--- linux-rt-2.6.29.5.orig/debian/NOTES +++ linux-rt-2.6.29.5/debian/NOTES @@ -0,0 +1,4 @@ +eSCO patch removed. Replaced upstream with a disable_esco module parm. +airprime: Module gone, use option driver instead +AppArmor: Patch is all there and ported. Ooops when enabled, so default + off (still can be enabled apparmor=1) --- linux-rt-2.6.29.5.orig/debian/rules +++ linux-rt-2.6.29.5/debian/rules @@ -0,0 +1,81 @@ +#!/usr/bin/make -f +# +# debian/rules for Ubuntu linux +# +# Use this however you want, just give credit where credit is due. +# +# Copyright (c) 2007 Ben Collins +# + +# dpkg-buildpackage passes options that are incomptatible +# with the kernel build. +unexport CFLAGS +unexport LDFLAGS + +# This is the debhelper compatability version to use. +export DH_COMPAT=4 +export LC_ALL=C +export SHELL=/bin/bash -e + +# Common variables for all architectures +include debian/rules.d/0-common-vars.mk + +# Pill in some arch specific stuff +include debian/rules.d/$(arch).mk + +# Maintainer targets +include debian/rules.d/1-maintainer.mk + +# Debian Build System targets +binary: binary-indep binary-arch + +build: build-arch build-indep + +clean: unpatch debian/control + dh_testdir + dh_testroot + dh_clean + + # d-i stuff + rm -rf modules kernel-versions package-list + rm -rf debian/d-i-$(arch) + + # normal build junk + rm -rf debian/abi/$(release)-$(revision) + rm -rf $(builddir) + rm -f $(stampdir)/stamp-* + rm -rf debian/linux-* + + # This gets rid of the d-i packages in control + cp -f debian/control.stub debian/control + +# Builds the image, arch headers and debug packages +include debian/rules.d/2-binary-arch.mk + +# Rules for building the udebs (debian-installer) +#include debian/rules.d/5-udebs.mk + +# Builds the source, doc and linux-headers indep packages +include debian/rules.d/3-binary-indep.mk + +# Various checks to be performed on builds +include debian/rules.d/4-checks.mk + +# Misc stuff +debian/control.stub: debian/scripts/control-create \ + debian/control.stub.in \ + debian/changelog \ + $(wildcard debian/control.d/* debian/sub-flavours/*.vars) +# for i in debian/control.stub.in; do + for i in debian/control.stub.in; do \ + new=`echo $$i | sed 's/\.in$$//'`; \ + cat $$i | sed -e 's/PKGVER/$(release)/g' -e 's/ABINUM/$(abinum)/g' > \ + $$new; \ + done + flavours="$(wildcard debian/control.d/vars.* debian/sub-flavours/*.vars)";\ + for i in $$flavours; do \ + $(SHELL) debian/scripts/control-create $$i | \ + sed -e 's/PKGVER/$(release)/g' -e 's/ABINUM/$(abinum)/g' >> \ + debian/control.stub; \ + done + cp debian/control.stub debian/control --- linux-rt-2.6.29.5.orig/debian/control.stub +++ linux-rt-2.6.29.5/debian/control.stub @@ -0,0 +1,72 @@ +Source: linux-rt +Section: devel +Priority: optional +Maintainer: Alessio Igor Bogani +Standards-Version: 3.6.1 +Build-Depends: debhelper (>= 3), module-init-tools, kernel-wedge (>= 2.24ubuntu1), makedumpfile [!armel], quilt +Build-Depends-Indep: xmlto, docbook-utils, gs, transfig, bzip2, sharutils + +Package: linux-rt-headers-2.6.29.5-1 +Architecture: all +Section: devel +Priority: optional +Depends: coreutils | fileutils (>= 4.0) +Provides: linux-rt-headers, linux-rt-headers-2.6 +Description: Header files related to Linux kernel version 2.6.29.5 + This package provides kernel header files for version 2.6.29.5, for sites + that want the latest kernel headers. Please read + /usr/share/doc/linux-headers-2.6.29.5-1/debian.README.gz for details + +Package: linux-image-2.6.29.5-1-rt +Architecture: i386 amd64 +Section: base +Priority: optional +Pre-Depends: dpkg (>= 1.10.24) +Provides: linux-image, linux-image-2.6, fuse-module, kvm-api-4, redhat-cluster-modules, ivtv-modules, ndiswrapper-modules-1.9 +Depends: initramfs-tools (>= 0.36ubuntu6), coreutils | fileutils (>= 4.0), module-init-tools (>= 3.3-pre11-4ubuntu3) +Conflicts: hotplug (<< 0.0.20040105-1) +Recommends: grub | lilo (>= 19.1) +Suggests: fdutils, linux-doc-2.6.29.5 | linux-source-2.6.29.5 +Description: Linux kernel image for version 2.6.29.5 on Ingo Molnar's full real time preemption patch (2.6.28-rt) + This package contains the Linux kernel image for version 2.6.29.5 on + Ingo Molnar's full real time preemption patch (2.6.28-rt). + . + Also includes the corresponding System.map file, the modules built by the + packager, and scripts that try to ensure that the system is not left in an + unbootable state after an update. + . + Supports Generic processors. + . + Geared toward real time systems. + . + You likely do not want to install this package directly. Instead, install + the linux-rt meta-package, which will ensure that upgrades work + correctly, and that supporting packages are also installed. + +Package: linux-headers-2.6.29.5-1-rt +Architecture: i386 amd64 +Section: devel +Priority: optional +Depends: coreutils | fileutils (>= 4.0), linux-rt-headers-2.6.29.5-1, ${shlibs:Depends} +Provides: linux-headers, linux-headers-2.6 +Description: Linux kernel headers for version 2.6.29.5 on Ingo Molnar's full real time preemption patch (2.6.28-rt) + This package provides kernel header files for version 2.6.29.5 on + Ingo Molnar's full real time preemption patch (2.6.28-rt). + . + This is for sites that want the latest kernel headers. Please read + /usr/share/doc/linux-headers-2.6.29.5-1/debian.README.gz for details. + +Package: linux-image-debug-2.6.29.5-1-rt +Architecture: i386 amd64 +Section: devel +Priority: optional +Provides: linux-debug +Description: Linux kernel debug image for version 2.6.29.5 on Ingo Molnar's full real time preemption patch (2.6.28-rt) + This package provides a kernel debug image for version 2.6.29.5 on + Ingo Molnar's full real time preemption patch (2.6.28-rt). + . + This is for sites that wish to debug the kernel. + . + The kernel image contained in this package is NOT meant to boot from. It + is uncompressed, and unstripped. This package also includes the + unstripped modules. --- linux-rt-2.6.29.5.orig/debian/copyright +++ linux-rt-2.6.29.5/debian/copyright @@ -0,0 +1,30 @@ +This is the Ubuntu prepackaged version of the Linux kernel. +Linux was written by Linus Torvalds +and others. + +This package was put together by the Ubuntu Kernel Team, from +sources retrieved from upstream linux git. +The sources may be found at most Linux ftp sites, including +ftp://ftp.kernel.org/pub/linux/kernel/ + +This package is currently maintained by the +Ubuntu Kernel Team + +Linux is copyrighted by Linus Torvalds and others. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 dated June, 1991. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + 02111-1307, USA. + +On Ubuntu Linux systems, the complete text of the GNU General +Public License v2 can be found in `/usr/share/common-licenses/GPL-2'. --- linux-rt-2.6.29.5.orig/debian/compat +++ linux-rt-2.6.29.5/debian/compat @@ -0,0 +1 @@ +4 --- linux-rt-2.6.29.5.orig/debian/changelog.historical +++ linux-rt-2.6.29.5/debian/changelog.historical @@ -0,0 +1,5745 @@ +linux (2.6.24-19.33) UNRELEASED; urgency=low + + CHANGELOG: Do not edit directly. Autogenerated at release. + CHANGELOG: Use the printchanges target to see the curent changes. + CHANGELOG: Use the insertchanges target to create the final log. + + -- Tim Gardner Sun, 04 May 2008 20:22:21 -0600 + +linux (2.6.24-18.32) hardy-security; urgency=low + + * CVE-2007-6694: [POWERPC] CHRP: Fix possible NULL pointer dereference + * fix SMP ordering hole in fcntl_setlk() (CVE-2008-1669) + * Fix dnotify/close race (CVE-2008-1375) + * tehuti: check register size (CVE-2008-1675) + * tehuti: move ioctl perm check closer to function start (CVE-2008-1675) + + -- Ben Collins Mon, 19 May 2008 16:50:11 +0000 + +linux (2.6.24-17.31) hardy; urgency=low + + [Alessio Igor Bogani] + + * rt: Fix mutex in the toshiba_acpi driver + * rt: Updated configuration files + + [Ben Collins] + + * build: Fix revert detection in git-ubuntu-log + * SAUCE: Re-add eeprom_bad_csum_allow module-param + - LP: #60388 + + [Stefan Bader] + + * Pulled updates to openvz custom build. Fixes openvz 'refuses to boot' problem. + - LP: #210672 + * sched: retain vruntime, fix delayed key events when CONFIG_FAIR_GROUP_SCHED. + - LP: #218516 + * UBUNTU: SAUCE: Add blacklist support to fix Belkin bluetooth dongle. + - LP: #140511 + + [Tim Gardner] + + * Enable CONFIG_ISCSI_TCP for -virtual + - LP: #218215 + * build: Add fancontrol modules to powerpc64-smp debian installer + * Fix Xen Dom0/DomU bridging + - LP: #218126 + * TSC Clocksource can cause hangs and time jumps + - LP: #221351 + * Kernel should use CONFIG_FAIR_CGROUP_SCHED. Fixes high load issues + with pulseaudio. + - LP: #188226 + + [Upstream Kernel Changes] + + * KVM: MMU: prepopulate guest pages after write-protecting + - LP: #221032 + + -- Tim Gardner Fri, 11 Apr 2008 07:59:10 -0600 + +linux (2.6.24-16.30) hardy; urgency=low + + * Fix amd64/i386 ABI and module check FTBS by creating an ignore + and ignore.modules in the ABI directory. + + -- Tim Gardner Wed, 09 Apr 2008 21:58:25 -0600 + +linux (2.6.24-16.29) hardy; urgency=low + + [Stephan Bader] + + * UBUNTU: SAUCE: mmc: Increase power_up deleay to fix TI readers + + [Alessio Igor Bogani] + + * rt: Updated configuration files + + [Chuck Short] + + * Xen updates for vitrio changes. + + [Tim Gardner] + + * openvz updates for vitrio changes. + + -- Tim Gardner Tue, 08 Apr 2008 21:48:16 -0600 + +linux (2.6.24-16.28) hardy; urgency=low + + [Tim Gardner] + + * Revert "UBUNTU: x86: tsc prevent time going backwards" + + [Kees Cook] + + * AppArmor: implement mmap_min_addr check as done in mainline. + + [Soren Hansen] + + * Bring our virtio code up to date with 2.6.25-rc7 + + [Upstream Kernel Changes] + + * Ubuntu: Revert all our virtio changes + * lguest: Reboot support + * lguest: adapt launcher to per-cpuness + * virtio: Implement skb_partial_csum_set, for setting partial csums on + untrusted packets. + * virtio: simplify config mechanism. + * virtio: explicit enable_cb/disable_cb rather than callback return. + * virtio: configuration change callback + * virtio: Fix vring_init/vring_size to take unsigned long + * virtio: clarify NO_NOTIFY flag usage + * virtio: remove unused id field from struct virtio_blk_outhdr + * virtio: Net header needs hdr_len + * virtio: Tweak virtio_net defines + * virtio: populate network rings in the probe routine, not open + * virtio: reset function + * virtio: handle interrupts after callbacks turned off + * virtio: Use the sg_phys convenience function. + * virtio: Allow virtio to be modular and used by modules + * virtnet: remove double ether_setup + * virtio: flush buffers on open + * virtio: free transmit skbs when notified, not on next xmit. + * virtio_net: parametrize the napi_weight for virtio receive queue. + * virtio_blk: provide getgeo + * virtio_blk: Dont waste major numbers + * virtio_blk: implement naming for vda-vdz,vdaa-vdzz,vdaaa-vdzzz + * virtio: PCI device + * virtio: Use PCI revision field to indicate virtio PCI ABI version + * virtio: balloon driver + * virtio net: fix oops on interface-up + * virtio: add missing #include + * virtio: fix race in enable_cb + * virtio: handle > 2 billion page balloon targets + * virtio_net: Fix oops on early interrupts - introduced by virtio reset + code + * lguest: Do not append space to guests kernel command line + * virtio: Use spin_lock_irqsave/restore for virtio-pci + * virtio: Fix sysfs bits to have proper block symlink + * virtio: Enable netpoll interface for netconsole logging + * virtio_pci: unregister virtio device at device remove + * lguest: Add puppies which where previously missing. + * lguest: lguest.txt documentation fix + * lguest: Don't need comment terminator before disk section. + * virtio_pci iomem annotations + * virtio_net: remove overzealous printk + * virtio: remove overzealous BUG_ON. + + -- Tim Gardner Tue, 08 Apr 2008 11:53:49 -0600 + +linux (2.6.24-15.27) hardy; urgency=low + + [Alan Stern] + + * usb-storage: don't access beyond the end of the sg buffer + - LP: #204922 + + [Mario Limonciello] + + * Enable Reset and SCO workaround on Dell 410 BT adapter + + [Tim Gardner] + + * Enable CONFIG_E1000 in the i386 virtual image. + - LP: #205646 + + [Thomas Gleixner] + + * x86: tsc prevent time going backwards + + [Matthew Garrett] + + * Fix framebuffer fonts on non-x86 platforms + + -- Tim Gardner Fri, 04 Apr 2008 08:14:49 -0600 + +linux (2.6.24-15.26) hardy; urgency=low + + [Colin Ian King] + + * airprime.c supports more devices + - LP: #208250 + + [Kees Cook] + + * AppArmor: get latest batch of upstream fixes into Hardy (svn 1160) + + [Stefan Bader] + + * ACPI: fix boot oops regression in kernel + - LP: #207014 + + [Tim Gardner] + + * Enable CGROUPS for non x86/x86_64 arches, all flavours. + - LP: #188226 + + -- Tim Gardner Thu, 03 Apr 2008 07:00:29 -0600 + +linux (2.6.24-14.25) hardy; urgency=low + + [Mario Limonciello] + + * Resolve sky2 race condition leading to failed suspends + - LP: #210877 + + [Tim Gardner] + + * Copy drivers/media internal header files into header + package for external LUM compilation. This paves the + way for LP #202065. + + -- Tim Gardner Wed, 02 Apr 2008 08:28:32 -0600 + +linux (2.6.24-14.24) hardy; urgency=low + + [Amit Kucheria] + + * LPIA: Update from moblin + * LPIA: Fix reboot problem after S3/S4 + * LPIA: Integrate latest Dabney thermal patches + * LPIA: Change-umd_dbg-debug-level-to-KERN_INFO + * LPIA: Compile modules into kernel to save on boot time + * LPIA: lots of Dabney CONFIG options dissapeared + * LPIA: Purge nonexistent config options + + [Jay Chetty] + + * UBUNTU:USBC:Integrated USBC 2.0.0.32L.0009 + + [Misha Zhilin] + + * USB: ehci: handle large bulk URBs correctly (again) + - LP: #204857 + + [Tim Gardner] + + * frame buffer regression - screen blank except for blinking cursor after + fbcon vtswitch + - LP: #201591 + * Blacklist Bluetooth Dell Wireless 370 for SCO MTU + - LP: #209715 + * Set CONFIG_FAIR_CGROUP_SCHED for server flavours. + - LP: #188226 + * Add DMI IO_DELAY support. + - LP: #200057 + + -- Tim Gardner Mon, 31 Mar 2008 11:19:49 -0600 + +linux (2.6.24-13.23) hardy; urgency=low + + [Alessio Igor Bogani] + + * rt: Updated configuration files + + [Ben Collins] + + * openvz: New custom flavour for OpenVZ + * config: Disable IDE AMD driver in favor of PATA version + - LP: #181561 + * config: Disable IDE VIA driver in favor of PATA version + - LP: #181561 + * drivers/video: Restore gutsy backlight dimming behavior + - LP: #205261 + * build/config: Enable CONFIG_CIFS_WEAK_PW_HASH + - LP: #202445 + + [Colin Ian King] + + * SAUCE: Add support for version 4 of Chelsio NICs in cxgb3 driver + - LP: #201893 + + [Kees Cook] + + * AppArmor: re-add missing "type" field in syslog reports. + - LP: #202888 + * kvm: reset TSS on x86_64 to avoid ioperm bitmap corruption + - LP: #144900 + + [Stefan Bader] + + * USB: EHCI: add separate IAA watchdog timer + - LP: #198619 + * SAUCE: Always use SCO protocol (disable eSCO support) + - LP: #39414 + * PM: Introduce PM_EVENT_HIBERNATE callback state + - LP: #201086 + + [Tim Gardner] + + * Disable DRM suspend/resume on pre-915 Intel chips + - LP: #207496 + * frame buffer regression - screen blank except for blinking cursor after fbcon + vtswitch + - LP: #201591 + + -- Tim Gardner Wed, 19 Mar 2008 10:05:05 -0400 + +linux (2.6.24-12.22) hardy; urgency=low + + [Ben Collins] + + * custom/rt: Disable toshiba_acpi, since it isn't compatible + + -- Ben Collins Wed, 12 Mar 2008 14:38:59 -0400 + +linux (2.6.24-12.21) hardy; urgency=low + + [Ben Collins] + + * build: Fix vesafb module inclusion into initrd subdir + - LP: #129910 + * net/bluetooth: POWERBOOK => APPLE, fix for apple keyboard patch + * custom/xen: Remove asix portion of xen patch, breaks driver + - LP: #199296 + + [Colin Ian King] + + * SAUCE: fix Udma not fully available in Acer 1694 Wlmi + - LP: #187121 + * SAUCE: Update toshiba_acpi.c to version 0.19a + - LP: #77026 + + [Stefan Bader] + + * x86: Clear DF before calling signal handler + * Enable FN key on Apple aluminum bluetooth keyboard + - LP: #162083 + + -- Ben Collins Tue, 11 Mar 2008 13:20:49 -0400 + +linux (2.6.24-12.20) hardy; urgency=low + + [Ben Collins] + + * Enable CONFIG_SOUND at least, so alsa build in lum works + - LP: #200338 + + -- Ben Collins Mon, 10 Mar 2008 08:15:00 -0400 + +linux (2.6.24-12.19) hardy; urgency=low + + * Re-upload of -12.18 to fix build failures + * Fixup binary-custom configs + * Fixup xen patch to cope with kvm changes + + [Amit Kucheria] + + * Move Marvell 8686 and 8688 to LUM + * Poulsbo: Sync patches with moblin/ume-hardy tree + * Break if a patch fails to apply + * SAUCE: implement smarter atime updates support + - LP: #199427 + * Enable USB_PERSIST to allow devices with /root on usb to work with + suspend + * Enable USB_PERSIST across the board + + [Ben Collins] + + * build/config: Really fix ide on smp ppc configs + * build/configs: Enable relatime config option for all flavors + * build/abi: Ignore ide-core module for ppc, moved to built-in + + [Colin Ian King] + + * fix reversed logic for bbuild check leads to -j1 default + - LP: #197040 + * Enable IDE_PMAC for powerpc-smp + - LP: #196686 + * Disable CONFIG_USB_OHCI_HCD_SSB + - LP: #182716 + * SAUCE: fix arcmsr + archttp64 calls dma_free_coherent() with irqs + disabled - dmesg filled with warnings + - LP: #194207 + + [Jorge Boncompte [DTI2]] + + * Fix Messed multicast lists after dev_mc_sync/unsync + - LP: #193468 + + [Stefan Bader] + + * Add support for Apple Aluminium keyboards. + - LP: #162083 + * SAUCE: Restore VT fonts on switch + + [Upstream Kernel Changes] + + * [NET]: Messed multicast lists after dev_mc_sync/unsync + * KVM: x86 emulator: add support for group decoding + * KVM: x86 emulator: group decoding for group 1A + * KVM: x86 emulator: Group decoding for group 3 + * KVM: x86 emulator: Group decoding for groups 4 and 5 + * KVM: x86 emulator: add group 7 decoding + * KVM: constify function pointer tables + * KVM: Only x86 has pio + * KVM: x86 emulator: group decoding for group 1 instructions + * KVM: MMU: Decouple mmio from shadow page tables + * KVM: Limit vcpu mmap size to one page on non-x86 + * KVM: VMX: Enable Virtual Processor Identification (VPID) + * KVM: Use CONFIG_PREEMPT_NOTIFIERS around struct preempt_notifier + * KVM: Disable pagefaults during copy_from_user_inatomic() + * KVM: make EFER_RESERVED_BITS configurable for architecture code + * KVM: align valid EFER bits with the features of the host system + * KVM: allow access to EFER in 32bit KVM + * kvm: i386 fix + * KVM: export information about NPT to generic x86 code + * KVM: MMU: make the __nonpaging_map function generic + * KVM: export the load_pdptrs() function to modules + * KVM: MMU: add TDP support to the KVM MMU + * KVM: x86 emulator: Fix 'jmp abs' + * KVM: x86 emulator: fix group 5 decoding + * KVM: Fix kvm_arch_vcpu_ioctl_set_sregs so that set_cr0 works properly + * KVM: Make the supported cpuid list a host property rather than a vm + property + * KVM: emulate access to MSR_IA32_MCG_CTL + * KVM: remove the usage of the mmap_sem for the protection of the memory + slots. + * KVM: SVM: allocate the MSR permission map per VCPU + * KVM: make MMU_DEBUG compile again + * KVM: paravirtualized clocksource: host part + * KVM: Add missing semicolon + * KVM: x86 emulator: add ad_mask static inline + * KVM: x86 emulator: make register_address, address_mask static inlines + * KVM: x86 emulator: make register_address_increment and JMP_REL static + inlines + * KVM: Add API to retrieve the number of supported vcpus per vm + * KVM: Increase vcpu count to 16 + * KVM: Add API for determining the number of supported memory slots + * KVM: Increase the number of user memory slots per vm + * KVM: Add stat counter for hypercalls + * KVM: x86 emulator: fix sparse warnings in x86_emulate.c + * KVM: sparse fixes for kvm/x86.c + * KVM: Implement dummy values for MSR_PERF_STATUS + * KVM: MMU: ignore zapped root pagetables + * KVM: call write_guest_time as soon as we register the paravirt clock + * KVM: MMU: large page support + * KVM: Prefix control register accessors with kvm_ to avoid namespace + pollution + * KVM: Avoid infinite-frequency local apic timer + * KVM: Route irq 0 to vcpu 0 exclusively + * KVM: SVM: add support for Nested Paging + * KVM: SVM: enable LBR virtualization + * KVM: SVM: make iopm_base static + * KVM: SVM: let init_vmcb() take struct vcpu_svm as parameter + * KVM: VMX: fix typo in VMX header define + * KVM: SVM: fix Windows XP 64 bit installation crash + * KVM: VMX: Fix invalid opcode of VPID + * KVM: VMX: Handle machines without EFER + * KVM: move alloc_apic_access_page() outside of non-preemptable region + * KVM: VMX: unifdef the EFER specific code + * KVM: SVM: move feature detection to hardware setup code + * KVM: Export include/linux/kvm.h only if $ARCH actually supports KVM + * dlm: fix rcom_names message to self + * virtio: Net header needs hdr_len + + -- Tim Gardner Mon, 03 Mar 2008 07:07:16 -0700 + +linux (2.6.24-11.17) hardy; urgency=low + + [Alan Cox] + + * Pull in fixes for pata_it821x. + - LP: #106931 + + [Alessio Igor Bogani] + + * rt: Synchronized with upstream (2.6.24.3-rt3) + * rt: Updated configuration files + + [Amit Kucheria] + + * Add AGP support for Radeon Mobility 9000 chipset + - LP: #178634 + * Bluetooth: SCO flow control to enable bluetooth headsets + + [Ben Collins] + + * binary: Include vesafs in initrd subdir, should fix vga= usage + + [Colin Ian King] + + * AMD SB700 south bridge support patches + - LP: #195354 + * BCM4311 Revision 2 fix + - LP: #184600 + + [Mauro Carvalho Chehab] + + * V4L/DVB (6753): Fix vivi to support non-zero minor node + + [Tim Gardner] + + * Merged 2.6.24.3 + * Add atl1 to d-i bits. + - LP: #159561 + * SAUCE: Add xpad support for RedOctane Guitar Hero + - LP: #196745 + + [Upstream Kernel Changes] + + * DVB: cx23885: add missing subsystem ID for Hauppauge HVR1800 Retail + * slab: fix bootstrap on memoryless node + * vm audit: add VM_DONTEXPAND to mmap for drivers that need it + (CVE-2008-0007) + * USB: keyspan: Fix oops + * usb gadget: fix fsl_usb2_udc potential OOPS + * USB: CP2101 New Device IDs + * USB: add support for 4348:5523 WinChipHead USB->RS 232 adapter + * USB: Sierra - Add support for Aircard 881U + * USB: Adding YC Cable USB Serial device to pl2303 + * USB: sierra driver - add devices + * USB: ftdi_sio - enabling multiple ELV devices, adding EM1010PC + * USB: ftdi-sio: Patch to add vendor/device id for ATK_16IC CCD + * USB: sierra: add support for Onda H600/Zte MF330 datacard to USB Driver + for Sierra Wireless + * USB: remove duplicate entry in Option driver and Pl2303 driver for + Huawei modem + * USB: pl2303: add support for RATOC REX-USB60F + * USB: ftdi driver - add support for optical probe device + * USB: use GFP_NOIO in reset path + * USB: Variant of the Dell Wireless 5520 driver + * USB: storage: Add unusual_dev for HP r707 + * USB: fix usbtest halt check on big endian systems + * USB: handle idVendor of 0x0000 + * USB: Fix usb_serial_driver structure for Kobil cardreader driver. + * forcedeth: mac address mcp77/79 + * lockdep: annotate epoll + * sys_remap_file_pages: fix ->vm_file accounting + * PCI: Fix fakephp deadlock + * ACPI: update ACPI blacklist + * x86: restore correct module name for apm + * sky2: restore multicast addresses after recovery + * sky2: fix for WOL on some devices + * b43: Fix suspend/resume + * b43: Drop packets we are not able to encrypt + * b43: Fix dma-slot resource leakage + * b43legacy: fix PIO crash + * b43legacy: fix suspend/resume + * b43legacy: drop packets we are not able to encrypt + * b43legacy: fix DMA slot resource leakage + * selinux: fix labeling of /proc/net inodes + * b43: Reject new firmware early + * sched: let +nice tasks have smaller impact + * sched: fix high wake up latencies with FAIR_USER_SCHED + * fix writev regression: pan hanging unkillable and un-straceable + * Driver core: Revert "Fix Firmware class name collision" + * drm: the drm really should call pci_set_master.. + * splice: missing user pointer access verification (CVE-2008-0009/10) + * Linux 2.6.24.1 + * splice: fix user pointer access in get_iovec_page_array() + * Linux 2.6.24.2 + * ACPI: video: Rationalise ACPI backlight implementation + * ACPI: video: Ignore ACPI video devices that aren't present in hardware + * SPARC/SPARC64: Fix usage of .section .sched.text in assembler code. + * NETFILTER: nf_conntrack_tcp: conntrack reopening fix + * NFS: Fix a potential file corruption issue when writing + * inotify: fix check for one-shot watches before destroying them + * hugetlb: add locking for overcommit sysctl + * XFS: Fix oops in xfs_file_readdir() + * Fix dl2k constants + * SCSI: sd: handle bad lba in sense information + * TCP: Fix a bug in strategy_allowed_congestion_control + * TC: oops in em_meta + * SELinux: Fix double free in selinux_netlbl_sock_setsid() + * PKT_SCHED: ematch: oops from uninitialized variable (resend) + * NET: Add if_addrlabel.h to sanitized headers. + * IPV4: fib_trie: apply fixes from fib_hash + * IPV4: fib: fix route replacement, fib_info is shared + * IPCOMP: Fix reception of incompressible packets + * IPCOMP: Fetch nexthdr before ipch is destroyed + * INET_DIAG: Fix inet_diag_lock_handler error path. + * INET: Prevent out-of-sync truesize on ip_fragment slow path + * BLUETOOTH: Add conn add/del workqueues to avoid connection fail. + * AUDIT: Increase skb->truesize in audit_expand + * Be more robust about bad arguments in get_user_pages() + * Disable G5 NAP mode during SMU commands on U3 + * hrtimer: fix *rmtp handling in hrtimer_nanosleep() + * hrtimer: fix *rmtp/restarts handling in compat_sys_nanosleep() + * SLUB: Deal with annoying gcc warning on kfree() + * hrtimer: check relative timeouts for overflow + * hrtimer: catch expired CLOCK_REALTIME timers early + * genirq: do not leave interupts enabled on free_irq + * S390: Fix futex_atomic_cmpxchg_std inline assembly. + * USB: fix pm counter leak in usblp + * SCSI: gdth: scan for scsi devices + * PCMCIA: Fix station address detection in smc + * POWERPC: Revert chrp_pci_fixup_vt8231_ata devinit to fix libata on + pegasos + * bonding: fix NULL pointer deref in startup processing + * x86_64: CPA, fix cache attribute inconsistency bug + * Linux 2.6.24.3 + + -- Tim Gardner Mon, 25 Feb 2008 12:28:13 -0700 + +linux (2.6.24-10.16) hardy; urgency=low + + [Alessio Igor Bogani] + + * rt: Synchronized with upstream (2.6.24.2-rt2) + * rt: Updated configuration files + + [Eric Piel] + + * SAUCE: ACPI: Allow custom DSDT tables to be loaded from initramfs + Amit Kucheria consolidated the DSDT patch with another fix that + ifdefs symbols required when BLK_DEV_INITR is disabled. + + [Stefan Bader] + + * Add Optiarc DVD drive to audio quirks list. + - LP: #186664 + * Update drm and i915 drm driver to fix suspend issues. + - LP: #189260 + + [Tim Gardner] + + * Fix FTBS without BLK_DEV_INITRD + - LP: #193507 + * 64 bit CPA cache attribute bug + - LP: #193736 + * Implemented default EDD control + + [Upstream Kernel Changes] + + * bonding: fix NULL pointer deref in startup processing + * dlm: bind connections from known local address when using TCP + * dlm: proper prototypes + * dlm: don't print common non-errors + * dlm: use dlm prefix on alloc and free functions + * dlm: close othercons + * dlm: align midcomms message buffer + * dlm: swap bytes for rcom lock reply + * dlm: use fixed errno values in messages + * dlm: clear ast_type when removing from astqueue + * dlm: recover locks waiting for overlap replies + * dlm: another call to confirm_master in receive_request_reply + * dlm: reject messages from non-members + * dlm: validate messages before processing + * dlm: reject normal unlock when lock is waiting for lookup + * dlm: limit dir lookup loop + * dlm: fix possible use-after-free + * dlm: change error message to debug + * dlm: keep cached master rsbs during recovery + * dlm: Sanity check namelen before copying it + * dlm: clean ups + * dlm: static initialization improvements + * dlm: use proper C for dlm/requestqueue stuff (and fix alignment bug) + * dlm: dlm_process_incoming_buffer() fixes + * dlm: do not byteswap rcom_lock + * dlm: do not byteswap rcom_config + * dlm: use proper type for ->ls_recover_buf + * dlm: missing length check in check_config() + * dlm: validate data in dlm_recover_directory() + * dlm: verify that places expecting rcom_lock have packet long enough + * dlm: receive_rcom_lock_args() overflow check + * dlm: make find_rsb() fail gracefully when namelen is too large + * dlm: fix overflows when copying from ->m_extra to lvb + * dlm: fix dlm_dir_lookup() handling of too long names + * dlm: dlm/user.c input validation fixes + * dlm: proper types for asts and basts + * dlm: eliminate astparam type casting + * dlm: add __init and __exit marks to init and exit functions + * virtio: Use PCI revision field to indicate virtio PCI ABI version + + -- Tim Gardner Tue, 19 Feb 2008 09:57:18 -0700 + +linux (2.6.24-9.15) hardy; urgency=low + + [Alessio Igor Bogani] + + * rt: Fix FTBS + * rt: Updated configuration files + + [Tim Gardner] + + * SAUCE: make /dev/kmem a config option + * SAUCE: x86: introduce /dev/mem restrictions with a config option + * Fixed CGROUP FTBS caused by AppArmor patch. + * Enabled CGROUP and CPUSETS for server flavor. + - LP: #182434 + + [Colin King] + + * Turn on /proc/acpi/alarm for x86_64 (amd64) + - LP: #186297 + + [Upstream Kernel Changes] + + * Ubuntu: LatencyTOP infrastructure patch + + -- Tim Gardner Thu, 14 Feb 2008 13:34:55 -0700 + +linux (2.6.24-8.14) hardy; urgency=low + + [cking] + + * Support Novatel U727 EVDO modem: Add pid and vid to + drivers/usb/serial/airprime.c + - LP: #150996 + * Enable speedstep for sonoma processors. + - LP: #132271 + + [Stefan Bader] + + * SAUCE: Export dm_disk function of device-mapper + + -- Tim Gardner Wed, 13 Feb 2008 21:47:18 -0700 + +linux (2.6.24-8.13) hardy; urgency=low + + [Soren Hansen] + + * Add missing iscsi modules to kernel udebs + + [Stefan Bader] + + * Lower message level for PCI memory and I/O allocation. + + [Tim Gardner] + + * Enabled IP_ADVANCED_ROUTER and IP_MULTIPLE_TABLES in sparc, hppa + - LP: #189560 + * Compile RealTek 8139 using PIO method. + - LP: #90271 + * Add WD WD800ADFS NCQ horkage quirk support. + - LP: #147858 + + [Upstream Kernel Changes] + + * Introduce WEXT scan capabilities + * DVB: cx23885: add missing subsystem ID for Hauppauge HVR1800 Retail + * slab: fix bootstrap on memoryless node + * vm audit: add VM_DONTEXPAND to mmap for drivers that need it + (CVE-2008-0007) + * USB: keyspan: Fix oops + * usb gadget: fix fsl_usb2_udc potential OOPS + * USB: CP2101 New Device IDs + * USB: add support for 4348:5523 WinChipHead USB->RS 232 adapter + * USB: Sierra - Add support for Aircard 881U + * USB: Adding YC Cable USB Serial device to pl2303 + * USB: sierra driver - add devices + * USB: ftdi_sio - enabling multiple ELV devices, adding EM1010PC + * USB: ftdi-sio: Patch to add vendor/device id for ATK_16IC CCD + * USB: sierra: add support for Onda H600/Zte MF330 datacard to USB Driver + for Sierra Wireless + * USB: remove duplicate entry in Option driver and Pl2303 driver for + Huawei modem + * USB: pl2303: add support for RATOC REX-USB60F + * USB: ftdi driver - add support for optical probe device + * USB: use GFP_NOIO in reset path + * USB: Variant of the Dell Wireless 5520 driver + * USB: storage: Add unusual_dev for HP r707 + * USB: fix usbtest halt check on big endian systems + * USB: handle idVendor of 0x0000 + * forcedeth: mac address mcp77/79 + * lockdep: annotate epoll + * sys_remap_file_pages: fix ->vm_file accounting + * PCI: Fix fakephp deadlock + * ACPI: update ACPI blacklist + * x86: restore correct module name for apm + * sky2: restore multicast addresses after recovery + * sky2: fix for WOL on some devices + * b43: Fix suspend/resume + * b43: Drop packets we are not able to encrypt + * b43: Fix dma-slot resource leakage + * b43legacy: fix PIO crash + * b43legacy: fix suspend/resume + * b43legacy: drop packets we are not able to encrypt + * b43legacy: fix DMA slot resource leakage + * selinux: fix labeling of /proc/net inodes + * b43: Reject new firmware early + * sched: let +nice tasks have smaller impact + * sched: fix high wake up latencies with FAIR_USER_SCHED + * fix writev regression: pan hanging unkillable and un-straceable + * Driver core: Revert "Fix Firmware class name collision" + * drm: the drm really should call pci_set_master.. + * splice: missing user pointer access verification (CVE-2008-0009/10) + * Linux 2.6.24.1 + * splice: fix user pointer access in get_iovec_page_array() + * Linux 2.6.24.2 + + -- Tim Gardner Thu, 07 Feb 2008 06:50:13 -0700 + +linux (2.6.24-7.12) hardy; urgency=low + + [Jay Chetty] + + * Added patch to fix legacy USB interrupt issue + * Enabled Poulsbo PATA udma5 support + * Add touchscreen doubleclick workaround + + [Amit Kucheria] + + * Add AGP support for Radeon Mobility 9000 chipset + - LP: #178634 + + [Soren Hansen] + + * Add virtio modules to the relevant udebs + * Add missing "?" for virtio modules in storage-core-modules + + [Stefan Bader] + + * Added vendor id for Dell 5720 broadband modem + + -- Jay Chetty Wed, 06 Feb 2008 14:13:41 -0800 + +linux (2.6.24-7.11) hardy; urgency=low + + [Jay Chetty] + + * poulsbo: Add a 100ms delay for SiB workaround + + [Tim Gardner] + + * -6.10 should have been an ABI bump, but due to incomplete build testing + went undetected. + + -- Tim Gardner Mon, 04 Feb 2008 19:13:52 -0700 + +linux (2.6.24-6.10) hardy; urgency=low + + [Alessio Igor Bogani] + + * rt: Synced with upstream, removed old kvm related patches and updated + configurations files. + + [Chuck Short] + + * SAUCE: Enable Xen + + [Soren Hansen] + + * Update kvm driver to kvm-60. + * Added CONFIG_ARCH_SUPPORTS_KVM=y for lpia, i386, and amd64 + * Add rtl8139 driver to -virtual flavour + + [Stefan Bader] + + * Fix usb_serial_driver structure for Kobil cardreader driver. + - LP: #183109 + * Lower warning level of pci resource allocation messages. + - LP: #159241 + + [Tim Gardner] + + * Enabled CONFIG_BLK_DEV_IDE_PMAC + - LP: #185862 + * Add virtio config options to lpiacompat. + * SAUCE: Export symbols for aufs (in lum). + * Enabled Xen + + [Upstream Kernel Changes] + + * KVM: mmu: add missing dirty page tracking cases + * KVM: Move virtualization deactivation from CPU_DEAD state to + CPU_DOWN_PREPARE + * KVM: Cosmetics + * KVM: vmx: hack set_cr0_no_modeswitch() to actually do modeswitch + * KVM: Use ARRAY_SIZE macro instead of manual calculation. + * KVM: Use page_private()/set_page_private() apis + * KVM: add MSR based hypercall API + * KVM: Add host hypercall support for vmx + * KVM: Add hypercall host support for svm + * KVM: Wire up hypercall handlers to a central arch-independent location + * KVM: svm: init cr0 with the wp bit set + * KVM: SVM: intercept SMI to handle it at host level + * KVM: More 0 -> NULL conversions + * kvm, dirty pages log: adding some calls to mark_page_dirty() + * KVM: Add internal filesystem for generating inodes + * KVM: Create an inode per virtual machine + * KVM: Rename some kvm_dev_ioctl_*() functions to kvm_vm_ioctl_*() + * KVM: Move kvm_vm_ioctl_create_vcpu() around + * KVM: Per-vcpu inodes + * KVM: Bump API version + * .gitignore: ignore emacs backup files (*~) + * kvm: dirty pages log: fix bitmap size/access calculation + * kvm: move do_remove_write_access() up + * kvm: dirty page logging: remove write access permissions when + dirty-page-logging is enabled + * KVM: Add missing calls to mark_page_dirty() + * KVM: Fix dirty page log bitmap size/access calculation + * kvm: move do_remove_write_access() up + * KVM: Remove write access permissions when dirty-page-logging is enabled + * KVM: Fix bogus failure in kvm.ko module initialization + * KVM: Move kvmfs magic number to + * KVM: Unset kvm_arch_ops if arch module loading failed + * KVM: Fix guest register corruption on paravirt hypercall + * KVM: Use the generic skip_emulated_instruction() in hypercall code + * KVM: Use own minor number + * KVM: Fix guest sysenter on vmx + * KVM: Export + * KVM: Fix bogus sign extension in mmu mapping audit + * KVM: MMU: Fix guest writes to nonpae pde + * KVM: MMU: Fix host memory corruption on i386 with >= 4GB ram + * KVM: trivial whitespace fixes + * KVM: always reload segment selectors + * KVM: Remove extraneous guest entry on mmio read + * added KVM_GET_MEM_MAP ioctl to get the memory bitmap for a memory slot + * KVM: Prevent system selectors leaking into guest on real->protected + mode transition on vmx + * KVM: Use a shared page for kernel/user communication when runing a vcpu + * KVM: Do not communicate to userspace through cpu registers during PIO + * KVM: Initialize PIO I/O count + * KVM: Handle cpuid in the kernel instead of punting to userspace + * KVM: Remove the 'emulated' field from the userspace interface + * KVM: Remove minor wart from KVM_CREATE_VCPU ioctl + * KVM: Renumber ioctls + * KVM: Add method to check for backwards-compatible API extensions + * KVM: Allow userspace to process hypercalls which have no kernel handler + * KVM: Fold kvm_run::exit_type into kvm_run::exit_reason + * KVM: Add a special exit reason when exiting due to an interrupt + * KVM: Initialize the apic_base msr on svm too + * KVM: Add guest mode signal mask + * KVM: Allow kernel to select size of mmap() buffer + * KVM: Future-proof argument-less ioctls + * KVM: Avoid guest virtual addresses in string pio userspace interface + * KVM: MMU: Remove unnecessary check for pdptr access + * KVM: MMU: Remove global pte tracking + * KVM: Workaround vmx inability to virtualize the reset state + * KVM: Remove set_cr0_no_modeswitch() arch op + * KVM: Modify guest segments after potentially switching modes + * KVM: Hack real-mode segments on vmx from KVM_SET_SREGS + * KVM: Don't allow the guest to turn off the cpu cache + * KVM: Remove unused and write-only variables + * KVM: Handle writes to MCG_STATUS msr + * KVM: MMU: Fix hugepage pdes mapping same physical address with + different access + * KVM: SVM: Ensure timestamp counter monotonicity + * KVM: Remove unused function + * KVM: Remove debug message + * KVM: x86 emulator: fix bit string operations operand size + * KVM: SVM: enable LBRV virtualization if available + * Add mmu cache clear function + * KVM: Simply gfn_to_page() + * KVM: Add physical memory aliasing feature + * KVM: Add fpu get/set operations + * KVM: Use kernel-standard types + * KVM: Fix overflow bug in overflow detection code + * KVM: Fix memory leak on pio completion + * KVM: Handle partial pae pdptr + * KVM: Fix string pio when count == 0 + * KVM: Use slab caches to allocate mmu data structures + * KVM: Retry sleeping allocation if atomic allocation fails + * KVM: Fix pio completion + * KVM: SVM: Report hardware exit reason to userspace instead of dmesg + * KVM: Handle guest page faults when emulating mmio + * KVM: VMX: Reduce unnecessary saving of host msrs + * KVM: Fix off-by-one when writing to a nonpae guest pde + * KVM: VMX: Don't switch 64-bit msrs for 32-bit guests + * KVM: Fold drivers/kvm/kvm_vmx.h into drivers/kvm/vmx.c + * KVM: VMX: Only save/restore MSR_K6_STAR if necessary + * KVM: Per-vcpu statistics + * KVM: Silence compile warning on i386 + * KVM: Allow passing 64-bit values to the emulated read/write API + * KVM: Lazy FPU support for SVM + * KVM: Fix msr-avoidance regression on Core processors + * KVM: Don't complain about cpu erratum AA15 + * KVM: Document MSR_K6_STAR's special place in the msr index array + * KVM: MMU: Avoid heavy ASSERT at non debug mode. + * KVM: Initialize cr0 to indicate an fpu is present + * KVM: We want asserts on debug builds, not release + * KVM: Avoid unused function warning due to assertion removal + * KVM: VMX: Avoid unnecessary vcpu_load()/vcpu_put() cycles + * KVM: Move need_resched() check to common code + * KVM: VMX: Properly shadow the CR0 register in the vcpu struct + * KVM: VMX: Add lazy FPU support for VT + * KVM: fix an if() condition + * KVM: SVM: Only save/restore MSRs when needed + * KVM: Remove trailing whitespace + * KVM: Remove extraneous guest entry on mmio read + * KVM: Don't require explicit indication of completion of mmio or pio + * KVM: Remove unused 'instruction_length' + * KVM: VMX: Enable io bitmaps to avoid IO port 0x80 VMEXITs + * KVM: SVM: Allow direct guest access to PC debug port + * KVM: Fix RMW mmio handling + * KVM: Assume that writes smaller than 4 bytes are to non-pagetable pages + * KVM: Avoid saving and restoring some host CPU state on lightweight + vmexit + * KVM: Unindent some code + * KVM: Reduce misfirings of the fork detector + * KVM: Be more careful restoring fs on lightweight vmexit + * KVM: Unify kvm_mmu_pre_write() and kvm_mmu_post_write() + * KVM: MMU: Respect nonpae pagetable quadrant when zapping ptes + * KVM: Update shadow pte on write to guest pte + * KVM: Increase mmu shadow cache to 1024 pages + * KVM: Fix potential guest state leak into host + * KVM: Prevent guest fpu state from leaking into the host + * KVM: Move some more msr mangling into vmx_save_host_state() + * KVM: Rationalize exception bitmap usage + * KVM: Consolidate guest fpu activation and deactivation + * KVM: Ensure host cr0.ts is saved + * KVM: Set cr0.mp for guests + * KVM: Implement IA32_EBL_CR_POWERON msr + * KVM: MMU: Simplify kvm_mmu_free_page() a tiny bit + * KVM: MMU: Store shadow page tables as kernel virtual addresses, not + physical + * KVM: VMX: Only reload guest msrs if they are already loaded + * KVM: Avoid corrupting tr in real mode + * KVM: Fix vmx I/O bitmap initialization on highmem systems + * KVM: Remove merge artifact + * KVM: VMX: Use local labels in inline assembly + * KVM: VMX: Handle #SS faults from real mode + * KVM: VMX: Avoid saving and restoring msrs on lightweight vmexit + * KVM: VMX: Compile-fix for 32-bit hosts + * KVM: VMX: Cleanup redundant code in MSR set + * KVM: VMX: Fix a typo which mixes X86_64 and CONFIG_X86_64 + * KVM: VMX: Avoid saving and restoring msr_efer on lightweight vmexit + * KVM: VMX: Remove warnings on i386 + * Use menuconfig objects II - KVM/Virt + * KVM: x86 emulator: implement wbinvd + * KVM: Fix includes + * KVM: Use symbolic constants instead of magic numbers + * KVM: MMU: Use slab caches for shadow pages and their headers + * KVM: MMU: Simplify fetch() a little bit + * KVM: MMU: Move set_pte_common() to pte width dependent code + * KVM: MMU: Pass the guest pde to set_pte_common + * KVM: MMU: Fold fix_read_pf() into set_pte_common() + * KVM: MMU: Fold fix_write_pf() into set_pte_common() + * KVM: Move shadow pte modifications from set_pte/set_pde to + set_pde_common() + * KVM: Make shadow pte updates atomic + * KVM: MMU: Make setting shadow ptes atomic on i386 + * KVM: MMU: Remove cr0.wp tricks + * KVM: MMU: Simpify accessed/dirty/present/nx bit handling + * KVM: MMU: Don't cache guest access bits in the shadow page table + * KVM: MMU: Remove unused large page marker + * KVM: VMX: Fix asm constraint + * KVM: Lazy guest cr3 switching + * KVM: Replace C code with call to ARRAY_SIZE() macro. + * KVM: Remove unnecessary initialization and checks in mark_page_dirty() + * KVM: Fix vcpu freeing for guest smp + * KVM: Fix adding an smp virtual machine to the vm list + * KVM: Enable guest smp + * KVM: Move duplicate halt handling code into kvm_main.c + * KVM: Emulate hlt on real mode for Intel + * KVM: Keep an upper bound of initialized vcpus + * KVM: Flush remote tlbs when reducing shadow pte permissions + * KVM: SVM: Replace memset(, 0, PAGESIZE) with clear_page() + * KVM: VMX: Replace memset(, 0, PAGESIZE) with clear_page() + * KVM: Require a cpu which can set 64-bit values atomically + * KVM: Initialize the BSP bit in the APIC_BASE msr correctly + * KVM: VMX: Ensure vcpu time stamp counter is monotonous + * KVM: Bring local tree in line with origin + * KVM: Implement emulation of "pop reg" instruction (opcode 0x58-0x5f) + * KVM: Implement emulation of instruction "ret" (opcode 0xc3) + * KVM: Adds support for in-kernel mmio handlers + * KVM: VMX: Fix interrupt checking on lightweight exit + * KVM: Add support for in-kernel pio handlers + * KVM: Fix x86 emulator writeback + * KVM: Avoid useless memory write when possible + * KVM: VMX: Reinitialize the real-mode tss when entering real mode + * KVM: MMU: Fix Wrong tlb flush order + * KVM: VMX: Remove unnecessary code in vmx_tlb_flush() + * KVM: SVM: Reliably detect if SVM was disabled by BIOS + * KVM: Remove kvmfs in favor of the anonymous inodes source + * KVM: Clean up #includes + * KVM: Fix svm availability check miscompile on i386 + * HOTPLUG: Add CPU_DYING notifier + * HOTPLUG: Adapt cpuset hotplug callback to CPU_DYING + * HOTPLUG: Adapt thermal throttle to CPU_DYING + * SMP: Implement on_cpu() + * KVM: Keep track of which cpus have virtualization enabled + * KVM: Tune hotplug/suspend IPIs + * KVM: Use CPU_DYING for disabling virtualization + * KVM: MMU: Store nx bit for large page shadows + * KVM: Fix *nopage() in kvm_main.c + * KVM: SMP: Add vcpu_id field in struct vcpu + * KVM - add hypercall nr to kvm_run + * KVM:: Future-proof the exit information union ABI + * KVM: In-kernel string pio write support + * KVM: Fix memory slot management functions for guest smp + * KVM: x86 emulator: implement rdmsr and wrmsr + * KVM: Trivial: /dev/kvm interface is no longer experimental. + * KVM: Trivial: Remove unused struct cpu_user_regs declaration + * KVM: Trivial: Make decode_register() static + * KVM: Trivial: Comment spelling may escape grep + * KVM: Trivial: Avoid hardware_disable predeclaration + * KVM: Trivial: Use standard CR0 flags macros from asm/cpu-features.h + * Use standard CR3 flags, tighten checking + * Use standard CR4 flags, tighten checking + * KVM: Trivial: Use standard BITMAP macros, open-code userspace-exposed + header + * KVM: Set exit_reason to KVM_EXIT_MMIO where run->mmio is initialized. + * KVM: Use standard CR8 flags, and fix TPR definition + * KVM: MMU: Fix oopses with SLUB + * KVM: x86 emulator: fix cmov for writeback changes + * KVM: MMU: Fix cleaning up the shadow page allocation cache + * KVM: Require CONFIG_ANON_INODES + * KVM: x86 emulator: fix faulty check for two-byte opcode + * KVM: Correctly handle writes crossing a page boundary + * KVM: Fix unlikely kvm_create vs decache_vcpus_on_cpu race + * KVM: Hoist kvm_mmu_reload() out of the critical section + * KVM: Fix removal of nx capability from guest cpuid + * KVM: Move gfn_to_page out of kmap/unmap pairs + * KVM: disable writeback for 0x0f 0x01 instructions. + * KVM: VMX: Import some constants of vmcs from IA32 SDM + * KVM: Remove dead code in the cmpxchg instruction emulation + * KVM: load_pdptrs() cleanups + * KVM: Remove arch specific components from the general code + * KVM: Dynamically allocate vcpus + * KVM: VMX: Improve the method of writing vmcs control + * KVM: Use the scheduler preemption notifiers to make kvm preemptible + * KVM: Convert vm lock to a mutex + * KVM: fx_init() needs preemption disabled while it plays with the FPU + state + * KVM: VMX: pass vcpu_vmx internally + * KVM: Remove three magic numbers + * KVM: SVM: de-containization + * KVM: SVM: internal function name cleanup + * KVM: x86 emulator: disable writeback for debug register instructions + * KVM: Change the emulator_{read,write,cmpxchg}_* functions to take a + vcpu + * KVM: Remove kvm_{read,write}_guest() + * KVM: Use kmem cache for allocating vcpus + * KVM: Use alignment properties of vcpu to simplify FPU ops + * KVM: kvm_vm_ioctl_get_dirty_log restore "nothing dirty" optimization + * KVM: VMX: Add cpu consistency check + * KVM: Don't assign vcpu->cr3 if it's invalid: check first, set last + * KVM: Cleanup mark_page_dirty + * KVM: SVM: Make set_msr_interception more reliable + * KVM: Remove redundant alloc_vmcs_cpu declaration + * KVM: Fix defined but not used warning in drivers/kvm/vmx.c + * KVM: Remove stat_set from debugfs + * KVM: Remove unneeded kvm_dev_open and kvm_dev_release functions. + * KVM: Add and use pr_unimpl for standard formatting of unimplemented + features + * KVM: Use kmem_cache_free for kmem_cache_zalloc'ed objects + * KVM: VMX: Remove a duplicated ia32e mode vm entry control + * KVM: Remove useless assignment + * KVM: Cleanup string I/O instruction emulation + * KVM: Clean up kvm_setup_pio() + * KVM: VMX: Don't require cr8 load/store exit capability when running on + 32-bit + * KVM: Close minor race in signal handling + * KVM: Communicate cr8 changes to userspace + * KVM: x86 emulator: implement 'and $imm, %{al|ax|eax}' + * KVM: x86 emulator: implement 'jmp rel' instruction (opcode 0xe9) + * KVM: x86 emulator: Implement 'jmp rel short' instruction (opcode 0xeb) + * KVM: x86 emulator: implement 'push reg' (opcodes 0x50-0x57) + * KVM: VMX: allow rmode_tss_base() to work with >2G of guest memory + * KVM: Avoid calling smp_call_function_single() with interrupts disabled + * KVM: MMU: Fix rare oops on guest context switch + * KVM: Support more memory slots + * KVM: X86 emulator: fix 'push reg' writeback + * KVM: VMX: Split segments reload in vmx_load_host_state() + * KVM: Add support for in-kernel PIC emulation + * KVM: Define and use cr8 access functions + * KVM: Emulate local APIC in kernel + * KVM: In-kernel I/O APIC model + * KVM: Emulate hlt in the kernel + * KVM: Protect in-kernel pio using kvm->lock + * KVM: Add get/set irqchip ioctls for in-kernel PIC live migration + support + * KVM: Bypass irq_pending get/set when using in kernel irqchip + * KVM: in-kernel IOAPIC save and restore support + * KVM: in-kernel LAPIC save and restore support + * KVM: pending irq save/restore + * KVM: VMX: Use shadow TPR/cr8 for 64-bits guests + * KVM: Keep track of missed timer irq injections + * KVM: Migrate lapic hrtimer when vcpu moves to another cpu + * KVM: disable tpr/cr8 sync when in-kernel APIC is used + * KVM: VMX: Fix tpr threshold updating + * KVM: deliver PIC interrupt only to vcpu0 + * KVM: round robin for APIC lowest priority delivery mode + * KVM: enable in-kernel APIC INIT/SIPI handling + * KVM: Set the ET flag in CR0 after initializing FX + * KVM: Remove the unused invlpg member of struct kvm_arch_ops. + * KVM: Clean up unloved invlpg emulation + * KVM: Keep control regs in sync + * KVM: Hoist SVM's get_cs_db_l_bits into core code. + * KVM: Simplify memory allocation + * KVM: Rename kvm_arch_ops to kvm_x86_ops + * KVM: Fix lapic 64-bit division on 32-bit hosts + * KVM: fix apic timer migration when inactive + * KVM: MMU: Don't do GFP_NOWAIT allocations + * KVM: Remove smp_processor_id() in kvm_vcpu_kick() + * KVM: VMX: Move vm entry failure handling to the exit handler + * KVM: Move main vcpu loop into subarch independent code + * KVM: Fix link error to "genapic" + * KVM: VMX: Fix exit qualification width on i386 + * KVM: x86 emulator: push imm8 + * KVM: x86 emulator: call near + * KVM: x86 emulator: pushf + * KVM: Improve emulation failure reporting + * KVM: VMX: Prevent setting CPU_BASED_TPR_SHADOW on i386 host + * KVM: x86 emulator: sort opcodes into ascending order + * KVM: x86 emulator: imlpement jump conditional relative + * KVM: X86 emulator: jump conditional short + * KVM: x86 emulator: lea + * KVM: x86 emulator: jmp abs + * KVM: x86 emulator: fix src, dst value initialization + * KVM: x86 emulator: popf + * KVM: Skip pio instruction when it is emulated, not executed + * KVM: fix PIC interrupt delivery on different APIC conditions + * KVM: Fix kvm_vcpu_ioctl_get_sregs() warning on i386 + * KVM: Remove errant printk() in kvm_vcpu_ioctl_get_sregs() + * KVM: Fix virtualization menu help text + * KVM: x86 emulator: Add vmmcall/vmcall to x86_emulate (v3) + * KVM: Refactor hypercall infrastructure (v3) + * KVM: x86 emulator: remove unused functions + * KVM: x86 emulator: move all x86_emulate_memop() to a structure + * KVM: x86 emulator: move all decoding process to function + x86_decode_insn() + * KVM: emulate_instruction() calls now x86_decode_insn() and + x86_emulate_insn() + * KVM: Call x86_decode_insn() only when needed + * KVM: Fix ioapic level-triggered interrupt redelivery + * KVM: Fix #UD exception delivery + * KVM: VMX: Further reduce efer reloads + * KVM: VMX: Fix build on i386 due to EFER_LMA not defined + * KVM: Fix ioapic.c compilation failure due to missing include + * KVM: x86 emulator: fix merge screwup due to emulator split + * KVM: x85 emulator: Correct inconcistency in between cr2 and ctxt->cr2. + * KVM: Avoid redelivery of edge-triggered irq if it is already in service + * KVM: Implement ioapic irq polarity bit + * KVM: x86 emulator: fix repne/repnz decoding + * KVM: Fix host oops due to guest changing efer + * KVM: Fix ioapic edge-triggered interrupts + * KVM: MMU: Set shadow pte atomically in mmu_pte_write_zap_pte() + * KVM: Allow not-present guest page faults to bypass kvm + * KVM: MMU: Make flooding detection work when guest page faults are + bypassed + * KVM: MMU: Ignore reserved bits in cr3 in non-pae mode + * KVM: x86 emulator: split some decoding into functions for readability + * KVM: x86 emulator: remove _eflags and use directly ctxt->eflags. + * KVM: x86 emulator: Remove no_wb, use dst.type = OP_NONE instead + * KVM: x86_emulator: no writeback for bt + * KVM: apic round robin cleanup + * KVM: Purify x86_decode_insn() error case management + * KVM: x86 emulator: Any legacy prefix after a REX prefix nullifies its + effect + * i386: Expose IOAPIC register definitions even if CONFIG_X86_IO_APIC is + not set + * KVM: x86 emulator: On a pop instruction, don't restore ECX and EIP on + error + * KVM: x86 emulator: remove unused variable + * KVM: VMX: Don't clear the vmcs if the vcpu is not loaded on any + processor + * KVM: VMX: Simplify vcpu_clear() + * KVM: Remove the usage of paeg->private field by rmap + * KVM: x86 emulator: Correct management of REP prefix + * KVM: Add general accessors to read and write guest memory + * KVM: Allow dynamic allocation of the mmu shadow cache size + * KVM: Check I/O APIC indirect index before writing + * KVM: Add kvm_free_lapic() to pair with kvm_create_lapic() + * KVM: Hoist kvm_create_lapic() into kvm_vcpu_init() + * KVM: Remove gratuitous casts from lapic.c + * KVM: CodingStyle cleanup + * KVM: VMX: Handle NMIs before enabling interrupts and preemption + * KVM: Support assigning userspace memory to the guest + * KVM: Export PIC reset for kernel device reset + * KVM: Split IOAPIC reset function and export for kernel RESET + * KVM: VMX: Reset mmu context when entering real mode + * KVM: Replace enum by #define + * KVM: Move x86 msr handling to new files x86.[ch] + * KVM: MMU: Clean up MMU functions to take struct kvm when appropriate + * KVM: MMU: More struct kvm_vcpu -> struct kvm cleanups + * KVM: Move guest pte dirty bit management to the guest pagetable walker + * KVM: MMU: Fix nx access bit for huge pages + * KVM: MMU: Disable write access on clean large pages + * KVM: MMU: Instatiate real-mode shadows as user writable shadows + * KVM: MMU: Move dirty bit updates to a separate function + * KVM: MMU: When updating the dirty bit, inform the mmu about it + * KVM: Portability: split kvm_vcpu_ioctl + * KVM: Restore missing #include + * KVM: Add some \n in ioapic_debug() + * KVM: x86 emulator: implement 'movnti mem, reg' + * KVM: MMU: Call update_dirty_bit() without disabling preemption + * KVM: Move apic timer interrupt backlog processing to common code + * KVM: Move interrupt injection out of interrupt disabled section + * KVM: Rename KVM_TLB_FLUSH to KVM_REQ_TLB_FLUSH + * KVM: VMX: Force vm86 mode if setting flags during real mode + * KVM: MMU: Simplify page table walker + * KVM: Actually move the interrupt injection code out of the critical + section + * KVM: x86 emulator: cmc, clc, cli, sti + * KVM: x86 emulator: use a defined flag definition + * KVM: x86 emulator: fix access registers for instructions with ModR/M + byte and Mod = 3 + * KVM: MMU: Add rmap_next(), a helper for walking kvm rmaps + * KVM: MMU: Keep a reverse mapping of non-writable translations + * KVM: MMU: Make gfn_to_page() always safe + * KVM: Partial swapping of guest memory + * KVM: VMX: Initialize vcpu with preemption enabled + * KVM: Use virtual cpu accounting if available for guest times. + * KVM: Move kvm_guest_exit() after local_irq_enable() + * KVM: MMU: Fix dirty bit pte gpa calculation + * KVM: Allocate userspace memory for older userspace + * KVM: Portability: Split kvm_vcpu into arch dependent and independent + parts (part 1) + * KVM: Fix local apic timer divide by zero + * KVM: Move vmx_vcpu_reset() out of vmx_vcpu_setup() + * KVM: Add a might_sleep() annotation to gfn_to_page() + * KVM: VMX: vmx_vcpu_setup(): remove unused variable. + * KVM: Per-architecture hypercall definitions + * KVM: Use new smp_call_function_mask() in kvm_flush_remote_tlbs() + * KVM: Unmap kernel-allocated memory on slot destruction + * KVM: Export memory slot allocation mechanism + * KVM: Add kernel-internal memory slots + * KVM: Add ioctl to tss address from userspace, + * KVM: x86 emulator: fix 'push imm8' emulation + * KVM: VMX: Let gcc to choose which registers to save (x86_64) + * KVM: VMX: Let gcc to choose which registers to save (i386) + * KVM: SVM: Let gcc to choose which registers to save (x86_64) + * KVM: SVM: Let gcc to choose which registers to save (i386) + * KVM: x86 emulator: invd instruction + * KVM: SVM: Intercept the 'invd' and 'wbinvd' instructions + * KVM: x86 emulator: don't depend on cr2 for mov abs emulation + * KVM: Move page fault processing to common code + * KVM: MMU: Topup the mmu memory preallocation caches before emulating an + insn + * KVM: Portability: Split kvm_vm_ioctl v3 + * KVM: Portability: Move memory segmentation to x86.c + * KVM: Portability: move get/set_apic_base to x86.c + * KVM: Portability: Move control register helper functions to x86.c + * KVM: VMX: Enable memory mapped TPR shadow (FlexPriority) + * KVM: Fix gfn_to_page() acquiring mmap_sem twice + * KVM: Portability: Move kvm_get/set_msr[_common] to x86.c + * KVM: Portability: Move x86 emulation and mmio device hook to x86.c + * KVM: Portability: Move pio emulation functions to x86.c + * KVM: x86 emulator: Extract the common code of SrcReg and DstReg + * KVM: x86 emulator: centralize decoding of one-byte register access + insns + * KVM: Simplify decode_register_operand() calling convention + * KVM: Make mark_page_dirty() work for aliased pages too. + * KVM: x86 emulator: Hoist modrm and abs decoding into separate functions + * KVM: Portability: Make exported debugfs data architecture-specific + * KVM: Portability: Move x86 instruction emulation code to x86.c + * KVM: Portability: Move x86 FPU handling to x86.c + * KVM: Portability: Move x86 vcpu ioctl handlers to x86.c + * KVM: x86 emulator: Move one-byte insns with reg operand into one-byte + section + * KVM: VMX: Fix repeated allocation of apic access page on smp + * KVM: SVM: Fix SMP with kernel apic + * KVM: Add make_page_dirty() to kvm_clear_guest_page() + * KVM: SVM: Defer nmi processing until switch to host state is complete + * KVM: VMX: Avoid reloading host efer on cpus that don't have it + * KVM: VMX: Use vmx to inject real interrupts + * KVM: Go back to atomically injecting interrupts + * KVM: VMX: Comment VMX primary/secondary exec ctl definitions + * KVM: VMX: wbinvd exiting + * KVM: x86 emulator: fix JMP_REL + * KVM: x86 emulator: fix the saving of of the eip value + * KVM: x86 emulator: remove 8 bytes operands emulator for call near + instruction + * KVM: Simplify CPU_TASKS_FROZEN cpu notifier handling + * KVM: add kvm_is_error_hva() + * KVM: introduce gfn_to_hva() + * KVM: Change kvm_{read,write}_guest() to use copy_{from,to}_user() + * KVM: Portability: Move some includes to x86.c + * KVM: Portability: Move kvm_x86_ops to x86.c + * KVM: Portability: Add vcpu and hardware management arch hooks + * KVM: Portability: Combine kvm_init and kvm_init_x86 + * KVM: Portability: Move x86 specific code from kvm_init() to kvm_arch() + * KVM: x86 emulator: modify 'lods', and 'stos' not to depend on CR2 + * KVM: Portability: move KVM_CHECK_EXTENSION + * KVM: VMX: Consolidate register usage in vmx_vcpu_run() + * KVM: Portability: Make kvm_vcpu_ioctl_translate arch dependent + * KVM: x86 emulator: Rename 'cr2' to 'memop' + * KVM: Remove ptr comparisons to 0 + * KVM: Remove __init attributes for kvm_init_debug and kvm_init_msr_list + * KVM: Portability: Add two hooks to handle kvm_create and destroy vm + * KVM: Replace 'light_exits' stat with 'host_state_reload' + * KVM: Add fpu_reload counter + * KVM: Add instruction emulation statistics + * KVM: Extend stats support for VM stats + * KVM: MMU: Add some mmu statistics + * KVM: x86 emulator: Use emulator_write_emulated and not + emulator_write_std + * KVM: Make unloading of FPU state when putting vcpu arch-independent + * KVM: SVM: Disable Lazy FPU optimization + * KVM: Portability: Move kvm_vcpu_ioctl_get_dirty_log to arch-specific + file + * KVM: Portability: MMU initialization and teardown split + * KVM: Portability: Move some macro definitions from kvm.h to x86.h + * KVM: Portability: Move struct kvm_x86_ops definition to x86.h + * KVM: Portability: Move vcpu regs enumeration definition to x86.h + * KVM: Move some static inline functions out from kvm.h into x86.h + * KVM: Portability: Move some function declarations to x86.h + * KVM: VMX: Force seg.base == (seg.sel << 4) in real mode + * KVM: MMU: Change guest pte access to kvm_{read,write}_guest() + * kvm: simplify kvm_clear_guest_page() + * KVM: Add missing #include + * KVM: MMU: Remove unused variable + * KVM: Remove unused "rmap_overflow" variable + * KVM: Correct consistent typo: "destory" -> "destroy" + * KVM: Move misplaced comment + * KVM: Portability: Move kvm_memory_alias to asm/kvm.h + * KVM: Portability: Move x86 pic strutctures + * KVM: Portability: Move kvm_regs to + * KVM: Portability: Move structure lapic_state to + * KVM: Portability: Move kvm_segment & kvm_dtable structure to + + * KVM: Portability: Move kvm_sregs and msr structures to + * KVM: Portability: Move cpuid structures to + * KVM: Export include/asm-x86/kvm.h + * KVM: MMU: Fix potential memory leak with smp real-mode + * KVM: MMU: Selectively set PageDirty when releasing guest memory + * KVM: x86 emulator: retire ->write_std() + * KVM: x86 emulator: prefetch up to 15 bytes of the instruction executed + * KVM: SVM: Fix FPU leak and re-enable lazy FPU switching + * KVM: Recalculate mmu pages needed for every memory region change + * KVM: Portability: Split kvm_set_memory_region() to have an arch + callout + * KVM: Split vcpu creation to avoid vcpu_load() before preemption setup + * KVM: MMU: Implement guest page fault bypass for nonpae + * KVM: Add statistic for remote tlb flushes + * KVM: MMU: Avoid unnecessary remote tlb flushes when guest updates a pte + * KVM: Add parentheses to silence gcc + * KVM: Don't bother the mmu if cr3 load doesn't change cr3 + * KVM: MMU: Code cleanup + * KVM: MMU: Introduce and use gpte_to_gfn() + * KVM: MMU: Move pse36 handling to the guest walker + * KVM: MMU: Remove extra gaddr parameter from set_pte_common() + * KVM: MMU: Remove set_pde() + * KVM: MMU: Adjust page_header_update_slot() to accept a gfn instead of a + gpa + * KVM: MMU: Introduce gfn_to_gpa() + * KVM: MMU: Simplify nonpaging_map() + * KVM: MMU: Remove gva_to_hpa() + * KVM: Remove gpa_to_hpa() + * KVM: MMU: Rename variable of type 'struct kvm_mmu_page *' + * KVM: MMU: Rename 'release_page' + * KVM: Disallow fork() and similar games when using a VM + * KVM: Enhance guest cpuid management + * KVM: Replace private 'struct segment descriptor' by x86's desc_struct + * KVM: Remove segment_descriptor, part 2 + * KVM: Fix compile error on i386 + * KVM: VMX: Read & store IDT_VECTORING_INFO_FIELD + * KVM: Fix faults during injection of real-mode interrupts + * KVM: x86 emulator: Fix instruction fetch cache hit check + * KVM: VMX: Remove the secondary execute control dependency on irqchip + * KVM: Portability: Move unalias_gfn to arch dependent file + * KVM: x86 emulator: Make a distinction between repeat prefixes F3 and F2 + * KVM: x86 emulator: address size and operand size overrides are sticky + * KVM: Remove desc.h include in kvm_main.c + * KVM: Revert segment_descriptor.h removal + * KVM: Remove misleading check for mmio during event injection + * KVM: MMU: mark pages that were inserted to the shadow pages table as + accessed + * KVM: x86 emulator: rename REP_REPE_PREFIX + * KVM: x86 emulator: cmps instruction + * KVM: Add ifdef in irqchip struct for x86 only structures + * KVM: Fix cpuid2 killing 32-bit guests on non-NX machines + * KVM: x86 emulator: Move rep processing before instruction execution + * KVM: x86 emulator: unify two switches + * KVM: x86 emulator: unify four switch statements into two + * KVM: Don't bypass the mmu if in pae and pdptrs changed + * KVM: Portability: Move KVM_INTERRUPT vcpu ioctl to x86.c + * KVM: Correct kvm_init() error paths not freeing bad_pge. + * KVM: Export include/linux/kvm.h only if $ARCH actually supports KVM + * KVM: SVM: Remove KVM specific defines for MSR_EFER + * KVM: Replace kvm_lapic with kvm_vcpu in ioapic/lapic interface + * KVM: Replace dest_Lowest_Prio and dest_Fixed with self-defined macros + * KVM: Extend ioapic code to support iosapic + * KVM: Portability: Move address types to their own header file + * KVM: Portability: Move IO device definitions to its own header file + * KVM: Portability: Stop including x86-specific headers in kvm_main.c + * KVM: Portability: Create kvm_arch_vcpu_runnable() function + * KVM: Convert KVM from ->nopage() to ->fault() + * KVM: MMU: Remove unused prev_shadow_ent variable from fetch() + * KVM: Generalize exception injection mechanism + * KVM: Replace page fault injection by the generalized exception queue + * KVM: Replace #GP injection by the generalized exception queue + * KVM: Use generalized exception queue for injecting #UD + * KVM: x86 emulator: fix eflags preparation for emulation + * KVM: VMX: Avoid exit when setting cr8 if the local apic is in the + kernel + * KVM: SVM: Emulate read/write access to cr8 + * KVM: x86 emulator: Fix stack instructions on 64-bit mode + * KVM: SVM: Trap access to the cr8 register + * KVM: VMX: Fix cr8 exit optimization + * KVM: MMU: Use cmpxchg for pte updates on walk_addr() + * KVM: MMU: Simplify calculation of pte access + * KVM: MMU: Set nx bit correctly on shadow ptes + * KVM: MMU: Move pte access calculation into a helper function + * KVM: MMU: Fix inherited permissions for emulated guest pte updates + * KVM: MMU: No need to pick up nx bit from guest pte + * KVM: MMU: Pass pte dirty flag to set_pte() instead of calculating it + on-site + * KVM: MMU: Remove walker argument to set_pte() + * KVM: MMU: Move set_pte() into guest paging mode independent code + * KVM: MMU: Adjust mmu_set_spte() debug code for gpte removal + * KVM: MMU: Use mmu_set_spte() for real-mode shadows + * KVM: SVM: Exit to userspace if write to cr8 and not using in-kernel + apic + * KVM: SVM: support writing 0 to K8 performance counter control registers + * KVM: MMU: Fix kunmap_atomic() call in cmpxchg_gpte() + * KVM: MMU: Fix SMP shadow instantiation race + * KVM: LAPIC: minor debugging compile fix + * KVM: MMU: emulated cmpxchg8b should be atomic on i386 + * KVM: Fix bad kunmap_atomic() paramerter inm cmpxchg emulation + * KVM: Make cmpxchg emulation compile on i386 + * KVM: Another cmpxchg i386 compile fix + * KVM: Another cmpxchg emulation compile fix + * KVM: Another cmpxchg emulation compile fix + * KVM: Portability: Move kvm{pic,ioapic} accesors to x86 specific code + * KVM: Portability: Introduce kvm_vcpu_arch + * KVM: Portability: Split mmu-related static inline functions to mmu.h + * KVM: Portability: Move kvm_vcpu definition back to kvm.h + * KVM: Portability: Expand the KVM_VCPU_COMM in kvm_vcpu structure. + * KVM: Portability: Move kvm_vcpu_stat to x86.h + * KVM: Portability: Move memslot aliases to new struct kvm_arch + * KVM: Portability: Move mmu-related fields to kvm_arch + * KVM: Portability: move vpic and vioapic to kvm_arch + * KVM: Portability: Move round_robin_prev_vcpu and tss_addr to kvm_arch + * KVM: Portability: Move kvm_vm_stat to x86.h + * KVM: VMX: Add printk_ratelimit in vmx_intr_assist + * KVM: Move arch dependent files to new directory arch/x86/kvm/ + * KVM: Move drivers/kvm/* to virt/kvm/ + * KVM: Fix compile error in asm/kvm_host.h + * KVM: Move irqchip declarations into new ioapic.h and lapic.h + * KVM: Move ioapic code to common directory. + * KVM: Move kvm_vcpu_kick() to x86.c + * KVM: Expose ioapic to ia64 save/restore APIs + * KVM: MMU: Coalesce remote tlb flushes + * KVM: MMU: Add cache miss statistic + * KVM: Print data for unimplemented wrmsr + * KVM: Ensure pages are copied on write + * KVM: MMU: Fix cmpxchg8b emulation on i386 (again) + * KVM: x86 emulator: Add vmmcall/vmcall to x86_emulate (v3) + * KVM: Refactor hypercall infrastructure (v3) + * KVM: x86 emulator: remove unused functions + * KVM: x86 emulator: move all x86_emulate_memop() to a structure + * KVM: x86 emulator: move all decoding process to function + x86_decode_insn() + * KVM: emulate_instruction() calls now x86_decode_insn() and + x86_emulate_insn() + * KVM: Call x86_decode_insn() only when needed + * KVM: VMX: Further reduce efer reloads + * KVM: Allow not-present guest page faults to bypass kvm + * KVM: MMU: Make flooding detection work when guest page faults are + bypassed + * KVM: MMU: Ignore reserved bits in cr3 in non-pae mode + * KVM: x86 emulator: split some decoding into functions for readability + * KVM: x86 emulator: remove _eflags and use directly ctxt->eflags. + * KVM: x86 emulator: Remove no_wb, use dst.type = OP_NONE instead + * KVM: x86_emulator: no writeback for bt + * KVM: Purify x86_decode_insn() error case management + * KVM: x86 emulator: Any legacy prefix after a REX prefix nullifies its + effect + * KVM: VMX: Don't clear the vmcs if the vcpu is not loaded on any + processor + * KVM: VMX: Simplify vcpu_clear() + * KVM: Remove the usage of page->private field by rmap + * KVM: Add general accessors to read and write guest memory + * KVM: Allow dynamic allocation of the mmu shadow cache size + * KVM: Add kvm_free_lapic() to pair with kvm_create_lapic() + * KVM: Hoist kvm_create_lapic() into kvm_vcpu_init() + * KVM: Remove gratuitous casts from lapic.c + * KVM: CodingStyle cleanup + * KVM: Support assigning userspace memory to the guest + * KVM: Move x86 msr handling to new files x86.[ch] + * KVM: MMU: Clean up MMU functions to take struct kvm when appropriate + * KVM: MMU: More struct kvm_vcpu -> struct kvm cleanups + * KVM: Move guest pte dirty bit management to the guest pagetable walker + * KVM: MMU: Fix nx access bit for huge pages + * KVM: MMU: Disable write access on clean large pages + * KVM: MMU: Instantiate real-mode shadows as user writable shadows + * KVM: MMU: Move dirty bit updates to a separate function + * KVM: MMU: When updating the dirty bit, inform the mmu about it + * KVM: Portability: split kvm_vcpu_ioctl + * KVM: apic round robin cleanup + * KVM: Add some \n in ioapic_debug() + * KVM: Move apic timer interrupt backlog processing to common code + * KVM: Rename KVM_TLB_FLUSH to KVM_REQ_TLB_FLUSH + * KVM: x86 emulator: Implement emulation of instruction: inc & dec + * KVM: MMU: Simplify page table walker + * KVM: x86 emulator: cmc, clc, cli, sti + * KVM: MMU: Add rmap_next(), a helper for walking kvm rmaps + * KVM: MMU: Keep a reverse mapping of non-writable translations + * KVM: MMU: Make gfn_to_page() always safe + * KVM: MMU: Partial swapping of guest memory + * KVM: Use virtual cpu accounting if available for guest times. + * KVM: Allocate userspace memory for older userspace + * KVM: Portability: Split kvm_vcpu into arch dependent and independent + parts (part 1) + * KVM: Move vmx_vcpu_reset() out of vmx_vcpu_setup() + * KVM: Add a might_sleep() annotation to gfn_to_page() + * KVM: Export PIC reset for kernel device reset + * KVM: Split IOAPIC reset function and export for kernel RESET + * KVM: Per-architecture hypercall definitions + * KVM: Unmap kernel-allocated memory on slot destruction + * KVM: Export memory slot allocation mechanism + * KVM: Add kernel-internal memory slots + * KVM: Add ioctl to tss address from userspace, + * KVM: VMX: Let gcc to choose which registers to save (x86_64) + * KVM: VMX: Let gcc to choose which registers to save (i386) + * KVM: SVM: Let gcc to choose which registers to save (x86_64) + * KVM: SVM: Let gcc to choose which registers to save (i386) + * KVM: x86 emulator: don't depend on cr2 for mov abs emulation + * KVM: Move page fault processing to common code + * KVM: MMU: Topup the mmu memory preallocation caches before emulating an + insn + * KVM: Portability: Split kvm_vm_ioctl v3 + * KVM: Portability: Move memory segmentation to x86.c + * KVM: Portability: move get/set_apic_base to x86.c + * KVM: Portability: Move control register helper functions to x86.c + * KVM: VMX: Enable memory mapped TPR shadow (FlexPriority) + * KVM: Fix gfn_to_page() acquiring mmap_sem twice + * KVM: Portability: Move kvm_get/set_msr[_common] to x86.c + * KVM: Portability: Move x86 emulation and mmio device hook to x86.c + * KVM: Portability: Move pio emulation functions to x86.c + * KVM: x86 emulator: Extract the common code of SrcReg and DstReg + * KVM: x86 emulator: centralize decoding of one-byte register access + insns + * KVM: Simplify decode_register_operand() calling convention + * KVM: Make mark_page_dirty() work for aliased pages too. + * KVM: x86 emulator: Hoist modrm and abs decoding into separate functions + * KVM: Portability: Make exported debugfs data architecture-specific + * KVM: Portability: Move x86 instruction emulation code to x86.c + * KVM: Portability: Move x86 FPU handling to x86.c + * KVM: Portability: Move x86 vcpu ioctl handlers to x86.c + * KVM: Add make_page_dirty() to kvm_clear_guest_page() + * KVM: VMX: Use vmx to inject real-mode interrupts + * KVM: VMX: Read & store IDT_VECTORING_INFO_FIELD + * KVM: Fix faults during injection of real-mode interrupts + * KVM: VMX: Comment VMX primary/secondary exec ctl definitions + * KVM: VMX: wbinvd exiting + * KVM: x86 emulator: remove 8 bytes operands emulator for call near + instruction + * KVM: Simplify CPU_TASKS_FROZEN cpu notifier handling + * KVM: add kvm_is_error_hva() + * KVM: introduce gfn_to_hva() + * KVM: Change kvm_{read,write}_guest() to use copy_{from,to}_user() + * KVM: Portability: Move some includes to x86.c + * KVM: Portability: Move kvm_x86_ops to x86.c + * KVM: Portability: Add vcpu and hardware management arch hooks + * KVM: Portability: Combine kvm_init and kvm_init_x86 + * KVM: Portability: Move x86 specific code from kvm_init() to kvm_arch() + * KVM: x86 emulator: modify 'lods', and 'stos' not to depend on CR2 + * KVM: Portability: move KVM_CHECK_EXTENSION + * KVM: VMX: Consolidate register usage in vmx_vcpu_run() + * KVM: Portability: Make kvm_vcpu_ioctl_translate arch dependent + * KVM: Remove ptr comparisons to 0 + * KVM: Remove __init attributes for kvm_init_debug and kvm_init_msr_list + * KVM: Portability: Add two hooks to handle kvm_create and destroy vm + * KVM: Replace 'light_exits' stat with 'host_state_reload' + * KVM: Add fpu_reload counter + * KVM: Add instruction emulation statistics + * KVM: Extend stats support for VM stats + * KVM: MMU: Add some mmu statistics + * KVM: Make unloading of FPU state when putting vcpu arch-independent + * KVM: Portability: Move kvm_vcpu_ioctl_get_dirty_log to arch-specific + file + * KVM: Portability: MMU initialization and teardown split + * KVM: Portability: Move some macro definitions from kvm.h to x86.h + * KVM: Portability: Move struct kvm_x86_ops definition to x86.h + * KVM: Portability: Move vcpu regs enumeration definition to x86.h + * KVM: Move some static inline functions out from kvm.h into x86.h + * KVM: Portability: Move some function declarations to x86.h + * KVM: VMX: Force seg.base == (seg.sel << 4) in real mode + * KVM: MMU: Change guest pte access to kvm_{read,write}_guest() + * KVM: Simplify kvm_clear_guest_page() + * KVM: Add missing #include + * KVM: MMU: Remove unused variable + * KVM: Remove unused "rmap_overflow" variable + * KVM: Correct consistent typo: "destory" -> "destroy" + * KVM: Move misplaced comment + * KVM: Portability: Move kvm_memory_alias to asm/kvm.h + * KVM: Portability: Move x86 pic strutctures + * KVM: Portability: Move kvm_regs to + * KVM: Portability: Move structure lapic_state to + * KVM: Portability: Move kvm_segment & kvm_dtable structure to + + * KVM: Portability: Move kvm_sregs and msr structures to + * KVM: Portability: Move cpuid structures to + * KVM: Export include/asm-x86/kvm.h + * KVM: MMU: Fix potential memory leak with smp real-mode + * KVM: MMU: Selectively set PageDirty when releasing guest memory + * KVM: x86 emulator: retire ->write_std() + * KVM: x86 emulator: prefetch up to 15 bytes of the instruction executed + * KVM: Recalculate mmu pages needed for every memory region change + * KVM: Portability: Split kvm_set_memory_region() to have an arch + callout + * KVM: Split vcpu creation to avoid vcpu_load() before preemption setup + * KVM: MMU: Implement guest page fault bypass for nonpae + * KVM: Add statistic for remote tlb flushes + * KVM: MMU: Avoid unnecessary remote tlb flushes when guest updates a pte + * KVM: Don't bother the mmu if cr3 load doesn't change cr3 + * KVM: MMU: Code cleanup + * KVM: MMU: Introduce and use gpte_to_gfn() + * KVM: MMU: Move pse36 handling to the guest walker + * KVM: MMU: Remove extra gaddr parameter from set_pte_common() + * KVM: MMU: Remove set_pde() + * KVM: MMU: Adjust page_header_update_slot() to accept a gfn instead of a + gpa + * KVM: MMU: Introduce gfn_to_gpa() + * KVM: MMU: Simplify nonpaging_map() + * KVM: MMU: Remove gva_to_hpa() + * KVM: Remove gpa_to_hpa() + * KVM: MMU: Rename variables of type 'struct kvm_mmu_page *' + * KVM: MMU: Rename 'release_page' + * KVM: Disallow fork() and similar games when using a VM + * KVM: Enhance guest cpuid management + * KVM: VMX: Remove the secondary execute control dependency on irqchip + * KVM: Portability: Move unalias_gfn to arch dependent file + * KVM: x86 emulator: Make a distinction between repeat prefixes F3 and F2 + * KVM: x86 emulator: address size and operand size overrides are sticky + * KVM: Remove misleading check for mmio during event injection + * KVM: MMU: mark pages that were inserted to the shadow pages table as + accessed + * KVM: x86 emulator: rename REP_REPE_PREFIX + * KVM: x86 emulator: Rename 'cr2' to 'memop' + * KVM: x86 emulator: cmps instruction + * KVM: Add ifdef in irqchip struct for x86 only structures + * KVM: Fix cpuid2 killing 32-bit guests on non-NX machines + * KVM: x86 emulator: Move rep processing before instruction execution + * KVM: x86 emulator: unify two switches + * KVM: x86 emulator: unify four switch statements into two + * KVM: Portability: Move KVM_INTERRUPT vcpu ioctl to x86.c + * KVM: Correct kvm_init() error paths not freeing bad_pge. + * KVM: Export include/linux/kvm.h only if $ARCH actually supports KVM + * KVM: SVM: Remove KVM specific defines for MSR_EFER + * KVM: Replace kvm_lapic with kvm_vcpu in ioapic/lapic interface + * KVM: Replace dest_Lowest_Prio and dest_Fixed with self-defined macros + * KVM: Extend ioapic code to support iosapic + * KVM: Portability: Move address types to their own header file + * KVM: Portability: Move IO device definitions to its own header file + * KVM: Portability: Stop including x86-specific headers in kvm_main.c + * KVM: Portability: Create kvm_arch_vcpu_runnable() function + * KVM: Convert KVM from ->nopage() to ->fault() + * KVM: MMU: Remove unused prev_shadow_ent variable from fetch() + * KVM: Generalize exception injection mechanism + * KVM: Replace page fault injection by the generalized exception queue + * KVM: Replace #GP injection by the generalized exception queue + * KVM: Use generalized exception queue for injecting #UD + * KVM: x86 emulator: fix eflags preparation for emulation + * KVM: VMX: Avoid exit when setting cr8 if the local apic is in the + kernel + * KVM: SVM: Emulate read/write access to cr8 + * KVM: x86 emulator: Fix stack instructions on 64-bit mode + * KVM: SVM: Trap access to the cr8 register + * KVM: VMX: Fix cr8 exit optimization + * KVM: MMU: Use cmpxchg for pte updates on walk_addr() + * KVM: MMU: Simplify calculation of pte access + * KVM: MMU: Set nx bit correctly on shadow ptes + * KVM: MMU: Move pte access calculation into a helper function + * KVM: MMU: Fix inherited permissions for emulated guest pte updates + * KVM: MMU: No need to pick up nx bit from guest pte + * KVM: MMU: Pass pte dirty flag to set_pte() instead of calculating it + on-site + * KVM: MMU: Remove walker argument to set_pte() + * KVM: MMU: Move set_pte() into guest paging mode independent code + * KVM: MMU: Adjust mmu_set_spte() debug code for gpte removal + * KVM: MMU: Use mmu_set_spte() for real-mode shadows + * KVM: SVM: Exit to userspace if write to cr8 and not using in-kernel + apic + * KVM: MMU: Fix SMP shadow instantiation race + * KVM: LAPIC: minor debugging compile fix + * KVM: SVM: support writing 0 to K8 performance counter control registers + * KVM: MMU: emulated cmpxchg8b should be atomic on i386 + * KVM: Portability: Move kvm{pic,ioapic} accesors to x86 specific code + * KVM: Portability: Introduce kvm_vcpu_arch + * KVM: Portability: Split mmu-related static inline functions to mmu.h + * KVM: Portability: Move kvm_vcpu definition back to kvm.h + * KVM: Portability: Expand the KVM_VCPU_COMM in kvm_vcpu structure. + * KVM: Portability: Move kvm_vcpu_stat to x86.h + * KVM: Portability: Move memslot aliases to new struct kvm_arch + * KVM: Portability: Move mmu-related fields to kvm_arch + * KVM: Portability: move vpic and vioapic to kvm_arch + * KVM: Portability: Move round_robin_prev_vcpu and tss_addr to kvm_arch + * KVM: Portability: Move kvm_vm_stat to x86.h + * KVM: VMX: Add printk_ratelimit in vmx_intr_assist + * KVM: Move arch dependent files to new directory arch/x86/kvm/ + * KVM: Move drivers/kvm/* to virt/kvm/ + * KVM: Move irqchip declarations into new ioapic.h and lapic.h + * KVM: Move ioapic code to common directory. + * KVM: Move kvm_vcpu_kick() to x86.c + * KVM: Expose ioapic to ia64 save/restore APIs + * KVM: MMU: Coalesce remote tlb flushes + * KVM: MMU: Add cache miss statistic + * KVM: Print data for unimplemented wrmsr + * KVM: Ensure pages are copied on write + * KVM: local APIC TPR access reporting facility + * KVM: Accelerated apic support + * KVM: Disable vapic support on Intel machines with FlexPriority + * KVM: MMU: Concurrent guest walkers + * KVM: Add kvm_read_guest_atomic() + * KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte() + * KVM: MMU: Switch to mmu spinlock + * KVM: MMU: Move kvm_free_some_pages() into critical section + * KVM: MMU: Broaden scope of mmap_sem to include actual mapping + * KVM: MMU: Fix recursive locking of mmap_sem() + * KVM: Fix unbalanced mmap_sem operations in cmpxchg8b emulation + * KVM: Mark vapic page as dirty for save/restore/migrate + * KVM: x86 emulator: Only allow VMCALL/VMMCALL trapped by #UD + * KVM: MMU: Update shadow ptes on partial guest pte writes + * KVM: MMU: Simplify hash table indexing + * KVM: Portability: Move kvm_fpu to asm-x86/kvm.h + * KVM: MMU: Fix dirty page setting for pages removed from rmap + * KVM: Initialize the mmu caches only after verifying cpu support + * KVM: Fix unbounded preemption latency + * KVM: Put kvm_para.h include outside __KERNEL__ + * KVM: Move apic timer migration away from critical section + * KVM: SVM: Fix lazy FPU switching + * KVM: MMU: Fix gpa truncation when reading a pte + * [GFS2] Handle multiple glock demote requests + * [GFS2] Clean up internal read function + * [GFS2] Use ->page_mkwrite() for mmap() + * [GFS2] Remove useless i_cache from inodes + * [GFS2] Remove unused field in struct gfs2_inode + * [GFS2] Add gfs2_is_writeback() + * [GFS2] Introduce gfs2_set_aops() + * [GFS2] Split gfs2_writepage into three cases + * [GFS2] Add writepages for GFS2 jdata + * [GFS2] Don't hold page lock when starting transaction + * [GFS2] Use correct include file in ops_address.c + * [GFS2] Remove unused variables + * [GFS2] Remove "reclaim limit" + * [GFS2] Add sync_page to metadata address space operations + * [GFS2] Reorder writeback for glock sync + * [GFS2] Remove flags no longer required + * [GFS2] Given device ID rather than s_id in "id" sysfs file + * [GFS2] check kthread_should_stop when waiting + * [GFS2] Don't add glocks to the journal + * [GFS2] Use atomic_t for journal free blocks counter + * [GFS2] Move gfs2_logd into log.c + * [GFS2] Don't periodically update the jindex + * [GFS2] Check for installation of mount helpers for DLM mounts + * [GFS2] tidy up error message + * [GFS2] Fix runtime issue with UP kernels + * [GFS2] remove unnecessary permission checks + * [GFS2] Fix build warnings + * [GFS2] Remove unrequired code + * [GFS2] Remove lock methods for lock_nolock protocol + * [GFS2] patch to check for recursive lock requests in gfs2_rename code + path + * [GFS2] Remove unused variable + * [GFS2] use pid for plock owner for nfs clients + * [GFS2] Remove function gfs2_get_block + * [GFS2] Journal extent mapping + * [GFS2] Get rid of useless "found" variable in quota.c + * [GFS2] Run through full bitmaps quicker in gfs2_bitfit + * [GFS2] Reorganize function gfs2_glmutex_lock + * [GFS2] Only fetch the dinode once in block_map + * [GFS2] Function meta_read optimization + * [GFS2] Incremental patch to fix compiler warning + * [GFS2] Eliminate the no longer needed sd_statfs_mutex + * [GFS2] Minor correction + * [GFS2] Fix log block mapper + * [GFS2] Remove unused variable + * [GFS2] Allow page migration for writeback and ordered pages + * [GFS2] Initialize extent_list earlier + * [GFS2] Fix problems relating to execution of files on GFS2 + * [GFS2] Fix assert in log code + * [GFS2] Reduce inode size by moving i_alloc out of line + * [GFS2] Remove unneeded i_spin + * [GFS2] gfs2_alloc_required performance + * [GFS2] Fix write alloc required shortcut calculation + * [GFS2] Fix typo + * [GFS2] Fix page_mkwrite truncation race path + * [GFS2] Lockup on error + * [GFS2] Allow journal recovery on read-only mount + + -- Tim Gardner Sun, 27 Jan 2008 20:37:18 -0700 + +linux (2.6.24-5.9) hardy; urgency=low + + [Amit Kucheria] + + * Fix LPIA FTBFS due to virtio Ignore: yes + + [Upstream Kernel Changes] + + * ACPI: processor: Fix null pointer dereference in throttling + * [SPARC64]: Fix of section mismatch warnings. + * [SPARC64]: Fix section error in sparcspkr + * [SPARC]: Constify function pointer tables. + * [BLUETOOTH]: Move children of connection device to NULL before + connection down. + * [TULIP] DMFE: Fix SROM parsing regression. + * [IPV4]: Add missing skb->truesize increment in ip_append_page(). + * iwlwifi: fix possible read attempt on ucode that is not available + * [NETNS]: Re-export init_net via EXPORT_SYMBOL. + * [INET]: Fix truesize setting in ip_append_data + * sis190: add cmos ram access code for the SiS19x/968 chipset pair + * sis190: remove duplicate INIT_WORK + * sis190: mdio operation failure is not correctly detected + * sis190: scheduling while atomic error + * Update ctime and mtime for memory-mapped files + * [SCSI] initio: fix module hangs on loading + * xen: disable vcpu_info placement for now + * agp/intel: add support for E7221 chipset + * drm/i915: add support for E7221 chipset + * DMI: move dmi_available declaration to linux/dmi.h + * DMI: create dmi_get_slot() + * ACPI: create acpi_dmi_dump() + * ACPI: on OSI(Linux), print needed DMI rather than requesting dmidecode + output + * ACPI: Delete Intel Customer Reference Board (CRB) from OSI(Linux) DMI + list + * ACPI: make _OSI(Linux) console messages smarter + * ACPI: Add ThinkPad R61, ThinkPad T61 to OSI(Linux) white-list + * ACPI: DMI blacklist to reduce console warnings on OSI(Linux) systems. + * ACPI: EC: fix dmesg spam regression + * ACPI: EC: add leading zeros to debug messages + * Pull bugzilla-9747 into release branch + * Pull bugzilla-8459 into release branch + * Pull bugzilla-9798 into release branch + * Pull dmi-2.6.24 into release branch + * [SPARC64]: Partially revert "Constify function pointer tables." + * lockdep: fix kernel crash on module unload + * sysctl: kill binary sysctl KERN_PPC_L2CR + * fix hugepages leak due to pagetable page sharing + * spi: omap2_mcspi PIO RX fix + * Linux 2.6.24 + + -- Tim Gardner Fri, 25 Jan 2008 01:44:27 -0700 + +linux (2.6.24-5.8) hardy; urgency=low + + [Alessio Igor Bogani] + + * rt: Update to 2.6.24-rc8-rt1 + * rt: Update configuration files + + [Amit Kucheria] + + * Asix: fix breakage caused in 2.6.24-rc7 + * Add CONFIG_CPUSETS to server-related flavours + - LP: #182434 + + [Chuck Short] + + * SAUCE: ata: blacklist FUJITSU MHW2160BH PL + - LP: #175834 + + [Kees Cook] + + * AppArmor: updated patch series to upstream SVN 1079. + + [Soren Hansen] + + * Updated configs to enable virtio stuff Ignore: yes + + [Stefan Bader] + + * Enabled CONFIG_BSD_PROCESS_ACCT=y for sparc. + - LP: #176587 + * Enable CONFIG_AUDITSYSCALL=y. + - LP: #140784 + * Added CONFIG_AUDIT_SYSCALL=y to custom lpia(compat) + * Enabled CONFIG_HUGETLBFS=y for i386/server amd64/server and ia64. + * Lower priority of pnpacpi resource messages to warning level. + - LP: #159241 + * Fix the messed up message level of pnpacpi parser. + + [Tim Gardner] + + * Start new release, bump ABI to -5 + * Disabled iwlwifi preperatory to moving it to l-u-m. + * Enabled CONFIG_USB_SERIAL_KEYSPAN + * Disabled CONFIG_CGROUPS. + * Virtio config settings for -rt. + * Re-enable IWLWIFI in the kernel. + * Fixed -rt saa7134-core.c FTBS + + [Upstream Kernel Changes] + + * Input: Handle EV_PWR type of input caps in input_set_capability. + * Input: jornada680_kbd - fix default keymap + * increase PNP_MAX_PORT to 40 from 24 + * sched: fix gcc warnings + * leds: Fix leds_list_lock locking issues + * leds: Fix locomo LED driver oops + * x86: fix asm-x86/byteorder.h for userspace export + * x86: fix asm-x86/msr.h for user-space export + * ACPI: EC: Enable boot EC before bus_scan + * ACPI: Make sysfs interface in ACPI power optional. + * fix lguest rmmod "bad pgd" + * slub: provide /proc/slabinfo + * [POWERPC] Fix build failure on Cell when CONFIG_SPU_FS=y + * slub: register slabinfo to procfs + * [SCSI] scsi_sysfs: restore prep_fn when ULD is removed + * Unify /proc/slabinfo configuration + * scsi: revert "[SCSI] Get rid of scsi_cmnd->done" + * restrict reading from /proc//maps to those who share ->mm or can + ptrace pid + * Fix kernel/ptrace.c compile problem (missing "may_attach()") + * hwmon: (w83627ehf) Be more careful when changing VID input level + * NFS: Fix a possible Oops in fs/nfs/super.c + * NFSv4: Fix circular locking dependency in nfs4_kill_renewd + * NFS: add newline to kernel warning message in auth_gss code + * NFSv4: nfs4_open_confirm must not set the open_owner as confirmed on + error + * NFSv4: Fix open_to_lock_owner sequenceid allocation... + * gameport: don't export functions that are static inline + * Input: spitzkbd - fix suspend key handling + * Input: pass EV_PWR events to event handlers + * [ARM] 4735/1: Unbreak pxa25x suspend/resume + * IB/srp: Fix list corruption/oops on module reload + * Console is utf-8 by default + * [IA64] Update Altix BTE error return status patch + * [IA64] Update Altix nofault code + * [X25]: Add missing x25_neigh_put + * [XFRM]: Do not define km_migrate() if !CONFIG_XFRM_MIGRATE + * [CASSINI]: Fix endianness bug. + * [CASSINI]: Revert 'dont touch page_count'. + * [CASSINI]: Program parent Intel31154 bridge when necessary. + * [CASSINI]: Set skb->truesize properly on receive packets. + * [CASSINI]: Fix two obvious NAPI bugs. + * [CASSINI]: Bump driver version and release date. + * [INET]: Fix netdev renaming and inet address labels + * [CONNECTOR]: Return proper error code in cn_call_callback() + * [ISDN] i4l: 'NO CARRIER' message lost after ldisc flush + * [ISDN]: i4l: Fix DLE handling for i4l-audio + * fix: using joysticks in 32 bit applications on 64 bit systems + * [ARM] 4691/1: add missing i2c_board_info struct for at91rm9200 + * hda_intel suspend latency: shorten codec read + * CPU hotplug: fix cpu_is_offline() on !CONFIG_HOTPLUG_CPU + * Linux 2.6.24-rc7 + * sh: Fix argument page dcache flushing regression. + * V4L/DVB (6944a): Fix Regression VIDIOCGMBUF ioctl hangs on bttv driver + * V4L/DVB (6916): ivtv: udelay has to be changed *after* the eeprom was + read, not before + * [MIPS] Move inclusing of kernel/time/Kconfig menu to appropriate place + * [MIPS] Alchemy: Fix use of __init code bug exposed by modpost warning + * [MIPS] Fix IP32 breakage + * [MIPS] Assume R4000/R4400 newer than 3.0 don't have the mfc0 count bug + * [MIPS] Fix CONFIG_BOOT_RAW. + * ACPI: Reintroduce run time configurable max_cstate for !CPU_IDLE case + * core dump: real_parent ppid + * acct: real_parent ppid + * IB/mlx4: Fix value of pkey_index in QP1 completions + * IB/srp: Release transport before removing host + * x86: fix do_fork_idle section mismatch + * spi_bitbang: always grab lock with irqs blocked + * fat: optimize fat_count_free_clusters() + * KEYS: fix macro + * md: fix data corruption when a degraded raid5 array is reshaped + * xip: fix get_zeroed_page with __GFP_HIGHMEM + * eCryptfs: fix dentry handling on create error, unlink, and inode + destroy + * vmcoreinfo: add the array length of "free_list" for filtering free + pages + * dmi-id: fix for __you_cannot_kmalloc_that_much failure + * snd_mixer_oss_build_input(): fix for __you_cannot_kmalloc_that_much + failure with gcc-3.2 + * Fix crash with FLAT_MEMORY and ARCH_PFN_OFFSET != 0 + * hfs: handle more on-disk corruptions without oopsing + * pl2303: Fix mode switching regression + * futex: Prevent stale futex owner when interrupted/timeout + * [NIU]: Fix slowpath interrupt handling. + * [NIU]: Missing ->last_rx update. + * [NIU]: Fix potentially stuck TCP socket send queues. + * [NIU]: Update driver version and release date. + * [IPV4] raw: Strengthen check on validity of iph->ihl + * [IPV4] ipconfig: Fix regression in ip command line processing + * [NET]: Fix netx-eth.c compilation. + * [METH]: Fix MAC address handling. + * [TULIP]: NAPI full quantum bug. + * [ATM]: [nicstar] delay irq setup until card is configured + * [SCTP]: Fix the name of the authentication event. + * [SCTP]: Correctly handle AUTH parameters in unexpected INIT + * [SCTP]: Add back the code that accounted for FORWARD_TSN parameter in + INIT. + * [IRDA]: irda_create() nuke user triggable printk + * b43: Fix rxheader channel parsing + * [NET]: Do not grab device reference when scheduling a NAPI poll. + * [NET]: Add NAPI_STATE_DISABLE. + * [NET]: Do not check netif_running() and carrier state in ->poll() + * ssb: Fix probing of PCI cores if PCI and PCIE core is available + * mac80211: return an error when SIWRATE doesn't match any rate + * [NETXEN]: Fix ->poll() done logic. + * [NET]: Fix drivers to handle napi_disable() disabling interrupts. + * [NET]: Stop polling when napi_disable() is pending. + * [NET]: Make ->poll() breakout consistent in Intel ethernet drivers. + * [NET] Intel ethernet drivers: update MAINTAINERS + * [NET]: kaweth was forgotten in msec switchover of usb_start_wait_urb + * [IPV4] ROUTE: ip_rt_dump() is unecessary slow + * [NET]: Clone the sk_buff 'iif' field in __skb_clone() + * [LRO] Fix lro_mgr->features checks + * [NET]: mcs7830 passes msecs instead of jiffies to usb_control_msg + * [FORCEDETH]: Fix reversing the MAC address on suspend. + * [XFRM]: xfrm_algo_clone() allocates too much memory + * [SOCK]: Adds a rcu_dereference() in sk_filter + * [CONNECTOR]: Don't touch queue dev after decrement of ref count. + * [IPV6]: IPV6_MULTICAST_IF setting is ignored on link-local connect() + * [ATM]: Check IP header validity in mpc_send_packet + * show_task: real_parent + * [SCSI] qla1280: fix 32 bit segment code + * [NIU]: Support for Marvell PHY + * [NEIGH]: Fix race between neigh_parms_release and neightbl_fill_parms + * [IPV4] ROUTE: fix rcu_dereference() uses in /proc/net/rt_cache + * [AX25]: Kill user triggable printks. + * [ARM] pxa: silence warnings from cpu_is_xxx() macros + * [POWERPC] efika: add phy-handle property for fec_mpc52xx + * [ARM] vfp: fix fuitod/fsitod instructions + * [CRYPTO] padlock: Fix alignment fault in aes_crypt_copy + * rt2x00: Allow rt61 to catch up after a missing tx report + * rt2x00: Corectly initialize rt2500usb MAC + * rt2x00: Put 802.11 data on 4 byte boundary + * NFSv4: Give the lock stateid its own sequence queue + * sata_qstor: use hardreset instead of softreset + * libata-sff: PCI IRQ handling fix + * pata_pdc202xx_old: Further fixups + * pata_ixp4xx_cf: fix compilation introduced by ata_port_desc() + conversion + * libata-pmp: 4726 hates SRST + * libata-pmp: propagate timeout to host link + * libata: don't normalize UNKNOWN to NONE after reset + * Update kernel parameter document for libata DMA mode setting knobs. + * sata_sil24: prevent hba lockup when pass-through ATA commands are used + * ide: workaround suspend bug for ACPI IDE + * ide: fix cable detection for SATA bridges + * trm290: do hook dma_host_{on,off} methods (take 2) + * libata and starting/stopping ATAPI floppy devices + * ACPI : Not register gsi for PCI IDE controller in legacy mode + * ACPICA: fix acpi_serialize hang regression + * sh: Force __access_ok() to obey address space limit. + * [AX25] af_ax25: Possible circular locking. + * ACPI: apply quirk_ich6_lpc_acpi to more ICH8 and ICH9 + * [POWERPC] Fix CPU hotplug when using the SLB shadow buffer + * [BLUETOOTH]: rfcomm tty BUG_ON() code fix + * [BLUETOOTH]: Always send explicit hci_ll wake-up acks. + * [DECNET] ROUTE: fix rcu_dereference() uses in /proc/net/decnet_cache + * [VLAN]: nested VLAN: fix lockdep's recursive locking warning + * [MACVLAN]: Prevent nesting macvlan devices + * [NETFILTER]: ip6t_eui64: Fixes calculation of Universal/Local bit + * [NETFILTER]: xt_helper: Do not bypass RCU + * [XFS] fix unaligned access in readdir + * Don't blatt first element of prv in sg_chain() + * loop: fix bad bio_alloc() nr_iovec request + * block: fix blktrace timestamps + * blktrace: kill the unneeded initcall + * V4L/DVB (6999): ivtv: stick to udelay=10 after all + * V4L/DVB (7001): av7110: fix section mismatch + * [MIPS] Wrong CONFIG option prevents setup of DMA zone. + * [MIPS] pnx8xxx: move to clocksource + * [MIPS] Malta: Fix software reset on big endian + * [MIPS] Lasat: Fix built in separate object directory. + * [MIPS] Replace 40c7869b693b18412491fdcff64682215b739f9e kludge + * Pull bugzilla-5637 into release branch + * Pull bugzilla-8171 into release branch + * Pull bugzilla-8973 into release branch + * PM: ACPI and APM must not be enabled at the same time + * Pull bugzilla-9194 into release branch + * Pull bugzilla-9494 into release branch + * Pull bugzilla-9535 into release branch + * Pull bugzilla-9627 into release branch + * Pull bugzilla-9683 into release branch + * IDE: terminate ACPI DMI list + * cache invalidation error for buffered write + * ps3fb: prevent use after free of fb_info + * ps3fb: fix deadlock on kexec() + * [NETFILTER]: bridge: fix double POST_ROUTING invocation + * xircom_cb endianness fixes + * de4x5 fixes + * endianness noise in tulip_core + * netxen: update MAINTAINERS + * netxen: update driver version + * netxen: stop second phy correctly + * netxen: optimize tx handling + * netxen: fix byte-swapping in tx and rx + * 3c509: PnP resource management fix + * Fixed a small typo in the loopback driver + * ip1000: menu location change + * r8169: fix missing loop variable increment + * [usb netdev] asix: fix regression + * fs_enet: check for phydev existence in the ethtool handlers + * Use access mode instead of open flags to determine needed permissions + * sky2: large memory workaround. + * sky2: remove check for PCI wakeup setting from BIOS + * spidernet MAINTAINERship update + * pnpacpi: print resource shortage message only once + * Pull bugzilla-9535 into release branch + * [SPARC]: Make gettimeofday() monotonic again. + * [SPARC64]: Fix build with SPARSEMEM_VMEMMAP disabled. + * remove task_ppid_nr_ns + * knfsd: Allow NFSv2/3 WRITE calls to succeed when krb5i etc is used. + * Input: improve Kconfig help entries for HP Jornada devices + * [TOKENRING]: rif_timer not initialized properly + * modules: de-mutex more symbol lookup paths in the module code + * w1: decrement slave counter only in ->release() callback + * Kick CPUS that might be sleeping in cpus_idle_wait + * TPM: fix suspend and resume failure + * MAINTAINERS: email update and add missing entry + * quicklists: Only consider memory that can be used with GFP_KERNEL + * macintosh: fix fabrication of caplock key events + * scsi/qla2xxx/qla_os.c section fix + * cciss: section mismatch + * advansys: fix section mismatch warning + * hugetlbfs: fix quota leak + * s3c2410fb: fix incorrect argument type in resume function + * CRIS: define __ARCH_WANT_SYS_RT_SIGSUSPEND in unistd.h for CRIS + * CRIS v10: correct do_signal to fix oops and clean up signal handling in + general + * CRIS v10: kernel/time.c needs to include linux/vmstat.h to compile + * uvesafb: fix section mismatch warnings + * CRIS v10: driver for ds1302 needs to include cris-specific i2c.h + * OSS msnd: fix array overflows + * i2c-omap: Fix NULL pointer dereferencing + * i2c: Spelling fixes + * i2c: Driver IDs are optional + * i2c-sibyte: Fix an error path + * fix the "remove task_ppid_nr_ns" commit + * [MIPS] Kconfig fixes for BCM47XX platform + * [MIPS] Cobalt: Fix ethernet interrupts for RaQ1 + * [MIPS] Cobalt: Qube1 has no serial port so don't use it + * [MIPS] Cacheops.h: Fix typo. + * ata_piix: ignore ATA_DMA_ERR on vmware ich4 + * sata_sil24: fix stupid typo + * sata_sil24: freeze on non-dev errors reported via CERR + * libata: relocate sdev->manage_start_stop configuration + * [POWERPC] Fix boot failure on POWER6 + * x86: fix boot crash on HIGHMEM4G && SPARSEMEM + * x86: asm-x86/msr.h: pull in linux/types.h + * x86: fix RTC_AIE with CONFIG_HPET_EMULATE_RTC + * Fix ARM profiling/instrumentation configuration + * Fix Blackfin HARDWARE_PM support + * libata fixes for sparse-found problems + * [libata] pata_bf54x: checkpatch fixes + * [libata] core checkpatch fix + * libata: correct handling of TSS DVD + * [IA64] Fix unaligned handler for floating point instructions with base + update + * Linux 2.6.24-rc8 + * lockdep: fix internal double unlock during self-test + * lockdep: fix workqueue creation API lockdep interaction + * lockdep: more hardirq annotations for notify_die() + * hostap: section mismatch warning + * wireless/libertas support for 88w8385 sdio older revision + * ipw2200: fix typo in kerneldoc + * b43: fix use-after-free rfkill bug + * rt2x00: Fix ieee80211 payload alignment + * sysfs: make sysfs_lookup() return ERR_PTR(-ENOENT) on failed lookup + * sysfs: fix bugs in sysfs_rename/move_dir() + * Use access mode instead of open flags to determine needed permissions + (CVE-2008-0001) + * IB/ipath: Fix receiving UD messages with immediate data + * [NET]: Fix TX timeout regression in Intel drivers. + * [NIU]: Fix 1G PHY link state handling. + * [SPARC64]: Fix hypervisor TLB operation error reporting. + * Input: mousedev - handle mice that use absolute coordinates + * Input: usbtouchscreen - fix buffer overflow, make more egalax work + * Input: psmouse - fix potential memory leak in psmouse_connect() + * Input: psmouse - fix input_dev leak in lifebook driver + * Input: ALPS - fix sync loss on Acer Aspire 5720ZG + * ipg: balance locking in irq handler + * ipg: plug Tx completion leak + * ipg: fix queue stop condition in the xmit handler + * ipg: fix Tx completion irq request + * cpufreq: Initialise default governor before use + * hfs: fix coverity-found null deref + * pnpacpi: print resource shortage message only once (more) + * CRIS v10: vmlinux.lds.S: ix kernel oops on boot and use common defines + * mm: fix section mismatch warning in page_alloc.c + * jbd: do not try lock_acquire after handle made invalid + * alpha: fix conversion from denormal float to double + * #ifdef very expensive debug check in page fault path + * Fix unbalanced helper_lock in kernel/kmod.c + * fix wrong sized spinlock flags argument + * bonding: fix locking in sysfs primary/active selection + * bonding: fix ASSERT_RTNL that produces spurious warnings + * bonding: fix locking during alb failover and slave removal + * bonding: release slaves when master removed via sysfs + * bonding: Fix up parameter parsing + * bonding: fix lock ordering for rtnl and bonding_rwsem + * bonding: Don't hold lock when calling rtnl_unlock + * Documentation: add a guideline for hard_start_xmit method + * atl1: fix frame length bug + * S2io: Fixed synchronization between scheduling of napi with card reset + and close + * dscc4 endian fixes + * wan/lmc bitfields fixes + * sbni endian fixes + * 3c574, 3c515 bitfields abuse + * dl2k: BMCR_t fixes + * dl2k: ANAR, ANLPAR fixes + * dl2k: BMSR fixes + * dl2k: MSCR, MSSR, ESR, PHY_SCR fixes + * dl2k: the rest + * Replace cpmac fix + * [WATCHDOG] Revert "Stop looking for device as soon as one is found" + * [WATCHDOG] clarify watchdog operation in documentation + * x86: add support for the latest Intel processors to Oprofile + * Selecting LGUEST should turn on Guest support, as in 2.6.23. + * ARM: OMAP1: Keymap fix for f-sample and p2-sample + * ARM: OMAP1: Fix compile for board-nokia770 + * pata_pdc202xx_old: Fix crashes with ATAPI + * arch: Ignore arch/i386 and arch/x86_64 + * Remove bogus duplicate CONFIG_LGUEST_GUEST entry. + * [ARM] pxa: don't rely on r2 being preserved over a function call + * [ARM] 4748/1: dca: source drivers/dca/Kconfig in arch/arm/Kconfig to + fix warning + * rfkill: call rfkill_led_trigger_unregister() on error + * [IPV6]: Mischecked tw match in __inet6_check_established. + * [IPV4] fib_hash: fix duplicated route issue + * [IPV4] fib_trie: fix duplicated route issue + * [NET]: Fix interrupt semaphore corruption in Intel drivers. + * [IPV4] FIB_HASH : Avoid unecessary loop in fn_hash_dump_zone() + * [IPV6] ROUTE: Make sending algorithm more friendly with RFC 4861. + * [NETFILTER]: bridge-netfilter: fix net_device refcnt leaks + * [NEIGH]: Revert 'Fix race between neigh_parms_release and + neightbl_fill_parms' + * [IrDA]: af_irda memory leak fixes + * [ATM] atm/idt77105.c: Fix section mismatch. + * [ATM] atm/suni.c: Fix section mismatch. + * [AF_KEY]: Fix skb leak on pfkey_send_migrate() error + * [NET]: rtnl_link: fix use-after-free + * [IPV6]: ICMP6_MIB_OUTMSGS increment duplicated + * [IPV6]: RFC 2011 compatibility broken + * [ICMP]: ICMP_MIB_OUTMSGS increment duplicated + * selinux: fix memory leak in netlabel code + * [MIPS] SMTC: Fix build error. + * [MIPS] Malta: Fix reading the PCI clock frequency on big-endian + * tc35815: Use irq number for tc35815-mac platform device id + * keyspan: fix oops + * hrtimer: fix section mismatch + * timer: fix section mismatch + * CRIS: add missed local_irq_restore call + * s3c2410_fb: fix line length calculation + * Fix filesystem capability support + * sched: group scheduler, set uid share fix + * hwmon: (it87) request only Environment Controller ports + * W1: w1_therm.c ds18b20 decode freezing temperatures correctly + * W1: w1_therm.c is flagging 0C etc as invalid + * rcu: fix section mismatch + * Fix file references in documentation and Kconfig + * x86: GEODE fix a race condition in the MFGPT timer tick + * virtnet: remove double ether_setup + * virtio:simplify-config-mechanism + * virtio: An entropy device, as suggested by hpa. + * virtio: Export vring functions for modules to use + * virtio: Put the virtio under the virtualization menu + * virtio:pci-device + * Fix vring_init/vring_size to take unsigned long + * virtio:vring-kick-when-empty + * virtio:explicit-callback-disable + * virtio:net-flush-queue-on-init + * virtio:net-fix-xmit-skb-free-real + * Parametrize the napi_weight for virtio receive queue. + * Handle module unload Add the device release function. + * Update all status fields on driver unload + * Make virtio modules GPL + * Make virtio_pci license be GPL2+ + * Use Qumranet donated PCI vendor/device IDs + * virtio:more-interrupt-suppression + * Reboot Implemented + * lguest:reboot-fix + * introduce vcpu struct + * adapt lguest launcher to per-cpuness + * initialize vcpu + * per-cpu run guest + * make write() operation smp aware + * make hypercalls use the vcpu struct + * per-vcpu lguest timers + * per-vcpu interrupt processing. + * map_switcher_in_guest() per-vcpu + * make emulate_insn receive a vcpu struct. + * make registers per-vcpu + * replace lguest_arch with lg_cpu_arch. + * per-vcpu lguest task management + * makes special fields be per-vcpu + * make pending notifications per-vcpu + * per-vcpu lguest pgdir management + + -- Tim Gardner Thu, 17 Jan 2008 14:45:01 -0700 + +linux (2.6.24-4.7) hardy; urgency=low + + [Amit Kucheria] + + * Poulsbo: Add SD8686 and 8688 WLAN drivers + * Poulsbo: Mass update of patches to be identical to those on moblin + * SAUCE: make fc transport removal of target configurable OriginalAuthor: + Michael Reed sgi.com> OriginalLocation: + http://thread.gmane.org/gmane.linux.scsi/25318 Bug: 163075 + + [Fabio M. Di Nitto] + + * Fix handling of gcc-4.1 for powerpc and ia64 + + [Tim Gardner] + + * Re-engineered architecture specific linux-headers compiler version + dependencies. + * Doh! Changed header-depends to header_depends. + + -- Tim Gardner Fri, 11 Jan 2008 07:10:46 -0700 + +linux (2.6.24-4.6) hardy; urgency=low + + [Alessio Igor Bogani] + + * Fix -rt build FTBS. + + [Amit Kucheria] + + * LPIACOMPAT: Update thermal patches to be inline with lpia flavour + * Poulsbo: Add USB Controller patch and corresponding config change + + [Fabio M. Di Nitto] + + * Enable aoe and nbd modules on hppa Ignore: yes + * Fix ia64 build by using gcc-4.1 + + [Tim Gardner] + + * Enable JFFS2 LZO compression. + - LP: #178343 + * Remove IS_G33 special handling. + - LP: #174367 + * Enabled CONFIG_SECURITY_CAPABILITIES and + CONFIG_SECURITY_FILE_CAPABILITIES + - LP: #95089 + * Enabled CONFIG_TASKSTATS and CONFIG_TASK_IO_ACCOUNTING + * Turned CONFIG_SECURITY_FILE_CAPABILITIES back off. + * Enabled CONFIG_B43LEGACY=m + * Enabled CONFIG_SCSI_QLOGIC_1280=m + * Enabled CONFIG_FUSION=y for virtual + * USB bluetooth device 0x0e5e:0x6622 floods errors to syslog + - LP: #152689 + * Removed lpia from d-i. + * Added ia64 modules. + * Added hppa32/64 modules. + + [Upstream Kernel Changes] + + * DMI autoload dcdbas on all Dell systems. + * sched: fix gcc warnings + * leds: Fix leds_list_lock locking issues + * leds: Fix locomo LED driver oops + * x86: fix asm-x86/byteorder.h for userspace export + * x86: fix asm-x86/msr.h for user-space export + * fix lguest rmmod "bad pgd" + * slub: provide /proc/slabinfo + * [POWERPC] Fix build failure on Cell when CONFIG_SPU_FS=y + * slub: register slabinfo to procfs + * [SCSI] scsi_sysfs: restore prep_fn when ULD is removed + * Unify /proc/slabinfo configuration + * scsi: revert "[SCSI] Get rid of scsi_cmnd->done" + * restrict reading from /proc//maps to those who share ->mm or can + ptrace pid + * Fix kernel/ptrace.c compile problem (missing "may_attach()") + * hwmon: (w83627ehf) Be more careful when changing VID input level + * NFS: Fix a possible Oops in fs/nfs/super.c + * NFSv4: Fix circular locking dependency in nfs4_kill_renewd + * NFS: add newline to kernel warning message in auth_gss code + * NFSv4: nfs4_open_confirm must not set the open_owner as confirmed on + error + * NFSv4: Fix open_to_lock_owner sequenceid allocation... + * IB/srp: Fix list corruption/oops on module reload + * Console is utf-8 by default + * [IA64] Update Altix BTE error return status patch + * [IA64] Update Altix nofault code + * [X25]: Add missing x25_neigh_put + * [XFRM]: Do not define km_migrate() if !CONFIG_XFRM_MIGRATE + * [CASSINI]: Fix endianness bug. + * [CASSINI]: Revert 'dont touch page_count'. + * [CASSINI]: Program parent Intel31154 bridge when necessary. + * [CASSINI]: Set skb->truesize properly on receive packets. + * [CASSINI]: Fix two obvious NAPI bugs. + * [CASSINI]: Bump driver version and release date. + * [INET]: Fix netdev renaming and inet address labels + * [CONNECTOR]: Return proper error code in cn_call_callback() + * [ISDN] i4l: 'NO CARRIER' message lost after ldisc flush + * [ISDN]: i4l: Fix DLE handling for i4l-audio + * fix: using joysticks in 32 bit applications on 64 bit systems + * hda_intel suspend latency: shorten codec read + * CPU hotplug: fix cpu_is_offline() on !CONFIG_HOTPLUG_CPU + * Linux 2.6.24-rc7 + * PIE executable randomization (upstream cherry pick by kees) + + -- Tim Gardner Fri, 04 Jan 2008 07:15:47 -0700 + +linux (2.6.24-3.5) hardy; urgency=low + + [Alessio Igor Bogani] + + * rt: Fix rt preempt patchset version + * Updated README file for binary custom flavours + * Fix -rt build FTBS. + * rt: Update configuration files + + [Tim Gardner] + + * SAUCE: Add extra headers to linux-libc-dev + + [Upstream Kernel Changes] + + * [WATCHDOG] at32ap700x_wdt: add support for boot status and add fix for + silicon errata + * [WATCHDOG] Stop looking for device as soon as one is found + * [WATCHDOG] bfin_wdt, remove SPIN_LOCK_UNLOCKED + * [WATCHDOG] Sbus: cpwatchdog, remove SPIN_LOCK_UNLOCKED + * [WATCHDOG] IT8212F watchdog driver + * ACPI: acpiphp: Remove dmesg spam on device remove + * [WATCHDOG] ipmi: add the standard watchdog timeout ioctls + * [WATCHDOG] add Nano 7240 driver + * ACPI: battery: fix ACPI battery technology reporting + * [ARM] 4667/1: CM-X270 fixes + * [ARM] 4690/1: PXA: fix CKEN corruption in PXA27x AC97 cold reset code + * [IPV6] XFRM: Fix auditing rt6i_flags; use RTF_xxx flags instead of + RTCF_xxx. + * [IPV4]: Swap the ifa allocation with the"ipv4_devconf_setall" call + * [IPv4] ESP: Discard dummy packets introduced in rfc4303 + * [IPv6] ESP: Discard dummy packets introduced in rfc4303 + * [UM]: Fix use of skb after netif_rx + * [XTENSA]: Fix use of skb after netif_rx + * [S390]: Fix use of skb after netif_rx + * [BNX2]: Add PHY_DIS_EARLY_DAC workaround. + * [BNX2]: Fix RX packet rot. + * [BNX2]: Update version to 1.6.9. + * [NET]: Fix wrong comments for unregister_net* + * [VLAN]: Fix potential race in vlan_cleanup_module vs + vlan_ioctl_handler. + * [IPSEC]: Fix potential dst leak in xfrm_lookup + * V4L/DVB (6485): ivtv: fix compile warning + * V4L/DVB (6540): em28xx: fix failing autodetection after the reboot + * V4L/DVB (6542): Fix S-video mode on tvp5150 + * V4L/DVB (6579): Fix bug #8824: Correct support for Diseqc on tda10086 + * V4L/DVB (6581): Fix: avoids negative vma usage count + * V4L/DVB (6601): V4L: videobuf-core locking fixes and comments + * V4L/DVB (6602): V4L: Convert videobuf drivers to videobuf_stop + * V4L/DVB (6615): V4L: Fix VIDIOCGMBUF locking in saa7146 + * V4L/DVB (6629): zl10353: fix default adc_clock and TRL nominal rate + calculation + * V4L/DVB (6666): saa7134-alsa: fix period handling + * V4L/DVB (6684): Complement va_start() with va_end() + style fixes + * V4L/DVB (6686): saa7134: fix composite over s-video input on the Tevion + MD 9717 + * V4L/DVB (6690): saa7134: fix ignored interrupts + * V4L/DVB (6751): V4L: Memory leak! Fix count in videobuf-vmalloc mmap + * V4L/DVB (6746): saa7134-dvb: fix tuning for WinTV HVR-1110 + * V4L/DVB (6750): Fix in-kernel compilation for cxusb + * V4L/DVB (6733): DVB: Compile 3000MC-specific DIB code only for + CONFIG_DVB_DIB3000MC + * V4L/DVB (6794): Fix compilation when dib3000mc is compiled as a module + * NFS: Fix NFS mountpoint crossing... + * V4L/DVB (6796): ivtv/ section fix + * V4L/DVB (6797): bt8xx/ section fixes + * NFSv2/v3: Fix a memory leak when using -onolock + * V4L/DVB (6609): Re-adds lock safe videobuf_read_start + * i2c: Delete an outdated piece of documentation + * i2c-gpio: Initialize adapter class + * i2c: Add missing spaces in split log messages + * i2c/isp1301_omap: Build fix + * [SERIAL] sparc: Infrastructure to fix section mismatch bugs. + * NFS: Fix an Oops in NFS unmount + * sdhci: describe quirks + * sdhci: don't warn about sdhci 2.0 controllers + * sdhci: use PIO when DMA can't satisfy the request + * sdhci: support JMicron JMB38x chips + * mmc: remove unused 'mode' from the mmc_host structure + * IB/ehca: Return correct number of SGEs for SRQ + * IB/ehca: Serialize HCA-related hCalls if necessary + * ide-scsi: add ide_scsi_hex_dump() helper + * ide: add missing checks for control register existence + * ide: deprecate CONFIG_BLK_DEV_OFFBOARD + * ide: fix ide_scan_pcibus() error message + * ide: coding style fixes for drivers/ide/setup-pci.c + * ide: add /sys/bus/ide/devices/*/{model,firmware,serial} sysfs entries + * ide: DMA reporting and validity checking fixes (take 3) + * ide-cd: remove dead post_transform_command() + * pdc202xx_new: fix Promise TX4 support + * hpt366: fix HPT37x PIO mode timings (take 2) + * ide: remove dead code from __ide_dma_test_irq() + * ide: remove stale changelog from ide-disk.c + * ide: remove stale changelog from ide-probe.c + * ide: fix ->io_32bit race in set_io_32bit() + * MAINTAINERS: update the NFS CLIENT entry + * V4L/DVB (6803): buf-core.c locking fixes + * [SPARC64]: Fix two kernel linear mapping setup bugs. + * IB/ehca: Fix lock flag variable location, bump version number + * kbuild: re-enable Makefile generation in a new O=... directory + * V4L/DVB (6798): saa7134: enable LNA in analog mode for Hauppauge WinTV + HVR-1110 + * V4L/DVB (6814): Makefile: always enter video/ + * V4L/DVB (6819): i2c: fix drivers/media/video/bt866.c + * V4L/DVB (6820): s5h1409: QAM SNR related fixes + * ACPI: video_device_list corruption + * ACPI: fix modpost warnings + * ACPI: thinkpad-acpi: fix lenovo keymap for brightness + * Pull thinkpad-2.6.24 into release branch + * Pull battery-2.6.24 into release branch + * [POWERPC] Fix typo #ifdef -> #ifndef + * [POWERPC] Kill non-existent symbols from ksyms and commproc.h + * [POWRPC] CPM2: Eliminate section mismatch warning in cpm2_reset(). + * [POWERPC] 82xx: mpc8272ads, pq2fads: Update defconfig with + CONFIG_FS_ENET_MDIO_FCC + * [POWERPC] iSeries: don't printk with HV spinlock held + * [POWERPC] Fix rounding bug in emulation for double float operating + * [POWERPC] Make PS3_SYS_MANAGER default y, not m + * [MIPS] time: Set up Cobalt's mips_hpt_frequency + * [MIPS] Alchemy: fix PCI resource conflict + * [MIPS] Alchemy: fix off by two error in __fixup_bigphys_addr() + * [MIPS] Atlas, Malta: Don't free firmware memory on free_initmem. + * [MIPS] PCI: Make pcibios_fixup_device_resources ignore legacy + resources. + * [MIPS] time: Delete weak definition of plat_time_init() due to gcc bug. + * [MIPS] Ensure that ST0_FR is never set on a 32 bit kernel + * [SPARC32]: Silence sparc32 warnings on missing syscalls. + * Pull hotplug into release branch + * ACPI: SBS: Reset alarm bit + * ACPI: SBS: Ignore alarms coming from unknown devices + * ACPI: SBS: Return rate in mW if capacity in mWh + * Pull bugzilla-9362 into release branch + * sky2: RX lockup fix + * sundance fixes + * starfire VLAN fix + * e100: free IRQ to remove warningwhenrebooting + * hamachi endianness fixes + * drivers/net/sis190.c section fix + * drivers/net/s2io.c section fixes + * ucc_geth: minor whitespace fix + * net: smc911x: shut up compiler warnings + * Net: ibm_newemac, remove SPIN_LOCK_UNLOCKED + * ixgb: make sure jumbos stay enabled after reset + * [NETFILTER]: ctnetlink: set expected bit for related conntracks + * [NETFILTER]: ip_tables: fix compat copy race + * [XFRM]: Display the audited SPI value in host byte order. + * [NETFILTER]: xt_hashlimit should use time_after_eq() + * [TIPC]: Fix semaphore handling. + * [SYNCPPP]: Endianness and 64bit fixes. + * [NETFILTER]: bridge: fix missing link layer headers on outgoing routed + packets + * [ATM]: Fix compiler warning noise with FORE200E driver + * [IPV4]: Updates to nfsroot documentation + * [BRIDGE]: Assign random address. + * [IPV6]: Fix the return value of ipv6_getsockopt + * [IPV4]: Make tcp_input_metrics() get minimum RTO via tcp_rto_min() + * [AX25]: Locking dependencies fix in ax25_disconnect(). + * [SCTP]: Flush fragment queue when exiting partial delivery. + * [IRDA]: Race between open and disconnect in irda-usb. + * [IRDA]: mcs7780 needs to free allocated rx buffer. + * [IRDA]: irlmp_unregister_link() needs to free lsaps. + * [IRDA]: stir4200 fixes. + * [IRDA]: irda parameters warning fixes. + * [S390] pud_present/pmd_present bug. + * [ARM] 4710/1: Fix coprocessor 14 usage for debug messages via ICEDCC + * [ARM] 4694/1: IXP4xx: Update clockevent support for shutdown and resume + * kobject: fix the documentation of how kobject_set_name works + * tipar: remove obsolete module + * HOWTO: Change man-page maintainer address for Japanese HOWTO + * Add Documentation for FAIR_USER_SCHED sysfs files + * HOWTO: change addresses of maintainer and lxr url for Korean HOWTO + * add stable_api_nonsense.txt in korean + * HOWTO: update misspelling and word incorrected + * PCI: Restore PCI expansion ROM P2P prefetch window creation + * USB: sierra: fix product id + * usb-storage: Fix devices that cannot handle 32k transfers + * USB: cp2101: new device id + * USB: option: Bind to the correct interface of the Huawei E220 + * usb.h: fix kernel-doc warning + * USB: fix locking loop by avoiding flush_scheduled_work + * USB: use IRQF_DISABLED for HCD interrupt handlers + * USB: at91_udc: correct hanging while disconnecting usb cable + * usb: Remove broken optimisation in OHCI IRQ handler + * USB: revert portions of "UNUSUAL_DEV: Sync up some reported devices + from Ubuntu" + * ocfs2: fix exit-while-locked bug in ocfs2_queue_orphans() + * ocfs2: Don't panic when truncating an empty extent + * ocfs2: Allow for debugging of transaction extends + * ocfs2: Re-journal buffers after transaction extend + * pcnet_cs: add new id + * ucc_geth: really fix section mismatch + * sis190 endianness + * libertas: add Dan Williams as maintainer + * zd1211rw: Fix alignment problems + * wireless/ipw2200.c: add __dev{init,exit} annotations + * ieee80211_rate: missed unlock + * iwlwifi3945/4965: fix rate control algo reference leak + * libertas: select WIRELESS_EXT + * bcm43xx_debugfs sscanf fix + * b43: Fix rfkill radio LED + * iwlwifi: fix rf_kill state inconsistent during suspend and resume + * sata_sil: fix spurious IRQ handling + * libata: clear link->eh_info.serror from ata_std_postreset() + * libata: add ST3160023AS / 3.42 to NCQ blacklist + * sata_mv: improve warnings about Highpoint RocketRAID 23xx cards + * libata-acpi: adjust constness in ata_acpi_gtm/stm() parameters + * libata: update ata_*_printk() macros such that level can be a variable + * libata: add more opcodes to ata.h + * libata: ata_dev_disable() should be called from EH context + * libata-acpi: add new hooks ata_acpi_dissociate() and + ata_acpi_on_disable() + * libata-acpi: implement and use ata_acpi_init_gtm() + * libata-acpi: implement dev->gtf_cache and evaluate _GTF right after + _STM during resume + * libata-acpi: improve ACPI disabling + * libata-acpi: improve _GTF execution error handling and reporting + * libata-acpi: implement _GTF command filtering + * libata: update atapi_eh_request_sense() such that lbam/lbah contains + buffer size + * libata: fix ATAPI draining + * fix headers_install + * revert "Hibernation: Use temporary page tables for kernel text mapping + on x86_64" + * uml: stop gdb from deleting breakpoints when running UML + * alpha: strncpy/strncat fixes + * rtc-at32ap700x: fix irq init oops + * parport: "dev->timeslice" is an unsigned long, not an int + * ecryptfs: initialize new auth_tokens before teardown + * Fix lguest documentation + * sparsemem: make SPARSEMEM_VMEMMAP selectable + * fs/Kconfig: grammar fix + * ext3, ext4: avoid divide by zero + * alpha: build fixes + * cpufreq: fix missing unlocks in cpufreq_add_dev error paths. + * mm/sparse.c: check the return value of sparse_index_alloc() + * mm/sparse.c: improve the error handling for sparse_add_one_section() + * pktcdvd: add kobject_put when kobject register fails + * drivers/macintosh/via-pmu.c: Added a missing iounmap + * drivers/cpufreq/cpufreq_stats.c section fix + * apm_event{,info}_t are userspace types + * mm: fix page allocation for larger I/O segments + * ecryptfs: set s_blocksize from lower fs in sb + * I/OAT: fixups from code comments + * I/OAT: fix null device in call to dev_err() + * fix bloat-o-meter for ppc64 + * ecryptfs: fix fsx data corruption problems + * Documentation: update hugetlb information + * Fix compilation warning in dquot.c + * SLUB: remove useless masking of GFP_ZERO + * quicklist: Set tlb->need_flush if pages are remaining in quicklist 0 + * sysctl: fix ax25 checks + * [XFS] Don't wait for pending I/Os when purging blocks beyond eof. + * [XFS] Put the correct offset in dirent d_off + * block: use jiffies conversion functions in scsi_ioctl.c + * as-iosched: fix incorrect comments + * as-iosched: fix write batch start point + * block: let elv_register() return void + * Cleanup umem driver: fix most checkpatch warnings, conform to kernel + * sched: fix crash on ia64, introduce task_current() + * sched: mark rwsem functions as __sched for wchan/profiling + * sched: sysctl, proc_dointvec_minmax() expects int values for + * sched: touch softlockup watchdog after idling + * sched: do not hurt SCHED_BATCH on wakeup + * oprofile: op_model_athlon.c support for AMD family 10h barcelona + performance counters + * clockevents: fix reprogramming decision in oneshot broadcast + * genirq: add unlocked version of set_irq_handler() + * timer: kernel/timer.c section fixes + * x86: jprobe bugfix + * x86: kprobes bugfix + * x86: also define AT_VECTOR_SIZE_ARCH + * genirq: revert lazy irq disable for simple irqs + * x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" + * [SCSI] sym53c8xx: fix free_irq() regression + * [SCSI] dpt_i2o: driver is only 32 bit so don't set 64 bit DMA mask + * [SCSI] sym53c8xx: fix "irq X: nobody cared" regression + * [SCSI] initio: fix conflict when loading driver + * [SCSI] st: fix kernel BUG at include/linux/scatterlist.h:59! + * [SCSI] initio: bugfix for accessors patch + * IA64: Slim down __clear_bit_unlock + * [IA64] signal: remove redundant code in setup_sigcontext() + * [IA64] ia32 nopage + * [IA64] Avoid unnecessary TLB flushes when allocating memory + * [IA64] Two trivial spelling fixes + * [IA64] print kernel release in OOPS to make kerneloops.org happy + * [IA64] set_thread_area fails in IA32 chroot + * [IA64] Remove compiler warinings about uninitialized variable in + irq_ia64.c + * [IA64] Remove assembler warnings on head.S + * [IA64] Fix Altix BTE error return status + * [IA64] Guard elfcorehdr_addr with #if CONFIG_PROC_FS + * [IA64] make flush_tlb_kernel_range() an inline function + * [IA64] Adjust CMCI mask on CPU hotplug + * Do dirty page accounting when removing a page from the page cache + * x86 apic_32.c section fix + * x86 smpboot_32.c section fixes + * x86_32: select_idle_routine() must be __cpuinit + * x86_32: disable_pse must be __cpuinitdata + * x86: fix show cpuinfo cpu number always zero + * ps3fb: Update for firmware 2.10 + * ps3fb: Fix ps3fb free_irq() dev_id + * pata_hpt37x: Fix HPT374 detection + * mac80211: Drop out of associated state if link is lost + * mac80211: fix header ops + * NET: mac80211: fix inappropriate memory freeing + * [TG3]: Endianness annotations. + * [TG3]: Endianness bugfix. + * rtl8187: Add USB ID for Sitecom WL-168 v1 001 + * p54: add Kconfig description + * iwlwifi: fix possible priv->mutex deadlock during suspend + * ipw2200: prevent alloc of unspecified size on stack + * [IPV4] ARP: Remove not used code + * [IPSEC]: Avoid undefined shift operation when testing algorithm ID + * [XFRM]: Audit function arguments misordered + * [IPV4] ip_gre: set mac_header correctly in receive path + * [NET]: Correct two mistaken skb_reset_mac_header() conversions. + * [SPARC64]: Fix OOPS in dma_sync_*_for_device() + * sched: rt: account the cpu time during the tick + * debug: add end-of-oops marker + * mm: fix exit_mmap BUG() on a.out binary exit + * dm: table detect io beyond device + * dm mpath: hp requires scsi + * dm crypt: fix write endio + * dm: trigger change uevent on rename + * dm: merge max_hw_sector + * dm crypt: use bio_add_page + * [SPARC64]: Spelling fixes + * [SPARC32]: Spelling fixes + * [NET] include/net/: Spelling fixes + * [DCCP]: Spelling fixes + * [IRDA]: Spelling fixes + * [IPV6]: Spelling fixes + * [NET] net/core/: Spelling fixes + * [PKT_SCHED]: Spelling fixes + * [NETLABEL]: Spelling fixes + * [SCTP]: Spelling fixes + * [NETFILTER]: Spelling fixes + * [NETFILTER] ipv4: Spelling fixes + * [ATM]: Spelling fixes + * [NET]: Fix function put_cmsg() which may cause usr application memory + overflow + * x86: fix die() to not be preemptible + * x86: intel_cacheinfo.c: cpu cache info entry for Intel Tolapai + * [XFS] Fix mknod regression + * [XFS] Initialise current offset in xfs_file_readdir correctly + * Linux 2.6.24-rc6 + * [IPV4]: OOPS with NETLINK_FIB_LOOKUP netlink socket + * SLUB: Improve hackbench speed + * typhoon: endianness bug in tx/rx byte counters + * typhoon: missing le32_to_cpu() in get_drvinfo + * typhoon: set_settings broken on big-endian + * typhoon: missed rx overruns on big-endian + * typhoon: memory corruptor on big-endian if TSO is enabled + * typhoon: trivial endianness annotations + * cycx: annotations and fixes (.24 fodder?) + * asix fixes + * yellowfin: annotations and fixes (.24 fodder?) + * dl2k endianness fixes (.24 fodder?) + * r8169 endianness + * rrunner: use offsetof() instead of homegrown insanity + * 3c574 and 3c589 endianness fixes (.24?) + * fec_mpc52xx: write in C... + * 3c359 endianness annotations and fixes + * MACB: clear transmit buffers properly on transmit underrun + * UIO: Add a MAINTAINERS entry for Userspace I/O + * Modules: fix memory leak of module names + * USB: Unbreak fsl_usb2_udc + * USB: VID/PID update for sierra + * USB: New device ID for the CP2101 driver + * quicklists: do not release off node pages early + * ecryptfs: fix string overflow on long cipher names + * Fix computation of SKB size for quota messages + * Don't send quota messages repeatedly when hardlimit reached + * ecryptfs: fix unlocking in error paths + * ecryptfs: redo dget,mntget on dentry_open failure + * MAINTAINERS: mailing list archives are web links + * ps3: vuart: fix error path locking + * lib: proportion: fix underflow in prop_norm_percpu() + * pcmcia: remove pxa2xx_lubbock build warning + * kconfig: obey KCONFIG_ALLCONFIG choices with randconfig. + * tty: fix logic change introduced by wait_event_interruptible_timeout() + * uml: user of helper_wait() got missed when it got extra arguments + * V4L/DVB (6871): Kconfig: VIDEO_CX23885 must select DVB_LGDT330X + * V4L/DVB (6876): ivtv: mspx4xx needs a longer i2c udelay + * drivers/ide/: Spelling fixes + * ide-cd: fix SAMSUNG CD-ROM SCR-3231 quirk + * ide-cd: fix ACER/AOpen 24X CDROM speed reporting on big-endian machines + * ide-cd: use ide_cd_release() in ide_cd_probe() + * ide-cd: fix error messages in cdrom_{read,write}_check_ireason() + * ide-cd: add missing 'ireason' masking to cdrom_write_intr() + * ide-cd: fix error messages in cdrom_write_intr() + * ide-cd: add error message for DMA error to cdrom_read_intr() + * ide-cd: fix error message in cdrom_pc_intr() + * ide-cd: fix 'ireason' reporting in cdrom_pc_intr() + * MAINTAINERS: update ide-cd entry + * [SPARC64]: Implement pci_resource_to_user() + * mac80211: round station cleanup timer + * mac80211: warn when receiving frames with unaligned data + * [NETFILTER]: nf_conntrack_ipv4: fix module parameter compatibility + * [TUNTAP]: Fix wrong debug message. + * [NET] tc_nat: header install + * [VETH]: move veth.h to include/linux + * [IPV4]: Fix ip command line processing. + * Revert quicklist need->flush fix + * [CRYPTO] padlock: Fix spurious ECB page fault + * [POWERPC] Oprofile: Remove dependency on spufs module + * [POWERPC] PS3: Fix printing of os-area magic numbers + * [PCI] Do not enable CRS Software Visibility by default + * [IPV4] Fix ip=dhcp regression + * [SERIAL]: Fix section mismatches in Sun serial console drivers. + * [TCP]: use non-delayed ACK for congestion control RTT + * [BLUETOOTH]: put_device before device_del fix + + -- Tim Gardner Sat, 22 Dec 2007 15:16:11 -0700 + +linux (2.6.24-2.4) hardy; urgency=low + + [Alessio Igor Bogani] + + * rt: First import for Hardy + + [Amit Kucheria] + + * LPIA: Fix FTBFS for hda + * LPIA: Trim configs including disabling stock DRM + + [Tim Gardner] + + * SAUCE: Increase CONFIG_IDE_MAX_HWIFS to 8 (from 4) + - LP: #157909 + Then reverted since it causes an ABI bump. Will pick it up + again when next the ABI changes. + * Expose apm for applications. + + -- Tim Gardner Wed, 19 Dec 2007 13:17:31 -0700 + +linux (2.6.24-2.3) hardy; urgency=low + + [Amit Kucheria] + + * LPIA: Add thermal framework from Intel + * LPIA: Poulsbo-specific patches + * LPIA: Add thermal framework from Intel + + [Tim Gardner] + + * SAUCE: hdaps module does not load on Thinkpad T61P + - LP: #133636 + + [Upstream Kernel Changes] + + * Rebased against 2.6.24-rc5 + + -- Tim Gardner Wed, 12 Dec 2007 13:58:52 -0700 + +linux (2.6.24-1.2) hardy; urgency=low + + [Ben Collins] + + * cell: Remove cell custom flavour, merged upstream + * apparmor: Added module from SVN repo + * ubuntu: Update configs to enable apparmor + * ubuntu/configs: Disable vga type framebuffers on hppa32. Fixes FTBFS + + [Tim Gardner] + + * Add support for PPA builds. + + [Upstream Kernel Changes] + + * [SPARC64] Export symbols for sunvnet and sunvdc to be built modular + + -- Ben Collins Fri, 07 Dec 2007 15:18:32 -0500 + +linux (2.6.24-1.1) hardy; urgency=low + + [Ben Collins] + + * ubuntu: Disable custom binary flavours for now + * ubuntu: Remove cruft in headers-postinst + * ubuntu: Set skipabi/skipmodule to true if prev_revions == 0.0 + * ubuntu: Do not fail on missing module lists when skipmodule is set + * ubuntu: capability.ko is built-in now, no need to place in initrd. + * ubuntu: Change to "linux" instead of "linux-source-2.6.x" + * d-i: cdrom-modules disappeared, and sha256/aes modules renamed. + * ubuntu-build: Add asm_link= to arch rules, and use them + * config: Re-enable snd-hda-intel + + -- Ben Collins Wed, 28 Nov 2007 12:58:37 -0500 + +linux-source-2.6.22 (2.6.22-14.46) gutsy; urgency=low + + [Upstream Kernel Changes] + + * [SPARC64]: Fix bugs in SYSV IPC handling in 64-bit processes. + + -- Kyle McMartin Sun, 14 Oct 2007 20:30:09 +0000 + +linux-source-2.6.22 (2.6.22-14.45) gutsy; urgency=low + + [Upstream Kernel Changes] + + * [SPARC64]: Fix register usage in xor_raid_4(). + + -- Kyle McMartin Sun, 14 Oct 2007 12:34:44 -0400 + +linux-source-2.6.22 (2.6.22-14.44) gutsy; urgency=low + + [Kyle McMartin] + + * Revert "sparc wants ehci built in" + + [Upstream Kernel Changes] + + * Revert "[PATCH]: Gutsy OHCI hang workaround for Huron" + * [USB]: Serialize EHCI CF initialization. + + -- Kyle McMartin Sun, 14 Oct 2007 16:25:51 +0000 + +linux-source-2.6.22 (2.6.22-14.43) gutsy; urgency=low + + [Kyle McMartin] + + * sparc wants ehci built in + + -- Kyle McMartin Tue, 09 Oct 2007 20:07:58 +0000 + +linux-source-2.6.22 (2.6.22-14.42) gutsy; urgency=low + + [Kyle McMartin] + + * fix up module-check to bail early if asked to ignore modules + * disable kernel DRM on lpia (we provide one in lum) + - LP: #145168 + * add ignore for ia64 abi too + + [Upstream Kernel Changes] + + * [NIU]: Use netif_msg_*(). + * [NIU]: Use pr_info(). + * [NIU]: Remove redundant BUILD_BUG_ON() in __niu_wait_bits_clear(). + * [NIU]: Remove BUG_ON() NULL pointer checks. + * [NIU]: Use dev_err(). + * [NIU]: Fix x86_64 build failure. + * [NIU]: Use linux/io.h instead of asm/io.h + * [NIU]: Fix some checkpatch caught coding style issues. + * [NIU]: Fix shadowed local variables. + * [NIU]: Fix locking errors in link_status_10g(). + * [NIU]: Document a few magic constants using comments. + * [NIU]: MII phy handling fixes. + * [NIU]: Make sure link_up status is set to something in + link_status_{1,10}g(). + * [PATCH]: Gutsy OHCI hang workaround for Huron + + -- Kyle McMartin Tue, 09 Oct 2007 17:25:06 +0000 + +linux-source-2.6.22 (2.6.22-14.41) gutsy; urgency=low + + [Ben Collins] + + * ubuntu/d-i: Add niu to nic-modules + + [Kyle McMartin] + + * vesafb is not for ia64 + * remove CONFIG_NIU from places it shouldn't be + * fix orinoco_cs oops + - LP: #149997 + + [Upstream Kernel Changes] + + * [SPARC64]: Allow userspace to get at the machine description. + * [SPARC64]: Niagara-2 optimized copies. + * [SPARC64]: Do not touch %tick_cmpr on sun4v cpus. + * [SPARC64]: SMP trampoline needs to avoid %tick_cmpr on sun4v too. + * [SPARC64]: Create a HWCAP_SPARC_N2 and report it to userspace on + Niagara-2. + * [MATH-EMU]: Fix underflow exception reporting. + * [SPARC64]: Need to clobber global reg vars in switch_to(). + * [MATH]: Fix typo in FP_TRAPPING_EXCEPTIONS default setting. + * [SUNVDC]: Use slice 0xff on VD_DISK_TYPE_DISK. + * [SPARC64]: Fix type and constant sizes wrt. sun4u IMAP/ICLR handling. + * [SPARC64]: Enable MSI on sun4u Fire PCI-E controllers. + * [SPARC64]: Fix several bugs in MSI handling. + * [SPARC64]: Fix booting on V100 systems. + * [SPARC64]: Fix lockdep, particularly on SMP. + * [SPARC64]: Warn user if cpu is ignored. + * [SUNSAB]: Fix several bugs. + * [SUNSAB]: Fix broken SYSRQ. + * [SPARC64]: Fix missing load-twin usage in Niagara-1 memcpy. + * [SPARC64]: Don't use in/local regs for ldx/stx data in N1 memcpy. + * [SPARC64]: Fix domain-services port probing. + * [SPARC64]: VIO device addition log message level is too high. + * [SPARC64]: check fork_idle() error + * [SPARC64]: Fix 'niu' complex IRQ probing. + * [NIU]: Add Sun Neptune ethernet driver. + + -- Kyle McMartin Tue, 09 Oct 2007 00:38:16 +0000 + +linux-source-2.6.22 (2.6.22-13.40) gutsy; urgency=low + + [Amit Kucheria] + + * Enable CONFIG_VM86 for LPIA + - LP: #146311 + * Update configuration files + * Disable MSI by default + * Add mmconf documentation + * Update configuration files + + [Bartlomiej Zolnierkiewicz] + + * ide-disk: workaround for buggy HPA support on ST340823A (take 3) + - LP: #26119 + + [Ben Collins] + + * ubuntu/cell: Fixup ps3 related modules for d-i, enable RTAS console + * ubuntu/cell: Enable CELLEB and related modules (pata_scc) + * ubuntu/cell: Move ps3rom to storage-core. Also use spidernet, not + spider_net. + * ubuntu/cell: Set PS3_MANAGER=y + * ubuntu: Set NR_CPUS=256 for sparc64-smp + + [Chuck Short] + + * [USB] USB] Support for MediaTek MT6227 in cdc-acm. + - LP: #134123 + * [XEN] Fix xen vif create with more than 14 guests. + - LP: #14486 + + [Jorge Juan Chico] + + * ide: ST320413A has the same problem as ST340823A + - LP: #26119 + + [Kyle McMartin] + + * fix -rt build + * fix ia32entry-xen.S for CVE-2007-4573 + * fix build when CONFIG_PCI_MSI is not set + + [Matthew Garrett] + + * hostap: send events on data interface as well as master interface + - LP: #57146 + * A malformed _GTF object should not prevent ATA device recovery + - LP: #139079 + * hostap: send events on data interface as well as master interface + - LP: #57146 + * A malformed _GTF object should not prevent ATA device recovery + - LP: #139079 + * Don't lose appletouch button release events + * Fix build with appletouch change + * Disable Thinkpad backlight support on machines with ACPI video + - LP: #148055 + * Don't attempt to register a callback if there is no CMOS object + - LP: #145857 + * Update ACPI bay hotswap code to support locking + - LP: #148219 + * Update ACPI bay hotswap code to support locking + - LP: #148219 + * Don't attempt to register a callback if there is no CMOS object + - LP: #145857 + * Disable Thinkpad backlight support on machines with ACPI video + - LP: #148055 + + [Steffen Klassert] + + * 3c59x: fix duplex configuration + - LP: #94186 + + [Thomas Gleixner] + + * clockevents: remove the suspend/resume workaround^Wthinko + + [Tim Gardner] + + * orinoco_cs.ko missing + - LP: #125832 + * Marvell Technology ethernet card not recognized and not operational + - LP: #135316 + * Marvell Technology ethernet card not recognized and not operational + - LP: #135316 + * acpi_scan_rsdp() breaks some PCs by not honouring ACPI specification + - LP: #144336 + * VIA southbridge Intel id missing + - LP: #128289 + * Add T-Sinus 111card to hostap_cs driver to be able to upload firmware + - LP: #132466 + * RTL8111 PCI Express Gigabit driver r8169 big files produce slow file + transfer + - LP: #114171 + * Guest OS does not recognize a lun with non zero target id on Vmware ESX + Server + - LP: #140761 + * Modualrize vesafb + - LP: #139505 + * Nikon cameras need support in unusual_devs.h + - LP: #134477 + * agp for i830m broken in gutsy + - LP: #139767 + * hdaps: Added support for Thinkpad T61 + - LP: #147383 + * xen: Update config for i386 + - LP: #139047 + * xen: resync for amd64 + - LP: #139047 + * ide-disk: workaround for buggy HPA support on ST340823A (take 4) + - LP: #26119 + + [Upstream Kernel Changes] + + * Convert snd-page-alloc proc file to use seq_file (CVE-2007-4571) + * Linux 2.6.22.8 + * ACPI: disable lower idle C-states across suspend/resume + * V4L: ivtv: fix VIDIOC_S_FBUF: new OSD values were never set + * DVB: get_dvb_firmware: update script for new location of sp8870 + firmware + * DVB: get_dvb_firmware: update script for new location of tda10046 + firmware + * DVB: b2c2-flexcop: fix Airstar HD5000 tuning regression + * setpgid(child) fails if the child was forked by sub-thread + * sigqueue_free: fix the race with collect_signal() + * kconfig: oldconfig shall not set symbols if it does not need to + * MTD: Makefile fix for mtdsuper + * USB: fix linked list insertion bugfix for usb core + * ACPI: Validate XSDT, use RSDT if XSDT fails + * POWERPC: Flush registers to proper task context + * 3w-9xxx: Fix dma mask setting + * MTD: Initialise s_flags in get_sb_mtd_aux() + * JFFS2: fix write deadlock regression + * V4L: cx88: Avoid a NULL pointer dereference during mpeg_open() + * hwmon: End of I/O region off-by-one + * Fix debug regression in video/pwc + * splice: fix direct splice error handling + * rpc: fix garbage in printk in svc_tcp_accept() + * disable sys_timerfd() + * afs: mntput called before dput + * Fix DAC960 driver on machines which don't support 64-bit DMA + * Fix "Fix DAC960 driver on machines which don't support 64-bit DMA" + * firewire: fw-ohci: ignore failure of pci_set_power_state (fix suspend + regression) + * futex_compat: fix list traversal bugs + * Leases can be hidden by flocks + * ext34: ensure do_split leaves enough free space in both blocks + * nfs: fix oops re sysctls and V4 support + * dir_index: error out instead of BUG on corrupt dx dirs + * ieee1394: ohci1394: fix initialization if built non-modular + * Correctly close old nfsd/lockd sockets. + * Fix race with shared tag queue maps + * crypto: blkcipher_get_spot() handling of buffer at end of page + * fix realtek phy id in forcedeth + * Fix decnet device address listing. + * Fix device address listing for ipv4. + * Fix inet_diag OOPS. + * Fix IPV6 append OOPS. + * Fix IPSEC AH4 options handling + * Fix ipv6 double-sock-release with MSG_CONFIRM + * Fix IPV6 DAD handling + * Fix ipv6 source address handling. + * Fix oops in vlan and bridging code + * Fix tc_ematch kbuild + * Handle snd_una in tcp_cwnd_down() + * Fix TCP DSACK cwnd handling + * Fix datagram recvmsg NULL iov handling regression. + * Fix pktgen src_mac handling. + * Fix sparc64 v100 platform booting. + * bcm43xx: Fix cancellation of work queue crashes + * Linux 2.6.22.9 + * usb: serial/pl2303: support for BenQ Siemens Mobile Phone EF81 + * pata_it821x: fix lost interrupt with atapi devices + * i915: make vbl interrupts work properly on i965g/gm hw. + + -- Kyle McMartin Thu, 04 Oct 2007 13:57:53 +0000 + +linux-source-2.6.22 (2.6.22-12.39) gutsy; urgency=low + + [Ben Collins] + + * ubuntu: Re-order deps so that binary-custom is done before + binary-udebs. Fixes ppc build + + [Upstream Kernel Changes] + + * x86_64: Zero extend all registers after ptrace in 32bit entry path. + * Linux 2.6.22.7 + + -- Ben Collins Sun, 23 Sep 2007 11:05:32 -0400 + +linux-source-2.6.22 (2.6.22-12.38) gutsy; urgency=low + + [Kyle McMartin] + + * add -12 abi files + * update getabis for new flavours + + -- Kyle McMartin Fri, 21 Sep 2007 13:35:49 -0400 + +linux-source-2.6.22 (2.6.22-12.37) gutsy; urgency=low + + [Kyle McMartin] + + * enable d-i for cell flavour + * ignore ABI check on all hppa flavours + + -- Kyle McMartin Fri, 21 Sep 2007 11:28:34 -0400 + +linux-source-2.6.22 (2.6.22-12.36) gutsy; urgency=low + + [Ben Collins] + + * ABI bump due to LED support being enabled. + + [Kyle McMartin] + + * fix memory leak in psparse.c + - Bug introduced in previous commit to acpi + + [Upstream Kernel Changes] + + * Ubuntu: Allocate acpi_devices structure rather than leaving it on the + stack. + * ipw2100: Fix `iwpriv set_power` error + * Fix ipw2200 set wrong power parameter causing firmware error + * [SCSI] Fix async scanning double-add problems + - LP: #110997 + + -- Ben Collins Thu, 20 Sep 2007 11:34:52 -0400 + +linux-source-2.6.22 (2.6.22-11.34) gutsy; urgency=low + + [Alan Stern] + + * USB: disable autosuspend by default for non-hubs + - LP: #85488 + + [Ben Collins] + + * ubuntu: Enable LEDS_TRIGGERS and related options + - Needed for iwlwifi + * ubuntu: Add real ABI files for virtual flavour + * ubuntu: Re-enable missing CONFIG_SERPENT for hppa64 + - Noticed by Lamont + * ubuntu: Add linux-headers postinst to handle hooks + - LP: #125816 + * ubuntu: Add support for /etc/kernel/headers_postinst.d/ to + headers-postinst + - LP: #120049 + * cell: Add binary-custom flavour "cell" to support ps3 + + [Mattia Dongili] + + * sony-laptop: restore the last user requested brightness level on + resume. + - LP: #117331 + + [Tejun Heo] + + * ata_piix: fix suspend/resume for some TOSHIBA laptops + - LP: #139045 + * PCI: export __pci_reenable_device() + - needed for ata_piix change + + [Tim Gardner] + + * Enable Sierra Wireless MC8775 0x6813 + - LP: #131167 + + [Zhang Rui] + + * ACPI: work around duplicate name "VID" problem on T61 + - Noted by mjg59 + + -- Ben Collins Sun, 16 Sep 2007 22:31:47 -0400 + +linux-source-2.6.22 (2.6.22-11.33) gutsy; urgency=low + + [Alessio Igor Bogani] + + * rt: Update to rt9 + * rt: Update configuration files + + [Ben Collins] + + * ubuntu: Enable A100 driver + - LP: #138632 + * libata: Default to hpa being overridden + + [Chuck Short] + + * [HDAPS] Add support for Thinkpad R61. + * [LIBATA] Add more hard drives to blacklist. + * [USB] Added support for Sprint Pantech PX-500. + * [XEN] No really enable amd64. + * [XEN] Fix amd64 yet again. + + [Matthew Garrett] + + * alter default behaviour of ACPI video module + * Add infrastructure for notification on ACPI method execution + * Get thinkpad_acpi to send notifications on CMOS updates + * Add support to libata-acpi for acpi-based bay hotplug + + [Phillip Lougher] + + * Add kernel flavour optimised for virtualised environments + * Change abi-check script to check for $flavour.ignore in previous abi + * Disable abi and module check for virtual flavour + + [Richard Hughes] + + * Refresh laptop lid status on resume + + [Upstream Kernel Changes] + + * [pata_marvell]: Add more identifiers + + -- Ben Collins Sun, 16 Sep 2007 22:13:08 -0400 + +linux-source-2.6.22 (2.6.22-11.32) gutsy; urgency=low + + [Amit Kucheria] + + * Build system: Allow custom builds to comprise multiple patches + * Move UME to a Custom build and add first setup of thermal framework + + [Ben Collins] + + * ubuntu: Enable CONFIG_BLK_DEV_IO_TRACE + * bcm203x: Fix firmware loading + - LP: #85247 + * ubuntu: mtd changes caused module renaming. Ignore + * rt: Do not patch top level Makefile for SUBLEVEL. Will always end up + breaking + + [Chuck Short] + + * [USB] Unusual Device support for Gold MP3 Player Energy + - LP: #125250 + * [SIERRA] Adds support for Onda H600 ZTE MF330 + - LP: #129433 + * [HDAPS] Add Thinkpad T61P to whitelist. + - LP: #133636 + * [USB] Add support for Toshiba (Novatel Wireless) HSDPA for M400. + - LP: #133650 + + [Kyle McMartin] + + * apparmor 10.3 hooks + * unionfs 2.1 hooks + * nuke UNION_FS stuff from fs/{Kconfig,Makefile} + + [Tim Gardner] + + * Paravirt-ops I/O hypercalls + * Fix lazy vmalloc bug for Gutsy + * bluetooth headset patch + - LP: #130870 + * Add the PCI ID of this ICH4 in list of laptops that use short cables. + * v2.6.22.5 merge + * Update Xen config options. + - LP: #132726 + * Remove mtd modules from ABI + * Support parallel= in DEB_BUILD_OPTIONS + - LP: #136426 + + [Upstream Kernel Changes] + + * hwmon: fix w83781d temp sensor type setting + * hwmon: (smsc47m1) restore missing name attribute + * sky2: restore workarounds for lost interrupts + * sky2: carrier management + * sky2: check for more work before leaving NAPI + * sky2: check drop truncated packets + * revert "x86, serial: convert legacy COM ports to platform devices" + * ACPICA: Fixed possible corruption of global GPE list + * ACPICA: Clear reserved fields for incoming ACPI 1.0 FADTs + * AVR32: Fix atomic_add_unless() and atomic_sub_unless() + * r8169: avoid needless NAPI poll scheduling + * forcedeth: fix random hang in forcedeth driver when using netconsole + * libata: add ATI SB700 device IDs to AHCI driver + * Hibernation: do not try to mark invalid PFNs as nosave + * i386: allow debuggers to access the vsyscall page with compat vDSO + * x86_64: Check for .cfi_rel_offset in CFI probe + * x86_64: Change PMDS invocation to single macro + * i386: Handle P6s without performance counters in nmi watchdog + * i386: Fix double fault handler + * JFFS2 locking regression fix. + * [Input]: appletouch - improve powersaving for Geyser3 devices + * [Input]: add driver for Fujitsu serial touchscreens + * [sdhci]: add support to ENE-CB714 + * v2.6.22.5 + * [MTD] Makefile fix for mtdsuper + * ocfs2: Fix bad source start calculation during kernel writes + * NET: Share correct feature code between bridging and bonding + * sky2: don't clear phy power bits + * uml: fix previous request size limit fix + * i386: fix lazy mode vmalloc synchronization for paravirt + * signalfd: fix interaction with posix-timers + * signalfd: make it group-wide, fix posix-timers scheduling + * DCCP: Fix DCCP GFP_KERNEL allocation in atomic context + * IPV6: Fix kernel panic while send SCTP data with IP fragments + * IPv6: Invalid semicolon after if statement + * Fix soft-fp underflow handling. + * Netfilter: Missing Kbuild entry for netfilter + * SNAP: Fix SNAP protocol header accesses. + * NET: Fix missing rcu unlock in __sock_create() + * SPARC64: Fix sparc64 task stack traces. + * SPARC64: Fix sparc64 PCI config accesses on sun4u + * TCP: Do not autobind ports for TCP sockets + * TCP: Fix TCP rate-halving on bidirectional flows. + * TCP: Fix TCP handling of SACK in bidirectional flows. + * PPP: Fix PPP buffer sizing. + * PCI: lets kill the 'PCI hidden behind bridge' message + * PCI: disable MSI on RS690 + * PCI: disable MSI on RD580 + * PCI: disable MSI on RX790 + * USB: allow retry on descriptor fetch errors + * USB: fix DoS in pwc USB video driver + * usb: add PRODUCT, TYPE to usb-interface events + * Linux 2.6.22.6 + * V4L/DVB (6042): b2c2-flexcop: fix Airstar HD5000 tuning regression + * V4L/DVB (5967): ivtv: fix VIDIOC_S_FBUF:new OSD values where never set + * Re-add _GTM and _STM support + + -- Ben Collins Fri, 31 Aug 2007 16:26:56 -0400 + +linux-source-2.6.22 (2.6.22-10.30) gutsy; urgency=low + + * URGENT upload to fix FTBFS with xen-{i386,amd64} configs, + lpia d-i ftbfs, xen ftbfs. + * URGENT fix module-check to actually ignore things + * URGENT ignore ume modules + + [Alek Du] + + * Add Intel Poulsbo chipset Libata support + + [Amit Kucheria] + + * Update configuration files + * Enable stylus on Lenovo X60/X61 thinkpads + + [Ben Collins] + + * ubuntu: Disable snd-hda-intel, in favor of lum updated version + + [Kyle McMartin] + + * apparmor 10.3 hooks + * add lpia d-i udeb generation + * fix bits of rt/diff for -rt8 + * fix rt/diff for 2.6.22.3 changes + * fix up rt/diff for stable 2.6.22.4 + + [LaMont Jones] + + * Update configuration files + + [Phillip Lougher] + + * WriteSupportForNTFS: make fuse module available to d-i + + [Tim Gardner] + + * Gutsy Tribe 3 CD don't load on Dell Inspiron 1501 + - LP: #121111 + * Update configuration files + * Update configuration files + * Update configuration files + + [Upstream Kernel Changes] + + * [SPARC64]: Fix handling of multiple vdc-port nodes. + * [SPARC64]: Tweak assertions in sun4v_build_virq(). + * [SPARC64]: Fix log message type in vio_create_one(). + * [SPARC64]: Fix two year old bug in early bootup asm. + * [SPARC64]: Improve VIO device naming further. + * [SPARC64]: Handle multiple domain-services-port nodes properly. + * [SPARC64]: Add proper multicast support to VNET driver. + * [SPARC64]: Do not flood log with failed DS messages. + * [SPARC64]: Use KERN_ERR in IRQ manipulation error printks. + * [SPARC64]: Fix virq decomposition. + * [SPARC]: Fix serial console device detection. + * [SPARC64]: fix section mismatch warning in pci_sunv4 + * [SPARC64]: fix section mismatch warning in mdesc.c + * [SPARC64] viohs: extern on function definition + * [SPARC64]: Fix sun4u PCI config space accesses on sun4u. + * [SPARC64]: Fix show_stack() when stack argument is NULL. + * [SUNLANCE]: Fix sparc32 crashes by using of_*() interfaces. + * [SPARC]: Centralize find_in_proplist() instead of duplicating N times. + * [SPARC64]: Fix hard-coding of cpu type output in /proc/cpuinfo on + sun4v. + * [SPARC64]: Do not assume sun4v chips have load-twin/store-init support. + * [SPARC64]: Fix memory leak when cpu hotplugging. + * USB: cdc-acm: fix sysfs attribute registration bug + * TCP FRTO retransmit bug fix + * Fix TC deadlock. + * Fix IPCOMP crashes. + * Fix console write locking in sparc drivers. + * Add a PCI ID for santa rosa's PATA controller. + * Missing header include in ipt_iprange.h + * SCTP scope_id handling fix + * Fix rfkill IRQ flags. + * gen estimator timer unload race + * gen estimator deadlock fix + * Fix error queue socket lookup in ipv6 + * Fix ipv6 link down handling. + * Netpoll leak + * Sparc64 bootup assembler bug + * Fix ipv6 tunnel endianness bug. + * Fix sparc32 memset() + * Fix sparc32 udelay() rounding errors. + * Fix TCP IPV6 MD5 bug. + * KVM: SVM: Reliably detect if SVM was disabled by BIOS + * USB: fix warning caused by autosuspend counter going negative + * usb-serial: Fix edgeport regression on non-EPiC devices + * Fix reported task file values in sense data + * aacraid: fix security hole + * firewire: fw-sbp2: set correct maximum payload (fixes CardBus adapters) + * make timerfd return a u64 and fix the __put_user + * V4L: Add check for valid control ID to v4l2_ctrl_next + * V4L: ivtv: fix broken VBI output support + * V4L: ivtv: fix DMA timeout when capturing VBI + another stream + * V4L: ivtv: Add locking to ensure stream setup is atomic + * V4L: wm8775/wm8739: Fix memory leak when unloading module + * Input: lifebook - fix an oops on Panasonic CF-18 + * splice: fix double page unlock + * drm/i915: Fix i965 secured batchbuffer usage (CVE-2007-3851) + * Fix leak on /proc/lockdep_stats + * CPU online file permission + * Fix user struct leakage with locked IPC shem segment + * md: handle writes to broken raid10 arrays gracefully + * md: raid10: fix use-after-free of bio + * pcmcia: give socket time to power down + * Fix leaks on /proc/{*/sched, sched_debug, timer_list, timer_stats} + * futex: pass nr_wake2 to futex_wake_op + * "ext4_ext_put_in_cache" uses __u32 to receive physical block number + * Include serial_reg.h with userspace headers + * dm io: fix panic on large request + * i386: HPET, check if the counter works + * fw-ohci: fix "scheduling while atomic" + * firewire: fix memory leak of fw_request instances + * softmac: Fix ESSID problem + * eCryptfs: ecryptfs_setattr() bugfix + * nfsd: fix possible read-ahead cache and export table corruption + * readahead: MIN_RA_PAGES/MAX_RA_PAGES macros + * fs: 9p/conv.c error path fix + * forcedeth bug fix: cicada phy + * forcedeth bug fix: vitesse phy + * forcedeth bug fix: realtek phy + * acpi-cpufreq: Proper ReadModifyWrite of PERF_CTL MSR + * jbd commit: fix transaction dropping + * jbd2 commit: fix transaction dropping + * hugetlb: fix race in alloc_fresh_huge_page() + * do not limit locked memory when RLIMIT_MEMLOCK is RLIM_INFINITY + * uml: limit request size on COWed devices + * sony-laptop: fix bug in event handling + * destroy_workqueue() can livelock + * drivers/video/macmodes.c:mac_find_mode() mustn't be __devinit + * cfq-iosched: fix async queue behaviour + * libata: add FUJITSU MHV2080BH to NCQ blacklist + * ieee1394: revert "sbp2: enforce 32bit DMA mapping" + * nfsd: fix possible oops on re-insertion of rpcsec_gss modules + * dm raid1: fix status + * dm io: fix another panic on large request + * dm snapshot: permit invalid activation + * dm: disable barriers + * cr_backlight_probe() allocates too little storage for struct cr_panel + * ACPI: dock: fix opps after dock driver fails to initialize + * Hangup TTY before releasing rfcomm_dev + * Keep rfcomm_dev on the list until it is freed + * nf_conntrack: don't track locally generated special ICMP error + * IPV6: /proc/net/anycast6 unbalanced inet6_dev refcnt + * sysfs: release mutex when kmalloc() failed in sysfs_open_file(). + * Netfilter: Fix logging regression + * USB: fix for ftdi_sio quirk handling + * sx: switch subven and subid values + * UML: exports for hostfs + * Linux 2.6.22.2 + * fix oops in __audit_signal_info() + * random: fix bound check ordering (CVE-2007-3105) + * softmac: Fix deadlock of wx_set_essid with assoc work + * ata_piix: update map 10b for ich8m + * PPC: Revert "[POWERPC] Don't complain if size-cells == 0 in + prom_parse()" + * PPC: Revert "[POWERPC] Add 'mdio' to bus scan id list for platforms + with QE UEC" + * powerpc: Fix size check for hugetlbfs + * direct-io: fix error-path crashes + * stifb: detect cards in double buffer mode more reliably + * pata_atiixp: add SB700 PCI ID + * CPUFREQ: ondemand: fix tickless accounting and software coordination + bug + * CPUFREQ: ondemand: add a check to avoid negative load calculation + * Linux 2.6.22.3 + * intel_agp: really fix 945/965GME + * Reset current->pdeath_signal on SUID binary execution (CVE-2007-3848) + * MSS(mmc/sd/sdio) driver patch + + -- Kyle McMartin Thu, 16 Aug 2007 12:17:27 -0400 + +linux-source-2.6.22 (2.6.22-9.25) gutsy; urgency=low + + [Kyle McMartin] + + * ubuntu: Fix FTBFS -- forgot to bump debian/abi + + -- Kyle McMartin Thu, 02 Aug 2007 22:13:28 +0000 + +linux-source-2.6.22 (2.6.22-9.24) gutsy; urgency=low + + [Colin Watson] + + * provide Provides for fs-*-modules udebs + + [Matthias Klose] + + * test $dilist before using it + + [Lamont Jones] + + * hppa: Update abi files + + -- Kyle McMartin Thu, 02 Aug 2007 18:26:34 +0000 + +linux-source-2.6.22 (2.6.22-9.23) gutsy; urgency=low + + [Ben Collins] + + * ubuntu: Add missing newline to module-check script + * ubuntu: Add lpia to linux-libc-dev. Should finally build now. + + -- Ben Collins Thu, 02 Aug 2007 13:10:23 -0400 + +linux-source-2.6.22 (2.6.22-9.22) gutsy; urgency=low + + [Ben Collins] + + * ubuntu: Use DEB_HOST_ARCH, not DEB_HOST_ARCH_CPU + + -- Ben Collins Thu, 02 Aug 2007 08:44:09 -0400 + +linux-source-2.6.22 (2.6.22-9.21) gutsy; urgency=low + + [Ben Collins] + + * lpia: Add build stuff for lpia architecture + + [LaMont Jones] + + * abi files for hppa + * UBUNTU-HPPA: configs that seem to work + * hppa: abi files for 9.20 + + -- Ben Collins Wed, 01 Aug 2007 11:12:59 -0400 + +linux-source-2.6.22 (2.6.22-9.20) gutsy; urgency=low + + [Ben Collins] + + * tulip: Fix for Uli5261 chipsets. + * tulip: Define ULI PCI ID's + * tulip: Let dmfe handle davicom on non-sparc + * input: Allow root to inject unknown scan codes. + * irda: Default to dongle type 9 on IBM hardware + * input/mouse/alps: Do not call psmouse_reset() for alps + * pcmcia: Do not insert pcmcia cards on resume + * ide-cd: Disable verbose errors. + * block: Make CDROMEJECT more robust + * pm: Config option to disable handling of console during suspend/resume. + * version: Implement version_signature proc file. + * update toshiba_acpi to 0.19a-dev + * xpad: Update to latest version from xbox-linux. + * ubuntu: Enable setting of CONFIG_VERSION_SIGNATURE at build time + * toshiba_acpi: Don't use init_MUTEX_LOCKED + + [Chuck Short] + + * [USB]: add ASUS LCM to the blacklist + * [NET]: Add mcp73 to forcedeth. + * [USB]: Added support for Sanwa PC5000 multimeter usb cable (KB-USB2). + * [ATA] Add support for Sb700 AHCI nor-raid5 and raid5 + + [Fabio M. Di Nitto] + + * drivers/char/vt.c: make promcon driver init a boot option. + + [Kyle McMartin] + + * Disable MMCONFIG by default + + [Phillip Lougher] + + * fix NFS mounting regression from Edgy->Feisty + * r8169: disable TSO by default for RTL8111/8168B chipsets. + + [Tim Gardner] + + * Catch nonsense keycodes and silently ignore + * Cause SoftMac to emit an association event when setting ESSID. + + -- Ben Collins Mon, 30 Jul 2007 12:01:43 -0400 + +linux-source-2.6.22 (2.6.22-9.19) gutsy; urgency=low + + [Amit Kucheria] + + * Fix for FTBFS bug 123178 + * Fix for FTBFS bug 123178 + * Add devices to USB quirks to prevent USB autosuspend + * More devices added to USB quirks + - LP: #85488 + * Support for ENE CB-712/4 SD card reader + * Reorder quirk list based on Vendor/Product ID + + [Ben Collins] + + * ubuntu: Enable HOTPLUG_CPU in sparc64-smp config. + * ubuntu: Add xen to amd64 custom builds + * ubuntu: Update real-time kernel to -rt4 + * rt: Patch from Alessio Igor Bogani for RT-8 + + [Chuck Short] + + * IDE: add MHV2080BH to NCQ blacklist + * XEN: update to 2.6.22 final and amd64 support. + * NET: Add more pci-ids to zd1211rw + * IDE: add new PCI ID + * USB: fix oops in ftdi_sio + + [Eric Piel] + + * ACPI: Allow custom DSDT tables to be loaded from initramfs + + [Ryan Lortie] + + * Macbook calibration loop fix + - LP: #54621 + + [Upstream Kernel Changes] + + * NETFILTER: {ip, nf}_conntrack_sctp: fix remotely triggerable NULL ptr + dereference (CVE-2007-2876) + * Linux 2.6.22.1 + * [SPARC64]: Use KERN_ERR in sun4v IRQ printk()'s. + * [SPARC64]: Add LDOM virtual channel driver and VIO device layer. + * [SPARC64]: Add Sun LDOM virtual network driver. + * [SPARC64]: Add Sun LDOM virtual disk driver. + * [SPARC64]: Create proper obppath sysfs files for VIO bus devices. + * [SPARC64] LDC: Do limited polled retry on setting RX queue head. + * [SUNVNET]: Validate RX descriptor size field. + * [SPARC64]: Add missing symbol exports for LDOM infrastructure. + * [SPARC64]: Temporary workaround for LDC INO double-delivery. + * [SPARC64]: Create 'devspec' nodes for vio devices. + * [SPARC64]: vdev->type can be NULL, handle this in devspec_show(). + * [SPARC64]: Assorted LDC bug cures. + * [SPARC64]: Add domain-services nodes to VIO device tree. + * [SPARC64]: Export powerd facilities for external entities. + * [SPARC64]: Initial domain-services driver. + * [SPARC64]: Use more mearningful names for IRQ registry. + * [SPARC64]: Abstract out mdesc accesses for better MD update handling. + * [SPARC64]: Fix MD property lifetime bugs. + * [SPARC64]: Fix setting of variables in LDOM guest. + * [SPARC64]: Initial LDOM cpu hotplug support. + * [SPARC64]: Unconditionally register vio_bus_type. + * [SPARC64]: Fix build regressions added by dr-cpu changes. + * [SPARC64]: mdesc.c needs linux/mm.h + * [SPARC64]: SMP build fixes. + * [SPARC64]: More sensible udelay implementation. + * [SPARC64]: Process dr-cpu events in a kthread instead of workqueue. + * [SPARC64]: Add ->set_affinity IRQ handlers. + * [SPARC64]: Fix leak when DR added cpu does not bootup. + * [SPARC64]: Clear cpu_{core,sibling}_map[] in + smp_fill_in_sib_core_maps() + * [SPARC64]: Give more accurate errors in dr_cpu_configure(). + * [SERIAL]: Fix console write locking in sparc drivers. + * [TIMER]: Fix clockevent notifications on 64-bit. + * [SPARC64]: dr-cpu unconfigure support. + * [SPARC64]: Fix UP build. + * [SPARC64]: SMP build fix. + * [SPARC64]: Fix race between MD update and dr-cpu add. + * [SERIAL] SUNHV: Fix jerky console on LDOM guests. + * [SPARC64]: Kill explicit %gl register reference. + * [SPARC64]: Add basic infrastructure for MD add/remove notification. + * [SPARC64]: Simplify VDC device probing. + * [SPARC64]: Simplify VNET probing. + * [SPARC64]: Massively simplify VIO device layer and support hot + add/remove. + * [SPARC64]: Handle LDC resets properly in domain-services driver. + * [SPARC64]: Handle reset events in vio_link_state_change(). + * [SPARC64]: Fix reset handling in VNET driver. + * [SPARC64]: Set vio->desc_buf to NULL after freeing. + * [SPARC64]: Fix MODULE_DEVICE_TABLE() specification in VDC and VNET. + * [SPARC64]: Fix device type matching in VIO's devspec_show(). + * Add empty + * Add dummy isa_(bus|virt)_to_(virt|bus) inlines + * Clean up sti_flush + * Do not allow STI_CONSOLE to be modular + * Use compat_sys_getdents + + -- Ben Collins Sat, 28 Jul 2007 12:30:53 -0400 + +linux-source-2.6.22 (2.6.22-8.18) gutsy; urgency=low + + [Ben Collins] + + * ubuntu: *sigh* update xen config to fix FTBFS + + -- Ben Collins Thu, 12 Jul 2007 14:23:20 +0100 + +linux-source-2.6.22 (2.6.22-8.17) gutsy; urgency=low + + [Ben Collins] + + * ubuntu: Actually enable the -xen build. + + -- Ben Collins Thu, 12 Jul 2007 09:51:01 +0100 + +linux-source-2.6.22 (2.6.22-8.16) gutsy; urgency=low + + * Removed CONFIG_BLINK from all configs and added to modules.ignore + * This fixes a build failure for 8.15 + + [Alexey Starikovskiy] + + * Fix ACPI battery detection on Asus + + [Amit Kucheria] + + * Export symbols required to build GFS1 in LUM + * Update configuration files + * 2.6.22-7.14 ABI + * Remove old ABI + * Update d-i modules to support Sparc LDOM + * Introducing the UME kernel flavour + + [Jacob Pan] + + * Poulsbo SMBus Controller + * Intel Poulsbo SCH IDE Controller + * Intel Poulsbo HD audio controller + + [Phillip Lougher] + + * xen: Update custom binary flavour (Xen 3.1 for 2.6.22-rc5) + * xen: Update xen/config.i386 to enable PAE + + [Upstream Kernel Changes] + + * [SCSI] fusion: fix for BZ 8426 - massive slowdown on SCSI CD/DVD drive + * [XFS] Update the MAINTAINERS file entry for XFS. + * IB/mlx4: Fix handling of wq->tail for send completions + * IB/mlx4: Fix warning in rounding up queue sizes + * [SCSI] ESP: Don't forget to clear ESP_FLAG_RESETTING. + * firewire: fix hang after card ejection + * ieee1394: fix to ether1394_tx in ether1394.c + * [ARM] Add support for pause_on_oops and display preempt/smp options + * sh: Fix restartable syscall arg5 clobbering. + * ACPI: gracefully print null trip-point device + * ACPICA: fix error path in new external package objects as method + arguments + * sh: oops_enter()/oops_exit() in die(). + * [ARM] Update show_regs/oops register format + * IB/mlx4: Handle new FW requirement for send request prefetching + * IB/mlx4: Get rid of max_inline_data calculation + * IB/mlx4: Handle buffer wraparound in __mlx4_ib_cq_clean() + * IB/mlx4: Handle FW command interface rev 3 + * Fix signalfd interaction with thread-private signals + * sched: fix SysRq-N (normalize RT tasks) + * Fix possible runqueue lock starvation in wait_task_inactive() + * sh: Handle -ERESTART_RESTARTBLOCK for restartable syscalls. + * sh64: Handle -ERESTART_RESTARTBLOCK for restartable syscalls. + * [POWERPC] Fix snd-powermac refcounting bugs + * [XFS] s/memclear_highpage_flush/zero_user_page/ + * [XFS] Update the MAINTAINERS file entry for XFS - change git repo name. + * [XFRM]: Fix MTU calculation for non-ESP SAs + * [IPVS]: Fix state variable on failure to start ipvs threads + * [AF_RXRPC]: Return the number of bytes buffered in rxrpc_send_data() + * [S390] Missing blank when appending cio_ignore kernel parameter + * [S390] Fix zfcpdump header + * [S390] Fix yet another two section mismatches. + * [S390] Print list of modules on die(). + * [S390] Add oops_enter()/oops_exit() calls to die(). + * [S390] Move psw_set_key. + * [POWERPC] rheap - eliminates internal fragments caused by alignment + * [POWERPC] PowerPC: Prevent data exception in kernel space (32-bit) + * [POWERPC] Fix powermac late initcall to only run on powermac + * [MIPS] Don't drag a platform specific header into generic arch code. + * x86_64: Fix readahead/sync_file_range/fadvise64 compat calls + * x86_64: Fix eventd/timerfd syscalls + * x86: Disable DAC on VIA bridges + * x86_64: Quieten Atari keyboard warnings in Kconfig + * x86: Only make Macintosh drivers default on Macs + * x86: Disable KPROBES with DEBUG_RODATA for now + * x86: change_page_attr bandaids + * x86_64: fix link warning between for .text and .init.text + * Fix up CREDIT entry ordering + * firewire: Only set client->iso_context if allocation was successful. + * spidernet: null out skb pointer after its been used. + * spidernet: Cure RX ram full bug + * spidernet: Don't terminate the RX ring + * spidernet: silence the ramfull messages + * spidernet: turn off descriptor chain end interrupt. + * spidernet: checksum and ethtool + * bonding: Fix use after free in unregister path + * bonding: Fix 802.3ad no carrier on "no partner found" instance + * s390: print correct level for HiperSockets devices + * s390: qeth driver does not recover + * s390: avoid inconsistent lock state in qeth + * s390: qeth: wrong packet length in qdio header + * s390: Use ccw_device_get_id() in qeth/claw drivers + * s390: don't call iucv_path_connect from tasklet context + * s390: netiucv spinlock initializer cleanup + * s390: netiucv inlining cleanup + * forcedeth: use unicast receive mode for WoL + * natsemi irq flags + * cxgb3 - fix skb->dev dereference + * cxgb3 - fix netpoll hanlder + * cxgb3 - Fix direct XAUI support + * cxgb3 - Stop mac RX when changing MTU + * cxgb3 - MAC watchdog update + * PATA: Add the MCP73/77 support to PATA driver + * pata_it821x: (partially) fix DMA in RAID mode + * libata: more NONCQ devices + * kerneldoc fix in libata + * ahci: fix PORTS_IMPL override + * fix module_param mistake in it821x + * Blackfin arch: update ANOMALY handling + * Blackfin arch: update printk to use KERN_EMERG and reformat crash + output + * Blackfin arch: add missing braces around array bfin serial init + * Blackfin arch: match kernel startup messaage with new linker script + * Blackfin arch: move cond_syscall() behind __KERNEL__ like all other + architectures + * Blackfin arch: Add definition of dma_mapping_error + * Blackfin arch: add proper const volatile to addr argument to the read + functions + * [AGPGART] intel_agp: don't load if no IGD and AGP port + * IB/umem: Fix possible hang on process exit + * IPoIB/cm: Initialize RX before moving QP to RTR + * IPoIB/cm: Fix interoperability when MTU doesn't match + * IPoIB/cm: Remove dead definition of struct ipoib_cm_id + * IB/mlx4: Correct max_srq_wr returned from mlx4_ib_query_device() + * [PARISC] stop lcd driver from stripping initial whitespace + * [PARISC] Handle wrapping in expand_upwards() + * [PARISC] Fix unwinder on 64-bit kernels + * [PARISC] unwinder improvements + * page_mapping must avoid slub pages + * posix-timers: Prevent softirq starvation by small intervals and SIG_IGN + * Allow DEBUG_RODATA and KPROBES to co-exist + * [NETFILTER]: nf_conntrack_sip: add missing message types containing RTP + info + * [NETFILTER]: nfctnetlink: Don't allow to change helper + * [IPV6] NDISC: Fix thinko to control Router Preference support. + * [IPV4]: include sysctl.h from inetdevice.h + * i386: Make CMPXCHG64 only dependent on PAE + * x86_64: Fix only make Macintosh drivers default on Macs + * x86_64: Ignore compat mode SYSCALL when IA32_EMULATION is not defined + * [AVR32] Fix bug in invalidate_dcache_region() + * [AVR32] NGW100, Remove relics of the old USART mapping scheme + * [AVR32] Initialize dma_mask and dma_coherent_mask + * [AVR32] Update defconfigs + * ACPI: fix 2.6.20 SMP boot regression + * [SKBUFF]: Fix incorrect config #ifdef around skb_copy_secmark + * [TIPC]: Fix infinite loop in netlink handler + * [PPP]: Revert 606f585e363527da9feaed79465132c0c661fd9e + * [PPP]: Fix osize too small errors when decoding mppe. + * [TCP] tcp_read_sock: Allow recv_actor() return return negative error + value. + * [NET]: Re-enable irqs before pushing pending DMA requests + * [NET]: Make skb_seq_read unmap the last fragment + * hwmon/coretemp: fix a broken error path + * fix refcounting of nsproxy object when unshared + * console UTF-8 fixes (fix) + * SM501: suspend support + * SM501: initialise SDRAM clock before bus clocks + * SM501: Fix sm501_init_reg() mask/set order + * SM501: Clock updates and checks + * SM501: Add Documentation/SM501.txt + * SM501: Check SM501 ID register on initialisation + * SLUB: fix behavior if the text output of list_locations overflows + PAGE_SIZE + * sched: fix next_interval determination in idle_balance() + * update checkpatch.pl to version 0.05 + * alpha: fix alignment problem in csum_ipv6_magic() + * Char: stallion, fix oops during init with ISA cards + * uml: use generic BUG + * uml: add asm/paravirt.h + * "volatile considered harmful" + * document nlink function + * slab allocators: MAX_ORDER one off fix + * update checkpatch.pl to version 0.06 + * x86_64: fix misplaced `continue' in mce.c + * ext2: disallow setting xip on remount + * audit: fix oops removing watch if audit disabled + * ext3: lost brelse in ext3_read_inode() + * ext4: lost brelse in ext4_read_inode() + * ACPI: preserve the ebx value in acpi_copy_wakeup_routine + * FUTEX: Restore the dropped ERSCH fix + * Linus 2.6.22-rc6 + * [ARM] 4452/1: Force the literal pool dump before reloc_end + * [ARM] 4449/1: more entries in arch/arm/boot/.gitignore + * fix nmi_watchdog=2 bootup hang + * [POWERPC] Update g5_defconfig + * [POWERPC] Update defconfigs + * [POWERPC] Fix VDSO gettimeofday() when called with NULL struct timeval + * [POWERPC] Fix subtle FP state corruption bug in signal return on SMP + * USB: g_file_storage: call allow_signal() + * USB: ti serial driver sleeps with spinlock held + * USB: memory leak in iowarrior.c + * USB: usblcd doesn't limit memory consumption during write + * USB: fix race leading to use after free in io_edgeport + * USB: add new device id to option driver + * USB: ftdio_sio: New IPlus device ID + * [MIPS] __ucmpdi2 arguments are unsigned long long. + * [MIPS] add io_map_base to pci_controller on Cobalt + * [MIPS] remove "support for" from system type entry + * [MIPS] Alchemy: Fix wrong cast + * [MIPS] Fix pb1500 reg B access + * [MIPS] AP/SP requires shadow registers, auto enable support. + * [MIPS] 20K: Handle WAIT related bugs according to errata information + * [MIPS] use compat_siginfo in rt_sigframe_n32 + * [MIPS] Remove a duplicated local variable in test_and_clear_bit() + * [MIPS] EMMA2RH: Disable GEN_RTC, it can't possibly work. + * [MIPS] SMTC and non-SMTC kernel and modules are incompatible + * [MIPS] Count timer interrupts correctly. + * x86_64: set the irq_chip name for lapic + * x86_64 irq: use mask/unmask and proper locking in fixup_irqs() + * [SPARC64]: Add irqs to mdesc_node. + * [SPARC64]: Fix VIRQ enabling. + * [SPARC64]: Need to set state to IDLE during sun4v IRQ enable. + * [SPARC64]: Add LDOM virtual channel driver and VIO device layer. + * [SPARC64]: Add Sun LDOM virtual network driver. + * [SPARC64]: Add Sun LDOM virtual disk driver. + * [SPARC64]: Create proper obppath sysfs files for VIO bus devices. + * [SPARC64] LDC: Do limited polled retry on setting RX queue head. + * [GFS2] Fix gfs2_block_truncate_page err return + * [DLM] Telnet to port 21064 can stop all lockspaces + * [GFS2] inode size inconsistency + * [GFS2] remounting w/o acl option leaves acls enabled + * [GFS2] System won't suspend with GFS2 file system mounted + * [GFS2] git-gfs2-nmw-build-fix + * [GFS2] Obtaining no_formal_ino from directory entry + * [GFS2] Remove i_mode passing from NFS File Handle + * [SUNVNET]: Validate RX descriptor size field. + * [SPARC64]: Add missing symbol exports for LDOM infrastructure. + * [SPARC64]: Temporary workaround for LDC INO double-delivery. + * [SPARC64]: Create 'devspec' nodes for vio devices. + * [SPARC64]: vdev->type can be NULL, handle this in devspec_show(). + + -- Amit Kucheria Mon, 09 Jul 2007 12:55:56 +0300 + +linux-source-2.6.22 (2.6.22-7.14) gutsy; urgency=low + + [Ben Collins] + + * build/vars: Provide ivtv-modules + * Bump ABI + * ubuntu/config: Enable Intermediate Functional Block device + * coredump: Fix typo in patch merge + * ubuntu/scripts: Make sure to symlink *.lds for ia64 builds + * ubuntu/config: Enable NO_HZ for server and sparc64 targets. + * ubuntu/config: Remove bigiron target, see if anyone complains + * ubuntu: Ok, really remove bigiron + * ubuntu/control-scripts: Fo sho, remove the debconf stuff from controls + scripts + * AppArmor: Enable exports and changes for AppArmor usage + * ubuntu: Add feisty changelog for historical purposes. + + [Colin Watson] + + * Move isofs to storage-core-modules udeb from fs-core-modules. + + [Upstream Kernel Changes] + + * [MTD] [MAPS] don't force uclinux mtd map to be root dev + * [MTD] generalise the handling of MTD-specific superblocks + * [SCSI] zfcp: avoid clutter in erp_dbf + * [SCSI] zfcp: IO stall after deleting and path checker changes after + reenabling zfcp devices + * [SCSI] ipr: Proper return codes for eh_dev_reset for SATA devices + * [SCSI] stex: fix id mapping issue + * [SCSI] stex: extend hard reset wait time + * [SCSI] stex: fix reset recovery for console device + * [SCSI] stex: minor cleanup and version update + * [SCSI] MegaRAID: Update MAINTAINERS email-id + * [SCSI] tgt: fix a rdma indirect transfer error bug + * [SCSI] NCR53C9x: correct spelling mistake in deprecation notice + * [SCSI] aacraid: Correct sa platform support. (Was: [Bug 8469] Bad EIP + value on pentium3 SMP kernel-2.6.21.1) + * [SCSI] aacraid: fix panic on short Inquiry + * [WATCHDOG] ks8695_wdt.c - new KS8695 watchdog driver + * [JFFS2] Fix BUG() caused by failing to discard xattrs on deleted files. + * [JFFS2] Fix potential memory leak of dead xattrs on unmount. + * [SCSI] sd: fix refcounting regression in suspend/resume routines + * [SCSI] aacraid: apply commit config for reset_devices flag + * [SCSI] aic7xxx: fix aicasm build failure with gcc-3.4.6 + * [SCSI] aic94xx: asd_clear_nexus should fail if the cleared task does + not complete + * [SCSI] fusion: Fix |/|| confusion + * parisc: make command_line[] static + * parisc: sync compat getdents + * [PARISC] Move #undef to end of syscall table + * [PARISC] Wire up kexec_load syscall + * parisc: convert /proc/gsc/pcxl_dma to seq_file + * [PARISC] Let PA-8900 processors boot + * [PARISC] Disable LWS debugging + * [PARISC] spelling fixes: arch/parisc/ + * sh: section mismatch fixes for system timer. + * [PARISC] ROUND_UP macro cleanup in arch/parisc + * [PARISC] ROUNDUP macro cleanup in drivers/parisc + * [PPC] Fix COMMON symbol warnings + * [PPC] Remove duplicate export of __div64_32. + * [POWERPC] 52xx: unbreak lite5200 dts (_pic vs. -pic) + * [POWERPC] QE: fix Kconfig 'select' warning with UCC_FAST + * [POWERPC] Fix Section mismatch warnings + * [POWERPC] Fix modpost warning + * [PPC] Fix modpost warning + * [CIFS] Fix oops on failed cifs mount (in kthread_stop) + * [POWERPC] Fix Kconfig warning + * [CIFS] typo in previous patch + * [SCSI] megaraid_sas: intercept cmd timeout and throttle io + * [WATCHDOG] clean-up watchdog documentation + * drm: Spinlock initializer cleanup + * drm/radeon: add more IGP chipset pci ids + * drm: make sure the drawable code doesn't call malloc(0). + * [PARISC] kobject is embedded in subsys, not kset + * [PARISC] Build fixes for power.c + * [ARM] 4401/1: S3C2443: Add definitions for port GPIOJ + * [ARM] 4402/1: S3C2443: Add physical address of HSMMC controller + * [ARM] 4403/1: Make the PXA-I2C driver work with lockdep validator + * [ARM] 4404/1: Trivial IXP42x Kconfig cleanup + * [ARM] 4405/1: NSLU2, DSM-G600 frequency fixup code + * [ARM] 4406/1: Trivial NSLU2 / NAS-100D header & setup code cleanup + * [ARM] remove unused header file: arch/arm/mach-s3c2410/bast.h + * [PARISC] fix lasi_82596 build + * [PARISC] fix section mismatch in parport_gsc + * [PARISC] fix section mismatch in parisc STI video drivers + * [PARISC] fix section mismatch in ccio-dma + * [PARISC] fix section mismatches in arch/parisc/kernel + * [PARISC] fix section mismatch in parisc eisa driver + * [PARISC] fix section mismatch in superio serial drivers + * [PARISC] Wire up utimensat/signalfd/timerfd/eventfd syscalls + * hwmon/ds1621: Fix swapped temperature limits + * hwmon/coretemp: Add more safety checks + * hwmon/w83627hf: Be quiet when no chip is found + * hwmon-vid: Don't spam the logs when VRM version is missing + * hwmon/applesmc: Simplify dependencies + * hwmon/applesmc: Handle name file creation error and deletion + * ieee1394: sbp2: include workqueue.h + * ieee1394: eth1394: remove bogus netif_wake_queue + * ieee1394: eth1394: handle tlabel exhaustion + * ieee1394: eth1394: bring back a parent device + * ieee1394: raw1394: Fix async send + * firewire: Add missing byteswapping for receive DMA programs. + * firewire: prefix modules with firewire- instead of fw- + * firewire: fix return code + * [libata] Add drive to NCQ blacklist + * [ARM] enable arbitary speed tty ioctls and split input/output speed + * Input: db9 - do not ignore dev2 module parameter + * Input: logips2pp - fix typo in Kconfig + * [XFS] Write at EOF may not update filesize correctly. + * [SCSI] pluto: Use wait_for_completion_timeout. + * [SPARC64]: Kill unused DIE_PAGE_FAULT enum value. + * [SPARC64]: Don't be picky about virtual-dma values on sun4v. + * [SPARC32]: Removes mismatch section warnigs in sparc time.c file + * [SERIAL] sunzilog: section mismatch fix + * [SPARC64]: PCI device scan is way too verbose by default. + * [SCSI] jazz_esp: Converted to use esp_core. + * [SCSI] ESP: Kill SCSI_ESP_CORE and link directly just like jazz_esp + * [SPARC64]: Fix typo in sun4v_hvapi_register error handling. + * [SPARC64]: Report proper system soft state to the hypervisor. + * [SPARC64]: Negotiate hypervisor API for PCI services. + * [SPARC64]: Use machine description and OBP properly for cpu probing. + * [SPARC64]: Eliminate NR_CPUS limitations. + * [SPARC64]: arch/sparc64/time.c doesn't compile on Ultra 1 (no PCI) + * [SPARC]: Linux always started with 9600 8N1 + * [SPARC64]: Fix _PAGE_EXEC_4U check in sun4u I-TLB miss handler. + * [SPARC]: Emulate cmpxchg like parisc + * [SPARC]: Mark as emulating cmpxchg, add appropriate depends for DRM. + * [SPARC64]: Fix two bugs wrt. kernel 4MB TSB. + * [SPARC64]: Fill holes in hypervisor APIs and fix KTSB registry. + * mac80211: fail back to use associate from reassociate + * mac80211: fix memory leak when defrag fragments + * mac80211: always set carrier status on open + * mac80211: avoid null ptr deref in ieee80211_ibss_add_sta + * prism54: fix monitor mode oops + * ieee80211: fix incomplete error message + * softmac: alloc_ieee80211() NULL check + * hostap: Allocate enough tailroom for TKIP + * sparc64: fix alignment bug in linker definition script + * USB: replace flush_workqueue with cancel_sync_work + * ACPICA: allow Load(OEMx) tables + * ACPI: thermal: Replace pointer with name in trip_points + * ACPI: extend "acpi_osi=" boot option + * IB/mthca: Fix handling of send CQE with error for QPs connected to SRQ + * IPoIB/cm: Fix performance regression on Mellanox + * IB/cm: Fix stale connection detection + * IB/mlx4: Fix last allocated object tracking in bitmap allocator + * NOHZ: prevent multiplication overflow - stop timer for huge timeouts + * random: fix error in entropy extraction + * random: fix seeding with zero entropy + * ACPI: Make _OSI(Linux) a special case + * ACPI: add __init to acpi_initialize_subsystem() + * [PARISC] fix "ENTRY" macro redefinition + * [PARISC] fix section mismatch in smp.c + * [PARISC] remove remnants of parisc-specific softirq code + * [PARISC] fix trivial spelling nit in asm/linkage.h + * [PARISC] fix null ptr deref in unwind.c + * [PARISC] fix "reduce size of task_struct on 64-bit machines" fallout + * [PARISC] be more defensive in process.c::get_wchan + * [ARM] use __used attribute + * [ARM] Fix stacktrace FP range checking + * [ARM] oprofile: avoid lockdep warnings on mpcore oprofile init + * [ARM] 4411/1: KS8695: Another serial driver fix + * [ARM] 4412/1: S3C2412: reset errata fix + * [ARM] 4414/1: S3C2443: sparse fix for clock.c + * [ARM] 4415/1: AML5900: fix sparse warnings from map_io + * [ARM] 4416/1: NWFPE: fix undeclared symbols + * [ARM] 4410/1: Remove extern declarations in coyote/ixdpg425-pci.c + * [ARM] 4394/1: ARMv7: Add the TLB range operations + * [ARM] 4417/1: Serial: Fix AMBA drivers locking + * sky2: dont set bogus bit in PHY register + * sky2: checksum offload plus vlan bug + * sky2: program proper register for fiber PHY + * defxx: Fix the handling of ioremap() failures + * e1000: restore netif_poll_enable call but make sure IRQs are off + * sky2: enable IRQ on duplex renegotiation + * ehea: Fixed multi queue RX bug + * [SCSI] fix CONFIG_SCSI_WAIT_SCAN=m + * [SCSI] qla2xxx: fix timeout in qla2x00_down_timeout + * [ARM] Fix some section mismatch warnings + * alpha: cleanup in bitops.h + * alpha: support new syscalls + * fix possible null ptr deref in kallsyms_lookup + * NFS: Fix a refcount leakage in O_DIRECT + * a bug in ramfs_nommu_resize function, passing old size to vmtruncate + * sh: Fix pcrel too far for in_nmi label. + * sh: Trivial fix for dma-api compile failure. + * sh: Fix vsyscall build failure. + * sh: trivial build cleanups. + * sh: support older gcc's + * [ALSA] HDA: Add support for Gateway NX860 + * [ALSA] HDA: Add more systems to Sigmatel codec + * [ALSA] HDA: Fix headphone mute issue on non-eapd Conexant systems + * [ALSA] hda-codec - Add support for ASUS A8J modem + * [ALSA] ali5451 - Fix possible NULL dereference + * [ALSA] hda-intel: fix ASUS M2V detection + * [ALSA] Fix ASoC s3c24xx-pcm spinlock bug + * [ALSA] hda-codec - Add quirk for MSI S420 + * [ALSA] hda-codec - Add quirk for Supermicro PDSBA to alc883_cfg_tbl[] + * [ALSA] hda-codec - Add support for MSI K9N Ultra + * [ALSA] hda-codec - Fix pin configs for Gateway MX6453 + * [ALSA] hda-codec - Fix input with STAC92xx + * [ALSA] hda-codec - Fix STAC922x capture boost level + * [CRYPTO] cryptd: Fix problem with cryptd and the freezer + * [CASSINI]: Fix printk message typo. + * [XFRM]: Allow XFRM_ACQ_EXPIRES to be tunable via sysctl. + * [XFRM]: xfrm_larval_drop sysctl should be __read_mostly. + * [IPSEC]: Fix IPv6 AH calculation in outbound + * [IPV6] ROUTE: No longer handle ::/0 specially. + * [NET]: parse ip:port strings correctly in in4_pton + * [IPSEC]: Fix panic when using inter address familiy IPsec on loopback. + * [IPV4]: Kill references to bogus non-existent CONFIG_IP_NOSIOCRT + * [AF_PACKET]: Kill bogus CONFIG_PACKET_MULTICAST + * [IPV6]: Fix build warning. + * [AF_PACKET]: Kill CONFIG_PACKET_SOCKET. + * [SOCK]: Shrink struct sock by 8 bytes on 64-bit. + * [TCP]: Consolidate checking for tcp orphan count being too big. + * [NET] napi: Call __netif_rx_complete in netif_rx_complete + * [IPV6] ADDRCONF: Fix conflicts in DEVCONF_xxx constant. + * [TCP] tcp_probe: a trivial fix for mismatched number of printl + arguments. + * [TCP] tcp_probe: use GCC printf attribute + * [BRIDGE]: Reduce frequency of forwarding cleanup timer in bridge. + * [BRIDGE]: Round off STP perodic timers. + * [IPSEC]: Add xfrm_sysctl.txt. + * [SPARC64]: Add missing NCS and SVC hypervisor interfaces. + * [SPARC32]: Build fix. + * [SPARC]: Missing #include in drivers/sbus/char/flash.c + * [ALSA] version 1.0.14 + * neofb: Fix pseudo_palette array overrun in neofb_setcolreg + * smpboot: fix cachesize comparison in smp_tune_scheduling() + * at91: fix enable/disable_irq_wake symmetry in pcmcia driver + * SLUB: More documentation + * pci-quirks: fix MSI disabling on RS400-200 and RS480 + * ntfs_init_locked_inode(): fix array indexing + * m68k: runtime patching infrastructure + * SLUB: Fix NUMA / SYSFS bootstrap issue + * afs: needs sched.h + * m68k: discontinuous memory support + * [S390] Add exception handler for diagnose 224 + * [S390] dasd_eer: use mutex instead of semaphore + * [S390] arch/s390/kernel/debug.c: use mutex instead of semaphore + * [S390] raw3270: use mutex instead of semaphore + * [S390] Fix section annotations. + * [S390] cio: Use device_schedule_callback() for removing disconnected + devices. + * [S390] cio: deregister ccw device when pgid disband failed + * ACPI: thinkpad-acpi: do not use named sysfs groups + * ieee1394: fix calculation of sysfs attribute "address" + * ieee1394: sbp2: offer SAM-conforming target port ID in sysfs + * firewire: fw-sbp2: implement sysfs ieee1394_id + * firewire: add to MAINTAINERS + * firewire: Implement suspend/resume PCI driver hooks. + * firewire: Change struct fw_cdev_iso_packet to not use bitfields. + * firewire: Install firewire-constants.h and firewire-cdev.h for + userspace. + * EXT4: Fix whitespace + * Remove unnecessary exported symbols. + * ext4: Extent overlap bugfix + * When ext4_ext_insert_extent() fails to insert new blocks + * Define/reserve new ext4 superblock fields + * msi: fix ARM compile + * PCI: disable MSI by default on systems with Serverworks HT1000 chips + * PCI: Fix pci_find_present + * PCI: i386: fixup for Siemens Nixdorf AG FSC Multiprocessor Interrupt + Controllers + * PCI: quirk disable MSI on via vt3351 + * [XTENSA] fix bit operations in bitops.h + * [XTENSA] Spelling fixes in arch/xtensa + * [XTENSA] fix sources using deprecated assembler directive + * [XTENSA] Remove multi-exported symbols from xtensa_ksyms.c + * [XTENSA] Use generic 64-bit division + * [XTENSA] clean-up header files + * [XTENSA] Move common sections into bss sections + * [XTENSA] Remove non-rt signal handling + * Xtensa: use asm-generic/fcntl.h + * [JFFS2] Fix buffer length calculations in jffs2_get_inode_nodes() + * Fix vmi.c compilation + * x86_64: allocate sparsemem memmap above 4G + * Add select PHYLIB to the UCC_GETH Kconfig option + * Fix possible UDF data corruption + * m68k: parenthesis balance + * msi: fix the ordering of msix irqs + * msi: mask the msix vector before we unmap it + * potential parse error in ifdef + * parse errors in ifdefs + * pci_ids: update patch for Intel ICH9M + * x86: fix oprofile double free + * Work around Dell E520 BIOS reboot bug + * fix compat futex code for private futexes + * skeletonfb: fix of xxxfb_setup ifdef + * vt8623fb: arkfb: null pointer dereference fix + * cfag12864bfb: Use sys_ instead of cfb_ framebuffer accessors + * fbdev: Move declaration of fb_class to + * misc/tifm_7xx1: replace deprecated irq flag + * add a trivial patch style checker + * Documentation: How to use GDB to decode OOPSes + * RTC: use fallback IRQ if PNP tables don't provide one + * memory hotplug: fix unnecessary calling of init_currenty_empty_zone() + * tty: fix leakage of -ERESTARTSYS to userland + * ISDN4Linux: fix maturity label + * Fix broken CLIR in isdn driver + * prism54: MAINTAINERS update + * atmel_spi dma address bugfix + * h8300 trival patches + * ALPHA: support graphics on non-zero PCI domains + * ALPHA: correct low-level I/O routines for sable-lynx + * ALPHA: misc fixes + * Better documentation for ERESTARTSYS + * serial_core.h: include + * SPI: Freescale iMX SPI controller driver fixes + * SLUB: fix locking for hotplug callbacks + * pm3fb: switching between X and fb fix + * microcode: fix section mismatch warning + * isdn: fix section mismatch warnings + * acpi: fix section mismatch warning in asus + toshiba + * kvm: fix section mismatch warning in kvm-intel.o + * net/hp100: fix section mismatch warning + * timer statistics: fix race + * timer stats: speedups + * [SCSI] aacraid: fix shutdown handler to also disable interrupts. + * [MTD] Fix error checking after get_mtd_device() in get_sb_mtd functions + * [JFFS2] Fix obsoletion of metadata nodes in jffs2_add_tn_to_tree() + * ACPI: Section mismatch ... acpi_map_pxm_to_node + * ACPICA: Support for external package objects as method arguments + * Pull now into release branch + * Pull osi-now into release branch + * [POWERPC] Update documentation for of_find_node_by_type() + * [POWERPC] Fix ppc32 single-stepping out of syscalls + * [POWERPC] Fix compiler/assembler flags for Ebony platform boot files + * [POWERPC] Fix possible access to free pages + * [POWERPC] ps3/interrupt.c uses get_hard_smp_processor_id + * [POWERPC] pasemi idle uses hard_smp_processor_id + * [POWERPC] Create a zImage for legacy iSeries + * [POWERPC] Don't use HOSTCFLAGS in BOOTCFLAGS + * [POWERPC] Fix compile warning in pseries xics code + * [POWERPC] Fix return from pte_alloc_one() in out-of-memory case + * [POWERPC] Compare irq numbers with NO_IRQ not IRQ_NONE + * [POWERPC] Don't allow PMAC_APM_EMU for 64-bit + * [POWERPC] Fix compile breakage for IBM/AMCC 4xx arch/ppc platforms + * [POWERPC] Fix zImage.coff generation for 32-bit pmac + * [ARM] 4392/2: Do not corrupt the SP register in compressed/head.S + * [ARM] 4418/1: AT91: Number of programmable clocks differs + * [ARM] 4419/1: AT91: SAM9 USB clocks check for suspending + * [ARM] 4422/1: Fix default value handling in gpio_direction_output (PXA) + * [ARM] Solve buggy smp_processor_id() usage + * qla3xxx: device doesnt do hardware checksumming. + * VLAN: kill_vid is only useful for VLAN filtering devices + * sky2: Fix VLAN unregistration + * 8139cp: fix VLAN unregistration + * atl1: eliminate unneeded kill_vid code + * network drivers: eliminate unneeded kill_vid code + * e1000: disable polling before registering netdevice + * smc91x: sh solution engine fixes. + * Update tulip maintainer email address + * NetXen: Removal of extra free_irq call + * myri10ge: report link up/down in standard ethtool way + * NET: add MAINTAINERS entry for ucc_geth driver + * [ARM] 4421/1: AT91: Value of _KEY fields. + * [PARISC] Fix bug when syscall nr is __NR_Linux_syscalls + * [AF_UNIX]: Make socket locking much less confusing. + * [TG3]: Fix link problem on Dell's onboard 5906. + * [AF_UNIX]: Fix datagram connect race causing an OOPS. + * [TCP]: Use default 32768-61000 outgoing port range in all cases. + * [ATM]: Fix warning. + * [NET]: Make net watchdog timers 1 sec jiffy aligned. + * [NET]: Fix comparisons of unsigned < 0. + * [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules) + * [NET] gso: Fix GSO feature mask in sk_setup_caps + * [IPV4]: Fix "ipOutNoRoutes" counter error for TCP and UDP + * [ICMP]: Fix icmp_errors_use_inbound_ifaddr sysctl + * [VIDEO]: XVR500 and XVR2500 require FB=y + * [ATA]: Don't allow to enable this for SPARC64 without PCI. + * sh: Fix in_nmi symbol build error. + * sh: microdev: Fix compile warnings. + * sh: Fix SH4-202 clock fwk set_rate() mismatch. + * sh: voyagergx: Fix build warnings. + * sh: ioremap() through PMB needs asm/mmu.h. + * sh: Fix se73180 platform device registration. + * Input: ucb1x00 - do not access input_dev->private directly + * Input: reduce raciness when input handlers disconnect + * [PARISC] Fix kernel panic in check_ivt + * [SCSI] atari_NCR5380: update_timeout removal + * [SCSI] JAZZ ESP and SUN ESP need SPI_ATTRS + * [CIFS] fix mempool destroy done in wrong order in cifs error path + * SPI dynamic busid generation bugfix + * mtrr atomicity fix + * vanishing ioctl handler debugging + * libata: always use polling SETXFER + * Linux 2.6.22-rc4 + * [SPARC64]: Move topology init code into new file, sysfs.c + * [SPARC64]: Export basic cpu properties via sysfs. + * [SPARC64]: Fix service channel hypervisor function names. + * [SPARC64]: Provide mmu statistics via sysfs. + * [SPARC64]: Proper multi-core scheduling support. + * [SPARC64]: Make core and sibling groups equal on UltraSPARC-IV. + * [SPARC64]: Fix {mc,smt}_capable(). + * [SPARC64]: Fill in gaps in non-PCI dma_*() NOP implementation. + * [ATA]: Back out bogus (SPARC64 && !PCI) Kconfig depends. + * [VIDEO]: Fix section mismatch warning in promcon. + * [CIFS] whitespace cleanup + * [ARM] Fix 4417/1: Serial: Fix AMBA drivers locking + * [VIDEO] ffb: The pseudo_palette is only 16 elements long + * [ARM] pxa: fix pxa27x keyboard driver + * [VIDEO] sunxvr2500fb: Fix pseudo_palette array size + * [VIDEO] sunxvr500fb: Fix pseudo_palette array size + * [CIFS] whitespace cleanup part 2 + * [CIFS] Missing flag on negprot needed for some servers to force packet + signing + * [MIPS] Atlas, Malta, SEAD: Remove scroll from interrupt handler. + * [MIPS] Remove duplicate fpu enable hazard code. + * [MIPS] EMMA2RH: remove dead KGDB code + * [MIPS] RM300: Fix MMIO problems by marking the PCI INT ACK region busy + * [MIPS] Fix VGA corruption on RM300C + * [MIPS] Drop __ARCH_WANT_SYS_FADVISE64 + * [MIPS] Make dma_map_sg handle sg elements which are longer than one + page + * [MIPS] Fix some system calls with long long arguments + * [MIPS] Remove prototype for deleted function qemu_handle_int + * [MIPS] Fix some minor typoes in arch/mips/Kconfig. + * [MIPS] Fix warning by moving do_default_vi into CONFIG_CPU_MIPSR2_SRS + * [AGPGART] intel_agp: cleanup intel private data + * [AGPGART] intel_agp: use table for device probe + * [AGPGART] intel_agp: add support for 965GME/GLE + * [AGPGART] intel_agp: add support for 945GME + * [AGPGART] intel_agp: Add support for G33, Q33 and Q35 chipsets + * ocfs2: Fix masklog breakage + * ocfs2: Fix invalid assertion during write on 64k pages + * [POWERPC] pasemi: Fix iommu + 64K PAGE_SIZE bug + * [POWERPC] spufs: Refuse to load the module when not running on cell + * [POWERPC] spufs: Hook up spufs_release_mem + * [POWERPC] spufs: Fix gang destroy leaks + * [POWERPC] spufs: Free mm if spufs_fill_dir() failed + * [POWERPC] spufs: Synchronize pte invalidation vs ps close + * [POWERPC] spufs scheduler: Fix wakeup races + * [POWERPC] Fix pci_setup_phb_io_dynamic for pci_iomap + * [POWERPC] cbe_cpufreq: Limit frequency via cpufreq notifier chain + * [POWERPC] scc_sio: Fix link failure + * [POWERPC] Fix typo in booting-without-of-txt section numbering + * [POWERPC] spufs: Don't yield nosched context + * [POWERPC] Add table of contents to booting-without-of.txt + * [POWERPC] spufs: Fix error handling in spufs_fill_dir() + * mmc-atmel: remove linux/mmc/protocol.h dependencies + * au1xmmc: Replace C code with call to ARRAY_SIZE() macro. + * mmc: fix broken if clause + * mmc: don't call switch on old cards + * [POWERPC] Fix building of COFF zImages + * checkpatch.pl: should be executable + * Restrict clearing TIF_SIGPENDING + * mlx4_core: Fix CQ context layout + * mlx4_core: Initialize ctx_list and ctx_lock earlier + * mlx4_core: Free catastrophic error MSI-X interrupt with correct dev_id + * IB/mthca, mlx4_core: Fix typo in comment + * [BNX2]: Fix netdev watchdog on 5708. + * [BNX2]: Add missing wait in bnx2_init_5709_context(). + * [BNX2]: Enable DMA on 5709. + * [BNX2]: Fix occasional counter corruption on 5708. + * [BNX2]: Update version and reldate. + * [TCP]: Honour sk_bound_dev_if in tcp_v4_send_ack + * [IPV4]: Only panic if inetdev_init fails for loopback + * [IPV4]: Convert IPv4 devconf to an array + * [IPV4]: Add default config support after inetdev_init + * [IPV4]: Restore old behaviour of default config values + * [RFKILL]: Make rfkill->name const + * [TCP]: Use LIMIT_NETDEBUG in tcp_retransmit_timer(). + * [TCP] tcp_probe: Attach printf attribute properly to printl(). + * [NETLINK]: Mark netlink policies const + * [RTNETLINK]: ifindex 0 does not exist + * [NETFILTER]: nf_conntrack: fix helper module unload races + * [NETFILTER]: ip_tables: fix compat related crash + * [NETFILTER]: nf_conntrack_amanda: fix textsearch_prepare() error check + * [AF_UNIX]: Fix stream recvmsg() race. + * [UDP]: Revert 2-pass hashing changes. + * [NET]: Avoid duplicate netlink notification when changing link state + * [NET_SCHED]: Fix filter double free + * xfrm: Add security check before flushing SAD/SPD + * [SPARC64]: Fix 2 bugs in PCI Sabre bus scanning. + * [SPARC64]: Fix SBUS IRQ regression caused by PCI-E driver. + * frv: build fix + * enable interrupts in user path of page fault. + * RAMFS NOMMU: missed POSIX UID/GID inode attribute checking + * [SPARC64]: Include instead of . + * [SPARC64]: Handle PCI bridges without 'ranges' property. + * mlx4_core: Check firmware command interface revision + * mlx4_core: Don't set MTT address in dMPT entries with PA set + * IB/mlx4: Fix zeroing of rnr_retry value in ib_modify_qp() + * RDMA/cma: Fix initialization of next_port + * IB/mlx4: Make sure RQ allocation is always valid + * splice: move inode size check into generic_file_splice_read() + * splice: remove do_splice_direct() symbol export + * pipe: move pipe_inode_info structure decleration up before it's used + * splice: move balance_dirty_pages_ratelimited() outside of splice actor + * splice: __generic_file_splice_read: fix i_size_read() length checks + * splice: __generic_file_splice_read: fix read/truncate race + * V4L/DVB (5702): Fix Kconfig items to avoid linkedition errors + * V4L/DVB (5700): Saa7111: fix picture settings cache bug + * V4L/DVB (5699): Cinergyt2: fix file release handler + * V4L/DVB (5675): Move big PIO accesses from the interrupt handler to a + workhandler + * V4L/DVB (5716): Tda10086,tda826x: fix tuning, STR/SNR values + * V4L/DVB (5720): Usbvision: fix urb allocation and submits + * V4L/DVB (5730): Remove unused V4L2_CAP_VIDEO_OUTPUT_POS + * V4L/DVB (5732): Add ivtv CROPCAP support and fix ivtv S_CROP for video + output. + * V4L/DVB (5736): Add V4L2_FBUF_CAP/FLAG_LOCAL/GLOBAL_INV_ALPHA + * V4L/DVB (5673): Fix audio stuttering for saa711x/ivtv when in radio + mode. + * V4L/DVB (5761): Fix broken b2c2 dependency on non x86 architectures + * V4L/DVB (5751): Ivtv: fix ia64 printk format warnings. + * serverworks: remove crappy code + * serverworks: fix CSB6 tuning logic + * it821x: RAID mode fixes + * ide: HPA detect from resume + * ide: generic IDE PCI driver, add another device exception + * hpt366: disallow Ultra133 for HPT374 + * Add the PATA controller device ID to pci_ids.h for MCP73/MCP77. + * ide: Add the MCP73/77 support to PATA driver + * [CIFS] CIFS should honour umask + * update Documentation/driver-model/platform.txt + * Driver core: keep PHYSDEV for old struct class_device + * Driver core: kill unused code + * kobject: use the proper printk level for kobject error + * firmware: remove orphaned Email + * [IPV4]: Do not remove idev when addresses are cleared + * [NetLabel]: consolidate the struct socket/sock handling to just struct + sock + * [CIPSO]: Fix several unaligned kernel accesses in the CIPSO engine. + * USB: set default y for CONFIG_USB_DEVICE_CLASS + * usblp: Don't let suspend to kill ->used + * USB: usb gadgets avoid le{16,32}_to_cpup() + * USB: UNUSUAL_DEV: Sync up some reported devices from Ubuntu + * USB: cxacru: add Documentation file + * USB: cxacru: create sysfs attributes in atm_start instead of bind + * USB: cxacru: ignore error trying to start ADSL in atm_start + * USB: Fix up bogus bInterval values in endpoint descriptors + * OHCI: Fix machine check in ohci_hub_status_data + * update checkpatch.pl to version 0.03 + * m68knommu: fix ColdFire timer off by 1 + * nommu: report correct errno in message + * loop: preallocate eight loop devices + * document Acked-by: + * update feature-removal-schedule.txt to include deprecated functions + * mount -t tmpfs -o mpol=: check nodes online + * slab: fix alien cache handling + * potential parse error in ifdef part 3 + * SLUB: return ZERO_SIZE_PTR for kmalloc(0) + * uml: fix kernel stack size on x86_64 + * Documentation/atomic_ops.txt typo fix + * Move three functions that are only needed for CONFIG_MEMORY_HOTPLUG + * Char: stallion, don't fail with less than max panels + * Char: stallion, alloc tty before pci devices init + * Char: stallion, proper fail return values + * uml: get declaration of simple_strtoul + * isdn/diva: fix section mismatch + * sata_promise: use TF interface for polling NODATA commands + * rt-mutex: fix stale return value + * rt-mutex: fix chain walk early wakeup bug + * pi-futex: fix exit races and locking problems + * fix sysrq-m oops + * x86_64: oops_begin() fix + * reiserfs: mailing list has moved + * checkpatch: produce fewer lines of output + * MAINTAINERS: corrections + * hexdump: more output formatting + * update checkpatch.pl to version 0.04 + * Protect from multiple inclusion + * [IrDA]: Fix Rx/Tx path race. + * [IrDA]: f-timer reloading when sending rejected frames. + * ibmveth: Fix h_free_logical_lan error on pool resize + * ibmveth: Automatically enable larger rx buffer pools for larger mtu + * typo in via-velocity.c + * NetXen: Fix ping issue after reboot on Blades with 3.4.19 firmware + * NetXen: Fix compile failure seen on PPC architecture + * ehea: Fixed possible kernel panic on VLAN packet recv + * phylib: add RGMII-ID mode to the Marvell m88e1111 PHY to fix broken + ucc_geth + * net: fix typo in drivers/net/usb/Kconfig + * remove unused variable in pata_isapnp + * libata: disable NCQ for HITACHI HTS541680J9SA00/SB21C7EP + * libata: fix probe time irq printouts + * libata: print device model and firmware revision for ATAPI devices + * libata: fix hw_sata_spd_limit initialization + * ahci: Add MCP73/MCP77 support to AHCI driver + * libata-core/sff: Fix multiple assumptions about DMA + * libata: Correct abuse of language + * libata passthru: update protocol numbers + * libata passthru: support PIO multi commands + * libata passthru: map UDMA protocols + * libata passthru: always enforce correct DEV bit + * libata passthru: update cached device paramters + * i915: add new pciids for 945GME, 965GME/GLE + * drm/i915: Add support for the G33, Q33, and Q35 chipsets. + * drm: fix radeon setparam on 32/64 bit systems. + * [ARM] VFP: fix section mismatch error + * libata: force PIO on IOMEGA ZIP 250 ATAPI + * libata: limit post SRST nsect/lbal wait to ~100ms + * Blackfin arch: remove defconfig file + * Blackfin arch: DMA code minor naming convention fix + * Blackfin arch: spelling fixes + * Blackfin arch: fix bug ad1836 fails to build properly for BF533-EZKIT + * Blackfin arch: all symbols were offset by 4k, since we didn't have the + __text label. + * Blackfin arch: mark our memory init functions with __init so they get + freed after init + * Blackfin arch: implement a basic /proc/sram file for L1 allocation + visibility + * Blackfin arch: fixup Blackfin MAINTIANERS team member list + * Blackfin arch: scrub old console defines + * Blackfin arch: update defconfigs + * Blackfin arch: unify differences between our diff head.S files -- no + functional changes + * Blackfin arch: move more of our startup code to .init so it can be + freed once we are up and running + * Blackfin arch: add proper ENDPROC() + * Blackfin arch: try to split up functions like this into smaller units + according to LKML review + * Blackfin arch: fix spelling typo in output + * Blackfin arch: As Mike pointed out range goes form m..MAX_BLACKFIN_GPIO + -1 + * Blackfin arch: add missing gpio.h header to fix compiling in some pm + configurations + * Blackfin arch: add support for Alon Bar-Lev's dynamic kernel + command-line + * Blackfin arch: fix bug can not wakeup from sleep via push buttons + * Blackfin arch: make sure we initialize our L1 Data B section properly + based on the linked kernel + * Blackfin arch: redo our linker script a bit + * Blackfin arch: move HI/LO macros into blackfin.h and punt the rest of + macros.h as it includes VDSP macros we never use + * Blackfin serial driver: hook up our UARTs STP bit with userspaces + CMSPAR + * Blackfin serial driver: ignore framing and parity errors + * Blackfin serial driver: actually implement the break_ctl() function + * Blackfin serial driver: decouple PARODD and CMSPAR checking from PARENB + * Blackfin RTC drivers: update MAINTAINERS information + * Blackfin SPI driver: tweak spi cleanup function to match newer kernel + changes + * [ARM] 4442/1: OSIRIS: Fix CPLD register definitions + * [ARM] 4443/1: OSIRIS: Add watchdog device to machine devices + * [ARM] 4444/2: OSIRIS: CPLD suspend fix + * [ARM] 4445/1: ANUBIS: Fix CPLD registers + * Blackfin SPI driver: fix bug SPI DMA incomplete transmission + * Blackfin SMC91X ethernet supporting driver: SMC91C111 LEDs are note + drived in the kernel like in uboot + * [MIPS] Fix KMODE for the R3000 + * [MIPS] SMTC: Don't set and restore irqregs ptr from self_ipi. + * [MIPS] Always install the DSP exception handler. + * [MIPS] Atlas: Fix build. + * [MIPS] Wire up utimensat, signalfd, timerfd, eventfd + * [MIPS] SMTC: Fix warning. + * [MIPS] SMTC: Don't continue in set_vi_srs_handler on detected bad + arguments. + * [MIPS] SMTC: The MT ASE requires to initialize c0_pagemask and + c0_wired. + * [MIPS] SMTC: Fix build error caused by nonsense code. + * [MIPS] Fix modpost warnings by making start_secondary __cpuinit + * [MIPS] Fix IP27 build + * [MIPS] Fix smp barriers in test_and_{change,clear,set}_bit + * libertas: scan two channels per scan command + * libertas: rename wlan_association_worker + * libertas: a debug output was missing a newline + * libertas: fix removal of all debugfs files + * libertas: remove deprecated pm_register and associated code + * libertas: remove __FILE__ from debug output + * libertas: remove unused/superfluous definitions of DEV_NAME_LEN + * libertas: move vendor & product id's into if_usb.c + * libertas: make libertas_wlan_data_rates static + * libertas: fix scanning from associate path + * libertas: exclude non-used code when PROC_DEBUG is not set + * libertas: make debug configurable + * libertas: tune debug code + * libertas: single out mesh code + * libertas: change debug output of libertas_interrupt() + * libertas: get rid of libertas_sbi_get_priv() + * libertas: fix SSID output + * libertas: changed some occurences of kmalloc() + memset(&a,0,sz) to + kzalloc() + * libertas: move reset_device() code main.c to if_usb.c + * libertas: split wlan_add_card() + * libertas: fixed transmission flow control on the mesh interface + * libertas: fix error handling of card initialization + * libertas: added transmission failures to mesh statistics + * libertas: wakeup both mesh and normal wakeup when getting out of scan + * libertas: indirect all hardware access via hw_XXXX functions + * libertas: move contents of fw.h to decl.h + * libertas: split module into two (libertas.ko and usb8xxx.ko) + * libertas: fix RESET logic at unload time + * libertas: let DRV_NAME be overridable + * libertas: remove unused variables in wlan_dev_t + * libertas: fixed incorrect assigment of fcs errors to frag errors + * libertas: add URB debug info + * libertas: fixed kernel oops on module/card removal + * libertas: call SET_NETDEV_DEV from common code + * libertas: replace 'macaddress' with 'bssid' + * libertas: correctly unregister mesh netdev on error + * libertas: don't tear down netdev in libertas_activate_card + * libertas: version bump (321p0) and cmds update for new fw (5.220.10.p0) + * libertas: updated mesh commands for 5.220.9.p11 + * libertas: make scan result handling more flexible + * libertas: fix 'keep previous scan' behavior + * libertas: cleanup of fwt_list_route processing + * libertas: fix oops on rmmod + * libertas: move channel changing into association framework + * libertas: make association paths consistent + * libertas: use MAC_FMT and MAC_ARG where appropriate + * libertas: use compare_ether_addr() rather than memcmp() where + appropriate + * libertas: fix debug enter/leave prints for + libertas_execute_next_command + * libertas: correctly balance locking in libertas_process_rx_command + * libertas: correct error report paths for wlan_fwt_list_ioctl + * libertas: fix deadlock SIOCGIWSCAN handler + * libertas: fix default adhoc channel + * libertas: honor specific channel requests during association + * libertas: send SIOCGIWSCAN event after partial scans too + * libertas: debug print spacing fixes in assoc.c + * libertas: add more verbose debugging to libertas_cmd_80211_authenticate + * libertas: Make WPA work through supplicant handshake + * libertas: updated readme file + * libertas: make mac address configuration work with mesh interface too + * libertas: split wext for eth and msh + * libertas: support for mesh autostart on firmware 5.220.11 + * libertas: fix character set in README + * libertas: sparse fixes + * libertas: first pass at fixing up endianness issues + * libertas: More endianness fixes. + * libertas: more endianness fixes, in tx.c this time + * libertas: don't byte-swap firmware version number. It's a byte array. + * libertas: fix big-endian associate command. + * libertas: tweak association debug output + * libertas: remove structure WLAN_802_11_SSID and libertas_escape_essid + * libertas: remove WPA_SUPPLICANT structure + * libertas: reduce SSID and BSSID mixed-case abuse + * kbuild: fix sh64 section mismatch problems + * cfg80211: fix signed macaddress in sysfs + * mac80211: fix debugfs tx power reduction output + * mac80211: Don't stop tx queue on master device while scanning. + * Input: usbtouchscreen - fix fallout caused by move from drivers/usb + * Input: i8042 - add ASUS P65UP5 to the noloop list + * Input: i8042 - add ULI EV4873 to noloop list + * [PARISC] remove global_ack_eiem + * libertas: pull current channel from firmware on mesh autostart + * libertas: deauthenticate from AP in channel switch + * libertas: actually send mesh frames to mesh netdev + * libertas: convert libertas_mpp into anycast_mask + * [PPP_MPPE]: Fix "osize too small" check. + * NetXen: Fix link status messages + * myri10ge: limit the number of recoveries + * myri10ge: report when the link partner is running in Myrinet mode + * myri10ge: update driver version + * sysfs: store sysfs inode nrs in s_ino to avoid readdir oopses + * sysfs: fix condition check in sysfs_drop_dentry() + * sysfs: fix race condition around sd->s_dentry, take#2 + * [TCP]: Fix left_out setting during FRTO + * Input: move input-polldev to drivers/input + * [SPARC64]: Wire up cookie based sun4v interrupt registry. + * [SPARC64]: Fix IO/MEM space sizing for PCI. + * [SPARC64]: Really fix parport. + * [SPARC64]: Fix args to sun4v_ldc_revoke(). + * [TCP]: Set initial_ssthresh default to zero in Cubic and BIC. + * mmc-omap: fix sd response type 6 vs. 1 + * mmc: get back read-only switch function + * [SCTP]: Correctly set daddr for IPv6 sockets during peeloff + * [SCTP]: Allow unspecified port in sctp_bindx() + * [SCTP] Fix leak in sctp_getsockopt_local_addrs when copy_to_user fails + * [SCTP] Update pmtu handling to be similar to tcp + * [SCTP] Flag a pmtu change request + * [SCTP] Don't disable PMTU discovery when mtu is small + * [POWERPC] Fix per-cpu allocation on oldworld SMP powermacs + * [POWERPC] Fix console output getting dropped on platforms without + udbg_putc + * [AVR32] ratelimit segfault reporting rate + * [AVR32] gpio_*_cansleep() fix + * [AVR32] STK1000: Set SPI_MODE_3 in the ltv350qv board info + * [AVR32] Define ARCH_KMALLOC_MINALIGN to L1_CACHE_BYTES + * [MIPS] Malta: Fix for SOCitSC based Maltas + * [MIPS] Separate performance counter interrupts + * [MIPS] Fix builds where MSC01E_xxx is undefined. + * [TCP]: Add missing break to TCP option parsing code + * [IPV6] addrconf: Fix IPv6 on tuntap tunnels + * [AGPGART] intel_agp: fix device probe + * KVM: Prevent guest fpu state from leaking into the host + * splice: adjust balance_dirty_pages_ratelimited() call + * splice: fix leak of pages on short splice to pipe + * splice: only check do_wakeup in splice_to_pipe() for a real pipe + * [TCP]: Congestion control API RTT sampling fix + * [TCP]: Fix logic breakage due to DSACK separation + * [RXRPC] net/rxrpc/ar-connection.c: fix NULL dereference + * block: always requeue !fs requests at the front + * mm: Fix memory/cpu hotplug section mismatch and oops. + * Resume from RAM on HPC nx6325 broken + * ide-scsi: fix OOPS in idescsi_expiry() + * fix radeon setparam on 32/64 systems, harder. + * tty: restore locked ioctl file op + * i386: fix NMI watchdog not reserving its MSRs + * i386: use the right wrapper to disable the NMI watchdog + * SLUB slab validation: Alloc while interrupts are disabled must use + GFP_ATOMIC + * Restore shmid as inode# to fix /proc/pid/maps ABI breakage + * i386 mm: use pte_update() in ptep_test_and_clear_dirty() + * cpuset: zero malloc - fix for old cpusets + * toshiba_acpi: fix section mismatch in allyesconfig + * swsusp: Fix userland interface + * perfctr-watchdog: fix interchanged parameters to + release_{evntsel,perfctr}_nmi + * fuse: ->fs_flags fixlet + * md: fix two raid10 bugs + * md: fix bug in error handling during raid1 repair + * spi doc updates + * uml: remove PAGE_SIZE from libc code + * uml: kill x86_64 STACK_TOP_MAX + * random: fix output buffer folding + * Rework ptep_set_access_flags and fix sun4c + * SLUB: minimum alignment fixes + * udf: fix possible leakage of blocks + * hugetlb: fix get_policy for stacked shared memory files + * shm: fix the filename of hugetlb sysv shared memory + * Linux 2.6.22-rc5 + * [GFS2] flush the glock completely in inode_go_sync + * [DLM] fix a couple of races + * [GFS2] kernel changes to support new gfs2_grow command + * [GFS2] Kernel changes to support new gfs2_grow command (part 2) + * [GFS2] use zero_user_page + * [GFS2] Addendum patch 2 for gfs2_grow + * [GFS2] Reduce size of struct gdlm_lock + * [GFS2] Clean up inode number handling + * [GFS2] Quotas non-functional - fix bug + * [DLM] keep dlm from panicing when traversing rsb list in debugfs + * [DLM] block scand during recovery [1/6] + * [DLM] add lock timeouts and warnings [2/6] + * [DLM] dlm_device interface changes [3/6] + * [DLM] cancel in conversion deadlock [4/6] + * [DLM] fix new_lockspace error exit [5/6] + * [DLM] wait for config check during join [6/6] + * [DLM] fix compile breakage + * [GFS2] latest gfs2-nmw headers break userland build + * [DLM] Compile fix + * [DLM] timeout fixes + * [DLM] canceling deadlocked lock + * [DLM] dumping master locks + * [DLM] show default protocol + * [GFS2] Quotas non-functional - fix another bug + * [GFS2] Make the log reserved blocks depend on block size + * [DLM] fix socket shutdown + * [GFS2] fix jdata issues + * [GFS2] Fix sign problem in quota/statfs and cleanup _host structures + * [GFS2] Add nanosecond timestamp feature + * [DLM] fix reference counting + * [DLM] variable allocation + * [GFS2] Fix typo in rename of directories + * [GFS2] Fix bug in error path of inode + * [GFS2] Can't mount GFS2 file system on AoE device + * [GFS2] Recovery for lost unlinked inodes + * [GFS2] gfs2_lookupi() uninitialised var fix + * [GFS2] set plock owner in GETLK info + * [GFS2] return conflicts for GETLK + * [GFS2] Fix deallocation issues + * [DLM] don't require FS flag on all nodes + * [GFS2] Journaled file write/unstuff bug + * [GFS2] Remove bogus '\0' in rgrp.c + * [GFS2] Use zero_user_page() in stuffed_readpage() + * [GFS2] assertion failure after writing to journaled file, umount + * [GFS2] Simplify multiple glock aquisition + * [GFS2] Addendum to the journaled file/unmount patch + + -- Ben Collins Fri, 01 Jun 2007 12:15:58 -0400 + +linux-source-2.6.22 (2.6.22-6.13) gutsy; urgency=low + + [Ben Collins] + + * Bump ABI + * build/scripts: Remove all remnants of debconf from control scripts + * build/config: Re-enable paravirt/vmi + * build/config: Build ide-core as a module + * i386/x86_64: Allow disabling the putstr's from compressed boot wrapper + * PM: Do not require dev spew to get PM_DEBUG + * RTC: Ratelimit "lost interrupts" message + * UNUSUAL_DEV: Sync up some reported devices from Ubuntu + * build/d-i: Include ide-core in storage-core udeb, not that it's modular + * build/d-i: Make ide-modules depend on storage-code-modules + * build/config: Enable CONFIG_TIMER_STATS on x86_64. + * build/config: Disable CONFIG_RTC_DRV_CMOS + * build/config: Enable TIMER_STATS everywhere. + * build/config: Enable SND_AC97_POWER_SAVE + - LP: #116679 + * kmod: Improve call_usermodehelper_pipe to handle data close + * coredump: Convert to new call_usermodehelper_pipe symantics + * PPC: Only set hwif stuff when ide-core is non-modular + * PPC/MEDIABAY: Export some functions for modular ide-core/ppc-ide + + [Colin Watson] + + * Move isofs to storage-core-modules udeb from fs-core-modules. + + [Upstream Kernel Changes] + + * Input: logips2pp - add type 72 (PS/2 TrackMan Marble) + * Input: adbhid - do not access input_dev->private directly + * sh: Shut up compiler warnings in __do_page_fault(). + * sh: Fix up psw build rules for r7780rp. + * sh: Kill off pmb slab cache destructor. + * sh: landisk: rtc-rs5c313 support. + * sh: landisk: Header cleanups. + * input: hp680_ts compile fixes. + * [ARM] 4375/1: sharpsl_pm: Fix compile warnings + * [ARM] 4376/1: Selects GENERIC_GPIO for ARCH_IXP4XX in Kconfig + * [ARM] 4378/1: KS8695: Serial driver fix + * [ARM] Remove Integrator/CP SMP platform support + * [ARM] 4382/1: iop13xx: fix msi support + * [ARM] 4383/1: iop: fix usage of '__init' and 'inline' in iop files + * [ARM] 4384/1: S3C2412/13 SPI registers offset correction + * [ARM] Update ARM syscalls + * [ARM] Silence OMAP kernel configuration warning + * [ARM] gic: Fix gic cascade irq handling + * [ARM] integrator: fix pci_v3 compile error with DEBUG_LL + * [ARM] ARMv6: add CPU_HAS_ASID configuration + * [CRYPTO] padlock: Make CRYPTO_DEV_PADLOCK a tristate again + * [CRYPTO] tcrypt: Add missing error check + * eventfd use waitqueue lock ... + * timerfd use waitqueue lock ... + * [IA64] Fix bogus messages about system calls not implemented. + * [IA64] Yet another section mismatch warning + * Fix roundup_pow_of_two(1) + * Further update of the i386 boot documentation + * cciss: Fix pci_driver.shutdown while device is still active + * Linux v2.6.22-rc2 + * [CRYPTO] api: Read module pointer before freeing algorithm + * powerpc: Fix the MODALIAS generation in modpost for of devices + * kbuild: include limits.h in sumversion.c for PATH_MAX + * kconfig: search harder for curses library in check-lxdialog.sh + * kbuild: make modpost section warnings clearer + * kbuild: make better section mismatch reports on i386, arm and mips + * kbuild: add "Section mismatch" warning whitelist for powerpc + * all-archs: consolidate .text section definition in asm-generic + * all-archs: consolidate .data section definition in asm-generic + * kbuild: introduce __init_refok/__initdata_refok to supress section + mismatch warnings + * init/main: use __init_refok to fix section mismatch + * mm: fix section mismatch warnings + * mm/slab: fix section mismatch warning + * IB/core: Free umem when mm is already gone + * IB/ipath: Fix potential deadlock with multicast spinlocks + * IB/core: Add helpers for uncached GID and P_Key searches + * IB/core: Use start_port() and end_port() + * IPoIB: Handle P_Key table reordering + * IB/ehca: Return proper error code if register_mr fails + * IB/mthca: Fix use-after-free on device restart + * IB/mlx4: Fix check of max_qp_dest_rdma in modify QP + * IB/mthca: Set GRH:HopLimit when building MLX headers + * IB/mlx4: Set GRH:HopLimit when sending globally routed MADs + * IB/mthca: Fix RESET to ERROR transition + * IB/mlx4: Fix RESET to RESET and RESET to ERROR transitions + * mlx4_core: Fix array overrun in dump_dev_cap_flags() + * IB/mlx4: Fix check of opcode in mlx4_ib_post_send() + * [IPV6]: Add ip6_tunnel.h to headers_install + * [RFKILL]: Fix check for correct rfkill allocation + * [NET]: Fix net/core/skbuff.c gcc-3.2.3 compilation error + * [TCP] FRTO: Add missing ECN CWR sending to one of the responses + * [TCP] FRTO: Prevent state inconsistency in corner cases + * [IPSEC] pfkey: Load specific algorithm in pfkey_add rather than all + * [NETFILTER]: nf_conntrack: fix use-after-free in helper destroy + callback invocation + * [NETFILTER]: nf_conntrack_ipv4: fix incorrect #ifdef config name + * [IPV4]: icmp: fix crash with sysctl_icmp_errors_use_inbound_ifaddr + * [NET]: Fix race condition about network device name allocation. + * IB/mlx4: Pass send queue sizes from userspace to kernel + * [ARM] 4387/1: fix /proc/cpuinfo formatting for pre-ARM7 parts + * [ARM] 4388/1: no need for arm/mm mmap range checks for non-mmu + * [ARM] 4395/1: S3C24XX: add include of to relevant + machines + * [ARM] 4396/1: S3C2443: Add missing HCLK clocks + * [ARM] 4397/1: S3C2443: remove SDI0/1 IRQ ambiguity + * [ARM] 4398/1: S3C2443: Fix watchdog IRQ number + * [ARM] 4399/2: S3C2443: Fix SMDK2443 nand timings + * [ARM] 4400/1: S3C24XX: Add high-speed MMC device definition + * [ARM] at91_adc parenthesis balance + * [ARM] spelling fixes + * IB/mlx4: Check if SRQ is full when posting receive + * spelling fixes: arch/sh/ + * sh: revert addition of page fault notifiers + * sh: Wire up signalfd/timerfd/eventfd syscalls. + * sh: Fix up various compile warnings for SE boards. + * sh: Fix page size alignment in __copy_user_page(). + * sh: Disable psw support for R7785RP. + * fs: Kill sh dependency for binfmt_flat. + * sh: disable genrtc support. + * sh: sr.bl toggling around idle sleep. + * sh: Wire up kdump crash kernel exec in die(). + * sh: Fix clock multiplier on SH7722. + * sh: Fix dreamcast build for IRQ changes. + * [S390] cio: Update documentation. + * [S390] Wire up sys_utimensat. + * [S390] Wire up signald, timerfd and eventfd syscalls. + * [S390] Make use of kretprobe_assert. + * [S390] More verbose show_mem() like other architectures. + * Fix "fs: convert core functions to zero_user_page" + * Detach sched.h from mm.h + * Blackfin arch: Add Workaround for ANOMALY 05000257 + * Blackfin arch: add SPI MMC driver support on bf533-stamp, tested on + STAMP-BF533 + * Blackfin arch: ISP1761 doesn't work for USB flash disk + * Blackfin arch: fix a few random warnings + * Blackfin arch: Add configuration data for ISP176x on BF561 + * Blackfin arch: mark a bunch of local functions as static + * Blackfin arch: Fix reserved map after we changed PORT_H definition + * Blackfin arch: Move write to VR_CTL closer to IDLE + * Blackfin arch: DMA operation cleanup + * Blackfin arch: GPIO fix some defines + * Blackfin arch: fix trace output for FLAT binaries + * Blackfin arch: Fix bug using usb keyboard crashes kernel + * Blackfin arch: initial tepla-bf561 board support + * Blackfin arch: make sure we declare the revid functions as pure (since + they are) + * Blackfin arch: dont clear status register bits in SWRST so we can + actually use it + * Blackfin arch: finish removing p* volatile defines for MMRs + * Blackfin arch: move board specific setup out of common init code and + into the board specific init code + * Blackfin arch: issue reset via SWRST so we dont clobber the watchdog + state + * Blackfin arch: document why we have to touch the UART peripheral in our + boot up code + * Blackfin arch: dma_memcpy borken for > 64K + * Blackfin arch: dont clear the bit that tells coreb to start booting + * Blackfin arch: make sure we use local labels + * Blackfin arch: update blackfin header files to latest one in VDSP. + * Blackfin arch: cache SWRST value at bootup so other things like + watchdog can non-destructively query it + * Blackfin arch: fix signal handling bug + * Blackfin arch: Change NO_ACCESS_CHECK to ACCESS_CHECK + * Blackfin arch: add board default configs to blackfin arch + * Blackfin arch: update defconfig files + * Blackfin arch: update pm.c according to power management API change. + * Blackfin serial driver: fix overhead issue + * Blackfin serial driver: implement support for ignoring parity/break + errors + * Blackfin SPI: cleanup according to David Brownell's review + * x86_64: Update defconfig + * i386: Update defconfig + * x86_64: Support x86_64 in make buildtar + * i386: Fix K8/core2 oprofile on multiple CPUs + * x86_64: Support gcc 5 properly + * i386: Clear MCE flag on AMD K6 + * i386: Fix wrong CPU error message in early boot path + * i386: Enable CX8/PGE CPUID bits early on VIA C3 + * x86_64: early_print kernel console should send CRLF not LFCR + * x86_64: vsyscall time() fix + * i386: fix PGE mask + * LDM: Fix for Windows Vista dynamic disks + * IB/ipoib: Fix typos in error messages + * IPoIB/cm: Fix SRQ WR leak + * IB/cm: Improve local id allocation + * e1000: Don't enable polling in open() (was: e1000: assertion hit in + e1000_clean(), kernel 2.6.21.1) + * declance: Remove a dangling spin_unlock_irq() thingy + * Add constant for FCS/CRC length (frame check sequence) + * ahci: disable 64bit dma on sb600 + * libata: Add Seagate STT20000A to DMA blacklist. + * pata_hpt366: Enable bits are unreliable so don't use them + * ata_piix: clean up + * libata: Kiss post_set_mode goodbye + * libata: Trim trailing whitespace + * partitions/LDM: build fix + * Make 'headerscheck' stop immediately on an error + * Fix headers check fallout + * [POWERPC] Fix smp_call_function to be preempt-safe + * [POWERPC] Add missing pmc_type fields in cpu_table + * [POWERPC] Fix typo: MMCR0_PMA0 != MMCR0_PMAO + * [POWERPC] Fix powerpc vmlinux.lds.S + * [POWERPC] Fix warning in 32-bit builds with CONFIG_HIGHMEM + * libertas: skb dereferenced after netif_rx + * drivers/net/wireless/libertas/fw.c: fix use-before-check + * drivers/net/wireless/libertas/rx.c: fix use-after-free + * [IA64] Improve unwind checking. + * [IA64] Only unwind non-running tasks. + * [IA64] fix kmalloc(0) in arch/ia64/pci/pci.c + * i2c: Legacy i2c drivers shouldn't issue uevents + * i2c-tiny-usb: Fix truncated adapter name + * i2c-s3c2410: Fix build warning + * V4L/DVB (5639): Fix Kconfig dependencies for ivtv + * V4L/DVB (5640): Fix: em28xx shouldn't be selecting VIDEO_BUF + * V4L/DVB (5670): Adding new fields to v4l2_pix_format broke the ABI, + reverted that change + * V4L/DVB (5639a): Fix dst usage count + * V4L/DVB (5630): Dvb-core: Handle failures to create devices + * V4L/DVB (5680): Tuner-simple.c fix suport for SECAM with FI1216MF + * V4L/DVB (5690): Cafe_ccic: Properly power down the sensor + * V4L/DVB (5691): Ov7670: reset clkrc in rgb565 mode + * [IPSEC]: Fix warnings with casting int to pointer + * [AF_RXRPC]: AF_RXRPC depends on IPv4 + * [AF_RXRPC]: Make call state names available if CONFIG_PROC_FS=n + * [RTNETLINK]: Allow changing of subsets of netdevice flags in + rtnl_setlink + * [RTNETLINK]: Remove remains of wireless extensions over rtnetlink + * Input: iforce - fix force feedback not working + * Input: iforce - minor clean-ups + * Input: ALPS - force stream mode + * Input: ucb1400_ts - use sched_setscheduler() + * Input: ucb1x00-ts - remove commented out code + * Input: input-polldev - add module info + * Input: ads7846 - document that it handles tsc2046 too + * Input: ads7846 - SPI_CPHA mode bugfix + * USB: fix omninet memory leak found by coverity + * USB: remove useless check in mos7840 found by coverity + * usb-storage: ignore Sitecom WL-117 USB-WLAN + * USB: fix more ftdi-elan/u132-hcd #include lossage + * USB: handle more rndis_host oddities + * USB: remove usb DocBook warnings + * USB: address FIXME in usbnet w.r.t drivers claiming multiple interfaces + * EHCI: fix problem with BIOS handoff + * USB: more autosuspend timer stuff + * USB: remove unneeded WARN_ON + * USB: New device PID for ftdi_sio driver + * USB: set the correct Interrupt interval in usb_bulk_msg + * USB: fsl_usb2_udc: Fix UMTI_WIDE support and a compile warning + * USB: auerswald: fix file release handler + * USB: Remove duplicate IDs from option card driver + * USB: Deref URB after usbmon is done with it + * USB: remove short initial timeout for device descriptor fetch + * USB: don't try to kzalloc 0 bytes + * USB: Onetouch - switch to using input_dev->dev.parent + * USB: Fix debug output of ark3116 + * USB: usblp: Use correct DMA address in case of probe error + * USB: Fix USB OHCI Subvendor for Toshiba Portege 4000 + * USB: make the autosuspend workqueue thread freezable + * USB: handle errors in power/level attribute + * USB: fix ratelimit call semantics + * USB: ftdi_sio: Add USB Product Id for OpenDCC + * USB: ldusb bugfix + * USB: Add support for Sierra Wireless Aircard 595U + * USB: Add support for Olimex arm-usb-ocd JTAG interface serial port + * IB/mlx4: Don't allocate RQ doorbell if using SRQ + * [IA64] start_secondary() and smp_callin() should be __cpuinit + * add the IDE device ID for ATI SB700 + * ide/pci/serverworks.c: Fix corruption/timeouts with MegaIDE + * Add two missing chipsets to drivers/ide/ide-proc.c + * Match DMA blacklist entries between ide-dma.c and libata-core.c + * ide serverworks warning fixes + * freezer: close potential race between refrigerator and thaw_tasks + * freezer: fix vfork problem + * freezer: take kernel_execve into consideration + * freezer: fix kthread_create vs freezer theoretical race + * freezer: fix PF_NOFREEZE vs freezeable race + * freezer: move frozen_process() to kernel/power/process.c + * Ignore bogus ACPI info for offline CPUs + * SLUB Debug: Fix object size calculation + * fuse: fix mknod of regular file + * mpc52xx_psc_spi: fix it for CONFIG_PPC_MERGE + * spi doc update: describe clock mode bits + * NOHZ: Rate limit the local softirq pending warning output + * genhd: expose AN to user space + * genhd: send async notification on media change + * capability.h warning fix + * spi/spidev: check message size before copying + * uml: improve PTRACE_SYSEMU checking + * prohibit rcutorture from being compiled into the kernel + * Documentation: fix the explanation of Kconfig files + * Avoid zero size allocation in cache_k8_northbridges() + * recalc_sigpending_tsk fixes + * optimize compat_core_sys_select() by a using stack space for small fd + sets + * spi: potential memleak in spidev_ioctl + * fbdev: cleanup of sparc FB options + * pm2fb: RDAC_WR barriers clean up + * pm3fb: various fixes + * w100fb: fix compile warnings + * ps3fb: use FB_SYS_* instead of FB_CFB_* + * imxfb: remove ifdefs + * imxfb: fix memory hole + * Missing 'const' from reiserfs MIN_KEY declaration. + * uselib: add missing MNT_NOEXEC check + * fuse: generic_write_checks() for direct_io + * fuse: delete inode on drop + * fix unused setup_nr_node_ids + * SLUB Debug: fix check for super sized slabs (>512k 64bit, >256k 32bit) + * Char: cyclades, fix deadlock + * simplify cleanup_workqueue_thread() + * phantom: move to unlocked_ioctl + * Misc: phantom, take care of pci posting + * power: Fix sizeof(PAGE_SIZE) typo + * update dontdiff file + * signalfd: retrieve multiple signals with one read() call + * i2o: destroy event queue only when drv->event is set + * i2o: fix notifiers when max_drivers is configured + * i2o: eliminate a peculiar constraint on i2o_max_drivers + * i386, x86-64: show that CONFIG_HOTPLUG_CPU is required for suspend on + SMP + * md: avoid overflow in raid0 calculation with large components + * md: don't write more than is required of the last page of a bitmap + * md: fix bug with linear hot-add and elsewhere + * documentation: Documentation/initrd.txt + * HiSax: fix error checking for hisax_register()] + * applesmc - sensors patch missing from 2.6.22-rc2 + * Off by one in floppy.c + * eCryptfs: delay writing 0's after llseek until write + * document clocksources + * ehci-fsl: fix cache coherency problem on system with large memory + * Prevent going idle with softirq pending + * i386: fix early usage of atomic_add_return and local_add_return on real + i386 + * Documentation/memory-barriers.txt: various fixes + * omap_uwire: SPI_CPHA mode bugfix + * capifunc warning fixes + * drivers/isdn/hardware/eicon/message.c warning fixes + * i386 bigsmp: section mismatch fixes + * boot documentation: clarifications + * mmc: clean up unused parts of block driver + * mmc: mark unmaintained drivers + * mmc: Add maintainers for TI OMAP MMC interface + * mmc: add maintainer for iMX MMC interface + * mmc: add maintainer for ARM Primecell controller + * [CRYPTO] geode: Fix in-place operations and set key + * [Bluetooth] Always send HCI_Reset for Broadcom devices + * [Bluetooth] Fix L2CAP configuration parameter handling + * NFS: Avoid a deadlock situation on write + * NFS: Fix handful of compiler warnings in direct.c + * NFS: Fix nfs_direct_dirty_pages() + * Don't call a warnign a bug. It's a warning. + * [IA64] Fix using uninitialized data in _PDC setup + * [IA64] Cleanup acpi header to reuse the generic _PDC defines + * Documentation: Fix up docs still talking about i_sem + * [IA64] acpi_get_sysname() should be __init + * IB/mlx4: Initialize send queue entry ownership bits + * IB/ehca: Fix number of send WRs reported for new QP + * IPoIB/cm: Fix timeout check in ipoib_cm_dev_stop() + * IPoIB/cm: Drain cq in ipoib_cm_dev_stop() + * ucc_geth: Fix MODULE_DEVICE_TABLE() duplication + * ucc_geth:trivial fix + * asix.c - Add Belkin F5D5055 ids + * fix compiler warning in fixed.c + * remove unnecessary dependency on VIA velocity config + * meth driver renovation + * spidernet: skb used after netif_receive_skb + * chelsio parenthesis fix + * forcedeth: fix cpu irq mask + * [NET_SCHED]: Fix qdisc_restart return value when dequeue is empty + * [IPV6]: Ignore ipv6 events on non-IPV6 capable devices. + * [ATM]: Use mutex instead of binary semaphore in idt77252 driver. + * [DCCP]: Use menuconfig objects. + * [IPVS]: Use menuconfig objects. + * [SCTP]: Use menuconfig objects. + * [TIPC]: Use menuconfig objects. + * [ARCNET]: Use menuconfig objects. + * [TR]: Use menuconfig objects. + * [RTNETLINK]: Fix sending netlink message when replace route. + * [TIPC]: Fixed erroneous introduction of for_each_netdev + * [DCCP]: Fix build warning when debugging is disabled. + * [NET_SCHED]: sch_htb: fix event cache time calculation + * [NETFILTER]: nf_conntrack_ftp: fix newline sequence number update + * [NETFILTER]: nf_conntrack_ftp: fix newline sequence number calculation + * [NETFILTER]: nf_conntrack_h323: fix ASN.1 types + * [NETFILTER]: nf_conntrack_h323: fix get_h225_addr() for IPv6 address + access + * [NETFILTER]: nf_conntrack_h323: remove unnecessary process of + Information signal + * [NETFILTER]: nf_conntrack_h323: add missing T.120 address in OLCA + * [NETFILTER]: nf_nat_h323: call set_h225_addr instead of + set_h225_addr_hook + * [NET]: "wrong timeout value" in sk_wait_data() v2 + * hpt3x2n: Correct revision boundary + * pata_sis: Fix and clean up some timing setups + * ata_piix: add short 40c quirk for Acer Aspire 2030, take #2 + * libata: don't consider 0xff as port empty if SStatus is available + * libata: -ENODEV during prereset isn't an error + * pata_via: Handle laptops via DMI + * [CASSINI]: Check pci_set_mwi() return value. + * [XFRM]: Allow packet drops during larval state resolution. + * [libata] sata_promise: fix flags typo + * [libata] sata_mv: add TODO list + * Fix build failure for drivers/ata/pata_scc.c + * libata: sata_sis fixes + * [libata] Fix decoding of 6-byte commands + * [libata] sata_via, pata_via: Add PCI IDs. + * ocfs2: trylock in ocfs2_readpage() + * ocfs2: unmap_mapping_range() in ocfs2_truncate() + * ocfs2: use zero_user_page + * ocfs2: fix inode leak + * ocfs2: use generic_segment_checks + * pata: Trivia + * pata_hpt37x: Further improvements based on the IDE updates and vendor + drivers + * fix compat console unimap regression + * Linux 2.6.22-rc3 + + -- Ben Collins Thu, 31 May 2007 12:35:44 -0400 + +linux-source-2.6.22 (2.6.22-5.11) gutsy; urgency=low + + [Ben Collins] + + * build/headers/ppc: Correct asm-ppc -> asm for arch symlink + * build/headers/ia64: Fix find command line to correctly pull in *.lds + files + * Bump ABI + + [Upstream Kernel Changes] + + * [IA64] spelling fixes: arch/ia64/ + * [AVR32] Remove bogus comment in arch/avr32/kernel/irq.c + * [AVR32] optimize pagefault path + * [AVR32] Wire up signalfd, timerfd and eventfd + * [IA64] wire up {signal,timer,event}fd syscalls + * [IA64] kdump on INIT needs multi-nodes sync-up (v.2) + * [IA64] s/scalibility/scalability/ + * [AVR32] Implement platform hooks for atmel_lcdfb driver + * [IA64] Fix section conflict of ia64_mlogbuf_finish + * [SPARC64]: Add hypervisor API negotiation and fix console bugs. + * pata_scc had been missed by ata_std_prereset() switch + * libata: separate out ata_dev_reread_id() + * libata: during revalidation, check n_sectors after device is configured + * libata-acpi: add ATA_FLAG_ACPI_SATA port flag + * libata: fix shutdown warning message printing + * libata: track spindown status and skip spindown_compat if possible + * [ALSA] usb-audio: another Logitech QuickCam ID + * [ALSA] hda-codec - Make the mixer capability check more robust + * [ALSA] ASoC AC97 static GPL symbol fix + * [ALSA] ASoC AC97 device reg bugfix + * [ALSA] hda-codec - Fix ALC882/861VD codec support on some laptops + * [ALSA] version 1.0.14rc4 + * [ALSA] Fix probe of non-PnP ISA devices + * [ALSA] Include quirks from Ubuntu Dapper/Edgy/Feisty + * [ALSA] usbaudio - Coping with short replies in usbmixer + * [IA64] optimize pagefaults a little + * Fix ACPI suspend / device suspend ordering problem + * AFS: write back dirty data on unmount + * SLUB: It is legit to allocate a slab of the maximum permitted size + * slub: don't confuse ctor and dtor + * AFS: Fix afs_prepare_write() + * spi: fix spidev for >sizeof(long)/32 devices + * parport_pc needs dma-mapping.h + * Fix: find_or_create_page skips cpuset memory spreading. + * slob: implement RCU freeing + * Slab allocators: Drop support for destructors + * SLUB: Remove depends on EXPERIMENTAL and !ARCH_USES_SLAB_PAGE_STRUCT + * SLAB: Move two remaining SLAB specific definitions to slab_def.h + * SLUB: Define functions for cpu slab handling instead of using + PageActive + * slab: warn on zero-length allocations + * slub: fix handling of oversized slabs + * SLUB: slabinfo fixes + * SLUB: Do our own flags based on PG_active and PG_error + * Remove SLAB_CTOR_CONSTRUCTOR + * SLUB: Simplify debug code + * Slab allocators: define common size limitations + * acpi: fix potential call to a freed memory section. + * i386/x86-64: fix section mismatch + * Make __vunmap static + * simplify compat_sys_timerfd + * Let smp_call_function_single return -EBUSY on UP + * Refine SCREEN_INFO sanity check for vgacon initialization + * make freezeable workqueues singlethread + * parport: mailing list is subscribers-only + * docbook: make kernel-locking table readable + * gpio interface loosens call restrictions + * rtc-omap build fix + * rtc kconfig clarification + * icom: add new sub-device-id to support new adapter + * make sysctl/kernel/core_pattern and fs/exec.c agree on maximum core + filename size + * ecryptfs: use zero_user_page + * i386: don't check_pgt_cache in flush_tlb_mm + * circular locking dependency found in QUOTA OFF + * swsusp: fix sysfs interface + * Fix page allocation flags in grow_dev_page() + * mm: more rmap checking + * NS16550A: Restore HS settings in EXCR2 on resume + * Fix incorrect prototype for ipxrtr_route_packet() + * sky2: remove Gigabyte 88e8056 restriction + * sky2: PHY register settings + * sky2: keep track of receive alloc failures + * sky2: MIB counter overflow handling + * sky2: remove dual port workaround + * sky2: memory barriers change + * small netdevices.txt fix + * ibm_emac: fix section mismatch warnings + * ibm_emac: improved PHY support + * ibm_emac: fix link speed detection change + * gianfar: Add I/O barriers when touching buffer descriptor ownership. + * spidernet: node-aware skbuff allocation + * NetXen: Fix NetXen driver ping on system-p + * ixgb: don't print error if pci_enable_msi() fails, cleanup minor leak + * e1000: Fix msi enable leak on error, don't print error message, cleanup + * drivers/ata: remove the wildcard from sata_nv driver + * sata_nv: fix fallout of devres conversion + * libata: remove libata.spindown_compat + * sata_via: pcim_iomap_regions() conversion missed BAR5 + + -- Ben Collins Thu, 17 May 2007 14:54:16 -0400 + +linux-source-2.6.22 (2.6.22-4.10) gutsy; urgency=low + + [Ben Collins] + + * Bump ABI + * build/config: Disable obsolete tsdev driver. + * build: Add tsdev to list of modules intentionally removed. + * build/headers: Include *.lds files (fixes ia64 headers). + * build/headers: Add arch/powerpc/include/asm symlink to get all headers. + * build/module-check: Fix logic for printed messages. + * build/maintainer: Use linux instead of upstream-linux for local diffs + * build/config: Enable SLUB slab allocator (vs. SLAB). + * build/config: Disable orinoco_nortel, use prefered hostap_plx + * build/config: Disable ir-usb in favor of irda-usb + * build/config: Disable sis5513(ide) in favor of pata_sis(libata) + * build/config: Disable piix(ide) in favour of pata_oldpiix, ata_piix and + pata_mpiix (libata) + * build/config: Disable zaurus driver in favour of the cdc_ether driver + * build/abi: Note a few modules intentionally removed. + * build/config: Disable mxb and dpc7146 driver in favour of hexium_orion + * build/config: Disable usbtest driver (for development only) + * build/config: Disable keyspan driver in favour of keyspan_pda + * build/abi: Add mxb and usbtest to list of removed modules. + + [Upstream Kernel Changes] + + * net: Trivial MLX4_DEBUG dependency fix. + * mlx4_core: Remove unused doorbell_lock + * [CPUFREQ] Support rev H AMD64s in powernow-k8 + * [CPUFREQ] powernow-k7: fix MHz rounding issue with perflib + * [AGPGART] Fix wrong ID in via-agp.c + * sh64: ROUND_UP macro cleanup in arch/sh64/kernel/pci_sh5.c + * spelling fixes: arch/sh64/ + * sh64: Wire up many new syscalls. + * sh64: Fixups for the irq_regs changes. + * sh64: dma-mapping updates. + * sh64: ppoll/pselect6() and restartable syscalls. + * sh64: Fixup sh-sci build. + * sh64: Update cayman defconfig. + * sh64: generic quicklist support. + * sh64: Add .gitignore entry for syscalltab. + * IB/mlx4: Fix uninitialized spinlock for 32-bit archs + * IB/ipath: Shadow the gpio_mask register + * IB/ehca: Serialize hypervisor calls in ehca_register_mr() + * IB/ehca: Correctly set GRH mask bit in ehca_modify_qp() + * IB/ehca: Fix AQP0/1 QP number + * IB/ehca: Remove _irqsave, move #ifdef + * IB/ehca: Beautify sysfs attribute code and fix compiler warnings + * IB/ehca: Disable scaling code by default, bump version number + * RDMA/cma: Simplify device removal handling code + * RDMA/cma: Fix synchronization with device removal in cma_iw_handler + * RDMA/cma: Add check to validate that cm_id is bound to a device + * IB/mthca: Fix posting >255 recv WRs for Tavor + * IB/mthca: Set cleaned CQEs back to HW ownership when cleaning CQ + * IPoIB/cm: Optimize stale connection detection + * [CPUFREQ] Correct revision mask for powernow-k8 + * fix epoll single pass code and add wait-exclusive flag + * epoll locks changes and cleanups + * epoll: fix some comments + * epoll: move kfree inside ep_free + * nommu: add ioremap_page_range() + * h8300 atomic.h update + * alpha: fix hard_smp_processor_id compile error + * m68k: implement __clear_user() + * Remove cpu hotplug defines for __INIT & __INITDATA + * i386: move common parts of smp into their own file + * i386: fix voyager build + * SLUB: CONFIG_LARGE_ALLOCS must consider MAX_ORDER limit + * ll_rw_blk: fix gcc 4.2 warning on current_io_context() + * pasemi_mac: Fix register defines + * pasemi_mac: Interrupt ack fixes + * pasemi_mac: Terminate PCI ID list + * pasemi_mac: Fix local-mac-address parsing + * smc911x: fix compilation breakage + * ucc_geth: eliminate max-speed, change interface-type to + phy-connection-type + * pdc202xx_old: rewrite mode programming code (v2) + * serverworks: PIO mode setup fixes + * sis5513: PIO mode setup fixes + * alim15x3: use ide_tune_dma() + * pdc202xx_new: use ide_tune_dma() + * ide: always disable DMA before tuning it + * cs5530/sc1200: add ->udma_filter methods + * ide: use ide_tune_dma() part #2 + * cs5530/sc1200: DMA support cleanup + * cs5530/sc1200: add ->speedproc support + * sl82c105: add speedproc() method and MWDMA0/1 support + * ide: remove ide_dma_enable() + * ide: add missing validity checks for identify words 62 and 63 + * ide: remove ide_use_dma() + * sl82c105: Switch to ref counting API + * Use menuconfig objects: IDE + * x86: Fix discontigmem + non-HIGHMEM compile + * missing mm.h in fw-ohci + * missing dependencies for USB drivers in input + * missing includes in mlx4 + * em28xx and ivtv should depend on PCI + * rpadlpar breakage - fallout of struct subsystem removal + * m32r: __xchg() should be always_inline + * audit_match_signal() and friends are used only if CONFIG_AUDITSYSCALL + is set + * fix uml-x86_64 + * arm: walk_stacktrace() needs to be exported + + -- Ben Collins Tue, 15 May 2007 10:13:23 -0400 + +linux-source-2.6.22 (2.6.22-3.9) gutsy; urgency=low + + * Fixup firmware-modules -> efi-modules in exclude files. + + [Ben Collins] + + * build/config: Enable CONFIG_TIMER_STATS + * build/config: Disable CONFIG_IRQBALANCE, handled in userspace now + * build: Update modules that have been deprecated + * sparc64: Get some drivers compiling, till patches get upstream. + * powerpc: Add 64-bit cmp op for 32-bit. + * build/config: Disable apm_emu, pasemi_mac and cbe_cpufreq on ppc64 + * build/d-i(cjwatson): Rename firmware-modules to efi-modules + + -- Ben Collins Fri, 11 May 2007 09:38:50 +0200 + +linux-source-2.6.22 (2.6.22-2.7) gutsy; urgency=low + + [Changes for 2.7] + + * Added some more modules going missing to ignore. + * Disable ADB_PMU_LED on powerpc64. FTBFS. + + [Ben Collins] + + * XXX: Well, xen and rt got disabled in this upload. Hopefully things will + get working again soon. + + * build: Add check for nrcpus on buildd's for CONCURRENCY_LEVEL + * build: No longer provide ndiswrapper or ivtv modules (l-u-m does). + * build/d-i: Remove firmware lists, since we no longer supply those udebs + * build: Remove more firmware stuff + * build/control: Build-dep on coreutils + * Update configuration files + * build/custom: Updated xen/rt patches and configs. + * build: Make sure to use /bin/bash for headers_install + * build: Add SHELL=/bin/bash to headers_install + * Update configuration files + * Bump ABI + * Update module lists to match module name changes and merges. + * build/rt: Trimmed down real-time patch from Alessio Igor Bogani. + * Update configuration files + * Update configuration files + * build/rt: Fix typo in diff + * Update configuration files + * build: make explicit binary-headers target + * Update configuration files + * build/control-scripts: Remove debconf from pre-rm script + * build/ia64: Compress and use vmlinuz for target install + * build/config: Diable OSS i810_audio driver (Alsa driver prefered) + * build/config: Disable OSS cs4232 driver (Alsa prefered) + * build/config: Disable OSS via82xx driver (Alsa prefered) + * build/config: Disable OSS trident driver (Alsa prefered) + * build/config: Disable OSS Sound Blaster driver (Alsa prefered) + * build/config: Disable IDE generic, ata_generic prefered + * build/config: Disable siimage, pata_sil680 prefered + * build/module-check: More robust module checking + * build: Call module-check with perl, not $SHELL + * Update configuration files + * build: Fixup calling conventions of module-check + * build: Add modules.ignore from 1.3 revision + * build/config: Disable obsolete MOXA_SMARTIO in favor of new driver. + * build/config: Disable orinoco_cs in favor of hostap_cs + * build/config: Disable orinoco_pci in favor of hostap_pci + * build/config: Disable orinoco_{plx,tmd} in favor of hostap_plx + * build/config: Disable sk98lin in favor of skge + * build: Add more modules intentionally removed since 1.3 + + -- Ben Collins Fri, 27 Apr 2007 09:04:29 -0400 + +linux-source-2.6.22 (2.6.22-1.3) gutsy; urgency=low + + [Ben Collins] + + * build: Only use bzip2 for linux-image, and pre-depend on proper dpkg + + [2.6.22-1.2] + + [Ben Collins] + + * build: Add build-arch target. FTBFS + + [2.6.22-1.1] + + [Ben Collins] + + * debian: New build system, from scratch + * debian: Rename place holder so debian/stamps/ sticks around + * debian: Create stamp-flavours at start of build (for build scripts) + * debian/abi: Add revision 0.0 bootstrap module list. + * debian: Fix backwards logic in module/abi checkers. + * debian: Add arch= to vars.* files + * Update configuration files + * build: Added control scripts for images + * build/config: Disable CONFIG_PARAVIRT for now + * build/config: Enable CONFIG_FB_VESA + * build: Take CONCURRENCY_LEVEL from env if it exists. + * build: Do not print SHAs by default for changelog + * build/config(i386): Disable NO_HZ on all but generic + * build: Implement udeb rules + * build/d-i: Remove speakup-modules udeb + * build/udebs: Fix a couple trivial errors in the build. + * build/config: Disable CONFIG_FB_IMSTT on powerpc64-smp (no NVRAM) + * build/config: Disable some modules for ppc64 that don't use DMA API + * build/config: Yet another module to disable on ppc64 + * build/tests: New test infrastructure + * build: Special kernel build infrastructure + * build: Fix typo from last commit + * build/custom: Move custom files for each flavour into subdir. + * build/config: Disable some drivers on sparc that don't support DMA API + * build/sparc: Add new compress_file config, and use it for sparc + * build: Fix typo in compress_file commit. + * build/schedcfs: Update to v6 of the patch. + * build: Fix control file generation for custom images + * build: Correct message in link-headers + * build: 2.6.21 is released, force our SUBLEVEL to .22 + * build/vars: kvm API is at 4, provide that. + * build/custom: Allow custom builds to override things like build_image + * build/custom: Fix type causing custom rules not to be included. + * build/custom: Hello Xen 3.0.5 + * build/custom: Remove sched-cfs. Superseded in use by rt. + * build/custom: Add 2.6.21-rt1 patch for -rt custom flavour + * build/link-headers: Make sure to copy new files for custom + + -- Ben Collins Fri, 27 Apr 2007 08:29:00 -0400 --- linux-rt-2.6.29.5.orig/debian/control.stub.in +++ linux-rt-2.6.29.5/debian/control.stub.in @@ -0,0 +1,18 @@ +Source: linux-rt +Section: devel +Priority: optional +Maintainer: Alessio Igor Bogani +Standards-Version: 3.6.1 +Build-Depends: debhelper (>= 3), module-init-tools, kernel-wedge (>= 2.24ubuntu1), makedumpfile [!armel], quilt +Build-Depends-Indep: xmlto, docbook-utils, gs, transfig, bzip2, sharutils + +Package: linux-rt-headers-PKGVER-ABINUM +Architecture: all +Section: devel +Priority: optional +Depends: coreutils | fileutils (>= 4.0) +Provides: linux-rt-headers, linux-rt-headers-2.6 +Description: Header files related to Linux kernel version PKGVER + This package provides kernel header files for version PKGVER, for sites + that want the latest kernel headers. Please read + /usr/share/doc/linux-headers-PKGVER-ABINUM/debian.README.gz for details --- linux-rt-2.6.29.5.orig/debian/changelog +++ linux-rt-2.6.29.5/debian/changelog @@ -0,0 +1,429 @@ +linux-rt (2.6.29.5-1.2) karmic; urgency=low + + * Temporarily hard-code the .5 point release for the kernel, since without + this, needed files cannot be copied into the correct places, causing an + FTBFs. + + -- Luke Yelavich Sun, 28 Jun 2009 11:18:17 +1000 + +linux-rt (2.6.29.5-1.1) karmic; urgency=low + + * New 2.6.29 kernel upstream release + * New realtime patch upstream release + * Bump ABI + * Use main ubuntu kernel configs as a base for the rt configs. + + -- Luke Yelavich Sat, 27 Jun 2009 21:25:03 +1000 + +linux-rt (2.6.29-1.1) karmic; urgency=low + + * New upstream release + + -- Luke Yelavich Sun, 14 Jun 2009 15:07:25 +1000 + +linux-rt (2.6.28-3.12) jaunty; urgency=low + + * Build depend on linux-source-2.6.28 >= 2.6.28-11.42. + + Rebase on jaunty mainline 2.6.28-11.41: + + [ Tim Gardner ] + + * Enabled LPIA CONFIG_PACKET=y + - LP: #362071 + + [ Upstream Kernel Changes ] + + * ext4: fix bb_prealloc_list corruption due to wrong group locking + - LP: #348836 + + -- Luke Yelavich Fri, 17 Apr 2009 15:06:04 +1000 + +linux-rt (2.6.28-3.11) jaunty; urgency=low + + Rebase on jaunty mainline 2.6.28-11.41: + + [ Amit Kucheria ] + + * ixp4xx: Enabled TCP SYN_COOKIES + - LP: #346378 + + [ Brad Figg ] + + * Change LPIA configuration to compile with CONFIG_NETFILTER_XT_MATCH_RECENT + - LP: #355291 + + [ Kay Sievers ] + + * SAUCE: driver core: allow non-root users to listen to uevents + - LP: #357124 + + [ Manoj Iyer ] + + * SAUCE: Added quirk to recognize GE0301 3G modem as an interface. + - LP: #348861 + + [ Tim Gardner ] + + * Revert "SAUCE: [i915] allocate MCHBAR space & enable if necessary" + Appears to cause hard locks in some cases. + - LP: #349314 + + [ Trond Myklebust ] + + * SAUCE: NFS: Fix the notifications when renaming onto an existing file + - LP: #224642 + + [ Upstream Kernel Changes ] + + * USB: option: add QUANTA HSDPA Data Card device ids + - LP: #353321 + * hwmon: (abituguru3) Match partial DMI board name strings + - LP: #298798 + * zd1211rw: adding Sitecom WL-603 (0df6:0036) to the USB id list + - LP: #339631 + * USB: unusual dev for Option N.V. ZeroCD modems + - LP: #348861 + + -- Luke Yelavich Thu, 09 Apr 2009 02:34:08 +1000 + +linux-rt (2.6.28-3.10) jaunty; urgency=low + + * Disable DEVKMEM for all archs on Jaunty + + Rebase on jaunty mainline 2.6.28-11.40: + + [ Amit Kucheria ] + + * Disable DEVKMEM for all archs on Jaunty + - LP: #354221 + + [ Andy Whitcroft ] + + * SAUCE: md: wait for possible pending deletes after stopping an array + - LP: #334994 + + [ Brad Figg ] + + * ARM: Setting the bootloader for imx51 flavour. + - LP: #348382 + * ARM: Add bootloader package Recomendation to iop32x and ixp4xx flavours + - LP: #348382 + + [ Tim Gardner ] + + * SAUCE: [i915] allocate MCHBAR space & enable if necessary + - LP: #349314 + + [ Upstream Kernel Changes ] + + * hpilo: open/close fix + - LP: #353496 + + [ Alan Tull ] + + * SAUCE: mx51: fix to1.1 in mxc_iomux_set_input + - LP: #348333 + + [ Andy Whitcroft ] + + * SAUCE: acer: rfkill disable quirk for ACER Aspire One + - LP: #319825 + + [ Brad Figg ] + + * ARM: Increase CONFIG_BLK_DEV_RAM_SIZE for imx51 flavour. + - LP: #349842 + * ARM: Enable rtl8187 for imx51 + - LP: #349526 + * ARM: Unset CONFIG_USB_STORAGE_DEBUG for imx51 + - LP: #348504 + + [ Bryan Wu ] + + * build CRAMFS into kernel to support mounting CRAMFS initrd on iop32x + machine + - LP: #349104 + + [ Michael Casadevall ] + + * [lpia] Change ATA, SCSI, SD, ext2-4 modules into compiled-in components + - LP: #347458 + + [ Rob Herring ] + + * SAUCE: imx51: fec: fix cache operations for receive + - LP: #348333 + + [ Sam Yang ] + + * SAUCE: Revert ENGR00103870 FEC reopening causes network wdog timeout + - LP: #348333 + * SAUCE: imx51: fec cache flush functions are incorrect + - LP: #348333 + + [ Upstream Kernel Changes ] + + * Bluetooth: Add fine grained mem_flags usage to btusb driver + - LP: #268502 + * Bluetooth: Handle bulk URBs in btusb driver from notify callback + - LP: #268502 + * Bluetooth: Submit bulk URBs along with interrupt URBs + - LP: #268502 + + -- Luke Yelavich Tue, 07 Apr 2009 09:23:10 +1000 + +linux-rt (2.6.28-3.9) jaunty; urgency=low + + * link-headers: only link directories which do not already exist + + Rebase on jaunty mainline 2.6.28-11.38: + + [ Brad Figg ] + + * When AppArmor is configured, securityfs must be as well. + - LP: #344370 + * ARM: Enable AA with SECURITYFS for imx51 + - LP: #344370 + + [ Bryan Wu ] + + * Add 3 missing files to prerm remove file list + - LP: #345623 + + [ Daniel T Chen ] + + * SAUCE: (drop after 2.6.28) Don't trust hw-ptr blindly + - LP: #330814 + * SAUCE: (drop after 2.6.28) Apply further pcm_lib updates for hw_ptr + - LP: #330814 + + [ Ike Panhc ] + + * Copy header files for various kernel media driver + - LP: #322732 + + [ Tim Gardner ] + + * Revert "Fix the VFP handling on the Feroceon CPU" + Only applied to mv78xx0 ARM flavour. + * Enabled drivers/staging/at76_usb + - LP: #152626 + + [ ubuntu@tjworld.net ] + + * SAUCE: ipw2200: Enable LED by default + - LP: #21367 + * SAUCE: wistron_btns: support Prestigio Wifi RF kill button over suspend + - LP: #346586 + + [ Upstream Kernel Changes ] + + * Build fix for __early_pfn_to_nid() undefined link error + * Fix misreporting of #cores as #hyperthreads for Q9550 + * eventfd: remove fput() call from possible IRQ context + * S390: __div64_31 broken for CONFIG_MARCH_G5 + * ALSA: Fix vunmap and free order in snd_free_sgbuf_pages() + * ALSA: mixart, fix lock imbalance + * ALSA: pcm_oss, fix locking typo + * ALSA: hda - Fix DMA mask for ATI controllers + * ALSA: hda - Workaround for buggy DMA position on ATI controllers + * ALSA: opl3sa2 - Fix NULL dereference when suspending snd_opl3sa2 + * nfsd: nfsd should drop CAP_MKNOD for non-root + * NFSD: provide encode routine for OP_OPENATTR + * dm ioctl: validate name length when renaming + * dm io: respect BIO_MAX_PAGES limit + * dm crypt: fix kcryptd_async_done parameter + * dm crypt: wait for endio to complete before destruction + * ata_piix: add workaround for Samsung DB-P70 + * V4L/DVB (10218): cx23885: Fix Oops for mixed install of analog and + digital only cards + * thinkpad-acpi: fix module autoloading for older models + * Add '-fwrapv' to gcc CFLAGS + * Move cc-option to below arch-specific setup + * USB: storage: Unusual USB device Prolific 2507 variation added + * USB: Add Vendor/Product ID for new CDMA U727 to option driver + * USB: option.c: add ZTE 622 modem device + * USB: Add device id for Option GTM380 to option driver + * USB: Option: let cdc-acm handle Sony Ericsson F3507g / Dell 5530 + * USB: Updated unusual-devs entry for USB mass storage on Nokia 6233 + * USB: unusual_devs: Add support for GI 0431 SD-Card interface + * USB: serial: add FTDI USB/Serial converter devices + * USB: serial: ftdi: enable UART detection on gnICE JTAG adaptors + blacklist interface0 + * USB: serial: new cp2101 device id + * USB: usbtmc: fix stupid bug in open() + * USB: usbtmc: add protocol 1 support + * USB: usbfs: keep async URBs until the device file is closed + * USB: EHCI: expedite unlinks when the root hub is suspended + * USB: EHCI: Fix isochronous URB leak + * powerpc: Remove extra semicolon in fsl_soc.c + * menu: fix embedded menu snafu + * Linux 2.6.28.9 + * Add '-fwrapv' to gcc CFLAGS + - LP: #348015 + * Move cc-option to below arch-specific setup + - LP: #348015 + * Revert Staging: at76_usb: update drivers/staging/at76_usb w/ mac80211 + port + - LP: #152626 + * Staging: at76_usb: fix bugs introduced by "Staging: at76_usb: cleanup + dma on stack issues" + - LP: #152626 + * Staging: at76_usb: Add support for OQO Model 01+ + - LP: #152626 + + [ Alex Deucher ] + + * SAUCE: radeon: add some new pci ids + - LP: #334101 + + [ Amit Kucheria ] + + * Updating configs - rip out orion5x and mv78xx0 flavours + + [ Andy Whitcroft ] + + * SAUCE: tone down the synaptics warning to avoid triggering kerneloops + - LP: #330606 + + [ Upstream Kernel Changes ] + + * ext4: fix header check in ext4_ext_search_right() for deep extent + trees. + - LP: #346194 + * eCryptfs: NULL crypt_stat dereference during lookup + - LP: #345766 + * eCryptfs: Allocate a variable number of pages for file headers + (CVE-2009-0787) + - LP: #345544 + + -- Luke Yelavich Tue, 24 Mar 2009 10:40:43 +1100 + +linux-rt (2.6.28-3.8) jaunty; urgency=low + + * Rebase on jaunty mainline 2.6.28-11.36. + * Adjust CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR as per jaunty mainline. + * Adjust CONFIG_UEVENT_HELPER_PATH as per jaunty mainline. + * Adjust dependency on linux-source-2.6.28. + + -- Luke Yelavich Sun, 22 Mar 2009 10:18:54 +1100 + +linux-rt (2.6.28-3.7) jaunty; urgency=low + + * Rebase on jaunty mainline 2.6.28-11.34. + * Update config files as per jaunty mainline. + * Add hard dependency on linux-source-2.6.28 = 2.6.28-11.34 + + -- Luke Yelavich Wed, 18 Mar 2009 17:56:45 +1100 + +linux-rt (2.6.28-2.6) jaunty; urgency=low + + * Rebase on top of Jaunty 2.6.28-9.29. + * Add code to copy modules.order into the kernel packages. + * Add fuse to modules.ignore since fuse is built into the kernel. + + -- Luke Yelavich Tue, 10 Mar 2009 09:45:01 +1100 + +linux-rt (2.6.28-1.5) jaunty; urgency=low + + [ Alessio Igor Bogani ] + * Resync with ubuntu jaunty kernel v2.6.28-8.26 + + [ Luke Yelavich ] + * re-enable ABI checking + * debian/scripts/misc/getabis: tweak for fetching the rt abi files + * build our own arch independant headers package as well + * rebase against jaunty 2.6.28-8.27 + * update config files. + + -- Luke Yelavich Fri, 06 Mar 2009 07:58:51 +1100 + +linux-rt (2.6.28-1.4) jaunty; urgency=low + + * Resync with ubuntu jaunty kernel v2.6.28-8.24 + * Merge all patches into only one + * Update config files + + -- Alessio Igor Bogani Fri, 20 Feb 2009 19:54:53 +0100 + +linux-rt (2.6.28-1.3) jaunty; urgency=low + + * Update config files, and remove unneeded armel files/references. + + -- Luke Yelavich Sat, 14 Feb 2009 22:18:58 +1100 + +linux-rt (2.6.28-1.2) jaunty; urgency=low + + * Resync with the ubuntu jaunty kernel, v2.6.28-8.21, and use the work done + so far by Alessio Igor Bogani for the realtime patch + * Partially remove udeb generation code, as the udebs are not ever used + for installation + * Add a hack to ensure that the linux-headers package depends on jaunty's + main linux headers package + + -- Luke Yelavich Sat, 14 Feb 2009 16:24:24 +1100 + +linux-rt (2.6.28-1.1) jaunty; urgency=low + + * Initial version based on linux-source-2.6.28 2.6.28-6.16 + * CONFIG_AUFS and CONFIG_CGROUP_MEM_RES_CTLR are still disabled + + -- Alessio Igor Bogani Thu, 29 Jan 2009 08:46:29 +0100 + +linux-rt (2.6.27-3.8) intrepid; urgency=low + + * Blacklisted XFS to prevent an ABI bump + - This is based off git commit 21c69ea60244403e503f148cd29d89df85eb0908 + from ubuntu-intrepid.git (LP: #289683) + + -- Michael Casadevall Sun, 26 Oct 2008 20:46:33 -0400 + +linux-rt (2.6.27-3.7) intrepid; urgency=low + + * Synced against linux-source-2.6.27 2.6.27-7.12 + * Added ext4-nrpages.patch and jbd2-nrpages.patch + * Synced configuration + + -- Alessio Igor Bogani Sat, 18 Oct 2008 18:08:55 +0200 + +linux-rt (2.6.27-3.6) intrepid; urgency=low + + * Synced against linux-source-2.6.27 2.6.27-7.11 + + -- Alessio Igor Bogani Wed, 15 Oct 2008 15:36:52 +0200 + +linux-rt (2.6.27-3.5) intrepid; urgency=low + + * Synced against linux-source-2.6.27 2.6.27-7.10 + * Bump ABI to -3 + + -- Alessio Igor Bogani Mon, 13 Oct 2008 15:46:02 +0200 + +linux-rt (2.6.27-2.4) intrepid; urgency=low + + * Mistake in packaging + + -- Alessio Igor Bogani Fri, 10 Oct 2008 14:07:41 +0200 + +linux-rt (2.6.27-2.3) intrepid; urgency=low + + * Fix a FTBS + + -- Alessio Igor Bogani Fri, 10 Oct 2008 11:46:25 +0200 + +linux-rt (2.6.27-2.2) intrepid; urgency=low + + * Synced against linux-source-2.6.27 2.6.27-6.9 + * Disable CONFIG_AUFS, CONFIG_CGROUP_MEM_RES_CTLR, CONFIG_FTRACE, + CONFIG_LATENCYTOP, CONFIG_CONTEXT_SWITCH_TRACER + * Set timer frequency to 1000 HZ + * Bump ABI to -2 + + -- Alessio Igor Bogani Thu, 09 Oct 2008 15:27:27 +0200 + +linux-rt (2.6.27-1.1) intrepid; urgency=low + + * Initial version based on linux-source-2.6.27 2.6.27-4.6 (LP: #281276) + + -- Alessio Igor Bogani Fri, 03 Oct 2008 09:15:26 +0200 --- linux-rt-2.6.29.5.orig/debian/changelog.jaunty +++ linux-rt-2.6.29.5/debian/changelog.jaunty @@ -0,0 +1,4737 @@ +linux (2.6.28-11.42) jaunty; urgency=low + + [ Tim Gardner ] + + * Enabled LPIA CONFIG_PACKET=y + - LP: #362071 + + [ Upstream Kernel Changes ] + + * ext4: fix bb_prealloc_list corruption due to wrong group locking + - LP: #348836 + + -- Stefan Bader Thu, 16 Apr 2009 08:10:55 +0200 + +linux (2.6.28-11.41) jaunty; urgency=low + + [ Amit Kucheria ] + + * ixp4xx: Enabled TCP SYN_COOKIES + - LP: #346378 + + [ Brad Figg ] + + * Change LPIA configuration to compile with CONFIG_NETFILTER_XT_MATCH_RECENT + - LP: #355291 + + [ Kay Sievers ] + + * SAUCE: driver core: allow non-root users to listen to uevents + - LP: #357124 + + [ Manoj Iyer ] + + * SAUCE: Added quirk to recognize GE0301 3G modem as an interface. + - LP: #348861 + + [ Tim Gardner ] + + * Revert "SAUCE: [i915] allocate MCHBAR space & enable if necessary" + Appears to cause hard locks in some cases. + - LP: #349314 + + [ Trond Myklebust ] + + * SAUCE: NFS: Fix the notifications when renaming onto an existing file + - LP: #224642 + + [ Upstream Kernel Changes ] + + * USB: option: add QUANTA HSDPA Data Card device ids + - LP: #353321 + * hwmon: (abituguru3) Match partial DMI board name strings + - LP: #298798 + * zd1211rw: adding Sitecom WL-603 (0df6:0036) to the USB id list + - LP: #339631 + * USB: unusual dev for Option N.V. ZeroCD modems + - LP: #348861 + + -- Tim Gardner Sat, 04 Apr 2009 08:42:14 -0600 + +linux (2.6.28-11.40) jaunty; urgency=low + + [ Amit Kucheria ] + + * Disable DEVKMEM for all archs on Jaunty + - LP: #354221 + + [ Andy Whitcroft ] + + * SAUCE: md: wait for possible pending deletes after stopping an array + - LP: #334994 + + [ Brad Figg ] + + * ARM: Setting the bootloader for imx51 flavour. + - LP: #348382 + * ARM: Add bootloader package Recomendation to iop32x and ixp4xx flavours + - LP: #348382 + + [ Tim Gardner ] + + * SAUCE: [i915] allocate MCHBAR space & enable if necessary + - LP: #349314 + + [ Upstream Kernel Changes ] + + * hpilo: open/close fix + - LP: #353496 + + -- Amit Kucheria Thu, 02 Apr 2009 11:26:22 -0400 + +linux (2.6.28-11.39) jaunty; urgency=low + + [ Alan Tull ] + + * SAUCE: mx51: fix to1.1 in mxc_iomux_set_input + - LP: #348333 + + [ Andy Whitcroft ] + + * SAUCE: acer: rfkill disable quirk for ACER Aspire One + - LP: #319825 + + [ Brad Figg ] + + * ARM: Increase CONFIG_BLK_DEV_RAM_SIZE for imx51 flavour. + - LP: #349842 + * ARM: Enable rtl8187 for imx51 + - LP: #349526 + * ARM: Unset CONFIG_USB_STORAGE_DEBUG for imx51 + - LP: #348504 + + [ Bryan Wu ] + + * build CRAMFS into kernel to support mounting CRAMFS initrd on iop32x + machine + - LP: #349104 + + [ Michael Casadevall ] + + * [lpia] Change ATA, SCSI, SD, ext2-4 modules into compiled-in components + - LP: #347458 + + [ Rob Herring ] + + * SAUCE: imx51: fec: fix cache operations for receive + - LP: #348333 + + [ Sam Yang ] + + * SAUCE: Revert ENGR00103870 FEC reopening causes network wdog timeout + - LP: #348333 + * SAUCE: imx51: fec cache flush functions are incorrect + - LP: #348333 + + [ Upstream Kernel Changes ] + + * Bluetooth: Add fine grained mem_flags usage to btusb driver + - LP: #268502 + * Bluetooth: Handle bulk URBs in btusb driver from notify callback + - LP: #268502 + * Bluetooth: Submit bulk URBs along with interrupt URBs + - LP: #268502 + + -- Tim Gardner Wed, 01 Apr 2009 17:37:32 -0600 + +linux (2.6.28-11.38) jaunty; urgency=low + + [ Brad Figg ] + + * When AppArmor is configured, securityfs must be as well. + - LP: #344370 + * ARM: Enable AA with SECURITYFS for imx51 + - LP: #344370 + + [ Bryan Wu ] + + * Add 3 missing files to prerm remove file list + - LP: #345623 + + [ Daniel T Chen ] + + * SAUCE: (drop after 2.6.28) Don't trust hw-ptr blindly + - LP: #330814 + * SAUCE: (drop after 2.6.28) Apply further pcm_lib updates for hw_ptr + - LP: #330814 + + [ Ike Panhc ] + + * Copy header files for various kernel media driver + - LP: #322732 + + [ Tim Gardner ] + + * Revert "Fix the VFP handling on the Feroceon CPU" + Only applied to mv78xx0 ARM flavour. + * Enabled drivers/staging/at76_usb + - LP: #152626 + + [ ubuntu@tjworld.net ] + + * SAUCE: ipw2200: Enable LED by default + - LP: #21367 + * SAUCE: wistron_btns: support Prestigio Wifi RF kill button over suspend + - LP: #346586 + + [ Upstream Kernel Changes ] + + * Build fix for __early_pfn_to_nid() undefined link error + * Fix misreporting of #cores as #hyperthreads for Q9550 + * eventfd: remove fput() call from possible IRQ context + * S390: __div64_31 broken for CONFIG_MARCH_G5 + * ALSA: Fix vunmap and free order in snd_free_sgbuf_pages() + * ALSA: mixart, fix lock imbalance + * ALSA: pcm_oss, fix locking typo + * ALSA: hda - Fix DMA mask for ATI controllers + * ALSA: hda - Workaround for buggy DMA position on ATI controllers + * ALSA: opl3sa2 - Fix NULL dereference when suspending snd_opl3sa2 + * nfsd: nfsd should drop CAP_MKNOD for non-root + * NFSD: provide encode routine for OP_OPENATTR + * dm ioctl: validate name length when renaming + * dm io: respect BIO_MAX_PAGES limit + * dm crypt: fix kcryptd_async_done parameter + * dm crypt: wait for endio to complete before destruction + * ata_piix: add workaround for Samsung DB-P70 + * V4L/DVB (10218): cx23885: Fix Oops for mixed install of analog and + digital only cards + * thinkpad-acpi: fix module autoloading for older models + * Add '-fwrapv' to gcc CFLAGS + * Move cc-option to below arch-specific setup + * USB: storage: Unusual USB device Prolific 2507 variation added + * USB: Add Vendor/Product ID for new CDMA U727 to option driver + * USB: option.c: add ZTE 622 modem device + * USB: Add device id for Option GTM380 to option driver + * USB: Option: let cdc-acm handle Sony Ericsson F3507g / Dell 5530 + * USB: Updated unusual-devs entry for USB mass storage on Nokia 6233 + * USB: unusual_devs: Add support for GI 0431 SD-Card interface + * USB: serial: add FTDI USB/Serial converter devices + * USB: serial: ftdi: enable UART detection on gnICE JTAG adaptors + blacklist interface0 + * USB: serial: new cp2101 device id + * USB: usbtmc: fix stupid bug in open() + * USB: usbtmc: add protocol 1 support + * USB: usbfs: keep async URBs until the device file is closed + * USB: EHCI: expedite unlinks when the root hub is suspended + * USB: EHCI: Fix isochronous URB leak + * powerpc: Remove extra semicolon in fsl_soc.c + * menu: fix embedded menu snafu + * Linux 2.6.28.9 + * Add '-fwrapv' to gcc CFLAGS + - LP: #348015 + * Move cc-option to below arch-specific setup + - LP: #348015 + * Revert Staging: at76_usb: update drivers/staging/at76_usb w/ mac80211 + port + - LP: #152626 + * Staging: at76_usb: fix bugs introduced by "Staging: at76_usb: cleanup + dma on stack issues" + - LP: #152626 + * Staging: at76_usb: Add support for OQO Model 01+ + - LP: #152626 + + -- Tim Gardner Mon, 23 Mar 2009 19:20:08 -0600 + +linux (2.6.28-11.37) jaunty; urgency=low + + [ Alex Deucher ] + + * SAUCE: radeon: add some new pci ids + - LP: #334101 + + [ Amit Kucheria ] + + * Updating configs - rip out orion5x and mv78xx0 flavours + + [ Andy Whitcroft ] + + * SAUCE: tone down the synaptics warning to avoid triggering kerneloops + - LP: #330606 + + [ Upstream Kernel Changes ] + + * ext4: fix header check in ext4_ext_search_right() for deep extent + trees. + - LP: #346194 + * eCryptfs: NULL crypt_stat dereference during lookup + - LP: #345766 + * eCryptfs: Allocate a variable number of pages for file headers + (CVE-2009-0787) + - LP: #345544 + + -- Tim Gardner Mon, 23 Mar 2009 09:24:32 -0600 + +linux (2.6.28-11.36) jaunty; urgency=low + + [ Amit Kucheria ] + + * Updating imx51 configs one more time + * Disable CONFIG_UEVENT_HELPER_PATH + + [ Anton Veretenenko ] + + * SAUCE: sony-laptop: add support for Sony Vaio FW series function/media + keys + - LP: #307592 + + [ Brad Figg ] + + * Have AUFS use current VFS APIs so it can build with or without + AppArmor. + + [ Bryan Wu ] + + * Build-in "Ram block device support" to boot up with initramfs + - LP: #329098 + * Remove brd module from iop32x modules list + - LP: #329098 + * Increase the CONFIG_BLK_DEV_RAM_SIZE to 8192 on i.MX51 + + [ Ike Panhc ] + + * SAUCE: Fixing symbol name in HECI module + - LP: #336549 + + [ Manoj Iyer ] + + * SAUCE: Added quirk for Ralink rt2870 802.11n USB driver + - LP: #326621 + + [ Upstream Kernel Changes ] + + * udf:SAUCE (drop after 2.6.30): Fix oops when invalid character in + filename occurs + - LP: #321606 + + -- Stefan Bader Fri, 20 Mar 2009 16:52:08 +0100 + +linux (2.6.28-11.35) jaunty; urgency=low + + [ Amit Kucheria ] + + * Updating imx51 configs + + [ Andy Whitcroft ] + + * SAUCE: hotkey quirks for various Zeptro Znote and Fujitsu Amilo laptops + - LP: #330259 + + [ Tim Gardner ] + + * Revert "SAUCE: (drop after 2.6.28) eCryptfs: Don't encrypt file key + with filename key". Use upstream commit. + * CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR set to upstream defaults. + 64K for x86'en, 32K for ARM + - LP: #344955 + + [ Upstream Kernel Changes ] + + * eCryptfs: don't encrypt file key with filename key + * libata: set NODEV_HINT for 0x7f status + - LP: #293218 + * USB: cdc-acm: Add another conexant modem to the quirks + - LP: #323829 + * Input: elantech - touchpad driver miss-recognising logitech mice + - LP: #318722 + + -- Tim Gardner Wed, 18 Mar 2009 08:52:46 -0600 + +linux (2.6.28-11.34) jaunty; urgency=low + + [ Alex Deucher ] + + * SAUCE: (drop after 2.6.28) radeon: add support for RS600, R6xx, and + R7xx GPUs + - LP: #334101 + + [ Aristeu Sergio Rozanski Filho ] + + * SAUCE: (drop after 2.6.28) ALSA: hda: add quirk for Lenovo X200 laptop + dock + + [ Shane Huang ] + + * SAUCE: (drop after 2.6.28) i2c-piix4: Add support to SB800 SMBus + changes + - LP: #338108 + + [ Upstream Kernel Changes ] + + * net: amend the fix for SO_BSDCOMPAT gsopt infoleak + * net: Kill skb_truesize_check(), it only catches false-positives. + * sparc64: Fix crashes in jbusmc_print_dimm() + * sparc64: Fix DAX handling via userspace access from kernel. + * vfs: separate FMODE_PREAD/FMODE_PWRITE into separate flags + * seq_file: properly cope with pread + * vt: Declare PIO_CMAP/GIO_CMAP as compatbile ioctls. + * timerfd: add flags check + * aoe: ignore vendor extension AoE responses + * mm: clean up for early_pfn_to_nid() + * mm: fix memmap init for handling memory hole + * Fix oops in cifs_strfromUCS_le mounting to servers which do not specify + their OS + * mm: fix lazy vmap purging (use-after-free error) + * mm: vmap fix overflow + * PCI quirk: enable MSI on 8132 + * SCSI: hptiop: Add new PCI device ID + * JFFS2: fix mount crash caused by removed nodes + * SCSI: sd: revive sd_index_lock + * USB: usb_get_string should check the descriptor type + * USB: usb-storage: add IGNORE_RESIDUE flag for Genesys Logic adapters + * USB: cdc-acm: add usb id for motomagx phones + * rtl8187: New USB ID's for RTL8187L + * WATCHDOG: ks8695_wdt.c: 'CLOCK_TICK_RATE' undeclared + * WATCHDOG: rc32434_wdt: fix watchdog driver + * WATCHDOG: rc32434_wdt: fix sections + * RDMA/nes: Don't allow userspace QPs to use STag zero + * USB: option: add BenQ 3g modem information + * md: avoid races when stopping resync. + * md/raid10: Don't call bitmap_cond_end_sync when we are doing recovery. + * md/raid10: Don't skip more than 1 bitmap-chunk at a time during + recovery. + * sound: virtuoso: revert "do not overwrite EEPROM on Xonar D2/D2X" + * ALSA: usb-audio - Fix non-continuous rate detection + * ALSA: usb-audio - Workaround for misdetected sample rate with CM6207 + * sound: usb-audio: fix uninitialized variable with M-Audio MIDI + interfaces + * ALSA: fix excessive background noise introduced by OSS emulation rate + shrink + * ALSA: hda - Fix digital mic on dell-m4-1 and dell-m4-3 + * ALSA: aw2: do not grab every saa7146 based device + * acer-wmi: fix regression in backlight detection + * vmalloc: call flush_cache_vunmap() from unmap_kernel_range() + * Fix fixpoint divide exception in acct_update_integrals + * 8250: fix boot hang with serial console when using with Serial Over Lan + port + * x86, vmi: TSC going backwards check in vmi clocksource + * HID: fix bus endianity in file2alias + * inotify: fix GFP_KERNEL related deadlock + * sdhci: fix led naming + * x86: oprofile: don't set counter width from cpuid on Core2 + * intel-agp: fix a panic with 1M of shared memory, no GTT entries + * mtd_dataflash: fix probing of AT45DB321C chips. + * proc: fix kflags to uflags copying in /proc/kpageflags + * fs: new inode i_state corruption fix + * PCIe: portdrv: call pci_disable_device during remove + * PCI: Enable PCIe AER only after checking firmware support + * jsm: additional device support + * libata: Don't trust current capacity values in identify words 57-58 + * mmc: fix data timeout for SEND_EXT_CSD + * s3cmci: Fix hangup in do_pio_write() + * mmc: s3cmci: fix s3c2410_dma_config() arguments. + * MMC: fix bug - SDHC card capacity not correct + * mmc_test: fix basic read test + * x86: tone down mtrr_trim_uncached_memory() warning + * selinux: Fix a panic in selinux_netlbl_inode_permission() + * selinux: Fix the NetLabel glue code for setsockopt() + * hpilo: new pci device + * x86-64: seccomp: fix 32/64 syscall hole + * x86-64: syscall-audit: fix 32/64 syscall hole + * xen: disable interrupts early, as start_kernel expects + * xen/blkfront: use blk_rq_map_sg to generate ring entries + * asix: new device ids + * cdc_ether: add usb id for Ericsson F3507g + * zaurus: add usb id for motomagx phones + * fore200: fix oops on failed firmware load + * PCI: Add PCI quirk to disable L0s ASPM state for 82575 and 82598 + * copy_process: fix CLONE_PARENT && parent_exec_id interaction + * proc: fix PG_locked reporting in /proc/kpageflags + * powerpc: Fix load/store float double alignment handler + * sdhci: Add quirk for controllers with no end-of-busy IRQ + * sdhci: Add NO_BUSY_IRQ quirk for Marvell CAFE host chip + * pipe_rdwr_fasync: fix the error handling to prevent the leak/crash + * DVB: s5h1409: Perform s5h1409 soft reset after tuning + * V4L: tda8290: fix TDA8290 + TDA18271 initialization + * V4L: ivtv: fix decoder crash regression + * x86/paravirt: make arch_flush_lazy_mmu/cpu disable preemption + * x86, hpet: fix for LS21 + HPET = boot hang + * x86: math_emu info cleanup + * x86: fix math_emu register frame access + * ide-iops: fix odd-length ATAPI PIO transfers + * HID: move tmff and zpff devices from ignore_list to blacklist + * ARM: Add i2c_board_info for RiscPC PCF8583 + * i2c: Timeouts reach -1 + * i2c: Fix misplaced parentheses + * ACPI: fix broken usage of name.ascii + * ACPI: fix broken usage of acpi_ut_get_node_name() + * crypto: api - Fix algorithm test race that broke aead initialisation + * hwmon: (f71882fg) Hide misleading error message + * MIPS: compat: Implement is_compat_task. + * hwmon: (it87) Properly decode -128 degrees C temperature + * Linux 2.6.28.8 + + -- Tim Gardner Tue, 17 Mar 2009 07:07:33 -0600 + +linux (2.6.28-10.33) jaunty; urgency=low + + [ Scott James Remnant ] + + * SAUCE: nbd: Change default partitions per device to 15 + - LP: #342563 + + [ Tejun Heo ] + + * SAUCE: libata: make sure port is thawed when skipping resets + - LP: #269652 + + [ Tim Gardner ] + + * Revert "SAUCE: Auto-load esp module when device opened." + This driver performs unsafe ISA probes (according to Alan Cox). + * Enable CONFIG_USB_GADGET_DUMMY_HCD + This facilitates gadget slave endpoints in virtual environments. + * Build ehci, uhci, and ohci into the i386/amd64 kernels + - LP: #296710 + + [ Upstream Kernel Changes ] + + * Add "thumbee" to the hwcap_str array + - LP: #343602 + * Add HWCAP_NEON to the ARM hwcap.h file + - LP: #343602 + * x86: mtrr: don't modify RdDram/WrDram bits of fixed MTRRs + - LP: #292619 + + -- Tim Gardner Mon, 16 Mar 2009 08:19:53 -0600 + +linux (2.6.28-10.32) jaunty; urgency=low + + [ Amit Kucheria ] + + * Delete prepare-ppa-source script + + [ Andy Isaacson ] + + * SAUCE: FSAM7400: select CHECK_SIGNATURE + * SAUCE: LIRC_PVR150: depends on VIDEO_IVTV + - LP: #341477 + + [ Ayaz Abdulla ] + + * SAUCE: forcedeth: msi interrupt fix + - LP: #288281 + + [ Brad Figg ] + + * Updating armel configs to remove PREEMPT + + [ Catalin Marinas ] + + * Fix the VFP handling on the Feroceon CPU + + [ Huaxu Wan ] + + * SAUCE: (drop after 2.6.28) [Jaunty] iwlagn: fix iwlagn DMA mapping + direction + + [ Ike Panhc ] + + * squashfs: correct misspelling + - LP: #322306 + + [ Theodore Ts'o ] + + * SAUCE: (drop after 2.6.28) ext4: add EXT4_IOC_ALLOC_DA_BLKS ioctl + * SAUCE: (drop after 2.6.28) ext4: Automatically allocate delay allocated + blocks on close + * SAUCE: (drop after 2.6.28) ext4: Automatically allocate delay allocated + blocks on rename + - LP: #317781 + + [ Tyler Hicks ] + + * SAUCE: (drop after 2.6.28) eCryptfs: Don't encrypt file key with + filename key + - LP: #342128 + + [ Upstream Kernel Changes ] + + * ALS: hda - Add support of iMac 24 Aluminium + * USB: fix broken OTG makefile reference + * ALSA: hda - add another MacBook Pro 3,1 SSID + * ALSA: hda - Add model entry for HP dv4 + * x86-64: fix int $0x80 -ENOSYS return + - LP: #339743 + + -- Tim Gardner Thu, 12 Mar 2009 19:16:07 -0600 + +linux (2.6.28-9.31) jaunty; urgency=low + + [ Andy Whitcroft ] + + * SAUCE: cpufreq-nforce2: probe failures are not errors + - LP: #332170 + * SAUCE: mmc: add MODALIAS linkage for MMC/SD devices + - LP: #30335 + * remove test-suspend script + - LP: #333856 + + [ Kees Cook ] + + * handle relative paths in modules.dep + Fixes 2.6.28-9.30 FTBS. + + [ Upstream Kernel Changes ] + + * ricoh_mmc: Handle newer models of Ricoh controllers + + -- Tim Gardner Wed, 11 Mar 2009 08:19:24 -0600 + +linux (2.6.28-9.30) jaunty; urgency=low + + [ Amit Kucheria ] + + * ARM:mx51 Add SoC and board support for mx51 platforms + * ARM:mx51 Add CONFIG_ARCH_MXC_CANONICAL to disable parts of Freescale's + code + * MMC: Add support for 8-bit cards + * Add ARM:MX51 SoC support to the build system + * ARM: Make ARM arch aware of ubuntu/ drivers + * ARM: Add imx51 configuration + * Disable d-i modules for imx51 and mv78xx0 + * Disable Apparmor on boot for ARM + * Updating imx51 config + + [ Jason Liu ] + + * Do not use OOB with MLC NAND + + [ Richard Zhu ] + + * Support the eMMC4.3 card + + [ Rob Herring ] + + * ARM: Add more cache memory types macros + + [ Tim Gardner ] + + * Set CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y for i386/amd64/lpia + + [ Manoj Iyer ] + + * Enable CONFIG_RTL8187SE=m + + [ Upstream Kernel Changes ] + + * USB: EHCI: slow down ITD reuse + - LP: #329437 + + -- Tim Gardner Sun, 08 Mar 2009 14:14:15 -0600 + +linux (2.6.28-9.29) jaunty; urgency=low + + [ Andy Whitcroft ] + + * link-headers -- only link directories which do not already exist + - LP: #315252 + + [ Daniel Marjamäki ] + + * SAUCE: (drop after 2.6.28) netxen: fix memory leak in + drivers/net/netxen_nic_init.c + - LP: #330813 + + [ Dhananjay Phadke ] + + * SAUCE: (drop after 2.6.28) netxen: fix endianness in firmware commands + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: fix ipv6 offload and tx cleanup + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: fix link speed reporting for some + boards + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: firmware init fix + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: cleanup mac list on driver unload + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: hold tx lock while sending firmware + commands + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: handle dma mapping failures + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: avoid invalid iounmap + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: include ipv6.h (fixes build failure) + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: fix vlan tso/checksum offload + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: reduce memory footprint + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: revert jumbo ringsize + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: fix msi-x interrupt handling + - LP: #330813 + * SAUCE: (drop after 2.6.28) netxen: remove pcie workaround + - LP: #330813 + + [ Hannes Eder ] + + * SAUCE: (drop after 2.6.28) drivers/net/netxen: fix sparse warnings: use + NULL pointer instead of plain integer + - LP: #330813 + + [ Huaxu Wan ] + + * SAUCE: report rfkill changes event if interface is down + - LP: #193970 + + [ Tim Gardner ] + + * MV78XX0 must specify a target in the vars definition. + + [ Upstream Kernel Changes ] + + * Revert "ext4: wait on all pending commits in ext4_sync_fs()" + * jbd2: Fix return value of jbd2_journal_start_commit() + * jbd2: Avoid possible NULL dereference in + jbd2_journal_begin_ordered_truncate() + * ext4: Fix to read empty directory blocks correctly in 64k + * ext4: Fix lockdep warning + * ext4: Initialize preallocation list_head's properly + * ext4: Implement range_cyclic in ext4_da_writepages instead of + write_cache_pages + * ext4: Fix NULL dereference in ext4_ext_migrate()'s error handling + * ext4: Add fallback for find_group_flex + * ext4: Fix deadlock in ext4_write_begin() and ext4_da_write_begin() + * Added mv78xx0 flavor + + -- Tim Gardner Fri, 06 Mar 2009 06:13:31 -0700 + +linux (2.6.28-8.28) jaunty; urgency=low + + [ Alexey Starikovskiy ] + + * SAUCE: ACPI: EC: Limit workaround for ASUS notebooks even more + - LP: #288385 + + [ Scott James Remnant ] + + * SAUCE: Auto-load esp module when device opened. + * SAUCE: Auto-load bridge module when socket opened. + * SAUCE: Auto-load af_netlink module when socket opened. + * SAUCE: Auto-load wanrouter module when socket opened. + * SAUCE: Auto-load ip_queue module when socket opened. + * SAUCE: Auto-load ip6_queue module when socket opened. + * SAUCE: Auto-load cn module when socket opened. + * SAUCE: Auto-load scsi_transport_iscsi module when socket opened. + * SAUCE: Auto-load ftl module when device opened. + * SAUCE: Auto-load pcd module when device opened. + * SAUCE: Auto-load pf module when device opened. + * SAUCE: Auto-load nftl module when device opened. + * SAUCE: Auto-load mousedev module when psaux device opened. + * SAUCE: Auto-load mousedev module when /dev/input/mice opened. + * SAUCE: Auto-load rng-core module when device opened. + * SAUCE: Auto-load openprom module when device opened. + * SAUCE: Auto-load applicom module when device opened. + * SAUCE: Auto-load toshiba module when device opened. + * SAUCE: Auto-load cyclades module when device opened. + * SAUCE: Auto-load riscom8 module when device opened. + * SAUCE: Auto-load specialix module when device opened. + * SAUCE: Auto-load videodev module when device opened. + * SAUCE: Auto-load i2c_dev module when device opened. + * SAUCE: Auto-load mtdchar module when device opened. + * SAUCE: Auto-load pt module when device opened. + * SAUCE: Auto-load pg module when device opened. + * SAUCE: Auto-load cdc_acm module when device opened. + * SAUCE: Auto-load msr module when device opened. + * SAUCE: Auto-load cpuid module when device opened. + * SAUCE: quickcam: Enable double-buffering by default + * SAUCE: libata: Ignore HPA by default. + * SAUCE: hostap: Change initial operation mode to managed (infra) + * SAUCE: floppy: Provide a PnP device table in the module. + - LP: #255651 + * SAUCE: Auto-load mwave module when device opened. + * Build CONFIG_FUSE_FS into kernel, not as module. + + [ Stefan Bader ] + + * Enable build of ext4 as a module on LPIA + - LP: #331848 + + [ Tim Gardner ] + + * Update configs to fix LPIA FTBS + + -- Tim Gardner Thu, 05 Mar 2009 10:43:24 -0700 + +linux (2.6.28-8.27) jaunty; urgency=low + + [ Amit Kucheria ] + + * Updating configs (arm:ixp4xx) + + [ Andy Whitcroft ] + + * SAUCE: enable Intel HDMI output + + [ Manoj Iyer ] + + * SAUCE: Added quirk for Linksys WUSB600N USB wifi-n networking adapter + - LP: #323473 + + [ Steve Beattie ] + + * fix apparmor memory leak on unlinked file ops + - LP: #329489 + + [ Tim Gardner ] + + * SAUCE: Dell XPS710 reboot quirk + - LP: #323592 + * SAUCE: (drop after 2.6.28) ieee80211: Add infrastructure to obsolete + scan results + - LP: #336055 + * Add modules.order to the linux-image package. + + [ Upstream Kernel Changes ] + + * iwlwifi: fix time interval misuse in iwl_poll_{direct_}bit + * x86: only scan the root bus in early PCI quirks + - LP: #267295 + * ALSA: hda - Intel HDMI audio support + * ALSA: hda - Fix unused function in patch_intelhdmi.c + * ALSA: handle SiI1392 HDMI codec in patch_intelhdmi.c + * ALSA: hda-intel: reorder HDMI audio enabling sequence + * ALSA: introduce snd_print_pcm_rates() + * ALSA: create hda_eld.c for ELD routines and proc interface + * ALSA: ELD proc interface for HDMI sinks + * ALSA: hda: make standalone hdmi_fill_audio_infoframe() + * ALSA: hda: make global snd_print_channel_allocation() + * ALSA: hda: HDMI channel allocations for audio infoframe + * ALSA: hda: HDMI channel mapping cleanups + * ALSA: hda: minor code cleanups + * ALSA: hda: rename sink_eld to hdmi_eld + * ALSA: hda - Release ELD proc file + * ALSA: hda - minor HDMI code cleanups + * ALSA: hda - report selected CA index for Audio InfoFrame + * ALSA: hda - Add Intel vendor id string + + -- Tim Gardner Wed, 25 Feb 2009 14:23:46 -0700 + +linux (2.6.28-8.26) jaunty; urgency=low + + [ Amit Kucheria ] + + * Updating configs (armel:ixp4xx) + - LP: #331510 + + [ Tim Gardner ] + + * Add more missing modules + + -- Tim Gardner Tue, 24 Feb 2009 06:58:53 -0700 + +linux (2.6.28-8.25) jaunty; urgency=low + + [ Scott James Remnant ] + + * SAUCE: Prefer powernow-k8 to acpi-cpufreq + * Change CONFIG_X86_P4_CLOCKMOD to be a module again. + + [ Tim Gardner ] + + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Initialize the new + group descriptor when resizing the filesystem" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Add sanity check + to make_indexed_dir" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: only use + i_size_high for regular files" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Add sanity checks + for the superblock before mounting the filesystem" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Fix + s_dirty_blocks_counter if block allocation failed with nodelalloc" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Init the complete + page while building buddy cache" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Don't allow new + groups to be added during block allocation" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: mark the + blocks/inode bitmap beyond end of group as used" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Use new + buffer_head flag to check uninit group bitmaps initialization" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Fix the race + between read_inode_bitmap() and ext4_new_inode()" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Fix race between + read_block_bitmap() and mark_diskspace_used()" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: don't use blocks + freed but not yet committed in buddy cache init" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: cleanup mballoc + header files" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Use + EXT4_GROUP_INFO_NEED_INIT_BIT during resize" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Add blocks added + during resize to bitmap" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Don't overwrite + allocation_context ac_status" + * Revert "SAUCE: (revert before 2.6.28.y update) jbd2: Add barrier not + supported test to journal_wait_on_commit_record" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Widen type of + ext4_sb_info.s_mb_maxs[]" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: avoid ext4_error + when mounting a fs with a single bg" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Fix the delalloc + writepages to allocate blocks at the right offset." + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: tone down + ext4_da_writepages warnings" + * Revert "SAUCE: (revert before 2.6.28.y update) ext4: Add support for + non-native signed/unsigned htree hash algorithms" + * Enabled X86_ACPI_CPUFREQ=y + + [ Upstream Kernel Changes ] + + * ath9k: quiet harmless ForceXPAon messages + - LP: #321474 + * [WATCHDOG] iTCO_wdt: fix SMI_EN regression 2 + - LP: #314050 + * pid: implement ns_of_pid + * mqueue: fix si_pid value in mqueue do_notify() + * powerpc/vsx: Fix VSX alignment handler for regs 32-63 + * sata_nv: give up hardreset on nf2 + * Fix Intel IOMMU write-buffer flushing + * SCSI: libiscsi: fix iscsi pool leak + * x86/cpa: make sure cpa is safe to call in lazy mmu mode + * sched: SCHED_OTHER vs SCHED_IDLE isolation + * x86, vm86: fix preemption bug + * Add support for VT6415 PCIE PATA IDE Host Controller + * ext2/xip: refuse to change xip flag during remount with busy inodes + * 3c505: do not set pcb->data.raw beyond its size + * Bluetooth: Fix TX error path in btsdio driver + * ext4: Add support for non-native signed/unsigned htree hash algorithms + * ext4: tone down ext4_da_writepages warnings + * ext4: Fix the delalloc writepages to allocate blocks at the right + offset. + * ext4: avoid ext4_error when mounting a fs with a single bg + * ext4: Widen type of ext4_sb_info.s_mb_maxs[] + * jbd2: Add barrier not supported test to journal_wait_on_commit_record + * ext4: Don't overwrite allocation_context ac_status + * ext4: Add blocks added during resize to bitmap + * ext4: Use EXT4_GROUP_INFO_NEED_INIT_BIT during resize + * ext4: cleanup mballoc header files + * ext4: don't use blocks freed but not yet committed in buddy cache init + * ext4: Fix race between read_block_bitmap() and mark_diskspace_used() + * ext4: Fix the race between read_inode_bitmap() and ext4_new_inode() + * ext4: Use new buffer_head flag to check uninit group bitmaps + initialization + * ext4: mark the blocks/inode bitmap beyond end of group as used + * ext4: Don't allow new groups to be added during block allocation + * ext4: Init the complete page while building buddy cache + * ext4: Fix s_dirty_blocks_counter if block allocation failed with + nodelalloc + * ext4: Add sanity checks for the superblock before mounting the + filesystem + * ext4: only use i_size_high for regular files + * ext4: Add sanity check to make_indexed_dir + * ext4: Initialize the new group descriptor when resizing the filesystem + * Fix longstanding "error: storage size of '__mod_dmi_device_table' isn't + known" + * Linux 2.6.28.7 + + -- Tim Gardner Thu, 19 Feb 2009 06:45:55 -0700 + +linux (2.6.28-8.24) jaunty; urgency=low + + [ Scott James Remnant ] + + * Change CPU_FREQ_DEFAULT_GOV_ONDEMAND to y + * SAUCE: Link acpi-cpufreq.o first + + [ Tim Gardner ] + + * Build in CPU Frequency scaling drivers + + -- Tim Gardner Wed, 18 Feb 2009 06:12:24 -0700 + +linux (2.6.28-8.23) jaunty; urgency=low + + [ Andy Whitcroft ] + + * include the kernel configuration in the sub-flavour images + - LP: #328859 + + [ Tim Gardner ] + + * Revert "SAUCE: (drop after 2.6.28) [eCryptfs] Regression in unencrypted + filename symlinks" in favor of upstream commit. + * Fix compile issues with qc-usb + * SAUCE: (remove after 2.6.28) V4L/DVB (10216): saa7127: fix broken + S-Video with saa7129 + - LP: #329267 + + [ Upstream Kernel Changes ] + + * Subject:SAUCE: LP#193970 iwlagn: fix hw-rfkill while the interface is + down + - LP: #193970 + * x86, vmi: put a missing paravirt_release_pmd in pgd_dtor + * nbd: fix I/O hang on disconnected nbds + * mac80211: restrict to AP in outgoing interface heuristic + * w1: w1 temp calculation overflow fix + * zd1211rw: adding 0ace:0xa211 as a ZD1211 device + * zd1211rw: treat MAXIM_NEW_RF(0x08) as UW2453_RF(0x09) for TP-Link + WN322/422G + * parport: parport_serial, don't bind netmos ibm 0299 + * syscall define: fix uml compile bug + * kernel-doc: fix syscall wrapper processing + * Fix page writeback thinko, causing Berkeley DB slowdown + * write-back: fix nr_to_write counter + * writeback: fix break condition + * mm: rearrange exit_mmap() to unlock before arch_exit_mmap + * powerpc/fsl-booke: Fix mapping functions to use phys_addr_t + * lockd: fix regression in lockd's handling of blocked locks + * sctp: Fix crc32c calculations on big-endian arhes. + * sctp: Correctly start rtx timer on new packet transmissions. + * sctp: Properly timestamp outgoing data chunks for rtx purposes + * net: Fix frag_list handling in skb_seq_read + * net: Fix OOPS in skb_seq_read(). + * drivers/net/skfp: if !capable(CAP_NET_ADMIN): inverted logic + * ipv4: fix infinite retry loop in IP-Config + * net: Fix userland breakage wrt. linux/if_tunnel.h + * net: packet socket packet_lookup_frame fix + * packet: Avoid lock_sock in mmap handler + * sungem: Soft lockup in sungem on Netra AC200 when switching interface + up + * udp: Fix UDP short packet false positive + * udp: increments sk_drops in __udp_queue_rcv_skb() + * ipv6: Disallow rediculious flowlabel option sizes. + * ipv6: Copy cork options in ip6_append_data + * net: 4 bytes kernel memory disclosure in SO_BSDCOMPAT gsopt try #2 + * sky2: fix hard hang with netconsoling and iface going up + * tun: Add some missing TUN compat ioctl translations. + * tun: Fix unicast filter overflow + * virtio_net: Fix MAX_PACKET_LEN to support 802.1Q VLANs + * tcp: splice as many packets as possible at once + * tcp: Fix length tcp_splice_data_recv passes to skb_splice_bits. + * sparc: Enable syscall wrappers for 64-bit (CVE-2009-0029) + * sparc64: Annotate sparc64 specific syscalls with SYSCALL_DEFINEx() + * ALSA: hda - Add missing terminator in slave dig-out array + * ALSA: mtpav - Fix initial value for input hwport + * HID: adjust report descriptor fixup for MS 1028 receiver + * ide/libata: fix ata_id_is_cfa() (take 4) + * libata: fix EH device failure handling + * netfilter: fix tuple inversion for Node information request + * netfilter: xt_sctp: sctp chunk mapping doesn't work + * x86: microcode_amd: fix wrong handling of equivalent CPU id + * ide-cd: fix DMA for non bio-backed requests + * net: Fix data corruption when splicing from sockets. + * Linux 2.6.28.6 + * eCryptfs: Regression in unencrypted filename symlinks + + -- Tim Gardner Mon, 16 Feb 2009 06:43:51 -0700 + +linux (2.6.28-8.22) jaunty; urgency=low + + [ Amit Kucheria ] + + * Remove perm-blacklist + + [ Andy Whitcroft ] + + * SAUCE: psmouse/synaptics: ensure we reset the device on resume + - LP: #317270 + + [ Tim Gardner ] + + * Add lpia to getabi script + * SAUCE: tracer for sreadahead + + -- Amit Kucheria Fri, 13 Feb 2009 15:23:21 +0200 + +linux (2.6.28-8.21) jaunty; urgency=low + + [ Andy Whitcroft ] + + * SAUCE: switch the Asus Pundit P1-AH2 to old acpi sleep ordering + - LP: #327267 + + [ Tim Gardner ] + + * Added LPIA arch support + * Added libdrm-dev as a 'Replaces' to linux-libc-dev + * SAUCE: LPIA support for 9202 HDA Sigmatel codec + * SAUCE: Add an X86_LPIA Kconfig option + * SAUCE: UHCI USB quirk for resume + * SAUCE: LPIA Reboot fix for Intel Crownbeach development boards + * SAUCE: LPIA Logical reset of USB port on resume + * Set CONFIG_WIRELESS_OLD_REGULATORY=n, added wireless-crda + as an install dependency. + + [ Upstream Kernel Changes ] + + * Revert "Revert "x86, early_ioremap: fix fencepost error"" + - LP: #312554 + * drm/i915: capture last_vblank count at IRQ uninstall time too + - LP: #320813 + * drm/i915: add get_vblank_counter function for GM45 + - LP: #320813 + * Staging: comedi: fix Kbuild + * Staging: meilhaus: fix Kbuild + * Staging: android: binder: fix arm build errors + * Staging: android: timed_gpio: Fix build to build on kernels after + 2.6.25. + * Staging: android: fix build error on 64bit boxes + * Staging: android: Add lowmemorykiller documentation. + * Staging: android: task_get_unused_fd_flags: fix the wrong usage of + tsk->signal + * staging: agnx: drivers/staging/agnx/agnx.h needs + * Staging: usbip: usbip_start_threads(): handle kernel_thread failure + * Staging: poch: fix verification of memory area + * Documentation: move DMA-mapping.txt to Doc/PCI/ + * sgi-xp: fix writing past the end of kzalloc()'d space + * do_wp_page: fix regression with execute in place + * wait: prevent exclusive waiter starvation + * shm: fix shmctl(SHM_INFO) lockup with !CONFIG_SHMEM + * revert "rlimit: permit setting RLIMIT_NOFILE to RLIM_INFINITY" + * prevent kprobes from catching spurious page faults + * sound: usb-audio: handle wMaxPacketSize for FIXED_ENDPOINT devices + * md: Ensure an md array never has too many devices. + * md: Fix a bug in linear.c causing which_dev() to return the wrong + device. + * ACPI: Enable bit 11 in _PDC to advertise hw coord + * ACPI: dock: Don't eval _STA on every show_docked sysfs read + * ieee1394: ohci1394: increase AT req. retries, fix ack_busy_X from + Panasonic camcorders and others + * firewire: ohci: increase AT req. retries, fix ack_busy_X from Panasonic + camcorders and others + * firewire: sbp2: fix DMA mapping leak on the failure path + * firewire: sbp2: add workarounds for 2nd and 3rd generation iPods + * ieee1394: sbp2: add workarounds for 2nd and 3rd generation iPods + * module: remove over-zealous check in __module_get() + * x86: APIC: enable workaround on AMD Fam10h CPUs + * eeepc-laptop: fix oops when changing backlight brightness during + eeepc-laptop init + * eeepc-laptop: Add support for extended hotkeys + * e1000: fix bug with shared interrupt during reset + * e1000: Fix PCI enable to honor the need_ioport flag + * agp/intel: Fix broken ® symbol in device name. + * ALSA: hda - Add quirk for FSC Amilo Xi2550 + * ALSA: hda - Add missing COEF initialization for ALC887 + * ALSA: hda - Add missing initialization for ALC272 + * asus_acpi: Add R1F support + * panasonic-laptop: fix X[ ARRAY_SIZE(X) ] + * ACPI: Skip the first two elements in the _BCL package + * ACPI: proc_dir_entry 'video/VGA' already registered + * ACPI: disable ACPI cleanly when bad RSDP found + * ACPICA: Fix table entry truncation calculation + * PCI: properly clean up ASPM link state on device remove + * PCI: return error on failure to read PCI ROMs + * seq_file: move traverse so it can be used from seq_read + * seq_file: fix big-enough lseek() + read() + * serial: set correct baud_base for Oxford Semiconductor Ltd EXSYS + EX-41092 Dual 16950 Serial adapter + * Add support for '8-port RS-232 MIC-3620 from advantech' + * mm: fix error case in mlock downgrade reversion + * elf core dump: fix get_user use + * ACPI: video: Fix reversed brightness behavior on ThinkPad SL series + * ipw2200: fix scanning while associated + * XFS: set b_error from bio error in xfs_buf_bio_end_io + * Revert USB: option: add Pantech cards + * USB: option: New mobile broadband modems to be supported + * USB: new id for ti_usb_3410_5052 driver + * USB: two more usb ids for ti_usb_3410_5052 + * USB: usb-storage: add Pentax to the bad-vendor list + * sata_via: Add VT8261 support + * nbd: do not allow two clients at the same time + * sctp: Fix another socket race during accept/peeloff + * Linux 2.6.28.5 + + -- Tim Gardner Mon, 09 Feb 2009 16:11:28 -0700 + +linux (2.6.28-7.20) jaunty; urgency=low + + [ Tim Gardner ] + + * SAUCE: Input: atkbd - Samsung NC10 key repeat fix + + [ Upstream Kernel Changes ] + + * Manually revert "mlock: downgrade mmap sem while populating mlocked + regions" + * xen: make sysfs files behave as their names suggest + * sata_mv: fix 8-port timeouts on 508x/6081 chips + * m68knommu: set NO_DMA + * PCI/MSI: bugfix/utilize for msi_capability_init() + * x86: use early clobbers in usercopy*.c + * netfilter: ctnetlink: fix scheduling while atomic + * orinoco: move kmalloc(..., GFP_KERNEL) outside spinlock in + orinoco_ioctl_set_genie + * fbdev/atyfb: Fix DSP config on some PowerMacs & PowerBooks + * kmalloc: return NULL instead of link failure + * sata_nv: rename nv_nf2_hardreset() + * sata_nv: fix MCP5x reset + * sata_nv: ck804 has borked hardreset too + * Fix memory corruption in console selection + * Add enable_ms to jsm driver + * nfsd: only set file_lock.fl_lmops in nfsd4_lockt if a stateowner is + found + * nfsd: Ensure nfsv4 calls the underlying filesystem on LOCKT + * iwlwifi: fix rs_get_rate WARN_ON() + * p54: fix lm87 checksum endianness + * p54: fix p54_read_eeprom to cope with tx_hdr_len + * p54usb: rewriting rx/tx routines to make use of usb_anchor's facilities + * minstrel: fix warning if lowest supported rate index is not 0 + * PCI: irq and pci_ids patch for Intel Tigerpoint DeviceIDs + * cpuidle: Add decaying history logic to menu idle predictor + * ACPI: Avoid array address overflow when _CST MWAIT hint bits are set + * video: always update the brightness when poking "brightness" + * Newly inserted battery might differ from one just removed, so update of + battery info fields is required. + * ACPI: Do not modify SCI_EN directly + * dlm: initialize file_lock struct in GETLK before copying conflicting + lock + * sata_mv: Fix chip type for Hightpoint RocketRaid 1740/1742 + * ACPICA: Allow multiple backslash prefix in namepaths + * Linux 2.6.28.4 + + -- Tim Gardner Sat, 07 Feb 2009 18:53:42 -0700 + +linux (2.6.28-7.19) jaunty; urgency=low + + * Fix missing modules FTBS + + -- Tim Gardner Thu, 05 Feb 2009 15:28:15 -0700 + +linux (2.6.28-7.18) jaunty; urgency=low + + [ Alok Kataria ] + + * SAUCE: (drop after 2.6.29) x86: add a synthetic TSC_RELIABLE feature + bit + - LP: #319945 + * SAUCE: (drop after 2.6.29) x86: add X86_FEATURE_HYPERVISOR feature bit + - LP: #319945 + * SAUCE: (drop after 2.6.29) x86: Hypervisor detection and get tsc_freq + from hypervisor + - LP: #319945 + * SAUCE: (drop after 2.6.29) x86: Add a synthetic TSC_RELIABLE feature + bit. + - LP: #319945 + * SAUCE: (drop after 2.6.29) x86: Skip verification by the watchdog for + TSC clocksource. + - LP: #319945 + * SAUCE: (drop after 2.6.29) x86: VMware: Fix vmware_get_tsc code + - LP: #319945 + * SAUCE: (drop after 2.6.29) x86: vmware: look for DMI string in the + product serial key + - LP: #319945 + + [ Andy Whitcroft ] + + * SAUCE: toshiba_acpi -- pull in current -dev version of driver + - LP: #269831 + * SAUCE: toshiba_acpi -- add acpi hotkey kernel thread + - LP: #269831 + * move toshiba laptops back from tlsup to toshiba_acpi + - LP: #269831 + + [ Aneesh Kumar K.V ] + + * SAUCE: (revert before 2.6.28.y update) ext4: Fix the delalloc + writepages to allocate blocks at the right offset. + * SAUCE: (revert before 2.6.28.y update) ext4: avoid ext4_error when + mounting a fs with a single bg + * SAUCE: (revert before 2.6.28.y update) ext4: Don't overwrite + allocation_context ac_status + * SAUCE: (revert before 2.6.28.y update) ext4: Add blocks added during + resize to bitmap + * SAUCE: (revert before 2.6.28.y update) ext4: Use + EXT4_GROUP_INFO_NEED_INIT_BIT during resize + * SAUCE: (revert before 2.6.28.y update) ext4: cleanup mballoc header + files + * SAUCE: (revert before 2.6.28.y update) ext4: don't use blocks freed but + not yet committed in buddy cache init + * SAUCE: (revert before 2.6.28.y update) ext4: Fix race between + read_block_bitmap() and mark_diskspace_used() + * SAUCE: (revert before 2.6.28.y update) ext4: Fix the race between + read_inode_bitmap() and ext4_new_inode() + * SAUCE: (revert before 2.6.28.y update) ext4: Use new buffer_head flag + to check uninit group bitmaps initialization + * SAUCE: (revert before 2.6.28.y update) ext4: mark the blocks/inode + bitmap beyond end of group as used + * SAUCE: (revert before 2.6.28.y update) ext4: Don't allow new groups to + be added during block allocation + * SAUCE: (revert before 2.6.28.y update) ext4: Init the complete page + while building buddy cache + * SAUCE: (revert before 2.6.28.y update) ext4: Fix s_dirty_blocks_counter + if block allocation failed with nodelalloc + + [ Hannes Eder ] + + * SAUCE: (drop after 2.6.29) x86: vmware - fix sparse warnings + - LP: #319945 + + [ Luke Yelavich ] + + * hid modules have hyphens instead of underscores in their names + + [ Mark Fasheh ] + + * SAUCE: (revert before 2.6.28.y update) jbd2: Add BH_JBDPrivateStart + + [ Theodore Ts'o ] + + * SAUCE: (revert before 2.6.28.y update) ext4: Add support for non-native + signed/unsigned htree hash algorithms + * SAUCE: (revert before 2.6.28.y update) ext4: tone down + ext4_da_writepages warnings + * SAUCE: (revert before 2.6.28.y update) jbd2: Add barrier not supported + test to journal_wait_on_commit_record + * SAUCE: (revert before 2.6.28.y update) ext4: Add sanity checks for the + superblock before mounting the filesystem + * SAUCE: (revert before 2.6.28.y update) ext4: only use i_size_high for + regular files + * SAUCE: (revert before 2.6.28.y update) ext4: Add sanity check to + make_indexed_dir + * SAUCE: (revert before 2.6.28.y update) jbd2: On a __journal_expect() + assertion failure printk "JBD2", not "EXT3-fs" + * SAUCE: (revert before 2.6.28.y update) ext4: Initialize the new group + descriptor when resizing the filesystem + + [ Tyler Hicks ] + + * SAUCE: (drop after 2.6.28) [eCryptfs] Regression in unencrypted + filename symlinks + - LP: #322532 + + [ Upstream Kernel Changes ] + + * Input: atkbd - broaden the Dell DMI signatures + - LP: #261721 + * ti_usb_3410_5052: support alternate firmware + * ath5k: fix mesh point operation + * mac80211: decrement ref count to netdev after launching mesh discovery + * inotify: clean up inotify_read and fix locking problems + * fuse: destroy bdi on umount + * fuse: fix missing fput on error + * fuse: fix NULL deref in fuse_file_alloc() + * x86, mm: fix pte_free() + * klist.c: bit 0 in pointer can't be used as flag + * sysfs: fix problems with binary files + * x86: fix page attribute corruption with cpa() + * USB: fix toggle mismatch in disable_endpoint paths + * sound: virtuoso: enable UART on Xonar HDAV1.3 + * USB: usbmon: Implement compat_ioctl + * USB: fix char-device disconnect handling + * USB: storage: add unusual devs entry + * alpha: nautilus - fix compile failure with gcc-4.3 + * alpha: fix vmalloc breakage + * resources: skip sanity check of busy resources + * rtl8187: Add termination packet to prevent stall + * it821x: Add ultra_mask quirk for Vortex86SX + * libata: pata_via: support VX855, future chips whose IDE controller use + 0x0571 + * serial_8250: support for Sealevel Systems Model 7803 COMM+8 + * drm: stash AGP include under the do-we-have-AGP ifdef + * Fix OOPS in mmap_region() when merging adjacent VM_LOCKED file segments + * bnx2x: Block nvram access when the device is inactive + * ext3: Add sanity check to make_indexed_dir + * rtl8187: Fix error in setting OFDM power settings for RTL8187L + * epoll: drop max_user_instances and rely only on max_user_watches + * gpiolib: fix request related issue + * sgi-xpc: Remove NULL pointer dereference. + * sgi-xpc: ensure flags are updated before bte_copy + * include/linux: Add bsg.h to the Kernel exported headers + * ALSA: hda - Fix PCM reference NID for STAC/IDT analog outputs + * ALSA: hda - add another MacBook Pro 4, 1 subsystem ID + * ALSA: hda - Add quirk for HP DV6700 laptop + * crypto: authenc - Fix zero-length IV crash + * crypto: ccm - Fix handling of null assoc data + * x86, pat: fix reserve_memtype() for legacy 1MB range + * x86, pat: fix PTE corruption issue while mapping RAM using /dev/mem + * PCI hotplug: fix lock imbalance in pciehp + * dmaengine: fix dependency chaining + * NET: net_namespace, fix lock imbalance + * relay: fix lock imbalance in relay_late_setup_files + * Linux 2.6.28.3 + * ALSA: Enable SPDIF output on ALC655 + * ALSA: hda - Add ASUS V1Sn support + * ALSA: hda - support detecting HD Audio devices with PCI class code + * ALSA: hda: alc883 model for ASUS P5Q-EM boards + * ALSA: hda - Add quirk for MSI 7260 mobo + * ALSA: hda - Add quirk for Sony VAIO VGN-SR19XN + * ALSA: oxygen: add Claro halo support + * ALSA: hda - Add a new function to seek for a codec ID + * ALSA: patch_sigmatel: Add missing Gateway entries and autodetection + * ALSA: hda - More fixes on Gateway entries + * ALSA: hda - Add MCP67 HDMI support + * ALSA: hda - fix name for ALC1200 + * LSA: hda - Add HP Acacia detection + * ALSA: hda - Add quirk for HP 2230s + * ALSA: hda - Add quirk for Dell Inspiron Mini9 + * ALSA: hda - add support for Intel DX58SO board + * ALSA: hda - Fix silent headphone output on Panasonic CF-74 + * ALSA: USB quirk for Logitech Quickcam Pro 9000 name + * ALSA: hda - add quirks for some 82801H variants to use ALC883_MITAC + + [ Yasunori Goto ] + + * SAUCE: (revert before 2.6.28.y update) ext4: Widen type of + ext4_sb_info.s_mb_maxs[] + + -- Tim Gardner Mon, 02 Feb 2009 23:07:13 -0700 + +linux (2.6.28-6.17) jaunty; urgency=low + + [ Amit Kucheria ] + + * Updating configs: ARMEL/versatile + + -- Amit Kucheria Fri, 30 Jan 2009 13:36:59 +0200 + +linux (2.6.28-6.16) jaunty; urgency=low + + [ Luke Yelavich ] + + * Add hid quirks to input-modules udeb + + [ Tim Gardner ] + + * Revert "[arm] Fix kexec on ARM by properly calling the relocation + function". This patch was deemed 'bogus' by Russell King on the + ARM mailing list. + + [ Upstream Kernel Changes ] + + * PCI: keep ASPM link state consistent throughout PCIe hierarchy + * security: introduce missing kfree + * rt2x00: add USB ID for the Linksys WUSB200. + * p54usb: Add USB ID for Thomson Speedtouch 121g + * lib/idr.c: use kmem_cache_zalloc() for the idr_layer cache + * sgi-xp: eliminate false detection of no heartbeat + * sched: fix update_min_vruntime + * IA64: Turn on CONFIG_HAVE_UNSTABLE_CLOCK + * sound: virtuoso: do not overwrite EEPROM on Xonar D2/D2X + * ALSA: hda - Add quirk for another HP dv5 + * ALSA: hda - Fix HP dv5 mic input + * ALSA: hda - Don't reset HP pinctl in patch_sigmatel.c + * ALSA: hda - make laptop-eapd model back for AD1986A + * drivers/net/irda/irda-usb.c: fix buffer overflow + * usb-storage: add last-sector hacks + * usb-storage: set CAPACITY_HEURISTICS flag for bad vendors + * pkt_sched: sch_htb: Fix deadlock in hrtimers triggered by HTB + * ipv6: Fix fib6_dump_table walker leak + * sctp: Avoid memory overflow while FWD-TSN chunk is received with bad + stream ID + * pkt_sched: cls_u32: Fix locking in u32_change() + * r6040: fix wrong logic in mdio code + * r6040: save and restore MIER correctly in the interrupt routine + * r6040: bump release number to 0.19 + * tcp: don't mask EOF and socket errors on nonblocking splice receive + * p54usb: fix traffic stalls / packet drop + * netfilter: x_tables: fix match/target revision lookup + * netfilter: ebtables: fix inversion in match code + * netfilter: nf_conntrack: fix ICMP/ICMPv6 timeout sysctls on big-endian + * dell_rbu: use scnprintf() instead of less secure sprintf() + * powerpc: is_hugepage_only_range() must account for both 4kB and 64kB + slices + * hwmon: (abituguru3) Fix CONFIG_DMI=n fallback to probe + * mm: write_cache_pages cyclic fix + * mm: write_cache_pages early loop termination + * mm: write_cache_pages writepage error fix + * mm: write_cache_pages integrity fix + * mm: write_cache_pages cleanups + * mm: write_cache_pages optimise page cleaning + * mm: write_cache_pages terminate quickly + * mm: write_cache_pages more terminate quickly + * mm: do_sync_mapping_range integrity fix + * mm: direct IO starvation improvement + * fs: remove WB_SYNC_HOLD + * fs: sync_sb_inodes fix + * fs: sys_sync fix + * Linux 2.6.28.2 + + -- Tim Gardner Sun, 25 Jan 2009 13:36:16 -0700 + +linux (2.6.28-5.15) jaunty; urgency=low + + [ Tim Gardner ] + + * Revert "Enabled CONFIG_PID_NS=y for i386/amd64" + Somehow this commit also reverted the 7 prior commits (which is bad). + * Enabled CONFIG_PID_NS=y for i386/amd64 (version 2) + + -- Tim Gardner Thu, 22 Jan 2009 13:48:34 -0700 + +linux (2.6.28-5.14) jaunty; urgency=low + + [ Ben Collins ] + + * lirc_gpio: Forward ported to current kernel (jaunty) + * configs: Enable LIRC_GPIO on 64-bit/32-bit x86 + - LP: #298791 + + [ Jeff Layton ] + + * SAUCE: cifs: make sure we allocate enough storage for socket address + - LP: #318565 + + [ Tim Gardner ] + + * check-abi: Return success when ABI skip is requested and no ABI files exist. + This ought to fix the armel FTBS. + + -- Tim Gardner Thu, 22 Jan 2009 06:42:49 -0700 + +linux (2.6.28-5.13) jaunty; urgency=low + + [ Andy Whitcroft ] + + * Revert "SAUCE: don't use buggy _BCL/_BCM/_BQC for backlight control" + + [ Tim Gardner ] + + * Fix udeb generation breakage caused by the previous armel versatile + flavour config update. + + -- Tim Gardner Wed, 21 Jan 2009 12:38:35 -0700 + +linux (2.6.28-5.12) jaunty; urgency=low + + [ Ante ] + + * Update drbd to 8.3.0 + + [ Dave Airlie ] + + * i915/drm: provide compat defines for userspace for certain struct + + [ Eric Anholt ] + + * drm/i915: Don't double-unpin buffers if we take a signal in + * drm/i915: Don't complain when interrupted while pinning in execbuffers. + * drm/i915: Don't allow objects to get bound while VT switched. + + [ Jani Monoses ] + + * Fix webcam having USB ID 0ac8:303b + - LP: #292086 + + [ Jesse Barnes ] + + * drm/i915: set vblank enabled flag correctly across IRQ + * drm/i915: don't enable vblanks on disabled pipes + + [ Michael Casadevall ] + + * [arm] Fix kexec on ARM by properly calling the relocation function + + [ Tim Gardner ] + + * Enabled CONFIG_PID_NS=y for i386/amd64 + * SAUCE: Increase ATA_TMOUT_PMP_SRST_WAIT to 5 seconds. + - LP: #318978 + * Update armel versatile config + - LP: #314789 + * Enabled CONFIG_RT2860=m for i386/amd64 + * Enabled CONFIG_RT2870=m for i386/amd64 + + [ Upstream Kernel Changes ] + + * Input: atkbd - add keyboard quirk for HP Pavilion ZV6100 laptop + - LP: #291878 + * ALSA: hda - Add quirk for another HP dv7 + * ALSA: hda - Add quirk for HP6730B laptop + * ALSA: caiaq - Fix Oops with MIDI + * ALSA: hda - Fix typos for AD1882 codecs + * x86: fix intel x86_64 llc_shared_map/cpu_llc_id anomolies + * x86: default to SWIOTLB=y on x86_64 + * CIFS: make sure that DFS pathnames are properly formed + * ring-buffer: prevent false positive warning + * ring-buffer: fix dangling commit race + * iwlwifi: use GFP_KERNEL to allocate Rx SKB memory + * tx493[89]ide: Fix length for __ide_flush_dcache_range + * tx4939ide: Do not use zero count PRD entry + * SCSI: eata: fix the data buffer accessors conversion regression + * USB: emi26: fix oops on load + * x86, UV: remove erroneous BAU initialization + * x86: fix incorrect __read_mostly on _boot_cpu_pda + * vmalloc.c: fix flushing in vmap_page_range() + * fs: symlink write_begin allocation context fix + * cgroups: fix a race between cgroup_clone and umount + * dm raid1: fix error count + * dm log: fix dm_io_client leak on error paths + * minix: fix add link's wrong position calculation + * md: fix bitmap-on-external-file bug. + * sched_clock: prevent scd->clock from moving backwards, take #2 + * devices cgroup: allow mkfifo + * SCSI: aha152x_cs: Fix regression that keeps driver from using shared + interrupts + * ioat: fix self test for multi-channel case + * USB: isp1760: use a specific PLX bridge instead of any bdridge + * USB: isp1760: Fix probe in PCI glue code + * USB: unusual_devs.h additions for Pentax K10D + * inotify: fix type errors in interfaces + * Move compat system call declarations to compat header file + * Convert all system calls to return a long + * Rename old_readdir to sys_old_readdir + * Remove __attribute__((weak)) from sys_pipe/sys_pipe2 + * Make sys_pselect7 static + * Make sys_syslog a conditional system call + * System call wrapper infrastructure + * powerpc: Enable syscall wrappers for 64-bit + * s390: enable system call wrappers + * System call wrapper special cases + * System call wrappers part 01 + * System call wrappers part 02 + * System call wrappers part 03 + * System call wrappers part 04 + * System call wrappers part 05 + * System call wrappers part 06 + * System call wrappers part 07 + * System call wrappers part 08 + * System call wrappers part 09 + * System call wrappers part 10 + * System call wrappers part 11 + * System call wrappers part 12 + * System call wrappers part 13 + * System call wrappers part 14 + * System call wrappers part 15 + * System call wrappers part 16 + * System call wrappers part 17 + * System call wrappers part 18 + * System call wrappers part 19 + * System call wrappers part 20 + * System call wrappers part 21 + * System call wrappers part 22 + * System call wrappers part 23 + * System call wrappers part 24 + * System call wrappers part 25 + * System call wrappers part 26 + * System call wrappers part 27 + * System call wrappers part 28 + * System call wrappers part 29 + * System call wrappers part 30 + * System call wrappers part 31 + * System call wrappers part 32 + * System call wrappers part 33 + * s390 specific system call wrappers + * x86: fix RIP printout in early_idt_handler + * Fix timeouts in sys_pselect7 + * USB: another unusual_devs entry for another bad Argosy storage device + * USB: storage: extend unusual range for 067b:3507 + * USB: storage: recognizing and enabling Nokia 5200 cell phoes + * HID: fix error condition propagation in hid-sony driver + * fix switch_names() breakage in short-to-short case + * nfs: remove redundant tests on reading new pages + * eCryptfs: check readlink result was not an error before using it + * mvsas: increase port type detection delay to suit Seagate's 10k6 drive ST3450856SS 0003 + * x86: avoid theoretical vmalloc fault loop + * ath9k: enable RXing of beacons on STA/IBSS + * mm lockless pagecache barrier fix + * powerpc: Disable Collaborative Memory Manager for kdump + * ibmvfc: Delay NPIV login retry and add retries + * ibmvfc: Improve async event handling + * getrusage: RUSAGE_THREAD should return ru_utime and ru_stime + * ath5k: ignore the return value of ath5k_hw_noise_floor_calibration + * mm: fix assertion + * XFS: truncate readdir offsets to signed 32 bit values + * Linux 2.6.28.1 + * eCryptfs: Filename Encryption: Tag 70 packets + * eCryptfs: Filename Encryption: Header updates + * eCryptfs: Filename Encryption: Encoding and encryption functions + * eCryptfs: Filename Encryption: filldir, lookup, and readlink + * eCryptfs: Filename Encryption: mount option + * eCryptfs: Replace %Z with %z + * eCryptfs: Fix data types (int/size_t) + * eCryptfs: kerneldoc for ecryptfs_parse_tag_70_packet() + * eCryptfs: Clean up ecryptfs_decode_from_filename() + * fs/ecryptfs/inode.c: cleanup kerneldoc + * staging-p80211: Kill directly reference of netdev->priv + * staging-slicoss: Kill directly reference of netdev->priv + * staging-winbond: Kill directly reference of netdev->priv + * Staging: go7007: fixes due to video_usercopy api change + * Staging: go7007: fixes due v4l2_file_operations api change + * staging: correct dubious use of !x & y + * Staging: w35und: make wb35_probe() and wb35_disconnect() funtions static + * Staging: w35und: remove unused wb35_open() and wb35_close() functions + * Staging: w35und: use msleep() and udelay() + * Staging: w35und: remove the no-op pa_stall_execution macro + * Staging: w35und: purb typedef removal + * Staging: w35und: reg queue struct typedef removal + * Staging: w35und: wb35reg struct typedef removal + * Staging: w35und: padapter struct typedef removal + * Staging: w35und: merge wblinux struct to adapter + * Staging: w35und: wb35_probe() cleanup + * Staging: w35und: remove usb_submit_urb wrapper function + * Staging: w35und: remove usb_alloc_urb wrapper function + * w35und: remove dead code from wbusb_f.h + * Staging: w35und: remove true/false boolean macros + * Staging: w35und: OS_MEMORY_ALLOC wrapper removal + * Staging: w35und: usb_put_dev() is missing from wb35_disconnect() + * Staging: w35und: remove macro magic from MLME_GetNextPacket() + * Staging: w35und: plug memory leak in wbsoft_tx() + * Staging: w35und: move supported band initialization out of wb35_probe() + * Staging: w35und: remove timer wrappers + * Staging: w35und: remove atomic op wrappers + * Staging: w35und: remove memcpy/memcmp wrappers + * Staging: w35und: remove abs() and BIT() macros + * Staging: w35und: remove unused macros from common.h + * Staging: w35und: remove unused link status code + * Staging: w35und: #include cleanup + * Staging: w35und: remove some dead code + * Staging: w35und: move source files to one directory + * Staging: w35und: move struct wbsoft_priv to core.h and use it + * Staging: w35und: remove ->adapter from struct _HW_DATA_T + * Staging: w35und: clean up adapter.h a bit + * Staging: w35und: merge struct wb35_adapter to struct wbsoft_priv + * Staging: w35und: remove global struct ieee80211_hw + * Staging: w35und: inline DRIVER_AUTHOR and DRIVER_DESC macros + * Staging: w35und: clean up wblinux.c a bit + * Staging: w35und: remove unused ->ShutDowned member from struct + LOCAL_PARA + * Staging: w35und: move global wbsoft_enabled to struct wbsoft_priv + * Staging: w35und: move packet_came() to wb35rx.c + * Staging: w35und: remove ->skb_array from struct wbsoft_priv + * Staging: w35und: remove ->shutdown from struct wbsoft_priv + * Staging: w35und: make functions local to mds.c static + * Staging: w35und: make functions local to mlmetxrx.c static + * Staging: w35und: remove dead code from mto.c + * Staging: w35und: make functions local to wb35rx.c static + * Staging: w35und: make functions local to wb35tx.c static + * Staging: w35und: remove dead code from wbhal.c + * Staging: w35und: remove rxisr.c as dead code + * Staging: w35und: fix Kconfig + * Staging: w35und: fix config build warnings + * Staging: wlan-ng: Remove PCI/PLX/PCMCIA files. + * Staging: wlan-ng: Update Help text to mention prism3 devices. + * Staging: wlan-ng: Delete PCI/PLX/PCMCIA-specific code. + * Staging: wlan-ng: Make wlan-ng use WEXT mode by default. + * Staging: wlan-ng: Eliminate more <2.6 kernel support. + * Staging: wlan-ng: Eliminate all backwards-compatibility for <2.6.13 kernels. + * Staging: wlan-ng: Eliminate a boatload of tertiaryAP-only code. + * Staging: wlan-ng: Remove AP-only code from MLME functions. + * Staging: wlan-ng: Get rid of the MTU tests in the rx conversion path. + * Staging: wlan-ng: Eliminate one more rx mtu test. + * Staging: wlan-ng: Eliminate local 'version.h' + * Staging: wlan-ng: Eliminate usage of procfs. + * Staging: wlan-ng: Use standard kernel integer (u32/s32/etc) types. + * Staging: wlan-ng: Eliminate all backwards-compatible kernel code. + * Staging: wlan-ng: Wireless Extension support is mandatory. + * Staging: wlan-ng: use WIRELESS_EXT, not CONFIG_WIRELESS_EXT + * Staging: wlan-ng: Delete a large pile of now-unused code. + * Staging: wlan-ng: Delete a pile of unused mibs. And fix WEXT SET_TXPOWER. + * Staging: wlan-ng: Consolidate wlan-ng into a single module. + * Staging: wlan-ng: Purge all MIBs not used internally. + * Staging: wlan-ng: p80211netdev.c fix netdev alloc to prevent oops on device start + * Staging: wlan-ng: prism2_usb.c always enable the card in probe_usb + * Staging: wlan-ng: hfa384x_usb.c use newest version of 384x_drvr_start + * Staging: wlan-ng: p80211wext.c add latest changes & remove extra nulls from wext_handlers + * Staging: wlan-ng: p80211wext don't set default key id twice + * Staging: wlan-ng: hfa384x_usbin_callback: check for hardware removed + * Staging: wlan-ng: p80211conv.c copy code from wlan-ng-devel branch to not drop packets + * Staging: wlan-ng: remove unused #include + * Staging: wlan-ng: p80211wext.c: use ARRAY_SIZE + * Staging: wlan-ng: fix compiler warnings + * Staging: wlan-ng: skb_p80211_to_ether() - payload_length is unsigned, check before subtraction + * Staging: at76_usb: update drivers/staging/at76_usb w/ mac80211 port + * Staging: at76_usb: fix build breakage + * Staging: at76_usb: remove compiler warnings + * Staging: at76_usb: fix up all remaining checkpatch.pl warnings + * Staging: at76_usb: cleanup dma on stack issues + * Staging: poch: Block size bug fix + * Staging: poch: Update TODO list + * Staging: poch: Correct pages from bytes. + * Staging: poch: minor fixes + * Staging: poch: Fix build warnings + * Staging: poch: Rx control register init + * Staging: poch: Fix user space protocol syncing + * Staging: poch: Fine grained locking + * Staging: sxg: remove typedefs + * Staging: sxg: break the build in a cleaner way when !x86 + * Staging: sxg: update README + * staging: struct device - replace bus_id with dev_name(), dev_set_name() + * Staging: echo: remove typedefs + * Staging: echo: Lindent drivers/staging/echo + * Staging: go7007: saa7134 updates + * Staging: go7007: add sensoray 2250/2251 support + * Staging: go7007: Convert driver to use video_ioctl2 + * Staging: go7007: annotate code pointers + * Staging: go7007: fix minor build warnings + * Staging: go7007: small cleanup + * Staging: go7007: add some more v4l2 ioctls + * Staging: et131x: Cleanup et131x_debug.h defines + * Staging: et131x: fix build failure + * Staging: et131x: remove unused variable in et1310_tx.c + * Staging: usbip: cleanup kerneldoc + * Staging: slicoss: use kzalloc + * Staging: slicoss: use correct type for memory allcations + * Staging: slicoss: use request_firmware + * Staging: add agnx wireless driver + * Staging: agnx: fix build errors due to ssid removal + * Staging: agnx: fix build errors due to rate control API changes + * Staging: agnx: fix build warnings + * Staging: add otus Atheros wireless network driver + * Staging: otus: fix netdev->priv usage + * Staging: otus: fix name clash + * Staging: otus: fix urb callback function type + * Staging: otus: remove dependence on kernel version + * Staging: add rt2860 wireless driver + * Staging: rt2860: disable root hack for reading files + * Staging: rt2860: fix up netdev->priv usage + * Staging: rt2860: use standard bit-reverse function + * Staging: rt2860: Fix minor compiler warnings + * Staging: rt2860: enable WPA_SUPPLICANT support + * Staging: Add ServerEngines benet 10Gb ethernet driver + * Staging: benet: fix netif api breakage + * Staging: benet: fix up netdev->priv change + * Staging: benet: build is broken unless CONFIG_NETPOLL is enabled + * Staging: benet: patch to remove subdirectories + * Staging: benet: fix build errors when CONFIG_NETPOLL is off + * Staging: benet: fix build error. + * Staging: benet: patch to use offsetof() instead of AMAP_BYTE_OFFSET() + * Staging: benet: fix problems reported by checkpatch + * Staging: benet: cleanup a check while posting rx buffers + * Staging: add comedi core + * Staging: comedi: fix up a lot of checkpatch.pl warnings + * Staging: comedi: fix checkpatch.pl errors in comedi_fops.c + * Staging: comedi: fix build error in comedilib.h + * Staging: comedi: add kcomedilib to the tree + * Staging: comedi: set up infrastructure for individual drivers + * Staging: comedi: add local copy of interrupt.h + * Staging: comedi: add pci and usb wrapper header files + * Staging: comedi: comedi driver common function module + * Staging: comedi: add mite comedi pci driver + * Staging: comedi: add usb usbdux driver + * Staging: comedi: add usb usbduxfast driver + * Staging: comedi: add usb dt9812 driver + * Staging: comedi: add comedi_bond driver + * Staging: comedi: add comedi_test driver + * Staging: comedi: add comedi_parport driver + * Staging: comedi: dt9812: fix up a lot of coding style issues + * Staging: comedi: dt9812: remove dt9812.h + * Staging: comedi: dt9812: remove typedefs + * Staging: comedi: dt9812: fix sparse warnings + * Staging: comedi: usbdux: remove kernel version checks + * Staging: comedi: usbdux: code style cleanups + * Staging: comedi: usbdux: remove // comments + * Staging: comedi: usbdux: fix up printk calls + * Staging: comedi: usbdux: remove checkpatch.pl warnings + * Staging: comedi: usbdux: remove typedef + * Staging: comedi: usbdux: remove comedi usb wrappers + * Staging: comedi: usbduxfast: remove comedi usb wrappers + * Staging: comedi: dt9812: remove #ifdef that is not needed + * Staging: comedi: remove usb wrappers + * Staging: comedi: remove PCI wrappers + * Staging: comedi: add icp_multi driver + * Staging: comedi: add me4000 driver + * Staging: comedi: fix checkpatch.pl issues in comedi_bond.c + * Staging: comedi: fix checkpatch.pl issues in comedi_fc.c + * Staging: comedi: remove typedefs from comedi_bond.c + * Staging: comedi: fix sparse issues in comedi_bond.c + * Staging: comedi: fix checkpatch.pl issues in comedi_test.c + * Staging: comedi: fix sparse issues in comedi_test.c + * Staging: comedi: remove typedefs from comedi_test.c + * Staging: comedi: fix comedi_parport.c checkpatch.pl issues. + * Staging: comedi: fix comedi_fc.h checkpatch.pl issues. + * Staging: comedi: fix comedi_pci.h checkpatch.pl issues. + * Staging: comedi: comedi_pci.h: remove unneeded wrapper + * Staging: comedi: comedi_pci.h: remove comedi_pci_enable_no_regions + * Staging: comedi: comedi_pci.h: remove comedi_pci_disable_no_regions + * Staging: comedi: add s626 driver + * Staging: comedi: add rtd520 driver + * Staging: comedi: add me_daq driver + * Staging: comedi: me_daq: fix checkpatch.pl issues + * Staging: comedi: me_daq: remove typedefs + * Staging: comedi: me_daq: fix sparse issues + * Staging: comedi: fix checkpatch.pl warning in interrupt.h + * Staging: comedi: fix build if CONFIG_PROC_FS is not set + * Staging: add asus_oled driver + * Staging: asus_oled: fix build dependancy + * Staging: Add the Meilhaus ME-IDS driver package + * Staging: meilhaus: fix __symbol_get problems + * Staging: add lcd-panel driver + * Staging: panel: major checkpatch cleanup + * Staging: panel: remove ifdefs and code for pre-2.6 kernels + * Staging: panel: remove support for smartcards + * Staging: add Driver for Altera PCI Express Chaining DMA reference design + * Staging: add rtl8187se driver + * Staging: rtl8187se: remove unneeded files + * Staging: rtl8187se: make the built module be the proper name + * Staging: rtl8187se: remove duplicate pci ids + * Staging: me4000: switch to list_for_each*() + * Staging: usbip: switch to list_for_each_entry() + * Staging: add princeton instruments usb camera driver + * Staging: add mimio xi driver + * Staging: add rt2870 wireless driver + * Staging: rt2870: disable root hack for reading files + * Staging: rt2870: fix up netdev->priv usage + * Staging: add frontier tranzport and alphatrack drivers + * Staging: frontier: remove unused alphatrack_sysfs.c file + * Staging: frontier: fix compiler warnings + * Staging: add epl stack + * Staging: epl: run Lindent on all kernel/*.h files + * Staging: epl: run Lindent on all user/*.h files + * Staging: epl: run Lindent on *.h files + * Staging: epl: run Lindent on *.c files + * Staging: epl: hr timers all run in hard irq context now + * Staging: epl: fix netdev->priv b0rkage + * Staging: add android framework + * Staging: android: add binder driver + * Staging: android: binder: Fix gcc warnings about improper format specifiers for size_t in printk + * staging: android: binder: Fix use of euid + * Staging: android: add logging driver + * Staging: android: add ram_console driver + * Staging: android: add timed_gpio driver + * Staging: android: timed_gpio: Rename android_timed_gpio to timed_gpio + * Staging: android: remove dummy android.c driver + * Staging: android: add lowmemorykiller driver + * Staging: android: binder: fix build errors + * staging: __FUNCTION__ is gcc-specific, use __func__ + * V4L/DVB (10176a): Switch remaining clear_user_page users over to + clear_user_highpage + + [ Zhenyu Wang ] + + * agp/intel: add support for G41 chipset + + -- Tim Gardner Sun, 18 Jan 2009 20:22:54 -0700 + +linux (2.6.28-4.11) jaunty; urgency=low + + [ Mario Limonciello ] + + * SAUCE: Enable HDMI audio codec on Studio XPS 1340 + - LP: #309508 + + [ Tim Gardner ] + + * Fix armel d-i FTBSs + + [ Upstream Kernel Changes ] + + * USB: re-enable interface after driver unbinds + + -- Tim Gardner Tue, 13 Jan 2009 16:33:08 -0700 + +linux (2.6.28-4.10) jaunty; urgency=low + + [ Andy Whitcroft ] + + * update kernel bootloader recommends: to prefer grub + - LP: #314004 + * SAUCE: don't use buggy _BCL/_BCM/_BQC for backlight control + - LP: #311716 + * SAUCE: test-suspend -- add the suspend test scripts + - LP: #316419 + + [ Colin Watson ] + + * Enable udebs for armel + + [ Tim Gardner ] + + * SAUCE: Dell laptop digital mic does not work, PCI 1028:0271 + - LP: #309508 + * Enable CIFS_XATTR=y and CONFIG_CIFS_POSIX=y + - LP: #220658 + + -- Tim Gardner Thu, 08 Jan 2009 10:38:22 -0700 + +linux (2.6.28-4.9) jaunty; urgency=low + + [ Tim Gardner ] + + * Restore DM_CRYPT, AES, ECB, and CBC as modules. This fixes + some installer issues with encrypted /home and Private directories. + * Take one more stab at building armel without module or ABI errors. + + -- Tim Gardner Tue, 06 Jan 2009 08:38:23 -0700 + +linux (2.6.28-4.8) jaunty; urgency=low + + * Fix i386/amd64 FTBS by ignoring all module and ABI changes, + not something you would normally do, but I'm sure the ABI + has not changed. This will probably also allow the ARM builds to complete. + + -- Tim Gardner Mon, 05 Jan 2009 14:42:58 -0700 + +linux (2.6.28-4.7) jaunty; urgency=low + + [ Tim Gardner ] + + * Enable CONFIG_ATH5K=m for i386/amd64 + - LP: #306719 + * Build all i386/amd64 AGP/DRM components as modules. + - LP: #312721 + * git commands are now installed outside the default $PATH + Use 'git CMD' instead of 'git-CMD'. + * Build in most PATA/SATA drivers. This should allow most i386/amd64 systems to boot + without an initramfs, though some support work is still required in initramfs-tools + and grub. + - LP: #311730 + + -- Tim Gardner Fri, 02 Jan 2009 07:33:09 -0700 + +linux (2.6.28-4.6) jaunty; urgency=low + + [ Tim Gardner ] + + * Enable CONFIG_X86_E_POWERSAVER=m for i386 generic + - LP: #237405 + * Build i386 AGP drivers as modules + - LP: #312721 + * Build i386 DRM as a module + - LP: #312721 + + [ Upstream Kernel Changes ] + + * drm/i915: Add missing userland definitions for gem init/execbuffer. + - LP: #308387 + + -- Tim Gardner Mon, 29 Dec 2008 09:16:47 -0700 + +linux (2.6.28-4.5) jaunty; urgency=low + + [ Andy Whitcroft ] + + * clean up module dependancy information on package removal/purge + - LP: #300773 + + [ Tim Gardner ] + + * Update iscsitarget to 0.4.17 + * Build in ext{234} + * Build in Crypto modules AES, CBC, ECB + * Build in ACPI AC,BATTERY,BUTTON,FAN,PCI_SLOT,PROCESSOR,SBS,THERMAL,WMI + * Build in AGP intel,via,sis,ali,amd,amd64,efficeon,nvidia,sworks + * Build in ata,dev_dm,dev_loop,dev_md,dev_sd,dev_sr + * Build in BT l2cap,rfcomm,sco + * Reduce CONFIG_LEGACY_PTY_COUNT to 0 + * Build in CDROM_PKTCDVD and CHR_DEV_SG + * Build in CPU_FREQ + GOV_CONSERVATIVE,GOV_ONDEMAND,GOV_POWERSAVE,GOV_USERSPACE,STAT,TABLE + * Build in DM CRYPT,MIRROR,MULTIPATH,SNAPSHOT + * Build in DRM + * Build in HID + * Build in HOTPLUG PCI,PCIE + * Build in I2C + * Build in IEEE1394 OHCI1394 + * Build in INPUT EVDEV + * Build in IPV6 + * Build in MMC + * Build in PACKET + * Enable both IEEE1394 (Firewire) stacks as modules + - LP: #276463 + * Disable SUNRPC_REGISTER_V4 + - LP: #306016 + * Enable dm-raid4-5 + - LP: #309378 + * Build in PPP + * Build in RFKILL + * Build in USB SERIAL + + [ Upstream Kernel Changes ] + + * Rebased to v2.6.28 + + -- Tim Gardner Thu, 18 Dec 2008 21:18:44 -0700 + +linux (2.6.28-3.4) jaunty; urgency=low + + [ Tim Gardner ] + + * Build ecryptfs into the kernel + - LP: #302870 + * Deprecated gnbd + + [ Upstream Kernel Changes ] + + * Rebased to v2.6.28-rc8 + + -- Tim Gardner Wed, 10 Dec 2008 22:45:13 -0700 + +linux (2.6.28-2.3) jaunty; urgency=low + + [ Andy Whitcroft ] + + * update the templates so that we have spaces following the title line + + [ Tim Gardner ] + + * Add upload number to kernel version signature. This has the side effect + of renaming kernel packages back to the original way, e.g., without '-ub' + in the name. + + -- Tim Gardner Thu, 04 Dec 2008 12:18:31 -0700 + +linux (2.6.28-2.2) jaunty; urgency=low + + [ Andy Whitcroft ] + + * Revert "SAUCE: (no-up) version: Implement version_signature proc file." + * SAUCE: (no-up) version: Implement version_signature proc file. + * SAUCE: serial: RS485 ioctl structure uses __u32 include linux/types.h + - LP: #303711 + + [ Tim Gardner ] + + * UBUNTU: Removed CONFIG_DRM_VIA_CHROME9 since it is upstream. + * UBUNTU: Removed ubuntu/via_chrome9 + + [ Upstream Kernel Changes ] + + * Rebased to v2.6.28-rc7 + + -- Tim Gardner Tue, 02 Dec 2008 07:33:32 -0700 + +linux (2.6.28-1.1) jaunty; urgency=low + + [ Amit Kucheria ] + + * SAUCE: make fc transport removal of target configurable + * SAUCE: pm: Config option to disable handling of console during + suspend/resume + * SAUCE: Adds support for COMPAL JHL90 webcam + * Map armel to arm to all editconfigs to work correctly + * Add armel to getabis for completeness sake + * Add -ub to our versioning to allow kerneloops.org to identify us + + [ Andy Whitcroft ] + + * Fix Vcs-Git path for the kernel repository. + - LP: #296915 + + [ Ben Collins ] + + * SAUCE: Lower warning level of some PCI messages + - LP: #159241 + * SAUCE: input/mouse/alps: Do not call psmouse_reset() for alps + * SAUCE: tulip: Let dmfe handle davicom on non-sparc + * SAUCE: tulip: Define ULI PCI ID's + * SAUCE: (no-up) version: Implement version_signature proc file. + * SAUCE: (no-up) connector.h: Add idx/val for drbd + * SAUCE: (no-up) swap: Add notify_swap_entry_free callback for compcache + * SAUCE: drivers: Remove some duplicate device entries in various modules + * SAUCE: (no-up) [AppArmor] merge with upstream subversion r1291 + * SAUCE: (no-up) Enable ubuntu extra subdirectory + * SAUCE: (no-up) ACPI: initramfs DSDT override support + * ubuntu: Add drbd module + * ubuntu: Add iscsitarget module + * ubuntu: Add BOM for iscsitarget + * ubuntu: Add squashfs driver + * SAUCE: (no-up) Check for squashfs superblock in initramfs mounting. + * ubuntu: Add aufs module + * ubuntu: Added atl2 driver + * ubuntu: Added et131x driver + * ubuntu: Add dm-raid4-5 driver + * ubuntu: Add ndiswrapper driver + * ubuntu: Added ram backed compressed swap module (compcache) + * ubuntu: Add misc drivers from hardy lum + * ubuntu: Add heci driver 3.2.0.24 + * ubuntu: Add ov511 and bt-sco drivers + * ubuntu: Add acx, prism2_usb wireless drivers + * ubuntu: Add at76 driver to build + * ubuntu: Add fsam7400 sw kill switch driver + * ubuntu: Added qc-usb driver + * ubuntu: e1000e: Upgraded module to 0.4.1.7 + * ubuntu: Added rfkill drivers + * ubuntu: VIA - Add VIA DRM Chrome9 3D engine + * ubuntu: unionfs: Added v1.4 module from hardy + * ubuntu: Add LIRC driver + * ubuntu: Add GFS driver + * ubuntu: New tlsup driver for toshiba laptops + * SAUCE: (no-up) Export lookup_has for aufs + * SAUCE: (no-up) Modularize vesafb + * ubuntu: Config files + * Disable some modules that need porting to 2.6.28 + * ubuntu: Fixup headers creation to include arch/*/include + * ubuntu/module-check: Ignore comment lines + + [ Chuck Short ] + + * SAUCE: ata: blacklist FUJITSU MHW2160BH PL + + [ cking ] + + * SAUCE: Enable speedstep for sonoma processors. + + [ Colin Ian King ] + + * ubuntu: Add dm-loop + * SAUCE: cx88: Support Leadtek WinFast DTV2000 H version J. + * SAUCE: fix kernel oops in VirtualBox during paravirt patching + * SAUCE: qc-usb: Enable Logitech QuickCam Messenger + * SAUCE: appleir: Enable driver for new MacBook Pro + + [ Colin Watson ] + + * Enable configfs, fuse, jfs, reiserfs, and xfs for armel + * Extend debian/d-i/ modules handling to make armel easier to support + * Create udebs for armel + + [ Fabio M. Di Nitto ] + + * ubuntu: update GFS Cluster File System + + [ Kees Cook ] + + * SAUCE: AppArmor: update to upstream subversion r1302 + + [ Leann Ogasawara ] + + * Add automatic model setting for Samsung Q45 + * Add Dell Dimension 9200 reboot quirk + + [ Mackenzie Morgan ] + + * SAUCE: Add quirk for ASUS Z37E to make sound audible after resume + + [ Matthew Garrett ] + + * SAUCE: hostap: send events on data interface as well as master + interface + + [ Michael Frey (Senior Manager, MID ] + + * SAUCE: Send HCI_RESET for Broadcomm 2046 + + [ Michael Haas ] + + * add proper aufs source tree from 20080922 + * Fix AUFS compilation in vfsub.c + * Add splice-2.6.23.patch from AUFS to export a symbol needed by AUFS + * Add put_filp.patch from AUFS to export a symbol needed by AUFS + * Add deny_write_access.patch from AUFS - export deny_write_access + * Add sec_perm-2.6.24.patch from AUFS - export security_inode_permission + * make sure TMPFS_MAGIC is defined in AUFS Makefile + * SAUCE: Revert aufs changes from AppArmor merge + + [ Mohamed Abbas ] + + * SAUCE: iwlagn -- fix rfkill when on when driver loaded + + [ Phillip Lougher ] + + * SAUCE: r8169: disable TSO by default for RTL8111/8168B chipsets. + + [ Stefan Bader ] + + * SAUCE: (no-up) Export dm_disk function of device-mapper + * SAUCE: Restore VT fonts on switch + * SAUCE: mmc: Increase power_up deleay to fix TI readers + * gfs1: GFS1 can't create more than 4kb file + * uvcvideo: Commit streaming parameters when enabling the video stream. + + [ Tim Gardner ] + + * SAUCE: Add extra headers to linux-libc-dev + * SAUCE: Catch nonsense keycodes and silently ignore + * SAUCE: Added support for HDAPS on various ThinkPads from Lenovo and IBM + * SAUCE: Guest OS does not recognize a lun with non zero target id on + Vmware ESX Server + * SAUCE: (no-up) Take care of orinoco_cs overlap with hostap_cs + * ubuntu: Add GNBD driver + * SAUCE: e1000e: Map NV RAM dynamically only when needed. + * SAUCE: Correctly blacklist Thinkpad r40e in ACPI + * SAUCE: Update Wacom tablet driver to 1.49 + * SAUCE: Fix Wacom tablet 1.49 porting errors + * SAUCE: Enable an e1000e Intel Corporation 82567 Gigabit controller + * SAUCE: Fix Oops in wlan_setup + * SAUCE: ipw2200: change default policy for auto-associate + * Dell Wireless 365 needs BTUSB_RESET quirk. + * ndiswrapper remote buffer overflows on long ESSIDs (CVE 2008-4395) + * Disabled ubuntu/e1000e config + + [ Upstream Kernel Changes ] + + * Revert "[Bluetooth] Eliminate checks for impossible conditions in IRQ + handler" + * Revert "x86, early_ioremap: fix fencepost error" + * mac80211: fix two issues in debugfs + * iwl3945: do not send scan command if channel count zero + + -- Ben Collins Fri, 07 Nov 2008 09:37:42 -0700 + +linux (2.6.27-8.17) intrepid-proposed; urgency=low + + [ John W. Linville ] + + * SAUCE: iwlagn: avoid sleep in softirq context + -LP: #286285 + + [ Tim Gardner ] + + * Dell Wireless 365 needs BTUSB_RESET quirk. + - LP: #293670 + * SAUCE: ALSA: hda: make a STAC_DELL_EQ option (version 2) + - LP: #293271 + + [ Upstream Kernel Changes ] + + * iwlagn: downgrade BUG_ON in interrupt + * Input: atkbd - expand Latitude's force release quirk to other Dells + * fbcon_set_all_vcs: fix kernel crash when switching the rotated consoles + * modules: fix module "notes" kobject leak + * Driver core: Fix cleanup in device_create_vargs(). + * Driver core: Clarify device cleanup. + * ath9k/mac80211: disallow fragmentation in ath9k, report to userspace + * md: Fix rdev_size_store with size == 0 + * xfs: fix remount rw with unrecognized options + * OHCI: Allow broken controllers to auto-stop + * USB: OHCI: fix endless polling behavior + * USB: Fix s3c2410_udc usb speed handling + * USB: EHCI: log a warning if ehci-hcd is not loaded first + * usb gadget: cdc ethernet notification bugfix + * usb: musb_hdrc build fixes + * drm/i915: fix ioremap of a user address for non-root (CVE-2008-3831) + * DVB: au0828: add support for another USB id for Hauppauge HVR950Q + * DVB: sms1xxx: support two new revisions of the Hauppauge WinTV + MiniStick + * security: avoid calling a NULL function pointer in + drivers/video/tvaudio.c + * Linux 2.6.27.3 + -LP: #294152 + + * gpiolib: fix oops in gpio_get_value_cansleep() + * edac cell: fix incorrect edac_mode + * x86 ACPI: fix breakage of resume on 64-bit UP systems with SMP kernel + * sched: fix the wrong mask_len + * USB: cdc-wdm: make module autoload work + * USB: don't rebind drivers after failed resume or reset + * USB: fix memory leak in cdc-acm + * USB: Speedtouch: add pre_reset and post_reset routines + * dm kcopyd: avoid queue shuffle + * dm snapshot: fix primary_pe race + * amd_iommu: fix nasty bug that caused ILLEGAL_DEVICE_TABLE_ENTRY errors + * CIFS: fix saving of resume key before CIFSFindNext + * netfilter: xt_iprange: fix range inversion match + * netfilter: snmp nat leaks memory in case of failure + * netfilter: restore lost ifdef guarding defrag exception + * anon_vma_prepare: properly lock even newly allocated entries + * hvc_console: Fix free_irq in spinlocked section + * ACPI Suspend: Enable ACPI during resume if SCI_EN is not set + * ACPI suspend: Blacklist HP xw4600 Workstation for old code ordering + * ACPI suspend: Always use the 32-bit waking vector + * proc: fix vma display mismatch between /proc/pid/{maps,smaps} + * SCSI: scsi_dh: add Dell product information into rdac device handler + * PCI hotplug: cpqphp: fix kernel NULL pointer dereference + * V4L/DVB (9300): pvrusb2: Fix deadlock problem + * Linux 2.6.27.4 + -LP: #294155 + + -- Tim Gardner Tue, 04 Nov 2008 12:16:07 -0700 + +linux (2.6.27-7.16) intrepid-security; urgency=low + + [ Tim Gardner ] + + * ndiswrapper remote buffer overflows on long ESSIDs (CVE 2008-4395) + - LP: #275860 + + [ Upstream Kernel Changes ] + + * ext[234]: Avoid printk floods in the face of directory corruption + (CVE-2008-3528) + + -- Tim Gardner Mon, 03 Nov 2008 13:34:42 -0700 + +linux (2.6.27-7.15) intrepid-security; urgency=low + + [ Upstream Kernel Changes ] + + * tcp: Restore ordering of TCP options for the sake of inter-operability + - LP: #264019 + + -- Tim Gardner Mon, 27 Oct 2008 19:28:06 -0600 + +linux (2.6.27-7.14) intrepid; urgency=low + + [ Tim Gardner ] + + * Disable ath5k in 2.6.27 + - LP: #288148 + + -- Tim Gardner Thu, 23 Oct 2008 07:40:43 -0600 + +linux (2.6.27-7.13) intrepid; urgency=low + + [ Stefan Bader ] + + * gfs1: GFS1 can't create more than 4kb file + + [ Tim Gardner ] + + * Revert "SAUCE: x86: Reserve FIRST_DEVICE_VECTOR in used_vectors + bitmap.". Use upstream commit to avoid future conflicts. + * Revert "STABLE queue: mac80211: fix two issues in debugfs". + Use upstream commit to avoid future conflicts. + * Revert "x86, early_ioremap: fix fencepost error" + Use upstream commit to avoid future conflicts. + + [ Upstream Kernel Changes ] + + * sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq + * x86: Reserve FIRST_DEVICE_VECTOR in used_vectors bitmap. + * mac80211: fix two issues in debugfs + * Fix barrier fail detection in XFS + * tty: Termios locking - sort out real_tty confusions and lock reads + * CIFS: make sure we have the right resume info before calling + CIFSFindNext + * rfkill: update LEDs for all state changes + * libertas: clear current command on card removal + * b43legacy: Fix failure in rate-adjustment mechanism + * x86, early_ioremap: fix fencepost error + * x86: SB450: skip IRQ0 override if it is not routed to INT2 of IOAPIC + * x86: improve UP kernel when CPU-hotplug and SMP is enabled + * sky2: Fix WOL regression + * netdrvr: atl1e: Don't take the mdio_lock in atl1e_probe + * Linux 2.6.27.2 + + [ Amit Kucheria ] + + * Ubuntu: agp: Fix stolen memory counting on G4X. + -LP: 285572 + + [ Scott Remnant ] + + * add MODULE_ALIAS to load ipmi_devintf with ipmi_si + + -- Tim Gardner Sun, 19 Oct 2008 10:06:21 -0600 + +linux (2.6.27-7.12) intrepid; urgency=low + + [ Chuck Short ] + + * xen: Add xen modules to virtual flavours. + + [ Mario Limonciello ] + + * SAUCE: Add back in lost commit for Apple BT Wireless Keyboard + - LP: #162083 + + [ Tim Gardner ] + + * Remove depmod created files from packages. + - LP: #250511 + * Changed default TCP congestion algorithm to 'cubic' (again) + - LP: #278801 + * Update configs for 'disable CONFIG_DYNAMIC_FTRACE' + - LP: #263555 + + [ Upstream Kernel Changes ] + + * x86: register a platform RTC device if PNP doesn't describe it + * disable CONFIG_DYNAMIC_FTRACE due to possible memory corruption on + module unload + + -- Tim Gardner Fri, 17 Oct 2008 11:25:39 -0600 + +linux (2.6.27-7.11) intrepid; urgency=low + + [ Amit Kucheria ] + + * STABLE queue: mac80211: fix two issues in debugfs + - LP: #275227 + * SAUCE: Adds support for COMPAL JHL90 webcam + + [ Ben Collins ] + + * SAUCE: (no-up) x86: Quiet "Kernel alive" messages + - LP: #39985 + * SAUCE: (no-up) Modularize vesafb + * build/config: Enable vesafb module + * build: Switch to vesafb as preferred. + + [ Leann Ogasawara ] + + * Add Dell Dimension 9200 reboot quirk + - LP: #271370 + + [ Michael Haas ] + + * SAUCE: Revert aufs changes from AppArmor merge + + [ Tim Gardner ] + + * fix virtio udeb layout + - LP: #257739 + * Enabled CONFIG_EXT4DEV_FS=m + * Changed default TCP congestion algorithm to 'cubic' + - LP: #278801 + * SAUCE: ipw2200: change default policy for auto-associate + - LP: #264104 + + [ Upstream Kernel Changes ] + + * x86, early_ioremap: fix fencepost error + - LP: #263543 + + -- Tim Gardner Sat, 11 Oct 2008 08:07:42 -0600 + +linux (2.6.27-7.10) intrepid; urgency=low + + [ Alexey Starikovskiy ] + + * SAUCE: ACPI: EC: do transaction from interrupt context + - LP: #277802 + + [ Ben Collins ] + + * build/d-i: Change virtio-modules udeb to prio standard + + [ Colin Ian King ] + + * SAUCE: Blacklist IBM 2656 in serio/i8042 + - LP: #21558 + + [ Henrik Rydberg ] + + * Revert "SAUCE: applesmc: Add MacBookAir" + * SAUCE: [PATCH 1/5] hwmon: applesmc: Specified number of bytes to read + should match actual + * SAUCE: [PATCH 2/5] hwmon: applesmc: Fix the 'wait status failed: c != + 8' problem + * SAUCE: [PATCH 3/5] hwmon: applesmc: Prolong status wait + * SAUCE: [PATCH 4/5] hwmon: applesmc: Allow for variable ALV0 and ALV1 + package length + * SAUCE: [PATCH 5/5] hwmon: applesmc: Add support for Macbook Air + * SAUCE: hwmon: applesmc: Add support for Macbook Pro 4 + * SAUCE: hwmon: applesmc: Add support for Macbook Pro 3 + * SAUCE: hwmon: applesmc: Lighter wait mechanism, drastic improvement + + [ Leann Ogasawara ] + + * Add automatic model setting for Samsung Q45 + - LP: #200210 + + [ Tim Gardner ] + + * SAUCE: Correctly blacklist Thinkpad r40e in ACPI + - LP: #278794 + * SAUCE: Update Wacom tablet driver to 1.49 + - LP: #260675 + * SAUCE: ALPS touchpad for Dell Latitude E6500/E6400 + - LP: #270643 + * SAUCE: Fix Wacom tablet 1.49 porting errors + * SAUCE: Enable an e1000e Intel Corporation 82567 Gigabit controller + * SAUCE: Fix Oops in wlan_setup + - LP: #263309 + + [ Upstream Kernel Changes ] + + * ath9k: fix oops on trying to hold the wrong spinlock + * [Bluetooth] Fix double frees on error paths of btusb and bpa10x drivers + * [Bluetooth] Add reset quirk for new Targus and Belkin dongles + * [Bluetooth] Add reset quirk for A-Link BlueUSB21 dongle + * Revert "ax25: Fix std timer socket destroy handling." + * ax25: Quick fix for making sure unaccepted sockets get destroyed. + * netrom: Fix sock_orphan() use in nr_release + * Revert "V4L/DVB (8904): cx88: add missing unlock_kernel" + * SLOB: fix bogus ksize calculation + * net: only invoke dev->change_rx_flags when device is UP + * tcp: Fix possible double-ack w/ user dma + * net: Fix netdev_run_todo dead-lock + * tcp: Fix tcp_hybla zero congestion window growth with small rho and large cwnd. + * [MIPS] Sibyte: Register PIO PATA device only for Swarm and Litte Sur + * eeepc-laptop: Fix hwmon interface + * hwmon: (it87) Prevent power-off on Shuttle SN68PT + * hwmon: Define sysfs interface for energy consumption register + * hwmon: (adt7473) Fix some bogosity in documentation file + * hwmon: (abituguru3) Enable reading from AUX3 fan on Abit AT8 32X + * hwmon: (abituguru3) Enable DMI probing feature on Abit AT8 32X + * [CPUFREQ] correct broken links and email addresses + * SLOB: fix bogus ksize calculation fix + * Don't allow splice() to files opened with O_APPEND + * Linux 2.6.27 + + -- Tim Gardner Wed, 08 Oct 2008 21:19:34 -0600 + +linux (2.6.27-6.9) intrepid; urgency=low + + [ Kees Cook ] + + * SAUCE: AppArmor: update to upstream subversion r1302 + - LP: #269921 + + [ Stefan Bader ] + + * Update configuration files to be compliant to desktop specs + - LP: #279019 + + [ Tim Gardner ] + + * Add support in e1000e for a couple of ICH10 PCI IDs + * Enable CONFIG_INPUT_PCSPKR=m + - LP: #275453 + + [ Upstream Kernel Changes ] + + * V4L/DVB (8559a): Fix a merge conflict at gspca/sonixb + * V4L/DVB (8789): wm8739: remove wrong kfree + * V4L/DVB (8883): w9968cf: Fix order of usb_alloc_urb validation + * V4L/DVB (8884): em28xx-audio: fix memory leak + * V4L/DVB (8885): cpia2_usb: fix memory leak + * V4L/DVB (8886): ov511: fix memory leak + * V4L/DVB (8887): gspca: fix memory leak + * V4L/DVB (8892): pvrusb2: Handle USB ID 2040:2950 same as 2040:2900 + * V4L/DVB (8904): cx88: add missing unlock_kernel + * V4L/DVB (8905): ov511: fix exposure sysfs attribute bug + * V4L/DVB (8909): gspca: PAC 7302 webcam 093a:262a added. + * hrtimer: migrate pending list on cpu offline + * hrtimer: fix migration of CB_IRQSAFE_NO_SOFTIRQ hrtimers + * hrtimer: mark migration state + * hrtimer: prevent migration of per CPU hrtimers + * [IA64] Put the space for cpu0 per-cpu area into .data section + * powerpc: Fix PCI in Holly device tree + * powerpc: Fix failure to shutdown with CPU hotplug + * mfd: Fix Kconfig accroding to the new gpiolib symbols + * mfd: Fix asic3 compilation + * x86: fix typo in enable_mtrr_cleanup early parameter + * ipsec: Fix pskb_expand_head corruption in xfrm_state_check_space + * iucv: Fix mismerge again. + * ALSA: ASoC: Fix cs4270 error path + * ALSA: hda - Fix model for Dell Inspiron 1525 + * sctp: Fix kernel panic while process protocol violation parameter + * x86: Fix broken LDT access in VMI + * x86, vmi: fix broken LDT access + * tcp: Fix NULL dereference in tcp_4_send_ack() + * ipv6: NULL pointer dereferrence in tcp_v6_send_ack + * XFRM,IPv6: initialize ip6_dst_blackhole_ops.kmem_cachep + * af_key: Free dumping state on socket close + * dm: always allow one page in dm_merge_bvec + * dm: cope with access beyond end of device in dm_merge_bvec + * dm mpath: add missing path switching locking + * MN10300: Fix IRQ handling + * pxa2xx_spi: fix build breakage + * e1000e: write protect ICHx NVM to prevent malicious write/erase + * powerpc: Fix boot hang regression on MPC8544DS + * ASoC: Set correct name for WM8753 rec mixer output + * ALSA: snd-powermac: mixers for PowerMac G4 AGP + * ALSA: snd-powermac: HP detection for 1st iMac G3 SL + * fbcon: fix monochrome color value calculation + * inotify: fix lock ordering wrt do_page_fault's mmap_sem + * braille_console: only register notifiers when the braille console is used + * fix error-path NULL deref in alloc_posix_timer() + * memory hotplug: missing zone->lock in test_pages_isolated() + * mm: tiny-shmem nommu fix + * mm: handle initialising compound pages at orders greater than MAX_ORDER + * e1000e: reset swflag after resetting hardware + * e1000e: do not ever sleep in interrupt context + * e1000e: remove phy read from inside spinlock + * e1000e: drop stats lock + * e1000e: debug contention on NVM SWFLAG + * e1000e: update version from k4 to k6 + * Check mapped ranges on sysfs resource files + * e1000e: Fix incorrect debug warning + * [MIPS] Build fix: Fix irq flags type + * [MIPS] SMTC: Build fix: Fix filename in Makefile + * [MIPS] SMTC: Fix holes in SMTC and FPU affinity support. + * [MIPS] SMTC: Close tiny holes in the SMTC IPI replay system. + * [MIPS] SMTC: Fix SMTC dyntick support. + * [S390] nohz: Fix __udelay. + * [S390] qdio: prevent stack clobber + * Fix init/main.c to use regular printk with '%pF' for initcall fn + * x86 setup: correct segfault in generation of 32-bit reloc kernel + * selinux: Fix an uninitialized variable BUG/panic in selinux_secattr_to_sid() + * rtc: fix kernel panic on second use of SIGIO nofitication + * fbdev: fix recursive notifier and locking when fbdev console is blanked + * orion_spi: fix handling of default transfer speed + * include/linux/stacktrace.h: declare struct task_struct + * cpusets: remove pj from cpuset maintainers + * MAINTAINERS: add mailing list for man-pages + * SubmitChecklist: interfaces changes should CC linux-api@ + * Documentation/HOWTO: info about interface changes should CC linux-api@vger + * dw_dmac: fix copy/paste bug in tasklet + * leds-fsg: change order of initialization and deinitialization + * leds-pca955x: add proper error handling and fix bogus memory handling + * ACPI: Make /proc/acpi/wakeup interface handle PCI devices (again) + * clockevents: check broadcast tick device not the clock events device + * V4L/DVB (8919): cx18: Fix tuner audio input for Compro H900 cards + * V4L/DVB (8926): gspca: Bad fix of leak memory (changeset 43d2ead315b1). + * V4L/DVB (8933): gspca: Disable light frquency for zc3xx cs2102 Kokom. + * V4L/DVB (8935): em28xx-cards: Remove duplicate entry (EM2800_BOARD_KWORLD_USB2800) + * V4L/DVB (8955): bttv: Prevent NULL pointer dereference in radio_open + * V4L/DVB (8957): zr36067: Restore the default pixel format + * V4L/DVB (8958): zr36067: Return proper bytes-per-line value + * V4L/DVB (8960): drivers/media/video/cafe_ccic.c needs mm.h + * V4L/DVB (8961): zr36067: Fix RGBR pixel format + * V4L/DVB (8963): s2255drv field count fix + * V4L/DVB (8967): Use correct XC3028L firmware for AMD ATI TV Wonder 600 + * V4L/DVB (8978): sms1xxx: fix product name for Hauppauge WinTV MiniStick + * V4L/DVB (8979): sms1xxx: Add new USB product ID for Hauppauge WinTV MiniStick + * V4L/DVB (9029): Fix deadlock in demux code + * V4L/DVB (9037): Fix support for Hauppauge Nova-S SE + * V4L/DVB (9043): S5H1420: Fix size of shadow-array to avoid overflow + * V4L/DVB (9053): fix buffer overflow in uvc-video + * V4L/DVB (9075): gspca: Bad check of returned status in i2c_read() spca561. + * V4L/DVB (9080): gspca: Add a delay after writing to the sonixj sensors. + * V4L/DVB (9092): gspca: Bad init values for sonixj ov7660. + * V4L/DVB (9099): em28xx: Add detection for K-WORLD DVB-T 310U + * V4L/DVB (9103): em28xx: HVR-900 B3C0 - fix audio clicking issue + * x86: gart iommu have direct mapping when agp is present too + * ide-cd: temporary tray close fix + * ide-dma: fix ide_build_dmatable() for TRM290 + * IDE: Fix platform device registration in Swarm IDE driver (v2) + * ide-cd: Optiarc DVD RW AD-7200A does play audio + * ide: workaround for bogus gcc warning in ide_sysfs_register_port() + * [MIPS] Fix CMP Kconfig configuration and mark as broken. + * [MIPS] IP27: Fix build errors if CONFIG_MAPPED_KERNEL=y + * x86 ACPI: Blacklist two HP machines with buggy BIOSes + * kgdb, x86: Avoid invoking kgdb_nmicallback twice per NMI + * kgdb: call touch_softlockup_watchdog on resume + * atmel-mci: Initialize BLKR before sending data transfer command + * Marker depmod fix core kernel list + * Linux 2.6.27-rc9 + + -- Tim Gardner Sun, 05 Oct 2008 21:27:49 -0600 + +linux (2.6.27-5.8) intrepid; urgency=low + + [ Amit Kucheria ] + + * Update AUFS-related Kconfig + - LP: #264048 + + [ Michael Haas ] + + * add proper aufs source tree from 20080922 + * Fix AUFS compilation in vfsub.c + * Add splice-2.6.23.patch from AUFS to export a symbol needed by AUFS + * Add put_filp.patch from AUFS to export a symbol needed by AUFS + * apply (modified) lhash.patch from AUFS to export __lookup_hash() + * Add deny_write_access.patch from AUFS - export deny_write_access + * Add sec_perm-2.6.24.patch from AUFS - export security_inode_permission + * make sure TMPFS_MAGIC is defined in AUFS Makefile + + [ Tim Gardner ] + + * Enabled CONFIG_IPWIRELESS + - LP: #274748 + * Enabled CONFIG_E1000E, disabled CONFIG_E1000E_NEW + This takes advantage of the upstream NVM protection fix in + commit 4a7703582836f55a1cbad0e2c1c6ebbee3f9b3a7. + + [ Upstream Kernel Changes ] + + * Revert "[Bluetooth] Eliminate checks for impossible conditions in IRQ + handler" + * [SCSI] qla2xxx: Defer enablement of RISC interrupts until ISP + initialization completes. + * PCI: Fix pcie_aspm=force + * PCI: fix compiler warnings in pci_get_subsys() + * UBIFS: create the name of the background thread in every case + * UBIFS: TNC / GC race fixes + * UBIFS: remove incorrect assert + * UBIFS: fix printk format warnings + * AMD IOMMU: set iommu sunc flag after command queuing + * AMD IOMMU: protect completion wait loop with iommu lock + * sparc64: Fix disappearing PCI devices on e3500. + * x86, oprofile: BUG scheduling while atomic + * ALSA: ASoC: Fix at32-pcm build breakage with PM enabled + * ath9k: connectivity is lost after Group rekeying is done + * wireless: zd1211rw: add device ID fix wifi dongle "trust nw-3100" + * [IA64] Ski simulator doesn't need check_sal_cache_flush + * [IA64] kexec fails on systems with blocks of uncached memory + * ath9k: Fix IRQ nobody cared issue with ath9k + * [Bluetooth] Fix I/O errors on MacBooks with Broadcom chips + * [Bluetooth] Fix wrong URB handling of btusb driver + * [Bluetooth] Fix USB disconnect handling of btusb driver + * sparc64: Fix missing devices due to PCI bridge test in + of_create_pci_dev(). + * [WATCHDOG] ibmasr: remove unnecessary spin_unlock() + * [WATCHDOG] wdt285: fix sparse warnings + * [WATCHDOG] unlocked_ioctl changes + * x86: fix 27-rc crash on vsmp due to paravirt during module load + * sched: fix init_hrtick() section mismatch warning + * clockevents: prevent cpu online to interfere with nohz + * x86: prevent stale state of c1e_mask across CPU offline/online + * clockevents: prevent stale tick_next_period for onlining CPUs + * clockevents: check broadcast device not tick device + * clockevents: prevent mode mismatch on cpu online + * x86: prevent C-states hang on AMD C1E enabled machines + * x86: c1e_idle: don't mark TSC unstable if CPU has invariant TSC + * timers: fix build error in !oneshot case + * ALSA: ASoC: maintainers - update email address for Liam Girdwood + * ibmasr: remove unnecessary spin_unlock() + * smb.h: do not include linux/time.h in userspace + * kernel-doc: allow structs whose members are all private + * kexec: fix segmentation fault in kimage_add_entry + * Documentation/DMA-mapping.txt: update for pci_dma_mapping_error() + changes + * sys_paccept: disable paccept() until API design is resolved + * mm: tiny-shmem fix lock ordering: mmap_sem vs i_mutex + * Documentation/sysctl/kernel.txt: fix softlockup_thresh description + * memcg: check under limit at shrink_usage + * atmel_serial: update the powersave handler to match serial core + * [SCSI] Fix hang with split requests + * USB Storage: Sierra: Non-configurable TRU-Install + * USB Serial: Sierra: Device addition & version rev + * USB: ehci: fix some ehci hangs and crashes + * USB: Fix the Nokia 6300 storage-mode. + * USB: Correct Sierra Wireless USB EVDO Modem Device ID + * USB: fix hcd interrupt disabling + * USB: update of Documentation/usb/anchors.txt + * usb gadget: fix omap_udc DMA regression + * USB: Fixing Nokia 3310c in storage mode + * usb: musb: fix include path + * USB: fix EHCI periodic transfers + * usb-serial: Add Siemens EF81 to PL-2303 hack triggers + * USB: SERIAL CP2101 add device IDs + * USB: unusual_devs addition for RockChip MP3 player + * USB: fsl_usb2_udc: fix VDBG() format string + * usb serial: ti_usb_3410_5052 obviously broken by firmware changes + * USB: ftdi_sio: Add 0x5050/0x0900 USB IDs (Papouch Quido USB 4/4) + * USB: serial: add ZTE CDMA Tech id to option driver + * USB Serial: Sierra: Add MC8785 VID/PID + * USB: drivers/usb/musb/: disable it on SuperH + * usb: ftdi_sio: add support for Domintell devices + * usb: unusual devs patch for Nokia 5310 Music Xpress + * USB: revert recovery from transient errors + * [MIPS] au1000: Fix gpio direction + * [MIPS] Fixe the definition of PTRS_PER_PGD + * x86: prevent stale state of c1e_mask across CPU offline/online, fix + * x86: disable apm on the olpc + * i2c-powermac: Fix section for probe and remove functions + * i2c-dev: Return correct error code on class_create() failure + * i2c: Fix mailing lists in two MAINTAINERS entries + * ath9k: disable MIB interrupts to fix interrupt storm + * 9p: implement proper trans module refcounting and unregistration + * 9p-trans_fd: fix trans_fd::p9_conn_destroy() + * 9p-trans_fd: clean up p9_conn_create() + * 9p-trans_fd: don't do fs segment mangling in p9_fd_poll() + * 9p-trans_fd: fix and clean up module init/exit paths + * 9p: introduce missing kfree + * 9p: use an IS_ERR test rather than a NULL test + * 9p: fix put_data error handling + * netfilter: ip6t_{hbh,dst}: Rejects not-strict mode on rule insertion + * MN10300: Move asm-arm/cnt32_to_63.h to include/linux/ + * MN10300: Make sched_clock() report time since boot + * ALSA: fix locking in snd_pcm_open*() and snd_rawmidi_open*() + * ALSA: remove unneeded power_mutex lock in snd_pcm_drop + * IPoIB: Fix crash when path record fails after path flush + * [XFS] Fix extent list corruption in xfs_iext_irec_compact_full(). + * [XFS] Remove xfs_iext_irec_compact_full() + * kgdb: could not write to the last of valid memory with kgdb + * kgdb, x86, arm, mips, powerpc: ignore user space single stepping + * kgdb, x86_64: gdb serial has BX and DX reversed + * kgdb, x86_64: fix PS CS SS registers in gdb serial + * kgdboc,tty: Fix tty polling search to use name correctly + * ARM: Delete ARM's own cnt32_to_63.h + * m32r: remove the unused NOHIGHMEM option + * m32r: don't offer CONFIG_ISA + * m32r: export empty_zero_page + * m32r: export __ndelay + * m32r/kernel/: cleanups + * [MIPS] au1000: Make sure GPIO value is zero or one + * [MIPS] IP27: Switch to dynamic interrupt routing avoding panic on + error. + * [MIPS] BCM47xx: Fix build error due to missing PCI functions + * [SSB] Initialise dma_mask for SSB_BUSTYPE_SSB devices + * Swarm: Fix crash due to missing initialization + * ide-tape: fix vendor strings + * ide: note that IDE generic may prevent other drivers from attaching + * cdrom: update ioctl documentation + * [SCSI] qlogicpti: fix sg list traversal error in continuation entries + * sata_nv: reinstate nv_hardreset() for non generic controllers + * scsi: fix fall out of sg-chaining patch in qlogicpti + * ALSA: make the CS4270 driver a new-style I2C driver + * ALSA: ASoC: Fix another cs4270 error path + * Fix NULL pointer dereference in proc_sys_compare + * kconfig: fix silentoldconfig + * kconfig: readd lost change count + * mm owner: fix race between swapoff and exit + * Linux 2.6.27-rc8 + * e1000e: write protect ICHx NVM to prevent malicious write/erase + + -- Amit Kucheria Tue, 30 Sep 2008 18:22:35 +0300 + +linux (2.6.27-4.7) intrepid; urgency=low + + [ Ben Collins ] + + * build/abi: Add gfs1 to perm blacklist + * build/abi: Ignored changes in gfs2 symbols + + [ Fabio M. Di Nitto ] + + * Revert "SAUCE: Export gfs2 symbols required for gfs1 kernel module" + * ubuntu: update GFS Cluster File System + + [ Stefan Bader ] + + * SAUCE: x86: Reserve FIRST_DEVICE_VECTOR in used_vectors bitmap. + - LP: #276334 + + [ Tim Gardner ] + + * Revert "Disable e1000e until the NVRAM corruption problem is found." + * Add atl1e and atl2 to Debian installer bits + - LP: #273904 + * SAUCE: e1000e: Map NV RAM dynamically only when needed. + - LP: #263555 + + -- Tim Gardner Fri, 26 Sep 2008 20:51:22 -0600 + +linux (2.6.27-4.6) intrepid; urgency=low + + [ Tim Gardner ] + + * Disable e1000e until the NVRAM corruption problem is found. + - LP: #263555 + + [ Upstream Kernel Changes ] + + * Revert "[Bluetooth] Eliminate checks for impossible conditions in IRQ + handler" + + -- Ben Collins Tue, 23 Sep 2008 09:53:57 -0400 + +linux (2.6.27-4.5) intrepid; urgency=low + + [ Upstream Kernel Changes ] + + * Revert "b43/b43legacy: add RFKILL_STATE_HARD_BLOCKED support" + * udf: Fix lock inversion between iprune_mutex and alloc_mutex (v2) + * udf: Fix error paths in udf_new_inode() + * [SCSI] sd: select CRC_T10DIF only when necessary + * [SCSI] zfcp: Fix request queue locking + * [SCSI] zfcp: Correctly query end flag in gpn_ft response + * [SCSI] zfcp: Simplify ccw notify handler + * [SCSI] zfcp: Fix reference counter for remote ports + * [SCSI] zfcp: channel cannot be detached due to refcount imbalance + * [SCSI] zfcp: Remove duplicated unlikely() macros. + * [SCSI] scsi_dh: make check_sense return ADD_TO_MLQUEUE + * [SCSI] make scsi_check_sense HARDWARE_ERROR return ADD_TO_MLQUEUE on + retry + * [SCSI] fix check of PQ and PDT bits for WLUNs + * pcm037: add rts/cts support for serial port + * i.MX serial: fix init failure + * imx serial: set RXD mux bit on i.MX27 and i.MX31 + * imx serial: fix rts handling for non imx1 based hardware + * mlx4_core: Set RAE and init mtt_sz field in FRMR MPT entries + * udf: add llseek method + * PCI/iommu: blacklist DMAR on Intel G31/G33 chipsets + * PCI: Fix printk warnings in probe.c + * PCI: Fix printk warnings in setup-bus.c + * PCI Hotplug: fakephp: fix deadlock... again + * clockevents: remove WARN_ON which was used to gather information + * ocfs2: Fix a bug in direct IO read. + * arch/x86/kernel/kdebugfs.c: introduce missing kfree + * [IA64] fix compile failure with non modular builds + * [IA64] fix up bte.h + * [IA64] arch/ia64/sn/pci/tioca_provider.c: introduce missing kfree + * PCI: fix pciehp_free_irq() + * [IA64] prevent ia64 from invoking irq handlers on offline CPUs + * ide: Fix pointer arithmetic in hpt3xx driver code (3rd try) + * add deprecated ide-scsi to feature-removal-schedule.txt + * swiotlb: fix back-off path when memory allocation fails + * sparc64: Fix interrupt register calculations on Psycho and Sabre. + * VIDEO_SH_MOBILE_CEU should depend on HAS_DMA + * m68k: Update defconfigs for 2.6.27-rc6 + * sparc32: Fix function signature of of_bus_sbus_get_flags(). + * sched: fix 2.6.27-rc5 couldn't boot on tulsa machine randomly + * sched: fix deadlock in setting scheduler parameter to zero + * KVM: SVM: fix random segfaults with NPT enabled + * KVM: SVM: fix guest global tlb flushes with NPT + * KVM: VMX: Always return old for clear_flush_young() when using EPT + * clocksource, acpi_pm.c: fix check for monotonicity + * [ARM] OMAP: Fix MMC device data + * block: disable sysfs parts of the disk command filter + * ath9k: Assign seq# when mac80211 requests this + * sg: disable interrupts inside sg_copy_buffer + * MN10300: Change the fault handler to check in_atomic() not + in_interrupt() + * [Bluetooth] Fix regression from using default link policy + * netlink: fix overrun in attribute iteration + * x86: fix possible x86_64 and EFI regression + * sparc64: Fix PCI error interrupt registry on PSYCHO. + * sparc: Fix user_regset 'n' field values. + * niu: panic on reset + * PCI: re-add debug prints for unmodified BARs + * [ARM] 5245/1: Fix warning about unused return value in drivers/pcmcia + * [ARM] 5246/1: tosa: add proper clock alias for tc6393xb clock + * [ARM] 5247/1: tosa: SW_EAR_IN support + * [ARM] Fix PCI_DMA_BUS_IS_PHYS for ARM + * ata: duplicate variable sparse warning + * sata_inic162x: enable LED blinking + * [libata] LBA28/LBA48 off-by-one bug in ata.h + * proc: more debugging for "already registered" case + * include/linux/ioport.h: add missing macro argument for devm_release_* + family + * cpuset: avoid changing cpuset's cpus when -errno returned + * cpuset: hotplug documentation fix + * coredump_filter: add description of bit 4 + * bfs: fix Lockdep warning + * mm: ifdef Quicklists in /proc/meminfo + * spi_mpc83xx: fix clockrate calculation for low speed + * spi_mpc83xx: reject invalid transfer sizes + * pxa2xx_spi: chipselect bugfixes + * pxa2xx_spi: dma bugfixes + * mm: mark the correct zone as full when scanning zonelists + * Documentation/ABI: /sys/class/gpio + * MAINTAINERS: fix USB VIDEO CLASS mail list address + * ia64: fix panic during `modprobe -r xpc' + * atmel_lcdfb: disable LCD and DMA engines when suspending + * spi_s3c24xx: fix section warning + * rescan_partitions(): make device capacity errors non-fatal + * memstick: fix MSProHG 8-bit interface mode support + * Add Uwe Kleine-König to .mailmap + * xen: fix for xen guest with mem > 3.7G + * x86/paravirt: Remove duplicate paravirt_pagetable_setup_{start, done}() + * crypto: talitos - Avoid consecutive packets going out with same IV + * slub: fixed uninitialized counter in struct kmem_cache_node + * udp: Fix rcv socket locking + * IB/mlx4: Fix up fast register page list format + * [MIPS] VR41xx: unsigned irq cannot be negative + * x86: completely disable NOPL on 32 bits + * [S390] cio: Fix driver_data handling for ccwgroup devices. + * [S390] cio: fix orb initialization in cio_start_key + * sparc64: Fix OOPS in psycho_pcierr_intr_other(). + * sparc64: Fix SMP bootup with CONFIG_STACK_DEBUG or ftrace. + * RDMA/nes: Fix client side QP destroy + * IPoIB: Fix deadlock on RTNL between bcast join comp and ipoib_stop() + * clockevents: make device shutdown robust + * powerpc: Fix interrupt values for DMA2 in MPC8610 HPCD device tree + * hpplus: fix build regression + * Fix PNP build failure, bugzilla #11276 + * warn: Turn the netdev timeout WARN_ON() into a WARN() + * [XFS] Move memory allocations for log tracing out of the critical path + * [XFS] Fix regression introduced by remount fixup + * [XFS] Prevent direct I/O from mapping extents beyond eof + * [XFS] Fix barrier status change detection. + * [XFS] Prevent lockdep false positives when locking two inodes. + * [XFS] Fix use-after-free with buffers + * [XFS] Don't do I/O beyond eof when unreserving space + * powerpc: Holly board needs dtbImage target + * Fix compile failure with non modular builds + * [ARM] 5249/1: davinci: remove redundant check in davinci_psc_config() + * [ARM] omap: back out 'internal_clock' support + * sctp: set the skb->ip_summed correctly when sending over loopback. + * [ARM] 5255/1: Update jornada ssp to remove build errors/warnings + * sctp: do not enable peer features if we can't do them. + * sctp: Fix oops when INIT-ACK indicates that peer doesn't support AUTH + * bnx2: Promote vector field in bnx2_irq structure from u16 to unsigned + int + * forcedeth: call restore mac addr in nv_shutdown path + * e1000: prevent corruption of EEPROM/NVM + * e100: Use pci_pme_active to clear PME_Status and disable PME# + * md: Don't wait UNINTERRUPTIBLE for other resync to finish + * atstk1000: fix build breakage with BOARD_ATSTK100X_SW2_CUSTOM=y + * avr32: add .gitignore files + * avr32: add generic_find_next_le_bit bit function + * avr32: fix sys_sync_file_range() call convention + * avr32: nmi_enter() without nmi_exit() + * KVM: ia64: 'struct fdesc' build fix + * hwmon: (atxp1) Fix device detection logic + * hwmon: (it87) Fix fan tachometer reading in IT8712F rev 0x7 (I) + * hwmon: (ad7414) Make ad7414_update_device() static + * tmio_mmc: fix compilation with debug enabled + * atmel-mci: debugfs: enable clock before dumping regs + * atmel-mci: Fix memory leak in atmci_regs_show + * atmel-mci: Fix bogus debugfs file size + * atmel-mci: Set MMC_CAP_NEEDS_POLL if no detect_pin + * mmc_block: handle error from mmc_register_driver() + * mmc_test: initialize mmc_test_lock statically + * [MIPS] Fix 64-bit IP checksum code + * [MIPS] SMTC: Clear TIF_FPUBOUND on clone / fork. + * [MIPS] Fix potential latency problem due to non-atomic cpu_wait. + * [MIPS] vmlinux.lds.S: handle .text.* + * MAINTAINERS: Trivial whitespace cleanups + * MAINTAINERS: Various fixes + * Linux 2.6.27-rc7 + + -- Tim Gardner Sun, 21 Sep 2008 21:49:28 -0600 + +linux (2.6.27-3.4) intrepid; urgency=low + + [ Colin Ian King ] + + * SAUCE: fix kernel oops in VirtualBox during paravirt patching + - LP: #246067 + * SAUCE: qc-usb: Enable Logitech QuickCam Messenger + - LP: #209901 + * SAUCE: appleir: Enable driver for new MacBook Pro + - LP: #157919 + + [ Tim Gardner ] + + * Enabled CONFIG_DEBUG_RODATA=y + + [ Upstream Kernel Changes ] + + * Revert "ALSA: hda - Added model selection for iMac 24"" + * Revert "x86: fix HPET regression in 2.6.26 versus 2.6.25, check hpet + against BAR, v3" + * Revert "[ARM] use the new byteorder headers" + * Revert "mac80211: Use IWEVASSOCREQIE instead of IWEVCUSTOM" + * Revert "crypto: camellia - Use kernel-provided bitops, unaligned access + helpers" + * svcrdma: Fix race between svc_rdma_recvfrom thread and the dto_tasklet + * sched, cpuset: rework sched domains and CPU hotplug handling (v4) + * ACPI: Fix now signed module parameter. + * ACPI: Change package length error to warning + * ACPI: Fix now signed module parameter. + * ACPI: Fix typo in "Disable MWAIT via DMI on broken Compal board" + * acpi: add checking for NULL early param + * UBIFS: fix zero-length truncations + * Input: bcm5974 - add maintainer entry + * sh64: re-add the __strnlen_user() prototype + * sh: fix ptrace_64.c:user_disable_single_step() + * PNPACPI: ignore the producer/consumer bit for extended IRQ descriptors + * UBIFS: always read hashed-key nodes under TNC mutex + * UBIFS: allow for racing between GC and TNC + * [CIFS] Fix plaintext authentication + * sparc32: Implement smp_call_function_single(). + * sh: crash kernel resource fix + * sh: fix kexec entry point for crash kernels + * sh: fix platform_resource_setup_memory() section mismatch + * sh: update Migo-R defconfig + * sh: update AP325RXA defconfig + * sh: fix semtimedop syscall + * cifs: fix O_APPEND on directio mounts + * [CIFS] update cifs change log + * [CIFS] Turn off Unicode during session establishment for plaintext + authentication + * ACPI: thinkpad-acpi: wan radio control is not experimental + * sparc: Fix resource flags for PCI children in OF device tree. + * remove blk_register_filter and blk_unregister_filter in gendisk + * ALSA: oxygen: fix distorted output on AK4396-based cards + * ipv6: When we droped a packet, we should return NET_RX_DROP instead of + 0 + * pkt_sched: Fix locking of qdisc_root with qdisc_root_sleeping_lock() + * net: Unbreak userspace usage of linux/mroute.h + * Don't trigger softlockup detector on network fs blocked tasks + * Resource handling: add 'insert_resource_expand_to_fit()' function + * sparc64: setup_valid_addr_bitmap_from_pavail() should be __init + * UBIFS: do not update min_idx_lebs in stafs + * UBIFS: push empty flash hack down + * UBIFS: remove incorrect index space check + * UBIFS: improve statfs reporting + * UBIFS: fix assertion + * UBIFS: add forgotten gc_idx_lebs component + * UBIFS: introduce LEB overhead + * UBIFS: improve statfs reporting even more + * UBIFS: fill f_fsid + * drm/radeon: downgrade debug message from info to debug. + * Remove invalidate_partition call from do_md_stop. + * Fix problem with waiting while holding rcu read lock in md/bitmap.c + * ALSA: hda: Distortion fix for dell_m6_core_init + * ALSA: ASoC: fix pxa2xx-i2s clk_get call + * block: restore original behavior of /proc/partition when there's no + partition + * debugobjects: fix lockdep warning + * avr32: Fix lockup after Java stack underflow in user mode + * avr32: pm_standby low-power ram bug fix + * nfsd: fix compound state allocation error handling + * sunrpc: fix possible overrun on read of /proc/sys/sunrpc/transports + * nfsd: fix buffer overrun decoding NFSv4 acl + * audit: Moved variable declaration to beginning of function + * Fix modules_install on RO nfs-exported trees. + * Remove '#include ' from mm/page_isolation.c + * dabusb_fpga_download(): fix a memory leak + * [MTD] mtdchar.c: Fix regression in MEMGETREGIONINFO ioctl() + * ALSA: hda - Fix ALC663 auto-probe + * ALSA: hda - Add mic-boost controls to ALC662/663 auto configuration + * Un-break printk strings in x86 PCI probing code + * kernel/resource.c: fix new kernel-doc warning + * softlockup: minor cleanup, don't check task->state twice + * fix typo in arch/parisc/hpux/fs.c + * m68k: atari_keyb_init operator precedence fix + * ACPI: Fix typo in "Disable MWAIT via DMI on broken Compal board" + * don't diff generated firmware files + * IDE: compile fix for sff_dma_ops + * IDE: palm_bk3710: fix compile warning for unused variable + * ide: fix hwif_to_node() + * palm_bk3710: improve IDE registration + * ide-disk: remove stale init_idedisk_capacity() documentation + * ide/Kconfig: mark ide-scsi as deprecated + * net/wireless/Kconfig: clarify the description for + CONFIG_WIRELESS_EXT_SYSFS + * iwlwifi: do not use GFP_DMA in iwl_tx_queue_init + * iwlwifi: workaround interrupt handling no some platforms + * iwlwifi: fix apm_stop (wrong bit polarity for FLAG_INIT_DONE) + * iwlwifi: fix 64bit platform firmware loading + * orinoco: Multicast to the specified addresses + * wireless/libertas/if_cs.c: fix memory leaks + * mac80211: Fix debugfs union misuse and pointer corruption + * rt2x00: Compiler warning unmasked by fix of BUILD_BUG_ON + * ath9k: Incorrect key used when group and pairwise ciphers are + different. + * ath9: Fix ath_rx_flush_tid() for IRQs disabled kernel warning message. + * net/xfrm: Use an IS_ERR test rather than a NULL test + * ipv: Re-enable IP when MTU > 68 + * NTFS: update homepage + * mm: make setup_zone_migrate_reserve() aware of overlapping nodes + * VFS: fix dio write returning EIO when try_to_release_page fails + * acer-wmi: remove debugfs entries upon unloading + * mm/bootmem: silence section mismatch warning - + contig_page_data/bootmem_node_data + * MAINTAINERS: add a maintainer for the BCM5974 multitouch driver + * 8250: improve workaround for UARTs that don't re-assert THRE correctly + * mmc: at91_mci: don't use coherent dma buffers + * pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing + * pid_ns: (BUG 11391) change ->child_reaper when init->group_leader exits + * cirrusfb: check_par fixes + * devcgroup: fix race against rmdir() + * mm: show quicklist usage in /proc/meminfo + * mm: size of quicklists shouldn't be proportional to the number of CPUs + * ipc: document the new auto_msgmni proc file + * hp-wmi: update to match current rfkill semantics + * hp-wmi: add proper hotkey support + * tdfxfb: fix SDRAM memory size detection + * tdfxfb: fix frame buffer name overrun + * rtc_time_to_tm: fix signed/unsigned arithmetic + * ibft: fix target info parsing in ibft module + * sysfs: document files in /sys/firmware/sgi_uv/ + * rtc-cmos: wake again from S5 + * pm_qos_requirement might sleep + * drivers/char/random.c: fix a race which can lead to a bogus BUG() + * ipsec: Fix deadlock in xfrm_state management. + * [x86] Fix TSC calibration issues + * tipc: Don't use structure names which easily globally conflict. + * sparc64: Fix IPI call locking. + * [ARM] omap: fix gpio.c build error + * sparc64: Prevent sparc64 from invoking irq handlers on offline CPUs + * powerpc: Fix uninitialised variable in VSX alignment code + * powerpc: Only make kernel text pages of linear mapping executable + * powerpc: Make sure _etext is after all kernel text + * powerpc: Work around gcc's -fno-omit-frame-pointer bug + * powerpc: Fix build error with 64K pages and !hugetlbfs + * powerpc: Fix for getting CPU number in power_save_ppc32_restore() + * UBIFS: amend f_fsid + * net/usb/pegasus: avoid hundreds of diagnostics + * ixgbe: initialize interrupt throttle rate + * pcnet-cs, axnet_cs: add new IDs, remove dup ID with less info + * netxen: Remove workaround for chipset quirk + * Split up PIT part of TSC calibration from native_calibrate_tsc + * iwlwifi: W/A for the TSF correction in IBSS + * iwlwifi: fix hidden ssid discovery in passive channels + * iwlwifi: remove false rxon if rx chain changes + * iwlwifi: fix station mimo power save values + * iwlwifi: fix rx_chain computation + * iwlwifi: fix Tx cmd memory allocation failure handling + * iwlwifi: call apm stop on exit + * iwlwifi: fix STATUS_EXIT_PENDING is not set on pci_remove + * ath9k: Fix TX status reporting + * ath9k: Fix TX control flag use for no ACK and RTS/CTS + * V4L/DVB (8555): au8522: add mechanism to configure IF frequency for vsb + and qam + * V4L/DVB (8556): au0828: add support for Hauppauge Woodbury + * V4L/DVB (8598): au8522: clean up function au8522_set_if + * V4L/DVB (8599): au8522: remove if frequency settings from vsb/qam + modulation tables + * V4L/DVB (8600): au0828: explicitly set 6 MHz IF frequency in + hauppauge_hvr950q_config + * V4L/DVB (8629): v4l2-ioctl: do not try to handle private V4L1 ioctls + * V4L/DVB (8633): ivtv: update ivtv version number + * V4L/DVB (8648): ivtv: improve CC support + * V4L/DVB (8660): gspca: Simplify the scan of URB packets in pac7311. + * V4L/DVB (8661): gspca: Bug in the previous changeset about pac7311. + * V4L/DVB (8663): gspca: Webcam 0c45:6128 added in sonixj. + * V4L/DVB (8664): gspca: The bridge/sensor of the webcam 093a:2621 is a + PAC 7302. + * V4L/DVB (8665): gspca: Fix the 640x480 resolution of the webcam + 093a:2621. + * V4L/DVB (8666): gspca: Bad scanning of frames in pac7311. + * V4L/DVB (8667): gspca: Bad probe of Z-Star/Vimicro webcams with pas106 + sensor. + * V4L/DVB (8668): gspca: Conflict GSPCA / ET61X251 for the webcam + 102c:6251. + * V4L/DVB (8669): gspca: Add white balance control for spca561 rev 012A. + * V4L/DVB (8671): gspca: Remove the unused field 'dev_name' of the device + structure. + * V4L/DVB (8672): gspca: Big rewrite of spca561. + * V4L/DVB (8673): gspca: Bad frame scanning again and bad init in + pac7311. + * V4L/DVB (8674): gspca: Webcam 0c45:612e added in sonixj. + * V4L/DVB (8675): gspca: Pixmap PJPG (Pixart 73xx JPEG) added, generated + by pac7311. + * V4L/DVB (8678): Remove the dead CONFIG_RADIO_MIROPCM20{,_RDS} code + * V4L/DVB (8681): v4l2-ioctl.c: fix warning + * V4L/DVB (8682): V4L: fix return value of register video func + * V4L/DVB (8701): cx18: Add missing lock for when the irq handler + manipulates the queues + * V4L/DVB (8703): gspca: Do controls work for spca561 revision 12a. + * V4L/DVB (8705): gspca: Adjust some control limits in spca561. + * V4L/DVB (8706): Make contrast and brightness work for pac7302. + * V4L/DVB (8707): gspca: Colors, hflip and vflip controls added for + pac7302. + * V4L/DVB (8709): gspca: Fix initialization and controls of sn9x110 - + ov7630. + * V4L/DVB (8710): gspca: Bad color control in sonixj. + * V4L/DVB (8711): gspca: Bad controls and quantization table of pac7311. + * V4L/DVB (8712): gspca: Bad start of sonixj webcams since changeset + a8779025e7e8. + * V4L/DVB (8713): gspca: Bad color control again in sonixj. + * V4L/DVB (8714): gspca: Bad start of sn9c110 and sensor om6802. + * V4L/DVB (8715): gspca: Change the name of some webcam in the gspca doc. + * V4L/DVB (8716): gspca: Bad start of sn9c110 and sensor ov7630. + * V4L/DVB (8717): gspca: Frame buffer too small for small resolutions + (sonixj and t613). + * V4L/DVB (8718): gspca: suspend/resume added. + * V4L/DVB (8719): gspca: Have VIDIOC_QUERYCTRL more compliant to the + spec. + * V4L/DVB (8720): gspca: V4L2_CAP_SENSOR_UPSIDE_DOWN added as a cap for + some webcams. + * V4L/DVB (8722): sms1xxx: fix typo in license header + * V4L/DVB (8726): link tuner before saa7134 + * V4L/DVB (8727): V4L1: make PMS not autoprobe when builtin. + * V4L/DVB (8728): 1-make-pms-not-autoprobe-when-builtin update + * V4L/DVB (8749): Fix error code, when camera is not turned on by sonypi + * V4L/DVB (8750): V4L: check inval in video_register_device_index() + * V4L/DVB (8751): vivi: Fix some issues at vivi register routine + * V4L/DVB (8757): v4l-dvb: fix a bunch of sparse warnings + * V4L/DVB (8769): cx18: Simplify queue flush logic to prevent oops in + cx18_flush_queues() + * V4L/DVB (8778): radio: fix incorrect video_register_device result check + * V4L/DVB (8779): v4l: fix more incorrect video_register_device result + checks + * V4L/DVB (8790): saa7115: call i2c_set_clientdata only when state != + NULL + * V4L/DVB (8803): s5h1409: Enable QAM_AUTO mode + * V4L/DVB (8804): s5h1411: Enable QAM_AUTO mode + * V4L/DVB (8805): Steven Toth email address change + * V4L/DVB (8809): gspca: Revert commit + 9a9335776548d01525141c6e8f0c12e86bbde982 + * V4L/DVB (8810): gspca: Compile error when CONFIG_PM not defined. + * V4L/DVB (8812): gspca: Do pac73xx webcams work. + * V4L/DVB (8813): gspca: Adjust SOF detection for pac73xx. + * V4L/DVB (8814): gspca: Set DISABLED the disabled controls at query + control time. + * V4L/DVB (8815): gspca: Fix problems with disabled controls. + * V4L/DVB (8816): gspca: Set disabled ctrls and fix a register pb with + ovxxxx in sonixb. + * V4L/DVB (8817): gspca: LED and proble changes in sonixb. + * V4L/DVB (8818): gspca: Reinitialize the device on resume. + * V4L/DVB (8819): gspca: Initialize the ov519 at open time and source + cleanup. + * V4L/DVB (8820): gspca: Change initialization and gamma of zc3xx - + pas106. + * V4L/DVB (8822): gspca: Change some subdriver functions for + suspend/resume. + * V4L/DVB (8823): gspca: H and V flips work for ov7670 only in ov519. + * V4L/DVB (8824): gspca: Too much code removed in the suspend/resume + changeset. + * V4L/DVB (8825): gspca: More controls for pac73xx and new webcam + 093a:2624. + * V4L/DVB (8826): gspca: Webcam Labtec 2200 (093a:2626) added in pac7311. + * V4L/DVB (8827): gspca: Stop pac7302 autogain oscillation. + * V4L/DVB (8828): gspca: Set the clock at the end of initialization in + sonixj. + * V4L/DVB (8829): gspca: Have a clean kmalloc-ated buffer for USB + exchanges. + * V4L/DVB (8830): gspca: Move some probe code to the new init function. + * V4L/DVB (8831): gspca: Resolve webcam conflicts between some drivers. + * V4L/DVB (8832): gspca: Bad pixelformat of vc0321 webcams. + * V4L/DVB (8833): gspca: Cleanup the sonixb code. + * V4L/DVB (8834): gspca: Have a bigger buffer for sn9c10x compressed + images. + * V4L/DVB (8835): gspca: Same pixfmt as the sn9c102 driver and raw Bayer + added in sonixb. + * V4L/DVB (8837): dvb: fix I2C adapters name size + * V4L/DVB (8839): dib0700: add comment to identify 35th USB id pair + * V4L/DVB (8840): dib0700: add basic support for Hauppauge Nova-TD-500 + (84xxx) + * V4L/DVB (8842): vivi_release(): fix use-after-free + * V4L/DVB (8843): tda10048_firmware_upload(): fix a memory leak + * V4L/DVB (8844): dabusb_fpga_download(): fix a memory leak + * bnx2x: Accessing un-mapped page + * SELinux: memory leak in security_context_to_sid_core + * x86: add io delay quirk for Presario F700 + * mmap: fix petty bug in anonymous shared mmap offset handling + * x86: Change warning message in TSC calibration. + * PCI: fix pbus_size_mem() resource alignment for CardBus controllers + * [ARM] omap: fix build error in ohci-omap.c + * [ARM] remove unused #include + * ACPI: Make Len Brown the ACPI maintainer again + * fujitsu-laptop: fix regression for P8010 in 2.6.27-rc + * ACPI: Avoid bogus timeout about SMbus check + * acer-wmi: remove debugfs entries upon unloading + * forgotten refcount on sysctl root table + * V4L/DVB (8868): gspca: Support for vga modes with sif sensors in + sonixb. + * V4L/DVB (8869): gspca: Move the Sonix webcams with TAS5110C1B from + sn9c102 to gspca. + * V4L/DVB (8870): gspca: Fix dark room problem with sonixb. + * V4L/DVB (8872): gspca: Bad image format and offset with rev072a of + spca561. + * V4L/DVB (8873): gspca: Bad image offset with rev012a of spca561 and + adjust exposure. + * V4L/DVB (8874): gspca: Adjust hstart for sn9c103/ov7630 and update + usb-id's. + * [ARM] omap: fix virtual vs physical address space confusions + * V4L/DVB (8876): budget: udelay changed to mdelay + * V4L/DVB (8877): b2c2 and bt8xx: udelay to mdelay + * V4L/DVB (8880): PATCH: Fix parents on some webcam drivers + * V4L/DVB (8881): gspca: After 'while (retry--) {...}', retry will be -1 + but not 0. + * powerpc/spufs: Fix multiple get_spu_context() + * powerpc/spufs: Fix race for a free SPU + * Input: bcm5974 - small formatting cleanup + * Input: bcm5974 - improve finger tracking and counting + * Input: bcm5974 - add BTN_TOUCH event for mousedev benefit + * Input: i8042 - make Lenovo 3000 N100 blacklist entry more specific + * sh: resume_kernel fix for kernel oops built with CONFIG_BKL_PREEMPT=y. + * sh64: resume_kernel fix for kernel oops built with + CONFIG_BKL_PREEMPT=y. + * i2c: fix i2c-sh_mobile timing issues + * clockevents: prevent clockevent event_handler ending up handler_noop + * clockevents: prevent endless loop in periodic broadcast handler + * clockevents: enforce reprogram in oneshot setup + * clockevents: prevent multiple init/shutdown + * clockevents: prevent endless loop lockup + * HPET: make minimum reprogramming delta useful + * [MTD] [NAND] tmio_nand: fix base address programming + * Fix conditional export of kvh.h and a.out.h to userspace. + * async_tx: fix the bug in async_tx_run_dependencies + * sched_clock: fix NOHZ interaction + * sched: fix process time monotonicity + * UBIFS: fix division by zero + * UBIFS: make minimum fanout 3 + * [MIPS] Fix data bus error recovery + * [MIPS] Fix WARNING: at kernel/smp.c:290 + * [MIPS] TXx9: Fix txx9_pcode initialization + * [MIPS] TX39xx: Add missing local_flush_icache_range initialization + * [MIPS] Probe initrd header only if explicitly specified + * res_counter: fix off-by-one bug in setting limit + * forcedeth: fix kexec regression + * atmel_lcdfb: fix oops in rmmod when framebuffer fails to register + * tracehook: comment pasto fixes + * drivers/mmc/card/block.c: fix refcount leak in mmc_block_open() + * x86: boot: stub out unimplemented CPU feature words + * x86: add NOPL as a synthetic CPU feature bit + * x86: use X86_FEATURE_NOPL in alternatives + * clockevents: broadcast fixup possible waiters + * x86: HPET fix moronic 32/64bit thinko + * x86: HPET: read back compare register before reading counter + * Fix CONFIG_AC97_BUS dependency + * [ARM] 5241/1: provide ioremap_wc() + * ntp: fix calculation of the next jiffie to trigger RTC sync + * clocksource, acpi_pm.c: use proper read function also in errata mode + * clocksource, acpi_pm.c: check for monotonicity + * x86: delay early cpu initialization until cpuid is done + * x86: move mtrr cpu cap setting early in early_init_xxxx + * sched: arch_reinit_sched_domains() must destroy domains to force + rebuild + * x86, xen: Use native_pte_flags instead of native_pte_val for .pte_flags + * x86: pda_init(): fix memory leak when using CPU hotplug + * x86: cpu_init(): fix memory leak when using CPU hotplug + * powerpc/spufs: Fix possible scheduling of a context to multiple SPEs + * netfilter: nf_conntrack_sip: de-static helper pointers + * netfilter: nf_conntrack_gre: more locking around keymap list + * netfilter: nf_conntrack_gre: nf_ct_gre_keymap_flush() fixlet + * netfilter: nf_conntrack_irc: make sure string is terminated before + calling simple_strtoul + * pkt_sched: Fix qdisc state in net_tx_action() + * powerpc: Fix rare boot build breakage + * ahci, pata_marvell: play nicely together + * sata_mv: add RocketRaid 1720 PCI ID to driver + * ahci: disable PMP for marvell ahcis + * sata_nv: disable hardreset for generic + * libata-sff: kill spurious WARN_ON() in ata_hsm_move() + * pata_sil680: remove duplicate pcim_enable_device + * ahci: RAID mode SATA patch for Intel Ibex Peak DeviceIDs + * [MIPS] IP22: Fix detection of second HPC3 on Challenge S + * xen: fix 2.6.27-rc5 xen balloon driver warnings + * x86: disable static NOPLs on 32 bits + * netns : fix kernel panic in timewait socket destruction + * bridge: don't allow setting hello time to zero + * NFS: Restore missing hunk in NFS mount option parser + * usb: fix null deferences in low level usb serial + * Fix format of MAINTAINERS + * sparc64: Disable timer interrupts in fixup_irqs(). + * [Bluetooth] Fix reference counting during ACL config stage + * [Bluetooth] Enforce correct authentication requirements + * [Bluetooth] Reject L2CAP connections on an insecure ACL link + * [S390] CVE-2008-1514: prevent ptrace padding area read/write in 31-bit + mode + * [S390] cio: Correct cleanup on error. + * [S390] cio: handle ssch() return codes correctly. + * [S390] cio: allow offline processing for disconnected devices + * ipsec: Restore larval states and socket policies in dump + * update Documentation/filesystems/Locking for 2.6.27 changes + * MAINTAINERS: add Atheros maintainer for atlx + * lib: Correct printk %pF to work on all architectures + * x86: fix memmap=exactmap boot argument + * clockevents: remove WARN_ON which was used to gather information + * ipv6: Fix OOPS in ip6_dst_lookup_tail(). + * Linux 2.6.27-rc6 + + -- Ben Collins Tue, 02 Sep 2008 12:45:56 -0400 + +linux (2.6.27-2.3) intrepid; urgency=low + + [ Ben Collins ] + + * build/retag: Make script save .orig of tags for later use + * ubuntu/lirc: Fix device_create call + * build/firmware: Put in-kernel firmware into version specific subdir + - LP: #262115 + * Rebase on linux-2.6 git. + * ABI bump + + [ Herton Ronaldo Krzesinski ] + + * SAUCE: (no-up) Apparmor warning fixes + + [ John Johansen ] + + * SAUCE: (no-up) Proper AppArmor ptrace updates for newer lsm API + + [ Mackenzie Morgan ] + + * SAUCE: Add quirk for ASUS Z37E to make sound audible after resume + - LP: #25896 + + -- Ben Collins Wed, 27 Aug 2008 14:03:05 -0400 + +linux (2.6.27-1.2) intrepid; urgency=low + + [ Amit Kucheria ] + + * SAUCE: make fc transport removal of target configurable + * SAUCE: pm: Config option to disable handling of console during + suspend/resume + + [ Ben Collins ] + + * SAUCE: Lower warning level of some PCI messages + * SAUCE: input/mouse/alps: Do not call psmouse_reset() for alps + * SAUCE: tulip: Let dmfe handle davicom on non-sparc + * SAUCE: tulip: Define ULI PCI ID's + * SAUCE: (no-up) version: Implement version_signature proc file. + * SAUCE: (no-up) connector.h: Add idx/val for drbd + * SAUCE: (no-up) swap: Add notify_swap_entry_free callback for compcache + * SAUCE: drivers: Remove some duplicate device entries in various modules + * SAUCE: (no-up) [AppArmor] merge with upstream subversion r1291 + * SAUCE: apparmor: Update for changes to ptrace lsm hooks + * SAUCE: (no-up) Enable ubuntu extra subdirectory + * SAUCE: applesmc: Add MacBookAir + * SAUCE: (no-up) ACPI: initramfs DSDT override support + * ubuntu: Add drbd module + * ubuntu: Add iscsitarget module + * ubuntu: Add BOM for iscsitarget + * ubuntu: Add squashfs driver + * SAUCE: (no-up) Check for squashfs superblock in initramfs mounting. + * ubuntu: Add aufs module + * ubuntu: Added atl2 driver + * ubuntu: Added et131x driver + * ubuntu: Add dm-raid4-5 driver + * ubuntu: Add ndiswrapper driver + * ubuntu: Added ram backed compressed swap module (compcache) + * ubuntu: Add misc drivers from hardy lum + * ubuntu: Add heci driver 3.2.0.24 + * ubuntu: Add ov511 and bt-sco drivers + * ubuntu: Add acx, prism2_usb wireless drivers + * ubuntu: Add at76 driver to build + * ubuntu: Add fsam7400 sw kill switch driver + * ubuntu: Added qc-usb driver + * ubuntu: e1000e: Upgraded module to 0.4.1.7 + * ubuntu: Added rfkill drivers + * ubuntu: VIA - Add VIA DRM Chrome9 3D engine + * ubuntu: unionfs: Added v1.4 module from hardy + * ubuntu: Add LIRC driver + * ubuntu: Add GFS driver + * ubuntu: New tlsup driver for toshiba laptops + * Update config files + * build/d-i: Remove obsolete dm modules + + [ Chuck Short ] + + * SAUCE: ata: blacklist FUJITSU MHW2160BH PL + + [ Colin Ian King ] + + * ubuntu: Add dm-loop + * SAUCE: Enable speedstep for sonoma processors. + + [ Dennis Noordsij ] + + * SAUCE: Work around ACPI corruption upon suspend on some Dell machines. + + [ Fabio M. Di Nitto ] + + * SAUCE: Export gfs2 symbols required for gfs1 kernel module + + [ Matthew Garrett ] + + * SAUCE: hostap: send events on data interface as well as master + interface + + [ Michael Frey (Senior Manager, MID ] + + * SAUCE: Send HCI_RESET for Broadcomm 2046 + + [ Phillip Lougher ] + + * SAUCE: r8169: disable TSO by default for RTL8111/8168B chipsets. + + [ Stefan Bader ] + + * SAUCE: (no-up) Export dm_disk function of device-mapper + * SAUCE: Restore VT fonts on switch + * SAUCE: mmc: Increase power_up deleay to fix TI readers + + [ Tim Gardner ] + + * SAUCE: Add extra headers to linux-libc-dev + * SAUCE: Catch nonsense keycodes and silently ignore + * SAUCE: Added support for HDAPS on various ThinkPads from Lenovo and IBM + * SAUCE: Guest OS does not recognize a lun with non zero target id on + Vmware ESX Server + * SAUCE: (no-up) Take care of orinoco_cs overlap with hostap_cs + * ubuntu: Add GNBD driver + + -- Ben Collins Sat, 23 Aug 2008 15:48:35 -0400 + +linux (2.6.27-0.0) intrepid; urgency=low + + * Not uploaded, placeholder for new release + + -- Ben Collins Sat, 23 Aug 2008 15:48:35 -0400 + +linux (2.6.26-5.17) intrepid; urgency=low + + [ Ben Collins ] + + * build/abi: Add tosh_smm symbol to blacklist + + -- Ben Collins Fri, 15 Aug 2008 09:29:34 -0400 + +linux (2.6.26-5.16) intrepid; urgency=low + + [ Ben Collins ] + + * Revert "SAUCE: toshiba_acpi: Rewrote most of the proc entry bits." + * Revert "SAUCE: Update toshiba_acpi.c to version 0.19a" + * build/config: Disable in-kernel toshiba driver(s) + * ubuntu/tlsup: New driver for toshiba laptops + * build/config: Enable TLSUP driver + * SAUCE: e1000e: Fix E1000E_ENABLED logic to check for our E1000E_NEW + driver as well + * ubuntu/e1000e: Remove E1000E_ENABLED option in local config + * build/config: Update configs to have E1000E_ENABLED set + * ubuntu/prism2: Remove duplicate device + + [ Fabio M. Di Nitto ] + + * SAUCE: Export gfs2 symbols required for gfs1 kernel module + + [ Stefan Bader ] + + * SAUCE: x86: HPET rework for SB700 + - LP: #255910 + + [ Tim Gardner ] + + * Add GNBD driver + * Enable GNBD driver + * SAUCE: Add GFS driver + * SAUCE: Enable gfs driver configs + * b43: Linksys WMP54G (BCM4306/3) card in a PCI format has an SPROM + coding + + [ Upstream Kernel Changes ] + + * KVM: x86 emulator: emulate clflush + * USB: quirk PLL power down mode + + -- Ben Collins Mon, 11 Aug 2008 13:19:28 -0400 + +linux (2.6.26-5.15) intrepid; urgency=low + + [ Ben Collins ] + + * Revert "SAUCE: Add blacklist support to fix Belkin bluetooth dongle." + - Superceded by upstream changes. + * build/config: New option enabled for uvcvideo + * build/control: Add Vcs-Git meta data to control file + * SAUCE: toshiba_acpi: Rewrote most of the new code + * abi/perm-blacklist: Add emu10k1 driver to blacklist + + [ Upstream Kernel Changes ] + + * pxamci: trivial fix of DMA alignment register bit clearing + * udplite: Protection against coverage value wrap-around + * ipv6: use timer pending + * ipv6: __KERNEL__ ifdef struct ipv6_devconf + * hdlcdrv: Fix CRC calculation. + * quota: fix possible infinite loop in quota code + * isofs: fix minor filesystem corruption + * KVM: VMX: Fix a wrong usage of vmcs_config + * KVM: SVM: fix suspend/resume support + * KVM: mmu_shrink: kvm_mmu_zap_page requires slots_lock to be held + * KVM: VMX: Add ept_sync_context in flush_tlb + * KVM: x86 emulator: Fix HLT instruction + * KVM: MMU: nuke shadowed pgtable pages and ptes on memslot destruction + * KVM: MMU: Fix potential race setting upper shadow ptes on nonpae hosts + * Patch Upstream: x86 ptrace: fix PTRACE_GETFPXREGS error + * rcu: fix rcu_try_flip_waitack_needed() to prevent grace-period stall + * Fix typos from signal_32/64.h merge + * x86 reboot quirks: add Dell Precision WorkStation T5400 + * USB: fix usb serial pm counter decrement for disconnected interfaces + * x86, suspend, acpi: enter Big Real Mode + * markers: fix duplicate modpost entry + * Fix build on COMPAT platforms when CONFIG_EPOLL is disabled + * proc: fix /proc/*/pagemap some more + * cpusets: fix wrong domain attr updates + * x86: fix crash due to missing debugctlmsr on AMD K6-3 + * ide-cd: fix oops when using growisofs + * rtc-at91rm9200: avoid spurious irqs + * vmlinux.lds: move __attribute__((__cold__)) functions back into final + .text section + * ARM: fix fls() for 64-bit arguments + * tcp: Clear probes_out more aggressively in tcp_ack(). + * sparc64: Fix lockdep issues in LDC protocol layer. + * sparc64: Fix cpufreq notifier registry. + * sparc64: Do not define BIO_VMERGE_BOUNDARY. + * iop-adma: fix platform driver hotplug/coldplug + * myri10ge: do not forget to setup the single slice pointers + * myri10ge: do not use mgp->max_intr_slots before loading the firmware + * ALSA: trident - pause s/pdif output + * V4L: cx18: Upgrade to newer firmware & update documentation + * DVB: dib0700: add support for Hauppauge Nova-TD Stick 52009 + * V4L: uvcvideo: Fix a buffer overflow in format descriptor parsing + * V4L: uvcvideo: Use GFP_NOIO when allocating memory during resume + * V4L: uvcvideo: Don't free URB buffers on suspend + * V4L: uvcvideo: Make input device support optional + * V4L: uvcvideo: Add support for Medion Akoya Mini E1210 integrated + webcam + * V4L: saa7134: Copy tuner data earlier to avoid overwriting manual tuner + type + * V4L: cx23885: Bugfix for concurrent use of /dev/video0 and /dev/video1 + * DVB: cx23885: Ensure PAD_CTRL is always reset to a sensible default + * DVB: cx23885: DVB Transport cards using DVB port VIDB/TS1 did not + stream + * DVB: cx23885: Reallocated the sram to avoid concurrent VIDB/C issues + * DVB: cx23885: SRAM changes for the 885 and 887 silicon parts + * x86: fix kernel_physical_mapping_init() for large x86 systems + * eCryptfs: use page_alloc not kmalloc to get a page of memory + * UML - Fix boot crash + * ixgbe: remove device ID for unsupported device + * mpc52xx_psc_spi: fix block transfer + * tmpfs: fix kernel BUG in shmem_delete_inode + * markers: fix markers read barrier for multiple probes + * VFS: increase pseudo-filesystem block size to PAGE_SIZE + * cpufreq acpi: only call _PPC after cpufreq ACPI init funcs got called + already + * b43legacy: Release mutex in error handling code + * ath5k: don't enable MSI, we cannot handle it yet + * Fix off-by-one error in iov_iter_advance() + * Linux 2.6.26.1 + * ftrace: remove unneeded documentation + * romfs_readpage: don't report errors for pages beyond i_size + * netfilter: nf_nat_sip: c= is optional for session + * SCSI: bsg: fix bsg_mutex hang with device removal + * x86: idle process - add checking for NULL early param + * x86: io delay - add checking for NULL early param + * Close race in md_probe + * Kprobe smoke test lockdep warning + * netfilter: xt_time: fix time's time_mt()'s use of do_div() + * linear: correct disk numbering error check + * SCSI: ch: fix ch_remove oops + * NFS: Ensure we zap only the access and acl caches when setting new acls + * jbd: fix race between free buffer and commit transaction + * Input: i8042 - add Intel D845PESV to nopnp list + * Input: i8042 - add Gericom Bellagio to nomux blacklist + * Input: i8042 - add Acer Aspire 1360 to nomux blacklist + * Bluetooth: Signal user-space for HIDP and BNEP socket errors + * Add compat handler for PTRACE_GETSIGINFO + * ALSA: hda - Fix wrong volumes in AD1988 auto-probe mode + * ALSA: hda - Fix DMA position inaccuracy + * ALSA: hda - Add missing Thinkpad Z60m support + * ALSA: emu10k1 - Fix inverted Analog/Digital mixer switch on Audigy2 + * vfs: fix lookup on deleted directory + * Ath5k: fix memory corruption + * Ath5k: kill tasklets on shutdown + * sound: ensure device number is valid in snd_seq_oss_synth_make_info + * Linux 2.6.26.2 + + -- Ben Collins Sun, 03 Aug 2008 13:25:02 -0400 + +linux (2.6.26-5.14) intrepid; urgency=low + + [ Ben Collins ] + + * SAUCE: applesmc: Add MacBookAir + * build: Do not build ddeb unless we are on the buildd + * build: control: Consistency in arch fields. + * SAUCE: Update toshiba_acpi.c to version 0.19a + - LP: #77026 + * build: Added perm blacklist support and per-module support to abi-check + - Blacklist p80211 module from abi checks + * ubuntu/lirc: Get rid of drivers symlink and use real include stuff + + + [ Colin Ian King ] + + * SAUCE: acerhk module - add support for Amilo A1650g keyboard + - LP: #84159 + * SAUCE: rt2x00: Fix OOPS on failed creation of rt2x00lib workqueue + - LP: #249242 + + [ Mario Limonciello ] + + * Add LIRC back in + + [ Tim Gardner ] + + * Makefile race condition can lead to ndiswrapper build failure + - LP: #241547 + * update linux-wlan-ng (prism2_usb) to upstream version 1861 + - LP: #245026 + + [ Upstream Kernel Changes ] + + * Fix typos from signal_32/64.h merge + + -- Ben Collins Fri, 01 Aug 2008 00:05:01 -0400 + +linux (2.6.26-5.13) intrepid; urgency=low + + [ Ben Collins ] + + * build: Make makedumpfile an amd64/i386 only build-dep + * ubuntu/acerhk: Fixup assembly to compile with newer binutils + + -- Ben Collins Sat, 26 Jul 2008 16:41:50 -0400 + +linux (2.6.26-4.12) intrepid; urgency=low + + [ Ben Collins ] + + * e1000e: Upgraded module to 0.4.1.7 upstream. Placed in ubuntu/, + in-kernel driver disabled + * config: Disable e1000e in-kernel, and enable newer driver in ubuntu/ + * rfkill: Update to 1.3 drivers, and move to common location + * ubuntu: Actually link kconfig/kbuild into rfkill subdir + * config: Enable loading dsdt from initramfs + - LP: #246222 + * ubuntu: [compcache] Update to fix crashes in improper BUG() + * build: Create a retag scripts to recover tags from rebases + * build: Updates for dbg pkg + * build: Make sure no empty lines show up in debian/files + * ubuntu: atl1e: Add new driver from 2.6.27-pre-rc1 + - LP: #243894 + * sys_getcwd: Fix some brokeness introduced by AppArmor __d_path + changes + - LP: #251223 + * ubuntu: unionfs: Added v1.4 module from hardy + * build: Add sub-flavour infrastructure, and virtual subflav + + [ Eric Piel ] + + * ACPI: Allow custom DSDT tables to be loaded from initramfs + + [ Kees Cook ] + + * AppArmor: Smack VFS patches + + [ Mario Limonciello ] + + * Work around ACPI corruption upon suspend on some Dell machines. + - LP: #183033 + + [ Tim Gardner ] + + * Export usbhid_modify_dquirk for LBM module bcm5974 + - LP: #250838 + * VIA - Add VIA DRM Chrome9 3D engine + - LP: #251862 + * Define TRUE/FALSE for VIA DRM driver. + + -- Ben Collins Tue, 15 Jul 2008 12:51:39 -0400 + +linux (2.6.26-4.11) intrepid; urgency=low + + [ Ben Collins ] + + * config: Enable bcm5974 driver in all configs + + [ 2.6.26-4.10 ] + + [ Amit Kucheria ] + + * Fix typo in GSPCA Makefile and make it compile + + [ Ben Collins ] + + * ubuntu: Remove UVC driver in favor of in-kernel one (-rc9) + * config: Updates for -rc9 + * ubuntu: Add acx, prism2_usb wireless drivers + * config: Enable prism2_usb and acx drivers. + * ubuntu: Add at76 driver to build + * config: Enable at76_usb driver. + * iscsitarget: Fix prototype for bi_end_io callback. + * acx: Fix section type mismatch warnings + * fsam7400: Add sw kill switch driver + * config: Enable fsam7400 driver + * qc-usb: Added new driver + * config: Enable qc-usb driver + * drbd: Remove built-in connector usage + * drbd: Do not define idx/val for connector here + * connector.h: Add idx/val for drbd + * bcm5974: Added new driver + + [ Kees Cook ] + + * SAUCE: [AppArmor] merge with upstream subversion r1291 + * SAUCE: [AppArmor] fix typo in selinux_inode_link + * SAUCE: [AppArmor] aufs patches + + [ Michael Frey (Senior Manager, MID ] + + * SAUCE: Send HCI_RESET for Broadcomm 2046 + - LP: #241749 + + [ Tim Gardner ] + + * SAUCE: Medion Akoya Mini E1210 + + [ Upstream Kernel Changes ] + + * Revert "BAST: Remove old IDE driver" + * ARM: OMAP: DMA: Don't mark channel active in omap_enable_channel_irq + * ARM: OMAP: Correcting the gpmc prefetch control register address + * debugobjects: fix lockdep warning + * [ARM] 5115/1: pxafb: fix ifdef for command line option handling + * [ARM] 5116/1: pxafb: cleanup and fix order of failure handling + * [ARM] 5109/1: Mark rtc sa1100 driver as wakeup source before + registering it + * [ARM] Export dma_sync_sg_for_device() + * fix cgroup-inflicted breakage in block_dev.c + * [patch for 2.6.26 2/4] vfs: utimensat(): be consistent with utime() for + immutable and append-only files + * [patch for 2.6.26 1/4] vfs: utimensat(): ignore tv_sec if tv_nsec == + UTIME_OMIT or UTIME_NOW + * [patch for 2.6.26 3/4] vfs: utimensat(): fix error checking for + {UTIME_NOW,UTIME_OMIT} case + * [patch for 2.6.26 4/4] vfs: utimensat(): fix write access check for + futimens() + * [patch 1/4] vfs: path_{get,put}() cleanups + * [patch 2/4] fs: make struct file arg to d_path const + * [patch 3/4] vfs: fix ERR_PTR abuse in generic_readlink + * [patch 4/4] flock: remove unused fields from file_lock_operations + * [patch 3/3] vfs: make d_path() consistent across mount operations + * [patch 1/3] vfs: dcache sparse fixes + * [patch 2/3] vfs: dcache cleanups + * udf: Fix regression in UDF anchor block detection + * [SCSI] ses: Fix timeout + * netfilter: ip6table_mangle: don't reroute in LOCAL_IN + * [SCSI] esp: Fix OOPS in esp_reset_cleanup(). + * kernel/audit.c: nlh->nlmsg_type is gotten more than once + * audit: fix kernel-doc parameter notation + * remove useless argument type in audit_filter_user() + * Blackfin arch: fix bug - kernel boot fails when Spinlock and rw-lock + debugging enabled + * Blackfin arch: fix up section mismatch warning + * mac80211: implement EU regulatory domain + * b43: Do not return TX_BUSY from op_tx + * b43legacy: Do not return TX_BUSY from op_tx + * b43: Fix possible MMIO access while device is down + * b43legacy: Fix possible NULL pointer dereference in DMA code + * rt2x00: Fix unbalanced mutex locking + * iwlwifi: improve scanning band selection management + * [SCSI] esp: tidy up target reference counting + * [ARM] 5117/1: pxafb: fix __devinit/exit annotations + * thermal: Create CONFIG_THERMAL_HWMON=n + * ACPI: don't walk tables if ACPI was disabled + * dock: bay: Don't call acpi_walk_namespace() when ACPI is disabled. + * x86: shift bits the right way in native_read_tscp + * x86: section/warning fixes + * V4L/DVB (8004): Fix INPUT dependency at budget-ci + * V4L/DVB (8005): Fix OOPS if frontend is null + * V4L/DVB (8007): cx18/cx25840: the S-Video LUMA input can use all + In1-In8 inputs + * V4L/DVB (8008): cx18: remove duplicate audio and video input enums + * V4L/DVB (8010): em28xx: Properly register extensions for already + attached devices + * V4L/DVB (8011): em28xx: enable DVB for HVR-900 + * V4L/DVB (8012): gl861: sleep a little to avoid I2C errors + * V4L/DVB (8013): gl861: remove useless identify_state + * V4L/DVB (8015): gl861: replace non critical msleep(0) with msleep(1) to + be on the safe side + * V4L/DVB (8017): Ensure em28xx extensions only get run against devs that + support them + * V4L/DVB (8018): Add em2860 chip ID + * V4L/DVB (8020): Fix callbacks functions of saa7134_empress + * V4L/DVB (8022): saa7134: fix race between opening and closing the + device + * V4L/DVB (8026): Avoids an OOPS if dev struct can't be successfully + recovered + * V4L/DVB (8027): saa7134: Avermedia A700: only s-video and composite + input are working + * V4L/DVB (8028): Improve error messages for tda1004x attach + * V4L/DVB (8029): Improve error message at tda1004x_attach + * V4L/DVB (8034): tda18271: fix IF notch frequency handling + * V4L/DVB (8035): tda18271: dont touch EB14 if rf_cal lookup is out of + range + * V4L/DVB (8036): tda18271: toggle rf agc speed mode on TDA18271HD/C2 + only + * V4L/DVB (8037): tda18271: ensure that the thermometer is off during + channel configuration + * V4L/DVB (8039): pxa-camera: fix platform_get_irq() error handling. + * V4L/DVB (8040): soc-camera: remove soc_camera_host_class class + * V4L/DVB (8042): DVB-USB UMT-010 channel scan oops + * V4L/DVB (8043): au0828: add support for additional USB device id's + * V4L/DVB (8044): au8522: tuning optimizations + * V4L/DVB (8048): saa7134: Fix entries for Avermedia A16d and Avermedia + E506 + * V4L/DVB (8061): cx18: only select tuner / frontend modules if + !DVB_FE_CUSTOMISE + * V4L/DVB (8063): cx18: Fix unintended auto configurations in + cx18-av-core + * V4L/DVB (8066): cx18: Fix audio mux input definitions for HVR-1600 Line + In 2 and FM radio + * V4L/DVB (8067): cx18: Fix firmware load for case when digital capture + happens first + * V4L/DVB (8068): cx18: Add I2C slave reset via GPIO upon initialization + * V4L/DVB (8069): cx18: Fix S-Video and Compsite inputs for the Yuan + MPC718 and enable card entry + * V4L/DVB (8071): tda10023: Fix possible kernel oops during + initialisation + * V4L/DVB (8073): av7110: Catch another type of ARM crash + * V4L/DVB (8074): av7110: OSD transfers should not be interrupted + * V4L/DVB (8075): stv0299: Uncorrected block count and bit error rate + fixed + * V4L/DVB (8092): videodev: simplify and fix standard enumeration + * V4L/DVB (8096): au8522: prevent false-positive lock status + * V4L/DVB (8097): xc5000: check device hardware state to determine if + firmware download is needed + * V4L/DVB (8100): V4L/vivi: fix possible memory leak in vivi_fillbuff + * V4L/DVB (8108): Fix open/close race in saa7134 + * s2io: fix documentation about intr_type + * tc35815: Mark carrier-off before starting PHY + * tc35815: Fix receiver hangup on Rx FIFO overflow + * ixgbe: fix EEH recovery during reset on PPC + * igb: fix EEH recovery during reset on PPC + * e1000e: fix EEH recovery during reset on PPC + * pcnet_cs, axnet_cs: clear bogus interrupt before request_irq + * drivers/net/r6040.c: Eliminate double sizeof + * ipg: fix jumbo frame compilation + * ipg: use NULL, not zero, for pointers + * [netdrvr] 3c59x: remove irqs_disabled warning from local_bh_enable + * [netdrvr] netxen: fix netxen_pci_tbl[] breakage + * e100: Do pci_dma_sync after skb_alloc for proper operation on ixp4xx + * e1000: only enable TSO6 via ethtool when using correct hardware + * [netdrvr] Fix IOMMU overflow checking in s2io.c + * qla3xxx: Hold RTNL while calling dev_close() + * Hold RTNL while calling dev_close() + * sata_uli: hardreset is broken + * rt2x00: Fix lock dependency errror + * prism: islpci_eth.c endianness fix + * mac80211: fix an oops in several failure paths in key allocation + * firewire: fw-sbp2: fix parsing of logical unit directories + * kbuild: fix a.out.h export to userspace with O= build. + * Ensure interrupted recovery completed properly (v1 metadata plus + bitmap) + * Don't acknowlege that stripe-expand is complete until it really is. + * Fix error paths if md_probe fails. + * hamradio: remove unused variable + * tcp: calculate tcp_mem based on low memory instead of all memory + * tcp: fix for splice receive when used with software LRO + * af_unix: fix 'poll for write'/connected DGRAM sockets + * netdevice: Fix typo of dev_unicast_add() comment + * pkt_sched: ERR_PTR() ususally encodes an negative errno, not positive. + * pkt_sched: Remove CONFIG_NET_SCH_RR + * include/linux/netdevice.h: don't export MAX_HEADER to userspace + * tcp: /proc/net/tcp rto,ato values not scaled properly (v2) + * netlink: Fix some doc comments in net/netlink/attr.c + * CONNECTOR: add a proc entry to list connectors + * inet fragments: fix race between inet_frag_find and + inet_frag_secret_rebuild + * net/inet_lro: remove setting skb->ip_summed when not LRO-able + * netlabel: Fix a problem when dumping the default IPv6 static labels + * ipv6 route: Convert rt6_device_match() to use RT6_LOOKUP_F_xxx flags. + * sched: fix cpu hotplug + * Fix and clean top .gitignore + * x86: fix cpu hotplug crash + * ptrace GET/SET FPXREGS broken + * Input: add KEY_MEDIA_REPEAT definition + * Input: fix locking in force-feedback core + * [ARM] 5131/1: Annotate platform_secondary_init with trace_hardirqs_off + * ide: fix /proc/ide/ide?/mate reporting + * netfilter: nf_conntrack_tcp: fixing to check the lower bound of valid + ACK + * textsearch: fix Boyer-Moore text search bug + * hostap: don't report useless WDS frames by default + * hostap: fix sparse warnings + * mac80211: don't accept WEP keys other than WEP40 and WEP104 + * V4L/DVB (8145a): USB Video Class driver + * [IA64] Bugfix for system with 32 cpus + * [IA64] export account_system_vtime + * sched: fix divide error when trying to configure rt_period to zero + * x86: fix NODES_SHIFT Kconfig range + * block: Fix the starving writes bug in the anticipatory IO scheduler + * Properly notify block layer of sync writes + * rcu: fix hotplug vs rcu race + * I2C: S3C2410: Check ACK on byte transmission + * I2C: S3C2410: Fixup error codes returned rom a transfer. + * I2C: S3C2410: Add MODULE_ALIAS() for s3c2440 device. + * PCI: Restrict VPD read permission to root + * powerpc/bootwrapper: update for initrd with simpleImage + * i2c: Documentation: fix device matching description + * i2c: Fix bad hint about irqs in i2c.h + * powerpc/legacy_serial: Bail if reg-offset/shift properties are present + * powerpc/mpc5200: Fix lite5200b suspend/resume + * ipv4: fix sysctl documentation of time related values + * net-sched: change tcf_destroy_chain() to clear start of filter list + * net-sched: fix filter destruction in atm/hfsc qdisc destruction + * netlink: Unneeded local variable + * net: Tyop of sk_filter() comment + * netdevice: Fix wrong string handle in kernel command line parsing + * net: fib_rules: fix error code for unsupported families + * dm crypt: use cond_resched + * V4L/DVB (8178): uvc: Fix compilation breakage for the other drivers, if + uvc is selected + * PCI: Limit VPD read/write lengths for Broadcom 5706, 5708, 5709 rev. + * PCI: acpiphp: cleanup notify handler on all root bridges + * drivers/input/ff-core.c needs + * DRM/i915: only use tiled blits on 965+ + * tty: Fix inverted logic in send_break + * x86: fix Intel Mac booting with EFI + * arch/x86/mm/init_64.c: early_memtest(): fix types + * 9p: fix O_APPEND in legacy mode + * slub: Do not use 192 byte sized cache if minimum alignment is 128 byte + * Do not overwrite nr_zones on !NUMA when initialising zlcache_ptr + * [MIPS] IP32: Fix unexpected irq 71 + * [MIPS] IP22: Fix crashes due to wrong L1_CACHE_BYTES + * [MIPS] cevt-txx9: Reset timer counter on initialization + * hrtimer: prevent migration for raising softirq + * svcrpc: fix handling of garbage args + * OHCI: Fix problem if SM501 and another platform driver is selected + * USB: fix cdc-acm resume() + * USB: ehci - fix timer regression + * USB: ohci - record data toggle after unlink + * USB: mass storage: new id for US_SC_CYP_ATACB + * sisusbvga: Fix oops on disconnect. + * USB: New device ID for ftdi_sio driver + * USB: fix interrupt disabling for HCDs with shared interrupt handlers + * USB: don't lose disconnections during suspend + * USB: another option device id + * USB: add a pl2303 device id + * USB: fix Oops on loading ipaq module since 2.6.26 + * USB: adding comment for ipaq forcing number of ports + * [MIPS] Fix bug in atomic_sub_if_positive. + * xen: fix address truncation in pte mfn<->pfn conversion + * sata_sil24: add DID for another adaptec flavor + * ahci: always clear all bits in irq_stat + * libata-sff: improve HSM violation reporting + * sata_mv: safer logic for limit_warnings + * Update maintainers for powerpc + * Christoph has moved + * mm: dirty page accounting vs VM_MIXEDMAP + * rtc: rtc_read_alarm() handles wraparound + * firmware: fix the request_firmware() dummy + * serial: fix serial_match_port() for dynamic major tty-device numbers + * get_user_pages(): fix possible page leak on oom + * rtc-x1205: Fix alarm set + * rtc: fix CMOS time error after writing /proc/acpi/alarm + * pci: VT3336 can't do MSI either + * Miguel Ojeda has moved + * ext3: add missing unlock to error path in ext3_quota_write() + * ext4: add missing unlock to an error path in ext4_quota_write() + * reiserfs: add missing unlock to an error path in reiserfs_quota_write() + * ecryptfs: remove unnecessary mux from ecryptfs_init_ecryptfs_miscdev() + * lib: taint kernel in common report_bug() WARN path. + * gpio: pca953x (i2c) handles max7310 too + * fsl_diu_fb: fix build with CONFIG_PM=y, plus fix some warnings + * Update taskstats-struct document for scaled time accounting + * cciss: fix regression that no device nodes are created if no logical + drives are configured. + * delay accounting: maintainer update + * Doc*/kernel-parameters.txt: fix stale references + * hdaps: add support for various newer Lenovo thinkpads + * mn10300: export certain arch symbols required to build allmodconfig + * mn10300: provide __ucmpdi2() for MN10300 + * Introduce rculist.h + * man-pages is supported + * ntfs: update help text + * add kernel-doc for simple_read_from_buffer and memory_read_from_buffer + * w100fb: do not depend on SHARPSL + * w100fb: add 80 MHz modeline + * MFD maintainer + * cgroups: document the effect of attaching PID 0 to a cgroup + * spi: fix the read path in spidev + * doc: doc maintainers + * security: filesystem capabilities: fix fragile setuid fixup code + * security: filesystem capabilities: fix CAP_SETPCAP handling + * Alpha Linux kernel fails with inconsistent kallsyms data + * cpusets: document proc status cpus and mems allowed lists + * MAINTAINERS: update the email address of Andreas Dilger + * cciss: read config to obtain max outstanding commands per controller + * olpc: sdhci: add quirk for the Marvell CaFe's vdd/powerup issue + * olpc: sdhci: add quirk for the Marvell CaFe's interrupt timeout + * cpumask: introduce new APIs + * mm: switch node meminfo Active & Inactive pages to Kbytes + * Update MAINTAINERS file for the TPM device driver + * devcgroup: fix odd behaviour when writing 'a' to devices.allow + * doc: document the relax_domain_level kernel boot argument + * mmc: don't use DMA on newer ENE controllers + * mempolicy: mask off internal flags for userspace API + * x86 ACPI: normalize segment descriptor register on resume + * x86 ACPI: fix resume from suspend to RAM on uniprocessor x86-64 + * softlockup: print a module list on being stuck + * ide: fix hwif->gendev refcounting + * ide: ide_unregister() warm-plug bugfix + * ide: ide_unregister() locking bugfix + * ahci: give another shot at clearing all bits in irq_stat + * Fix clear_refs_write() use of struct mm_walk + * Move _RET_IP_ and _THIS_IP_ to include/linux/kernel.h + * Fix pagemap_read() use of struct mm_walk + * Linux 2.6.26-rc9 + * Revert "USB: don't explicitly reenable root-hub status interrupts" + * Revert "PCI: Correct last two HP entries in the bfsort whitelist" + * iwlwifi: fix incorrect 5GHz rates reported in monitor mode + * iwlwifi: drop skb silently for Tx request in monitor mode + * libertas: support USB persistence on suspend/resume (resend) + * tcp: net/ipv4/tcp.c needs linux/scatterlist.h + * tcp: fix a size_t < 0 comparison in tcp_read_sock + * bridge: fix use-after-free in br_cleanup_bridges() + * Add missing skb->dev assignment in Frame Relay RX code + * forcedeth: fix lockdep warning on ethtool -s + * ehea: fix might sleep problem + * ehea: add MODULE_DEVICE_TABLE + * ehea: fix race condition + * ehea: Access iph->tot_len with correct endianness + * pasemi_mac: Access iph->tot_len with correct endianness + * ibm_newemac: Fixes kernel crashes when speed of cable connected changes + * ibm_newemac: Fixes entry of short packets + * fs_enet: restore promiscuous and multicast settings in restart() + * can: add sanity checks + * x86: KVM guest: Add memory clobber to hypercalls + * KVM: IOAPIC: Fix level-triggered irq injection hang + * [SCSI] erase invalid data returned by device + * pxamci: fix byte aligned DMA transfers + * vsprintf: split out '%s' handling logic + * vsprintf: split out '%p' handling logic + * vsprintf: add infrastructure support for extended '%p' specifiers + * vsprintf: add support for '%pS' and '%pF' pointer formats + * powerpc: Fix unterminated of_device_id array in legacy_serial.c + * [UML] fix gcc ICEs and unresolved externs + * ocfs2/dlm: Fixes oops in dlm_new_lockres() + * hostap_cs: correct poor NULL checks in suspend/resume routines + * drivers/net/wireless/iwlwifi/iwl-3945.c Fix type issue on 64bit + * mac80211: move netif_carrier_on to after + ieee80211_bss_info_change_notify + * mac80211: Only flush workqueue when last interface was removed + * zd1211rw: add ID for AirTies WUS-201 + * ssb-pcicore: Fix IRQ-vector init on embedded devices + * mac80211: don't report selected IBSS when not found + * crypto: tcrypt - Fix memory leak in test_cipher + * sctp: Mark the tsn as received after all allocations finish + * [S390] protect _PAGE_SPECIAL bit against mprotect + * irda: via-ircc proper dma freeing + * irda: New device ID for nsc-ircc + * irda: Fix netlink error path return value + * [SCSI] mptspi: fix oops in mptspi_dv_renegotiate_work() + * Correct hash flushing from huge_ptep_set_wrprotect() + * ide: add __ide_default_irq() inline helper + * palm_bk3710: fix IDECLK period calculation + * it8213: fix return value in it8213_init_one() + * [MIPS] Atlas, decstation: Fix section mismatches triggered by + defconfigs + * [MIPS] Fix 32bit kernels on R4k with 128 byte cache line size + * NFS: Fix readdir cache invalidation + * SUNRPC: Fix a double-free in rpcbind + * SUNRPC: Fix an rpcbind breakage for the case of IPv6 lookups + * reiserfs: discard prealloc in reiserfs_delete_inode + * Fix broken fix for fsl-diu-db + * RDMA/cxgb3: Fix regression caused by class_device -> device conversion + * ipv6: fix race between ipv6_del_addr and DAD timer + * sctp: Add documentation for sctp sysctl variable + * kernel/printk.c: Made printk_recursion_bug_msg static. + * powerpc: Add missing reference to coherent_dma_mask + * rc80211_pid: Fix fast_start parameter handling + * rt2x00: Disable synchronization during initialization + * zd1211rw: stop beacons on remove_interface + * libertas: fix memory alignment problems on the blackfin + * netfilter: nf_conntrack_tcp: fix endless loop + * netfilter: nf_nat_snmp_basic: fix a range check in NAT for SNMP + * md: ensure all blocks are uptodate or locked when syncing + * sched: fix cpu hotplug + * x86: fix /dev/mem compatibility under PAT + * crypto: chainiv - Invoke completion function + * ocfs2: Fix flags in ocfs2_file_lock + * kernel/kprobes.c: Made kprobe_blacklist static. + * arch/x86/kernel/.gitignore: Added vmlinux.lds to .gitignore file + because it shouldn't be tracked. + * ftrace: Documentation + * Fix PREEMPT_RCU without HOTPLUG_CPU + * sched: fix cpu hotplug, cleanup + * exec: fix stack excutability without PT_GNU_STACK + * slub: Fix use-after-preempt of per-CPU data structure + * Documentation: clarify tcp_{r,w}mem sysctl docs + * ip: sysctl documentation cleanup + * tcp: correct kcalloc usage + * ipv4: fib_trie: Fix lookup error return + * netlabel: netlink_unicast calls kfree_skb on error path by itself + * ipv6: missed namespace context in ipv6_rthdr_rcv + * xfrm: Add a XFRM_STATE_AF_UNSPEC flag to xfrm_usersa_info + * tun: Persistent devices can get stuck in xoff state + * tpm: add Intel TPM TIS device HID + * rapidio: fix device reference counting + * Fix name of Russell King in various comments + * rtc: fix reported IRQ rate for when HPET is enabled + * libata-acpi: filter out DIPM enable + * Added Targa Visionary 1000 IDE adapter to pata_sis.c + * libata-acpi: don't call sleeping function from invalid context + * Fix reference counting race on log buffers + * [SCSI] ipr: Fix HDIO_GET_IDENTITY oops for SATA devices + * IPMI: return correct value from ipmi_write + * x86: fix ldt limit for 64 bit + * [SCSI] fusion: default MSI to disabled for SPI and FC controllers + * [SCSI] bsg: fix oops on remove + * drivers/char/pcmcia/ipwireless/hardware.c fix resource leak + * drivers/isdn/i4l/isdn_common.c fix small resource leak + * fbdev: bugfix for multiprocess defio + * serial8250: sanity check nr_uarts on all paths. + * ov7670: clean up ov7670_read semantics + * rtc-fm3130: fix chip naming + * rtc-pcf8563: add chip id + * OProfile kernel maintainership changes + * frv: fix irqs_disabled() to return an int, not an unsigned long + * cifs: fix inode leak in cifs_get_inode_info_unix + * cifs: fix wksidarr declaration to be big-endian friendly + * cpusets, hotplug, scheduler: fix scheduler domain breakage + * Documentation/HOWTO: correct wrong kernel bugzilla FAQ URL + * devcgroup: always show positive major/minor num + * devcgroup: fix permission check when adding entry to child cgroup + * Linux 2.6.26 + + -- Ben Collins Mon, 14 Jul 2008 13:41:50 -0400 + +linux (2.6.26-3.9) intrepid; urgency=low + + * abi: Add dca and ioatdma to modules.ignore + + [ 2.6.26-3.8 ] + + [ Ben Collins ] + + * ubuntu: Add heci driver 3.2.0.24 + * ubuntu: Add heci to kconfig/kbuild + * config: Enable heci module on all flavours + * dm-bbr: Update to get it to compile with 2.6.26 + * config: Enable dm-bbr + * ubuntu: Add some media drivers + * config: Enable misc media drivers + * udeb: Switch to uvesafb in fb-modules + * abi: Add more modules to ignore (known) + + [ 2.6.26-3.7 ] + + [Amit Kucheria] + + * SAUCE: make fc transport removal of target configurable + - LP: #163075 + * SAUCE: pm: Config option to disable handling of console during + suspend/resume + + [Ben Collins] + + * SAUCE: input/mouse/alps: Do not call psmouse_reset() for alps + * SAUCE: irda: Default to dongle type 9 on IBM hardware + * SAUCE: tulip: Let dmfe handle davicom on non-sparc + * SAUCE: tulip: Define ULI PCI ID's + * SAUCE: version: Implement version_signature proc file. + * build: Cleanup arches + * build: Remove remnants of unused binary-custom infrastructure + * build: Remove disable_d_i (not needed) and cleanup ppa build stuff + * ubuntu: New modules, acer-acpi + * build: Remove -virtual, and rebuild configs + * ubuntu: Add drbd module + * acer-acpi: Fix makefile + * x86/Kconfig: Fix missing quote for ubuntu Kconfig source + * ubuntu: Add iscsitarget module + * ubuntu: Added Amiga FS driver + * ubuntu: Add squashfs driver + * ubuntu: Remove asfs (Amiga FS). Need to be in linux-ports instead + * squashfs: Move headers to real include directory + * build/configs: The Great Config Consistency Check of 2008 + * ubuntu: Move third-party includes to ubuntu/include + * ubuntu: Add aufs module + * ubuntu: Added atl2 driver + * ubuntu: Add dm-radi4-5 driver + * build: Add CONFIG_DEBUG_SECTION_MISMATCH=y to get old style warnings + from build + * ubuntu/Makefile: Fixup dm-raid4-5 and add kludge for kbuild + * squashfs: Fixes for VFS changes + * ubuntu/dm-raid4-5: Fixups for moved/renamed headers/functions in core + md + * ubuntu: Add ndiswrapper driver + * d-i: Update module listings + * build: Disable xd block device (ancient) + * ndiswrapper: Fixup makefile + * d-i: Remove efi-modules. The only module, efivars, is built-in + * build: Remove install-source, obsolete and caused build failure + * Ubuntu-2.6.26-1.3 + * build: linux-doc rules got broken when disabling html side. Fixed now. + * Ubuntu-2.6.26-1.4 + * x86: Update to -rc6 allows CONFIG_PCI_OLPC to work with PCI_GOANY + * d-i: Make virtio-ring optional (it's built-in on i386) + * Ubuntu-2.6.26-1.4 + * Ubuntu-2.6.26-1.5 + * config: Enable DVB devices + * ubuntu/aufs: Make aufs a bool config, since it needs to be built-in + * config: Build aufs into the kernels + * build: Fix arguments passed to link-headers script + * config: Disable early printk + * d-i: Move isofs to storage-core and kill st (scsi tape) from list + * config: Enable non-promiscuous access to /dev/mem + * x86: Add option to disable decompression info messages + * config: Enable no-bz-chatter config options + * build: Re-add linux-source package + * d-i: Re-add socket-modules. Accidentally removed + - LP: #241295 + * Ubuntu-2.6.26-2.6 + * Use makedumpfile to generate a vmcoreinfo file. + * build: Build-Depend on makedumpfile for vmcoreinfo generation + * build: Remove debug print from git-ubuntu-log + * Updated configs for -rc7 + * build: postinst, do not call depmod with -F + * config: Enable rtc-cmos as a built-in driver. + * control: Provide ndiswrapper-modules-1.9 + * build: Generate vmcoreinfo in image build for crashdumps without debug + image + * config: Disable vesafb, since we'll prefer uvesafb + * build: Copy uvesafb module to initrd mod directory + * abi-check: New, more robust script + * config: Enable heap randomization by default + * abi-check: Cleanup output and call with perl (not $SHELL) + * abi: Ignore missing vesafb (known) + * config: Disable pcspkr (in favor of snd-pcsp) + * swap: Add notify_swap_entry_free callback for compcache + * compcache: Added ram backed compressed swap module + * ubuntu: Enable kbuild and kconfig for compcache + * config: Enable compcache and tlsf allocator as modules + * config: Updated for -rc8. Disables XEN on i386 + * config: Switch i386-server to 64G, enable PAE, 64-bit res, and XEN + * ubuntu: Add misc drivers from hardy lum + * ubuntu: Enable build of misc/ subdir + * config: Enable misc drivers + * aufs: Fix warning about single non-string-literal arg to printf style + function + * drivers: Remove some duplicate device entries in various modules + * config: Disable some duplicate drivers + * keyspan: Remove duplicate device ID's + * check-aliases: Cleanup output, and fix rolling checks + * ubuntu: Disable dm-bbr for now + * dm-bbr: First cut at forward portiong. Still needs work. + * ubuntu: Disable dm-bbr in kbuild/kconfig + + [Chuck Short] + + * SAUCE: ata: blacklist FUJITSU MHW2160BH PL + - LP: #175834 + * SAUCE: [USB]: add ASUS LCM to the blacklist + + [Colin Ian King] + + * SAUCE: airprime.c supports more devices + - LP: #208250 + * SAUCE: Enable speedstep for sonoma processors. + - LP: #132271 + * Add dm-loop + * Add dm-loop BOM + + [Kyle McMartin] + + * SAUCE: fix orinoco_cs oops + + [Mario Limonciello] + + * SAUCE: Enable Reset and SCO workaround on Dell 410 BT adapter + + [Matthew Garrett] + + * SAUCE: hostap: send events on data interface as well as master + interface + + [Phillip Lougher] + + * SAUCE: r8169: disable TSO by default for RTL8111/8168B chipsets. + + [Stefan Bader] + + * SAUCE: Export dm_disk function of device-mapper + * SAUCE: Restore VT fonts on switch + * SAUCE: Always use SCO protocol (disable eSCO support) Bug: #39414 + * SAUCE: mmc: Increase power_up deleay to fix TI readers OriginalAuthor: + Pascal Terjan Bug: #137686 + * SAUCE: Add blacklist support to fix Belkin bluetooth dongle. Bug: + #140511 + * SAUCE: Lower warning level of pci resource allocation messages. Bug: + 159241 + * SAUCE: Lower message level for PCI memory and I/O allocation. + - LP: #159241 + * Modify log generation to catch bug numbers when adding with git-am. + + [Tim Gardner] + + * Added the debian directory. Ignore: yes + * Add support for UBUNTUINCLUDE Ignore: yes + * LUM headers go in /usr/src Ignore: yes + * First pass at 2.6.25 configs Ignore: yes + * i386 -generic builds. Ignore: yes + * SAUCE: Increase CONFIG_IDE_MAX_HWIFS to 8 (from 4) + * SAUCE: Add extra headers to linux-libc-dev OriginalAuthor: Soren Hansen + OriginalLocation: + https://lists.ubuntu.com/archives/kernel-team/2007-November/001891.html + * Set CONFIG_DEVKMEM=n Ignore: yes + * Enabled ALSA and CGROUPS for i386 Ignore: yes + * Enabled amd64 configs. Ignore: yes + * CONFIG_STANDALONE=n Ignore: yes + * CONFIG_BLK_DEV_4DRIVES=n for i386 Ignore: yes + * CONFIG: CONFIG_DEFAULT_RELATIME=y for all flavours. Ignore: yes + * Set CONFIG_EDD_OFF=y Ignore: yes + * SAUCE: Blacklist Bluetooth Dell Wireless 370 for SCO MTU + OriginalAuthor: Mario Limonciello Bug: + #209715 + * SAUCE: Catch nonsense keycodes and silently ignore + * SAUCE: frame buffer regression - screen blank except for blinking + cursor after fbcon vtswitch OriginalAuthor: Matthew Garrett + Bug: #201591 + * SAUCE: Added support for HDAPS on various ThinkPads from Lenovo and IBM + OriginalAuthor: Klaus S. Madsen + OriginalAuthor: Chuck Short + * SAUCE: Guest OS does not recognize a lun with non zero target id on + Vmware ESX Server + * SAUCE: orinoco_cs.ko missing + * Set CONFIG_FB_VESA=m for i386/amd64 Ignore: yes + * Set CONFIG_PM_DISABLE_CONSOLE=y for all flavours Ignore: yes + * Thorough review of amd64 -generic config Ignore: yes + * Build PPA packages for Hardy until the Intrepid archive is opened. + * Deleted obsolete flavours Ignore: yes + * Don't build docs for PPA Ignore: yes + * Build all standard packages in PPA. Ignore: yes + * Remove duplicate USB ids + * SAUCE: DVB-USB UMT-010 driver oops on install Bug: #115284 + * Update configs after rebase to 2.6.26-rc1 Ignore: yes + * Update configs after rebase Ignore: yes + * Disable V4L until the build issues get ironed out. Ignore: yes + * Update configs after rebase. Ignore: yes + * Another device enable pass Ignore: yes + * Update configs after merge. Ignore: yes + * SAUCE: fn key doesn't work in hardy with macbook pro fourth generation + (4,1) + - LP: #207127 + * Enabled CONFIG_CIFS_DFS_UPCALL=y and CONFIG_CIFS_UPCALL=y + - LP: #236830 + + [Upstream Kernel Changes] + + * Revert "[WATCHDOG] hpwdt: Add CFLAGS to get driver working" + * mac80211: detect driver tx bugs + * hwmon: (lm85) Fix function RANGE_TO_REG() + * hwmon: (adt7473) Initialize max_duty_at_overheat before use + * hwmon: Update the sysfs interface documentation + * hwmon: (abituguru3) Identify Abit AW8D board as such + * hwmon: (w83791d) new maintainer + * hwmon: (abituguru3) update driver detection + * hwmon: (lm75) sensor reading bugfix + * ipv6: Remove options header when setsockopt's optlen is 0 + * ipv6: Drop packets for loopback address from outside of the box. + * sched: rt: dont stop the period timer when there are tasks wanting to + run + * sched: fix wait_for_completion_timeout() spurious failure under heavy + load + * x86: fix NULL pointer deref in __switch_to + * xen: Use wmb instead of rmb in xen_evtchn_do_upcall(). + * xen: mask unwanted pte bits in __supported_pte_mask + * xen: don't drop NX bit + * sched: refactor wait_for_completion_timeout() + * Ext4: Fix online resize block group descriptor corruption + * [IA64] SN2: security hole in sn2_ptc_proc_write + * alpha: fix module load failures on smp (bug #10926) + * alpha: link failure fix + * alpha: fix compile failures with gcc-4.3 (bug #10438) + * alpha: resurrect Cypress IDE quirk + * pppoe: warning fix + * sctp: Make sure N * sizeof(union sctp_addr) does not overflow. + * netns: Don't receive new packets in a dead network namespace. + * Add return value to reserve_bootmem_node() + * Slab: Fix memory leak in fallback_alloc() + * Fix performance regression on lmbench select benchmark + * ALSA: aw2 - Fix Oops at initialization + * ALSA: sb - Fix wrong assertions + * futexes: fix fault handling in futex_lock_pi + * IB/mthca: Clear ICM pages before handing to FW + * tty_driver: Update required method documentation + * removed unused var real_tty on n_tty_ioctl() + * Fix ZERO_PAGE breakage with vmware + * mm: fix race in COW logic + * NFS: Reduce the NFS mount code stack usage. + * NFS: Fix filehandle size comparisons in the mount code + * NFS: nfs_updatepage(): don't mark page as dirty if an error occurred + * alpha: fix compile error in arch/alpha/mm/init.c + * KVM: Fix race between timer migration and vcpu migration + * KVM: close timer injection race window in __vcpu_run + * KVM: MMU: Fix rmap_write_protect() hugepage iteration bug + * KVM: MMU: large page update_pte issue with non-PAE 32-bit guests + (resend) + * KVM: MMU: Fix oops on guest userspace access to guest pagetable + * KVM: ioapic: fix lost interrupt when changing a device's irq + * KVM: VMX: Fix host msr corruption with preemption enabled + * [GFS2] BUG: unable to handle kernel paging request at ffff81002690e000 + * xen: remove support for non-PAE 32-bit + * kgdb: documentation update - remove kgdboe + * kgdb: sparse fix + * [IA64] Fix boot failure on ia64/sn2 + * [IA64] Handle count==0 in sn2_ptc_proc_write() + * [IA64] Eliminate NULL test after alloc_bootmem in iosapic_alloc_rte() + * [GFS2] fix gfs2 block allocation (cleaned up) + * x86: Add structs and functions for paravirt clocksource + * x86: Make xen use the paravirt clocksource structs and functions + * KVM: Make kvm host use the paravirt clocksource structs + * x86: KVM guest: Use the paravirt clocksource structs and functions + * KVM: Remove now unused structs from kvm_para.h + * enable bus mastering on i915 at resume time + * Linux 2.6.26-rc8 + * # Ubuntu external driver commit. + * # Ubuntu commit template. + + -- Ben Collins Sat, 21 Jun 2008 09:05:15 -0400 + +linux (2.6.26-2.6) intrepid; urgency=low + + [Ben Collins] + + * Revert "SAUCE: Export symbols for aufs (in lum) (not needed) + * config: Enable DVB devices + * ubuntu/aufs: Make aufs a bool config, since it needs to be built-in + * config: Build aufs into the kernels + * build: Fix arguments passed to link-headers script + * config: Disable early printk + * d-i: Move isofs to storage-core and kill st (scsi tape) from list + * config: Enable non-promiscuous access to /dev/mem + * x86: Add option to disable decompression info messages + * config: Enable no-bz-chatter config options + * build: Re-add linux-source package + * d-i: Re-add socket-modules. Accidentally removed + - LP: #241295 + + [Colin Ian King] + + * Add dm-loop + + [Tim Gardner] + + * Revert "SAUCE: USB bluetooth device 0x0e5e:0x6622 floods errors to + syslog (merged upstream) + + -- Ben Collins Mon, 16 Jun 2008 10:56:01 -0400 + +linux (2.6.26-1.5) intrepid; urgency=low + + * d-i: Make virtio-ring optional (it's built-in on i386) + * Rebased on 2.6.26-rc6 + + [Ubuntu-2.6.26-1.4 Changes below] + + * build: linux-doc rules got broken when disabling html side. Fixed now. + + [Ubuntu-2.6.26-1.3 Changes below] + + * build: Remove install-source, obsolete and caused build failure + + [Ubuntu-2.6.26-1.2 Changes below] + + * Remove efi-modules from d-i module list (efivars is built-in). Caused a + build failure. + * Patch to arch/x86/xen/time.c to remove __divdi3 usage (build failure on + i386). + + [Ubuntu-2.6.26-1.1 Changes below] + + [Amit Kucheria] + + * SAUCE: make fc transport removal of target configurable + * SAUCE: Add AGP support for Radeon Mobility 9000 chipset + * SAUCE: pm: Config option to disable handling of console during + suspend/resume + + [Ben Collins] + + * SAUCE: input/mouse/alps: Do not call psmouse_reset() for alps + * SAUCE: irda: Default to dongle type 9 on IBM hardware + * SAUCE: tulip: Let dmfe handle davicom on non-sparc + * SAUCE: tulip: Define ULI PCI ID's + * SAUCE: version: Implement version_signature proc file. + * build: Remove remnants of unused binary-custom infrastructure + * mmc_block: Fix bad allocation on 64-bit (zero len array) + * ubuntu: New modules, acer-acpi + * build: Remove -virtual, and rebuild configs + * ubuntu: Add drbd module + * ubuntu: Add iscsitarget module + * ubuntu: Add squashfs driver + * build/configs: The Great Config Consistency Check of 2008 + * ubuntu: Add aufs module + * ubuntu: Added atl2 driver + * ubuntu: Add dm-radi4-5 driver + * build: Add CONFIG_DEBUG_SECTION_MISMATCH=y to get old style warnings + from build + * squashfs: Fixes for VFS changes + * ubuntu/dm-raid4-5: Fixups for moved/renamed headers/functions in core + md + * ubuntu: Add ndiswrapper driver + * d-i: Update module listings + + [Chuck Short] + + * SAUCE: ata: blacklist FUJITSU MHW2160BH PL + * SAUCE: [USB]: add ASUS LCM to the blacklist + + [Colin Ian King] + + * SAUCE: Enable speedstep for sonoma processors. + * SAUCE: airprime.c supports more devices + + [Kyle McMartin] + + * SAUCE: fix orinoco_cs oops + + [Mario Limonciello] + + * SAUCE: Enable Reset and SCO workaround on Dell 410 BT adapter + + [Matthew Garrett] + + * SAUCE: hostap: send events on data interface as well as master + interface + + [Phillip Lougher] + + * SAUCE: r8169: disable TSO by default for RTL8111/8168B chipsets. + + [Stefan Bader] + + * SAUCE: Export dm_disk function of device-mapper + * SAUCE: Restore VT fonts on switch + * SAUCE: Always use SCO protocol (disable eSCO support) Bug: #39414 + * SAUCE: mmc: Increase power_up deleay to fix TI readers + * SAUCE: Add blacklist support to fix Belkin bluetooth dongle. + * SAUCE: Lower warning level of pci resource allocation messages. + * SAUCE: Lower message level for PCI memory and I/O allocation. + - LP: #159241 + * Modify log generation to catch bug numbers when adding with git-am. + + [Tim Gardner] + + * SAUCE: hdaps module does not load on Thinkpad T61P + * SAUCE: Add extra headers to linux-libc-dev + * SAUCE: Export symbols for aufs (in lum). + * SAUCE: USB bluetooth device 0x0e5e:0x6622 floods errors to syslog + * SAUCE: Blacklist Bluetooth Dell Wireless 370 for SCO MTU + * SAUCE: Catch nonsense keycodes and silently ignore + * SAUCE: frame buffer regression - screen blank except for blinking + cursor after fbcon vtswitch + * SAUCE: Added support for HDAPS on various ThinkPads from Lenovo and IBM + * SAUCE: Guest OS does not recognize a lun with non zero target id on + Vmware ESX Server + * SAUCE: Modualrize vesafb + * SAUCE: DVB-USB UMT-010 driver oops on install + * SAUCE: fn key doesn't work in hardy with macbook pro fourth generation + (4,1) + - LP: #207127 + + -- Ben Collins Wed, 11 Jun 2008 05:28:35 -0400 --- linux-rt-2.6.29.5.orig/debian/control +++ linux-rt-2.6.29.5/debian/control @@ -0,0 +1,72 @@ +Source: linux-rt +Section: devel +Priority: optional +Maintainer: Alessio Igor Bogani +Standards-Version: 3.6.1 +Build-Depends: debhelper (>= 3), module-init-tools, kernel-wedge (>= 2.24ubuntu1), makedumpfile [!armel], quilt +Build-Depends-Indep: xmlto, docbook-utils, gs, transfig, bzip2, sharutils + +Package: linux-rt-headers-2.6.29.5-1 +Architecture: all +Section: devel +Priority: optional +Depends: coreutils | fileutils (>= 4.0) +Provides: linux-rt-headers, linux-rt-headers-2.6 +Description: Header files related to Linux kernel version 2.6.29.5 + This package provides kernel header files for version 2.6.29.5, for sites + that want the latest kernel headers. Please read + /usr/share/doc/linux-headers-2.6.29.5-1/debian.README.gz for details + +Package: linux-image-2.6.29.5-1-rt +Architecture: i386 amd64 +Section: base +Priority: optional +Pre-Depends: dpkg (>= 1.10.24) +Provides: linux-image, linux-image-2.6, fuse-module, kvm-api-4, redhat-cluster-modules, ivtv-modules, ndiswrapper-modules-1.9 +Depends: initramfs-tools (>= 0.36ubuntu6), coreutils | fileutils (>= 4.0), module-init-tools (>= 3.3-pre11-4ubuntu3) +Conflicts: hotplug (<< 0.0.20040105-1) +Recommends: grub | lilo (>= 19.1) +Suggests: fdutils, linux-doc-2.6.29.5 | linux-source-2.6.29.5 +Description: Linux kernel image for version 2.6.29.5 on Ingo Molnar's full real time preemption patch (2.6.28-rt) + This package contains the Linux kernel image for version 2.6.29.5 on + Ingo Molnar's full real time preemption patch (2.6.28-rt). + . + Also includes the corresponding System.map file, the modules built by the + packager, and scripts that try to ensure that the system is not left in an + unbootable state after an update. + . + Supports Generic processors. + . + Geared toward real time systems. + . + You likely do not want to install this package directly. Instead, install + the linux-rt meta-package, which will ensure that upgrades work + correctly, and that supporting packages are also installed. + +Package: linux-headers-2.6.29.5-1-rt +Architecture: i386 amd64 +Section: devel +Priority: optional +Depends: coreutils | fileutils (>= 4.0), linux-rt-headers-2.6.29.5-1, ${shlibs:Depends} +Provides: linux-headers, linux-headers-2.6 +Description: Linux kernel headers for version 2.6.29.5 on Ingo Molnar's full real time preemption patch (2.6.28-rt) + This package provides kernel header files for version 2.6.29.5 on + Ingo Molnar's full real time preemption patch (2.6.28-rt). + . + This is for sites that want the latest kernel headers. Please read + /usr/share/doc/linux-headers-2.6.29.5-1/debian.README.gz for details. + +Package: linux-image-debug-2.6.29.5-1-rt +Architecture: i386 amd64 +Section: devel +Priority: optional +Provides: linux-debug +Description: Linux kernel debug image for version 2.6.29.5 on Ingo Molnar's full real time preemption patch (2.6.28-rt) + This package provides a kernel debug image for version 2.6.29.5 on + Ingo Molnar's full real time preemption patch (2.6.28-rt). + . + This is for sites that wish to debug the kernel. + . + The kernel image contained in this package is NOT meant to boot from. It + is uncompressed, and unstripped. This package also includes the + unstripped modules. --- linux-rt-2.6.29.5.orig/debian/patches/series +++ linux-rt-2.6.29.5/debian/patches/series @@ -0,0 +1 @@ +2.6.29.5-rt22 --- linux-rt-2.6.29.5.orig/debian/patches/2.6.29.5-rt22 +++ linux-rt-2.6.29.5/debian/patches/2.6.29.5-rt22 @@ -0,0 +1,174663 @@ +Index: linux-2.6-tip/Documentation/ABI/testing/debugfs-kmemtrace +=================================================================== +--- /dev/null ++++ linux-2.6-tip/Documentation/ABI/testing/debugfs-kmemtrace +@@ -0,0 +1,71 @@ ++What: /sys/kernel/debug/kmemtrace/ ++Date: July 2008 ++Contact: Eduard - Gabriel Munteanu ++Description: ++ ++In kmemtrace-enabled kernels, the following files are created: ++ ++/sys/kernel/debug/kmemtrace/ ++ cpu (0400) Per-CPU tracing data, see below. (binary) ++ total_overruns (0400) Total number of bytes which were dropped from ++ cpu files because of full buffer condition, ++ non-binary. (text) ++ abi_version (0400) Kernel's kmemtrace ABI version. (text) ++ ++Each per-CPU file should be read according to the relay interface. That is, ++the reader should set affinity to that specific CPU and, as currently done by ++the userspace application (though there are other methods), use poll() with ++an infinite timeout before every read(). Otherwise, erroneous data may be ++read. The binary data has the following _core_ format: ++ ++ Event ID (1 byte) Unsigned integer, one of: ++ 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC) ++ 1 - represents a freeing of previously allocated memory ++ (KMEMTRACE_EVENT_FREE) ++ Type ID (1 byte) Unsigned integer, one of: ++ 0 - this is a kmalloc() / kfree() ++ 1 - this is a kmem_cache_alloc() / kmem_cache_free() ++ 2 - this is a __get_free_pages() et al. ++ Event size (2 bytes) Unsigned integer representing the ++ size of this event. Used to extend ++ kmemtrace. Discard the bytes you ++ don't know about. ++ Sequence number (4 bytes) Signed integer used to reorder data ++ logged on SMP machines. Wraparound ++ must be taken into account, although ++ it is unlikely. ++ Caller address (8 bytes) Return address to the caller. ++ Pointer to mem (8 bytes) Pointer to target memory area. Can be ++ NULL, but not all such calls might be ++ recorded. ++ ++In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow: ++ ++ Requested bytes (8 bytes) Total number of requested bytes, ++ unsigned, must not be zero. ++ Allocated bytes (8 bytes) Total number of actually allocated ++ bytes, unsigned, must not be lower ++ than requested bytes. ++ Requested flags (4 bytes) GFP flags supplied by the caller. ++ Target CPU (4 bytes) Signed integer, valid for event id 1. ++ If equal to -1, target CPU is the same ++ as origin CPU, but the reverse might ++ not be true. ++ ++The data is made available in the same endianness the machine has. ++ ++Other event ids and type ids may be defined and added. Other fields may be ++added by increasing event size, but see below for details. ++Every modification to the ABI, including new id definitions, are followed ++by bumping the ABI version by one. ++ ++Adding new data to the packet (features) is done at the end of the mandatory ++data: ++ Feature size (2 byte) ++ Feature ID (1 byte) ++ Feature data (Feature size - 3 bytes) ++ ++ ++Users: ++ kmemtrace-user - git://repo.or.cz/kmemtrace-user.git ++ +Index: linux-2.6-tip/Documentation/DMA-API.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/DMA-API.txt ++++ linux-2.6-tip/Documentation/DMA-API.txt +@@ -609,3 +609,109 @@ size is the size (and should be a page-s + The return value will be either a pointer to the processor virtual + address of the memory, or an error (via PTR_ERR()) if any part of the + region is occupied. ++ ++Part III - Debug drivers use of the DMA-API ++------------------------------------------- ++ ++The DMA-API as described above as some constraints. DMA addresses must be ++released with the corresponding function with the same size for example. With ++the advent of hardware IOMMUs it becomes more and more important that drivers ++do not violate those constraints. In the worst case such a violation can ++result in data corruption up to destroyed filesystems. ++ ++To debug drivers and find bugs in the usage of the DMA-API checking code can ++be compiled into the kernel which will tell the developer about those ++violations. If your architecture supports it you can select the "Enable ++debugging of DMA-API usage" option in your kernel configuration. Enabling this ++option has a performance impact. Do not enable it in production kernels. ++ ++If you boot the resulting kernel will contain code which does some bookkeeping ++about what DMA memory was allocated for which device. If this code detects an ++error it prints a warning message with some details into your kernel log. An ++example warning message may look like this: ++ ++------------[ cut here ]------------ ++WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448 ++ check_unmap+0x203/0x490() ++Hardware name: ++forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong ++ function [device address=0x00000000640444be] [size=66 bytes] [mapped as ++single] [unmapped as page] ++Modules linked in: nfsd exportfs bridge stp llc r8169 ++Pid: 0, comm: swapper Tainted: G W 2.6.28-dmatest-09289-g8bb99c0 #1 ++Call Trace: ++ [] warn_slowpath+0xf2/0x130 ++ [] _spin_unlock+0x10/0x30 ++ [] usb_hcd_link_urb_to_ep+0x75/0xc0 ++ [] _spin_unlock_irqrestore+0x12/0x40 ++ [] ohci_urb_enqueue+0x19f/0x7c0 ++ [] queue_work+0x56/0x60 ++ [] enqueue_task_fair+0x20/0x50 ++ [] usb_hcd_submit_urb+0x379/0xbc0 ++ [] cpumask_next_and+0x23/0x40 ++ [] find_busiest_group+0x207/0x8a0 ++ [] _spin_lock_irqsave+0x1f/0x50 ++ [] check_unmap+0x203/0x490 ++ [] debug_dma_unmap_page+0x49/0x50 ++ [] nv_tx_done_optimized+0xc6/0x2c0 ++ [] nv_nic_irq_optimized+0x73/0x2b0 ++ [] handle_IRQ_event+0x34/0x70 ++ [] handle_edge_irq+0xc9/0x150 ++ [] do_IRQ+0xcb/0x1c0 ++ [] ret_from_intr+0x0/0xa ++ <4>---[ end trace f6435a98e2a38c0e ]--- ++ ++The driver developer can find the driver and the device including a stacktrace ++of the DMA-API call which caused this warning. ++ ++Per default only the first error will result in a warning message. All other ++errors will only silently counted. This limitation exist to prevent the code ++from flooding your kernel log. To support debugging a device driver this can ++be disabled via debugfs. See the debugfs interface documentation below for ++details. ++ ++The debugfs directory for the DMA-API debugging code is called dma-api/. In ++this directory the following files can currently be found: ++ ++ dma-api/all_errors This file contains a numeric value. If this ++ value is not equal to zero the debugging code ++ will print a warning for every error it finds ++ into the kernel log. Be carefull with this ++ option. It can easily flood your logs. ++ ++ dma-api/disabled This read-only file contains the character 'Y' ++ if the debugging code is disabled. This can ++ happen when it runs out of memory or if it was ++ disabled at boot time ++ ++ dma-api/error_count This file is read-only and shows the total ++ numbers of errors found. ++ ++ dma-api/num_errors The number in this file shows how many ++ warnings will be printed to the kernel log ++ before it stops. This number is initialized to ++ one at system boot and be set by writing into ++ this file ++ ++ dma-api/min_free_entries ++ This read-only file can be read to get the ++ minimum number of free dma_debug_entries the ++ allocator has ever seen. If this value goes ++ down to zero the code will disable itself ++ because it is not longer reliable. ++ ++ dma-api/num_free_entries ++ The current number of free dma_debug_entries ++ in the allocator. ++ ++If you have this code compiled into your kernel it will be enabled by default. ++If you want to boot without the bookkeeping anyway you can provide ++'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. ++Notice that you can not enable it again at runtime. You have to reboot to do ++so. ++ ++When the code disables itself at runtime this is most likely because it ran ++out of dma_debug_entries. These entries are preallocated at boot. The number ++of preallocated entries is defined per architecture. If it is too low for you ++boot with 'dma_debug_entries=' to overwrite the ++architectural default. +Index: linux-2.6-tip/Documentation/DocBook/genericirq.tmpl +=================================================================== +--- linux-2.6-tip.orig/Documentation/DocBook/genericirq.tmpl ++++ linux-2.6-tip/Documentation/DocBook/genericirq.tmpl +@@ -440,6 +440,7 @@ desc->chip->end(); + used in the generic IRQ layer. + + !Iinclude/linux/irq.h ++!Iinclude/linux/interrupt.h + + + +Index: linux-2.6-tip/Documentation/cputopology.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/cputopology.txt ++++ linux-2.6-tip/Documentation/cputopology.txt +@@ -18,11 +18,11 @@ For an architecture to support this feat + these macros in include/asm-XXX/topology.h: + #define topology_physical_package_id(cpu) + #define topology_core_id(cpu) +-#define topology_thread_siblings(cpu) +-#define topology_core_siblings(cpu) ++#define topology_thread_cpumask(cpu) ++#define topology_core_cpumask(cpu) + + The type of **_id is int. +-The type of siblings is cpumask_t. ++The type of siblings is (const) struct cpumask *. + + To be consistent on all architectures, include/linux/topology.h + provides default definitions for any of the above macros that are +Index: linux-2.6-tip/Documentation/feature-removal-schedule.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/feature-removal-schedule.txt ++++ linux-2.6-tip/Documentation/feature-removal-schedule.txt +@@ -344,3 +344,20 @@ Why: See commits 129f8ae9b1b5be94517da76 + Removal is subject to fixing any remaining bugs in ACPI which may + cause the thermal throttling not to happen at the right time. + Who: Dave Jones , Matthew Garrett ++ ++----------------------------- ++ ++What: __do_IRQ all in one fits nothing interrupt handler ++When: 2.6.32 ++Why: __do_IRQ was kept for easy migration to the type flow handlers. ++ More than two years of migration time is enough. ++Who: Thomas Gleixner ++ ++----------------------------- ++ ++What: obsolete generic irq defines and typedefs ++When: 2.6.30 ++Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) ++ have been kept around for migration reasons. After more than two years ++ it's time to remove them finally ++Who: Thomas Gleixner +Index: linux-2.6-tip/Documentation/ftrace.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/ftrace.txt ++++ linux-2.6-tip/Documentation/ftrace.txt +@@ -15,31 +15,31 @@ Introduction + + Ftrace is an internal tracer designed to help out developers and + designers of systems to find what is going on inside the kernel. +-It can be used for debugging or analyzing latencies and performance +-issues that take place outside of user-space. ++It can be used for debugging or analyzing latencies and ++performance issues that take place outside of user-space. + + Although ftrace is the function tracer, it also includes an +-infrastructure that allows for other types of tracing. Some of the +-tracers that are currently in ftrace include a tracer to trace +-context switches, the time it takes for a high priority task to +-run after it was woken up, the time interrupts are disabled, and +-more (ftrace allows for tracer plugins, which means that the list of +-tracers can always grow). ++infrastructure that allows for other types of tracing. Some of ++the tracers that are currently in ftrace include a tracer to ++trace context switches, the time it takes for a high priority ++task to run after it was woken up, the time interrupts are ++disabled, and more (ftrace allows for tracer plugins, which ++means that the list of tracers can always grow). + + + The File System + --------------- + +-Ftrace uses the debugfs file system to hold the control files as well +-as the files to display output. ++Ftrace uses the debugfs file system to hold the control files as ++well as the files to display output. + + To mount the debugfs system: + + # mkdir /debug + # mount -t debugfs nodev /debug + +-(Note: it is more common to mount at /sys/kernel/debug, but for simplicity +- this document will use /debug) ++( Note: it is more common to mount at /sys/kernel/debug, but for ++ simplicity this document will use /debug) + + That's it! (assuming that you have ftrace configured into your kernel) + +@@ -50,90 +50,124 @@ of ftrace. Here is a list of some of the + + Note: all time values are in microseconds. + +- current_tracer: This is used to set or display the current tracer +- that is configured. ++ current_tracer: + +- available_tracers: This holds the different types of tracers that +- have been compiled into the kernel. The tracers +- listed here can be configured by echoing their name +- into current_tracer. +- +- tracing_enabled: This sets or displays whether the current_tracer +- is activated and tracing or not. Echo 0 into this +- file to disable the tracer or 1 to enable it. +- +- trace: This file holds the output of the trace in a human readable +- format (described below). +- +- latency_trace: This file shows the same trace but the information +- is organized more to display possible latencies +- in the system (described below). +- +- trace_pipe: The output is the same as the "trace" file but this +- file is meant to be streamed with live tracing. +- Reads from this file will block until new data +- is retrieved. Unlike the "trace" and "latency_trace" +- files, this file is a consumer. This means reading +- from this file causes sequential reads to display +- more current data. Once data is read from this +- file, it is consumed, and will not be read +- again with a sequential read. The "trace" and +- "latency_trace" files are static, and if the +- tracer is not adding more data, they will display +- the same information every time they are read. +- +- trace_options: This file lets the user control the amount of data +- that is displayed in one of the above output +- files. +- +- trace_max_latency: Some of the tracers record the max latency. +- For example, the time interrupts are disabled. +- This time is saved in this file. The max trace +- will also be stored, and displayed by either +- "trace" or "latency_trace". A new max trace will +- only be recorded if the latency is greater than +- the value in this file. (in microseconds) +- +- buffer_size_kb: This sets or displays the number of kilobytes each CPU +- buffer can hold. The tracer buffers are the same size +- for each CPU. The displayed number is the size of the +- CPU buffer and not total size of all buffers. The +- trace buffers are allocated in pages (blocks of memory +- that the kernel uses for allocation, usually 4 KB in size). +- If the last page allocated has room for more bytes +- than requested, the rest of the page will be used, +- making the actual allocation bigger than requested. +- (Note, the size may not be a multiple of the page size due +- to buffer managment overhead.) +- +- This can only be updated when the current_tracer +- is set to "nop". +- +- tracing_cpumask: This is a mask that lets the user only trace +- on specified CPUS. The format is a hex string +- representing the CPUS. +- +- set_ftrace_filter: When dynamic ftrace is configured in (see the +- section below "dynamic ftrace"), the code is dynamically +- modified (code text rewrite) to disable calling of the +- function profiler (mcount). This lets tracing be configured +- in with practically no overhead in performance. This also +- has a side effect of enabling or disabling specific functions +- to be traced. Echoing names of functions into this file +- will limit the trace to only those functions. +- +- set_ftrace_notrace: This has an effect opposite to that of +- set_ftrace_filter. Any function that is added here will not +- be traced. If a function exists in both set_ftrace_filter +- and set_ftrace_notrace, the function will _not_ be traced. +- +- set_ftrace_pid: Have the function tracer only trace a single thread. +- +- available_filter_functions: This lists the functions that ftrace +- has processed and can trace. These are the function +- names that you can pass to "set_ftrace_filter" or +- "set_ftrace_notrace". (See the section "dynamic ftrace" +- below for more details.) ++ This is used to set or display the current tracer ++ that is configured. ++ ++ available_tracers: ++ ++ This holds the different types of tracers that ++ have been compiled into the kernel. The ++ tracers listed here can be configured by ++ echoing their name into current_tracer. ++ ++ tracing_enabled: ++ ++ This sets or displays whether the current_tracer ++ is activated and tracing or not. Echo 0 into this ++ file to disable the tracer or 1 to enable it. ++ ++ trace: ++ ++ This file holds the output of the trace in a human ++ readable format (described below). ++ ++ latency_trace: ++ ++ This file shows the same trace but the information ++ is organized more to display possible latencies ++ in the system (described below). ++ ++ trace_pipe: ++ ++ The output is the same as the "trace" file but this ++ file is meant to be streamed with live tracing. ++ Reads from this file will block until new data ++ is retrieved. Unlike the "trace" and "latency_trace" ++ files, this file is a consumer. This means reading ++ from this file causes sequential reads to display ++ more current data. Once data is read from this ++ file, it is consumed, and will not be read ++ again with a sequential read. The "trace" and ++ "latency_trace" files are static, and if the ++ tracer is not adding more data, they will display ++ the same information every time they are read. ++ ++ trace_options: ++ ++ This file lets the user control the amount of data ++ that is displayed in one of the above output ++ files. ++ ++ tracing_max_latency: ++ ++ Some of the tracers record the max latency. ++ For example, the time interrupts are disabled. ++ This time is saved in this file. The max trace ++ will also be stored, and displayed by either ++ "trace" or "latency_trace". A new max trace will ++ only be recorded if the latency is greater than ++ the value in this file. (in microseconds) ++ ++ buffer_size_kb: ++ ++ This sets or displays the number of kilobytes each CPU ++ buffer can hold. The tracer buffers are the same size ++ for each CPU. The displayed number is the size of the ++ CPU buffer and not total size of all buffers. The ++ trace buffers are allocated in pages (blocks of memory ++ that the kernel uses for allocation, usually 4 KB in size). ++ If the last page allocated has room for more bytes ++ than requested, the rest of the page will be used, ++ making the actual allocation bigger than requested. ++ ( Note, the size may not be a multiple of the page size ++ due to buffer managment overhead. ) ++ ++ This can only be updated when the current_tracer ++ is set to "nop". ++ ++ tracing_cpumask: ++ ++ This is a mask that lets the user only trace ++ on specified CPUS. The format is a hex string ++ representing the CPUS. ++ ++ set_ftrace_filter: ++ ++ When dynamic ftrace is configured in (see the ++ section below "dynamic ftrace"), the code is dynamically ++ modified (code text rewrite) to disable calling of the ++ function profiler (mcount). This lets tracing be configured ++ in with practically no overhead in performance. This also ++ has a side effect of enabling or disabling specific functions ++ to be traced. Echoing names of functions into this file ++ will limit the trace to only those functions. ++ ++ set_ftrace_notrace: ++ ++ This has an effect opposite to that of ++ set_ftrace_filter. Any function that is added here will not ++ be traced. If a function exists in both set_ftrace_filter ++ and set_ftrace_notrace, the function will _not_ be traced. ++ ++ set_ftrace_pid: ++ ++ Have the function tracer only trace a single thread. ++ ++ set_graph_function: ++ ++ Set a "trigger" function where tracing should start ++ with the function graph tracer (See the section ++ "dynamic ftrace" for more details). ++ ++ available_filter_functions: ++ ++ This lists the functions that ftrace ++ has processed and can trace. These are the function ++ names that you can pass to "set_ftrace_filter" or ++ "set_ftrace_notrace". (See the section "dynamic ftrace" ++ below for more details.) + + + The Tracers +@@ -141,36 +175,66 @@ The Tracers + + Here is the list of current tracers that may be configured. + +- function - function tracer that uses mcount to trace all functions. ++ "function" ++ ++ Function call tracer to trace all kernel functions. ++ ++ "function_graph_tracer" ++ ++ Similar to the function tracer except that the ++ function tracer probes the functions on their entry ++ whereas the function graph tracer traces on both entry ++ and exit of the functions. It then provides the ability ++ to draw a graph of function calls similar to C code ++ source. ++ ++ "sched_switch" ++ ++ Traces the context switches and wakeups between tasks. ++ ++ "irqsoff" ++ ++ Traces the areas that disable interrupts and saves ++ the trace with the longest max latency. ++ See tracing_max_latency. When a new max is recorded, ++ it replaces the old trace. It is best to view this ++ trace via the latency_trace file. ++ ++ "preemptoff" ++ ++ Similar to irqsoff but traces and records the amount of ++ time for which preemption is disabled. + +- sched_switch - traces the context switches between tasks. ++ "preemptirqsoff" + +- irqsoff - traces the areas that disable interrupts and saves +- the trace with the longest max latency. +- See tracing_max_latency. When a new max is recorded, +- it replaces the old trace. It is best to view this +- trace via the latency_trace file. ++ Similar to irqsoff and preemptoff, but traces and ++ records the largest time for which irqs and/or preemption ++ is disabled. + +- preemptoff - Similar to irqsoff but traces and records the amount of +- time for which preemption is disabled. ++ "wakeup" + +- preemptirqsoff - Similar to irqsoff and preemptoff, but traces and +- records the largest time for which irqs and/or preemption +- is disabled. ++ Traces and records the max latency that it takes for ++ the highest priority task to get scheduled after ++ it has been woken up. + +- wakeup - Traces and records the max latency that it takes for +- the highest priority task to get scheduled after +- it has been woken up. ++ "hw-branch-tracer" + +- nop - This is not a tracer. To remove all tracers from tracing +- simply echo "nop" into current_tracer. ++ Uses the BTS CPU feature on x86 CPUs to traces all ++ branches executed. ++ ++ "nop" ++ ++ This is the "trace nothing" tracer. To remove all ++ tracers from tracing simply echo "nop" into ++ current_tracer. + + + Examples of using the tracer + ---------------------------- + +-Here are typical examples of using the tracers when controlling them only +-with the debugfs interface (without using any user-land utilities). ++Here are typical examples of using the tracers when controlling ++them only with the debugfs interface (without using any ++user-land utilities). + + Output format: + -------------- +@@ -187,16 +251,16 @@ Here is an example of the output format + bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput + -------- + +-A header is printed with the tracer name that is represented by the trace. +-In this case the tracer is "function". Then a header showing the format. Task +-name "bash", the task PID "4251", the CPU that it was running on +-"01", the timestamp in . format, the function name that was +-traced "path_put" and the parent function that called this function +-"path_walk". The timestamp is the time at which the function was +-entered. ++A header is printed with the tracer name that is represented by ++the trace. In this case the tracer is "function". Then a header ++showing the format. Task name "bash", the task PID "4251", the ++CPU that it was running on "01", the timestamp in . ++format, the function name that was traced "path_put" and the ++parent function that called this function "path_walk". The ++timestamp is the time at which the function was entered. + +-The sched_switch tracer also includes tracing of task wakeups and +-context switches. ++The sched_switch tracer also includes tracing of task wakeups ++and context switches. + + ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S + ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S +@@ -205,8 +269,8 @@ context switches. + kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R + ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R + +-Wake ups are represented by a "+" and the context switches are shown as +-"==>". The format is: ++Wake ups are represented by a "+" and the context switches are ++shown as "==>". The format is: + + Context switches: + +@@ -220,19 +284,20 @@ Wake ups are represented by a "+" and th + + :: + :: + +-The prio is the internal kernel priority, which is the inverse of the +-priority that is usually displayed by user-space tools. Zero represents +-the highest priority (99). Prio 100 starts the "nice" priorities with +-100 being equal to nice -20 and 139 being nice 19. The prio "140" is +-reserved for the idle task which is the lowest priority thread (pid 0). ++The prio is the internal kernel priority, which is the inverse ++of the priority that is usually displayed by user-space tools. ++Zero represents the highest priority (99). Prio 100 starts the ++"nice" priorities with 100 being equal to nice -20 and 139 being ++nice 19. The prio "140" is reserved for the idle task which is ++the lowest priority thread (pid 0). + + + Latency trace format + -------------------- + +-For traces that display latency times, the latency_trace file gives +-somewhat more information to see why a latency happened. Here is a typical +-trace. ++For traces that display latency times, the latency_trace file ++gives somewhat more information to see why a latency happened. ++Here is a typical trace. + + # tracer: irqsoff + # +@@ -259,20 +324,20 @@ irqsoff latency trace v1.1.5 on 2.6.26-r + -0 0d.s1 98us : trace_hardirqs_on (do_softirq) + + ++This shows that the current tracer is "irqsoff" tracing the time ++for which interrupts were disabled. It gives the trace version ++and the version of the kernel upon which this was executed on ++(2.6.26-rc8). Then it displays the max latency in microsecs (97 ++us). The number of trace entries displayed and the total number ++recorded (both are three: #3/3). The type of preemption that was ++used (PREEMPT). VP, KP, SP, and HP are always zero and are ++reserved for later use. #P is the number of online CPUS (#P:2). + +-This shows that the current tracer is "irqsoff" tracing the time for which +-interrupts were disabled. It gives the trace version and the version +-of the kernel upon which this was executed on (2.6.26-rc8). Then it displays +-the max latency in microsecs (97 us). The number of trace entries displayed +-and the total number recorded (both are three: #3/3). The type of +-preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero +-and are reserved for later use. #P is the number of online CPUS (#P:2). +- +-The task is the process that was running when the latency occurred. +-(swapper pid: 0). ++The task is the process that was running when the latency ++occurred. (swapper pid: 0). + +-The start and stop (the functions in which the interrupts were disabled and +-enabled respectively) that caused the latencies: ++The start and stop (the functions in which the interrupts were ++disabled and enabled respectively) that caused the latencies: + + apic_timer_interrupt is where the interrupts were disabled. + do_softirq is where they were enabled again. +@@ -308,12 +373,12 @@ The above is mostly meaningful for kerne + latency_trace file is relative to the start of the trace. + + delay: This is just to help catch your eye a bit better. And +- needs to be fixed to be only relative to the same CPU. +- The marks are determined by the difference between this +- current trace and the next trace. +- '!' - greater than preempt_mark_thresh (default 100) +- '+' - greater than 1 microsecond +- ' ' - less than or equal to 1 microsecond. ++ needs to be fixed to be only relative to the same CPU. ++ The marks are determined by the difference between this ++ current trace and the next trace. ++ '!' - greater than preempt_mark_thresh (default 100) ++ '+' - greater than 1 microsecond ++ ' ' - less than or equal to 1 microsecond. + + The rest is the same as the 'trace' file. + +@@ -321,14 +386,15 @@ The above is mostly meaningful for kerne + trace_options + ------------- + +-The trace_options file is used to control what gets printed in the trace +-output. To see what is available, simply cat the file: ++The trace_options file is used to control what gets printed in ++the trace output. To see what is available, simply cat the file: + + cat /debug/tracing/trace_options + print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ +- noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj ++ noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj + +-To disable one of the options, echo in the option prepended with "no". ++To disable one of the options, echo in the option prepended with ++"no". + + echo noprint-parent > /debug/tracing/trace_options + +@@ -338,8 +404,8 @@ To enable an option, leave off the "no". + + Here are the available options: + +- print-parent - On function traces, display the calling function +- as well as the function being traced. ++ print-parent - On function traces, display the calling (parent) ++ function as well as the function being traced. + + print-parent: + bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul +@@ -348,15 +414,16 @@ Here are the available options: + bash-4000 [01] 1477.606694: simple_strtoul + + +- sym-offset - Display not only the function name, but also the offset +- in the function. For example, instead of seeing just +- "ktime_get", you will see "ktime_get+0xb/0x20". ++ sym-offset - Display not only the function name, but also the ++ offset in the function. For example, instead of ++ seeing just "ktime_get", you will see ++ "ktime_get+0xb/0x20". + + sym-offset: + bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0 + +- sym-addr - this will also display the function address as well as +- the function name. ++ sym-addr - this will also display the function address as well ++ as the function name. + + sym-addr: + bash-4000 [01] 1477.606694: simple_strtoul +@@ -366,35 +433,41 @@ Here are the available options: + bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ + (+0.000ms): simple_strtoul (strict_strtoul) + +- raw - This will display raw numbers. This option is best for use with +- user applications that can translate the raw numbers better than +- having it done in the kernel. ++ raw - This will display raw numbers. This option is best for ++ use with user applications that can translate the raw ++ numbers better than having it done in the kernel. + +- hex - Similar to raw, but the numbers will be in a hexadecimal format. ++ hex - Similar to raw, but the numbers will be in a hexadecimal ++ format. + + bin - This will print out the formats in raw binary. + + block - TBD (needs update) + +- stacktrace - This is one of the options that changes the trace itself. +- When a trace is recorded, so is the stack of functions. +- This allows for back traces of trace sites. +- +- userstacktrace - This option changes the trace. +- It records a stacktrace of the current userspace thread. +- +- sym-userobj - when user stacktrace are enabled, look up which object the +- address belongs to, and print a relative address +- This is especially useful when ASLR is on, otherwise you don't +- get a chance to resolve the address to object/file/line after the app is no +- longer running ++ stacktrace - This is one of the options that changes the trace ++ itself. When a trace is recorded, so is the stack ++ of functions. This allows for back traces of ++ trace sites. ++ ++ userstacktrace - This option changes the trace. It records a ++ stacktrace of the current userspace thread. ++ ++ sym-userobj - when user stacktrace are enabled, look up which ++ object the address belongs to, and print a ++ relative address. This is especially useful when ++ ASLR is on, otherwise you don't get a chance to ++ resolve the address to object/file/line after ++ the app is no longer running + +- The lookup is performed when you read trace,trace_pipe,latency_trace. Example: ++ The lookup is performed when you read ++ trace,trace_pipe,latency_trace. Example: + + a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 + x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] + +- sched-tree - TBD (any users??) ++ sched-tree - trace all tasks that are on the runqueue, at ++ every scheduling event. Will add overhead if ++ there's a lot of tasks running at once. + + + sched_switch +@@ -431,18 +504,19 @@ of how to use it. + [...] + + +-As we have discussed previously about this format, the header shows +-the name of the trace and points to the options. The "FUNCTION" +-is a misnomer since here it represents the wake ups and context +-switches. +- +-The sched_switch file only lists the wake ups (represented with '+') +-and context switches ('==>') with the previous task or current task +-first followed by the next task or task waking up. The format for both +-of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO +-is the inverse of the actual priority with zero (0) being the highest +-priority and the nice values starting at 100 (nice -20). Below is +-a quick chart to map the kernel priority to user land priorities. ++As we have discussed previously about this format, the header ++shows the name of the trace and points to the options. The ++"FUNCTION" is a misnomer since here it represents the wake ups ++and context switches. ++ ++The sched_switch file only lists the wake ups (represented with ++'+') and context switches ('==>') with the previous task or ++current task first followed by the next task or task waking up. ++The format for both of these is PID:KERNEL-PRIO:TASK-STATE. ++Remember that the KERNEL-PRIO is the inverse of the actual ++priority with zero (0) being the highest priority and the nice ++values starting at 100 (nice -20). Below is a quick chart to map ++the kernel priority to user land priorities. + + Kernel priority: 0 to 99 ==> user RT priority 99 to 0 + Kernel priority: 100 to 139 ==> user nice -20 to 19 +@@ -463,10 +537,10 @@ The task states are: + ftrace_enabled + -------------- + +-The following tracers (listed below) give different output depending +-on whether or not the sysctl ftrace_enabled is set. To set ftrace_enabled, +-one can either use the sysctl function or set it via the proc +-file system interface. ++The following tracers (listed below) give different output ++depending on whether or not the sysctl ftrace_enabled is set. To ++set ftrace_enabled, one can either use the sysctl function or ++set it via the proc file system interface. + + sysctl kernel.ftrace_enabled=1 + +@@ -474,12 +548,12 @@ file system interface. + + echo 1 > /proc/sys/kernel/ftrace_enabled + +-To disable ftrace_enabled simply replace the '1' with '0' in +-the above commands. ++To disable ftrace_enabled simply replace the '1' with '0' in the ++above commands. + +-When ftrace_enabled is set the tracers will also record the functions +-that are within the trace. The descriptions of the tracers +-will also show an example with ftrace enabled. ++When ftrace_enabled is set the tracers will also record the ++functions that are within the trace. The descriptions of the ++tracers will also show an example with ftrace enabled. + + + irqsoff +@@ -487,17 +561,18 @@ irqsoff + + When interrupts are disabled, the CPU can not react to any other + external event (besides NMIs and SMIs). This prevents the timer +-interrupt from triggering or the mouse interrupt from letting the +-kernel know of a new mouse event. The result is a latency with the +-reaction time. +- +-The irqsoff tracer tracks the time for which interrupts are disabled. +-When a new maximum latency is hit, the tracer saves the trace leading up +-to that latency point so that every time a new maximum is reached, the old +-saved trace is discarded and the new trace is saved. ++interrupt from triggering or the mouse interrupt from letting ++the kernel know of a new mouse event. The result is a latency ++with the reaction time. ++ ++The irqsoff tracer tracks the time for which interrupts are ++disabled. When a new maximum latency is hit, the tracer saves ++the trace leading up to that latency point so that every time a ++new maximum is reached, the old saved trace is discarded and the ++new trace is saved. + +-To reset the maximum, echo 0 into tracing_max_latency. Here is an +-example: ++To reset the maximum, echo 0 into tracing_max_latency. Here is ++an example: + + # echo irqsoff > /debug/tracing/current_tracer + # echo 0 > /debug/tracing/tracing_max_latency +@@ -532,10 +607,11 @@ irqsoff latency trace v1.1.5 on 2.6.26 + + + Here we see that that we had a latency of 12 microsecs (which is +-very good). The _write_lock_irq in sys_setpgid disabled interrupts. +-The difference between the 12 and the displayed timestamp 14us occurred +-because the clock was incremented between the time of recording the max +-latency and the time of recording the function that had that latency. ++very good). The _write_lock_irq in sys_setpgid disabled ++interrupts. The difference between the 12 and the displayed ++timestamp 14us occurred because the clock was incremented ++between the time of recording the max latency and the time of ++recording the function that had that latency. + + Note the above example had ftrace_enabled not set. If we set the + ftrace_enabled, we get a much larger output: +@@ -586,24 +662,24 @@ irqsoff latency trace v1.1.5 on 2.6.26-r + + + Here we traced a 50 microsecond latency. But we also see all the +-functions that were called during that time. Note that by enabling +-function tracing, we incur an added overhead. This overhead may +-extend the latency times. But nevertheless, this trace has provided +-some very helpful debugging information. ++functions that were called during that time. Note that by ++enabling function tracing, we incur an added overhead. This ++overhead may extend the latency times. But nevertheless, this ++trace has provided some very helpful debugging information. + + + preemptoff + ---------- + +-When preemption is disabled, we may be able to receive interrupts but +-the task cannot be preempted and a higher priority task must wait +-for preemption to be enabled again before it can preempt a lower +-priority task. ++When preemption is disabled, we may be able to receive ++interrupts but the task cannot be preempted and a higher ++priority task must wait for preemption to be enabled again ++before it can preempt a lower priority task. + + The preemptoff tracer traces the places that disable preemption. +-Like the irqsoff tracer, it records the maximum latency for which preemption +-was disabled. The control of preemptoff tracer is much like the irqsoff +-tracer. ++Like the irqsoff tracer, it records the maximum latency for ++which preemption was disabled. The control of preemptoff tracer ++is much like the irqsoff tracer. + + # echo preemptoff > /debug/tracing/current_tracer + # echo 0 > /debug/tracing/tracing_max_latency +@@ -637,11 +713,12 @@ preemptoff latency trace v1.1.5 on 2.6.2 + sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq) + + +-This has some more changes. Preemption was disabled when an interrupt +-came in (notice the 'h'), and was enabled while doing a softirq. +-(notice the 's'). But we also see that interrupts have been disabled +-when entering the preempt off section and leaving it (the 'd'). +-We do not know if interrupts were enabled in the mean time. ++This has some more changes. Preemption was disabled when an ++interrupt came in (notice the 'h'), and was enabled while doing ++a softirq. (notice the 's'). But we also see that interrupts ++have been disabled when entering the preempt off section and ++leaving it (the 'd'). We do not know if interrupts were enabled ++in the mean time. + + # tracer: preemptoff + # +@@ -700,28 +777,30 @@ preemptoff latency trace v1.1.5 on 2.6.2 + sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq) + + +-The above is an example of the preemptoff trace with ftrace_enabled +-set. Here we see that interrupts were disabled the entire time. +-The irq_enter code lets us know that we entered an interrupt 'h'. +-Before that, the functions being traced still show that it is not +-in an interrupt, but we can see from the functions themselves that +-this is not the case. +- +-Notice that __do_softirq when called does not have a preempt_count. +-It may seem that we missed a preempt enabling. What really happened +-is that the preempt count is held on the thread's stack and we +-switched to the softirq stack (4K stacks in effect). The code +-does not copy the preempt count, but because interrupts are disabled, +-we do not need to worry about it. Having a tracer like this is good +-for letting people know what really happens inside the kernel. ++The above is an example of the preemptoff trace with ++ftrace_enabled set. Here we see that interrupts were disabled ++the entire time. The irq_enter code lets us know that we entered ++an interrupt 'h'. Before that, the functions being traced still ++show that it is not in an interrupt, but we can see from the ++functions themselves that this is not the case. ++ ++Notice that __do_softirq when called does not have a ++preempt_count. It may seem that we missed a preempt enabling. ++What really happened is that the preempt count is held on the ++thread's stack and we switched to the softirq stack (4K stacks ++in effect). The code does not copy the preempt count, but ++because interrupts are disabled, we do not need to worry about ++it. Having a tracer like this is good for letting people know ++what really happens inside the kernel. + + + preemptirqsoff + -------------- + +-Knowing the locations that have interrupts disabled or preemption +-disabled for the longest times is helpful. But sometimes we would +-like to know when either preemption and/or interrupts are disabled. ++Knowing the locations that have interrupts disabled or ++preemption disabled for the longest times is helpful. But ++sometimes we would like to know when either preemption and/or ++interrupts are disabled. + + Consider the following code: + +@@ -741,11 +820,13 @@ The preemptoff tracer will record the to + call_function_with_irqs_and_preemption_off() and + call_function_with_preemption_off(). + +-But neither will trace the time that interrupts and/or preemption +-is disabled. This total time is the time that we can not schedule. +-To record this time, use the preemptirqsoff tracer. ++But neither will trace the time that interrupts and/or ++preemption is disabled. This total time is the time that we can ++not schedule. To record this time, use the preemptirqsoff ++tracer. + +-Again, using this trace is much like the irqsoff and preemptoff tracers. ++Again, using this trace is much like the irqsoff and preemptoff ++tracers. + + # echo preemptirqsoff > /debug/tracing/current_tracer + # echo 0 > /debug/tracing/tracing_max_latency +@@ -781,9 +862,10 @@ preemptirqsoff latency trace v1.1.5 on 2 + + + The trace_hardirqs_off_thunk is called from assembly on x86 when +-interrupts are disabled in the assembly code. Without the function +-tracing, we do not know if interrupts were enabled within the preemption +-points. We do see that it started with preemption enabled. ++interrupts are disabled in the assembly code. Without the ++function tracing, we do not know if interrupts were enabled ++within the preemption points. We do see that it started with ++preemption enabled. + + Here is a trace with ftrace_enabled set: + +@@ -871,40 +953,42 @@ preemptirqsoff latency trace v1.1.5 on 2 + sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq) + + +-This is a very interesting trace. It started with the preemption of +-the ls task. We see that the task had the "need_resched" bit set +-via the 'N' in the trace. Interrupts were disabled before the spin_lock +-at the beginning of the trace. We see that a schedule took place to run +-sshd. When the interrupts were enabled, we took an interrupt. +-On return from the interrupt handler, the softirq ran. We took another +-interrupt while running the softirq as we see from the capital 'H'. ++This is a very interesting trace. It started with the preemption ++of the ls task. We see that the task had the "need_resched" bit ++set via the 'N' in the trace. Interrupts were disabled before ++the spin_lock at the beginning of the trace. We see that a ++schedule took place to run sshd. When the interrupts were ++enabled, we took an interrupt. On return from the interrupt ++handler, the softirq ran. We took another interrupt while ++running the softirq as we see from the capital 'H'. + + + wakeup + ------ + +-In a Real-Time environment it is very important to know the wakeup +-time it takes for the highest priority task that is woken up to the +-time that it executes. This is also known as "schedule latency". +-I stress the point that this is about RT tasks. It is also important +-to know the scheduling latency of non-RT tasks, but the average +-schedule latency is better for non-RT tasks. Tools like +-LatencyTop are more appropriate for such measurements. ++In a Real-Time environment it is very important to know the ++wakeup time it takes for the highest priority task that is woken ++up to the time that it executes. This is also known as "schedule ++latency". I stress the point that this is about RT tasks. It is ++also important to know the scheduling latency of non-RT tasks, ++but the average schedule latency is better for non-RT tasks. ++Tools like LatencyTop are more appropriate for such ++measurements. + + Real-Time environments are interested in the worst case latency. +-That is the longest latency it takes for something to happen, and +-not the average. We can have a very fast scheduler that may only +-have a large latency once in a while, but that would not work well +-with Real-Time tasks. The wakeup tracer was designed to record +-the worst case wakeups of RT tasks. Non-RT tasks are not recorded +-because the tracer only records one worst case and tracing non-RT +-tasks that are unpredictable will overwrite the worst case latency +-of RT tasks. +- +-Since this tracer only deals with RT tasks, we will run this slightly +-differently than we did with the previous tracers. Instead of performing +-an 'ls', we will run 'sleep 1' under 'chrt' which changes the +-priority of the task. ++That is the longest latency it takes for something to happen, ++and not the average. We can have a very fast scheduler that may ++only have a large latency once in a while, but that would not ++work well with Real-Time tasks. The wakeup tracer was designed ++to record the worst case wakeups of RT tasks. Non-RT tasks are ++not recorded because the tracer only records one worst case and ++tracing non-RT tasks that are unpredictable will overwrite the ++worst case latency of RT tasks. ++ ++Since this tracer only deals with RT tasks, we will run this ++slightly differently than we did with the previous tracers. ++Instead of performing an 'ls', we will run 'sleep 1' under ++'chrt' which changes the priority of the task. + + # echo wakeup > /debug/tracing/current_tracer + # echo 0 > /debug/tracing/tracing_max_latency +@@ -934,17 +1018,16 @@ wakeup latency trace v1.1.5 on 2.6.26-rc + -0 1d..4 4us : schedule (cpu_idle) + + +- +-Running this on an idle system, we see that it only took 4 microseconds +-to perform the task switch. Note, since the trace marker in the +-schedule is before the actual "switch", we stop the tracing when +-the recorded task is about to schedule in. This may change if +-we add a new marker at the end of the scheduler. +- +-Notice that the recorded task is 'sleep' with the PID of 4901 and it +-has an rt_prio of 5. This priority is user-space priority and not +-the internal kernel priority. The policy is 1 for SCHED_FIFO and 2 +-for SCHED_RR. ++Running this on an idle system, we see that it only took 4 ++microseconds to perform the task switch. Note, since the trace ++marker in the schedule is before the actual "switch", we stop ++the tracing when the recorded task is about to schedule in. This ++may change if we add a new marker at the end of the scheduler. ++ ++Notice that the recorded task is 'sleep' with the PID of 4901 ++and it has an rt_prio of 5. This priority is user-space priority ++and not the internal kernel priority. The policy is 1 for ++SCHED_FIFO and 2 for SCHED_RR. + + Doing the same with chrt -r 5 and ftrace_enabled set. + +@@ -1001,24 +1084,25 @@ ksoftirq-7 1d..6 49us : _spin_unlo + ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock) + ksoftirq-7 1d..4 50us : schedule (__cond_resched) + +-The interrupt went off while running ksoftirqd. This task runs at +-SCHED_OTHER. Why did not we see the 'N' set early? This may be +-a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks +-configured, the interrupt and softirq run with their own stack. +-Some information is held on the top of the task's stack (need_resched +-and preempt_count are both stored there). The setting of the NEED_RESCHED +-bit is done directly to the task's stack, but the reading of the +-NEED_RESCHED is done by looking at the current stack, which in this case +-is the stack for the hard interrupt. This hides the fact that NEED_RESCHED +-has been set. We do not see the 'N' until we switch back to the task's ++The interrupt went off while running ksoftirqd. This task runs ++at SCHED_OTHER. Why did not we see the 'N' set early? This may ++be a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K ++stacks configured, the interrupt and softirq run with their own ++stack. Some information is held on the top of the task's stack ++(need_resched and preempt_count are both stored there). The ++setting of the NEED_RESCHED bit is done directly to the task's ++stack, but the reading of the NEED_RESCHED is done by looking at ++the current stack, which in this case is the stack for the hard ++interrupt. This hides the fact that NEED_RESCHED has been set. ++We do not see the 'N' until we switch back to the task's + assigned stack. + + function + -------- + + This tracer is the function tracer. Enabling the function tracer +-can be done from the debug file system. Make sure the ftrace_enabled is +-set; otherwise this tracer is a nop. ++can be done from the debug file system. Make sure the ++ftrace_enabled is set; otherwise this tracer is a nop. + + # sysctl kernel.ftrace_enabled=1 + # echo function > /debug/tracing/current_tracer +@@ -1048,14 +1132,15 @@ set; otherwise this tracer is a nop. + [...] + + +-Note: function tracer uses ring buffers to store the above entries. +-The newest data may overwrite the oldest data. Sometimes using echo to +-stop the trace is not sufficient because the tracing could have overwritten +-the data that you wanted to record. For this reason, it is sometimes better to +-disable tracing directly from a program. This allows you to stop the +-tracing at the point that you hit the part that you are interested in. +-To disable the tracing directly from a C program, something like following +-code snippet can be used: ++Note: function tracer uses ring buffers to store the above ++entries. The newest data may overwrite the oldest data. ++Sometimes using echo to stop the trace is not sufficient because ++the tracing could have overwritten the data that you wanted to ++record. For this reason, it is sometimes better to disable ++tracing directly from a program. This allows you to stop the ++tracing at the point that you hit the part that you are ++interested in. To disable the tracing directly from a C program, ++something like following code snippet can be used: + + int trace_fd; + [...] +@@ -1070,10 +1155,10 @@ int main(int argc, char *argv[]) { + } + + Note: Here we hard coded the path name. The debugfs mount is not +-guaranteed to be at /debug (and is more commonly at /sys/kernel/debug). +-For simple one time traces, the above is sufficent. For anything else, +-a search through /proc/mounts may be needed to find where the debugfs +-file-system is mounted. ++guaranteed to be at /debug (and is more commonly at ++/sys/kernel/debug). For simple one time traces, the above is ++sufficent. For anything else, a search through /proc/mounts may ++be needed to find where the debugfs file-system is mounted. + + + Single thread tracing +@@ -1152,49 +1237,297 @@ int main (int argc, char **argv) + return 0; + } + ++ ++hw-branch-tracer (x86 only) ++--------------------------- ++ ++This tracer uses the x86 last branch tracing hardware feature to ++collect a branch trace on all cpus with relatively low overhead. ++ ++The tracer uses a fixed-size circular buffer per cpu and only ++traces ring 0 branches. The trace file dumps that buffer in the ++following format: ++ ++# tracer: hw-branch-tracer ++# ++# CPU# TO <- FROM ++ 0 scheduler_tick+0xb5/0x1bf <- task_tick_idle+0x5/0x6 ++ 2 run_posix_cpu_timers+0x2b/0x72a <- run_posix_cpu_timers+0x25/0x72a ++ 0 scheduler_tick+0x139/0x1bf <- scheduler_tick+0xed/0x1bf ++ 0 scheduler_tick+0x17c/0x1bf <- scheduler_tick+0x148/0x1bf ++ 2 run_posix_cpu_timers+0x9e/0x72a <- run_posix_cpu_timers+0x5e/0x72a ++ 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf ++ ++ ++The tracer may be used to dump the trace for the oops'ing cpu on ++a kernel oops into the system log. To enable this, ++ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one ++can either use the sysctl function or set it via the proc system ++interface. ++ ++ sysctl kernel.ftrace_dump_on_oops=1 ++ ++or ++ ++ echo 1 > /proc/sys/kernel/ftrace_dump_on_oops ++ ++ ++Here's an example of such a dump after a null pointer ++dereference in a kernel module: ++ ++[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 ++[57848.106019] IP: [] open+0x6/0x14 [oops] ++[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0 ++[57848.106019] Oops: 0002 [#1] SMP ++[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus ++[57848.106019] Dumping ftrace buffer: ++[57848.106019] --------------------------------- ++[...] ++[57848.106019] 0 chrdev_open+0xe6/0x165 <- cdev_put+0x23/0x24 ++[57848.106019] 0 chrdev_open+0x117/0x165 <- chrdev_open+0xfa/0x165 ++[57848.106019] 0 chrdev_open+0x120/0x165 <- chrdev_open+0x11c/0x165 ++[57848.106019] 0 chrdev_open+0x134/0x165 <- chrdev_open+0x12b/0x165 ++[57848.106019] 0 open+0x0/0x14 [oops] <- chrdev_open+0x144/0x165 ++[57848.106019] 0 page_fault+0x0/0x30 <- open+0x6/0x14 [oops] ++[57848.106019] 0 error_entry+0x0/0x5b <- page_fault+0x4/0x30 ++[57848.106019] 0 error_kernelspace+0x0/0x31 <- error_entry+0x59/0x5b ++[57848.106019] 0 error_sti+0x0/0x1 <- error_kernelspace+0x2d/0x31 ++[57848.106019] 0 page_fault+0x9/0x30 <- error_sti+0x0/0x1 ++[57848.106019] 0 do_page_fault+0x0/0x881 <- page_fault+0x1a/0x30 ++[...] ++[57848.106019] 0 do_page_fault+0x66b/0x881 <- is_prefetch+0x1ee/0x1f2 ++[57848.106019] 0 do_page_fault+0x6e0/0x881 <- do_page_fault+0x67a/0x881 ++[57848.106019] 0 oops_begin+0x0/0x96 <- do_page_fault+0x6e0/0x881 ++[57848.106019] 0 trace_hw_branch_oops+0x0/0x2d <- oops_begin+0x9/0x96 ++[...] ++[57848.106019] 0 ds_suspend_bts+0x2a/0xe3 <- ds_suspend_bts+0x1a/0xe3 ++[57848.106019] --------------------------------- ++[57848.106019] CPU 0 ++[57848.106019] Modules linked in: oops ++[57848.106019] Pid: 5542, comm: cat Tainted: G W 2.6.28 #23 ++[57848.106019] RIP: 0010:[] [] open+0x6/0x14 [oops] ++[57848.106019] RSP: 0018:ffff880235457d48 EFLAGS: 00010246 ++[...] ++ ++ ++function graph tracer ++--------------------------- ++ ++This tracer is similar to the function tracer except that it ++probes a function on its entry and its exit. This is done by ++using a dynamically allocated stack of return addresses in each ++task_struct. On function entry the tracer overwrites the return ++address of each function traced to set a custom probe. Thus the ++original return address is stored on the stack of return address ++in the task_struct. ++ ++Probing on both ends of a function leads to special features ++such as: ++ ++- measure of a function's time execution ++- having a reliable call stack to draw function calls graph ++ ++This tracer is useful in several situations: ++ ++- you want to find the reason of a strange kernel behavior and ++ need to see what happens in detail on any areas (or specific ++ ones). ++ ++- you are experiencing weird latencies but it's difficult to ++ find its origin. ++ ++- you want to find quickly which path is taken by a specific ++ function ++ ++- you just want to peek inside a working kernel and want to see ++ what happens there. ++ ++# tracer: function_graph ++# ++# CPU DURATION FUNCTION CALLS ++# | | | | | | | ++ ++ 0) | sys_open() { ++ 0) | do_sys_open() { ++ 0) | getname() { ++ 0) | kmem_cache_alloc() { ++ 0) 1.382 us | __might_sleep(); ++ 0) 2.478 us | } ++ 0) | strncpy_from_user() { ++ 0) | might_fault() { ++ 0) 1.389 us | __might_sleep(); ++ 0) 2.553 us | } ++ 0) 3.807 us | } ++ 0) 7.876 us | } ++ 0) | alloc_fd() { ++ 0) 0.668 us | _spin_lock(); ++ 0) 0.570 us | expand_files(); ++ 0) 0.586 us | _spin_unlock(); ++ ++ ++There are several columns that can be dynamically ++enabled/disabled. You can use every combination of options you ++want, depending on your needs. ++ ++- The cpu number on which the function executed is default ++ enabled. It is sometimes better to only trace one cpu (see ++ tracing_cpu_mask file) or you might sometimes see unordered ++ function calls while cpu tracing switch. ++ ++ hide: echo nofuncgraph-cpu > /debug/tracing/trace_options ++ show: echo funcgraph-cpu > /debug/tracing/trace_options ++ ++- The duration (function's time of execution) is displayed on ++ the closing bracket line of a function or on the same line ++ than the current function in case of a leaf one. It is default ++ enabled. ++ ++ hide: echo nofuncgraph-duration > /debug/tracing/trace_options ++ show: echo funcgraph-duration > /debug/tracing/trace_options ++ ++- The overhead field precedes the duration field in case of ++ reached duration thresholds. ++ ++ hide: echo nofuncgraph-overhead > /debug/tracing/trace_options ++ show: echo funcgraph-overhead > /debug/tracing/trace_options ++ depends on: funcgraph-duration ++ ++ ie: ++ ++ 0) | up_write() { ++ 0) 0.646 us | _spin_lock_irqsave(); ++ 0) 0.684 us | _spin_unlock_irqrestore(); ++ 0) 3.123 us | } ++ 0) 0.548 us | fput(); ++ 0) + 58.628 us | } ++ ++ [...] ++ ++ 0) | putname() { ++ 0) | kmem_cache_free() { ++ 0) 0.518 us | __phys_addr(); ++ 0) 1.757 us | } ++ 0) 2.861 us | } ++ 0) ! 115.305 us | } ++ 0) ! 116.402 us | } ++ ++ + means that the function exceeded 10 usecs. ++ ! means that the function exceeded 100 usecs. ++ ++ ++- The task/pid field displays the thread cmdline and pid which ++ executed the function. It is default disabled. ++ ++ hide: echo nofuncgraph-proc > /debug/tracing/trace_options ++ show: echo funcgraph-proc > /debug/tracing/trace_options ++ ++ ie: ++ ++ # tracer: function_graph ++ # ++ # CPU TASK/PID DURATION FUNCTION CALLS ++ # | | | | | | | | | ++ 0) sh-4802 | | d_free() { ++ 0) sh-4802 | | call_rcu() { ++ 0) sh-4802 | | __call_rcu() { ++ 0) sh-4802 | 0.616 us | rcu_process_gp_end(); ++ 0) sh-4802 | 0.586 us | check_for_new_grace_period(); ++ 0) sh-4802 | 2.899 us | } ++ 0) sh-4802 | 4.040 us | } ++ 0) sh-4802 | 5.151 us | } ++ 0) sh-4802 | + 49.370 us | } ++ ++ ++- The absolute time field is an absolute timestamp given by the ++ system clock since it started. A snapshot of this time is ++ given on each entry/exit of functions ++ ++ hide: echo nofuncgraph-abstime > /debug/tracing/trace_options ++ show: echo funcgraph-abstime > /debug/tracing/trace_options ++ ++ ie: ++ ++ # ++ # TIME CPU DURATION FUNCTION CALLS ++ # | | | | | | | | ++ 360.774522 | 1) 0.541 us | } ++ 360.774522 | 1) 4.663 us | } ++ 360.774523 | 1) 0.541 us | __wake_up_bit(); ++ 360.774524 | 1) 6.796 us | } ++ 360.774524 | 1) 7.952 us | } ++ 360.774525 | 1) 9.063 us | } ++ 360.774525 | 1) 0.615 us | journal_mark_dirty(); ++ 360.774527 | 1) 0.578 us | __brelse(); ++ 360.774528 | 1) | reiserfs_prepare_for_journal() { ++ 360.774528 | 1) | unlock_buffer() { ++ 360.774529 | 1) | wake_up_bit() { ++ 360.774529 | 1) | bit_waitqueue() { ++ 360.774530 | 1) 0.594 us | __phys_addr(); ++ ++ ++You can put some comments on specific functions by using ++trace_printk() For example, if you want to put a comment inside ++the __might_sleep() function, you just have to include ++ and call trace_printk() inside __might_sleep() ++ ++trace_printk("I'm a comment!\n") ++ ++will produce: ++ ++ 1) | __might_sleep() { ++ 1) | /* I'm a comment! */ ++ 1) 1.449 us | } ++ ++ ++You might find other useful features for this tracer in the ++following "dynamic ftrace" section such as tracing only specific ++functions or tasks. ++ + dynamic ftrace + -------------- + + If CONFIG_DYNAMIC_FTRACE is set, the system will run with + virtually no overhead when function tracing is disabled. The way + this works is the mcount function call (placed at the start of +-every kernel function, produced by the -pg switch in gcc), starts +-of pointing to a simple return. (Enabling FTRACE will include the +--pg switch in the compiling of the kernel.) ++every kernel function, produced by the -pg switch in gcc), ++starts of pointing to a simple return. (Enabling FTRACE will ++include the -pg switch in the compiling of the kernel.) + + At compile time every C file object is run through the + recordmcount.pl script (located in the scripts directory). This + script will process the C object using objdump to find all the +-locations in the .text section that call mcount. (Note, only +-the .text section is processed, since processing other sections +-like .init.text may cause races due to those sections being freed). +- +-A new section called "__mcount_loc" is created that holds references +-to all the mcount call sites in the .text section. This section is +-compiled back into the original object. The final linker will add +-all these references into a single table. ++locations in the .text section that call mcount. (Note, only the ++.text section is processed, since processing other sections like ++.init.text may cause races due to those sections being freed). ++ ++A new section called "__mcount_loc" is created that holds ++references to all the mcount call sites in the .text section. ++This section is compiled back into the original object. The ++final linker will add all these references into a single table. + + On boot up, before SMP is initialized, the dynamic ftrace code +-scans this table and updates all the locations into nops. It also +-records the locations, which are added to the available_filter_functions +-list. Modules are processed as they are loaded and before they are +-executed. When a module is unloaded, it also removes its functions from +-the ftrace function list. This is automatic in the module unload +-code, and the module author does not need to worry about it. +- +-When tracing is enabled, kstop_machine is called to prevent races +-with the CPUS executing code being modified (which can cause the +-CPU to do undesireable things), and the nops are patched back +-to calls. But this time, they do not call mcount (which is just +-a function stub). They now call into the ftrace infrastructure. ++scans this table and updates all the locations into nops. It ++also records the locations, which are added to the ++available_filter_functions list. Modules are processed as they ++are loaded and before they are executed. When a module is ++unloaded, it also removes its functions from the ftrace function ++list. This is automatic in the module unload code, and the ++module author does not need to worry about it. ++ ++When tracing is enabled, kstop_machine is called to prevent ++races with the CPUS executing code being modified (which can ++cause the CPU to do undesireable things), and the nops are ++patched back to calls. But this time, they do not call mcount ++(which is just a function stub). They now call into the ftrace ++infrastructure. + + One special side-effect to the recording of the functions being + traced is that we can now selectively choose which functions we +-wish to trace and which ones we want the mcount calls to remain as +-nops. ++wish to trace and which ones we want the mcount calls to remain ++as nops. + +-Two files are used, one for enabling and one for disabling the tracing +-of specified functions. They are: ++Two files are used, one for enabling and one for disabling the ++tracing of specified functions. They are: + + set_ftrace_filter + +@@ -1202,8 +1535,8 @@ and + + set_ftrace_notrace + +-A list of available functions that you can add to these files is listed +-in: ++A list of available functions that you can add to these files is ++listed in: + + available_filter_functions + +@@ -1240,8 +1573,8 @@ hrtimer_interrupt + sys_nanosleep + + +-Perhaps this is not enough. The filters also allow simple wild cards. +-Only the following are currently available ++Perhaps this is not enough. The filters also allow simple wild ++cards. Only the following are currently available + + * - will match functions that begin with + * - will match functions that end with +@@ -1251,9 +1584,9 @@ These are the only wild cards which are + + * will not work. + +-Note: It is better to use quotes to enclose the wild cards, otherwise +- the shell may expand the parameters into names of files in the local +- directory. ++Note: It is better to use quotes to enclose the wild cards, ++ otherwise the shell may expand the parameters into names ++ of files in the local directory. + + # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter + +@@ -1299,7 +1632,8 @@ This is because the '>' and '>>' act jus + To rewrite the filters, use '>' + To append to the filters, use '>>' + +-To clear out a filter so that all functions will be recorded again: ++To clear out a filter so that all functions will be recorded ++again: + + # echo > /debug/tracing/set_ftrace_filter + # cat /debug/tracing/set_ftrace_filter +@@ -1331,7 +1665,8 @@ hrtimer_get_res + hrtimer_init_sleeper + + +-The set_ftrace_notrace prevents those functions from being traced. ++The set_ftrace_notrace prevents those functions from being ++traced. + + # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace + +@@ -1353,13 +1688,75 @@ Produces: + + We can see that there's no more lock or preempt tracing. + ++ ++Dynamic ftrace with the function graph tracer ++--------------------------------------------- ++ ++Although what has been explained above concerns both the ++function tracer and the function-graph-tracer, there are some ++special features only available in the function-graph tracer. ++ ++If you want to trace only one function and all of its children, ++you just have to echo its name into set_graph_function: ++ ++ echo __do_fault > set_graph_function ++ ++will produce the following "expanded" trace of the __do_fault() ++function: ++ ++ 0) | __do_fault() { ++ 0) | filemap_fault() { ++ 0) | find_lock_page() { ++ 0) 0.804 us | find_get_page(); ++ 0) | __might_sleep() { ++ 0) 1.329 us | } ++ 0) 3.904 us | } ++ 0) 4.979 us | } ++ 0) 0.653 us | _spin_lock(); ++ 0) 0.578 us | page_add_file_rmap(); ++ 0) 0.525 us | native_set_pte_at(); ++ 0) 0.585 us | _spin_unlock(); ++ 0) | unlock_page() { ++ 0) 0.541 us | page_waitqueue(); ++ 0) 0.639 us | __wake_up_bit(); ++ 0) 2.786 us | } ++ 0) + 14.237 us | } ++ 0) | __do_fault() { ++ 0) | filemap_fault() { ++ 0) | find_lock_page() { ++ 0) 0.698 us | find_get_page(); ++ 0) | __might_sleep() { ++ 0) 1.412 us | } ++ 0) 3.950 us | } ++ 0) 5.098 us | } ++ 0) 0.631 us | _spin_lock(); ++ 0) 0.571 us | page_add_file_rmap(); ++ 0) 0.526 us | native_set_pte_at(); ++ 0) 0.586 us | _spin_unlock(); ++ 0) | unlock_page() { ++ 0) 0.533 us | page_waitqueue(); ++ 0) 0.638 us | __wake_up_bit(); ++ 0) 2.793 us | } ++ 0) + 14.012 us | } ++ ++You can also expand several functions at once: ++ ++ echo sys_open > set_graph_function ++ echo sys_close >> set_graph_function ++ ++Now if you want to go back to trace all functions you can clear ++this special filter via: ++ ++ echo > set_graph_function ++ ++ + trace_pipe + ---------- + +-The trace_pipe outputs the same content as the trace file, but the effect +-on the tracing is different. Every read from trace_pipe is consumed. +-This means that subsequent reads will be different. The trace +-is live. ++The trace_pipe outputs the same content as the trace file, but ++the effect on the tracing is different. Every read from ++trace_pipe is consumed. This means that subsequent reads will be ++different. The trace is live. + + # echo function > /debug/tracing/current_tracer + # cat /debug/tracing/trace_pipe > /tmp/trace.out & +@@ -1387,38 +1784,45 @@ is live. + bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up + + +-Note, reading the trace_pipe file will block until more input is added. +-By changing the tracer, trace_pipe will issue an EOF. We needed +-to set the function tracer _before_ we "cat" the trace_pipe file. ++Note, reading the trace_pipe file will block until more input is ++added. By changing the tracer, trace_pipe will issue an EOF. We ++needed to set the function tracer _before_ we "cat" the ++trace_pipe file. + + + trace entries + ------------- + +-Having too much or not enough data can be troublesome in diagnosing +-an issue in the kernel. The file buffer_size_kb is used to modify +-the size of the internal trace buffers. The number listed +-is the number of entries that can be recorded per CPU. To know +-the full size, multiply the number of possible CPUS with the +-number of entries. ++Having too much or not enough data can be troublesome in ++diagnosing an issue in the kernel. The file buffer_size_kb is ++used to modify the size of the internal trace buffers. The ++number listed is the number of entries that can be recorded per ++CPU. To know the full size, multiply the number of possible CPUS ++with the number of entries. + + # cat /debug/tracing/buffer_size_kb + 1408 (units kilobytes) + +-Note, to modify this, you must have tracing completely disabled. To do that, +-echo "nop" into the current_tracer. If the current_tracer is not set +-to "nop", an EINVAL error will be returned. ++Note, to modify this, you must have tracing completely disabled. ++To do that, echo "nop" into the current_tracer. If the ++current_tracer is not set to "nop", an EINVAL error will be ++returned. + + # echo nop > /debug/tracing/current_tracer + # echo 10000 > /debug/tracing/buffer_size_kb + # cat /debug/tracing/buffer_size_kb + 10000 (units kilobytes) + +-The number of pages which will be allocated is limited to a percentage +-of available memory. Allocating too much will produce an error. ++The number of pages which will be allocated is limited to a ++percentage of available memory. Allocating too much will produce ++an error. + + # echo 1000000000000 > /debug/tracing/buffer_size_kb + -bash: echo: write error: Cannot allocate memory + # cat /debug/tracing/buffer_size_kb + 85 + ++----------- ++ ++More details can be found in the source code, in the ++kernel/tracing/*.c files. +Index: linux-2.6-tip/Documentation/kernel-parameters.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/kernel-parameters.txt ++++ linux-2.6-tip/Documentation/kernel-parameters.txt +@@ -49,6 +49,7 @@ parameter is applicable: + ISAPNP ISA PnP code is enabled. + ISDN Appropriate ISDN support is enabled. + JOY Appropriate joystick support is enabled. ++ KMEMTRACE kmemtrace is enabled. + LIBATA Libata driver is enabled + LP Printer support is enabled. + LOOP Loopback device support is enabled. +@@ -491,11 +492,23 @@ and is between 256 and 4096 characters. + Range: 0 - 8192 + Default: 64 + ++ dma_debug=off If the kernel is compiled with DMA_API_DEBUG support ++ this option disables the debugging code at boot. ++ ++ dma_debug_entries= ++ This option allows to tune the number of preallocated ++ entries for DMA-API debugging code. One entry is ++ required per DMA-API allocation. Use this if the ++ DMA-API debugging code disables itself because the ++ architectural default is too low. ++ + hpet= [X86-32,HPET] option to control HPET usage +- Format: { enable (default) | disable | force } ++ Format: { enable (default) | disable | force | ++ verbose } + disable: disable HPET and use PIT instead + force: allow force enabled of undocumented chips (ICH4, + VIA, nVidia) ++ verbose: show contents of HPET registers during setup + + com20020= [HW,NET] ARCnet - COM20020 chipset + Format: +@@ -604,6 +617,9 @@ and is between 256 and 4096 characters. + + debug_objects [KNL] Enable object debugging + ++ no_debug_objects ++ [KNL] Disable object debugging ++ + debugpat [X86] Enable PAT debugging + + decnet.addr= [HW,NET] +@@ -1047,6 +1063,15 @@ and is between 256 and 4096 characters. + use the HighMem zone if it exists, and the Normal + zone if it does not. + ++ kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } ++ Controls whether kmemtrace is enabled ++ at boot-time. ++ ++ kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of ++ subbufs kmemtrace's relay channel has. Set this ++ higher than default (KMEMTRACE_N_SUBBUFS in code) if ++ you experience buffer overruns. ++ + movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter + is similar to kernelcore except it specifies the + amount of memory used for migratable allocations. +@@ -1310,8 +1335,13 @@ and is between 256 and 4096 characters. + + memtest= [KNL,X86] Enable memtest + Format: +- range: 0,4 : pattern number + default : 0 ++ Specifies the number of memtest passes to be ++ performed. Each pass selects another test ++ pattern from a given set of patterns. Memtest ++ fills the memory with this pattern, validates ++ memory contents and reserves bad memory ++ regions that are detected. + + meye.*= [HW] Set MotionEye Camera parameters + See Documentation/video4linux/meye.txt. +@@ -2329,6 +2359,8 @@ and is between 256 and 4096 characters. + + tp720= [HW,PS2] + ++ trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size. ++ + trix= [HW,OSS] MediaTrix AudioTrix Pro + Format: + ,,,,,,,, +Index: linux-2.6-tip/Documentation/kmemcheck.txt +=================================================================== +--- /dev/null ++++ linux-2.6-tip/Documentation/kmemcheck.txt +@@ -0,0 +1,129 @@ ++Contents ++======== ++ ++ 1. How to use ++ 2. Technical description ++ 3. Changes to the slab allocators ++ 4. Problems ++ 5. Parameters ++ 6. Future enhancements ++ ++ ++How to use (IMPORTANT) ++====================== ++ ++Always remember this: kmemcheck _will_ give false positives. So don't enable ++it and spam the mailing list with its reports; you are not going to be heard, ++and it will make people's skins thicker for when the real errors are found. ++ ++Instead, I encourage maintainers and developers to find errors in _their_ ++_own_ code. And if you find false positives, you can try to work around them, ++try to figure out if it's a real bug or not, or simply ignore them. Most ++developers know their own code and will quickly and efficiently determine the ++root cause of a kmemcheck report. This is therefore also the most efficient ++way to work with kmemcheck. ++ ++If you still want to run kmemcheck to inspect others' code, the rule of thumb ++should be: If it's not obvious (to you), don't tell us about it either. Most ++likely the code is correct and you'll only waste our time. If you can work ++out the error, please do send the maintainer a heads up and/or a patch, but ++don't expect him/her to fix something that wasn't wrong in the first place. ++ ++ ++Technical description ++===================== ++ ++kmemcheck works by marking memory pages non-present. This means that whenever ++somebody attempts to access the page, a page fault is generated. The page ++fault handler notices that the page was in fact only hidden, and so it calls ++on the kmemcheck code to make further investigations. ++ ++When the investigations are completed, kmemcheck "shows" the page by marking ++it present (as it would be under normal circumstances). This way, the ++interrupted code can continue as usual. ++ ++But after the instruction has been executed, we should hide the page again, so ++that we can catch the next access too! Now kmemcheck makes use of a debugging ++feature of the processor, namely single-stepping. When the processor has ++finished the one instruction that generated the memory access, a debug ++exception is raised. From here, we simply hide the page again and continue ++execution, this time with the single-stepping feature turned off. ++ ++ ++Changes to the slab allocators ++============================== ++ ++kmemcheck requires some assistance from the memory allocator in order to work. ++The memory allocator needs to ++ ++1. Tell kmemcheck about newly allocated pages and pages that are about to ++ be freed. This allows kmemcheck to set up and tear down the shadow memory ++ for the pages in question. The shadow memory stores the status of each byte ++ in the allocation proper, e.g. whether it is initialized or uninitialized. ++2. Tell kmemcheck which parts of memory should be marked uninitialized. There ++ are actually a few more states, such as "not yet allocated" and "recently ++ freed". ++ ++If a slab cache is set up using the SLAB_NOTRACK flag, it will never return ++memory that can take page faults because of kmemcheck. ++ ++If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still ++request memory with the __GFP_NOTRACK flag. This does not prevent the page ++faults from occurring, however, but marks the object in question as being ++initialized so that no warnings will ever be produced for this object. ++ ++Currently, the SLAB and SLUB allocators are supported by kmemcheck. ++ ++ ++Problems ++======== ++ ++The most prominent problem seems to be that of bit-fields. kmemcheck can only ++track memory with byte granularity. Therefore, when gcc generates code to ++access only one bit in a bit-field, there is really no way for kmemcheck to ++know which of the other bits will be used or thrown away. Consequently, there ++may be bogus warnings for bit-field accesses. We have added a "bitfields" API ++to get around this problem. See include/linux/kmemcheck.h for detailed ++instructions! ++ ++ ++Parameters ++========== ++ ++In addition to enabling CONFIG_KMEMCHECK before the kernel is compiled, the ++parameter kmemcheck=1 must be passed to the kernel when it is started in order ++to actually do the tracking. So by default, there is only a very small ++(probably negligible) overhead for enabling the config option. ++ ++Similarly, kmemcheck may be turned on or off at run-time using, respectively: ++ ++echo 1 > /proc/sys/kernel/kmemcheck ++ and ++echo 0 > /proc/sys/kernel/kmemcheck ++ ++Note that this is a lazy setting; once turned off, the old allocations will ++still have to take a single page fault exception before tracking is turned off ++for that particular page. Enabling kmemcheck on will only enable tracking for ++allocations made from that point onwards. ++ ++The default mode is the one-shot mode, where only the first error is reported ++before kmemcheck is disabled. This mode can be enabled by passing kmemcheck=2 ++to the kernel at boot, or running ++ ++echo 2 > /proc/sys/kernel/kmemcheck ++ ++when the kernel is already running. ++ ++ ++Future enhancements ++=================== ++ ++There is already some preliminary support for catching use-after-free errors. ++What still needs to be done is delaying kfree() so that memory is not ++reallocated immediately after freeing it. [Suggested by Pekka Enberg.] ++ ++It should be possible to allow SMP systems by duplicating the page tables for ++each processor in the system. This is probably extremely difficult, however. ++[Suggested by Ingo Molnar.] ++ ++Support for instruction set extensions like XMM, SSE2, etc. +Index: linux-2.6-tip/Documentation/lockdep-design.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/lockdep-design.txt ++++ linux-2.6-tip/Documentation/lockdep-design.txt +@@ -27,33 +27,37 @@ lock-class. + State + ----- + +-The validator tracks lock-class usage history into 5 separate state bits: ++The validator tracks lock-class usage history into 4n + 1 separate state bits: + +-- 'ever held in hardirq context' [ == hardirq-safe ] +-- 'ever held in softirq context' [ == softirq-safe ] +-- 'ever held with hardirqs enabled' [ == hardirq-unsafe ] +-- 'ever held with softirqs and hardirqs enabled' [ == softirq-unsafe ] ++- 'ever held in STATE context' ++- 'ever head as readlock in STATE context' ++- 'ever head with STATE enabled' ++- 'ever head as readlock with STATE enabled' ++ ++Where STATE can be either one of (kernel/lockdep_states.h) ++ - hardirq ++ - softirq ++ - reclaim_fs + + - 'ever used' [ == !unused ] + +-When locking rules are violated, these 4 state bits are presented in the +-locking error messages, inside curlies. A contrived example: ++When locking rules are violated, these state bits are presented in the ++locking error messages, inside curlies. A contrived example: + + modprobe/2287 is trying to acquire lock: +- (&sio_locks[i].lock){--..}, at: [] mutex_lock+0x21/0x24 ++ (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 + + but task is already holding lock: +- (&sio_locks[i].lock){--..}, at: [] mutex_lock+0x21/0x24 ++ (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 + + +-The bit position indicates hardirq, softirq, hardirq-read, +-softirq-read respectively, and the character displayed in each +-indicates: ++The bit position indicates STATE, STATE-read, for each of the states listed ++above, and the character displayed in each indicates: + + '.' acquired while irqs disabled + '+' acquired in irq context + '-' acquired with irqs enabled +- '?' read acquired in irq context with irqs enabled. ++ '?' acquired in irq context with irqs enabled. + + Unused mutexes cannot be part of the cause of an error. + +Index: linux-2.6-tip/Documentation/perf_counter/Makefile +=================================================================== +--- /dev/null ++++ linux-2.6-tip/Documentation/perf_counter/Makefile +@@ -0,0 +1,12 @@ ++BINS = kerneltop perfstat ++ ++all: $(BINS) ++ ++kerneltop: kerneltop.c ../../include/linux/perf_counter.h ++ cc -O6 -Wall -lrt -o $@ $< ++ ++perfstat: kerneltop ++ ln -sf kerneltop perfstat ++ ++clean: ++ rm $(BINS) +Index: linux-2.6-tip/Documentation/perf_counter/design.txt +=================================================================== +--- /dev/null ++++ linux-2.6-tip/Documentation/perf_counter/design.txt +@@ -0,0 +1,283 @@ ++ ++Performance Counters for Linux ++------------------------------ ++ ++Performance counters are special hardware registers available on most modern ++CPUs. These registers count the number of certain types of hw events: such ++as instructions executed, cachemisses suffered, or branches mis-predicted - ++without slowing down the kernel or applications. These registers can also ++trigger interrupts when a threshold number of events have passed - and can ++thus be used to profile the code that runs on that CPU. ++ ++The Linux Performance Counter subsystem provides an abstraction of these ++hardware capabilities. It provides per task and per CPU counters, counter ++groups, and it provides event capabilities on top of those. It ++provides "virtual" 64-bit counters, regardless of the width of the ++underlying hardware counters. ++ ++Performance counters are accessed via special file descriptors. ++There's one file descriptor per virtual counter used. ++ ++The special file descriptor is opened via the perf_counter_open() ++system call: ++ ++ int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, ++ pid_t pid, int cpu, int group_fd, ++ unsigned long flags); ++ ++The syscall returns the new fd. The fd can be used via the normal ++VFS system calls: read() can be used to read the counter, fcntl() ++can be used to set the blocking mode, etc. ++ ++Multiple counters can be kept open at a time, and the counters ++can be poll()ed. ++ ++When creating a new counter fd, 'perf_counter_hw_event' is: ++ ++/* ++ * Event to monitor via a performance monitoring counter: ++ */ ++struct perf_counter_hw_event { ++ __u64 event_config; ++ ++ __u64 irq_period; ++ __u64 record_type; ++ __u64 read_format; ++ ++ __u64 disabled : 1, /* off by default */ ++ nmi : 1, /* NMI sampling */ ++ inherit : 1, /* children inherit it */ ++ pinned : 1, /* must always be on PMU */ ++ exclusive : 1, /* only group on PMU */ ++ exclude_user : 1, /* don't count user */ ++ exclude_kernel : 1, /* ditto kernel */ ++ exclude_hv : 1, /* ditto hypervisor */ ++ exclude_idle : 1, /* don't count when idle */ ++ ++ __reserved_1 : 55; ++ ++ __u32 extra_config_len; ++ ++ __u32 __reserved_4; ++ __u64 __reserved_2; ++ __u64 __reserved_3; ++}; ++ ++The 'event_config' field specifies what the counter should count. It ++is divided into 3 bit-fields: ++ ++raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 ++type: 7 bits (next most significant) 0x7f00_0000_0000_0000 ++event_id: 56 bits (least significant) 0x00ff_0000_0000_0000 ++ ++If 'raw_type' is 1, then the counter will count a hardware event ++specified by the remaining 63 bits of event_config. The encoding is ++machine-specific. ++ ++If 'raw_type' is 0, then the 'type' field says what kind of counter ++this is, with the following encoding: ++ ++enum perf_event_types { ++ PERF_TYPE_HARDWARE = 0, ++ PERF_TYPE_SOFTWARE = 1, ++ PERF_TYPE_TRACEPOINT = 2, ++}; ++ ++A counter of PERF_TYPE_HARDWARE will count the hardware event ++specified by 'event_id': ++ ++/* ++ * Generalized performance counter event types, used by the hw_event.event_id ++ * parameter of the sys_perf_counter_open() syscall: ++ */ ++enum hw_event_ids { ++ /* ++ * Common hardware events, generalized by the kernel: ++ */ ++ PERF_COUNT_CPU_CYCLES = 0, ++ PERF_COUNT_INSTRUCTIONS = 1, ++ PERF_COUNT_CACHE_REFERENCES = 2, ++ PERF_COUNT_CACHE_MISSES = 3, ++ PERF_COUNT_BRANCH_INSTRUCTIONS = 4, ++ PERF_COUNT_BRANCH_MISSES = 5, ++ PERF_COUNT_BUS_CYCLES = 6, ++}; ++ ++These are standardized types of events that work relatively uniformly ++on all CPUs that implement Performance Counters support under Linux, ++although there may be variations (e.g., different CPUs might count ++cache references and misses at different levels of the cache hierarchy). ++If a CPU is not able to count the selected event, then the system call ++will return -EINVAL. ++ ++More hw_event_types are supported as well, but they are CPU-specific ++and accessed as raw events. For example, to count "External bus ++cycles while bus lock signal asserted" events on Intel Core CPUs, pass ++in a 0x4064 event_id value and set hw_event.raw_type to 1. ++ ++A counter of type PERF_TYPE_SOFTWARE will count one of the available ++software events, selected by 'event_id': ++ ++/* ++ * Special "software" counters provided by the kernel, even if the hardware ++ * does not support performance counters. These counters measure various ++ * physical and sw events of the kernel (and allow the profiling of them as ++ * well): ++ */ ++enum sw_event_ids { ++ PERF_COUNT_CPU_CLOCK = 0, ++ PERF_COUNT_TASK_CLOCK = 1, ++ PERF_COUNT_PAGE_FAULTS = 2, ++ PERF_COUNT_CONTEXT_SWITCHES = 3, ++ PERF_COUNT_CPU_MIGRATIONS = 4, ++ PERF_COUNT_PAGE_FAULTS_MIN = 5, ++ PERF_COUNT_PAGE_FAULTS_MAJ = 6, ++}; ++ ++Counters come in two flavours: counting counters and sampling ++counters. A "counting" counter is one that is used for counting the ++number of events that occur, and is characterised by having ++irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a ++counting counter simply returns the current value of the counter as ++an 8-byte number. ++ ++A "sampling" counter is one that is set up to generate an interrupt ++every N events, where N is given by 'irq_period'. A sampling counter ++has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The ++record_type controls what data is recorded on each interrupt, and the ++available values are currently: ++ ++/* ++ * IRQ-notification data record type: ++ */ ++enum perf_counter_record_type { ++ PERF_RECORD_SIMPLE = 0, ++ PERF_RECORD_IRQ = 1, ++ PERF_RECORD_GROUP = 2, ++}; ++ ++A record_type value of PERF_RECORD_IRQ will record the instruction ++pointer (IP) at which the interrupt occurred. A record_type value of ++PERF_RECORD_GROUP will record the event_config and counter value of ++all of the other counters in the group, and should only be used on a ++group leader (see below). Currently these two values are mutually ++exclusive, but record_type will become a bit-mask in future and ++support other values. ++ ++A sampling counter has an event queue, into which an event is placed ++on each interrupt. A read() on a sampling counter will read the next ++event from the event queue. If the queue is empty, the read() will ++either block or return an EAGAIN error, depending on whether the fd ++has been set to non-blocking mode or not. ++ ++The 'disabled' bit specifies whether the counter starts out disabled ++or enabled. If it is initially disabled, it can be enabled by ioctl ++or prctl (see below). ++ ++The 'nmi' bit specifies, for hardware events, whether the counter ++should be set up to request non-maskable interrupts (NMIs) or normal ++interrupts. This bit is ignored if the user doesn't have ++CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't ++generate NMIs from hardware counters. ++ ++The 'inherit' bit, if set, specifies that this counter should count ++events on descendant tasks as well as the task specified. This only ++applies to new descendents, not to any existing descendents at the ++time the counter is created (nor to any new descendents of existing ++descendents). ++ ++The 'pinned' bit, if set, specifies that the counter should always be ++on the CPU if at all possible. It only applies to hardware counters ++and only to group leaders. If a pinned counter cannot be put onto the ++CPU (e.g. because there are not enough hardware counters or because of ++a conflict with some other event), then the counter goes into an ++'error' state, where reads return end-of-file (i.e. read() returns 0) ++until the counter is subsequently enabled or disabled. ++ ++The 'exclusive' bit, if set, specifies that when this counter's group ++is on the CPU, it should be the only group using the CPU's counters. ++In future, this will allow sophisticated monitoring programs to supply ++extra configuration information via 'extra_config_len' to exploit ++advanced features of the CPU's Performance Monitor Unit (PMU) that are ++not otherwise accessible and that might disrupt other hardware ++counters. ++ ++The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a ++way to request that counting of events be restricted to times when the ++CPU is in user, kernel and/or hypervisor mode. ++ ++ ++The 'pid' parameter to the perf_counter_open() system call allows the ++counter to be specific to a task: ++ ++ pid == 0: if the pid parameter is zero, the counter is attached to the ++ current task. ++ ++ pid > 0: the counter is attached to a specific task (if the current task ++ has sufficient privilege to do so) ++ ++ pid < 0: all tasks are counted (per cpu counters) ++ ++The 'cpu' parameter allows a counter to be made specific to a CPU: ++ ++ cpu >= 0: the counter is restricted to a specific CPU ++ cpu == -1: the counter counts on all CPUs ++ ++(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.) ++ ++A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts ++events of that task and 'follows' that task to whatever CPU the task ++gets schedule to. Per task counters can be created by any user, for ++their own tasks. ++ ++A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts ++all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. ++ ++The 'flags' parameter is currently unused and must be zero. ++ ++The 'group_fd' parameter allows counter "groups" to be set up. A ++counter group has one counter which is the group "leader". The leader ++is created first, with group_fd = -1 in the perf_counter_open call ++that creates it. The rest of the group members are created ++subsequently, with group_fd giving the fd of the group leader. ++(A single counter on its own is created with group_fd = -1 and is ++considered to be a group with only 1 member.) ++ ++A counter group is scheduled onto the CPU as a unit, that is, it will ++only be put onto the CPU if all of the counters in the group can be ++put onto the CPU. This means that the values of the member counters ++can be meaningfully compared, added, divided (to get ratios), etc., ++with each other, since they have counted events for the same set of ++executed instructions. ++ ++Counters can be enabled and disabled in two ways: via ioctl and via ++prctl. When a counter is disabled, it doesn't count or generate ++events but does continue to exist and maintain its count value. ++ ++An individual counter or counter group can be enabled with ++ ++ ioctl(fd, PERF_COUNTER_IOC_ENABLE); ++ ++or disabled with ++ ++ ioctl(fd, PERF_COUNTER_IOC_DISABLE); ++ ++Enabling or disabling the leader of a group enables or disables the ++whole group; that is, while the group leader is disabled, none of the ++counters in the group will count. Enabling or disabling a member of a ++group other than the leader only affects that counter - disabling an ++non-leader stops that counter from counting but doesn't affect any ++other counter. ++ ++A process can enable or disable all the counter groups that are ++attached to it, using prctl: ++ ++ prctl(PR_TASK_PERF_COUNTERS_ENABLE); ++ ++ prctl(PR_TASK_PERF_COUNTERS_DISABLE); ++ ++This applies to all counters on the current process, whether created ++by this process or by another, and doesn't affect any counters that ++this process has created on other processes. It only enables or ++disables the group leaders, not any other members in the groups. ++ +Index: linux-2.6-tip/Documentation/perf_counter/kerneltop.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/Documentation/perf_counter/kerneltop.c +@@ -0,0 +1,1328 @@ ++/* ++ * kerneltop.c: show top kernel functions - performance counters showcase ++ ++ Build with: ++ ++ cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt ++ ++ Sample output: ++ ++------------------------------------------------------------------------------ ++ KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2) ++------------------------------------------------------------------------------ ++ ++ weight RIP kernel function ++ ______ ________________ _______________ ++ ++ 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev ++ 33.00 - ffffffff804cb740 : sock_alloc_send_skb ++ 31.26 - ffffffff804ce808 : skb_push ++ 22.43 - ffffffff80510004 : tcp_established_options ++ 19.00 - ffffffff8027d250 : find_get_page ++ 15.76 - ffffffff804e4fc9 : eth_type_trans ++ 15.20 - ffffffff804d8baa : dst_release ++ 14.86 - ffffffff804cf5d8 : skb_release_head_state ++ 14.00 - ffffffff802217d5 : read_hpet ++ 12.00 - ffffffff804ffb7f : __ip_local_out ++ 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish ++ 8.54 - ffffffff805001a3 : ip_queue_xmit ++ */ ++ ++/* ++ * perfstat: /usr/bin/time -alike performance counter statistics utility ++ ++ It summarizes the counter events of all tasks (and child tasks), ++ covering all CPUs that the command (or workload) executes on. ++ It only counts the per-task events of the workload started, ++ independent of how many other tasks run on those CPUs. ++ ++ Sample output: ++ ++ $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null ++ ++ Performance counter stats for 'ls': ++ ++ 163516953 instructions ++ 2295 cache-misses ++ 2855182 branch-misses ++ */ ++ ++ /* ++ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar ++ * ++ * Improvements and fixes by: ++ * ++ * Arjan van de Ven ++ * Yanmin Zhang ++ * Wu Fengguang ++ * Mike Galbraith ++ * Paul Mackerras ++ * ++ * Released under the GPL v2. (and only v2, not any later version) ++ */ ++ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "../../include/linux/perf_counter.h" ++ ++ ++/* ++ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all ++ * counters in the current task. ++ */ ++#define PR_TASK_PERF_COUNTERS_DISABLE 31 ++#define PR_TASK_PERF_COUNTERS_ENABLE 32 ++ ++#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) ++ ++#define rdclock() \ ++({ \ ++ struct timespec ts; \ ++ \ ++ clock_gettime(CLOCK_MONOTONIC, &ts); \ ++ ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ ++}) ++ ++/* ++ * Pick up some kernel type conventions: ++ */ ++#define __user ++#define asmlinkage ++ ++#ifdef __x86_64__ ++#define __NR_perf_counter_open 298 ++#define rmb() asm volatile("lfence" ::: "memory") ++#define cpu_relax() asm volatile("rep; nop" ::: "memory"); ++#endif ++ ++#ifdef __i386__ ++#define __NR_perf_counter_open 336 ++#define rmb() asm volatile("lfence" ::: "memory") ++#define cpu_relax() asm volatile("rep; nop" ::: "memory"); ++#endif ++ ++#ifdef __powerpc__ ++#define __NR_perf_counter_open 319 ++#define rmb() asm volatile ("sync" ::: "memory") ++#define cpu_relax() asm volatile ("" ::: "memory"); ++#endif ++ ++#define unlikely(x) __builtin_expect(!!(x), 0) ++#define min(x, y) ({ \ ++ typeof(x) _min1 = (x); \ ++ typeof(y) _min2 = (y); \ ++ (void) (&_min1 == &_min2); \ ++ _min1 < _min2 ? _min1 : _min2; }) ++ ++asmlinkage int sys_perf_counter_open( ++ struct perf_counter_hw_event *hw_event_uptr __user, ++ pid_t pid, ++ int cpu, ++ int group_fd, ++ unsigned long flags) ++{ ++ return syscall( ++ __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); ++} ++ ++#define MAX_COUNTERS 64 ++#define MAX_NR_CPUS 256 ++ ++#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) ++ ++static int run_perfstat = 0; ++static int system_wide = 0; ++ ++static int nr_counters = 0; ++static __u64 event_id[MAX_COUNTERS] = { ++ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), ++ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), ++ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), ++ EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), ++ ++ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), ++ EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), ++ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), ++ EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), ++}; ++static int default_interval = 100000; ++static int event_count[MAX_COUNTERS]; ++static int fd[MAX_NR_CPUS][MAX_COUNTERS]; ++ ++static __u64 count_filter = 100; ++ ++static int tid = -1; ++static int profile_cpu = -1; ++static int nr_cpus = 0; ++static int nmi = 1; ++static int group = 0; ++static unsigned int page_size; ++static unsigned int mmap_pages = 16; ++ ++static char *vmlinux; ++ ++static char *sym_filter; ++static unsigned long filter_start; ++static unsigned long filter_end; ++ ++static int delay_secs = 2; ++static int zero; ++static int dump_symtab; ++ ++struct source_line { ++ uint64_t EIP; ++ unsigned long count; ++ char *line; ++ struct source_line *next; ++}; ++ ++static struct source_line *lines; ++static struct source_line **lines_tail; ++ ++const unsigned int default_count[] = { ++ 1000000, ++ 1000000, ++ 10000, ++ 10000, ++ 1000000, ++ 10000, ++}; ++ ++static char *hw_event_names[] = { ++ "CPU cycles", ++ "instructions", ++ "cache references", ++ "cache misses", ++ "branches", ++ "branch misses", ++ "bus cycles", ++}; ++ ++static char *sw_event_names[] = { ++ "cpu clock ticks", ++ "task clock ticks", ++ "pagefaults", ++ "context switches", ++ "CPU migrations", ++ "minor faults", ++ "major faults", ++}; ++ ++struct event_symbol { ++ __u64 event; ++ char *symbol; ++}; ++ ++static struct event_symbol event_symbols[] = { ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", }, ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", }, ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", }, ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", }, ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", }, ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", }, ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", }, ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", }, ++ {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", }, ++ ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", }, ++ {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", }, ++}; ++ ++#define __PERF_COUNTER_FIELD(config, name) \ ++ ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) ++ ++#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) ++#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) ++#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) ++#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) ++ ++static void display_events_help(void) ++{ ++ unsigned int i; ++ __u64 e; ++ ++ printf( ++ " -e EVENT --event=EVENT # symbolic-name abbreviations"); ++ ++ for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { ++ int type, id; ++ ++ e = event_symbols[i].event; ++ type = PERF_COUNTER_TYPE(e); ++ id = PERF_COUNTER_ID(e); ++ ++ printf("\n %d:%d: %-20s", ++ type, id, event_symbols[i].symbol); ++ } ++ ++ printf("\n" ++ " rNNN: raw PMU events (eventsel+umask)\n\n"); ++} ++ ++static void display_perfstat_help(void) ++{ ++ printf( ++ "Usage: perfstat [] \n\n" ++ "PerfStat Options (up to %d event types can be specified):\n\n", ++ MAX_COUNTERS); ++ ++ display_events_help(); ++ ++ printf( ++ " -a # system-wide collection\n"); ++ exit(0); ++} ++ ++static void display_help(void) ++{ ++ if (run_perfstat) ++ return display_perfstat_help(); ++ ++ printf( ++ "Usage: kerneltop []\n" ++ " Or: kerneltop -S [] COMMAND [ARGS]\n\n" ++ "KernelTop Options (up to %d event types can be specified at once):\n\n", ++ MAX_COUNTERS); ++ ++ display_events_help(); ++ ++ printf( ++ " -S --stat # perfstat COMMAND\n" ++ " -a # system-wide collection (for perfstat)\n\n" ++ " -c CNT --count=CNT # event period to sample\n\n" ++ " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" ++ " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" ++ " -d delay --delay= # sampling/display delay [default: 2]\n" ++ " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" ++ " -s symbol --symbol= # function to be showed annotated one-shot\n" ++ " -x path --vmlinux= # the vmlinux binary, required for -s use\n" ++ " -z --zero # zero counts after display\n" ++ " -D --dump_symtab # dump symbol table to stderr on startup\n" ++ " -m pages --mmap_pages= # number of mmap data pages\n" ++ ); ++ ++ exit(0); ++} ++ ++static char *event_name(int ctr) ++{ ++ __u64 config = event_id[ctr]; ++ int type = PERF_COUNTER_TYPE(config); ++ int id = PERF_COUNTER_ID(config); ++ static char buf[32]; ++ ++ if (PERF_COUNTER_RAW(config)) { ++ sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config)); ++ return buf; ++ } ++ ++ switch (type) { ++ case PERF_TYPE_HARDWARE: ++ if (id < PERF_HW_EVENTS_MAX) ++ return hw_event_names[id]; ++ return "unknown-hardware"; ++ ++ case PERF_TYPE_SOFTWARE: ++ if (id < PERF_SW_EVENTS_MAX) ++ return sw_event_names[id]; ++ return "unknown-software"; ++ ++ default: ++ break; ++ } ++ ++ return "unknown"; ++} ++ ++/* ++ * Each event can have multiple symbolic names. ++ * Symbolic names are (almost) exactly matched. ++ */ ++static __u64 match_event_symbols(char *str) ++{ ++ __u64 config, id; ++ int type; ++ unsigned int i; ++ ++ if (sscanf(str, "r%llx", &config) == 1) ++ return config | PERF_COUNTER_RAW_MASK; ++ ++ if (sscanf(str, "%d:%llu", &type, &id) == 2) ++ return EID(type, id); ++ ++ for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { ++ if (!strncmp(str, event_symbols[i].symbol, ++ strlen(event_symbols[i].symbol))) ++ return event_symbols[i].event; ++ } ++ ++ return ~0ULL; ++} ++ ++static int parse_events(char *str) ++{ ++ __u64 config; ++ ++again: ++ if (nr_counters == MAX_COUNTERS) ++ return -1; ++ ++ config = match_event_symbols(str); ++ if (config == ~0ULL) ++ return -1; ++ ++ event_id[nr_counters] = config; ++ nr_counters++; ++ ++ str = strstr(str, ","); ++ if (str) { ++ str++; ++ goto again; ++ } ++ ++ return 0; ++} ++ ++ ++/* ++ * perfstat ++ */ ++ ++char fault_here[1000000]; ++ ++static void create_perfstat_counter(int counter) ++{ ++ struct perf_counter_hw_event hw_event; ++ ++ memset(&hw_event, 0, sizeof(hw_event)); ++ hw_event.config = event_id[counter]; ++ hw_event.record_type = PERF_RECORD_SIMPLE; ++ hw_event.nmi = 0; ++ ++ if (system_wide) { ++ int cpu; ++ for (cpu = 0; cpu < nr_cpus; cpu ++) { ++ fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); ++ if (fd[cpu][counter] < 0) { ++ printf("perfstat error: syscall returned with %d (%s)\n", ++ fd[cpu][counter], strerror(errno)); ++ exit(-1); ++ } ++ } ++ } else { ++ hw_event.inherit = 1; ++ hw_event.disabled = 1; ++ ++ fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); ++ if (fd[0][counter] < 0) { ++ printf("perfstat error: syscall returned with %d (%s)\n", ++ fd[0][counter], strerror(errno)); ++ exit(-1); ++ } ++ } ++} ++ ++int do_perfstat(int argc, char *argv[]) ++{ ++ unsigned long long t0, t1; ++ int counter; ++ ssize_t res; ++ int status; ++ int pid; ++ ++ if (!system_wide) ++ nr_cpus = 1; ++ ++ for (counter = 0; counter < nr_counters; counter++) ++ create_perfstat_counter(counter); ++ ++ argc -= optind; ++ argv += optind; ++ ++ if (!argc) ++ display_help(); ++ ++ /* ++ * Enable counters and exec the command: ++ */ ++ t0 = rdclock(); ++ prctl(PR_TASK_PERF_COUNTERS_ENABLE); ++ ++ if ((pid = fork()) < 0) ++ perror("failed to fork"); ++ if (!pid) { ++ if (execvp(argv[0], argv)) { ++ perror(argv[0]); ++ exit(-1); ++ } ++ } ++ while (wait(&status) >= 0) ++ ; ++ prctl(PR_TASK_PERF_COUNTERS_DISABLE); ++ t1 = rdclock(); ++ ++ fflush(stdout); ++ ++ fprintf(stderr, "\n"); ++ fprintf(stderr, " Performance counter stats for \'%s\':\n", ++ argv[0]); ++ fprintf(stderr, "\n"); ++ ++ for (counter = 0; counter < nr_counters; counter++) { ++ int cpu; ++ __u64 count, single_count; ++ ++ count = 0; ++ for (cpu = 0; cpu < nr_cpus; cpu ++) { ++ res = read(fd[cpu][counter], ++ (char *) &single_count, sizeof(single_count)); ++ assert(res == sizeof(single_count)); ++ count += single_count; ++ } ++ ++ if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) || ++ event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { ++ ++ double msecs = (double)count / 1000000; ++ ++ fprintf(stderr, " %14.6f %-20s (msecs)\n", ++ msecs, event_name(counter)); ++ } else { ++ fprintf(stderr, " %14Ld %-20s (events)\n", ++ count, event_name(counter)); ++ } ++ } ++ fprintf(stderr, "\n"); ++ fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", ++ (double)(t1-t0)/1e6); ++ fprintf(stderr, "\n"); ++ ++ return 0; ++} ++ ++/* ++ * Symbols ++ */ ++ ++static uint64_t min_ip; ++static uint64_t max_ip = -1ll; ++ ++struct sym_entry { ++ unsigned long long addr; ++ char *sym; ++ unsigned long count[MAX_COUNTERS]; ++ int skip; ++ struct source_line *source; ++}; ++ ++#define MAX_SYMS 100000 ++ ++static int sym_table_count; ++ ++struct sym_entry *sym_filter_entry; ++ ++static struct sym_entry sym_table[MAX_SYMS]; ++ ++static void show_details(struct sym_entry *sym); ++ ++/* ++ * Ordering weight: count-1 * count-2 * ... / count-n ++ */ ++static double sym_weight(const struct sym_entry *sym) ++{ ++ double weight; ++ int counter; ++ ++ weight = sym->count[0]; ++ ++ for (counter = 1; counter < nr_counters-1; counter++) ++ weight *= sym->count[counter]; ++ ++ weight /= (sym->count[counter] + 1); ++ ++ return weight; ++} ++ ++static int compare(const void *__sym1, const void *__sym2) ++{ ++ const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; ++ ++ return sym_weight(sym1) < sym_weight(sym2); ++} ++ ++static time_t last_refresh; ++static long events; ++static long userspace_events; ++static const char CONSOLE_CLEAR[] = ""; ++ ++static struct sym_entry tmp[MAX_SYMS]; ++ ++static void print_sym_table(void) ++{ ++ int i, printed; ++ int counter; ++ float events_per_sec = events/delay_secs; ++ float kevents_per_sec = (events-userspace_events)/delay_secs; ++ ++ memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count); ++ qsort(tmp, sym_table_count, sizeof(tmp[0]), compare); ++ ++ write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR)); ++ ++ printf( ++"------------------------------------------------------------------------------\n"); ++ printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ", ++ events_per_sec, ++ 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)), ++ nmi ? "NMI" : "IRQ"); ++ ++ if (nr_counters == 1) ++ printf("%d ", event_count[0]); ++ ++ for (counter = 0; counter < nr_counters; counter++) { ++ if (counter) ++ printf("/"); ++ ++ printf("%s", event_name(counter)); ++ } ++ ++ printf( "], "); ++ ++ if (tid != -1) ++ printf(" (tid: %d", tid); ++ else ++ printf(" (all"); ++ ++ if (profile_cpu != -1) ++ printf(", cpu: %d)\n", profile_cpu); ++ else { ++ if (tid != -1) ++ printf(")\n"); ++ else ++ printf(", %d CPUs)\n", nr_cpus); ++ } ++ ++ printf("------------------------------------------------------------------------------\n\n"); ++ ++ if (nr_counters == 1) ++ printf(" events"); ++ else ++ printf(" weight events"); ++ ++ printf(" RIP kernel function\n" ++ " ______ ______ ________________ _______________\n\n" ++ ); ++ ++ printed = 0; ++ for (i = 0; i < sym_table_count; i++) { ++ int count; ++ ++ if (nr_counters == 1) { ++ if (printed <= 18 && ++ tmp[i].count[0] >= count_filter) { ++ printf("%19.2f - %016llx : %s\n", ++ sym_weight(tmp + i), tmp[i].addr, tmp[i].sym); ++ printed++; ++ } ++ } else { ++ if (printed <= 18 && ++ tmp[i].count[0] >= count_filter) { ++ printf("%8.1f %10ld - %016llx : %s\n", ++ sym_weight(tmp + i), ++ tmp[i].count[0], ++ tmp[i].addr, tmp[i].sym); ++ printed++; ++ } ++ } ++ /* ++ * Add decay to the counts: ++ */ ++ for (count = 0; count < nr_counters; count++) ++ sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8; ++ } ++ ++ if (sym_filter_entry) ++ show_details(sym_filter_entry); ++ ++ last_refresh = time(NULL); ++ ++ { ++ struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; ++ ++ if (poll(&stdin_poll, 1, 0) == 1) { ++ printf("key pressed - exiting.\n"); ++ exit(0); ++ } ++ } ++} ++ ++static int read_symbol(FILE *in, struct sym_entry *s) ++{ ++ static int filter_match = 0; ++ char *sym, stype; ++ char str[500]; ++ int rc, pos; ++ ++ rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str); ++ if (rc == EOF) ++ return -1; ++ ++ assert(rc == 3); ++ ++ /* skip until end of line: */ ++ pos = strlen(str); ++ do { ++ rc = fgetc(in); ++ if (rc == '\n' || rc == EOF || pos >= 499) ++ break; ++ str[pos] = rc; ++ pos++; ++ } while (1); ++ str[pos] = 0; ++ ++ sym = str; ++ ++ /* Filter out known duplicates and non-text symbols. */ ++ if (!strcmp(sym, "_text")) ++ return 1; ++ if (!min_ip && !strcmp(sym, "_stext")) ++ return 1; ++ if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext")) ++ return 1; ++ if (stype != 'T' && stype != 't') ++ return 1; ++ if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14)) ++ return 1; ++ if (strstr(sym, "_text_start") || strstr(sym, "_text_end")) ++ return 1; ++ ++ s->sym = malloc(strlen(str)); ++ assert(s->sym); ++ ++ strcpy((char *)s->sym, str); ++ s->skip = 0; ++ ++ /* Tag events to be skipped. */ ++ if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym)) ++ s->skip = 1; ++ else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym)) ++ s->skip = 1; ++ else if (!strcmp("mwait_idle", s->sym)) ++ s->skip = 1; ++ ++ if (filter_match == 1) { ++ filter_end = s->addr; ++ filter_match = -1; ++ if (filter_end - filter_start > 10000) { ++ printf("hm, too large filter symbol <%s> - skipping.\n", ++ sym_filter); ++ printf("symbol filter start: %016lx\n", filter_start); ++ printf(" end: %016lx\n", filter_end); ++ filter_end = filter_start = 0; ++ sym_filter = NULL; ++ sleep(1); ++ } ++ } ++ if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) { ++ filter_match = 1; ++ filter_start = s->addr; ++ } ++ ++ return 0; ++} ++ ++int compare_addr(const void *__sym1, const void *__sym2) ++{ ++ const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; ++ ++ return sym1->addr > sym2->addr; ++} ++ ++static void sort_symbol_table(void) ++{ ++ int i, dups; ++ ++ do { ++ qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr); ++ for (i = 0, dups = 0; i < sym_table_count; i++) { ++ if (sym_table[i].addr == sym_table[i+1].addr) { ++ sym_table[i+1].addr = -1ll; ++ dups++; ++ } ++ } ++ sym_table_count -= dups; ++ } while(dups); ++} ++ ++static void parse_symbols(void) ++{ ++ struct sym_entry *last; ++ ++ FILE *kallsyms = fopen("/proc/kallsyms", "r"); ++ ++ if (!kallsyms) { ++ printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n"); ++ exit(-1); ++ } ++ ++ while (!feof(kallsyms)) { ++ if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) { ++ sym_table_count++; ++ assert(sym_table_count <= MAX_SYMS); ++ } ++ } ++ ++ sort_symbol_table(); ++ min_ip = sym_table[0].addr; ++ max_ip = sym_table[sym_table_count-1].addr; ++ last = sym_table + sym_table_count++; ++ ++ last->addr = -1ll; ++ last->sym = ""; ++ ++ if (filter_end) { ++ int count; ++ for (count=0; count < sym_table_count; count ++) { ++ if (!strcmp(sym_table[count].sym, sym_filter)) { ++ sym_filter_entry = &sym_table[count]; ++ break; ++ } ++ } ++ } ++ if (dump_symtab) { ++ int i; ++ ++ for (i = 0; i < sym_table_count; i++) ++ fprintf(stderr, "%llx %s\n", ++ sym_table[i].addr, sym_table[i].sym); ++ } ++} ++ ++/* ++ * Source lines ++ */ ++ ++static void parse_vmlinux(char *filename) ++{ ++ FILE *file; ++ char command[PATH_MAX*2]; ++ if (!filename) ++ return; ++ ++ sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename); ++ ++ file = popen(command, "r"); ++ if (!file) ++ return; ++ ++ lines_tail = &lines; ++ while (!feof(file)) { ++ struct source_line *src; ++ size_t dummy = 0; ++ char *c; ++ ++ src = malloc(sizeof(struct source_line)); ++ assert(src != NULL); ++ memset(src, 0, sizeof(struct source_line)); ++ ++ if (getline(&src->line, &dummy, file) < 0) ++ break; ++ if (!src->line) ++ break; ++ ++ c = strchr(src->line, '\n'); ++ if (c) ++ *c = 0; ++ ++ src->next = NULL; ++ *lines_tail = src; ++ lines_tail = &src->next; ++ ++ if (strlen(src->line)>8 && src->line[8] == ':') ++ src->EIP = strtoull(src->line, NULL, 16); ++ if (strlen(src->line)>8 && src->line[16] == ':') ++ src->EIP = strtoull(src->line, NULL, 16); ++ } ++ pclose(file); ++} ++ ++static void record_precise_ip(uint64_t ip) ++{ ++ struct source_line *line; ++ ++ for (line = lines; line; line = line->next) { ++ if (line->EIP == ip) ++ line->count++; ++ if (line->EIP > ip) ++ break; ++ } ++} ++ ++static void lookup_sym_in_vmlinux(struct sym_entry *sym) ++{ ++ struct source_line *line; ++ char pattern[PATH_MAX]; ++ sprintf(pattern, "<%s>:", sym->sym); ++ ++ for (line = lines; line; line = line->next) { ++ if (strstr(line->line, pattern)) { ++ sym->source = line; ++ break; ++ } ++ } ++} ++ ++static void show_lines(struct source_line *line_queue, int line_queue_count) ++{ ++ int i; ++ struct source_line *line; ++ ++ line = line_queue; ++ for (i = 0; i < line_queue_count; i++) { ++ printf("%8li\t%s\n", line->count, line->line); ++ line = line->next; ++ } ++} ++ ++#define TRACE_COUNT 3 ++ ++static void show_details(struct sym_entry *sym) ++{ ++ struct source_line *line; ++ struct source_line *line_queue = NULL; ++ int displayed = 0; ++ int line_queue_count = 0; ++ ++ if (!sym->source) ++ lookup_sym_in_vmlinux(sym); ++ if (!sym->source) ++ return; ++ ++ printf("Showing details for %s\n", sym->sym); ++ ++ line = sym->source; ++ while (line) { ++ if (displayed && strstr(line->line, ">:")) ++ break; ++ ++ if (!line_queue_count) ++ line_queue = line; ++ line_queue_count ++; ++ ++ if (line->count >= count_filter) { ++ show_lines(line_queue, line_queue_count); ++ line_queue_count = 0; ++ line_queue = NULL; ++ } else if (line_queue_count > TRACE_COUNT) { ++ line_queue = line_queue->next; ++ line_queue_count --; ++ } ++ ++ line->count = 0; ++ displayed++; ++ if (displayed > 300) ++ break; ++ line = line->next; ++ } ++} ++ ++/* ++ * Binary search in the histogram table and record the hit: ++ */ ++static void record_ip(uint64_t ip, int counter) ++{ ++ int left_idx, middle_idx, right_idx, idx; ++ unsigned long left, middle, right; ++ ++ record_precise_ip(ip); ++ ++ left_idx = 0; ++ right_idx = sym_table_count-1; ++ assert(ip <= max_ip && ip >= min_ip); ++ ++ while (left_idx + 1 < right_idx) { ++ middle_idx = (left_idx + right_idx) / 2; ++ ++ left = sym_table[ left_idx].addr; ++ middle = sym_table[middle_idx].addr; ++ right = sym_table[ right_idx].addr; ++ ++ if (!(left <= middle && middle <= right)) { ++ printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right); ++ printf("%d %d %d\n", left_idx, middle_idx, right_idx); ++ } ++ assert(left <= middle && middle <= right); ++ if (!(left <= ip && ip <= right)) { ++ printf(" left: %016lx\n", left); ++ printf(" ip: %016lx\n", (unsigned long)ip); ++ printf("right: %016lx\n", right); ++ } ++ assert(left <= ip && ip <= right); ++ /* ++ * [ left .... target .... middle .... right ] ++ * => right := middle ++ */ ++ if (ip < middle) { ++ right_idx = middle_idx; ++ continue; ++ } ++ /* ++ * [ left .... middle ... target ... right ] ++ * => left := middle ++ */ ++ left_idx = middle_idx; ++ } ++ ++ idx = left_idx; ++ ++ if (!sym_table[idx].skip) ++ sym_table[idx].count[counter]++; ++ else events--; ++} ++ ++static void process_event(uint64_t ip, int counter) ++{ ++ events++; ++ ++ if (ip < min_ip || ip > max_ip) { ++ userspace_events++; ++ return; ++ } ++ ++ record_ip(ip, counter); ++} ++ ++static void process_options(int argc, char *argv[]) ++{ ++ int error = 0, counter; ++ ++ if (strstr(argv[0], "perfstat")) ++ run_perfstat = 1; ++ ++ for (;;) { ++ int option_index = 0; ++ /** Options for getopt */ ++ static struct option long_options[] = { ++ {"count", required_argument, NULL, 'c'}, ++ {"cpu", required_argument, NULL, 'C'}, ++ {"delay", required_argument, NULL, 'd'}, ++ {"dump_symtab", no_argument, NULL, 'D'}, ++ {"event", required_argument, NULL, 'e'}, ++ {"filter", required_argument, NULL, 'f'}, ++ {"group", required_argument, NULL, 'g'}, ++ {"help", no_argument, NULL, 'h'}, ++ {"nmi", required_argument, NULL, 'n'}, ++ {"pid", required_argument, NULL, 'p'}, ++ {"vmlinux", required_argument, NULL, 'x'}, ++ {"symbol", required_argument, NULL, 's'}, ++ {"stat", no_argument, NULL, 'S'}, ++ {"zero", no_argument, NULL, 'z'}, ++ {"mmap_pages", required_argument, NULL, 'm'}, ++ {NULL, 0, NULL, 0 } ++ }; ++ int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z", ++ long_options, &option_index); ++ if (c == -1) ++ break; ++ ++ switch (c) { ++ case 'a': system_wide = 1; break; ++ case 'c': default_interval = atoi(optarg); break; ++ case 'C': ++ /* CPU and PID are mutually exclusive */ ++ if (tid != -1) { ++ printf("WARNING: CPU switch overriding PID\n"); ++ sleep(1); ++ tid = -1; ++ } ++ profile_cpu = atoi(optarg); break; ++ case 'd': delay_secs = atoi(optarg); break; ++ case 'D': dump_symtab = 1; break; ++ ++ case 'e': error = parse_events(optarg); break; ++ ++ case 'f': count_filter = atoi(optarg); break; ++ case 'g': group = atoi(optarg); break; ++ case 'h': display_help(); break; ++ case 'n': nmi = atoi(optarg); break; ++ case 'p': ++ /* CPU and PID are mutually exclusive */ ++ if (profile_cpu != -1) { ++ printf("WARNING: PID switch overriding CPU\n"); ++ sleep(1); ++ profile_cpu = -1; ++ } ++ tid = atoi(optarg); break; ++ case 's': sym_filter = strdup(optarg); break; ++ case 'S': run_perfstat = 1; break; ++ case 'x': vmlinux = strdup(optarg); break; ++ case 'z': zero = 1; break; ++ case 'm': mmap_pages = atoi(optarg); break; ++ default: error = 1; break; ++ } ++ } ++ if (error) ++ display_help(); ++ ++ if (!nr_counters) { ++ if (run_perfstat) ++ nr_counters = 8; ++ else { ++ nr_counters = 1; ++ event_id[0] = 0; ++ } ++ } ++ ++ for (counter = 0; counter < nr_counters; counter++) { ++ if (event_count[counter]) ++ continue; ++ ++ event_count[counter] = default_interval; ++ } ++} ++ ++struct mmap_data { ++ int counter; ++ void *base; ++ unsigned int mask; ++ unsigned int prev; ++}; ++ ++static unsigned int mmap_read_head(struct mmap_data *md) ++{ ++ struct perf_counter_mmap_page *pc = md->base; ++ unsigned int seq, head; ++ ++repeat: ++ rmb(); ++ seq = pc->lock; ++ ++ if (unlikely(seq & 1)) { ++ cpu_relax(); ++ goto repeat; ++ } ++ ++ head = pc->data_head; ++ ++ rmb(); ++ if (pc->lock != seq) ++ goto repeat; ++ ++ return head; ++} ++ ++struct timeval last_read, this_read; ++ ++static void mmap_read(struct mmap_data *md) ++{ ++ unsigned int head = mmap_read_head(md); ++ unsigned int old = md->prev; ++ unsigned char *data = md->base + page_size; ++ int diff; ++ ++ gettimeofday(&this_read, NULL); ++ ++ /* ++ * If we're further behind than half the buffer, there's a chance ++ * the writer will bite our tail and screw up the events under us. ++ * ++ * If we somehow ended up ahead of the head, we got messed up. ++ * ++ * In either case, truncate and restart at head. ++ */ ++ diff = head - old; ++ if (diff > md->mask / 2 || diff < 0) { ++ struct timeval iv; ++ unsigned long msecs; ++ ++ timersub(&this_read, &last_read, &iv); ++ msecs = iv.tv_sec*1000 + iv.tv_usec/1000; ++ ++ fprintf(stderr, "WARNING: failed to keep up with mmap data." ++ " Last read %lu msecs ago.\n", msecs); ++ ++ /* ++ * head points to a known good entry, start there. ++ */ ++ old = head; ++ } ++ ++ last_read = this_read; ++ ++ for (; old != head;) { ++ struct event_struct { ++ struct perf_event_header header; ++ __u64 ip; ++ __u32 pid, tid; ++ } *event = (struct event_struct *)&data[old & md->mask]; ++ struct event_struct event_copy; ++ ++ unsigned int size = event->header.size; ++ ++ /* ++ * Event straddles the mmap boundary -- header should always ++ * be inside due to u64 alignment of output. ++ */ ++ if ((old & md->mask) + size != ((old + size) & md->mask)) { ++ unsigned int offset = old; ++ unsigned int len = sizeof(*event), cpy; ++ void *dst = &event_copy; ++ ++ do { ++ cpy = min(md->mask + 1 - (offset & md->mask), len); ++ memcpy(dst, &data[offset & md->mask], cpy); ++ offset += cpy; ++ dst += cpy; ++ len -= cpy; ++ } while (len); ++ ++ event = &event_copy; ++ } ++ ++ old += size; ++ ++ switch (event->header.type) { ++ case PERF_EVENT_IP: ++ case PERF_EVENT_IP | __PERF_EVENT_TID: ++ process_event(event->ip, md->counter); ++ break; ++ } ++ } ++ ++ md->prev = old; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; ++ struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; ++ struct perf_counter_hw_event hw_event; ++ int i, counter, group_fd, nr_poll = 0; ++ unsigned int cpu; ++ int ret; ++ ++ page_size = sysconf(_SC_PAGE_SIZE); ++ ++ process_options(argc, argv); ++ ++ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); ++ assert(nr_cpus <= MAX_NR_CPUS); ++ assert(nr_cpus >= 0); ++ ++ if (run_perfstat) ++ return do_perfstat(argc, argv); ++ ++ if (tid != -1 || profile_cpu != -1) ++ nr_cpus = 1; ++ ++ parse_symbols(); ++ if (vmlinux && sym_filter_entry) ++ parse_vmlinux(vmlinux); ++ ++ for (i = 0; i < nr_cpus; i++) { ++ group_fd = -1; ++ for (counter = 0; counter < nr_counters; counter++) { ++ ++ cpu = profile_cpu; ++ if (tid == -1 && profile_cpu == -1) ++ cpu = i; ++ ++ memset(&hw_event, 0, sizeof(hw_event)); ++ hw_event.config = event_id[counter]; ++ hw_event.irq_period = event_count[counter]; ++ hw_event.record_type = PERF_RECORD_IRQ; ++ hw_event.nmi = nmi; ++ hw_event.include_tid = 1; ++ ++ fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); ++ if (fd[i][counter] < 0) { ++ int err = errno; ++ printf("kerneltop error: syscall returned with %d (%s)\n", ++ fd[i][counter], strerror(err)); ++ if (err == EPERM) ++ printf("Are you root?\n"); ++ exit(-1); ++ } ++ assert(fd[i][counter] >= 0); ++ fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); ++ ++ /* ++ * First counter acts as the group leader: ++ */ ++ if (group && group_fd == -1) ++ group_fd = fd[i][counter]; ++ ++ event_array[nr_poll].fd = fd[i][counter]; ++ event_array[nr_poll].events = POLLIN; ++ nr_poll++; ++ ++ mmap_array[i][counter].counter = counter; ++ mmap_array[i][counter].prev = 0; ++ mmap_array[i][counter].mask = mmap_pages*page_size - 1; ++ mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, ++ PROT_READ, MAP_SHARED, fd[i][counter], 0); ++ if (mmap_array[i][counter].base == MAP_FAILED) { ++ printf("kerneltop error: failed to mmap with %d (%s)\n", ++ errno, strerror(errno)); ++ exit(-1); ++ } ++ } ++ } ++ ++ printf("KernelTop refresh period: %d seconds\n", delay_secs); ++ last_refresh = time(NULL); ++ ++ while (1) { ++ int hits = events; ++ ++ for (i = 0; i < nr_cpus; i++) { ++ for (counter = 0; counter < nr_counters; counter++) ++ mmap_read(&mmap_array[i][counter]); ++ } ++ ++ if (time(NULL) >= last_refresh + delay_secs) { ++ print_sym_table(); ++ events = userspace_events = 0; ++ } ++ ++ if (hits == events) ++ ret = poll(event_array, nr_poll, 1000); ++ hits = events; ++ } ++ ++ return 0; ++} +Index: linux-2.6-tip/Documentation/scheduler/00-INDEX +=================================================================== +--- linux-2.6-tip.orig/Documentation/scheduler/00-INDEX ++++ linux-2.6-tip/Documentation/scheduler/00-INDEX +@@ -2,8 +2,6 @@ + - this file. + sched-arch.txt + - CPU Scheduler implementation hints for architecture specific code. +-sched-coding.txt +- - reference for various scheduler-related methods in the O(1) scheduler. + sched-design-CFS.txt + - goals, design and implementation of the Complete Fair Scheduler. + sched-domains.txt +Index: linux-2.6-tip/Documentation/scheduler/sched-coding.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/scheduler/sched-coding.txt ++++ /dev/null +@@ -1,126 +0,0 @@ +- Reference for various scheduler-related methods in the O(1) scheduler +- Robert Love , MontaVista Software +- +- +-Note most of these methods are local to kernel/sched.c - this is by design. +-The scheduler is meant to be self-contained and abstracted away. This document +-is primarily for understanding the scheduler, not interfacing to it. Some of +-the discussed interfaces, however, are general process/scheduling methods. +-They are typically defined in include/linux/sched.h. +- +- +-Main Scheduling Methods +------------------------ +- +-void load_balance(runqueue_t *this_rq, int idle) +- Attempts to pull tasks from one cpu to another to balance cpu usage, +- if needed. This method is called explicitly if the runqueues are +- imbalanced or periodically by the timer tick. Prior to calling, +- the current runqueue must be locked and interrupts disabled. +- +-void schedule() +- The main scheduling function. Upon return, the highest priority +- process will be active. +- +- +-Locking +-------- +- +-Each runqueue has its own lock, rq->lock. When multiple runqueues need +-to be locked, lock acquires must be ordered by ascending &runqueue value. +- +-A specific runqueue is locked via +- +- task_rq_lock(task_t pid, unsigned long *flags) +- +-which disables preemption, disables interrupts, and locks the runqueue pid is +-running on. Likewise, +- +- task_rq_unlock(task_t pid, unsigned long *flags) +- +-unlocks the runqueue pid is running on, restores interrupts to their previous +-state, and reenables preemption. +- +-The routines +- +- double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +- +-and +- +- double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +- +-safely lock and unlock, respectively, the two specified runqueues. They do +-not, however, disable and restore interrupts. Users are required to do so +-manually before and after calls. +- +- +-Values +------- +- +-MAX_PRIO +- The maximum priority of the system, stored in the task as task->prio. +- Lower priorities are higher. Normal (non-RT) priorities range from +- MAX_RT_PRIO to (MAX_PRIO - 1). +-MAX_RT_PRIO +- The maximum real-time priority of the system. Valid RT priorities +- range from 0 to (MAX_RT_PRIO - 1). +-MAX_USER_RT_PRIO +- The maximum real-time priority that is exported to user-space. Should +- always be equal to or less than MAX_RT_PRIO. Setting it less allows +- kernel threads to have higher priorities than any user-space task. +-MIN_TIMESLICE +-MAX_TIMESLICE +- Respectively, the minimum and maximum timeslices (quanta) of a process. +- +-Data +----- +- +-struct runqueue +- The main per-CPU runqueue data structure. +-struct task_struct +- The main per-process data structure. +- +- +-General Methods +---------------- +- +-cpu_rq(cpu) +- Returns the runqueue of the specified cpu. +-this_rq() +- Returns the runqueue of the current cpu. +-task_rq(pid) +- Returns the runqueue which holds the specified pid. +-cpu_curr(cpu) +- Returns the task currently running on the given cpu. +-rt_task(pid) +- Returns true if pid is real-time, false if not. +- +- +-Process Control Methods +------------------------ +- +-void set_user_nice(task_t *p, long nice) +- Sets the "nice" value of task p to the given value. +-int setscheduler(pid_t pid, int policy, struct sched_param *param) +- Sets the scheduling policy and parameters for the given pid. +-int set_cpus_allowed(task_t *p, unsigned long new_mask) +- Sets a given task's CPU affinity and migrates it to a proper cpu. +- Callers must have a valid reference to the task and assure the +- task not exit prematurely. No locks can be held during the call. +-set_task_state(tsk, state_value) +- Sets the given task's state to the given value. +-set_current_state(state_value) +- Sets the current task's state to the given value. +-void set_tsk_need_resched(struct task_struct *tsk) +- Sets need_resched in the given task. +-void clear_tsk_need_resched(struct task_struct *tsk) +- Clears need_resched in the given task. +-void set_need_resched() +- Sets need_resched in the current task. +-void clear_need_resched() +- Clears need_resched in the current task. +-int need_resched() +- Returns true if need_resched is set in the current task, false +- otherwise. +-yield() +- Place the current process at the end of the runqueue and call schedule. +Index: linux-2.6-tip/Documentation/sysrq.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/sysrq.txt ++++ linux-2.6-tip/Documentation/sysrq.txt +@@ -113,6 +113,8 @@ On all - write a character to /proc/sys + + 'x' - Used by xmon interface on ppc/powerpc platforms. + ++'z' - Dump the ftrace buffer ++ + '0'-'9' - Sets the console log level, controlling which kernel messages + will be printed to your console. ('0', for example would make + it so that only emergency messages like PANICs or OOPSes would +Index: linux-2.6-tip/Documentation/tracepoints.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/tracepoints.txt ++++ linux-2.6-tip/Documentation/tracepoints.txt +@@ -45,8 +45,8 @@ In include/trace/subsys.h : + #include + + DECLARE_TRACE(subsys_eventname, +- TPPROTO(int firstarg, struct task_struct *p), +- TPARGS(firstarg, p)); ++ TP_PROTO(int firstarg, struct task_struct *p), ++ TP_ARGS(firstarg, p)); + + In subsys/file.c (where the tracing statement must be added) : + +@@ -66,10 +66,10 @@ Where : + - subsys is the name of your subsystem. + - eventname is the name of the event to trace. + +-- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the ++- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the + function called by this tracepoint. + +-- TPARGS(firstarg, p) are the parameters names, same as found in the ++- TP_ARGS(firstarg, p) are the parameters names, same as found in the + prototype. + + Connecting a function (probe) to a tracepoint is done by providing a +@@ -103,13 +103,14 @@ used to export the defined tracepoints. + + * Probe / tracepoint example + +-See the example provided in samples/tracepoints/src ++See the example provided in samples/tracepoints + +-Compile them with your kernel. ++Compile them with your kernel. They are built during 'make' (not ++'make modules') when CONFIG_SAMPLE_TRACEPOINTS=m. + + Run, as root : +-modprobe tracepoint-example (insmod order is not important) +-modprobe tracepoint-probe-example +-cat /proc/tracepoint-example (returns an expected error) +-rmmod tracepoint-example tracepoint-probe-example ++modprobe tracepoint-sample (insmod order is not important) ++modprobe tracepoint-probe-sample ++cat /proc/tracepoint-sample (returns an expected error) ++rmmod tracepoint-sample tracepoint-probe-sample + dmesg +Index: linux-2.6-tip/Documentation/vm/kmemtrace.txt +=================================================================== +--- /dev/null ++++ linux-2.6-tip/Documentation/vm/kmemtrace.txt +@@ -0,0 +1,126 @@ ++ kmemtrace - Kernel Memory Tracer ++ ++ by Eduard - Gabriel Munteanu ++ ++ ++I. Introduction ++=============== ++ ++kmemtrace helps kernel developers figure out two things: ++1) how different allocators (SLAB, SLUB etc.) perform ++2) how kernel code allocates memory and how much ++ ++To do this, we trace every allocation and export information to the userspace ++through the relay interface. We export things such as the number of requested ++bytes, the number of bytes actually allocated (i.e. including internal ++fragmentation), whether this is a slab allocation or a plain kmalloc() and so ++on. ++ ++The actual analysis is performed by a userspace tool (see section III for ++details on where to get it from). It logs the data exported by the kernel, ++processes it and (as of writing this) can provide the following information: ++- the total amount of memory allocated and fragmentation per call-site ++- the amount of memory allocated and fragmentation per allocation ++- total memory allocated and fragmentation in the collected dataset ++- number of cross-CPU allocation and frees (makes sense in NUMA environments) ++ ++Moreover, it can potentially find inconsistent and erroneous behavior in ++kernel code, such as using slab free functions on kmalloc'ed memory or ++allocating less memory than requested (but not truly failed allocations). ++ ++kmemtrace also makes provisions for tracing on some arch and analysing the ++data on another. ++ ++II. Design and goals ++==================== ++ ++kmemtrace was designed to handle rather large amounts of data. Thus, it uses ++the relay interface to export whatever is logged to userspace, which then ++stores it. Analysis and reporting is done asynchronously, that is, after the ++data is collected and stored. By design, it allows one to log and analyse ++on different machines and different arches. ++ ++As of writing this, the ABI is not considered stable, though it might not ++change much. However, no guarantees are made about compatibility yet. When ++deemed stable, the ABI should still allow easy extension while maintaining ++backward compatibility. This is described further in Documentation/ABI. ++ ++Summary of design goals: ++ - allow logging and analysis to be done across different machines ++ - be fast and anticipate usage in high-load environments (*) ++ - be reasonably extensible ++ - make it possible for GNU/Linux distributions to have kmemtrace ++ included in their repositories ++ ++(*) - one of the reasons Pekka Enberg's original userspace data analysis ++ tool's code was rewritten from Perl to C (although this is more than a ++ simple conversion) ++ ++ ++III. Quick usage guide ++====================== ++ ++1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable ++CONFIG_KMEMTRACE). ++ ++2) Get the userspace tool and build it: ++$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository ++$ cd kmemtrace-user/ ++$ ./autogen.sh ++$ ./configure ++$ make ++ ++3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the ++'single' runlevel (so that relay buffers don't fill up easily), and run ++kmemtrace: ++# '$' does not mean user, but root here. ++$ mount -t debugfs none /sys/kernel/debug ++$ mount -t proc none /proc ++$ cd path/to/kmemtrace-user/ ++$ ./kmemtraced ++Wait a bit, then stop it with CTRL+C. ++$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't ++ # overrun, should ++ # be zero. ++$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to ++ check its correctness] ++$ ./kmemtrace-report ++ ++Now you should have a nice and short summary of how the allocator performs. ++ ++IV. FAQ and known issues ++======================== ++ ++Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix ++this? Should I worry? ++A: If it's non-zero, this affects kmemtrace's accuracy, depending on how ++large the number is. You can fix it by supplying a higher ++'kmemtrace.subbufs=N' kernel parameter. ++--- ++ ++Q: kmemtrace_check reports errors, how do I fix this? Should I worry? ++A: This is a bug and should be reported. It can occur for a variety of ++reasons: ++ - possible bugs in relay code ++ - possible misuse of relay by kmemtrace ++ - timestamps being collected unorderly ++Or you may fix it yourself and send us a patch. ++--- ++ ++Q: kmemtrace_report shows many errors, how do I fix this? Should I worry? ++A: This is a known issue and I'm working on it. These might be true errors ++in kernel code, which may have inconsistent behavior (e.g. allocating memory ++with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed ++out this behavior may work with SLAB, but may fail with other allocators. ++ ++It may also be due to lack of tracing in some unusual allocator functions. ++ ++We don't want bug reports regarding this issue yet. ++--- ++ ++V. See also ++=========== ++ ++Documentation/kernel-parameters.txt ++Documentation/ABI/testing/debugfs-kmemtrace ++ +Index: linux-2.6-tip/Documentation/x86/boot.txt +=================================================================== +--- linux-2.6-tip.orig/Documentation/x86/boot.txt ++++ linux-2.6-tip/Documentation/x86/boot.txt +@@ -158,7 +158,7 @@ Offset Proto Name Meaning + 0202/4 2.00+ header Magic signature "HdrS" + 0206/2 2.00+ version Boot protocol version supported + 0208/4 2.00+ realmode_swtch Boot loader hook (see below) +-020C/2 2.00+ start_sys The load-low segment (0x1000) (obsolete) ++020C/2 2.00+ start_sys_seg The load-low segment (0x1000) (obsolete) + 020E/2 2.00+ kernel_version Pointer to kernel version string + 0210/1 2.00+ type_of_loader Boot loader identifier + 0211/1 2.00+ loadflags Boot protocol option flags +@@ -170,10 +170,11 @@ Offset Proto Name Meaning + 0224/2 2.01+ heap_end_ptr Free memory after setup end + 0226/2 N/A pad1 Unused + 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line +-022C/4 2.03+ initrd_addr_max Highest legal initrd address ++022C/4 2.03+ ramdisk_max Highest legal initrd address + 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel + 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not +-0235/3 N/A pad2 Unused ++0235/1 N/A pad2 Unused ++0236/2 N/A pad3 Unused + 0238/4 2.06+ cmdline_size Maximum size of the kernel command line + 023C/4 2.07+ hardware_subarch Hardware subarchitecture + 0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data +@@ -299,14 +300,14 @@ Protocol: 2.00+ + e.g. 0x0204 for version 2.04, and 0x0a11 for a hypothetical version + 10.17. + +-Field name: readmode_swtch ++Field name: realmode_swtch + Type: modify (optional) + Offset/size: 0x208/4 + Protocol: 2.00+ + + Boot loader hook (see ADVANCED BOOT LOADER HOOKS below.) + +-Field name: start_sys ++Field name: start_sys_seg + Type: read + Offset/size: 0x20c/2 + Protocol: 2.00+ +@@ -468,7 +469,7 @@ Protocol: 2.02+ + zero, the kernel will assume that your boot loader does not support + the 2.02+ protocol. + +-Field name: initrd_addr_max ++Field name: ramdisk_max + Type: read + Offset/size: 0x22c/4 + Protocol: 2.03+ +@@ -542,7 +543,10 @@ Protocol: 2.08+ + + The payload may be compressed. The format of both the compressed and + uncompressed data should be determined using the standard magic +- numbers. Currently only gzip compressed ELF is used. ++ numbers. The currently supported compression formats are gzip ++ (magic numbers 1F 8B or 1F 9E), bzip2 (magic number 42 5A) and LZMA ++ (magic number 5D 00). The uncompressed payload is currently always ELF ++ (magic number 7F 45 4C 46). + + Field name: payload_length + Type: read +Index: linux-2.6-tip/Documentation/x86/earlyprintk.txt +=================================================================== +--- /dev/null ++++ linux-2.6-tip/Documentation/x86/earlyprintk.txt +@@ -0,0 +1,101 @@ ++ ++Mini-HOWTO for using the earlyprintk=dbgp boot option with a ++USB2 Debug port key and a debug cable, on x86 systems. ++ ++You need two computers, the 'USB debug key' special gadget and ++and two USB cables, connected like this: ++ ++ [host/target] <-------> [USB debug key] <-------> [client/console] ++ ++1. There are three specific hardware requirements: ++ ++ a.) Host/target system needs to have USB debug port capability. ++ ++ You can check this capability by looking at a 'Debug port' bit in ++ the lspci -vvv output: ++ ++ # lspci -vvv ++ ... ++ 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI]) ++ Subsystem: Lenovo ThinkPad T61 ++ Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx- ++ Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- SERR- /proc/sysrq-trigger ++ ++ On the host/target system you should see this help line in "dmesg" output: ++ ++ SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z) ++ ++ On the client/console system do: ++ ++ cat /dev/ttyUSB0 ++ ++ And you should see the help line above displayed shortly after you've ++ provoked it on the host system. ++ ++If it does not work then please ask about it on the linux-kernel@vger.kernel.org ++mailing list or contact the x86 maintainers. +Index: linux-2.6-tip/MAINTAINERS +=================================================================== +--- linux-2.6-tip.orig/MAINTAINERS ++++ linux-2.6-tip/MAINTAINERS +@@ -1952,6 +1952,15 @@ L: linux-media@vger.kernel.org + T: git kernel.org:/pub/scm/linux/kernel/git/mchehab/linux-2.6.git + S: Maintained + ++HARDWARE LATENCY DETECTOR ++P: Jon Masters ++M: jcm@jonmasters.org ++W: http://www.kernel.org/pub/linux/kernel/people/jcm/hwlat_detector/ ++S: Supported ++L: linux-kernel@vger.kernel.org ++F: Documentation/hwlat_detector.txt ++F: drivers/misc/hwlat_detector.c ++ + HARDWARE MONITORING + L: lm-sensors@lm-sensors.org + W: http://www.lm-sensors.org/ +@@ -2621,6 +2630,20 @@ M: jason.wessel@windriver.com + L: kgdb-bugreport@lists.sourceforge.net + S: Maintained + ++KMEMCHECK ++P: Vegard Nossum ++M: vegardno@ifi.uio.no ++P Pekka Enberg ++M: penberg@cs.helsinki.fi ++L: linux-kernel@vger.kernel.org ++S: Maintained ++ ++KMEMTRACE ++P: Eduard - Gabriel Munteanu ++M: eduard.munteanu@linux360.ro ++L: linux-kernel@vger.kernel.org ++S: Maintained ++ + KPROBES + P: Ananth N Mavinakayanahalli + M: ananth@in.ibm.com +Index: linux-2.6-tip/Makefile +=================================================================== +--- linux-2.6-tip.orig/Makefile ++++ linux-2.6-tip/Makefile +@@ -533,8 +533,9 @@ KBUILD_CFLAGS += $(call cc-option,-Wfram + endif + + # Force gcc to behave correct even for buggy distributions +-# Arch Makefiles may override this setting ++ifndef CONFIG_CC_STACKPROTECTOR + KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) ++endif + + ifdef CONFIG_FRAME_POINTER + KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls +@@ -551,6 +552,10 @@ ifdef CONFIG_FUNCTION_TRACER + KBUILD_CFLAGS += -pg + endif + ++ifndef CONFIG_ALLOW_WARNINGS ++KBUILD_CFLAGS += -Werror ${WERROR} ++endif ++ + # We trigger additional mismatches with less inlining + ifdef CONFIG_DEBUG_SECTION_MISMATCH + KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once) +Index: linux-2.6-tip/arch/Kconfig +=================================================================== +--- linux-2.6-tip.orig/arch/Kconfig ++++ linux-2.6-tip/arch/Kconfig +@@ -6,6 +6,7 @@ config OPROFILE + tristate "OProfile system profiling (EXPERIMENTAL)" + depends on PROFILING + depends on HAVE_OPROFILE ++ depends on TRACING_SUPPORT + select TRACING + select RING_BUFFER + help +@@ -32,6 +33,11 @@ config OPROFILE_IBS + config HAVE_OPROFILE + bool + ++config PROFILE_NMI ++ bool ++ depends on OPROFILE ++ default y ++ + config KPROBES + bool "Kprobes" + depends on KALLSYMS && MODULES +@@ -106,3 +112,5 @@ config HAVE_CLK + The calls support software clock gating and + thus are a key power management tool on many systems. + ++config HAVE_DMA_API_DEBUG ++ bool +Index: linux-2.6-tip/arch/alpha/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/alpha/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/alpha/include/asm/hardirq.h +=================================================================== +--- linux-2.6-tip.orig/arch/alpha/include/asm/hardirq.h ++++ linux-2.6-tip/arch/alpha/include/asm/hardirq.h +@@ -14,17 +14,4 @@ typedef struct { + + void ack_bad_irq(unsigned int irq); + +-#define HARDIRQ_BITS 12 +- +-/* +- * The hardirq mask has to be large enough to have +- * space for potentially nestable IRQ sources in the system +- * to nest on a single CPU. On Alpha, interrupts are masked at the CPU +- * by IPL as well as at the system level. We only have 8 IPLs (UNIX PALcode) +- * so we really only have 8 nestable IRQs, but allow some overhead +- */ +-#if (1 << HARDIRQ_BITS) < 16 +-#error HARDIRQ_BITS is too low! +-#endif +- + #endif /* _ALPHA_HARDIRQ_H */ +Index: linux-2.6-tip/arch/alpha/include/asm/statfs.h +=================================================================== +--- linux-2.6-tip.orig/arch/alpha/include/asm/statfs.h ++++ linux-2.6-tip/arch/alpha/include/asm/statfs.h +@@ -1,6 +1,8 @@ + #ifndef _ALPHA_STATFS_H + #define _ALPHA_STATFS_H + ++#include ++ + /* Alpha is the only 64-bit platform with 32-bit statfs. And doesn't + even seem to implement statfs64 */ + #define __statfs_word __u32 +Index: linux-2.6-tip/arch/alpha/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/alpha/include/asm/swab.h ++++ linux-2.6-tip/arch/alpha/include/asm/swab.h +@@ -1,7 +1,7 @@ + #ifndef _ALPHA_SWAB_H + #define _ALPHA_SWAB_H + +-#include ++#include + #include + #include + +Index: linux-2.6-tip/arch/alpha/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/alpha/kernel/irq.c ++++ linux-2.6-tip/arch/alpha/kernel/irq.c +@@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq + cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); + last_cpu = cpu; + +- irq_desc[irq].affinity = cpumask_of_cpu(cpu); ++ cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu)); + irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu)); + return 0; + } +@@ -90,7 +90,7 @@ show_interrupts(struct seq_file *p, void + seq_printf(p, "%10u ", kstat_irqs(irq)); + #else + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[irq]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(irq, j)); + #endif + seq_printf(p, " %14s", irq_desc[irq].chip->typename); + seq_printf(p, " %c%s", +Index: linux-2.6-tip/arch/alpha/kernel/irq_alpha.c +=================================================================== +--- linux-2.6-tip.orig/arch/alpha/kernel/irq_alpha.c ++++ linux-2.6-tip/arch/alpha/kernel/irq_alpha.c +@@ -64,7 +64,7 @@ do_entInt(unsigned long type, unsigned l + smp_percpu_timer_interrupt(regs); + cpu = smp_processor_id(); + if (cpu != boot_cpuid) { +- kstat_cpu(cpu).irqs[RTC_IRQ]++; ++ kstat_incr_irqs_this_cpu(RTC_IRQ, irq_to_desc(RTC_IRQ)); + } else { + handle_irq(RTC_IRQ); + } +Index: linux-2.6-tip/arch/alpha/mm/init.c +=================================================================== +--- linux-2.6-tip.orig/arch/alpha/mm/init.c ++++ linux-2.6-tip/arch/alpha/mm/init.c +@@ -189,9 +189,21 @@ callback_init(void * kernel_end) + + if (alpha_using_srm) { + static struct vm_struct console_remap_vm; +- unsigned long vaddr = VMALLOC_START; ++ unsigned long nr_pages = 0; ++ unsigned long vaddr; + unsigned long i, j; + ++ /* calculate needed size */ ++ for (i = 0; i < crb->map_entries; ++i) ++ nr_pages += crb->map[i].count; ++ ++ /* register the vm area */ ++ console_remap_vm.flags = VM_ALLOC; ++ console_remap_vm.size = nr_pages << PAGE_SHIFT; ++ vm_area_register_early(&console_remap_vm, PAGE_SIZE); ++ ++ vaddr = (unsigned long)console_remap_vm.addr; ++ + /* Set up the third level PTEs and update the virtual + addresses of the CRB entries. */ + for (i = 0; i < crb->map_entries; ++i) { +@@ -213,12 +225,6 @@ callback_init(void * kernel_end) + vaddr += PAGE_SIZE; + } + } +- +- /* Let vmalloc know that we've allocated some space. */ +- console_remap_vm.flags = VM_ALLOC; +- console_remap_vm.addr = (void *) VMALLOC_START; +- console_remap_vm.size = vaddr - VMALLOC_START; +- vmlist = &console_remap_vm; + } + + callback_init_done = 1; +Index: linux-2.6-tip/arch/arm/include/asm/a.out.h +=================================================================== +--- linux-2.6-tip.orig/arch/arm/include/asm/a.out.h ++++ linux-2.6-tip/arch/arm/include/asm/a.out.h +@@ -2,7 +2,7 @@ + #define __ARM_A_OUT_H__ + + #include +-#include ++#include + + struct exec + { +Index: linux-2.6-tip/arch/arm/include/asm/setup.h +=================================================================== +--- linux-2.6-tip.orig/arch/arm/include/asm/setup.h ++++ linux-2.6-tip/arch/arm/include/asm/setup.h +@@ -14,7 +14,7 @@ + #ifndef __ASMARM_SETUP_H + #define __ASMARM_SETUP_H + +-#include ++#include + + #define COMMAND_LINE_SIZE 1024 + +Index: linux-2.6-tip/arch/arm/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/arm/include/asm/swab.h ++++ linux-2.6-tip/arch/arm/include/asm/swab.h +@@ -16,7 +16,7 @@ + #define __ASM_ARM_SWAB_H + + #include +-#include ++#include + + #if !defined(__STRICT_ANSI__) || defined(__KERNEL__) + # define __SWAB_64_THRU_32__ +Index: linux-2.6-tip/arch/arm/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/arm/kernel/irq.c ++++ linux-2.6-tip/arch/arm/kernel/irq.c +@@ -76,7 +76,7 @@ int show_interrupts(struct seq_file *p, + + seq_printf(p, "%3d: ", i); + for_each_present_cpu(cpu) +- seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, cpu)); + seq_printf(p, " %10s", irq_desc[i].chip->name ? : "-"); + seq_printf(p, " %s", action->name); + for (action = action->next; action; action = action->next) +@@ -101,9 +101,14 @@ unlock: + /* Handle bad interrupts */ + static struct irq_desc bad_irq_desc = { + .handle_irq = handle_bad_irq, +- .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock), ++ .lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock), + }; + ++#ifdef CONFIG_CPUMASK_OFFSTACK ++/* We are not allocating bad_irq_desc.affinity or .pending_mask */ ++#error "ARM architecture does not support CONFIG_CPUMASK_OFFSTACK." ++#endif ++ + /* + * do_IRQ handles all hardware IRQ's. Decoded IRQs should not + * come via this function. Instead, they should provide their +@@ -161,7 +166,7 @@ void __init init_IRQ(void) + irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE; + + #ifdef CONFIG_SMP +- bad_irq_desc.affinity = CPU_MASK_ALL; ++ cpumask_setall(bad_irq_desc.affinity); + bad_irq_desc.cpu = smp_processor_id(); + #endif + init_arch_irq(); +@@ -191,15 +196,16 @@ void migrate_irqs(void) + struct irq_desc *desc = irq_desc + i; + + if (desc->cpu == cpu) { +- unsigned int newcpu = any_online_cpu(desc->affinity); +- +- if (newcpu == NR_CPUS) { ++ unsigned int newcpu = cpumask_any_and(desc->affinity, ++ cpu_online_mask); ++ if (newcpu >= nr_cpu_ids) { + if (printk_ratelimit()) + printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n", + i, cpu); + +- cpus_setall(desc->affinity); +- newcpu = any_online_cpu(desc->affinity); ++ cpumask_setall(desc->affinity); ++ newcpu = cpumask_any_and(desc->affinity, ++ cpu_online_mask); + } + + route_irq(desc, i, newcpu); +Index: linux-2.6-tip/arch/arm/kernel/vmlinux.lds.S +=================================================================== +--- linux-2.6-tip.orig/arch/arm/kernel/vmlinux.lds.S ++++ linux-2.6-tip/arch/arm/kernel/vmlinux.lds.S +@@ -64,7 +64,9 @@ SECTIONS + __initramfs_end = .; + #endif + . = ALIGN(4096); ++ __per_cpu_load = .; + __per_cpu_start = .; ++ *(.data.percpu.page_aligned) + *(.data.percpu) + *(.data.percpu.shared_aligned) + __per_cpu_end = .; +Index: linux-2.6-tip/arch/arm/mach-ns9xxx/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/arm/mach-ns9xxx/irq.c ++++ linux-2.6-tip/arch/arm/mach-ns9xxx/irq.c +@@ -63,7 +63,6 @@ static struct irq_chip ns9xxx_chip = { + #else + static void handle_prio_irq(unsigned int irq, struct irq_desc *desc) + { +- unsigned int cpu = smp_processor_id(); + struct irqaction *action; + irqreturn_t action_ret; + +@@ -72,7 +71,7 @@ static void handle_prio_irq(unsigned int + BUG_ON(desc->status & IRQ_INPROGRESS); + + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); +- kstat_cpu(cpu).irqs[irq]++; ++ kstat_incr_irqs_this_cpu(irq, desc); + + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) +Index: linux-2.6-tip/arch/arm/oprofile/op_model_mpcore.c +=================================================================== +--- linux-2.6-tip.orig/arch/arm/oprofile/op_model_mpcore.c ++++ linux-2.6-tip/arch/arm/oprofile/op_model_mpcore.c +@@ -263,7 +263,7 @@ static void em_route_irq(int irq, unsign + const struct cpumask *mask = cpumask_of(cpu); + + spin_lock_irq(&desc->lock); +- desc->affinity = *mask; ++ cpumask_copy(desc->affinity, mask); + desc->chip->set_affinity(irq, mask); + spin_unlock_irq(&desc->lock); + } +Index: linux-2.6-tip/arch/avr32/Kconfig +=================================================================== +--- linux-2.6-tip.orig/arch/avr32/Kconfig ++++ linux-2.6-tip/arch/avr32/Kconfig +@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt" + config QUICKLIST + def_bool y + +-config HAVE_ARCH_BOOTMEM_NODE ++config HAVE_ARCH_BOOTMEM + def_bool n + + config ARCH_HAVE_MEMORY_PRESENT +Index: linux-2.6-tip/arch/avr32/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/avr32/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/avr32/include/asm/hardirq.h +=================================================================== +--- linux-2.6-tip.orig/arch/avr32/include/asm/hardirq.h ++++ linux-2.6-tip/arch/avr32/include/asm/hardirq.h +@@ -20,15 +20,4 @@ void ack_bad_irq(unsigned int irq); + + #endif /* __ASSEMBLY__ */ + +-#define HARDIRQ_BITS 12 +- +-/* +- * The hardirq mask has to be large enough to have +- * space for potentially all IRQ sources in the system +- * nesting on a single CPU: +- */ +-#if (1 << HARDIRQ_BITS) < NR_IRQS +-# error HARDIRQ_BITS is too low! +-#endif +- + #endif /* __ASM_AVR32_HARDIRQ_H */ +Index: linux-2.6-tip/arch/avr32/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/avr32/include/asm/swab.h ++++ linux-2.6-tip/arch/avr32/include/asm/swab.h +@@ -4,7 +4,7 @@ + #ifndef __ASM_AVR32_SWAB_H + #define __ASM_AVR32_SWAB_H + +-#include ++#include + #include + + #define __SWAB_64_THRU_32__ +Index: linux-2.6-tip/arch/avr32/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/avr32/kernel/irq.c ++++ linux-2.6-tip/arch/avr32/kernel/irq.c +@@ -58,7 +58,7 @@ int show_interrupts(struct seq_file *p, + + seq_printf(p, "%3d: ", i); + for_each_online_cpu(cpu) +- seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, cpu)); + seq_printf(p, " %8s", irq_desc[i].chip->name ? : "-"); + seq_printf(p, " %s", action->name); + for (action = action->next; action; action = action->next) +Index: linux-2.6-tip/arch/blackfin/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/blackfin/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/blackfin/include/asm/percpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/blackfin/include/asm/percpu.h ++++ linux-2.6-tip/arch/blackfin/include/asm/percpu.h +@@ -3,14 +3,4 @@ + + #include + +-#ifdef CONFIG_MODULES +-#define PERCPU_MODULE_RESERVE 8192 +-#else +-#define PERCPU_MODULE_RESERVE 0 +-#endif +- +-#define PERCPU_ENOUGH_ROOM \ +- (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ +- PERCPU_MODULE_RESERVE) +- + #endif /* __ARCH_BLACKFIN_PERCPU__ */ +Index: linux-2.6-tip/arch/blackfin/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/blackfin/include/asm/swab.h ++++ linux-2.6-tip/arch/blackfin/include/asm/swab.h +@@ -1,7 +1,7 @@ + #ifndef _BLACKFIN_SWAB_H + #define _BLACKFIN_SWAB_H + +-#include ++#include + #include + + #if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__) +Index: linux-2.6-tip/arch/blackfin/kernel/irqchip.c +=================================================================== +--- linux-2.6-tip.orig/arch/blackfin/kernel/irqchip.c ++++ linux-2.6-tip/arch/blackfin/kernel/irqchip.c +@@ -70,6 +70,11 @@ static struct irq_desc bad_irq_desc = { + #endif + }; + ++#ifdef CONFIG_CPUMASK_OFFSTACK ++/* We are not allocating a variable-sized bad_irq_desc.affinity */ ++#error "Blackfin architecture does not support CONFIG_CPUMASK_OFFSTACK." ++#endif ++ + int show_interrupts(struct seq_file *p, void *v) + { + int i = *(loff_t *) v, j; +@@ -83,7 +88,7 @@ int show_interrupts(struct seq_file *p, + goto skip; + seq_printf(p, "%3d: ", i); + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, " %8s", irq_desc[i].chip->name); + seq_printf(p, " %s", action->name); + for (action = action->next; action; action = action->next) +Index: linux-2.6-tip/arch/cris/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/cris/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/cris/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/cris/kernel/irq.c ++++ linux-2.6-tip/arch/cris/kernel/irq.c +@@ -66,7 +66,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%10u ", kstat_irqs(i)); + #else + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + #endif + seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %s", action->name); +Index: linux-2.6-tip/arch/frv/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/frv/kernel/irq.c ++++ linux-2.6-tip/arch/frv/kernel/irq.c +@@ -74,7 +74,7 @@ int show_interrupts(struct seq_file *p, + if (action) { + seq_printf(p, "%3d: ", i); + for_each_present_cpu(cpu) +- seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, cpu)); + seq_printf(p, " %10s", irq_desc[i].chip->name ? : "-"); + seq_printf(p, " %s", action->name); + for (action = action->next; +Index: linux-2.6-tip/arch/h8300/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/h8300/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/h8300/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/h8300/include/asm/swab.h ++++ linux-2.6-tip/arch/h8300/include/asm/swab.h +@@ -1,7 +1,7 @@ + #ifndef _H8300_SWAB_H + #define _H8300_SWAB_H + +-#include ++#include + + #if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__) + # define __SWAB_64_THRU_32__ +Index: linux-2.6-tip/arch/h8300/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/h8300/kernel/irq.c ++++ linux-2.6-tip/arch/h8300/kernel/irq.c +@@ -183,7 +183,7 @@ asmlinkage void do_IRQ(int irq) + #if defined(CONFIG_PROC_FS) + int show_interrupts(struct seq_file *p, void *v) + { +- int i = *(loff_t *) v, j; ++ int i = *(loff_t *) v; + struct irqaction * action; + unsigned long flags; + +@@ -196,7 +196,7 @@ int show_interrupts(struct seq_file *p, + if (!action) + goto unlock; + seq_printf(p, "%3d: ",i); +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs(i)); + seq_printf(p, " %14s", irq_desc[i].chip->name); + seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %s", action->name); +Index: linux-2.6-tip/arch/ia64/Kconfig +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/Kconfig ++++ linux-2.6-tip/arch/ia64/Kconfig +@@ -22,6 +22,9 @@ config IA64 + select HAVE_OPROFILE + select HAVE_KPROBES + select HAVE_KRETPROBES ++ select HAVE_FTRACE_MCOUNT_RECORD ++ select HAVE_DYNAMIC_FTRACE if (!ITANIUM) ++ select HAVE_FUNCTION_TRACER + select HAVE_DMA_ATTRS + select HAVE_KVM + select HAVE_ARCH_TRACEHOOK +Index: linux-2.6-tip/arch/ia64/dig/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/dig/Makefile ++++ linux-2.6-tip/arch/ia64/dig/Makefile +@@ -7,8 +7,8 @@ + + obj-y := setup.o + ifeq ($(CONFIG_DMAR), y) +-obj-$(CONFIG_IA64_GENERIC) += machvec.o machvec_vtd.o dig_vtd_iommu.o ++obj-$(CONFIG_IA64_GENERIC) += machvec.o machvec_vtd.o + else + obj-$(CONFIG_IA64_GENERIC) += machvec.o + endif +-obj-$(CONFIG_IA64_DIG_VTD) += dig_vtd_iommu.o ++ +Index: linux-2.6-tip/arch/ia64/dig/dig_vtd_iommu.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/dig/dig_vtd_iommu.c ++++ /dev/null +@@ -1,59 +0,0 @@ +-#include +-#include +-#include +-#include +- +-void * +-vtd_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, +- gfp_t flags) +-{ +- return intel_alloc_coherent(dev, size, dma_handle, flags); +-} +-EXPORT_SYMBOL_GPL(vtd_alloc_coherent); +- +-void +-vtd_free_coherent(struct device *dev, size_t size, void *vaddr, +- dma_addr_t dma_handle) +-{ +- intel_free_coherent(dev, size, vaddr, dma_handle); +-} +-EXPORT_SYMBOL_GPL(vtd_free_coherent); +- +-dma_addr_t +-vtd_map_single_attrs(struct device *dev, void *addr, size_t size, +- int dir, struct dma_attrs *attrs) +-{ +- return intel_map_single(dev, (phys_addr_t)addr, size, dir); +-} +-EXPORT_SYMBOL_GPL(vtd_map_single_attrs); +- +-void +-vtd_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size, +- int dir, struct dma_attrs *attrs) +-{ +- intel_unmap_single(dev, iova, size, dir); +-} +-EXPORT_SYMBOL_GPL(vtd_unmap_single_attrs); +- +-int +-vtd_map_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents, +- int dir, struct dma_attrs *attrs) +-{ +- return intel_map_sg(dev, sglist, nents, dir); +-} +-EXPORT_SYMBOL_GPL(vtd_map_sg_attrs); +- +-void +-vtd_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist, +- int nents, int dir, struct dma_attrs *attrs) +-{ +- intel_unmap_sg(dev, sglist, nents, dir); +-} +-EXPORT_SYMBOL_GPL(vtd_unmap_sg_attrs); +- +-int +-vtd_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +-{ +- return 0; +-} +-EXPORT_SYMBOL_GPL(vtd_dma_mapping_error); +Index: linux-2.6-tip/arch/ia64/hp/common/hwsw_iommu.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/hp/common/hwsw_iommu.c ++++ linux-2.6-tip/arch/ia64/hp/common/hwsw_iommu.c +@@ -13,48 +13,33 @@ + */ + + #include ++#include + #include +- + #include + ++extern struct dma_map_ops sba_dma_ops, swiotlb_dma_ops; ++ + /* swiotlb declarations & definitions: */ + extern int swiotlb_late_init_with_default_size (size_t size); + +-/* hwiommu declarations & definitions: */ +- +-extern ia64_mv_dma_alloc_coherent sba_alloc_coherent; +-extern ia64_mv_dma_free_coherent sba_free_coherent; +-extern ia64_mv_dma_map_single_attrs sba_map_single_attrs; +-extern ia64_mv_dma_unmap_single_attrs sba_unmap_single_attrs; +-extern ia64_mv_dma_map_sg_attrs sba_map_sg_attrs; +-extern ia64_mv_dma_unmap_sg_attrs sba_unmap_sg_attrs; +-extern ia64_mv_dma_supported sba_dma_supported; +-extern ia64_mv_dma_mapping_error sba_dma_mapping_error; +- +-#define hwiommu_alloc_coherent sba_alloc_coherent +-#define hwiommu_free_coherent sba_free_coherent +-#define hwiommu_map_single_attrs sba_map_single_attrs +-#define hwiommu_unmap_single_attrs sba_unmap_single_attrs +-#define hwiommu_map_sg_attrs sba_map_sg_attrs +-#define hwiommu_unmap_sg_attrs sba_unmap_sg_attrs +-#define hwiommu_dma_supported sba_dma_supported +-#define hwiommu_dma_mapping_error sba_dma_mapping_error +-#define hwiommu_sync_single_for_cpu machvec_dma_sync_single +-#define hwiommu_sync_sg_for_cpu machvec_dma_sync_sg +-#define hwiommu_sync_single_for_device machvec_dma_sync_single +-#define hwiommu_sync_sg_for_device machvec_dma_sync_sg +- +- + /* + * Note: we need to make the determination of whether or not to use + * the sw I/O TLB based purely on the device structure. Anything else + * would be unreliable or would be too intrusive. + */ +-static inline int +-use_swiotlb (struct device *dev) ++static inline int use_swiotlb(struct device *dev) ++{ ++ return dev && dev->dma_mask && ++ !sba_dma_ops.dma_supported(dev, *dev->dma_mask); ++} ++ ++struct dma_map_ops *hwsw_dma_get_ops(struct device *dev) + { +- return dev && dev->dma_mask && !hwiommu_dma_supported(dev, *dev->dma_mask); ++ if (use_swiotlb(dev)) ++ return &swiotlb_dma_ops; ++ return &sba_dma_ops; + } ++EXPORT_SYMBOL(hwsw_dma_get_ops); + + void __init + hwsw_init (void) +@@ -71,125 +56,3 @@ hwsw_init (void) + #endif + } + } +- +-void * +-hwsw_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags) +-{ +- if (use_swiotlb(dev)) +- return swiotlb_alloc_coherent(dev, size, dma_handle, flags); +- else +- return hwiommu_alloc_coherent(dev, size, dma_handle, flags); +-} +- +-void +-hwsw_free_coherent (struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) +-{ +- if (use_swiotlb(dev)) +- swiotlb_free_coherent(dev, size, vaddr, dma_handle); +- else +- hwiommu_free_coherent(dev, size, vaddr, dma_handle); +-} +- +-dma_addr_t +-hwsw_map_single_attrs(struct device *dev, void *addr, size_t size, int dir, +- struct dma_attrs *attrs) +-{ +- if (use_swiotlb(dev)) +- return swiotlb_map_single_attrs(dev, addr, size, dir, attrs); +- else +- return hwiommu_map_single_attrs(dev, addr, size, dir, attrs); +-} +-EXPORT_SYMBOL(hwsw_map_single_attrs); +- +-void +-hwsw_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size, +- int dir, struct dma_attrs *attrs) +-{ +- if (use_swiotlb(dev)) +- return swiotlb_unmap_single_attrs(dev, iova, size, dir, attrs); +- else +- return hwiommu_unmap_single_attrs(dev, iova, size, dir, attrs); +-} +-EXPORT_SYMBOL(hwsw_unmap_single_attrs); +- +-int +-hwsw_map_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents, +- int dir, struct dma_attrs *attrs) +-{ +- if (use_swiotlb(dev)) +- return swiotlb_map_sg_attrs(dev, sglist, nents, dir, attrs); +- else +- return hwiommu_map_sg_attrs(dev, sglist, nents, dir, attrs); +-} +-EXPORT_SYMBOL(hwsw_map_sg_attrs); +- +-void +-hwsw_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents, +- int dir, struct dma_attrs *attrs) +-{ +- if (use_swiotlb(dev)) +- return swiotlb_unmap_sg_attrs(dev, sglist, nents, dir, attrs); +- else +- return hwiommu_unmap_sg_attrs(dev, sglist, nents, dir, attrs); +-} +-EXPORT_SYMBOL(hwsw_unmap_sg_attrs); +- +-void +-hwsw_sync_single_for_cpu (struct device *dev, dma_addr_t addr, size_t size, int dir) +-{ +- if (use_swiotlb(dev)) +- swiotlb_sync_single_for_cpu(dev, addr, size, dir); +- else +- hwiommu_sync_single_for_cpu(dev, addr, size, dir); +-} +- +-void +-hwsw_sync_sg_for_cpu (struct device *dev, struct scatterlist *sg, int nelems, int dir) +-{ +- if (use_swiotlb(dev)) +- swiotlb_sync_sg_for_cpu(dev, sg, nelems, dir); +- else +- hwiommu_sync_sg_for_cpu(dev, sg, nelems, dir); +-} +- +-void +-hwsw_sync_single_for_device (struct device *dev, dma_addr_t addr, size_t size, int dir) +-{ +- if (use_swiotlb(dev)) +- swiotlb_sync_single_for_device(dev, addr, size, dir); +- else +- hwiommu_sync_single_for_device(dev, addr, size, dir); +-} +- +-void +-hwsw_sync_sg_for_device (struct device *dev, struct scatterlist *sg, int nelems, int dir) +-{ +- if (use_swiotlb(dev)) +- swiotlb_sync_sg_for_device(dev, sg, nelems, dir); +- else +- hwiommu_sync_sg_for_device(dev, sg, nelems, dir); +-} +- +-int +-hwsw_dma_supported (struct device *dev, u64 mask) +-{ +- if (hwiommu_dma_supported(dev, mask)) +- return 1; +- return swiotlb_dma_supported(dev, mask); +-} +- +-int +-hwsw_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +-{ +- return hwiommu_dma_mapping_error(dev, dma_addr) || +- swiotlb_dma_mapping_error(dev, dma_addr); +-} +- +-EXPORT_SYMBOL(hwsw_dma_mapping_error); +-EXPORT_SYMBOL(hwsw_dma_supported); +-EXPORT_SYMBOL(hwsw_alloc_coherent); +-EXPORT_SYMBOL(hwsw_free_coherent); +-EXPORT_SYMBOL(hwsw_sync_single_for_cpu); +-EXPORT_SYMBOL(hwsw_sync_single_for_device); +-EXPORT_SYMBOL(hwsw_sync_sg_for_cpu); +-EXPORT_SYMBOL(hwsw_sync_sg_for_device); +Index: linux-2.6-tip/arch/ia64/hp/common/sba_iommu.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/hp/common/sba_iommu.c ++++ linux-2.6-tip/arch/ia64/hp/common/sba_iommu.c +@@ -36,6 +36,7 @@ + #include /* hweight64() */ + #include + #include ++#include + + #include /* ia64_get_itc() */ + #include +@@ -908,11 +909,13 @@ sba_mark_invalid(struct ioc *ioc, dma_ad + * + * See Documentation/PCI/PCI-DMA-mapping.txt + */ +-dma_addr_t +-sba_map_single_attrs(struct device *dev, void *addr, size_t size, int dir, +- struct dma_attrs *attrs) ++static dma_addr_t sba_map_page(struct device *dev, struct page *page, ++ unsigned long poff, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct ioc *ioc; ++ void *addr = page_address(page) + poff; + dma_addr_t iovp; + dma_addr_t offset; + u64 *pdir_start; +@@ -990,7 +993,14 @@ sba_map_single_attrs(struct device *dev, + #endif + return SBA_IOVA(ioc, iovp, offset); + } +-EXPORT_SYMBOL(sba_map_single_attrs); ++ ++static dma_addr_t sba_map_single_attrs(struct device *dev, void *addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ return sba_map_page(dev, virt_to_page(addr), ++ (unsigned long)addr & ~PAGE_MASK, size, dir, attrs); ++} + + #ifdef ENABLE_MARK_CLEAN + static SBA_INLINE void +@@ -1026,8 +1036,8 @@ sba_mark_clean(struct ioc *ioc, dma_addr + * + * See Documentation/PCI/PCI-DMA-mapping.txt + */ +-void sba_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size, +- int dir, struct dma_attrs *attrs) ++static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size, ++ enum dma_data_direction dir, struct dma_attrs *attrs) + { + struct ioc *ioc; + #if DELAYED_RESOURCE_CNT > 0 +@@ -1094,7 +1104,12 @@ void sba_unmap_single_attrs(struct devic + spin_unlock_irqrestore(&ioc->res_lock, flags); + #endif /* DELAYED_RESOURCE_CNT == 0 */ + } +-EXPORT_SYMBOL(sba_unmap_single_attrs); ++ ++void sba_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size, ++ enum dma_data_direction dir, struct dma_attrs *attrs) ++{ ++ sba_unmap_page(dev, iova, size, dir, attrs); ++} + + /** + * sba_alloc_coherent - allocate/map shared mem for DMA +@@ -1104,7 +1119,7 @@ EXPORT_SYMBOL(sba_unmap_single_attrs); + * + * See Documentation/PCI/PCI-DMA-mapping.txt + */ +-void * ++static void * + sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags) + { + struct ioc *ioc; +@@ -1167,7 +1182,8 @@ sba_alloc_coherent (struct device *dev, + * + * See Documentation/PCI/PCI-DMA-mapping.txt + */ +-void sba_free_coherent (struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) ++static void sba_free_coherent (struct device *dev, size_t size, void *vaddr, ++ dma_addr_t dma_handle) + { + sba_unmap_single_attrs(dev, dma_handle, size, 0, NULL); + free_pages((unsigned long) vaddr, get_order(size)); +@@ -1422,8 +1438,9 @@ sba_coalesce_chunks(struct ioc *ioc, str + * + * See Documentation/PCI/PCI-DMA-mapping.txt + */ +-int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents, +- int dir, struct dma_attrs *attrs) ++static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist, ++ int nents, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct ioc *ioc; + int coalesced, filled = 0; +@@ -1502,7 +1519,6 @@ int sba_map_sg_attrs(struct device *dev, + + return filled; + } +-EXPORT_SYMBOL(sba_map_sg_attrs); + + /** + * sba_unmap_sg_attrs - unmap Scatter/Gather list +@@ -1514,8 +1530,9 @@ EXPORT_SYMBOL(sba_map_sg_attrs); + * + * See Documentation/PCI/PCI-DMA-mapping.txt + */ +-void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist, +- int nents, int dir, struct dma_attrs *attrs) ++static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist, ++ int nents, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + #ifdef ASSERT_PDIR_SANITY + struct ioc *ioc; +@@ -1551,7 +1568,6 @@ void sba_unmap_sg_attrs(struct device *d + #endif + + } +-EXPORT_SYMBOL(sba_unmap_sg_attrs); + + /************************************************************** + * +@@ -2064,6 +2080,8 @@ static struct acpi_driver acpi_sba_ioc_d + }, + }; + ++extern struct dma_map_ops swiotlb_dma_ops; ++ + static int __init + sba_init(void) + { +@@ -2077,6 +2095,7 @@ sba_init(void) + * a successful kdump kernel boot is to use the swiotlb. + */ + if (is_kdump_kernel()) { ++ dma_ops = &swiotlb_dma_ops; + if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) + panic("Unable to initialize software I/O TLB:" + " Try machvec=dig boot option"); +@@ -2092,6 +2111,7 @@ sba_init(void) + * If we didn't find something sba_iommu can claim, we + * need to setup the swiotlb and switch to the dig machvec. + */ ++ dma_ops = &swiotlb_dma_ops; + if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) + panic("Unable to find SBA IOMMU or initialize " + "software I/O TLB: Try machvec=dig boot option"); +@@ -2138,15 +2158,13 @@ nosbagart(char *str) + return 1; + } + +-int +-sba_dma_supported (struct device *dev, u64 mask) ++static int sba_dma_supported (struct device *dev, u64 mask) + { + /* make sure it's at least 32bit capable */ + return ((mask & 0xFFFFFFFFUL) == 0xFFFFFFFFUL); + } + +-int +-sba_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) ++static int sba_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + { + return 0; + } +@@ -2176,7 +2194,22 @@ sba_page_override(char *str) + + __setup("sbapagesize=",sba_page_override); + +-EXPORT_SYMBOL(sba_dma_mapping_error); +-EXPORT_SYMBOL(sba_dma_supported); +-EXPORT_SYMBOL(sba_alloc_coherent); +-EXPORT_SYMBOL(sba_free_coherent); ++struct dma_map_ops sba_dma_ops = { ++ .alloc_coherent = sba_alloc_coherent, ++ .free_coherent = sba_free_coherent, ++ .map_page = sba_map_page, ++ .unmap_page = sba_unmap_page, ++ .map_sg = sba_map_sg_attrs, ++ .unmap_sg = sba_unmap_sg_attrs, ++ .sync_single_for_cpu = machvec_dma_sync_single, ++ .sync_sg_for_cpu = machvec_dma_sync_sg, ++ .sync_single_for_device = machvec_dma_sync_single, ++ .sync_sg_for_device = machvec_dma_sync_sg, ++ .dma_supported = sba_dma_supported, ++ .mapping_error = sba_dma_mapping_error, ++}; ++ ++void sba_dma_init(void) ++{ ++ dma_ops = &sba_dma_ops; ++} +Index: linux-2.6-tip/arch/ia64/include/asm/dma-mapping.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/dma-mapping.h ++++ linux-2.6-tip/arch/ia64/include/asm/dma-mapping.h +@@ -11,99 +11,128 @@ + + #define ARCH_HAS_DMA_GET_REQUIRED_MASK + +-struct dma_mapping_ops { +- int (*mapping_error)(struct device *dev, +- dma_addr_t dma_addr); +- void* (*alloc_coherent)(struct device *dev, size_t size, +- dma_addr_t *dma_handle, gfp_t gfp); +- void (*free_coherent)(struct device *dev, size_t size, +- void *vaddr, dma_addr_t dma_handle); +- dma_addr_t (*map_single)(struct device *hwdev, unsigned long ptr, +- size_t size, int direction); +- void (*unmap_single)(struct device *dev, dma_addr_t addr, +- size_t size, int direction); +- void (*sync_single_for_cpu)(struct device *hwdev, +- dma_addr_t dma_handle, size_t size, +- int direction); +- void (*sync_single_for_device)(struct device *hwdev, +- dma_addr_t dma_handle, size_t size, +- int direction); +- void (*sync_single_range_for_cpu)(struct device *hwdev, +- dma_addr_t dma_handle, unsigned long offset, +- size_t size, int direction); +- void (*sync_single_range_for_device)(struct device *hwdev, +- dma_addr_t dma_handle, unsigned long offset, +- size_t size, int direction); +- void (*sync_sg_for_cpu)(struct device *hwdev, +- struct scatterlist *sg, int nelems, +- int direction); +- void (*sync_sg_for_device)(struct device *hwdev, +- struct scatterlist *sg, int nelems, +- int direction); +- int (*map_sg)(struct device *hwdev, struct scatterlist *sg, +- int nents, int direction); +- void (*unmap_sg)(struct device *hwdev, +- struct scatterlist *sg, int nents, +- int direction); +- int (*dma_supported_op)(struct device *hwdev, u64 mask); +- int is_phys; +-}; +- +-extern struct dma_mapping_ops *dma_ops; ++extern struct dma_map_ops *dma_ops; + extern struct ia64_machine_vector ia64_mv; + extern void set_iommu_machvec(void); + +-#define dma_alloc_coherent(dev, size, handle, gfp) \ +- platform_dma_alloc_coherent(dev, size, handle, (gfp) | GFP_DMA) ++extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t, ++ enum dma_data_direction); ++extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int, ++ enum dma_data_direction); + +-/* coherent mem. is cheap */ +-static inline void * +-dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, +- gfp_t flag) ++static inline void *dma_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t *daddr, gfp_t gfp) + { +- return dma_alloc_coherent(dev, size, dma_handle, flag); ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ return ops->alloc_coherent(dev, size, daddr, gfp); + } +-#define dma_free_coherent platform_dma_free_coherent +-static inline void +-dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr, +- dma_addr_t dma_handle) ++ ++static inline void dma_free_coherent(struct device *dev, size_t size, ++ void *caddr, dma_addr_t daddr) + { +- dma_free_coherent(dev, size, cpu_addr, dma_handle); ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ ops->free_coherent(dev, size, caddr, daddr); + } +-#define dma_map_single_attrs platform_dma_map_single_attrs +-static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, +- size_t size, int dir) ++ ++#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) ++#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) ++ ++static inline dma_addr_t dma_map_single_attrs(struct device *dev, ++ void *caddr, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { +- return dma_map_single_attrs(dev, cpu_addr, size, dir, NULL); ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ return ops->map_page(dev, virt_to_page(caddr), ++ (unsigned long)caddr & ~PAGE_MASK, size, ++ dir, attrs); + } +-#define dma_map_sg_attrs platform_dma_map_sg_attrs +-static inline int dma_map_sg(struct device *dev, struct scatterlist *sgl, +- int nents, int dir) ++ ++static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t daddr, ++ size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { +- return dma_map_sg_attrs(dev, sgl, nents, dir, NULL); ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ ops->unmap_page(dev, daddr, size, dir, attrs); + } +-#define dma_unmap_single_attrs platform_dma_unmap_single_attrs +-static inline void dma_unmap_single(struct device *dev, dma_addr_t cpu_addr, +- size_t size, int dir) ++ ++#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) ++#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL) ++ ++static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl, ++ int nents, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { +- return dma_unmap_single_attrs(dev, cpu_addr, size, dir, NULL); ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ return ops->map_sg(dev, sgl, nents, dir, attrs); + } +-#define dma_unmap_sg_attrs platform_dma_unmap_sg_attrs +-static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sgl, +- int nents, int dir) ++ ++static inline void dma_unmap_sg_attrs(struct device *dev, ++ struct scatterlist *sgl, int nents, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { +- return dma_unmap_sg_attrs(dev, sgl, nents, dir, NULL); ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ ops->unmap_sg(dev, sgl, nents, dir, attrs); + } +-#define dma_sync_single_for_cpu platform_dma_sync_single_for_cpu +-#define dma_sync_sg_for_cpu platform_dma_sync_sg_for_cpu +-#define dma_sync_single_for_device platform_dma_sync_single_for_device +-#define dma_sync_sg_for_device platform_dma_sync_sg_for_device +-#define dma_mapping_error platform_dma_mapping_error + +-#define dma_map_page(dev, pg, off, size, dir) \ +- dma_map_single(dev, page_address(pg) + (off), (size), (dir)) +-#define dma_unmap_page(dev, dma_addr, size, dir) \ +- dma_unmap_single(dev, dma_addr, size, dir) ++#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL) ++#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL) ++ ++static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t daddr, ++ size_t size, ++ enum dma_data_direction dir) ++{ ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ ops->sync_single_for_cpu(dev, daddr, size, dir); ++} ++ ++static inline void dma_sync_sg_for_cpu(struct device *dev, ++ struct scatterlist *sgl, ++ int nents, enum dma_data_direction dir) ++{ ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ ops->sync_sg_for_cpu(dev, sgl, nents, dir); ++} ++ ++static inline void dma_sync_single_for_device(struct device *dev, ++ dma_addr_t daddr, ++ size_t size, ++ enum dma_data_direction dir) ++{ ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ ops->sync_single_for_device(dev, daddr, size, dir); ++} ++ ++static inline void dma_sync_sg_for_device(struct device *dev, ++ struct scatterlist *sgl, ++ int nents, ++ enum dma_data_direction dir) ++{ ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ ops->sync_sg_for_device(dev, sgl, nents, dir); ++} ++ ++static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr) ++{ ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ return ops->mapping_error(dev, daddr); ++} ++ ++static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, ++ size_t offset, size_t size, ++ enum dma_data_direction dir) ++{ ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ return ops->map_page(dev, page, offset, size, dir, NULL); ++} ++ ++static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, ++ size_t size, enum dma_data_direction dir) ++{ ++ dma_unmap_single(dev, addr, size, dir); ++} + + /* + * Rest of this file is part of the "Advanced DMA API". Use at your own risk. +@@ -115,7 +144,11 @@ static inline void dma_unmap_sg(struct d + #define dma_sync_single_range_for_device(dev, dma_handle, offset, size, dir) \ + dma_sync_single_for_device(dev, dma_handle, size, dir) + +-#define dma_supported platform_dma_supported ++static inline int dma_supported(struct device *dev, u64 mask) ++{ ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); ++ return ops->dma_supported(dev, mask); ++} + + static inline int + dma_set_mask (struct device *dev, u64 mask) +@@ -141,11 +174,4 @@ dma_cache_sync (struct device *dev, void + + #define dma_is_consistent(d, h) (1) /* all we do is coherent memory... */ + +-static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) +-{ +- return dma_ops; +-} +- +- +- + #endif /* _ASM_IA64_DMA_MAPPING_H */ +Index: linux-2.6-tip/arch/ia64/include/asm/fpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/fpu.h ++++ linux-2.6-tip/arch/ia64/include/asm/fpu.h +@@ -6,8 +6,6 @@ + * David Mosberger-Tang + */ + +-#include +- + /* floating point status register: */ + #define FPSR_TRAP_VD (1 << 0) /* invalid op trap disabled */ + #define FPSR_TRAP_DD (1 << 1) /* denormal trap disabled */ +Index: linux-2.6-tip/arch/ia64/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/ia64/include/asm/ftrace.h +@@ -0,0 +1,28 @@ ++#ifndef _ASM_IA64_FTRACE_H ++#define _ASM_IA64_FTRACE_H ++ ++#ifdef CONFIG_FUNCTION_TRACER ++#define MCOUNT_INSN_SIZE 32 /* sizeof mcount call */ ++ ++#ifndef __ASSEMBLY__ ++extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0); ++#define mcount _mcount ++ ++#include ++/* In IA64, MCOUNT_ADDR is set in link time, so it's not a constant at compile time */ ++#define MCOUNT_ADDR (((struct fnptr *)mcount)->ip) ++#define FTRACE_ADDR (((struct fnptr *)ftrace_caller)->ip) ++ ++static inline unsigned long ftrace_call_adjust(unsigned long addr) ++{ ++ /* second bundle, insn 2 */ ++ return addr - 0x12; ++} ++ ++struct dyn_arch_ftrace { ++}; ++#endif ++ ++#endif /* CONFIG_FUNCTION_TRACER */ ++ ++#endif /* _ASM_IA64_FTRACE_H */ +Index: linux-2.6-tip/arch/ia64/include/asm/gcc_intrin.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/gcc_intrin.h ++++ linux-2.6-tip/arch/ia64/include/asm/gcc_intrin.h +@@ -6,6 +6,7 @@ + * Copyright (C) 2002,2003 Suresh Siddha + */ + ++#include + #include + + /* define this macro to get some asm stmts included in 'c' files */ +Index: linux-2.6-tip/arch/ia64/include/asm/hardirq.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/hardirq.h ++++ linux-2.6-tip/arch/ia64/include/asm/hardirq.h +@@ -20,16 +20,6 @@ + + #define local_softirq_pending() (local_cpu_data->softirq_pending) + +-#define HARDIRQ_BITS 14 +- +-/* +- * The hardirq mask has to be large enough to have space for potentially all IRQ sources +- * in the system nesting on a single CPU: +- */ +-#if (1 << HARDIRQ_BITS) < NR_IRQS +-# error HARDIRQ_BITS is too low! +-#endif +- + extern void __iomem *ipi_base_addr; + + void ack_bad_irq(unsigned int irq); +Index: linux-2.6-tip/arch/ia64/include/asm/intrinsics.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/intrinsics.h ++++ linux-2.6-tip/arch/ia64/include/asm/intrinsics.h +@@ -10,6 +10,7 @@ + + #ifndef __ASSEMBLY__ + ++#include + /* include compiler specific intrinsics */ + #include + #ifdef __INTEL_COMPILER +Index: linux-2.6-tip/arch/ia64/include/asm/kvm.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/kvm.h ++++ linux-2.6-tip/arch/ia64/include/asm/kvm.h +@@ -21,8 +21,7 @@ + * + */ + +-#include +- ++#include + #include + + /* Select x86 specific features in */ +Index: linux-2.6-tip/arch/ia64/include/asm/machvec.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/machvec.h ++++ linux-2.6-tip/arch/ia64/include/asm/machvec.h +@@ -11,7 +11,6 @@ + #define _ASM_IA64_MACHVEC_H + + #include +-#include + + /* forward declarations: */ + struct device; +@@ -45,24 +44,8 @@ typedef void ia64_mv_kernel_launch_event + + /* DMA-mapping interface: */ + typedef void ia64_mv_dma_init (void); +-typedef void *ia64_mv_dma_alloc_coherent (struct device *, size_t, dma_addr_t *, gfp_t); +-typedef void ia64_mv_dma_free_coherent (struct device *, size_t, void *, dma_addr_t); +-typedef dma_addr_t ia64_mv_dma_map_single (struct device *, void *, size_t, int); +-typedef void ia64_mv_dma_unmap_single (struct device *, dma_addr_t, size_t, int); +-typedef int ia64_mv_dma_map_sg (struct device *, struct scatterlist *, int, int); +-typedef void ia64_mv_dma_unmap_sg (struct device *, struct scatterlist *, int, int); +-typedef void ia64_mv_dma_sync_single_for_cpu (struct device *, dma_addr_t, size_t, int); +-typedef void ia64_mv_dma_sync_sg_for_cpu (struct device *, struct scatterlist *, int, int); +-typedef void ia64_mv_dma_sync_single_for_device (struct device *, dma_addr_t, size_t, int); +-typedef void ia64_mv_dma_sync_sg_for_device (struct device *, struct scatterlist *, int, int); +-typedef int ia64_mv_dma_mapping_error(struct device *, dma_addr_t dma_addr); +-typedef int ia64_mv_dma_supported (struct device *, u64); +- +-typedef dma_addr_t ia64_mv_dma_map_single_attrs (struct device *, void *, size_t, int, struct dma_attrs *); +-typedef void ia64_mv_dma_unmap_single_attrs (struct device *, dma_addr_t, size_t, int, struct dma_attrs *); +-typedef int ia64_mv_dma_map_sg_attrs (struct device *, struct scatterlist *, int, int, struct dma_attrs *); +-typedef void ia64_mv_dma_unmap_sg_attrs (struct device *, struct scatterlist *, int, int, struct dma_attrs *); + typedef u64 ia64_mv_dma_get_required_mask (struct device *); ++typedef struct dma_map_ops *ia64_mv_dma_get_ops(struct device *); + + /* + * WARNING: The legacy I/O space is _architected_. Platforms are +@@ -114,8 +97,6 @@ machvec_noop_bus (struct pci_bus *bus) + + extern void machvec_setup (char **); + extern void machvec_timer_interrupt (int, void *); +-extern void machvec_dma_sync_single (struct device *, dma_addr_t, size_t, int); +-extern void machvec_dma_sync_sg (struct device *, struct scatterlist *, int, int); + extern void machvec_tlb_migrate_finish (struct mm_struct *); + + # if defined (CONFIG_IA64_HP_SIM) +@@ -148,19 +129,8 @@ extern void machvec_tlb_migrate_finish ( + # define platform_global_tlb_purge ia64_mv.global_tlb_purge + # define platform_tlb_migrate_finish ia64_mv.tlb_migrate_finish + # define platform_dma_init ia64_mv.dma_init +-# define platform_dma_alloc_coherent ia64_mv.dma_alloc_coherent +-# define platform_dma_free_coherent ia64_mv.dma_free_coherent +-# define platform_dma_map_single_attrs ia64_mv.dma_map_single_attrs +-# define platform_dma_unmap_single_attrs ia64_mv.dma_unmap_single_attrs +-# define platform_dma_map_sg_attrs ia64_mv.dma_map_sg_attrs +-# define platform_dma_unmap_sg_attrs ia64_mv.dma_unmap_sg_attrs +-# define platform_dma_sync_single_for_cpu ia64_mv.dma_sync_single_for_cpu +-# define platform_dma_sync_sg_for_cpu ia64_mv.dma_sync_sg_for_cpu +-# define platform_dma_sync_single_for_device ia64_mv.dma_sync_single_for_device +-# define platform_dma_sync_sg_for_device ia64_mv.dma_sync_sg_for_device +-# define platform_dma_mapping_error ia64_mv.dma_mapping_error +-# define platform_dma_supported ia64_mv.dma_supported + # define platform_dma_get_required_mask ia64_mv.dma_get_required_mask ++# define platform_dma_get_ops ia64_mv.dma_get_ops + # define platform_irq_to_vector ia64_mv.irq_to_vector + # define platform_local_vector_to_irq ia64_mv.local_vector_to_irq + # define platform_pci_get_legacy_mem ia64_mv.pci_get_legacy_mem +@@ -203,19 +173,8 @@ struct ia64_machine_vector { + ia64_mv_global_tlb_purge_t *global_tlb_purge; + ia64_mv_tlb_migrate_finish_t *tlb_migrate_finish; + ia64_mv_dma_init *dma_init; +- ia64_mv_dma_alloc_coherent *dma_alloc_coherent; +- ia64_mv_dma_free_coherent *dma_free_coherent; +- ia64_mv_dma_map_single_attrs *dma_map_single_attrs; +- ia64_mv_dma_unmap_single_attrs *dma_unmap_single_attrs; +- ia64_mv_dma_map_sg_attrs *dma_map_sg_attrs; +- ia64_mv_dma_unmap_sg_attrs *dma_unmap_sg_attrs; +- ia64_mv_dma_sync_single_for_cpu *dma_sync_single_for_cpu; +- ia64_mv_dma_sync_sg_for_cpu *dma_sync_sg_for_cpu; +- ia64_mv_dma_sync_single_for_device *dma_sync_single_for_device; +- ia64_mv_dma_sync_sg_for_device *dma_sync_sg_for_device; +- ia64_mv_dma_mapping_error *dma_mapping_error; +- ia64_mv_dma_supported *dma_supported; + ia64_mv_dma_get_required_mask *dma_get_required_mask; ++ ia64_mv_dma_get_ops *dma_get_ops; + ia64_mv_irq_to_vector *irq_to_vector; + ia64_mv_local_vector_to_irq *local_vector_to_irq; + ia64_mv_pci_get_legacy_mem_t *pci_get_legacy_mem; +@@ -254,19 +213,8 @@ struct ia64_machine_vector { + platform_global_tlb_purge, \ + platform_tlb_migrate_finish, \ + platform_dma_init, \ +- platform_dma_alloc_coherent, \ +- platform_dma_free_coherent, \ +- platform_dma_map_single_attrs, \ +- platform_dma_unmap_single_attrs, \ +- platform_dma_map_sg_attrs, \ +- platform_dma_unmap_sg_attrs, \ +- platform_dma_sync_single_for_cpu, \ +- platform_dma_sync_sg_for_cpu, \ +- platform_dma_sync_single_for_device, \ +- platform_dma_sync_sg_for_device, \ +- platform_dma_mapping_error, \ +- platform_dma_supported, \ + platform_dma_get_required_mask, \ ++ platform_dma_get_ops, \ + platform_irq_to_vector, \ + platform_local_vector_to_irq, \ + platform_pci_get_legacy_mem, \ +@@ -302,6 +250,9 @@ extern void machvec_init_from_cmdline(co + # error Unknown configuration. Update arch/ia64/include/asm/machvec.h. + # endif /* CONFIG_IA64_GENERIC */ + ++extern void swiotlb_dma_init(void); ++extern struct dma_map_ops *dma_get_ops(struct device *); ++ + /* + * Define default versions so we can extend machvec for new platforms without having + * to update the machvec files for all existing platforms. +@@ -332,43 +283,10 @@ extern void machvec_init_from_cmdline(co + # define platform_kernel_launch_event machvec_noop + #endif + #ifndef platform_dma_init +-# define platform_dma_init swiotlb_init +-#endif +-#ifndef platform_dma_alloc_coherent +-# define platform_dma_alloc_coherent swiotlb_alloc_coherent +-#endif +-#ifndef platform_dma_free_coherent +-# define platform_dma_free_coherent swiotlb_free_coherent +-#endif +-#ifndef platform_dma_map_single_attrs +-# define platform_dma_map_single_attrs swiotlb_map_single_attrs +-#endif +-#ifndef platform_dma_unmap_single_attrs +-# define platform_dma_unmap_single_attrs swiotlb_unmap_single_attrs +-#endif +-#ifndef platform_dma_map_sg_attrs +-# define platform_dma_map_sg_attrs swiotlb_map_sg_attrs +-#endif +-#ifndef platform_dma_unmap_sg_attrs +-# define platform_dma_unmap_sg_attrs swiotlb_unmap_sg_attrs +-#endif +-#ifndef platform_dma_sync_single_for_cpu +-# define platform_dma_sync_single_for_cpu swiotlb_sync_single_for_cpu +-#endif +-#ifndef platform_dma_sync_sg_for_cpu +-# define platform_dma_sync_sg_for_cpu swiotlb_sync_sg_for_cpu +-#endif +-#ifndef platform_dma_sync_single_for_device +-# define platform_dma_sync_single_for_device swiotlb_sync_single_for_device +-#endif +-#ifndef platform_dma_sync_sg_for_device +-# define platform_dma_sync_sg_for_device swiotlb_sync_sg_for_device +-#endif +-#ifndef platform_dma_mapping_error +-# define platform_dma_mapping_error swiotlb_dma_mapping_error ++# define platform_dma_init swiotlb_dma_init + #endif +-#ifndef platform_dma_supported +-# define platform_dma_supported swiotlb_dma_supported ++#ifndef platform_dma_get_ops ++# define platform_dma_get_ops dma_get_ops + #endif + #ifndef platform_dma_get_required_mask + # define platform_dma_get_required_mask ia64_dma_get_required_mask +Index: linux-2.6-tip/arch/ia64/include/asm/machvec_dig_vtd.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/machvec_dig_vtd.h ++++ linux-2.6-tip/arch/ia64/include/asm/machvec_dig_vtd.h +@@ -2,14 +2,6 @@ + #define _ASM_IA64_MACHVEC_DIG_VTD_h + + extern ia64_mv_setup_t dig_setup; +-extern ia64_mv_dma_alloc_coherent vtd_alloc_coherent; +-extern ia64_mv_dma_free_coherent vtd_free_coherent; +-extern ia64_mv_dma_map_single_attrs vtd_map_single_attrs; +-extern ia64_mv_dma_unmap_single_attrs vtd_unmap_single_attrs; +-extern ia64_mv_dma_map_sg_attrs vtd_map_sg_attrs; +-extern ia64_mv_dma_unmap_sg_attrs vtd_unmap_sg_attrs; +-extern ia64_mv_dma_supported iommu_dma_supported; +-extern ia64_mv_dma_mapping_error vtd_dma_mapping_error; + extern ia64_mv_dma_init pci_iommu_alloc; + + /* +@@ -22,17 +14,5 @@ extern ia64_mv_dma_init pci_iommu_allo + #define platform_name "dig_vtd" + #define platform_setup dig_setup + #define platform_dma_init pci_iommu_alloc +-#define platform_dma_alloc_coherent vtd_alloc_coherent +-#define platform_dma_free_coherent vtd_free_coherent +-#define platform_dma_map_single_attrs vtd_map_single_attrs +-#define platform_dma_unmap_single_attrs vtd_unmap_single_attrs +-#define platform_dma_map_sg_attrs vtd_map_sg_attrs +-#define platform_dma_unmap_sg_attrs vtd_unmap_sg_attrs +-#define platform_dma_sync_single_for_cpu machvec_dma_sync_single +-#define platform_dma_sync_sg_for_cpu machvec_dma_sync_sg +-#define platform_dma_sync_single_for_device machvec_dma_sync_single +-#define platform_dma_sync_sg_for_device machvec_dma_sync_sg +-#define platform_dma_supported iommu_dma_supported +-#define platform_dma_mapping_error vtd_dma_mapping_error + + #endif /* _ASM_IA64_MACHVEC_DIG_VTD_h */ +Index: linux-2.6-tip/arch/ia64/include/asm/machvec_hpzx1.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/machvec_hpzx1.h ++++ linux-2.6-tip/arch/ia64/include/asm/machvec_hpzx1.h +@@ -2,14 +2,7 @@ + #define _ASM_IA64_MACHVEC_HPZX1_h + + extern ia64_mv_setup_t dig_setup; +-extern ia64_mv_dma_alloc_coherent sba_alloc_coherent; +-extern ia64_mv_dma_free_coherent sba_free_coherent; +-extern ia64_mv_dma_map_single_attrs sba_map_single_attrs; +-extern ia64_mv_dma_unmap_single_attrs sba_unmap_single_attrs; +-extern ia64_mv_dma_map_sg_attrs sba_map_sg_attrs; +-extern ia64_mv_dma_unmap_sg_attrs sba_unmap_sg_attrs; +-extern ia64_mv_dma_supported sba_dma_supported; +-extern ia64_mv_dma_mapping_error sba_dma_mapping_error; ++extern ia64_mv_dma_init sba_dma_init; + + /* + * This stuff has dual use! +@@ -20,18 +13,6 @@ extern ia64_mv_dma_mapping_error sba_dma + */ + #define platform_name "hpzx1" + #define platform_setup dig_setup +-#define platform_dma_init machvec_noop +-#define platform_dma_alloc_coherent sba_alloc_coherent +-#define platform_dma_free_coherent sba_free_coherent +-#define platform_dma_map_single_attrs sba_map_single_attrs +-#define platform_dma_unmap_single_attrs sba_unmap_single_attrs +-#define platform_dma_map_sg_attrs sba_map_sg_attrs +-#define platform_dma_unmap_sg_attrs sba_unmap_sg_attrs +-#define platform_dma_sync_single_for_cpu machvec_dma_sync_single +-#define platform_dma_sync_sg_for_cpu machvec_dma_sync_sg +-#define platform_dma_sync_single_for_device machvec_dma_sync_single +-#define platform_dma_sync_sg_for_device machvec_dma_sync_sg +-#define platform_dma_supported sba_dma_supported +-#define platform_dma_mapping_error sba_dma_mapping_error ++#define platform_dma_init sba_dma_init + + #endif /* _ASM_IA64_MACHVEC_HPZX1_h */ +Index: linux-2.6-tip/arch/ia64/include/asm/machvec_hpzx1_swiotlb.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/machvec_hpzx1_swiotlb.h ++++ linux-2.6-tip/arch/ia64/include/asm/machvec_hpzx1_swiotlb.h +@@ -2,18 +2,7 @@ + #define _ASM_IA64_MACHVEC_HPZX1_SWIOTLB_h + + extern ia64_mv_setup_t dig_setup; +-extern ia64_mv_dma_alloc_coherent hwsw_alloc_coherent; +-extern ia64_mv_dma_free_coherent hwsw_free_coherent; +-extern ia64_mv_dma_map_single_attrs hwsw_map_single_attrs; +-extern ia64_mv_dma_unmap_single_attrs hwsw_unmap_single_attrs; +-extern ia64_mv_dma_map_sg_attrs hwsw_map_sg_attrs; +-extern ia64_mv_dma_unmap_sg_attrs hwsw_unmap_sg_attrs; +-extern ia64_mv_dma_supported hwsw_dma_supported; +-extern ia64_mv_dma_mapping_error hwsw_dma_mapping_error; +-extern ia64_mv_dma_sync_single_for_cpu hwsw_sync_single_for_cpu; +-extern ia64_mv_dma_sync_sg_for_cpu hwsw_sync_sg_for_cpu; +-extern ia64_mv_dma_sync_single_for_device hwsw_sync_single_for_device; +-extern ia64_mv_dma_sync_sg_for_device hwsw_sync_sg_for_device; ++extern ia64_mv_dma_get_ops hwsw_dma_get_ops; + + /* + * This stuff has dual use! +@@ -23,20 +12,8 @@ extern ia64_mv_dma_sync_sg_for_device h + * the macros are used directly. + */ + #define platform_name "hpzx1_swiotlb" +- + #define platform_setup dig_setup + #define platform_dma_init machvec_noop +-#define platform_dma_alloc_coherent hwsw_alloc_coherent +-#define platform_dma_free_coherent hwsw_free_coherent +-#define platform_dma_map_single_attrs hwsw_map_single_attrs +-#define platform_dma_unmap_single_attrs hwsw_unmap_single_attrs +-#define platform_dma_map_sg_attrs hwsw_map_sg_attrs +-#define platform_dma_unmap_sg_attrs hwsw_unmap_sg_attrs +-#define platform_dma_supported hwsw_dma_supported +-#define platform_dma_mapping_error hwsw_dma_mapping_error +-#define platform_dma_sync_single_for_cpu hwsw_sync_single_for_cpu +-#define platform_dma_sync_sg_for_cpu hwsw_sync_sg_for_cpu +-#define platform_dma_sync_single_for_device hwsw_sync_single_for_device +-#define platform_dma_sync_sg_for_device hwsw_sync_sg_for_device ++#define platform_dma_get_ops hwsw_dma_get_ops + + #endif /* _ASM_IA64_MACHVEC_HPZX1_SWIOTLB_h */ +Index: linux-2.6-tip/arch/ia64/include/asm/machvec_sn2.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/machvec_sn2.h ++++ linux-2.6-tip/arch/ia64/include/asm/machvec_sn2.h +@@ -55,19 +55,8 @@ extern ia64_mv_readb_t __sn_readb_relaxe + extern ia64_mv_readw_t __sn_readw_relaxed; + extern ia64_mv_readl_t __sn_readl_relaxed; + extern ia64_mv_readq_t __sn_readq_relaxed; +-extern ia64_mv_dma_alloc_coherent sn_dma_alloc_coherent; +-extern ia64_mv_dma_free_coherent sn_dma_free_coherent; +-extern ia64_mv_dma_map_single_attrs sn_dma_map_single_attrs; +-extern ia64_mv_dma_unmap_single_attrs sn_dma_unmap_single_attrs; +-extern ia64_mv_dma_map_sg_attrs sn_dma_map_sg_attrs; +-extern ia64_mv_dma_unmap_sg_attrs sn_dma_unmap_sg_attrs; +-extern ia64_mv_dma_sync_single_for_cpu sn_dma_sync_single_for_cpu; +-extern ia64_mv_dma_sync_sg_for_cpu sn_dma_sync_sg_for_cpu; +-extern ia64_mv_dma_sync_single_for_device sn_dma_sync_single_for_device; +-extern ia64_mv_dma_sync_sg_for_device sn_dma_sync_sg_for_device; +-extern ia64_mv_dma_mapping_error sn_dma_mapping_error; +-extern ia64_mv_dma_supported sn_dma_supported; + extern ia64_mv_dma_get_required_mask sn_dma_get_required_mask; ++extern ia64_mv_dma_init sn_dma_init; + extern ia64_mv_migrate_t sn_migrate; + extern ia64_mv_kernel_launch_event_t sn_kernel_launch_event; + extern ia64_mv_setup_msi_irq_t sn_setup_msi_irq; +@@ -111,20 +100,8 @@ extern ia64_mv_pci_fixup_bus_t sn_pci_f + #define platform_pci_get_legacy_mem sn_pci_get_legacy_mem + #define platform_pci_legacy_read sn_pci_legacy_read + #define platform_pci_legacy_write sn_pci_legacy_write +-#define platform_dma_init machvec_noop +-#define platform_dma_alloc_coherent sn_dma_alloc_coherent +-#define platform_dma_free_coherent sn_dma_free_coherent +-#define platform_dma_map_single_attrs sn_dma_map_single_attrs +-#define platform_dma_unmap_single_attrs sn_dma_unmap_single_attrs +-#define platform_dma_map_sg_attrs sn_dma_map_sg_attrs +-#define platform_dma_unmap_sg_attrs sn_dma_unmap_sg_attrs +-#define platform_dma_sync_single_for_cpu sn_dma_sync_single_for_cpu +-#define platform_dma_sync_sg_for_cpu sn_dma_sync_sg_for_cpu +-#define platform_dma_sync_single_for_device sn_dma_sync_single_for_device +-#define platform_dma_sync_sg_for_device sn_dma_sync_sg_for_device +-#define platform_dma_mapping_error sn_dma_mapping_error +-#define platform_dma_supported sn_dma_supported + #define platform_dma_get_required_mask sn_dma_get_required_mask ++#define platform_dma_init sn_dma_init + #define platform_migrate sn_migrate + #define platform_kernel_launch_event sn_kernel_launch_event + #ifdef CONFIG_PCI_MSI +Index: linux-2.6-tip/arch/ia64/include/asm/percpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/percpu.h ++++ linux-2.6-tip/arch/ia64/include/asm/percpu.h +@@ -27,12 +27,12 @@ extern void *per_cpu_init(void); + + #else /* ! SMP */ + +-#define PER_CPU_ATTRIBUTES __attribute__((__section__(".data.percpu"))) +- + #define per_cpu_init() (__phys_per_cpu_start) + + #endif /* SMP */ + ++#define PER_CPU_BASE_SECTION ".data.percpu" ++ + /* + * Be extremely careful when taking the address of this variable! Due to virtual + * remapping, it is different from the canonical address returned by __get_cpu_var(var)! +Index: linux-2.6-tip/arch/ia64/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/swab.h ++++ linux-2.6-tip/arch/ia64/include/asm/swab.h +@@ -6,7 +6,7 @@ + * David Mosberger-Tang , Hewlett-Packard Co. + */ + +-#include ++#include + #include + #include + +Index: linux-2.6-tip/arch/ia64/include/asm/topology.h +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/include/asm/topology.h ++++ linux-2.6-tip/arch/ia64/include/asm/topology.h +@@ -84,7 +84,7 @@ void build_cpu_to_node_map(void); + .child = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ +- .max_interval = 8*(min(num_online_cpus(), 32)), \ ++ .max_interval = 8*(min(num_online_cpus(), 32U)), \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 2, \ +Index: linux-2.6-tip/arch/ia64/include/asm/uv/uv.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/ia64/include/asm/uv/uv.h +@@ -0,0 +1,13 @@ ++#ifndef _ASM_IA64_UV_UV_H ++#define _ASM_IA64_UV_UV_H ++ ++#include ++#include ++ ++static inline int is_uv_system(void) ++{ ++ /* temporary support for running on hardware simulator */ ++ return IS_MEDUSA() || ia64_platform_is("uv"); ++} ++ ++#endif /* _ASM_IA64_UV_UV_H */ +Index: linux-2.6-tip/arch/ia64/kernel/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/Makefile ++++ linux-2.6-tip/arch/ia64/kernel/Makefile +@@ -2,12 +2,16 @@ + # Makefile for the linux kernel. + # + ++ifdef CONFIG_DYNAMIC_FTRACE ++CFLAGS_REMOVE_ftrace.o = -pg ++endif ++ + extra-y := head.o init_task.o vmlinux.lds + + obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ + irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ + salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ +- unwind.o mca.o mca_asm.o topology.o ++ unwind.o mca.o mca_asm.o topology.o dma-mapping.o + + obj-$(CONFIG_IA64_BRL_EMU) += brl_emu.o + obj-$(CONFIG_IA64_GENERIC) += acpi-ext.o +@@ -28,6 +32,7 @@ obj-$(CONFIG_IA64_CYCLONE) += cyclone.o + obj-$(CONFIG_CPU_FREQ) += cpufreq/ + obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o + obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o ++obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o + obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o + obj-$(CONFIG_CRASH_DUMP) += crash_dump.o + obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o +@@ -43,9 +48,7 @@ ifneq ($(CONFIG_IA64_ESI),) + obj-y += esi_stub.o # must be in kernel proper + endif + obj-$(CONFIG_DMAR) += pci-dma.o +-ifeq ($(CONFIG_DMAR), y) + obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +-endif + + # The gate DSO image is built using a special linker script. + targets += gate.so gate-syms.o +Index: linux-2.6-tip/arch/ia64/kernel/acpi.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/acpi.c ++++ linux-2.6-tip/arch/ia64/kernel/acpi.c +@@ -199,6 +199,10 @@ char *__init __acpi_map_table(unsigned l + return __va(phys_addr); + } + ++void __init __acpi_unmap_table(char *map, unsigned long size) ++{ ++} ++ + /* -------------------------------------------------------------------------- + Boot-time Table Parsing + -------------------------------------------------------------------------- */ +Index: linux-2.6-tip/arch/ia64/kernel/dma-mapping.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/ia64/kernel/dma-mapping.c +@@ -0,0 +1,13 @@ ++#include ++ ++/* Set this to 1 if there is a HW IOMMU in the system */ ++int iommu_detected __read_mostly; ++ ++struct dma_map_ops *dma_ops; ++EXPORT_SYMBOL(dma_ops); ++ ++struct dma_map_ops *dma_get_ops(struct device *dev) ++{ ++ return dma_ops; ++} ++EXPORT_SYMBOL(dma_get_ops); +Index: linux-2.6-tip/arch/ia64/kernel/entry.S +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/entry.S ++++ linux-2.6-tip/arch/ia64/kernel/entry.S +@@ -47,6 +47,7 @@ + #include + #include + #include ++#include + + #include "minstate.h" + +@@ -1404,6 +1405,105 @@ GLOBAL_ENTRY(unw_init_running) + br.ret.sptk.many rp + END(unw_init_running) + ++#ifdef CONFIG_FUNCTION_TRACER ++#ifdef CONFIG_DYNAMIC_FTRACE ++GLOBAL_ENTRY(_mcount) ++ br ftrace_stub ++END(_mcount) ++ ++.here: ++ br.ret.sptk.many b0 ++ ++GLOBAL_ENTRY(ftrace_caller) ++ alloc out0 = ar.pfs, 8, 0, 4, 0 ++ mov out3 = r0 ++ ;; ++ mov out2 = b0 ++ add r3 = 0x20, r3 ++ mov out1 = r1; ++ br.call.sptk.many b0 = ftrace_patch_gp ++ //this might be called from module, so we must patch gp ++ftrace_patch_gp: ++ movl gp=__gp ++ mov b0 = r3 ++ ;; ++.global ftrace_call; ++ftrace_call: ++{ ++ .mlx ++ nop.m 0x0 ++ movl r3 = .here;; ++} ++ alloc loc0 = ar.pfs, 4, 4, 2, 0 ++ ;; ++ mov loc1 = b0 ++ mov out0 = b0 ++ mov loc2 = r8 ++ mov loc3 = r15 ++ ;; ++ adds out0 = -MCOUNT_INSN_SIZE, out0 ++ mov out1 = in2 ++ mov b6 = r3 ++ ++ br.call.sptk.many b0 = b6 ++ ;; ++ mov ar.pfs = loc0 ++ mov b0 = loc1 ++ mov r8 = loc2 ++ mov r15 = loc3 ++ br ftrace_stub ++ ;; ++END(ftrace_caller) ++ ++#else ++GLOBAL_ENTRY(_mcount) ++ movl r2 = ftrace_stub ++ movl r3 = ftrace_trace_function;; ++ ld8 r3 = [r3];; ++ ld8 r3 = [r3];; ++ cmp.eq p7,p0 = r2, r3 ++(p7) br.sptk.many ftrace_stub ++ ;; ++ ++ alloc loc0 = ar.pfs, 4, 4, 2, 0 ++ ;; ++ mov loc1 = b0 ++ mov out0 = b0 ++ mov loc2 = r8 ++ mov loc3 = r15 ++ ;; ++ adds out0 = -MCOUNT_INSN_SIZE, out0 ++ mov out1 = in2 ++ mov b6 = r3 ++ ++ br.call.sptk.many b0 = b6 ++ ;; ++ mov ar.pfs = loc0 ++ mov b0 = loc1 ++ mov r8 = loc2 ++ mov r15 = loc3 ++ br ftrace_stub ++ ;; ++END(_mcount) ++#endif ++ ++GLOBAL_ENTRY(ftrace_stub) ++ mov r3 = b0 ++ movl r2 = _mcount_ret_helper ++ ;; ++ mov b6 = r2 ++ mov b7 = r3 ++ br.ret.sptk.many b6 ++ ++_mcount_ret_helper: ++ mov b0 = r42 ++ mov r1 = r41 ++ mov ar.pfs = r40 ++ br b7 ++END(ftrace_stub) ++ ++#endif /* CONFIG_FUNCTION_TRACER */ ++ + .rodata + .align 8 + .globl sys_call_table +Index: linux-2.6-tip/arch/ia64/kernel/ftrace.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/ia64/kernel/ftrace.c +@@ -0,0 +1,206 @@ ++/* ++ * Dynamic function tracing support. ++ * ++ * Copyright (C) 2008 Shaohua Li ++ * ++ * For licencing details, see COPYING. ++ * ++ * Defines low-level handling of mcount calls when the kernel ++ * is compiled with the -pg flag. When using dynamic ftrace, the ++ * mcount call-sites get patched lazily with NOP till they are ++ * enabled. All code mutation routines here take effect atomically. ++ */ ++ ++#include ++#include ++ ++#include ++#include ++ ++/* In IA64, each function will be added below two bundles with -pg option */ ++static unsigned char __attribute__((aligned(8))) ++ftrace_orig_code[MCOUNT_INSN_SIZE] = { ++ 0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */ ++ 0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */ ++ 0x05, 0x00, 0xc4, 0x00, /* mov r42=b0 */ ++ 0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */ ++ 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */ ++ 0x08, 0x00, 0x00, 0x50 /* br.call.sptk.many b0 = _mcount;; */ ++}; ++ ++struct ftrace_orig_insn { ++ u64 dummy1, dummy2, dummy3; ++ u64 dummy4:64-41+13; ++ u64 imm20:20; ++ u64 dummy5:3; ++ u64 sign:1; ++ u64 dummy6:4; ++}; ++ ++/* mcount stub will be converted below for nop */ ++static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = { ++ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ ++ 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ ++ 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ ++ 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ ++ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */ ++ 0x00, 0x00, 0x04, 0x00 ++}; ++ ++static unsigned char *ftrace_nop_replace(void) ++{ ++ return ftrace_nop_code; ++} ++ ++/* ++ * mcount stub will be converted below for call ++ * Note: Just the last instruction is changed against nop ++ * */ ++static unsigned char __attribute__((aligned(8))) ++ftrace_call_code[MCOUNT_INSN_SIZE] = { ++ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ ++ 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ ++ 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ ++ 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ ++ 0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/ ++ 0xf8, 0xff, 0xff, 0xc8 ++}; ++ ++struct ftrace_call_insn { ++ u64 dummy1, dummy2; ++ u64 dummy3:48; ++ u64 imm39_l:16; ++ u64 imm39_h:23; ++ u64 dummy4:13; ++ u64 imm20:20; ++ u64 dummy5:3; ++ u64 i:1; ++ u64 dummy6:4; ++}; ++ ++static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) ++{ ++ struct ftrace_call_insn *code = (void *)ftrace_call_code; ++ unsigned long offset = addr - (ip + 0x10); ++ ++ code->imm39_l = offset >> 24; ++ code->imm39_h = offset >> 40; ++ code->imm20 = offset >> 4; ++ code->i = offset >> 63; ++ return ftrace_call_code; ++} ++ ++static int ++ftrace_modify_code(unsigned long ip, unsigned char *old_code, ++ unsigned char *new_code, int do_check) ++{ ++ unsigned char replaced[MCOUNT_INSN_SIZE]; ++ ++ /* ++ * Note: Due to modules and __init, code can ++ * disappear and change, we need to protect against faulting ++ * as well as code changing. We do this by using the ++ * probe_kernel_* functions. ++ * ++ * No real locking needed, this code is run through ++ * kstop_machine, or before SMP starts. ++ */ ++ ++ if (!do_check) ++ goto skip_check; ++ ++ /* read the text we want to modify */ ++ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) ++ return -EFAULT; ++ ++ /* Make sure it is what we expect it to be */ ++ if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) ++ return -EINVAL; ++ ++skip_check: ++ /* replace the text with the new text */ ++ if (probe_kernel_write(((void *)ip), new_code, MCOUNT_INSN_SIZE)) ++ return -EPERM; ++ flush_icache_range(ip, ip + MCOUNT_INSN_SIZE); ++ ++ return 0; ++} ++ ++static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr) ++{ ++ unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE]; ++ unsigned long ip = rec->ip; ++ ++ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) ++ return -EFAULT; ++ if (rec->flags & FTRACE_FL_CONVERTED) { ++ struct ftrace_call_insn *call_insn, *tmp_call; ++ ++ call_insn = (void *)ftrace_call_code; ++ tmp_call = (void *)replaced; ++ call_insn->imm39_l = tmp_call->imm39_l; ++ call_insn->imm39_h = tmp_call->imm39_h; ++ call_insn->imm20 = tmp_call->imm20; ++ call_insn->i = tmp_call->i; ++ if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0) ++ return -EINVAL; ++ return 0; ++ } else { ++ struct ftrace_orig_insn *call_insn, *tmp_call; ++ ++ call_insn = (void *)ftrace_orig_code; ++ tmp_call = (void *)replaced; ++ call_insn->sign = tmp_call->sign; ++ call_insn->imm20 = tmp_call->imm20; ++ if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0) ++ return -EINVAL; ++ return 0; ++ } ++} ++ ++int ftrace_make_nop(struct module *mod, ++ struct dyn_ftrace *rec, unsigned long addr) ++{ ++ int ret; ++ char *new; ++ ++ ret = ftrace_make_nop_check(rec, addr); ++ if (ret) ++ return ret; ++ new = ftrace_nop_replace(); ++ return ftrace_modify_code(rec->ip, NULL, new, 0); ++} ++ ++int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) ++{ ++ unsigned long ip = rec->ip; ++ unsigned char *old, *new; ++ ++ old= ftrace_nop_replace(); ++ new = ftrace_call_replace(ip, addr); ++ return ftrace_modify_code(ip, old, new, 1); ++} ++ ++/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */ ++int ftrace_update_ftrace_func(ftrace_func_t func) ++{ ++ unsigned long ip; ++ unsigned long addr = ((struct fnptr *)ftrace_call)->ip; ++ ++ if (func == ftrace_stub) ++ return 0; ++ ip = ((struct fnptr *)func)->ip; ++ ++ ia64_patch_imm64(addr + 2, ip); ++ ++ flush_icache_range(addr, addr + 16); ++ return 0; ++} ++ ++/* run from kstop_machine */ ++int __init ftrace_dyn_arch_init(void *data) ++{ ++ *(unsigned long *)data = 0; ++ ++ return 0; ++} +Index: linux-2.6-tip/arch/ia64/kernel/ia64_ksyms.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/ia64_ksyms.c ++++ linux-2.6-tip/arch/ia64/kernel/ia64_ksyms.c +@@ -112,3 +112,9 @@ EXPORT_SYMBOL_GPL(esi_call_phys); + #endif + extern char ia64_ivt[]; + EXPORT_SYMBOL(ia64_ivt); ++ ++#include ++#ifdef CONFIG_FUNCTION_TRACER ++/* mcount is defined in assembly */ ++EXPORT_SYMBOL(_mcount); ++#endif +Index: linux-2.6-tip/arch/ia64/kernel/iosapic.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/iosapic.c ++++ linux-2.6-tip/arch/ia64/kernel/iosapic.c +@@ -880,7 +880,7 @@ iosapic_unregister_intr (unsigned int gs + if (iosapic_intr_info[irq].count == 0) { + #ifdef CONFIG_SMP + /* Clear affinity */ +- cpus_setall(idesc->affinity); ++ cpumask_setall(idesc->affinity); + #endif + /* Clear the interrupt information */ + iosapic_intr_info[irq].dest = 0; +Index: linux-2.6-tip/arch/ia64/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/irq.c ++++ linux-2.6-tip/arch/ia64/kernel/irq.c +@@ -80,7 +80,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%10u ", kstat_irqs(i)); + #else + for_each_online_cpu(j) { +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + } + #endif + seq_printf(p, " %14s", irq_desc[i].chip->name); +@@ -103,7 +103,7 @@ static char irq_redir [NR_IRQS]; // = { + void set_irq_affinity_info (unsigned int irq, int hwid, int redir) + { + if (irq < NR_IRQS) { +- cpumask_copy(&irq_desc[irq].affinity, ++ cpumask_copy(irq_desc[irq].affinity, + cpumask_of(cpu_logical_id(hwid))); + irq_redir[irq] = (char) (redir & 0xff); + } +@@ -148,7 +148,7 @@ static void migrate_irqs(void) + if (desc->status == IRQ_PER_CPU) + continue; + +- if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask) ++ if (cpumask_any_and(irq_desc[irq].affinity, cpu_online_mask) + >= nr_cpu_ids) { + /* + * Save it for phase 2 processing +Index: linux-2.6-tip/arch/ia64/kernel/irq_ia64.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/irq_ia64.c ++++ linux-2.6-tip/arch/ia64/kernel/irq_ia64.c +@@ -493,11 +493,13 @@ ia64_handle_irq (ia64_vector vector, str + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); + while (vector != IA64_SPURIOUS_INT_VECTOR) { ++ struct irq_desc *desc = irq_to_desc(vector); ++ + if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { + smp_local_flush_tlb(); +- kstat_this_cpu.irqs[vector]++; ++ kstat_incr_irqs_this_cpu(vector, desc); + } else if (unlikely(IS_RESCHEDULE(vector))) +- kstat_this_cpu.irqs[vector]++; ++ kstat_incr_irqs_this_cpu(vector, desc); + else { + int irq = local_vector_to_irq(vector); + +@@ -551,11 +553,13 @@ void ia64_process_pending_intr(void) + * Perform normal interrupt style processing + */ + while (vector != IA64_SPURIOUS_INT_VECTOR) { ++ struct irq_desc *desc = irq_to_desc(vector); ++ + if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { + smp_local_flush_tlb(); +- kstat_this_cpu.irqs[vector]++; ++ kstat_incr_irqs_this_cpu(vector, desc); + } else if (unlikely(IS_RESCHEDULE(vector))) +- kstat_this_cpu.irqs[vector]++; ++ kstat_incr_irqs_this_cpu(vector, desc); + else { + struct pt_regs *old_regs = set_irq_regs(NULL); + int irq = local_vector_to_irq(vector); +Index: linux-2.6-tip/arch/ia64/kernel/machvec.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/machvec.c ++++ linux-2.6-tip/arch/ia64/kernel/machvec.c +@@ -1,5 +1,5 @@ + #include +- ++#include + #include + #include + +@@ -75,14 +75,16 @@ machvec_timer_interrupt (int irq, void * + EXPORT_SYMBOL(machvec_timer_interrupt); + + void +-machvec_dma_sync_single (struct device *hwdev, dma_addr_t dma_handle, size_t size, int dir) ++machvec_dma_sync_single(struct device *hwdev, dma_addr_t dma_handle, size_t size, ++ enum dma_data_direction dir) + { + mb(); + } + EXPORT_SYMBOL(machvec_dma_sync_single); + + void +-machvec_dma_sync_sg (struct device *hwdev, struct scatterlist *sg, int n, int dir) ++machvec_dma_sync_sg(struct device *hwdev, struct scatterlist *sg, int n, ++ enum dma_data_direction dir) + { + mb(); + } +Index: linux-2.6-tip/arch/ia64/kernel/msi_ia64.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/msi_ia64.c ++++ linux-2.6-tip/arch/ia64/kernel/msi_ia64.c +@@ -75,7 +75,7 @@ static void ia64_set_msi_irq_affinity(un + msg.data = data; + + write_msi_msg(irq, &msg); +- irq_desc[irq].affinity = cpumask_of_cpu(cpu); ++ cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu)); + } + #endif /* CONFIG_SMP */ + +@@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsign + msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu)); + + dmar_msi_write(irq, &msg); +- irq_desc[irq].affinity = *mask; ++ cpumask_copy(irq_desc[irq].affinity, mask); + } + #endif /* CONFIG_SMP */ + +Index: linux-2.6-tip/arch/ia64/kernel/pci-dma.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/pci-dma.c ++++ linux-2.6-tip/arch/ia64/kernel/pci-dma.c +@@ -32,9 +32,6 @@ int force_iommu __read_mostly = 1; + int force_iommu __read_mostly; + #endif + +-/* Set this to 1 if there is a HW IOMMU in the system */ +-int iommu_detected __read_mostly; +- + /* Dummy device used for NULL arguments (normally ISA). Better would + be probably a smaller DMA mask, but this is bug-to-bug compatible + to i386. */ +@@ -44,18 +41,7 @@ struct device fallback_dev = { + .dma_mask = &fallback_dev.coherent_dma_mask, + }; + +-void __init pci_iommu_alloc(void) +-{ +- /* +- * The order of these functions is important for +- * fall-back/fail-over reasons +- */ +- detect_intel_iommu(); +- +-#ifdef CONFIG_SWIOTLB +- pci_swiotlb_init(); +-#endif +-} ++extern struct dma_map_ops intel_dma_ops; + + static int __init pci_iommu_init(void) + { +@@ -79,15 +65,12 @@ iommu_dma_init(void) + return; + } + +-struct dma_mapping_ops *dma_ops; +-EXPORT_SYMBOL(dma_ops); +- + int iommu_dma_supported(struct device *dev, u64 mask) + { +- struct dma_mapping_ops *ops = get_dma_ops(dev); ++ struct dma_map_ops *ops = platform_dma_get_ops(dev); + +- if (ops->dma_supported_op) +- return ops->dma_supported_op(dev, mask); ++ if (ops->dma_supported) ++ return ops->dma_supported(dev, mask); + + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. +@@ -116,4 +99,25 @@ int iommu_dma_supported(struct device *d + } + EXPORT_SYMBOL(iommu_dma_supported); + ++void __init pci_iommu_alloc(void) ++{ ++ dma_ops = &intel_dma_ops; ++ ++ dma_ops->sync_single_for_cpu = machvec_dma_sync_single; ++ dma_ops->sync_sg_for_cpu = machvec_dma_sync_sg; ++ dma_ops->sync_single_for_device = machvec_dma_sync_single; ++ dma_ops->sync_sg_for_device = machvec_dma_sync_sg; ++ dma_ops->dma_supported = iommu_dma_supported; ++ ++ /* ++ * The order of these functions is important for ++ * fall-back/fail-over reasons ++ */ ++ detect_intel_iommu(); ++ ++#ifdef CONFIG_SWIOTLB ++ pci_swiotlb_init(); ++#endif ++} ++ + #endif +Index: linux-2.6-tip/arch/ia64/kernel/pci-swiotlb.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/pci-swiotlb.c ++++ linux-2.6-tip/arch/ia64/kernel/pci-swiotlb.c +@@ -13,23 +13,37 @@ + int swiotlb __read_mostly; + EXPORT_SYMBOL(swiotlb); + +-struct dma_mapping_ops swiotlb_dma_ops = { +- .mapping_error = swiotlb_dma_mapping_error, +- .alloc_coherent = swiotlb_alloc_coherent, ++static void *ia64_swiotlb_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t *dma_handle, gfp_t gfp) ++{ ++ if (dev->coherent_dma_mask != DMA_64BIT_MASK) ++ gfp |= GFP_DMA; ++ return swiotlb_alloc_coherent(dev, size, dma_handle, gfp); ++} ++ ++struct dma_map_ops swiotlb_dma_ops = { ++ .alloc_coherent = ia64_swiotlb_alloc_coherent, + .free_coherent = swiotlb_free_coherent, +- .map_single = swiotlb_map_single, +- .unmap_single = swiotlb_unmap_single, ++ .map_page = swiotlb_map_page, ++ .unmap_page = swiotlb_unmap_page, ++ .map_sg = swiotlb_map_sg_attrs, ++ .unmap_sg = swiotlb_unmap_sg_attrs, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, + .sync_single_range_for_device = swiotlb_sync_single_range_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, +- .map_sg = swiotlb_map_sg, +- .unmap_sg = swiotlb_unmap_sg, +- .dma_supported_op = swiotlb_dma_supported, ++ .dma_supported = swiotlb_dma_supported, ++ .mapping_error = swiotlb_dma_mapping_error, + }; + ++void __init swiotlb_dma_init(void) ++{ ++ dma_ops = &swiotlb_dma_ops; ++ swiotlb_init(); ++} ++ + void __init pci_swiotlb_init(void) + { + if (!iommu_detected) { +Index: linux-2.6-tip/arch/ia64/kernel/vmlinux.lds.S +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/kernel/vmlinux.lds.S ++++ linux-2.6-tip/arch/ia64/kernel/vmlinux.lds.S +@@ -213,16 +213,9 @@ SECTIONS + { *(.data.cacheline_aligned) } + + /* Per-cpu data: */ +- percpu : { } :percpu + . = ALIGN(PERCPU_PAGE_SIZE); +- __phys_per_cpu_start = .; +- .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) +- { +- __per_cpu_start = .; +- *(.data.percpu) +- *(.data.percpu.shared_aligned) +- __per_cpu_end = .; +- } ++ PERCPU_VADDR(PERCPU_ADDR, :percpu) ++ __phys_per_cpu_start = __per_cpu_load; + . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits + * into percpu page size + */ +Index: linux-2.6-tip/arch/ia64/sn/kernel/msi_sn.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/sn/kernel/msi_sn.c ++++ linux-2.6-tip/arch/ia64/sn/kernel/msi_sn.c +@@ -205,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsi + msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff); + + write_msi_msg(irq, &msg); +- irq_desc[irq].affinity = *cpu_mask; ++ cpumask_copy(irq_desc[irq].affinity, cpu_mask); + } + #endif /* CONFIG_SMP */ + +Index: linux-2.6-tip/arch/ia64/sn/pci/pci_dma.c +=================================================================== +--- linux-2.6-tip.orig/arch/ia64/sn/pci/pci_dma.c ++++ linux-2.6-tip/arch/ia64/sn/pci/pci_dma.c +@@ -10,7 +10,7 @@ + */ + + #include +-#include ++#include + #include + #include + #include +@@ -31,7 +31,7 @@ + * this function. Of course, SN only supports devices that have 32 or more + * address bits when using the PMU. + */ +-int sn_dma_supported(struct device *dev, u64 mask) ++static int sn_dma_supported(struct device *dev, u64 mask) + { + BUG_ON(dev->bus != &pci_bus_type); + +@@ -39,7 +39,6 @@ int sn_dma_supported(struct device *dev, + return 0; + return 1; + } +-EXPORT_SYMBOL(sn_dma_supported); + + /** + * sn_dma_set_mask - set the DMA mask +@@ -75,8 +74,8 @@ EXPORT_SYMBOL(sn_dma_set_mask); + * queue for a SCSI controller). See Documentation/DMA-API.txt for + * more information. + */ +-void *sn_dma_alloc_coherent(struct device *dev, size_t size, +- dma_addr_t * dma_handle, gfp_t flags) ++static void *sn_dma_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t * dma_handle, gfp_t flags) + { + void *cpuaddr; + unsigned long phys_addr; +@@ -124,7 +123,6 @@ void *sn_dma_alloc_coherent(struct devic + + return cpuaddr; + } +-EXPORT_SYMBOL(sn_dma_alloc_coherent); + + /** + * sn_pci_free_coherent - free memory associated with coherent DMAable region +@@ -136,8 +134,8 @@ EXPORT_SYMBOL(sn_dma_alloc_coherent); + * Frees the memory allocated by dma_alloc_coherent(), potentially unmapping + * any associated IOMMU mappings. + */ +-void sn_dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, +- dma_addr_t dma_handle) ++static void sn_dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, ++ dma_addr_t dma_handle) + { + struct pci_dev *pdev = to_pci_dev(dev); + struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev); +@@ -147,7 +145,6 @@ void sn_dma_free_coherent(struct device + provider->dma_unmap(pdev, dma_handle, 0); + free_pages((unsigned long)cpu_addr, get_order(size)); + } +-EXPORT_SYMBOL(sn_dma_free_coherent); + + /** + * sn_dma_map_single_attrs - map a single page for DMA +@@ -173,10 +170,12 @@ EXPORT_SYMBOL(sn_dma_free_coherent); + * TODO: simplify our interface; + * figure out how to save dmamap handle so can use two step. + */ +-dma_addr_t sn_dma_map_single_attrs(struct device *dev, void *cpu_addr, +- size_t size, int direction, +- struct dma_attrs *attrs) ++static dma_addr_t sn_dma_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { ++ void *cpu_addr = page_address(page) + offset; + dma_addr_t dma_addr; + unsigned long phys_addr; + struct pci_dev *pdev = to_pci_dev(dev); +@@ -201,7 +200,6 @@ dma_addr_t sn_dma_map_single_attrs(struc + } + return dma_addr; + } +-EXPORT_SYMBOL(sn_dma_map_single_attrs); + + /** + * sn_dma_unmap_single_attrs - unamp a DMA mapped page +@@ -215,21 +213,20 @@ EXPORT_SYMBOL(sn_dma_map_single_attrs); + * by @dma_handle into the coherence domain. On SN, we're always cache + * coherent, so we just need to free any ATEs associated with this mapping. + */ +-void sn_dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr, +- size_t size, int direction, +- struct dma_attrs *attrs) ++static void sn_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct pci_dev *pdev = to_pci_dev(dev); + struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev); + + BUG_ON(dev->bus != &pci_bus_type); + +- provider->dma_unmap(pdev, dma_addr, direction); ++ provider->dma_unmap(pdev, dma_addr, dir); + } +-EXPORT_SYMBOL(sn_dma_unmap_single_attrs); + + /** +- * sn_dma_unmap_sg_attrs - unmap a DMA scatterlist ++ * sn_dma_unmap_sg - unmap a DMA scatterlist + * @dev: device to unmap + * @sg: scatterlist to unmap + * @nhwentries: number of scatterlist entries +@@ -238,9 +235,9 @@ EXPORT_SYMBOL(sn_dma_unmap_single_attrs) + * + * Unmap a set of streaming mode DMA translations. + */ +-void sn_dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl, +- int nhwentries, int direction, +- struct dma_attrs *attrs) ++static void sn_dma_unmap_sg(struct device *dev, struct scatterlist *sgl, ++ int nhwentries, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + int i; + struct pci_dev *pdev = to_pci_dev(dev); +@@ -250,15 +247,14 @@ void sn_dma_unmap_sg_attrs(struct device + BUG_ON(dev->bus != &pci_bus_type); + + for_each_sg(sgl, sg, nhwentries, i) { +- provider->dma_unmap(pdev, sg->dma_address, direction); ++ provider->dma_unmap(pdev, sg->dma_address, dir); + sg->dma_address = (dma_addr_t) NULL; + sg->dma_length = 0; + } + } +-EXPORT_SYMBOL(sn_dma_unmap_sg_attrs); + + /** +- * sn_dma_map_sg_attrs - map a scatterlist for DMA ++ * sn_dma_map_sg - map a scatterlist for DMA + * @dev: device to map for + * @sg: scatterlist to map + * @nhwentries: number of entries +@@ -272,8 +268,9 @@ EXPORT_SYMBOL(sn_dma_unmap_sg_attrs); + * + * Maps each entry of @sg for DMA. + */ +-int sn_dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl, +- int nhwentries, int direction, struct dma_attrs *attrs) ++static int sn_dma_map_sg(struct device *dev, struct scatterlist *sgl, ++ int nhwentries, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + unsigned long phys_addr; + struct scatterlist *saved_sg = sgl, *sg; +@@ -310,8 +307,7 @@ int sn_dma_map_sg_attrs(struct device *d + * Free any successfully allocated entries. + */ + if (i > 0) +- sn_dma_unmap_sg_attrs(dev, saved_sg, i, +- direction, attrs); ++ sn_dma_unmap_sg(dev, saved_sg, i, dir, attrs); + return 0; + } + +@@ -320,41 +316,36 @@ int sn_dma_map_sg_attrs(struct device *d + + return nhwentries; + } +-EXPORT_SYMBOL(sn_dma_map_sg_attrs); + +-void sn_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, +- size_t size, int direction) ++static void sn_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, ++ size_t size, enum dma_data_direction dir) + { + BUG_ON(dev->bus != &pci_bus_type); + } +-EXPORT_SYMBOL(sn_dma_sync_single_for_cpu); + +-void sn_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, +- size_t size, int direction) ++static void sn_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, ++ size_t size, ++ enum dma_data_direction dir) + { + BUG_ON(dev->bus != &pci_bus_type); + } +-EXPORT_SYMBOL(sn_dma_sync_single_for_device); + +-void sn_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, +- int nelems, int direction) ++static void sn_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir) + { + BUG_ON(dev->bus != &pci_bus_type); + } +-EXPORT_SYMBOL(sn_dma_sync_sg_for_cpu); + +-void sn_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, +- int nelems, int direction) ++static void sn_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir) + { + BUG_ON(dev->bus != &pci_bus_type); + } +-EXPORT_SYMBOL(sn_dma_sync_sg_for_device); + +-int sn_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) ++static int sn_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + { + return 0; + } +-EXPORT_SYMBOL(sn_dma_mapping_error); + + u64 sn_dma_get_required_mask(struct device *dev) + { +@@ -471,3 +462,23 @@ int sn_pci_legacy_write(struct pci_bus * + out: + return ret; + } ++ ++static struct dma_map_ops sn_dma_ops = { ++ .alloc_coherent = sn_dma_alloc_coherent, ++ .free_coherent = sn_dma_free_coherent, ++ .map_page = sn_dma_map_page, ++ .unmap_page = sn_dma_unmap_page, ++ .map_sg = sn_dma_map_sg, ++ .unmap_sg = sn_dma_unmap_sg, ++ .sync_single_for_cpu = sn_dma_sync_single_for_cpu, ++ .sync_sg_for_cpu = sn_dma_sync_sg_for_cpu, ++ .sync_single_for_device = sn_dma_sync_single_for_device, ++ .sync_sg_for_device = sn_dma_sync_sg_for_device, ++ .mapping_error = sn_dma_mapping_error, ++ .dma_supported = sn_dma_supported, ++}; ++ ++void sn_dma_init(void) ++{ ++ dma_ops = &sn_dma_ops; ++} +Index: linux-2.6-tip/arch/m32r/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/m32r/kernel/irq.c ++++ linux-2.6-tip/arch/m32r/kernel/irq.c +@@ -49,7 +49,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%10u ", kstat_irqs(i)); + #else + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + #endif + seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %s", action->name); +Index: linux-2.6-tip/arch/m68k/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/m68k/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/mips/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/mips/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/mips/include/asm/irq.h +=================================================================== +--- linux-2.6-tip.orig/arch/mips/include/asm/irq.h ++++ linux-2.6-tip/arch/mips/include/asm/irq.h +@@ -66,7 +66,7 @@ extern void smtc_forward_irq(unsigned in + */ + #define IRQ_AFFINITY_HOOK(irq) \ + do { \ +- if (!cpu_isset(smp_processor_id(), irq_desc[irq].affinity)) { \ ++ if (!cpumask_test_cpu(smp_processor_id(), irq_desc[irq].affinity)) {\ + smtc_forward_irq(irq); \ + irq_exit(); \ + return; \ +Index: linux-2.6-tip/arch/mips/include/asm/sigcontext.h +=================================================================== +--- linux-2.6-tip.orig/arch/mips/include/asm/sigcontext.h ++++ linux-2.6-tip/arch/mips/include/asm/sigcontext.h +@@ -9,6 +9,7 @@ + #ifndef _ASM_SIGCONTEXT_H + #define _ASM_SIGCONTEXT_H + ++#include + #include + + #if _MIPS_SIM == _MIPS_SIM_ABI32 +Index: linux-2.6-tip/arch/mips/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/mips/include/asm/swab.h ++++ linux-2.6-tip/arch/mips/include/asm/swab.h +@@ -9,7 +9,7 @@ + #define _ASM_SWAB_H + + #include +-#include ++#include + + #define __SWAB_64_THRU_32__ + +Index: linux-2.6-tip/arch/mips/kernel/irq-gic.c +=================================================================== +--- linux-2.6-tip.orig/arch/mips/kernel/irq-gic.c ++++ linux-2.6-tip/arch/mips/kernel/irq-gic.c +@@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned in + set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask); + + } +- irq_desc[irq].affinity = *cpumask; ++ cpumask_copy(irq_desc[irq].affinity, cpumask); + spin_unlock_irqrestore(&gic_lock, flags); + + } +Index: linux-2.6-tip/arch/mips/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/mips/kernel/irq.c ++++ linux-2.6-tip/arch/mips/kernel/irq.c +@@ -108,7 +108,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%10u ", kstat_irqs(i)); + #else + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + #endif + seq_printf(p, " %14s", irq_desc[i].chip->name); + seq_printf(p, " %s", action->name); +Index: linux-2.6-tip/arch/mips/kernel/smtc.c +=================================================================== +--- linux-2.6-tip.orig/arch/mips/kernel/smtc.c ++++ linux-2.6-tip/arch/mips/kernel/smtc.c +@@ -686,7 +686,7 @@ void smtc_forward_irq(unsigned int irq) + * and efficiency, we just pick the easiest one to find. + */ + +- target = first_cpu(irq_desc[irq].affinity); ++ target = cpumask_first(irq_desc[irq].affinity); + + /* + * We depend on the platform code to have correctly processed +@@ -921,11 +921,13 @@ void ipi_decode(struct smtc_ipi *pipi) + struct clock_event_device *cd; + void *arg_copy = pipi->arg; + int type_copy = pipi->type; ++ int irq = MIPS_CPU_IRQ_BASE + 1; ++ + smtc_ipi_nq(&freeIPIq, pipi); + switch (type_copy) { + case SMTC_CLOCK_TICK: + irq_enter(); +- kstat_this_cpu.irqs[MIPS_CPU_IRQ_BASE + 1]++; ++ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + cd = &per_cpu(mips_clockevent_device, cpu); + cd->event_handler(cd); + irq_exit(); +Index: linux-2.6-tip/arch/mips/mti-malta/malta-smtc.c +=================================================================== +--- linux-2.6-tip.orig/arch/mips/mti-malta/malta-smtc.c ++++ linux-2.6-tip/arch/mips/mti-malta/malta-smtc.c +@@ -116,7 +116,7 @@ struct plat_smp_ops msmtc_smp_ops = { + + void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) + { +- cpumask_t tmask = *affinity; ++ cpumask_t tmask; + int cpu = 0; + void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff); + +@@ -139,11 +139,12 @@ void plat_set_irq_affinity(unsigned int + * be made to forward to an offline "CPU". + */ + ++ cpumask_copy(&tmask, affinity); + for_each_cpu(cpu, affinity) { + if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu)) + cpu_clear(cpu, tmask); + } +- irq_desc[irq].affinity = tmask; ++ cpumask_copy(irq_desc[irq].affinity, &tmask); + + if (cpus_empty(tmask)) + /* +Index: linux-2.6-tip/arch/mips/sgi-ip22/ip22-int.c +=================================================================== +--- linux-2.6-tip.orig/arch/mips/sgi-ip22/ip22-int.c ++++ linux-2.6-tip/arch/mips/sgi-ip22/ip22-int.c +@@ -155,7 +155,7 @@ static void indy_buserror_irq(void) + int irq = SGI_BUSERR_IRQ; + + irq_enter(); +- kstat_this_cpu.irqs[irq]++; ++ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + ip22_be_interrupt(irq); + irq_exit(); + } +Index: linux-2.6-tip/arch/mips/sgi-ip22/ip22-time.c +=================================================================== +--- linux-2.6-tip.orig/arch/mips/sgi-ip22/ip22-time.c ++++ linux-2.6-tip/arch/mips/sgi-ip22/ip22-time.c +@@ -122,7 +122,7 @@ void indy_8254timer_irq(void) + char c; + + irq_enter(); +- kstat_this_cpu.irqs[irq]++; ++ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + printk(KERN_ALERT "Oops, got 8254 interrupt.\n"); + ArcRead(0, &c, 1, &cnt); + ArcEnterInteractiveMode(); +Index: linux-2.6-tip/arch/mips/sibyte/bcm1480/smp.c +=================================================================== +--- linux-2.6-tip.orig/arch/mips/sibyte/bcm1480/smp.c ++++ linux-2.6-tip/arch/mips/sibyte/bcm1480/smp.c +@@ -178,9 +178,10 @@ struct plat_smp_ops bcm1480_smp_ops = { + void bcm1480_mailbox_interrupt(void) + { + int cpu = smp_processor_id(); ++ int irq = K_BCM1480_INT_MBOX_0_0; + unsigned int action; + +- kstat_this_cpu.irqs[K_BCM1480_INT_MBOX_0_0]++; ++ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + /* Load the mailbox register to figure out what we're supposed to do */ + action = (__raw_readq(mailbox_0_regs[cpu]) >> 48) & 0xffff; + +Index: linux-2.6-tip/arch/mips/sibyte/sb1250/smp.c +=================================================================== +--- linux-2.6-tip.orig/arch/mips/sibyte/sb1250/smp.c ++++ linux-2.6-tip/arch/mips/sibyte/sb1250/smp.c +@@ -166,9 +166,10 @@ struct plat_smp_ops sb_smp_ops = { + void sb1250_mailbox_interrupt(void) + { + int cpu = smp_processor_id(); ++ int irq = K_INT_MBOX_0; + unsigned int action; + +- kstat_this_cpu.irqs[K_INT_MBOX_0]++; ++ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + /* Load the mailbox register to figure out what we're supposed to do */ + action = (____raw_readq(mailbox_regs[cpu]) >> 48) & 0xffff; + +Index: linux-2.6-tip/arch/mn10300/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/mn10300/kernel/irq.c ++++ linux-2.6-tip/arch/mn10300/kernel/irq.c +@@ -221,7 +221,7 @@ int show_interrupts(struct seq_file *p, + if (action) { + seq_printf(p, "%3d: ", i); + for_each_present_cpu(cpu) +- seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, cpu)); + seq_printf(p, " %14s.%u", irq_desc[i].chip->name, + (GxICR(i) & GxICR_LEVEL) >> + GxICR_LEVEL_SHIFT); +Index: linux-2.6-tip/arch/mn10300/kernel/mn10300-watchdog.c +=================================================================== +--- linux-2.6-tip.orig/arch/mn10300/kernel/mn10300-watchdog.c ++++ linux-2.6-tip/arch/mn10300/kernel/mn10300-watchdog.c +@@ -130,6 +130,7 @@ void watchdog_interrupt(struct pt_regs * + * the stack NMI-atomically, it's safe to use smp_processor_id(). + */ + int sum, cpu = smp_processor_id(); ++ int irq = NMIIRQ; + u8 wdt, tmp; + + wdt = WDCTR & ~WDCTR_WDCNE; +@@ -138,7 +139,7 @@ void watchdog_interrupt(struct pt_regs * + NMICR = NMICR_WDIF; + + nmi_count(cpu)++; +- kstat_this_cpu.irqs[NMIIRQ]++; ++ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); + sum = irq_stat[cpu].__irq_count; + + if (last_irq_sums[cpu] == sum) { +Index: linux-2.6-tip/arch/parisc/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/parisc/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/parisc/include/asm/pdc.h +=================================================================== +--- linux-2.6-tip.orig/arch/parisc/include/asm/pdc.h ++++ linux-2.6-tip/arch/parisc/include/asm/pdc.h +@@ -336,10 +336,11 @@ + #define NUM_PDC_RESULT 32 + + #if !defined(__ASSEMBLY__) +-#ifdef __KERNEL__ + + #include + ++#ifdef __KERNEL__ ++ + extern int pdc_type; + + /* Values for pdc_type */ +Index: linux-2.6-tip/arch/parisc/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/parisc/include/asm/swab.h ++++ linux-2.6-tip/arch/parisc/include/asm/swab.h +@@ -1,7 +1,7 @@ + #ifndef _PARISC_SWAB_H + #define _PARISC_SWAB_H + +-#include ++#include + #include + + #define __SWAB_64_THRU_32__ +Index: linux-2.6-tip/arch/parisc/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/parisc/kernel/irq.c ++++ linux-2.6-tip/arch/parisc/kernel/irq.c +@@ -138,7 +138,7 @@ static void cpu_set_affinity_irq(unsigne + if (cpu_dest < 0) + return; + +- cpumask_copy(&irq_desc[irq].affinity, &cpumask_of_cpu(cpu_dest)); ++ cpumask_copy(&irq_desc[irq].affinity, dest); + } + #endif + +@@ -185,7 +185,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%3d: ", i); + #ifdef CONFIG_SMP + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + #else + seq_printf(p, "%10u ", kstat_irqs(i)); + #endif +Index: linux-2.6-tip/arch/powerpc/include/asm/bootx.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/bootx.h ++++ linux-2.6-tip/arch/powerpc/include/asm/bootx.h +@@ -9,7 +9,7 @@ + #ifndef __ASM_BOOTX_H__ + #define __ASM_BOOTX_H__ + +-#include ++#include + + #ifdef macintosh + #include +Index: linux-2.6-tip/arch/powerpc/include/asm/elf.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/elf.h ++++ linux-2.6-tip/arch/powerpc/include/asm/elf.h +@@ -7,7 +7,7 @@ + #include + #endif + +-#include ++#include + #include + #include + #include +Index: linux-2.6-tip/arch/powerpc/include/asm/hw_irq.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/hw_irq.h ++++ linux-2.6-tip/arch/powerpc/include/asm/hw_irq.h +@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(un + */ + struct hw_interrupt_type; + ++#ifdef CONFIG_PERF_COUNTERS ++static inline unsigned long get_perf_counter_pending(void) ++{ ++ unsigned long x; ++ ++ asm volatile("lbz %0,%1(13)" ++ : "=r" (x) ++ : "i" (offsetof(struct paca_struct, perf_counter_pending))); ++ return x; ++} ++ ++static inline void set_perf_counter_pending(void) ++{ ++ asm volatile("stb %0,%1(13)" : : ++ "r" (1), ++ "i" (offsetof(struct paca_struct, perf_counter_pending))); ++} ++ ++static inline void clear_perf_counter_pending(void) ++{ ++ asm volatile("stb %0,%1(13)" : : ++ "r" (0), ++ "i" (offsetof(struct paca_struct, perf_counter_pending))); ++} ++ ++extern void perf_counter_do_pending(void); ++ ++#else ++ ++static inline unsigned long get_perf_counter_pending(void) ++{ ++ return 0; ++} ++ ++static inline void set_perf_counter_pending(void) {} ++static inline void clear_perf_counter_pending(void) {} ++static inline void perf_counter_do_pending(void) {} ++#endif /* CONFIG_PERF_COUNTERS */ ++ + #endif /* __KERNEL__ */ + #endif /* _ASM_POWERPC_HW_IRQ_H */ +Index: linux-2.6-tip/arch/powerpc/include/asm/kvm.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/kvm.h ++++ linux-2.6-tip/arch/powerpc/include/asm/kvm.h +@@ -20,7 +20,7 @@ + #ifndef __LINUX_KVM_POWERPC_H + #define __LINUX_KVM_POWERPC_H + +-#include ++#include + + struct kvm_regs { + __u64 pc; +Index: linux-2.6-tip/arch/powerpc/include/asm/mmzone.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/mmzone.h ++++ linux-2.6-tip/arch/powerpc/include/asm/mmzone.h +@@ -8,6 +8,7 @@ + #define _ASM_MMZONE_H_ + #ifdef __KERNEL__ + ++#include + + /* + * generic non-linear memory support: +Index: linux-2.6-tip/arch/powerpc/include/asm/paca.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/paca.h ++++ linux-2.6-tip/arch/powerpc/include/asm/paca.h +@@ -99,6 +99,7 @@ struct paca_struct { + u8 soft_enabled; /* irq soft-enable flag */ + u8 hard_enabled; /* set if irqs are enabled in MSR */ + u8 io_sync; /* writel() needs spin_unlock sync */ ++ u8 perf_counter_pending; /* PM interrupt while soft-disabled */ + + /* Stuff for accurate time accounting */ + u64 user_time; /* accumulated usermode TB ticks */ +Index: linux-2.6-tip/arch/powerpc/include/asm/perf_counter.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/powerpc/include/asm/perf_counter.h +@@ -0,0 +1,72 @@ ++/* ++ * Performance counter support - PowerPC-specific definitions. ++ * ++ * Copyright 2008-2009 Paul Mackerras, IBM Corporation. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#include ++ ++#define MAX_HWCOUNTERS 8 ++#define MAX_EVENT_ALTERNATIVES 8 ++ ++/* ++ * This struct provides the constants and functions needed to ++ * describe the PMU on a particular POWER-family CPU. ++ */ ++struct power_pmu { ++ int n_counter; ++ int max_alternatives; ++ u64 add_fields; ++ u64 test_adder; ++ int (*compute_mmcr)(unsigned int events[], int n_ev, ++ unsigned int hwc[], u64 mmcr[]); ++ int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp); ++ int (*get_alternatives)(unsigned int event, unsigned int alt[]); ++ void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); ++ int n_generic; ++ int *generic_events; ++}; ++ ++extern struct power_pmu *ppmu; ++ ++/* ++ * The power_pmu.get_constraint function returns a 64-bit value and ++ * a 64-bit mask that express the constraints between this event and ++ * other events. ++ * ++ * The value and mask are divided up into (non-overlapping) bitfields ++ * of three different types: ++ * ++ * Select field: this expresses the constraint that some set of bits ++ * in MMCR* needs to be set to a specific value for this event. For a ++ * select field, the mask contains 1s in every bit of the field, and ++ * the value contains a unique value for each possible setting of the ++ * MMCR* bits. The constraint checking code will ensure that two events ++ * that set the same field in their masks have the same value in their ++ * value dwords. ++ * ++ * Add field: this expresses the constraint that there can be at most ++ * N events in a particular class. A field of k bits can be used for ++ * N <= 2^(k-1) - 1. The mask has the most significant bit of the field ++ * set (and the other bits 0), and the value has only the least significant ++ * bit of the field set. In addition, the 'add_fields' and 'test_adder' ++ * in the struct power_pmu for this processor come into play. The ++ * add_fields value contains 1 in the LSB of the field, and the ++ * test_adder contains 2^(k-1) - 1 - N in the field. ++ * ++ * NAND field: this expresses the constraint that you may not have events ++ * in all of a set of classes. (For example, on PPC970, you can't select ++ * events from the FPU, ISU and IDU simultaneously, although any two are ++ * possible.) For N classes, the field is N+1 bits wide, and each class ++ * is assigned one bit from the least-significant N bits. The mask has ++ * only the most-significant bit set, and the value has only the bit ++ * for the event's class set. The test_adder has the least significant ++ * bit set in the field. ++ * ++ * If an event is not subject to the constraint expressed by a particular ++ * field, then it will have 0 in both the mask and value for that field. ++ */ +Index: linux-2.6-tip/arch/powerpc/include/asm/ps3fb.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/ps3fb.h ++++ linux-2.6-tip/arch/powerpc/include/asm/ps3fb.h +@@ -19,6 +19,7 @@ + #ifndef _ASM_POWERPC_PS3FB_H_ + #define _ASM_POWERPC_PS3FB_H_ + ++#include + #include + + /* ioctl */ +Index: linux-2.6-tip/arch/powerpc/include/asm/spu_info.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/spu_info.h ++++ linux-2.6-tip/arch/powerpc/include/asm/spu_info.h +@@ -23,9 +23,10 @@ + #ifndef _SPU_INFO_H + #define _SPU_INFO_H + ++#include ++ + #ifdef __KERNEL__ + #include +-#include + #else + struct mfc_cq_sr { + __u64 mfc_cq_data0_RW; +Index: linux-2.6-tip/arch/powerpc/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/swab.h ++++ linux-2.6-tip/arch/powerpc/include/asm/swab.h +@@ -8,7 +8,7 @@ + * 2 of the License, or (at your option) any later version. + */ + +-#include ++#include + #include + + #ifdef __GNUC__ +Index: linux-2.6-tip/arch/powerpc/include/asm/systbl.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/systbl.h ++++ linux-2.6-tip/arch/powerpc/include/asm/systbl.h +@@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1) + SYSCALL_SPU(dup3) + SYSCALL_SPU(pipe2) + SYSCALL(inotify_init1) ++SYSCALL_SPU(perf_counter_open) +Index: linux-2.6-tip/arch/powerpc/include/asm/unistd.h +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/include/asm/unistd.h ++++ linux-2.6-tip/arch/powerpc/include/asm/unistd.h +@@ -341,10 +341,11 @@ + #define __NR_dup3 316 + #define __NR_pipe2 317 + #define __NR_inotify_init1 318 ++#define __NR_perf_counter_open 319 + + #ifdef __KERNEL__ + +-#define __NR_syscalls 319 ++#define __NR_syscalls 320 + + #define __NR__exit __NR_exit + #define NR_syscalls __NR_syscalls +Index: linux-2.6-tip/arch/powerpc/kernel/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/kernel/Makefile ++++ linux-2.6-tip/arch/powerpc/kernel/Makefile +@@ -94,6 +94,8 @@ obj-$(CONFIG_AUDIT) += audit.o + obj64-$(CONFIG_AUDIT) += compat_audit.o + + obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o ++obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \ ++ power5-pmu.o power5+-pmu.o power6-pmu.o + + obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o + +Index: linux-2.6-tip/arch/powerpc/kernel/asm-offsets.c +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/kernel/asm-offsets.c ++++ linux-2.6-tip/arch/powerpc/kernel/asm-offsets.c +@@ -131,6 +131,7 @@ int main(void) + DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); + DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); + DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); ++ DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending)); + DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); + DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); + DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); +Index: linux-2.6-tip/arch/powerpc/kernel/entry_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/kernel/entry_64.S ++++ linux-2.6-tip/arch/powerpc/kernel/entry_64.S +@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ + 2: + TRACE_AND_RESTORE_IRQ(r5); + ++#ifdef CONFIG_PERF_COUNTERS ++ /* check paca->perf_counter_pending if we're enabling ints */ ++ lbz r3,PACAPERFPEND(r13) ++ and. r3,r3,r5 ++ beq 27f ++ bl .perf_counter_do_pending ++27: ++#endif /* CONFIG_PERF_COUNTERS */ ++ + /* extract EE bit and use it to restore paca->hard_enabled */ + ld r3,_MSR(r1) + rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ +@@ -616,44 +625,52 @@ do_work: + bne restore + /* here we are preempting the current task */ + 1: ++ /* ++ * preempt_schedule_irq() expects interrupts disabled and returns ++ * with interrupts disabled. No need to check preemption again, ++ * preempt_schedule_irq just did that for us. ++ */ ++ bl .preempt_schedule_irq + #ifdef CONFIG_TRACE_IRQFLAGS + bl .trace_hardirqs_on ++#endif /* CONFIG_TRACE_IRQFLAGS */ ++ + /* Note: we just clobbered r10 which used to contain the previous + * MSR before the hard-disabling done by the caller of do_work. + * We don't have that value anymore, but it doesn't matter as + * we will hard-enable unconditionally, we can just reload the + * current MSR into r10 + */ ++ bl .preempt_schedule_irq + mfmsr r10 +-#endif /* CONFIG_TRACE_IRQFLAGS */ +- li r0,1 +- stb r0,PACASOFTIRQEN(r13) +- stb r0,PACAHARDIRQEN(r13) +- ori r10,r10,MSR_EE +- mtmsrd r10,1 /* reenable interrupts */ +- bl .preempt_schedule +- mfmsr r10 +- clrrdi r9,r1,THREAD_SHIFT +- rldicl r10,r10,48,1 /* disable interrupts again */ +- rotldi r10,r10,16 +- mtmsrd r10,1 +- ld r4,TI_FLAGS(r9) +- andi. r0,r4,_TIF_NEED_RESCHED +- bne 1b ++ clrrdi r9,r1,THREAD_SHIFT ++ rldicl r10,r10,48,1 /* disable interrupts again */ ++ rotldi r10,r10,16 ++ mtmsrd r10,1 ++ ld r4,TI_FLAGS(r9) ++ andi. r0,r4,(_TIF_NEED_RESCHED) ++ bne 1b + b restore + + user_work: + #endif +- /* Enable interrupts */ +- ori r10,r10,MSR_EE +- mtmsrd r10,1 +- + andi. r0,r4,_TIF_NEED_RESCHED + beq 1f +- bl .schedule ++ ++ /* preempt_schedule_irq() expects interrupts disabled. */ ++ bl .preempt_schedule_irq + b .ret_from_except_lite + +-1: bl .save_nvgprs ++ /* here we are preempting the current task */ ++1: li r0,1 ++ stb r0,PACASOFTIRQEN(r13) ++ stb r0,PACAHARDIRQEN(r13) ++ ++ /* Enable interrupts */ ++ ori r10,r10,MSR_EE ++ mtmsrd r10,1 ++ ++ bl .save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl .do_signal + b .ret_from_except +Index: linux-2.6-tip/arch/powerpc/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/kernel/irq.c ++++ linux-2.6-tip/arch/powerpc/kernel/irq.c +@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsig + iseries_handle_interrupts(); + } + ++ if (get_perf_counter_pending()) { ++ clear_perf_counter_pending(); ++ perf_counter_do_pending(); ++ } ++ + /* + * if (get_paca()->hard_enabled) return; + * But again we need to take care that gcc gets hard_enabled directly +@@ -190,7 +195,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%3d: ", i); + #ifdef CONFIG_SMP + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + #else + seq_printf(p, "%10u ", kstat_irqs(i)); + #endif /* CONFIG_SMP */ +@@ -231,7 +236,7 @@ void fixup_irqs(cpumask_t map) + if (irq_desc[irq].status & IRQ_PER_CPU) + continue; + +- cpus_and(mask, irq_desc[irq].affinity, map); ++ cpumask_and(&mask, irq_desc[irq].affinity, &map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; +@@ -438,7 +443,7 @@ void do_softirq(void) + */ + + static LIST_HEAD(irq_hosts); +-static DEFINE_SPINLOCK(irq_big_lock); ++static DEFINE_RAW_SPINLOCK(irq_big_lock); + static unsigned int revmap_trees_allocated; + static DEFINE_MUTEX(revmap_trees_mutex); + struct irq_map_entry irq_map[NR_IRQS]; +Index: linux-2.6-tip/arch/powerpc/kernel/perf_counter.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/powerpc/kernel/perf_counter.c +@@ -0,0 +1,827 @@ ++/* ++ * Performance counter support - powerpc architecture code ++ * ++ * Copyright 2008-2009 Paul Mackerras, IBM Corporation. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct cpu_hw_counters { ++ int n_counters; ++ int n_percpu; ++ int disabled; ++ int n_added; ++ struct perf_counter *counter[MAX_HWCOUNTERS]; ++ unsigned int events[MAX_HWCOUNTERS]; ++ u64 mmcr[3]; ++ u8 pmcs_enabled; ++}; ++DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); ++ ++struct power_pmu *ppmu; ++ ++/* ++ * Normally, to ignore kernel events we set the FCS (freeze counters ++ * in supervisor mode) bit in MMCR0, but if the kernel runs with the ++ * hypervisor bit set in the MSR, or if we are running on a processor ++ * where the hypervisor bit is forced to 1 (as on Apple G5 processors), ++ * then we need to use the FCHV bit to ignore kernel events. ++ */ ++static unsigned int freeze_counters_kernel = MMCR0_FCS; ++ ++void perf_counter_print_debug(void) ++{ ++} ++ ++/* ++ * Read one performance monitor counter (PMC). ++ */ ++static unsigned long read_pmc(int idx) ++{ ++ unsigned long val; ++ ++ switch (idx) { ++ case 1: ++ val = mfspr(SPRN_PMC1); ++ break; ++ case 2: ++ val = mfspr(SPRN_PMC2); ++ break; ++ case 3: ++ val = mfspr(SPRN_PMC3); ++ break; ++ case 4: ++ val = mfspr(SPRN_PMC4); ++ break; ++ case 5: ++ val = mfspr(SPRN_PMC5); ++ break; ++ case 6: ++ val = mfspr(SPRN_PMC6); ++ break; ++ case 7: ++ val = mfspr(SPRN_PMC7); ++ break; ++ case 8: ++ val = mfspr(SPRN_PMC8); ++ break; ++ default: ++ printk(KERN_ERR "oops trying to read PMC%d\n", idx); ++ val = 0; ++ } ++ return val; ++} ++ ++/* ++ * Write one PMC. ++ */ ++static void write_pmc(int idx, unsigned long val) ++{ ++ switch (idx) { ++ case 1: ++ mtspr(SPRN_PMC1, val); ++ break; ++ case 2: ++ mtspr(SPRN_PMC2, val); ++ break; ++ case 3: ++ mtspr(SPRN_PMC3, val); ++ break; ++ case 4: ++ mtspr(SPRN_PMC4, val); ++ break; ++ case 5: ++ mtspr(SPRN_PMC5, val); ++ break; ++ case 6: ++ mtspr(SPRN_PMC6, val); ++ break; ++ case 7: ++ mtspr(SPRN_PMC7, val); ++ break; ++ case 8: ++ mtspr(SPRN_PMC8, val); ++ break; ++ default: ++ printk(KERN_ERR "oops trying to write PMC%d\n", idx); ++ } ++} ++ ++/* ++ * Check if a set of events can all go on the PMU at once. ++ * If they can't, this will look at alternative codes for the events ++ * and see if any combination of alternative codes is feasible. ++ * The feasible set is returned in event[]. ++ */ ++static int power_check_constraints(unsigned int event[], int n_ev) ++{ ++ u64 mask, value, nv; ++ unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; ++ u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; ++ u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; ++ u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; ++ int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS]; ++ int i, j; ++ u64 addf = ppmu->add_fields; ++ u64 tadd = ppmu->test_adder; ++ ++ if (n_ev > ppmu->n_counter) ++ return -1; ++ ++ /* First see if the events will go on as-is */ ++ for (i = 0; i < n_ev; ++i) { ++ alternatives[i][0] = event[i]; ++ if (ppmu->get_constraint(event[i], &amasks[i][0], ++ &avalues[i][0])) ++ return -1; ++ choice[i] = 0; ++ } ++ value = mask = 0; ++ for (i = 0; i < n_ev; ++i) { ++ nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf); ++ if ((((nv + tadd) ^ value) & mask) != 0 || ++ (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0) ++ break; ++ value = nv; ++ mask |= amasks[i][0]; ++ } ++ if (i == n_ev) ++ return 0; /* all OK */ ++ ++ /* doesn't work, gather alternatives... */ ++ if (!ppmu->get_alternatives) ++ return -1; ++ for (i = 0; i < n_ev; ++i) { ++ n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]); ++ for (j = 1; j < n_alt[i]; ++j) ++ ppmu->get_constraint(alternatives[i][j], ++ &amasks[i][j], &avalues[i][j]); ++ } ++ ++ /* enumerate all possibilities and see if any will work */ ++ i = 0; ++ j = -1; ++ value = mask = nv = 0; ++ while (i < n_ev) { ++ if (j >= 0) { ++ /* we're backtracking, restore context */ ++ value = svalues[i]; ++ mask = smasks[i]; ++ j = choice[i]; ++ } ++ /* ++ * See if any alternative k for event i, ++ * where k > j, will satisfy the constraints. ++ */ ++ while (++j < n_alt[i]) { ++ nv = (value | avalues[i][j]) + ++ (value & avalues[i][j] & addf); ++ if ((((nv + tadd) ^ value) & mask) == 0 && ++ (((nv + tadd) ^ avalues[i][j]) ++ & amasks[i][j]) == 0) ++ break; ++ } ++ if (j >= n_alt[i]) { ++ /* ++ * No feasible alternative, backtrack ++ * to event i-1 and continue enumerating its ++ * alternatives from where we got up to. ++ */ ++ if (--i < 0) ++ return -1; ++ } else { ++ /* ++ * Found a feasible alternative for event i, ++ * remember where we got up to with this event, ++ * go on to the next event, and start with ++ * the first alternative for it. ++ */ ++ choice[i] = j; ++ svalues[i] = value; ++ smasks[i] = mask; ++ value = nv; ++ mask |= amasks[i][j]; ++ ++i; ++ j = -1; ++ } ++ } ++ ++ /* OK, we have a feasible combination, tell the caller the solution */ ++ for (i = 0; i < n_ev; ++i) ++ event[i] = alternatives[i][choice[i]]; ++ return 0; ++} ++ ++/* ++ * Check if newly-added counters have consistent settings for ++ * exclude_{user,kernel,hv} with each other and any previously ++ * added counters. ++ */ ++static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) ++{ ++ int eu, ek, eh; ++ int i, n; ++ struct perf_counter *counter; ++ ++ n = n_prev + n_new; ++ if (n <= 1) ++ return 0; ++ ++ eu = ctrs[0]->hw_event.exclude_user; ++ ek = ctrs[0]->hw_event.exclude_kernel; ++ eh = ctrs[0]->hw_event.exclude_hv; ++ if (n_prev == 0) ++ n_prev = 1; ++ for (i = n_prev; i < n; ++i) { ++ counter = ctrs[i]; ++ if (counter->hw_event.exclude_user != eu || ++ counter->hw_event.exclude_kernel != ek || ++ counter->hw_event.exclude_hv != eh) ++ return -EAGAIN; ++ } ++ return 0; ++} ++ ++static void power_perf_read(struct perf_counter *counter) ++{ ++ long val, delta, prev; ++ ++ if (!counter->hw.idx) ++ return; ++ /* ++ * Performance monitor interrupts come even when interrupts ++ * are soft-disabled, as long as interrupts are hard-enabled. ++ * Therefore we treat them like NMIs. ++ */ ++ do { ++ prev = atomic64_read(&counter->hw.prev_count); ++ barrier(); ++ val = read_pmc(counter->hw.idx); ++ } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev); ++ ++ /* The counters are only 32 bits wide */ ++ delta = (val - prev) & 0xfffffffful; ++ atomic64_add(delta, &counter->count); ++ atomic64_sub(delta, &counter->hw.period_left); ++} ++ ++/* ++ * Disable all counters to prevent PMU interrupts and to allow ++ * counters to be added or removed. ++ */ ++u64 hw_perf_save_disable(void) ++{ ++ struct cpu_hw_counters *cpuhw; ++ unsigned long ret; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ cpuhw = &__get_cpu_var(cpu_hw_counters); ++ ++ ret = cpuhw->disabled; ++ if (!ret) { ++ cpuhw->disabled = 1; ++ cpuhw->n_added = 0; ++ ++ /* ++ * Check if we ever enabled the PMU on this cpu. ++ */ ++ if (!cpuhw->pmcs_enabled) { ++ if (ppc_md.enable_pmcs) ++ ppc_md.enable_pmcs(); ++ cpuhw->pmcs_enabled = 1; ++ } ++ ++ /* ++ * Set the 'freeze counters' bit. ++ * The barrier is to make sure the mtspr has been ++ * executed and the PMU has frozen the counters ++ * before we return. ++ */ ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); ++ mb(); ++ } ++ local_irq_restore(flags); ++ return ret; ++} ++ ++/* ++ * Re-enable all counters if disable == 0. ++ * If we were previously disabled and counters were added, then ++ * put the new config on the PMU. ++ */ ++void hw_perf_restore(u64 disable) ++{ ++ struct perf_counter *counter; ++ struct cpu_hw_counters *cpuhw; ++ unsigned long flags; ++ long i; ++ unsigned long val; ++ s64 left; ++ unsigned int hwc_index[MAX_HWCOUNTERS]; ++ ++ if (disable) ++ return; ++ local_irq_save(flags); ++ cpuhw = &__get_cpu_var(cpu_hw_counters); ++ cpuhw->disabled = 0; ++ ++ /* ++ * If we didn't change anything, or only removed counters, ++ * no need to recalculate MMCR* settings and reset the PMCs. ++ * Just reenable the PMU with the current MMCR* settings ++ * (possibly updated for removal of counters). ++ */ ++ if (!cpuhw->n_added) { ++ mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); ++ mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); ++ mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); ++ if (cpuhw->n_counters == 0) ++ get_lppaca()->pmcregs_in_use = 0; ++ goto out; ++ } ++ ++ /* ++ * Compute MMCR* values for the new set of counters ++ */ ++ if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index, ++ cpuhw->mmcr)) { ++ /* shouldn't ever get here */ ++ printk(KERN_ERR "oops compute_mmcr failed\n"); ++ goto out; ++ } ++ ++ /* ++ * Add in MMCR0 freeze bits corresponding to the ++ * hw_event.exclude_* bits for the first counter. ++ * We have already checked that all counters have the ++ * same values for these bits as the first counter. ++ */ ++ counter = cpuhw->counter[0]; ++ if (counter->hw_event.exclude_user) ++ cpuhw->mmcr[0] |= MMCR0_FCP; ++ if (counter->hw_event.exclude_kernel) ++ cpuhw->mmcr[0] |= freeze_counters_kernel; ++ if (counter->hw_event.exclude_hv) ++ cpuhw->mmcr[0] |= MMCR0_FCHV; ++ ++ /* ++ * Write the new configuration to MMCR* with the freeze ++ * bit set and set the hardware counters to their initial values. ++ * Then unfreeze the counters. ++ */ ++ get_lppaca()->pmcregs_in_use = 1; ++ mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); ++ mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); ++ mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) ++ | MMCR0_FC); ++ ++ /* ++ * Read off any pre-existing counters that need to move ++ * to another PMC. ++ */ ++ for (i = 0; i < cpuhw->n_counters; ++i) { ++ counter = cpuhw->counter[i]; ++ if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { ++ power_perf_read(counter); ++ write_pmc(counter->hw.idx, 0); ++ counter->hw.idx = 0; ++ } ++ } ++ ++ /* ++ * Initialize the PMCs for all the new and moved counters. ++ */ ++ for (i = 0; i < cpuhw->n_counters; ++i) { ++ counter = cpuhw->counter[i]; ++ if (counter->hw.idx) ++ continue; ++ val = 0; ++ if (counter->hw_event.irq_period) { ++ left = atomic64_read(&counter->hw.period_left); ++ if (left < 0x80000000L) ++ val = 0x80000000L - left; ++ } ++ atomic64_set(&counter->hw.prev_count, val); ++ counter->hw.idx = hwc_index[i] + 1; ++ write_pmc(counter->hw.idx, val); ++ perf_counter_update_userpage(counter); ++ } ++ mb(); ++ cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; ++ mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); ++ ++ out: ++ local_irq_restore(flags); ++} ++ ++static int collect_events(struct perf_counter *group, int max_count, ++ struct perf_counter *ctrs[], unsigned int *events) ++{ ++ int n = 0; ++ struct perf_counter *counter; ++ ++ if (!is_software_counter(group)) { ++ if (n >= max_count) ++ return -1; ++ ctrs[n] = group; ++ events[n++] = group->hw.config; ++ } ++ list_for_each_entry(counter, &group->sibling_list, list_entry) { ++ if (!is_software_counter(counter) && ++ counter->state != PERF_COUNTER_STATE_OFF) { ++ if (n >= max_count) ++ return -1; ++ ctrs[n] = counter; ++ events[n++] = counter->hw.config; ++ } ++ } ++ return n; ++} ++ ++static void counter_sched_in(struct perf_counter *counter, int cpu) ++{ ++ counter->state = PERF_COUNTER_STATE_ACTIVE; ++ counter->oncpu = cpu; ++ counter->tstamp_running += counter->ctx->time_now - ++ counter->tstamp_stopped; ++ if (is_software_counter(counter)) ++ counter->hw_ops->enable(counter); ++} ++ ++/* ++ * Called to enable a whole group of counters. ++ * Returns 1 if the group was enabled, or -EAGAIN if it could not be. ++ * Assumes the caller has disabled interrupts and has ++ * frozen the PMU with hw_perf_save_disable. ++ */ ++int hw_perf_group_sched_in(struct perf_counter *group_leader, ++ struct perf_cpu_context *cpuctx, ++ struct perf_counter_context *ctx, int cpu) ++{ ++ struct cpu_hw_counters *cpuhw; ++ long i, n, n0; ++ struct perf_counter *sub; ++ ++ cpuhw = &__get_cpu_var(cpu_hw_counters); ++ n0 = cpuhw->n_counters; ++ n = collect_events(group_leader, ppmu->n_counter - n0, ++ &cpuhw->counter[n0], &cpuhw->events[n0]); ++ if (n < 0) ++ return -EAGAIN; ++ if (check_excludes(cpuhw->counter, n0, n)) ++ return -EAGAIN; ++ if (power_check_constraints(cpuhw->events, n + n0)) ++ return -EAGAIN; ++ cpuhw->n_counters = n0 + n; ++ cpuhw->n_added += n; ++ ++ /* ++ * OK, this group can go on; update counter states etc., ++ * and enable any software counters ++ */ ++ for (i = n0; i < n0 + n; ++i) ++ cpuhw->counter[i]->hw.config = cpuhw->events[i]; ++ cpuctx->active_oncpu += n; ++ n = 1; ++ counter_sched_in(group_leader, cpu); ++ list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { ++ if (sub->state != PERF_COUNTER_STATE_OFF) { ++ counter_sched_in(sub, cpu); ++ ++n; ++ } ++ } ++ ctx->nr_active += n; ++ ++ return 1; ++} ++ ++/* ++ * Add a counter to the PMU. ++ * If all counters are not already frozen, then we disable and ++ * re-enable the PMU in order to get hw_perf_restore to do the ++ * actual work of reconfiguring the PMU. ++ */ ++static int power_perf_enable(struct perf_counter *counter) ++{ ++ struct cpu_hw_counters *cpuhw; ++ unsigned long flags; ++ u64 pmudis; ++ int n0; ++ int ret = -EAGAIN; ++ ++ local_irq_save(flags); ++ pmudis = hw_perf_save_disable(); ++ ++ /* ++ * Add the counter to the list (if there is room) ++ * and check whether the total set is still feasible. ++ */ ++ cpuhw = &__get_cpu_var(cpu_hw_counters); ++ n0 = cpuhw->n_counters; ++ if (n0 >= ppmu->n_counter) ++ goto out; ++ cpuhw->counter[n0] = counter; ++ cpuhw->events[n0] = counter->hw.config; ++ if (check_excludes(cpuhw->counter, n0, 1)) ++ goto out; ++ if (power_check_constraints(cpuhw->events, n0 + 1)) ++ goto out; ++ ++ counter->hw.config = cpuhw->events[n0]; ++ ++cpuhw->n_counters; ++ ++cpuhw->n_added; ++ ++ ret = 0; ++ out: ++ hw_perf_restore(pmudis); ++ local_irq_restore(flags); ++ return ret; ++} ++ ++/* ++ * Remove a counter from the PMU. ++ */ ++static void power_perf_disable(struct perf_counter *counter) ++{ ++ struct cpu_hw_counters *cpuhw; ++ long i; ++ u64 pmudis; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ pmudis = hw_perf_save_disable(); ++ ++ power_perf_read(counter); ++ ++ cpuhw = &__get_cpu_var(cpu_hw_counters); ++ for (i = 0; i < cpuhw->n_counters; ++i) { ++ if (counter == cpuhw->counter[i]) { ++ while (++i < cpuhw->n_counters) ++ cpuhw->counter[i-1] = cpuhw->counter[i]; ++ --cpuhw->n_counters; ++ ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); ++ write_pmc(counter->hw.idx, 0); ++ counter->hw.idx = 0; ++ perf_counter_update_userpage(counter); ++ break; ++ } ++ } ++ if (cpuhw->n_counters == 0) { ++ /* disable exceptions if no counters are running */ ++ cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); ++ } ++ ++ hw_perf_restore(pmudis); ++ local_irq_restore(flags); ++} ++ ++struct hw_perf_counter_ops power_perf_ops = { ++ .enable = power_perf_enable, ++ .disable = power_perf_disable, ++ .read = power_perf_read ++}; ++ ++const struct hw_perf_counter_ops * ++hw_perf_counter_init(struct perf_counter *counter) ++{ ++ unsigned long ev; ++ struct perf_counter *ctrs[MAX_HWCOUNTERS]; ++ unsigned int events[MAX_HWCOUNTERS]; ++ int n; ++ ++ if (!ppmu) ++ return NULL; ++ if ((s64)counter->hw_event.irq_period < 0) ++ return NULL; ++ if (!perf_event_raw(&counter->hw_event)) { ++ ev = perf_event_id(&counter->hw_event); ++ if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) ++ return NULL; ++ ev = ppmu->generic_events[ev]; ++ } else { ++ ev = perf_event_config(&counter->hw_event); ++ } ++ counter->hw.config_base = ev; ++ counter->hw.idx = 0; ++ ++ /* ++ * If we are not running on a hypervisor, force the ++ * exclude_hv bit to 0 so that we don't care what ++ * the user set it to. ++ */ ++ if (!firmware_has_feature(FW_FEATURE_LPAR)) ++ counter->hw_event.exclude_hv = 0; ++ ++ /* ++ * If this is in a group, check if it can go on with all the ++ * other hardware counters in the group. We assume the counter ++ * hasn't been linked into its leader's sibling list at this point. ++ */ ++ n = 0; ++ if (counter->group_leader != counter) { ++ n = collect_events(counter->group_leader, ppmu->n_counter - 1, ++ ctrs, events); ++ if (n < 0) ++ return NULL; ++ } ++ events[n] = ev; ++ ctrs[n] = counter; ++ if (check_excludes(ctrs, n, 1)) ++ return NULL; ++ if (power_check_constraints(events, n + 1)) ++ return NULL; ++ ++ counter->hw.config = events[n]; ++ atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); ++ return &power_perf_ops; ++} ++ ++/* ++ * Handle wakeups. ++ */ ++void perf_counter_do_pending(void) ++{ ++ int i; ++ struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); ++ struct perf_counter *counter; ++ ++ for (i = 0; i < cpuhw->n_counters; ++i) { ++ counter = cpuhw->counter[i]; ++ if (counter && counter->wakeup_pending) { ++ counter->wakeup_pending = 0; ++ wake_up(&counter->waitq); ++ } ++ } ++} ++ ++/* ++ * A counter has overflowed; update its count and record ++ * things if requested. Note that interrupts are hard-disabled ++ * here so there is no possibility of being interrupted. ++ */ ++static void record_and_restart(struct perf_counter *counter, long val, ++ struct pt_regs *regs) ++{ ++ s64 prev, delta, left; ++ int record = 0; ++ ++ /* we don't have to worry about interrupts here */ ++ prev = atomic64_read(&counter->hw.prev_count); ++ delta = (val - prev) & 0xfffffffful; ++ atomic64_add(delta, &counter->count); ++ ++ /* ++ * See if the total period for this counter has expired, ++ * and update for the next period. ++ */ ++ val = 0; ++ left = atomic64_read(&counter->hw.period_left) - delta; ++ if (counter->hw_event.irq_period) { ++ if (left <= 0) { ++ left += counter->hw_event.irq_period; ++ if (left <= 0) ++ left = counter->hw_event.irq_period; ++ record = 1; ++ } ++ if (left < 0x80000000L) ++ val = 0x80000000L - left; ++ } ++ write_pmc(counter->hw.idx, val); ++ atomic64_set(&counter->hw.prev_count, val); ++ atomic64_set(&counter->hw.period_left, left); ++ perf_counter_update_userpage(counter); ++ ++ /* ++ * Finally record data if requested. ++ */ ++ if (record) ++ perf_counter_output(counter, 1, regs); ++} ++ ++/* ++ * Performance monitor interrupt stuff ++ */ ++static void perf_counter_interrupt(struct pt_regs *regs) ++{ ++ int i; ++ struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); ++ struct perf_counter *counter; ++ long val; ++ int need_wakeup = 0, found = 0; ++ ++ for (i = 0; i < cpuhw->n_counters; ++i) { ++ counter = cpuhw->counter[i]; ++ val = read_pmc(counter->hw.idx); ++ if ((int)val < 0) { ++ /* counter has overflowed */ ++ found = 1; ++ record_and_restart(counter, val, regs); ++ } ++ } ++ ++ /* ++ * In case we didn't find and reset the counter that caused ++ * the interrupt, scan all counters and reset any that are ++ * negative, to avoid getting continual interrupts. ++ * Any that we processed in the previous loop will not be negative. ++ */ ++ if (!found) { ++ for (i = 0; i < ppmu->n_counter; ++i) { ++ val = read_pmc(i + 1); ++ if ((int)val < 0) ++ write_pmc(i + 1, 0); ++ } ++ } ++ ++ /* ++ * Reset MMCR0 to its normal value. This will set PMXE and ++ * clear FC (freeze counters) and PMAO (perf mon alert occurred) ++ * and thus allow interrupts to occur again. ++ * XXX might want to use MSR.PM to keep the counters frozen until ++ * we get back out of this interrupt. ++ */ ++ mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); ++ ++ /* ++ * If we need a wakeup, check whether interrupts were soft-enabled ++ * when we took the interrupt. If they were, we can wake stuff up ++ * immediately; otherwise we'll have do the wakeup when interrupts ++ * get soft-enabled. ++ */ ++ if (get_perf_counter_pending() && regs->softe) { ++ irq_enter(); ++ clear_perf_counter_pending(); ++ perf_counter_do_pending(); ++ irq_exit(); ++ } ++} ++ ++void hw_perf_counter_setup(int cpu) ++{ ++ struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); ++ ++ memset(cpuhw, 0, sizeof(*cpuhw)); ++ cpuhw->mmcr[0] = MMCR0_FC; ++} ++ ++extern struct power_pmu power4_pmu; ++extern struct power_pmu ppc970_pmu; ++extern struct power_pmu power5_pmu; ++extern struct power_pmu power5p_pmu; ++extern struct power_pmu power6_pmu; ++ ++static int init_perf_counters(void) ++{ ++ unsigned long pvr; ++ ++ if (reserve_pmc_hardware(perf_counter_interrupt)) { ++ printk(KERN_ERR "Couldn't init performance monitor subsystem\n"); ++ return -EBUSY; ++ } ++ ++ /* XXX should get this from cputable */ ++ pvr = mfspr(SPRN_PVR); ++ switch (PVR_VER(pvr)) { ++ case PV_POWER4: ++ case PV_POWER4p: ++ ppmu = &power4_pmu; ++ break; ++ case PV_970: ++ case PV_970FX: ++ case PV_970MP: ++ ppmu = &ppc970_pmu; ++ break; ++ case PV_POWER5: ++ ppmu = &power5_pmu; ++ break; ++ case PV_POWER5p: ++ ppmu = &power5p_pmu; ++ break; ++ case 0x3e: ++ ppmu = &power6_pmu; ++ break; ++ } ++ ++ /* ++ * Use FCHV to ignore kernel events if MSR.HV is set. ++ */ ++ if (mfmsr() & MSR_HV) ++ freeze_counters_kernel = MMCR0_FCHV; ++ ++ return 0; ++} ++ ++arch_initcall(init_perf_counters); +Index: linux-2.6-tip/arch/powerpc/kernel/power4-pmu.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/powerpc/kernel/power4-pmu.c +@@ -0,0 +1,557 @@ ++/* ++ * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors. ++ * ++ * Copyright 2009 Paul Mackerras, IBM Corporation. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#include ++#include ++#include ++ ++/* ++ * Bits in event code for POWER4 ++ */ ++#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ ++#define PM_PMC_MSK 0xf ++#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ ++#define PM_UNIT_MSK 0xf ++#define PM_LOWER_SH 6 ++#define PM_LOWER_MSK 1 ++#define PM_LOWER_MSKS 0x40 ++#define PM_BYTE_SH 4 /* Byte number of event bus to use */ ++#define PM_BYTE_MSK 3 ++#define PM_PMCSEL_MSK 7 ++ ++/* ++ * Unit code values ++ */ ++#define PM_FPU 1 ++#define PM_ISU1 2 ++#define PM_IFU 3 ++#define PM_IDU0 4 ++#define PM_ISU1_ALT 6 ++#define PM_ISU2 7 ++#define PM_IFU_ALT 8 ++#define PM_LSU0 9 ++#define PM_LSU1 0xc ++#define PM_GPS 0xf ++ ++/* ++ * Bits in MMCR0 for POWER4 ++ */ ++#define MMCR0_PMC1SEL_SH 8 ++#define MMCR0_PMC2SEL_SH 1 ++#define MMCR_PMCSEL_MSK 0x1f ++ ++/* ++ * Bits in MMCR1 for POWER4 ++ */ ++#define MMCR1_TTM0SEL_SH 62 ++#define MMCR1_TTC0SEL_SH 61 ++#define MMCR1_TTM1SEL_SH 59 ++#define MMCR1_TTC1SEL_SH 58 ++#define MMCR1_TTM2SEL_SH 56 ++#define MMCR1_TTC2SEL_SH 55 ++#define MMCR1_TTM3SEL_SH 53 ++#define MMCR1_TTC3SEL_SH 52 ++#define MMCR1_TTMSEL_MSK 3 ++#define MMCR1_TD_CP_DBG0SEL_SH 50 ++#define MMCR1_TD_CP_DBG1SEL_SH 48 ++#define MMCR1_TD_CP_DBG2SEL_SH 46 ++#define MMCR1_TD_CP_DBG3SEL_SH 44 ++#define MMCR1_DEBUG0SEL_SH 43 ++#define MMCR1_DEBUG1SEL_SH 42 ++#define MMCR1_DEBUG2SEL_SH 41 ++#define MMCR1_DEBUG3SEL_SH 40 ++#define MMCR1_PMC1_ADDER_SEL_SH 39 ++#define MMCR1_PMC2_ADDER_SEL_SH 38 ++#define MMCR1_PMC6_ADDER_SEL_SH 37 ++#define MMCR1_PMC5_ADDER_SEL_SH 36 ++#define MMCR1_PMC8_ADDER_SEL_SH 35 ++#define MMCR1_PMC7_ADDER_SEL_SH 34 ++#define MMCR1_PMC3_ADDER_SEL_SH 33 ++#define MMCR1_PMC4_ADDER_SEL_SH 32 ++#define MMCR1_PMC3SEL_SH 27 ++#define MMCR1_PMC4SEL_SH 22 ++#define MMCR1_PMC5SEL_SH 17 ++#define MMCR1_PMC6SEL_SH 12 ++#define MMCR1_PMC7SEL_SH 7 ++#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */ ++ ++static short mmcr1_adder_bits[8] = { ++ MMCR1_PMC1_ADDER_SEL_SH, ++ MMCR1_PMC2_ADDER_SEL_SH, ++ MMCR1_PMC3_ADDER_SEL_SH, ++ MMCR1_PMC4_ADDER_SEL_SH, ++ MMCR1_PMC5_ADDER_SEL_SH, ++ MMCR1_PMC6_ADDER_SEL_SH, ++ MMCR1_PMC7_ADDER_SEL_SH, ++ MMCR1_PMC8_ADDER_SEL_SH ++}; ++ ++/* ++ * Bits in MMCRA ++ */ ++#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */ ++ ++/* ++ * Layout of constraint bits: ++ * 6666555555555544444444443333333333222222222211111111110000000000 ++ * 3210987654321098765432109876543210987654321098765432109876543210 ++ * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><> ++ * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 ++ * \SMPL ||\TTC3SEL ++ * |\TTC_IFU_SEL ++ * \TTM2SEL0 ++ * ++ * SMPL - SAMPLE_ENABLE constraint ++ * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000 ++ * ++ * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2 ++ * 55: UC1 error 0x0080_0000_0000_0000 ++ * 54: FPU events needed 0x0040_0000_0000_0000 ++ * 53: ISU1 events needed 0x0020_0000_0000_0000 ++ * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000 ++ * ++ * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0 ++ * 51: UC2 error 0x0008_0000_0000_0000 ++ * 50: FPU events needed 0x0004_0000_0000_0000 ++ * 49: IFU events needed 0x0002_0000_0000_0000 ++ * 48: LSU0 events needed 0x0001_0000_0000_0000 ++ * ++ * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1 ++ * 47: UC3 error 0x8000_0000_0000 ++ * 46: LSU0 events needed 0x4000_0000_0000 ++ * 45: IFU events needed 0x2000_0000_0000 ++ * 44: IDU0|ISU2 events needed 0x1000_0000_0000 ++ * 43: ISU1 events needed 0x0800_0000_0000 ++ * ++ * TTM2SEL0 ++ * 42: 0 = IDU0 events needed ++ * 1 = ISU2 events needed 0x0400_0000_0000 ++ * ++ * TTC_IFU_SEL ++ * 41: 0 = IFU.U events needed ++ * 1 = IFU.L events needed 0x0200_0000_0000 ++ * ++ * TTC3SEL ++ * 40: 0 = LSU1.U events needed ++ * 1 = LSU1.L events needed 0x0100_0000_0000 ++ * ++ * PS1 ++ * 39: PS1 error 0x0080_0000_0000 ++ * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 ++ * ++ * PS2 ++ * 35: PS2 error 0x0008_0000_0000 ++ * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 ++ * ++ * B0 ++ * 28-31: Byte 0 event source 0xf000_0000 ++ * 1 = FPU ++ * 2 = ISU1 ++ * 3 = IFU ++ * 4 = IDU0 ++ * 7 = ISU2 ++ * 9 = LSU0 ++ * c = LSU1 ++ * f = GPS ++ * ++ * B1, B2, B3 ++ * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources ++ * ++ * P8 ++ * 15: P8 error 0x8000 ++ * 14-15: Count of events needing PMC8 ++ * ++ * P1..P7 ++ * 0-13: Count of events needing PMC1..PMC7 ++ * ++ * Note: this doesn't allow events using IFU.U to be combined with events ++ * using IFU.L, though that is feasible (using TTM0 and TTM2). However ++ * there are no listed events for IFU.L (they are debug events not ++ * verified for performance monitoring) so this shouldn't cause a ++ * problem. ++ */ ++ ++static struct unitinfo { ++ u64 value, mask; ++ int unit; ++ int lowerbit; ++} p4_unitinfo[16] = { ++ [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 }, ++ [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, ++ [PM_ISU1_ALT] = ++ { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, ++ [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, ++ [PM_IFU_ALT] = ++ { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, ++ [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 }, ++ [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 }, ++ [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 }, ++ [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 }, ++ [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 } ++}; ++ ++static unsigned char direct_marked_event[8] = { ++ (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ ++ (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ ++ (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */ ++ (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ ++ (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */ ++ (1<<3) | (1<<4) | (1<<5), ++ /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ ++ (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ ++ (1<<4), /* PMC8: PM_MRK_LSU_FIN */ ++}; ++ ++/* ++ * Returns 1 if event counts things relating to marked instructions ++ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. ++ */ ++static int p4_marked_instr_event(unsigned int event) ++{ ++ int pmc, psel, unit, byte, bit; ++ unsigned int mask; ++ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ psel = event & PM_PMCSEL_MSK; ++ if (pmc) { ++ if (direct_marked_event[pmc - 1] & (1 << psel)) ++ return 1; ++ if (psel == 0) /* add events */ ++ bit = (pmc <= 4)? pmc - 1: 8 - pmc; ++ else if (psel == 6) /* decode events */ ++ bit = 4; ++ else ++ return 0; ++ } else ++ bit = psel; ++ ++ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; ++ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; ++ mask = 0; ++ switch (unit) { ++ case PM_LSU1: ++ if (event & PM_LOWER_MSKS) ++ mask = 1 << 28; /* byte 7 bit 4 */ ++ else ++ mask = 6 << 24; /* byte 3 bits 1 and 2 */ ++ break; ++ case PM_LSU0: ++ /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */ ++ mask = 0x083dff00; ++ } ++ return (mask >> (byte * 8 + bit)) & 1; ++} ++ ++static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp) ++{ ++ int pmc, byte, unit, lower, sh; ++ u64 mask = 0, value = 0; ++ int grp = -1; ++ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc > 8) ++ return -1; ++ sh = (pmc - 1) * 2; ++ mask |= 2 << sh; ++ value |= 1 << sh; ++ grp = ((pmc - 1) >> 1) & 1; ++ } ++ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; ++ if (unit) { ++ lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK; ++ ++ /* ++ * Bus events on bytes 0 and 2 can be counted ++ * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. ++ */ ++ if (!pmc) ++ grp = byte & 1; ++ ++ if (!p4_unitinfo[unit].unit) ++ return -1; ++ mask |= p4_unitinfo[unit].mask; ++ value |= p4_unitinfo[unit].value; ++ sh = p4_unitinfo[unit].lowerbit; ++ if (sh > 1) ++ value |= (u64)lower << sh; ++ else if (lower != sh) ++ return -1; ++ unit = p4_unitinfo[unit].unit; ++ ++ /* Set byte lane select field */ ++ mask |= 0xfULL << (28 - 4 * byte); ++ value |= (u64)unit << (28 - 4 * byte); ++ } ++ if (grp == 0) { ++ /* increment PMC1/2/5/6 field */ ++ mask |= 0x8000000000ull; ++ value |= 0x1000000000ull; ++ } else { ++ /* increment PMC3/4/7/8 field */ ++ mask |= 0x800000000ull; ++ value |= 0x100000000ull; ++ } ++ ++ /* Marked instruction events need sample_enable set */ ++ if (p4_marked_instr_event(event)) { ++ mask |= 1ull << 56; ++ value |= 1ull << 56; ++ } ++ ++ /* PMCSEL=6 decode events on byte 2 need sample_enable clear */ ++ if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2) ++ mask |= 1ull << 56; ++ ++ *maskp = mask; ++ *valp = value; ++ return 0; ++} ++ ++static unsigned int ppc_inst_cmpl[] = { ++ 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 ++}; ++ ++static int p4_get_alternatives(unsigned int event, unsigned int alt[]) ++{ ++ int i, j, na; ++ ++ alt[0] = event; ++ na = 1; ++ ++ /* 2 possibilities for PM_GRP_DISP_REJECT */ ++ if (event == 0x8003 || event == 0x0224) { ++ alt[1] = event ^ (0x8003 ^ 0x0224); ++ return 2; ++ } ++ ++ /* 2 possibilities for PM_ST_MISS_L1 */ ++ if (event == 0x0c13 || event == 0x0c23) { ++ alt[1] = event ^ (0x0c13 ^ 0x0c23); ++ return 2; ++ } ++ ++ /* several possibilities for PM_INST_CMPL */ ++ for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) { ++ if (event == ppc_inst_cmpl[i]) { ++ for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j) ++ if (j != i) ++ alt[na++] = ppc_inst_cmpl[j]; ++ break; ++ } ++ } ++ ++ return na; ++} ++ ++static int p4_compute_mmcr(unsigned int event[], int n_ev, ++ unsigned int hwc[], u64 mmcr[]) ++{ ++ u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; ++ unsigned int pmc, unit, byte, psel, lower; ++ unsigned int ttm, grp; ++ unsigned int pmc_inuse = 0; ++ unsigned int pmc_grp_use[2]; ++ unsigned char busbyte[4]; ++ unsigned char unituse[16]; ++ unsigned int unitlower = 0; ++ int i; ++ ++ if (n_ev > 8) ++ return -1; ++ ++ /* First pass to count resource use */ ++ pmc_grp_use[0] = pmc_grp_use[1] = 0; ++ memset(busbyte, 0, sizeof(busbyte)); ++ memset(unituse, 0, sizeof(unituse)); ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc_inuse & (1 << (pmc - 1))) ++ return -1; ++ pmc_inuse |= 1 << (pmc - 1); ++ /* count 1/2/5/6 vs 3/4/7/8 use */ ++ ++pmc_grp_use[((pmc - 1) >> 1) & 1]; ++ } ++ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; ++ lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK; ++ if (unit) { ++ if (!pmc) ++ ++pmc_grp_use[byte & 1]; ++ if (unit == 6 || unit == 8) ++ /* map alt ISU1/IFU codes: 6->2, 8->3 */ ++ unit = (unit >> 1) - 1; ++ if (busbyte[byte] && busbyte[byte] != unit) ++ return -1; ++ busbyte[byte] = unit; ++ lower <<= unit; ++ if (unituse[unit] && lower != (unitlower & lower)) ++ return -1; ++ unituse[unit] = 1; ++ unitlower |= lower; ++ } ++ } ++ if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) ++ return -1; ++ ++ /* ++ * Assign resources and set multiplexer selects. ++ * ++ * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2. ++ * Each TTMx can only select one unit, but since ++ * units 2 and 6 are both ISU1, and 3 and 8 are both IFU, ++ * we have some choices. ++ */ ++ if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) { ++ unituse[6] = 1; /* Move 2 to 6 */ ++ unituse[2] = 0; ++ } ++ if (unituse[3] & (unituse[1] | unituse[2])) { ++ unituse[8] = 1; /* Move 3 to 8 */ ++ unituse[3] = 0; ++ unitlower = (unitlower & ~8) | ((unitlower & 8) << 5); ++ } ++ /* Check only one unit per TTMx */ ++ if (unituse[1] + unituse[2] + unituse[3] > 1 || ++ unituse[4] + unituse[6] + unituse[7] > 1 || ++ unituse[8] + unituse[9] > 1 || ++ (unituse[5] | unituse[10] | unituse[11] | ++ unituse[13] | unituse[14])) ++ return -1; ++ ++ /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */ ++ mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH; ++ mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH; ++ mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH; ++ ++ /* Set TTCxSEL fields. */ ++ if (unitlower & 0xe) ++ mmcr1 |= 1ull << MMCR1_TTC0SEL_SH; ++ if (unitlower & 0xf0) ++ mmcr1 |= 1ull << MMCR1_TTC1SEL_SH; ++ if (unitlower & 0xf00) ++ mmcr1 |= 1ull << MMCR1_TTC2SEL_SH; ++ if (unitlower & 0x7000) ++ mmcr1 |= 1ull << MMCR1_TTC3SEL_SH; ++ ++ /* Set byte lane select fields. */ ++ for (byte = 0; byte < 4; ++byte) { ++ unit = busbyte[byte]; ++ if (!unit) ++ continue; ++ if (unit == 0xf) { ++ /* special case for GPS */ ++ mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte); ++ } else { ++ if (!unituse[unit]) ++ ttm = unit - 1; /* 2->1, 3->2 */ ++ else ++ ttm = unit >> 2; ++ mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte); ++ } ++ } ++ ++ /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; ++ psel = event[i] & PM_PMCSEL_MSK; ++ if (!pmc) { ++ /* Bus event or 00xxx direct event (off or cycles) */ ++ if (unit) ++ psel |= 0x10 | ((byte & 2) << 2); ++ for (pmc = 0; pmc < 8; ++pmc) { ++ if (pmc_inuse & (1 << pmc)) ++ continue; ++ grp = (pmc >> 1) & 1; ++ if (unit) { ++ if (grp == (byte & 1)) ++ break; ++ } else if (pmc_grp_use[grp] < 4) { ++ ++pmc_grp_use[grp]; ++ break; ++ } ++ } ++ pmc_inuse |= 1 << pmc; ++ } else { ++ /* Direct event */ ++ --pmc; ++ if (psel == 0 && (byte & 2)) ++ /* add events on higher-numbered bus */ ++ mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; ++ else if (psel == 6 && byte == 3) ++ /* seem to need to set sample_enable here */ ++ mmcra |= MMCRA_SAMPLE_ENABLE; ++ psel |= 8; ++ } ++ if (pmc <= 1) ++ mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc); ++ else ++ mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); ++ if (pmc == 7) /* PMC8 */ ++ mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH; ++ hwc[i] = pmc; ++ if (p4_marked_instr_event(event[i])) ++ mmcra |= MMCRA_SAMPLE_ENABLE; ++ } ++ ++ if (pmc_inuse & 1) ++ mmcr0 |= MMCR0_PMC1CE; ++ if (pmc_inuse & 0xfe) ++ mmcr0 |= MMCR0_PMCjCE; ++ ++ mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ ++ ++ /* Return MMCRx values */ ++ mmcr[0] = mmcr0; ++ mmcr[1] = mmcr1; ++ mmcr[2] = mmcra; ++ return 0; ++} ++ ++static void p4_disable_pmc(unsigned int pmc, u64 mmcr[]) ++{ ++ /* ++ * Setting the PMCxSEL field to 0 disables PMC x. ++ * (Note that pmc is 0-based here, not 1-based.) ++ */ ++ if (pmc <= 1) { ++ mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc)); ++ } else { ++ mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2))); ++ if (pmc == 7) ++ mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH); ++ } ++} ++ ++static int p4_generic_events[] = { ++ [PERF_COUNT_CPU_CYCLES] = 7, ++ [PERF_COUNT_INSTRUCTIONS] = 0x1001, ++ [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ ++ [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ ++ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ ++ [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ ++}; ++ ++struct power_pmu power4_pmu = { ++ .n_counter = 8, ++ .max_alternatives = 5, ++ .add_fields = 0x0000001100005555ull, ++ .test_adder = 0x0011083300000000ull, ++ .compute_mmcr = p4_compute_mmcr, ++ .get_constraint = p4_get_constraint, ++ .get_alternatives = p4_get_alternatives, ++ .disable_pmc = p4_disable_pmc, ++ .n_generic = ARRAY_SIZE(p4_generic_events), ++ .generic_events = p4_generic_events, ++}; +Index: linux-2.6-tip/arch/powerpc/kernel/power5+-pmu.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/powerpc/kernel/power5+-pmu.c +@@ -0,0 +1,452 @@ ++/* ++ * Performance counter support for POWER5 (not POWER5++) processors. ++ * ++ * Copyright 2009 Paul Mackerras, IBM Corporation. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#include ++#include ++#include ++ ++/* ++ * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3) ++ */ ++#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ ++#define PM_PMC_MSK 0xf ++#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) ++#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ ++#define PM_UNIT_MSK 0xf ++#define PM_BYTE_SH 12 /* Byte number of event bus to use */ ++#define PM_BYTE_MSK 7 ++#define PM_GRS_SH 8 /* Storage subsystem mux select */ ++#define PM_GRS_MSK 7 ++#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ ++#define PM_PMCSEL_MSK 0x7f ++ ++/* Values in PM_UNIT field */ ++#define PM_FPU 0 ++#define PM_ISU0 1 ++#define PM_IFU 2 ++#define PM_ISU1 3 ++#define PM_IDU 4 ++#define PM_ISU0_ALT 6 ++#define PM_GRS 7 ++#define PM_LSU0 8 ++#define PM_LSU1 0xc ++#define PM_LASTUNIT 0xc ++ ++/* ++ * Bits in MMCR1 for POWER5+ ++ */ ++#define MMCR1_TTM0SEL_SH 62 ++#define MMCR1_TTM1SEL_SH 60 ++#define MMCR1_TTM2SEL_SH 58 ++#define MMCR1_TTM3SEL_SH 56 ++#define MMCR1_TTMSEL_MSK 3 ++#define MMCR1_TD_CP_DBG0SEL_SH 54 ++#define MMCR1_TD_CP_DBG1SEL_SH 52 ++#define MMCR1_TD_CP_DBG2SEL_SH 50 ++#define MMCR1_TD_CP_DBG3SEL_SH 48 ++#define MMCR1_GRS_L2SEL_SH 46 ++#define MMCR1_GRS_L2SEL_MSK 3 ++#define MMCR1_GRS_L3SEL_SH 44 ++#define MMCR1_GRS_L3SEL_MSK 3 ++#define MMCR1_GRS_MCSEL_SH 41 ++#define MMCR1_GRS_MCSEL_MSK 7 ++#define MMCR1_GRS_FABSEL_SH 39 ++#define MMCR1_GRS_FABSEL_MSK 3 ++#define MMCR1_PMC1_ADDER_SEL_SH 35 ++#define MMCR1_PMC2_ADDER_SEL_SH 34 ++#define MMCR1_PMC3_ADDER_SEL_SH 33 ++#define MMCR1_PMC4_ADDER_SEL_SH 32 ++#define MMCR1_PMC1SEL_SH 25 ++#define MMCR1_PMC2SEL_SH 17 ++#define MMCR1_PMC3SEL_SH 9 ++#define MMCR1_PMC4SEL_SH 1 ++#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) ++#define MMCR1_PMCSEL_MSK 0x7f ++ ++/* ++ * Bits in MMCRA ++ */ ++ ++/* ++ * Layout of constraint bits: ++ * 6666555555555544444444443333333333222222222211111111110000000000 ++ * 3210987654321098765432109876543210987654321098765432109876543210 ++ * [ ><><>< ><> <><>[ > < >< >< >< ><><><><> ++ * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P4P3P2P1 ++ * ++ * NC - number of counters ++ * 51: NC error 0x0008_0000_0000_0000 ++ * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 ++ * ++ * G0..G3 - GRS mux constraints ++ * 46-47: GRS_L2SEL value ++ * 44-45: GRS_L3SEL value ++ * 41-44: GRS_MCSEL value ++ * 39-40: GRS_FABSEL value ++ * Note that these match up with their bit positions in MMCR1 ++ * ++ * T0 - TTM0 constraint ++ * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000 ++ * ++ * T1 - TTM1 constraint ++ * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000 ++ * ++ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS ++ * 33: UC3 error 0x02_0000_0000 ++ * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000 ++ * 31: ISU0 events needed 0x01_8000_0000 ++ * 30: IDU|GRS events needed 0x00_4000_0000 ++ * ++ * B0 ++ * 20-23: Byte 0 event source 0x00f0_0000 ++ * Encoding as for the event code ++ * ++ * B1, B2, B3 ++ * 16-19, 12-15, 8-11: Byte 1, 2, 3 event sources ++ * ++ * P4 ++ * 7: P1 error 0x80 ++ * 6-7: Count of events needing PMC4 ++ * ++ * P1..P3 ++ * 0-6: Count of events needing PMC1..PMC3 ++ */ ++ ++static const int grsel_shift[8] = { ++ MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, ++ MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, ++ MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH ++}; ++ ++/* Masks and values for using events from the various units */ ++static u64 unit_cons[PM_LASTUNIT+1][2] = { ++ [PM_FPU] = { 0x3200000000ull, 0x0100000000ull }, ++ [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull }, ++ [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull }, ++ [PM_IFU] = { 0x3200000000ull, 0x2100000000ull }, ++ [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull }, ++ [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull }, ++}; ++ ++static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) ++{ ++ int pmc, byte, unit, sh; ++ int bit, fmask; ++ u64 mask = 0, value = 0; ++ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc > 4) ++ return -1; ++ sh = (pmc - 1) * 2; ++ mask |= 2 << sh; ++ value |= 1 << sh; ++ } ++ if (event & PM_BUSEVENT_MSK) { ++ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; ++ if (unit > PM_LASTUNIT) ++ return -1; ++ if (unit == PM_ISU0_ALT) ++ unit = PM_ISU0; ++ mask |= unit_cons[unit][0]; ++ value |= unit_cons[unit][1]; ++ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; ++ if (byte >= 4) { ++ if (unit != PM_LSU1) ++ return -1; ++ /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ ++ ++unit; ++ byte &= 3; ++ } ++ if (unit == PM_GRS) { ++ bit = event & 7; ++ fmask = (bit == 6)? 7: 3; ++ sh = grsel_shift[bit]; ++ mask |= (u64)fmask << sh; ++ value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; ++ } ++ /* Set byte lane select field */ ++ mask |= 0xfULL << (20 - 4 * byte); ++ value |= (u64)unit << (20 - 4 * byte); ++ } ++ mask |= 0x8000000000000ull; ++ value |= 0x1000000000000ull; ++ *maskp = mask; ++ *valp = value; ++ return 0; ++} ++ ++#define MAX_ALT 3 /* at most 3 alternatives for any event */ ++ ++static const unsigned int event_alternatives[][MAX_ALT] = { ++ { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */ ++ { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ ++ { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */ ++ { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */ ++ { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ ++ { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */ ++ { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */ ++ { 0x100009, 0x200009 }, /* PM_INST_CMPL */ ++ { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */ ++ { 0x300009, 0x400009 }, /* PM_INST_DISP */ ++}; ++ ++/* ++ * Scan the alternatives table for a match and return the ++ * index into the alternatives table if found, else -1. ++ */ ++static int find_alternative(unsigned int event) ++{ ++ int i, j; ++ ++ for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { ++ if (event < event_alternatives[i][0]) ++ break; ++ for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) ++ if (event == event_alternatives[i][j]) ++ return i; ++ } ++ return -1; ++} ++ ++static const unsigned char bytedecode_alternatives[4][4] = { ++ /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, ++ /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, ++ /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, ++ /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } ++}; ++ ++/* ++ * Some direct events for decodes of event bus byte 3 have alternative ++ * PMCSEL values on other counters. This returns the alternative ++ * event code for those that do, or -1 otherwise. This also handles ++ * alternative PCMSEL values for add events. ++ */ ++static int find_alternative_bdecode(unsigned int event) ++{ ++ int pmc, altpmc, pp, j; ++ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc == 0 || pmc > 4) ++ return -1; ++ altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ ++ pp = event & PM_PMCSEL_MSK; ++ for (j = 0; j < 4; ++j) { ++ if (bytedecode_alternatives[pmc - 1][j] == pp) { ++ return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | ++ (altpmc << PM_PMC_SH) | ++ bytedecode_alternatives[altpmc - 1][j]; ++ } ++ } ++ ++ /* new decode alternatives for power5+ */ ++ if (pmc == 1 && (pp == 0x0d || pp == 0x0e)) ++ return event + (2 << PM_PMC_SH) + (0x2e - 0x0d); ++ if (pmc == 3 && (pp == 0x2e || pp == 0x2f)) ++ return event - (2 << PM_PMC_SH) - (0x2e - 0x0d); ++ ++ /* alternative add event encodings */ ++ if (pp == 0x10 || pp == 0x28) ++ return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) | ++ (altpmc << PM_PMC_SH); ++ ++ return -1; ++} ++ ++static int power5p_get_alternatives(unsigned int event, unsigned int alt[]) ++{ ++ int i, j, ae, nalt = 1; ++ ++ alt[0] = event; ++ nalt = 1; ++ i = find_alternative(event); ++ if (i >= 0) { ++ for (j = 0; j < MAX_ALT; ++j) { ++ ae = event_alternatives[i][j]; ++ if (ae && ae != event) ++ alt[nalt++] = ae; ++ } ++ } else { ++ ae = find_alternative_bdecode(event); ++ if (ae > 0) ++ alt[nalt++] = ae; ++ } ++ return nalt; ++} ++ ++static int power5p_compute_mmcr(unsigned int event[], int n_ev, ++ unsigned int hwc[], u64 mmcr[]) ++{ ++ u64 mmcr1 = 0; ++ unsigned int pmc, unit, byte, psel; ++ unsigned int ttm; ++ int i, isbus, bit, grsel; ++ unsigned int pmc_inuse = 0; ++ unsigned char busbyte[4]; ++ unsigned char unituse[16]; ++ int ttmuse; ++ ++ if (n_ev > 4) ++ return -1; ++ ++ /* First pass to count resource use */ ++ memset(busbyte, 0, sizeof(busbyte)); ++ memset(unituse, 0, sizeof(unituse)); ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc > 4) ++ return -1; ++ if (pmc_inuse & (1 << (pmc - 1))) ++ return -1; ++ pmc_inuse |= 1 << (pmc - 1); ++ } ++ if (event[i] & PM_BUSEVENT_MSK) { ++ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; ++ if (unit > PM_LASTUNIT) ++ return -1; ++ if (unit == PM_ISU0_ALT) ++ unit = PM_ISU0; ++ if (byte >= 4) { ++ if (unit != PM_LSU1) ++ return -1; ++ ++unit; ++ byte &= 3; ++ } ++ if (busbyte[byte] && busbyte[byte] != unit) ++ return -1; ++ busbyte[byte] = unit; ++ unituse[unit] = 1; ++ } ++ } ++ ++ /* ++ * Assign resources and set multiplexer selects. ++ * ++ * PM_ISU0 can go either on TTM0 or TTM1, but that's the only ++ * choice we have to deal with. ++ */ ++ if (unituse[PM_ISU0] & ++ (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { ++ unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ ++ unituse[PM_ISU0] = 0; ++ } ++ /* Set TTM[01]SEL fields. */ ++ ttmuse = 0; ++ for (i = PM_FPU; i <= PM_ISU1; ++i) { ++ if (!unituse[i]) ++ continue; ++ if (ttmuse++) ++ return -1; ++ mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; ++ } ++ ttmuse = 0; ++ for (; i <= PM_GRS; ++i) { ++ if (!unituse[i]) ++ continue; ++ if (ttmuse++) ++ return -1; ++ mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; ++ } ++ if (ttmuse > 1) ++ return -1; ++ ++ /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ ++ for (byte = 0; byte < 4; ++byte) { ++ unit = busbyte[byte]; ++ if (!unit) ++ continue; ++ if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { ++ /* get ISU0 through TTM1 rather than TTM0 */ ++ unit = PM_ISU0_ALT; ++ } else if (unit == PM_LSU1 + 1) { ++ /* select lower word of LSU1 for this byte */ ++ mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); ++ } ++ ttm = unit >> 2; ++ mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); ++ } ++ ++ /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; ++ psel = event[i] & PM_PMCSEL_MSK; ++ isbus = event[i] & PM_BUSEVENT_MSK; ++ if (!pmc) { ++ /* Bus event or any-PMC direct event */ ++ for (pmc = 0; pmc < 4; ++pmc) { ++ if (!(pmc_inuse & (1 << pmc))) ++ break; ++ } ++ if (pmc >= 4) ++ return -1; ++ pmc_inuse |= 1 << pmc; ++ } else { ++ /* Direct event */ ++ --pmc; ++ if (isbus && (byte & 2) && ++ (psel == 8 || psel == 0x10 || psel == 0x28)) ++ /* add events on higher-numbered bus */ ++ mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); ++ } ++ if (isbus && unit == PM_GRS) { ++ bit = psel & 7; ++ grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; ++ mmcr1 |= (u64)grsel << grsel_shift[bit]; ++ } ++ if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1)) ++ /* select alternate byte lane */ ++ psel |= 0x10; ++ if (pmc <= 3) ++ mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); ++ hwc[i] = pmc; ++ } ++ ++ /* Return MMCRx values */ ++ mmcr[0] = 0; ++ if (pmc_inuse & 1) ++ mmcr[0] = MMCR0_PMC1CE; ++ if (pmc_inuse & 0x3e) ++ mmcr[0] |= MMCR0_PMCjCE; ++ mmcr[1] = mmcr1; ++ mmcr[2] = 0; ++ return 0; ++} ++ ++static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[]) ++{ ++ if (pmc <= 3) ++ mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); ++} ++ ++static int power5p_generic_events[] = { ++ [PERF_COUNT_CPU_CYCLES] = 0xf, ++ [PERF_COUNT_INSTRUCTIONS] = 0x100009, ++ [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ ++ [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ ++ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ ++ [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ ++}; ++ ++struct power_pmu power5p_pmu = { ++ .n_counter = 4, ++ .max_alternatives = MAX_ALT, ++ .add_fields = 0x7000000000055ull, ++ .test_adder = 0x3000040000000ull, ++ .compute_mmcr = power5p_compute_mmcr, ++ .get_constraint = power5p_get_constraint, ++ .get_alternatives = power5p_get_alternatives, ++ .disable_pmc = power5p_disable_pmc, ++ .n_generic = ARRAY_SIZE(power5p_generic_events), ++ .generic_events = power5p_generic_events, ++}; +Index: linux-2.6-tip/arch/powerpc/kernel/power5-pmu.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/powerpc/kernel/power5-pmu.c +@@ -0,0 +1,475 @@ ++/* ++ * Performance counter support for POWER5 (not POWER5++) processors. ++ * ++ * Copyright 2009 Paul Mackerras, IBM Corporation. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#include ++#include ++#include ++ ++/* ++ * Bits in event code for POWER5 (not POWER5++) ++ */ ++#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ ++#define PM_PMC_MSK 0xf ++#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) ++#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ ++#define PM_UNIT_MSK 0xf ++#define PM_BYTE_SH 12 /* Byte number of event bus to use */ ++#define PM_BYTE_MSK 7 ++#define PM_GRS_SH 8 /* Storage subsystem mux select */ ++#define PM_GRS_MSK 7 ++#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ ++#define PM_PMCSEL_MSK 0x7f ++ ++/* Values in PM_UNIT field */ ++#define PM_FPU 0 ++#define PM_ISU0 1 ++#define PM_IFU 2 ++#define PM_ISU1 3 ++#define PM_IDU 4 ++#define PM_ISU0_ALT 6 ++#define PM_GRS 7 ++#define PM_LSU0 8 ++#define PM_LSU1 0xc ++#define PM_LASTUNIT 0xc ++ ++/* ++ * Bits in MMCR1 for POWER5 ++ */ ++#define MMCR1_TTM0SEL_SH 62 ++#define MMCR1_TTM1SEL_SH 60 ++#define MMCR1_TTM2SEL_SH 58 ++#define MMCR1_TTM3SEL_SH 56 ++#define MMCR1_TTMSEL_MSK 3 ++#define MMCR1_TD_CP_DBG0SEL_SH 54 ++#define MMCR1_TD_CP_DBG1SEL_SH 52 ++#define MMCR1_TD_CP_DBG2SEL_SH 50 ++#define MMCR1_TD_CP_DBG3SEL_SH 48 ++#define MMCR1_GRS_L2SEL_SH 46 ++#define MMCR1_GRS_L2SEL_MSK 3 ++#define MMCR1_GRS_L3SEL_SH 44 ++#define MMCR1_GRS_L3SEL_MSK 3 ++#define MMCR1_GRS_MCSEL_SH 41 ++#define MMCR1_GRS_MCSEL_MSK 7 ++#define MMCR1_GRS_FABSEL_SH 39 ++#define MMCR1_GRS_FABSEL_MSK 3 ++#define MMCR1_PMC1_ADDER_SEL_SH 35 ++#define MMCR1_PMC2_ADDER_SEL_SH 34 ++#define MMCR1_PMC3_ADDER_SEL_SH 33 ++#define MMCR1_PMC4_ADDER_SEL_SH 32 ++#define MMCR1_PMC1SEL_SH 25 ++#define MMCR1_PMC2SEL_SH 17 ++#define MMCR1_PMC3SEL_SH 9 ++#define MMCR1_PMC4SEL_SH 1 ++#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) ++#define MMCR1_PMCSEL_MSK 0x7f ++ ++/* ++ * Bits in MMCRA ++ */ ++ ++/* ++ * Layout of constraint bits: ++ * 6666555555555544444444443333333333222222222211111111110000000000 ++ * 3210987654321098765432109876543210987654321098765432109876543210 ++ * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><> ++ * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1 ++ * ++ * T0 - TTM0 constraint ++ * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000 ++ * ++ * T1 - TTM1 constraint ++ * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000 ++ * ++ * NC - number of counters ++ * 51: NC error 0x0008_0000_0000_0000 ++ * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 ++ * ++ * G0..G3 - GRS mux constraints ++ * 46-47: GRS_L2SEL value ++ * 44-45: GRS_L3SEL value ++ * 41-44: GRS_MCSEL value ++ * 39-40: GRS_FABSEL value ++ * Note that these match up with their bit positions in MMCR1 ++ * ++ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS ++ * 37: UC3 error 0x20_0000_0000 ++ * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000 ++ * 35: ISU0 events needed 0x08_0000_0000 ++ * 34: IDU|GRS events needed 0x04_0000_0000 ++ * ++ * PS1 ++ * 33: PS1 error 0x2_0000_0000 ++ * 31-32: count of events needing PMC1/2 0x1_8000_0000 ++ * ++ * PS2 ++ * 30: PS2 error 0x4000_0000 ++ * 28-29: count of events needing PMC3/4 0x3000_0000 ++ * ++ * B0 ++ * 24-27: Byte 0 event source 0x0f00_0000 ++ * Encoding as for the event code ++ * ++ * B1, B2, B3 ++ * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources ++ * ++ * P1..P6 ++ * 0-11: Count of events needing PMC1..PMC6 ++ */ ++ ++static const int grsel_shift[8] = { ++ MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, ++ MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, ++ MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH ++}; ++ ++/* Masks and values for using events from the various units */ ++static u64 unit_cons[PM_LASTUNIT+1][2] = { ++ [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull }, ++ [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull }, ++ [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull }, ++ [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull }, ++ [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull }, ++ [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull }, ++}; ++ ++static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp) ++{ ++ int pmc, byte, unit, sh; ++ int bit, fmask; ++ u64 mask = 0, value = 0; ++ int grp = -1; ++ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc > 6) ++ return -1; ++ sh = (pmc - 1) * 2; ++ mask |= 2 << sh; ++ value |= 1 << sh; ++ if (pmc <= 4) ++ grp = (pmc - 1) >> 1; ++ else if (event != 0x500009 && event != 0x600005) ++ return -1; ++ } ++ if (event & PM_BUSEVENT_MSK) { ++ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; ++ if (unit > PM_LASTUNIT) ++ return -1; ++ if (unit == PM_ISU0_ALT) ++ unit = PM_ISU0; ++ mask |= unit_cons[unit][0]; ++ value |= unit_cons[unit][1]; ++ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; ++ if (byte >= 4) { ++ if (unit != PM_LSU1) ++ return -1; ++ /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ ++ ++unit; ++ byte &= 3; ++ } ++ if (unit == PM_GRS) { ++ bit = event & 7; ++ fmask = (bit == 6)? 7: 3; ++ sh = grsel_shift[bit]; ++ mask |= (u64)fmask << sh; ++ value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; ++ } ++ /* ++ * Bus events on bytes 0 and 2 can be counted ++ * on PMC1/2; bytes 1 and 3 on PMC3/4. ++ */ ++ if (!pmc) ++ grp = byte & 1; ++ /* Set byte lane select field */ ++ mask |= 0xfULL << (24 - 4 * byte); ++ value |= (u64)unit << (24 - 4 * byte); ++ } ++ if (grp == 0) { ++ /* increment PMC1/2 field */ ++ mask |= 0x200000000ull; ++ value |= 0x080000000ull; ++ } else if (grp == 1) { ++ /* increment PMC3/4 field */ ++ mask |= 0x40000000ull; ++ value |= 0x10000000ull; ++ } ++ if (pmc < 5) { ++ /* need a counter from PMC1-4 set */ ++ mask |= 0x8000000000000ull; ++ value |= 0x1000000000000ull; ++ } ++ *maskp = mask; ++ *valp = value; ++ return 0; ++} ++ ++#define MAX_ALT 3 /* at most 3 alternatives for any event */ ++ ++static const unsigned int event_alternatives[][MAX_ALT] = { ++ { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ ++ { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ ++ { 0x100005, 0x600005 }, /* PM_RUN_CYC */ ++ { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */ ++ { 0x300009, 0x400009 }, /* PM_INST_DISP */ ++}; ++ ++/* ++ * Scan the alternatives table for a match and return the ++ * index into the alternatives table if found, else -1. ++ */ ++static int find_alternative(unsigned int event) ++{ ++ int i, j; ++ ++ for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { ++ if (event < event_alternatives[i][0]) ++ break; ++ for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) ++ if (event == event_alternatives[i][j]) ++ return i; ++ } ++ return -1; ++} ++ ++static const unsigned char bytedecode_alternatives[4][4] = { ++ /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, ++ /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, ++ /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, ++ /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } ++}; ++ ++/* ++ * Some direct events for decodes of event bus byte 3 have alternative ++ * PMCSEL values on other counters. This returns the alternative ++ * event code for those that do, or -1 otherwise. ++ */ ++static int find_alternative_bdecode(unsigned int event) ++{ ++ int pmc, altpmc, pp, j; ++ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc == 0 || pmc > 4) ++ return -1; ++ altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ ++ pp = event & PM_PMCSEL_MSK; ++ for (j = 0; j < 4; ++j) { ++ if (bytedecode_alternatives[pmc - 1][j] == pp) { ++ return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | ++ (altpmc << PM_PMC_SH) | ++ bytedecode_alternatives[altpmc - 1][j]; ++ } ++ } ++ return -1; ++} ++ ++static int power5_get_alternatives(unsigned int event, unsigned int alt[]) ++{ ++ int i, j, ae, nalt = 1; ++ ++ alt[0] = event; ++ nalt = 1; ++ i = find_alternative(event); ++ if (i >= 0) { ++ for (j = 0; j < MAX_ALT; ++j) { ++ ae = event_alternatives[i][j]; ++ if (ae && ae != event) ++ alt[nalt++] = ae; ++ } ++ } else { ++ ae = find_alternative_bdecode(event); ++ if (ae > 0) ++ alt[nalt++] = ae; ++ } ++ return nalt; ++} ++ ++static int power5_compute_mmcr(unsigned int event[], int n_ev, ++ unsigned int hwc[], u64 mmcr[]) ++{ ++ u64 mmcr1 = 0; ++ unsigned int pmc, unit, byte, psel; ++ unsigned int ttm, grp; ++ int i, isbus, bit, grsel; ++ unsigned int pmc_inuse = 0; ++ unsigned int pmc_grp_use[2]; ++ unsigned char busbyte[4]; ++ unsigned char unituse[16]; ++ int ttmuse; ++ ++ if (n_ev > 6) ++ return -1; ++ ++ /* First pass to count resource use */ ++ pmc_grp_use[0] = pmc_grp_use[1] = 0; ++ memset(busbyte, 0, sizeof(busbyte)); ++ memset(unituse, 0, sizeof(unituse)); ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc > 6) ++ return -1; ++ if (pmc_inuse & (1 << (pmc - 1))) ++ return -1; ++ pmc_inuse |= 1 << (pmc - 1); ++ /* count 1/2 vs 3/4 use */ ++ if (pmc <= 4) ++ ++pmc_grp_use[(pmc - 1) >> 1]; ++ } ++ if (event[i] & PM_BUSEVENT_MSK) { ++ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; ++ if (unit > PM_LASTUNIT) ++ return -1; ++ if (unit == PM_ISU0_ALT) ++ unit = PM_ISU0; ++ if (byte >= 4) { ++ if (unit != PM_LSU1) ++ return -1; ++ ++unit; ++ byte &= 3; ++ } ++ if (!pmc) ++ ++pmc_grp_use[byte & 1]; ++ if (busbyte[byte] && busbyte[byte] != unit) ++ return -1; ++ busbyte[byte] = unit; ++ unituse[unit] = 1; ++ } ++ } ++ if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2) ++ return -1; ++ ++ /* ++ * Assign resources and set multiplexer selects. ++ * ++ * PM_ISU0 can go either on TTM0 or TTM1, but that's the only ++ * choice we have to deal with. ++ */ ++ if (unituse[PM_ISU0] & ++ (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { ++ unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ ++ unituse[PM_ISU0] = 0; ++ } ++ /* Set TTM[01]SEL fields. */ ++ ttmuse = 0; ++ for (i = PM_FPU; i <= PM_ISU1; ++i) { ++ if (!unituse[i]) ++ continue; ++ if (ttmuse++) ++ return -1; ++ mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; ++ } ++ ttmuse = 0; ++ for (; i <= PM_GRS; ++i) { ++ if (!unituse[i]) ++ continue; ++ if (ttmuse++) ++ return -1; ++ mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; ++ } ++ if (ttmuse > 1) ++ return -1; ++ ++ /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ ++ for (byte = 0; byte < 4; ++byte) { ++ unit = busbyte[byte]; ++ if (!unit) ++ continue; ++ if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { ++ /* get ISU0 through TTM1 rather than TTM0 */ ++ unit = PM_ISU0_ALT; ++ } else if (unit == PM_LSU1 + 1) { ++ /* select lower word of LSU1 for this byte */ ++ mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); ++ } ++ ttm = unit >> 2; ++ mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); ++ } ++ ++ /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; ++ psel = event[i] & PM_PMCSEL_MSK; ++ isbus = event[i] & PM_BUSEVENT_MSK; ++ if (!pmc) { ++ /* Bus event or any-PMC direct event */ ++ for (pmc = 0; pmc < 4; ++pmc) { ++ if (pmc_inuse & (1 << pmc)) ++ continue; ++ grp = (pmc >> 1) & 1; ++ if (isbus) { ++ if (grp == (byte & 1)) ++ break; ++ } else if (pmc_grp_use[grp] < 2) { ++ ++pmc_grp_use[grp]; ++ break; ++ } ++ } ++ pmc_inuse |= 1 << pmc; ++ } else if (pmc <= 4) { ++ /* Direct event */ ++ --pmc; ++ if ((psel == 8 || psel == 0x10) && isbus && (byte & 2)) ++ /* add events on higher-numbered bus */ ++ mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); ++ } else { ++ /* Instructions or run cycles on PMC5/6 */ ++ --pmc; ++ } ++ if (isbus && unit == PM_GRS) { ++ bit = psel & 7; ++ grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; ++ mmcr1 |= (u64)grsel << grsel_shift[bit]; ++ } ++ if (pmc <= 3) ++ mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); ++ hwc[i] = pmc; ++ } ++ ++ /* Return MMCRx values */ ++ mmcr[0] = 0; ++ if (pmc_inuse & 1) ++ mmcr[0] = MMCR0_PMC1CE; ++ if (pmc_inuse & 0x3e) ++ mmcr[0] |= MMCR0_PMCjCE; ++ mmcr[1] = mmcr1; ++ mmcr[2] = 0; ++ return 0; ++} ++ ++static void power5_disable_pmc(unsigned int pmc, u64 mmcr[]) ++{ ++ if (pmc <= 3) ++ mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); ++} ++ ++static int power5_generic_events[] = { ++ [PERF_COUNT_CPU_CYCLES] = 0xf, ++ [PERF_COUNT_INSTRUCTIONS] = 0x100009, ++ [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ ++ [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ ++ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ ++ [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ ++}; ++ ++struct power_pmu power5_pmu = { ++ .n_counter = 6, ++ .max_alternatives = MAX_ALT, ++ .add_fields = 0x7000090000555ull, ++ .test_adder = 0x3000490000000ull, ++ .compute_mmcr = power5_compute_mmcr, ++ .get_constraint = power5_get_constraint, ++ .get_alternatives = power5_get_alternatives, ++ .disable_pmc = power5_disable_pmc, ++ .n_generic = ARRAY_SIZE(power5_generic_events), ++ .generic_events = power5_generic_events, ++}; +Index: linux-2.6-tip/arch/powerpc/kernel/power6-pmu.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/powerpc/kernel/power6-pmu.c +@@ -0,0 +1,283 @@ ++/* ++ * Performance counter support for POWER6 processors. ++ * ++ * Copyright 2008-2009 Paul Mackerras, IBM Corporation. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#include ++#include ++#include ++ ++/* ++ * Bits in event code for POWER6 ++ */ ++#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ ++#define PM_PMC_MSK 0x7 ++#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) ++#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */ ++#define PM_UNIT_MSK 0xf ++#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH) ++#define PM_LLAV 0x8000 /* Load lookahead match value */ ++#define PM_LLA 0x4000 /* Load lookahead match enable */ ++#define PM_BYTE_SH 12 /* Byte of event bus to use */ ++#define PM_BYTE_MSK 3 ++#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */ ++#define PM_SUBUNIT_MSK 7 ++#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH) ++#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */ ++#define PM_BUSEVENT_MSK 0xf3700 ++ ++/* ++ * Bits in MMCR1 for POWER6 ++ */ ++#define MMCR1_TTM0SEL_SH 60 ++#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4) ++#define MMCR1_TTMSEL_MSK 0xf ++#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK) ++#define MMCR1_NESTSEL_SH 45 ++#define MMCR1_NESTSEL_MSK 0x7 ++#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK) ++#define MMCR1_PMC1_LLA ((u64)1 << 44) ++#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39) ++#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35) ++#define MMCR1_PMC1SEL_SH 24 ++#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) ++#define MMCR1_PMCSEL_MSK 0xff ++ ++/* ++ * Assign PMC numbers and compute MMCR1 value for a set of events ++ */ ++static int p6_compute_mmcr(unsigned int event[], int n_ev, ++ unsigned int hwc[], u64 mmcr[]) ++{ ++ u64 mmcr1 = 0; ++ int i; ++ unsigned int pmc, ev, b, u, s, psel; ++ unsigned int ttmset = 0; ++ unsigned int pmc_inuse = 0; ++ ++ if (n_ev > 4) ++ return -1; ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc_inuse & (1 << (pmc - 1))) ++ return -1; /* collision! */ ++ pmc_inuse |= 1 << (pmc - 1); ++ } ++ } ++ for (i = 0; i < n_ev; ++i) { ++ ev = event[i]; ++ pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ --pmc; ++ } else { ++ /* can go on any PMC; find a free one */ ++ for (pmc = 0; pmc < 4; ++pmc) ++ if (!(pmc_inuse & (1 << pmc))) ++ break; ++ pmc_inuse |= 1 << pmc; ++ } ++ hwc[i] = pmc; ++ psel = ev & PM_PMCSEL_MSK; ++ if (ev & PM_BUSEVENT_MSK) { ++ /* this event uses the event bus */ ++ b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK; ++ u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK; ++ /* check for conflict on this byte of event bus */ ++ if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u) ++ return -1; ++ mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b); ++ ttmset |= 1 << b; ++ if (u == 5) { ++ /* Nest events have a further mux */ ++ s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; ++ if ((ttmset & 0x10) && ++ MMCR1_NESTSEL(mmcr1) != s) ++ return -1; ++ ttmset |= 0x10; ++ mmcr1 |= (u64)s << MMCR1_NESTSEL_SH; ++ } ++ if (0x30 <= psel && psel <= 0x3d) { ++ /* these need the PMCx_ADDR_SEL bits */ ++ if (b >= 2) ++ mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc; ++ } ++ /* bus select values are different for PMC3/4 */ ++ if (pmc >= 2 && (psel & 0x90) == 0x80) ++ psel ^= 0x20; ++ } ++ if (ev & PM_LLA) { ++ mmcr1 |= MMCR1_PMC1_LLA >> pmc; ++ if (ev & PM_LLAV) ++ mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc; ++ } ++ mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); ++ } ++ mmcr[0] = 0; ++ if (pmc_inuse & 1) ++ mmcr[0] = MMCR0_PMC1CE; ++ if (pmc_inuse & 0xe) ++ mmcr[0] |= MMCR0_PMCjCE; ++ mmcr[1] = mmcr1; ++ mmcr[2] = 0; ++ return 0; ++} ++ ++/* ++ * Layout of constraint bits: ++ * ++ * 0-1 add field: number of uses of PMC1 (max 1) ++ * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4 ++ * 8-10 select field: nest (subunit) event selector ++ * 16-19 select field: unit on byte 0 of event bus ++ * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 ++ */ ++static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) ++{ ++ int pmc, byte, sh; ++ unsigned int mask = 0, value = 0; ++ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc > 4) ++ return -1; ++ sh = (pmc - 1) * 2; ++ mask |= 2 << sh; ++ value |= 1 << sh; ++ } ++ if (event & PM_BUSEVENT_MSK) { ++ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; ++ sh = byte * 4; ++ mask |= PM_UNIT_MSKS << sh; ++ value |= (event & PM_UNIT_MSKS) << sh; ++ if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) { ++ mask |= PM_SUBUNIT_MSKS; ++ value |= event & PM_SUBUNIT_MSKS; ++ } ++ } ++ *maskp = mask; ++ *valp = value; ++ return 0; ++} ++ ++#define MAX_ALT 4 /* at most 4 alternatives for any event */ ++ ++static const unsigned int event_alternatives[][MAX_ALT] = { ++ { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */ ++ { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */ ++ { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */ ++ { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */ ++ { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */ ++ { 0x10000e, 0x400010 }, /* PM_PURR */ ++ { 0x100010, 0x4000f8 }, /* PM_FLUSH */ ++ { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */ ++ { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */ ++ { 0x100054, 0x2000f0 }, /* PM_ST_FIN */ ++ { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */ ++ { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */ ++ { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */ ++ { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */ ++ { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */ ++ { 0x200012, 0x300012 }, /* PM_INST_DISP */ ++ { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */ ++ { 0x2000f8, 0x300010 }, /* PM_EXT_INT */ ++ { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */ ++ { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */ ++ { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */ ++ { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */ ++ { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */ ++}; ++ ++/* ++ * This could be made more efficient with a binary search on ++ * a presorted list, if necessary ++ */ ++static int find_alternatives_list(unsigned int event) ++{ ++ int i, j; ++ unsigned int alt; ++ ++ for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { ++ if (event < event_alternatives[i][0]) ++ return -1; ++ for (j = 0; j < MAX_ALT; ++j) { ++ alt = event_alternatives[i][j]; ++ if (!alt || event < alt) ++ break; ++ if (event == alt) ++ return i; ++ } ++ } ++ return -1; ++} ++ ++static int p6_get_alternatives(unsigned int event, unsigned int alt[]) ++{ ++ int i, j; ++ unsigned int aevent, psel, pmc; ++ unsigned int nalt = 1; ++ ++ alt[0] = event; ++ ++ /* check the alternatives table */ ++ i = find_alternatives_list(event); ++ if (i >= 0) { ++ /* copy out alternatives from list */ ++ for (j = 0; j < MAX_ALT; ++j) { ++ aevent = event_alternatives[i][j]; ++ if (!aevent) ++ break; ++ if (aevent != event) ++ alt[nalt++] = aevent; ++ } ++ ++ } else { ++ /* Check for alternative ways of computing sum events */ ++ /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */ ++ psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc && (psel == 0x32 || psel == 0x34)) ++ alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) | ++ ((5 - pmc) << PM_PMC_SH); ++ ++ /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */ ++ if (pmc && (psel == 0x38 || psel == 0x3a)) ++ alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) | ++ ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH); ++ } ++ ++ return nalt; ++} ++ ++static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) ++{ ++ /* Set PMCxSEL to 0 to disable PMCx */ ++ mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); ++} ++ ++static int power6_generic_events[] = { ++ [PERF_COUNT_CPU_CYCLES] = 0x1e, ++ [PERF_COUNT_INSTRUCTIONS] = 2, ++ [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ ++ [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ ++ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ ++ [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ ++}; ++ ++struct power_pmu power6_pmu = { ++ .n_counter = 4, ++ .max_alternatives = MAX_ALT, ++ .add_fields = 0x55, ++ .test_adder = 0, ++ .compute_mmcr = p6_compute_mmcr, ++ .get_constraint = p6_get_constraint, ++ .get_alternatives = p6_get_alternatives, ++ .disable_pmc = p6_disable_pmc, ++ .n_generic = ARRAY_SIZE(power6_generic_events), ++ .generic_events = power6_generic_events, ++}; +Index: linux-2.6-tip/arch/powerpc/kernel/ppc970-pmu.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/powerpc/kernel/ppc970-pmu.c +@@ -0,0 +1,375 @@ ++/* ++ * Performance counter support for PPC970-family processors. ++ * ++ * Copyright 2008-2009 Paul Mackerras, IBM Corporation. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#include ++#include ++#include ++ ++/* ++ * Bits in event code for PPC970 ++ */ ++#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ ++#define PM_PMC_MSK 0xf ++#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ ++#define PM_UNIT_MSK 0xf ++#define PM_BYTE_SH 4 /* Byte number of event bus to use */ ++#define PM_BYTE_MSK 3 ++#define PM_PMCSEL_MSK 0xf ++ ++/* Values in PM_UNIT field */ ++#define PM_NONE 0 ++#define PM_FPU 1 ++#define PM_VPU 2 ++#define PM_ISU 3 ++#define PM_IFU 4 ++#define PM_IDU 5 ++#define PM_STS 6 ++#define PM_LSU0 7 ++#define PM_LSU1U 8 ++#define PM_LSU1L 9 ++#define PM_LASTUNIT 9 ++ ++/* ++ * Bits in MMCR0 for PPC970 ++ */ ++#define MMCR0_PMC1SEL_SH 8 ++#define MMCR0_PMC2SEL_SH 1 ++#define MMCR_PMCSEL_MSK 0x1f ++ ++/* ++ * Bits in MMCR1 for PPC970 ++ */ ++#define MMCR1_TTM0SEL_SH 62 ++#define MMCR1_TTM1SEL_SH 59 ++#define MMCR1_TTM3SEL_SH 53 ++#define MMCR1_TTMSEL_MSK 3 ++#define MMCR1_TD_CP_DBG0SEL_SH 50 ++#define MMCR1_TD_CP_DBG1SEL_SH 48 ++#define MMCR1_TD_CP_DBG2SEL_SH 46 ++#define MMCR1_TD_CP_DBG3SEL_SH 44 ++#define MMCR1_PMC1_ADDER_SEL_SH 39 ++#define MMCR1_PMC2_ADDER_SEL_SH 38 ++#define MMCR1_PMC6_ADDER_SEL_SH 37 ++#define MMCR1_PMC5_ADDER_SEL_SH 36 ++#define MMCR1_PMC8_ADDER_SEL_SH 35 ++#define MMCR1_PMC7_ADDER_SEL_SH 34 ++#define MMCR1_PMC3_ADDER_SEL_SH 33 ++#define MMCR1_PMC4_ADDER_SEL_SH 32 ++#define MMCR1_PMC3SEL_SH 27 ++#define MMCR1_PMC4SEL_SH 22 ++#define MMCR1_PMC5SEL_SH 17 ++#define MMCR1_PMC6SEL_SH 12 ++#define MMCR1_PMC7SEL_SH 7 ++#define MMCR1_PMC8SEL_SH 2 ++ ++static short mmcr1_adder_bits[8] = { ++ MMCR1_PMC1_ADDER_SEL_SH, ++ MMCR1_PMC2_ADDER_SEL_SH, ++ MMCR1_PMC3_ADDER_SEL_SH, ++ MMCR1_PMC4_ADDER_SEL_SH, ++ MMCR1_PMC5_ADDER_SEL_SH, ++ MMCR1_PMC6_ADDER_SEL_SH, ++ MMCR1_PMC7_ADDER_SEL_SH, ++ MMCR1_PMC8_ADDER_SEL_SH ++}; ++ ++/* ++ * Bits in MMCRA ++ */ ++ ++/* ++ * Layout of constraint bits: ++ * 6666555555555544444444443333333333222222222211111111110000000000 ++ * 3210987654321098765432109876543210987654321098765432109876543210 ++ * <><>[ >[ >[ >< >< >< >< ><><><><><><><><> ++ * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 ++ * ++ * T0 - TTM0 constraint ++ * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000 ++ * ++ * T1 - TTM1 constraint ++ * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000 ++ * ++ * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS ++ * 43: UC3 error 0x0800_0000_0000 ++ * 42: FPU|IFU|VPU events needed 0x0400_0000_0000 ++ * 41: ISU events needed 0x0200_0000_0000 ++ * 40: IDU|STS events needed 0x0100_0000_0000 ++ * ++ * PS1 ++ * 39: PS1 error 0x0080_0000_0000 ++ * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 ++ * ++ * PS2 ++ * 35: PS2 error 0x0008_0000_0000 ++ * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 ++ * ++ * B0 ++ * 28-31: Byte 0 event source 0xf000_0000 ++ * Encoding as for the event code ++ * ++ * B1, B2, B3 ++ * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources ++ * ++ * P1 ++ * 15: P1 error 0x8000 ++ * 14-15: Count of events needing PMC1 ++ * ++ * P2..P8 ++ * 0-13: Count of events needing PMC2..PMC8 ++ */ ++ ++/* Masks and values for using events from the various units */ ++static u64 unit_cons[PM_LASTUNIT+1][2] = { ++ [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull }, ++ [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull }, ++ [PM_ISU] = { 0x080000000000ull, 0x020000000000ull }, ++ [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull }, ++ [PM_IDU] = { 0x380000000000ull, 0x010000000000ull }, ++ [PM_STS] = { 0x380000000000ull, 0x310000000000ull }, ++}; ++ ++static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) ++{ ++ int pmc, byte, unit, sh; ++ u64 mask = 0, value = 0; ++ int grp = -1; ++ ++ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc > 8) ++ return -1; ++ sh = (pmc - 1) * 2; ++ mask |= 2 << sh; ++ value |= 1 << sh; ++ grp = ((pmc - 1) >> 1) & 1; ++ } ++ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; ++ if (unit) { ++ if (unit > PM_LASTUNIT) ++ return -1; ++ mask |= unit_cons[unit][0]; ++ value |= unit_cons[unit][1]; ++ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; ++ /* ++ * Bus events on bytes 0 and 2 can be counted ++ * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. ++ */ ++ if (!pmc) ++ grp = byte & 1; ++ /* Set byte lane select field */ ++ mask |= 0xfULL << (28 - 4 * byte); ++ value |= (u64)unit << (28 - 4 * byte); ++ } ++ if (grp == 0) { ++ /* increment PMC1/2/5/6 field */ ++ mask |= 0x8000000000ull; ++ value |= 0x1000000000ull; ++ } else if (grp == 1) { ++ /* increment PMC3/4/7/8 field */ ++ mask |= 0x800000000ull; ++ value |= 0x100000000ull; ++ } ++ *maskp = mask; ++ *valp = value; ++ return 0; ++} ++ ++static int p970_get_alternatives(unsigned int event, unsigned int alt[]) ++{ ++ alt[0] = event; ++ ++ /* 2 alternatives for LSU empty */ ++ if (event == 0x2002 || event == 0x3002) { ++ alt[1] = event ^ 0x1000; ++ return 2; ++ } ++ ++ return 1; ++} ++ ++static int p970_compute_mmcr(unsigned int event[], int n_ev, ++ unsigned int hwc[], u64 mmcr[]) ++{ ++ u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; ++ unsigned int pmc, unit, byte, psel; ++ unsigned int ttm, grp; ++ unsigned int pmc_inuse = 0; ++ unsigned int pmc_grp_use[2]; ++ unsigned char busbyte[4]; ++ unsigned char unituse[16]; ++ unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 }; ++ unsigned char ttmuse[2]; ++ unsigned char pmcsel[8]; ++ int i; ++ ++ if (n_ev > 8) ++ return -1; ++ ++ /* First pass to count resource use */ ++ pmc_grp_use[0] = pmc_grp_use[1] = 0; ++ memset(busbyte, 0, sizeof(busbyte)); ++ memset(unituse, 0, sizeof(unituse)); ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ if (pmc) { ++ if (pmc_inuse & (1 << (pmc - 1))) ++ return -1; ++ pmc_inuse |= 1 << (pmc - 1); ++ /* count 1/2/5/6 vs 3/4/7/8 use */ ++ ++pmc_grp_use[((pmc - 1) >> 1) & 1]; ++ } ++ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; ++ if (unit) { ++ if (unit > PM_LASTUNIT) ++ return -1; ++ if (!pmc) ++ ++pmc_grp_use[byte & 1]; ++ if (busbyte[byte] && busbyte[byte] != unit) ++ return -1; ++ busbyte[byte] = unit; ++ unituse[unit] = 1; ++ } ++ } ++ if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) ++ return -1; ++ ++ /* ++ * Assign resources and set multiplexer selects. ++ * ++ * PM_ISU can go either on TTM0 or TTM1, but that's the only ++ * choice we have to deal with. ++ */ ++ if (unituse[PM_ISU] & ++ (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU])) ++ unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */ ++ /* Set TTM[01]SEL fields. */ ++ ttmuse[0] = ttmuse[1] = 0; ++ for (i = PM_FPU; i <= PM_STS; ++i) { ++ if (!unituse[i]) ++ continue; ++ ttm = unitmap[i]; ++ ++ttmuse[(ttm >> 2) & 1]; ++ mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH; ++ } ++ /* Check only one unit per TTMx */ ++ if (ttmuse[0] > 1 || ttmuse[1] > 1) ++ return -1; ++ ++ /* Set byte lane select fields and TTM3SEL. */ ++ for (byte = 0; byte < 4; ++byte) { ++ unit = busbyte[byte]; ++ if (!unit) ++ continue; ++ if (unit <= PM_STS) ++ ttm = (unitmap[unit] >> 2) & 1; ++ else if (unit == PM_LSU0) ++ ttm = 2; ++ else { ++ ttm = 3; ++ if (unit == PM_LSU1L && byte >= 2) ++ mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); ++ } ++ mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); ++ } ++ ++ /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ ++ memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */ ++ for (i = 0; i < n_ev; ++i) { ++ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; ++ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; ++ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; ++ psel = event[i] & PM_PMCSEL_MSK; ++ if (!pmc) { ++ /* Bus event or any-PMC direct event */ ++ if (unit) ++ psel |= 0x10 | ((byte & 2) << 2); ++ else ++ psel |= 8; ++ for (pmc = 0; pmc < 8; ++pmc) { ++ if (pmc_inuse & (1 << pmc)) ++ continue; ++ grp = (pmc >> 1) & 1; ++ if (unit) { ++ if (grp == (byte & 1)) ++ break; ++ } else if (pmc_grp_use[grp] < 4) { ++ ++pmc_grp_use[grp]; ++ break; ++ } ++ } ++ pmc_inuse |= 1 << pmc; ++ } else { ++ /* Direct event */ ++ --pmc; ++ if (psel == 0 && (byte & 2)) ++ /* add events on higher-numbered bus */ ++ mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; ++ } ++ pmcsel[pmc] = psel; ++ hwc[i] = pmc; ++ } ++ for (pmc = 0; pmc < 2; ++pmc) ++ mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc); ++ for (; pmc < 8; ++pmc) ++ mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); ++ if (pmc_inuse & 1) ++ mmcr0 |= MMCR0_PMC1CE; ++ if (pmc_inuse & 0xfe) ++ mmcr0 |= MMCR0_PMCjCE; ++ ++ mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ ++ ++ /* Return MMCRx values */ ++ mmcr[0] = mmcr0; ++ mmcr[1] = mmcr1; ++ mmcr[2] = mmcra; ++ return 0; ++} ++ ++static void p970_disable_pmc(unsigned int pmc, u64 mmcr[]) ++{ ++ int shift, i; ++ ++ if (pmc <= 1) { ++ shift = MMCR0_PMC1SEL_SH - 7 * pmc; ++ i = 0; ++ } else { ++ shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2); ++ i = 1; ++ } ++ /* ++ * Setting the PMCxSEL field to 0x08 disables PMC x. ++ */ ++ mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift); ++} ++ ++static int ppc970_generic_events[] = { ++ [PERF_COUNT_CPU_CYCLES] = 7, ++ [PERF_COUNT_INSTRUCTIONS] = 1, ++ [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ ++ [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ ++ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ ++ [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ ++}; ++ ++struct power_pmu ppc970_pmu = { ++ .n_counter = 8, ++ .max_alternatives = 2, ++ .add_fields = 0x001100005555ull, ++ .test_adder = 0x013300000000ull, ++ .compute_mmcr = p970_compute_mmcr, ++ .get_constraint = p970_get_constraint, ++ .get_alternatives = p970_get_alternatives, ++ .disable_pmc = p970_disable_pmc, ++ .n_generic = ARRAY_SIZE(ppc970_generic_events), ++ .generic_events = ppc970_generic_events, ++}; +Index: linux-2.6-tip/arch/powerpc/kernel/vmlinux.lds.S +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/kernel/vmlinux.lds.S ++++ linux-2.6-tip/arch/powerpc/kernel/vmlinux.lds.S +@@ -181,13 +181,7 @@ SECTIONS + __initramfs_end = .; + } + #endif +- . = ALIGN(PAGE_SIZE); +- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { +- __per_cpu_start = .; +- *(.data.percpu) +- *(.data.percpu.shared_aligned) +- __per_cpu_end = .; +- } ++ PERCPU(PAGE_SIZE) + + . = ALIGN(8); + .machine.desc : AT(ADDR(.machine.desc) - LOAD_OFFSET) { +Index: linux-2.6-tip/arch/powerpc/mm/fault.c +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/mm/fault.c ++++ linux-2.6-tip/arch/powerpc/mm/fault.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -158,7 +159,7 @@ int __kprobes do_page_fault(struct pt_re + } + #endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ + +- if (in_atomic() || mm == NULL) { ++ if (in_atomic() || mm == NULL || current->pagefault_disabled) { + if (!user_mode(regs)) + return SIGSEGV; + /* in_atomic() in user mode is really bad, +@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_re + die("Weird page fault", regs, SIGSEGV); + } + ++ perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); ++ + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an +@@ -321,6 +324,7 @@ good_area: + } + if (ret & VM_FAULT_MAJOR) { + current->maj_flt++; ++ perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + #ifdef CONFIG_PPC_SMLPAR + if (firmware_has_feature(FW_FEATURE_CMO)) { + preempt_disable(); +@@ -328,8 +332,10 @@ good_area: + preempt_enable(); + } + #endif +- } else ++ } else { + current->min_flt++; ++ perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); ++ } + up_read(&mm->mmap_sem); + return 0; + +Index: linux-2.6-tip/arch/powerpc/platforms/Kconfig.cputype +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/platforms/Kconfig.cputype ++++ linux-2.6-tip/arch/powerpc/platforms/Kconfig.cputype +@@ -1,6 +1,7 @@ + config PPC64 + bool "64-bit kernel" + default n ++ select HAVE_PERF_COUNTERS + help + This option selects whether a 32-bit or a 64-bit kernel + will be built. +Index: linux-2.6-tip/arch/powerpc/platforms/cell/interrupt.c +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/platforms/cell/interrupt.c ++++ linux-2.6-tip/arch/powerpc/platforms/cell/interrupt.c +@@ -237,8 +237,6 @@ extern int noirqdebug; + + static void handle_iic_irq(unsigned int irq, struct irq_desc *desc) + { +- const unsigned int cpu = smp_processor_id(); +- + spin_lock(&desc->lock); + + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); +@@ -254,7 +252,7 @@ static void handle_iic_irq(unsigned int + goto out_eoi; + } + +- kstat_cpu(cpu).irqs[irq]++; ++ kstat_incr_irqs_this_cpu(irq, desc); + + /* Mark the IRQ currently in progress.*/ + desc->status |= IRQ_INPROGRESS; +Index: linux-2.6-tip/arch/powerpc/platforms/cell/spufs/sched.c +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/platforms/cell/spufs/sched.c ++++ linux-2.6-tip/arch/powerpc/platforms/cell/spufs/sched.c +@@ -508,7 +508,7 @@ static void __spu_add_to_rq(struct spu_c + list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]); + set_bit(ctx->prio, spu_prio->bitmap); + if (!spu_prio->nr_waiting++) +- __mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); ++ mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); + } + } + +Index: linux-2.6-tip/arch/powerpc/platforms/pseries/xics.c +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/platforms/pseries/xics.c ++++ linux-2.6-tip/arch/powerpc/platforms/pseries/xics.c +@@ -153,9 +153,10 @@ static int get_irq_server(unsigned int v + { + int server; + /* For the moment only implement delivery to all cpus or one cpu */ +- cpumask_t cpumask = irq_desc[virq].affinity; ++ cpumask_t cpumask; + cpumask_t tmp = CPU_MASK_NONE; + ++ cpumask_copy(&cpumask, irq_desc[virq].affinity); + if (!distribute_irqs) + return default_server; + +@@ -869,7 +870,7 @@ void xics_migrate_irqs_away(void) + virq, cpu); + + /* Reset affinity to all cpus */ +- irq_desc[virq].affinity = CPU_MASK_ALL; ++ cpumask_setall(irq_desc[virq].affinity); + desc->chip->set_affinity(virq, cpu_all_mask); + unlock: + spin_unlock_irqrestore(&desc->lock, flags); +Index: linux-2.6-tip/arch/powerpc/sysdev/mpic.c +=================================================================== +--- linux-2.6-tip.orig/arch/powerpc/sysdev/mpic.c ++++ linux-2.6-tip/arch/powerpc/sysdev/mpic.c +@@ -46,7 +46,7 @@ + + static struct mpic *mpics; + static struct mpic *mpic_primary; +-static DEFINE_SPINLOCK(mpic_lock); ++static DEFINE_RAW_SPINLOCK(mpic_lock); + + #ifdef CONFIG_PPC32 /* XXX for now */ + #ifdef CONFIG_IRQ_ALL_CPUS +@@ -566,9 +566,10 @@ static void __init mpic_scan_ht_pics(str + #ifdef CONFIG_SMP + static int irq_choose_cpu(unsigned int virt_irq) + { +- cpumask_t mask = irq_desc[virt_irq].affinity; ++ cpumask_t mask; + int cpuid; + ++ cpumask_copy(&mask, irq_desc[virt_irq].affinity); + if (cpus_equal(mask, CPU_MASK_ALL)) { + static int irq_rover; + static DEFINE_SPINLOCK(irq_rover_lock); +Index: linux-2.6-tip/arch/s390/include/asm/smp.h +=================================================================== +--- linux-2.6-tip.orig/arch/s390/include/asm/smp.h ++++ linux-2.6-tip/arch/s390/include/asm/smp.h +@@ -97,12 +97,6 @@ extern void arch_send_call_function_ipi( + #endif + + #ifndef CONFIG_SMP +-static inline void smp_send_stop(void) +-{ +- /* Disable all interrupts/machine checks */ +- __load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK); +-} +- + #define hard_smp_processor_id() 0 + #define smp_cpu_not_running(cpu) 1 + #endif +Index: linux-2.6-tip/arch/sh/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/sh/kernel/irq.c ++++ linux-2.6-tip/arch/sh/kernel/irq.c +@@ -51,7 +51,7 @@ int show_interrupts(struct seq_file *p, + goto unlock; + seq_printf(p, "%3d: ",i); + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, " %14s", irq_desc[i].chip->name); + seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %s", action->name); +Index: linux-2.6-tip/arch/sparc/include/asm/mmzone.h +=================================================================== +--- linux-2.6-tip.orig/arch/sparc/include/asm/mmzone.h ++++ linux-2.6-tip/arch/sparc/include/asm/mmzone.h +@@ -3,6 +3,8 @@ + + #ifdef CONFIG_NEED_MULTIPLE_NODES + ++#include ++ + extern struct pglist_data *node_data[]; + + #define NODE_DATA(nid) (node_data[nid]) +Index: linux-2.6-tip/arch/sparc/kernel/irq_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/sparc/kernel/irq_64.c ++++ linux-2.6-tip/arch/sparc/kernel/irq_64.c +@@ -185,7 +185,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%10u ", kstat_irqs(i)); + #else + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + #endif + seq_printf(p, " %9s", irq_desc[i].chip->typename); + seq_printf(p, " %s", action->name); +@@ -252,9 +252,10 @@ struct irq_handler_data { + #ifdef CONFIG_SMP + static int irq_choose_cpu(unsigned int virt_irq) + { +- cpumask_t mask = irq_desc[virt_irq].affinity; ++ cpumask_t mask; + int cpuid; + ++ cpumask_copy(&mask, irq_desc[virt_irq].affinity); + if (cpus_equal(mask, CPU_MASK_ALL)) { + static int irq_rover; + static DEFINE_SPINLOCK(irq_rover_lock); +@@ -805,7 +806,7 @@ void fixup_irqs(void) + !(irq_desc[irq].status & IRQ_PER_CPU)) { + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, +- &irq_desc[irq].affinity); ++ irq_desc[irq].affinity); + } + spin_unlock_irqrestore(&irq_desc[irq].lock, flags); + } +Index: linux-2.6-tip/arch/sparc/kernel/time_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/sparc/kernel/time_64.c ++++ linux-2.6-tip/arch/sparc/kernel/time_64.c +@@ -36,10 +36,10 @@ + #include + #include + #include ++#include + + #include + #include +-#include + #include + #include + #include +@@ -729,7 +729,7 @@ void timer_interrupt(int irq, struct pt_ + + irq_enter(); + +- kstat_this_cpu.irqs[0]++; ++ kstat_incr_irqs_this_cpu(0, irq_to_desc(0)); + + if (unlikely(!evt->event_handler)) { + printk(KERN_WARNING +Index: linux-2.6-tip/arch/um/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/um/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/um/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/um/kernel/irq.c ++++ linux-2.6-tip/arch/um/kernel/irq.c +@@ -42,7 +42,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%10u ", kstat_irqs(i)); + #else + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + #endif + seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %s", action->name); +Index: linux-2.6-tip/arch/x86/Kconfig +=================================================================== +--- linux-2.6-tip.orig/arch/x86/Kconfig ++++ linux-2.6-tip/arch/x86/Kconfig +@@ -5,7 +5,7 @@ mainmenu "Linux Kernel Configuration for + config 64BIT + bool "64-bit kernel" if ARCH = "x86" + default ARCH = "x86_64" +- help ++ ---help--- + Say yes to build a 64-bit kernel - formerly known as x86_64 + Say no to build a 32-bit kernel - formerly known as i386 + +@@ -34,12 +34,19 @@ config X86 + select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_TRACE_MCOUNT_TEST +- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) +- select HAVE_ARCH_KGDB if !X86_VOYAGER ++ select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE ++ select HAVE_FTRACE_SYSCALLS ++ select HAVE_KVM ++ select HAVE_ARCH_KGDB + select HAVE_ARCH_TRACEHOOK + select HAVE_GENERIC_DMA_COHERENT if X86_32 + select HAVE_EFFICIENT_UNALIGNED_ACCESS + select USER_STACKTRACE_SUPPORT ++ select HAVE_KERNEL_GZIP ++ select HAVE_KERNEL_BZIP2 ++ select HAVE_KERNEL_LZMA ++ select HAVE_ARCH_KMEMCHECK ++ select HAVE_DMA_API_DEBUG + + config ARCH_DEFCONFIG + string +@@ -108,10 +115,18 @@ config ARCH_MAY_HAVE_PC_FDC + def_bool y + + config RWSEM_GENERIC_SPINLOCK +- def_bool !X86_XADD ++ bool ++ depends on !X86_XADD || PREEMPT_RT ++ default y ++ ++config ASM_SEMAPHORES ++ bool ++ default y + + config RWSEM_XCHGADD_ALGORITHM +- def_bool X86_XADD ++ bool ++ depends on X86_XADD && !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT ++ default y + + config ARCH_HAS_CPU_IDLE_WAIT + def_bool y +@@ -133,18 +148,19 @@ config ARCH_HAS_CACHE_LINE_SIZE + def_bool y + + config HAVE_SETUP_PER_CPU_AREA +- def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER) ++ def_bool y ++ ++config HAVE_DYNAMIC_PER_CPU_AREA ++ def_bool y + + config HAVE_CPUMASK_OF_CPU_MAP + def_bool X86_64_SMP + + config ARCH_HIBERNATION_POSSIBLE + def_bool y +- depends on !SMP || !X86_VOYAGER + + config ARCH_SUSPEND_POSSIBLE + def_bool y +- depends on !X86_VOYAGER + + config ZONE_DMA32 + bool +@@ -165,6 +181,9 @@ config GENERIC_HARDIRQS + bool + default y + ++config GENERIC_HARDIRQS_NO__DO_IRQ ++ def_bool y ++ + config GENERIC_IRQ_PROBE + bool + default y +@@ -174,11 +193,6 @@ config GENERIC_PENDING_IRQ + depends on GENERIC_HARDIRQS && SMP + default y + +-config X86_SMP +- bool +- depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) +- default y +- + config USE_GENERIC_SMP_HELPERS + def_bool y + depends on SMP +@@ -194,19 +208,17 @@ config X86_64_SMP + config X86_HT + bool + depends on SMP +- depends on (X86_32 && !X86_VOYAGER) || X86_64 +- default y +- +-config X86_BIOS_REBOOT +- bool +- depends on !X86_VOYAGER + default y + + config X86_TRAMPOLINE + bool +- depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) ++ depends on SMP || (64BIT && ACPI_SLEEP) + default y + ++config X86_32_LAZY_GS ++ def_bool y ++ depends on X86_32 && !CC_STACKPROTECTOR ++ + config KTIME_SCALAR + def_bool X86_32 + source "init/Kconfig" +@@ -244,14 +256,24 @@ config SMP + + If you don't know what to do here, say N. + +-config X86_HAS_BOOT_CPU_ID +- def_bool y +- depends on X86_VOYAGER ++config X86_X2APIC ++ bool "Support x2apic" ++ depends on X86_LOCAL_APIC && X86_64 ++ ---help--- ++ This enables x2apic support on CPUs that have this feature. ++ ++ This allows 32-bit apic IDs (so it can support very large systems), ++ and accesses the local apic via MSRs not via mmio. ++ ++ ( On certain CPU models you may need to enable INTR_REMAP too, ++ to get functional x2apic mode. ) ++ ++ If you don't know what to do here, say N. + + config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on PCI_MSI || HT_IRQ +- help ++ ---help--- + This enables support for sparse irqs. This is useful for distro + kernels that want to define a high CONFIG_NR_CPUS value but still + want to have low kernel memory footprint on smaller machines. +@@ -265,114 +287,140 @@ config NUMA_MIGRATE_IRQ_DESC + bool "Move irq desc when changing irq smp_affinity" + depends on SPARSE_IRQ && NUMA + default n +- help ++ ---help--- + This enables moving irq_desc to cpu/node that irq will use handled. + + If you don't know what to do here, say N. + +-config X86_FIND_SMP_CONFIG +- def_bool y +- depends on X86_MPPARSE || X86_VOYAGER +- + config X86_MPPARSE + bool "Enable MPS table" if ACPI + default y + depends on X86_LOCAL_APIC +- help ++ ---help--- + For old smp systems that do not have proper acpi support. Newer systems + (esp with 64bit cpus) with acpi support, MADT and DSDT will override it + +-choice +- prompt "Subarchitecture Type" +- default X86_PC ++config X86_BIGSMP ++ bool "Support for big SMP systems with more than 8 CPUs" ++ depends on X86_32 && SMP ++ ---help--- ++ This option is needed for the systems that have more than 8 CPUs + +-config X86_PC +- bool "PC-compatible" +- help +- Choose this option if your computer is a standard PC or compatible. ++if X86_32 ++config X86_EXTENDED_PLATFORM ++ bool "Support for extended (non-PC) x86 platforms" ++ default y ++ ---help--- ++ If you disable this option then the kernel will only support ++ standard PC platforms. (which covers the vast majority of ++ systems out there.) ++ ++ If you enable this option then you'll be able to select support ++ for the following (non-PC) 32 bit x86 platforms: ++ AMD Elan ++ NUMAQ (IBM/Sequent) ++ RDC R-321x SoC ++ SGI 320/540 (Visual Workstation) ++ Summit/EXA (IBM x440) ++ Unisys ES7000 IA32 series ++ ++ If you have one of these systems, or if you want to build a ++ generic distribution kernel, say Y here - otherwise say N. ++endif ++ ++if X86_64 ++config X86_EXTENDED_PLATFORM ++ bool "Support for extended (non-PC) x86 platforms" ++ default y ++ ---help--- ++ If you disable this option then the kernel will only support ++ standard PC platforms. (which covers the vast majority of ++ systems out there.) ++ ++ If you enable this option then you'll be able to select support ++ for the following (non-PC) 64 bit x86 platforms: ++ ScaleMP vSMP ++ SGI Ultraviolet ++ ++ If you have one of these systems, or if you want to build a ++ generic distribution kernel, say Y here - otherwise say N. ++endif ++# This is an alphabetically sorted list of 64 bit extended platforms ++# Please maintain the alphabetic order if and when there are additions ++ ++config X86_VSMP ++ bool "ScaleMP vSMP" ++ select PARAVIRT ++ depends on X86_64 && PCI ++ depends on X86_EXTENDED_PLATFORM ++ ---help--- ++ Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is ++ supposed to run on these EM64T-based machines. Only choose this option ++ if you have one of these machines. ++ ++config X86_UV ++ bool "SGI Ultraviolet" ++ depends on X86_64 ++ depends on X86_EXTENDED_PLATFORM ++ select X86_X2APIC ++ ---help--- ++ This option is needed in order to support SGI Ultraviolet systems. ++ If you don't have one of these, you should say N here. ++ ++# Following is an alphabetically sorted list of 32 bit extended platforms ++# Please maintain the alphabetic order if and when there are additions + + config X86_ELAN + bool "AMD Elan" + depends on X86_32 +- help ++ depends on X86_EXTENDED_PLATFORM ++ ---help--- + Select this for an AMD Elan processor. + + Do not use this option for K6/Athlon/Opteron processors! + + If unsure, choose "PC-compatible" instead. + +-config X86_VOYAGER +- bool "Voyager (NCR)" +- depends on X86_32 && (SMP || BROKEN) && !PCI +- help +- Voyager is an MCA-based 32-way capable SMP architecture proprietary +- to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based. +- +- *** WARNING *** +- +- If you do not specifically know you have a Voyager based machine, +- say N here, otherwise the kernel you build will not be bootable. +- +-config X86_GENERICARCH +- bool "Generic architecture" ++config X86_RDC321X ++ bool "RDC R-321x SoC" + depends on X86_32 +- help +- This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default ++ depends on X86_EXTENDED_PLATFORM ++ select M486 ++ select X86_REBOOTFIXUPS ++ ---help--- ++ This option is needed for RDC R-321x system-on-chip, also known ++ as R-8610-(G). ++ If you don't have one of these chips, you should say N here. ++ ++config X86_32_NON_STANDARD ++ bool "Support non-standard 32-bit SMP architectures" ++ depends on X86_32 && SMP ++ depends on X86_EXTENDED_PLATFORM ++ ---help--- ++ This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default + subarchitectures. It is intended for a generic binary kernel. + if you select them all, kernel will probe it one by one. and will + fallback to default. + +-if X86_GENERICARCH ++# Alphabetically sorted list of Non standard 32 bit platforms + + config X86_NUMAQ + bool "NUMAQ (IBM/Sequent)" +- depends on SMP && X86_32 && PCI && X86_MPPARSE ++ depends on X86_32_NON_STANDARD + select NUMA +- help ++ select X86_MPPARSE ++ ---help--- + This option is used for getting Linux to run on a NUMAQ (IBM/Sequent) + NUMA multiquad box. This changes the way that processors are + bootstrapped, and uses Clustered Logical APIC addressing mode instead + of Flat Logical. You will need a new lynxer.elf file to flash your + firmware with - send email to . + +-config X86_SUMMIT +- bool "Summit/EXA (IBM x440)" +- depends on X86_32 && SMP +- help +- This option is needed for IBM systems that use the Summit/EXA chipset. +- In particular, it is needed for the x440. +- +-config X86_ES7000 +- bool "Support for Unisys ES7000 IA32 series" +- depends on X86_32 && SMP +- help +- Support for Unisys ES7000 systems. Say 'Y' here if this kernel is +- supposed to run on an IA32-based Unisys ES7000 system. +- +-config X86_BIGSMP +- bool "Support for big SMP systems with more than 8 CPUs" +- depends on X86_32 && SMP +- help +- This option is needed for the systems that have more than 8 CPUs +- and if the system is not of any sub-arch type above. +- +-endif +- +-config X86_VSMP +- bool "Support for ScaleMP vSMP" +- select PARAVIRT +- depends on X86_64 && PCI +- help +- Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is +- supposed to run on these EM64T-based machines. Only choose this option +- if you have one of these machines. +- +-endchoice +- + config X86_VISWS + bool "SGI 320/540 (Visual Workstation)" +- depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT +- help ++ depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT ++ depends on X86_32_NON_STANDARD ++ ---help--- + The SGI Visual Workstation series is an IA32-based workstation + based on SGI systems chips with some legacy PC hardware attached. + +@@ -381,21 +429,25 @@ config X86_VISWS + A kernel compiled for the Visual Workstation will run on general + PCs as well. See for details. + +-config X86_RDC321X +- bool "RDC R-321x SoC" +- depends on X86_32 +- select M486 +- select X86_REBOOTFIXUPS +- help +- This option is needed for RDC R-321x system-on-chip, also known +- as R-8610-(G). +- If you don't have one of these chips, you should say N here. ++config X86_SUMMIT ++ bool "Summit/EXA (IBM x440)" ++ depends on X86_32_NON_STANDARD ++ ---help--- ++ This option is needed for IBM systems that use the Summit/EXA chipset. ++ In particular, it is needed for the x440. ++ ++config X86_ES7000 ++ bool "Unisys ES7000 IA32 series" ++ depends on X86_32_NON_STANDARD && X86_BIGSMP ++ ---help--- ++ Support for Unisys ES7000 systems. Say 'Y' here if this kernel is ++ supposed to run on an IA32-based Unisys ES7000 system. + + config SCHED_OMIT_FRAME_POINTER + def_bool y + prompt "Single-depth WCHAN output" + depends on X86 +- help ++ ---help--- + Calculate simpler /proc//wchan values. If this option + is disabled then wchan values will recurse back to the + caller function. This provides more accurate wchan values, +@@ -405,7 +457,7 @@ config SCHED_OMIT_FRAME_POINTER + + menuconfig PARAVIRT_GUEST + bool "Paravirtualized guest support" +- help ++ ---help--- + Say Y here to get to see options related to running Linux under + various hypervisors. This option alone does not add any kernel code. + +@@ -419,8 +471,7 @@ config VMI + bool "VMI Guest support" + select PARAVIRT + depends on X86_32 +- depends on !X86_VOYAGER +- help ++ ---help--- + VMI provides a paravirtualized interface to the VMware ESX server + (it could be used by other hypervisors in theory too, but is not + at the moment), by linking the kernel to a GPL-ed ROM module +@@ -430,8 +481,7 @@ config KVM_CLOCK + bool "KVM paravirtualized clock" + select PARAVIRT + select PARAVIRT_CLOCK +- depends on !X86_VOYAGER +- help ++ ---help--- + Turning on this option will allow you to run a paravirtualized clock + when running over the KVM hypervisor. Instead of relying on a PIT + (or probably other) emulation by the underlying device model, the host +@@ -441,17 +491,15 @@ config KVM_CLOCK + config KVM_GUEST + bool "KVM Guest support" + select PARAVIRT +- depends on !X86_VOYAGER +- help +- This option enables various optimizations for running under the KVM +- hypervisor. ++ ---help--- ++ This option enables various optimizations for running under the KVM ++ hypervisor. + + source "arch/x86/lguest/Kconfig" + + config PARAVIRT + bool "Enable paravirtualization code" +- depends on !X86_VOYAGER +- help ++ ---help--- + This changes the kernel so it can modify itself when it is run + under a hypervisor, potentially improving performance significantly + over full virtualization. However, when run without a hypervisor +@@ -464,51 +512,51 @@ config PARAVIRT_CLOCK + endif + + config PARAVIRT_DEBUG +- bool "paravirt-ops debugging" +- depends on PARAVIRT && DEBUG_KERNEL +- help +- Enable to debug paravirt_ops internals. Specifically, BUG if +- a paravirt_op is missing when it is called. ++ bool "paravirt-ops debugging" ++ depends on PARAVIRT && DEBUG_KERNEL ++ ---help--- ++ Enable to debug paravirt_ops internals. Specifically, BUG if ++ a paravirt_op is missing when it is called. + + config MEMTEST + bool "Memtest" +- help ++ ---help--- + This option adds a kernel parameter 'memtest', which allows memtest + to be set. +- memtest=0, mean disabled; -- default +- memtest=1, mean do 1 test pattern; +- ... +- memtest=4, mean do 4 test patterns. ++ memtest=0, mean disabled; -- default ++ memtest=1, mean do 1 test pattern; ++ ... ++ memtest=4, mean do 4 test patterns. + If you are unsure how to answer this question, answer N. + + config X86_SUMMIT_NUMA + def_bool y +- depends on X86_32 && NUMA && X86_GENERICARCH ++ depends on X86_32 && NUMA && X86_32_NON_STANDARD + + config X86_CYCLONE_TIMER + def_bool y +- depends on X86_GENERICARCH ++ depends on X86_32_NON_STANDARD + + source "arch/x86/Kconfig.cpu" + + config HPET_TIMER + def_bool X86_64 + prompt "HPET Timer Support" if X86_32 +- help +- Use the IA-PC HPET (High Precision Event Timer) to manage +- time in preference to the PIT and RTC, if a HPET is +- present. +- HPET is the next generation timer replacing legacy 8254s. +- The HPET provides a stable time base on SMP +- systems, unlike the TSC, but it is more expensive to access, +- as it is off-chip. You can find the HPET spec at +- . +- +- You can safely choose Y here. However, HPET will only be +- activated if the platform and the BIOS support this feature. +- Otherwise the 8254 will be used for timing services. ++ ---help--- ++ Use the IA-PC HPET (High Precision Event Timer) to manage ++ time in preference to the PIT and RTC, if a HPET is ++ present. ++ HPET is the next generation timer replacing legacy 8254s. ++ The HPET provides a stable time base on SMP ++ systems, unlike the TSC, but it is more expensive to access, ++ as it is off-chip. You can find the HPET spec at ++ . ++ ++ You can safely choose Y here. However, HPET will only be ++ activated if the platform and the BIOS support this feature. ++ Otherwise the 8254 will be used for timing services. + +- Choose N to continue using the legacy 8254 timer. ++ Choose N to continue using the legacy 8254 timer. + + config HPET_EMULATE_RTC + def_bool y +@@ -519,7 +567,7 @@ config HPET_EMULATE_RTC + config DMI + default y + bool "Enable DMI scanning" if EMBEDDED +- help ++ ---help--- + Enabled scanning of DMI to identify machine quirks. Say Y + here unless you have verified that your setup is not + affected by entries in the DMI blacklist. Required by PNP +@@ -531,7 +579,7 @@ config GART_IOMMU + select SWIOTLB + select AGP + depends on X86_64 && PCI +- help ++ ---help--- + Support for full DMA access of devices with 32bit memory access only + on systems with more than 3GB. This is usually needed for USB, + sound, many IDE/SATA chipsets and some other devices. +@@ -546,7 +594,7 @@ config CALGARY_IOMMU + bool "IBM Calgary IOMMU support" + select SWIOTLB + depends on X86_64 && PCI && EXPERIMENTAL +- help ++ ---help--- + Support for hardware IOMMUs in IBM's xSeries x366 and x460 + systems. Needed to run systems with more than 3GB of memory + properly with 32-bit PCI devices that do not support DAC +@@ -564,7 +612,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT + def_bool y + prompt "Should Calgary be enabled by default?" + depends on CALGARY_IOMMU +- help ++ ---help--- + Should Calgary be enabled by default? if you choose 'y', Calgary + will be used (if it exists). If you choose 'n', Calgary will not be + used even if it exists. If you choose 'n' and would like to use +@@ -576,7 +624,7 @@ config AMD_IOMMU + select SWIOTLB + select PCI_MSI + depends on X86_64 && PCI && ACPI +- help ++ ---help--- + With this option you can enable support for AMD IOMMU hardware in + your system. An IOMMU is a hardware component which provides + remapping of DMA memory accesses from devices. With an AMD IOMMU you +@@ -591,7 +639,7 @@ config AMD_IOMMU_STATS + bool "Export AMD IOMMU statistics to debugfs" + depends on AMD_IOMMU + select DEBUG_FS +- help ++ ---help--- + This option enables code in the AMD IOMMU driver to collect various + statistics about whats happening in the driver and exports that + information to userspace via debugfs. +@@ -600,7 +648,7 @@ config AMD_IOMMU_STATS + # need this always selected by IOMMU for the VIA workaround + config SWIOTLB + def_bool y if X86_64 +- help ++ ---help--- + Support for software bounce buffers used on x86-64 systems + which don't have a hardware IOMMU (e.g. the current generation + of Intel's x86-64 CPUs). Using this PCI devices which can only +@@ -615,10 +663,10 @@ config IOMMU_API + + config MAXSMP + bool "Configure Maximum number of SMP Processors and NUMA Nodes" +- depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL ++ depends on 0 && X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + select CPUMASK_OFFSTACK + default n +- help ++ ---help--- + Configure maximum number of CPUS and NUMA Nodes for this architecture. + If unsure, say N. + +@@ -629,7 +677,7 @@ config NR_CPUS + default "4096" if MAXSMP + default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "8" if SMP +- help ++ ---help--- + This allows you to specify the maximum number of CPUs which this + kernel will support. The maximum supported value is 512 and the + minimum value which makes sense is 2. +@@ -640,7 +688,7 @@ config NR_CPUS + config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on X86_HT +- help ++ ---help--- + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say +@@ -650,7 +698,7 @@ config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" + depends on X86_HT +- help ++ ---help--- + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. +@@ -659,8 +707,8 @@ source "kernel/Kconfig.preempt" + + config X86_UP_APIC + bool "Local APIC support on uniprocessors" +- depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH) +- help ++ depends on X86_32 && !SMP && !X86_32_NON_STANDARD ++ ---help--- + A local APIC (Advanced Programmable Interrupt Controller) is an + integrated interrupt controller in the CPU. If you have a single-CPU + system which has a processor with a local APIC, you can say Y here to +@@ -673,7 +721,7 @@ config X86_UP_APIC + config X86_UP_IOAPIC + bool "IO-APIC support on uniprocessors" + depends on X86_UP_APIC +- help ++ ---help--- + An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an + SMP-capable replacement for PC-style interrupt controllers. Most + SMP systems and many recent uniprocessor systems have one. +@@ -684,11 +732,12 @@ config X86_UP_IOAPIC + + config X86_LOCAL_APIC + def_bool y +- depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) ++ depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC ++ select HAVE_PERF_COUNTERS if (!M386 && !M486) + + config X86_IO_APIC + def_bool y +- depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) ++ depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + + config X86_VISWS_APIC + def_bool y +@@ -698,7 +747,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + bool "Reroute for broken boot IRQs" + default n + depends on X86_IO_APIC +- help ++ ---help--- + This option enables a workaround that fixes a source of + spurious interrupts. This is recommended when threaded + interrupt handling is used on systems where the generation of +@@ -720,7 +769,6 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + + config X86_MCE + bool "Machine Check Exception" +- depends on !X86_VOYAGER + ---help--- + Machine Check Exception support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, component failure). +@@ -739,7 +787,7 @@ config X86_MCE_INTEL + def_bool y + prompt "Intel MCE features" + depends on X86_64 && X86_MCE && X86_LOCAL_APIC +- help ++ ---help--- + Additional support for intel specific MCE features such as + the thermal monitor. + +@@ -747,14 +795,19 @@ config X86_MCE_AMD + def_bool y + prompt "AMD MCE features" + depends on X86_64 && X86_MCE && X86_LOCAL_APIC +- help ++ ---help--- + Additional support for AMD specific MCE features such as + the DRAM Error Threshold. + ++config X86_MCE_THRESHOLD ++ depends on X86_MCE_AMD || X86_MCE_INTEL ++ bool ++ default y ++ + config X86_MCE_NONFATAL + tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" + depends on X86_32 && X86_MCE +- help ++ ---help--- + Enabling this feature starts a timer that triggers every 5 seconds which + will look at the machine check registers to see if anything happened. + Non-fatal problems automatically get corrected (but still logged). +@@ -767,7 +820,7 @@ config X86_MCE_NONFATAL + config X86_MCE_P4THERMAL + bool "check for P4 thermal throttling interrupt." + depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) +- help ++ ---help--- + Enabling this feature will cause a message to be printed when the P4 + enters thermal throttling. + +@@ -775,11 +828,11 @@ config VM86 + bool "Enable VM86 support" if EMBEDDED + default y + depends on X86_32 +- help +- This option is required by programs like DOSEMU to run 16-bit legacy ++ ---help--- ++ This option is required by programs like DOSEMU to run 16-bit legacy + code on X86 processors. It also may be needed by software like +- XFree86 to initialize some video cards via BIOS. Disabling this +- option saves about 6k. ++ XFree86 to initialize some video cards via BIOS. Disabling this ++ option saves about 6k. + + config TOSHIBA + tristate "Toshiba Laptop support" +@@ -853,33 +906,33 @@ config MICROCODE + module will be called microcode. + + config MICROCODE_INTEL +- bool "Intel microcode patch loading support" +- depends on MICROCODE +- default MICROCODE +- select FW_LOADER +- --help--- +- This options enables microcode patch loading support for Intel +- processors. +- +- For latest news and information on obtaining all the required +- Intel ingredients for this driver, check: +- . ++ bool "Intel microcode patch loading support" ++ depends on MICROCODE ++ default MICROCODE ++ select FW_LOADER ++ ---help--- ++ This options enables microcode patch loading support for Intel ++ processors. ++ ++ For latest news and information on obtaining all the required ++ Intel ingredients for this driver, check: ++ . + + config MICROCODE_AMD +- bool "AMD microcode patch loading support" +- depends on MICROCODE +- select FW_LOADER +- --help--- +- If you select this option, microcode patch loading support for AMD +- processors will be enabled. ++ bool "AMD microcode patch loading support" ++ depends on MICROCODE ++ select FW_LOADER ++ ---help--- ++ If you select this option, microcode patch loading support for AMD ++ processors will be enabled. + +- config MICROCODE_OLD_INTERFACE ++config MICROCODE_OLD_INTERFACE + def_bool y + depends on MICROCODE + + config X86_MSR + tristate "/dev/cpu/*/msr - Model-specific register support" +- help ++ ---help--- + This device gives privileged processes access to the x86 + Model-Specific Registers (MSRs). It is a character device with + major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. +@@ -888,12 +941,18 @@ config X86_MSR + + config X86_CPUID + tristate "/dev/cpu/*/cpuid - CPU information support" +- help ++ ---help--- + This device gives processes access to the x86 CPUID instruction to + be executed on a specific processor. It is a character device + with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to + /dev/cpu/31/cpuid. + ++config X86_CPU_DEBUG ++ tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support" ++ ---help--- ++ If you select this option, this will provide various x86 CPUs ++ information through debugfs. ++ + choice + prompt "High Memory Support" + default HIGHMEM4G if !X86_NUMAQ +@@ -940,7 +999,7 @@ config NOHIGHMEM + config HIGHMEM4G + bool "4GB" + depends on !X86_NUMAQ +- help ++ ---help--- + Select this if you have a 32-bit processor and between 1 and 4 + gigabytes of physical RAM. + +@@ -948,7 +1007,7 @@ config HIGHMEM64G + bool "64GB" + depends on !M386 && !M486 + select X86_PAE +- help ++ ---help--- + Select this if you have a 32-bit processor and more than 4 + gigabytes of physical RAM. + +@@ -959,7 +1018,7 @@ choice + prompt "Memory split" if EMBEDDED + default VMSPLIT_3G + depends on X86_32 +- help ++ ---help--- + Select the desired split between kernel and user memory. + + If the address range available to the kernel is less than the +@@ -1005,20 +1064,20 @@ config HIGHMEM + config X86_PAE + bool "PAE (Physical Address Extension) Support" + depends on X86_32 && !HIGHMEM4G +- help ++ ---help--- + PAE is required for NX support, and furthermore enables + larger swapspace support for non-overcommit purposes. It + has the cost of more pagetable lookup overhead, and also + consumes more pagetable space per process. + + config ARCH_PHYS_ADDR_T_64BIT +- def_bool X86_64 || X86_PAE ++ def_bool X86_64 || X86_PAE + + config DIRECT_GBPAGES + bool "Enable 1GB pages for kernel pagetables" if EMBEDDED + default y + depends on X86_64 +- help ++ ---help--- + Allow the kernel linear mapping to use 1GB pages on CPUs that + support it. This can improve the kernel's performance a tiny bit by + reducing TLB pressure. If in doubt, say "Y". +@@ -1028,9 +1087,8 @@ config NUMA + bool "Numa Memory Allocation and Scheduler Support" + depends on SMP + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) +- default n if X86_PC + default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) +- help ++ ---help--- + Enable NUMA (Non Uniform Memory Access) support. + + The kernel will try to allocate memory used by a CPU on the +@@ -1053,19 +1111,19 @@ config K8_NUMA + def_bool y + prompt "Old style AMD Opteron NUMA detection" + depends on X86_64 && NUMA && PCI +- help +- Enable K8 NUMA node topology detection. You should say Y here if +- you have a multi processor AMD K8 system. This uses an old +- method to read the NUMA configuration directly from the builtin +- Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA +- instead, which also takes priority if both are compiled in. ++ ---help--- ++ Enable K8 NUMA node topology detection. You should say Y here if ++ you have a multi processor AMD K8 system. This uses an old ++ method to read the NUMA configuration directly from the builtin ++ Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA ++ instead, which also takes priority if both are compiled in. + + config X86_64_ACPI_NUMA + def_bool y + prompt "ACPI NUMA detection" + depends on X86_64 && NUMA && ACPI && PCI + select ACPI_NUMA +- help ++ ---help--- + Enable ACPI SRAT based node topology detection. + + # Some NUMA nodes have memory ranges that span +@@ -1080,24 +1138,24 @@ config NODES_SPAN_OTHER_NODES + config NUMA_EMU + bool "NUMA emulation" + depends on X86_64 && NUMA +- help ++ ---help--- + Enable NUMA emulation. A flat machine will be split + into virtual nodes when booted with "numa=fake=N", where N is the + number of nodes. This is only useful for debugging. + + config NODES_SHIFT + int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP +- range 1 9 if X86_64 ++ range 1 9 + default "9" if MAXSMP + default "6" if X86_64 + default "4" if X86_NUMAQ + default "3" + depends on NEED_MULTIPLE_NODES +- help ++ ---help--- + Specify the maximum number of NUMA Nodes available on the target + system. Increases memory reserved to accomodate various tables. + +-config HAVE_ARCH_BOOTMEM_NODE ++config HAVE_ARCH_BOOTMEM + def_bool y + depends on X86_32 && NUMA + +@@ -1131,7 +1189,7 @@ config ARCH_SPARSEMEM_DEFAULT + + config ARCH_SPARSEMEM_ENABLE + def_bool y +- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH ++ depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD + select SPARSEMEM_STATIC if X86_32 + select SPARSEMEM_VMEMMAP_ENABLE if X86_64 + +@@ -1143,66 +1201,71 @@ config ARCH_MEMORY_PROBE + def_bool X86_64 + depends on MEMORY_HOTPLUG + ++config ILLEGAL_POINTER_VALUE ++ hex ++ default 0 if X86_32 ++ default 0xdead000000000000 if X86_64 ++ + source "mm/Kconfig" + + config HIGHPTE + bool "Allocate 3rd-level pagetables from highmem" + depends on X86_32 && (HIGHMEM4G || HIGHMEM64G) +- help ++ ---help--- + The VM uses one page table entry for each page of physical memory. + For systems with a lot of RAM, this can be wasteful of precious + low memory. Setting this option will put user-space page table + entries in high memory. + + config X86_CHECK_BIOS_CORRUPTION +- bool "Check for low memory corruption" +- help +- Periodically check for memory corruption in low memory, which +- is suspected to be caused by BIOS. Even when enabled in the +- configuration, it is disabled at runtime. Enable it by +- setting "memory_corruption_check=1" on the kernel command +- line. By default it scans the low 64k of memory every 60 +- seconds; see the memory_corruption_check_size and +- memory_corruption_check_period parameters in +- Documentation/kernel-parameters.txt to adjust this. +- +- When enabled with the default parameters, this option has +- almost no overhead, as it reserves a relatively small amount +- of memory and scans it infrequently. It both detects corruption +- and prevents it from affecting the running system. +- +- It is, however, intended as a diagnostic tool; if repeatable +- BIOS-originated corruption always affects the same memory, +- you can use memmap= to prevent the kernel from using that +- memory. ++ bool "Check for low memory corruption" ++ ---help--- ++ Periodically check for memory corruption in low memory, which ++ is suspected to be caused by BIOS. Even when enabled in the ++ configuration, it is disabled at runtime. Enable it by ++ setting "memory_corruption_check=1" on the kernel command ++ line. By default it scans the low 64k of memory every 60 ++ seconds; see the memory_corruption_check_size and ++ memory_corruption_check_period parameters in ++ Documentation/kernel-parameters.txt to adjust this. ++ ++ When enabled with the default parameters, this option has ++ almost no overhead, as it reserves a relatively small amount ++ of memory and scans it infrequently. It both detects corruption ++ and prevents it from affecting the running system. ++ ++ It is, however, intended as a diagnostic tool; if repeatable ++ BIOS-originated corruption always affects the same memory, ++ you can use memmap= to prevent the kernel from using that ++ memory. + + config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK +- bool "Set the default setting of memory_corruption_check" ++ bool "Set the default setting of memory_corruption_check" + depends on X86_CHECK_BIOS_CORRUPTION + default y +- help +- Set whether the default state of memory_corruption_check is +- on or off. ++ ---help--- ++ Set whether the default state of memory_corruption_check is ++ on or off. + + config X86_RESERVE_LOW_64K +- bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" ++ bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" + default y +- help +- Reserve the first 64K of physical RAM on BIOSes that are known +- to potentially corrupt that memory range. A numbers of BIOSes are +- known to utilize this area during suspend/resume, so it must not +- be used by the kernel. +- +- Set this to N if you are absolutely sure that you trust the BIOS +- to get all its memory reservations and usages right. +- +- If you have doubts about the BIOS (e.g. suspend/resume does not +- work or there's kernel crashes after certain hardware hotplug +- events) and it's not AMI or Phoenix, then you might want to enable +- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical +- corruption patterns. ++ ---help--- ++ Reserve the first 64K of physical RAM on BIOSes that are known ++ to potentially corrupt that memory range. A numbers of BIOSes are ++ known to utilize this area during suspend/resume, so it must not ++ be used by the kernel. ++ ++ Set this to N if you are absolutely sure that you trust the BIOS ++ to get all its memory reservations and usages right. ++ ++ If you have doubts about the BIOS (e.g. suspend/resume does not ++ work or there's kernel crashes after certain hardware hotplug ++ events) and it's not AMI or Phoenix, then you might want to enable ++ X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical ++ corruption patterns. + +- Say Y if unsure. ++ Say Y if unsure. + + config MATH_EMULATION + bool +@@ -1268,7 +1331,7 @@ config MTRR_SANITIZER + def_bool y + prompt "MTRR cleanup support" + depends on MTRR +- help ++ ---help--- + Convert MTRR layout from continuous to discrete, so X drivers can + add writeback entries. + +@@ -1283,7 +1346,7 @@ config MTRR_SANITIZER_ENABLE_DEFAULT + range 0 1 + default "0" + depends on MTRR_SANITIZER +- help ++ ---help--- + Enable mtrr cleanup default value + + config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT +@@ -1291,7 +1354,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAU + range 0 7 + default "1" + depends on MTRR_SANITIZER +- help ++ ---help--- + mtrr cleanup spare entries default, it can be changed via + mtrr_spare_reg_nr=N on the kernel command line. + +@@ -1299,7 +1362,7 @@ config X86_PAT + bool + prompt "x86 PAT support" + depends on MTRR +- help ++ ---help--- + Use PAT attributes to setup page level cache control. + + PATs are the modern equivalents of MTRRs and are much more +@@ -1314,20 +1377,20 @@ config EFI + bool "EFI runtime service support" + depends on ACPI + ---help--- +- This enables the kernel to use EFI runtime services that are +- available (such as the EFI variable services). ++ This enables the kernel to use EFI runtime services that are ++ available (such as the EFI variable services). + +- This option is only useful on systems that have EFI firmware. +- In addition, you should use the latest ELILO loader available +- at in order to take advantage +- of EFI runtime services. However, even with this option, the +- resultant kernel should continue to boot on existing non-EFI +- platforms. ++ This option is only useful on systems that have EFI firmware. ++ In addition, you should use the latest ELILO loader available ++ at in order to take advantage ++ of EFI runtime services. However, even with this option, the ++ resultant kernel should continue to boot on existing non-EFI ++ platforms. + + config SECCOMP + def_bool y + prompt "Enable seccomp to safely compute untrusted bytecode" +- help ++ ---help--- + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to +@@ -1340,13 +1403,16 @@ config SECCOMP + + If unsure, say Y. Only embedded should say N here. + ++config CC_STACKPROTECTOR_ALL ++ bool ++ + config CC_STACKPROTECTOR + bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" +- depends on X86_64 && EXPERIMENTAL && BROKEN +- help +- This option turns on the -fstack-protector GCC feature. This +- feature puts, at the beginning of critical functions, a canary +- value on the stack just before the return address, and validates ++ select CC_STACKPROTECTOR_ALL ++ ---help--- ++ This option turns on the -fstack-protector GCC feature. This ++ feature puts, at the beginning of functions, a canary value on ++ the stack just before the return address, and validates + the value just before actually returning. Stack based buffer + overflows (that need to overwrite this return address) now also + overwrite the canary, which gets detected and the attack is then +@@ -1354,22 +1420,14 @@ config CC_STACKPROTECTOR + + This feature requires gcc version 4.2 or above, or a distribution + gcc with the feature backported. Older versions are automatically +- detected and for those versions, this configuration option is ignored. +- +-config CC_STACKPROTECTOR_ALL +- bool "Use stack-protector for all functions" +- depends on CC_STACKPROTECTOR +- help +- Normally, GCC only inserts the canary value protection for +- functions that use large-ish on-stack buffers. By enabling +- this option, GCC will be asked to do this for ALL functions. ++ detected and for those versions, this configuration option is ++ ignored. (and a warning is printed during bootup) + + source kernel/Kconfig.hz + + config KEXEC + bool "kexec system call" +- depends on X86_BIOS_REBOOT +- help ++ ---help--- + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is independent of the system firmware. And like a reboot +@@ -1386,7 +1444,7 @@ config KEXEC + config CRASH_DUMP + bool "kernel crash dumps" + depends on X86_64 || (X86_32 && HIGHMEM) +- help ++ ---help--- + Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels + which are loaded in the main kernel with kexec-tools into +@@ -1400,8 +1458,8 @@ config CRASH_DUMP + config KEXEC_JUMP + bool "kexec jump (EXPERIMENTAL)" + depends on EXPERIMENTAL +- depends on KEXEC && HIBERNATION && X86_32 +- help ++ depends on KEXEC && HIBERNATION ++ ---help--- + Jump between original kernel and kexeced kernel and invoke + code in physical address mode via KEXEC + +@@ -1410,7 +1468,7 @@ config PHYSICAL_START + default "0x1000000" if X86_NUMAQ + default "0x200000" if X86_64 + default "0x100000" +- help ++ ---help--- + This gives the physical address where the kernel is loaded. + + If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then +@@ -1451,7 +1509,7 @@ config PHYSICAL_START + config RELOCATABLE + bool "Build a relocatable kernel (EXPERIMENTAL)" + depends on EXPERIMENTAL +- help ++ ---help--- + This builds a kernel image that retains relocation information + so it can be loaded someplace besides the default 1MB. + The relocations tend to make the kernel binary about 10% larger, +@@ -1471,7 +1529,7 @@ config PHYSICAL_ALIGN + default "0x100000" if X86_32 + default "0x200000" if X86_64 + range 0x2000 0x400000 +- help ++ ---help--- + This value puts the alignment restrictions on physical address + where kernel is loaded and run from. Kernel is compiled for an + address which meets above alignment restriction. +@@ -1492,7 +1550,7 @@ config PHYSICAL_ALIGN + + config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs" +- depends on SMP && HOTPLUG && !X86_VOYAGER ++ depends on SMP && HOTPLUG + ---help--- + Say Y here to allow turning CPUs off and on. CPUs can be + controlled through /sys/devices/system/cpu. +@@ -1504,7 +1562,7 @@ config COMPAT_VDSO + def_bool y + prompt "Compat VDSO support" + depends on X86_32 || IA32_EMULATION +- help ++ ---help--- + Map the 32-bit VDSO to the predictable old-style address too. + ---help--- + Say N here if you are running a sufficiently recent glibc +@@ -1516,7 +1574,7 @@ config COMPAT_VDSO + config CMDLINE_BOOL + bool "Built-in kernel command line" + default n +- help ++ ---help--- + Allow for specifying boot arguments to the kernel at + build time. On some systems (e.g. embedded ones), it is + necessary or convenient to provide some or all of the +@@ -1534,7 +1592,7 @@ config CMDLINE + string "Built-in kernel command string" + depends on CMDLINE_BOOL + default "" +- help ++ ---help--- + Enter arguments here that should be compiled into the kernel + image and used at boot time. If the boot loader provides a + command line at boot time, it is appended to this string to +@@ -1551,7 +1609,7 @@ config CMDLINE_OVERRIDE + bool "Built-in command line overrides boot loader arguments" + default n + depends on CMDLINE_BOOL +- help ++ ---help--- + Set this option to 'Y' to have the kernel ignore the boot loader + command line, and use ONLY the built-in command line. + +@@ -1572,8 +1630,11 @@ config HAVE_ARCH_EARLY_PFN_TO_NID + def_bool X86_64 + depends on NUMA + ++config HARDIRQS_SW_RESEND ++ bool ++ default y ++ + menu "Power management and ACPI options" +- depends on !X86_VOYAGER + + config ARCH_HIBERNATION_HEADER + def_bool y +@@ -1651,7 +1712,7 @@ if APM + + config APM_IGNORE_USER_SUSPEND + bool "Ignore USER SUSPEND" +- help ++ ---help--- + This option will ignore USER SUSPEND requests. On machines with a + compliant APM BIOS, you want to say N. However, on the NEC Versa M + series notebooks, it is necessary to say Y because of a BIOS bug. +@@ -1675,7 +1736,7 @@ config APM_DO_ENABLE + + config APM_CPU_IDLE + bool "Make CPU Idle calls when idle" +- help ++ ---help--- + Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. + On some machines, this can activate improved power savings, such as + a slowed CPU clock rate, when the machine is idle. These idle calls +@@ -1686,7 +1747,7 @@ config APM_CPU_IDLE + + config APM_DISPLAY_BLANK + bool "Enable console blanking using APM" +- help ++ ---help--- + Enable console blanking using the APM. Some laptops can use this to + turn off the LCD backlight when the screen blanker of the Linux + virtual console blanks the screen. Note that this is only used by +@@ -1699,7 +1760,7 @@ config APM_DISPLAY_BLANK + + config APM_ALLOW_INTS + bool "Allow interrupts during APM BIOS calls" +- help ++ ---help--- + Normally we disable external interrupts while we are making calls to + the APM BIOS as a measure to lessen the effects of a badly behaving + BIOS implementation. The BIOS should reenable interrupts if it +@@ -1724,7 +1785,7 @@ config PCI + bool "PCI support" + default y + select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) +- help ++ ---help--- + Find out whether you have a PCI motherboard. PCI is the name of a + bus system, i.e. the way the CPU talks to the other stuff inside + your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or +@@ -1795,7 +1856,7 @@ config PCI_MMCONFIG + config DMAR + bool "Support for DMA Remapping Devices (EXPERIMENTAL)" + depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL +- help ++ ---help--- + DMA remapping (DMAR) devices support enables independent address + translations for Direct Memory Access (DMA) from devices. + These DMA remapping devices are reported via ACPI tables +@@ -1817,29 +1878,30 @@ config DMAR_GFX_WA + def_bool y + prompt "Support for Graphics workaround" + depends on DMAR +- help +- Current Graphics drivers tend to use physical address +- for DMA and avoid using DMA APIs. Setting this config +- option permits the IOMMU driver to set a unity map for +- all the OS-visible memory. Hence the driver can continue +- to use physical addresses for DMA. ++ ---help--- ++ Current Graphics drivers tend to use physical address ++ for DMA and avoid using DMA APIs. Setting this config ++ option permits the IOMMU driver to set a unity map for ++ all the OS-visible memory. Hence the driver can continue ++ to use physical addresses for DMA. + + config DMAR_FLOPPY_WA + def_bool y + depends on DMAR +- help +- Floppy disk drivers are know to bypass DMA API calls +- thereby failing to work when IOMMU is enabled. This +- workaround will setup a 1:1 mapping for the first +- 16M to make floppy (an ISA device) work. ++ ---help--- ++ Floppy disk drivers are know to bypass DMA API calls ++ thereby failing to work when IOMMU is enabled. This ++ workaround will setup a 1:1 mapping for the first ++ 16M to make floppy (an ISA device) work. + + config INTR_REMAP + bool "Support for Interrupt Remapping (EXPERIMENTAL)" + depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL +- help +- Supports Interrupt remapping for IO-APIC and MSI devices. +- To use x2apic mode in the CPU's which support x2APIC enhancements or +- to support platforms with CPU's having > 8 bit APIC ID, say Y. ++ select X86_X2APIC ++ ---help--- ++ Supports Interrupt remapping for IO-APIC and MSI devices. ++ To use x2apic mode in the CPU's which support x2APIC enhancements or ++ to support platforms with CPU's having > 8 bit APIC ID, say Y. + + source "drivers/pci/pcie/Kconfig" + +@@ -1853,8 +1915,7 @@ if X86_32 + + config ISA + bool "ISA support" +- depends on !X86_VOYAGER +- help ++ ---help--- + Find out whether you have ISA slots on your motherboard. ISA is the + name of a bus system, i.e. the way the CPU talks to the other stuff + inside your box. Other bus systems are PCI, EISA, MicroChannel +@@ -1880,9 +1941,8 @@ config EISA + source "drivers/eisa/Kconfig" + + config MCA +- bool "MCA support" if !X86_VOYAGER +- default y if X86_VOYAGER +- help ++ bool "MCA support" ++ ---help--- + MicroChannel Architecture is found in some IBM PS/2 machines and + laptops. It is a bus system similar to PCI or ISA. See + (and especially the web page given +@@ -1892,8 +1952,7 @@ source "drivers/mca/Kconfig" + + config SCx200 + tristate "NatSemi SCx200 support" +- depends on !X86_VOYAGER +- help ++ ---help--- + This provides basic support for National Semiconductor's + (now AMD's) Geode processors. The driver probes for the + PCI-IDs of several on-chip devices, so its a good dependency +@@ -1905,7 +1964,7 @@ config SCx200HR_TIMER + tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" + depends on SCx200 && GENERIC_TIME + default y +- help ++ ---help--- + This driver provides a clocksource built upon the on-chip + 27MHz high-resolution timer. Its also a workaround for + NSC Geode SC-1100's buggy TSC, which loses time when the +@@ -1916,7 +1975,7 @@ config GEODE_MFGPT_TIMER + def_bool y + prompt "Geode Multi-Function General Purpose Timer (MFGPT) events" + depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS +- help ++ ---help--- + This driver provides a clock event source based on the MFGPT + timer(s) in the CS5535 and CS5536 companion chip for the geode. + MFGPTs have a better resolution and max interval than the +@@ -1925,7 +1984,7 @@ config GEODE_MFGPT_TIMER + config OLPC + bool "One Laptop Per Child support" + default n +- help ++ ---help--- + Add support for detecting the unique features of the OLPC + XO hardware. + +@@ -1950,16 +2009,16 @@ config IA32_EMULATION + bool "IA32 Emulation" + depends on X86_64 + select COMPAT_BINFMT_ELF +- help ++ ---help--- + Include code to run 32-bit programs under a 64-bit kernel. You should + likely turn this on, unless you're 100% sure that you don't have any + 32-bit programs left. + + config IA32_AOUT +- tristate "IA32 a.out support" +- depends on IA32_EMULATION +- help +- Support old a.out binaries in the 32bit emulation. ++ tristate "IA32 a.out support" ++ depends on IA32_EMULATION ++ ---help--- ++ Support old a.out binaries in the 32bit emulation. + + config COMPAT + def_bool y +Index: linux-2.6-tip/arch/x86/Kconfig.cpu +=================================================================== +--- linux-2.6-tip.orig/arch/x86/Kconfig.cpu ++++ linux-2.6-tip/arch/x86/Kconfig.cpu +@@ -50,7 +50,7 @@ config M386 + config M486 + bool "486" + depends on X86_32 +- help ++ ---help--- + Select this for a 486 series processor, either Intel or one of the + compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, + DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or +@@ -59,7 +59,7 @@ config M486 + config M586 + bool "586/K5/5x86/6x86/6x86MX" + depends on X86_32 +- help ++ ---help--- + Select this for an 586 or 686 series processor such as the AMD K5, + the Cyrix 5x86, 6x86 and 6x86MX. This choice does not + assume the RDTSC (Read Time Stamp Counter) instruction. +@@ -67,21 +67,21 @@ config M586 + config M586TSC + bool "Pentium-Classic" + depends on X86_32 +- help ++ ---help--- + Select this for a Pentium Classic processor with the RDTSC (Read + Time Stamp Counter) instruction for benchmarking. + + config M586MMX + bool "Pentium-MMX" + depends on X86_32 +- help ++ ---help--- + Select this for a Pentium with the MMX graphics/multimedia + extended instructions. + + config M686 + bool "Pentium-Pro" + depends on X86_32 +- help ++ ---help--- + Select this for Intel Pentium Pro chips. This enables the use of + Pentium Pro extended instructions, and disables the init-time guard + against the f00f bug found in earlier Pentiums. +@@ -89,7 +89,7 @@ config M686 + config MPENTIUMII + bool "Pentium-II/Celeron(pre-Coppermine)" + depends on X86_32 +- help ++ ---help--- + Select this for Intel chips based on the Pentium-II and + pre-Coppermine Celeron core. This option enables an unaligned + copy optimization, compiles the kernel with optimization flags +@@ -99,7 +99,7 @@ config MPENTIUMII + config MPENTIUMIII + bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon" + depends on X86_32 +- help ++ ---help--- + Select this for Intel chips based on the Pentium-III and + Celeron-Coppermine core. This option enables use of some + extended prefetch instructions in addition to the Pentium II +@@ -108,14 +108,14 @@ config MPENTIUMIII + config MPENTIUMM + bool "Pentium M" + depends on X86_32 +- help ++ ---help--- + Select this for Intel Pentium M (not Pentium-4 M) + notebook chips. + + config MPENTIUM4 + bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" + depends on X86_32 +- help ++ ---help--- + Select this for Intel Pentium 4 chips. This includes the + Pentium 4, Pentium D, P4-based Celeron and Xeon, and + Pentium-4 M (not Pentium M) chips. This option enables compile +@@ -151,7 +151,7 @@ config MPENTIUM4 + config MK6 + bool "K6/K6-II/K6-III" + depends on X86_32 +- help ++ ---help--- + Select this for an AMD K6-family processor. Enables use of + some extended instructions, and passes appropriate optimization + flags to GCC. +@@ -159,14 +159,14 @@ config MK6 + config MK7 + bool "Athlon/Duron/K7" + depends on X86_32 +- help ++ ---help--- + Select this for an AMD Athlon K7-family processor. Enables use of + some extended instructions, and passes appropriate optimization + flags to GCC. + + config MK8 + bool "Opteron/Athlon64/Hammer/K8" +- help ++ ---help--- + Select this for an AMD Opteron or Athlon64 Hammer-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. +@@ -174,7 +174,7 @@ config MK8 + config MCRUSOE + bool "Crusoe" + depends on X86_32 +- help ++ ---help--- + Select this for a Transmeta Crusoe processor. Treats the processor + like a 586 with TSC, and sets some GCC optimization flags (like a + Pentium Pro with no alignment requirements). +@@ -182,13 +182,13 @@ config MCRUSOE + config MEFFICEON + bool "Efficeon" + depends on X86_32 +- help ++ ---help--- + Select this for a Transmeta Efficeon processor. + + config MWINCHIPC6 + bool "Winchip-C6" + depends on X86_32 +- help ++ ---help--- + Select this for an IDT Winchip C6 chip. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment requirements. +@@ -196,7 +196,7 @@ config MWINCHIPC6 + config MWINCHIP3D + bool "Winchip-2/Winchip-2A/Winchip-3" + depends on X86_32 +- help ++ ---help--- + Select this for an IDT Winchip-2, 2A or 3. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment requirements. Also enable out of order memory +@@ -206,19 +206,19 @@ config MWINCHIP3D + config MGEODEGX1 + bool "GeodeGX1" + depends on X86_32 +- help ++ ---help--- + Select this for a Geode GX1 (Cyrix MediaGX) chip. + + config MGEODE_LX + bool "Geode GX/LX" + depends on X86_32 +- help ++ ---help--- + Select this for AMD Geode GX and LX processors. + + config MCYRIXIII + bool "CyrixIII/VIA-C3" + depends on X86_32 +- help ++ ---help--- + Select this for a Cyrix III or C3 chip. Presently Linux and GCC + treat this chip as a generic 586. Whilst the CPU is 686 class, + it lacks the cmov extension which gcc assumes is present when +@@ -230,7 +230,7 @@ config MCYRIXIII + config MVIAC3_2 + bool "VIA C3-2 (Nehemiah)" + depends on X86_32 +- help ++ ---help--- + Select this for a VIA C3 "Nehemiah". Selecting this enables usage + of SSE and tells gcc to treat the CPU as a 686. + Note, this kernel will not boot on older (pre model 9) C3s. +@@ -238,14 +238,14 @@ config MVIAC3_2 + config MVIAC7 + bool "VIA C7" + depends on X86_32 +- help ++ ---help--- + Select this for a VIA C7. Selecting this uses the correct cache + shift and tells gcc to treat the CPU as a 686. + + config MPSC + bool "Intel P4 / older Netburst based Xeon" + depends on X86_64 +- help ++ ---help--- + Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey + Xeon CPUs with Intel 64bit which is compatible with x86-64. + Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the +@@ -255,7 +255,7 @@ config MPSC + + config MCORE2 + bool "Core 2/newer Xeon" +- help ++ ---help--- + + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and + 53xx) CPUs. You can distinguish newer from older Xeons by the CPU +@@ -265,7 +265,7 @@ config MCORE2 + config GENERIC_CPU + bool "Generic-x86-64" + depends on X86_64 +- help ++ ---help--- + Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. + +@@ -274,7 +274,7 @@ endchoice + config X86_GENERIC + bool "Generic x86 support" + depends on X86_32 +- help ++ ---help--- + Instead of just including optimizations for the selected + x86 variant (e.g. PII, Crusoe or Athlon), include some more + generic optimizations as well. This will make the kernel +@@ -294,25 +294,23 @@ config X86_CPU + # Define implied options from the CPU selection here + config X86_L1_CACHE_BYTES + int +- default "128" if GENERIC_CPU || MPSC +- default "64" if MK8 || MCORE2 +- depends on X86_64 ++ default "128" if MPSC ++ default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32 + + config X86_INTERNODE_CACHE_BYTES + int + default "4096" if X86_VSMP + default X86_L1_CACHE_BYTES if !X86_VSMP +- depends on X86_64 + + config X86_CMPXCHG + def_bool X86_64 || (X86_32 && !M386) + + config X86_L1_CACHE_SHIFT + int +- default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC ++ default "7" if MPENTIUM4 || MPSC + default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX +- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 ++ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU + + config X86_XADD + def_bool y +@@ -321,7 +319,7 @@ config X86_XADD + config X86_PPRO_FENCE + bool "PentiumPro memory ordering errata workaround" + depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 +- help ++ ---help--- + Old PentiumPro multiprocessor systems had errata that could cause + memory operations to violate the x86 ordering standard in rare cases. + Enabling this option will attempt to work around some (but not all) +@@ -414,14 +412,14 @@ config X86_DEBUGCTLMSR + + menuconfig PROCESSOR_SELECT + bool "Supported processor vendors" if EMBEDDED +- help ++ ---help--- + This lets you choose what x86 vendor support code your kernel + will include. + + config CPU_SUP_INTEL + default y + bool "Support Intel processors" if PROCESSOR_SELECT +- help ++ ---help--- + This enables detection, tunings and quirks for Intel processors + + You need this enabled if you want your kernel to run on an +@@ -435,7 +433,7 @@ config CPU_SUP_CYRIX_32 + default y + bool "Support Cyrix processors" if PROCESSOR_SELECT + depends on !64BIT +- help ++ ---help--- + This enables detection, tunings and quirks for Cyrix processors + + You need this enabled if you want your kernel to run on a +@@ -448,7 +446,7 @@ config CPU_SUP_CYRIX_32 + config CPU_SUP_AMD + default y + bool "Support AMD processors" if PROCESSOR_SELECT +- help ++ ---help--- + This enables detection, tunings and quirks for AMD processors + + You need this enabled if you want your kernel to run on an +@@ -458,25 +456,10 @@ config CPU_SUP_AMD + + If unsure, say N. + +-config CPU_SUP_CENTAUR_32 +- default y +- bool "Support Centaur processors" if PROCESSOR_SELECT +- depends on !64BIT +- help +- This enables detection, tunings and quirks for Centaur processors +- +- You need this enabled if you want your kernel to run on a +- Centaur CPU. Disabling this option on other types of CPUs +- makes the kernel a tiny bit smaller. Disabling it on a Centaur +- CPU might render the kernel unbootable. +- +- If unsure, say N. +- +-config CPU_SUP_CENTAUR_64 ++config CPU_SUP_CENTAUR + default y + bool "Support Centaur processors" if PROCESSOR_SELECT +- depends on 64BIT +- help ++ ---help--- + This enables detection, tunings and quirks for Centaur processors + + You need this enabled if you want your kernel to run on a +@@ -490,7 +473,7 @@ config CPU_SUP_TRANSMETA_32 + default y + bool "Support Transmeta processors" if PROCESSOR_SELECT + depends on !64BIT +- help ++ ---help--- + This enables detection, tunings and quirks for Transmeta processors + + You need this enabled if you want your kernel to run on a +@@ -504,7 +487,7 @@ config CPU_SUP_UMC_32 + default y + bool "Support UMC processors" if PROCESSOR_SELECT + depends on !64BIT +- help ++ ---help--- + This enables detection, tunings and quirks for UMC processors + + You need this enabled if you want your kernel to run on a +@@ -523,8 +506,7 @@ config X86_PTRACE_BTS + bool "Branch Trace Store" + default y + depends on X86_DEBUGCTLMSR +- depends on BROKEN +- help ++ ---help--- + This adds a ptrace interface to the hardware's branch trace store. + + Debuggers may use it to collect an execution trace of the debugged +Index: linux-2.6-tip/arch/x86/Kconfig.debug +=================================================================== +--- linux-2.6-tip.orig/arch/x86/Kconfig.debug ++++ linux-2.6-tip/arch/x86/Kconfig.debug +@@ -7,7 +7,7 @@ source "lib/Kconfig.debug" + + config STRICT_DEVMEM + bool "Filter access to /dev/mem" +- help ++ ---help--- + If this option is disabled, you allow userspace (root) access to all + of memory, including kernel and userspace memory. Accidental + access to this is obviously disastrous, but specific access can +@@ -25,7 +25,7 @@ config STRICT_DEVMEM + config X86_VERBOSE_BOOTUP + bool "Enable verbose x86 bootup info messages" + default y +- help ++ ---help--- + Enables the informational output from the decompression stage + (e.g. bzImage) of the boot. If you disable this you will still + see errors. Disable this if you want silent bootup. +@@ -33,7 +33,7 @@ config X86_VERBOSE_BOOTUP + config EARLY_PRINTK + bool "Early printk" if EMBEDDED + default y +- help ++ ---help--- + Write kernel log output directly into the VGA buffer or to a serial + port. + +@@ -47,7 +47,7 @@ config EARLY_PRINTK_DBGP + bool "Early printk via EHCI debug port" + default n + depends on EARLY_PRINTK && PCI +- help ++ ---help--- + Write kernel log output directly into the EHCI debug port. + + This is useful for kernel debugging when your machine crashes very +@@ -59,14 +59,14 @@ config EARLY_PRINTK_DBGP + config DEBUG_STACKOVERFLOW + bool "Check for stack overflows" + depends on DEBUG_KERNEL +- help ++ ---help--- + This option will cause messages to be printed if free stack space + drops below a certain limit. + + config DEBUG_STACK_USAGE + bool "Stack utilization instrumentation" + depends on DEBUG_KERNEL +- help ++ ---help--- + Enables the display of the minimum amount of free stack which each + task has ever had available in the sysrq-T and sysrq-P debug output. + +@@ -75,7 +75,8 @@ config DEBUG_STACK_USAGE + config DEBUG_PAGEALLOC + bool "Debug page memory allocations" + depends on DEBUG_KERNEL +- help ++ depends on !KMEMCHECK ++ ---help--- + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. +@@ -83,9 +84,10 @@ config DEBUG_PAGEALLOC + config DEBUG_PER_CPU_MAPS + bool "Debug access to per_cpu maps" + depends on DEBUG_KERNEL +- depends on X86_SMP ++ depends on SMP ++ depends on !PREEMPT_RT + default n +- help ++ ---help--- + Say Y to verify that the per_cpu map being accessed has + been setup. Adds a fair amount of code to kernel memory + and decreases performance. +@@ -96,7 +98,7 @@ config X86_PTDUMP + bool "Export kernel pagetable layout to userspace via debugfs" + depends on DEBUG_KERNEL + select DEBUG_FS +- help ++ ---help--- + Say Y here if you want to show the kernel pagetable layout in a + debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. +@@ -108,7 +110,7 @@ config DEBUG_RODATA + bool "Write protect kernel read-only data structures" + default y + depends on DEBUG_KERNEL +- help ++ ---help--- + Mark the kernel read-only data as write-protected in the pagetables, + in order to catch accidental (and incorrect) writes to such const + data. This is recommended so that we can catch kernel bugs sooner. +@@ -117,7 +119,8 @@ config DEBUG_RODATA + config DEBUG_RODATA_TEST + bool "Testcase for the DEBUG_RODATA feature" + depends on DEBUG_RODATA +- help ++ default y ++ ---help--- + This option enables a testcase for the DEBUG_RODATA + feature as well as for the change_page_attr() infrastructure. + If in doubt, say "N" +@@ -125,7 +128,7 @@ config DEBUG_RODATA_TEST + config DEBUG_NX_TEST + tristate "Testcase for the NX non-executable stack feature" + depends on DEBUG_KERNEL && m +- help ++ ---help--- + This option enables a testcase for the CPU NX capability + and the software setup of this feature. + If in doubt, say "N" +@@ -133,7 +136,8 @@ config DEBUG_NX_TEST + config 4KSTACKS + bool "Use 4Kb for kernel stacks instead of 8Kb" + depends on X86_32 +- help ++ default y ++ ---help--- + If you say Y here the kernel will use a 4Kb stacksize for the + kernel stack attached to each process/thread. This facilitates + running more threads on a system and also reduces the pressure +@@ -144,7 +148,7 @@ config DOUBLEFAULT + default y + bool "Enable doublefault exception handler" if EMBEDDED + depends on X86_32 +- help ++ ---help--- + This option allows trapping of rare doublefault exceptions that + would otherwise cause a system to silently reboot. Disabling this + option saves about 4k and might cause you much additional grey +@@ -154,7 +158,7 @@ config IOMMU_DEBUG + bool "Enable IOMMU debugging" + depends on GART_IOMMU && DEBUG_KERNEL + depends on X86_64 +- help ++ ---help--- + Force the IOMMU to on even when you have less than 4GB of + memory and add debugging code. On overflow always panic. And + allow to enable IOMMU leak tracing. Can be disabled at boot +@@ -170,7 +174,7 @@ config IOMMU_LEAK + bool "IOMMU leak tracing" + depends on DEBUG_KERNEL + depends on IOMMU_DEBUG +- help ++ ---help--- + Add a simple leak tracer to the IOMMU code. This is useful when you + are debugging a buggy device driver that leaks IOMMU mappings. + +@@ -203,25 +207,25 @@ choice + + config IO_DELAY_0X80 + bool "port 0x80 based port-IO delay [recommended]" +- help ++ ---help--- + This is the traditional Linux IO delay used for in/out_p. + It is the most tested hence safest selection here. + + config IO_DELAY_0XED + bool "port 0xed based port-IO delay" +- help ++ ---help--- + Use port 0xed as the IO delay. This frees up port 0x80 which is + often used as a hardware-debug port. + + config IO_DELAY_UDELAY + bool "udelay based port-IO delay" +- help ++ ---help--- + Use udelay(2) as the IO delay method. This provides the delay + while not having any side-effect on the IO port space. + + config IO_DELAY_NONE + bool "no port-IO delay" +- help ++ ---help--- + No port-IO delay. Will break on old boxes that require port-IO + delay for certain operations. Should work on most new machines. + +@@ -255,18 +259,18 @@ config DEBUG_BOOT_PARAMS + bool "Debug boot parameters" + depends on DEBUG_KERNEL + depends on DEBUG_FS +- help ++ ---help--- + This option will cause struct boot_params to be exported via debugfs. + + config CPA_DEBUG + bool "CPA self-test code" + depends on DEBUG_KERNEL +- help ++ ---help--- + Do change_page_attr() self-tests every 30 seconds. + + config OPTIMIZE_INLINING + bool "Allow gcc to uninline functions marked 'inline'" +- help ++ ---help--- + This option determines if the kernel forces gcc to inline the functions + developers have marked 'inline'. Doing so takes away freedom from gcc to + do what it thinks is best, which is desirable for the gcc 3.x series of +@@ -279,4 +283,3 @@ config OPTIMIZE_INLINING + If unsure, say N. + + endmenu +- +Index: linux-2.6-tip/arch/x86/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/Makefile ++++ linux-2.6-tip/arch/x86/Makefile +@@ -70,14 +70,22 @@ else + # this works around some issues with generating unwind tables in older gccs + # newer gccs do it by default + KBUILD_CFLAGS += -maccumulate-outgoing-args ++endif + +- stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh +- stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \ +- "$(CC)" -fstack-protector ) +- stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \ +- "$(CC)" -fstack-protector-all ) ++ifdef CONFIG_CC_STACKPROTECTOR ++ cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh ++ ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y) ++ stackp-y := -fstack-protector ++ stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all ++ KBUILD_CFLAGS += $(stackp-y) ++ else ++ $(warning stack protector enabled but no compiler support) ++ endif ++endif + +- KBUILD_CFLAGS += $(stackp-y) ++# Don't unroll struct assignments with kmemcheck enabled ++ifeq ($(CONFIG_KMEMCHECK),y) ++ KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) + endif + + # Stackpointer is addressed different for 32 bit and 64 bit x86 +@@ -102,29 +110,6 @@ KBUILD_CFLAGS += -fno-asynchronous-unwin + # prevent gcc from generating any FP code by mistake + KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) + +-### +-# Sub architecture support +-# fcore-y is linked before mcore-y files. +- +-# Default subarch .c files +-mcore-y := arch/x86/mach-default/ +- +-# Voyager subarch support +-mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager +-mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ +- +-# generic subarchitecture +-mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic +-fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ +-mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ +- +-# default subarch .h files +-mflags-y += -Iarch/x86/include/asm/mach-default +- +-# 64 bit does not support subarch support - clear sub arch variables +-fcore-$(CONFIG_X86_64) := +-mcore-$(CONFIG_X86_64) := +- + KBUILD_CFLAGS += $(mflags-y) + KBUILD_AFLAGS += $(mflags-y) + +@@ -150,9 +135,6 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/ + core-y += arch/x86/kernel/ + core-y += arch/x86/mm/ + +-# Remaining sub architecture files +-core-y += $(mcore-y) +- + core-y += arch/x86/crypto/ + core-y += arch/x86/vdso/ + core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ +@@ -176,34 +158,23 @@ endif + + boot := arch/x86/boot + +-PHONY += zImage bzImage compressed zlilo bzlilo \ +- zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install ++BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install ++ ++PHONY += bzImage $(BOOT_TARGETS) + + # Default kernel to build + all: bzImage + + # KBUILD_IMAGE specify target image being built +- KBUILD_IMAGE := $(boot)/bzImage +-zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage ++KBUILD_IMAGE := $(boot)/bzImage + +-zImage bzImage: vmlinux ++bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) + $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot + $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ + +-compressed: zImage +- +-zlilo bzlilo: vmlinux +- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo +- +-zdisk bzdisk: vmlinux +- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk +- +-fdimage fdimage144 fdimage288 isoimage: vmlinux +- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ +- +-install: +- $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install ++$(BOOT_TARGETS): vmlinux ++ $(Q)$(MAKE) $(build)=$(boot) $@ + + PHONY += vdso_install + vdso_install: +@@ -228,7 +199,3 @@ define archhelp + echo ' FDARGS="..." arguments for the booted kernel' + echo ' FDINITRD=file initrd for the booted kernel' + endef +- +-CLEAN_FILES += arch/x86/boot/fdimage \ +- arch/x86/boot/image.iso \ +- arch/x86/boot/mtools.conf +Index: linux-2.6-tip/arch/x86/boot/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/Makefile ++++ linux-2.6-tip/arch/x86/boot/Makefile +@@ -6,33 +6,30 @@ + # for more details. + # + # Copyright (C) 1994 by Linus Torvalds ++# Changed by many, many contributors over the years. + # + + # ROOT_DEV specifies the default root-device when making the image. + # This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case + # the default of FLOPPY is used by 'build'. + +-ROOT_DEV := CURRENT ++ROOT_DEV := CURRENT + + # If you want to preset the SVGA mode, uncomment the next line and + # set SVGA_MODE to whatever number you want. + # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. + # The number is the same as you would ordinarily press at bootup. + +-SVGA_MODE := -DSVGA_MODE=NORMAL_VGA ++SVGA_MODE := -DSVGA_MODE=NORMAL_VGA + +-# If you want the RAM disk device, define this to be the size in blocks. +- +-#RAMDISK := -DRAMDISK=512 +- +-targets := vmlinux.bin setup.bin setup.elf zImage bzImage ++targets := vmlinux.bin setup.bin setup.elf bzImage ++targets += fdimage fdimage144 fdimage288 image.iso mtools.conf + subdir- := compressed + + setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o + setup-y += header.o main.o mca.o memory.o pm.o pmjump.o + setup-y += printf.o string.o tty.o video.o video-mode.o version.o + setup-$(CONFIG_X86_APM_BOOT) += apm.o +-setup-$(CONFIG_X86_VOYAGER) += voyager.o + + # The link order of the video-*.o modules can matter. In particular, + # video-vga.o *must* be listed first, followed by video-vesa.o. +@@ -72,17 +69,13 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os + KBUILD_CFLAGS += $(call cc-option,-m32) + KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ + +-$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) +-$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ +-$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +-$(obj)/bzImage: BUILDFLAGS := -b ++$(obj)/bzImage: asflags-y := $(SVGA_MODE) + + quiet_cmd_image = BUILD $@ +-cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \ +- $(obj)/vmlinux.bin $(ROOT_DEV) > $@ ++cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ ++ $(ROOT_DEV) > $@ + +-$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ +- $(obj)/vmlinux.bin $(obj)/tools/build FORCE ++$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE + $(call if_changed,image) + @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' + +@@ -117,9 +110,11 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE + $(obj)/compressed/vmlinux: FORCE + $(Q)$(MAKE) $(build)=$(obj)/compressed $@ + +-# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel ++# Set this if you want to pass append arguments to the ++# bzdisk/fdimage/isoimage kernel + FDARGS = +-# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel ++# Set this if you want an initrd included with the ++# bzdisk/fdimage/isoimage kernel + FDINITRD = + + image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) +@@ -128,7 +123,7 @@ $(obj)/mtools.conf: $(src)/mtools.conf.i + sed -e 's|@OBJ@|$(obj)|g' < $< > $@ + + # This requires write access to /dev/fd0 +-zdisk: $(BOOTIMAGE) $(obj)/mtools.conf ++bzdisk: $(obj)/bzImage $(obj)/mtools.conf + MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync + syslinux /dev/fd0 ; sync + echo '$(image_cmdline)' | \ +@@ -136,10 +131,10 @@ zdisk: $(BOOTIMAGE) $(obj)/mtools.conf + if [ -f '$(FDINITRD)' ] ; then \ + MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ + fi +- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync ++ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync + + # These require being root or having syslinux 2.02 or higher installed +-fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf ++fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf + dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 + MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync + syslinux $(obj)/fdimage ; sync +@@ -148,9 +143,9 @@ fdimage fdimage144: $(BOOTIMAGE) $(obj)/ + if [ -f '$(FDINITRD)' ] ; then \ + MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ + fi +- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync ++ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync + +-fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf ++fdimage288: $(obj)/bzImage $(obj)/mtools.conf + dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 + MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync + syslinux $(obj)/fdimage ; sync +@@ -159,9 +154,9 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.c + if [ -f '$(FDINITRD)' ] ; then \ + MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ + fi +- MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync ++ MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync + +-isoimage: $(BOOTIMAGE) ++isoimage: $(obj)/bzImage + -rm -rf $(obj)/isoimage + mkdir $(obj)/isoimage + for i in lib lib64 share end ; do \ +@@ -171,7 +166,7 @@ isoimage: $(BOOTIMAGE) + fi ; \ + if [ $$i = end ] ; then exit 1 ; fi ; \ + done +- cp $(BOOTIMAGE) $(obj)/isoimage/linux ++ cp $(obj)/bzImage $(obj)/isoimage/linux + echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg + if [ -f '$(FDINITRD)' ] ; then \ + cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ +@@ -182,12 +177,13 @@ isoimage: $(BOOTIMAGE) + isohybrid $(obj)/image.iso 2>/dev/null || true + rm -rf $(obj)/isoimage + +-zlilo: $(BOOTIMAGE) ++bzlilo: $(obj)/bzImage + if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi + if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi +- cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz ++ cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz + cp System.map $(INSTALL_PATH)/ + if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi + + install: +- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" ++ sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ ++ System.map "$(INSTALL_PATH)" +Index: linux-2.6-tip/arch/x86/boot/a20.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/a20.c ++++ linux-2.6-tip/arch/x86/boot/a20.c +@@ -2,6 +2,7 @@ + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright 2007-2008 rPath, Inc. - All Rights Reserved ++ * Copyright 2009 Intel Corporation + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. +@@ -15,16 +16,23 @@ + #include "boot.h" + + #define MAX_8042_LOOPS 100000 ++#define MAX_8042_FF 32 + + static int empty_8042(void) + { + u8 status; + int loops = MAX_8042_LOOPS; ++ int ffs = MAX_8042_FF; + + while (loops--) { + io_delay(); + + status = inb(0x64); ++ if (status == 0xff) { ++ /* FF is a plausible, but very unlikely status */ ++ if (!--ffs) ++ return -1; /* Assume no KBC present */ ++ } + if (status & 1) { + /* Read and discard input data */ + io_delay(); +@@ -118,44 +126,37 @@ static void enable_a20_fast(void) + + int enable_a20(void) + { +-#if defined(CONFIG_X86_ELAN) +- /* Elan croaks if we try to touch the KBC */ +- enable_a20_fast(); +- while (!a20_test_long()) +- ; +- return 0; +-#elif defined(CONFIG_X86_VOYAGER) +- /* On Voyager, a20_test() is unsafe? */ +- enable_a20_kbc(); +- return 0; +-#else + int loops = A20_ENABLE_LOOPS; +- while (loops--) { +- /* First, check to see if A20 is already enabled +- (legacy free, etc.) */ +- if (a20_test_short()) +- return 0; +- +- /* Next, try the BIOS (INT 0x15, AX=0x2401) */ +- enable_a20_bios(); +- if (a20_test_short()) +- return 0; +- +- /* Try enabling A20 through the keyboard controller */ +- empty_8042(); +- if (a20_test_short()) +- return 0; /* BIOS worked, but with delayed reaction */ ++ int kbc_err; + +- enable_a20_kbc(); +- if (a20_test_long()) +- return 0; +- +- /* Finally, try enabling the "fast A20 gate" */ +- enable_a20_fast(); +- if (a20_test_long()) +- return 0; +- } +- +- return -1; +-#endif ++ while (loops--) { ++ /* First, check to see if A20 is already enabled ++ (legacy free, etc.) */ ++ if (a20_test_short()) ++ return 0; ++ ++ /* Next, try the BIOS (INT 0x15, AX=0x2401) */ ++ enable_a20_bios(); ++ if (a20_test_short()) ++ return 0; ++ ++ /* Try enabling A20 through the keyboard controller */ ++ kbc_err = empty_8042(); ++ ++ if (a20_test_short()) ++ return 0; /* BIOS worked, but with delayed reaction */ ++ ++ if (!kbc_err) { ++ enable_a20_kbc(); ++ if (a20_test_long()) ++ return 0; ++ } ++ ++ /* Finally, try enabling the "fast A20 gate" */ ++ enable_a20_fast(); ++ if (a20_test_long()) ++ return 0; ++ } ++ ++ return -1; + } +Index: linux-2.6-tip/arch/x86/boot/boot.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/boot.h ++++ linux-2.6-tip/arch/x86/boot/boot.h +@@ -302,9 +302,6 @@ void probe_cards(int unsafe); + /* video-vesa.c */ + void vesa_store_edid(void); + +-/* voyager.c */ +-int query_voyager(void); +- + #endif /* __ASSEMBLY__ */ + + #endif /* BOOT_BOOT_H */ +Index: linux-2.6-tip/arch/x86/boot/compressed/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/compressed/Makefile ++++ linux-2.6-tip/arch/x86/boot/compressed/Makefile +@@ -4,7 +4,7 @@ + # create a compressed vmlinux image from the original vmlinux + # + +-targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o ++targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o + + KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 + KBUILD_CFLAGS += -fno-strict-aliasing -fPIC +@@ -47,18 +47,35 @@ ifeq ($(CONFIG_X86_32),y) + ifdef CONFIG_RELOCATABLE + $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE + $(call if_changed,gzip) ++$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE ++ $(call if_changed,bzip2) ++$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE ++ $(call if_changed,lzma) + else + $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) ++$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE ++ $(call if_changed,bzip2) ++$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE ++ $(call if_changed,lzma) + endif + LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T + + else ++ + $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) ++$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE ++ $(call if_changed,bzip2) ++$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE ++ $(call if_changed,lzma) + + LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T + endif + +-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE ++suffix_$(CONFIG_KERNEL_GZIP) = gz ++suffix_$(CONFIG_KERNEL_BZIP2) = bz2 ++suffix_$(CONFIG_KERNEL_LZMA) = lzma ++ ++$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE + $(call if_changed,ld) +Index: linux-2.6-tip/arch/x86/boot/compressed/head_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/compressed/head_32.S ++++ linux-2.6-tip/arch/x86/boot/compressed/head_32.S +@@ -25,14 +25,12 @@ + + #include + #include +-#include ++#include + #include + #include + + .section ".text.head","ax",@progbits +- .globl startup_32 +- +-startup_32: ++ENTRY(startup_32) + cld + /* test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments */ +@@ -113,6 +111,8 @@ startup_32: + */ + leal relocated(%ebx), %eax + jmp *%eax ++ENDPROC(startup_32) ++ + .section ".text" + relocated: + +Index: linux-2.6-tip/arch/x86/boot/compressed/head_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/compressed/head_64.S ++++ linux-2.6-tip/arch/x86/boot/compressed/head_64.S +@@ -26,8 +26,8 @@ + + #include + #include +-#include +-#include ++#include ++#include + #include + #include + #include +@@ -35,9 +35,7 @@ + + .section ".text.head" + .code32 +- .globl startup_32 +- +-startup_32: ++ENTRY(startup_32) + cld + /* test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments */ +@@ -176,6 +174,7 @@ startup_32: + + /* Jump from 32bit compatibility mode into 64bit mode. */ + lret ++ENDPROC(startup_32) + + no_longmode: + /* This isn't an x86-64 CPU so hang */ +@@ -295,7 +294,6 @@ relocated: + call decompress_kernel + popq %rsi + +- + /* + * Jump to the decompressed kernel. + */ +Index: linux-2.6-tip/arch/x86/boot/compressed/misc.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/compressed/misc.c ++++ linux-2.6-tip/arch/x86/boot/compressed/misc.c +@@ -116,71 +116,13 @@ + /* + * gzip declarations + */ +- +-#define OF(args) args + #define STATIC static + + #undef memset + #undef memcpy + #define memzero(s, n) memset((s), 0, (n)) + +-typedef unsigned char uch; +-typedef unsigned short ush; +-typedef unsigned long ulg; +- +-/* +- * Window size must be at least 32k, and a power of two. +- * We don't actually have a window just a huge output buffer, +- * so we report a 2G window size, as that should always be +- * larger than our output buffer: +- */ +-#define WSIZE 0x80000000 +- +-/* Input buffer: */ +-static unsigned char *inbuf; +- +-/* Sliding window buffer (and final output buffer): */ +-static unsigned char *window; + +-/* Valid bytes in inbuf: */ +-static unsigned insize; +- +-/* Index of next byte to be processed in inbuf: */ +-static unsigned inptr; +- +-/* Bytes in output buffer: */ +-static unsigned outcnt; +- +-/* gzip flag byte */ +-#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ +-#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gz file */ +-#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +-#define ORIG_NAM 0x08 /* bit 3 set: original file name present */ +-#define COMMENT 0x10 /* bit 4 set: file comment present */ +-#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ +-#define RESERVED 0xC0 /* bit 6, 7: reserved */ +- +-#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) +- +-/* Diagnostic functions */ +-#ifdef DEBUG +-# define Assert(cond, msg) do { if (!(cond)) error(msg); } while (0) +-# define Trace(x) do { fprintf x; } while (0) +-# define Tracev(x) do { if (verbose) fprintf x ; } while (0) +-# define Tracevv(x) do { if (verbose > 1) fprintf x ; } while (0) +-# define Tracec(c, x) do { if (verbose && (c)) fprintf x ; } while (0) +-# define Tracecv(c, x) do { if (verbose > 1 && (c)) fprintf x ; } while (0) +-#else +-# define Assert(cond, msg) +-# define Trace(x) +-# define Tracev(x) +-# define Tracevv(x) +-# define Tracec(c, x) +-# define Tracecv(c, x) +-#endif +- +-static int fill_inbuf(void); +-static void flush_window(void); + static void error(char *m); + + /* +@@ -189,13 +131,8 @@ static void error(char *m); + static struct boot_params *real_mode; /* Pointer to real-mode data */ + static int quiet; + +-extern unsigned char input_data[]; +-extern int input_len; +- +-static long bytes_out; +- + static void *memset(void *s, int c, unsigned n); +-static void *memcpy(void *dest, const void *src, unsigned n); ++void *memcpy(void *dest, const void *src, unsigned n); + + static void __putstr(int, const char *); + #define putstr(__x) __putstr(0, __x) +@@ -213,7 +150,17 @@ static char *vidmem; + static int vidport; + static int lines, cols; + +-#include "../../../../lib/inflate.c" ++#ifdef CONFIG_KERNEL_GZIP ++#include "../../../../lib/decompress_inflate.c" ++#endif ++ ++#ifdef CONFIG_KERNEL_BZIP2 ++#include "../../../../lib/decompress_bunzip2.c" ++#endif ++ ++#ifdef CONFIG_KERNEL_LZMA ++#include "../../../../lib/decompress_unlzma.c" ++#endif + + static void scroll(void) + { +@@ -282,7 +229,7 @@ static void *memset(void *s, int c, unsi + return s; + } + +-static void *memcpy(void *dest, const void *src, unsigned n) ++void *memcpy(void *dest, const void *src, unsigned n) + { + int i; + const char *s = src; +@@ -293,38 +240,6 @@ static void *memcpy(void *dest, const vo + return dest; + } + +-/* =========================================================================== +- * Fill the input buffer. This is called only when the buffer is empty +- * and at least one byte is really needed. +- */ +-static int fill_inbuf(void) +-{ +- error("ran out of input data"); +- return 0; +-} +- +-/* =========================================================================== +- * Write the output window window[0..outcnt-1] and update crc and bytes_out. +- * (Used for the decompressed data only.) +- */ +-static void flush_window(void) +-{ +- /* With my window equal to my output buffer +- * I only need to compute the crc here. +- */ +- unsigned long c = crc; /* temporary variable */ +- unsigned n; +- unsigned char *in, ch; +- +- in = window; +- for (n = 0; n < outcnt; n++) { +- ch = *in++; +- c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); +- } +- crc = c; +- bytes_out += (unsigned long)outcnt; +- outcnt = 0; +-} + + static void error(char *x) + { +@@ -407,12 +322,8 @@ asmlinkage void decompress_kernel(void * + lines = real_mode->screen_info.orig_video_lines; + cols = real_mode->screen_info.orig_video_cols; + +- window = output; /* Output buffer (Normally at 1M) */ + free_mem_ptr = heap; /* Heap */ + free_mem_end_ptr = heap + BOOT_HEAP_SIZE; +- inbuf = input_data; /* Input buffer */ +- insize = input_len; +- inptr = 0; + + #ifdef CONFIG_X86_64 + if ((unsigned long)output & (__KERNEL_ALIGN - 1)) +@@ -430,10 +341,9 @@ asmlinkage void decompress_kernel(void * + #endif + #endif + +- makecrc(); + if (!quiet) + putstr("\nDecompressing Linux... "); +- gunzip(); ++ decompress(input_data, input_len, NULL, NULL, output, NULL, error); + parse_elf(output); + if (!quiet) + putstr("done.\nBooting the kernel.\n"); +Index: linux-2.6-tip/arch/x86/boot/copy.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/copy.S ++++ linux-2.6-tip/arch/x86/boot/copy.S +@@ -8,6 +8,8 @@ + * + * ----------------------------------------------------------------------- */ + ++#include ++ + /* + * Memory copy routines + */ +@@ -15,9 +17,7 @@ + .code16gcc + .text + +- .globl memcpy +- .type memcpy, @function +-memcpy: ++GLOBAL(memcpy) + pushw %si + pushw %di + movw %ax, %di +@@ -31,11 +31,9 @@ memcpy: + popw %di + popw %si + ret +- .size memcpy, .-memcpy ++ENDPROC(memcpy) + +- .globl memset +- .type memset, @function +-memset: ++GLOBAL(memset) + pushw %di + movw %ax, %di + movzbl %dl, %eax +@@ -48,52 +46,42 @@ memset: + rep; stosb + popw %di + ret +- .size memset, .-memset ++ENDPROC(memset) + +- .globl copy_from_fs +- .type copy_from_fs, @function +-copy_from_fs: ++GLOBAL(copy_from_fs) + pushw %ds + pushw %fs + popw %ds + call memcpy + popw %ds + ret +- .size copy_from_fs, .-copy_from_fs ++ENDPROC(copy_from_fs) + +- .globl copy_to_fs +- .type copy_to_fs, @function +-copy_to_fs: ++GLOBAL(copy_to_fs) + pushw %es + pushw %fs + popw %es + call memcpy + popw %es + ret +- .size copy_to_fs, .-copy_to_fs ++ENDPROC(copy_to_fs) + + #if 0 /* Not currently used, but can be enabled as needed */ +- +- .globl copy_from_gs +- .type copy_from_gs, @function +-copy_from_gs: ++GLOBAL(copy_from_gs) + pushw %ds + pushw %gs + popw %ds + call memcpy + popw %ds + ret +- .size copy_from_gs, .-copy_from_gs +- .globl copy_to_gs ++ENDPROC(copy_from_gs) + +- .type copy_to_gs, @function +-copy_to_gs: ++GLOBAL(copy_to_gs) + pushw %es + pushw %gs + popw %es + call memcpy + popw %es + ret +- .size copy_to_gs, .-copy_to_gs +- ++ENDPROC(copy_to_gs) + #endif +Index: linux-2.6-tip/arch/x86/boot/header.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/header.S ++++ linux-2.6-tip/arch/x86/boot/header.S +@@ -19,17 +19,13 @@ + #include + #include + #include +-#include ++#include + #include + #include "boot.h" + #include "offsets.h" + +-SETUPSECTS = 4 /* default nr of setup-sectors */ + BOOTSEG = 0x07C0 /* original address of boot-sector */ +-SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ +-SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ +- /* to be loaded */ +-ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ ++SYSSEG = 0x1000 /* historical load address >> 4 */ + + #ifndef SVGA_MODE + #define SVGA_MODE ASK_VGA +@@ -97,12 +93,12 @@ bugger_off_msg: + .section ".header", "a" + .globl hdr + hdr: +-setup_sects: .byte SETUPSECTS ++setup_sects: .byte 0 /* Filled in by build.c */ + root_flags: .word ROOT_RDONLY +-syssize: .long SYSSIZE +-ram_size: .word RAMDISK ++syssize: .long 0 /* Filled in by build.c */ ++ram_size: .word 0 /* Obsolete */ + vid_mode: .word SVGA_MODE +-root_dev: .word ROOT_DEV ++root_dev: .word 0 /* Filled in by build.c */ + boot_flag: .word 0xAA55 + + # offset 512, entry point +@@ -123,14 +119,15 @@ _start: + # or else old loadlin-1.5 will fail) + .globl realmode_swtch + realmode_swtch: .word 0, 0 # default_switch, SETUPSEG +-start_sys_seg: .word SYSSEG ++start_sys_seg: .word SYSSEG # obsolete and meaningless, but just ++ # in case something decided to "use" it + .word kernel_version-512 # pointing to kernel version string + # above section of header is compatible + # with loadlin-1.5 (header v1.5). Don't + # change it. + +-type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, +- # Bootlin, SYSLX, bootsect...) ++type_of_loader: .byte 0 # 0 means ancient bootloader, newer ++ # bootloaders know to change this. + # See Documentation/i386/boot.txt for + # assigned ids + +@@ -142,11 +139,7 @@ CAN_USE_HEAP = 0x80 # If set, the load + # space behind setup.S can be used for + # heap purposes. + # Only the loader knows what is free +-#ifndef __BIG_KERNEL__ +- .byte 0 +-#else + .byte LOADED_HIGH +-#endif + + setup_move_size: .word 0x8000 # size to move, when setup is not + # loaded at 0x90000. We will move setup +@@ -157,11 +150,7 @@ setup_move_size: .word 0x8000 # size t + + code32_start: # here loaders can put a different + # start address for 32-bit code. +-#ifndef __BIG_KERNEL__ +- .long 0x1000 # 0x1000 = default for zImage +-#else + .long 0x100000 # 0x100000 = default for big kernel +-#endif + + ramdisk_image: .long 0 # address of loaded ramdisk image + # Here the loader puts the 32-bit +Index: linux-2.6-tip/arch/x86/boot/main.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/main.c ++++ linux-2.6-tip/arch/x86/boot/main.c +@@ -149,11 +149,6 @@ void main(void) + /* Query MCA information */ + query_mca(); + +- /* Voyager */ +-#ifdef CONFIG_X86_VOYAGER +- query_voyager(); +-#endif +- + /* Query Intel SpeedStep (IST) information */ + query_ist(); + +Index: linux-2.6-tip/arch/x86/boot/pm.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/pm.c ++++ linux-2.6-tip/arch/x86/boot/pm.c +@@ -33,47 +33,6 @@ static void realmode_switch_hook(void) + } + + /* +- * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000. +- * A bzImage kernel is loaded and runs at 0x100000. +- */ +-static void move_kernel_around(void) +-{ +- /* Note: rely on the compile-time option here rather than +- the LOADED_HIGH flag. The Qemu kernel loader unconditionally +- sets the loadflags to zero. */ +-#ifndef __BIG_KERNEL__ +- u16 dst_seg, src_seg; +- u32 syssize; +- +- dst_seg = 0x1000 >> 4; +- src_seg = 0x10000 >> 4; +- syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */ +- +- while (syssize) { +- int paras = (syssize >= 0x1000) ? 0x1000 : syssize; +- int dwords = paras << 2; +- +- asm volatile("pushw %%es ; " +- "pushw %%ds ; " +- "movw %1,%%es ; " +- "movw %2,%%ds ; " +- "xorw %%di,%%di ; " +- "xorw %%si,%%si ; " +- "rep;movsl ; " +- "popw %%ds ; " +- "popw %%es" +- : "+c" (dwords) +- : "r" (dst_seg), "r" (src_seg) +- : "esi", "edi"); +- +- syssize -= paras; +- dst_seg += paras; +- src_seg += paras; +- } +-#endif +-} +- +-/* + * Disable all interrupts at the legacy PIC. + */ + static void mask_all_interrupts(void) +@@ -147,9 +106,6 @@ void go_to_protected_mode(void) + /* Hook before leaving real mode, also disables interrupts */ + realmode_switch_hook(); + +- /* Move the kernel/setup to their final resting places */ +- move_kernel_around(); +- + /* Enable the A20 gate */ + if (enable_a20()) { + puts("A20 gate not responding, unable to boot...\n"); +Index: linux-2.6-tip/arch/x86/boot/pmjump.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/pmjump.S ++++ linux-2.6-tip/arch/x86/boot/pmjump.S +@@ -15,18 +15,15 @@ + #include + #include + #include ++#include + + .text +- +- .globl protected_mode_jump +- .type protected_mode_jump, @function +- + .code16 + + /* + * void protected_mode_jump(u32 entrypoint, u32 bootparams); + */ +-protected_mode_jump: ++GLOBAL(protected_mode_jump) + movl %edx, %esi # Pointer to boot_params table + + xorl %ebx, %ebx +@@ -47,12 +44,11 @@ protected_mode_jump: + .byte 0x66, 0xea # ljmpl opcode + 2: .long in_pm32 # offset + .word __BOOT_CS # segment +- +- .size protected_mode_jump, .-protected_mode_jump ++ENDPROC(protected_mode_jump) + + .code32 +- .type in_pm32, @function +-in_pm32: ++ .section ".text32","ax" ++GLOBAL(in_pm32) + # Set up data segments for flat 32-bit mode + movl %ecx, %ds + movl %ecx, %es +@@ -78,5 +74,4 @@ in_pm32: + lldt %cx + + jmpl *%eax # Jump to the 32-bit entrypoint +- +- .size in_pm32, .-in_pm32 ++ENDPROC(in_pm32) +Index: linux-2.6-tip/arch/x86/boot/setup.ld +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/setup.ld ++++ linux-2.6-tip/arch/x86/boot/setup.ld +@@ -17,7 +17,8 @@ SECTIONS + .header : { *(.header) } + .inittext : { *(.inittext) } + .initdata : { *(.initdata) } +- .text : { *(.text*) } ++ .text : { *(.text) } ++ .text32 : { *(.text32) } + + . = ALIGN(16); + .rodata : { *(.rodata*) } +Index: linux-2.6-tip/arch/x86/boot/tools/build.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/tools/build.c ++++ linux-2.6-tip/arch/x86/boot/tools/build.c +@@ -130,7 +130,7 @@ static void die(const char * str, ...) + + static void usage(void) + { +- die("Usage: build [-b] setup system [rootdev] [> image]"); ++ die("Usage: build setup system [rootdev] [> image]"); + } + + int main(int argc, char ** argv) +@@ -145,11 +145,6 @@ int main(int argc, char ** argv) + void *kernel; + u32 crc = 0xffffffffUL; + +- if (argc > 2 && !strcmp(argv[1], "-b")) +- { +- is_big_kernel = 1; +- argc--, argv++; +- } + if ((argc < 3) || (argc > 4)) + usage(); + if (argc > 3) { +@@ -216,8 +211,6 @@ int main(int argc, char ** argv) + die("Unable to mmap '%s': %m", argv[2]); + /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ + sys_size = (sz + 15 + 4) / 16; +- if (!is_big_kernel && sys_size > DEF_SYSSIZE) +- die("System is too big. Try using bzImage or modules."); + + /* Patch the setup code with the appropriate size parameters */ + buf[0x1f1] = setup_sectors-1; +Index: linux-2.6-tip/arch/x86/boot/video-mode.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/video-mode.c ++++ linux-2.6-tip/arch/x86/boot/video-mode.c +@@ -147,7 +147,7 @@ static void vga_recalc_vertical(void) + int set_mode(u16 mode) + { + int rv; +- u16 real_mode; ++ u16 uninitialized_var(real_mode); + + /* Very special mode numbers... */ + if (mode == VIDEO_CURRENT_MODE) +Index: linux-2.6-tip/arch/x86/boot/video-vga.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/video-vga.c ++++ linux-2.6-tip/arch/x86/boot/video-vga.c +@@ -129,41 +129,45 @@ u16 vga_crtc(void) + return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4; + } + +-static void vga_set_480_scanlines(int end) ++static void vga_set_480_scanlines(int lines) + { +- u16 crtc; +- u8 csel; ++ u16 crtc; /* CRTC base address */ ++ u8 csel; /* CRTC miscellaneous output register */ ++ u8 ovfw; /* CRTC overflow register */ ++ int end = lines-1; + + crtc = vga_crtc(); + ++ ovfw = 0x3c | ((end >> (8-1)) & 0x02) | ((end >> (9-6)) & 0x40); ++ + out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */ + out_idx(0x0b, crtc, 0x06); /* Vertical total */ +- out_idx(0x3e, crtc, 0x07); /* Vertical overflow */ ++ out_idx(ovfw, crtc, 0x07); /* Vertical overflow */ + out_idx(0xea, crtc, 0x10); /* Vertical sync start */ +- out_idx(end, crtc, 0x12); /* Vertical display end */ ++ out_idx(end, crtc, 0x12); /* Vertical display end */ + out_idx(0xe7, crtc, 0x15); /* Vertical blank start */ + out_idx(0x04, crtc, 0x16); /* Vertical blank end */ + csel = inb(0x3cc); + csel &= 0x0d; + csel |= 0xe2; +- outb(csel, 0x3cc); ++ outb(csel, 0x3c2); + } + + static void vga_set_80x30(void) + { +- vga_set_480_scanlines(0xdf); ++ vga_set_480_scanlines(30*16); + } + + static void vga_set_80x34(void) + { + vga_set_14font(); +- vga_set_480_scanlines(0xdb); ++ vga_set_480_scanlines(34*14); + } + + static void vga_set_80x60(void) + { + vga_set_8font(); +- vga_set_480_scanlines(0xdf); ++ vga_set_480_scanlines(60*8); + } + + static int vga_set_mode(struct mode_info *mode) +Index: linux-2.6-tip/arch/x86/boot/voyager.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/boot/voyager.c ++++ /dev/null +@@ -1,40 +0,0 @@ +-/* -*- linux-c -*- ------------------------------------------------------- * +- * +- * Copyright (C) 1991, 1992 Linus Torvalds +- * Copyright 2007 rPath, Inc. - All Rights Reserved +- * +- * This file is part of the Linux kernel, and is made available under +- * the terms of the GNU General Public License version 2. +- * +- * ----------------------------------------------------------------------- */ +- +-/* +- * Get the Voyager config information +- */ +- +-#include "boot.h" +- +-int query_voyager(void) +-{ +- u8 err; +- u16 es, di; +- /* Abuse the apm_bios_info area for this */ +- u8 *data_ptr = (u8 *)&boot_params.apm_bios_info; +- +- data_ptr[0] = 0xff; /* Flag on config not found(?) */ +- +- asm("pushw %%es ; " +- "int $0x15 ; " +- "setc %0 ; " +- "movw %%es, %1 ; " +- "popw %%es" +- : "=q" (err), "=r" (es), "=D" (di) +- : "a" (0xffc0)); +- +- if (err) +- return -1; /* Not Voyager */ +- +- set_fs(es); +- copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */ +- return 0; +-} +Index: linux-2.6-tip/arch/x86/configs/i386_defconfig +=================================================================== +--- linux-2.6-tip.orig/arch/x86/configs/i386_defconfig ++++ linux-2.6-tip/arch/x86/configs/i386_defconfig +@@ -1,14 +1,13 @@ + # + # Automatically generated make config: don't edit +-# Linux kernel version: 2.6.27-rc5 +-# Wed Sep 3 17:23:09 2008 ++# Linux kernel version: 2.6.29-rc4 ++# Tue Feb 24 15:50:58 2009 + # + # CONFIG_64BIT is not set + CONFIG_X86_32=y + # CONFIG_X86_64 is not set + CONFIG_X86=y + CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" +-# CONFIG_GENERIC_LOCKBREAK is not set + CONFIG_GENERIC_TIME=y + CONFIG_GENERIC_CMOS_UPDATE=y + CONFIG_CLOCKSOURCE_WATCHDOG=y +@@ -24,16 +23,14 @@ CONFIG_GENERIC_ISA_DMA=y + CONFIG_GENERIC_IOMAP=y + CONFIG_GENERIC_BUG=y + CONFIG_GENERIC_HWEIGHT=y +-# CONFIG_GENERIC_GPIO is not set + CONFIG_ARCH_MAY_HAVE_PC_FDC=y + # CONFIG_RWSEM_GENERIC_SPINLOCK is not set + CONFIG_RWSEM_XCHGADD_ALGORITHM=y +-# CONFIG_ARCH_HAS_ILOG2_U32 is not set +-# CONFIG_ARCH_HAS_ILOG2_U64 is not set + CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y + CONFIG_GENERIC_CALIBRATE_DELAY=y + # CONFIG_GENERIC_TIME_VSYSCALL is not set + CONFIG_ARCH_HAS_CPU_RELAX=y ++CONFIG_ARCH_HAS_DEFAULT_IDLE=y + CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y + CONFIG_HAVE_SETUP_PER_CPU_AREA=y + # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set +@@ -42,12 +39,12 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y + # CONFIG_ZONE_DMA32 is not set + CONFIG_ARCH_POPULATES_NODE_MAP=y + # CONFIG_AUDIT_ARCH is not set +-CONFIG_ARCH_SUPPORTS_AOUT=y + CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y + CONFIG_GENERIC_HARDIRQS=y + CONFIG_GENERIC_IRQ_PROBE=y + CONFIG_GENERIC_PENDING_IRQ=y + CONFIG_X86_SMP=y ++CONFIG_USE_GENERIC_SMP_HELPERS=y + CONFIG_X86_32_SMP=y + CONFIG_X86_HT=y + CONFIG_X86_BIOS_REBOOT=y +@@ -76,30 +73,44 @@ CONFIG_TASK_IO_ACCOUNTING=y + CONFIG_AUDIT=y + CONFIG_AUDITSYSCALL=y + CONFIG_AUDIT_TREE=y ++ ++# ++# RCU Subsystem ++# ++# CONFIG_CLASSIC_RCU is not set ++CONFIG_TREE_RCU=y ++# CONFIG_PREEMPT_RCU is not set ++# CONFIG_RCU_TRACE is not set ++CONFIG_RCU_FANOUT=32 ++# CONFIG_RCU_FANOUT_EXACT is not set ++# CONFIG_TREE_RCU_TRACE is not set ++# CONFIG_PREEMPT_RCU_TRACE is not set + # CONFIG_IKCONFIG is not set + CONFIG_LOG_BUF_SHIFT=18 +-CONFIG_CGROUPS=y +-# CONFIG_CGROUP_DEBUG is not set +-CONFIG_CGROUP_NS=y +-# CONFIG_CGROUP_DEVICE is not set +-CONFIG_CPUSETS=y + CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + CONFIG_GROUP_SCHED=y + CONFIG_FAIR_GROUP_SCHED=y + # CONFIG_RT_GROUP_SCHED is not set + # CONFIG_USER_SCHED is not set + CONFIG_CGROUP_SCHED=y ++CONFIG_CGROUPS=y ++# CONFIG_CGROUP_DEBUG is not set ++CONFIG_CGROUP_NS=y ++CONFIG_CGROUP_FREEZER=y ++# CONFIG_CGROUP_DEVICE is not set ++CONFIG_CPUSETS=y ++CONFIG_PROC_PID_CPUSET=y + CONFIG_CGROUP_CPUACCT=y + CONFIG_RESOURCE_COUNTERS=y + # CONFIG_CGROUP_MEM_RES_CTLR is not set + # CONFIG_SYSFS_DEPRECATED_V2 is not set +-CONFIG_PROC_PID_CPUSET=y + CONFIG_RELAY=y + CONFIG_NAMESPACES=y + CONFIG_UTS_NS=y + CONFIG_IPC_NS=y + CONFIG_USER_NS=y + CONFIG_PID_NS=y ++CONFIG_NET_NS=y + CONFIG_BLK_DEV_INITRD=y + CONFIG_INITRAMFS_SOURCE="" + CONFIG_CC_OPTIMIZE_FOR_SIZE=y +@@ -124,12 +135,15 @@ CONFIG_SIGNALFD=y + CONFIG_TIMERFD=y + CONFIG_EVENTFD=y + CONFIG_SHMEM=y ++CONFIG_AIO=y + CONFIG_VM_EVENT_COUNTERS=y ++CONFIG_PCI_QUIRKS=y + CONFIG_SLUB_DEBUG=y + # CONFIG_SLAB is not set + CONFIG_SLUB=y + # CONFIG_SLOB is not set + CONFIG_PROFILING=y ++CONFIG_TRACEPOINTS=y + CONFIG_MARKERS=y + # CONFIG_OPROFILE is not set + CONFIG_HAVE_OPROFILE=y +@@ -139,15 +153,10 @@ CONFIG_KRETPROBES=y + CONFIG_HAVE_IOREMAP_PROT=y + CONFIG_HAVE_KPROBES=y + CONFIG_HAVE_KRETPROBES=y +-# CONFIG_HAVE_ARCH_TRACEHOOK is not set +-# CONFIG_HAVE_DMA_ATTRS is not set +-CONFIG_USE_GENERIC_SMP_HELPERS=y +-# CONFIG_HAVE_CLK is not set +-CONFIG_PROC_PAGE_MONITOR=y ++CONFIG_HAVE_ARCH_TRACEHOOK=y + CONFIG_HAVE_GENERIC_DMA_COHERENT=y + CONFIG_SLABINFO=y + CONFIG_RT_MUTEXES=y +-# CONFIG_TINY_SHMEM is not set + CONFIG_BASE_SMALL=0 + CONFIG_MODULES=y + # CONFIG_MODULE_FORCE_LOAD is not set +@@ -155,12 +164,10 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_MODVERSIONS is not set + # CONFIG_MODULE_SRCVERSION_ALL is not set +-CONFIG_KMOD=y + CONFIG_STOP_MACHINE=y + CONFIG_BLOCK=y + # CONFIG_LBD is not set + CONFIG_BLK_DEV_IO_TRACE=y +-# CONFIG_LSF is not set + CONFIG_BLK_DEV_BSG=y + # CONFIG_BLK_DEV_INTEGRITY is not set + +@@ -176,7 +183,7 @@ CONFIG_IOSCHED_CFQ=y + CONFIG_DEFAULT_CFQ=y + # CONFIG_DEFAULT_NOOP is not set + CONFIG_DEFAULT_IOSCHED="cfq" +-CONFIG_CLASSIC_RCU=y ++CONFIG_FREEZER=y + + # + # Processor type and features +@@ -186,15 +193,14 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_GENERIC_CLOCKEVENTS_BUILD=y + CONFIG_SMP=y ++CONFIG_SPARSE_IRQ=y + CONFIG_X86_FIND_SMP_CONFIG=y + CONFIG_X86_MPPARSE=y +-CONFIG_X86_PC=y + # CONFIG_X86_ELAN is not set +-# CONFIG_X86_VOYAGER is not set + # CONFIG_X86_GENERICARCH is not set + # CONFIG_X86_VSMP is not set + # CONFIG_X86_RDC321X is not set +-CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y ++CONFIG_SCHED_OMIT_FRAME_POINTER=y + # CONFIG_PARAVIRT_GUEST is not set + # CONFIG_MEMTEST is not set + # CONFIG_M386 is not set +@@ -238,10 +244,19 @@ CONFIG_X86_TSC=y + CONFIG_X86_CMOV=y + CONFIG_X86_MINIMUM_CPU_FAMILY=4 + CONFIG_X86_DEBUGCTLMSR=y ++CONFIG_CPU_SUP_INTEL=y ++CONFIG_CPU_SUP_CYRIX_32=y ++CONFIG_CPU_SUP_AMD=y ++CONFIG_CPU_SUP_CENTAUR_32=y ++CONFIG_CPU_SUP_TRANSMETA_32=y ++CONFIG_CPU_SUP_UMC_32=y ++CONFIG_X86_DS=y ++CONFIG_X86_PTRACE_BTS=y + CONFIG_HPET_TIMER=y + CONFIG_HPET_EMULATE_RTC=y + CONFIG_DMI=y + # CONFIG_IOMMU_HELPER is not set ++# CONFIG_IOMMU_API is not set + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y + CONFIG_SCHED_MC=y +@@ -250,12 +265,17 @@ CONFIG_PREEMPT_VOLUNTARY=y + # CONFIG_PREEMPT is not set + CONFIG_X86_LOCAL_APIC=y + CONFIG_X86_IO_APIC=y +-# CONFIG_X86_MCE is not set ++CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y ++CONFIG_X86_MCE=y ++CONFIG_X86_MCE_NONFATAL=y ++CONFIG_X86_MCE_P4THERMAL=y + CONFIG_VM86=y + # CONFIG_TOSHIBA is not set + # CONFIG_I8K is not set + CONFIG_X86_REBOOTFIXUPS=y + CONFIG_MICROCODE=y ++CONFIG_MICROCODE_INTEL=y ++CONFIG_MICROCODE_AMD=y + CONFIG_MICROCODE_OLD_INTERFACE=y + CONFIG_X86_MSR=y + CONFIG_X86_CPUID=y +@@ -264,6 +284,7 @@ CONFIG_HIGHMEM4G=y + # CONFIG_HIGHMEM64G is not set + CONFIG_PAGE_OFFSET=0xC0000000 + CONFIG_HIGHMEM=y ++# CONFIG_ARCH_PHYS_ADDR_T_64BIT is not set + CONFIG_ARCH_FLATMEM_ENABLE=y + CONFIG_ARCH_SPARSEMEM_ENABLE=y + CONFIG_ARCH_SELECT_MEMORY_MODEL=y +@@ -274,14 +295,17 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_FLATMEM=y + CONFIG_FLAT_NODE_MEM_MAP=y + CONFIG_SPARSEMEM_STATIC=y +-# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set + CONFIG_PAGEFLAGS_EXTENDED=y + CONFIG_SPLIT_PTLOCK_CPUS=4 +-CONFIG_RESOURCES_64BIT=y ++# CONFIG_PHYS_ADDR_T_64BIT is not set + CONFIG_ZONE_DMA_FLAG=1 + CONFIG_BOUNCE=y + CONFIG_VIRT_TO_BUS=y ++CONFIG_UNEVICTABLE_LRU=y + CONFIG_HIGHPTE=y ++CONFIG_X86_CHECK_BIOS_CORRUPTION=y ++CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y ++CONFIG_X86_RESERVE_LOW_64K=y + # CONFIG_MATH_EMULATION is not set + CONFIG_MTRR=y + # CONFIG_MTRR_SANITIZER is not set +@@ -302,10 +326,11 @@ CONFIG_PHYSICAL_START=0x1000000 + CONFIG_PHYSICAL_ALIGN=0x200000 + CONFIG_HOTPLUG_CPU=y + # CONFIG_COMPAT_VDSO is not set ++# CONFIG_CMDLINE_BOOL is not set + CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y + + # +-# Power management options ++# Power management and ACPI options + # + CONFIG_PM=y + CONFIG_PM_DEBUG=y +@@ -331,19 +356,13 @@ CONFIG_ACPI_BATTERY=y + CONFIG_ACPI_BUTTON=y + CONFIG_ACPI_FAN=y + CONFIG_ACPI_DOCK=y +-# CONFIG_ACPI_BAY is not set + CONFIG_ACPI_PROCESSOR=y + CONFIG_ACPI_HOTPLUG_CPU=y + CONFIG_ACPI_THERMAL=y +-# CONFIG_ACPI_WMI is not set +-# CONFIG_ACPI_ASUS is not set +-# CONFIG_ACPI_TOSHIBA is not set + # CONFIG_ACPI_CUSTOM_DSDT is not set + CONFIG_ACPI_BLACKLIST_YEAR=0 + # CONFIG_ACPI_DEBUG is not set +-CONFIG_ACPI_EC=y + # CONFIG_ACPI_PCI_SLOT is not set +-CONFIG_ACPI_POWER=y + CONFIG_ACPI_SYSTEM=y + CONFIG_X86_PM_TIMER=y + CONFIG_ACPI_CONTAINER=y +@@ -388,7 +407,6 @@ CONFIG_X86_ACPI_CPUFREQ=y + # + # shared options + # +-# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set + # CONFIG_X86_SPEEDSTEP_LIB is not set + CONFIG_CPU_IDLE=y + CONFIG_CPU_IDLE_GOV_LADDER=y +@@ -415,6 +433,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y + CONFIG_PCI_MSI=y + # CONFIG_PCI_LEGACY is not set + # CONFIG_PCI_DEBUG is not set ++# CONFIG_PCI_STUB is not set + CONFIG_HT_IRQ=y + CONFIG_ISA_DMA_API=y + # CONFIG_ISA is not set +@@ -452,13 +471,17 @@ CONFIG_HOTPLUG_PCI=y + # Executable file formats / Emulations + # + CONFIG_BINFMT_ELF=y ++CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y ++CONFIG_HAVE_AOUT=y + # CONFIG_BINFMT_AOUT is not set + CONFIG_BINFMT_MISC=y ++CONFIG_HAVE_ATOMIC_IOMAP=y + CONFIG_NET=y + + # + # Networking options + # ++CONFIG_COMPAT_NET_DEV_OPS=y + CONFIG_PACKET=y + CONFIG_PACKET_MMAP=y + CONFIG_UNIX=y +@@ -519,7 +542,6 @@ CONFIG_DEFAULT_CUBIC=y + # CONFIG_DEFAULT_RENO is not set + CONFIG_DEFAULT_TCP_CONG="cubic" + CONFIG_TCP_MD5SIG=y +-# CONFIG_IP_VS is not set + CONFIG_IPV6=y + # CONFIG_IPV6_PRIVACY is not set + # CONFIG_IPV6_ROUTER_PREF is not set +@@ -557,19 +579,21 @@ CONFIG_NF_CONNTRACK_IRC=y + CONFIG_NF_CONNTRACK_SIP=y + CONFIG_NF_CT_NETLINK=y + CONFIG_NETFILTER_XTABLES=y ++CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y + CONFIG_NETFILTER_XT_TARGET_MARK=y + CONFIG_NETFILTER_XT_TARGET_NFLOG=y + CONFIG_NETFILTER_XT_TARGET_SECMARK=y +-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y + CONFIG_NETFILTER_XT_TARGET_TCPMSS=y + CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y + CONFIG_NETFILTER_XT_MATCH_MARK=y + CONFIG_NETFILTER_XT_MATCH_POLICY=y + CONFIG_NETFILTER_XT_MATCH_STATE=y ++# CONFIG_IP_VS is not set + + # + # IP: Netfilter Configuration + # ++CONFIG_NF_DEFRAG_IPV4=y + CONFIG_NF_CONNTRACK_IPV4=y + CONFIG_NF_CONNTRACK_PROC_COMPAT=y + CONFIG_IP_NF_IPTABLES=y +@@ -595,8 +619,8 @@ CONFIG_IP_NF_MANGLE=y + CONFIG_NF_CONNTRACK_IPV6=y + CONFIG_IP6_NF_IPTABLES=y + CONFIG_IP6_NF_MATCH_IPV6HEADER=y +-CONFIG_IP6_NF_FILTER=y + CONFIG_IP6_NF_TARGET_LOG=y ++CONFIG_IP6_NF_FILTER=y + CONFIG_IP6_NF_TARGET_REJECT=y + CONFIG_IP6_NF_MANGLE=y + # CONFIG_IP_DCCP is not set +@@ -604,6 +628,7 @@ CONFIG_IP6_NF_MANGLE=y + # CONFIG_TIPC is not set + # CONFIG_ATM is not set + # CONFIG_BRIDGE is not set ++# CONFIG_NET_DSA is not set + # CONFIG_VLAN_8021Q is not set + # CONFIG_DECNET is not set + CONFIG_LLC=y +@@ -623,6 +648,7 @@ CONFIG_NET_SCHED=y + # CONFIG_NET_SCH_HTB is not set + # CONFIG_NET_SCH_HFSC is not set + # CONFIG_NET_SCH_PRIO is not set ++# CONFIG_NET_SCH_MULTIQ is not set + # CONFIG_NET_SCH_RED is not set + # CONFIG_NET_SCH_SFQ is not set + # CONFIG_NET_SCH_TEQL is not set +@@ -630,6 +656,7 @@ CONFIG_NET_SCHED=y + # CONFIG_NET_SCH_GRED is not set + # CONFIG_NET_SCH_DSMARK is not set + # CONFIG_NET_SCH_NETEM is not set ++# CONFIG_NET_SCH_DRR is not set + # CONFIG_NET_SCH_INGRESS is not set + + # +@@ -644,6 +671,7 @@ CONFIG_NET_CLS=y + # CONFIG_NET_CLS_RSVP is not set + # CONFIG_NET_CLS_RSVP6 is not set + # CONFIG_NET_CLS_FLOW is not set ++# CONFIG_NET_CLS_CGROUP is not set + CONFIG_NET_EMATCH=y + CONFIG_NET_EMATCH_STACK=32 + # CONFIG_NET_EMATCH_CMP is not set +@@ -659,7 +687,9 @@ CONFIG_NET_CLS_ACT=y + # CONFIG_NET_ACT_NAT is not set + # CONFIG_NET_ACT_PEDIT is not set + # CONFIG_NET_ACT_SIMP is not set ++# CONFIG_NET_ACT_SKBEDIT is not set + CONFIG_NET_SCH_FIFO=y ++# CONFIG_DCB is not set + + # + # Network testing +@@ -676,29 +706,33 @@ CONFIG_HAMRADIO=y + # CONFIG_IRDA is not set + # CONFIG_BT is not set + # CONFIG_AF_RXRPC is not set ++# CONFIG_PHONET is not set + CONFIG_FIB_RULES=y +- +-# +-# Wireless +-# ++CONFIG_WIRELESS=y + CONFIG_CFG80211=y ++# CONFIG_CFG80211_REG_DEBUG is not set + CONFIG_NL80211=y ++CONFIG_WIRELESS_OLD_REGULATORY=y + CONFIG_WIRELESS_EXT=y + CONFIG_WIRELESS_EXT_SYSFS=y ++# CONFIG_LIB80211 is not set + CONFIG_MAC80211=y + + # + # Rate control algorithm selection + # +-CONFIG_MAC80211_RC_PID=y +-CONFIG_MAC80211_RC_DEFAULT_PID=y +-CONFIG_MAC80211_RC_DEFAULT="pid" ++CONFIG_MAC80211_RC_MINSTREL=y ++# CONFIG_MAC80211_RC_DEFAULT_PID is not set ++CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y ++CONFIG_MAC80211_RC_DEFAULT="minstrel" + # CONFIG_MAC80211_MESH is not set + CONFIG_MAC80211_LEDS=y + # CONFIG_MAC80211_DEBUGFS is not set + # CONFIG_MAC80211_DEBUG_MENU is not set +-# CONFIG_IEEE80211 is not set +-# CONFIG_RFKILL is not set ++# CONFIG_WIMAX is not set ++CONFIG_RFKILL=y ++# CONFIG_RFKILL_INPUT is not set ++CONFIG_RFKILL_LEDS=y + # CONFIG_NET_9P is not set + + # +@@ -722,7 +756,7 @@ CONFIG_PROC_EVENTS=y + # CONFIG_MTD is not set + # CONFIG_PARPORT is not set + CONFIG_PNP=y +-# CONFIG_PNP_DEBUG is not set ++CONFIG_PNP_DEBUG_MESSAGES=y + + # + # Protocols +@@ -750,20 +784,19 @@ CONFIG_BLK_DEV_RAM_SIZE=16384 + CONFIG_MISC_DEVICES=y + # CONFIG_IBM_ASM is not set + # CONFIG_PHANTOM is not set +-# CONFIG_EEPROM_93CX6 is not set + # CONFIG_SGI_IOC4 is not set + # CONFIG_TIFM_CORE is not set +-# CONFIG_ACER_WMI is not set +-# CONFIG_ASUS_LAPTOP is not set +-# CONFIG_FUJITSU_LAPTOP is not set +-# CONFIG_TC1100_WMI is not set +-# CONFIG_MSI_LAPTOP is not set +-# CONFIG_COMPAL_LAPTOP is not set +-# CONFIG_SONY_LAPTOP is not set +-# CONFIG_THINKPAD_ACPI is not set +-# CONFIG_INTEL_MENLOW is not set ++# CONFIG_ICS932S401 is not set + # CONFIG_ENCLOSURE_SERVICES is not set + # CONFIG_HP_ILO is not set ++# CONFIG_C2PORT is not set ++ ++# ++# EEPROM support ++# ++# CONFIG_EEPROM_AT24 is not set ++# CONFIG_EEPROM_LEGACY is not set ++# CONFIG_EEPROM_93CX6 is not set + CONFIG_HAVE_IDE=y + # CONFIG_IDE is not set + +@@ -802,7 +835,7 @@ CONFIG_SCSI_WAIT_SCAN=m + # + CONFIG_SCSI_SPI_ATTRS=y + # CONFIG_SCSI_FC_ATTRS is not set +-CONFIG_SCSI_ISCSI_ATTRS=y ++# CONFIG_SCSI_ISCSI_ATTRS is not set + # CONFIG_SCSI_SAS_ATTRS is not set + # CONFIG_SCSI_SAS_LIBSAS is not set + # CONFIG_SCSI_SRP_ATTRS is not set +@@ -875,6 +908,7 @@ CONFIG_PATA_OLDPIIX=y + CONFIG_PATA_SCH=y + CONFIG_MD=y + CONFIG_BLK_DEV_MD=y ++CONFIG_MD_AUTODETECT=y + # CONFIG_MD_LINEAR is not set + # CONFIG_MD_RAID0 is not set + # CONFIG_MD_RAID1 is not set +@@ -930,6 +964,9 @@ CONFIG_PHYLIB=y + # CONFIG_BROADCOM_PHY is not set + # CONFIG_ICPLUS_PHY is not set + # CONFIG_REALTEK_PHY is not set ++# CONFIG_NATIONAL_PHY is not set ++# CONFIG_STE10XP is not set ++# CONFIG_LSI_ET1011C_PHY is not set + # CONFIG_FIXED_PHY is not set + # CONFIG_MDIO_BITBANG is not set + CONFIG_NET_ETHERNET=y +@@ -953,6 +990,9 @@ CONFIG_NET_TULIP=y + # CONFIG_IBM_NEW_EMAC_RGMII is not set + # CONFIG_IBM_NEW_EMAC_TAH is not set + # CONFIG_IBM_NEW_EMAC_EMAC4 is not set ++# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set ++# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set ++# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set + CONFIG_NET_PCI=y + # CONFIG_PCNET32 is not set + # CONFIG_AMD8111_ETH is not set +@@ -960,7 +1000,6 @@ CONFIG_NET_PCI=y + # CONFIG_B44 is not set + CONFIG_FORCEDETH=y + # CONFIG_FORCEDETH_NAPI is not set +-# CONFIG_EEPRO100 is not set + CONFIG_E100=y + # CONFIG_FEALNX is not set + # CONFIG_NATSEMI is not set +@@ -974,15 +1013,16 @@ CONFIG_8139TOO=y + # CONFIG_R6040 is not set + # CONFIG_SIS900 is not set + # CONFIG_EPIC100 is not set ++# CONFIG_SMSC9420 is not set + # CONFIG_SUNDANCE is not set + # CONFIG_TLAN is not set + # CONFIG_VIA_RHINE is not set + # CONFIG_SC92031 is not set ++# CONFIG_ATL2 is not set + CONFIG_NETDEV_1000=y + # CONFIG_ACENIC is not set + # CONFIG_DL2K is not set + CONFIG_E1000=y +-# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set + CONFIG_E1000E=y + # CONFIG_IP1000 is not set + # CONFIG_IGB is not set +@@ -1000,18 +1040,23 @@ CONFIG_BNX2=y + # CONFIG_QLA3XXX is not set + # CONFIG_ATL1 is not set + # CONFIG_ATL1E is not set ++# CONFIG_JME is not set + CONFIG_NETDEV_10000=y + # CONFIG_CHELSIO_T1 is not set ++CONFIG_CHELSIO_T3_DEPENDS=y + # CONFIG_CHELSIO_T3 is not set ++# CONFIG_ENIC is not set + # CONFIG_IXGBE is not set + # CONFIG_IXGB is not set + # CONFIG_S2IO is not set + # CONFIG_MYRI10GE is not set + # CONFIG_NETXEN_NIC is not set + # CONFIG_NIU is not set ++# CONFIG_MLX4_EN is not set + # CONFIG_MLX4_CORE is not set + # CONFIG_TEHUTI is not set + # CONFIG_BNX2X is not set ++# CONFIG_QLGE is not set + # CONFIG_SFC is not set + CONFIG_TR=y + # CONFIG_IBMOL is not set +@@ -1025,9 +1070,8 @@ CONFIG_TR=y + # CONFIG_WLAN_PRE80211 is not set + CONFIG_WLAN_80211=y + # CONFIG_PCMCIA_RAYCS is not set +-# CONFIG_IPW2100 is not set +-# CONFIG_IPW2200 is not set + # CONFIG_LIBERTAS is not set ++# CONFIG_LIBERTAS_THINFIRM is not set + # CONFIG_AIRO is not set + # CONFIG_HERMES is not set + # CONFIG_ATMEL is not set +@@ -1044,6 +1088,8 @@ CONFIG_WLAN_80211=y + CONFIG_ATH5K=y + # CONFIG_ATH5K_DEBUG is not set + # CONFIG_ATH9K is not set ++# CONFIG_IPW2100 is not set ++# CONFIG_IPW2200 is not set + # CONFIG_IWLCORE is not set + # CONFIG_IWLWIFI_LEDS is not set + # CONFIG_IWLAGN is not set +@@ -1055,6 +1101,10 @@ CONFIG_ATH5K=y + # CONFIG_RT2X00 is not set + + # ++# Enable WiMAX (Networking options) to see the WiMAX drivers ++# ++ ++# + # USB Network Adapters + # + # CONFIG_USB_CATC is not set +@@ -1062,6 +1112,7 @@ CONFIG_ATH5K=y + # CONFIG_USB_PEGASUS is not set + # CONFIG_USB_RTL8150 is not set + # CONFIG_USB_USBNET is not set ++# CONFIG_USB_HSO is not set + CONFIG_NET_PCMCIA=y + # CONFIG_PCMCIA_3C589 is not set + # CONFIG_PCMCIA_3C574 is not set +@@ -1123,6 +1174,7 @@ CONFIG_MOUSE_PS2_LOGIPS2PP=y + CONFIG_MOUSE_PS2_SYNAPTICS=y + CONFIG_MOUSE_PS2_LIFEBOOK=y + CONFIG_MOUSE_PS2_TRACKPOINT=y ++# CONFIG_MOUSE_PS2_ELANTECH is not set + # CONFIG_MOUSE_PS2_TOUCHKIT is not set + # CONFIG_MOUSE_SERIAL is not set + # CONFIG_MOUSE_APPLETOUCH is not set +@@ -1160,15 +1212,16 @@ CONFIG_INPUT_TOUCHSCREEN=y + # CONFIG_TOUCHSCREEN_FUJITSU is not set + # CONFIG_TOUCHSCREEN_GUNZE is not set + # CONFIG_TOUCHSCREEN_ELO is not set ++# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set + # CONFIG_TOUCHSCREEN_MTOUCH is not set + # CONFIG_TOUCHSCREEN_INEXIO is not set + # CONFIG_TOUCHSCREEN_MK712 is not set + # CONFIG_TOUCHSCREEN_PENMOUNT is not set + # CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set + # CONFIG_TOUCHSCREEN_TOUCHWIN is not set +-# CONFIG_TOUCHSCREEN_UCB1400 is not set + # CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set + # CONFIG_TOUCHSCREEN_TOUCHIT213 is not set ++# CONFIG_TOUCHSCREEN_TSC2007 is not set + CONFIG_INPUT_MISC=y + # CONFIG_INPUT_PCSPKR is not set + # CONFIG_INPUT_APANEL is not set +@@ -1179,6 +1232,7 @@ CONFIG_INPUT_MISC=y + # CONFIG_INPUT_KEYSPAN_REMOTE is not set + # CONFIG_INPUT_POWERMATE is not set + # CONFIG_INPUT_YEALINK is not set ++# CONFIG_INPUT_CM109 is not set + # CONFIG_INPUT_UINPUT is not set + + # +@@ -1245,6 +1299,7 @@ CONFIG_SERIAL_CORE=y + CONFIG_SERIAL_CORE_CONSOLE=y + # CONFIG_SERIAL_JSM is not set + CONFIG_UNIX98_PTYS=y ++# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set + # CONFIG_LEGACY_PTYS is not set + # CONFIG_IPMI_HANDLER is not set + CONFIG_HW_RANDOM=y +@@ -1279,6 +1334,7 @@ CONFIG_I2C=y + CONFIG_I2C_BOARDINFO=y + # CONFIG_I2C_CHARDEV is not set + CONFIG_I2C_HELPER_AUTO=y ++CONFIG_I2C_ALGOBIT=y + + # + # I2C Hardware Bus support +@@ -1331,8 +1387,6 @@ CONFIG_I2C_I801=y + # Miscellaneous I2C Chip support + # + # CONFIG_DS1682 is not set +-# CONFIG_EEPROM_AT24 is not set +-# CONFIG_EEPROM_LEGACY is not set + # CONFIG_SENSORS_PCF8574 is not set + # CONFIG_PCF8575 is not set + # CONFIG_SENSORS_PCA9539 is not set +@@ -1351,8 +1405,78 @@ CONFIG_POWER_SUPPLY=y + # CONFIG_POWER_SUPPLY_DEBUG is not set + # CONFIG_PDA_POWER is not set + # CONFIG_BATTERY_DS2760 is not set +-# CONFIG_HWMON is not set ++# CONFIG_BATTERY_BQ27x00 is not set ++CONFIG_HWMON=y ++# CONFIG_HWMON_VID is not set ++# CONFIG_SENSORS_ABITUGURU is not set ++# CONFIG_SENSORS_ABITUGURU3 is not set ++# CONFIG_SENSORS_AD7414 is not set ++# CONFIG_SENSORS_AD7418 is not set ++# CONFIG_SENSORS_ADM1021 is not set ++# CONFIG_SENSORS_ADM1025 is not set ++# CONFIG_SENSORS_ADM1026 is not set ++# CONFIG_SENSORS_ADM1029 is not set ++# CONFIG_SENSORS_ADM1031 is not set ++# CONFIG_SENSORS_ADM9240 is not set ++# CONFIG_SENSORS_ADT7462 is not set ++# CONFIG_SENSORS_ADT7470 is not set ++# CONFIG_SENSORS_ADT7473 is not set ++# CONFIG_SENSORS_ADT7475 is not set ++# CONFIG_SENSORS_K8TEMP is not set ++# CONFIG_SENSORS_ASB100 is not set ++# CONFIG_SENSORS_ATXP1 is not set ++# CONFIG_SENSORS_DS1621 is not set ++# CONFIG_SENSORS_I5K_AMB is not set ++# CONFIG_SENSORS_F71805F is not set ++# CONFIG_SENSORS_F71882FG is not set ++# CONFIG_SENSORS_F75375S is not set ++# CONFIG_SENSORS_FSCHER is not set ++# CONFIG_SENSORS_FSCPOS is not set ++# CONFIG_SENSORS_FSCHMD is not set ++# CONFIG_SENSORS_GL518SM is not set ++# CONFIG_SENSORS_GL520SM is not set ++# CONFIG_SENSORS_CORETEMP is not set ++# CONFIG_SENSORS_IT87 is not set ++# CONFIG_SENSORS_LM63 is not set ++# CONFIG_SENSORS_LM75 is not set ++# CONFIG_SENSORS_LM77 is not set ++# CONFIG_SENSORS_LM78 is not set ++# CONFIG_SENSORS_LM80 is not set ++# CONFIG_SENSORS_LM83 is not set ++# CONFIG_SENSORS_LM85 is not set ++# CONFIG_SENSORS_LM87 is not set ++# CONFIG_SENSORS_LM90 is not set ++# CONFIG_SENSORS_LM92 is not set ++# CONFIG_SENSORS_LM93 is not set ++# CONFIG_SENSORS_LTC4245 is not set ++# CONFIG_SENSORS_MAX1619 is not set ++# CONFIG_SENSORS_MAX6650 is not set ++# CONFIG_SENSORS_PC87360 is not set ++# CONFIG_SENSORS_PC87427 is not set ++# CONFIG_SENSORS_SIS5595 is not set ++# CONFIG_SENSORS_DME1737 is not set ++# CONFIG_SENSORS_SMSC47M1 is not set ++# CONFIG_SENSORS_SMSC47M192 is not set ++# CONFIG_SENSORS_SMSC47B397 is not set ++# CONFIG_SENSORS_ADS7828 is not set ++# CONFIG_SENSORS_THMC50 is not set ++# CONFIG_SENSORS_VIA686A is not set ++# CONFIG_SENSORS_VT1211 is not set ++# CONFIG_SENSORS_VT8231 is not set ++# CONFIG_SENSORS_W83781D is not set ++# CONFIG_SENSORS_W83791D is not set ++# CONFIG_SENSORS_W83792D is not set ++# CONFIG_SENSORS_W83793 is not set ++# CONFIG_SENSORS_W83L785TS is not set ++# CONFIG_SENSORS_W83L786NG is not set ++# CONFIG_SENSORS_W83627HF is not set ++# CONFIG_SENSORS_W83627EHF is not set ++# CONFIG_SENSORS_HDAPS is not set ++# CONFIG_SENSORS_LIS3LV02D is not set ++# CONFIG_SENSORS_APPLESMC is not set ++# CONFIG_HWMON_DEBUG_CHIP is not set + CONFIG_THERMAL=y ++# CONFIG_THERMAL_HWMON is not set + CONFIG_WATCHDOG=y + # CONFIG_WATCHDOG_NOWAYOUT is not set + +@@ -1372,6 +1496,7 @@ CONFIG_WATCHDOG=y + # CONFIG_I6300ESB_WDT is not set + # CONFIG_ITCO_WDT is not set + # CONFIG_IT8712F_WDT is not set ++# CONFIG_IT87_WDT is not set + # CONFIG_HP_WATCHDOG is not set + # CONFIG_SC1200_WDT is not set + # CONFIG_PC87413_WDT is not set +@@ -1379,9 +1504,11 @@ CONFIG_WATCHDOG=y + # CONFIG_SBC8360_WDT is not set + # CONFIG_SBC7240_WDT is not set + # CONFIG_CPU5_WDT is not set ++# CONFIG_SMSC_SCH311X_WDT is not set + # CONFIG_SMSC37B787_WDT is not set + # CONFIG_W83627HF_WDT is not set + # CONFIG_W83697HF_WDT is not set ++# CONFIG_W83697UG_WDT is not set + # CONFIG_W83877F_WDT is not set + # CONFIG_W83977F_WDT is not set + # CONFIG_MACHZ_WDT is not set +@@ -1397,11 +1524,11 @@ CONFIG_WATCHDOG=y + # USB-based Watchdog Cards + # + # CONFIG_USBPCWATCHDOG is not set ++CONFIG_SSB_POSSIBLE=y + + # + # Sonics Silicon Backplane + # +-CONFIG_SSB_POSSIBLE=y + # CONFIG_SSB is not set + + # +@@ -1410,7 +1537,13 @@ CONFIG_SSB_POSSIBLE=y + # CONFIG_MFD_CORE is not set + # CONFIG_MFD_SM501 is not set + # CONFIG_HTC_PASIC3 is not set ++# CONFIG_TWL4030_CORE is not set + # CONFIG_MFD_TMIO is not set ++# CONFIG_PMIC_DA903X is not set ++# CONFIG_MFD_WM8400 is not set ++# CONFIG_MFD_WM8350_I2C is not set ++# CONFIG_MFD_PCF50633 is not set ++# CONFIG_REGULATOR is not set + + # + # Multimedia devices +@@ -1450,6 +1583,7 @@ CONFIG_DRM=y + # CONFIG_DRM_I810 is not set + # CONFIG_DRM_I830 is not set + CONFIG_DRM_I915=y ++# CONFIG_DRM_I915_KMS is not set + # CONFIG_DRM_MGA is not set + # CONFIG_DRM_SIS is not set + # CONFIG_DRM_VIA is not set +@@ -1459,6 +1593,7 @@ CONFIG_DRM_I915=y + CONFIG_FB=y + # CONFIG_FIRMWARE_EDID is not set + # CONFIG_FB_DDC is not set ++# CONFIG_FB_BOOT_VESA_SUPPORT is not set + CONFIG_FB_CFB_FILLRECT=y + CONFIG_FB_CFB_COPYAREA=y + CONFIG_FB_CFB_IMAGEBLIT=y +@@ -1487,7 +1622,6 @@ CONFIG_FB_TILEBLITTING=y + # CONFIG_FB_UVESA is not set + # CONFIG_FB_VESA is not set + CONFIG_FB_EFI=y +-# CONFIG_FB_IMAC is not set + # CONFIG_FB_N411 is not set + # CONFIG_FB_HGA is not set + # CONFIG_FB_S1D13XXX is not set +@@ -1503,6 +1637,7 @@ CONFIG_FB_EFI=y + # CONFIG_FB_S3 is not set + # CONFIG_FB_SAVAGE is not set + # CONFIG_FB_SIS is not set ++# CONFIG_FB_VIA is not set + # CONFIG_FB_NEOMAGIC is not set + # CONFIG_FB_KYRO is not set + # CONFIG_FB_3DFX is not set +@@ -1515,12 +1650,15 @@ CONFIG_FB_EFI=y + # CONFIG_FB_CARMINE is not set + # CONFIG_FB_GEODE is not set + # CONFIG_FB_VIRTUAL is not set ++# CONFIG_FB_METRONOME is not set ++# CONFIG_FB_MB862XX is not set + CONFIG_BACKLIGHT_LCD_SUPPORT=y + # CONFIG_LCD_CLASS_DEVICE is not set + CONFIG_BACKLIGHT_CLASS_DEVICE=y +-# CONFIG_BACKLIGHT_CORGI is not set ++CONFIG_BACKLIGHT_GENERIC=y + # CONFIG_BACKLIGHT_PROGEAR is not set + # CONFIG_BACKLIGHT_MBP_NVIDIA is not set ++# CONFIG_BACKLIGHT_SAHARA is not set + + # + # Display device support +@@ -1540,10 +1678,12 @@ CONFIG_LOGO=y + # CONFIG_LOGO_LINUX_VGA16 is not set + CONFIG_LOGO_LINUX_CLUT224=y + CONFIG_SOUND=y ++CONFIG_SOUND_OSS_CORE=y + CONFIG_SND=y + CONFIG_SND_TIMER=y + CONFIG_SND_PCM=y + CONFIG_SND_HWDEP=y ++CONFIG_SND_JACK=y + CONFIG_SND_SEQUENCER=y + CONFIG_SND_SEQ_DUMMY=y + CONFIG_SND_OSSEMUL=y +@@ -1551,6 +1691,8 @@ CONFIG_SND_MIXER_OSS=y + CONFIG_SND_PCM_OSS=y + CONFIG_SND_PCM_OSS_PLUGINS=y + CONFIG_SND_SEQUENCER_OSS=y ++CONFIG_SND_HRTIMER=y ++CONFIG_SND_SEQ_HRTIMER_DEFAULT=y + CONFIG_SND_DYNAMIC_MINORS=y + CONFIG_SND_SUPPORT_OLD_API=y + CONFIG_SND_VERBOSE_PROCFS=y +@@ -1605,11 +1747,16 @@ CONFIG_SND_PCI=y + # CONFIG_SND_FM801 is not set + CONFIG_SND_HDA_INTEL=y + CONFIG_SND_HDA_HWDEP=y ++# CONFIG_SND_HDA_RECONFIG is not set ++# CONFIG_SND_HDA_INPUT_BEEP is not set + CONFIG_SND_HDA_CODEC_REALTEK=y + CONFIG_SND_HDA_CODEC_ANALOG=y + CONFIG_SND_HDA_CODEC_SIGMATEL=y + CONFIG_SND_HDA_CODEC_VIA=y + CONFIG_SND_HDA_CODEC_ATIHDMI=y ++CONFIG_SND_HDA_CODEC_NVHDMI=y ++CONFIG_SND_HDA_CODEC_INTELHDMI=y ++CONFIG_SND_HDA_ELD=y + CONFIG_SND_HDA_CODEC_CONEXANT=y + CONFIG_SND_HDA_CODEC_CMEDIA=y + CONFIG_SND_HDA_CODEC_SI3054=y +@@ -1643,6 +1790,7 @@ CONFIG_SND_USB=y + # CONFIG_SND_USB_AUDIO is not set + # CONFIG_SND_USB_USX2Y is not set + # CONFIG_SND_USB_CAIAQ is not set ++# CONFIG_SND_USB_US122L is not set + CONFIG_SND_PCMCIA=y + # CONFIG_SND_VXPOCKET is not set + # CONFIG_SND_PDAUDIOCF is not set +@@ -1657,15 +1805,37 @@ CONFIG_HIDRAW=y + # USB Input Devices + # + CONFIG_USB_HID=y +-CONFIG_USB_HIDINPUT_POWERBOOK=y +-CONFIG_HID_FF=y + CONFIG_HID_PID=y ++CONFIG_USB_HIDDEV=y ++ ++# ++# Special HID drivers ++# ++CONFIG_HID_COMPAT=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y + CONFIG_LOGITECH_FF=y + # CONFIG_LOGIRUMBLEPAD2_FF is not set ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_NTRIG=y ++CONFIG_HID_PANTHERLORD=y + CONFIG_PANTHERLORD_FF=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++# CONFIG_GREENASIA_FF is not set ++CONFIG_HID_TOPSEED=y + CONFIG_THRUSTMASTER_FF=y + CONFIG_ZEROPLUS_FF=y +-CONFIG_USB_HIDDEV=y + CONFIG_USB_SUPPORT=y + CONFIG_USB_ARCH_HAS_HCD=y + CONFIG_USB_ARCH_HAS_OHCI=y +@@ -1683,6 +1853,8 @@ CONFIG_USB_DEVICEFS=y + CONFIG_USB_SUSPEND=y + # CONFIG_USB_OTG is not set + CONFIG_USB_MON=y ++# CONFIG_USB_WUSB is not set ++# CONFIG_USB_WUSB_CBAF is not set + + # + # USB Host Controller Drivers +@@ -1691,6 +1863,7 @@ CONFIG_USB_MON=y + CONFIG_USB_EHCI_HCD=y + # CONFIG_USB_EHCI_ROOT_HUB_TT is not set + # CONFIG_USB_EHCI_TT_NEWSCHED is not set ++# CONFIG_USB_OXU210HP_HCD is not set + # CONFIG_USB_ISP116X_HCD is not set + # CONFIG_USB_ISP1760_HCD is not set + CONFIG_USB_OHCI_HCD=y +@@ -1700,6 +1873,8 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y + CONFIG_USB_UHCI_HCD=y + # CONFIG_USB_SL811_HCD is not set + # CONFIG_USB_R8A66597_HCD is not set ++# CONFIG_USB_WHCI_HCD is not set ++# CONFIG_USB_HWA_HCD is not set + + # + # USB Device Class drivers +@@ -1707,20 +1882,20 @@ CONFIG_USB_UHCI_HCD=y + # CONFIG_USB_ACM is not set + CONFIG_USB_PRINTER=y + # CONFIG_USB_WDM is not set ++# CONFIG_USB_TMC is not set + + # +-# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' ++# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; + # + + # +-# may also be needed; see USB_STORAGE Help for more information ++# see USB_STORAGE Help for more information + # + CONFIG_USB_STORAGE=y + # CONFIG_USB_STORAGE_DEBUG is not set + # CONFIG_USB_STORAGE_DATAFAB is not set + # CONFIG_USB_STORAGE_FREECOM is not set + # CONFIG_USB_STORAGE_ISD200 is not set +-# CONFIG_USB_STORAGE_DPCM is not set + # CONFIG_USB_STORAGE_USBAT is not set + # CONFIG_USB_STORAGE_SDDR09 is not set + # CONFIG_USB_STORAGE_SDDR55 is not set +@@ -1728,7 +1903,6 @@ CONFIG_USB_STORAGE=y + # CONFIG_USB_STORAGE_ALAUDA is not set + # CONFIG_USB_STORAGE_ONETOUCH is not set + # CONFIG_USB_STORAGE_KARMA is not set +-# CONFIG_USB_STORAGE_SIERRA is not set + # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set + CONFIG_USB_LIBUSUAL=y + +@@ -1749,6 +1923,7 @@ CONFIG_USB_LIBUSUAL=y + # CONFIG_USB_EMI62 is not set + # CONFIG_USB_EMI26 is not set + # CONFIG_USB_ADUTUX is not set ++# CONFIG_USB_SEVSEG is not set + # CONFIG_USB_RIO500 is not set + # CONFIG_USB_LEGOTOWER is not set + # CONFIG_USB_LCD is not set +@@ -1766,7 +1941,13 @@ CONFIG_USB_LIBUSUAL=y + # CONFIG_USB_IOWARRIOR is not set + # CONFIG_USB_TEST is not set + # CONFIG_USB_ISIGHTFW is not set ++# CONFIG_USB_VST is not set + # CONFIG_USB_GADGET is not set ++ ++# ++# OTG and related infrastructure ++# ++# CONFIG_UWB is not set + # CONFIG_MMC is not set + # CONFIG_MEMSTICK is not set + CONFIG_NEW_LEDS=y +@@ -1775,6 +1956,7 @@ CONFIG_LEDS_CLASS=y + # + # LED drivers + # ++# CONFIG_LEDS_ALIX2 is not set + # CONFIG_LEDS_PCA9532 is not set + # CONFIG_LEDS_CLEVO_MAIL is not set + # CONFIG_LEDS_PCA955X is not set +@@ -1785,6 +1967,7 @@ CONFIG_LEDS_CLASS=y + CONFIG_LEDS_TRIGGERS=y + # CONFIG_LEDS_TRIGGER_TIMER is not set + # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set ++# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set + # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + # CONFIG_ACCESSIBILITY is not set + # CONFIG_INFINIBAND is not set +@@ -1824,6 +2007,7 @@ CONFIG_RTC_INTF_DEV=y + # CONFIG_RTC_DRV_M41T80 is not set + # CONFIG_RTC_DRV_S35390A is not set + # CONFIG_RTC_DRV_FM3130 is not set ++# CONFIG_RTC_DRV_RX8581 is not set + + # + # SPI RTC drivers +@@ -1833,12 +2017,15 @@ CONFIG_RTC_INTF_DEV=y + # Platform RTC drivers + # + CONFIG_RTC_DRV_CMOS=y ++# CONFIG_RTC_DRV_DS1286 is not set + # CONFIG_RTC_DRV_DS1511 is not set + # CONFIG_RTC_DRV_DS1553 is not set + # CONFIG_RTC_DRV_DS1742 is not set + # CONFIG_RTC_DRV_STK17TA8 is not set + # CONFIG_RTC_DRV_M48T86 is not set ++# CONFIG_RTC_DRV_M48T35 is not set + # CONFIG_RTC_DRV_M48T59 is not set ++# CONFIG_RTC_DRV_BQ4802 is not set + # CONFIG_RTC_DRV_V3020 is not set + + # +@@ -1851,6 +2038,22 @@ CONFIG_DMADEVICES=y + # + # CONFIG_INTEL_IOATDMA is not set + # CONFIG_UIO is not set ++# CONFIG_STAGING is not set ++CONFIG_X86_PLATFORM_DEVICES=y ++# CONFIG_ACER_WMI is not set ++# CONFIG_ASUS_LAPTOP is not set ++# CONFIG_FUJITSU_LAPTOP is not set ++# CONFIG_TC1100_WMI is not set ++# CONFIG_MSI_LAPTOP is not set ++# CONFIG_PANASONIC_LAPTOP is not set ++# CONFIG_COMPAL_LAPTOP is not set ++# CONFIG_SONY_LAPTOP is not set ++# CONFIG_THINKPAD_ACPI is not set ++# CONFIG_INTEL_MENLOW is not set ++CONFIG_EEEPC_LAPTOP=y ++# CONFIG_ACPI_WMI is not set ++# CONFIG_ACPI_ASUS is not set ++# CONFIG_ACPI_TOSHIBA is not set + + # + # Firmware Drivers +@@ -1861,8 +2064,7 @@ CONFIG_EFI_VARS=y + # CONFIG_DELL_RBU is not set + # CONFIG_DCDBAS is not set + CONFIG_DMIID=y +-CONFIG_ISCSI_IBFT_FIND=y +-CONFIG_ISCSI_IBFT=y ++# CONFIG_ISCSI_IBFT_FIND is not set + + # + # File systems +@@ -1872,21 +2074,24 @@ CONFIG_EXT3_FS=y + CONFIG_EXT3_FS_XATTR=y + CONFIG_EXT3_FS_POSIX_ACL=y + CONFIG_EXT3_FS_SECURITY=y +-# CONFIG_EXT4DEV_FS is not set ++# CONFIG_EXT4_FS is not set + CONFIG_JBD=y + # CONFIG_JBD_DEBUG is not set + CONFIG_FS_MBCACHE=y + # CONFIG_REISERFS_FS is not set + # CONFIG_JFS_FS is not set + CONFIG_FS_POSIX_ACL=y ++CONFIG_FILE_LOCKING=y + # CONFIG_XFS_FS is not set + # CONFIG_OCFS2_FS is not set ++# CONFIG_BTRFS_FS is not set + CONFIG_DNOTIFY=y + CONFIG_INOTIFY=y + CONFIG_INOTIFY_USER=y + CONFIG_QUOTA=y + CONFIG_QUOTA_NETLINK_INTERFACE=y + # CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QUOTA_TREE=y + # CONFIG_QFMT_V1 is not set + CONFIG_QFMT_V2=y + CONFIG_QUOTACTL=y +@@ -1920,16 +2125,14 @@ CONFIG_PROC_FS=y + CONFIG_PROC_KCORE=y + CONFIG_PROC_VMCORE=y + CONFIG_PROC_SYSCTL=y ++CONFIG_PROC_PAGE_MONITOR=y + CONFIG_SYSFS=y + CONFIG_TMPFS=y + CONFIG_TMPFS_POSIX_ACL=y + CONFIG_HUGETLBFS=y + CONFIG_HUGETLB_PAGE=y + # CONFIG_CONFIGFS_FS is not set +- +-# +-# Miscellaneous filesystems +-# ++CONFIG_MISC_FILESYSTEMS=y + # CONFIG_ADFS_FS is not set + # CONFIG_AFFS_FS is not set + # CONFIG_ECRYPT_FS is not set +@@ -1939,6 +2142,7 @@ CONFIG_HUGETLB_PAGE=y + # CONFIG_BFS_FS is not set + # CONFIG_EFS_FS is not set + # CONFIG_CRAMFS is not set ++# CONFIG_SQUASHFS is not set + # CONFIG_VXFS_FS is not set + # CONFIG_MINIX_FS is not set + # CONFIG_OMFS_FS is not set +@@ -1960,6 +2164,7 @@ CONFIG_NFS_ACL_SUPPORT=y + CONFIG_NFS_COMMON=y + CONFIG_SUNRPC=y + CONFIG_SUNRPC_GSS=y ++# CONFIG_SUNRPC_REGISTER_V4 is not set + CONFIG_RPCSEC_GSS_KRB5=y + # CONFIG_RPCSEC_GSS_SPKM3 is not set + # CONFIG_SMB_FS is not set +@@ -2036,7 +2241,7 @@ CONFIG_NLS_UTF8=y + # + CONFIG_TRACE_IRQFLAGS_SUPPORT=y + CONFIG_PRINTK_TIME=y +-CONFIG_ENABLE_WARN_DEPRECATED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set + CONFIG_ENABLE_MUST_CHECK=y + CONFIG_FRAME_WARN=2048 + CONFIG_MAGIC_SYSRQ=y +@@ -2066,33 +2271,54 @@ CONFIG_TIMER_STATS=y + CONFIG_DEBUG_BUGVERBOSE=y + # CONFIG_DEBUG_INFO is not set + # CONFIG_DEBUG_VM is not set ++# CONFIG_DEBUG_VIRTUAL is not set + # CONFIG_DEBUG_WRITECOUNT is not set + CONFIG_DEBUG_MEMORY_INIT=y + # CONFIG_DEBUG_LIST is not set + # CONFIG_DEBUG_SG is not set ++# CONFIG_DEBUG_NOTIFIERS is not set ++CONFIG_ARCH_WANT_FRAME_POINTERS=y + CONFIG_FRAME_POINTER=y + # CONFIG_BOOT_PRINTK_DELAY is not set + # CONFIG_RCU_TORTURE_TEST is not set ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set + # CONFIG_KPROBES_SANITY_TEST is not set + # CONFIG_BACKTRACE_SELF_TEST is not set ++# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set + # CONFIG_LKDTM is not set + # CONFIG_FAULT_INJECTION is not set + # CONFIG_LATENCYTOP is not set + CONFIG_SYSCTL_SYSCALL_CHECK=y +-CONFIG_HAVE_FTRACE=y ++CONFIG_USER_STACKTRACE_SUPPORT=y ++CONFIG_HAVE_FUNCTION_TRACER=y ++CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y ++CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y + CONFIG_HAVE_DYNAMIC_FTRACE=y +-# CONFIG_FTRACE is not set ++CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y ++CONFIG_HAVE_HW_BRANCH_TRACER=y ++ ++# ++# Tracers ++# ++# CONFIG_FUNCTION_TRACER is not set + # CONFIG_IRQSOFF_TRACER is not set + # CONFIG_SYSPROF_TRACER is not set + # CONFIG_SCHED_TRACER is not set + # CONFIG_CONTEXT_SWITCH_TRACER is not set ++# CONFIG_BOOT_TRACER is not set ++# CONFIG_TRACE_BRANCH_PROFILING is not set ++# CONFIG_POWER_TRACER is not set ++# CONFIG_STACK_TRACER is not set ++# CONFIG_HW_BRANCH_TRACER is not set + CONFIG_PROVIDE_OHCI1394_DMA_INIT=y ++# CONFIG_DYNAMIC_PRINTK_DEBUG is not set + # CONFIG_SAMPLES is not set + CONFIG_HAVE_ARCH_KGDB=y + # CONFIG_KGDB is not set + # CONFIG_STRICT_DEVMEM is not set + CONFIG_X86_VERBOSE_BOOTUP=y + CONFIG_EARLY_PRINTK=y ++CONFIG_EARLY_PRINTK_DBGP=y + CONFIG_DEBUG_STACKOVERFLOW=y + CONFIG_DEBUG_STACK_USAGE=y + # CONFIG_DEBUG_PAGEALLOC is not set +@@ -2123,8 +2349,10 @@ CONFIG_OPTIMIZE_INLINING=y + CONFIG_KEYS=y + CONFIG_KEYS_DEBUG_PROC_KEYS=y + CONFIG_SECURITY=y ++# CONFIG_SECURITYFS is not set + CONFIG_SECURITY_NETWORK=y + # CONFIG_SECURITY_NETWORK_XFRM is not set ++# CONFIG_SECURITY_PATH is not set + CONFIG_SECURITY_FILE_CAPABILITIES=y + # CONFIG_SECURITY_ROOTPLUG is not set + CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 +@@ -2135,7 +2363,6 @@ CONFIG_SECURITY_SELINUX_DISABLE=y + CONFIG_SECURITY_SELINUX_DEVELOP=y + CONFIG_SECURITY_SELINUX_AVC_STATS=y + CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 +-# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set + # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set + # CONFIG_SECURITY_SMACK is not set + CONFIG_CRYPTO=y +@@ -2143,11 +2370,18 @@ CONFIG_CRYPTO=y + # + # Crypto core or helper + # ++# CONFIG_CRYPTO_FIPS is not set + CONFIG_CRYPTO_ALGAPI=y ++CONFIG_CRYPTO_ALGAPI2=y + CONFIG_CRYPTO_AEAD=y ++CONFIG_CRYPTO_AEAD2=y + CONFIG_CRYPTO_BLKCIPHER=y ++CONFIG_CRYPTO_BLKCIPHER2=y + CONFIG_CRYPTO_HASH=y ++CONFIG_CRYPTO_HASH2=y ++CONFIG_CRYPTO_RNG2=y + CONFIG_CRYPTO_MANAGER=y ++CONFIG_CRYPTO_MANAGER2=y + # CONFIG_CRYPTO_GF128MUL is not set + # CONFIG_CRYPTO_NULL is not set + # CONFIG_CRYPTO_CRYPTD is not set +@@ -2182,6 +2416,7 @@ CONFIG_CRYPTO_HMAC=y + # Digest + # + # CONFIG_CRYPTO_CRC32C is not set ++# CONFIG_CRYPTO_CRC32C_INTEL is not set + # CONFIG_CRYPTO_MD4 is not set + CONFIG_CRYPTO_MD5=y + # CONFIG_CRYPTO_MICHAEL_MIC is not set +@@ -2222,6 +2457,11 @@ CONFIG_CRYPTO_DES=y + # + # CONFIG_CRYPTO_DEFLATE is not set + # CONFIG_CRYPTO_LZO is not set ++ ++# ++# Random Number Generation ++# ++# CONFIG_CRYPTO_ANSI_CPRNG is not set + CONFIG_CRYPTO_HW=y + # CONFIG_CRYPTO_DEV_PADLOCK is not set + # CONFIG_CRYPTO_DEV_GEODE is not set +@@ -2239,6 +2479,7 @@ CONFIG_VIRTUALIZATION=y + CONFIG_BITREVERSE=y + CONFIG_GENERIC_FIND_FIRST_BIT=y + CONFIG_GENERIC_FIND_NEXT_BIT=y ++CONFIG_GENERIC_FIND_LAST_BIT=y + # CONFIG_CRC_CCITT is not set + # CONFIG_CRC16 is not set + CONFIG_CRC_T10DIF=y +Index: linux-2.6-tip/arch/x86/configs/x86_64_defconfig +=================================================================== +--- linux-2.6-tip.orig/arch/x86/configs/x86_64_defconfig ++++ linux-2.6-tip/arch/x86/configs/x86_64_defconfig +@@ -1,14 +1,13 @@ + # + # Automatically generated make config: don't edit +-# Linux kernel version: 2.6.27-rc5 +-# Wed Sep 3 17:13:39 2008 ++# Linux kernel version: 2.6.29-rc4 ++# Tue Feb 24 15:44:16 2009 + # + CONFIG_64BIT=y + # CONFIG_X86_32 is not set + CONFIG_X86_64=y + CONFIG_X86=y + CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" +-# CONFIG_GENERIC_LOCKBREAK is not set + CONFIG_GENERIC_TIME=y + CONFIG_GENERIC_CMOS_UPDATE=y + CONFIG_CLOCKSOURCE_WATCHDOG=y +@@ -23,17 +22,16 @@ CONFIG_ZONE_DMA=y + CONFIG_GENERIC_ISA_DMA=y + CONFIG_GENERIC_IOMAP=y + CONFIG_GENERIC_BUG=y ++CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y + CONFIG_GENERIC_HWEIGHT=y +-# CONFIG_GENERIC_GPIO is not set + CONFIG_ARCH_MAY_HAVE_PC_FDC=y + CONFIG_RWSEM_GENERIC_SPINLOCK=y + # CONFIG_RWSEM_XCHGADD_ALGORITHM is not set +-# CONFIG_ARCH_HAS_ILOG2_U32 is not set +-# CONFIG_ARCH_HAS_ILOG2_U64 is not set + CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y + CONFIG_GENERIC_CALIBRATE_DELAY=y + CONFIG_GENERIC_TIME_VSYSCALL=y + CONFIG_ARCH_HAS_CPU_RELAX=y ++CONFIG_ARCH_HAS_DEFAULT_IDLE=y + CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y + CONFIG_HAVE_SETUP_PER_CPU_AREA=y + CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y +@@ -42,12 +40,12 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y + CONFIG_ZONE_DMA32=y + CONFIG_ARCH_POPULATES_NODE_MAP=y + CONFIG_AUDIT_ARCH=y +-CONFIG_ARCH_SUPPORTS_AOUT=y + CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y + CONFIG_GENERIC_HARDIRQS=y + CONFIG_GENERIC_IRQ_PROBE=y + CONFIG_GENERIC_PENDING_IRQ=y + CONFIG_X86_SMP=y ++CONFIG_USE_GENERIC_SMP_HELPERS=y + CONFIG_X86_64_SMP=y + CONFIG_X86_HT=y + CONFIG_X86_BIOS_REBOOT=y +@@ -76,30 +74,44 @@ CONFIG_TASK_IO_ACCOUNTING=y + CONFIG_AUDIT=y + CONFIG_AUDITSYSCALL=y + CONFIG_AUDIT_TREE=y ++ ++# ++# RCU Subsystem ++# ++# CONFIG_CLASSIC_RCU is not set ++CONFIG_TREE_RCU=y ++# CONFIG_PREEMPT_RCU is not set ++# CONFIG_RCU_TRACE is not set ++CONFIG_RCU_FANOUT=64 ++# CONFIG_RCU_FANOUT_EXACT is not set ++# CONFIG_TREE_RCU_TRACE is not set ++# CONFIG_PREEMPT_RCU_TRACE is not set + # CONFIG_IKCONFIG is not set + CONFIG_LOG_BUF_SHIFT=18 +-CONFIG_CGROUPS=y +-# CONFIG_CGROUP_DEBUG is not set +-CONFIG_CGROUP_NS=y +-# CONFIG_CGROUP_DEVICE is not set +-CONFIG_CPUSETS=y + CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + CONFIG_GROUP_SCHED=y + CONFIG_FAIR_GROUP_SCHED=y + # CONFIG_RT_GROUP_SCHED is not set + # CONFIG_USER_SCHED is not set + CONFIG_CGROUP_SCHED=y ++CONFIG_CGROUPS=y ++# CONFIG_CGROUP_DEBUG is not set ++CONFIG_CGROUP_NS=y ++CONFIG_CGROUP_FREEZER=y ++# CONFIG_CGROUP_DEVICE is not set ++CONFIG_CPUSETS=y ++CONFIG_PROC_PID_CPUSET=y + CONFIG_CGROUP_CPUACCT=y + CONFIG_RESOURCE_COUNTERS=y + # CONFIG_CGROUP_MEM_RES_CTLR is not set + # CONFIG_SYSFS_DEPRECATED_V2 is not set +-CONFIG_PROC_PID_CPUSET=y + CONFIG_RELAY=y + CONFIG_NAMESPACES=y + CONFIG_UTS_NS=y + CONFIG_IPC_NS=y + CONFIG_USER_NS=y + CONFIG_PID_NS=y ++CONFIG_NET_NS=y + CONFIG_BLK_DEV_INITRD=y + CONFIG_INITRAMFS_SOURCE="" + CONFIG_CC_OPTIMIZE_FOR_SIZE=y +@@ -124,12 +136,15 @@ CONFIG_SIGNALFD=y + CONFIG_TIMERFD=y + CONFIG_EVENTFD=y + CONFIG_SHMEM=y ++CONFIG_AIO=y + CONFIG_VM_EVENT_COUNTERS=y ++CONFIG_PCI_QUIRKS=y + CONFIG_SLUB_DEBUG=y + # CONFIG_SLAB is not set + CONFIG_SLUB=y + # CONFIG_SLOB is not set + CONFIG_PROFILING=y ++CONFIG_TRACEPOINTS=y + CONFIG_MARKERS=y + # CONFIG_OPROFILE is not set + CONFIG_HAVE_OPROFILE=y +@@ -139,15 +154,10 @@ CONFIG_KRETPROBES=y + CONFIG_HAVE_IOREMAP_PROT=y + CONFIG_HAVE_KPROBES=y + CONFIG_HAVE_KRETPROBES=y +-# CONFIG_HAVE_ARCH_TRACEHOOK is not set +-# CONFIG_HAVE_DMA_ATTRS is not set +-CONFIG_USE_GENERIC_SMP_HELPERS=y +-# CONFIG_HAVE_CLK is not set +-CONFIG_PROC_PAGE_MONITOR=y ++CONFIG_HAVE_ARCH_TRACEHOOK=y + # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set + CONFIG_SLABINFO=y + CONFIG_RT_MUTEXES=y +-# CONFIG_TINY_SHMEM is not set + CONFIG_BASE_SMALL=0 + CONFIG_MODULES=y + # CONFIG_MODULE_FORCE_LOAD is not set +@@ -155,7 +165,6 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_MODVERSIONS is not set + # CONFIG_MODULE_SRCVERSION_ALL is not set +-CONFIG_KMOD=y + CONFIG_STOP_MACHINE=y + CONFIG_BLOCK=y + CONFIG_BLK_DEV_IO_TRACE=y +@@ -175,7 +184,7 @@ CONFIG_IOSCHED_CFQ=y + CONFIG_DEFAULT_CFQ=y + # CONFIG_DEFAULT_NOOP is not set + CONFIG_DEFAULT_IOSCHED="cfq" +-CONFIG_CLASSIC_RCU=y ++CONFIG_FREEZER=y + + # + # Processor type and features +@@ -185,13 +194,14 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_GENERIC_CLOCKEVENTS_BUILD=y + CONFIG_SMP=y ++CONFIG_SPARSE_IRQ=y ++# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set + CONFIG_X86_FIND_SMP_CONFIG=y + CONFIG_X86_MPPARSE=y +-CONFIG_X86_PC=y + # CONFIG_X86_ELAN is not set +-# CONFIG_X86_VOYAGER is not set + # CONFIG_X86_GENERICARCH is not set + # CONFIG_X86_VSMP is not set ++CONFIG_SCHED_OMIT_FRAME_POINTER=y + # CONFIG_PARAVIRT_GUEST is not set + # CONFIG_MEMTEST is not set + # CONFIG_M386 is not set +@@ -230,6 +240,11 @@ CONFIG_X86_CMPXCHG64=y + CONFIG_X86_CMOV=y + CONFIG_X86_MINIMUM_CPU_FAMILY=64 + CONFIG_X86_DEBUGCTLMSR=y ++CONFIG_CPU_SUP_INTEL=y ++CONFIG_CPU_SUP_AMD=y ++CONFIG_CPU_SUP_CENTAUR_64=y ++CONFIG_X86_DS=y ++CONFIG_X86_PTRACE_BTS=y + CONFIG_HPET_TIMER=y + CONFIG_HPET_EMULATE_RTC=y + CONFIG_DMI=y +@@ -237,8 +252,11 @@ CONFIG_GART_IOMMU=y + CONFIG_CALGARY_IOMMU=y + CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y + CONFIG_AMD_IOMMU=y ++CONFIG_AMD_IOMMU_STATS=y + CONFIG_SWIOTLB=y + CONFIG_IOMMU_HELPER=y ++CONFIG_IOMMU_API=y ++# CONFIG_MAXSMP is not set + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y + CONFIG_SCHED_MC=y +@@ -247,12 +265,19 @@ CONFIG_PREEMPT_VOLUNTARY=y + # CONFIG_PREEMPT is not set + CONFIG_X86_LOCAL_APIC=y + CONFIG_X86_IO_APIC=y +-# CONFIG_X86_MCE is not set ++CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y ++CONFIG_X86_MCE=y ++CONFIG_X86_MCE_INTEL=y ++CONFIG_X86_MCE_AMD=y + # CONFIG_I8K is not set + CONFIG_MICROCODE=y ++CONFIG_MICROCODE_INTEL=y ++CONFIG_MICROCODE_AMD=y + CONFIG_MICROCODE_OLD_INTERFACE=y + CONFIG_X86_MSR=y + CONFIG_X86_CPUID=y ++CONFIG_ARCH_PHYS_ADDR_T_64BIT=y ++CONFIG_DIRECT_GBPAGES=y + CONFIG_NUMA=y + CONFIG_K8_NUMA=y + CONFIG_X86_64_ACPI_NUMA=y +@@ -269,7 +294,6 @@ CONFIG_SPARSEMEM_MANUAL=y + CONFIG_SPARSEMEM=y + CONFIG_NEED_MULTIPLE_NODES=y + CONFIG_HAVE_MEMORY_PRESENT=y +-# CONFIG_SPARSEMEM_STATIC is not set + CONFIG_SPARSEMEM_EXTREME=y + CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y + CONFIG_SPARSEMEM_VMEMMAP=y +@@ -280,10 +304,14 @@ CONFIG_SPARSEMEM_VMEMMAP=y + CONFIG_PAGEFLAGS_EXTENDED=y + CONFIG_SPLIT_PTLOCK_CPUS=4 + CONFIG_MIGRATION=y +-CONFIG_RESOURCES_64BIT=y ++CONFIG_PHYS_ADDR_T_64BIT=y + CONFIG_ZONE_DMA_FLAG=1 + CONFIG_BOUNCE=y + CONFIG_VIRT_TO_BUS=y ++CONFIG_UNEVICTABLE_LRU=y ++CONFIG_X86_CHECK_BIOS_CORRUPTION=y ++CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y ++CONFIG_X86_RESERVE_LOW_64K=y + CONFIG_MTRR=y + # CONFIG_MTRR_SANITIZER is not set + CONFIG_X86_PAT=y +@@ -302,11 +330,12 @@ CONFIG_PHYSICAL_START=0x1000000 + CONFIG_PHYSICAL_ALIGN=0x200000 + CONFIG_HOTPLUG_CPU=y + # CONFIG_COMPAT_VDSO is not set ++# CONFIG_CMDLINE_BOOL is not set + CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y + CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y + + # +-# Power management options ++# Power management and ACPI options + # + CONFIG_ARCH_HIBERNATION_HEADER=y + CONFIG_PM=y +@@ -333,20 +362,14 @@ CONFIG_ACPI_BATTERY=y + CONFIG_ACPI_BUTTON=y + CONFIG_ACPI_FAN=y + CONFIG_ACPI_DOCK=y +-# CONFIG_ACPI_BAY is not set + CONFIG_ACPI_PROCESSOR=y + CONFIG_ACPI_HOTPLUG_CPU=y + CONFIG_ACPI_THERMAL=y + CONFIG_ACPI_NUMA=y +-# CONFIG_ACPI_WMI is not set +-# CONFIG_ACPI_ASUS is not set +-# CONFIG_ACPI_TOSHIBA is not set + # CONFIG_ACPI_CUSTOM_DSDT is not set + CONFIG_ACPI_BLACKLIST_YEAR=0 + # CONFIG_ACPI_DEBUG is not set +-CONFIG_ACPI_EC=y + # CONFIG_ACPI_PCI_SLOT is not set +-CONFIG_ACPI_POWER=y + CONFIG_ACPI_SYSTEM=y + CONFIG_X86_PM_TIMER=y + CONFIG_ACPI_CONTAINER=y +@@ -381,13 +404,17 @@ CONFIG_X86_ACPI_CPUFREQ=y + # + # shared options + # +-# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set + # CONFIG_X86_SPEEDSTEP_LIB is not set + CONFIG_CPU_IDLE=y + CONFIG_CPU_IDLE_GOV_LADDER=y + CONFIG_CPU_IDLE_GOV_MENU=y + + # ++# Memory power savings ++# ++# CONFIG_I7300_IDLE is not set ++ ++# + # Bus options (PCI etc.) + # + CONFIG_PCI=y +@@ -395,8 +422,10 @@ CONFIG_PCI_DIRECT=y + CONFIG_PCI_MMCONFIG=y + CONFIG_PCI_DOMAINS=y + CONFIG_DMAR=y ++# CONFIG_DMAR_DEFAULT_ON is not set + CONFIG_DMAR_GFX_WA=y + CONFIG_DMAR_FLOPPY_WA=y ++# CONFIG_INTR_REMAP is not set + CONFIG_PCIEPORTBUS=y + # CONFIG_HOTPLUG_PCI_PCIE is not set + CONFIG_PCIEAER=y +@@ -405,6 +434,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y + CONFIG_PCI_MSI=y + # CONFIG_PCI_LEGACY is not set + # CONFIG_PCI_DEBUG is not set ++# CONFIG_PCI_STUB is not set + CONFIG_HT_IRQ=y + CONFIG_ISA_DMA_API=y + CONFIG_K8_NB=y +@@ -438,6 +468,8 @@ CONFIG_HOTPLUG_PCI=y + # + CONFIG_BINFMT_ELF=y + CONFIG_COMPAT_BINFMT_ELF=y ++CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y ++# CONFIG_HAVE_AOUT is not set + CONFIG_BINFMT_MISC=y + CONFIG_IA32_EMULATION=y + # CONFIG_IA32_AOUT is not set +@@ -449,6 +481,7 @@ CONFIG_NET=y + # + # Networking options + # ++CONFIG_COMPAT_NET_DEV_OPS=y + CONFIG_PACKET=y + CONFIG_PACKET_MMAP=y + CONFIG_UNIX=y +@@ -509,7 +542,6 @@ CONFIG_DEFAULT_CUBIC=y + # CONFIG_DEFAULT_RENO is not set + CONFIG_DEFAULT_TCP_CONG="cubic" + CONFIG_TCP_MD5SIG=y +-# CONFIG_IP_VS is not set + CONFIG_IPV6=y + # CONFIG_IPV6_PRIVACY is not set + # CONFIG_IPV6_ROUTER_PREF is not set +@@ -547,19 +579,21 @@ CONFIG_NF_CONNTRACK_IRC=y + CONFIG_NF_CONNTRACK_SIP=y + CONFIG_NF_CT_NETLINK=y + CONFIG_NETFILTER_XTABLES=y ++CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y + CONFIG_NETFILTER_XT_TARGET_MARK=y + CONFIG_NETFILTER_XT_TARGET_NFLOG=y + CONFIG_NETFILTER_XT_TARGET_SECMARK=y +-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y + CONFIG_NETFILTER_XT_TARGET_TCPMSS=y + CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y + CONFIG_NETFILTER_XT_MATCH_MARK=y + CONFIG_NETFILTER_XT_MATCH_POLICY=y + CONFIG_NETFILTER_XT_MATCH_STATE=y ++# CONFIG_IP_VS is not set + + # + # IP: Netfilter Configuration + # ++CONFIG_NF_DEFRAG_IPV4=y + CONFIG_NF_CONNTRACK_IPV4=y + CONFIG_NF_CONNTRACK_PROC_COMPAT=y + CONFIG_IP_NF_IPTABLES=y +@@ -585,8 +619,8 @@ CONFIG_IP_NF_MANGLE=y + CONFIG_NF_CONNTRACK_IPV6=y + CONFIG_IP6_NF_IPTABLES=y + CONFIG_IP6_NF_MATCH_IPV6HEADER=y +-CONFIG_IP6_NF_FILTER=y + CONFIG_IP6_NF_TARGET_LOG=y ++CONFIG_IP6_NF_FILTER=y + CONFIG_IP6_NF_TARGET_REJECT=y + CONFIG_IP6_NF_MANGLE=y + # CONFIG_IP_DCCP is not set +@@ -594,6 +628,7 @@ CONFIG_IP6_NF_MANGLE=y + # CONFIG_TIPC is not set + # CONFIG_ATM is not set + # CONFIG_BRIDGE is not set ++# CONFIG_NET_DSA is not set + # CONFIG_VLAN_8021Q is not set + # CONFIG_DECNET is not set + CONFIG_LLC=y +@@ -613,6 +648,7 @@ CONFIG_NET_SCHED=y + # CONFIG_NET_SCH_HTB is not set + # CONFIG_NET_SCH_HFSC is not set + # CONFIG_NET_SCH_PRIO is not set ++# CONFIG_NET_SCH_MULTIQ is not set + # CONFIG_NET_SCH_RED is not set + # CONFIG_NET_SCH_SFQ is not set + # CONFIG_NET_SCH_TEQL is not set +@@ -620,6 +656,7 @@ CONFIG_NET_SCHED=y + # CONFIG_NET_SCH_GRED is not set + # CONFIG_NET_SCH_DSMARK is not set + # CONFIG_NET_SCH_NETEM is not set ++# CONFIG_NET_SCH_DRR is not set + # CONFIG_NET_SCH_INGRESS is not set + + # +@@ -634,6 +671,7 @@ CONFIG_NET_CLS=y + # CONFIG_NET_CLS_RSVP is not set + # CONFIG_NET_CLS_RSVP6 is not set + # CONFIG_NET_CLS_FLOW is not set ++# CONFIG_NET_CLS_CGROUP is not set + CONFIG_NET_EMATCH=y + CONFIG_NET_EMATCH_STACK=32 + # CONFIG_NET_EMATCH_CMP is not set +@@ -649,7 +687,9 @@ CONFIG_NET_CLS_ACT=y + # CONFIG_NET_ACT_NAT is not set + # CONFIG_NET_ACT_PEDIT is not set + # CONFIG_NET_ACT_SIMP is not set ++# CONFIG_NET_ACT_SKBEDIT is not set + CONFIG_NET_SCH_FIFO=y ++# CONFIG_DCB is not set + + # + # Network testing +@@ -666,29 +706,33 @@ CONFIG_HAMRADIO=y + # CONFIG_IRDA is not set + # CONFIG_BT is not set + # CONFIG_AF_RXRPC is not set ++# CONFIG_PHONET is not set + CONFIG_FIB_RULES=y +- +-# +-# Wireless +-# ++CONFIG_WIRELESS=y + CONFIG_CFG80211=y ++# CONFIG_CFG80211_REG_DEBUG is not set + CONFIG_NL80211=y ++CONFIG_WIRELESS_OLD_REGULATORY=y + CONFIG_WIRELESS_EXT=y + CONFIG_WIRELESS_EXT_SYSFS=y ++# CONFIG_LIB80211 is not set + CONFIG_MAC80211=y + + # + # Rate control algorithm selection + # +-CONFIG_MAC80211_RC_PID=y +-CONFIG_MAC80211_RC_DEFAULT_PID=y +-CONFIG_MAC80211_RC_DEFAULT="pid" ++CONFIG_MAC80211_RC_MINSTREL=y ++# CONFIG_MAC80211_RC_DEFAULT_PID is not set ++CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y ++CONFIG_MAC80211_RC_DEFAULT="minstrel" + # CONFIG_MAC80211_MESH is not set + CONFIG_MAC80211_LEDS=y + # CONFIG_MAC80211_DEBUGFS is not set + # CONFIG_MAC80211_DEBUG_MENU is not set +-# CONFIG_IEEE80211 is not set +-# CONFIG_RFKILL is not set ++# CONFIG_WIMAX is not set ++CONFIG_RFKILL=y ++# CONFIG_RFKILL_INPUT is not set ++CONFIG_RFKILL_LEDS=y + # CONFIG_NET_9P is not set + + # +@@ -712,7 +756,7 @@ CONFIG_PROC_EVENTS=y + # CONFIG_MTD is not set + # CONFIG_PARPORT is not set + CONFIG_PNP=y +-# CONFIG_PNP_DEBUG is not set ++CONFIG_PNP_DEBUG_MESSAGES=y + + # + # Protocols +@@ -740,21 +784,21 @@ CONFIG_BLK_DEV_RAM_SIZE=16384 + CONFIG_MISC_DEVICES=y + # CONFIG_IBM_ASM is not set + # CONFIG_PHANTOM is not set +-# CONFIG_EEPROM_93CX6 is not set + # CONFIG_SGI_IOC4 is not set + # CONFIG_TIFM_CORE is not set +-# CONFIG_ACER_WMI is not set +-# CONFIG_ASUS_LAPTOP is not set +-# CONFIG_FUJITSU_LAPTOP is not set +-# CONFIG_MSI_LAPTOP is not set +-# CONFIG_COMPAL_LAPTOP is not set +-# CONFIG_SONY_LAPTOP is not set +-# CONFIG_THINKPAD_ACPI is not set +-# CONFIG_INTEL_MENLOW is not set ++# CONFIG_ICS932S401 is not set + # CONFIG_ENCLOSURE_SERVICES is not set + # CONFIG_SGI_XP is not set + # CONFIG_HP_ILO is not set + # CONFIG_SGI_GRU is not set ++# CONFIG_C2PORT is not set ++ ++# ++# EEPROM support ++# ++# CONFIG_EEPROM_AT24 is not set ++# CONFIG_EEPROM_LEGACY is not set ++# CONFIG_EEPROM_93CX6 is not set + CONFIG_HAVE_IDE=y + # CONFIG_IDE is not set + +@@ -793,7 +837,7 @@ CONFIG_SCSI_WAIT_SCAN=m + # + CONFIG_SCSI_SPI_ATTRS=y + # CONFIG_SCSI_FC_ATTRS is not set +-CONFIG_SCSI_ISCSI_ATTRS=y ++# CONFIG_SCSI_ISCSI_ATTRS is not set + # CONFIG_SCSI_SAS_ATTRS is not set + # CONFIG_SCSI_SAS_LIBSAS is not set + # CONFIG_SCSI_SRP_ATTRS is not set +@@ -864,6 +908,7 @@ CONFIG_PATA_OLDPIIX=y + CONFIG_PATA_SCH=y + CONFIG_MD=y + CONFIG_BLK_DEV_MD=y ++CONFIG_MD_AUTODETECT=y + # CONFIG_MD_LINEAR is not set + # CONFIG_MD_RAID0 is not set + # CONFIG_MD_RAID1 is not set +@@ -919,6 +964,9 @@ CONFIG_PHYLIB=y + # CONFIG_BROADCOM_PHY is not set + # CONFIG_ICPLUS_PHY is not set + # CONFIG_REALTEK_PHY is not set ++# CONFIG_NATIONAL_PHY is not set ++# CONFIG_STE10XP is not set ++# CONFIG_LSI_ET1011C_PHY is not set + # CONFIG_FIXED_PHY is not set + # CONFIG_MDIO_BITBANG is not set + CONFIG_NET_ETHERNET=y +@@ -942,6 +990,9 @@ CONFIG_NET_TULIP=y + # CONFIG_IBM_NEW_EMAC_RGMII is not set + # CONFIG_IBM_NEW_EMAC_TAH is not set + # CONFIG_IBM_NEW_EMAC_EMAC4 is not set ++# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set ++# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set ++# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set + CONFIG_NET_PCI=y + # CONFIG_PCNET32 is not set + # CONFIG_AMD8111_ETH is not set +@@ -949,7 +1000,6 @@ CONFIG_NET_PCI=y + # CONFIG_B44 is not set + CONFIG_FORCEDETH=y + # CONFIG_FORCEDETH_NAPI is not set +-# CONFIG_EEPRO100 is not set + CONFIG_E100=y + # CONFIG_FEALNX is not set + # CONFIG_NATSEMI is not set +@@ -963,15 +1013,16 @@ CONFIG_8139TOO_PIO=y + # CONFIG_R6040 is not set + # CONFIG_SIS900 is not set + # CONFIG_EPIC100 is not set ++# CONFIG_SMSC9420 is not set + # CONFIG_SUNDANCE is not set + # CONFIG_TLAN is not set + # CONFIG_VIA_RHINE is not set + # CONFIG_SC92031 is not set ++# CONFIG_ATL2 is not set + CONFIG_NETDEV_1000=y + # CONFIG_ACENIC is not set + # CONFIG_DL2K is not set + CONFIG_E1000=y +-# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set + # CONFIG_E1000E is not set + # CONFIG_IP1000 is not set + # CONFIG_IGB is not set +@@ -989,18 +1040,23 @@ CONFIG_TIGON3=y + # CONFIG_QLA3XXX is not set + # CONFIG_ATL1 is not set + # CONFIG_ATL1E is not set ++# CONFIG_JME is not set + CONFIG_NETDEV_10000=y + # CONFIG_CHELSIO_T1 is not set ++CONFIG_CHELSIO_T3_DEPENDS=y + # CONFIG_CHELSIO_T3 is not set ++# CONFIG_ENIC is not set + # CONFIG_IXGBE is not set + # CONFIG_IXGB is not set + # CONFIG_S2IO is not set + # CONFIG_MYRI10GE is not set + # CONFIG_NETXEN_NIC is not set + # CONFIG_NIU is not set ++# CONFIG_MLX4_EN is not set + # CONFIG_MLX4_CORE is not set + # CONFIG_TEHUTI is not set + # CONFIG_BNX2X is not set ++# CONFIG_QLGE is not set + # CONFIG_SFC is not set + CONFIG_TR=y + # CONFIG_IBMOL is not set +@@ -1013,9 +1069,8 @@ CONFIG_TR=y + # CONFIG_WLAN_PRE80211 is not set + CONFIG_WLAN_80211=y + # CONFIG_PCMCIA_RAYCS is not set +-# CONFIG_IPW2100 is not set +-# CONFIG_IPW2200 is not set + # CONFIG_LIBERTAS is not set ++# CONFIG_LIBERTAS_THINFIRM is not set + # CONFIG_AIRO is not set + # CONFIG_HERMES is not set + # CONFIG_ATMEL is not set +@@ -1032,6 +1087,8 @@ CONFIG_WLAN_80211=y + CONFIG_ATH5K=y + # CONFIG_ATH5K_DEBUG is not set + # CONFIG_ATH9K is not set ++# CONFIG_IPW2100 is not set ++# CONFIG_IPW2200 is not set + # CONFIG_IWLCORE is not set + # CONFIG_IWLWIFI_LEDS is not set + # CONFIG_IWLAGN is not set +@@ -1043,6 +1100,10 @@ CONFIG_ATH5K=y + # CONFIG_RT2X00 is not set + + # ++# Enable WiMAX (Networking options) to see the WiMAX drivers ++# ++ ++# + # USB Network Adapters + # + # CONFIG_USB_CATC is not set +@@ -1050,6 +1111,7 @@ CONFIG_ATH5K=y + # CONFIG_USB_PEGASUS is not set + # CONFIG_USB_RTL8150 is not set + # CONFIG_USB_USBNET is not set ++# CONFIG_USB_HSO is not set + CONFIG_NET_PCMCIA=y + # CONFIG_PCMCIA_3C589 is not set + # CONFIG_PCMCIA_3C574 is not set +@@ -1059,6 +1121,7 @@ CONFIG_NET_PCMCIA=y + # CONFIG_PCMCIA_SMC91C92 is not set + # CONFIG_PCMCIA_XIRC2PS is not set + # CONFIG_PCMCIA_AXNET is not set ++# CONFIG_PCMCIA_IBMTR is not set + # CONFIG_WAN is not set + CONFIG_FDDI=y + # CONFIG_DEFXX is not set +@@ -1110,6 +1173,7 @@ CONFIG_MOUSE_PS2_LOGIPS2PP=y + CONFIG_MOUSE_PS2_SYNAPTICS=y + CONFIG_MOUSE_PS2_LIFEBOOK=y + CONFIG_MOUSE_PS2_TRACKPOINT=y ++# CONFIG_MOUSE_PS2_ELANTECH is not set + # CONFIG_MOUSE_PS2_TOUCHKIT is not set + # CONFIG_MOUSE_SERIAL is not set + # CONFIG_MOUSE_APPLETOUCH is not set +@@ -1147,15 +1211,16 @@ CONFIG_INPUT_TOUCHSCREEN=y + # CONFIG_TOUCHSCREEN_FUJITSU is not set + # CONFIG_TOUCHSCREEN_GUNZE is not set + # CONFIG_TOUCHSCREEN_ELO is not set ++# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set + # CONFIG_TOUCHSCREEN_MTOUCH is not set + # CONFIG_TOUCHSCREEN_INEXIO is not set + # CONFIG_TOUCHSCREEN_MK712 is not set + # CONFIG_TOUCHSCREEN_PENMOUNT is not set + # CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set + # CONFIG_TOUCHSCREEN_TOUCHWIN is not set +-# CONFIG_TOUCHSCREEN_UCB1400 is not set + # CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set + # CONFIG_TOUCHSCREEN_TOUCHIT213 is not set ++# CONFIG_TOUCHSCREEN_TSC2007 is not set + CONFIG_INPUT_MISC=y + # CONFIG_INPUT_PCSPKR is not set + # CONFIG_INPUT_APANEL is not set +@@ -1165,6 +1230,7 @@ CONFIG_INPUT_MISC=y + # CONFIG_INPUT_KEYSPAN_REMOTE is not set + # CONFIG_INPUT_POWERMATE is not set + # CONFIG_INPUT_YEALINK is not set ++# CONFIG_INPUT_CM109 is not set + # CONFIG_INPUT_UINPUT is not set + + # +@@ -1231,6 +1297,7 @@ CONFIG_SERIAL_CORE=y + CONFIG_SERIAL_CORE_CONSOLE=y + # CONFIG_SERIAL_JSM is not set + CONFIG_UNIX98_PTYS=y ++# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set + # CONFIG_LEGACY_PTYS is not set + # CONFIG_IPMI_HANDLER is not set + CONFIG_HW_RANDOM=y +@@ -1260,6 +1327,7 @@ CONFIG_I2C=y + CONFIG_I2C_BOARDINFO=y + # CONFIG_I2C_CHARDEV is not set + CONFIG_I2C_HELPER_AUTO=y ++CONFIG_I2C_ALGOBIT=y + + # + # I2C Hardware Bus support +@@ -1311,8 +1379,6 @@ CONFIG_I2C_I801=y + # Miscellaneous I2C Chip support + # + # CONFIG_DS1682 is not set +-# CONFIG_EEPROM_AT24 is not set +-# CONFIG_EEPROM_LEGACY is not set + # CONFIG_SENSORS_PCF8574 is not set + # CONFIG_PCF8575 is not set + # CONFIG_SENSORS_PCA9539 is not set +@@ -1331,8 +1397,78 @@ CONFIG_POWER_SUPPLY=y + # CONFIG_POWER_SUPPLY_DEBUG is not set + # CONFIG_PDA_POWER is not set + # CONFIG_BATTERY_DS2760 is not set +-# CONFIG_HWMON is not set ++# CONFIG_BATTERY_BQ27x00 is not set ++CONFIG_HWMON=y ++# CONFIG_HWMON_VID is not set ++# CONFIG_SENSORS_ABITUGURU is not set ++# CONFIG_SENSORS_ABITUGURU3 is not set ++# CONFIG_SENSORS_AD7414 is not set ++# CONFIG_SENSORS_AD7418 is not set ++# CONFIG_SENSORS_ADM1021 is not set ++# CONFIG_SENSORS_ADM1025 is not set ++# CONFIG_SENSORS_ADM1026 is not set ++# CONFIG_SENSORS_ADM1029 is not set ++# CONFIG_SENSORS_ADM1031 is not set ++# CONFIG_SENSORS_ADM9240 is not set ++# CONFIG_SENSORS_ADT7462 is not set ++# CONFIG_SENSORS_ADT7470 is not set ++# CONFIG_SENSORS_ADT7473 is not set ++# CONFIG_SENSORS_ADT7475 is not set ++# CONFIG_SENSORS_K8TEMP is not set ++# CONFIG_SENSORS_ASB100 is not set ++# CONFIG_SENSORS_ATXP1 is not set ++# CONFIG_SENSORS_DS1621 is not set ++# CONFIG_SENSORS_I5K_AMB is not set ++# CONFIG_SENSORS_F71805F is not set ++# CONFIG_SENSORS_F71882FG is not set ++# CONFIG_SENSORS_F75375S is not set ++# CONFIG_SENSORS_FSCHER is not set ++# CONFIG_SENSORS_FSCPOS is not set ++# CONFIG_SENSORS_FSCHMD is not set ++# CONFIG_SENSORS_GL518SM is not set ++# CONFIG_SENSORS_GL520SM is not set ++# CONFIG_SENSORS_CORETEMP is not set ++# CONFIG_SENSORS_IT87 is not set ++# CONFIG_SENSORS_LM63 is not set ++# CONFIG_SENSORS_LM75 is not set ++# CONFIG_SENSORS_LM77 is not set ++# CONFIG_SENSORS_LM78 is not set ++# CONFIG_SENSORS_LM80 is not set ++# CONFIG_SENSORS_LM83 is not set ++# CONFIG_SENSORS_LM85 is not set ++# CONFIG_SENSORS_LM87 is not set ++# CONFIG_SENSORS_LM90 is not set ++# CONFIG_SENSORS_LM92 is not set ++# CONFIG_SENSORS_LM93 is not set ++# CONFIG_SENSORS_LTC4245 is not set ++# CONFIG_SENSORS_MAX1619 is not set ++# CONFIG_SENSORS_MAX6650 is not set ++# CONFIG_SENSORS_PC87360 is not set ++# CONFIG_SENSORS_PC87427 is not set ++# CONFIG_SENSORS_SIS5595 is not set ++# CONFIG_SENSORS_DME1737 is not set ++# CONFIG_SENSORS_SMSC47M1 is not set ++# CONFIG_SENSORS_SMSC47M192 is not set ++# CONFIG_SENSORS_SMSC47B397 is not set ++# CONFIG_SENSORS_ADS7828 is not set ++# CONFIG_SENSORS_THMC50 is not set ++# CONFIG_SENSORS_VIA686A is not set ++# CONFIG_SENSORS_VT1211 is not set ++# CONFIG_SENSORS_VT8231 is not set ++# CONFIG_SENSORS_W83781D is not set ++# CONFIG_SENSORS_W83791D is not set ++# CONFIG_SENSORS_W83792D is not set ++# CONFIG_SENSORS_W83793 is not set ++# CONFIG_SENSORS_W83L785TS is not set ++# CONFIG_SENSORS_W83L786NG is not set ++# CONFIG_SENSORS_W83627HF is not set ++# CONFIG_SENSORS_W83627EHF is not set ++# CONFIG_SENSORS_HDAPS is not set ++# CONFIG_SENSORS_LIS3LV02D is not set ++# CONFIG_SENSORS_APPLESMC is not set ++# CONFIG_HWMON_DEBUG_CHIP is not set + CONFIG_THERMAL=y ++# CONFIG_THERMAL_HWMON is not set + CONFIG_WATCHDOG=y + # CONFIG_WATCHDOG_NOWAYOUT is not set + +@@ -1352,15 +1488,18 @@ CONFIG_WATCHDOG=y + # CONFIG_I6300ESB_WDT is not set + # CONFIG_ITCO_WDT is not set + # CONFIG_IT8712F_WDT is not set ++# CONFIG_IT87_WDT is not set + # CONFIG_HP_WATCHDOG is not set + # CONFIG_SC1200_WDT is not set + # CONFIG_PC87413_WDT is not set + # CONFIG_60XX_WDT is not set + # CONFIG_SBC8360_WDT is not set + # CONFIG_CPU5_WDT is not set ++# CONFIG_SMSC_SCH311X_WDT is not set + # CONFIG_SMSC37B787_WDT is not set + # CONFIG_W83627HF_WDT is not set + # CONFIG_W83697HF_WDT is not set ++# CONFIG_W83697UG_WDT is not set + # CONFIG_W83877F_WDT is not set + # CONFIG_W83977F_WDT is not set + # CONFIG_MACHZ_WDT is not set +@@ -1376,11 +1515,11 @@ CONFIG_WATCHDOG=y + # USB-based Watchdog Cards + # + # CONFIG_USBPCWATCHDOG is not set ++CONFIG_SSB_POSSIBLE=y + + # + # Sonics Silicon Backplane + # +-CONFIG_SSB_POSSIBLE=y + # CONFIG_SSB is not set + + # +@@ -1389,7 +1528,13 @@ CONFIG_SSB_POSSIBLE=y + # CONFIG_MFD_CORE is not set + # CONFIG_MFD_SM501 is not set + # CONFIG_HTC_PASIC3 is not set ++# CONFIG_TWL4030_CORE is not set + # CONFIG_MFD_TMIO is not set ++# CONFIG_PMIC_DA903X is not set ++# CONFIG_MFD_WM8400 is not set ++# CONFIG_MFD_WM8350_I2C is not set ++# CONFIG_MFD_PCF50633 is not set ++# CONFIG_REGULATOR is not set + + # + # Multimedia devices +@@ -1423,6 +1568,7 @@ CONFIG_DRM=y + # CONFIG_DRM_I810 is not set + # CONFIG_DRM_I830 is not set + CONFIG_DRM_I915=y ++CONFIG_DRM_I915_KMS=y + # CONFIG_DRM_MGA is not set + # CONFIG_DRM_SIS is not set + # CONFIG_DRM_VIA is not set +@@ -1432,6 +1578,7 @@ CONFIG_DRM_I915=y + CONFIG_FB=y + # CONFIG_FIRMWARE_EDID is not set + # CONFIG_FB_DDC is not set ++# CONFIG_FB_BOOT_VESA_SUPPORT is not set + CONFIG_FB_CFB_FILLRECT=y + CONFIG_FB_CFB_COPYAREA=y + CONFIG_FB_CFB_IMAGEBLIT=y +@@ -1460,7 +1607,6 @@ CONFIG_FB_TILEBLITTING=y + # CONFIG_FB_UVESA is not set + # CONFIG_FB_VESA is not set + CONFIG_FB_EFI=y +-# CONFIG_FB_IMAC is not set + # CONFIG_FB_N411 is not set + # CONFIG_FB_HGA is not set + # CONFIG_FB_S1D13XXX is not set +@@ -1475,6 +1621,7 @@ CONFIG_FB_EFI=y + # CONFIG_FB_S3 is not set + # CONFIG_FB_SAVAGE is not set + # CONFIG_FB_SIS is not set ++# CONFIG_FB_VIA is not set + # CONFIG_FB_NEOMAGIC is not set + # CONFIG_FB_KYRO is not set + # CONFIG_FB_3DFX is not set +@@ -1486,12 +1633,15 @@ CONFIG_FB_EFI=y + # CONFIG_FB_CARMINE is not set + # CONFIG_FB_GEODE is not set + # CONFIG_FB_VIRTUAL is not set ++# CONFIG_FB_METRONOME is not set ++# CONFIG_FB_MB862XX is not set + CONFIG_BACKLIGHT_LCD_SUPPORT=y + # CONFIG_LCD_CLASS_DEVICE is not set + CONFIG_BACKLIGHT_CLASS_DEVICE=y +-# CONFIG_BACKLIGHT_CORGI is not set ++CONFIG_BACKLIGHT_GENERIC=y + # CONFIG_BACKLIGHT_PROGEAR is not set + # CONFIG_BACKLIGHT_MBP_NVIDIA is not set ++# CONFIG_BACKLIGHT_SAHARA is not set + + # + # Display device support +@@ -1511,10 +1661,12 @@ CONFIG_LOGO=y + # CONFIG_LOGO_LINUX_VGA16 is not set + CONFIG_LOGO_LINUX_CLUT224=y + CONFIG_SOUND=y ++CONFIG_SOUND_OSS_CORE=y + CONFIG_SND=y + CONFIG_SND_TIMER=y + CONFIG_SND_PCM=y + CONFIG_SND_HWDEP=y ++CONFIG_SND_JACK=y + CONFIG_SND_SEQUENCER=y + CONFIG_SND_SEQ_DUMMY=y + CONFIG_SND_OSSEMUL=y +@@ -1522,6 +1674,8 @@ CONFIG_SND_MIXER_OSS=y + CONFIG_SND_PCM_OSS=y + CONFIG_SND_PCM_OSS_PLUGINS=y + CONFIG_SND_SEQUENCER_OSS=y ++CONFIG_SND_HRTIMER=y ++CONFIG_SND_SEQ_HRTIMER_DEFAULT=y + CONFIG_SND_DYNAMIC_MINORS=y + CONFIG_SND_SUPPORT_OLD_API=y + CONFIG_SND_VERBOSE_PROCFS=y +@@ -1575,11 +1729,16 @@ CONFIG_SND_PCI=y + # CONFIG_SND_FM801 is not set + CONFIG_SND_HDA_INTEL=y + CONFIG_SND_HDA_HWDEP=y ++# CONFIG_SND_HDA_RECONFIG is not set ++# CONFIG_SND_HDA_INPUT_BEEP is not set + CONFIG_SND_HDA_CODEC_REALTEK=y + CONFIG_SND_HDA_CODEC_ANALOG=y + CONFIG_SND_HDA_CODEC_SIGMATEL=y + CONFIG_SND_HDA_CODEC_VIA=y + CONFIG_SND_HDA_CODEC_ATIHDMI=y ++CONFIG_SND_HDA_CODEC_NVHDMI=y ++CONFIG_SND_HDA_CODEC_INTELHDMI=y ++CONFIG_SND_HDA_ELD=y + CONFIG_SND_HDA_CODEC_CONEXANT=y + CONFIG_SND_HDA_CODEC_CMEDIA=y + CONFIG_SND_HDA_CODEC_SI3054=y +@@ -1612,6 +1771,7 @@ CONFIG_SND_USB=y + # CONFIG_SND_USB_AUDIO is not set + # CONFIG_SND_USB_USX2Y is not set + # CONFIG_SND_USB_CAIAQ is not set ++# CONFIG_SND_USB_US122L is not set + CONFIG_SND_PCMCIA=y + # CONFIG_SND_VXPOCKET is not set + # CONFIG_SND_PDAUDIOCF is not set +@@ -1626,15 +1786,37 @@ CONFIG_HIDRAW=y + # USB Input Devices + # + CONFIG_USB_HID=y +-CONFIG_USB_HIDINPUT_POWERBOOK=y +-CONFIG_HID_FF=y + CONFIG_HID_PID=y ++CONFIG_USB_HIDDEV=y ++ ++# ++# Special HID drivers ++# ++CONFIG_HID_COMPAT=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y + CONFIG_LOGITECH_FF=y + # CONFIG_LOGIRUMBLEPAD2_FF is not set ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_NTRIG=y ++CONFIG_HID_PANTHERLORD=y + CONFIG_PANTHERLORD_FF=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++# CONFIG_GREENASIA_FF is not set ++CONFIG_HID_TOPSEED=y + CONFIG_THRUSTMASTER_FF=y + CONFIG_ZEROPLUS_FF=y +-CONFIG_USB_HIDDEV=y + CONFIG_USB_SUPPORT=y + CONFIG_USB_ARCH_HAS_HCD=y + CONFIG_USB_ARCH_HAS_OHCI=y +@@ -1652,6 +1834,8 @@ CONFIG_USB_DEVICEFS=y + CONFIG_USB_SUSPEND=y + # CONFIG_USB_OTG is not set + CONFIG_USB_MON=y ++# CONFIG_USB_WUSB is not set ++# CONFIG_USB_WUSB_CBAF is not set + + # + # USB Host Controller Drivers +@@ -1660,6 +1844,7 @@ CONFIG_USB_MON=y + CONFIG_USB_EHCI_HCD=y + # CONFIG_USB_EHCI_ROOT_HUB_TT is not set + # CONFIG_USB_EHCI_TT_NEWSCHED is not set ++# CONFIG_USB_OXU210HP_HCD is not set + # CONFIG_USB_ISP116X_HCD is not set + # CONFIG_USB_ISP1760_HCD is not set + CONFIG_USB_OHCI_HCD=y +@@ -1669,6 +1854,8 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y + CONFIG_USB_UHCI_HCD=y + # CONFIG_USB_SL811_HCD is not set + # CONFIG_USB_R8A66597_HCD is not set ++# CONFIG_USB_WHCI_HCD is not set ++# CONFIG_USB_HWA_HCD is not set + + # + # USB Device Class drivers +@@ -1676,20 +1863,20 @@ CONFIG_USB_UHCI_HCD=y + # CONFIG_USB_ACM is not set + CONFIG_USB_PRINTER=y + # CONFIG_USB_WDM is not set ++# CONFIG_USB_TMC is not set + + # +-# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' ++# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; + # + + # +-# may also be needed; see USB_STORAGE Help for more information ++# see USB_STORAGE Help for more information + # + CONFIG_USB_STORAGE=y + # CONFIG_USB_STORAGE_DEBUG is not set + # CONFIG_USB_STORAGE_DATAFAB is not set + # CONFIG_USB_STORAGE_FREECOM is not set + # CONFIG_USB_STORAGE_ISD200 is not set +-# CONFIG_USB_STORAGE_DPCM is not set + # CONFIG_USB_STORAGE_USBAT is not set + # CONFIG_USB_STORAGE_SDDR09 is not set + # CONFIG_USB_STORAGE_SDDR55 is not set +@@ -1697,7 +1884,6 @@ CONFIG_USB_STORAGE=y + # CONFIG_USB_STORAGE_ALAUDA is not set + # CONFIG_USB_STORAGE_ONETOUCH is not set + # CONFIG_USB_STORAGE_KARMA is not set +-# CONFIG_USB_STORAGE_SIERRA is not set + # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set + CONFIG_USB_LIBUSUAL=y + +@@ -1718,6 +1904,7 @@ CONFIG_USB_LIBUSUAL=y + # CONFIG_USB_EMI62 is not set + # CONFIG_USB_EMI26 is not set + # CONFIG_USB_ADUTUX is not set ++# CONFIG_USB_SEVSEG is not set + # CONFIG_USB_RIO500 is not set + # CONFIG_USB_LEGOTOWER is not set + # CONFIG_USB_LCD is not set +@@ -1735,7 +1922,13 @@ CONFIG_USB_LIBUSUAL=y + # CONFIG_USB_IOWARRIOR is not set + # CONFIG_USB_TEST is not set + # CONFIG_USB_ISIGHTFW is not set ++# CONFIG_USB_VST is not set + # CONFIG_USB_GADGET is not set ++ ++# ++# OTG and related infrastructure ++# ++# CONFIG_UWB is not set + # CONFIG_MMC is not set + # CONFIG_MEMSTICK is not set + CONFIG_NEW_LEDS=y +@@ -1744,6 +1937,7 @@ CONFIG_LEDS_CLASS=y + # + # LED drivers + # ++# CONFIG_LEDS_ALIX2 is not set + # CONFIG_LEDS_PCA9532 is not set + # CONFIG_LEDS_CLEVO_MAIL is not set + # CONFIG_LEDS_PCA955X is not set +@@ -1754,6 +1948,7 @@ CONFIG_LEDS_CLASS=y + CONFIG_LEDS_TRIGGERS=y + # CONFIG_LEDS_TRIGGER_TIMER is not set + # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set ++# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set + # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + # CONFIG_ACCESSIBILITY is not set + # CONFIG_INFINIBAND is not set +@@ -1793,6 +1988,7 @@ CONFIG_RTC_INTF_DEV=y + # CONFIG_RTC_DRV_M41T80 is not set + # CONFIG_RTC_DRV_S35390A is not set + # CONFIG_RTC_DRV_FM3130 is not set ++# CONFIG_RTC_DRV_RX8581 is not set + + # + # SPI RTC drivers +@@ -1802,12 +1998,15 @@ CONFIG_RTC_INTF_DEV=y + # Platform RTC drivers + # + CONFIG_RTC_DRV_CMOS=y ++# CONFIG_RTC_DRV_DS1286 is not set + # CONFIG_RTC_DRV_DS1511 is not set + # CONFIG_RTC_DRV_DS1553 is not set + # CONFIG_RTC_DRV_DS1742 is not set + # CONFIG_RTC_DRV_STK17TA8 is not set + # CONFIG_RTC_DRV_M48T86 is not set ++# CONFIG_RTC_DRV_M48T35 is not set + # CONFIG_RTC_DRV_M48T59 is not set ++# CONFIG_RTC_DRV_BQ4802 is not set + # CONFIG_RTC_DRV_V3020 is not set + + # +@@ -1820,6 +2019,21 @@ CONFIG_DMADEVICES=y + # + # CONFIG_INTEL_IOATDMA is not set + # CONFIG_UIO is not set ++# CONFIG_STAGING is not set ++CONFIG_X86_PLATFORM_DEVICES=y ++# CONFIG_ACER_WMI is not set ++# CONFIG_ASUS_LAPTOP is not set ++# CONFIG_FUJITSU_LAPTOP is not set ++# CONFIG_MSI_LAPTOP is not set ++# CONFIG_PANASONIC_LAPTOP is not set ++# CONFIG_COMPAL_LAPTOP is not set ++# CONFIG_SONY_LAPTOP is not set ++# CONFIG_THINKPAD_ACPI is not set ++# CONFIG_INTEL_MENLOW is not set ++CONFIG_EEEPC_LAPTOP=y ++# CONFIG_ACPI_WMI is not set ++# CONFIG_ACPI_ASUS is not set ++# CONFIG_ACPI_TOSHIBA is not set + + # + # Firmware Drivers +@@ -1830,8 +2044,7 @@ CONFIG_EFI_VARS=y + # CONFIG_DELL_RBU is not set + # CONFIG_DCDBAS is not set + CONFIG_DMIID=y +-CONFIG_ISCSI_IBFT_FIND=y +-CONFIG_ISCSI_IBFT=y ++# CONFIG_ISCSI_IBFT_FIND is not set + + # + # File systems +@@ -1841,22 +2054,25 @@ CONFIG_EXT3_FS=y + CONFIG_EXT3_FS_XATTR=y + CONFIG_EXT3_FS_POSIX_ACL=y + CONFIG_EXT3_FS_SECURITY=y +-# CONFIG_EXT4DEV_FS is not set ++# CONFIG_EXT4_FS is not set + CONFIG_JBD=y + # CONFIG_JBD_DEBUG is not set + CONFIG_FS_MBCACHE=y + # CONFIG_REISERFS_FS is not set + # CONFIG_JFS_FS is not set + CONFIG_FS_POSIX_ACL=y ++CONFIG_FILE_LOCKING=y + # CONFIG_XFS_FS is not set + # CONFIG_GFS2_FS is not set + # CONFIG_OCFS2_FS is not set ++# CONFIG_BTRFS_FS is not set + CONFIG_DNOTIFY=y + CONFIG_INOTIFY=y + CONFIG_INOTIFY_USER=y + CONFIG_QUOTA=y + CONFIG_QUOTA_NETLINK_INTERFACE=y + # CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QUOTA_TREE=y + # CONFIG_QFMT_V1 is not set + CONFIG_QFMT_V2=y + CONFIG_QUOTACTL=y +@@ -1890,16 +2106,14 @@ CONFIG_PROC_FS=y + CONFIG_PROC_KCORE=y + CONFIG_PROC_VMCORE=y + CONFIG_PROC_SYSCTL=y ++CONFIG_PROC_PAGE_MONITOR=y + CONFIG_SYSFS=y + CONFIG_TMPFS=y + CONFIG_TMPFS_POSIX_ACL=y + CONFIG_HUGETLBFS=y + CONFIG_HUGETLB_PAGE=y + # CONFIG_CONFIGFS_FS is not set +- +-# +-# Miscellaneous filesystems +-# ++CONFIG_MISC_FILESYSTEMS=y + # CONFIG_ADFS_FS is not set + # CONFIG_AFFS_FS is not set + # CONFIG_ECRYPT_FS is not set +@@ -1909,6 +2123,7 @@ CONFIG_HUGETLB_PAGE=y + # CONFIG_BFS_FS is not set + # CONFIG_EFS_FS is not set + # CONFIG_CRAMFS is not set ++# CONFIG_SQUASHFS is not set + # CONFIG_VXFS_FS is not set + # CONFIG_MINIX_FS is not set + # CONFIG_OMFS_FS is not set +@@ -1930,6 +2145,7 @@ CONFIG_NFS_ACL_SUPPORT=y + CONFIG_NFS_COMMON=y + CONFIG_SUNRPC=y + CONFIG_SUNRPC_GSS=y ++# CONFIG_SUNRPC_REGISTER_V4 is not set + CONFIG_RPCSEC_GSS_KRB5=y + # CONFIG_RPCSEC_GSS_SPKM3 is not set + # CONFIG_SMB_FS is not set +@@ -2006,7 +2222,7 @@ CONFIG_NLS_UTF8=y + # + CONFIG_TRACE_IRQFLAGS_SUPPORT=y + CONFIG_PRINTK_TIME=y +-CONFIG_ENABLE_WARN_DEPRECATED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set + CONFIG_ENABLE_MUST_CHECK=y + CONFIG_FRAME_WARN=2048 + CONFIG_MAGIC_SYSRQ=y +@@ -2035,40 +2251,60 @@ CONFIG_TIMER_STATS=y + CONFIG_DEBUG_BUGVERBOSE=y + # CONFIG_DEBUG_INFO is not set + # CONFIG_DEBUG_VM is not set ++# CONFIG_DEBUG_VIRTUAL is not set + # CONFIG_DEBUG_WRITECOUNT is not set + CONFIG_DEBUG_MEMORY_INIT=y + # CONFIG_DEBUG_LIST is not set + # CONFIG_DEBUG_SG is not set ++# CONFIG_DEBUG_NOTIFIERS is not set ++CONFIG_ARCH_WANT_FRAME_POINTERS=y + CONFIG_FRAME_POINTER=y + # CONFIG_BOOT_PRINTK_DELAY is not set + # CONFIG_RCU_TORTURE_TEST is not set ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set + # CONFIG_KPROBES_SANITY_TEST is not set + # CONFIG_BACKTRACE_SELF_TEST is not set ++# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set + # CONFIG_LKDTM is not set + # CONFIG_FAULT_INJECTION is not set + # CONFIG_LATENCYTOP is not set + CONFIG_SYSCTL_SYSCALL_CHECK=y +-CONFIG_HAVE_FTRACE=y ++CONFIG_USER_STACKTRACE_SUPPORT=y ++CONFIG_HAVE_FUNCTION_TRACER=y ++CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y ++CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y + CONFIG_HAVE_DYNAMIC_FTRACE=y +-# CONFIG_FTRACE is not set ++CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y ++CONFIG_HAVE_HW_BRANCH_TRACER=y ++ ++# ++# Tracers ++# ++# CONFIG_FUNCTION_TRACER is not set + # CONFIG_IRQSOFF_TRACER is not set + # CONFIG_SYSPROF_TRACER is not set + # CONFIG_SCHED_TRACER is not set + # CONFIG_CONTEXT_SWITCH_TRACER is not set ++# CONFIG_BOOT_TRACER is not set ++# CONFIG_TRACE_BRANCH_PROFILING is not set ++# CONFIG_POWER_TRACER is not set ++# CONFIG_STACK_TRACER is not set ++# CONFIG_HW_BRANCH_TRACER is not set + CONFIG_PROVIDE_OHCI1394_DMA_INIT=y ++# CONFIG_DYNAMIC_PRINTK_DEBUG is not set + # CONFIG_SAMPLES is not set + CONFIG_HAVE_ARCH_KGDB=y + # CONFIG_KGDB is not set + # CONFIG_STRICT_DEVMEM is not set + CONFIG_X86_VERBOSE_BOOTUP=y + CONFIG_EARLY_PRINTK=y ++CONFIG_EARLY_PRINTK_DBGP=y + CONFIG_DEBUG_STACKOVERFLOW=y + CONFIG_DEBUG_STACK_USAGE=y + # CONFIG_DEBUG_PAGEALLOC is not set + # CONFIG_DEBUG_PER_CPU_MAPS is not set + # CONFIG_X86_PTDUMP is not set + CONFIG_DEBUG_RODATA=y +-# CONFIG_DIRECT_GBPAGES is not set + # CONFIG_DEBUG_RODATA_TEST is not set + CONFIG_DEBUG_NX_TEST=m + # CONFIG_IOMMU_DEBUG is not set +@@ -2092,8 +2328,10 @@ CONFIG_OPTIMIZE_INLINING=y + CONFIG_KEYS=y + CONFIG_KEYS_DEBUG_PROC_KEYS=y + CONFIG_SECURITY=y ++# CONFIG_SECURITYFS is not set + CONFIG_SECURITY_NETWORK=y + # CONFIG_SECURITY_NETWORK_XFRM is not set ++# CONFIG_SECURITY_PATH is not set + CONFIG_SECURITY_FILE_CAPABILITIES=y + # CONFIG_SECURITY_ROOTPLUG is not set + CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 +@@ -2104,7 +2342,6 @@ CONFIG_SECURITY_SELINUX_DISABLE=y + CONFIG_SECURITY_SELINUX_DEVELOP=y + CONFIG_SECURITY_SELINUX_AVC_STATS=y + CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 +-# CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT is not set + # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set + # CONFIG_SECURITY_SMACK is not set + CONFIG_CRYPTO=y +@@ -2112,11 +2349,18 @@ CONFIG_CRYPTO=y + # + # Crypto core or helper + # ++# CONFIG_CRYPTO_FIPS is not set + CONFIG_CRYPTO_ALGAPI=y ++CONFIG_CRYPTO_ALGAPI2=y + CONFIG_CRYPTO_AEAD=y ++CONFIG_CRYPTO_AEAD2=y + CONFIG_CRYPTO_BLKCIPHER=y ++CONFIG_CRYPTO_BLKCIPHER2=y + CONFIG_CRYPTO_HASH=y ++CONFIG_CRYPTO_HASH2=y ++CONFIG_CRYPTO_RNG2=y + CONFIG_CRYPTO_MANAGER=y ++CONFIG_CRYPTO_MANAGER2=y + # CONFIG_CRYPTO_GF128MUL is not set + # CONFIG_CRYPTO_NULL is not set + # CONFIG_CRYPTO_CRYPTD is not set +@@ -2151,6 +2395,7 @@ CONFIG_CRYPTO_HMAC=y + # Digest + # + # CONFIG_CRYPTO_CRC32C is not set ++# CONFIG_CRYPTO_CRC32C_INTEL is not set + # CONFIG_CRYPTO_MD4 is not set + CONFIG_CRYPTO_MD5=y + # CONFIG_CRYPTO_MICHAEL_MIC is not set +@@ -2191,6 +2436,11 @@ CONFIG_CRYPTO_DES=y + # + # CONFIG_CRYPTO_DEFLATE is not set + # CONFIG_CRYPTO_LZO is not set ++ ++# ++# Random Number Generation ++# ++# CONFIG_CRYPTO_ANSI_CPRNG is not set + CONFIG_CRYPTO_HW=y + # CONFIG_CRYPTO_DEV_HIFN_795X is not set + CONFIG_HAVE_KVM=y +@@ -2205,6 +2455,7 @@ CONFIG_VIRTUALIZATION=y + CONFIG_BITREVERSE=y + CONFIG_GENERIC_FIND_FIRST_BIT=y + CONFIG_GENERIC_FIND_NEXT_BIT=y ++CONFIG_GENERIC_FIND_LAST_BIT=y + # CONFIG_CRC_CCITT is not set + # CONFIG_CRC16 is not set + CONFIG_CRC_T10DIF=y +Index: linux-2.6-tip/arch/x86/ia32/ia32_signal.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/ia32/ia32_signal.c ++++ linux-2.6-tip/arch/x86/ia32/ia32_signal.c +@@ -33,8 +33,6 @@ + #include + #include + +-#define DEBUG_SIG 0 +- + #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + + #define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ +@@ -46,78 +44,83 @@ void signal_fault(struct pt_regs *regs, + + int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) + { +- int err; ++ int err = 0; + + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) + return -EFAULT; + +- /* If you change siginfo_t structure, please make sure that +- this code is fixed accordingly. +- It should never copy any pad contained in the structure +- to avoid security leaks, but must copy the generic +- 3 ints plus the relevant union member. */ +- err = __put_user(from->si_signo, &to->si_signo); +- err |= __put_user(from->si_errno, &to->si_errno); +- err |= __put_user((short)from->si_code, &to->si_code); +- +- if (from->si_code < 0) { +- err |= __put_user(from->si_pid, &to->si_pid); +- err |= __put_user(from->si_uid, &to->si_uid); +- err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); +- } else { +- /* +- * First 32bits of unions are always present: +- * si_pid === si_band === si_tid === si_addr(LS half) +- */ +- err |= __put_user(from->_sifields._pad[0], +- &to->_sifields._pad[0]); +- switch (from->si_code >> 16) { +- case __SI_FAULT >> 16: +- break; +- case __SI_CHLD >> 16: +- err |= __put_user(from->si_utime, &to->si_utime); +- err |= __put_user(from->si_stime, &to->si_stime); +- err |= __put_user(from->si_status, &to->si_status); +- /* FALL THROUGH */ +- default: +- case __SI_KILL >> 16: +- err |= __put_user(from->si_uid, &to->si_uid); +- break; +- case __SI_POLL >> 16: +- err |= __put_user(from->si_fd, &to->si_fd); +- break; +- case __SI_TIMER >> 16: +- err |= __put_user(from->si_overrun, &to->si_overrun); +- err |= __put_user(ptr_to_compat(from->si_ptr), +- &to->si_ptr); +- break; +- /* This is not generated by the kernel as of now. */ +- case __SI_RT >> 16: +- case __SI_MESGQ >> 16: +- err |= __put_user(from->si_uid, &to->si_uid); +- err |= __put_user(from->si_int, &to->si_int); +- break; ++ put_user_try { ++ /* If you change siginfo_t structure, please make sure that ++ this code is fixed accordingly. ++ It should never copy any pad contained in the structure ++ to avoid security leaks, but must copy the generic ++ 3 ints plus the relevant union member. */ ++ put_user_ex(from->si_signo, &to->si_signo); ++ put_user_ex(from->si_errno, &to->si_errno); ++ put_user_ex((short)from->si_code, &to->si_code); ++ ++ if (from->si_code < 0) { ++ put_user_ex(from->si_pid, &to->si_pid); ++ put_user_ex(from->si_uid, &to->si_uid); ++ put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); ++ } else { ++ /* ++ * First 32bits of unions are always present: ++ * si_pid === si_band === si_tid === si_addr(LS half) ++ */ ++ put_user_ex(from->_sifields._pad[0], ++ &to->_sifields._pad[0]); ++ switch (from->si_code >> 16) { ++ case __SI_FAULT >> 16: ++ break; ++ case __SI_CHLD >> 16: ++ put_user_ex(from->si_utime, &to->si_utime); ++ put_user_ex(from->si_stime, &to->si_stime); ++ put_user_ex(from->si_status, &to->si_status); ++ /* FALL THROUGH */ ++ default: ++ case __SI_KILL >> 16: ++ put_user_ex(from->si_uid, &to->si_uid); ++ break; ++ case __SI_POLL >> 16: ++ put_user_ex(from->si_fd, &to->si_fd); ++ break; ++ case __SI_TIMER >> 16: ++ put_user_ex(from->si_overrun, &to->si_overrun); ++ put_user_ex(ptr_to_compat(from->si_ptr), ++ &to->si_ptr); ++ break; ++ /* This is not generated by the kernel as of now. */ ++ case __SI_RT >> 16: ++ case __SI_MESGQ >> 16: ++ put_user_ex(from->si_uid, &to->si_uid); ++ put_user_ex(from->si_int, &to->si_int); ++ break; ++ } + } +- } ++ } put_user_catch(err); ++ + return err; + } + + int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) + { +- int err; ++ int err = 0; + u32 ptr32; + + if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) + return -EFAULT; + +- err = __get_user(to->si_signo, &from->si_signo); +- err |= __get_user(to->si_errno, &from->si_errno); +- err |= __get_user(to->si_code, &from->si_code); +- +- err |= __get_user(to->si_pid, &from->si_pid); +- err |= __get_user(to->si_uid, &from->si_uid); +- err |= __get_user(ptr32, &from->si_ptr); +- to->si_ptr = compat_ptr(ptr32); ++ get_user_try { ++ get_user_ex(to->si_signo, &from->si_signo); ++ get_user_ex(to->si_errno, &from->si_errno); ++ get_user_ex(to->si_code, &from->si_code); ++ ++ get_user_ex(to->si_pid, &from->si_pid); ++ get_user_ex(to->si_uid, &from->si_uid); ++ get_user_ex(ptr32, &from->si_ptr); ++ to->si_ptr = compat_ptr(ptr32); ++ } get_user_catch(err); + + return err; + } +@@ -142,17 +145,23 @@ asmlinkage long sys32_sigaltstack(const + struct pt_regs *regs) + { + stack_t uss, uoss; +- int ret; ++ int ret, err = 0; + mm_segment_t seg; + + if (uss_ptr) { + u32 ptr; + + memset(&uss, 0, sizeof(stack_t)); +- if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)) || +- __get_user(ptr, &uss_ptr->ss_sp) || +- __get_user(uss.ss_flags, &uss_ptr->ss_flags) || +- __get_user(uss.ss_size, &uss_ptr->ss_size)) ++ if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t))) ++ return -EFAULT; ++ ++ get_user_try { ++ get_user_ex(ptr, &uss_ptr->ss_sp); ++ get_user_ex(uss.ss_flags, &uss_ptr->ss_flags); ++ get_user_ex(uss.ss_size, &uss_ptr->ss_size); ++ } get_user_catch(err); ++ ++ if (err) + return -EFAULT; + uss.ss_sp = compat_ptr(ptr); + } +@@ -161,10 +170,16 @@ asmlinkage long sys32_sigaltstack(const + ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp); + set_fs(seg); + if (ret >= 0 && uoss_ptr) { +- if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) || +- __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || +- __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || +- __put_user(uoss.ss_size, &uoss_ptr->ss_size)) ++ if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t))) ++ return -EFAULT; ++ ++ put_user_try { ++ put_user_ex(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp); ++ put_user_ex(uoss.ss_flags, &uoss_ptr->ss_flags); ++ put_user_ex(uoss.ss_size, &uoss_ptr->ss_size); ++ } put_user_catch(err); ++ ++ if (err) + ret = -EFAULT; + } + return ret; +@@ -173,75 +188,78 @@ asmlinkage long sys32_sigaltstack(const + /* + * Do a signal return; undo the signal stack. + */ ++#define loadsegment_gs(v) load_gs_index(v) ++#define loadsegment_fs(v) loadsegment(fs, v) ++#define loadsegment_ds(v) loadsegment(ds, v) ++#define loadsegment_es(v) loadsegment(es, v) ++ ++#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; }) ++#define set_user_seg(seg, v) loadsegment_##seg(v) ++ + #define COPY(x) { \ +- err |= __get_user(regs->x, &sc->x); \ ++ get_user_ex(regs->x, &sc->x); \ + } + +-#define COPY_SEG_CPL3(seg) { \ +- unsigned short tmp; \ +- err |= __get_user(tmp, &sc->seg); \ +- regs->seg = tmp | 3; \ +-} ++#define GET_SEG(seg) ({ \ ++ unsigned short tmp; \ ++ get_user_ex(tmp, &sc->seg); \ ++ tmp; \ ++}) ++ ++#define COPY_SEG_CPL3(seg) do { \ ++ regs->seg = GET_SEG(seg) | 3; \ ++} while (0) + + #define RELOAD_SEG(seg) { \ +- unsigned int cur, pre; \ +- err |= __get_user(pre, &sc->seg); \ +- savesegment(seg, cur); \ ++ unsigned int pre = GET_SEG(seg); \ ++ unsigned int cur = get_user_seg(seg); \ + pre |= 3; \ + if (pre != cur) \ +- loadsegment(seg, pre); \ ++ set_user_seg(seg, pre); \ + } + + static int ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_ia32 __user *sc, + unsigned int *pax) + { +- unsigned int tmpflags, gs, oldgs, err = 0; ++ unsigned int tmpflags, err = 0; + void __user *buf; + u32 tmp; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + +-#if DEBUG_SIG +- printk(KERN_DEBUG "SIG restore_sigcontext: " +- "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", +- sc, sc->err, sc->ip, sc->cs, sc->flags); +-#endif +- +- /* +- * Reload fs and gs if they have changed in the signal +- * handler. This does not handle long fs/gs base changes in +- * the handler, but does not clobber them at least in the +- * normal case. +- */ +- err |= __get_user(gs, &sc->gs); +- gs |= 3; +- savesegment(gs, oldgs); +- if (gs != oldgs) +- load_gs_index(gs); +- +- RELOAD_SEG(fs); +- RELOAD_SEG(ds); +- RELOAD_SEG(es); +- +- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); +- COPY(dx); COPY(cx); COPY(ip); +- /* Don't touch extended registers */ +- +- COPY_SEG_CPL3(cs); +- COPY_SEG_CPL3(ss); +- +- err |= __get_user(tmpflags, &sc->flags); +- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); +- /* disable syscall checks */ +- regs->orig_ax = -1; +- +- err |= __get_user(tmp, &sc->fpstate); +- buf = compat_ptr(tmp); +- err |= restore_i387_xstate_ia32(buf); ++ get_user_try { ++ /* ++ * Reload fs and gs if they have changed in the signal ++ * handler. This does not handle long fs/gs base changes in ++ * the handler, but does not clobber them at least in the ++ * normal case. ++ */ ++ RELOAD_SEG(gs); ++ RELOAD_SEG(fs); ++ RELOAD_SEG(ds); ++ RELOAD_SEG(es); ++ ++ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); ++ COPY(dx); COPY(cx); COPY(ip); ++ /* Don't touch extended registers */ ++ ++ COPY_SEG_CPL3(cs); ++ COPY_SEG_CPL3(ss); ++ ++ get_user_ex(tmpflags, &sc->flags); ++ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); ++ /* disable syscall checks */ ++ regs->orig_ax = -1; ++ ++ get_user_ex(tmp, &sc->fpstate); ++ buf = compat_ptr(tmp); ++ err |= restore_i387_xstate_ia32(buf); ++ ++ get_user_ex(*pax, &sc->ax); ++ } get_user_catch(err); + +- err |= __get_user(*pax, &sc->ax); + return err; + } + +@@ -317,38 +335,36 @@ static int ia32_setup_sigcontext(struct + void __user *fpstate, + struct pt_regs *regs, unsigned int mask) + { +- int tmp, err = 0; ++ int err = 0; + +- savesegment(gs, tmp); +- err |= __put_user(tmp, (unsigned int __user *)&sc->gs); +- savesegment(fs, tmp); +- err |= __put_user(tmp, (unsigned int __user *)&sc->fs); +- savesegment(ds, tmp); +- err |= __put_user(tmp, (unsigned int __user *)&sc->ds); +- savesegment(es, tmp); +- err |= __put_user(tmp, (unsigned int __user *)&sc->es); +- +- err |= __put_user(regs->di, &sc->di); +- err |= __put_user(regs->si, &sc->si); +- err |= __put_user(regs->bp, &sc->bp); +- err |= __put_user(regs->sp, &sc->sp); +- err |= __put_user(regs->bx, &sc->bx); +- err |= __put_user(regs->dx, &sc->dx); +- err |= __put_user(regs->cx, &sc->cx); +- err |= __put_user(regs->ax, &sc->ax); +- err |= __put_user(current->thread.trap_no, &sc->trapno); +- err |= __put_user(current->thread.error_code, &sc->err); +- err |= __put_user(regs->ip, &sc->ip); +- err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); +- err |= __put_user(regs->flags, &sc->flags); +- err |= __put_user(regs->sp, &sc->sp_at_signal); +- err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); +- +- err |= __put_user(ptr_to_compat(fpstate), &sc->fpstate); +- +- /* non-iBCS2 extensions.. */ +- err |= __put_user(mask, &sc->oldmask); +- err |= __put_user(current->thread.cr2, &sc->cr2); ++ put_user_try { ++ put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs); ++ put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs); ++ put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds); ++ put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es); ++ ++ put_user_ex(regs->di, &sc->di); ++ put_user_ex(regs->si, &sc->si); ++ put_user_ex(regs->bp, &sc->bp); ++ put_user_ex(regs->sp, &sc->sp); ++ put_user_ex(regs->bx, &sc->bx); ++ put_user_ex(regs->dx, &sc->dx); ++ put_user_ex(regs->cx, &sc->cx); ++ put_user_ex(regs->ax, &sc->ax); ++ put_user_ex(current->thread.trap_no, &sc->trapno); ++ put_user_ex(current->thread.error_code, &sc->err); ++ put_user_ex(regs->ip, &sc->ip); ++ put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); ++ put_user_ex(regs->flags, &sc->flags); ++ put_user_ex(regs->sp, &sc->sp_at_signal); ++ put_user_ex(regs->ss, (unsigned int __user *)&sc->ss); ++ ++ put_user_ex(ptr_to_compat(fpstate), &sc->fpstate); ++ ++ /* non-iBCS2 extensions.. */ ++ put_user_ex(mask, &sc->oldmask); ++ put_user_ex(current->thread.cr2, &sc->cr2); ++ } put_user_catch(err); + + return err; + } +@@ -437,13 +453,17 @@ int ia32_setup_frame(int sig, struct k_s + else + restorer = &frame->retcode; + } +- err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); + +- /* +- * These are actually not used anymore, but left because some +- * gdb versions depend on them as a marker. +- */ +- err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); ++ put_user_try { ++ put_user_ex(ptr_to_compat(restorer), &frame->pretcode); ++ ++ /* ++ * These are actually not used anymore, but left because some ++ * gdb versions depend on them as a marker. ++ */ ++ put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); ++ } put_user_catch(err); ++ + if (err) + return -EFAULT; + +@@ -462,11 +482,6 @@ int ia32_setup_frame(int sig, struct k_s + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + +-#if DEBUG_SIG +- printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", +- current->comm, current->pid, frame, regs->ip, frame->pretcode); +-#endif +- + return 0; + } + +@@ -496,41 +511,40 @@ int ia32_setup_rt_frame(int sig, struct + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + +- err |= __put_user(sig, &frame->sig); +- err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); +- err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); +- err |= copy_siginfo_to_user32(&frame->info, info); +- if (err) +- return -EFAULT; ++ put_user_try { ++ put_user_ex(sig, &frame->sig); ++ put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo); ++ put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); ++ err |= copy_siginfo_to_user32(&frame->info, info); ++ ++ /* Create the ucontext. */ ++ if (cpu_has_xsave) ++ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); ++ else ++ put_user_ex(0, &frame->uc.uc_flags); ++ put_user_ex(0, &frame->uc.uc_link); ++ put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); ++ put_user_ex(sas_ss_flags(regs->sp), ++ &frame->uc.uc_stack.ss_flags); ++ put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); ++ err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, ++ regs, set->sig[0]); ++ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + +- /* Create the ucontext. */ +- if (cpu_has_xsave) +- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); +- else +- err |= __put_user(0, &frame->uc.uc_flags); +- err |= __put_user(0, &frame->uc.uc_link); +- err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); +- err |= __put_user(sas_ss_flags(regs->sp), +- &frame->uc.uc_stack.ss_flags); +- err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); +- err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, +- regs, set->sig[0]); +- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); +- if (err) +- return -EFAULT; ++ if (ka->sa.sa_flags & SA_RESTORER) ++ restorer = ka->sa.sa_restorer; ++ else ++ restorer = VDSO32_SYMBOL(current->mm->context.vdso, ++ rt_sigreturn); ++ put_user_ex(ptr_to_compat(restorer), &frame->pretcode); ++ ++ /* ++ * Not actually used anymore, but left because some gdb ++ * versions need it. ++ */ ++ put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); ++ } put_user_catch(err); + +- if (ka->sa.sa_flags & SA_RESTORER) +- restorer = ka->sa.sa_restorer; +- else +- restorer = VDSO32_SYMBOL(current->mm->context.vdso, +- rt_sigreturn); +- err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); +- +- /* +- * Not actually used anymore, but left because some gdb +- * versions need it. +- */ +- err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); + if (err) + return -EFAULT; + +@@ -549,10 +563,5 @@ int ia32_setup_rt_frame(int sig, struct + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + +-#if DEBUG_SIG +- printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", +- current->comm, current->pid, frame, regs->ip, frame->pretcode); +-#endif +- + return 0; + } +Index: linux-2.6-tip/arch/x86/ia32/ia32entry.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/ia32/ia32entry.S ++++ linux-2.6-tip/arch/x86/ia32/ia32entry.S +@@ -112,8 +112,8 @@ ENTRY(ia32_sysenter_target) + CFI_DEF_CFA rsp,0 + CFI_REGISTER rsp,rbp + SWAPGS_UNSAFE_STACK +- movq %gs:pda_kernelstack, %rsp +- addq $(PDA_STACKOFFSET),%rsp ++ movq PER_CPU_VAR(kernel_stack), %rsp ++ addq $(KERNEL_STACK_OFFSET),%rsp + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs, here we enable it straight after entry: +@@ -273,13 +273,13 @@ ENDPROC(ia32_sysenter_target) + ENTRY(ia32_cstar_target) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME +- CFI_DEF_CFA rsp,PDA_STACKOFFSET ++ CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET + CFI_REGISTER rip,rcx + /*CFI_REGISTER rflags,r11*/ + SWAPGS_UNSAFE_STACK + movl %esp,%r8d + CFI_REGISTER rsp,r8 +- movq %gs:pda_kernelstack,%rsp ++ movq PER_CPU_VAR(kernel_stack),%rsp + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: +@@ -825,7 +825,11 @@ ia32_sys_call_table: + .quad compat_sys_signalfd4 + .quad sys_eventfd2 + .quad sys_epoll_create1 +- .quad sys_dup3 /* 330 */ ++ .quad sys_dup3 /* 330 */ + .quad sys_pipe2 + .quad sys_inotify_init1 ++ .quad quiet_ni_syscall /* preadv */ ++ .quad quiet_ni_syscall /* pwritev */ ++ .quad compat_sys_rt_tgsigqueueinfo /* 335 */ ++ .quad sys_perf_counter_open + ia32_syscall_end: +Index: linux-2.6-tip/arch/x86/include/asm/a.out-core.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/a.out-core.h ++++ linux-2.6-tip/arch/x86/include/asm/a.out-core.h +@@ -55,7 +55,7 @@ static inline void aout_dump_thread(stru + dump->regs.ds = (u16)regs->ds; + dump->regs.es = (u16)regs->es; + dump->regs.fs = (u16)regs->fs; +- savesegment(gs, dump->regs.gs); ++ dump->regs.gs = get_user_gs(regs); + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; + dump->regs.cs = (u16)regs->cs; +Index: linux-2.6-tip/arch/x86/include/asm/acpi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/acpi.h ++++ linux-2.6-tip/arch/x86/include/asm/acpi.h +@@ -50,8 +50,8 @@ + + #define ACPI_ASM_MACROS + #define BREAKPOINT3 +-#define ACPI_DISABLE_IRQS() local_irq_disable() +-#define ACPI_ENABLE_IRQS() local_irq_enable() ++#define ACPI_DISABLE_IRQS() local_irq_disable_nort() ++#define ACPI_ENABLE_IRQS() local_irq_enable_nort() + #define ACPI_FLUSH_CPU_CACHE() wbinvd() + + int __acpi_acquire_global_lock(unsigned int *lock); +@@ -102,9 +102,6 @@ static inline void disable_acpi(void) + acpi_noirq = 1; + } + +-/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */ +-#define FIX_ACPI_PAGES 4 +- + extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); + + static inline void acpi_noirq_set(void) { acpi_noirq = 1; } +Index: linux-2.6-tip/arch/x86/include/asm/apic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/apic.h ++++ linux-2.6-tip/arch/x86/include/asm/apic.h +@@ -1,15 +1,18 @@ + #ifndef _ASM_X86_APIC_H + #define _ASM_X86_APIC_H + +-#include ++#include + #include ++#include + + #include +-#include +-#include ++#include + #include ++#include ++#include ++#include ++#include + #include +-#include + #include + + #define ARCH_APICTIMER_STOPS_ON_C3 1 +@@ -33,7 +36,13 @@ + } while (0) + + ++#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) + extern void generic_apic_probe(void); ++#else ++static inline void generic_apic_probe(void) ++{ ++} ++#endif + + #ifdef CONFIG_X86_LOCAL_APIC + +@@ -41,6 +50,21 @@ extern unsigned int apic_verbosity; + extern int local_apic_timer_c2_ok; + + extern int disable_apic; ++ ++#ifdef CONFIG_SMP ++extern void __inquire_remote_apic(int apicid); ++#else /* CONFIG_SMP */ ++static inline void __inquire_remote_apic(int apicid) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++static inline void default_inquire_remote_apic(int apicid) ++{ ++ if (apic_verbosity >= APIC_DEBUG) ++ __inquire_remote_apic(apicid); ++} ++ + /* + * Basic functions accessing APICs. + */ +@@ -51,7 +75,14 @@ extern int disable_apic; + #define setup_secondary_clock setup_secondary_APIC_clock + #endif + ++#ifdef CONFIG_X86_VSMP + extern int is_vsmp_box(void); ++#else ++static inline int is_vsmp_box(void) ++{ ++ return 0; ++} ++#endif + extern void xapic_wait_icr_idle(void); + extern u32 safe_xapic_wait_icr_idle(void); + extern void xapic_icr_write(u32, u32); +@@ -71,6 +102,22 @@ static inline u32 native_apic_mem_read(u + return *((volatile u32 *)(APIC_BASE + reg)); + } + ++extern void native_apic_wait_icr_idle(void); ++extern u32 native_safe_apic_wait_icr_idle(void); ++extern void native_apic_icr_write(u32 low, u32 id); ++extern u64 native_apic_icr_read(void); ++ ++#ifdef CONFIG_X86_X2APIC ++/* ++ * Make previous memory operations globally visible before ++ * sending the IPI through x2apic wrmsr. We need a serializing instruction or ++ * mfence for this. ++ */ ++static inline void x2apic_wrmsr_fence(void) ++{ ++ asm volatile("mfence" : : : "memory"); ++} ++ + static inline void native_apic_msr_write(u32 reg, u32 v) + { + if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || +@@ -91,8 +138,32 @@ static inline u32 native_apic_msr_read(u + return low; + } + +-#ifndef CONFIG_X86_32 +-extern int x2apic; ++static inline void native_x2apic_wait_icr_idle(void) ++{ ++ /* no need to wait for icr idle in x2apic */ ++ return; ++} ++ ++static inline u32 native_safe_x2apic_wait_icr_idle(void) ++{ ++ /* no need to wait for icr idle in x2apic */ ++ return 0; ++} ++ ++static inline void native_x2apic_icr_write(u32 low, u32 id) ++{ ++ wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); ++} ++ ++static inline u64 native_x2apic_icr_read(void) ++{ ++ unsigned long val; ++ ++ rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); ++ return val; ++} ++ ++extern int x2apic, x2apic_phys; + extern void check_x2apic(void); + extern void enable_x2apic(void); + extern void enable_IR_x2apic(void); +@@ -110,30 +181,27 @@ static inline int x2apic_enabled(void) + return 0; + } + #else +-#define x2apic_enabled() 0 +-#endif +- +-struct apic_ops { +- u32 (*read)(u32 reg); +- void (*write)(u32 reg, u32 v); +- u64 (*icr_read)(void); +- void (*icr_write)(u32 low, u32 high); +- void (*wait_icr_idle)(void); +- u32 (*safe_wait_icr_idle)(void); +-}; ++static inline void check_x2apic(void) ++{ ++} ++static inline void enable_x2apic(void) ++{ ++} ++static inline void enable_IR_x2apic(void) ++{ ++} ++static inline int x2apic_enabled(void) ++{ ++ return 0; ++} + +-extern struct apic_ops *apic_ops; ++#define x2apic 0 + +-#define apic_read (apic_ops->read) +-#define apic_write (apic_ops->write) +-#define apic_icr_read (apic_ops->icr_read) +-#define apic_icr_write (apic_ops->icr_write) +-#define apic_wait_icr_idle (apic_ops->wait_icr_idle) +-#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle) ++#endif + + extern int get_physical_broadcast(void); + +-#ifdef CONFIG_X86_64 ++#ifdef CONFIG_X86_X2APIC + static inline void ack_x2APIC_irq(void) + { + /* Docs say use 0 for future compatibility */ +@@ -141,18 +209,6 @@ static inline void ack_x2APIC_irq(void) + } + #endif + +- +-static inline void ack_APIC_irq(void) +-{ +- /* +- * ack_APIC_irq() actually gets compiled as a single instruction +- * ... yummie. +- */ +- +- /* Docs say use 0 for future compatibility */ +- apic_write(APIC_EOI, 0); +-} +- + extern int lapic_get_maxlvt(void); + extern void clear_local_APIC(void); + extern void connect_bsp_APIC(void); +@@ -196,4 +252,329 @@ static inline void disable_local_APIC(vo + + #endif /* !CONFIG_X86_LOCAL_APIC */ + ++#ifdef CONFIG_X86_64 ++#define SET_APIC_ID(x) (apic->set_apic_id(x)) ++#else ++ ++#endif ++ ++/* ++ * Copyright 2004 James Cleverdon, IBM. ++ * Subject to the GNU Public License, v.2 ++ * ++ * Generic APIC sub-arch data struct. ++ * ++ * Hacked for x86-64 by James Cleverdon from i386 architecture code by ++ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and ++ * James Cleverdon. ++ */ ++struct apic { ++ char *name; ++ ++ int (*probe)(void); ++ int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); ++ int (*apic_id_registered)(void); ++ ++ u32 irq_delivery_mode; ++ u32 irq_dest_mode; ++ ++ const struct cpumask *(*target_cpus)(void); ++ ++ int disable_esr; ++ ++ int dest_logical; ++ unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); ++ unsigned long (*check_apicid_present)(int apicid); ++ ++ void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); ++ void (*init_apic_ldr)(void); ++ ++ physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); ++ ++ void (*setup_apic_routing)(void); ++ int (*multi_timer_check)(int apic, int irq); ++ int (*apicid_to_node)(int logical_apicid); ++ int (*cpu_to_logical_apicid)(int cpu); ++ int (*cpu_present_to_apicid)(int mps_cpu); ++ physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); ++ void (*setup_portio_remap)(void); ++ int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); ++ void (*enable_apic_mode)(void); ++ int (*phys_pkg_id)(int cpuid_apic, int index_msb); ++ ++ /* ++ * When one of the next two hooks returns 1 the apic ++ * is switched to this. Essentially they are additional ++ * probe functions: ++ */ ++ int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid); ++ ++ unsigned int (*get_apic_id)(unsigned long x); ++ unsigned long (*set_apic_id)(unsigned int id); ++ unsigned long apic_id_mask; ++ ++ unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); ++ unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, ++ const struct cpumask *andmask); ++ ++ /* ipi */ ++ void (*send_IPI_mask)(const struct cpumask *mask, int vector); ++ void (*send_IPI_mask_allbutself)(const struct cpumask *mask, ++ int vector); ++ void (*send_IPI_allbutself)(int vector); ++ void (*send_IPI_all)(int vector); ++ void (*send_IPI_self)(int vector); ++ ++ /* wakeup_secondary_cpu */ ++ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); ++ ++ int trampoline_phys_low; ++ int trampoline_phys_high; ++ ++ void (*wait_for_init_deassert)(atomic_t *deassert); ++ void (*smp_callin_clear_local_apic)(void); ++ void (*inquire_remote_apic)(int apicid); ++ ++ /* apic ops */ ++ u32 (*read)(u32 reg); ++ void (*write)(u32 reg, u32 v); ++ u64 (*icr_read)(void); ++ void (*icr_write)(u32 low, u32 high); ++ void (*wait_icr_idle)(void); ++ u32 (*safe_wait_icr_idle)(void); ++}; ++ ++/* ++ * Pointer to the local APIC driver in use on this system (there's ++ * always just one such driver in use - the kernel decides via an ++ * early probing process which one it picks - and then sticks to it): ++ */ ++extern struct apic *apic; ++ ++/* ++ * APIC functionality to boot other CPUs - only used on SMP: ++ */ ++#ifdef CONFIG_SMP ++extern atomic_t init_deasserted; ++extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); ++#endif ++ ++static inline u32 apic_read(u32 reg) ++{ ++ return apic->read(reg); ++} ++ ++static inline void apic_write(u32 reg, u32 val) ++{ ++ apic->write(reg, val); ++} ++ ++static inline u64 apic_icr_read(void) ++{ ++ return apic->icr_read(); ++} ++ ++static inline void apic_icr_write(u32 low, u32 high) ++{ ++ apic->icr_write(low, high); ++} ++ ++static inline void apic_wait_icr_idle(void) ++{ ++ apic->wait_icr_idle(); ++} ++ ++static inline u32 safe_apic_wait_icr_idle(void) ++{ ++ return apic->safe_wait_icr_idle(); ++} ++ ++ ++static inline void ack_APIC_irq(void) ++{ ++#ifdef CONFIG_X86_LOCAL_APIC ++ /* ++ * ack_APIC_irq() actually gets compiled as a single instruction ++ * ... yummie. ++ */ ++ ++ /* Docs say use 0 for future compatibility */ ++ apic_write(APIC_EOI, 0); ++#endif ++} ++ ++static inline unsigned default_get_apic_id(unsigned long x) ++{ ++ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); ++ ++ if (APIC_XAPIC(ver)) ++ return (x >> 24) & 0xFF; ++ else ++ return (x >> 24) & 0x0F; ++} ++ ++/* ++ * Warm reset vector default position: ++ */ ++#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467 ++#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 ++ ++#ifdef CONFIG_X86_64 ++extern struct apic apic_flat; ++extern struct apic apic_physflat; ++extern struct apic apic_x2apic_cluster; ++extern struct apic apic_x2apic_phys; ++extern int default_acpi_madt_oem_check(char *, char *); ++ ++extern void apic_send_IPI_self(int vector); ++ ++extern struct apic apic_x2apic_uv_x; ++DECLARE_PER_CPU(int, x2apic_extra_bits); ++ ++extern int default_cpu_present_to_apicid(int mps_cpu); ++extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); ++#endif ++ ++static inline void default_wait_for_init_deassert(atomic_t *deassert) ++{ ++ while (!atomic_read(deassert)) ++ cpu_relax(); ++ return; ++} ++ ++extern void generic_bigsmp_probe(void); ++ ++ ++#ifdef CONFIG_X86_LOCAL_APIC ++ ++#include ++ ++#define APIC_DFR_VALUE (APIC_DFR_FLAT) ++ ++static inline const struct cpumask *default_target_cpus(void) ++{ ++#ifdef CONFIG_SMP ++ return cpu_online_mask; ++#else ++ return cpumask_of(0); ++#endif ++} ++ ++DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); ++ ++ ++static inline unsigned int read_apic_id(void) ++{ ++ unsigned int reg; ++ ++ reg = apic_read(APIC_ID); ++ ++ return apic->get_apic_id(reg); ++} ++ ++extern void default_setup_apic_routing(void); ++ ++#ifdef CONFIG_X86_32 ++/* ++ * Set up the logical destination ID. ++ * ++ * Intel recommends to set DFR, LDR and TPR before enabling ++ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel ++ * document number 292116). So here it goes... ++ */ ++extern void default_init_apic_ldr(void); ++ ++static inline int default_apic_id_registered(void) ++{ ++ return physid_isset(read_apic_id(), phys_cpu_present_map); ++} ++ ++static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) ++{ ++ return cpuid_apic >> index_msb; ++} ++ ++extern int default_apicid_to_node(int logical_apicid); ++ ++#endif ++ ++static inline unsigned int ++default_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; ++} ++ ++static inline unsigned int ++default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, ++ const struct cpumask *andmask) ++{ ++ unsigned long mask1 = cpumask_bits(cpumask)[0]; ++ unsigned long mask2 = cpumask_bits(andmask)[0]; ++ unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; ++ ++ return (unsigned int)(mask1 & mask2 & mask3); ++} ++ ++static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) ++{ ++ return physid_isset(apicid, bitmap); ++} ++ ++static inline unsigned long default_check_apicid_present(int bit) ++{ ++ return physid_isset(bit, phys_cpu_present_map); ++} ++ ++static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map) ++{ ++ return phys_map; ++} ++ ++/* Mapping from cpu number to logical apicid */ ++static inline int default_cpu_to_logical_apicid(int cpu) ++{ ++ return 1 << cpu; ++} ++ ++static inline int __default_cpu_present_to_apicid(int mps_cpu) ++{ ++ if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) ++ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); ++ else ++ return BAD_APICID; ++} ++ ++static inline int ++__default_check_phys_apicid_present(int boot_cpu_physical_apicid) ++{ ++ return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); ++} ++ ++#ifdef CONFIG_X86_32 ++static inline int default_cpu_present_to_apicid(int mps_cpu) ++{ ++ return __default_cpu_present_to_apicid(mps_cpu); ++} ++ ++static inline int ++default_check_phys_apicid_present(int boot_cpu_physical_apicid) ++{ ++ return __default_check_phys_apicid_present(boot_cpu_physical_apicid); ++} ++#else ++extern int default_cpu_present_to_apicid(int mps_cpu); ++extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); ++#endif ++ ++static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) ++{ ++ return physid_mask_of_physid(phys_apicid); ++} ++ ++#endif /* CONFIG_X86_LOCAL_APIC */ ++ ++#ifdef CONFIG_X86_32 ++extern u8 cpu_2_logical_apicid[NR_CPUS]; ++#endif ++ + #endif /* _ASM_X86_APIC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/apicdef.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/apicdef.h ++++ linux-2.6-tip/arch/x86/include/asm/apicdef.h +@@ -53,6 +53,7 @@ + #define APIC_ESR_SENDILL 0x00020 + #define APIC_ESR_RECVILL 0x00040 + #define APIC_ESR_ILLREGA 0x00080 ++#define APIC_LVTCMCI 0x2f0 + #define APIC_ICR 0x300 + #define APIC_DEST_SELF 0x40000 + #define APIC_DEST_ALLINC 0x80000 +Index: linux-2.6-tip/arch/x86/include/asm/apicnum.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/apicnum.h +@@ -0,0 +1,12 @@ ++#ifndef _ASM_X86_APICNUM_H ++#define _ASM_X86_APICNUM_H ++ ++/* define MAX_IO_APICS */ ++#ifdef CONFIG_X86_32 ++# define MAX_IO_APICS 64 ++#else ++# define MAX_IO_APICS 128 ++# define MAX_LOCAL_APIC 32768 ++#endif ++ ++#endif /* _ASM_X86_APICNUM_H */ +Index: linux-2.6-tip/arch/x86/include/asm/apm.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/apm.h +@@ -0,0 +1,73 @@ ++/* ++ * Machine specific APM BIOS functions for generic. ++ * Split out from apm.c by Osamu Tomita ++ */ ++ ++#ifndef _ASM_X86_MACH_DEFAULT_APM_H ++#define _ASM_X86_MACH_DEFAULT_APM_H ++ ++#ifdef APM_ZERO_SEGS ++# define APM_DO_ZERO_SEGS \ ++ "pushl %%ds\n\t" \ ++ "pushl %%es\n\t" \ ++ "xorl %%edx, %%edx\n\t" \ ++ "mov %%dx, %%ds\n\t" \ ++ "mov %%dx, %%es\n\t" \ ++ "mov %%dx, %%fs\n\t" \ ++ "mov %%dx, %%gs\n\t" ++# define APM_DO_POP_SEGS \ ++ "popl %%es\n\t" \ ++ "popl %%ds\n\t" ++#else ++# define APM_DO_ZERO_SEGS ++# define APM_DO_POP_SEGS ++#endif ++ ++static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, ++ u32 *eax, u32 *ebx, u32 *ecx, ++ u32 *edx, u32 *esi) ++{ ++ /* ++ * N.B. We do NOT need a cld after the BIOS call ++ * because we always save and restore the flags. ++ */ ++ __asm__ __volatile__(APM_DO_ZERO_SEGS ++ "pushl %%edi\n\t" ++ "pushl %%ebp\n\t" ++ "lcall *%%cs:apm_bios_entry\n\t" ++ "setc %%al\n\t" ++ "popl %%ebp\n\t" ++ "popl %%edi\n\t" ++ APM_DO_POP_SEGS ++ : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx), ++ "=S" (*esi) ++ : "a" (func), "b" (ebx_in), "c" (ecx_in) ++ : "memory", "cc"); ++} ++ ++static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, ++ u32 ecx_in, u32 *eax) ++{ ++ int cx, dx, si; ++ u8 error; ++ ++ /* ++ * N.B. We do NOT need a cld after the BIOS call ++ * because we always save and restore the flags. ++ */ ++ __asm__ __volatile__(APM_DO_ZERO_SEGS ++ "pushl %%edi\n\t" ++ "pushl %%ebp\n\t" ++ "lcall *%%cs:apm_bios_entry\n\t" ++ "setc %%bl\n\t" ++ "popl %%ebp\n\t" ++ "popl %%edi\n\t" ++ APM_DO_POP_SEGS ++ : "=a" (*eax), "=b" (error), "=c" (cx), "=d" (dx), ++ "=S" (si) ++ : "a" (func), "b" (ebx_in), "c" (ecx_in) ++ : "memory", "cc"); ++ return error; ++} ++ ++#endif /* _ASM_X86_MACH_DEFAULT_APM_H */ +Index: linux-2.6-tip/arch/x86/include/asm/arch_hooks.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/arch_hooks.h ++++ /dev/null +@@ -1,26 +0,0 @@ +-#ifndef _ASM_X86_ARCH_HOOKS_H +-#define _ASM_X86_ARCH_HOOKS_H +- +-#include +- +-/* +- * linux/include/asm/arch_hooks.h +- * +- * define the architecture specific hooks +- */ +- +-/* these aren't arch hooks, they are generic routines +- * that can be used by the hooks */ +-extern void init_ISA_irqs(void); +-extern irqreturn_t timer_interrupt(int irq, void *dev_id); +- +-/* these are the defined hooks */ +-extern void intr_init_hook(void); +-extern void pre_intr_init_hook(void); +-extern void pre_setup_arch_hook(void); +-extern void trap_init_hook(void); +-extern void pre_time_init_hook(void); +-extern void time_init_hook(void); +-extern void mca_nmi_hook(void); +- +-#endif /* _ASM_X86_ARCH_HOOKS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/atomic_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/atomic_32.h ++++ linux-2.6-tip/arch/x86/include/asm/atomic_32.h +@@ -180,10 +180,10 @@ static inline int atomic_add_return(int + + #ifdef CONFIG_M386 + no_xadd: /* Legacy 386 processor */ +- local_irq_save(flags); ++ raw_local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); +- local_irq_restore(flags); ++ raw_local_irq_restore(flags); + return i + __i; + #endif + } +@@ -247,5 +247,223 @@ static inline int atomic_add_unless(atom + #define smp_mb__before_atomic_inc() barrier() + #define smp_mb__after_atomic_inc() barrier() + ++/* An 64bit atomic type */ ++ ++typedef struct { ++ unsigned long long counter; ++} atomic64_t; ++ ++#define ATOMIC64_INIT(val) { (val) } ++ ++/** ++ * atomic64_read - read atomic64 variable ++ * @v: pointer of type atomic64_t ++ * ++ * Atomically reads the value of @v. ++ * Doesn't imply a read memory barrier. ++ */ ++#define __atomic64_read(ptr) ((ptr)->counter) ++ ++static inline unsigned long long ++cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) ++{ ++ asm volatile( ++ ++ LOCK_PREFIX "cmpxchg8b (%[ptr])\n" ++ ++ : "=A" (old) ++ ++ : [ptr] "D" (ptr), ++ "A" (old), ++ "b" (ll_low(new)), ++ "c" (ll_high(new)) ++ ++ : "memory"); ++ ++ return old; ++} ++ ++static inline unsigned long long ++atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, ++ unsigned long long new_val) ++{ ++ return cmpxchg8b(&ptr->counter, old_val, new_val); ++} ++ ++/** ++ * atomic64_set - set atomic64 variable ++ * @ptr: pointer to type atomic64_t ++ * @new_val: value to assign ++ * ++ * Atomically sets the value of @ptr to @new_val. ++ */ ++static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) ++{ ++ unsigned long long old_val; ++ ++ do { ++ old_val = atomic_read(ptr); ++ } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); ++} ++ ++/** ++ * atomic64_read - read atomic64 variable ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically reads the value of @ptr and returns it. ++ */ ++static inline unsigned long long atomic64_read(atomic64_t *ptr) ++{ ++ unsigned long long curr_val; ++ ++ do { ++ curr_val = __atomic64_read(ptr); ++ } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); ++ ++ return curr_val; ++} ++ ++/** ++ * atomic64_add_return - add and return ++ * @delta: integer value to add ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically adds @delta to @ptr and returns @delta + *@ptr ++ */ ++static inline unsigned long long ++atomic64_add_return(unsigned long long delta, atomic64_t *ptr) ++{ ++ unsigned long long old_val, new_val; ++ ++ do { ++ old_val = atomic_read(ptr); ++ new_val = old_val + delta; ++ ++ } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); ++ ++ return new_val; ++} ++ ++static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) ++{ ++ return atomic64_add_return(-delta, ptr); ++} ++ ++static inline long atomic64_inc_return(atomic64_t *ptr) ++{ ++ return atomic64_add_return(1, ptr); ++} ++ ++static inline long atomic64_dec_return(atomic64_t *ptr) ++{ ++ return atomic64_sub_return(1, ptr); ++} ++ ++/** ++ * atomic64_add - add integer to atomic64 variable ++ * @delta: integer value to add ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically adds @delta to @ptr. ++ */ ++static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) ++{ ++ atomic64_add_return(delta, ptr); ++} ++ ++/** ++ * atomic64_sub - subtract the atomic64 variable ++ * @delta: integer value to subtract ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically subtracts @delta from @ptr. ++ */ ++static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) ++{ ++ atomic64_add(-delta, ptr); ++} ++ ++/** ++ * atomic64_sub_and_test - subtract value from variable and test result ++ * @delta: integer value to subtract ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically subtracts @delta from @ptr and returns ++ * true if the result is zero, or false for all ++ * other cases. ++ */ ++static inline int ++atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) ++{ ++ unsigned long long old_val = atomic64_sub_return(delta, ptr); ++ ++ return old_val == 0; ++} ++ ++/** ++ * atomic64_inc - increment atomic64 variable ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically increments @ptr by 1. ++ */ ++static inline void atomic64_inc(atomic64_t *ptr) ++{ ++ atomic64_add(1, ptr); ++} ++ ++/** ++ * atomic64_dec - decrement atomic64 variable ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically decrements @ptr by 1. ++ */ ++static inline void atomic64_dec(atomic64_t *ptr) ++{ ++ atomic64_sub(1, ptr); ++} ++ ++/** ++ * atomic64_dec_and_test - decrement and test ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically decrements @ptr by 1 and ++ * returns true if the result is 0, or false for all other ++ * cases. ++ */ ++static inline int atomic64_dec_and_test(atomic64_t *ptr) ++{ ++ return atomic64_sub_and_test(1, ptr); ++} ++ ++/** ++ * atomic64_inc_and_test - increment and test ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically increments @ptr by 1 ++ * and returns true if the result is zero, or false for all ++ * other cases. ++ */ ++static inline int atomic64_inc_and_test(atomic64_t *ptr) ++{ ++ return atomic64_sub_and_test(-1, ptr); ++} ++ ++/** ++ * atomic64_add_negative - add and test if negative ++ * @delta: integer value to add ++ * @ptr: pointer to type atomic64_t ++ * ++ * Atomically adds @delta to @ptr and returns true ++ * if the result is negative, or false when ++ * result is greater than or equal to zero. ++ */ ++static inline int ++atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) ++{ ++ long long old_val = atomic64_add_return(delta, ptr); ++ ++ return old_val < 0; ++} ++ + #include + #endif /* _ASM_X86_ATOMIC_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/bigsmp/apic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/bigsmp/apic.h ++++ /dev/null +@@ -1,155 +0,0 @@ +-#ifndef __ASM_MACH_APIC_H +-#define __ASM_MACH_APIC_H +- +-#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu)) +-#define esr_disable (1) +- +-static inline int apic_id_registered(void) +-{ +- return (1); +-} +- +-static inline const cpumask_t *target_cpus(void) +-{ +-#ifdef CONFIG_SMP +- return &cpu_online_map; +-#else +- return &cpumask_of_cpu(0); +-#endif +-} +- +-#undef APIC_DEST_LOGICAL +-#define APIC_DEST_LOGICAL 0 +-#define APIC_DFR_VALUE (APIC_DFR_FLAT) +-#define INT_DELIVERY_MODE (dest_Fixed) +-#define INT_DEST_MODE (0) /* phys delivery to target proc */ +-#define NO_BALANCE_IRQ (0) +- +-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +-{ +- return (0); +-} +- +-static inline unsigned long check_apicid_present(int bit) +-{ +- return (1); +-} +- +-static inline unsigned long calculate_ldr(int cpu) +-{ +- unsigned long val, id; +- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; +- id = xapic_phys_to_log_apicid(cpu); +- val |= SET_APIC_LOGICAL_ID(id); +- return val; +-} +- +-/* +- * Set up the logical destination ID. +- * +- * Intel recommends to set DFR, LDR and TPR before enabling +- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel +- * document number 292116). So here it goes... +- */ +-static inline void init_apic_ldr(void) +-{ +- unsigned long val; +- int cpu = smp_processor_id(); +- +- apic_write(APIC_DFR, APIC_DFR_VALUE); +- val = calculate_ldr(cpu); +- apic_write(APIC_LDR, val); +-} +- +-static inline void setup_apic_routing(void) +-{ +- printk("Enabling APIC mode: %s. Using %d I/O APICs\n", +- "Physflat", nr_ioapics); +-} +- +-static inline int multi_timer_check(int apic, int irq) +-{ +- return (0); +-} +- +-static inline int apicid_to_node(int logical_apicid) +-{ +- return apicid_2_node[hard_smp_processor_id()]; +-} +- +-static inline int cpu_present_to_apicid(int mps_cpu) +-{ +- if (mps_cpu < nr_cpu_ids) +- return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); +- +- return BAD_APICID; +-} +- +-static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) +-{ +- return physid_mask_of_physid(phys_apicid); +-} +- +-extern u8 cpu_2_logical_apicid[]; +-/* Mapping from cpu number to logical apicid */ +-static inline int cpu_to_logical_apicid(int cpu) +-{ +- if (cpu >= nr_cpu_ids) +- return BAD_APICID; +- return cpu_physical_id(cpu); +-} +- +-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +-{ +- /* For clustered we don't have a good way to do this yet - hack */ +- return physids_promote(0xFFL); +-} +- +-static inline void setup_portio_remap(void) +-{ +-} +- +-static inline void enable_apic_mode(void) +-{ +-} +- +-static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +-{ +- return (1); +-} +- +-/* As we are using single CPU as destination, pick only one CPU here */ +-static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) +-{ +- int cpu; +- int apicid; +- +- cpu = first_cpu(*cpumask); +- apicid = cpu_to_logical_apicid(cpu); +- return apicid; +-} +- +-static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, +- const struct cpumask *andmask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one phys APIC ID. +- * May as well be the first. +- */ +- for_each_cpu_and(cpu, cpumask, andmask) +- if (cpumask_test_cpu(cpu, cpu_online_mask)) +- break; +- if (cpu < nr_cpu_ids) +- return cpu_to_logical_apicid(cpu); +- +- return BAD_APICID; +-} +- +-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +-{ +- return cpuid_apic >> index_msb; +-} +- +-#endif /* __ASM_MACH_APIC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/bigsmp/apicdef.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/bigsmp/apicdef.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-#ifndef __ASM_MACH_APICDEF_H +-#define __ASM_MACH_APICDEF_H +- +-#define APIC_ID_MASK (0xFF<<24) +- +-static inline unsigned get_apic_id(unsigned long x) +-{ +- return (((x)>>24)&0xFF); +-} +- +-#define GET_APIC_ID(x) get_apic_id(x) +- +-#endif +Index: linux-2.6-tip/arch/x86/include/asm/bigsmp/ipi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/bigsmp/ipi.h ++++ /dev/null +@@ -1,22 +0,0 @@ +-#ifndef __ASM_MACH_IPI_H +-#define __ASM_MACH_IPI_H +- +-void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +- +-static inline void send_IPI_mask(const struct cpumask *mask, int vector) +-{ +- send_IPI_mask_sequence(mask, vector); +-} +- +-static inline void send_IPI_allbutself(int vector) +-{ +- send_IPI_mask_allbutself(cpu_online_mask, vector); +-} +- +-static inline void send_IPI_all(int vector) +-{ +- send_IPI_mask(cpu_online_mask, vector); +-} +- +-#endif /* __ASM_MACH_IPI_H */ +Index: linux-2.6-tip/arch/x86/include/asm/boot.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/boot.h ++++ linux-2.6-tip/arch/x86/include/asm/boot.h +@@ -1,26 +1,36 @@ + #ifndef _ASM_X86_BOOT_H + #define _ASM_X86_BOOT_H + +-/* Don't touch these, unless you really know what you're doing. */ +-#define DEF_SYSSEG 0x1000 +-#define DEF_SYSSIZE 0x7F00 +- + /* Internal svga startup constants */ + #define NORMAL_VGA 0xffff /* 80x25 mode */ + #define EXTENDED_VGA 0xfffe /* 80x50 mode */ + #define ASK_VGA 0xfffd /* ask for it at bootup */ + ++#ifdef __KERNEL__ ++ + /* Physical address where kernel should be loaded. */ + #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + + (CONFIG_PHYSICAL_ALIGN - 1)) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) + ++#ifdef CONFIG_KERNEL_BZIP2 ++#define BOOT_HEAP_SIZE 0x400000 ++#else /* !CONFIG_KERNEL_BZIP2 */ ++ + #ifdef CONFIG_X86_64 + #define BOOT_HEAP_SIZE 0x7000 +-#define BOOT_STACK_SIZE 0x4000 + #else + #define BOOT_HEAP_SIZE 0x4000 ++#endif ++ ++#endif /* !CONFIG_KERNEL_BZIP2 */ ++ ++#ifdef CONFIG_X86_64 ++#define BOOT_STACK_SIZE 0x4000 ++#else + #define BOOT_STACK_SIZE 0x1000 + #endif + ++#endif /* __KERNEL__ */ ++ + #endif /* _ASM_X86_BOOT_H */ +Index: linux-2.6-tip/arch/x86/include/asm/cacheflush.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/cacheflush.h ++++ linux-2.6-tip/arch/x86/include/asm/cacheflush.h +@@ -5,24 +5,43 @@ + #include + + /* Caches aren't brain-dead on the intel. */ +-#define flush_cache_all() do { } while (0) +-#define flush_cache_mm(mm) do { } while (0) +-#define flush_cache_dup_mm(mm) do { } while (0) +-#define flush_cache_range(vma, start, end) do { } while (0) +-#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +-#define flush_dcache_page(page) do { } while (0) +-#define flush_dcache_mmap_lock(mapping) do { } while (0) +-#define flush_dcache_mmap_unlock(mapping) do { } while (0) +-#define flush_icache_range(start, end) do { } while (0) +-#define flush_icache_page(vma, pg) do { } while (0) +-#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) +-#define flush_cache_vmap(start, end) do { } while (0) +-#define flush_cache_vunmap(start, end) do { } while (0) +- +-#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ +- memcpy((dst), (src), (len)) +-#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ +- memcpy((dst), (src), (len)) ++static inline void flush_cache_all(void) { } ++static inline void flush_cache_mm(struct mm_struct *mm) { } ++static inline void flush_cache_dup_mm(struct mm_struct *mm) { } ++static inline void flush_cache_range(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end) { } ++static inline void flush_cache_page(struct vm_area_struct *vma, ++ unsigned long vmaddr, unsigned long pfn) { } ++static inline void flush_dcache_page(struct page *page) { } ++static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } ++static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } ++static inline void flush_icache_range(unsigned long start, ++ unsigned long end) { } ++static inline void flush_icache_page(struct vm_area_struct *vma, ++ struct page *page) { } ++static inline void flush_icache_user_range(struct vm_area_struct *vma, ++ struct page *page, ++ unsigned long addr, ++ unsigned long len) { } ++static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } ++static inline void flush_cache_vunmap(unsigned long start, ++ unsigned long end) { } ++ ++static inline void copy_to_user_page(struct vm_area_struct *vma, ++ struct page *page, unsigned long vaddr, ++ void *dst, const void *src, ++ unsigned long len) ++{ ++ memcpy(dst, src, len); ++} ++ ++static inline void copy_from_user_page(struct vm_area_struct *vma, ++ struct page *page, unsigned long vaddr, ++ void *dst, const void *src, ++ unsigned long len) ++{ ++ memcpy(dst, src, len); ++} + + #define PG_non_WB PG_arch_1 + PAGEFLAG(NonWB, non_WB) +@@ -71,6 +90,9 @@ int set_memory_4k(unsigned long addr, in + int set_memory_array_uc(unsigned long *addr, int addrinarray); + int set_memory_array_wb(unsigned long *addr, int addrinarray); + ++int set_pages_array_uc(struct page **pages, int addrinarray); ++int set_pages_array_wb(struct page **pages, int addrinarray); ++ + /* + * For legacy compatibility with the old APIs, a few functions + * are provided that work on a "struct page". +@@ -104,6 +126,11 @@ void clflush_cache_range(void *addr, uns + #ifdef CONFIG_DEBUG_RODATA + void mark_rodata_ro(void); + extern const int rodata_test_data; ++void set_kernel_text_rw(void); ++void set_kernel_text_ro(void); ++#else ++static inline void set_kernel_text_rw(void) { } ++static inline void set_kernel_text_ro(void) { } + #endif + + #ifdef CONFIG_DEBUG_RODATA_TEST +Index: linux-2.6-tip/arch/x86/include/asm/calling.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/calling.h ++++ linux-2.6-tip/arch/x86/include/asm/calling.h +@@ -1,5 +1,55 @@ + /* +- * Some macros to handle stack frames in assembly. ++ ++ x86 function call convention, 64-bit: ++ ------------------------------------- ++ arguments | callee-saved | extra caller-saved | return ++ [callee-clobbered] | | [callee-clobbered] | ++ --------------------------------------------------------------------------- ++ rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**] ++ ++ ( rsp is obviously invariant across normal function calls. (gcc can 'merge' ++ functions when it sees tail-call optimization possibilities) rflags is ++ clobbered. Leftover arguments are passed over the stack frame.) ++ ++ [*] In the frame-pointers case rbp is fixed to the stack frame. ++ ++ [**] for struct return values wider than 64 bits the return convention is a ++ bit more complex: up to 128 bits width we return small structures ++ straight in rax, rdx. For structures larger than that (3 words or ++ larger) the caller puts a pointer to an on-stack return struct ++ [allocated in the caller's stack frame] into the first argument - i.e. ++ into rdi. All other arguments shift up by one in this case. ++ Fortunately this case is rare in the kernel. ++ ++For 32-bit we have the following conventions - kernel is built with ++-mregparm=3 and -freg-struct-return: ++ ++ x86 function calling convention, 32-bit: ++ ---------------------------------------- ++ arguments | callee-saved | extra caller-saved | return ++ [callee-clobbered] | | [callee-clobbered] | ++ ------------------------------------------------------------------------- ++ eax edx ecx | ebx edi esi ebp [*] | | eax, edx [**] ++ ++ ( here too esp is obviously invariant across normal function calls. eflags ++ is clobbered. Leftover arguments are passed over the stack frame. ) ++ ++ [*] In the frame-pointers case ebp is fixed to the stack frame. ++ ++ [**] We build with -freg-struct-return, which on 32-bit means similar ++ semantics as on 64-bit: edx can be used for a second return value ++ (i.e. covering integer and structure sizes up to 64 bits) - after that ++ it gets more complex and more expensive: 3-word or larger struct returns ++ get done in the caller's frame and the pointer to the return struct goes ++ into regparm0, i.e. eax - the other arguments shift up and the ++ function's register parameters degenerate to regparm=2 in essence. ++ ++*/ ++ ++ ++/* ++ * 64-bit system call stack frame layout defines and helpers, ++ * for assembly code: + */ + + #define R15 0 +@@ -9,7 +59,7 @@ + #define RBP 32 + #define RBX 40 + +-/* arguments: interrupts/non tracing syscalls only save upto here*/ ++/* arguments: interrupts/non tracing syscalls only save up to here: */ + #define R11 48 + #define R10 56 + #define R9 64 +@@ -22,7 +72,7 @@ + #define ORIG_RAX 120 /* + error_code */ + /* end of arguments */ + +-/* cpu exception frame or undefined in case of fast syscall. */ ++/* cpu exception frame or undefined in case of fast syscall: */ + #define RIP 128 + #define CS 136 + #define EFLAGS 144 +Index: linux-2.6-tip/arch/x86/include/asm/cpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/cpu.h ++++ linux-2.6-tip/arch/x86/include/asm/cpu.h +@@ -7,6 +7,20 @@ + #include + #include + ++#ifdef CONFIG_SMP ++ ++extern void prefill_possible_map(void); ++ ++#else /* CONFIG_SMP */ ++ ++static inline void prefill_possible_map(void) {} ++ ++#define cpu_physical_id(cpu) boot_cpu_physical_apicid ++#define safe_smp_processor_id() 0 ++#define stack_smp_processor_id() 0 ++ ++#endif /* CONFIG_SMP */ ++ + struct x86_cpu { + struct cpu cpu; + }; +@@ -17,4 +31,7 @@ extern void arch_unregister_cpu(int); + #endif + + DECLARE_PER_CPU(int, cpu_state); ++ ++extern unsigned int boot_cpu_id; ++ + #endif /* _ASM_X86_CPU_H */ +Index: linux-2.6-tip/arch/x86/include/asm/cpu_debug.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/cpu_debug.h +@@ -0,0 +1,226 @@ ++#ifndef _ASM_X86_CPU_DEBUG_H ++#define _ASM_X86_CPU_DEBUG_H ++ ++/* ++ * CPU x86 architecture debug ++ * ++ * Copyright(C) 2009 Jaswinder Singh Rajput ++ */ ++ ++/* Register flags */ ++enum cpu_debug_bit { ++/* Model Specific Registers (MSRs) */ ++ CPU_MC_BIT, /* Machine Check */ ++ CPU_MONITOR_BIT, /* Monitor */ ++ CPU_TIME_BIT, /* Time */ ++ CPU_PMC_BIT, /* Performance Monitor */ ++ CPU_PLATFORM_BIT, /* Platform */ ++ CPU_APIC_BIT, /* APIC */ ++ CPU_POWERON_BIT, /* Power-on */ ++ CPU_CONTROL_BIT, /* Control */ ++ CPU_FEATURES_BIT, /* Features control */ ++ CPU_LBRANCH_BIT, /* Last Branch */ ++ CPU_BIOS_BIT, /* BIOS */ ++ CPU_FREQ_BIT, /* Frequency */ ++ CPU_MTTR_BIT, /* MTRR */ ++ CPU_PERF_BIT, /* Performance */ ++ CPU_CACHE_BIT, /* Cache */ ++ CPU_SYSENTER_BIT, /* Sysenter */ ++ CPU_THERM_BIT, /* Thermal */ ++ CPU_MISC_BIT, /* Miscellaneous */ ++ CPU_DEBUG_BIT, /* Debug */ ++ CPU_PAT_BIT, /* PAT */ ++ CPU_VMX_BIT, /* VMX */ ++ CPU_CALL_BIT, /* System Call */ ++ CPU_BASE_BIT, /* BASE Address */ ++ CPU_VER_BIT, /* Version ID */ ++ CPU_CONF_BIT, /* Configuration */ ++ CPU_SMM_BIT, /* System mgmt mode */ ++ CPU_SVM_BIT, /*Secure Virtual Machine*/ ++ CPU_OSVM_BIT, /* OS-Visible Workaround*/ ++/* Standard Registers */ ++ CPU_TSS_BIT, /* Task Stack Segment */ ++ CPU_CR_BIT, /* Control Registers */ ++ CPU_DT_BIT, /* Descriptor Table */ ++/* End of Registers flags */ ++ CPU_REG_ALL_BIT, /* Select all Registers */ ++}; ++ ++#define CPU_REG_ALL (~0) /* Select all Registers */ ++ ++#define CPU_MC (1 << CPU_MC_BIT) ++#define CPU_MONITOR (1 << CPU_MONITOR_BIT) ++#define CPU_TIME (1 << CPU_TIME_BIT) ++#define CPU_PMC (1 << CPU_PMC_BIT) ++#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT) ++#define CPU_APIC (1 << CPU_APIC_BIT) ++#define CPU_POWERON (1 << CPU_POWERON_BIT) ++#define CPU_CONTROL (1 << CPU_CONTROL_BIT) ++#define CPU_FEATURES (1 << CPU_FEATURES_BIT) ++#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT) ++#define CPU_BIOS (1 << CPU_BIOS_BIT) ++#define CPU_FREQ (1 << CPU_FREQ_BIT) ++#define CPU_MTRR (1 << CPU_MTTR_BIT) ++#define CPU_PERF (1 << CPU_PERF_BIT) ++#define CPU_CACHE (1 << CPU_CACHE_BIT) ++#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT) ++#define CPU_THERM (1 << CPU_THERM_BIT) ++#define CPU_MISC (1 << CPU_MISC_BIT) ++#define CPU_DEBUG (1 << CPU_DEBUG_BIT) ++#define CPU_PAT (1 << CPU_PAT_BIT) ++#define CPU_VMX (1 << CPU_VMX_BIT) ++#define CPU_CALL (1 << CPU_CALL_BIT) ++#define CPU_BASE (1 << CPU_BASE_BIT) ++#define CPU_VER (1 << CPU_VER_BIT) ++#define CPU_CONF (1 << CPU_CONF_BIT) ++#define CPU_SMM (1 << CPU_SMM_BIT) ++#define CPU_SVM (1 << CPU_SVM_BIT) ++#define CPU_OSVM (1 << CPU_OSVM_BIT) ++#define CPU_TSS (1 << CPU_TSS_BIT) ++#define CPU_CR (1 << CPU_CR_BIT) ++#define CPU_DT (1 << CPU_DT_BIT) ++ ++/* Register file flags */ ++enum cpu_file_bit { ++ CPU_INDEX_BIT, /* index */ ++ CPU_VALUE_BIT, /* value */ ++}; ++ ++#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) ++ ++/* ++ * DisplayFamily_DisplayModel Processor Families/Processor Number Series ++ * -------------------------- ------------------------------------------ ++ * 05_01, 05_02, 05_04 Pentium, Pentium with MMX ++ * ++ * 06_01 Pentium Pro ++ * 06_03, 06_05 Pentium II Xeon, Pentium II ++ * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III ++ * ++ * 06_09, 060D Pentium M ++ * ++ * 06_0E Core Duo, Core Solo ++ * ++ * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series, ++ * Core 2 Quad, Core 2 Extreme, Core 2 Duo, ++ * Pentium dual-core ++ * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650 ++ * ++ * 06_1C Atom ++ * ++ * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4 ++ * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D ++ * ++ * 0F_06 Xeon 7100, 5000 Series, Xeon MP, ++ * Pentium 4, Pentium D ++ */ ++ ++/* Register processors bits */ ++enum cpu_processor_bit { ++ CPU_NONE, ++/* Intel */ ++ CPU_INTEL_PENTIUM_BIT, ++ CPU_INTEL_P6_BIT, ++ CPU_INTEL_PENTIUM_M_BIT, ++ CPU_INTEL_CORE_BIT, ++ CPU_INTEL_CORE2_BIT, ++ CPU_INTEL_ATOM_BIT, ++ CPU_INTEL_XEON_P4_BIT, ++ CPU_INTEL_XEON_MP_BIT, ++/* AMD */ ++ CPU_AMD_K6_BIT, ++ CPU_AMD_K7_BIT, ++ CPU_AMD_K8_BIT, ++ CPU_AMD_0F_BIT, ++ CPU_AMD_10_BIT, ++ CPU_AMD_11_BIT, ++}; ++ ++#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT) ++#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT) ++#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT) ++#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT) ++#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT) ++#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT) ++#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT) ++#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT) ++ ++#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M) ++#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2) ++#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP) ++#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM) ++#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM) ++#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM) ++#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON) ++#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON) ++#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT) ++#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON) ++#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON) ++#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT) ++#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX) ++#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE) ++#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE) ++#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT) ++#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE) ++#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT) ++#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE) ++ ++/* Select all supported Intel CPUs */ ++#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE) ++ ++#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT) ++#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT) ++#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT) ++#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT) ++#define CPU_AMD_10 (1 << CPU_AMD_10_BIT) ++#define CPU_AMD_11 (1 << CPU_AMD_11_BIT) ++ ++#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11) ++#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS) ++#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS) ++#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS) ++ ++/* Select all supported AMD CPUs */ ++#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS) ++ ++/* Select all supported CPUs */ ++#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL) ++ ++#define MAX_CPU_FILES 512 ++ ++struct cpu_private { ++ unsigned cpu; ++ unsigned type; ++ unsigned reg; ++ unsigned file; ++}; ++ ++struct cpu_debug_base { ++ char *name; /* Register name */ ++ unsigned flag; /* Register flag */ ++ unsigned write; /* Register write flag */ ++}; ++ ++/* ++ * Currently it looks similar to cpu_debug_base but once we add more files ++ * cpu_file_base will go in different direction ++ */ ++struct cpu_file_base { ++ char *name; /* Register file name */ ++ unsigned flag; /* Register file flag */ ++ unsigned write; /* Register write flag */ ++}; ++ ++struct cpu_cpuX_base { ++ struct dentry *dentry; /* Register dentry */ ++ int init; /* Register index file */ ++}; ++ ++struct cpu_debug_range { ++ unsigned min; /* Register range min */ ++ unsigned max; /* Register range max */ ++ unsigned flag; /* Supported flags */ ++ unsigned model; /* Supported models */ ++}; ++ ++#endif /* _ASM_X86_CPU_DEBUG_H */ +Index: linux-2.6-tip/arch/x86/include/asm/cpumask.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/cpumask.h +@@ -0,0 +1,14 @@ ++#ifndef _ASM_X86_CPUMASK_H ++#define _ASM_X86_CPUMASK_H ++#ifndef __ASSEMBLY__ ++#include ++ ++extern cpumask_var_t cpu_callin_mask; ++extern cpumask_var_t cpu_callout_mask; ++extern cpumask_var_t cpu_initialized_mask; ++extern cpumask_var_t cpu_sibling_setup_mask; ++ ++extern void setup_cpu_local_masks(void); ++ ++#endif /* __ASSEMBLY__ */ ++#endif /* _ASM_X86_CPUMASK_H */ +Index: linux-2.6-tip/arch/x86/include/asm/current.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/current.h ++++ linux-2.6-tip/arch/x86/include/asm/current.h +@@ -1,39 +1,21 @@ + #ifndef _ASM_X86_CURRENT_H + #define _ASM_X86_CURRENT_H + +-#ifdef CONFIG_X86_32 + #include + #include + ++#ifndef __ASSEMBLY__ + struct task_struct; + + DECLARE_PER_CPU(struct task_struct *, current_task); +-static __always_inline struct task_struct *get_current(void) +-{ +- return x86_read_percpu(current_task); +-} +- +-#else /* X86_32 */ +- +-#ifndef __ASSEMBLY__ +-#include +- +-struct task_struct; + + static __always_inline struct task_struct *get_current(void) + { +- return read_pda(pcurrent); ++ return percpu_read(current_task); + } + +-#else /* __ASSEMBLY__ */ +- +-#include +-#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg ++#define current get_current() + + #endif /* __ASSEMBLY__ */ + +-#endif /* X86_32 */ +- +-#define current get_current() +- + #endif /* _ASM_X86_CURRENT_H */ +Index: linux-2.6-tip/arch/x86/include/asm/desc.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/desc.h ++++ linux-2.6-tip/arch/x86/include/asm/desc.h +@@ -91,7 +91,6 @@ static inline int desc_empty(const void + #define store_gdt(dtr) native_store_gdt(dtr) + #define store_idt(dtr) native_store_idt(dtr) + #define store_tr(tr) (tr = native_store_tr()) +-#define store_ldt(ldt) asm("sldt %0":"=m" (ldt)) + + #define load_TLS(t, cpu) native_load_tls(t, cpu) + #define set_ldt native_set_ldt +@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(str + } + #endif /* CONFIG_PARAVIRT */ + ++#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) ++ + static inline void native_write_idt_entry(gate_desc *idt, int entry, + const gate_desc *gate) + { +Index: linux-2.6-tip/arch/x86/include/asm/device.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/device.h ++++ linux-2.6-tip/arch/x86/include/asm/device.h +@@ -6,7 +6,7 @@ struct dev_archdata { + void *acpi_handle; + #endif + #ifdef CONFIG_X86_64 +-struct dma_mapping_ops *dma_ops; ++struct dma_map_ops *dma_ops; + #endif + #ifdef CONFIG_DMAR + void *iommu; /* hook for IOMMU specific extension */ +Index: linux-2.6-tip/arch/x86/include/asm/dma-mapping.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/dma-mapping.h ++++ linux-2.6-tip/arch/x86/include/asm/dma-mapping.h +@@ -6,7 +6,10 @@ + * Documentation/DMA-API.txt for documentation. + */ + ++#include + #include ++#include ++#include + #include + #include + #include +@@ -16,47 +19,9 @@ extern int iommu_merge; + extern struct device x86_dma_fallback_dev; + extern int panic_on_overflow; + +-struct dma_mapping_ops { +- int (*mapping_error)(struct device *dev, +- dma_addr_t dma_addr); +- void* (*alloc_coherent)(struct device *dev, size_t size, +- dma_addr_t *dma_handle, gfp_t gfp); +- void (*free_coherent)(struct device *dev, size_t size, +- void *vaddr, dma_addr_t dma_handle); +- dma_addr_t (*map_single)(struct device *hwdev, phys_addr_t ptr, +- size_t size, int direction); +- void (*unmap_single)(struct device *dev, dma_addr_t addr, +- size_t size, int direction); +- void (*sync_single_for_cpu)(struct device *hwdev, +- dma_addr_t dma_handle, size_t size, +- int direction); +- void (*sync_single_for_device)(struct device *hwdev, +- dma_addr_t dma_handle, size_t size, +- int direction); +- void (*sync_single_range_for_cpu)(struct device *hwdev, +- dma_addr_t dma_handle, unsigned long offset, +- size_t size, int direction); +- void (*sync_single_range_for_device)(struct device *hwdev, +- dma_addr_t dma_handle, unsigned long offset, +- size_t size, int direction); +- void (*sync_sg_for_cpu)(struct device *hwdev, +- struct scatterlist *sg, int nelems, +- int direction); +- void (*sync_sg_for_device)(struct device *hwdev, +- struct scatterlist *sg, int nelems, +- int direction); +- int (*map_sg)(struct device *hwdev, struct scatterlist *sg, +- int nents, int direction); +- void (*unmap_sg)(struct device *hwdev, +- struct scatterlist *sg, int nents, +- int direction); +- int (*dma_supported)(struct device *hwdev, u64 mask); +- int is_phys; +-}; ++extern struct dma_map_ops *dma_ops; + +-extern struct dma_mapping_ops *dma_ops; +- +-static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) ++static inline struct dma_map_ops *get_dma_ops(struct device *dev) + { + #ifdef CONFIG_X86_32 + return dma_ops; +@@ -71,7 +36,7 @@ static inline struct dma_mapping_ops *ge + /* Make sure we keep the same behaviour */ + static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + { +- struct dma_mapping_ops *ops = get_dma_ops(dev); ++ struct dma_map_ops *ops = get_dma_ops(dev); + if (ops->mapping_error) + return ops->mapping_error(dev, dma_addr); + +@@ -90,137 +55,174 @@ extern void *dma_generic_alloc_coherent( + + static inline dma_addr_t + dma_map_single(struct device *hwdev, void *ptr, size_t size, +- int direction) ++ enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); ++ dma_addr_t addr; + +- BUG_ON(!valid_dma_direction(direction)); +- return ops->map_single(hwdev, virt_to_phys(ptr), size, direction); ++ kmemcheck_mark_initialized(ptr, size); ++ BUG_ON(!valid_dma_direction(dir)); ++ addr = ops->map_page(hwdev, virt_to_page(ptr), ++ (unsigned long)ptr & ~PAGE_MASK, size, ++ dir, NULL); ++ debug_dma_map_page(hwdev, virt_to_page(ptr), ++ (unsigned long)ptr & ~PAGE_MASK, size, ++ dir, addr, true); ++ return addr; + } + + static inline void + dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size, +- int direction) ++ enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(dev); ++ struct dma_map_ops *ops = get_dma_ops(dev); + +- BUG_ON(!valid_dma_direction(direction)); +- if (ops->unmap_single) +- ops->unmap_single(dev, addr, size, direction); ++ BUG_ON(!valid_dma_direction(dir)); ++ if (ops->unmap_page) ++ ops->unmap_page(dev, addr, size, dir, NULL); ++ debug_dma_unmap_page(dev, addr, size, dir, true); + } + + static inline int + dma_map_sg(struct device *hwdev, struct scatterlist *sg, +- int nents, int direction) ++ int nents, enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); ++ int ents; ++ ++ struct scatterlist *s; ++ int i; ++ ++ for_each_sg(sg, s, nents, i) ++ kmemcheck_mark_initialized(sg_virt(s), s->length); ++ BUG_ON(!valid_dma_direction(dir)); ++ ents = ops->map_sg(hwdev, sg, nents, dir, NULL); ++ debug_dma_map_sg(hwdev, sg, nents, ents, dir); + +- BUG_ON(!valid_dma_direction(direction)); +- return ops->map_sg(hwdev, sg, nents, direction); ++ return ents; + } + + static inline void + dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, +- int direction) ++ enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); + +- BUG_ON(!valid_dma_direction(direction)); ++ BUG_ON(!valid_dma_direction(dir)); ++ debug_dma_unmap_sg(hwdev, sg, nents, dir); + if (ops->unmap_sg) +- ops->unmap_sg(hwdev, sg, nents, direction); ++ ops->unmap_sg(hwdev, sg, nents, dir, NULL); + } + + static inline void + dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle, +- size_t size, int direction) ++ size_t size, enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); + +- BUG_ON(!valid_dma_direction(direction)); ++ BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_cpu) +- ops->sync_single_for_cpu(hwdev, dma_handle, size, direction); ++ ops->sync_single_for_cpu(hwdev, dma_handle, size, dir); ++ debug_dma_sync_single_for_cpu(hwdev, dma_handle, size, dir); + flush_write_buffers(); + } + + static inline void + dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle, +- size_t size, int direction) ++ size_t size, enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); + +- BUG_ON(!valid_dma_direction(direction)); ++ BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_device) +- ops->sync_single_for_device(hwdev, dma_handle, size, direction); ++ ops->sync_single_for_device(hwdev, dma_handle, size, dir); ++ debug_dma_sync_single_for_device(hwdev, dma_handle, size, dir); + flush_write_buffers(); + } + + static inline void + dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle, +- unsigned long offset, size_t size, int direction) ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); + +- BUG_ON(!valid_dma_direction(direction)); ++ BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_range_for_cpu) + ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, +- size, direction); ++ size, dir); ++ debug_dma_sync_single_range_for_cpu(hwdev, dma_handle, ++ offset, size, dir); + flush_write_buffers(); + } + + static inline void + dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle, + unsigned long offset, size_t size, +- int direction) ++ enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); + +- BUG_ON(!valid_dma_direction(direction)); ++ BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_range_for_device) + ops->sync_single_range_for_device(hwdev, dma_handle, +- offset, size, direction); ++ offset, size, dir); ++ debug_dma_sync_single_range_for_device(hwdev, dma_handle, ++ offset, size, dir); + flush_write_buffers(); + } + + static inline void + dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, +- int nelems, int direction) ++ int nelems, enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); + +- BUG_ON(!valid_dma_direction(direction)); ++ BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_sg_for_cpu) +- ops->sync_sg_for_cpu(hwdev, sg, nelems, direction); ++ ops->sync_sg_for_cpu(hwdev, sg, nelems, dir); ++ debug_dma_sync_sg_for_cpu(hwdev, sg, nelems, dir); + flush_write_buffers(); + } + + static inline void + dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, +- int nelems, int direction) ++ int nelems, enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(hwdev); ++ struct dma_map_ops *ops = get_dma_ops(hwdev); + +- BUG_ON(!valid_dma_direction(direction)); ++ BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_sg_for_device) +- ops->sync_sg_for_device(hwdev, sg, nelems, direction); ++ ops->sync_sg_for_device(hwdev, sg, nelems, dir); ++ debug_dma_sync_sg_for_device(hwdev, sg, nelems, dir); + + flush_write_buffers(); + } + + static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, +- int direction) ++ enum dma_data_direction dir) + { +- struct dma_mapping_ops *ops = get_dma_ops(dev); ++ struct dma_map_ops *ops = get_dma_ops(dev); ++ dma_addr_t addr; + +- BUG_ON(!valid_dma_direction(direction)); +- return ops->map_single(dev, page_to_phys(page) + offset, +- size, direction); ++ kmemcheck_mark_initialized(page_address(page) + offset, size); ++ BUG_ON(!valid_dma_direction(dir)); ++ addr = ops->map_page(dev, page, offset, size, dir, NULL); ++ debug_dma_map_page(dev, page, offset, size, dir, addr, false); ++ ++ return addr; + } + + static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, +- size_t size, int direction) ++ size_t size, enum dma_data_direction dir) + { +- dma_unmap_single(dev, addr, size, direction); ++ struct dma_map_ops *ops = get_dma_ops(dev); ++ ++ BUG_ON(!valid_dma_direction(dir)); ++ if (ops->unmap_page) ++ ops->unmap_page(dev, addr, size, dir, NULL); ++ debug_dma_unmap_page(dev, addr, size, dir, false); + } + + static inline void +@@ -266,7 +268,7 @@ static inline void * + dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp) + { +- struct dma_mapping_ops *ops = get_dma_ops(dev); ++ struct dma_map_ops *ops = get_dma_ops(dev); + void *memory; + + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); +@@ -285,20 +287,24 @@ dma_alloc_coherent(struct device *dev, s + if (!ops->alloc_coherent) + return NULL; + +- return ops->alloc_coherent(dev, size, dma_handle, +- dma_alloc_coherent_gfp_flags(dev, gfp)); ++ memory = ops->alloc_coherent(dev, size, dma_handle, ++ dma_alloc_coherent_gfp_flags(dev, gfp)); ++ debug_dma_alloc_coherent(dev, size, *dma_handle, memory); ++ ++ return memory; + } + + static inline void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus) + { +- struct dma_mapping_ops *ops = get_dma_ops(dev); ++ struct dma_map_ops *ops = get_dma_ops(dev); + + WARN_ON(irqs_disabled()); /* for portability */ + + if (dma_release_from_coherent(dev, get_order(size), vaddr)) + return; + ++ debug_dma_free_coherent(dev, size, vaddr, bus); + if (ops->free_coherent) + ops->free_coherent(dev, size, vaddr, bus); + } +Index: linux-2.6-tip/arch/x86/include/asm/dmi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/dmi.h ++++ linux-2.6-tip/arch/x86/include/asm/dmi.h +@@ -1,22 +1,15 @@ + #ifndef _ASM_X86_DMI_H + #define _ASM_X86_DMI_H + +-#include +- +-#define DMI_MAX_DATA 2048 ++#include ++#include + +-extern int dmi_alloc_index; +-extern char dmi_alloc_data[DMI_MAX_DATA]; ++#include ++#include + +-/* This is so early that there is no good way to allocate dynamic memory. +- Allocate data in an BSS array. */ +-static inline void *dmi_alloc(unsigned len) ++static __always_inline __init void *dmi_alloc(unsigned len) + { +- int idx = dmi_alloc_index; +- if ((dmi_alloc_index + len) > DMI_MAX_DATA) +- return NULL; +- dmi_alloc_index += len; +- return dmi_alloc_data + idx; ++ return extend_brk(len, sizeof(int)); + } + + /* Use early IO mappings for DMI because it's initialized early */ +Index: linux-2.6-tip/arch/x86/include/asm/do_timer.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/do_timer.h +@@ -0,0 +1,16 @@ ++/* defines for inline arch setup functions */ ++#include ++ ++#include ++#include ++ ++/** ++ * do_timer_interrupt_hook - hook into timer tick ++ * ++ * Call the pit clock event handler. see asm/i8253.h ++ **/ ++ ++static inline void do_timer_interrupt_hook(void) ++{ ++ global_clock_event->event_handler(global_clock_event); ++} +Index: linux-2.6-tip/arch/x86/include/asm/e820.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/e820.h ++++ linux-2.6-tip/arch/x86/include/asm/e820.h +@@ -72,7 +72,7 @@ extern int e820_all_mapped(u64 start, u6 + extern void e820_add_region(u64 start, u64 size, int type); + extern void e820_print_map(char *who); + extern int +-sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map); ++sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map); + extern u64 e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type); + extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type, +Index: linux-2.6-tip/arch/x86/include/asm/elf.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/elf.h ++++ linux-2.6-tip/arch/x86/include/asm/elf.h +@@ -112,7 +112,7 @@ extern unsigned int vdso_enabled; + * now struct_user_regs, they are different) + */ + +-#define ELF_CORE_COPY_REGS(pr_reg, regs) \ ++#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \ + do { \ + pr_reg[0] = regs->bx; \ + pr_reg[1] = regs->cx; \ +@@ -124,7 +124,6 @@ do { \ + pr_reg[7] = regs->ds & 0xffff; \ + pr_reg[8] = regs->es & 0xffff; \ + pr_reg[9] = regs->fs & 0xffff; \ +- savesegment(gs, pr_reg[10]); \ + pr_reg[11] = regs->orig_ax; \ + pr_reg[12] = regs->ip; \ + pr_reg[13] = regs->cs & 0xffff; \ +@@ -133,6 +132,18 @@ do { \ + pr_reg[16] = regs->ss & 0xffff; \ + } while (0); + ++#define ELF_CORE_COPY_REGS(pr_reg, regs) \ ++do { \ ++ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ ++ pr_reg[10] = get_user_gs(regs); \ ++} while (0); ++ ++#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \ ++do { \ ++ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ ++ savesegment(gs, pr_reg[10]); \ ++} while (0); ++ + #define ELF_PLATFORM (utsname()->machine) + #define set_personality_64bit() do { } while (0) + +Index: linux-2.6-tip/arch/x86/include/asm/entry_arch.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/entry_arch.h +@@ -0,0 +1,59 @@ ++/* ++ * This file is designed to contain the BUILD_INTERRUPT specifications for ++ * all of the extra named interrupt vectors used by the architecture. ++ * Usually this is the Inter Process Interrupts (IPIs) ++ */ ++ ++/* ++ * The following vectors are part of the Linux architecture, there ++ * is no hardware IRQ pin equivalent for them, they are triggered ++ * through the ICC by us (IPIs) ++ */ ++#ifdef CONFIG_SMP ++BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) ++BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) ++BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) ++BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) ++ ++BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, ++ smp_invalidate_interrupt) ++BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1, ++ smp_invalidate_interrupt) ++BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2, ++ smp_invalidate_interrupt) ++BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3, ++ smp_invalidate_interrupt) ++BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4, ++ smp_invalidate_interrupt) ++BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5, ++ smp_invalidate_interrupt) ++BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6, ++ smp_invalidate_interrupt) ++BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, ++ smp_invalidate_interrupt) ++#endif ++ ++BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) ++ ++/* ++ * every pentium local APIC has two 'local interrupts', with a ++ * soft-definable vector attached to both interrupts, one of ++ * which is a timer interrupt, the other one is error counter ++ * overflow. Linux uses the local APIC timer interrupt to get ++ * a much simpler SMP time architecture: ++ */ ++#ifdef CONFIG_X86_LOCAL_APIC ++ ++BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) ++BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) ++BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) ++ ++#ifdef CONFIG_PERF_COUNTERS ++BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) ++#endif ++ ++#ifdef CONFIG_X86_MCE_P4THERMAL ++BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) ++#endif ++ ++#endif +Index: linux-2.6-tip/arch/x86/include/asm/es7000/apic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/es7000/apic.h ++++ /dev/null +@@ -1,242 +0,0 @@ +-#ifndef __ASM_ES7000_APIC_H +-#define __ASM_ES7000_APIC_H +- +-#include +- +-#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu) +-#define esr_disable (1) +- +-static inline int apic_id_registered(void) +-{ +- return (1); +-} +- +-static inline const cpumask_t *target_cpus_cluster(void) +-{ +- return &CPU_MASK_ALL; +-} +- +-static inline const cpumask_t *target_cpus(void) +-{ +- return &cpumask_of_cpu(smp_processor_id()); +-} +- +-#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER) +-#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio) +-#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */ +-#define NO_BALANCE_IRQ_CLUSTER (1) +- +-#define APIC_DFR_VALUE (APIC_DFR_FLAT) +-#define INT_DELIVERY_MODE (dest_Fixed) +-#define INT_DEST_MODE (0) /* phys delivery to target procs */ +-#define NO_BALANCE_IRQ (0) +-#undef APIC_DEST_LOGICAL +-#define APIC_DEST_LOGICAL 0x0 +- +-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +-{ +- return 0; +-} +-static inline unsigned long check_apicid_present(int bit) +-{ +- return physid_isset(bit, phys_cpu_present_map); +-} +- +-#define apicid_cluster(apicid) (apicid & 0xF0) +- +-static inline unsigned long calculate_ldr(int cpu) +-{ +- unsigned long id; +- id = xapic_phys_to_log_apicid(cpu); +- return (SET_APIC_LOGICAL_ID(id)); +-} +- +-/* +- * Set up the logical destination ID. +- * +- * Intel recommends to set DFR, LdR and TPR before enabling +- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel +- * document number 292116). So here it goes... +- */ +-static inline void init_apic_ldr_cluster(void) +-{ +- unsigned long val; +- int cpu = smp_processor_id(); +- +- apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER); +- val = calculate_ldr(cpu); +- apic_write(APIC_LDR, val); +-} +- +-static inline void init_apic_ldr(void) +-{ +- unsigned long val; +- int cpu = smp_processor_id(); +- +- apic_write(APIC_DFR, APIC_DFR_VALUE); +- val = calculate_ldr(cpu); +- apic_write(APIC_LDR, val); +-} +- +-extern int apic_version [MAX_APICS]; +-static inline void setup_apic_routing(void) +-{ +- int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); +- printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", +- (apic_version[apic] == 0x14) ? +- "Physical Cluster" : "Logical Cluster", +- nr_ioapics, cpus_addr(*target_cpus())[0]); +-} +- +-static inline int multi_timer_check(int apic, int irq) +-{ +- return 0; +-} +- +-static inline int apicid_to_node(int logical_apicid) +-{ +- return 0; +-} +- +- +-static inline int cpu_present_to_apicid(int mps_cpu) +-{ +- if (!mps_cpu) +- return boot_cpu_physical_apicid; +- else if (mps_cpu < nr_cpu_ids) +- return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); +- else +- return BAD_APICID; +-} +- +-static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) +-{ +- static int id = 0; +- physid_mask_t mask; +- mask = physid_mask_of_physid(id); +- ++id; +- return mask; +-} +- +-extern u8 cpu_2_logical_apicid[]; +-/* Mapping from cpu number to logical apicid */ +-static inline int cpu_to_logical_apicid(int cpu) +-{ +-#ifdef CONFIG_SMP +- if (cpu >= nr_cpu_ids) +- return BAD_APICID; +- return (int)cpu_2_logical_apicid[cpu]; +-#else +- return logical_smp_processor_id(); +-#endif +-} +- +-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +-{ +- /* For clustered we don't have a good way to do this yet - hack */ +- return physids_promote(0xff); +-} +- +- +-static inline void setup_portio_remap(void) +-{ +-} +- +-extern unsigned int boot_cpu_physical_apicid; +-static inline int check_phys_apicid_present(int cpu_physical_apicid) +-{ +- boot_cpu_physical_apicid = read_apic_id(); +- return (1); +-} +- +-static inline unsigned int +-cpu_mask_to_apicid_cluster(const struct cpumask *cpumask) +-{ +- int num_bits_set; +- int cpus_found = 0; +- int cpu; +- int apicid; +- +- num_bits_set = cpumask_weight(cpumask); +- /* Return id to all */ +- if (num_bits_set == nr_cpu_ids) +- return 0xFF; +- /* +- * The cpus in the mask must all be on the apic cluster. If are not +- * on the same apicid cluster return default value of TARGET_CPUS. +- */ +- cpu = cpumask_first(cpumask); +- apicid = cpu_to_logical_apicid(cpu); +- while (cpus_found < num_bits_set) { +- if (cpumask_test_cpu(cpu, cpumask)) { +- int new_apicid = cpu_to_logical_apicid(cpu); +- if (apicid_cluster(apicid) != +- apicid_cluster(new_apicid)){ +- printk ("%s: Not a valid mask!\n", __func__); +- return 0xFF; +- } +- apicid = new_apicid; +- cpus_found++; +- } +- cpu++; +- } +- return apicid; +-} +- +-static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) +-{ +- int num_bits_set; +- int cpus_found = 0; +- int cpu; +- int apicid; +- +- num_bits_set = cpus_weight(*cpumask); +- /* Return id to all */ +- if (num_bits_set == nr_cpu_ids) +- return cpu_to_logical_apicid(0); +- /* +- * The cpus in the mask must all be on the apic cluster. If are not +- * on the same apicid cluster return default value of TARGET_CPUS. +- */ +- cpu = first_cpu(*cpumask); +- apicid = cpu_to_logical_apicid(cpu); +- while (cpus_found < num_bits_set) { +- if (cpu_isset(cpu, *cpumask)) { +- int new_apicid = cpu_to_logical_apicid(cpu); +- if (apicid_cluster(apicid) != +- apicid_cluster(new_apicid)){ +- printk ("%s: Not a valid mask!\n", __func__); +- return cpu_to_logical_apicid(0); +- } +- apicid = new_apicid; +- cpus_found++; +- } +- cpu++; +- } +- return apicid; +-} +- +- +-static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask, +- const struct cpumask *andmask) +-{ +- int apicid = cpu_to_logical_apicid(0); +- cpumask_var_t cpumask; +- +- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) +- return apicid; +- +- cpumask_and(cpumask, inmask, andmask); +- cpumask_and(cpumask, cpumask, cpu_online_mask); +- apicid = cpu_mask_to_apicid(cpumask); +- +- free_cpumask_var(cpumask); +- return apicid; +-} +- +-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +-{ +- return cpuid_apic >> index_msb; +-} +- +-#endif /* __ASM_ES7000_APIC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/es7000/apicdef.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/es7000/apicdef.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-#ifndef __ASM_ES7000_APICDEF_H +-#define __ASM_ES7000_APICDEF_H +- +-#define APIC_ID_MASK (0xFF<<24) +- +-static inline unsigned get_apic_id(unsigned long x) +-{ +- return (((x)>>24)&0xFF); +-} +- +-#define GET_APIC_ID(x) get_apic_id(x) +- +-#endif +Index: linux-2.6-tip/arch/x86/include/asm/es7000/ipi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/es7000/ipi.h ++++ /dev/null +@@ -1,22 +0,0 @@ +-#ifndef __ASM_ES7000_IPI_H +-#define __ASM_ES7000_IPI_H +- +-void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +- +-static inline void send_IPI_mask(const struct cpumask *mask, int vector) +-{ +- send_IPI_mask_sequence(mask, vector); +-} +- +-static inline void send_IPI_allbutself(int vector) +-{ +- send_IPI_mask_allbutself(cpu_online_mask, vector); +-} +- +-static inline void send_IPI_all(int vector) +-{ +- send_IPI_mask(cpu_online_mask, vector); +-} +- +-#endif /* __ASM_ES7000_IPI_H */ +Index: linux-2.6-tip/arch/x86/include/asm/es7000/mpparse.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/es7000/mpparse.h ++++ /dev/null +@@ -1,29 +0,0 @@ +-#ifndef __ASM_ES7000_MPPARSE_H +-#define __ASM_ES7000_MPPARSE_H +- +-#include +- +-extern int parse_unisys_oem (char *oemptr); +-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +-extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); +-extern void setup_unisys(void); +- +-#ifndef CONFIG_X86_GENERICARCH +-extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id); +-extern int mps_oem_check(struct mpc_table *mpc, char *oem, char *productid); +-#endif +- +-#ifdef CONFIG_ACPI +- +-static inline int es7000_check_dsdt(void) +-{ +- struct acpi_table_header header; +- +- if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) && +- !strncmp(header.oem_id, "UNISYS", 6)) +- return 1; +- return 0; +-} +-#endif +- +-#endif /* __ASM_MACH_MPPARSE_H */ +Index: linux-2.6-tip/arch/x86/include/asm/es7000/wakecpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/es7000/wakecpu.h ++++ /dev/null +@@ -1,37 +0,0 @@ +-#ifndef __ASM_ES7000_WAKECPU_H +-#define __ASM_ES7000_WAKECPU_H +- +-#define TRAMPOLINE_PHYS_LOW 0x467 +-#define TRAMPOLINE_PHYS_HIGH 0x469 +- +-static inline void wait_for_init_deassert(atomic_t *deassert) +-{ +-#ifndef CONFIG_ES7000_CLUSTERED_APIC +- while (!atomic_read(deassert)) +- cpu_relax(); +-#endif +- return; +-} +- +-/* Nothing to do for most platforms, since cleared by the INIT cycle */ +-static inline void smp_callin_clear_local_apic(void) +-{ +-} +- +-static inline void store_NMI_vector(unsigned short *high, unsigned short *low) +-{ +-} +- +-static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) +-{ +-} +- +-extern void __inquire_remote_apic(int apicid); +- +-static inline void inquire_remote_apic(int apicid) +-{ +- if (apic_verbosity >= APIC_DEBUG) +- __inquire_remote_apic(apicid); +-} +- +-#endif /* __ASM_MACH_WAKECPU_H */ +Index: linux-2.6-tip/arch/x86/include/asm/fixmap.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/fixmap.h ++++ linux-2.6-tip/arch/x86/include/asm/fixmap.h +@@ -1,11 +1,147 @@ ++/* ++ * fixmap.h: compile-time virtual memory allocation ++ * ++ * This file is subject to the terms and conditions of the GNU General Public ++ * License. See the file "COPYING" in the main directory of this archive ++ * for more details. ++ * ++ * Copyright (C) 1998 Ingo Molnar ++ * ++ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 ++ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009 ++ */ ++ + #ifndef _ASM_X86_FIXMAP_H + #define _ASM_X86_FIXMAP_H + ++#ifndef __ASSEMBLY__ ++#include ++#include ++#include ++#include ++#ifdef CONFIG_X86_32 ++#include ++#include ++#else ++#include ++#endif ++ ++/* ++ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall ++ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation. ++ * Because of this, FIXADDR_TOP x86 integration was left as later work. ++ */ ++#ifdef CONFIG_X86_32 ++/* used by vmalloc.c, vsyscall.lds.S. ++ * ++ * Leave one empty page between vmalloc'ed areas and ++ * the start of the fixmap. ++ */ ++extern unsigned long __FIXADDR_TOP; ++#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) ++ ++#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) ++#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) ++#else ++#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) ++ ++/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ ++#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) ++#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) ++#endif ++ ++ ++/* ++ * Here we define all the compile-time 'special' virtual ++ * addresses. The point is to have a constant address at ++ * compile time, but to set the physical address only ++ * in the boot process. ++ * for x86_32: We allocate these special addresses ++ * from the end of virtual memory (0xfffff000) backwards. ++ * Also this lets us do fail-safe vmalloc(), we ++ * can guarantee that these special addresses and ++ * vmalloc()-ed addresses never overlap. ++ * ++ * These 'compile-time allocated' memory buffers are ++ * fixed-size 4k pages (or larger if used with an increment ++ * higher than 1). Use set_fixmap(idx,phys) to associate ++ * physical memory with fixmap indices. ++ * ++ * TLB entries of such buffers will not be flushed across ++ * task switches. ++ */ ++enum fixed_addresses { + #ifdef CONFIG_X86_32 +-# include "fixmap_32.h" ++ FIX_HOLE, ++ FIX_VDSO, + #else +-# include "fixmap_64.h" ++ VSYSCALL_LAST_PAGE, ++ VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE ++ + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, ++ VSYSCALL_HPET, + #endif ++ FIX_DBGP_BASE, ++ FIX_EARLYCON_MEM_BASE, ++#ifdef CONFIG_X86_LOCAL_APIC ++ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ ++#endif ++#ifdef CONFIG_X86_IO_APIC ++ FIX_IO_APIC_BASE_0, ++ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, ++#endif ++#ifdef CONFIG_X86_VISWS_APIC ++ FIX_CO_CPU, /* Cobalt timer */ ++ FIX_CO_APIC, /* Cobalt APIC Redirection Table */ ++ FIX_LI_PCIA, /* Lithium PCI Bridge A */ ++ FIX_LI_PCIB, /* Lithium PCI Bridge B */ ++#endif ++#ifdef CONFIG_X86_F00F_BUG ++ FIX_F00F_IDT, /* Virtual mapping for IDT */ ++#endif ++#ifdef CONFIG_X86_CYCLONE_TIMER ++ FIX_CYCLONE_TIMER, /*cyclone timer register*/ ++#endif ++#ifdef CONFIG_X86_32 ++ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ ++ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ++#ifdef CONFIG_PCI_MMCONFIG ++ FIX_PCIE_MCFG, ++#endif ++#endif ++#ifdef CONFIG_PARAVIRT ++ FIX_PARAVIRT_BOOTMAP, ++#endif ++ FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ ++ FIX_TEXT_POKE1, ++ __end_of_permanent_fixed_addresses, ++#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT ++ FIX_OHCI1394_BASE, ++#endif ++ /* ++ * 256 temporary boot-time mappings, used by early_ioremap(), ++ * before ioremap() is functional. ++ * ++ * We round it up to the next 256 pages boundary so that we ++ * can have a single pgd entry and a single pte table: ++ */ ++#define NR_FIX_BTMAPS 64 ++#define FIX_BTMAPS_SLOTS 4 ++ FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - ++ (__end_of_permanent_fixed_addresses & 255), ++ FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, ++#ifdef CONFIG_X86_32 ++ FIX_WP_TEST, ++#endif ++ __end_of_fixed_addresses ++}; ++ ++ ++extern void reserve_top_address(unsigned long reserve); ++ ++#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) ++#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) ++#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) ++#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE) + + extern int fixmaps_set; + +@@ -15,11 +151,11 @@ extern pte_t *pkmap_page_table; + + void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); + void native_set_fixmap(enum fixed_addresses idx, +- unsigned long phys, pgprot_t flags); ++ phys_addr_t phys, pgprot_t flags); + + #ifndef CONFIG_PARAVIRT + static inline void __set_fixmap(enum fixed_addresses idx, +- unsigned long phys, pgprot_t flags) ++ phys_addr_t phys, pgprot_t flags) + { + native_set_fixmap(idx, phys, flags); + } +@@ -69,4 +205,5 @@ static inline unsigned long virt_to_fix( + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); + } ++#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_FIXMAP_H */ +Index: linux-2.6-tip/arch/x86/include/asm/fixmap_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/fixmap_32.h ++++ /dev/null +@@ -1,119 +0,0 @@ +-/* +- * fixmap.h: compile-time virtual memory allocation +- * +- * This file is subject to the terms and conditions of the GNU General Public +- * License. See the file "COPYING" in the main directory of this archive +- * for more details. +- * +- * Copyright (C) 1998 Ingo Molnar +- * +- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 +- */ +- +-#ifndef _ASM_X86_FIXMAP_32_H +-#define _ASM_X86_FIXMAP_32_H +- +- +-/* used by vmalloc.c, vsyscall.lds.S. +- * +- * Leave one empty page between vmalloc'ed areas and +- * the start of the fixmap. +- */ +-extern unsigned long __FIXADDR_TOP; +-#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +-#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) +- +-#ifndef __ASSEMBLY__ +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * Here we define all the compile-time 'special' virtual +- * addresses. The point is to have a constant address at +- * compile time, but to set the physical address only +- * in the boot process. We allocate these special addresses +- * from the end of virtual memory (0xfffff000) backwards. +- * Also this lets us do fail-safe vmalloc(), we +- * can guarantee that these special addresses and +- * vmalloc()-ed addresses never overlap. +- * +- * these 'compile-time allocated' memory buffers are +- * fixed-size 4k pages. (or larger if used with an increment +- * highger than 1) use fixmap_set(idx,phys) to associate +- * physical memory with fixmap indices. +- * +- * TLB entries of such buffers will not be flushed across +- * task switches. +- */ +-enum fixed_addresses { +- FIX_HOLE, +- FIX_VDSO, +- FIX_DBGP_BASE, +- FIX_EARLYCON_MEM_BASE, +-#ifdef CONFIG_X86_LOCAL_APIC +- FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +-#endif +-#ifdef CONFIG_X86_IO_APIC +- FIX_IO_APIC_BASE_0, +- FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, +-#endif +-#ifdef CONFIG_X86_VISWS_APIC +- FIX_CO_CPU, /* Cobalt timer */ +- FIX_CO_APIC, /* Cobalt APIC Redirection Table */ +- FIX_LI_PCIA, /* Lithium PCI Bridge A */ +- FIX_LI_PCIB, /* Lithium PCI Bridge B */ +-#endif +-#ifdef CONFIG_X86_F00F_BUG +- FIX_F00F_IDT, /* Virtual mapping for IDT */ +-#endif +-#ifdef CONFIG_X86_CYCLONE_TIMER +- FIX_CYCLONE_TIMER, /*cyclone timer register*/ +-#endif +- FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +-#ifdef CONFIG_PCI_MMCONFIG +- FIX_PCIE_MCFG, +-#endif +-#ifdef CONFIG_PARAVIRT +- FIX_PARAVIRT_BOOTMAP, +-#endif +- __end_of_permanent_fixed_addresses, +- /* +- * 256 temporary boot-time mappings, used by early_ioremap(), +- * before ioremap() is functional. +- * +- * We round it up to the next 256 pages boundary so that we +- * can have a single pgd entry and a single pte table: +- */ +-#define NR_FIX_BTMAPS 64 +-#define FIX_BTMAPS_SLOTS 4 +- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - +- (__end_of_permanent_fixed_addresses & 255), +- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +- FIX_WP_TEST, +-#ifdef CONFIG_ACPI +- FIX_ACPI_BEGIN, +- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, +-#endif +-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT +- FIX_OHCI1394_BASE, +-#endif +- __end_of_fixed_addresses +-}; +- +-extern void reserve_top_address(unsigned long reserve); +- +- +-#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) +- +-#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +-#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +-#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) +-#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) +- +-#endif /* !__ASSEMBLY__ */ +-#endif /* _ASM_X86_FIXMAP_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/fixmap_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/fixmap_64.h ++++ /dev/null +@@ -1,79 +0,0 @@ +-/* +- * fixmap.h: compile-time virtual memory allocation +- * +- * This file is subject to the terms and conditions of the GNU General Public +- * License. See the file "COPYING" in the main directory of this archive +- * for more details. +- * +- * Copyright (C) 1998 Ingo Molnar +- */ +- +-#ifndef _ASM_X86_FIXMAP_64_H +-#define _ASM_X86_FIXMAP_64_H +- +-#include +-#include +-#include +-#include +-#include +- +-/* +- * Here we define all the compile-time 'special' virtual +- * addresses. The point is to have a constant address at +- * compile time, but to set the physical address only +- * in the boot process. +- * +- * These 'compile-time allocated' memory buffers are +- * fixed-size 4k pages (or larger if used with an increment +- * higher than 1). Use set_fixmap(idx,phys) to associate +- * physical memory with fixmap indices. +- * +- * TLB entries of such buffers will not be flushed across +- * task switches. +- */ +- +-enum fixed_addresses { +- VSYSCALL_LAST_PAGE, +- VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE +- + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, +- VSYSCALL_HPET, +- FIX_DBGP_BASE, +- FIX_EARLYCON_MEM_BASE, +- FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +- FIX_IO_APIC_BASE_0, +- FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, +-#ifdef CONFIG_PARAVIRT +- FIX_PARAVIRT_BOOTMAP, +-#endif +- __end_of_permanent_fixed_addresses, +-#ifdef CONFIG_ACPI +- FIX_ACPI_BEGIN, +- FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, +-#endif +-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT +- FIX_OHCI1394_BASE, +-#endif +- /* +- * 256 temporary boot-time mappings, used by early_ioremap(), +- * before ioremap() is functional. +- * +- * We round it up to the next 256 pages boundary so that we +- * can have a single pgd entry and a single pte table: +- */ +-#define NR_FIX_BTMAPS 64 +-#define FIX_BTMAPS_SLOTS 4 +- FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - +- (__end_of_permanent_fixed_addresses & 255), +- FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, +- __end_of_fixed_addresses +-}; +- +-#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) +-#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +- +-/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ +-#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) +-#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) +- +-#endif /* _ASM_X86_FIXMAP_64_H */ +Index: linux-2.6-tip/arch/x86/include/asm/ftrace.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/ftrace.h ++++ linux-2.6-tip/arch/x86/include/asm/ftrace.h +@@ -28,6 +28,13 @@ + + #endif + ++/* FIXME: I don't want to stay hardcoded */ ++#ifdef CONFIG_X86_64 ++# define FTRACE_SYSCALL_MAX 296 ++#else ++# define FTRACE_SYSCALL_MAX 333 ++#endif ++ + #ifdef CONFIG_FUNCTION_TRACER + #define MCOUNT_ADDR ((long)(mcount)) + #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ +@@ -55,29 +62,4 @@ struct dyn_arch_ftrace { + #endif /* __ASSEMBLY__ */ + #endif /* CONFIG_FUNCTION_TRACER */ + +-#ifdef CONFIG_FUNCTION_GRAPH_TRACER +- +-#ifndef __ASSEMBLY__ +- +-/* +- * Stack of return addresses for functions +- * of a thread. +- * Used in struct thread_info +- */ +-struct ftrace_ret_stack { +- unsigned long ret; +- unsigned long func; +- unsigned long long calltime; +-}; +- +-/* +- * Primary handler of a function return. +- * It relays on ftrace_return_to_handler. +- * Defined in entry_32/64.S +- */ +-extern void return_to_handler(void); +- +-#endif /* __ASSEMBLY__ */ +-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +- + #endif /* _ASM_X86_FTRACE_H */ +Index: linux-2.6-tip/arch/x86/include/asm/genapic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/genapic.h ++++ linux-2.6-tip/arch/x86/include/asm/genapic.h +@@ -1,5 +1 @@ +-#ifdef CONFIG_X86_32 +-# include "genapic_32.h" +-#else +-# include "genapic_64.h" +-#endif ++#include +Index: linux-2.6-tip/arch/x86/include/asm/genapic_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/genapic_32.h ++++ /dev/null +@@ -1,148 +0,0 @@ +-#ifndef _ASM_X86_GENAPIC_32_H +-#define _ASM_X86_GENAPIC_32_H +- +-#include +-#include +- +-/* +- * Generic APIC driver interface. +- * +- * An straight forward mapping of the APIC related parts of the +- * x86 subarchitecture interface to a dynamic object. +- * +- * This is used by the "generic" x86 subarchitecture. +- * +- * Copyright 2003 Andi Kleen, SuSE Labs. +- */ +- +-struct mpc_bus; +-struct mpc_table; +-struct mpc_cpu; +- +-struct genapic { +- char *name; +- int (*probe)(void); +- +- int (*apic_id_registered)(void); +- const struct cpumask *(*target_cpus)(void); +- int int_delivery_mode; +- int int_dest_mode; +- int ESR_DISABLE; +- int apic_destination_logical; +- unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); +- unsigned long (*check_apicid_present)(int apicid); +- int no_balance_irq; +- int no_ioapic_check; +- void (*init_apic_ldr)(void); +- physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); +- +- void (*setup_apic_routing)(void); +- int (*multi_timer_check)(int apic, int irq); +- int (*apicid_to_node)(int logical_apicid); +- int (*cpu_to_logical_apicid)(int cpu); +- int (*cpu_present_to_apicid)(int mps_cpu); +- physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); +- void (*setup_portio_remap)(void); +- int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); +- void (*enable_apic_mode)(void); +- u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb); +- +- /* mpparse */ +- /* When one of the next two hooks returns 1 the genapic +- is switched to this. Essentially they are additional probe +- functions. */ +- int (*mps_oem_check)(struct mpc_table *mpc, char *oem, +- char *productid); +- int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); +- +- unsigned (*get_apic_id)(unsigned long x); +- unsigned long apic_id_mask; +- unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); +- unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, +- const struct cpumask *andmask); +- void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); +- +-#ifdef CONFIG_SMP +- /* ipi */ +- void (*send_IPI_mask)(const struct cpumask *mask, int vector); +- void (*send_IPI_mask_allbutself)(const struct cpumask *mask, +- int vector); +- void (*send_IPI_allbutself)(int vector); +- void (*send_IPI_all)(int vector); +-#endif +- int (*wakeup_cpu)(int apicid, unsigned long start_eip); +- int trampoline_phys_low; +- int trampoline_phys_high; +- void (*wait_for_init_deassert)(atomic_t *deassert); +- void (*smp_callin_clear_local_apic)(void); +- void (*store_NMI_vector)(unsigned short *high, unsigned short *low); +- void (*restore_NMI_vector)(unsigned short *high, unsigned short *low); +- void (*inquire_remote_apic)(int apicid); +-}; +- +-#define APICFUNC(x) .x = x, +- +-/* More functions could be probably marked IPIFUNC and save some space +- in UP GENERICARCH kernels, but I don't have the nerve right now +- to untangle this mess. -AK */ +-#ifdef CONFIG_SMP +-#define IPIFUNC(x) APICFUNC(x) +-#else +-#define IPIFUNC(x) +-#endif +- +-#define APIC_INIT(aname, aprobe) \ +-{ \ +- .name = aname, \ +- .probe = aprobe, \ +- .int_delivery_mode = INT_DELIVERY_MODE, \ +- .int_dest_mode = INT_DEST_MODE, \ +- .no_balance_irq = NO_BALANCE_IRQ, \ +- .ESR_DISABLE = esr_disable, \ +- .apic_destination_logical = APIC_DEST_LOGICAL, \ +- APICFUNC(apic_id_registered) \ +- APICFUNC(target_cpus) \ +- APICFUNC(check_apicid_used) \ +- APICFUNC(check_apicid_present) \ +- APICFUNC(init_apic_ldr) \ +- APICFUNC(ioapic_phys_id_map) \ +- APICFUNC(setup_apic_routing) \ +- APICFUNC(multi_timer_check) \ +- APICFUNC(apicid_to_node) \ +- APICFUNC(cpu_to_logical_apicid) \ +- APICFUNC(cpu_present_to_apicid) \ +- APICFUNC(apicid_to_cpu_present) \ +- APICFUNC(setup_portio_remap) \ +- APICFUNC(check_phys_apicid_present) \ +- APICFUNC(mps_oem_check) \ +- APICFUNC(get_apic_id) \ +- .apic_id_mask = APIC_ID_MASK, \ +- APICFUNC(cpu_mask_to_apicid) \ +- APICFUNC(cpu_mask_to_apicid_and) \ +- APICFUNC(vector_allocation_domain) \ +- APICFUNC(acpi_madt_oem_check) \ +- IPIFUNC(send_IPI_mask) \ +- IPIFUNC(send_IPI_allbutself) \ +- IPIFUNC(send_IPI_all) \ +- APICFUNC(enable_apic_mode) \ +- APICFUNC(phys_pkg_id) \ +- .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \ +- .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \ +- APICFUNC(wait_for_init_deassert) \ +- APICFUNC(smp_callin_clear_local_apic) \ +- APICFUNC(store_NMI_vector) \ +- APICFUNC(restore_NMI_vector) \ +- APICFUNC(inquire_remote_apic) \ +-} +- +-extern struct genapic *genapic; +-extern void es7000_update_genapic_to_cluster(void); +- +-enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; +-#define get_uv_system_type() UV_NONE +-#define is_uv_system() 0 +-#define uv_wakeup_secondary(a, b) 1 +-#define uv_system_init() do {} while (0) +- +- +-#endif /* _ASM_X86_GENAPIC_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/genapic_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/genapic_64.h ++++ /dev/null +@@ -1,66 +0,0 @@ +-#ifndef _ASM_X86_GENAPIC_64_H +-#define _ASM_X86_GENAPIC_64_H +- +-#include +- +-/* +- * Copyright 2004 James Cleverdon, IBM. +- * Subject to the GNU Public License, v.2 +- * +- * Generic APIC sub-arch data struct. +- * +- * Hacked for x86-64 by James Cleverdon from i386 architecture code by +- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and +- * James Cleverdon. +- */ +- +-struct genapic { +- char *name; +- int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); +- u32 int_delivery_mode; +- u32 int_dest_mode; +- int (*apic_id_registered)(void); +- const struct cpumask *(*target_cpus)(void); +- void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); +- void (*init_apic_ldr)(void); +- /* ipi */ +- void (*send_IPI_mask)(const struct cpumask *mask, int vector); +- void (*send_IPI_mask_allbutself)(const struct cpumask *mask, +- int vector); +- void (*send_IPI_allbutself)(int vector); +- void (*send_IPI_all)(int vector); +- void (*send_IPI_self)(int vector); +- /* */ +- unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); +- unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, +- const struct cpumask *andmask); +- unsigned int (*phys_pkg_id)(int index_msb); +- unsigned int (*get_apic_id)(unsigned long x); +- unsigned long (*set_apic_id)(unsigned int id); +- unsigned long apic_id_mask; +- /* wakeup_secondary_cpu */ +- int (*wakeup_cpu)(int apicid, unsigned long start_eip); +-}; +- +-extern struct genapic *genapic; +- +-extern struct genapic apic_flat; +-extern struct genapic apic_physflat; +-extern struct genapic apic_x2apic_cluster; +-extern struct genapic apic_x2apic_phys; +-extern int acpi_madt_oem_check(char *, char *); +- +-extern void apic_send_IPI_self(int vector); +-enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; +-extern enum uv_system_type get_uv_system_type(void); +-extern int is_uv_system(void); +- +-extern struct genapic apic_x2apic_uv_x; +-DECLARE_PER_CPU(int, x2apic_extra_bits); +-extern void uv_cpu_init(void); +-extern void uv_system_init(void); +-extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip); +- +-extern void setup_apic_routing(void); +- +-#endif /* _ASM_X86_GENAPIC_64_H */ +Index: linux-2.6-tip/arch/x86/include/asm/hardirq.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/hardirq.h ++++ linux-2.6-tip/arch/x86/include/asm/hardirq.h +@@ -1,11 +1,54 @@ +-#ifdef CONFIG_X86_32 +-# include "hardirq_32.h" +-#else +-# include "hardirq_64.h" ++#ifndef _ASM_X86_HARDIRQ_H ++#define _ASM_X86_HARDIRQ_H ++ ++#include ++#include ++ ++typedef struct { ++ unsigned int __softirq_pending; ++ unsigned int __nmi_count; /* arch dependent */ ++ unsigned int irq0_irqs; ++#ifdef CONFIG_X86_LOCAL_APIC ++ unsigned int apic_timer_irqs; /* arch dependent */ ++ unsigned int irq_spurious_count; ++#endif ++ unsigned int generic_irqs; /* arch dependent */ ++ unsigned int apic_perf_irqs; ++#ifdef CONFIG_SMP ++ unsigned int irq_resched_count; ++ unsigned int irq_call_count; ++ unsigned int irq_tlb_count; ++#endif ++#ifdef CONFIG_X86_MCE ++ unsigned int irq_thermal_count; ++# ifdef CONFIG_X86_64 ++ unsigned int irq_threshold_count; ++# endif + #endif ++} ____cacheline_aligned irq_cpustat_t; ++ ++DECLARE_PER_CPU(irq_cpustat_t, irq_stat); ++ ++/* We can have at most NR_VECTORS irqs routed to a cpu at a time */ ++#define MAX_HARDIRQS_PER_CPU NR_VECTORS ++ ++#define __ARCH_IRQ_STAT ++ ++#define inc_irq_stat(member) percpu_add(irq_stat.member, 1) ++ ++#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending) ++ ++#define __ARCH_SET_SOFTIRQ_PENDING ++ ++#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x)) ++#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x)) ++ ++extern void ack_bad_irq(unsigned int irq); + + extern u64 arch_irq_stat_cpu(unsigned int cpu); + #define arch_irq_stat_cpu arch_irq_stat_cpu + + extern u64 arch_irq_stat(void); + #define arch_irq_stat arch_irq_stat ++ ++#endif /* _ASM_X86_HARDIRQ_H */ +Index: linux-2.6-tip/arch/x86/include/asm/hardirq_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/hardirq_32.h ++++ /dev/null +@@ -1,30 +0,0 @@ +-#ifndef _ASM_X86_HARDIRQ_32_H +-#define _ASM_X86_HARDIRQ_32_H +- +-#include +-#include +- +-typedef struct { +- unsigned int __softirq_pending; +- unsigned long idle_timestamp; +- unsigned int __nmi_count; /* arch dependent */ +- unsigned int apic_timer_irqs; /* arch dependent */ +- unsigned int irq0_irqs; +- unsigned int irq_resched_count; +- unsigned int irq_call_count; +- unsigned int irq_tlb_count; +- unsigned int irq_thermal_count; +- unsigned int irq_spurious_count; +-} ____cacheline_aligned irq_cpustat_t; +- +-DECLARE_PER_CPU(irq_cpustat_t, irq_stat); +- +-#define __ARCH_IRQ_STAT +-#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member) +- +-#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++) +- +-void ack_bad_irq(unsigned int irq); +-#include +- +-#endif /* _ASM_X86_HARDIRQ_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/hardirq_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/hardirq_64.h ++++ /dev/null +@@ -1,25 +0,0 @@ +-#ifndef _ASM_X86_HARDIRQ_64_H +-#define _ASM_X86_HARDIRQ_64_H +- +-#include +-#include +-#include +-#include +- +-/* We can have at most NR_VECTORS irqs routed to a cpu at a time */ +-#define MAX_HARDIRQS_PER_CPU NR_VECTORS +- +-#define __ARCH_IRQ_STAT 1 +- +-#define inc_irq_stat(member) add_pda(member, 1) +- +-#define local_softirq_pending() read_pda(__softirq_pending) +- +-#define __ARCH_SET_SOFTIRQ_PENDING 1 +- +-#define set_softirq_pending(x) write_pda(__softirq_pending, (x)) +-#define or_softirq_pending(x) or_pda(__softirq_pending, (x)) +- +-extern void ack_bad_irq(unsigned int irq); +- +-#endif /* _ASM_X86_HARDIRQ_64_H */ +Index: linux-2.6-tip/arch/x86/include/asm/highmem.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/highmem.h ++++ linux-2.6-tip/arch/x86/include/asm/highmem.h +@@ -58,15 +58,28 @@ extern void *kmap_high(struct page *page + extern void kunmap_high(struct page *page); + + void *kmap(struct page *page); ++extern void kunmap_virt(void *ptr); ++extern struct page *kmap_to_page(void *ptr); ++void kunmap(struct page *page); ++ ++void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); ++void *__kmap_atomic(struct page *page, enum km_type type); ++void *__kmap_atomic_direct(struct page *page, enum km_type type); ++void __kunmap_atomic(void *kvaddr, enum km_type type); ++void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type); ++struct page *__kmap_atomic_to_page(void *ptr); ++ + void kunmap(struct page *page); + void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); + void *kmap_atomic(struct page *page, enum km_type type); + void kunmap_atomic(void *kvaddr, enum km_type type); + void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); ++void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); + struct page *kmap_atomic_to_page(void *ptr); + + #ifndef CONFIG_PARAVIRT +-#define kmap_atomic_pte(page, type) kmap_atomic(page, type) ++#define kmap_atomic_pte(page, type) kmap_atomic(page, type) ++#define kmap_atomic_pte_direct(page, type) kmap_atomic_direct(page, type) + #endif + + #define flush_cache_kmaps() do { } while (0) +@@ -74,6 +87,27 @@ struct page *kmap_atomic_to_page(void *p + extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); + ++/* ++ * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap(): ++ */ ++#ifdef CONFIG_PREEMPT_RT ++# define kmap_atomic_prot(page, type, prot) ({ pagefault_disable(); kmap(page); }) ++# define kmap_atomic(page, type) ({ pagefault_disable(); kmap(page); }) ++# define kmap_atomic_pfn(pfn, type) kmap(pfn_to_page(pfn)) ++# define kunmap_atomic(kvaddr, type) do { pagefault_enable(); kunmap_virt(kvaddr); } while(0) ++# define kmap_atomic_to_page(kvaddr) kmap_to_page(kvaddr) ++# define kmap_atomic_direct(page, type) __kmap_atomic_direct(page, type) ++# define kunmap_atomic_direct(kvaddr, type) __kunmap_atomic(kvaddr, type) ++#else ++# define kmap_atomic_prot(page, type, prot) __kmap_atomic_prot(page, type, prot) ++# define kmap_atomic(page, type) __kmap_atomic(page, type) ++# define kmap_atomic_pfn(pfn, type) __kmap_atomic_pfn(pfn, type) ++# define kunmap_atomic(kvaddr, type) __kunmap_atomic(kvaddr, type) ++# define kmap_atomic_to_page(kvaddr) __kmap_atomic_to_page(kvaddr) ++# define kmap_atomic_direct(page, type) __kmap_atomic(page, type) ++# define kunmap_atomic_direct(kvaddr, type) __kunmap_atomic(kvaddr, type) ++#endif ++ + #endif /* __KERNEL__ */ + + #endif /* _ASM_X86_HIGHMEM_H */ +Index: linux-2.6-tip/arch/x86/include/asm/hw_irq.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/hw_irq.h ++++ linux-2.6-tip/arch/x86/include/asm/hw_irq.h +@@ -25,11 +25,12 @@ + #include + #include + +-#define platform_legacy_irq(irq) ((irq) < 16) +- + /* Interrupt handlers registered during init_IRQ */ + extern void apic_timer_interrupt(void); ++extern void generic_interrupt(void); + extern void error_interrupt(void); ++extern void perf_counter_interrupt(void); ++ + extern void spurious_interrupt(void); + extern void thermal_interrupt(void); + extern void reschedule_interrupt(void); +@@ -58,7 +59,7 @@ extern void make_8259A_irq(unsigned int + extern void init_8259A(int aeoi); + + /* IOAPIC */ +-#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) ++#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) + extern unsigned long io_apic_irqs; + + extern void init_VISWS_APIC_irqs(void); +@@ -67,15 +68,7 @@ extern void disable_IO_APIC(void); + extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); + extern void setup_ioapic_dest(void); + +-#ifdef CONFIG_X86_64 + extern void enable_IO_APIC(void); +-#endif +- +-/* IPI functions */ +-#ifdef CONFIG_X86_32 +-extern void send_IPI_self(int vector); +-#endif +-extern void send_IPI(int dest, int vector); + + /* Statistics */ + extern atomic_t irq_err_count; +@@ -84,21 +77,11 @@ extern atomic_t irq_mis_count; + /* EISA */ + extern void eisa_set_level_irq(unsigned int irq); + +-/* Voyager functions */ +-extern asmlinkage void vic_cpi_interrupt(void); +-extern asmlinkage void vic_sys_interrupt(void); +-extern asmlinkage void vic_cmn_interrupt(void); +-extern asmlinkage void qic_timer_interrupt(void); +-extern asmlinkage void qic_invalidate_interrupt(void); +-extern asmlinkage void qic_reschedule_interrupt(void); +-extern asmlinkage void qic_enable_irq_interrupt(void); +-extern asmlinkage void qic_call_function_interrupt(void); +- + /* SMP */ + extern void smp_apic_timer_interrupt(struct pt_regs *); + extern void smp_spurious_interrupt(struct pt_regs *); + extern void smp_error_interrupt(struct pt_regs *); +-#ifdef CONFIG_X86_SMP ++#ifdef CONFIG_SMP + extern void smp_reschedule_interrupt(struct pt_regs *); + extern void smp_call_function_interrupt(struct pt_regs *); + extern void smp_call_function_single_interrupt(struct pt_regs *); +Index: linux-2.6-tip/arch/x86/include/asm/i8259.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/i8259.h ++++ linux-2.6-tip/arch/x86/include/asm/i8259.h +@@ -24,7 +24,7 @@ extern unsigned int cached_irq_mask; + #define SLAVE_ICW4_DEFAULT 0x01 + #define PIC_ICW4_AEOI 2 + +-extern spinlock_t i8259A_lock; ++extern raw_spinlock_t i8259A_lock; + + extern void init_8259A(int auto_eoi); + extern void enable_8259A_irq(unsigned int irq); +@@ -60,4 +60,8 @@ extern struct irq_chip i8259A_chip; + extern void mask_8259A(void); + extern void unmask_8259A(void); + ++#ifdef CONFIG_X86_32 ++extern void init_ISA_irqs(void); ++#endif ++ + #endif /* _ASM_X86_I8259_H */ +Index: linux-2.6-tip/arch/x86/include/asm/init.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/init.h +@@ -0,0 +1,18 @@ ++#ifndef _ASM_X86_INIT_32_H ++#define _ASM_X86_INIT_32_H ++ ++#ifdef CONFIG_X86_32 ++extern void __init early_ioremap_page_table_range_init(void); ++#endif ++ ++extern unsigned long __init ++kernel_physical_mapping_init(unsigned long start, ++ unsigned long end, ++ unsigned long page_size_mask); ++ ++ ++extern unsigned long __initdata e820_table_start; ++extern unsigned long __meminitdata e820_table_end; ++extern unsigned long __meminitdata e820_table_top; ++ ++#endif /* _ASM_X86_INIT_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/intel_arch_perfmon.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/intel_arch_perfmon.h ++++ /dev/null +@@ -1,31 +0,0 @@ +-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H +-#define _ASM_X86_INTEL_ARCH_PERFMON_H +- +-#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +-#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 +- +-#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +-#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 +- +-#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +-#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +-#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +-#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) +- +-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) +-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) +-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ +- (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) +- +-union cpuid10_eax { +- struct { +- unsigned int version_id:8; +- unsigned int num_counters:8; +- unsigned int bit_width:8; +- unsigned int mask_length:8; +- } split; +- unsigned int full; +-}; +- +-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ +Index: linux-2.6-tip/arch/x86/include/asm/io.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/io.h ++++ linux-2.6-tip/arch/x86/include/asm/io.h +@@ -5,6 +5,7 @@ + + #include + #include ++#include + + #define build_mmio_read(name, size, type, reg, barrier) \ + static inline type name(const volatile void __iomem *addr) \ +@@ -80,6 +81,98 @@ static inline void writeq(__u64 val, vol + #define readq readq + #define writeq writeq + ++/** ++ * virt_to_phys - map virtual addresses to physical ++ * @address: address to remap ++ * ++ * The returned physical address is the physical (CPU) mapping for ++ * the memory address given. It is only valid to use this function on ++ * addresses directly mapped or allocated via kmalloc. ++ * ++ * This function does not give bus mappings for DMA transfers. In ++ * almost all conceivable cases a device driver should not be using ++ * this function ++ */ ++ ++static inline phys_addr_t virt_to_phys(volatile void *address) ++{ ++ return __pa(address); ++} ++ ++/** ++ * phys_to_virt - map physical address to virtual ++ * @address: address to remap ++ * ++ * The returned virtual address is a current CPU mapping for ++ * the memory address given. It is only valid to use this function on ++ * addresses that have a kernel mapping ++ * ++ * This function does not handle bus mappings for DMA transfers. In ++ * almost all conceivable cases a device driver should not be using ++ * this function ++ */ ++ ++static inline void *phys_to_virt(phys_addr_t address) ++{ ++ return __va(address); ++} ++ ++/* ++ * Change "struct page" to physical address. ++ */ ++#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) ++ ++/* ++ * ISA I/O bus memory addresses are 1:1 with the physical address. ++ * However, we truncate the address to unsigned int to avoid undesirable ++ * promitions in legacy drivers. ++ */ ++static inline unsigned int isa_virt_to_bus(volatile void *address) ++{ ++ return (unsigned int)virt_to_phys(address); ++} ++#define isa_page_to_bus(page) ((unsigned int)page_to_phys(page)) ++#define isa_bus_to_virt phys_to_virt ++ ++/* ++ * However PCI ones are not necessarily 1:1 and therefore these interfaces ++ * are forbidden in portable PCI drivers. ++ * ++ * Allow them on x86 for legacy drivers, though. ++ */ ++#define virt_to_bus virt_to_phys ++#define bus_to_virt phys_to_virt ++ ++/** ++ * ioremap - map bus memory into CPU space ++ * @offset: bus address of the memory ++ * @size: size of the resource to map ++ * ++ * ioremap performs a platform specific sequence of operations to ++ * make bus memory CPU accessible via the readb/readw/readl/writeb/ ++ * writew/writel functions and the other mmio helpers. The returned ++ * address is not guaranteed to be usable directly as a virtual ++ * address. ++ * ++ * If the area you are trying to map is a PCI BAR you should have a ++ * look at pci_iomap(). ++ */ ++extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); ++extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); ++extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, ++ unsigned long prot_val); ++ ++/* ++ * The default ioremap() behavior is non-cached: ++ */ ++static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) ++{ ++ return ioremap_nocache(offset, size); ++} ++ ++extern void iounmap(volatile void __iomem *addr); ++ ++ + #ifdef CONFIG_X86_32 + # include "io_32.h" + #else +@@ -91,7 +184,7 @@ extern void unxlate_dev_mem_ptr(unsigned + + extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, + unsigned long prot_val); +-extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size); ++extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); + + /* + * early_ioremap() and early_iounmap() are for temporary early boot-time +@@ -100,10 +193,12 @@ extern void __iomem *ioremap_wc(unsigned + */ + extern void early_ioremap_init(void); + extern void early_ioremap_reset(void); +-extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); +-extern void __iomem *early_memremap(unsigned long offset, unsigned long size); ++extern void __iomem *early_ioremap(resource_size_t phys_addr, ++ unsigned long size); ++extern void __iomem *early_memremap(resource_size_t phys_addr, ++ unsigned long size); + extern void early_iounmap(void __iomem *addr, unsigned long size); +-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); + ++#define IO_SPACE_LIMIT 0xffff + + #endif /* _ASM_X86_IO_H */ +Index: linux-2.6-tip/arch/x86/include/asm/io_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/io_32.h ++++ linux-2.6-tip/arch/x86/include/asm/io_32.h +@@ -37,8 +37,6 @@ + * - Arnaldo Carvalho de Melo + */ + +-#define IO_SPACE_LIMIT 0xffff +- + #define XQUAD_PORTIO_BASE 0xfe400000 + #define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ + +@@ -53,92 +51,6 @@ + */ + #define xlate_dev_kmem_ptr(p) p + +-/** +- * virt_to_phys - map virtual addresses to physical +- * @address: address to remap +- * +- * The returned physical address is the physical (CPU) mapping for +- * the memory address given. It is only valid to use this function on +- * addresses directly mapped or allocated via kmalloc. +- * +- * This function does not give bus mappings for DMA transfers. In +- * almost all conceivable cases a device driver should not be using +- * this function +- */ +- +-static inline unsigned long virt_to_phys(volatile void *address) +-{ +- return __pa(address); +-} +- +-/** +- * phys_to_virt - map physical address to virtual +- * @address: address to remap +- * +- * The returned virtual address is a current CPU mapping for +- * the memory address given. It is only valid to use this function on +- * addresses that have a kernel mapping +- * +- * This function does not handle bus mappings for DMA transfers. In +- * almost all conceivable cases a device driver should not be using +- * this function +- */ +- +-static inline void *phys_to_virt(unsigned long address) +-{ +- return __va(address); +-} +- +-/* +- * Change "struct page" to physical address. +- */ +-#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +- +-/** +- * ioremap - map bus memory into CPU space +- * @offset: bus address of the memory +- * @size: size of the resource to map +- * +- * ioremap performs a platform specific sequence of operations to +- * make bus memory CPU accessible via the readb/readw/readl/writeb/ +- * writew/writel functions and the other mmio helpers. The returned +- * address is not guaranteed to be usable directly as a virtual +- * address. +- * +- * If the area you are trying to map is a PCI BAR you should have a +- * look at pci_iomap(). +- */ +-extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +-extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +-extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, +- unsigned long prot_val); +- +-/* +- * The default ioremap() behavior is non-cached: +- */ +-static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) +-{ +- return ioremap_nocache(offset, size); +-} +- +-extern void iounmap(volatile void __iomem *addr); +- +-/* +- * ISA I/O bus memory addresses are 1:1 with the physical address. +- */ +-#define isa_virt_to_bus virt_to_phys +-#define isa_page_to_bus page_to_phys +-#define isa_bus_to_virt phys_to_virt +- +-/* +- * However PCI ones are not necessarily 1:1 and therefore these interfaces +- * are forbidden in portable PCI drivers. +- * +- * Allow them on x86 for legacy drivers, though. +- */ +-#define virt_to_bus virt_to_phys +-#define bus_to_virt phys_to_virt +- + static inline void + memset_io(volatile void __iomem *addr, unsigned char val, int count) + { +Index: linux-2.6-tip/arch/x86/include/asm/io_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/io_64.h ++++ linux-2.6-tip/arch/x86/include/asm/io_64.h +@@ -136,73 +136,12 @@ __OUTS(b) + __OUTS(w) + __OUTS(l) + +-#define IO_SPACE_LIMIT 0xffff +- + #if defined(__KERNEL__) && defined(__x86_64__) + + #include + +-#ifndef __i386__ +-/* +- * Change virtual addresses to physical addresses and vv. +- * These are pretty trivial +- */ +-static inline unsigned long virt_to_phys(volatile void *address) +-{ +- return __pa(address); +-} +- +-static inline void *phys_to_virt(unsigned long address) +-{ +- return __va(address); +-} +-#endif +- +-/* +- * Change "struct page" to physical address. +- */ +-#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +- + #include + +-/* +- * This one maps high address device memory and turns off caching for that area. +- * it's useful if some control registers are in such an area and write combining +- * or read caching is not desirable: +- */ +-extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +-extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +-extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, +- unsigned long prot_val); +- +-/* +- * The default ioremap() behavior is non-cached: +- */ +-static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) +-{ +- return ioremap_nocache(offset, size); +-} +- +-extern void iounmap(volatile void __iomem *addr); +- +-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); +- +-/* +- * ISA I/O bus memory addresses are 1:1 with the physical address. +- */ +-#define isa_virt_to_bus virt_to_phys +-#define isa_page_to_bus page_to_phys +-#define isa_bus_to_virt phys_to_virt +- +-/* +- * However PCI ones are not necessarily 1:1 and therefore these interfaces +- * are forbidden in portable PCI drivers. +- * +- * Allow them on x86 for legacy drivers, though. +- */ +-#define virt_to_bus virt_to_phys +-#define bus_to_virt phys_to_virt +- + void __memcpy_fromio(void *, unsigned long, unsigned); + void __memcpy_toio(unsigned long, const void *, unsigned); + +Index: linux-2.6-tip/arch/x86/include/asm/io_apic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/io_apic.h ++++ linux-2.6-tip/arch/x86/include/asm/io_apic.h +@@ -114,38 +114,16 @@ struct IR_IO_APIC_route_entry { + extern int nr_ioapics; + extern int nr_ioapic_registers[MAX_IO_APICS]; + +-/* +- * MP-BIOS irq configuration table structures: +- */ +- + #define MP_MAX_IOAPIC_PIN 127 + +-struct mp_config_ioapic { +- unsigned long mp_apicaddr; +- unsigned int mp_apicid; +- unsigned char mp_type; +- unsigned char mp_apicver; +- unsigned char mp_flags; +-}; +- +-struct mp_config_intsrc { +- unsigned int mp_dstapic; +- unsigned char mp_type; +- unsigned char mp_irqtype; +- unsigned short mp_irqflag; +- unsigned char mp_srcbus; +- unsigned char mp_srcbusirq; +- unsigned char mp_dstirq; +-}; +- + /* I/O APIC entries */ +-extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; ++extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; + + /* # of MP IRQ source entries */ + extern int mp_irq_entries; + + /* MP IRQ source entries */ +-extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; ++extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; + + /* non-0 if default (table-less) MP configuration */ + extern int mpc_default_type; +@@ -165,15 +143,6 @@ extern int noioapicreroute; + /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ + extern int timer_through_8259; + +-static inline void disable_ioapic_setup(void) +-{ +-#ifdef CONFIG_PCI +- noioapicquirk = 1; +- noioapicreroute = -1; +-#endif +- skip_ioapic_setup = 1; +-} +- + /* + * If we use the IO-APIC for IRQ routing, disable automatic + * assignment of PCI IRQ's. +@@ -193,13 +162,20 @@ extern int (*ioapic_renumber_irq)(int io + extern void ioapic_init_mappings(void); + + #ifdef CONFIG_X86_64 +-extern int save_mask_IO_APIC_setup(void); ++extern int save_IO_APIC_setup(void); ++extern void mask_IO_APIC_setup(void); + extern void restore_IO_APIC_setup(void); + extern void reinit_intr_remapped_IO_APIC(int); + #endif + + extern void probe_nr_irqs_gsi(void); + ++extern int setup_ioapic_entry(int apic, int irq, ++ struct IO_APIC_route_entry *entry, ++ unsigned int destination, int trigger, ++ int polarity, int vector, int pin); ++extern void ioapic_write_entry(int apic, int pin, ++ struct IO_APIC_route_entry e); + #else /* !CONFIG_X86_IO_APIC */ + #define io_apic_assign_pci_irqs 0 + static const int timer_through_8259 = 0; +Index: linux-2.6-tip/arch/x86/include/asm/iommu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/iommu.h ++++ linux-2.6-tip/arch/x86/include/asm/iommu.h +@@ -3,7 +3,7 @@ + + extern void pci_iommu_shutdown(void); + extern void no_iommu_init(void); +-extern struct dma_mapping_ops nommu_dma_ops; ++extern struct dma_map_ops nommu_dma_ops; + extern int force_iommu, no_iommu; + extern int iommu_detected; + +Index: linux-2.6-tip/arch/x86/include/asm/ipi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/ipi.h ++++ linux-2.6-tip/arch/x86/include/asm/ipi.h +@@ -1,6 +1,8 @@ + #ifndef _ASM_X86_IPI_H + #define _ASM_X86_IPI_H + ++#ifdef CONFIG_X86_LOCAL_APIC ++ + /* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 +@@ -55,8 +57,8 @@ static inline void __xapic_wait_icr_idle + cpu_relax(); + } + +-static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, +- unsigned int dest) ++static inline void ++__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) + { + /* + * Subtle. In the case of the 'never do double writes' workaround +@@ -87,8 +89,8 @@ static inline void __send_IPI_shortcut(u + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +-static inline void __send_IPI_dest_field(unsigned int mask, int vector, +- unsigned int dest) ++static inline void ++ __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) + { + unsigned long cfg; + +@@ -117,41 +119,44 @@ static inline void __send_IPI_dest_field + native_apic_mem_write(APIC_ICR, cfg); + } + +-static inline void send_IPI_mask_sequence(const struct cpumask *mask, +- int vector) +-{ +- unsigned long flags; +- unsigned long query_cpu; ++extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, ++ int vector); ++extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, ++ int vector); ++extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, ++ int vector); ++extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, ++ int vector); ++ ++/* Avoid include hell */ ++#define NMI_VECTOR 0x02 + +- /* +- * Hack. The clustered APIC addressing mode doesn't allow us to send +- * to an arbitrary mask, so I do a unicast to each CPU instead. +- * - mbligh +- */ +- local_irq_save(flags); +- for_each_cpu(query_cpu, mask) { +- __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), +- vector, APIC_DEST_PHYSICAL); +- } +- local_irq_restore(flags); ++extern int no_broadcast; ++ ++static inline void __default_local_send_IPI_allbutself(int vector) ++{ ++ if (no_broadcast || vector == NMI_VECTOR) ++ apic->send_IPI_mask_allbutself(cpu_online_mask, vector); ++ else ++ __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, apic->dest_logical); + } + +-static inline void send_IPI_mask_allbutself(const struct cpumask *mask, +- int vector) ++static inline void __default_local_send_IPI_all(int vector) + { +- unsigned long flags; +- unsigned int query_cpu; +- unsigned int this_cpu = smp_processor_id(); ++ if (no_broadcast || vector == NMI_VECTOR) ++ apic->send_IPI_mask(cpu_online_mask, vector); ++ else ++ __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector, apic->dest_logical); ++} + +- /* See Hack comment above */ ++#ifdef CONFIG_X86_32 ++extern void default_send_IPI_mask_logical(const struct cpumask *mask, ++ int vector); ++extern void default_send_IPI_allbutself(int vector); ++extern void default_send_IPI_all(int vector); ++extern void default_send_IPI_self(int vector); ++#endif + +- local_irq_save(flags); +- for_each_cpu(query_cpu, mask) +- if (query_cpu != this_cpu) +- __send_IPI_dest_field( +- per_cpu(x86_cpu_to_apicid, query_cpu), +- vector, APIC_DEST_PHYSICAL); +- local_irq_restore(flags); +-} ++#endif + + #endif /* _ASM_X86_IPI_H */ +Index: linux-2.6-tip/arch/x86/include/asm/irq.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/irq.h ++++ linux-2.6-tip/arch/x86/include/asm/irq.h +@@ -36,9 +36,12 @@ static inline int irq_canonicalize(int i + extern void fixup_irqs(void); + #endif + +-extern unsigned int do_IRQ(struct pt_regs *regs); ++extern void (*generic_interrupt_extension)(void); + extern void init_IRQ(void); + extern void native_init_IRQ(void); ++extern bool handle_irq(unsigned irq, struct pt_regs *regs); ++ ++extern unsigned int do_IRQ(struct pt_regs *regs); + + /* Interrupt vector management */ + extern DECLARE_BITMAP(used_vectors, NR_VECTORS); +Index: linux-2.6-tip/arch/x86/include/asm/irq_regs.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/irq_regs.h ++++ linux-2.6-tip/arch/x86/include/asm/irq_regs.h +@@ -1,5 +1,31 @@ +-#ifdef CONFIG_X86_32 +-# include "irq_regs_32.h" +-#else +-# include "irq_regs_64.h" +-#endif ++/* ++ * Per-cpu current frame pointer - the location of the last exception frame on ++ * the stack, stored in the per-cpu area. ++ * ++ * Jeremy Fitzhardinge ++ */ ++#ifndef _ASM_X86_IRQ_REGS_H ++#define _ASM_X86_IRQ_REGS_H ++ ++#include ++ ++#define ARCH_HAS_OWN_IRQ_REGS ++ ++DECLARE_PER_CPU(struct pt_regs *, irq_regs); ++ ++static inline struct pt_regs *get_irq_regs(void) ++{ ++ return percpu_read(irq_regs); ++} ++ ++static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) ++{ ++ struct pt_regs *old_regs; ++ ++ old_regs = get_irq_regs(); ++ percpu_write(irq_regs, new_regs); ++ ++ return old_regs; ++} ++ ++#endif /* _ASM_X86_IRQ_REGS_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/irq_regs_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/irq_regs_32.h ++++ /dev/null +@@ -1,31 +0,0 @@ +-/* +- * Per-cpu current frame pointer - the location of the last exception frame on +- * the stack, stored in the per-cpu area. +- * +- * Jeremy Fitzhardinge +- */ +-#ifndef _ASM_X86_IRQ_REGS_32_H +-#define _ASM_X86_IRQ_REGS_32_H +- +-#include +- +-#define ARCH_HAS_OWN_IRQ_REGS +- +-DECLARE_PER_CPU(struct pt_regs *, irq_regs); +- +-static inline struct pt_regs *get_irq_regs(void) +-{ +- return x86_read_percpu(irq_regs); +-} +- +-static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) +-{ +- struct pt_regs *old_regs; +- +- old_regs = get_irq_regs(); +- x86_write_percpu(irq_regs, new_regs); +- +- return old_regs; +-} +- +-#endif /* _ASM_X86_IRQ_REGS_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/irq_regs_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/irq_regs_64.h ++++ /dev/null +@@ -1 +0,0 @@ +-#include +Index: linux-2.6-tip/arch/x86/include/asm/irq_remapping.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/irq_remapping.h ++++ linux-2.6-tip/arch/x86/include/asm/irq_remapping.h +@@ -1,8 +1,6 @@ + #ifndef _ASM_X86_IRQ_REMAPPING_H + #define _ASM_X86_IRQ_REMAPPING_H + +-extern int x2apic; +- + #define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) + + #endif /* _ASM_X86_IRQ_REMAPPING_H */ +Index: linux-2.6-tip/arch/x86/include/asm/irq_vectors.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/irq_vectors.h ++++ linux-2.6-tip/arch/x86/include/asm/irq_vectors.h +@@ -1,47 +1,69 @@ + #ifndef _ASM_X86_IRQ_VECTORS_H + #define _ASM_X86_IRQ_VECTORS_H + +-#include ++/* ++ * Linux IRQ vector layout. ++ * ++ * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can ++ * be defined by Linux. They are used as a jump table by the CPU when a ++ * given vector is triggered - by a CPU-external, CPU-internal or ++ * software-triggered event. ++ * ++ * Linux sets the kernel code address each entry jumps to early during ++ * bootup, and never changes them. This is the general layout of the ++ * IDT entries: ++ * ++ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events ++ * Vectors 32 ... 127 : device interrupts ++ * Vector 128 : legacy int80 syscall interface ++ * Vectors 129 ... 237 : device interrupts ++ * Vectors 238 ... 255 : special interrupts ++ * ++ * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. ++ * ++ * This file enumerates the exact layout of them: ++ */ + +-#define NMI_VECTOR 0x02 ++#define NMI_VECTOR 0x02 + + /* + * IDT vectors usable for external interrupt sources start + * at 0x20: + */ +-#define FIRST_EXTERNAL_VECTOR 0x20 ++#define FIRST_EXTERNAL_VECTOR 0x20 + + #ifdef CONFIG_X86_32 +-# define SYSCALL_VECTOR 0x80 ++# define SYSCALL_VECTOR 0x80 + #else +-# define IA32_SYSCALL_VECTOR 0x80 ++# define IA32_SYSCALL_VECTOR 0x80 + #endif + + /* + * Reserve the lowest usable priority level 0x20 - 0x2f for triggering + * cleanup after irq migration. + */ +-#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR ++#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR + + /* + * Vectors 0x30-0x3f are used for ISA interrupts. + */ +-#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +-#define IRQ1_VECTOR (IRQ0_VECTOR + 1) +-#define IRQ2_VECTOR (IRQ0_VECTOR + 2) +-#define IRQ3_VECTOR (IRQ0_VECTOR + 3) +-#define IRQ4_VECTOR (IRQ0_VECTOR + 4) +-#define IRQ5_VECTOR (IRQ0_VECTOR + 5) +-#define IRQ6_VECTOR (IRQ0_VECTOR + 6) +-#define IRQ7_VECTOR (IRQ0_VECTOR + 7) +-#define IRQ8_VECTOR (IRQ0_VECTOR + 8) +-#define IRQ9_VECTOR (IRQ0_VECTOR + 9) +-#define IRQ10_VECTOR (IRQ0_VECTOR + 10) +-#define IRQ11_VECTOR (IRQ0_VECTOR + 11) +-#define IRQ12_VECTOR (IRQ0_VECTOR + 12) +-#define IRQ13_VECTOR (IRQ0_VECTOR + 13) +-#define IRQ14_VECTOR (IRQ0_VECTOR + 14) +-#define IRQ15_VECTOR (IRQ0_VECTOR + 15) ++#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) ++ ++#define IRQ1_VECTOR (IRQ0_VECTOR + 1) ++#define IRQ2_VECTOR (IRQ0_VECTOR + 2) ++#define IRQ3_VECTOR (IRQ0_VECTOR + 3) ++#define IRQ4_VECTOR (IRQ0_VECTOR + 4) ++#define IRQ5_VECTOR (IRQ0_VECTOR + 5) ++#define IRQ6_VECTOR (IRQ0_VECTOR + 6) ++#define IRQ7_VECTOR (IRQ0_VECTOR + 7) ++#define IRQ8_VECTOR (IRQ0_VECTOR + 8) ++#define IRQ9_VECTOR (IRQ0_VECTOR + 9) ++#define IRQ10_VECTOR (IRQ0_VECTOR + 10) ++#define IRQ11_VECTOR (IRQ0_VECTOR + 11) ++#define IRQ12_VECTOR (IRQ0_VECTOR + 12) ++#define IRQ13_VECTOR (IRQ0_VECTOR + 13) ++#define IRQ14_VECTOR (IRQ0_VECTOR + 14) ++#define IRQ15_VECTOR (IRQ0_VECTOR + 15) + + /* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff +@@ -49,119 +71,103 @@ + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. +- * +- * Vectors 0xf0-0xfa are free (reserved for future Linux use). + */ +-#ifdef CONFIG_X86_32 +- +-# define SPURIOUS_APIC_VECTOR 0xff +-# define ERROR_APIC_VECTOR 0xfe +-# define INVALIDATE_TLB_VECTOR 0xfd +-# define RESCHEDULE_VECTOR 0xfc +-# define CALL_FUNCTION_VECTOR 0xfb +-# define CALL_FUNCTION_SINGLE_VECTOR 0xfa +-# define THERMAL_APIC_VECTOR 0xf0 +- +-#else + + #define SPURIOUS_APIC_VECTOR 0xff ++/* ++ * Sanity check ++ */ ++#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) ++# error SPURIOUS_APIC_VECTOR definition error ++#endif ++ + #define ERROR_APIC_VECTOR 0xfe + #define RESCHEDULE_VECTOR 0xfd + #define CALL_FUNCTION_VECTOR 0xfc + #define CALL_FUNCTION_SINGLE_VECTOR 0xfb + #define THERMAL_APIC_VECTOR 0xfa +-#define THRESHOLD_APIC_VECTOR 0xf9 +-#define UV_BAU_MESSAGE 0xf8 +-#define INVALIDATE_TLB_VECTOR_END 0xf7 +-#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ +- +-#define NUM_INVALIDATE_TLB_VECTORS 8 + ++#ifdef CONFIG_X86_32 ++/* 0xf8 - 0xf9 : free */ ++#else ++# define THRESHOLD_APIC_VECTOR 0xf9 ++# define UV_BAU_MESSAGE 0xf8 + #endif + ++/* f0-f7 used for spreading out TLB flushes: */ ++#define INVALIDATE_TLB_VECTOR_END 0xf7 ++#define INVALIDATE_TLB_VECTOR_START 0xf0 ++#define NUM_INVALIDATE_TLB_VECTORS 8 ++ + /* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +-#define LOCAL_TIMER_VECTOR 0xef ++#define LOCAL_TIMER_VECTOR 0xef ++ ++/* ++ * Performance monitoring interrupt vector: ++ */ ++#define LOCAL_PERF_VECTOR 0xee ++ ++/* ++ * Generic system vector for platform specific use ++ */ ++#define GENERIC_INTERRUPT_VECTOR 0xed + + /* + * First APIC vector available to drivers: (vectors 0x30-0xee) we + * start at 0x31(0x41) to spread out vectors evenly between priority + * levels. (0x80 is the syscall vector) + */ +-#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) ++#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) + +-#define NR_VECTORS 256 ++#define NR_VECTORS 256 + +-#define FPU_IRQ 13 ++#define FPU_IRQ 13 + +-#define FIRST_VM86_IRQ 3 +-#define LAST_VM86_IRQ 15 +-#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) ++#define FIRST_VM86_IRQ 3 ++#define LAST_VM86_IRQ 15 + +-#define NR_IRQS_LEGACY 16 +- +-#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) +- +-#ifndef CONFIG_SPARSE_IRQ +-# if NR_CPUS < MAX_IO_APICS +-# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) +-# else +-# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +-# endif +-#else +-# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) +-# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) +-# else +-# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +-# endif ++#ifndef __ASSEMBLY__ ++static inline int invalid_vm86_irq(int irq) ++{ ++ return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; ++} + #endif + +-#elif defined(CONFIG_X86_VOYAGER) +- +-# define NR_IRQS 224 ++/* ++ * Size the maximum number of interrupts. ++ * ++ * If the irq_desc[] array has a sparse layout, we can size things ++ * generously - it scales up linearly with the maximum number of CPUs, ++ * and the maximum number of IO-APICs, whichever is higher. ++ * ++ * In other cases we size more conservatively, to not create too large ++ * static arrays. ++ */ + +-#else /* IO_APIC || VOYAGER */ ++#define NR_IRQS_LEGACY 16 + +-# define NR_IRQS 16 ++#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) ++#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) + ++#ifdef CONFIG_X86_IO_APIC ++# ifdef CONFIG_SPARSE_IRQ ++# define NR_IRQS \ ++ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ ++ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ ++ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) ++# else ++# if NR_CPUS < MAX_IO_APICS ++# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) ++# else ++# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) ++# endif ++# endif ++#else /* !CONFIG_X86_IO_APIC: */ ++# define NR_IRQS NR_IRQS_LEGACY + #endif + +-/* Voyager specific defines */ +-/* These define the CPIs we use in linux */ +-#define VIC_CPI_LEVEL0 0 +-#define VIC_CPI_LEVEL1 1 +-/* now the fake CPIs */ +-#define VIC_TIMER_CPI 2 +-#define VIC_INVALIDATE_CPI 3 +-#define VIC_RESCHEDULE_CPI 4 +-#define VIC_ENABLE_IRQ_CPI 5 +-#define VIC_CALL_FUNCTION_CPI 6 +-#define VIC_CALL_FUNCTION_SINGLE_CPI 7 +- +-/* Now the QIC CPIs: Since we don't need the two initial levels, +- * these are 2 less than the VIC CPIs */ +-#define QIC_CPI_OFFSET 1 +-#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET) +-#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET) +-#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET) +-#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET) +-#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET) +-#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET) +- +-#define VIC_START_FAKE_CPI VIC_TIMER_CPI +-#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI +- +-/* this is the SYS_INT CPI. */ +-#define VIC_SYS_INT 8 +-#define VIC_CMN_INT 15 +- +-/* This is the boot CPI for alternate processors. It gets overwritten +- * by the above once the system has activated all available processors */ +-#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0 +-#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8) +- +- + #endif /* _ASM_X86_IRQ_VECTORS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/kexec.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/kexec.h ++++ linux-2.6-tip/arch/x86/include/asm/kexec.h +@@ -10,27 +10,12 @@ + #else + # define PA_CONTROL_PAGE 0 + # define VA_CONTROL_PAGE 1 +-# define PA_PGD 2 +-# define VA_PGD 3 +-# define PA_PUD_0 4 +-# define VA_PUD_0 5 +-# define PA_PMD_0 6 +-# define VA_PMD_0 7 +-# define PA_PTE_0 8 +-# define VA_PTE_0 9 +-# define PA_PUD_1 10 +-# define VA_PUD_1 11 +-# define PA_PMD_1 12 +-# define VA_PMD_1 13 +-# define PA_PTE_1 14 +-# define VA_PTE_1 15 +-# define PA_TABLE_PAGE 16 +-# define PAGES_NR 17 ++# define PA_TABLE_PAGE 2 ++# define PA_SWAP_PAGE 3 ++# define PAGES_NR 4 + #endif + +-#ifdef CONFIG_X86_32 + # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 +-#endif + + #ifndef __ASSEMBLY__ + +@@ -151,15 +136,16 @@ relocate_kernel(unsigned long indirectio + unsigned int has_pae, + unsigned int preserve_context); + #else +-NORET_TYPE void ++unsigned long + relocate_kernel(unsigned long indirection_page, + unsigned long page_list, +- unsigned long start_address) ATTRIB_NORET; ++ unsigned long start_address, ++ unsigned int preserve_context); + #endif + +-#ifdef CONFIG_X86_32 + #define ARCH_HAS_KIMAGE_ARCH + ++#ifdef CONFIG_X86_32 + struct kimage_arch { + pgd_t *pgd; + #ifdef CONFIG_X86_PAE +@@ -169,6 +155,12 @@ struct kimage_arch { + pte_t *pte0; + pte_t *pte1; + }; ++#else ++struct kimage_arch { ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++}; + #endif + + #endif /* __ASSEMBLY__ */ +Index: linux-2.6-tip/arch/x86/include/asm/kmemcheck.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/kmemcheck.h +@@ -0,0 +1,42 @@ ++#ifndef ASM_X86_KMEMCHECK_H ++#define ASM_X86_KMEMCHECK_H ++ ++#include ++#include ++ ++#ifdef CONFIG_KMEMCHECK ++bool kmemcheck_active(struct pt_regs *regs); ++ ++void kmemcheck_show(struct pt_regs *regs); ++void kmemcheck_hide(struct pt_regs *regs); ++ ++bool kmemcheck_fault(struct pt_regs *regs, ++ unsigned long address, unsigned long error_code); ++bool kmemcheck_trap(struct pt_regs *regs); ++#else ++static inline bool kmemcheck_active(struct pt_regs *regs) ++{ ++ return false; ++} ++ ++static inline void kmemcheck_show(struct pt_regs *regs) ++{ ++} ++ ++static inline void kmemcheck_hide(struct pt_regs *regs) ++{ ++} ++ ++static inline bool kmemcheck_fault(struct pt_regs *regs, ++ unsigned long address, unsigned long error_code) ++{ ++ return false; ++} ++ ++static inline bool kmemcheck_trap(struct pt_regs *regs) ++{ ++ return false; ++} ++#endif /* CONFIG_KMEMCHECK */ ++ ++#endif +Index: linux-2.6-tip/arch/x86/include/asm/linkage.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/linkage.h ++++ linux-2.6-tip/arch/x86/include/asm/linkage.h +@@ -1,14 +1,11 @@ + #ifndef _ASM_X86_LINKAGE_H + #define _ASM_X86_LINKAGE_H + ++#include ++ + #undef notrace + #define notrace __attribute__((no_instrument_function)) + +-#ifdef CONFIG_X86_64 +-#define __ALIGN .p2align 4,,15 +-#define __ALIGN_STR ".p2align 4,,15" +-#endif +- + #ifdef CONFIG_X86_32 + #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) + /* +@@ -50,72 +47,20 @@ + __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ + "g" (arg4), "g" (arg5), "g" (arg6)) + +-#endif ++#endif /* CONFIG_X86_32 */ + +-#ifdef CONFIG_X86_ALIGNMENT_16 +-#define __ALIGN .align 16,0x90 +-#define __ALIGN_STR ".align 16,0x90" +-#endif ++#ifdef __ASSEMBLY__ + +-/* +- * to check ENTRY_X86/END_X86 and +- * KPROBE_ENTRY_X86/KPROBE_END_X86 +- * unbalanced-missed-mixed appearance +- */ +-#define __set_entry_x86 .set ENTRY_X86_IN, 0 +-#define __unset_entry_x86 .set ENTRY_X86_IN, 1 +-#define __set_kprobe_x86 .set KPROBE_X86_IN, 0 +-#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1 +- +-#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed" +- +-#define __check_entry_x86 \ +- .ifdef ENTRY_X86_IN; \ +- .ifeq ENTRY_X86_IN; \ +- __macro_err_x86; \ +- .abort; \ +- .endif; \ +- .endif +- +-#define __check_kprobe_x86 \ +- .ifdef KPROBE_X86_IN; \ +- .ifeq KPROBE_X86_IN; \ +- __macro_err_x86; \ +- .abort; \ +- .endif; \ +- .endif +- +-#define __check_entry_kprobe_x86 \ +- __check_entry_x86; \ +- __check_kprobe_x86 +- +-#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86 +- +-#define ENTRY_X86(name) \ +- __check_entry_kprobe_x86; \ +- __set_entry_x86; \ +- .globl name; \ +- __ALIGN; \ ++#define GLOBAL(name) \ ++ .globl name; \ + name: + +-#define END_X86(name) \ +- __unset_entry_x86; \ +- __check_entry_kprobe_x86; \ +- .size name, .-name +- +-#define KPROBE_ENTRY_X86(name) \ +- __check_entry_kprobe_x86; \ +- __set_kprobe_x86; \ +- .pushsection .kprobes.text, "ax"; \ +- .globl name; \ +- __ALIGN; \ +- name: ++#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16) ++#define __ALIGN .p2align 4, 0x90 ++#define __ALIGN_STR __stringify(__ALIGN) ++#endif + +-#define KPROBE_END_X86(name) \ +- __unset_kprobe_x86; \ +- __check_entry_kprobe_x86; \ +- .size name, .-name; \ +- .popsection ++#endif /* __ASSEMBLY__ */ + + #endif /* _ASM_X86_LINKAGE_H */ + +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/apm.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/apm.h ++++ /dev/null +@@ -1,73 +0,0 @@ +-/* +- * Machine specific APM BIOS functions for generic. +- * Split out from apm.c by Osamu Tomita +- */ +- +-#ifndef _ASM_X86_MACH_DEFAULT_APM_H +-#define _ASM_X86_MACH_DEFAULT_APM_H +- +-#ifdef APM_ZERO_SEGS +-# define APM_DO_ZERO_SEGS \ +- "pushl %%ds\n\t" \ +- "pushl %%es\n\t" \ +- "xorl %%edx, %%edx\n\t" \ +- "mov %%dx, %%ds\n\t" \ +- "mov %%dx, %%es\n\t" \ +- "mov %%dx, %%fs\n\t" \ +- "mov %%dx, %%gs\n\t" +-# define APM_DO_POP_SEGS \ +- "popl %%es\n\t" \ +- "popl %%ds\n\t" +-#else +-# define APM_DO_ZERO_SEGS +-# define APM_DO_POP_SEGS +-#endif +- +-static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, +- u32 *eax, u32 *ebx, u32 *ecx, +- u32 *edx, u32 *esi) +-{ +- /* +- * N.B. We do NOT need a cld after the BIOS call +- * because we always save and restore the flags. +- */ +- __asm__ __volatile__(APM_DO_ZERO_SEGS +- "pushl %%edi\n\t" +- "pushl %%ebp\n\t" +- "lcall *%%cs:apm_bios_entry\n\t" +- "setc %%al\n\t" +- "popl %%ebp\n\t" +- "popl %%edi\n\t" +- APM_DO_POP_SEGS +- : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx), +- "=S" (*esi) +- : "a" (func), "b" (ebx_in), "c" (ecx_in) +- : "memory", "cc"); +-} +- +-static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, +- u32 ecx_in, u32 *eax) +-{ +- int cx, dx, si; +- u8 error; +- +- /* +- * N.B. We do NOT need a cld after the BIOS call +- * because we always save and restore the flags. +- */ +- __asm__ __volatile__(APM_DO_ZERO_SEGS +- "pushl %%edi\n\t" +- "pushl %%ebp\n\t" +- "lcall *%%cs:apm_bios_entry\n\t" +- "setc %%bl\n\t" +- "popl %%ebp\n\t" +- "popl %%edi\n\t" +- APM_DO_POP_SEGS +- : "=a" (*eax), "=b" (error), "=c" (cx), "=d" (dx), +- "=S" (si) +- : "a" (func), "b" (ebx_in), "c" (ecx_in) +- : "memory", "cc"); +- return error; +-} +- +-#endif /* _ASM_X86_MACH_DEFAULT_APM_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/do_timer.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/do_timer.h ++++ /dev/null +@@ -1,16 +0,0 @@ +-/* defines for inline arch setup functions */ +-#include +- +-#include +-#include +- +-/** +- * do_timer_interrupt_hook - hook into timer tick +- * +- * Call the pit clock event handler. see asm/i8253.h +- **/ +- +-static inline void do_timer_interrupt_hook(void) +-{ +- global_clock_event->event_handler(global_clock_event); +-} +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/entry_arch.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/entry_arch.h ++++ /dev/null +@@ -1,36 +0,0 @@ +-/* +- * This file is designed to contain the BUILD_INTERRUPT specifications for +- * all of the extra named interrupt vectors used by the architecture. +- * Usually this is the Inter Process Interrupts (IPIs) +- */ +- +-/* +- * The following vectors are part of the Linux architecture, there +- * is no hardware IRQ pin equivalent for them, they are triggered +- * through the ICC by us (IPIs) +- */ +-#ifdef CONFIG_X86_SMP +-BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) +-BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR) +-BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) +-BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) +-BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) +-#endif +- +-/* +- * every pentium local APIC has two 'local interrupts', with a +- * soft-definable vector attached to both interrupts, one of +- * which is a timer interrupt, the other one is error counter +- * overflow. Linux uses the local APIC timer interrupt to get +- * a much simpler SMP time architecture: +- */ +-#ifdef CONFIG_X86_LOCAL_APIC +-BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) +-BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) +-BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +- +-#ifdef CONFIG_X86_MCE_P4THERMAL +-BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) +-#endif +- +-#endif +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/mach_apic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/mach_apic.h ++++ /dev/null +@@ -1,168 +0,0 @@ +-#ifndef _ASM_X86_MACH_DEFAULT_MACH_APIC_H +-#define _ASM_X86_MACH_DEFAULT_MACH_APIC_H +- +-#ifdef CONFIG_X86_LOCAL_APIC +- +-#include +-#include +- +-#define APIC_DFR_VALUE (APIC_DFR_FLAT) +- +-static inline const struct cpumask *target_cpus(void) +-{ +-#ifdef CONFIG_SMP +- return cpu_online_mask; +-#else +- return cpumask_of(0); +-#endif +-} +- +-#define NO_BALANCE_IRQ (0) +-#define esr_disable (0) +- +-#ifdef CONFIG_X86_64 +-#include +-#define INT_DELIVERY_MODE (genapic->int_delivery_mode) +-#define INT_DEST_MODE (genapic->int_dest_mode) +-#define TARGET_CPUS (genapic->target_cpus()) +-#define apic_id_registered (genapic->apic_id_registered) +-#define init_apic_ldr (genapic->init_apic_ldr) +-#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +-#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and) +-#define phys_pkg_id (genapic->phys_pkg_id) +-#define vector_allocation_domain (genapic->vector_allocation_domain) +-#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) +-#define send_IPI_self (genapic->send_IPI_self) +-#define wakeup_secondary_cpu (genapic->wakeup_cpu) +-extern void setup_apic_routing(void); +-#else +-#define INT_DELIVERY_MODE dest_LowestPrio +-#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ +-#define TARGET_CPUS (target_cpus()) +-#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init +-/* +- * Set up the logical destination ID. +- * +- * Intel recommends to set DFR, LDR and TPR before enabling +- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel +- * document number 292116). So here it goes... +- */ +-static inline void init_apic_ldr(void) +-{ +- unsigned long val; +- +- apic_write(APIC_DFR, APIC_DFR_VALUE); +- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; +- val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); +- apic_write(APIC_LDR, val); +-} +- +-static inline int apic_id_registered(void) +-{ +- return physid_isset(read_apic_id(), phys_cpu_present_map); +-} +- +-static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask) +-{ +- return cpumask_bits(cpumask)[0]; +-} +- +-static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, +- const struct cpumask *andmask) +-{ +- unsigned long mask1 = cpumask_bits(cpumask)[0]; +- unsigned long mask2 = cpumask_bits(andmask)[0]; +- unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; +- +- return (unsigned int)(mask1 & mask2 & mask3); +-} +- +-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +-{ +- return cpuid_apic >> index_msb; +-} +- +-static inline void setup_apic_routing(void) +-{ +-#ifdef CONFIG_X86_IO_APIC +- printk("Enabling APIC mode: %s. Using %d I/O APICs\n", +- "Flat", nr_ioapics); +-#endif +-} +- +-static inline int apicid_to_node(int logical_apicid) +-{ +-#ifdef CONFIG_SMP +- return apicid_2_node[hard_smp_processor_id()]; +-#else +- return 0; +-#endif +-} +- +-static inline void vector_allocation_domain(int cpu, struct cpumask *retmask) +-{ +- /* Careful. Some cpus do not strictly honor the set of cpus +- * specified in the interrupt destination when using lowest +- * priority interrupt delivery mode. +- * +- * In particular there was a hyperthreading cpu observed to +- * deliver interrupts to the wrong hyperthread when only one +- * hyperthread was specified in the interrupt desitination. +- */ +- *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } }; +-} +-#endif +- +-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +-{ +- return physid_isset(apicid, bitmap); +-} +- +-static inline unsigned long check_apicid_present(int bit) +-{ +- return physid_isset(bit, phys_cpu_present_map); +-} +- +-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +-{ +- return phys_map; +-} +- +-static inline int multi_timer_check(int apic, int irq) +-{ +- return 0; +-} +- +-/* Mapping from cpu number to logical apicid */ +-static inline int cpu_to_logical_apicid(int cpu) +-{ +- return 1 << cpu; +-} +- +-static inline int cpu_present_to_apicid(int mps_cpu) +-{ +- if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) +- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); +- else +- return BAD_APICID; +-} +- +-static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) +-{ +- return physid_mask_of_physid(phys_apicid); +-} +- +-static inline void setup_portio_remap(void) +-{ +-} +- +-static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +-{ +- return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); +-} +- +-static inline void enable_apic_mode(void) +-{ +-} +-#endif /* CONFIG_X86_LOCAL_APIC */ +-#endif /* _ASM_X86_MACH_DEFAULT_MACH_APIC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/mach_apicdef.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/mach_apicdef.h ++++ /dev/null +@@ -1,24 +0,0 @@ +-#ifndef _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H +-#define _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H +- +-#include +- +-#ifdef CONFIG_X86_64 +-#define APIC_ID_MASK (genapic->apic_id_mask) +-#define GET_APIC_ID(x) (genapic->get_apic_id(x)) +-#define SET_APIC_ID(x) (genapic->set_apic_id(x)) +-#else +-#define APIC_ID_MASK (0xF<<24) +-static inline unsigned get_apic_id(unsigned long x) +-{ +- unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); +- if (APIC_XAPIC(ver)) +- return (((x)>>24)&0xFF); +- else +- return (((x)>>24)&0xF); +-} +- +-#define GET_APIC_ID(x) get_apic_id(x) +-#endif +- +-#endif /* _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/mach_ipi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/mach_ipi.h ++++ /dev/null +@@ -1,64 +0,0 @@ +-#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H +-#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H +- +-/* Avoid include hell */ +-#define NMI_VECTOR 0x02 +- +-void send_IPI_mask_bitmask(const struct cpumask *mask, int vector); +-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +-void __send_IPI_shortcut(unsigned int shortcut, int vector); +- +-extern int no_broadcast; +- +-#ifdef CONFIG_X86_64 +-#include +-#define send_IPI_mask (genapic->send_IPI_mask) +-#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself) +-#else +-static inline void send_IPI_mask(const struct cpumask *mask, int vector) +-{ +- send_IPI_mask_bitmask(mask, vector); +-} +-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +-#endif +- +-static inline void __local_send_IPI_allbutself(int vector) +-{ +- if (no_broadcast || vector == NMI_VECTOR) +- send_IPI_mask_allbutself(cpu_online_mask, vector); +- else +- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); +-} +- +-static inline void __local_send_IPI_all(int vector) +-{ +- if (no_broadcast || vector == NMI_VECTOR) +- send_IPI_mask(cpu_online_mask, vector); +- else +- __send_IPI_shortcut(APIC_DEST_ALLINC, vector); +-} +- +-#ifdef CONFIG_X86_64 +-#define send_IPI_allbutself (genapic->send_IPI_allbutself) +-#define send_IPI_all (genapic->send_IPI_all) +-#else +-static inline void send_IPI_allbutself(int vector) +-{ +- /* +- * if there are no other CPUs in the system then we get an APIC send +- * error if we try to broadcast, thus avoid sending IPIs in this case. +- */ +- if (!(num_online_cpus() > 1)) +- return; +- +- __local_send_IPI_allbutself(vector); +- return; +-} +- +-static inline void send_IPI_all(int vector) +-{ +- __local_send_IPI_all(vector); +-} +-#endif +- +-#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/mach_mpparse.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/mach_mpparse.h ++++ /dev/null +@@ -1,17 +0,0 @@ +-#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H +-#define _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H +- +-static inline int +-mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) +-{ +- return 0; +-} +- +-/* Hook from generic ACPI tables.c */ +-static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- return 0; +-} +- +- +-#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/mach_mpspec.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/mach_mpspec.h ++++ /dev/null +@@ -1,12 +0,0 @@ +-#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H +-#define _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H +- +-#define MAX_IRQ_SOURCES 256 +- +-#if CONFIG_BASE_SMALL == 0 +-#define MAX_MP_BUSSES 256 +-#else +-#define MAX_MP_BUSSES 32 +-#endif +- +-#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/mach_timer.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/mach_timer.h ++++ /dev/null +@@ -1,48 +0,0 @@ +-/* +- * Machine specific calibrate_tsc() for generic. +- * Split out from timer_tsc.c by Osamu Tomita +- */ +-/* ------ Calibrate the TSC ------- +- * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). +- * Too much 64-bit arithmetic here to do this cleanly in C, and for +- * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) +- * output busy loop as low as possible. We avoid reading the CTC registers +- * directly because of the awkward 8-bit access mechanism of the 82C54 +- * device. +- */ +-#ifndef _ASM_X86_MACH_DEFAULT_MACH_TIMER_H +-#define _ASM_X86_MACH_DEFAULT_MACH_TIMER_H +- +-#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ +-#define CALIBRATE_LATCH \ +- ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) +- +-static inline void mach_prepare_counter(void) +-{ +- /* Set the Gate high, disable speaker */ +- outb((inb(0x61) & ~0x02) | 0x01, 0x61); +- +- /* +- * Now let's take care of CTC channel 2 +- * +- * Set the Gate high, program CTC channel 2 for mode 0, +- * (interrupt on terminal count mode), binary count, +- * load 5 * LATCH count, (LSB and MSB) to begin countdown. +- * +- * Some devices need a delay here. +- */ +- outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */ +- outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */ +- outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */ +-} +- +-static inline void mach_countup(unsigned long *count_p) +-{ +- unsigned long count = 0; +- do { +- count++; +- } while ((inb_p(0x61) & 0x20) == 0); +- *count_p = count; +-} +- +-#endif /* _ASM_X86_MACH_DEFAULT_MACH_TIMER_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/mach_traps.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/mach_traps.h ++++ /dev/null +@@ -1,33 +0,0 @@ +-/* +- * Machine specific NMI handling for generic. +- * Split out from traps.c by Osamu Tomita +- */ +-#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H +-#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H +- +-#include +- +-static inline unsigned char get_nmi_reason(void) +-{ +- return inb(0x61); +-} +- +-static inline void reassert_nmi(void) +-{ +- int old_reg = -1; +- +- if (do_i_have_lock_cmos()) +- old_reg = current_lock_cmos_reg(); +- else +- lock_cmos(0); /* register doesn't matter here */ +- outb(0x8f, 0x70); +- inb(0x71); /* dummy */ +- outb(0x0f, 0x70); +- inb(0x71); /* dummy */ +- if (old_reg >= 0) +- outb(old_reg, 0x70); +- else +- unlock_cmos(); +-} +- +-#endif /* _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/mach_wakecpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/mach_wakecpu.h ++++ /dev/null +@@ -1,41 +0,0 @@ +-#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H +-#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H +- +-#define TRAMPOLINE_PHYS_LOW (0x467) +-#define TRAMPOLINE_PHYS_HIGH (0x469) +- +-static inline void wait_for_init_deassert(atomic_t *deassert) +-{ +- while (!atomic_read(deassert)) +- cpu_relax(); +- return; +-} +- +-/* Nothing to do for most platforms, since cleared by the INIT cycle */ +-static inline void smp_callin_clear_local_apic(void) +-{ +-} +- +-static inline void store_NMI_vector(unsigned short *high, unsigned short *low) +-{ +-} +- +-static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) +-{ +-} +- +-#ifdef CONFIG_SMP +-extern void __inquire_remote_apic(int apicid); +-#else /* CONFIG_SMP */ +-static inline void __inquire_remote_apic(int apicid) +-{ +-} +-#endif /* CONFIG_SMP */ +- +-static inline void inquire_remote_apic(int apicid) +-{ +- if (apic_verbosity >= APIC_DEBUG) +- __inquire_remote_apic(apicid); +-} +- +-#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/pci-functions.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/pci-functions.h ++++ /dev/null +@@ -1,19 +0,0 @@ +-/* +- * PCI BIOS function numbering for conventional PCI BIOS +- * systems +- */ +- +-#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX +-#define PCIBIOS_PCI_BIOS_PRESENT 0xb101 +-#define PCIBIOS_FIND_PCI_DEVICE 0xb102 +-#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103 +-#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106 +-#define PCIBIOS_READ_CONFIG_BYTE 0xb108 +-#define PCIBIOS_READ_CONFIG_WORD 0xb109 +-#define PCIBIOS_READ_CONFIG_DWORD 0xb10a +-#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b +-#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c +-#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d +-#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e +-#define PCIBIOS_SET_PCI_HW_INT 0xb10f +- +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/setup_arch.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/setup_arch.h ++++ /dev/null +@@ -1,3 +0,0 @@ +-/* Hook to call BIOS initialisation function */ +- +-/* no action for generic */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-default/smpboot_hooks.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-default/smpboot_hooks.h ++++ /dev/null +@@ -1,61 +0,0 @@ +-/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws +- * which needs to alter them. */ +- +-static inline void smpboot_clear_io_apic_irqs(void) +-{ +-#ifdef CONFIG_X86_IO_APIC +- io_apic_irqs = 0; +-#endif +-} +- +-static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) +-{ +- CMOS_WRITE(0xa, 0xf); +- local_flush_tlb(); +- pr_debug("1.\n"); +- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = +- start_eip >> 4; +- pr_debug("2.\n"); +- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = +- start_eip & 0xf; +- pr_debug("3.\n"); +-} +- +-static inline void smpboot_restore_warm_reset_vector(void) +-{ +- /* +- * Install writable page 0 entry to set BIOS data area. +- */ +- local_flush_tlb(); +- +- /* +- * Paranoid: Set warm reset code and vector here back +- * to default values. +- */ +- CMOS_WRITE(0, 0xf); +- +- *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; +-} +- +-static inline void __init smpboot_setup_io_apic(void) +-{ +-#ifdef CONFIG_X86_IO_APIC +- /* +- * Here we can be sure that there is an IO-APIC in the system. Let's +- * go and set it up: +- */ +- if (!skip_ioapic_setup && nr_ioapics) +- setup_IO_APIC(); +- else { +- nr_ioapics = 0; +- localise_nmi_watchdog(); +- } +-#endif +-} +- +-static inline void smpboot_clear_io_apic(void) +-{ +-#ifdef CONFIG_X86_IO_APIC +- nr_ioapics = 0; +-#endif +-} +Index: linux-2.6-tip/arch/x86/include/asm/mach-generic/gpio.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-generic/gpio.h ++++ /dev/null +@@ -1,15 +0,0 @@ +-#ifndef _ASM_X86_MACH_GENERIC_GPIO_H +-#define _ASM_X86_MACH_GENERIC_GPIO_H +- +-int gpio_request(unsigned gpio, const char *label); +-void gpio_free(unsigned gpio); +-int gpio_direction_input(unsigned gpio); +-int gpio_direction_output(unsigned gpio, int value); +-int gpio_get_value(unsigned gpio); +-void gpio_set_value(unsigned gpio, int value); +-int gpio_to_irq(unsigned gpio); +-int irq_to_gpio(unsigned irq); +- +-#include /* cansleep wrappers */ +- +-#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-generic/mach_apic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-generic/mach_apic.h ++++ /dev/null +@@ -1,35 +0,0 @@ +-#ifndef _ASM_X86_MACH_GENERIC_MACH_APIC_H +-#define _ASM_X86_MACH_GENERIC_MACH_APIC_H +- +-#include +- +-#define esr_disable (genapic->ESR_DISABLE) +-#define NO_BALANCE_IRQ (genapic->no_balance_irq) +-#define INT_DELIVERY_MODE (genapic->int_delivery_mode) +-#define INT_DEST_MODE (genapic->int_dest_mode) +-#undef APIC_DEST_LOGICAL +-#define APIC_DEST_LOGICAL (genapic->apic_destination_logical) +-#define TARGET_CPUS (genapic->target_cpus()) +-#define apic_id_registered (genapic->apic_id_registered) +-#define init_apic_ldr (genapic->init_apic_ldr) +-#define ioapic_phys_id_map (genapic->ioapic_phys_id_map) +-#define setup_apic_routing (genapic->setup_apic_routing) +-#define multi_timer_check (genapic->multi_timer_check) +-#define apicid_to_node (genapic->apicid_to_node) +-#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid) +-#define cpu_present_to_apicid (genapic->cpu_present_to_apicid) +-#define apicid_to_cpu_present (genapic->apicid_to_cpu_present) +-#define setup_portio_remap (genapic->setup_portio_remap) +-#define check_apicid_present (genapic->check_apicid_present) +-#define check_phys_apicid_present (genapic->check_phys_apicid_present) +-#define check_apicid_used (genapic->check_apicid_used) +-#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +-#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and) +-#define vector_allocation_domain (genapic->vector_allocation_domain) +-#define enable_apic_mode (genapic->enable_apic_mode) +-#define phys_pkg_id (genapic->phys_pkg_id) +-#define wakeup_secondary_cpu (genapic->wakeup_cpu) +- +-extern void generic_bigsmp_probe(void); +- +-#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-generic/mach_apicdef.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-generic/mach_apicdef.h ++++ /dev/null +@@ -1,11 +0,0 @@ +-#ifndef _ASM_X86_MACH_GENERIC_MACH_APICDEF_H +-#define _ASM_X86_MACH_GENERIC_MACH_APICDEF_H +- +-#ifndef APIC_DEFINITION +-#include +- +-#define GET_APIC_ID (genapic->get_apic_id) +-#define APIC_ID_MASK (genapic->apic_id_mask) +-#endif +- +-#endif /* _ASM_X86_MACH_GENERIC_MACH_APICDEF_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-generic/mach_ipi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-generic/mach_ipi.h ++++ /dev/null +@@ -1,10 +0,0 @@ +-#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H +-#define _ASM_X86_MACH_GENERIC_MACH_IPI_H +- +-#include +- +-#define send_IPI_mask (genapic->send_IPI_mask) +-#define send_IPI_allbutself (genapic->send_IPI_allbutself) +-#define send_IPI_all (genapic->send_IPI_all) +- +-#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-generic/mach_mpparse.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-generic/mach_mpparse.h ++++ /dev/null +@@ -1,9 +0,0 @@ +-#ifndef _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H +-#define _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H +- +- +-extern int mps_oem_check(struct mpc_table *, char *, char *); +- +-extern int acpi_madt_oem_check(char *, char *); +- +-#endif /* _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-generic/mach_mpspec.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-generic/mach_mpspec.h ++++ /dev/null +@@ -1,12 +0,0 @@ +-#ifndef _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H +-#define _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H +- +-#define MAX_IRQ_SOURCES 256 +- +-/* Summit or generic (i.e. installer) kernels need lots of bus entries. */ +-/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */ +-#define MAX_MP_BUSSES 260 +- +-extern void numaq_mps_oem_check(struct mpc_table *, char *, char *); +- +-#endif /* _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-generic/mach_wakecpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-generic/mach_wakecpu.h ++++ /dev/null +@@ -1,12 +0,0 @@ +-#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H +-#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H +- +-#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low) +-#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high) +-#define wait_for_init_deassert (genapic->wait_for_init_deassert) +-#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic) +-#define store_NMI_vector (genapic->store_NMI_vector) +-#define restore_NMI_vector (genapic->restore_NMI_vector) +-#define inquire_remote_apic (genapic->inquire_remote_apic) +- +-#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-rdc321x/gpio.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-rdc321x/gpio.h ++++ /dev/null +@@ -1,60 +0,0 @@ +-#ifndef _ASM_X86_MACH_RDC321X_GPIO_H +-#define _ASM_X86_MACH_RDC321X_GPIO_H +- +-#include +- +-extern int rdc_gpio_get_value(unsigned gpio); +-extern void rdc_gpio_set_value(unsigned gpio, int value); +-extern int rdc_gpio_direction_input(unsigned gpio); +-extern int rdc_gpio_direction_output(unsigned gpio, int value); +-extern int rdc_gpio_request(unsigned gpio, const char *label); +-extern void rdc_gpio_free(unsigned gpio); +-extern void __init rdc321x_gpio_setup(void); +- +-/* Wrappers for the arch-neutral GPIO API */ +- +-static inline int gpio_request(unsigned gpio, const char *label) +-{ +- return rdc_gpio_request(gpio, label); +-} +- +-static inline void gpio_free(unsigned gpio) +-{ +- might_sleep(); +- rdc_gpio_free(gpio); +-} +- +-static inline int gpio_direction_input(unsigned gpio) +-{ +- return rdc_gpio_direction_input(gpio); +-} +- +-static inline int gpio_direction_output(unsigned gpio, int value) +-{ +- return rdc_gpio_direction_output(gpio, value); +-} +- +-static inline int gpio_get_value(unsigned gpio) +-{ +- return rdc_gpio_get_value(gpio); +-} +- +-static inline void gpio_set_value(unsigned gpio, int value) +-{ +- rdc_gpio_set_value(gpio, value); +-} +- +-static inline int gpio_to_irq(unsigned gpio) +-{ +- return gpio; +-} +- +-static inline int irq_to_gpio(unsigned irq) +-{ +- return irq; +-} +- +-/* For cansleep */ +-#include +- +-#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h ++++ /dev/null +@@ -1,12 +0,0 @@ +-#define PFX "rdc321x: " +- +-/* General purpose configuration and data registers */ +-#define RDC3210_CFGREG_ADDR 0x0CF8 +-#define RDC3210_CFGREG_DATA 0x0CFC +- +-#define RDC321X_GPIO_CTRL_REG1 0x48 +-#define RDC321X_GPIO_CTRL_REG2 0x84 +-#define RDC321X_GPIO_DATA_REG1 0x4c +-#define RDC321X_GPIO_DATA_REG2 0x88 +- +-#define RDC321X_MAX_GPIO 58 +Index: linux-2.6-tip/arch/x86/include/asm/mach-voyager/do_timer.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-voyager/do_timer.h ++++ /dev/null +@@ -1,17 +0,0 @@ +-/* defines for inline arch setup functions */ +-#include +- +-#include +-#include +- +-/** +- * do_timer_interrupt_hook - hook into timer tick +- * +- * Call the pit clock event handler. see asm/i8253.h +- **/ +-static inline void do_timer_interrupt_hook(void) +-{ +- global_clock_event->event_handler(global_clock_event); +- voyager_timer_interrupt(); +-} +- +Index: linux-2.6-tip/arch/x86/include/asm/mach-voyager/entry_arch.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-voyager/entry_arch.h ++++ /dev/null +@@ -1,26 +0,0 @@ +-/* -*- mode: c; c-basic-offset: 8 -*- */ +- +-/* Copyright (C) 2002 +- * +- * Author: James.Bottomley@HansenPartnership.com +- * +- * linux/arch/i386/voyager/entry_arch.h +- * +- * This file builds the VIC and QIC CPI gates +- */ +- +-/* initialise the voyager interrupt gates +- * +- * This uses the macros in irq.h to set up assembly jump gates. The +- * calls are then redirected to the same routine with smp_ prefixed */ +-BUILD_INTERRUPT(vic_sys_interrupt, VIC_SYS_INT) +-BUILD_INTERRUPT(vic_cmn_interrupt, VIC_CMN_INT) +-BUILD_INTERRUPT(vic_cpi_interrupt, VIC_CPI_LEVEL0); +- +-/* do all the QIC interrupts */ +-BUILD_INTERRUPT(qic_timer_interrupt, QIC_TIMER_CPI); +-BUILD_INTERRUPT(qic_invalidate_interrupt, QIC_INVALIDATE_CPI); +-BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI); +-BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI); +-BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI); +-BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI); +Index: linux-2.6-tip/arch/x86/include/asm/mach-voyager/setup_arch.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mach-voyager/setup_arch.h ++++ /dev/null +@@ -1,12 +0,0 @@ +-#include +-#include +-#define VOYAGER_BIOS_INFO ((struct voyager_bios_info *) \ +- (&boot_params.apm_bios_info)) +- +-/* Hook to call BIOS initialisation function */ +- +-/* for voyager, pass the voyager BIOS/SUS info area to the detection +- * routines */ +- +-#define ARCH_SETUP voyager_detect(VOYAGER_BIOS_INFO); +- +Index: linux-2.6-tip/arch/x86/include/asm/mach_timer.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/mach_timer.h +@@ -0,0 +1,48 @@ ++/* ++ * Machine specific calibrate_tsc() for generic. ++ * Split out from timer_tsc.c by Osamu Tomita ++ */ ++/* ------ Calibrate the TSC ------- ++ * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). ++ * Too much 64-bit arithmetic here to do this cleanly in C, and for ++ * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) ++ * output busy loop as low as possible. We avoid reading the CTC registers ++ * directly because of the awkward 8-bit access mechanism of the 82C54 ++ * device. ++ */ ++#ifndef _ASM_X86_MACH_DEFAULT_MACH_TIMER_H ++#define _ASM_X86_MACH_DEFAULT_MACH_TIMER_H ++ ++#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ ++#define CALIBRATE_LATCH \ ++ ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) ++ ++static inline void mach_prepare_counter(void) ++{ ++ /* Set the Gate high, disable speaker */ ++ outb((inb(0x61) & ~0x02) | 0x01, 0x61); ++ ++ /* ++ * Now let's take care of CTC channel 2 ++ * ++ * Set the Gate high, program CTC channel 2 for mode 0, ++ * (interrupt on terminal count mode), binary count, ++ * load 5 * LATCH count, (LSB and MSB) to begin countdown. ++ * ++ * Some devices need a delay here. ++ */ ++ outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */ ++ outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */ ++ outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */ ++} ++ ++static inline void mach_countup(unsigned long *count_p) ++{ ++ unsigned long count = 0; ++ do { ++ count++; ++ } while ((inb_p(0x61) & 0x20) == 0); ++ *count_p = count; ++} ++ ++#endif /* _ASM_X86_MACH_DEFAULT_MACH_TIMER_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mach_traps.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/mach_traps.h +@@ -0,0 +1,33 @@ ++/* ++ * Machine specific NMI handling for generic. ++ * Split out from traps.c by Osamu Tomita ++ */ ++#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H ++#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H ++ ++#include ++ ++static inline unsigned char get_nmi_reason(void) ++{ ++ return inb(0x61); ++} ++ ++static inline void reassert_nmi(void) ++{ ++ int old_reg = -1; ++ ++ if (do_i_have_lock_cmos()) ++ old_reg = current_lock_cmos_reg(); ++ else ++ lock_cmos(0); /* register doesn't matter here */ ++ outb(0x8f, 0x70); ++ inb(0x71); /* dummy */ ++ outb(0x0f, 0x70); ++ inb(0x71); /* dummy */ ++ if (old_reg >= 0) ++ outb(old_reg, 0x70); ++ else ++ unlock_cmos(); ++} ++ ++#endif /* _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mce.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mce.h ++++ linux-2.6-tip/arch/x86/include/asm/mce.h +@@ -11,6 +11,8 @@ + */ + + #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ ++#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ ++#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ + + #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ + #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ +@@ -90,14 +92,29 @@ extern int mce_disabled; + + #include + ++void mce_setup(struct mce *m); + void mce_log(struct mce *m); + DECLARE_PER_CPU(struct sys_device, device_mce); + extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + ++/* ++ * To support more than 128 would need to escape the predefined ++ * Linux defined extended banks first. ++ */ ++#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) ++ + #ifdef CONFIG_X86_MCE_INTEL + void mce_intel_feature_init(struct cpuinfo_x86 *c); ++void cmci_clear(void); ++void cmci_reenable(void); ++void cmci_rediscover(int dying); ++void cmci_recheck(void); + #else + static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } ++static inline void cmci_clear(void) {} ++static inline void cmci_reenable(void) {} ++static inline void cmci_rediscover(int dying) {} ++static inline void cmci_recheck(void) {} + #endif + + #ifdef CONFIG_X86_MCE_AMD +@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo + static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } + #endif + +-void mce_log_therm_throt_event(unsigned int cpu, __u64 status); ++extern int mce_available(struct cpuinfo_x86 *c); ++ ++void mce_log_therm_throt_event(__u64 status); + + extern atomic_t mce_entry; + + extern void do_machine_check(struct pt_regs *, long); ++ ++typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); ++DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); ++ ++enum mcp_flags { ++ MCP_TIMESTAMP = (1 << 0), /* log time stamp */ ++ MCP_UC = (1 << 1), /* log uncorrected errors */ ++}; ++extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); ++ + extern int mce_notify_user(void); + + #endif /* !CONFIG_X86_32 */ +@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x + #else + #define mcheck_init(c) do { } while (0) + #endif +-extern void stop_mce(void); +-extern void restart_mce(void); ++ ++extern void (*mce_threshold_vector)(void); + + #endif /* __KERNEL__ */ + #endif /* _ASM_X86_MCE_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mmu_context.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mmu_context.h ++++ linux-2.6-tip/arch/x86/include/asm/mmu_context.h +@@ -21,11 +21,54 @@ static inline void paravirt_activate_mm( + int init_new_context(struct task_struct *tsk, struct mm_struct *mm); + void destroy_context(struct mm_struct *mm); + +-#ifdef CONFIG_X86_32 +-# include "mmu_context_32.h" +-#else +-# include "mmu_context_64.h" ++ ++static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) ++{ ++#ifdef CONFIG_SMP ++ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) ++ percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); ++#endif ++} ++ ++static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, ++ struct task_struct *tsk) ++{ ++ unsigned cpu = smp_processor_id(); ++ ++ if (likely(prev != next)) { ++ /* stop flush ipis for the previous mm */ ++ cpu_clear(cpu, prev->cpu_vm_mask); ++#ifdef CONFIG_SMP ++ percpu_write(cpu_tlbstate.state, TLBSTATE_OK); ++ percpu_write(cpu_tlbstate.active_mm, next); + #endif ++ cpu_set(cpu, next->cpu_vm_mask); ++ ++ /* Re-load page tables */ ++ load_cr3(next->pgd); ++ ++ /* ++ * load the LDT, if the LDT is different: ++ */ ++ if (unlikely(prev->context.ldt != next->context.ldt)) ++ load_LDT_nolock(&next->context); ++ } ++#ifdef CONFIG_SMP ++ else { ++ percpu_write(cpu_tlbstate.state, TLBSTATE_OK); ++ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); ++ ++ if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { ++ /* We were in lazy tlb mode and leave_mm disabled ++ * tlb flush IPI delivery. We must reload CR3 ++ * to make sure to use no freed page tables. ++ */ ++ load_cr3(next->pgd); ++ load_LDT_nolock(&next->context); ++ } ++ } ++#endif ++} + + #define activate_mm(prev, next) \ + do { \ +@@ -33,5 +76,17 @@ do { \ + switch_mm((prev), (next), NULL); \ + } while (0); + ++#ifdef CONFIG_X86_32 ++#define deactivate_mm(tsk, mm) \ ++do { \ ++ lazy_load_gs(0); \ ++} while (0) ++#else ++#define deactivate_mm(tsk, mm) \ ++do { \ ++ load_gs_index(0); \ ++ loadsegment(fs, 0); \ ++} while (0) ++#endif + + #endif /* _ASM_X86_MMU_CONTEXT_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mmu_context_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mmu_context_32.h ++++ /dev/null +@@ -1,55 +0,0 @@ +-#ifndef _ASM_X86_MMU_CONTEXT_32_H +-#define _ASM_X86_MMU_CONTEXT_32_H +- +-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +-{ +-#ifdef CONFIG_SMP +- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) +- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY); +-#endif +-} +- +-static inline void switch_mm(struct mm_struct *prev, +- struct mm_struct *next, +- struct task_struct *tsk) +-{ +- int cpu = smp_processor_id(); +- +- if (likely(prev != next)) { +- /* stop flush ipis for the previous mm */ +- cpu_clear(cpu, prev->cpu_vm_mask); +-#ifdef CONFIG_SMP +- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); +- x86_write_percpu(cpu_tlbstate.active_mm, next); +-#endif +- cpu_set(cpu, next->cpu_vm_mask); +- +- /* Re-load page tables */ +- load_cr3(next->pgd); +- +- /* +- * load the LDT, if the LDT is different: +- */ +- if (unlikely(prev->context.ldt != next->context.ldt)) +- load_LDT_nolock(&next->context); +- } +-#ifdef CONFIG_SMP +- else { +- x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); +- BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next); +- +- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { +- /* We were in lazy tlb mode and leave_mm disabled +- * tlb flush IPI delivery. We must reload %cr3. +- */ +- load_cr3(next->pgd); +- load_LDT_nolock(&next->context); +- } +- } +-#endif +-} +- +-#define deactivate_mm(tsk, mm) \ +- asm("movl %0,%%gs": :"r" (0)); +- +-#endif /* _ASM_X86_MMU_CONTEXT_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mmu_context_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mmu_context_64.h ++++ /dev/null +@@ -1,54 +0,0 @@ +-#ifndef _ASM_X86_MMU_CONTEXT_64_H +-#define _ASM_X86_MMU_CONTEXT_64_H +- +-#include +- +-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +-{ +-#ifdef CONFIG_SMP +- if (read_pda(mmu_state) == TLBSTATE_OK) +- write_pda(mmu_state, TLBSTATE_LAZY); +-#endif +-} +- +-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, +- struct task_struct *tsk) +-{ +- unsigned cpu = smp_processor_id(); +- if (likely(prev != next)) { +- /* stop flush ipis for the previous mm */ +- cpu_clear(cpu, prev->cpu_vm_mask); +-#ifdef CONFIG_SMP +- write_pda(mmu_state, TLBSTATE_OK); +- write_pda(active_mm, next); +-#endif +- cpu_set(cpu, next->cpu_vm_mask); +- load_cr3(next->pgd); +- +- if (unlikely(next->context.ldt != prev->context.ldt)) +- load_LDT_nolock(&next->context); +- } +-#ifdef CONFIG_SMP +- else { +- write_pda(mmu_state, TLBSTATE_OK); +- if (read_pda(active_mm) != next) +- BUG(); +- if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { +- /* We were in lazy tlb mode and leave_mm disabled +- * tlb flush IPI delivery. We must reload CR3 +- * to make sure to use no freed page tables. +- */ +- load_cr3(next->pgd); +- load_LDT_nolock(&next->context); +- } +- } +-#endif +-} +- +-#define deactivate_mm(tsk, mm) \ +-do { \ +- load_gs_index(0); \ +- asm volatile("movl %0,%%fs"::"r"(0)); \ +-} while (0) +- +-#endif /* _ASM_X86_MMU_CONTEXT_64_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mmzone_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mmzone_32.h ++++ linux-2.6-tip/arch/x86/include/asm/mmzone_32.h +@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn) + #endif /* CONFIG_DISCONTIGMEM */ + + #ifdef CONFIG_NEED_MULTIPLE_NODES +- +-/* +- * Following are macros that are specific to this numa platform. +- */ +-#define reserve_bootmem(addr, size, flags) \ +- reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags)) +-#define alloc_bootmem(x) \ +- __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +-#define alloc_bootmem_nopanic(x) \ +- __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ +- __pa(MAX_DMA_ADDRESS)) +-#define alloc_bootmem_low(x) \ +- __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) +-#define alloc_bootmem_pages(x) \ +- __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +-#define alloc_bootmem_pages_nopanic(x) \ +- __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \ +- __pa(MAX_DMA_ADDRESS)) +-#define alloc_bootmem_low_pages(x) \ +- __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) +-#define alloc_bootmem_node(pgdat, x) \ +-({ \ +- struct pglist_data __maybe_unused \ +- *__alloc_bootmem_node__pgdat = (pgdat); \ +- __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ +- __pa(MAX_DMA_ADDRESS)); \ +-}) +-#define alloc_bootmem_pages_node(pgdat, x) \ +-({ \ +- struct pglist_data __maybe_unused \ +- *__alloc_bootmem_node__pgdat = (pgdat); \ +- __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ +- __pa(MAX_DMA_ADDRESS)); \ +-}) +-#define alloc_bootmem_low_pages_node(pgdat, x) \ +-({ \ +- struct pglist_data __maybe_unused \ +- *__alloc_bootmem_node__pgdat = (pgdat); \ +- __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ +-}) ++/* always use node 0 for bootmem on this numa platform */ ++#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \ ++ (NODE_DATA(0)->bdata) + #endif /* CONFIG_NEED_MULTIPLE_NODES */ + + #endif /* _ASM_X86_MMZONE_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mpspec.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mpspec.h ++++ linux-2.6-tip/arch/x86/include/asm/mpspec.h +@@ -9,7 +9,18 @@ extern int apic_version[MAX_APICS]; + extern int pic_mode; + + #ifdef CONFIG_X86_32 +-#include ++ ++/* ++ * Summit or generic (i.e. installer) kernels need lots of bus entries. ++ * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. ++ */ ++#if CONFIG_BASE_SMALL == 0 ++# define MAX_MP_BUSSES 260 ++#else ++# define MAX_MP_BUSSES 32 ++#endif ++ ++#define MAX_IRQ_SOURCES 256 + + extern unsigned int def_to_bigsmp; + extern u8 apicid_2_node[]; +@@ -20,15 +31,15 @@ extern int mp_bus_id_to_local[MAX_MP_BUS + extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; + #endif + +-#define MAX_APICID 256 ++#define MAX_APICID 256 + +-#else ++#else /* CONFIG_X86_64: */ + +-#define MAX_MP_BUSSES 256 ++#define MAX_MP_BUSSES 256 + /* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */ +-#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4) ++#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4) + +-#endif ++#endif /* CONFIG_X86_64 */ + + extern void early_find_smp_config(void); + extern void early_get_smp_config(void); +@@ -45,11 +56,13 @@ extern int smp_found_config; + extern int mpc_default_type; + extern unsigned long mp_lapic_addr; + +-extern void find_smp_config(void); + extern void get_smp_config(void); ++ + #ifdef CONFIG_X86_MPPARSE ++extern void find_smp_config(void); + extern void early_reserve_e820_mpc_new(void); + #else ++static inline void find_smp_config(void) { } + static inline void early_reserve_e820_mpc_new(void) { } + #endif + +@@ -64,6 +77,8 @@ extern int acpi_probe_gsi(void); + #ifdef CONFIG_X86_IO_APIC + extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, + u32 gsi, int triggering, int polarity); ++extern int mp_find_ioapic(int gsi); ++extern int mp_find_ioapic_pin(int ioapic, int gsi); + #else + static inline int + mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, +@@ -148,4 +163,8 @@ static inline void physid_set_mask_of_ph + + extern physid_mask_t phys_cpu_present_map; + ++extern int generic_mps_oem_check(struct mpc_table *, char *, char *); ++ ++extern int default_acpi_madt_oem_check(char *, char *); ++ + #endif /* _ASM_X86_MPSPEC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/mpspec_def.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/mpspec_def.h ++++ linux-2.6-tip/arch/x86/include/asm/mpspec_def.h +@@ -24,17 +24,18 @@ + # endif + #endif + +-struct intel_mp_floating { +- char mpf_signature[4]; /* "_MP_" */ +- unsigned int mpf_physptr; /* Configuration table address */ +- unsigned char mpf_length; /* Our length (paragraphs) */ +- unsigned char mpf_specification;/* Specification version */ +- unsigned char mpf_checksum; /* Checksum (makes sum 0) */ +- unsigned char mpf_feature1; /* Standard or configuration ? */ +- unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */ +- unsigned char mpf_feature3; /* Unused (0) */ +- unsigned char mpf_feature4; /* Unused (0) */ +- unsigned char mpf_feature5; /* Unused (0) */ ++/* Intel MP Floating Pointer Structure */ ++struct mpf_intel { ++ char signature[4]; /* "_MP_" */ ++ unsigned int physptr; /* Configuration table address */ ++ unsigned char length; /* Our length (paragraphs) */ ++ unsigned char specification; /* Specification version */ ++ unsigned char checksum; /* Checksum (makes sum 0) */ ++ unsigned char feature1; /* Standard or configuration ? */ ++ unsigned char feature2; /* Bit7 set for IMCR|PIC */ ++ unsigned char feature3; /* Unused (0) */ ++ unsigned char feature4; /* Unused (0) */ ++ unsigned char feature5; /* Unused (0) */ + }; + + #define MPC_SIGNATURE "PCMP" +Index: linux-2.6-tip/arch/x86/include/asm/msidef.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/msidef.h ++++ linux-2.6-tip/arch/x86/include/asm/msidef.h +@@ -47,6 +47,7 @@ + #define MSI_ADDR_DEST_ID_MASK 0x00ffff0 + #define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ + MSI_ADDR_DEST_ID_MASK) ++#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00) + + #define MSI_ADDR_IR_EXT_INT (1 << 4) + #define MSI_ADDR_IR_SHV (1 << 3) +Index: linux-2.6-tip/arch/x86/include/asm/msr-index.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/msr-index.h ++++ linux-2.6-tip/arch/x86/include/asm/msr-index.h +@@ -77,6 +77,11 @@ + #define MSR_IA32_MC0_ADDR 0x00000402 + #define MSR_IA32_MC0_MISC 0x00000403 + ++/* These are consecutive and not in the normal 4er MCE bank block */ ++#define MSR_IA32_MC0_CTL2 0x00000280 ++#define CMCI_EN (1ULL << 30) ++#define CMCI_THRESHOLD_MASK 0xffffULL ++ + #define MSR_P6_PERFCTR0 0x000000c1 + #define MSR_P6_PERFCTR1 0x000000c2 + #define MSR_P6_EVNTSEL0 0x00000186 +Index: linux-2.6-tip/arch/x86/include/asm/numa_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/numa_32.h ++++ linux-2.6-tip/arch/x86/include/asm/numa_32.h +@@ -4,8 +4,12 @@ + extern int pxm_to_nid(int pxm); + extern void numa_remove_cpu(int cpu); + +-#ifdef CONFIG_NUMA ++#ifdef CONFIG_HIGHMEM + extern void set_highmem_pages_init(void); ++#else ++static inline void set_highmem_pages_init(void) ++{ ++} + #endif + + #endif /* _ASM_X86_NUMA_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/numaq.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/numaq.h ++++ linux-2.6-tip/arch/x86/include/asm/numaq.h +@@ -31,6 +31,8 @@ + extern int found_numaq; + extern int get_memcfg_numaq(void); + ++extern void *xquad_portio; ++ + /* + * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the + */ +Index: linux-2.6-tip/arch/x86/include/asm/numaq/apic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/numaq/apic.h ++++ /dev/null +@@ -1,142 +0,0 @@ +-#ifndef __ASM_NUMAQ_APIC_H +-#define __ASM_NUMAQ_APIC_H +- +-#include +-#include +-#include +- +-#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) +- +-static inline const cpumask_t *target_cpus(void) +-{ +- return &CPU_MASK_ALL; +-} +- +-#define NO_BALANCE_IRQ (1) +-#define esr_disable (1) +- +-#define INT_DELIVERY_MODE dest_LowestPrio +-#define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */ +- +-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +-{ +- return physid_isset(apicid, bitmap); +-} +-static inline unsigned long check_apicid_present(int bit) +-{ +- return physid_isset(bit, phys_cpu_present_map); +-} +-#define apicid_cluster(apicid) (apicid & 0xF0) +- +-static inline int apic_id_registered(void) +-{ +- return 1; +-} +- +-static inline void init_apic_ldr(void) +-{ +- /* Already done in NUMA-Q firmware */ +-} +- +-static inline void setup_apic_routing(void) +-{ +- printk("Enabling APIC mode: %s. Using %d I/O APICs\n", +- "NUMA-Q", nr_ioapics); +-} +- +-/* +- * Skip adding the timer int on secondary nodes, which causes +- * a small but painful rift in the time-space continuum. +- */ +-static inline int multi_timer_check(int apic, int irq) +-{ +- return apic != 0 && irq == 0; +-} +- +-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +-{ +- /* We don't have a good way to do this yet - hack */ +- return physids_promote(0xFUL); +-} +- +-/* Mapping from cpu number to logical apicid */ +-extern u8 cpu_2_logical_apicid[]; +-static inline int cpu_to_logical_apicid(int cpu) +-{ +- if (cpu >= nr_cpu_ids) +- return BAD_APICID; +- return (int)cpu_2_logical_apicid[cpu]; +-} +- +-/* +- * Supporting over 60 cpus on NUMA-Q requires a locality-dependent +- * cpu to APIC ID relation to properly interact with the intelligent +- * mode of the cluster controller. +- */ +-static inline int cpu_present_to_apicid(int mps_cpu) +-{ +- if (mps_cpu < 60) +- return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3)); +- else +- return BAD_APICID; +-} +- +-static inline int apicid_to_node(int logical_apicid) +-{ +- return logical_apicid >> 4; +-} +- +-static inline physid_mask_t apicid_to_cpu_present(int logical_apicid) +-{ +- int node = apicid_to_node(logical_apicid); +- int cpu = __ffs(logical_apicid & 0xf); +- +- return physid_mask_of_physid(cpu + 4*node); +-} +- +-extern void *xquad_portio; +- +-static inline void setup_portio_remap(void) +-{ +- int num_quads = num_online_nodes(); +- +- if (num_quads <= 1) +- return; +- +- printk("Remapping cross-quad port I/O for %d quads\n", num_quads); +- xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD); +- printk("xquad_portio vaddr 0x%08lx, len %08lx\n", +- (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); +-} +- +-static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +-{ +- return (1); +-} +- +-static inline void enable_apic_mode(void) +-{ +-} +- +-/* +- * We use physical apicids here, not logical, so just return the default +- * physical broadcast to stop people from breaking us +- */ +-static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) +-{ +- return (int) 0xF; +-} +- +-static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, +- const struct cpumask *andmask) +-{ +- return (int) 0xF; +-} +- +-/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ +-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +-{ +- return cpuid_apic >> index_msb; +-} +- +-#endif /* __ASM_NUMAQ_APIC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/numaq/apicdef.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/numaq/apicdef.h ++++ /dev/null +@@ -1,14 +0,0 @@ +-#ifndef __ASM_NUMAQ_APICDEF_H +-#define __ASM_NUMAQ_APICDEF_H +- +- +-#define APIC_ID_MASK (0xF<<24) +- +-static inline unsigned get_apic_id(unsigned long x) +-{ +- return (((x)>>24)&0x0F); +-} +- +-#define GET_APIC_ID(x) get_apic_id(x) +- +-#endif +Index: linux-2.6-tip/arch/x86/include/asm/numaq/ipi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/numaq/ipi.h ++++ /dev/null +@@ -1,22 +0,0 @@ +-#ifndef __ASM_NUMAQ_IPI_H +-#define __ASM_NUMAQ_IPI_H +- +-void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); +- +-static inline void send_IPI_mask(const struct cpumask *mask, int vector) +-{ +- send_IPI_mask_sequence(mask, vector); +-} +- +-static inline void send_IPI_allbutself(int vector) +-{ +- send_IPI_mask_allbutself(cpu_online_mask, vector); +-} +- +-static inline void send_IPI_all(int vector) +-{ +- send_IPI_mask(cpu_online_mask, vector); +-} +- +-#endif /* __ASM_NUMAQ_IPI_H */ +Index: linux-2.6-tip/arch/x86/include/asm/numaq/mpparse.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/numaq/mpparse.h ++++ /dev/null +@@ -1,6 +0,0 @@ +-#ifndef __ASM_NUMAQ_MPPARSE_H +-#define __ASM_NUMAQ_MPPARSE_H +- +-extern void numaq_mps_oem_check(struct mpc_table *, char *, char *); +- +-#endif /* __ASM_NUMAQ_MPPARSE_H */ +Index: linux-2.6-tip/arch/x86/include/asm/numaq/wakecpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/numaq/wakecpu.h ++++ /dev/null +@@ -1,45 +0,0 @@ +-#ifndef __ASM_NUMAQ_WAKECPU_H +-#define __ASM_NUMAQ_WAKECPU_H +- +-/* This file copes with machines that wakeup secondary CPUs by NMIs */ +- +-#define TRAMPOLINE_PHYS_LOW (0x8) +-#define TRAMPOLINE_PHYS_HIGH (0xa) +- +-/* We don't do anything here because we use NMI's to boot instead */ +-static inline void wait_for_init_deassert(atomic_t *deassert) +-{ +-} +- +-/* +- * Because we use NMIs rather than the INIT-STARTUP sequence to +- * bootstrap the CPUs, the APIC may be in a weird state. Kick it. +- */ +-static inline void smp_callin_clear_local_apic(void) +-{ +- clear_local_APIC(); +-} +- +-static inline void store_NMI_vector(unsigned short *high, unsigned short *low) +-{ +- printk("Storing NMI vector\n"); +- *high = +- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)); +- *low = +- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)); +-} +- +-static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) +-{ +- printk("Restoring NMI vector\n"); +- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = +- *high; +- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = +- *low; +-} +- +-static inline void inquire_remote_apic(int apicid) +-{ +-} +- +-#endif /* __ASM_NUMAQ_WAKECPU_H */ +Index: linux-2.6-tip/arch/x86/include/asm/page.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/page.h ++++ linux-2.6-tip/arch/x86/include/asm/page.h +@@ -1,42 +1,11 @@ + #ifndef _ASM_X86_PAGE_H + #define _ASM_X86_PAGE_H + +-#include +- +-/* PAGE_SHIFT determines the page size */ +-#define PAGE_SHIFT 12 +-#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) +-#define PAGE_MASK (~(PAGE_SIZE-1)) ++#include + + #ifdef __KERNEL__ + +-#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) +-#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) +- +-/* Cast PAGE_MASK to a signed type so that it is sign-extended if +- virtual addresses are 32-bits but physical addresses are larger +- (ie, 32-bit PAE). */ +-#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) +- +-/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +-#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) +- +-/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +-#define PTE_FLAGS_MASK (~PTE_PFN_MASK) +- +-#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) +-#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) +- +-#define HPAGE_SHIFT PMD_SHIFT +-#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) +-#define HPAGE_MASK (~(HPAGE_SIZE - 1)) +-#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +- +-#define HUGE_MAX_HSTATE 2 +- +-#ifndef __ASSEMBLY__ +-#include +-#endif ++#include + + #ifdef CONFIG_X86_64 + #include +@@ -44,38 +13,18 @@ + #include + #endif /* CONFIG_X86_64 */ + +-#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) +- +-#define VM_DATA_DEFAULT_FLAGS \ +- (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ +- VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +- +- + #ifndef __ASSEMBLY__ + +-typedef struct { pgdval_t pgd; } pgd_t; +-typedef struct { pgprotval_t pgprot; } pgprot_t; +- +-extern int page_is_ram(unsigned long pagenr); +-extern int devmem_is_allowed(unsigned long pagenr); +-extern void map_devmem(unsigned long pfn, unsigned long size, +- pgprot_t vma_prot); +-extern void unmap_devmem(unsigned long pfn, unsigned long size, +- pgprot_t vma_prot); +- +-extern unsigned long max_low_pfn_mapped; +-extern unsigned long max_pfn_mapped; +- + struct page; + + static inline void clear_user_page(void *page, unsigned long vaddr, +- struct page *pg) ++ struct page *pg) + { + clear_page(page); + } + + static inline void copy_user_page(void *to, void *from, unsigned long vaddr, +- struct page *topage) ++ struct page *topage) + { + copy_page(to, from); + } +@@ -84,99 +33,6 @@ static inline void copy_user_page(void * + alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) + #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + +-static inline pgd_t native_make_pgd(pgdval_t val) +-{ +- return (pgd_t) { val }; +-} +- +-static inline pgdval_t native_pgd_val(pgd_t pgd) +-{ +- return pgd.pgd; +-} +- +-#if PAGETABLE_LEVELS >= 3 +-#if PAGETABLE_LEVELS == 4 +-typedef struct { pudval_t pud; } pud_t; +- +-static inline pud_t native_make_pud(pmdval_t val) +-{ +- return (pud_t) { val }; +-} +- +-static inline pudval_t native_pud_val(pud_t pud) +-{ +- return pud.pud; +-} +-#else /* PAGETABLE_LEVELS == 3 */ +-#include +- +-static inline pudval_t native_pud_val(pud_t pud) +-{ +- return native_pgd_val(pud.pgd); +-} +-#endif /* PAGETABLE_LEVELS == 4 */ +- +-typedef struct { pmdval_t pmd; } pmd_t; +- +-static inline pmd_t native_make_pmd(pmdval_t val) +-{ +- return (pmd_t) { val }; +-} +- +-static inline pmdval_t native_pmd_val(pmd_t pmd) +-{ +- return pmd.pmd; +-} +-#else /* PAGETABLE_LEVELS == 2 */ +-#include +- +-static inline pmdval_t native_pmd_val(pmd_t pmd) +-{ +- return native_pgd_val(pmd.pud.pgd); +-} +-#endif /* PAGETABLE_LEVELS >= 3 */ +- +-static inline pte_t native_make_pte(pteval_t val) +-{ +- return (pte_t) { .pte = val }; +-} +- +-static inline pteval_t native_pte_val(pte_t pte) +-{ +- return pte.pte; +-} +- +-static inline pteval_t native_pte_flags(pte_t pte) +-{ +- return native_pte_val(pte) & PTE_FLAGS_MASK; +-} +- +-#define pgprot_val(x) ((x).pgprot) +-#define __pgprot(x) ((pgprot_t) { (x) } ) +- +-#ifdef CONFIG_PARAVIRT +-#include +-#else /* !CONFIG_PARAVIRT */ +- +-#define pgd_val(x) native_pgd_val(x) +-#define __pgd(x) native_make_pgd(x) +- +-#ifndef __PAGETABLE_PUD_FOLDED +-#define pud_val(x) native_pud_val(x) +-#define __pud(x) native_make_pud(x) +-#endif +- +-#ifndef __PAGETABLE_PMD_FOLDED +-#define pmd_val(x) native_pmd_val(x) +-#define __pmd(x) native_make_pmd(x) +-#endif +- +-#define pte_val(x) native_pte_val(x) +-#define pte_flags(x) native_pte_flags(x) +-#define __pte(x) native_make_pte(x) +- +-#endif /* CONFIG_PARAVIRT */ +- + #define __pa(x) __phys_addr((unsigned long)(x)) + #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) + /* __pa_symbol should be used for C visible symbols. +Index: linux-2.6-tip/arch/x86/include/asm/page_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/page_32.h ++++ linux-2.6-tip/arch/x86/include/asm/page_32.h +@@ -1,82 +1,14 @@ + #ifndef _ASM_X86_PAGE_32_H + #define _ASM_X86_PAGE_32_H + +-/* +- * This handles the memory map. +- * +- * A __PAGE_OFFSET of 0xC0000000 means that the kernel has +- * a virtual address space of one gigabyte, which limits the +- * amount of physical memory you can use to about 950MB. +- * +- * If you want more physical memory than this then see the CONFIG_HIGHMEM4G +- * and CONFIG_HIGHMEM64G options in the kernel configuration. +- */ +-#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +- +-#ifdef CONFIG_4KSTACKS +-#define THREAD_ORDER 0 +-#else +-#define THREAD_ORDER 1 +-#endif +-#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +- +-#define STACKFAULT_STACK 0 +-#define DOUBLEFAULT_STACK 1 +-#define NMI_STACK 0 +-#define DEBUG_STACK 0 +-#define MCE_STACK 0 +-#define N_EXCEPTION_STACKS 1 +- +-#ifdef CONFIG_X86_PAE +-/* 44=32+12, the limit we can fit into an unsigned long pfn */ +-#define __PHYSICAL_MASK_SHIFT 44 +-#define __VIRTUAL_MASK_SHIFT 32 +-#define PAGETABLE_LEVELS 3 +- +-#ifndef __ASSEMBLY__ +-typedef u64 pteval_t; +-typedef u64 pmdval_t; +-typedef u64 pudval_t; +-typedef u64 pgdval_t; +-typedef u64 pgprotval_t; +- +-typedef union { +- struct { +- unsigned long pte_low, pte_high; +- }; +- pteval_t pte; +-} pte_t; +-#endif /* __ASSEMBLY__ +- */ +-#else /* !CONFIG_X86_PAE */ +-#define __PHYSICAL_MASK_SHIFT 32 +-#define __VIRTUAL_MASK_SHIFT 32 +-#define PAGETABLE_LEVELS 2 +- +-#ifndef __ASSEMBLY__ +-typedef unsigned long pteval_t; +-typedef unsigned long pmdval_t; +-typedef unsigned long pudval_t; +-typedef unsigned long pgdval_t; +-typedef unsigned long pgprotval_t; +- +-typedef union { +- pteval_t pte; +- pteval_t pte_low; +-} pte_t; +- +-#endif /* __ASSEMBLY__ */ +-#endif /* CONFIG_X86_PAE */ ++#include + + #ifndef __ASSEMBLY__ +-typedef struct page *pgtable_t; +-#endif + + #ifdef CONFIG_HUGETLB_PAGE + #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA + #endif + +-#ifndef __ASSEMBLY__ + #define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) + #ifdef CONFIG_DEBUG_VIRTUAL + extern unsigned long __phys_addr(unsigned long); +@@ -89,23 +21,6 @@ extern unsigned long __phys_addr(unsigne + #define pfn_valid(pfn) ((pfn) < max_mapnr) + #endif /* CONFIG_FLATMEM */ + +-extern int nx_enabled; +- +-/* +- * This much address space is reserved for vmalloc() and iomap() +- * as well as fixmap mappings. +- */ +-extern unsigned int __VMALLOC_RESERVE; +-extern int sysctl_legacy_va_layout; +- +-extern void find_low_pfn_range(void); +-extern unsigned long init_memory_mapping(unsigned long start, +- unsigned long end); +-extern void initmem_init(unsigned long, unsigned long); +-extern void free_initmem(void); +-extern void setup_bootmem_allocator(void); +- +- + #ifdef CONFIG_X86_USE_3DNOW + #include + +Index: linux-2.6-tip/arch/x86/include/asm/page_32_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/page_32_types.h +@@ -0,0 +1,65 @@ ++#ifndef _ASM_X86_PAGE_32_DEFS_H ++#define _ASM_X86_PAGE_32_DEFS_H ++ ++#include ++ ++/* ++ * This handles the memory map. ++ * ++ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has ++ * a virtual address space of one gigabyte, which limits the ++ * amount of physical memory you can use to about 950MB. ++ * ++ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G ++ * and CONFIG_HIGHMEM64G options in the kernel configuration. ++ */ ++#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) ++ ++#ifdef CONFIG_4KSTACKS ++#define THREAD_ORDER 0 ++#else ++#define THREAD_ORDER 1 ++#endif ++#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) ++ ++#define STACKFAULT_STACK 0 ++#define DOUBLEFAULT_STACK 1 ++#define NMI_STACK 0 ++#define DEBUG_STACK 0 ++#define MCE_STACK 0 ++#define N_EXCEPTION_STACKS 1 ++ ++#ifdef CONFIG_X86_PAE ++/* 44=32+12, the limit we can fit into an unsigned long pfn */ ++#define __PHYSICAL_MASK_SHIFT 44 ++#define __VIRTUAL_MASK_SHIFT 32 ++ ++#else /* !CONFIG_X86_PAE */ ++#define __PHYSICAL_MASK_SHIFT 32 ++#define __VIRTUAL_MASK_SHIFT 32 ++#endif /* CONFIG_X86_PAE */ ++ ++/* ++ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S) ++ */ ++#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) ++ ++#ifndef __ASSEMBLY__ ++ ++/* ++ * This much address space is reserved for vmalloc() and iomap() ++ * as well as fixmap mappings. ++ */ ++extern unsigned int __VMALLOC_RESERVE; ++extern int sysctl_legacy_va_layout; ++ ++extern void find_low_pfn_range(void); ++extern unsigned long init_memory_mapping(unsigned long start, ++ unsigned long end); ++extern void initmem_init(unsigned long, unsigned long); ++extern void free_initmem(void); ++extern void setup_bootmem_allocator(void); ++ ++#endif /* !__ASSEMBLY__ */ ++ ++#endif /* _ASM_X86_PAGE_32_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/page_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/page_64.h ++++ linux-2.6-tip/arch/x86/include/asm/page_64.h +@@ -1,105 +1,6 @@ + #ifndef _ASM_X86_PAGE_64_H + #define _ASM_X86_PAGE_64_H + +-#define PAGETABLE_LEVELS 4 +- +-#define THREAD_ORDER 1 +-#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +-#define CURRENT_MASK (~(THREAD_SIZE - 1)) +- +-#define EXCEPTION_STACK_ORDER 0 +-#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) +- +-#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) +-#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) +- +-#define IRQSTACK_ORDER 2 +-#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) +- +-#define STACKFAULT_STACK 1 +-#define DOUBLEFAULT_STACK 2 +-#define NMI_STACK 3 +-#define DEBUG_STACK 4 +-#define MCE_STACK 5 +-#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +- +-#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +-#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) +- +-/* +- * Set __PAGE_OFFSET to the most negative possible address + +- * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a +- * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's +- * what Xen requires. +- */ +-#define __PAGE_OFFSET _AC(0xffff880000000000, UL) +- +-#define __PHYSICAL_START CONFIG_PHYSICAL_START +-#define __KERNEL_ALIGN 0x200000 +- +-/* +- * Make sure kernel is aligned to 2MB address. Catching it at compile +- * time is better. Change your config file and compile the kernel +- * for a 2MB aligned address (CONFIG_PHYSICAL_START) +- */ +-#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 +-#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" +-#endif +- +-#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) +-#define __START_KERNEL_map _AC(0xffffffff80000000, UL) +- +-/* See Documentation/x86_64/mm.txt for a description of the memory map. */ +-#define __PHYSICAL_MASK_SHIFT 46 +-#define __VIRTUAL_MASK_SHIFT 48 +- +-/* +- * Kernel image size is limited to 512 MB (see level2_kernel_pgt in +- * arch/x86/kernel/head_64.S), and it is mapped here: +- */ +-#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) +-#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) +- +-#ifndef __ASSEMBLY__ +-void clear_page(void *page); +-void copy_page(void *to, void *from); +- +-/* duplicated to the one in bootmem.h */ +-extern unsigned long max_pfn; +-extern unsigned long phys_base; +- +-extern unsigned long __phys_addr(unsigned long); +-#define __phys_reloc_hide(x) (x) +- +-/* +- * These are used to make use of C type-checking.. +- */ +-typedef unsigned long pteval_t; +-typedef unsigned long pmdval_t; +-typedef unsigned long pudval_t; +-typedef unsigned long pgdval_t; +-typedef unsigned long pgprotval_t; +- +-typedef struct page *pgtable_t; +- +-typedef struct { pteval_t pte; } pte_t; +- +-#define vmemmap ((struct page *)VMEMMAP_START) +- +-extern unsigned long init_memory_mapping(unsigned long start, +- unsigned long end); +- +-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); +-extern void free_initmem(void); +- +-extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); +-extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); +- +-#endif /* !__ASSEMBLY__ */ +- +-#ifdef CONFIG_FLATMEM +-#define pfn_valid(pfn) ((pfn) < max_pfn) +-#endif +- ++#include + + #endif /* _ASM_X86_PAGE_64_H */ +Index: linux-2.6-tip/arch/x86/include/asm/page_64_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/page_64_types.h +@@ -0,0 +1,98 @@ ++#ifndef _ASM_X86_PAGE_64_DEFS_H ++#define _ASM_X86_PAGE_64_DEFS_H ++ ++#define THREAD_ORDER 1 ++#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) ++#define CURRENT_MASK (~(THREAD_SIZE - 1)) ++ ++#define EXCEPTION_STACK_ORDER 0 ++#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) ++ ++#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) ++#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) ++ ++#define IRQ_STACK_ORDER 2 ++#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) ++ ++#ifdef CONFIG_PREEMPT_RT ++# define STACKFAULT_STACK 0 ++# define DOUBLEFAULT_STACK 1 ++# define NMI_STACK 2 ++# define DEBUG_STACK 0 ++# define MCE_STACK 3 ++# define N_EXCEPTION_STACKS 3 /* hw limit: 7 */ ++#else ++# define STACKFAULT_STACK 1 ++# define DOUBLEFAULT_STACK 2 ++# define NMI_STACK 3 ++# define DEBUG_STACK 4 ++# define MCE_STACK 5 ++# define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ ++#endif ++ ++#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) ++#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) ++ ++/* ++ * Set __PAGE_OFFSET to the most negative possible address + ++ * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a ++ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's ++ * what Xen requires. ++ */ ++#define __PAGE_OFFSET _AC(0xffff880000000000, UL) ++ ++#define __PHYSICAL_START CONFIG_PHYSICAL_START ++#define __KERNEL_ALIGN 0x200000 ++ ++/* ++ * Make sure kernel is aligned to 2MB address. Catching it at compile ++ * time is better. Change your config file and compile the kernel ++ * for a 2MB aligned address (CONFIG_PHYSICAL_START) ++ */ ++#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 ++#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" ++#endif ++ ++#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) ++#define __START_KERNEL_map _AC(0xffffffff80000000, UL) ++ ++/* See Documentation/x86_64/mm.txt for a description of the memory map. */ ++#define __PHYSICAL_MASK_SHIFT 46 ++#define __VIRTUAL_MASK_SHIFT 48 ++ ++/* ++ * Kernel image size is limited to 512 MB (see level2_kernel_pgt in ++ * arch/x86/kernel/head_64.S), and it is mapped here: ++ */ ++#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) ++#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) ++ ++#ifndef __ASSEMBLY__ ++void clear_page(void *page); ++void copy_page(void *to, void *from); ++ ++/* duplicated to the one in bootmem.h */ ++extern unsigned long max_pfn; ++extern unsigned long phys_base; ++ ++extern unsigned long __phys_addr(unsigned long); ++#define __phys_reloc_hide(x) (x) ++ ++#define vmemmap ((struct page *)VMEMMAP_START) ++ ++extern unsigned long init_memory_mapping(unsigned long start, ++ unsigned long end); ++ ++extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); ++extern void free_initmem(void); ++ ++extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); ++extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); ++ ++#endif /* !__ASSEMBLY__ */ ++ ++#ifdef CONFIG_FLATMEM ++#define pfn_valid(pfn) ((pfn) < max_pfn) ++#endif ++ ++#endif /* _ASM_X86_PAGE_64_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/page_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/page_types.h +@@ -0,0 +1,59 @@ ++#ifndef _ASM_X86_PAGE_DEFS_H ++#define _ASM_X86_PAGE_DEFS_H ++ ++#include ++ ++/* PAGE_SHIFT determines the page size */ ++#define PAGE_SHIFT 12 ++#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) ++#define PAGE_MASK (~(PAGE_SIZE-1)) ++ ++#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) ++#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) ++ ++/* Cast PAGE_MASK to a signed type so that it is sign-extended if ++ virtual addresses are 32-bits but physical addresses are larger ++ (ie, 32-bit PAE). */ ++#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) ++ ++#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) ++#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) ++ ++#define HPAGE_SHIFT PMD_SHIFT ++#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) ++#define HPAGE_MASK (~(HPAGE_SIZE - 1)) ++#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) ++ ++#define HUGE_MAX_HSTATE 2 ++ ++#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) ++ ++#define VM_DATA_DEFAULT_FLAGS \ ++ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ ++ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) ++ ++#ifdef CONFIG_X86_64 ++#include ++#else ++#include ++#endif /* CONFIG_X86_64 */ ++ ++#ifndef __ASSEMBLY__ ++ ++enum bootmem_state { ++ BEFORE_BOOTMEM, ++ DURING_BOOTMEM, ++ AFTER_BOOTMEM ++}; ++ ++extern enum bootmem_state bootmem_state; ++ ++extern int page_is_ram(unsigned long pagenr); ++extern int devmem_is_allowed(unsigned long pagenr); ++ ++extern unsigned long max_low_pfn_mapped; ++extern unsigned long max_pfn_mapped; ++ ++#endif /* !__ASSEMBLY__ */ ++ ++#endif /* _ASM_X86_PAGE_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/paravirt.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/paravirt.h ++++ linux-2.6-tip/arch/x86/include/asm/paravirt.h +@@ -4,7 +4,7 @@ + * para-virtualization: those hooks are defined here. */ + + #ifdef CONFIG_PARAVIRT +-#include ++#include + #include + + /* Bitmask of what can be clobbered: usually at least eax. */ +@@ -12,21 +12,38 @@ + #define CLBR_EAX (1 << 0) + #define CLBR_ECX (1 << 1) + #define CLBR_EDX (1 << 2) ++#define CLBR_EDI (1 << 3) + +-#ifdef CONFIG_X86_64 +-#define CLBR_RSI (1 << 3) +-#define CLBR_RDI (1 << 4) ++#ifdef CONFIG_X86_32 ++/* CLBR_ANY should match all regs platform has. For i386, that's just it */ ++#define CLBR_ANY ((1 << 4) - 1) ++ ++#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) ++#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) ++#define CLBR_SCRATCH (0) ++#else ++#define CLBR_RAX CLBR_EAX ++#define CLBR_RCX CLBR_ECX ++#define CLBR_RDX CLBR_EDX ++#define CLBR_RDI CLBR_EDI ++#define CLBR_RSI (1 << 4) + #define CLBR_R8 (1 << 5) + #define CLBR_R9 (1 << 6) + #define CLBR_R10 (1 << 7) + #define CLBR_R11 (1 << 8) ++ + #define CLBR_ANY ((1 << 9) - 1) ++ ++#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ ++ CLBR_RCX | CLBR_R8 | CLBR_R9) ++#define CLBR_RET_REG (CLBR_RAX) ++#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) ++ + #include +-#else +-/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +-#define CLBR_ANY ((1 << 3) - 1) + #endif /* X86_64 */ + ++#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) ++ + #ifndef __ASSEMBLY__ + #include + #include +@@ -40,6 +57,14 @@ struct tss_struct; + struct mm_struct; + struct desc_struct; + ++/* ++ * Wrapper type for pointers to code which uses the non-standard ++ * calling convention. See PV_CALL_SAVE_REGS_THUNK below. ++ */ ++struct paravirt_callee_save { ++ void *func; ++}; ++ + /* general info */ + struct pv_info { + unsigned int kernel_rpl; +@@ -189,11 +214,15 @@ struct pv_irq_ops { + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. ++ * ++ * NOTE: These functions callers expect the callee to preserve ++ * more registers than the standard C calling convention. + */ +- unsigned long (*save_fl)(void); +- void (*restore_fl)(unsigned long); +- void (*irq_disable)(void); +- void (*irq_enable)(void); ++ struct paravirt_callee_save save_fl; ++ struct paravirt_callee_save restore_fl; ++ struct paravirt_callee_save irq_disable; ++ struct paravirt_callee_save irq_enable; ++ + void (*safe_halt)(void); + void (*halt)(void); + +@@ -244,7 +273,8 @@ struct pv_mmu_ops { + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_single)(unsigned long addr); +- void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, ++ void (*flush_tlb_others)(const struct cpumask *cpus, ++ struct mm_struct *mm, + unsigned long va); + + /* Hooks for allocating and freeing a pagetable top-level */ +@@ -278,18 +308,15 @@ struct pv_mmu_ops { + void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + +- pteval_t (*pte_val)(pte_t); +- pteval_t (*pte_flags)(pte_t); +- pte_t (*make_pte)(pteval_t pte); ++ struct paravirt_callee_save pte_val; ++ struct paravirt_callee_save make_pte; + +- pgdval_t (*pgd_val)(pgd_t); +- pgd_t (*make_pgd)(pgdval_t pgd); ++ struct paravirt_callee_save pgd_val; ++ struct paravirt_callee_save make_pgd; + + #if PAGETABLE_LEVELS >= 3 + #ifdef CONFIG_X86_PAE + void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); +- void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, +- pte_t *ptep, pte_t pte); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pmd_clear)(pmd_t *pmdp); +@@ -298,12 +325,12 @@ struct pv_mmu_ops { + + void (*set_pud)(pud_t *pudp, pud_t pudval); + +- pmdval_t (*pmd_val)(pmd_t); +- pmd_t (*make_pmd)(pmdval_t pmd); ++ struct paravirt_callee_save pmd_val; ++ struct paravirt_callee_save make_pmd; + + #if PAGETABLE_LEVELS == 4 +- pudval_t (*pud_val)(pud_t); +- pud_t (*make_pud)(pudval_t pud); ++ struct paravirt_callee_save pud_val; ++ struct paravirt_callee_save make_pud; + + void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); + #endif /* PAGETABLE_LEVELS == 4 */ +@@ -311,6 +338,7 @@ struct pv_mmu_ops { + + #ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); ++ void *(*kmap_atomic_pte_direct)(struct page *page, enum km_type type); + #endif + + struct pv_lazy_ops lazy_mode; +@@ -320,7 +348,7 @@ struct pv_mmu_ops { + /* Sometimes the physical address is a pfn, and sometimes its + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, +- unsigned long phys, pgprot_t flags); ++ phys_addr_t phys, pgprot_t flags); + }; + + struct raw_spinlock; +@@ -360,7 +388,7 @@ extern struct pv_lock_ops pv_lock_ops; + + #define paravirt_type(op) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ +- [paravirt_opptr] "m" (op) ++ [paravirt_opptr] "i" (&(op)) + #define paravirt_clobber(clobber) \ + [paravirt_clobber] "i" (clobber) + +@@ -388,6 +416,8 @@ extern struct pv_lock_ops pv_lock_ops; + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + + unsigned paravirt_patch_nop(void); ++unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); ++unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); + unsigned paravirt_patch_ignore(unsigned len); + unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, +@@ -412,7 +442,7 @@ int paravirt_disable_iospace(void); + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +-#define PARAVIRT_CALL "call *%[paravirt_opptr];" ++#define PARAVIRT_CALL "call *%c[paravirt_opptr];" + + /* + * These macros are intended to wrap calls through one of the paravirt +@@ -479,25 +509,45 @@ int paravirt_disable_iospace(void); + * makes sure the incoming and outgoing types are always correct. + */ + #ifdef CONFIG_X86_32 +-#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx ++#define PVOP_VCALL_ARGS \ ++ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS ++ ++#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) ++#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) ++#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) ++ + #define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) + #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS ++ ++#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) ++#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS ++ + #define EXTRA_CLOBBERS + #define VEXTRA_CLOBBERS +-#else +-#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx ++#else /* CONFIG_X86_64 */ ++#define PVOP_VCALL_ARGS \ ++ unsigned long __edi = __edi, __esi = __esi, \ ++ __edx = __edx, __ecx = __ecx + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax ++ ++#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) ++#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) ++#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) ++#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) ++ + #define PVOP_VCALL_CLOBBERS "=D" (__edi), \ + "=S" (__esi), "=d" (__edx), \ + "=c" (__ecx) +- + #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) + ++#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) ++#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS ++ + #define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" + #define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" +-#endif ++#endif /* CONFIG_X86_32 */ + + #ifdef CONFIG_PARAVIRT_DEBUG + #define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +@@ -505,10 +555,11 @@ int paravirt_disable_iospace(void); + #define PVOP_TEST_NULL(op) ((void)op) + #endif + +-#define __PVOP_CALL(rettype, op, pre, post, ...) \ ++#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ ++ pre, post, ...) \ + ({ \ + rettype __ret; \ +- PVOP_CALL_ARGS; \ ++ PVOP_CALL_ARGS; \ + PVOP_TEST_NULL(op); \ + /* This is 32-bit specific, but is okay in 64-bit */ \ + /* since this condition will never hold */ \ +@@ -516,70 +567,113 @@ int paravirt_disable_iospace(void); + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ +- : PVOP_CALL_CLOBBERS \ ++ : call_clbr \ + : paravirt_type(op), \ +- paravirt_clobber(CLBR_ANY), \ ++ paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ +- : "memory", "cc" EXTRA_CLOBBERS); \ ++ : "memory", "cc" extra_clbr); \ + __ret = (rettype)((((u64)__edx) << 32) | __eax); \ + } else { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ +- : PVOP_CALL_CLOBBERS \ ++ : call_clbr \ + : paravirt_type(op), \ +- paravirt_clobber(CLBR_ANY), \ ++ paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ +- : "memory", "cc" EXTRA_CLOBBERS); \ ++ : "memory", "cc" extra_clbr); \ + __ret = (rettype)__eax; \ + } \ + __ret; \ + }) +-#define __PVOP_VCALL(op, pre, post, ...) \ ++ ++#define __PVOP_CALL(rettype, op, pre, post, ...) \ ++ ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ ++ EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) ++ ++#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ ++ ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ ++ PVOP_CALLEE_CLOBBERS, , \ ++ pre, post, ##__VA_ARGS__) ++ ++ ++#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ + ({ \ + PVOP_VCALL_ARGS; \ + PVOP_TEST_NULL(op); \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ +- : PVOP_VCALL_CLOBBERS \ ++ : call_clbr \ + : paravirt_type(op), \ +- paravirt_clobber(CLBR_ANY), \ ++ paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ +- : "memory", "cc" VEXTRA_CLOBBERS); \ ++ : "memory", "cc" extra_clbr); \ + }) + ++#define __PVOP_VCALL(op, pre, post, ...) \ ++ ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ ++ VEXTRA_CLOBBERS, \ ++ pre, post, ##__VA_ARGS__) ++ ++#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ ++ ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ ++ PVOP_VCALLEE_CLOBBERS, , \ ++ pre, post, ##__VA_ARGS__) ++ ++ ++ + #define PVOP_CALL0(rettype, op) \ + __PVOP_CALL(rettype, op, "", "") + #define PVOP_VCALL0(op) \ + __PVOP_VCALL(op, "", "") + ++#define PVOP_CALLEE0(rettype, op) \ ++ __PVOP_CALLEESAVE(rettype, op, "", "") ++#define PVOP_VCALLEE0(op) \ ++ __PVOP_VCALLEESAVE(op, "", "") ++ ++ + #define PVOP_CALL1(rettype, op, arg1) \ +- __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1))) ++ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) + #define PVOP_VCALL1(op, arg1) \ +- __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1))) ++ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) ++ ++#define PVOP_CALLEE1(rettype, op, arg1) \ ++ __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) ++#define PVOP_VCALLEE1(op, arg1) \ ++ __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) ++ + + #define PVOP_CALL2(rettype, op, arg1, arg2) \ +- __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ +- "1" ((unsigned long)(arg2))) ++ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ ++ PVOP_CALL_ARG2(arg2)) + #define PVOP_VCALL2(op, arg1, arg2) \ +- __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ +- "1" ((unsigned long)(arg2))) ++ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ ++ PVOP_CALL_ARG2(arg2)) ++ ++#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ ++ __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ ++ PVOP_CALL_ARG2(arg2)) ++#define PVOP_VCALLEE2(op, arg1, arg2) \ ++ __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ ++ PVOP_CALL_ARG2(arg2)) ++ + + #define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ +- __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ +- "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) ++ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ ++ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + #define PVOP_VCALL3(op, arg1, arg2, arg3) \ +- __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ +- "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) ++ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ ++ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + + /* This is the only difference in x86_64. We can make it much simpler */ + #ifdef CONFIG_X86_32 + #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ +- "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ +- "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) ++ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ ++ PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) + #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ +@@ -587,13 +681,13 @@ int paravirt_disable_iospace(void); + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) + #else + #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ +- __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ +- "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ +- "3"((unsigned long)(arg4))) ++ __PVOP_CALL(rettype, op, "", "", \ ++ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ ++ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) + #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ +- __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ +- "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ +- "3"((unsigned long)(arg4))) ++ __PVOP_VCALL(op, "", "", \ ++ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ ++ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) + #endif + + static inline int paravirt_enabled(void) +@@ -984,10 +1078,11 @@ static inline void __flush_tlb_single(un + PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr); + } + +-static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, ++static inline void flush_tlb_others(const struct cpumask *cpumask, ++ struct mm_struct *mm, + unsigned long va) + { +- PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va); ++ PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va); + } + + static inline int paravirt_pgd_alloc(struct mm_struct *mm) +@@ -1040,6 +1135,14 @@ static inline void *kmap_atomic_pte(stru + ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); + return (void *)ret; + } ++ ++static inline void *kmap_atomic_pte_direct(struct page *page, enum km_type type) ++{ ++ unsigned long ret; ++ ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte_direct, ++ page, type); ++ return (void *)ret; ++} + #endif + + static inline void pte_update(struct mm_struct *mm, unsigned long addr, +@@ -1059,13 +1162,13 @@ static inline pte_t __pte(pteval_t val) + pteval_t ret; + + if (sizeof(pteval_t) > sizeof(long)) +- ret = PVOP_CALL2(pteval_t, +- pv_mmu_ops.make_pte, +- val, (u64)val >> 32); ++ ret = PVOP_CALLEE2(pteval_t, ++ pv_mmu_ops.make_pte, ++ val, (u64)val >> 32); + else +- ret = PVOP_CALL1(pteval_t, +- pv_mmu_ops.make_pte, +- val); ++ ret = PVOP_CALLEE1(pteval_t, ++ pv_mmu_ops.make_pte, ++ val); + + return (pte_t) { .pte = ret }; + } +@@ -1075,42 +1178,25 @@ static inline pteval_t pte_val(pte_t pte + pteval_t ret; + + if (sizeof(pteval_t) > sizeof(long)) +- ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, +- pte.pte, (u64)pte.pte >> 32); ++ ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val, ++ pte.pte, (u64)pte.pte >> 32); + else +- ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val, +- pte.pte); ++ ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val, ++ pte.pte); + + return ret; + } + +-static inline pteval_t pte_flags(pte_t pte) +-{ +- pteval_t ret; +- +- if (sizeof(pteval_t) > sizeof(long)) +- ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags, +- pte.pte, (u64)pte.pte >> 32); +- else +- ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags, +- pte.pte); +- +-#ifdef CONFIG_PARAVIRT_DEBUG +- BUG_ON(ret & PTE_PFN_MASK); +-#endif +- return ret; +-} +- + static inline pgd_t __pgd(pgdval_t val) + { + pgdval_t ret; + + if (sizeof(pgdval_t) > sizeof(long)) +- ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, +- val, (u64)val >> 32); ++ ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd, ++ val, (u64)val >> 32); + else +- ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, +- val); ++ ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd, ++ val); + + return (pgd_t) { ret }; + } +@@ -1120,11 +1206,11 @@ static inline pgdval_t pgd_val(pgd_t pgd + pgdval_t ret; + + if (sizeof(pgdval_t) > sizeof(long)) +- ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, +- pgd.pgd, (u64)pgd.pgd >> 32); ++ ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val, ++ pgd.pgd, (u64)pgd.pgd >> 32); + else +- ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, +- pgd.pgd); ++ ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val, ++ pgd.pgd); + + return ret; + } +@@ -1188,11 +1274,11 @@ static inline pmd_t __pmd(pmdval_t val) + pmdval_t ret; + + if (sizeof(pmdval_t) > sizeof(long)) +- ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, +- val, (u64)val >> 32); ++ ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd, ++ val, (u64)val >> 32); + else +- ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, +- val); ++ ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd, ++ val); + + return (pmd_t) { ret }; + } +@@ -1202,11 +1288,11 @@ static inline pmdval_t pmd_val(pmd_t pmd + pmdval_t ret; + + if (sizeof(pmdval_t) > sizeof(long)) +- ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, +- pmd.pmd, (u64)pmd.pmd >> 32); ++ ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val, ++ pmd.pmd, (u64)pmd.pmd >> 32); + else +- ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, +- pmd.pmd); ++ ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val, ++ pmd.pmd); + + return ret; + } +@@ -1228,11 +1314,11 @@ static inline pud_t __pud(pudval_t val) + pudval_t ret; + + if (sizeof(pudval_t) > sizeof(long)) +- ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, +- val, (u64)val >> 32); ++ ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud, ++ val, (u64)val >> 32); + else +- ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, +- val); ++ ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud, ++ val); + + return (pud_t) { ret }; + } +@@ -1242,11 +1328,11 @@ static inline pudval_t pud_val(pud_t pud + pudval_t ret; + + if (sizeof(pudval_t) > sizeof(long)) +- ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val, +- pud.pud, (u64)pud.pud >> 32); ++ ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val, ++ pud.pud, (u64)pud.pud >> 32); + else +- ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val, +- pud.pud); ++ ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val, ++ pud.pud); + + return ret; + } +@@ -1286,13 +1372,6 @@ static inline void set_pte_atomic(pte_t + pte.pte, pte.pte >> 32); + } + +-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, +- pte_t *ptep, pte_t pte) +-{ +- /* 5 arg words */ +- pv_mmu_ops.set_pte_present(mm, addr, ptep, pte); +-} +- + static inline void pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) + { +@@ -1309,12 +1388,6 @@ static inline void set_pte_atomic(pte_t + set_pte(ptep, pte); + } + +-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, +- pte_t *ptep, pte_t pte) +-{ +- set_pte(ptep, pte); +-} +- + static inline void pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) + { +@@ -1368,15 +1441,16 @@ static inline void arch_leave_lazy_mmu_m + void arch_flush_lazy_mmu_mode(void); + + static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, +- unsigned long phys, pgprot_t flags) ++ phys_addr_t phys, pgprot_t flags) + { + pv_mmu_ops.set_fixmap(idx, phys, flags); + } + + void _paravirt_nop(void); +-#define paravirt_nop ((void *)_paravirt_nop) ++u32 _paravirt_ident_32(u32); ++u64 _paravirt_ident_64(u64); + +-void paravirt_use_bytelocks(void); ++#define paravirt_nop ((void *)_paravirt_nop) + + #ifdef CONFIG_SMP + +@@ -1426,12 +1500,37 @@ extern struct paravirt_patch_site __para + __parainstructions_end[]; + + #ifdef CONFIG_X86_32 +-#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;" +-#define PV_RESTORE_REGS "popl %%edx; popl %%ecx" ++#define PV_SAVE_REGS "pushl %ecx; pushl %edx;" ++#define PV_RESTORE_REGS "popl %edx; popl %ecx;" ++ ++/* save and restore all caller-save registers, except return value */ ++#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" ++#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" ++ + #define PV_FLAGS_ARG "0" + #define PV_EXTRA_CLOBBERS + #define PV_VEXTRA_CLOBBERS + #else ++/* save and restore all caller-save registers, except return value */ ++#define PV_SAVE_ALL_CALLER_REGS \ ++ "push %rcx;" \ ++ "push %rdx;" \ ++ "push %rsi;" \ ++ "push %rdi;" \ ++ "push %r8;" \ ++ "push %r9;" \ ++ "push %r10;" \ ++ "push %r11;" ++#define PV_RESTORE_ALL_CALLER_REGS \ ++ "pop %r11;" \ ++ "pop %r10;" \ ++ "pop %r9;" \ ++ "pop %r8;" \ ++ "pop %rdi;" \ ++ "pop %rsi;" \ ++ "pop %rdx;" \ ++ "pop %rcx;" ++ + /* We save some registers, but all of them, that's too much. We clobber all + * caller saved registers but the argument parameter */ + #define PV_SAVE_REGS "pushq %%rdi;" +@@ -1441,52 +1540,76 @@ extern struct paravirt_patch_site __para + #define PV_FLAGS_ARG "D" + #endif + ++/* ++ * Generate a thunk around a function which saves all caller-save ++ * registers except for the return value. This allows C functions to ++ * be called from assembler code where fewer than normal registers are ++ * available. It may also help code generation around calls from C ++ * code if the common case doesn't use many registers. ++ * ++ * When a callee is wrapped in a thunk, the caller can assume that all ++ * arg regs and all scratch registers are preserved across the ++ * call. The return value in rax/eax will not be saved, even for void ++ * functions. ++ */ ++#define PV_CALLEE_SAVE_REGS_THUNK(func) \ ++ extern typeof(func) __raw_callee_save_##func; \ ++ static void *__##func##__ __used = func; \ ++ \ ++ asm(".pushsection .text;" \ ++ "__raw_callee_save_" #func ": " \ ++ PV_SAVE_ALL_CALLER_REGS \ ++ "call " #func ";" \ ++ PV_RESTORE_ALL_CALLER_REGS \ ++ "ret;" \ ++ ".popsection") ++ ++/* Get a reference to a callee-save function */ ++#define PV_CALLEE_SAVE(func) \ ++ ((struct paravirt_callee_save) { __raw_callee_save_##func }) ++ ++/* Promise that "func" already uses the right calling convention */ ++#define __PV_IS_CALLEE_SAVE(func) \ ++ ((struct paravirt_callee_save) { func }) ++ + static inline unsigned long __raw_local_save_flags(void) + { + unsigned long f; + +- asm volatile(paravirt_alt(PV_SAVE_REGS +- PARAVIRT_CALL +- PV_RESTORE_REGS) ++ asm volatile(paravirt_alt(PARAVIRT_CALL) + : "=a"(f) + : paravirt_type(pv_irq_ops.save_fl), + paravirt_clobber(CLBR_EAX) +- : "memory", "cc" PV_VEXTRA_CLOBBERS); ++ : "memory", "cc"); + return f; + } + + static inline void raw_local_irq_restore(unsigned long f) + { +- asm volatile(paravirt_alt(PV_SAVE_REGS +- PARAVIRT_CALL +- PV_RESTORE_REGS) ++ asm volatile(paravirt_alt(PARAVIRT_CALL) + : "=a"(f) + : PV_FLAGS_ARG(f), + paravirt_type(pv_irq_ops.restore_fl), + paravirt_clobber(CLBR_EAX) +- : "memory", "cc" PV_EXTRA_CLOBBERS); ++ : "memory", "cc"); + } + + static inline void raw_local_irq_disable(void) + { +- asm volatile(paravirt_alt(PV_SAVE_REGS +- PARAVIRT_CALL +- PV_RESTORE_REGS) ++ asm volatile(paravirt_alt(PARAVIRT_CALL) + : + : paravirt_type(pv_irq_ops.irq_disable), + paravirt_clobber(CLBR_EAX) +- : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); ++ : "memory", "eax", "cc"); + } + + static inline void raw_local_irq_enable(void) + { +- asm volatile(paravirt_alt(PV_SAVE_REGS +- PARAVIRT_CALL +- PV_RESTORE_REGS) ++ asm volatile(paravirt_alt(PARAVIRT_CALL) + : + : paravirt_type(pv_irq_ops.irq_enable), + paravirt_clobber(CLBR_EAX) +- : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); ++ : "memory", "eax", "cc"); + } + + static inline unsigned long __raw_local_irq_save(void) +@@ -1529,33 +1652,49 @@ static inline unsigned long __raw_local_ + .popsection + + ++#define COND_PUSH(set, mask, reg) \ ++ .if ((~(set)) & mask); push %reg; .endif ++#define COND_POP(set, mask, reg) \ ++ .if ((~(set)) & mask); pop %reg; .endif ++ + #ifdef CONFIG_X86_64 +-#define PV_SAVE_REGS \ +- push %rax; \ +- push %rcx; \ +- push %rdx; \ +- push %rsi; \ +- push %rdi; \ +- push %r8; \ +- push %r9; \ +- push %r10; \ +- push %r11 +-#define PV_RESTORE_REGS \ +- pop %r11; \ +- pop %r10; \ +- pop %r9; \ +- pop %r8; \ +- pop %rdi; \ +- pop %rsi; \ +- pop %rdx; \ +- pop %rcx; \ +- pop %rax ++ ++#define PV_SAVE_REGS(set) \ ++ COND_PUSH(set, CLBR_RAX, rax); \ ++ COND_PUSH(set, CLBR_RCX, rcx); \ ++ COND_PUSH(set, CLBR_RDX, rdx); \ ++ COND_PUSH(set, CLBR_RSI, rsi); \ ++ COND_PUSH(set, CLBR_RDI, rdi); \ ++ COND_PUSH(set, CLBR_R8, r8); \ ++ COND_PUSH(set, CLBR_R9, r9); \ ++ COND_PUSH(set, CLBR_R10, r10); \ ++ COND_PUSH(set, CLBR_R11, r11) ++#define PV_RESTORE_REGS(set) \ ++ COND_POP(set, CLBR_R11, r11); \ ++ COND_POP(set, CLBR_R10, r10); \ ++ COND_POP(set, CLBR_R9, r9); \ ++ COND_POP(set, CLBR_R8, r8); \ ++ COND_POP(set, CLBR_RDI, rdi); \ ++ COND_POP(set, CLBR_RSI, rsi); \ ++ COND_POP(set, CLBR_RDX, rdx); \ ++ COND_POP(set, CLBR_RCX, rcx); \ ++ COND_POP(set, CLBR_RAX, rax) ++ + #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) + #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) + #define PARA_INDIRECT(addr) *addr(%rip) + #else +-#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx +-#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax ++#define PV_SAVE_REGS(set) \ ++ COND_PUSH(set, CLBR_EAX, eax); \ ++ COND_PUSH(set, CLBR_EDI, edi); \ ++ COND_PUSH(set, CLBR_ECX, ecx); \ ++ COND_PUSH(set, CLBR_EDX, edx) ++#define PV_RESTORE_REGS(set) \ ++ COND_POP(set, CLBR_EDX, edx); \ ++ COND_POP(set, CLBR_ECX, ecx); \ ++ COND_POP(set, CLBR_EDI, edi); \ ++ COND_POP(set, CLBR_EAX, eax) ++ + #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) + #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) + #define PARA_INDIRECT(addr) *%cs:addr +@@ -1567,15 +1706,15 @@ static inline unsigned long __raw_local_ + + #define DISABLE_INTERRUPTS(clobbers) \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ +- PV_SAVE_REGS; \ ++ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ +- PV_RESTORE_REGS;) \ ++ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) + + #define ENABLE_INTERRUPTS(clobbers) \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ +- PV_SAVE_REGS; \ ++ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ +- PV_RESTORE_REGS;) ++ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) + + #define USERGS_SYSRET32 \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ +@@ -1605,11 +1744,15 @@ static inline unsigned long __raw_local_ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ + swapgs) + ++/* ++ * Note: swapgs is very special, and in practise is either going to be ++ * implemented with a single "swapgs" instruction or something very ++ * special. Either way, we don't need to save any registers for ++ * it. ++ */ + #define SWAPGS \ + PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ +- PV_SAVE_REGS; \ +- call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \ +- PV_RESTORE_REGS \ ++ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \ + ) + + #define GET_CR2_INTO_RCX \ +Index: linux-2.6-tip/arch/x86/include/asm/pat.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pat.h ++++ linux-2.6-tip/arch/x86/include/asm/pat.h +@@ -2,13 +2,12 @@ + #define _ASM_X86_PAT_H + + #include ++#include + + #ifdef CONFIG_X86_PAT + extern int pat_enabled; +-extern void validate_pat_support(struct cpuinfo_x86 *c); + #else + static const int pat_enabled; +-static inline void validate_pat_support(struct cpuinfo_x86 *c) { } + #endif + + extern void pat_init(void); +@@ -17,6 +16,11 @@ extern int reserve_memtype(u64 start, u6 + unsigned long req_type, unsigned long *ret_type); + extern int free_memtype(u64 start, u64 end); + +-extern void pat_disable(char *reason); ++extern int kernel_map_sync_memtype(u64 base, unsigned long size, ++ unsigned long flag); ++extern void map_devmem(unsigned long pfn, unsigned long size, ++ struct pgprot vma_prot); ++extern void unmap_devmem(unsigned long pfn, unsigned long size, ++ struct pgprot vma_prot); + + #endif /* _ASM_X86_PAT_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pci-functions.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/pci-functions.h +@@ -0,0 +1,19 @@ ++/* ++ * PCI BIOS function numbering for conventional PCI BIOS ++ * systems ++ */ ++ ++#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX ++#define PCIBIOS_PCI_BIOS_PRESENT 0xb101 ++#define PCIBIOS_FIND_PCI_DEVICE 0xb102 ++#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103 ++#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106 ++#define PCIBIOS_READ_CONFIG_BYTE 0xb108 ++#define PCIBIOS_READ_CONFIG_WORD 0xb109 ++#define PCIBIOS_READ_CONFIG_DWORD 0xb10a ++#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b ++#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c ++#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d ++#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e ++#define PCIBIOS_SET_PCI_HW_INT 0xb10f ++ +Index: linux-2.6-tip/arch/x86/include/asm/pci.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pci.h ++++ linux-2.6-tip/arch/x86/include/asm/pci.h +@@ -109,11 +109,6 @@ static inline int __pcibus_to_node(const + return sd->node; + } + +-static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus) +-{ +- return node_to_cpumask(__pcibus_to_node(bus)); +-} +- + static inline const struct cpumask * + cpumask_of_pcibus(const struct pci_bus *bus) + { +Index: linux-2.6-tip/arch/x86/include/asm/pda.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pda.h ++++ /dev/null +@@ -1,137 +0,0 @@ +-#ifndef _ASM_X86_PDA_H +-#define _ASM_X86_PDA_H +- +-#ifndef __ASSEMBLY__ +-#include +-#include +-#include +-#include +- +-/* Per processor datastructure. %gs points to it while the kernel runs */ +-struct x8664_pda { +- struct task_struct *pcurrent; /* 0 Current process */ +- unsigned long data_offset; /* 8 Per cpu data offset from linker +- address */ +- unsigned long kernelstack; /* 16 top of kernel stack for current */ +- unsigned long oldrsp; /* 24 user rsp for system call */ +- int irqcount; /* 32 Irq nesting counter. Starts -1 */ +- unsigned int cpunumber; /* 36 Logical CPU number */ +-#ifdef CONFIG_CC_STACKPROTECTOR +- unsigned long stack_canary; /* 40 stack canary value */ +- /* gcc-ABI: this canary MUST be at +- offset 40!!! */ +-#endif +- char *irqstackptr; +- short nodenumber; /* number of current node (32k max) */ +- short in_bootmem; /* pda lives in bootmem */ +- unsigned int __softirq_pending; +- unsigned int __nmi_count; /* number of NMI on this CPUs */ +- short mmu_state; +- short isidle; +- struct mm_struct *active_mm; +- unsigned apic_timer_irqs; +- unsigned irq0_irqs; +- unsigned irq_resched_count; +- unsigned irq_call_count; +- unsigned irq_tlb_count; +- unsigned irq_thermal_count; +- unsigned irq_threshold_count; +- unsigned irq_spurious_count; +-} ____cacheline_aligned_in_smp; +- +-extern struct x8664_pda **_cpu_pda; +-extern void pda_init(int); +- +-#define cpu_pda(i) (_cpu_pda[i]) +- +-/* +- * There is no fast way to get the base address of the PDA, all the accesses +- * have to mention %fs/%gs. So it needs to be done this Torvaldian way. +- */ +-extern void __bad_pda_field(void) __attribute__((noreturn)); +- +-/* +- * proxy_pda doesn't actually exist, but tell gcc it is accessed for +- * all PDA accesses so it gets read/write dependencies right. +- */ +-extern struct x8664_pda _proxy_pda; +- +-#define pda_offset(field) offsetof(struct x8664_pda, field) +- +-#define pda_to_op(op, field, val) \ +-do { \ +- typedef typeof(_proxy_pda.field) T__; \ +- if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \ +- switch (sizeof(_proxy_pda.field)) { \ +- case 2: \ +- asm(op "w %1,%%gs:%c2" : \ +- "+m" (_proxy_pda.field) : \ +- "ri" ((T__)val), \ +- "i"(pda_offset(field))); \ +- break; \ +- case 4: \ +- asm(op "l %1,%%gs:%c2" : \ +- "+m" (_proxy_pda.field) : \ +- "ri" ((T__)val), \ +- "i" (pda_offset(field))); \ +- break; \ +- case 8: \ +- asm(op "q %1,%%gs:%c2": \ +- "+m" (_proxy_pda.field) : \ +- "ri" ((T__)val), \ +- "i"(pda_offset(field))); \ +- break; \ +- default: \ +- __bad_pda_field(); \ +- } \ +-} while (0) +- +-#define pda_from_op(op, field) \ +-({ \ +- typeof(_proxy_pda.field) ret__; \ +- switch (sizeof(_proxy_pda.field)) { \ +- case 2: \ +- asm(op "w %%gs:%c1,%0" : \ +- "=r" (ret__) : \ +- "i" (pda_offset(field)), \ +- "m" (_proxy_pda.field)); \ +- break; \ +- case 4: \ +- asm(op "l %%gs:%c1,%0": \ +- "=r" (ret__): \ +- "i" (pda_offset(field)), \ +- "m" (_proxy_pda.field)); \ +- break; \ +- case 8: \ +- asm(op "q %%gs:%c1,%0": \ +- "=r" (ret__) : \ +- "i" (pda_offset(field)), \ +- "m" (_proxy_pda.field)); \ +- break; \ +- default: \ +- __bad_pda_field(); \ +- } \ +- ret__; \ +-}) +- +-#define read_pda(field) pda_from_op("mov", field) +-#define write_pda(field, val) pda_to_op("mov", field, val) +-#define add_pda(field, val) pda_to_op("add", field, val) +-#define sub_pda(field, val) pda_to_op("sub", field, val) +-#define or_pda(field, val) pda_to_op("or", field, val) +- +-/* This is not atomic against other CPUs -- CPU preemption needs to be off */ +-#define test_and_clear_bit_pda(bit, field) \ +-({ \ +- int old__; \ +- asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \ +- : "=r" (old__), "+m" (_proxy_pda.field) \ +- : "dIr" (bit), "i" (pda_offset(field)) : "memory");\ +- old__; \ +-}) +- +-#endif +- +-#define PDA_STACKOFFSET (5*8) +- +-#endif /* _ASM_X86_PDA_H */ +Index: linux-2.6-tip/arch/x86/include/asm/percpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/percpu.h ++++ linux-2.6-tip/arch/x86/include/asm/percpu.h +@@ -2,53 +2,12 @@ + #define _ASM_X86_PERCPU_H + + #ifdef CONFIG_X86_64 +-#include +- +-/* Same as asm-generic/percpu.h, except that we store the per cpu offset +- in the PDA. Longer term the PDA and every per cpu variable +- should be just put into a single section and referenced directly +- from %gs */ +- +-#ifdef CONFIG_SMP +-#include +- +-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset) +-#define __my_cpu_offset read_pda(data_offset) +- +-#define per_cpu_offset(x) (__per_cpu_offset(x)) +- ++#define __percpu_seg gs ++#define __percpu_mov_op movq ++#else ++#define __percpu_seg fs ++#define __percpu_mov_op movl + #endif +-#include +- +-DECLARE_PER_CPU(struct x8664_pda, pda); +- +-/* +- * These are supposed to be implemented as a single instruction which +- * operates on the per-cpu data base segment. x86-64 doesn't have +- * that yet, so this is a fairly inefficient workaround for the +- * meantime. The single instruction is atomic with respect to +- * preemption and interrupts, so we need to explicitly disable +- * interrupts here to achieve the same effect. However, because it +- * can be used from within interrupt-disable/enable, we can't actually +- * disable interrupts; disabling preemption is enough. +- */ +-#define x86_read_percpu(var) \ +- ({ \ +- typeof(per_cpu_var(var)) __tmp; \ +- preempt_disable(); \ +- __tmp = __get_cpu_var(var); \ +- preempt_enable(); \ +- __tmp; \ +- }) +- +-#define x86_write_percpu(var, val) \ +- do { \ +- preempt_disable(); \ +- __get_cpu_var(var) = (val); \ +- preempt_enable(); \ +- } while(0) +- +-#else /* CONFIG_X86_64 */ + + #ifdef __ASSEMBLY__ + +@@ -65,47 +24,48 @@ DECLARE_PER_CPU(struct x8664_pda, pda); + * PER_CPU(cpu_gdt_descr, %ebx) + */ + #ifdef CONFIG_SMP +-#define PER_CPU(var, reg) \ +- movl %fs:per_cpu__##this_cpu_off, reg; \ ++#define PER_CPU(var, reg) \ ++ __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \ + lea per_cpu__##var(reg), reg +-#define PER_CPU_VAR(var) %fs:per_cpu__##var ++#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var + #else /* ! SMP */ +-#define PER_CPU(var, reg) \ +- movl $per_cpu__##var, reg ++#define PER_CPU(var, reg) \ ++ __percpu_mov_op $per_cpu__##var, reg + #define PER_CPU_VAR(var) per_cpu__##var + #endif /* SMP */ + ++#ifdef CONFIG_X86_64_SMP ++#define INIT_PER_CPU_VAR(var) init_per_cpu__##var ++#else ++#define INIT_PER_CPU_VAR(var) per_cpu__##var ++#endif ++ + #else /* ...!ASSEMBLY */ + ++#include ++ ++#ifdef CONFIG_SMP ++#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x ++#define __my_cpu_offset percpu_read(this_cpu_off) ++#else ++#define __percpu_arg(x) "%" #x ++#endif ++ + /* +- * PER_CPU finds an address of a per-cpu variable. ++ * Initialized pointers to per-cpu variables needed for the boot ++ * processor need to use these macros to get the proper address ++ * offset from __per_cpu_load on SMP. + * +- * Args: +- * var - variable name +- * cpu - 32bit register containing the current CPU number +- * +- * The resulting address is stored in the "cpu" argument. +- * +- * Example: +- * PER_CPU(cpu_gdt_descr, %ebx) ++ * There also must be an entry in vmlinux_64.lds.S + */ +-#ifdef CONFIG_SMP +- +-#define __my_cpu_offset x86_read_percpu(this_cpu_off) +- +-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */ +-#define __percpu_seg "%%fs:" +- +-#else /* !SMP */ +- +-#define __percpu_seg "" +- +-#endif /* SMP */ +- +-#include ++#define DECLARE_INIT_PER_CPU(var) \ ++ extern typeof(per_cpu_var(var)) init_per_cpu_var(var) + +-/* We can use this directly for local CPU (faster). */ +-DECLARE_PER_CPU(unsigned long, this_cpu_off); ++#ifdef CONFIG_X86_64_SMP ++#define init_per_cpu_var(var) init_per_cpu__##var ++#else ++#define init_per_cpu_var(var) per_cpu_var(var) ++#endif + + /* For arch-specific code, we can use direct single-insn ops (they + * don't give an lvalue though). */ +@@ -120,20 +80,25 @@ do { \ + } \ + switch (sizeof(var)) { \ + case 1: \ +- asm(op "b %1,"__percpu_seg"%0" \ ++ asm(op "b %1,"__percpu_arg(0) \ + : "+m" (var) \ + : "ri" ((T__)val)); \ + break; \ + case 2: \ +- asm(op "w %1,"__percpu_seg"%0" \ ++ asm(op "w %1,"__percpu_arg(0) \ + : "+m" (var) \ + : "ri" ((T__)val)); \ + break; \ + case 4: \ +- asm(op "l %1,"__percpu_seg"%0" \ ++ asm(op "l %1,"__percpu_arg(0) \ + : "+m" (var) \ + : "ri" ((T__)val)); \ + break; \ ++ case 8: \ ++ asm(op "q %1,"__percpu_arg(0) \ ++ : "+m" (var) \ ++ : "re" ((T__)val)); \ ++ break; \ + default: __bad_percpu_size(); \ + } \ + } while (0) +@@ -143,17 +108,22 @@ do { \ + typeof(var) ret__; \ + switch (sizeof(var)) { \ + case 1: \ +- asm(op "b "__percpu_seg"%1,%0" \ ++ asm(op "b "__percpu_arg(1)",%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + case 2: \ +- asm(op "w "__percpu_seg"%1,%0" \ ++ asm(op "w "__percpu_arg(1)",%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ + case 4: \ +- asm(op "l "__percpu_seg"%1,%0" \ ++ asm(op "l "__percpu_arg(1)",%0" \ ++ : "=r" (ret__) \ ++ : "m" (var)); \ ++ break; \ ++ case 8: \ ++ asm(op "q "__percpu_arg(1)",%0" \ + : "=r" (ret__) \ + : "m" (var)); \ + break; \ +@@ -162,13 +132,30 @@ do { \ + ret__; \ + }) + +-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var) +-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val) +-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val) +-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val) +-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) ++#define percpu_read(var) percpu_from_op("mov", per_cpu__##var) ++#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) ++#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) ++#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) ++#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) ++#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) ++#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) ++ ++/* This is not atomic against other CPUs -- CPU preemption needs to be off */ ++#define x86_test_and_clear_bit_percpu(bit, var) \ ++({ \ ++ int old__; \ ++ asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ ++ : "=r" (old__), "+m" (per_cpu__##var) \ ++ : "dIr" (bit)); \ ++ old__; \ ++}) ++ ++#include ++ ++/* We can use this directly for local CPU (faster). */ ++DECLARE_PER_CPU(unsigned long, this_cpu_off); ++ + #endif /* !__ASSEMBLY__ */ +-#endif /* !CONFIG_X86_64 */ + + #ifdef CONFIG_SMP + +@@ -195,9 +182,9 @@ do { \ + #define early_per_cpu_ptr(_name) (_name##_early_ptr) + #define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) + #define early_per_cpu(_name, _cpu) \ +- (early_per_cpu_ptr(_name) ? \ +- early_per_cpu_ptr(_name)[_cpu] : \ +- per_cpu(_name, _cpu)) ++ *(early_per_cpu_ptr(_name) ? \ ++ &early_per_cpu_ptr(_name)[_cpu] : \ ++ &per_cpu(_name, _cpu)) + + #else /* !CONFIG_SMP */ + #define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ +Index: linux-2.6-tip/arch/x86/include/asm/perf_counter.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/perf_counter.h +@@ -0,0 +1,98 @@ ++#ifndef _ASM_X86_PERF_COUNTER_H ++#define _ASM_X86_PERF_COUNTER_H ++ ++/* ++ * Performance counter hw details: ++ */ ++ ++#define X86_PMC_MAX_GENERIC 8 ++#define X86_PMC_MAX_FIXED 3 ++ ++#define X86_PMC_IDX_GENERIC 0 ++#define X86_PMC_IDX_FIXED 32 ++#define X86_PMC_IDX_MAX 64 ++ ++#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 ++#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 ++ ++#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 ++#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 ++ ++#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) ++#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) ++#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) ++#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) ++ ++/* ++ * Includes eventsel and unit mask as well: ++ */ ++#define ARCH_PERFMON_EVENT_MASK 0xffff ++ ++#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c ++#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) ++#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 ++#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ ++ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) ++ ++#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 ++ ++/* ++ * Intel "Architectural Performance Monitoring" CPUID ++ * detection/enumeration details: ++ */ ++union cpuid10_eax { ++ struct { ++ unsigned int version_id:8; ++ unsigned int num_counters:8; ++ unsigned int bit_width:8; ++ unsigned int mask_length:8; ++ } split; ++ unsigned int full; ++}; ++ ++union cpuid10_edx { ++ struct { ++ unsigned int num_counters_fixed:4; ++ unsigned int reserved:28; ++ } split; ++ unsigned int full; ++}; ++ ++ ++/* ++ * Fixed-purpose performance counters: ++ */ ++ ++/* ++ * All 3 fixed-mode PMCs are configured via this single MSR: ++ */ ++#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d ++ ++/* ++ * The counts are available in three separate MSRs: ++ */ ++ ++/* Instr_Retired.Any: */ ++#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 ++#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) ++ ++/* CPU_CLK_Unhalted.Core: */ ++#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a ++#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) ++ ++/* CPU_CLK_Unhalted.Ref: */ ++#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b ++#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) ++ ++#define set_perf_counter_pending() \ ++ set_tsk_thread_flag(current, TIF_PERF_COUNTERS); ++ ++#ifdef CONFIG_PERF_COUNTERS ++extern void init_hw_perf_counters(void); ++extern void perf_counters_lapic_init(int nmi); ++#else ++static inline void init_hw_perf_counters(void) { } ++static inline void perf_counters_lapic_init(int nmi) { } ++#endif ++ ++#endif /* _ASM_X86_PERF_COUNTER_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pgtable-2level-defs.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pgtable-2level-defs.h ++++ /dev/null +@@ -1,20 +0,0 @@ +-#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H +-#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H +- +-#define SHARED_KERNEL_PMD 0 +- +-/* +- * traditional i386 two-level paging structure: +- */ +- +-#define PGDIR_SHIFT 22 +-#define PTRS_PER_PGD 1024 +- +-/* +- * the i386 is two-level, so we don't really have any +- * PMD directory physically. +- */ +- +-#define PTRS_PER_PTE 1024 +- +-#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pgtable-2level.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pgtable-2level.h ++++ linux-2.6-tip/arch/x86/include/asm/pgtable-2level.h +@@ -26,13 +26,6 @@ static inline void native_set_pte_atomic + native_set_pte(ptep, pte); + } + +-static inline void native_set_pte_present(struct mm_struct *mm, +- unsigned long addr, +- pte_t *ptep, pte_t pte) +-{ +- native_set_pte(ptep, pte); +-} +- + static inline void native_pmd_clear(pmd_t *pmdp) + { + native_set_pmd(pmdp, __pmd(0)); +@@ -53,8 +46,6 @@ static inline pte_t native_ptep_get_and_ + #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) + #endif + +-#define pte_none(x) (!(x).pte_low) +- + /* + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, + * split up the 29 bits of offset into this range: +Index: linux-2.6-tip/arch/x86/include/asm/pgtable-2level_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/pgtable-2level_types.h +@@ -0,0 +1,37 @@ ++#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H ++#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H ++ ++#ifndef __ASSEMBLY__ ++#include ++ ++typedef unsigned long pteval_t; ++typedef unsigned long pmdval_t; ++typedef unsigned long pudval_t; ++typedef unsigned long pgdval_t; ++typedef unsigned long pgprotval_t; ++ ++typedef union { ++ pteval_t pte; ++ pteval_t pte_low; ++} pte_t; ++#endif /* !__ASSEMBLY__ */ ++ ++#define SHARED_KERNEL_PMD 0 ++#define PAGETABLE_LEVELS 2 ++ ++/* ++ * traditional i386 two-level paging structure: ++ */ ++ ++#define PGDIR_SHIFT 22 ++#define PTRS_PER_PGD 1024 ++ ++ ++/* ++ * the i386 is two-level, so we don't really have any ++ * PMD directory physically. ++ */ ++ ++#define PTRS_PER_PTE 1024 ++ ++#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pgtable-3level-defs.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pgtable-3level-defs.h ++++ /dev/null +@@ -1,28 +0,0 @@ +-#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H +-#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H +- +-#ifdef CONFIG_PARAVIRT +-#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) +-#else +-#define SHARED_KERNEL_PMD 1 +-#endif +- +-/* +- * PGDIR_SHIFT determines what a top-level page table entry can map +- */ +-#define PGDIR_SHIFT 30 +-#define PTRS_PER_PGD 4 +- +-/* +- * PMD_SHIFT determines the size of the area a middle-level +- * page table can map +- */ +-#define PMD_SHIFT 21 +-#define PTRS_PER_PMD 512 +- +-/* +- * entries per page directory level +- */ +-#define PTRS_PER_PTE 512 +- +-#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pgtable-3level.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pgtable-3level.h ++++ linux-2.6-tip/arch/x86/include/asm/pgtable-3level.h +@@ -18,21 +18,6 @@ + printk("%s:%d: bad pgd %p(%016Lx).\n", \ + __FILE__, __LINE__, &(e), pgd_val(e)) + +-static inline int pud_none(pud_t pud) +-{ +- return pud_val(pud) == 0; +-} +- +-static inline int pud_bad(pud_t pud) +-{ +- return (pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0; +-} +- +-static inline int pud_present(pud_t pud) +-{ +- return pud_val(pud) & _PAGE_PRESENT; +-} +- + /* Rules for using set_pte: the pte being assigned *must* be + * either not present or in a state where the hardware will + * not attempt to update the pte. In places where this is +@@ -46,23 +31,6 @@ static inline void native_set_pte(pte_t + ptep->pte_low = pte.pte_low; + } + +-/* +- * Since this is only called on user PTEs, and the page fault handler +- * must handle the already racy situation of simultaneous page faults, +- * we are justified in merely clearing the PTE present bit, followed +- * by a set. The ordering here is important. +- */ +-static inline void native_set_pte_present(struct mm_struct *mm, +- unsigned long addr, +- pte_t *ptep, pte_t pte) +-{ +- ptep->pte_low = 0; +- smp_wmb(); +- ptep->pte_high = pte.pte_high; +- smp_wmb(); +- ptep->pte_low = pte.pte_low; +-} +- + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) + { + set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); +@@ -103,6 +71,7 @@ static inline void pud_clear(pud_t *pudp + { + unsigned long pgd; + ++ preempt_disable(); + set_pud(pudp, __pud(0)); + + /* +@@ -118,17 +87,9 @@ static inline void pud_clear(pud_t *pudp + if (__pa(pudp) >= pgd && __pa(pudp) < + (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) + write_cr3(pgd); ++ preempt_enable(); + } + +-#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) +- +-#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK)) +- +- +-/* Find an entry in the second-level page table.. */ +-#define pmd_offset(pud, address) ((pmd_t *)pud_page_vaddr(*(pud)) + \ +- pmd_index(address)) +- + #ifdef CONFIG_SMP + static inline pte_t native_ptep_get_and_clear(pte_t *ptep) + { +@@ -145,17 +106,6 @@ static inline pte_t native_ptep_get_and_ + #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) + #endif + +-#define __HAVE_ARCH_PTE_SAME +-static inline int pte_same(pte_t a, pte_t b) +-{ +- return a.pte_low == b.pte_low && a.pte_high == b.pte_high; +-} +- +-static inline int pte_none(pte_t pte) +-{ +- return !pte.pte_low && !pte.pte_high; +-} +- + /* + * Bits 0, 6 and 7 are taken in the low part of the pte, + * put the 32 bits of offset into the high part. +Index: linux-2.6-tip/arch/x86/include/asm/pgtable-3level_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/pgtable-3level_types.h +@@ -0,0 +1,48 @@ ++#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H ++#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H ++ ++#ifndef __ASSEMBLY__ ++#include ++ ++typedef u64 pteval_t; ++typedef u64 pmdval_t; ++typedef u64 pudval_t; ++typedef u64 pgdval_t; ++typedef u64 pgprotval_t; ++ ++typedef union { ++ struct { ++ unsigned long pte_low, pte_high; ++ }; ++ pteval_t pte; ++} pte_t; ++#endif /* !__ASSEMBLY__ */ ++ ++#ifdef CONFIG_PARAVIRT ++#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd) ++#else ++#define SHARED_KERNEL_PMD 1 ++#endif ++ ++#define PAGETABLE_LEVELS 3 ++ ++/* ++ * PGDIR_SHIFT determines what a top-level page table entry can map ++ */ ++#define PGDIR_SHIFT 30 ++#define PTRS_PER_PGD 4 ++ ++/* ++ * PMD_SHIFT determines the size of the area a middle-level ++ * page table can map ++ */ ++#define PMD_SHIFT 21 ++#define PTRS_PER_PMD 512 ++ ++/* ++ * entries per page directory level ++ */ ++#define PTRS_PER_PTE 512 ++ ++ ++#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pgtable.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pgtable.h ++++ linux-2.6-tip/arch/x86/include/asm/pgtable.h +@@ -1,164 +1,9 @@ + #ifndef _ASM_X86_PGTABLE_H + #define _ASM_X86_PGTABLE_H + +-#define FIRST_USER_ADDRESS 0 ++#include + +-#define _PAGE_BIT_PRESENT 0 /* is present */ +-#define _PAGE_BIT_RW 1 /* writeable */ +-#define _PAGE_BIT_USER 2 /* userspace addressable */ +-#define _PAGE_BIT_PWT 3 /* page write through */ +-#define _PAGE_BIT_PCD 4 /* page cache disabled */ +-#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ +-#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ +-#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +-#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +-#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +-#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ +-#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ +-#define _PAGE_BIT_UNUSED3 11 +-#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +-#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +-#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +- +-/* If _PAGE_BIT_PRESENT is clear, we use these: */ +-/* - if the user mapped it with PROT_NONE; pte_present gives true */ +-#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +-/* - set: nonlinear file mapping, saved PTE; unset:swap */ +-#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY +- +-#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) +-#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) +-#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) +-#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) +-#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) +-#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) +-#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) +-#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +-#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +-#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +-#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3) +-#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +-#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) +-#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +-#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +-#define __HAVE_ARCH_PTE_SPECIAL +- +-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +-#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +-#else +-#define _PAGE_NX (_AT(pteval_t, 0)) +-#endif +- +-#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +- +-#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ +- _PAGE_ACCESSED | _PAGE_DIRTY) +-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ +- _PAGE_DIRTY) +- +-/* Set of bits not changed in pte_modify */ +-#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ +- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) +- +-#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) +-#define _PAGE_CACHE_WB (0) +-#define _PAGE_CACHE_WC (_PAGE_PWT) +-#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) +-#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) +- +-#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ +- _PAGE_ACCESSED | _PAGE_NX) +- +-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ +- _PAGE_USER | _PAGE_ACCESSED) +-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ +- _PAGE_ACCESSED | _PAGE_NX) +-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ +- _PAGE_ACCESSED) +-#define PAGE_COPY PAGE_COPY_NOEXEC +-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ +- _PAGE_ACCESSED | _PAGE_NX) +-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ +- _PAGE_ACCESSED) +- +-#define __PAGE_KERNEL_EXEC \ +- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) +-#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) +- +-#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +-#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) +-#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) +-#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) +-#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +-#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +-#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) +-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) +-#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +-#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) +-#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +- +-#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) +-#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) +-#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) +-#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) +- +-#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +-#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +-#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +-#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) +-#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) +-#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +-#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) +-#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) +-#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +-#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) +-#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +-#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) +-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) +- +-#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +-#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +-#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) +-#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) +- +-/* xwr */ +-#define __P000 PAGE_NONE +-#define __P001 PAGE_READONLY +-#define __P010 PAGE_COPY +-#define __P011 PAGE_COPY +-#define __P100 PAGE_READONLY_EXEC +-#define __P101 PAGE_READONLY_EXEC +-#define __P110 PAGE_COPY_EXEC +-#define __P111 PAGE_COPY_EXEC +- +-#define __S000 PAGE_NONE +-#define __S001 PAGE_READONLY +-#define __S010 PAGE_SHARED +-#define __S011 PAGE_SHARED +-#define __S100 PAGE_READONLY_EXEC +-#define __S101 PAGE_READONLY_EXEC +-#define __S110 PAGE_SHARED_EXEC +-#define __S111 PAGE_SHARED_EXEC +- +-/* +- * early identity mapping pte attrib macros. +- */ +-#ifdef CONFIG_X86_64 +-#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +-#else +-/* +- * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection +- * bits are combined, this will alow user to access the high address mapped +- * VDSO in the presence of CONFIG_COMPAT_VDSO +- */ +-#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +-#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +-#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ +-#endif ++#include + + /* + * Macro to mark a page protection value as UC- +@@ -170,9 +15,6 @@ + + #ifndef __ASSEMBLY__ + +-#define pgprot_writecombine pgprot_writecombine +-extern pgprot_t pgprot_writecombine(pgprot_t prot); +- + /* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. +@@ -183,6 +25,64 @@ extern unsigned long empty_zero_page[PAG + extern spinlock_t pgd_lock; + extern struct list_head pgd_list; + ++#ifdef CONFIG_PARAVIRT ++#include ++#else /* !CONFIG_PARAVIRT */ ++#define set_pte(ptep, pte) native_set_pte(ptep, pte) ++#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) ++ ++#define set_pte_atomic(ptep, pte) \ ++ native_set_pte_atomic(ptep, pte) ++ ++#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) ++ ++#ifndef __PAGETABLE_PUD_FOLDED ++#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) ++#define pgd_clear(pgd) native_pgd_clear(pgd) ++#endif ++ ++#ifndef set_pud ++# define set_pud(pudp, pud) native_set_pud(pudp, pud) ++#endif ++ ++#ifndef __PAGETABLE_PMD_FOLDED ++#define pud_clear(pud) native_pud_clear(pud) ++#endif ++ ++#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) ++#define pmd_clear(pmd) native_pmd_clear(pmd) ++ ++#define pte_update(mm, addr, ptep) do { } while (0) ++#define pte_update_defer(mm, addr, ptep) do { } while (0) ++ ++static inline void __init paravirt_pagetable_setup_start(pgd_t *base) ++{ ++ native_pagetable_setup_start(base); ++} ++ ++static inline void __init paravirt_pagetable_setup_done(pgd_t *base) ++{ ++ native_pagetable_setup_done(base); ++} ++ ++#define pgd_val(x) native_pgd_val(x) ++#define __pgd(x) native_make_pgd(x) ++ ++#ifndef __PAGETABLE_PUD_FOLDED ++#define pud_val(x) native_pud_val(x) ++#define __pud(x) native_make_pud(x) ++#endif ++ ++#ifndef __PAGETABLE_PMD_FOLDED ++#define pmd_val(x) native_pmd_val(x) ++#define __pmd(x) native_make_pmd(x) ++#endif ++ ++#define pte_val(x) native_pte_val(x) ++#define __pte(x) native_make_pte(x) ++ ++#endif /* CONFIG_PARAVIRT */ ++ + /* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. +@@ -236,72 +136,84 @@ static inline unsigned long pte_pfn(pte_ + + static inline int pmd_large(pmd_t pte) + { +- return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == ++ return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); + } + ++static inline pte_t pte_set_flags(pte_t pte, pteval_t set) ++{ ++ pteval_t v = native_pte_val(pte); ++ ++ return native_make_pte(v | set); ++} ++ ++static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) ++{ ++ pteval_t v = native_pte_val(pte); ++ ++ return native_make_pte(v & ~clear); ++} ++ + static inline pte_t pte_mkclean(pte_t pte) + { +- return __pte(pte_val(pte) & ~_PAGE_DIRTY); ++ return pte_clear_flags(pte, _PAGE_DIRTY); + } + + static inline pte_t pte_mkold(pte_t pte) + { +- return __pte(pte_val(pte) & ~_PAGE_ACCESSED); ++ return pte_clear_flags(pte, _PAGE_ACCESSED); + } + + static inline pte_t pte_wrprotect(pte_t pte) + { +- return __pte(pte_val(pte) & ~_PAGE_RW); ++ return pte_clear_flags(pte, _PAGE_RW); + } + + static inline pte_t pte_mkexec(pte_t pte) + { +- return __pte(pte_val(pte) & ~_PAGE_NX); ++ return pte_clear_flags(pte, _PAGE_NX); + } + + static inline pte_t pte_mkdirty(pte_t pte) + { +- return __pte(pte_val(pte) | _PAGE_DIRTY); ++ return pte_set_flags(pte, _PAGE_DIRTY); + } + + static inline pte_t pte_mkyoung(pte_t pte) + { +- return __pte(pte_val(pte) | _PAGE_ACCESSED); ++ return pte_set_flags(pte, _PAGE_ACCESSED); + } + + static inline pte_t pte_mkwrite(pte_t pte) + { +- return __pte(pte_val(pte) | _PAGE_RW); ++ return pte_set_flags(pte, _PAGE_RW); + } + + static inline pte_t pte_mkhuge(pte_t pte) + { +- return __pte(pte_val(pte) | _PAGE_PSE); ++ return pte_set_flags(pte, _PAGE_PSE); + } + + static inline pte_t pte_clrhuge(pte_t pte) + { +- return __pte(pte_val(pte) & ~_PAGE_PSE); ++ return pte_clear_flags(pte, _PAGE_PSE); + } + + static inline pte_t pte_mkglobal(pte_t pte) + { +- return __pte(pte_val(pte) | _PAGE_GLOBAL); ++ return pte_set_flags(pte, _PAGE_GLOBAL); + } + + static inline pte_t pte_clrglobal(pte_t pte) + { +- return __pte(pte_val(pte) & ~_PAGE_GLOBAL); ++ return pte_clear_flags(pte, _PAGE_GLOBAL); + } + + static inline pte_t pte_mkspecial(pte_t pte) + { +- return __pte(pte_val(pte) | _PAGE_SPECIAL); ++ return pte_set_flags(pte, _PAGE_SPECIAL); + } + +-extern pteval_t __supported_pte_mask; +- + /* + * Mask out unsupported bits in a present pgprot. Non-present pgprots + * can use those bits for other purposes, so leave them be. +@@ -374,82 +286,202 @@ static inline int is_new_memtype_allowed + return 1; + } + +-#ifndef __ASSEMBLY__ +-/* Indicate that x86 has its own track and untrack pfn vma functions */ +-#define __HAVE_PFNMAP_TRACKING +- +-#define __HAVE_PHYS_MEM_ACCESS_PROT +-struct file; +-pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, +- unsigned long size, pgprot_t vma_prot); +-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, +- unsigned long size, pgprot_t *vma_prot); +-#endif +- +-/* Install a pte for a particular vaddr in kernel space. */ +-void set_pte_vaddr(unsigned long vaddr, pte_t pte); ++pmd_t *populate_extra_pmd(unsigned long vaddr); ++pte_t *populate_extra_pte(unsigned long vaddr); ++#endif /* __ASSEMBLY__ */ + + #ifdef CONFIG_X86_32 +-extern void native_pagetable_setup_start(pgd_t *base); +-extern void native_pagetable_setup_done(pgd_t *base); ++# include "pgtable_32.h" + #else +-static inline void native_pagetable_setup_start(pgd_t *base) {} +-static inline void native_pagetable_setup_done(pgd_t *base) {} ++# include "pgtable_64.h" + #endif + +-struct seq_file; +-extern void arch_report_meminfo(struct seq_file *m); ++#ifndef __ASSEMBLY__ ++#include + +-#ifdef CONFIG_PARAVIRT +-#include +-#else /* !CONFIG_PARAVIRT */ +-#define set_pte(ptep, pte) native_set_pte(ptep, pte) +-#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) ++static inline int pte_none(pte_t pte) ++{ ++ return !pte.pte; ++} + +-#define set_pte_present(mm, addr, ptep, pte) \ +- native_set_pte_present(mm, addr, ptep, pte) +-#define set_pte_atomic(ptep, pte) \ +- native_set_pte_atomic(ptep, pte) ++#define __HAVE_ARCH_PTE_SAME ++static inline int pte_same(pte_t a, pte_t b) ++{ ++ return a.pte == b.pte; ++} + +-#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) ++static inline int pte_present(pte_t a) ++{ ++ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); ++} + +-#ifndef __PAGETABLE_PUD_FOLDED +-#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) +-#define pgd_clear(pgd) native_pgd_clear(pgd) +-#endif ++static inline int pmd_present(pmd_t pmd) ++{ ++ return pmd_flags(pmd) & _PAGE_PRESENT; ++} + +-#ifndef set_pud +-# define set_pud(pudp, pud) native_set_pud(pudp, pud) +-#endif ++static inline int pmd_none(pmd_t pmd) ++{ ++ /* Only check low word on 32-bit platforms, since it might be ++ out of sync with upper half. */ ++ return (unsigned long)native_pmd_val(pmd) == 0; ++} + +-#ifndef __PAGETABLE_PMD_FOLDED +-#define pud_clear(pud) native_pud_clear(pud) +-#endif ++static inline unsigned long pmd_page_vaddr(pmd_t pmd) ++{ ++ return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); ++} + +-#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) +-#define pmd_clear(pmd) native_pmd_clear(pmd) ++/* ++ * Currently stuck as a macro due to indirect forward reference to ++ * linux/mmzone.h's __section_mem_map_addr() definition: ++ */ ++#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) + +-#define pte_update(mm, addr, ptep) do { } while (0) +-#define pte_update_defer(mm, addr, ptep) do { } while (0) ++/* ++ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] ++ * ++ * this macro returns the index of the entry in the pmd page which would ++ * control the given virtual address ++ */ ++static inline unsigned pmd_index(unsigned long address) ++{ ++ return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); ++} + +-static inline void __init paravirt_pagetable_setup_start(pgd_t *base) ++/* ++ * Conversion functions: convert a page and protection to a page entry, ++ * and a page entry and page directory to the page they refer to. ++ * ++ * (Currently stuck as a macro because of indirect forward reference ++ * to linux/mm.h:page_to_nid()) ++ */ ++#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) ++ ++/* ++ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] ++ * ++ * this function returns the index of the entry in the pte page which would ++ * control the given virtual address ++ */ ++static inline unsigned pte_index(unsigned long address) + { +- native_pagetable_setup_start(base); ++ return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + } + +-static inline void __init paravirt_pagetable_setup_done(pgd_t *base) ++static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) + { +- native_pagetable_setup_done(base); ++ return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); + } +-#endif /* CONFIG_PARAVIRT */ + +-#endif /* __ASSEMBLY__ */ ++static inline int pmd_bad(pmd_t pmd) ++{ ++ return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; ++} + +-#ifdef CONFIG_X86_32 +-# include "pgtable_32.h" ++static inline unsigned long pages_to_mb(unsigned long npg) ++{ ++ return npg >> (20 - PAGE_SHIFT); ++} ++ ++#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ ++ remap_pfn_range(vma, vaddr, pfn, size, prot) ++ ++#if PAGETABLE_LEVELS > 2 ++static inline int pud_none(pud_t pud) ++{ ++ return native_pud_val(pud) == 0; ++} ++ ++static inline int pud_present(pud_t pud) ++{ ++ return pud_flags(pud) & _PAGE_PRESENT; ++} ++ ++static inline unsigned long pud_page_vaddr(pud_t pud) ++{ ++ return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); ++} ++ ++/* ++ * Currently stuck as a macro due to indirect forward reference to ++ * linux/mmzone.h's __section_mem_map_addr() definition: ++ */ ++#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) ++ ++/* Find an entry in the second-level page table.. */ ++static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) ++{ ++ return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); ++} ++ ++static inline unsigned long pmd_pfn(pmd_t pmd) ++{ ++ return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; ++} ++ ++static inline int pud_large(pud_t pud) ++{ ++ return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == ++ (_PAGE_PSE | _PAGE_PRESENT); ++} ++ ++static inline int pud_bad(pud_t pud) ++{ ++ return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; ++} + #else +-# include "pgtable_64.h" +-#endif ++static inline int pud_large(pud_t pud) ++{ ++ return 0; ++} ++#endif /* PAGETABLE_LEVELS > 2 */ ++ ++#if PAGETABLE_LEVELS > 3 ++static inline int pgd_present(pgd_t pgd) ++{ ++ return pgd_flags(pgd) & _PAGE_PRESENT; ++} ++ ++static inline unsigned long pgd_page_vaddr(pgd_t pgd) ++{ ++ return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK); ++} ++ ++/* ++ * Currently stuck as a macro due to indirect forward reference to ++ * linux/mmzone.h's __section_mem_map_addr() definition: ++ */ ++#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) ++ ++/* to find an entry in a page-table-directory. */ ++static inline unsigned pud_index(unsigned long address) ++{ ++ return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); ++} ++ ++static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) ++{ ++ return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); ++} ++ ++static inline int pgd_bad(pgd_t pgd) ++{ ++ return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; ++} ++ ++static inline int pgd_none(pgd_t pgd) ++{ ++ return !native_pgd_val(pgd); ++} ++#endif /* PAGETABLE_LEVELS > 3 */ ++ ++static inline int pte_hidden(pte_t pte) ++{ ++ return pte_flags(pte) & _PAGE_HIDDEN; ++} ++ ++#endif /* __ASSEMBLY__ */ + + /* + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] +@@ -476,28 +508,6 @@ static inline void __init paravirt_paget + + #ifndef __ASSEMBLY__ + +-enum { +- PG_LEVEL_NONE, +- PG_LEVEL_4K, +- PG_LEVEL_2M, +- PG_LEVEL_1G, +- PG_LEVEL_NUM +-}; +- +-#ifdef CONFIG_PROC_FS +-extern void update_page_count(int level, unsigned long pages); +-#else +-static inline void update_page_count(int level, unsigned long pages) { } +-#endif +- +-/* +- * Helper function that returns the kernel pagetable entry controlling +- * the virtual address 'address'. NULL means no pagetable entry present. +- * NOTE: the return type is pte_t but if the pmd is PSE then we return it +- * as a pte too. +- */ +-extern pte_t *lookup_address(unsigned long address, unsigned int *level); +- + /* local pte updates need not use xchg for locking */ + static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) + { +Index: linux-2.6-tip/arch/x86/include/asm/pgtable_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pgtable_32.h ++++ linux-2.6-tip/arch/x86/include/asm/pgtable_32.h +@@ -1,6 +1,7 @@ + #ifndef _ASM_X86_PGTABLE_32_H + #define _ASM_X86_PGTABLE_32_H + ++#include + + /* + * The Linux memory management assumes a three-level page table setup. On +@@ -33,47 +34,6 @@ void paging_init(void); + + extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); + +-/* +- * The Linux x86 paging architecture is 'compile-time dual-mode', it +- * implements both the traditional 2-level x86 page tables and the +- * newer 3-level PAE-mode page tables. +- */ +-#ifdef CONFIG_X86_PAE +-# include +-# define PMD_SIZE (1UL << PMD_SHIFT) +-# define PMD_MASK (~(PMD_SIZE - 1)) +-#else +-# include +-#endif +- +-#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +-#define PGDIR_MASK (~(PGDIR_SIZE - 1)) +- +-/* Just any arbitrary offset to the start of the vmalloc VM area: the +- * current 8MB value just means that there will be a 8MB "hole" after the +- * physical memory until the kernel virtual memory starts. That means that +- * any out-of-bounds memory accesses will hopefully be caught. +- * The vmalloc() routines leaves a hole of 4kB between each vmalloced +- * area for the same reason. ;) +- */ +-#define VMALLOC_OFFSET (8 * 1024 * 1024) +-#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) +-#ifdef CONFIG_X86_PAE +-#define LAST_PKMAP 512 +-#else +-#define LAST_PKMAP 1024 +-#endif +- +-#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ +- & PMD_MASK) +- +-#ifdef CONFIG_HIGHMEM +-# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) +-#else +-# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) +-#endif +- +-#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) + + /* + * Define this if things work differently on an i386 and an i486: +@@ -82,58 +42,12 @@ extern void set_pmd_pfn(unsigned long, u + */ + #undef TEST_ACCESS_OK + +-/* The boot page tables (all created as a single array) */ +-extern unsigned long pg0[]; +- +-#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) +- +-/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ +-#define pmd_none(x) (!(unsigned long)pmd_val((x))) +-#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT) +-#define pmd_bad(x) ((pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +- +-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) +- + #ifdef CONFIG_X86_PAE + # include + #else + # include + #endif + +-/* +- * Conversion functions: convert a page and protection to a page entry, +- * and a page entry and page directory to the page they refer to. +- */ +-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) +- +- +-static inline int pud_large(pud_t pud) { return 0; } +- +-/* +- * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] +- * +- * this macro returns the index of the entry in the pmd page which would +- * control the given virtual address +- */ +-#define pmd_index(address) \ +- (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) +- +-/* +- * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] +- * +- * this macro returns the index of the entry in the pte page which would +- * control the given virtual address +- */ +-#define pte_index(address) \ +- (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +-#define pte_offset_kernel(dir, address) \ +- ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address))) +- +-#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT)) +- +-#define pmd_page_vaddr(pmd) \ +- ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK)) +- + #if defined(CONFIG_HIGHPTE) + #define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \ +@@ -141,14 +55,20 @@ static inline int pud_large(pud_t pud) { + #define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + pte_index((address))) ++#define pte_offset_map_direct(dir, address) \ ++ ((pte_t *)kmap_atomic_pte_direct(pmd_page(*(dir)), KM_PTE0) + \ ++ pte_index((address))) + #define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) + #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) ++#define pte_unmap_direct(pte) kunmap_atomic_direct((pte), KM_PTE0) + #else + #define pte_offset_map(dir, address) \ + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) + #define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) ++#define pte_offset_map_direct(dir, address) pte_offset_map((dir), (address)) + #define pte_unmap(pte) do { } while (0) + #define pte_unmap_nested(pte) do { } while (0) ++#define pte_unmap_direct(pte) do { } while (0) + #endif + + /* Clear a kernel PTE and flush it from the TLB */ +@@ -176,7 +96,4 @@ do { \ + #define kern_addr_valid(kaddr) (0) + #endif + +-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ +- remap_pfn_range(vma, vaddr, pfn, size, prot) +- + #endif /* _ASM_X86_PGTABLE_32_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pgtable_32_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/pgtable_32_types.h +@@ -0,0 +1,51 @@ ++#ifndef _ASM_X86_PGTABLE_32_DEFS_H ++#define _ASM_X86_PGTABLE_32_DEFS_H ++ ++/* ++ * The Linux x86 paging architecture is 'compile-time dual-mode', it ++ * implements both the traditional 2-level x86 page tables and the ++ * newer 3-level PAE-mode page tables. ++ */ ++#ifdef CONFIG_X86_PAE ++# include ++# define PMD_SIZE (1UL << PMD_SHIFT) ++# define PMD_MASK (~(PMD_SIZE - 1)) ++#else ++# include ++#endif ++ ++#define PGDIR_SIZE (1UL << PGDIR_SHIFT) ++#define PGDIR_MASK (~(PGDIR_SIZE - 1)) ++ ++/* Just any arbitrary offset to the start of the vmalloc VM area: the ++ * current 8MB value just means that there will be a 8MB "hole" after the ++ * physical memory until the kernel virtual memory starts. That means that ++ * any out-of-bounds memory accesses will hopefully be caught. ++ * The vmalloc() routines leaves a hole of 4kB between each vmalloced ++ * area for the same reason. ;) ++ */ ++#define VMALLOC_OFFSET (8 * 1024 * 1024) ++ ++#ifndef __ASSEMBLER__ ++extern bool __vmalloc_start_set; /* set once high_memory is set */ ++#endif ++ ++#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) ++#ifdef CONFIG_X86_PAE ++#define LAST_PKMAP 512 ++#else ++#define LAST_PKMAP 1024 ++#endif ++ ++#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ ++ & PMD_MASK) ++ ++#ifdef CONFIG_HIGHMEM ++# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) ++#else ++# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) ++#endif ++ ++#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) ++ ++#endif /* _ASM_X86_PGTABLE_32_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pgtable_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pgtable_64.h ++++ linux-2.6-tip/arch/x86/include/asm/pgtable_64.h +@@ -2,6 +2,8 @@ + #define _ASM_X86_PGTABLE_64_H + + #include ++#include ++ + #ifndef __ASSEMBLY__ + + /* +@@ -11,7 +13,6 @@ + #include + #include + #include +-#include + + extern pud_t level3_kernel_pgt[512]; + extern pud_t level3_ident_pgt[512]; +@@ -26,32 +27,6 @@ extern void paging_init(void); + + #endif /* !__ASSEMBLY__ */ + +-#define SHARED_KERNEL_PMD 0 +- +-/* +- * PGDIR_SHIFT determines what a top-level page table entry can map +- */ +-#define PGDIR_SHIFT 39 +-#define PTRS_PER_PGD 512 +- +-/* +- * 3rd level page +- */ +-#define PUD_SHIFT 30 +-#define PTRS_PER_PUD 512 +- +-/* +- * PMD_SHIFT determines the size of the area a middle-level +- * page table can map +- */ +-#define PMD_SHIFT 21 +-#define PTRS_PER_PMD 512 +- +-/* +- * entries per page directory level +- */ +-#define PTRS_PER_PTE 512 +- + #ifndef __ASSEMBLY__ + + #define pte_ERROR(e) \ +@@ -67,9 +42,6 @@ extern void paging_init(void); + printk("%s:%d: bad pgd %p(%016lx).\n", \ + __FILE__, __LINE__, &(e), pgd_val(e)) + +-#define pgd_none(x) (!pgd_val(x)) +-#define pud_none(x) (!pud_val(x)) +- + struct mm_struct; + + void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); +@@ -134,48 +106,6 @@ static inline void native_pgd_clear(pgd_ + native_set_pgd(pgd, native_make_pgd(0)); + } + +-#define pte_same(a, b) ((a).pte == (b).pte) +- +-#endif /* !__ASSEMBLY__ */ +- +-#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) +-#define PMD_MASK (~(PMD_SIZE - 1)) +-#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +-#define PUD_MASK (~(PUD_SIZE - 1)) +-#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) +-#define PGDIR_MASK (~(PGDIR_SIZE - 1)) +- +- +-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +-#define VMALLOC_START _AC(0xffffc20000000000, UL) +-#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) +-#define VMEMMAP_START _AC(0xffffe20000000000, UL) +-#define MODULES_VADDR _AC(0xffffffffa0000000, UL) +-#define MODULES_END _AC(0xffffffffff000000, UL) +-#define MODULES_LEN (MODULES_END - MODULES_VADDR) +- +-#ifndef __ASSEMBLY__ +- +-static inline int pgd_bad(pgd_t pgd) +-{ +- return (pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; +-} +- +-static inline int pud_bad(pud_t pud) +-{ +- return (pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; +-} +- +-static inline int pmd_bad(pmd_t pmd) +-{ +- return (pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE; +-} +- +-#define pte_none(x) (!pte_val((x))) +-#define pte_present(x) (pte_val((x)) & (_PAGE_PRESENT | _PAGE_PROTNONE)) +- +-#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ +- + /* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. +@@ -184,41 +114,12 @@ static inline int pmd_bad(pmd_t pmd) + /* + * Level 4 access. + */ +-#define pgd_page_vaddr(pgd) \ +- ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK)) +-#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT)) +-#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT) + static inline int pgd_large(pgd_t pgd) { return 0; } + #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) + + /* PUD - Level3 access */ +-/* to find an entry in a page-table-directory. */ +-#define pud_page_vaddr(pud) \ +- ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK)) +-#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT)) +-#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) +-#define pud_offset(pgd, address) \ +- ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address))) +-#define pud_present(pud) (pud_val((pud)) & _PAGE_PRESENT) +- +-static inline int pud_large(pud_t pte) +-{ +- return (pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == +- (_PAGE_PSE | _PAGE_PRESENT); +-} + + /* PMD - Level 2 access */ +-#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK)) +-#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT)) +- +-#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) +-#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \ +- pmd_index(address)) +-#define pmd_none(x) (!pmd_val((x))) +-#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT) +-#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot)))) +-#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT) +- + #define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) + #define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ + _PAGE_FILE }) +@@ -226,18 +127,13 @@ static inline int pud_large(pud_t pte) + + /* PTE - Level 1 access. */ + +-/* page, protection -> pte */ +-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot)) +- +-#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +-#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \ +- pte_index((address))) +- + /* x86-64 always has all page tables mapped. */ + #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) + #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) +-#define pte_unmap(pte) /* NOP */ +-#define pte_unmap_nested(pte) /* NOP */ ++#define pte_offset_map_direct(dir, address) pte_offset_kernel((dir), (address)) ++#define pte_unmap(pte) do { } while (0) ++#define pte_unmap_nested(pte) do { } while (0) ++#define pte_unmap_direct(pte) do { } while (0) + + #define update_mmu_cache(vma, address, pte) do { } while (0) + +@@ -266,9 +162,6 @@ extern int direct_gbpages; + extern int kern_addr_valid(unsigned long addr); + extern void cleanup_highmap(void); + +-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ +- remap_pfn_range(vma, vaddr, pfn, size, prot) +- + #define HAVE_ARCH_UNMAPPED_AREA + #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + +Index: linux-2.6-tip/arch/x86/include/asm/pgtable_64_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/pgtable_64_types.h +@@ -0,0 +1,63 @@ ++#ifndef _ASM_X86_PGTABLE_64_DEFS_H ++#define _ASM_X86_PGTABLE_64_DEFS_H ++ ++#ifndef __ASSEMBLY__ ++#include ++ ++/* ++ * These are used to make use of C type-checking.. ++ */ ++typedef unsigned long pteval_t; ++typedef unsigned long pmdval_t; ++typedef unsigned long pudval_t; ++typedef unsigned long pgdval_t; ++typedef unsigned long pgprotval_t; ++ ++typedef struct { pteval_t pte; } pte_t; ++ ++#endif /* !__ASSEMBLY__ */ ++ ++#define SHARED_KERNEL_PMD 0 ++#define PAGETABLE_LEVELS 4 ++ ++/* ++ * PGDIR_SHIFT determines what a top-level page table entry can map ++ */ ++#define PGDIR_SHIFT 39 ++#define PTRS_PER_PGD 512 ++ ++/* ++ * 3rd level page ++ */ ++#define PUD_SHIFT 30 ++#define PTRS_PER_PUD 512 ++ ++/* ++ * PMD_SHIFT determines the size of the area a middle-level ++ * page table can map ++ */ ++#define PMD_SHIFT 21 ++#define PTRS_PER_PMD 512 ++ ++/* ++ * entries per page directory level ++ */ ++#define PTRS_PER_PTE 512 ++ ++#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) ++#define PMD_MASK (~(PMD_SIZE - 1)) ++#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) ++#define PUD_MASK (~(PUD_SIZE - 1)) ++#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) ++#define PGDIR_MASK (~(PGDIR_SIZE - 1)) ++ ++ ++#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) ++#define VMALLOC_START _AC(0xffffc20000000000, UL) ++#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) ++#define VMEMMAP_START _AC(0xffffe20000000000, UL) ++#define MODULES_VADDR _AC(0xffffffffa0000000, UL) ++#define MODULES_END _AC(0xffffffffff000000, UL) ++#define MODULES_LEN (MODULES_END - MODULES_VADDR) ++ ++#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/pgtable_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/pgtable_types.h +@@ -0,0 +1,334 @@ ++#ifndef _ASM_X86_PGTABLE_DEFS_H ++#define _ASM_X86_PGTABLE_DEFS_H ++ ++#include ++#include ++ ++#define FIRST_USER_ADDRESS 0 ++ ++#define _PAGE_BIT_PRESENT 0 /* is present */ ++#define _PAGE_BIT_RW 1 /* writeable */ ++#define _PAGE_BIT_USER 2 /* userspace addressable */ ++#define _PAGE_BIT_PWT 3 /* page write through */ ++#define _PAGE_BIT_PCD 4 /* page cache disabled */ ++#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ ++#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ ++#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ ++#define _PAGE_BIT_PAT 7 /* on 4KB pages */ ++#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ ++#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ ++#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ ++#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ ++#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ ++#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 ++#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 ++#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ ++ ++/* If _PAGE_BIT_PRESENT is clear, we use these: */ ++/* - if the user mapped it with PROT_NONE; pte_present gives true */ ++#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL ++/* - set: nonlinear file mapping, saved PTE; unset:swap */ ++#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY ++ ++#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) ++#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) ++#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) ++#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) ++#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) ++#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) ++#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) ++#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) ++#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) ++#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) ++#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) ++#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) ++#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) ++#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) ++#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) ++#define __HAVE_ARCH_PTE_SPECIAL ++ ++#ifdef CONFIG_KMEMCHECK ++#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) ++#else ++#define _PAGE_HIDDEN (_AT(pteval_t, 0)) ++#endif ++ ++#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) ++#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) ++#else ++#define _PAGE_NX (_AT(pteval_t, 0)) ++#endif ++ ++#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) ++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) ++ ++#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ ++ _PAGE_ACCESSED | _PAGE_DIRTY) ++#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ ++ _PAGE_DIRTY) ++ ++/* Set of bits not changed in pte_modify */ ++#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ ++ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) ++ ++#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) ++#define _PAGE_CACHE_WB (0) ++#define _PAGE_CACHE_WC (_PAGE_PWT) ++#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) ++#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) ++ ++#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) ++#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ ++ _PAGE_ACCESSED | _PAGE_NX) ++ ++#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ ++ _PAGE_USER | _PAGE_ACCESSED) ++#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ ++ _PAGE_ACCESSED | _PAGE_NX) ++#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ ++ _PAGE_ACCESSED) ++#define PAGE_COPY PAGE_COPY_NOEXEC ++#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ ++ _PAGE_ACCESSED | _PAGE_NX) ++#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ ++ _PAGE_ACCESSED) ++ ++#define __PAGE_KERNEL_EXEC \ ++ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) ++#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) ++ ++#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) ++#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) ++#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) ++#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) ++#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) ++#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) ++#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) ++#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) ++#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) ++#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) ++#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) ++ ++#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) ++#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) ++#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) ++#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) ++ ++#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) ++#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) ++#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) ++#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) ++#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) ++#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) ++#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) ++#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) ++#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) ++#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) ++#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) ++#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) ++#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) ++ ++#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) ++#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) ++#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) ++#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) ++ ++/* xwr */ ++#define __P000 PAGE_NONE ++#define __P001 PAGE_READONLY ++#define __P010 PAGE_COPY ++#define __P011 PAGE_COPY ++#define __P100 PAGE_READONLY_EXEC ++#define __P101 PAGE_READONLY_EXEC ++#define __P110 PAGE_COPY_EXEC ++#define __P111 PAGE_COPY_EXEC ++ ++#define __S000 PAGE_NONE ++#define __S001 PAGE_READONLY ++#define __S010 PAGE_SHARED ++#define __S011 PAGE_SHARED ++#define __S100 PAGE_READONLY_EXEC ++#define __S101 PAGE_READONLY_EXEC ++#define __S110 PAGE_SHARED_EXEC ++#define __S111 PAGE_SHARED_EXEC ++ ++/* ++ * early identity mapping pte attrib macros. ++ */ ++#ifdef CONFIG_X86_64 ++#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC ++#else ++/* ++ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection ++ * bits are combined, this will alow user to access the high address mapped ++ * VDSO in the presence of CONFIG_COMPAT_VDSO ++ */ ++#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ ++#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ ++#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ ++#endif ++ ++#ifdef CONFIG_X86_32 ++# include "pgtable_32_types.h" ++#else ++# include "pgtable_64_types.h" ++#endif ++ ++#ifndef __ASSEMBLY__ ++ ++#include ++ ++/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ ++#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) ++ ++/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ ++#define PTE_FLAGS_MASK (~PTE_PFN_MASK) ++ ++typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; ++ ++typedef struct { pgdval_t pgd; } pgd_t; ++ ++static inline pgd_t native_make_pgd(pgdval_t val) ++{ ++ return (pgd_t) { val }; ++} ++ ++static inline pgdval_t native_pgd_val(pgd_t pgd) ++{ ++ return pgd.pgd; ++} ++ ++static inline pgdval_t pgd_flags(pgd_t pgd) ++{ ++ return native_pgd_val(pgd) & PTE_FLAGS_MASK; ++} ++ ++#if PAGETABLE_LEVELS > 3 ++typedef struct { pudval_t pud; } pud_t; ++ ++static inline pud_t native_make_pud(pmdval_t val) ++{ ++ return (pud_t) { val }; ++} ++ ++static inline pudval_t native_pud_val(pud_t pud) ++{ ++ return pud.pud; ++} ++#else ++#include ++ ++static inline pudval_t native_pud_val(pud_t pud) ++{ ++ return native_pgd_val(pud.pgd); ++} ++#endif ++ ++#if PAGETABLE_LEVELS > 2 ++typedef struct { pmdval_t pmd; } pmd_t; ++ ++static inline pmd_t native_make_pmd(pmdval_t val) ++{ ++ return (pmd_t) { val }; ++} ++ ++static inline pmdval_t native_pmd_val(pmd_t pmd) ++{ ++ return pmd.pmd; ++} ++#else ++#include ++ ++static inline pmdval_t native_pmd_val(pmd_t pmd) ++{ ++ return native_pgd_val(pmd.pud.pgd); ++} ++#endif ++ ++static inline pudval_t pud_flags(pud_t pud) ++{ ++ return native_pud_val(pud) & PTE_FLAGS_MASK; ++} ++ ++static inline pmdval_t pmd_flags(pmd_t pmd) ++{ ++ return native_pmd_val(pmd) & PTE_FLAGS_MASK; ++} ++ ++static inline pte_t native_make_pte(pteval_t val) ++{ ++ return (pte_t) { .pte = val }; ++} ++ ++static inline pteval_t native_pte_val(pte_t pte) ++{ ++ return pte.pte; ++} ++ ++static inline pteval_t pte_flags(pte_t pte) ++{ ++ return native_pte_val(pte) & PTE_FLAGS_MASK; ++} ++ ++#define pgprot_val(x) ((x).pgprot) ++#define __pgprot(x) ((pgprot_t) { (x) } ) ++ ++ ++typedef struct page *pgtable_t; ++ ++extern pteval_t __supported_pte_mask; ++extern int nx_enabled; ++extern void set_nx(void); ++ ++#define pgprot_writecombine pgprot_writecombine ++extern pgprot_t pgprot_writecombine(pgprot_t prot); ++ ++/* Indicate that x86 has its own track and untrack pfn vma functions */ ++#define __HAVE_PFNMAP_TRACKING ++ ++#define __HAVE_PHYS_MEM_ACCESS_PROT ++struct file; ++pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, ++ unsigned long size, pgprot_t vma_prot); ++int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, ++ unsigned long size, pgprot_t *vma_prot); ++ ++/* Install a pte for a particular vaddr in kernel space. */ ++void set_pte_vaddr(unsigned long vaddr, pte_t pte); ++ ++#ifdef CONFIG_X86_32 ++extern void native_pagetable_setup_start(pgd_t *base); ++extern void native_pagetable_setup_done(pgd_t *base); ++#else ++static inline void native_pagetable_setup_start(pgd_t *base) {} ++static inline void native_pagetable_setup_done(pgd_t *base) {} ++#endif ++ ++struct seq_file; ++extern void arch_report_meminfo(struct seq_file *m); ++ ++enum { ++ PG_LEVEL_NONE, ++ PG_LEVEL_4K, ++ PG_LEVEL_2M, ++ PG_LEVEL_1G, ++ PG_LEVEL_NUM ++}; ++ ++#ifdef CONFIG_PROC_FS ++extern void update_page_count(int level, unsigned long pages); ++#else ++static inline void update_page_count(int level, unsigned long pages) { } ++#endif ++ ++/* ++ * Helper function that returns the kernel pagetable entry controlling ++ * the virtual address 'address'. NULL means no pagetable entry present. ++ * NOTE: the return type is pte_t but if the pmd is PSE then we return it ++ * as a pte too. ++ */ ++extern pte_t *lookup_address(unsigned long address, unsigned int *level); ++ ++#endif /* !__ASSEMBLY__ */ ++ ++#endif /* _ASM_X86_PGTABLE_DEFS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/prctl.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/prctl.h ++++ linux-2.6-tip/arch/x86/include/asm/prctl.h +@@ -6,8 +6,4 @@ + #define ARCH_GET_FS 0x1003 + #define ARCH_GET_GS 0x1004 + +-#ifdef CONFIG_X86_64 +-extern long sys_arch_prctl(int, unsigned long); +-#endif /* CONFIG_X86_64 */ +- + #endif /* _ASM_X86_PRCTL_H */ +Index: linux-2.6-tip/arch/x86/include/asm/processor.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/processor.h ++++ linux-2.6-tip/arch/x86/include/asm/processor.h +@@ -16,6 +16,7 @@ struct mm_struct; + #include + #include + #include ++#include + #include + #include + #include +@@ -73,10 +74,10 @@ struct cpuinfo_x86 { + char pad0; + #else + /* Number of 4K pages in DTLB/ITLB combined(in pages): */ +- int x86_tlbsize; ++ int x86_tlbsize; ++#endif + __u8 x86_virt_bits; + __u8 x86_phys_bits; +-#endif + /* CPUID returned core id bits: */ + __u8 x86_coreid_bits; + /* Max extended CPUID function supported: */ +@@ -93,7 +94,7 @@ struct cpuinfo_x86 { + unsigned long loops_per_jiffy; + #ifdef CONFIG_SMP + /* cpus sharing the last level cache: */ +- cpumask_t llc_shared_map; ++ cpumask_var_t llc_shared_map; + #endif + /* cpuid returned max cores value: */ + u16 x86_max_cores; +@@ -247,7 +248,6 @@ struct x86_hw_tss { + #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) + #define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) + #define INVALID_IO_BITMAP_OFFSET 0x8000 +-#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 + + struct tss_struct { + /* +@@ -262,11 +262,6 @@ struct tss_struct { + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +- /* +- * Cache the current maximum and the last task that used the bitmap: +- */ +- unsigned long io_bitmap_max; +- struct thread_struct *io_bitmap_owner; + + /* + * .. and then another 0x100 bytes for the emergency kernel stack: +@@ -378,9 +373,33 @@ union thread_xstate { + + #ifdef CONFIG_X86_64 + DECLARE_PER_CPU(struct orig_ist, orig_ist); ++ ++union irq_stack_union { ++ char irq_stack[IRQ_STACK_SIZE]; ++ /* ++ * GCC hardcodes the stack canary as %gs:40. Since the ++ * irq_stack is the object at %gs:0, we reserve the bottom ++ * 48 bytes of the irq stack for the canary. ++ */ ++ struct { ++ char gs_base[40]; ++ unsigned long stack_canary; ++ }; ++}; ++ ++DECLARE_PER_CPU(union irq_stack_union, irq_stack_union); ++DECLARE_INIT_PER_CPU(irq_stack_union); ++ ++DECLARE_PER_CPU(char *, irq_stack_ptr); ++DECLARE_PER_CPU(unsigned int, irq_count); ++extern unsigned long kernel_eflags; ++extern asmlinkage void ignore_sysret(void); ++#else /* X86_64 */ ++#ifdef CONFIG_CC_STACKPROTECTOR ++DECLARE_PER_CPU(unsigned long, stack_canary); + #endif ++#endif /* X86_64 */ + +-extern void print_cpu_info(struct cpuinfo_x86 *); + extern unsigned int xstate_size; + extern void free_thread_xstate(struct task_struct *); + extern struct kmem_cache *task_xstate_cachep; +@@ -717,6 +736,7 @@ static inline void __sti_mwait(unsigned + extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); + + extern void select_idle_routine(const struct cpuinfo_x86 *c); ++extern void init_c1e_mask(void); + + extern unsigned long boot_option_idle_override; + extern unsigned long idle_halt; +@@ -752,9 +772,9 @@ extern int sysenter_setup(void); + extern struct desc_ptr early_gdt_descr; + + extern void cpu_set_gdt(int); +-extern void switch_to_new_gdt(void); ++extern void switch_to_new_gdt(int); ++extern void load_percpu_segment(int); + extern void cpu_init(void); +-extern void init_gdt(int cpu); + + static inline unsigned long get_debugctlmsr(void) + { +@@ -839,6 +859,7 @@ static inline void spin_lock_prefetch(co + * User space process size: 3GB (default). + */ + #define TASK_SIZE PAGE_OFFSET ++#define TASK_SIZE_MAX TASK_SIZE + #define STACK_TOP TASK_SIZE + #define STACK_TOP_MAX STACK_TOP + +@@ -898,7 +919,7 @@ extern unsigned long thread_saved_pc(str + /* + * User space process size. 47bits minus one guard page. + */ +-#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) ++#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) + + /* This decides where the kernel will search for a free chunk of vm + * space during mmap's. +@@ -907,12 +928,12 @@ extern unsigned long thread_saved_pc(str + 0xc0000000 : 0xFFFFe000) + + #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ +- IA32_PAGE_OFFSET : TASK_SIZE64) ++ IA32_PAGE_OFFSET : TASK_SIZE_MAX) + #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ +- IA32_PAGE_OFFSET : TASK_SIZE64) ++ IA32_PAGE_OFFSET : TASK_SIZE_MAX) + + #define STACK_TOP TASK_SIZE +-#define STACK_TOP_MAX TASK_SIZE64 ++#define STACK_TOP_MAX TASK_SIZE_MAX + + #define INIT_THREAD { \ + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +Index: linux-2.6-tip/arch/x86/include/asm/proto.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/proto.h ++++ linux-2.6-tip/arch/x86/include/asm/proto.h +@@ -18,11 +18,7 @@ extern void syscall32_cpu_init(void); + + extern void check_efer(void); + +-#ifdef CONFIG_X86_BIOS_REBOOT + extern int reboot_force; +-#else +-static const int reboot_force = 0; +-#endif + + long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); + +Index: linux-2.6-tip/arch/x86/include/asm/ptrace-abi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/ptrace-abi.h ++++ linux-2.6-tip/arch/x86/include/asm/ptrace-abi.h +@@ -80,8 +80,6 @@ + + #define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ + +-#ifdef CONFIG_X86_PTRACE_BTS +- + #ifndef __ASSEMBLY__ + #include + +@@ -140,6 +138,5 @@ struct ptrace_bts_config { + BTS records are read from oldest to newest. + Returns number of BTS records drained. + */ +-#endif /* CONFIG_X86_PTRACE_BTS */ + + #endif /* _ASM_X86_PTRACE_ABI_H */ +Index: linux-2.6-tip/arch/x86/include/asm/ptrace.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/ptrace.h ++++ linux-2.6-tip/arch/x86/include/asm/ptrace.h +@@ -28,7 +28,7 @@ struct pt_regs { + int xds; + int xes; + int xfs; +- /* int gs; */ ++ int xgs; + long orig_eax; + long eip; + int xcs; +@@ -50,7 +50,7 @@ struct pt_regs { + unsigned long ds; + unsigned long es; + unsigned long fs; +- /* int gs; */ ++ unsigned long gs; + unsigned long orig_ax; + unsigned long ip; + unsigned long cs; +Index: linux-2.6-tip/arch/x86/include/asm/rdc321x_defs.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/rdc321x_defs.h +@@ -0,0 +1,12 @@ ++#define PFX "rdc321x: " ++ ++/* General purpose configuration and data registers */ ++#define RDC3210_CFGREG_ADDR 0x0CF8 ++#define RDC3210_CFGREG_DATA 0x0CFC ++ ++#define RDC321X_GPIO_CTRL_REG1 0x48 ++#define RDC321X_GPIO_CTRL_REG2 0x84 ++#define RDC321X_GPIO_DATA_REG1 0x4c ++#define RDC321X_GPIO_DATA_REG2 0x88 ++ ++#define RDC321X_MAX_GPIO 58 +Index: linux-2.6-tip/arch/x86/include/asm/sections.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/sections.h ++++ linux-2.6-tip/arch/x86/include/asm/sections.h +@@ -1 +1,8 @@ ++#ifndef _ASM_X86_SECTIONS_H ++#define _ASM_X86_SECTIONS_H ++ + #include ++ ++extern char __brk_base[], __brk_limit[]; ++ ++#endif /* _ASM_X86_SECTIONS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/segment.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/segment.h ++++ linux-2.6-tip/arch/x86/include/asm/segment.h +@@ -61,7 +61,7 @@ + * + * 26 - ESPFIX small SS + * 27 - per-cpu [ offset to per-cpu data area ] +- * 28 - unused ++ * 28 - stack_canary-20 [ for stack protector ] + * 29 - unused + * 30 - unused + * 31 - TSS for double fault handler +@@ -95,6 +95,13 @@ + #define __KERNEL_PERCPU 0 + #endif + ++#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16) ++#ifdef CONFIG_CC_STACKPROTECTOR ++#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8) ++#else ++#define __KERNEL_STACK_CANARY 0 ++#endif ++ + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 + + /* +Index: linux-2.6-tip/arch/x86/include/asm/setup.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/setup.h ++++ linux-2.6-tip/arch/x86/include/asm/setup.h +@@ -1,33 +1,19 @@ + #ifndef _ASM_X86_SETUP_H + #define _ASM_X86_SETUP_H + ++#ifdef __KERNEL__ ++ + #define COMMAND_LINE_SIZE 2048 + + #ifndef __ASSEMBLY__ + +-/* Interrupt control for vSMPowered x86_64 systems */ +-void vsmp_init(void); +- +- +-void setup_bios_corruption_check(void); +- +- +-#ifdef CONFIG_X86_VISWS +-extern void visws_early_detect(void); +-extern int is_visws_box(void); +-#else +-static inline void visws_early_detect(void) { } +-static inline int is_visws_box(void) { return 0; } +-#endif +- +-extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); +-extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip); + /* + * Any setup quirks to be performed? + */ + struct mpc_cpu; + struct mpc_bus; + struct mpc_oemtable; ++ + struct x86_quirks { + int (*arch_pre_time_init)(void); + int (*arch_time_init)(void); +@@ -43,20 +29,19 @@ struct x86_quirks { + void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); + void (*mpc_oem_pci_bus)(struct mpc_bus *m); + void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, +- unsigned short oemsize); ++ unsigned short oemsize); + int (*setup_ioapic_ids)(void); +- int (*update_genapic)(void); + }; + +-extern struct x86_quirks *x86_quirks; +-extern unsigned long saved_video_mode; ++extern void x86_quirk_pre_intr_init(void); ++extern void x86_quirk_intr_init(void); + +-#ifndef CONFIG_PARAVIRT +-#define paravirt_post_allocator_init() do {} while (0) +-#endif +-#endif /* __ASSEMBLY__ */ ++extern void x86_quirk_trap_init(void); + +-#ifdef __KERNEL__ ++extern void x86_quirk_pre_time_init(void); ++extern void x86_quirk_time_init(void); ++ ++#endif /* __ASSEMBLY__ */ + + #ifdef __i386__ + +@@ -78,6 +63,30 @@ extern unsigned long saved_video_mode; + #ifndef __ASSEMBLY__ + #include + ++/* Interrupt control for vSMPowered x86_64 systems */ ++#ifdef CONFIG_X86_VSMP ++void vsmp_init(void); ++#else ++static inline void vsmp_init(void) { } ++#endif ++ ++void setup_bios_corruption_check(void); ++ ++#ifdef CONFIG_X86_VISWS ++extern void visws_early_detect(void); ++extern int is_visws_box(void); ++#else ++static inline void visws_early_detect(void) { } ++static inline int is_visws_box(void) { return 0; } ++#endif ++ ++extern struct x86_quirks *x86_quirks; ++extern unsigned long saved_video_mode; ++ ++#ifndef CONFIG_PARAVIRT ++#define paravirt_post_allocator_init() do {} while (0) ++#endif ++ + #ifndef _SETUP + + /* +@@ -91,21 +100,51 @@ extern struct boot_params boot_params; + */ + #define LOWMEMSIZE() (0x9f000) + ++/* exceedingly early brk-like allocator */ ++extern unsigned long _brk_end; ++void *extend_brk(size_t size, size_t align); ++ ++/* ++ * Reserve space in the brk section. The name must be unique within ++ * the file, and somewhat descriptive. The size is in bytes. Must be ++ * used at file scope. ++ * ++ * (This uses a temp function to wrap the asm so we can pass it the ++ * size parameter; otherwise we wouldn't be able to. We can't use a ++ * "section" attribute on a normal variable because it always ends up ++ * being @progbits, which ends up allocating space in the vmlinux ++ * executable.) ++ */ ++#define RESERVE_BRK(name,sz) \ ++ static void __section(.discard) __used \ ++ __brk_reservation_fn_##name##__(void) { \ ++ asm volatile ( \ ++ ".pushsection .brk_reservation,\"aw\",@nobits;" \ ++ ".brk." #name ":" \ ++ " 1:.skip %c0;" \ ++ " .size .brk." #name ", . - 1b;" \ ++ " .popsection" \ ++ : : "i" (sz)); \ ++ } ++ + #ifdef __i386__ + + void __init i386_start_kernel(void); + extern void probe_roms(void); + +-extern unsigned long init_pg_tables_start; +-extern unsigned long init_pg_tables_end; +- + #else +-void __init x86_64_init_pda(void); + void __init x86_64_start_kernel(char *real_mode); + void __init x86_64_start_reservations(char *real_mode_data); + + #endif /* __i386__ */ + #endif /* _SETUP */ ++#else ++#define RESERVE_BRK(name,sz) \ ++ .pushsection .brk_reservation,"aw",@nobits; \ ++.brk.name: \ ++1: .skip sz; \ ++ .size .brk.name,.-1b; \ ++ .popsection + #endif /* __ASSEMBLY__ */ + #endif /* __KERNEL__ */ + +Index: linux-2.6-tip/arch/x86/include/asm/setup_arch.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/setup_arch.h +@@ -0,0 +1,3 @@ ++/* Hook to call BIOS initialisation function */ ++ ++/* no action for generic */ +Index: linux-2.6-tip/arch/x86/include/asm/smp.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/smp.h ++++ linux-2.6-tip/arch/x86/include/asm/smp.h +@@ -15,53 +15,25 @@ + # include + # endif + #endif +-#include + #include +- +-#ifdef CONFIG_X86_64 +- +-extern cpumask_var_t cpu_callin_mask; +-extern cpumask_var_t cpu_callout_mask; +-extern cpumask_var_t cpu_initialized_mask; +-extern cpumask_var_t cpu_sibling_setup_mask; +- +-#else /* CONFIG_X86_32 */ +- +-extern cpumask_t cpu_callin_map; +-extern cpumask_t cpu_callout_map; +-extern cpumask_t cpu_initialized; +-extern cpumask_t cpu_sibling_setup_map; +- +-#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map) +-#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map) +-#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized) +-#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map) +- +-#endif /* CONFIG_X86_32 */ +- +-extern void (*mtrr_hook)(void); +-extern void zap_low_mappings(void); +- +-extern int __cpuinit get_local_pda(int cpu); ++#include + + extern int smp_num_siblings; + extern unsigned int num_processors; + +-DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); +-DECLARE_PER_CPU(cpumask_t, cpu_core_map); ++DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); ++DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); + DECLARE_PER_CPU(u16, cpu_llc_id); +-#ifdef CONFIG_X86_32 + DECLARE_PER_CPU(int, cpu_number); +-#endif + + static inline struct cpumask *cpu_sibling_mask(int cpu) + { +- return &per_cpu(cpu_sibling_map, cpu); ++ return per_cpu(cpu_sibling_map, cpu); + } + + static inline struct cpumask *cpu_core_mask(int cpu) + { +- return &per_cpu(cpu_core_map, cpu); ++ return per_cpu(cpu_core_map, cpu); + } + + DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); +@@ -149,9 +121,10 @@ static inline void arch_send_call_functi + smp_ops.send_call_func_single_ipi(cpu); + } + +-static inline void arch_send_call_function_ipi(cpumask_t mask) ++#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask ++static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) + { +- smp_ops.send_call_func_ipi(&mask); ++ smp_ops.send_call_func_ipi(mask); + } + + void cpu_disable_common(void); +@@ -167,8 +140,6 @@ void play_dead_common(void); + void native_send_call_func_ipi(const struct cpumask *mask); + void native_send_call_func_single_ipi(int cpu); + +-extern void prefill_possible_map(void); +- + void smp_store_cpu_info(int id); + #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) + +@@ -177,10 +148,6 @@ static inline int num_booting_cpus(void) + { + return cpumask_weight(cpu_callout_mask); + } +-#else +-static inline void prefill_possible_map(void) +-{ +-} + #endif /* CONFIG_SMP */ + + extern unsigned disabled_cpus __cpuinitdata; +@@ -191,11 +158,11 @@ extern unsigned disabled_cpus __cpuinitd + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +-#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) ++#define raw_smp_processor_id() (percpu_read(cpu_number)) + extern int safe_smp_processor_id(void); + + #elif defined(CONFIG_X86_64_SMP) +-#define raw_smp_processor_id() read_pda(cpunumber) ++#define raw_smp_processor_id() (percpu_read(cpu_number)) + + #define stack_smp_processor_id() \ + ({ \ +@@ -205,10 +172,6 @@ extern int safe_smp_processor_id(void); + }) + #define safe_smp_processor_id() smp_processor_id() + +-#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */ +-#define cpu_physical_id(cpu) boot_cpu_physical_apicid +-#define safe_smp_processor_id() 0 +-#define stack_smp_processor_id() 0 + #endif + + #ifdef CONFIG_X86_LOCAL_APIC +@@ -220,28 +183,9 @@ static inline int logical_smp_processor_ + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); + } + +-#include +-static inline unsigned int read_apic_id(void) +-{ +- unsigned int reg; +- +- reg = *(u32 *)(APIC_BASE + APIC_ID); +- +- return GET_APIC_ID(reg); +-} + #endif + +- +-# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64) + extern int hard_smp_processor_id(void); +-# else +-#include +-static inline int hard_smp_processor_id(void) +-{ +- /* we don't want to mark this access volatile - bad code generation */ +- return read_apic_id(); +-} +-# endif /* APIC_DEFINITION */ + + #else /* CONFIG_X86_LOCAL_APIC */ + +@@ -251,11 +195,5 @@ static inline int hard_smp_processor_id( + + #endif /* CONFIG_X86_LOCAL_APIC */ + +-#ifdef CONFIG_X86_HAS_BOOT_CPU_ID +-extern unsigned char boot_cpu_id; +-#else +-#define boot_cpu_id 0 +-#endif +- + #endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_SMP_H */ +Index: linux-2.6-tip/arch/x86/include/asm/smpboot_hooks.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/smpboot_hooks.h +@@ -0,0 +1,61 @@ ++/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws ++ * which needs to alter them. */ ++ ++static inline void smpboot_clear_io_apic_irqs(void) ++{ ++#ifdef CONFIG_X86_IO_APIC ++ io_apic_irqs = 0; ++#endif ++} ++ ++static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) ++{ ++ CMOS_WRITE(0xa, 0xf); ++ local_flush_tlb(); ++ pr_debug("1.\n"); ++ *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) = ++ start_eip >> 4; ++ pr_debug("2.\n"); ++ *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) = ++ start_eip & 0xf; ++ pr_debug("3.\n"); ++} ++ ++static inline void smpboot_restore_warm_reset_vector(void) ++{ ++ /* ++ * Install writable page 0 entry to set BIOS data area. ++ */ ++ local_flush_tlb(); ++ ++ /* ++ * Paranoid: Set warm reset code and vector here back ++ * to default values. ++ */ ++ CMOS_WRITE(0, 0xf); ++ ++ *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0; ++} ++ ++static inline void __init smpboot_setup_io_apic(void) ++{ ++#ifdef CONFIG_X86_IO_APIC ++ /* ++ * Here we can be sure that there is an IO-APIC in the system. Let's ++ * go and set it up: ++ */ ++ if (!skip_ioapic_setup && nr_ioapics) ++ setup_IO_APIC(); ++ else { ++ nr_ioapics = 0; ++ localise_nmi_watchdog(); ++ } ++#endif ++} ++ ++static inline void smpboot_clear_io_apic(void) ++{ ++#ifdef CONFIG_X86_IO_APIC ++ nr_ioapics = 0; ++#endif ++} +Index: linux-2.6-tip/arch/x86/include/asm/spinlock.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/spinlock.h ++++ linux-2.6-tip/arch/x86/include/asm/spinlock.h +@@ -58,7 +58,7 @@ + #if (NR_CPUS < 256) + #define TICKET_SHIFT 8 + +-static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) ++static __always_inline void __ticket_spin_lock(__raw_spinlock_t *lock) + { + short inc = 0x0100; + +@@ -77,7 +77,7 @@ static __always_inline void __ticket_spi + : "memory", "cc"); + } + +-static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) ++static __always_inline int __ticket_spin_trylock(__raw_spinlock_t *lock) + { + int tmp, new; + +@@ -96,7 +96,7 @@ static __always_inline int __ticket_spin + return tmp; + } + +-static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) ++static __always_inline void __ticket_spin_unlock(__raw_spinlock_t *lock) + { + asm volatile(UNLOCK_LOCK_PREFIX "incb %0" + : "+m" (lock->slock) +@@ -106,7 +106,7 @@ static __always_inline void __ticket_spi + #else + #define TICKET_SHIFT 16 + +-static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) ++static __always_inline void __ticket_spin_lock(__raw_spinlock_t *lock) + { + int inc = 0x00010000; + int tmp; +@@ -127,7 +127,7 @@ static __always_inline void __ticket_spi + : "memory", "cc"); + } + +-static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) ++static __always_inline int __ticket_spin_trylock(__raw_spinlock_t *lock) + { + int tmp; + int new; +@@ -149,7 +149,7 @@ static __always_inline int __ticket_spin + return tmp; + } + +-static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) ++static __always_inline void __ticket_spin_unlock(__raw_spinlock_t *lock) + { + asm volatile(UNLOCK_LOCK_PREFIX "incw %0" + : "+m" (lock->slock) +@@ -158,119 +158,57 @@ static __always_inline void __ticket_spi + } + #endif + +-static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) ++static inline int __ticket_spin_is_locked(__raw_spinlock_t *lock) + { + int tmp = ACCESS_ONCE(lock->slock); + + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); + } + +-static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) ++static inline int __ticket_spin_is_contended(__raw_spinlock_t *lock) + { + int tmp = ACCESS_ONCE(lock->slock); + + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; + } + +-#ifdef CONFIG_PARAVIRT +-/* +- * Define virtualization-friendly old-style lock byte lock, for use in +- * pv_lock_ops if desired. +- * +- * This differs from the pre-2.6.24 spinlock by always using xchgb +- * rather than decb to take the lock; this allows it to use a +- * zero-initialized lock structure. It also maintains a 1-byte +- * contention counter, so that we can implement +- * __byte_spin_is_contended. +- */ +-struct __byte_spinlock { +- s8 lock; +- s8 spinners; +-}; +- +-static inline int __byte_spin_is_locked(raw_spinlock_t *lock) +-{ +- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; +- return bl->lock != 0; +-} ++#ifndef CONFIG_PARAVIRT + +-static inline int __byte_spin_is_contended(raw_spinlock_t *lock) +-{ +- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; +- return bl->spinners != 0; +-} +- +-static inline void __byte_spin_lock(raw_spinlock_t *lock) +-{ +- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; +- s8 val = 1; +- +- asm("1: xchgb %1, %0\n" +- " test %1,%1\n" +- " jz 3f\n" +- " " LOCK_PREFIX "incb %2\n" +- "2: rep;nop\n" +- " cmpb $1, %0\n" +- " je 2b\n" +- " " LOCK_PREFIX "decb %2\n" +- " jmp 1b\n" +- "3:" +- : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory"); +-} +- +-static inline int __byte_spin_trylock(raw_spinlock_t *lock) +-{ +- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; +- u8 old = 1; +- +- asm("xchgb %1,%0" +- : "+m" (bl->lock), "+q" (old) : : "memory"); +- +- return old == 0; +-} +- +-static inline void __byte_spin_unlock(raw_spinlock_t *lock) +-{ +- struct __byte_spinlock *bl = (struct __byte_spinlock *)lock; +- smp_wmb(); +- bl->lock = 0; +-} +-#else /* !CONFIG_PARAVIRT */ +-static inline int __raw_spin_is_locked(raw_spinlock_t *lock) ++static inline int __raw_spin_is_locked(__raw_spinlock_t *lock) + { + return __ticket_spin_is_locked(lock); + } + +-static inline int __raw_spin_is_contended(raw_spinlock_t *lock) ++static inline int __raw_spin_is_contended(__raw_spinlock_t *lock) + { + return __ticket_spin_is_contended(lock); + } + #define __raw_spin_is_contended __raw_spin_is_contended + +-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) ++static __always_inline void __raw_spin_lock(__raw_spinlock_t *lock) + { + __ticket_spin_lock(lock); + } + +-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) ++static __always_inline int __raw_spin_trylock(__raw_spinlock_t *lock) + { + return __ticket_spin_trylock(lock); + } + +-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) ++static __always_inline void __raw_spin_unlock(__raw_spinlock_t *lock) + { + __ticket_spin_unlock(lock); + } + +-static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, ++static __always_inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, + unsigned long flags) + { + __raw_spin_lock(lock); + } + +-#endif /* CONFIG_PARAVIRT */ ++#endif + +-static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) ++static inline void __raw_spin_unlock_wait(__raw_spinlock_t *lock) + { + while (__raw_spin_is_locked(lock)) + cpu_relax(); +@@ -294,7 +232,7 @@ static inline void __raw_spin_unlock_wai + * read_can_lock - would read_trylock() succeed? + * @lock: the rwlock in question. + */ +-static inline int __raw_read_can_lock(raw_rwlock_t *lock) ++static inline int __raw_read_can_lock(__raw_rwlock_t *lock) + { + return (int)(lock)->lock > 0; + } +@@ -303,12 +241,12 @@ static inline int __raw_read_can_lock(ra + * write_can_lock - would write_trylock() succeed? + * @lock: the rwlock in question. + */ +-static inline int __raw_write_can_lock(raw_rwlock_t *lock) ++static inline int __raw_write_can_lock(__raw_rwlock_t *lock) + { + return (lock)->lock == RW_LOCK_BIAS; + } + +-static inline void __raw_read_lock(raw_rwlock_t *rw) ++static inline void __raw_read_lock(__raw_rwlock_t *rw) + { + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" + "jns 1f\n" +@@ -317,7 +255,7 @@ static inline void __raw_read_lock(raw_r + ::LOCK_PTR_REG (rw) : "memory"); + } + +-static inline void __raw_write_lock(raw_rwlock_t *rw) ++static inline void __raw_write_lock(__raw_rwlock_t *rw) + { + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" + "jz 1f\n" +@@ -326,18 +264,17 @@ static inline void __raw_write_lock(raw_ + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); + } + +-static inline int __raw_read_trylock(raw_rwlock_t *lock) ++static inline int __raw_read_trylock(__raw_rwlock_t *lock) + { + atomic_t *count = (atomic_t *)lock; + +- atomic_dec(count); +- if (atomic_read(count) >= 0) ++ if (atomic_dec_return(count) >= 0) + return 1; + atomic_inc(count); + return 0; + } + +-static inline int __raw_write_trylock(raw_rwlock_t *lock) ++static inline int __raw_write_trylock(__raw_rwlock_t *lock) + { + atomic_t *count = (atomic_t *)lock; + +@@ -347,19 +284,19 @@ static inline int __raw_write_trylock(ra + return 0; + } + +-static inline void __raw_read_unlock(raw_rwlock_t *rw) ++static inline void __raw_read_unlock(__raw_rwlock_t *rw) + { + asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); + } + +-static inline void __raw_write_unlock(raw_rwlock_t *rw) ++static inline void __raw_write_unlock(__raw_rwlock_t *rw) + { + asm volatile(LOCK_PREFIX "addl %1, %0" + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); + } + +-#define _raw_spin_relax(lock) cpu_relax() +-#define _raw_read_relax(lock) cpu_relax() +-#define _raw_write_relax(lock) cpu_relax() ++#define __raw_spin_relax(lock) cpu_relax() ++#define __raw_read_relax(lock) cpu_relax() ++#define __raw_write_relax(lock) cpu_relax() + + #endif /* _ASM_X86_SPINLOCK_H */ +Index: linux-2.6-tip/arch/x86/include/asm/stackprotector.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/stackprotector.h +@@ -0,0 +1,124 @@ ++/* ++ * GCC stack protector support. ++ * ++ * Stack protector works by putting predefined pattern at the start of ++ * the stack frame and verifying that it hasn't been overwritten when ++ * returning from the function. The pattern is called stack canary ++ * and unfortunately gcc requires it to be at a fixed offset from %gs. ++ * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64 ++ * and x86_32 use segment registers differently and thus handles this ++ * requirement differently. ++ * ++ * On x86_64, %gs is shared by percpu area and stack canary. All ++ * percpu symbols are zero based and %gs points to the base of percpu ++ * area. The first occupant of the percpu area is always ++ * irq_stack_union which contains stack_canary at offset 40. Userland ++ * %gs is always saved and restored on kernel entry and exit using ++ * swapgs, so stack protector doesn't add any complexity there. ++ * ++ * On x86_32, it's slightly more complicated. As in x86_64, %gs is ++ * used for userland TLS. Unfortunately, some processors are much ++ * slower at loading segment registers with different value when ++ * entering and leaving the kernel, so the kernel uses %fs for percpu ++ * area and manages %gs lazily so that %gs is switched only when ++ * necessary, usually during task switch. ++ * ++ * As gcc requires the stack canary at %gs:20, %gs can't be managed ++ * lazily if stack protector is enabled, so the kernel saves and ++ * restores userland %gs on kernel entry and exit. This behavior is ++ * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in ++ * system.h to hide the details. ++ */ ++ ++#ifndef _ASM_STACKPROTECTOR_H ++#define _ASM_STACKPROTECTOR_H 1 ++ ++#ifdef CONFIG_CC_STACKPROTECTOR ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * 24 byte read-only segment initializer for stack canary. Linker ++ * can't handle the address bit shifting. Address will be set in ++ * head_32 for boot CPU and setup_per_cpu_areas() for others. ++ */ ++#define GDT_STACK_CANARY_INIT \ ++ [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } }, ++ ++/* ++ * Initialize the stackprotector canary value. ++ * ++ * NOTE: this must only be called from functions that never return, ++ * and it must always be inlined. ++ */ ++static __always_inline void boot_init_stack_canary(void) ++{ ++ u64 canary; ++ u64 tsc; ++ ++#ifdef CONFIG_X86_64 ++ BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); ++#endif ++ /* ++ * We both use the random pool and the current TSC as a source ++ * of randomness. The TSC only matters for very early init, ++ * there it already has some randomness on most systems. Later ++ * on during the bootup the random pool has true entropy too. ++ */ ++ get_random_bytes(&canary, sizeof(canary)); ++ tsc = __native_read_tsc(); ++ canary += tsc + (tsc << 32UL); ++ ++ current->stack_canary = canary; ++#ifdef CONFIG_X86_64 ++ percpu_write(irq_stack_union.stack_canary, canary); ++#else ++ percpu_write(stack_canary, canary); ++#endif ++} ++ ++static inline void setup_stack_canary_segment(int cpu) ++{ ++#ifdef CONFIG_X86_32 ++ unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20; ++ struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); ++ struct desc_struct desc; ++ ++ desc = gdt_table[GDT_ENTRY_STACK_CANARY]; ++ desc.base0 = canary & 0xffff; ++ desc.base1 = (canary >> 16) & 0xff; ++ desc.base2 = (canary >> 24) & 0xff; ++ write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); ++#endif ++} ++ ++static inline void load_stack_canary_segment(void) ++{ ++#ifdef CONFIG_X86_32 ++ asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory"); ++#endif ++} ++ ++#else /* CC_STACKPROTECTOR */ ++ ++#define GDT_STACK_CANARY_INIT ++ ++/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */ ++ ++static inline void setup_stack_canary_segment(int cpu) ++{ } ++ ++static inline void load_stack_canary_segment(void) ++{ ++#ifdef CONFIG_X86_32 ++ asm volatile ("mov %0, %%gs" : : "r" (0)); ++#endif ++} ++ ++#endif /* CC_STACKPROTECTOR */ ++#endif /* _ASM_STACKPROTECTOR_H */ +Index: linux-2.6-tip/arch/x86/include/asm/string_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/string_32.h ++++ linux-2.6-tip/arch/x86/include/asm/string_32.h +@@ -177,10 +177,18 @@ static inline void *__memcpy3d(void *to, + * No 3D Now! + */ + ++#ifndef CONFIG_KMEMCHECK + #define memcpy(t, f, n) \ + (__builtin_constant_p((n)) \ + ? __constant_memcpy((t), (f), (n)) \ + : __memcpy((t), (f), (n))) ++#else ++/* ++ * kmemcheck becomes very happy if we use the REP instructions unconditionally, ++ * because it means that we know both memory operands in advance. ++ */ ++#define memcpy(t, f, n) __memcpy((t), (f), (n)) ++#endif + + #endif + +Index: linux-2.6-tip/arch/x86/include/asm/string_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/string_64.h ++++ linux-2.6-tip/arch/x86/include/asm/string_64.h +@@ -27,6 +27,7 @@ static __always_inline void *__inline_me + function. */ + + #define __HAVE_ARCH_MEMCPY 1 ++#ifndef CONFIG_KMEMCHECK + #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 + extern void *memcpy(void *to, const void *from, size_t len); + #else +@@ -42,6 +43,13 @@ extern void *__memcpy(void *to, const vo + __ret; \ + }) + #endif ++#else ++/* ++ * kmemcheck becomes very happy if we use the REP instructions unconditionally, ++ * because it means that we know both memory operands in advance. ++ */ ++#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) ++#endif + + #define __HAVE_ARCH_MEMSET + void *memset(void *s, int c, size_t n); +Index: linux-2.6-tip/arch/x86/include/asm/summit/apic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/summit/apic.h ++++ /dev/null +@@ -1,202 +0,0 @@ +-#ifndef __ASM_SUMMIT_APIC_H +-#define __ASM_SUMMIT_APIC_H +- +-#include +-#include +- +-#define esr_disable (1) +-#define NO_BALANCE_IRQ (0) +- +-/* In clustered mode, the high nibble of APIC ID is a cluster number. +- * The low nibble is a 4-bit bitmap. */ +-#define XAPIC_DEST_CPUS_SHIFT 4 +-#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) +-#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) +- +-#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) +- +-static inline const cpumask_t *target_cpus(void) +-{ +- /* CPU_MASK_ALL (0xff) has undefined behaviour with +- * dest_LowestPrio mode logical clustered apic interrupt routing +- * Just start on cpu 0. IRQ balancing will spread load +- */ +- return &cpumask_of_cpu(0); +-} +- +-#define INT_DELIVERY_MODE (dest_LowestPrio) +-#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ +- +-static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +-{ +- return 0; +-} +- +-/* we don't use the phys_cpu_present_map to indicate apicid presence */ +-static inline unsigned long check_apicid_present(int bit) +-{ +- return 1; +-} +- +-#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK) +- +-extern u8 cpu_2_logical_apicid[]; +- +-static inline void init_apic_ldr(void) +-{ +- unsigned long val, id; +- int count = 0; +- u8 my_id = (u8)hard_smp_processor_id(); +- u8 my_cluster = (u8)apicid_cluster(my_id); +-#ifdef CONFIG_SMP +- u8 lid; +- int i; +- +- /* Create logical APIC IDs by counting CPUs already in cluster. */ +- for (count = 0, i = nr_cpu_ids; --i >= 0; ) { +- lid = cpu_2_logical_apicid[i]; +- if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster) +- ++count; +- } +-#endif +- /* We only have a 4 wide bitmap in cluster mode. If a deranged +- * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ +- BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); +- id = my_cluster | (1UL << count); +- apic_write(APIC_DFR, APIC_DFR_VALUE); +- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; +- val |= SET_APIC_LOGICAL_ID(id); +- apic_write(APIC_LDR, val); +-} +- +-static inline int multi_timer_check(int apic, int irq) +-{ +- return 0; +-} +- +-static inline int apic_id_registered(void) +-{ +- return 1; +-} +- +-static inline void setup_apic_routing(void) +-{ +- printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", +- nr_ioapics); +-} +- +-static inline int apicid_to_node(int logical_apicid) +-{ +-#ifdef CONFIG_SMP +- return apicid_2_node[hard_smp_processor_id()]; +-#else +- return 0; +-#endif +-} +- +-/* Mapping from cpu number to logical apicid */ +-static inline int cpu_to_logical_apicid(int cpu) +-{ +-#ifdef CONFIG_SMP +- if (cpu >= nr_cpu_ids) +- return BAD_APICID; +- return (int)cpu_2_logical_apicid[cpu]; +-#else +- return logical_smp_processor_id(); +-#endif +-} +- +-static inline int cpu_present_to_apicid(int mps_cpu) +-{ +- if (mps_cpu < nr_cpu_ids) +- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); +- else +- return BAD_APICID; +-} +- +-static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map) +-{ +- /* For clustered we don't have a good way to do this yet - hack */ +- return physids_promote(0x0F); +-} +- +-static inline physid_mask_t apicid_to_cpu_present(int apicid) +-{ +- return physid_mask_of_physid(0); +-} +- +-static inline void setup_portio_remap(void) +-{ +-} +- +-static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +-{ +- return 1; +-} +- +-static inline void enable_apic_mode(void) +-{ +-} +- +-static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) +-{ +- int num_bits_set; +- int cpus_found = 0; +- int cpu; +- int apicid; +- +- num_bits_set = cpus_weight(*cpumask); +- /* Return id to all */ +- if (num_bits_set >= nr_cpu_ids) +- return (int) 0xFF; +- /* +- * The cpus in the mask must all be on the apic cluster. If are not +- * on the same apicid cluster return default value of TARGET_CPUS. +- */ +- cpu = first_cpu(*cpumask); +- apicid = cpu_to_logical_apicid(cpu); +- while (cpus_found < num_bits_set) { +- if (cpu_isset(cpu, *cpumask)) { +- int new_apicid = cpu_to_logical_apicid(cpu); +- if (apicid_cluster(apicid) != +- apicid_cluster(new_apicid)){ +- printk ("%s: Not a valid mask!\n", __func__); +- return 0xFF; +- } +- apicid = apicid | new_apicid; +- cpus_found++; +- } +- cpu++; +- } +- return apicid; +-} +- +-static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask, +- const struct cpumask *andmask) +-{ +- int apicid = cpu_to_logical_apicid(0); +- cpumask_var_t cpumask; +- +- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) +- return apicid; +- +- cpumask_and(cpumask, inmask, andmask); +- cpumask_and(cpumask, cpumask, cpu_online_mask); +- apicid = cpu_mask_to_apicid(cpumask); +- +- free_cpumask_var(cpumask); +- return apicid; +-} +- +-/* cpuid returns the value latched in the HW at reset, not the APIC ID +- * register's value. For any box whose BIOS changes APIC IDs, like +- * clustered APIC systems, we must use hard_smp_processor_id. +- * +- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. +- */ +-static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +-{ +- return hard_smp_processor_id() >> index_msb; +-} +- +-#endif /* __ASM_SUMMIT_APIC_H */ +Index: linux-2.6-tip/arch/x86/include/asm/summit/apicdef.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/summit/apicdef.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-#ifndef __ASM_SUMMIT_APICDEF_H +-#define __ASM_SUMMIT_APICDEF_H +- +-#define APIC_ID_MASK (0xFF<<24) +- +-static inline unsigned get_apic_id(unsigned long x) +-{ +- return (x>>24)&0xFF; +-} +- +-#define GET_APIC_ID(x) get_apic_id(x) +- +-#endif +Index: linux-2.6-tip/arch/x86/include/asm/summit/ipi.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/summit/ipi.h ++++ /dev/null +@@ -1,26 +0,0 @@ +-#ifndef __ASM_SUMMIT_IPI_H +-#define __ASM_SUMMIT_IPI_H +- +-void send_IPI_mask_sequence(const cpumask_t *mask, int vector); +-void send_IPI_mask_allbutself(const cpumask_t *mask, int vector); +- +-static inline void send_IPI_mask(const cpumask_t *mask, int vector) +-{ +- send_IPI_mask_sequence(mask, vector); +-} +- +-static inline void send_IPI_allbutself(int vector) +-{ +- cpumask_t mask = cpu_online_map; +- cpu_clear(smp_processor_id(), mask); +- +- if (!cpus_empty(mask)) +- send_IPI_mask(&mask, vector); +-} +- +-static inline void send_IPI_all(int vector) +-{ +- send_IPI_mask(&cpu_online_map, vector); +-} +- +-#endif /* __ASM_SUMMIT_IPI_H */ +Index: linux-2.6-tip/arch/x86/include/asm/summit/mpparse.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/summit/mpparse.h ++++ /dev/null +@@ -1,109 +0,0 @@ +-#ifndef __ASM_SUMMIT_MPPARSE_H +-#define __ASM_SUMMIT_MPPARSE_H +- +-#include +- +-extern int use_cyclone; +- +-#ifdef CONFIG_X86_SUMMIT_NUMA +-extern void setup_summit(void); +-#else +-#define setup_summit() {} +-#endif +- +-static inline int mps_oem_check(struct mpc_table *mpc, char *oem, +- char *productid) +-{ +- if (!strncmp(oem, "IBM ENSW", 8) && +- (!strncmp(productid, "VIGIL SMP", 9) +- || !strncmp(productid, "EXA", 3) +- || !strncmp(productid, "RUTHLESS SMP", 12))){ +- mark_tsc_unstable("Summit based system"); +- use_cyclone = 1; /*enable cyclone-timer*/ +- setup_summit(); +- return 1; +- } +- return 0; +-} +- +-/* Hook from generic ACPI tables.c */ +-static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- if (!strncmp(oem_id, "IBM", 3) && +- (!strncmp(oem_table_id, "SERVIGIL", 8) +- || !strncmp(oem_table_id, "EXA", 3))){ +- mark_tsc_unstable("Summit based system"); +- use_cyclone = 1; /*enable cyclone-timer*/ +- setup_summit(); +- return 1; +- } +- return 0; +-} +- +-struct rio_table_hdr { +- unsigned char version; /* Version number of this data structure */ +- /* Version 3 adds chassis_num & WP_index */ +- unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */ +- unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */ +-} __attribute__((packed)); +- +-struct scal_detail { +- unsigned char node_id; /* Scalability Node ID */ +- unsigned long CBAR; /* Address of 1MB register space */ +- unsigned char port0node; /* Node ID port connected to: 0xFF=None */ +- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ +- unsigned char port1node; /* Node ID port connected to: 0xFF = None */ +- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ +- unsigned char port2node; /* Node ID port connected to: 0xFF = None */ +- unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */ +- unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */ +-} __attribute__((packed)); +- +-struct rio_detail { +- unsigned char node_id; /* RIO Node ID */ +- unsigned long BBAR; /* Address of 1MB register space */ +- unsigned char type; /* Type of device */ +- unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/ +- /* For CYC: Node ID of Twister that owns this CYC */ +- unsigned char port0node; /* Node ID port connected to: 0xFF=None */ +- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ +- unsigned char port1node; /* Node ID port connected to: 0xFF=None */ +- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ +- unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */ +- /* For CYC: 0 */ +- unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */ +- /* = 0 : the XAPIC is not used, ie:*/ +- /* ints fwded to another XAPIC */ +- /* Bits1:7 Reserved */ +- /* For CYC: Bits0:7 Reserved */ +- unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */ +- /* lower slot numbers/PCI bus numbers */ +- /* For CYC: No meaning */ +- unsigned char chassis_num; /* 1 based Chassis number */ +- /* For LookOut WPEGs this field indicates the */ +- /* Expansion Chassis #, enumerated from Boot */ +- /* Node WPEG external port, then Boot Node CYC */ +- /* external port, then Next Vigil chassis WPEG */ +- /* external port, etc. */ +- /* Shared Lookouts have only 1 chassis number (the */ +- /* first one assigned) */ +-} __attribute__((packed)); +- +- +-typedef enum { +- CompatTwister = 0, /* Compatibility Twister */ +- AltTwister = 1, /* Alternate Twister of internal 8-way */ +- CompatCyclone = 2, /* Compatibility Cyclone */ +- AltCyclone = 3, /* Alternate Cyclone of internal 8-way */ +- CompatWPEG = 4, /* Compatibility WPEG */ +- AltWPEG = 5, /* Second Planar WPEG */ +- LookOutAWPEG = 6, /* LookOut WPEG */ +- LookOutBWPEG = 7, /* LookOut WPEG */ +-} node_type; +- +-static inline int is_WPEG(struct rio_detail *rio){ +- return (rio->type == CompatWPEG || rio->type == AltWPEG || +- rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); +-} +- +-#endif /* __ASM_SUMMIT_MPPARSE_H */ +Index: linux-2.6-tip/arch/x86/include/asm/syscalls.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/syscalls.h ++++ linux-2.6-tip/arch/x86/include/asm/syscalls.h +@@ -29,21 +29,21 @@ asmlinkage int sys_get_thread_area(struc + /* X86_32 only */ + #ifdef CONFIG_X86_32 + /* kernel/process_32.c */ +-asmlinkage int sys_fork(struct pt_regs); +-asmlinkage int sys_clone(struct pt_regs); +-asmlinkage int sys_vfork(struct pt_regs); +-asmlinkage int sys_execve(struct pt_regs); ++int sys_fork(struct pt_regs *); ++int sys_clone(struct pt_regs *); ++int sys_vfork(struct pt_regs *); ++int sys_execve(struct pt_regs *); + + /* kernel/signal_32.c */ + asmlinkage int sys_sigsuspend(int, int, old_sigset_t); + asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, + struct old_sigaction __user *); +-asmlinkage int sys_sigaltstack(unsigned long); +-asmlinkage unsigned long sys_sigreturn(unsigned long); +-asmlinkage int sys_rt_sigreturn(unsigned long); ++int sys_sigaltstack(struct pt_regs *); ++unsigned long sys_sigreturn(struct pt_regs *); ++long sys_rt_sigreturn(struct pt_regs *); + + /* kernel/ioport.c */ +-asmlinkage long sys_iopl(unsigned long); ++long sys_iopl(struct pt_regs *); + + /* kernel/sys_i386_32.c */ + asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, +@@ -59,8 +59,8 @@ struct oldold_utsname; + asmlinkage int sys_olduname(struct oldold_utsname __user *); + + /* kernel/vm86_32.c */ +-asmlinkage int sys_vm86old(struct pt_regs); +-asmlinkage int sys_vm86(struct pt_regs); ++int sys_vm86old(struct pt_regs *); ++int sys_vm86(struct pt_regs *); + + #else /* CONFIG_X86_32 */ + +@@ -74,6 +74,7 @@ asmlinkage long sys_vfork(struct pt_regs + asmlinkage long sys_execve(char __user *, char __user * __user *, + char __user * __user *, + struct pt_regs *); ++long sys_arch_prctl(int, unsigned long); + + /* kernel/ioport.c */ + asmlinkage long sys_iopl(unsigned int, struct pt_regs *); +@@ -81,7 +82,7 @@ asmlinkage long sys_iopl(unsigned int, s + /* kernel/signal_64.c */ + asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, + struct pt_regs *); +-asmlinkage long sys_rt_sigreturn(struct pt_regs *); ++long sys_rt_sigreturn(struct pt_regs *); + + /* kernel/sys_x86_64.c */ + asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, +Index: linux-2.6-tip/arch/x86/include/asm/system.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/system.h ++++ linux-2.6-tip/arch/x86/include/asm/system.h +@@ -20,9 +20,26 @@ + struct task_struct; /* one of the stranger aspects of C forward declarations */ + struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); ++struct tss_struct; ++void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, ++ struct tss_struct *tss); + + #ifdef CONFIG_X86_32 + ++#ifdef CONFIG_CC_STACKPROTECTOR ++#define __switch_canary \ ++ "movl %P[task_canary](%[next]), %%ebx\n\t" \ ++ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" ++#define __switch_canary_oparam \ ++ , [stack_canary] "=m" (per_cpu_var(stack_canary)) ++#define __switch_canary_iparam \ ++ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) ++#else /* CC_STACKPROTECTOR */ ++#define __switch_canary ++#define __switch_canary_oparam ++#define __switch_canary_iparam ++#endif /* CC_STACKPROTECTOR */ ++ + /* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. +@@ -44,6 +61,7 @@ do { \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ ++ __switch_canary \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ +@@ -58,6 +76,8 @@ do { \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ ++ __switch_canary_oparam \ ++ \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ +@@ -66,6 +86,8 @@ do { \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ ++ __switch_canary_iparam \ ++ \ + : /* reloaded segment registers */ \ + "memory"); \ + } while (0) +@@ -86,27 +108,44 @@ do { \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + ++#ifdef CONFIG_CC_STACKPROTECTOR ++#define __switch_canary \ ++ "movq %P[task_canary](%%rsi),%%r8\n\t" \ ++ "movq %%r8,"__percpu_arg([gs_canary])"\n\t" ++#define __switch_canary_oparam \ ++ , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) ++#define __switch_canary_iparam \ ++ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) ++#else /* CC_STACKPROTECTOR */ ++#define __switch_canary ++#define __switch_canary_oparam ++#define __switch_canary_iparam ++#endif /* CC_STACKPROTECTOR */ ++ + /* Save restore flags to clear handle leaking NT */ + #define switch_to(prev, next, last) \ +- asm volatile(SAVE_CONTEXT \ ++ asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + ".globl thread_return\n" \ + "thread_return:\n\t" \ +- "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ ++ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ ++ __switch_canary \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ +- LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ + "movq %%rax,%%rdi\n\t" \ +- "jc ret_from_fork\n\t" \ ++ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ ++ "jnz ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ ++ __switch_canary_oparam \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ +- [tif_fork] "i" (TIF_FORK), \ ++ [_tif_fork] "i" (_TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ +- [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ ++ [current_task] "m" (per_cpu_var(current_task)) \ ++ __switch_canary_iparam \ + : "memory", "cc" __EXTRA_CLOBBER) + #endif + +@@ -165,6 +204,25 @@ extern void native_load_gs_index(unsigne + #define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + ++/* ++ * x86_32 user gs accessors. ++ */ ++#ifdef CONFIG_X86_32 ++#ifdef CONFIG_X86_32_LAZY_GS ++#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) ++#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) ++#define task_user_gs(tsk) ((tsk)->thread.gs) ++#define lazy_save_gs(v) savesegment(gs, (v)) ++#define lazy_load_gs(v) loadsegment(gs, (v)) ++#else /* X86_32_LAZY_GS */ ++#define get_user_gs(regs) (u16)((regs)->gs) ++#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) ++#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) ++#define lazy_save_gs(v) do { } while (0) ++#define lazy_load_gs(v) do { } while (0) ++#endif /* X86_32_LAZY_GS */ ++#endif /* X86_32 */ ++ + static inline unsigned long get_limit(unsigned long segment) + { + unsigned long __limit; +Index: linux-2.6-tip/arch/x86/include/asm/thread_info.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/thread_info.h ++++ linux-2.6-tip/arch/x86/include/asm/thread_info.h +@@ -40,6 +40,7 @@ struct thread_info { + */ + __u8 supervisor_stack[0]; + #endif ++ int uaccess_err; + }; + + #define INIT_THREAD_INFO(tsk) \ +@@ -82,6 +83,7 @@ struct thread_info { + #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ + #define TIF_SECCOMP 8 /* secure computing */ + #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ ++#define TIF_PERF_COUNTERS 11 /* notify perf counter work */ + #define TIF_NOTSC 16 /* TSC is not accessible in userland */ + #define TIF_IA32 17 /* 32bit process */ + #define TIF_FORK 18 /* ret_from_fork */ +@@ -93,6 +95,7 @@ struct thread_info { + #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ + #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ + #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ ++#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ + + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +@@ -104,6 +107,7 @@ struct thread_info { + #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) + #define _TIF_SECCOMP (1 << TIF_SECCOMP) + #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) ++#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS) + #define _TIF_NOTSC (1 << TIF_NOTSC) + #define _TIF_IA32 (1 << TIF_IA32) + #define _TIF_FORK (1 << TIF_FORK) +@@ -114,15 +118,17 @@ struct thread_info { + #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) + #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) + #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) ++#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) + + /* work to do in syscall_trace_enter() */ + #define _TIF_WORK_SYSCALL_ENTRY \ +- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \ ++ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ + _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) + + /* work to do in syscall_trace_leave() */ + #define _TIF_WORK_SYSCALL_EXIT \ +- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP) ++ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ ++ _TIF_SYSCALL_FTRACE) + + /* work to do on interrupt/exception return */ + #define _TIF_WORK_MASK \ +@@ -131,11 +137,11 @@ struct thread_info { + _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) + + /* work to do on any return to user space */ +-#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) ++#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) + + /* Only used for 64 bit */ + #define _TIF_DO_NOTIFY_MASK \ +- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) ++ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME) + + /* flags to check in __switch_to() */ + #define _TIF_WORK_CTXSW \ +@@ -148,9 +154,9 @@ struct thread_info { + + /* thread information allocation */ + #ifdef CONFIG_DEBUG_STACK_USAGE +-#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO) ++#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) + #else +-#define THREAD_FLAGS GFP_KERNEL ++#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK) + #endif + + #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR +@@ -194,25 +200,21 @@ static inline struct thread_info *curren + + #else /* X86_32 */ + +-#include ++#include ++#define KERNEL_STACK_OFFSET (5*8) + + /* + * macros/functions for gaining access to the thread information structure + * preempt_count needs to be 1 initially, until the scheduler is functional. + */ + #ifndef __ASSEMBLY__ +-static inline struct thread_info *current_thread_info(void) +-{ +- struct thread_info *ti; +- ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE); +- return ti; +-} ++DECLARE_PER_CPU(unsigned long, kernel_stack); + +-/* do not use in interrupt context */ +-static inline struct thread_info *stack_thread_info(void) ++static inline struct thread_info *current_thread_info(void) + { + struct thread_info *ti; +- asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1))); ++ ti = (void *)(percpu_read(kernel_stack) + ++ KERNEL_STACK_OFFSET - THREAD_SIZE); + return ti; + } + +@@ -220,8 +222,8 @@ static inline struct thread_info *stack_ + + /* how to get the thread information struct from ASM */ + #define GET_THREAD_INFO(reg) \ +- movq %gs:pda_kernelstack,reg ; \ +- subq $(THREAD_SIZE-PDA_STACKOFFSET),reg ++ movq PER_CPU_VAR(kernel_stack),reg ; \ ++ subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg + + #endif + +Index: linux-2.6-tip/arch/x86/include/asm/timer.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/timer.h ++++ linux-2.6-tip/arch/x86/include/asm/timer.h +@@ -3,6 +3,7 @@ + #include + #include + #include ++#include + + #define TICK_SIZE (tick_nsec / 1000) + +@@ -12,6 +13,7 @@ unsigned long native_calibrate_tsc(void) + #ifdef CONFIG_X86_32 + extern int timer_ack; + extern int recalibrate_cpu_khz(void); ++extern irqreturn_t timer_interrupt(int irq, void *dev_id); + #endif /* CONFIG_X86_32 */ + + extern int no_timer_check; +@@ -56,9 +58,9 @@ static inline unsigned long long cycles_ + unsigned long long ns; + unsigned long flags; + +- local_irq_save(flags); ++ raw_local_irq_save(flags); + ns = __cycles_2_ns(cyc); +- local_irq_restore(flags); ++ raw_local_irq_restore(flags); + + return ns; + } +Index: linux-2.6-tip/arch/x86/include/asm/tlbflush.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/tlbflush.h ++++ linux-2.6-tip/arch/x86/include/asm/tlbflush.h +@@ -7,6 +7,21 @@ + #include + #include + ++/* ++ * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the ++ * following complex race scenario: ++ * ++ * if the current task is lazy-TLB and does a TLB flush and ++ * gets preempted after the movl %%r3, %0 but before the ++ * movl %0, %%cr3 then its ->active_mm might change and it will ++ * install the wrong cr3 when it switches back. This is not a ++ * problem for the lazy-TLB task itself, but if the next task it ++ * switches to has an ->mm that is also the lazy-TLB task's ++ * new ->active_mm, then the scheduler will assume that cr3 is ++ * the new one, while we overwrote it with the old one. The result ++ * is the wrong cr3 in the new (non-lazy-TLB) task, which typically ++ * causes an infinite pagefault upon the next userspace access. ++ */ + #ifdef CONFIG_PARAVIRT + #include + #else +@@ -17,7 +32,9 @@ + + static inline void __native_flush_tlb(void) + { ++ preempt_disable(); + write_cr3(read_cr3()); ++ preempt_enable(); + } + + static inline void __native_flush_tlb_global(void) +@@ -95,6 +112,13 @@ static inline void __flush_tlb_one(unsig + + static inline void flush_tlb_mm(struct mm_struct *mm) + { ++ /* ++ * This is safe on PREEMPT_RT because if we preempt ++ * right after the check but before the __flush_tlb(), ++ * and if ->active_mm changes, then we might miss a ++ * TLB flush, but that TLB flush happened already when ++ * ->active_mm was changed: ++ */ + if (mm == current->active_mm) + __flush_tlb(); + } +@@ -113,7 +137,7 @@ static inline void flush_tlb_range(struc + __flush_tlb(); + } + +-static inline void native_flush_tlb_others(const cpumask_t *cpumask, ++static inline void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, + unsigned long va) + { +@@ -142,31 +166,28 @@ static inline void flush_tlb_range(struc + flush_tlb_mm(vma->vm_mm); + } + +-void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm, +- unsigned long va); ++void native_flush_tlb_others(const struct cpumask *cpumask, ++ struct mm_struct *mm, unsigned long va); + + #define TLBSTATE_OK 1 + #define TLBSTATE_LAZY 2 + +-#ifdef CONFIG_X86_32 + struct tlb_state { + struct mm_struct *active_mm; + int state; +- char __cacheline_padding[L1_CACHE_BYTES-8]; + }; + DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); + +-void reset_lazy_tlbstate(void); +-#else + static inline void reset_lazy_tlbstate(void) + { ++ percpu_write(cpu_tlbstate.state, 0); ++ percpu_write(cpu_tlbstate.active_mm, &init_mm); + } +-#endif + + #endif /* SMP */ + + #ifndef CONFIG_PARAVIRT +-#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va) ++#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va) + #endif + + static inline void flush_tlb_kernel_range(unsigned long start, +@@ -175,4 +196,6 @@ static inline void flush_tlb_kernel_rang + flush_tlb_all(); + } + ++extern void zap_low_mappings(void); ++ + #endif /* _ASM_X86_TLBFLUSH_H */ +Index: linux-2.6-tip/arch/x86/include/asm/topology.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/topology.h ++++ linux-2.6-tip/arch/x86/include/asm/topology.h +@@ -44,9 +44,6 @@ + + #ifdef CONFIG_X86_32 + +-/* Mappings between node number and cpus on that node. */ +-extern cpumask_t node_to_cpumask_map[]; +- + /* Mappings between logical cpu number and node number */ + extern int cpu_to_node_map[]; + +@@ -57,39 +54,18 @@ static inline int cpu_to_node(int cpu) + } + #define early_cpu_to_node(cpu) cpu_to_node(cpu) + +-/* Returns a bitmask of CPUs on Node 'node'. +- * +- * Side note: this function creates the returned cpumask on the stack +- * so with a high NR_CPUS count, excessive stack space is used. The +- * cpumask_of_node function should be used whenever possible. +- */ +-static inline cpumask_t node_to_cpumask(int node) +-{ +- return node_to_cpumask_map[node]; +-} +- +-/* Returns a bitmask of CPUs on Node 'node'. */ +-static inline const struct cpumask *cpumask_of_node(int node) +-{ +- return &node_to_cpumask_map[node]; +-} +- + #else /* CONFIG_X86_64 */ + +-/* Mappings between node number and cpus on that node. */ +-extern cpumask_t *node_to_cpumask_map; +- + /* Mappings between logical cpu number and node number */ + DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); + + /* Returns the number of the current Node. */ +-#define numa_node_id() read_pda(nodenumber) ++DECLARE_PER_CPU(int, node_number); ++#define numa_node_id() percpu_read(node_number) + + #ifdef CONFIG_DEBUG_PER_CPU_MAPS + extern int cpu_to_node(int cpu); + extern int early_cpu_to_node(int cpu); +-extern const cpumask_t *cpumask_of_node(int node); +-extern cpumask_t node_to_cpumask(int node); + + #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +@@ -102,37 +78,27 @@ static inline int cpu_to_node(int cpu) + /* Same function but used if called before per_cpu areas are setup */ + static inline int early_cpu_to_node(int cpu) + { +- if (early_per_cpu_ptr(x86_cpu_to_node_map)) +- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; +- +- return per_cpu(x86_cpu_to_node_map, cpu); ++ return early_per_cpu(x86_cpu_to_node_map, cpu); + } + +-/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ +-static inline const cpumask_t *cpumask_of_node(int node) +-{ +- return &node_to_cpumask_map[node]; +-} ++#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +-/* Returns a bitmask of CPUs on Node 'node'. */ +-static inline cpumask_t node_to_cpumask(int node) ++#endif /* CONFIG_X86_64 */ ++ ++/* Mappings between node number and cpus on that node. */ ++extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; ++ ++#ifdef CONFIG_DEBUG_PER_CPU_MAPS ++extern const struct cpumask *cpumask_of_node(int node); ++#else ++/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ ++static inline const struct cpumask *cpumask_of_node(int node) + { + return node_to_cpumask_map[node]; + } ++#endif + +-#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ +- +-/* +- * Replace default node_to_cpumask_ptr with optimized version +- * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" +- */ +-#define node_to_cpumask_ptr(v, node) \ +- const cpumask_t *v = cpumask_of_node(node) +- +-#define node_to_cpumask_ptr_next(v, node) \ +- v = cpumask_of_node(node) +- +-#endif /* CONFIG_X86_64 */ ++extern void setup_node_to_cpumask_map(void); + + /* + * Returns the number of the node containing Node 'node'. This +@@ -141,7 +107,6 @@ static inline cpumask_t node_to_cpumask( + #define parent_node(node) (node) + + #define pcibus_to_node(bus) __pcibus_to_node(bus) +-#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus) + + #ifdef CONFIG_X86_32 + extern unsigned long node_start_pfn[]; +@@ -192,32 +157,32 @@ extern int __node_distance(int, int); + + #else /* !CONFIG_NUMA */ + +-#define numa_node_id() 0 +-#define cpu_to_node(cpu) 0 +-#define early_cpu_to_node(cpu) 0 ++static inline int numa_node_id(void) ++{ ++ return 0; ++} ++ ++static inline int cpu_to_node(int cpu) ++{ ++ return 0; ++} + +-static inline const cpumask_t *cpumask_of_node(int node) ++static inline int early_cpu_to_node(int cpu) + { +- return &cpu_online_map; ++ return 0; + } +-static inline cpumask_t node_to_cpumask(int node) ++ ++static inline const struct cpumask *cpumask_of_node(int node) + { +- return cpu_online_map; ++ return cpu_online_mask; + } + static inline int node_to_first_cpu(int node) + { +- return first_cpu(cpu_online_map); ++ return cpumask_first(cpu_online_mask); + } + +-/* +- * Replace default node_to_cpumask_ptr with optimized version +- * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" +- */ +-#define node_to_cpumask_ptr(v, node) \ +- const cpumask_t *v = cpumask_of_node(node) ++static inline void setup_node_to_cpumask_map(void) { } + +-#define node_to_cpumask_ptr_next(v, node) \ +- v = cpumask_of_node(node) + #endif + + #include +@@ -230,16 +195,13 @@ static inline int node_to_first_cpu(int + } + #endif + +-extern cpumask_t cpu_coregroup_map(int cpu); + extern const struct cpumask *cpu_coregroup_mask(int cpu); + + #ifdef ENABLE_TOPO_DEFINES + #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) + #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +-#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) +-#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) +-#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) +-#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) ++#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) ++#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) + + /* indicates that pointers to the topology cpumask_t maps are valid */ + #define arch_provides_topology_pointers yes +@@ -253,7 +215,7 @@ struct pci_bus; + void set_pci_bus_resources_arch_default(struct pci_bus *b); + + #ifdef CONFIG_SMP +-#define mc_capable() (cpus_weight(per_cpu(cpu_core_map, 0)) != nr_cpu_ids) ++#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) + #define smt_capable() (smp_num_siblings > 1) + #endif + +Index: linux-2.6-tip/arch/x86/include/asm/trampoline.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/trampoline.h ++++ linux-2.6-tip/arch/x86/include/asm/trampoline.h +@@ -13,6 +13,7 @@ extern unsigned char *trampoline_base; + + extern unsigned long init_rsp; + extern unsigned long initial_code; ++extern unsigned long initial_gs; + + #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) + #define TRAMPOLINE_BASE 0x6000 +Index: linux-2.6-tip/arch/x86/include/asm/traps.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/traps.h ++++ linux-2.6-tip/arch/x86/include/asm/traps.h +@@ -41,7 +41,7 @@ dotraplinkage void do_int3(struct pt_reg + dotraplinkage void do_overflow(struct pt_regs *, long); + dotraplinkage void do_bounds(struct pt_regs *, long); + dotraplinkage void do_invalid_op(struct pt_regs *, long); +-dotraplinkage void do_device_not_available(struct pt_regs); ++dotraplinkage void do_device_not_available(struct pt_regs *, long); + dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); + dotraplinkage void do_invalid_TSS(struct pt_regs *, long); + dotraplinkage void do_segment_not_present(struct pt_regs *, long); +Index: linux-2.6-tip/arch/x86/include/asm/uaccess.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/uaccess.h ++++ linux-2.6-tip/arch/x86/include/asm/uaccess.h +@@ -121,7 +121,7 @@ extern int __get_user_bad(void); + + #define __get_user_x(size, ret, x, ptr) \ + asm volatile("call __get_user_" #size \ +- : "=a" (ret),"=d" (x) \ ++ : "=a" (ret), "=d" (x) \ + : "0" (ptr)) \ + + /* Careful: we have to cast the result to the type of the pointer +@@ -181,12 +181,12 @@ extern int __get_user_bad(void); + + #define __put_user_x(size, x, ptr, __ret_pu) \ + asm volatile("call __put_user_" #size : "=a" (__ret_pu) \ +- :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") ++ : "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") + + + + #ifdef CONFIG_X86_32 +-#define __put_user_u64(x, addr, err) \ ++#define __put_user_asm_u64(x, addr, err, errret) \ + asm volatile("1: movl %%eax,0(%2)\n" \ + "2: movl %%edx,4(%2)\n" \ + "3:\n" \ +@@ -197,14 +197,24 @@ extern int __get_user_bad(void); + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ + : "=r" (err) \ +- : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err)) ++ : "A" (x), "r" (addr), "i" (errret), "0" (err)) ++ ++#define __put_user_asm_ex_u64(x, addr) \ ++ asm volatile("1: movl %%eax,0(%1)\n" \ ++ "2: movl %%edx,4(%1)\n" \ ++ "3:\n" \ ++ _ASM_EXTABLE(1b, 2b - 1b) \ ++ _ASM_EXTABLE(2b, 3b - 2b) \ ++ : : "A" (x), "r" (addr)) + + #define __put_user_x8(x, ptr, __ret_pu) \ + asm volatile("call __put_user_8" : "=a" (__ret_pu) \ + : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx") + #else +-#define __put_user_u64(x, ptr, retval) \ +- __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT) ++#define __put_user_asm_u64(x, ptr, retval, errret) \ ++ __put_user_asm(x, ptr, retval, "q", "", "Zr", errret) ++#define __put_user_asm_ex_u64(x, addr) \ ++ __put_user_asm_ex(x, addr, "q", "", "Zr") + #define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu) + #endif + +@@ -276,10 +286,32 @@ do { \ + __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \ + break; \ + case 4: \ +- __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\ ++ __put_user_asm(x, ptr, retval, "l", "k", "ir", errret); \ + break; \ + case 8: \ +- __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \ ++ __put_user_asm_u64((__typeof__(*ptr))(x), ptr, retval, \ ++ errret); \ ++ break; \ ++ default: \ ++ __put_user_bad(); \ ++ } \ ++} while (0) ++ ++#define __put_user_size_ex(x, ptr, size) \ ++do { \ ++ __chk_user_ptr(ptr); \ ++ switch (size) { \ ++ case 1: \ ++ __put_user_asm_ex(x, ptr, "b", "b", "iq"); \ ++ break; \ ++ case 2: \ ++ __put_user_asm_ex(x, ptr, "w", "w", "ir"); \ ++ break; \ ++ case 4: \ ++ __put_user_asm_ex(x, ptr, "l", "k", "ir"); \ ++ break; \ ++ case 8: \ ++ __put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr); \ + break; \ + default: \ + __put_user_bad(); \ +@@ -311,9 +343,12 @@ do { \ + + #ifdef CONFIG_X86_32 + #define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad() ++#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad() + #else + #define __get_user_asm_u64(x, ptr, retval, errret) \ + __get_user_asm(x, ptr, retval, "q", "", "=r", errret) ++#define __get_user_asm_ex_u64(x, ptr) \ ++ __get_user_asm_ex(x, ptr, "q", "", "=r") + #endif + + #define __get_user_size(x, ptr, size, retval, errret) \ +@@ -350,6 +385,33 @@ do { \ + : "=r" (err), ltype(x) \ + : "m" (__m(addr)), "i" (errret), "0" (err)) + ++#define __get_user_size_ex(x, ptr, size) \ ++do { \ ++ __chk_user_ptr(ptr); \ ++ switch (size) { \ ++ case 1: \ ++ __get_user_asm_ex(x, ptr, "b", "b", "=q"); \ ++ break; \ ++ case 2: \ ++ __get_user_asm_ex(x, ptr, "w", "w", "=r"); \ ++ break; \ ++ case 4: \ ++ __get_user_asm_ex(x, ptr, "l", "k", "=r"); \ ++ break; \ ++ case 8: \ ++ __get_user_asm_ex_u64(x, ptr); \ ++ break; \ ++ default: \ ++ (x) = __get_user_bad(); \ ++ } \ ++} while (0) ++ ++#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ ++ asm volatile("1: mov"itype" %1,%"rtype"0\n" \ ++ "2:\n" \ ++ _ASM_EXTABLE(1b, 2b - 1b) \ ++ : ltype(x) : "m" (__m(addr))) ++ + #define __put_user_nocheck(x, ptr, size) \ + ({ \ + int __pu_err; \ +@@ -385,6 +447,26 @@ struct __large_struct { unsigned long bu + _ASM_EXTABLE(1b, 3b) \ + : "=r"(err) \ + : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err)) ++ ++#define __put_user_asm_ex(x, addr, itype, rtype, ltype) \ ++ asm volatile("1: mov"itype" %"rtype"0,%1\n" \ ++ "2:\n" \ ++ _ASM_EXTABLE(1b, 2b - 1b) \ ++ : : ltype(x), "m" (__m(addr))) ++ ++/* ++ * uaccess_try and catch ++ */ ++#define uaccess_try do { \ ++ int prev_err = current_thread_info()->uaccess_err; \ ++ current_thread_info()->uaccess_err = 0; \ ++ barrier(); ++ ++#define uaccess_catch(err) \ ++ (err) |= current_thread_info()->uaccess_err; \ ++ current_thread_info()->uaccess_err = prev_err; \ ++} while (0) ++ + /** + * __get_user: - Get a simple variable from user space, with less checking. + * @x: Variable to store result. +@@ -408,6 +490,7 @@ struct __large_struct { unsigned long bu + + #define __get_user(x, ptr) \ + __get_user_nocheck((x), (ptr), sizeof(*(ptr))) ++ + /** + * __put_user: - Write a simple value into user space, with less checking. + * @x: Value to copy to user space. +@@ -435,6 +518,45 @@ struct __large_struct { unsigned long bu + #define __put_user_unaligned __put_user + + /* ++ * {get|put}_user_try and catch ++ * ++ * get_user_try { ++ * get_user_ex(...); ++ * } get_user_catch(err) ++ */ ++#define get_user_try uaccess_try ++#define get_user_catch(err) uaccess_catch(err) ++ ++#define get_user_ex(x, ptr) do { \ ++ unsigned long __gue_val; \ ++ __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \ ++ (x) = (__force __typeof__(*(ptr)))__gue_val; \ ++} while (0) ++ ++#ifdef CONFIG_X86_WP_WORKS_OK ++ ++#define put_user_try uaccess_try ++#define put_user_catch(err) uaccess_catch(err) ++ ++#define put_user_ex(x, ptr) \ ++ __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) ++ ++#else /* !CONFIG_X86_WP_WORKS_OK */ ++ ++#define put_user_try do { \ ++ int __uaccess_err = 0; ++ ++#define put_user_catch(err) \ ++ (err) |= __uaccess_err; \ ++} while (0) ++ ++#define put_user_ex(x, ptr) do { \ ++ __uaccess_err |= __put_user(x, ptr); \ ++} while (0) ++ ++#endif /* CONFIG_X86_WP_WORKS_OK */ ++ ++/* + * movsl can be slow when source and dest are not both 8-byte aligned + */ + #ifdef CONFIG_X86_INTEL_USERCOPY +Index: linux-2.6-tip/arch/x86/include/asm/uaccess_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/uaccess_64.h ++++ linux-2.6-tip/arch/x86/include/asm/uaccess_64.h +@@ -188,16 +188,16 @@ __copy_to_user_inatomic(void __user *dst + extern long __copy_user_nocache(void *dst, const void __user *src, + unsigned size, int zerorest); + +-static inline int __copy_from_user_nocache(void *dst, const void __user *src, +- unsigned size) ++static inline int ++__copy_from_user_nocache(void *dst, const void __user *src, unsigned size) + { + might_sleep(); + return __copy_user_nocache(dst, src, size, 1); + } + +-static inline int __copy_from_user_inatomic_nocache(void *dst, +- const void __user *src, +- unsigned size) ++static inline int ++__copy_from_user_inatomic_nocache(void *dst, const void __user *src, ++ unsigned size) + { + return __copy_user_nocache(dst, src, size, 0); + } +Index: linux-2.6-tip/arch/x86/include/asm/unistd_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/unistd_32.h ++++ linux-2.6-tip/arch/x86/include/asm/unistd_32.h +@@ -338,6 +338,10 @@ + #define __NR_dup3 330 + #define __NR_pipe2 331 + #define __NR_inotify_init1 332 ++#define __NR_preadv 333 ++#define __NR_pwritev 334 ++#define __NR_rt_tgsigqueueinfo 335 ++#define __NR_perf_counter_open 336 + + #ifdef __KERNEL__ + +Index: linux-2.6-tip/arch/x86/include/asm/unistd_64.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/unistd_64.h ++++ linux-2.6-tip/arch/x86/include/asm/unistd_64.h +@@ -653,7 +653,14 @@ __SYSCALL(__NR_dup3, sys_dup3) + __SYSCALL(__NR_pipe2, sys_pipe2) + #define __NR_inotify_init1 294 + __SYSCALL(__NR_inotify_init1, sys_inotify_init1) +- ++#define __NR_preadv 295 ++__SYSCALL(__NR_preadv, sys_ni_syscall) ++#define __NR_pwritev 296 ++__SYSCALL(__NR_pwritev, sys_ni_syscall) ++#define __NR_rt_tgsigqueueinfo 297 ++__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) ++#define __NR_perf_counter_open 298 ++__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) + + #ifndef __NO_STUBS + #define __ARCH_WANT_OLD_READDIR +Index: linux-2.6-tip/arch/x86/include/asm/uv/uv.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/include/asm/uv/uv.h +@@ -0,0 +1,33 @@ ++#ifndef _ASM_X86_UV_UV_H ++#define _ASM_X86_UV_UV_H ++ ++enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; ++ ++struct cpumask; ++struct mm_struct; ++ ++#ifdef CONFIG_X86_UV ++ ++extern enum uv_system_type get_uv_system_type(void); ++extern int is_uv_system(void); ++extern void uv_cpu_init(void); ++extern void uv_system_init(void); ++extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, ++ struct mm_struct *mm, ++ unsigned long va, ++ unsigned int cpu); ++ ++#else /* X86_UV */ ++ ++static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } ++static inline int is_uv_system(void) { return 0; } ++static inline void uv_cpu_init(void) { } ++static inline void uv_system_init(void) { } ++static inline const struct cpumask * ++uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, ++ unsigned long va, unsigned int cpu) ++{ return cpumask; } ++ ++#endif /* X86_UV */ ++ ++#endif /* _ASM_X86_UV_UV_H */ +Index: linux-2.6-tip/arch/x86/include/asm/uv/uv_bau.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/uv/uv_bau.h ++++ linux-2.6-tip/arch/x86/include/asm/uv/uv_bau.h +@@ -325,7 +325,6 @@ static inline void bau_cpubits_clear(str + #define cpubit_isset(cpu, bau_local_cpumask) \ + test_bit((cpu), (bau_local_cpumask).bits) + +-extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long); + extern void uv_bau_message_intr1(void); + extern void uv_bau_timeout_intr1(void); + +Index: linux-2.6-tip/arch/x86/include/asm/uv/uv_hub.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/uv/uv_hub.h ++++ linux-2.6-tip/arch/x86/include/asm/uv/uv_hub.h +@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __ + #define SCIR_CPU_ACTIVITY 0x02 /* not idle */ + #define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ + ++/* Loop through all installed blades */ ++#define for_each_possible_blade(bid) \ ++ for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++) ++ + /* + * Macros for converting between kernel virtual addresses, socket local physical + * addresses, and UV global physical addresses. +Index: linux-2.6-tip/arch/x86/include/asm/vic.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/vic.h ++++ /dev/null +@@ -1,61 +0,0 @@ +-/* Copyright (C) 1999,2001 +- * +- * Author: J.E.J.Bottomley@HansenPartnership.com +- * +- * Standard include definitions for the NCR Voyager Interrupt Controller */ +- +-/* The eight CPI vectors. To activate a CPI, you write a bit mask +- * corresponding to the processor set to be interrupted into the +- * relevant register. That set of CPUs will then be interrupted with +- * the CPI */ +-static const int VIC_CPI_Registers[] = +- {0xFC00, 0xFC01, 0xFC08, 0xFC09, +- 0xFC10, 0xFC11, 0xFC18, 0xFC19 }; +- +-#define VIC_PROC_WHO_AM_I 0xfc29 +-# define QUAD_IDENTIFIER 0xC0 +-# define EIGHT_SLOT_IDENTIFIER 0xE0 +-#define QIC_EXTENDED_PROCESSOR_SELECT 0xFC72 +-#define VIC_CPI_BASE_REGISTER 0xFC41 +-#define VIC_PROCESSOR_ID 0xFC21 +-# define VIC_CPU_MASQUERADE_ENABLE 0x8 +- +-#define VIC_CLAIM_REGISTER_0 0xFC38 +-#define VIC_CLAIM_REGISTER_1 0xFC39 +-#define VIC_REDIRECT_REGISTER_0 0xFC60 +-#define VIC_REDIRECT_REGISTER_1 0xFC61 +-#define VIC_PRIORITY_REGISTER 0xFC20 +- +-#define VIC_PRIMARY_MC_BASE 0xFC48 +-#define VIC_SECONDARY_MC_BASE 0xFC49 +- +-#define QIC_PROCESSOR_ID 0xFC71 +-# define QIC_CPUID_ENABLE 0x08 +- +-#define QIC_VIC_CPI_BASE_REGISTER 0xFC79 +-#define QIC_CPI_BASE_REGISTER 0xFC7A +- +-#define QIC_MASK_REGISTER0 0xFC80 +-/* NOTE: these are masked high, enabled low */ +-# define QIC_PERF_TIMER 0x01 +-# define QIC_LPE 0x02 +-# define QIC_SYS_INT 0x04 +-# define QIC_CMN_INT 0x08 +-/* at the moment, just enable CMN_INT, disable SYS_INT */ +-# define QIC_DEFAULT_MASK0 (~(QIC_CMN_INT /* | VIC_SYS_INT */)) +-#define QIC_MASK_REGISTER1 0xFC81 +-# define QIC_BOOT_CPI_MASK 0xFE +-/* Enable CPI's 1-6 inclusive */ +-# define QIC_CPI_ENABLE 0x81 +- +-#define QIC_INTERRUPT_CLEAR0 0xFC8A +-#define QIC_INTERRUPT_CLEAR1 0xFC8B +- +-/* this is where we place the CPI vectors */ +-#define VIC_DEFAULT_CPI_BASE 0xC0 +-/* this is where we place the QIC CPI vectors */ +-#define QIC_DEFAULT_CPI_BASE 0xD0 +- +-#define VIC_BOOT_INTERRUPT_MASK 0xfe +- +-extern void smp_vic_timer_interrupt(void); +Index: linux-2.6-tip/arch/x86/include/asm/voyager.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/voyager.h ++++ /dev/null +@@ -1,529 +0,0 @@ +-/* Copyright (C) 1999,2001 +- * +- * Author: J.E.J.Bottomley@HansenPartnership.com +- * +- * Standard include definitions for the NCR Voyager system */ +- +-#undef VOYAGER_DEBUG +-#undef VOYAGER_CAT_DEBUG +- +-#ifdef VOYAGER_DEBUG +-#define VDEBUG(x) printk x +-#else +-#define VDEBUG(x) +-#endif +- +-/* There are three levels of voyager machine: 3,4 and 5. The rule is +- * if it's less than 3435 it's a Level 3 except for a 3360 which is +- * a level 4. A 3435 or above is a Level 5 */ +-#define VOYAGER_LEVEL5_AND_ABOVE 0x3435 +-#define VOYAGER_LEVEL4 0x3360 +- +-/* The L4 DINO ASIC */ +-#define VOYAGER_DINO 0x43 +- +-/* voyager ports in standard I/O space */ +-#define VOYAGER_MC_SETUP 0x96 +- +- +-#define VOYAGER_CAT_CONFIG_PORT 0x97 +-# define VOYAGER_CAT_DESELECT 0xff +-#define VOYAGER_SSPB_RELOCATION_PORT 0x98 +- +-/* Valid CAT controller commands */ +-/* start instruction register cycle */ +-#define VOYAGER_CAT_IRCYC 0x01 +-/* start data register cycle */ +-#define VOYAGER_CAT_DRCYC 0x02 +-/* move to execute state */ +-#define VOYAGER_CAT_RUN 0x0F +-/* end operation */ +-#define VOYAGER_CAT_END 0x80 +-/* hold in idle state */ +-#define VOYAGER_CAT_HOLD 0x90 +-/* single step an "intest" vector */ +-#define VOYAGER_CAT_STEP 0xE0 +-/* return cat controller to CLEMSON mode */ +-#define VOYAGER_CAT_CLEMSON 0xFF +- +-/* the default cat command header */ +-#define VOYAGER_CAT_HEADER 0x7F +- +-/* the range of possible CAT module ids in the system */ +-#define VOYAGER_MIN_MODULE 0x10 +-#define VOYAGER_MAX_MODULE 0x1f +- +-/* The voyager registers per asic */ +-#define VOYAGER_ASIC_ID_REG 0x00 +-#define VOYAGER_ASIC_TYPE_REG 0x01 +-/* the sub address registers can be made auto incrementing on reads */ +-#define VOYAGER_AUTO_INC_REG 0x02 +-# define VOYAGER_AUTO_INC 0x04 +-# define VOYAGER_NO_AUTO_INC 0xfb +-#define VOYAGER_SUBADDRDATA 0x03 +-#define VOYAGER_SCANPATH 0x05 +-# define VOYAGER_CONNECT_ASIC 0x01 +-# define VOYAGER_DISCONNECT_ASIC 0xfe +-#define VOYAGER_SUBADDRLO 0x06 +-#define VOYAGER_SUBADDRHI 0x07 +-#define VOYAGER_SUBMODSELECT 0x08 +-#define VOYAGER_SUBMODPRESENT 0x09 +- +-#define VOYAGER_SUBADDR_LO 0xff +-#define VOYAGER_SUBADDR_HI 0xffff +- +-/* the maximum size of a scan path -- used to form instructions */ +-#define VOYAGER_MAX_SCAN_PATH 0x100 +-/* the biggest possible register size (in bytes) */ +-#define VOYAGER_MAX_REG_SIZE 4 +- +-/* Total number of possible modules (including submodules) */ +-#define VOYAGER_MAX_MODULES 16 +-/* Largest number of asics per module */ +-#define VOYAGER_MAX_ASICS_PER_MODULE 7 +- +-/* the CAT asic of each module is always the first one */ +-#define VOYAGER_CAT_ID 0 +-#define VOYAGER_PSI 0x1a +- +-/* voyager instruction operations and registers */ +-#define VOYAGER_READ_CONFIG 0x1 +-#define VOYAGER_WRITE_CONFIG 0x2 +-#define VOYAGER_BYPASS 0xff +- +-typedef struct voyager_asic { +- __u8 asic_addr; /* ASIC address; Level 4 */ +- __u8 asic_type; /* ASIC type */ +- __u8 asic_id; /* ASIC id */ +- __u8 jtag_id[4]; /* JTAG id */ +- __u8 asic_location; /* Location within scan path; start w/ 0 */ +- __u8 bit_location; /* Location within bit stream; start w/ 0 */ +- __u8 ireg_length; /* Instruction register length */ +- __u16 subaddr; /* Amount of sub address space */ +- struct voyager_asic *next; /* Next asic in linked list */ +-} voyager_asic_t; +- +-typedef struct voyager_module { +- __u8 module_addr; /* Module address */ +- __u8 scan_path_connected; /* Scan path connected */ +- __u16 ee_size; /* Size of the EEPROM */ +- __u16 num_asics; /* Number of Asics */ +- __u16 inst_bits; /* Instruction bits in the scan path */ +- __u16 largest_reg; /* Largest register in the scan path */ +- __u16 smallest_reg; /* Smallest register in the scan path */ +- voyager_asic_t *asic; /* First ASIC in scan path (CAT_I) */ +- struct voyager_module *submodule; /* Submodule pointer */ +- struct voyager_module *next; /* Next module in linked list */ +-} voyager_module_t; +- +-typedef struct voyager_eeprom_hdr { +- __u8 module_id[4]; +- __u8 version_id; +- __u8 config_id; +- __u16 boundry_id; /* boundary scan id */ +- __u16 ee_size; /* size of EEPROM */ +- __u8 assembly[11]; /* assembly # */ +- __u8 assembly_rev; /* assembly rev */ +- __u8 tracer[4]; /* tracer number */ +- __u16 assembly_cksum; /* asm checksum */ +- __u16 power_consump; /* pwr requirements */ +- __u16 num_asics; /* number of asics */ +- __u16 bist_time; /* min. bist time */ +- __u16 err_log_offset; /* error log offset */ +- __u16 scan_path_offset;/* scan path offset */ +- __u16 cct_offset; +- __u16 log_length; /* length of err log */ +- __u16 xsum_end; /* offset to end of +- checksum */ +- __u8 reserved[4]; +- __u8 sflag; /* starting sentinal */ +- __u8 part_number[13]; /* prom part number */ +- __u8 version[10]; /* version number */ +- __u8 signature[8]; +- __u16 eeprom_chksum; +- __u32 data_stamp_offset; +- __u8 eflag ; /* ending sentinal */ +-} __attribute__((packed)) voyager_eprom_hdr_t; +- +- +- +-#define VOYAGER_EPROM_SIZE_OFFSET \ +- ((__u16)(&(((voyager_eprom_hdr_t *)0)->ee_size))) +-#define VOYAGER_XSUM_END_OFFSET 0x2a +- +-/* the following three definitions are for internal table layouts +- * in the module EPROMs. We really only care about the IDs and +- * offsets */ +-typedef struct voyager_sp_table { +- __u8 asic_id; +- __u8 bypass_flag; +- __u16 asic_data_offset; +- __u16 config_data_offset; +-} __attribute__((packed)) voyager_sp_table_t; +- +-typedef struct voyager_jtag_table { +- __u8 icode[4]; +- __u8 runbist[4]; +- __u8 intest[4]; +- __u8 samp_preld[4]; +- __u8 ireg_len; +-} __attribute__((packed)) voyager_jtt_t; +- +-typedef struct voyager_asic_data_table { +- __u8 jtag_id[4]; +- __u16 length_bsr; +- __u16 length_bist_reg; +- __u32 bist_clk; +- __u16 subaddr_bits; +- __u16 seed_bits; +- __u16 sig_bits; +- __u16 jtag_offset; +-} __attribute__((packed)) voyager_at_t; +- +-/* Voyager Interrupt Controller (VIC) registers */ +- +-/* Base to add to Cross Processor Interrupts (CPIs) when triggering +- * the CPU IRQ line */ +-/* register defines for the WCBICs (one per processor) */ +-#define VOYAGER_WCBIC0 0x41 /* bus A node P1 processor 0 */ +-#define VOYAGER_WCBIC1 0x49 /* bus A node P1 processor 1 */ +-#define VOYAGER_WCBIC2 0x51 /* bus A node P2 processor 0 */ +-#define VOYAGER_WCBIC3 0x59 /* bus A node P2 processor 1 */ +-#define VOYAGER_WCBIC4 0x61 /* bus B node P1 processor 0 */ +-#define VOYAGER_WCBIC5 0x69 /* bus B node P1 processor 1 */ +-#define VOYAGER_WCBIC6 0x71 /* bus B node P2 processor 0 */ +-#define VOYAGER_WCBIC7 0x79 /* bus B node P2 processor 1 */ +- +- +-/* top of memory registers */ +-#define VOYAGER_WCBIC_TOM_L 0x4 +-#define VOYAGER_WCBIC_TOM_H 0x5 +- +-/* register defines for Voyager Memory Contol (VMC) +- * these are present on L4 machines only */ +-#define VOYAGER_VMC1 0x81 +-#define VOYAGER_VMC2 0x91 +-#define VOYAGER_VMC3 0xa1 +-#define VOYAGER_VMC4 0xb1 +- +-/* VMC Ports */ +-#define VOYAGER_VMC_MEMORY_SETUP 0x9 +-# define VMC_Interleaving 0x01 +-# define VMC_4Way 0x02 +-# define VMC_EvenCacheLines 0x04 +-# define VMC_HighLine 0x08 +-# define VMC_Start0_Enable 0x20 +-# define VMC_Start1_Enable 0x40 +-# define VMC_Vremap 0x80 +-#define VOYAGER_VMC_BANK_DENSITY 0xa +-# define VMC_BANK_EMPTY 0 +-# define VMC_BANK_4MB 1 +-# define VMC_BANK_16MB 2 +-# define VMC_BANK_64MB 3 +-# define VMC_BANK0_MASK 0x03 +-# define VMC_BANK1_MASK 0x0C +-# define VMC_BANK2_MASK 0x30 +-# define VMC_BANK3_MASK 0xC0 +- +-/* Magellan Memory Controller (MMC) defines - present on L5 */ +-#define VOYAGER_MMC_ASIC_ID 1 +-/* the two memory modules corresponding to memory cards in the system */ +-#define VOYAGER_MMC_MEMORY0_MODULE 0x14 +-#define VOYAGER_MMC_MEMORY1_MODULE 0x15 +-/* the Magellan Memory Address (MMA) defines */ +-#define VOYAGER_MMA_ASIC_ID 2 +- +-/* Submodule number for the Quad Baseboard */ +-#define VOYAGER_QUAD_BASEBOARD 1 +- +-/* ASIC defines for the Quad Baseboard */ +-#define VOYAGER_QUAD_QDATA0 1 +-#define VOYAGER_QUAD_QDATA1 2 +-#define VOYAGER_QUAD_QABC 3 +- +-/* Useful areas in extended CMOS */ +-#define VOYAGER_PROCESSOR_PRESENT_MASK 0x88a +-#define VOYAGER_MEMORY_CLICKMAP 0xa23 +-#define VOYAGER_DUMP_LOCATION 0xb1a +- +-/* SUS In Control bit - used to tell SUS that we don't need to be +- * babysat anymore */ +-#define VOYAGER_SUS_IN_CONTROL_PORT 0x3ff +-# define VOYAGER_IN_CONTROL_FLAG 0x80 +- +-/* Voyager PSI defines */ +-#define VOYAGER_PSI_STATUS_REG 0x08 +-# define PSI_DC_FAIL 0x01 +-# define PSI_MON 0x02 +-# define PSI_FAULT 0x04 +-# define PSI_ALARM 0x08 +-# define PSI_CURRENT 0x10 +-# define PSI_DVM 0x20 +-# define PSI_PSCFAULT 0x40 +-# define PSI_STAT_CHG 0x80 +- +-#define VOYAGER_PSI_SUPPLY_REG 0x8000 +- /* read */ +-# define PSI_FAIL_DC 0x01 +-# define PSI_FAIL_AC 0x02 +-# define PSI_MON_INT 0x04 +-# define PSI_SWITCH_OFF 0x08 +-# define PSI_HX_OFF 0x10 +-# define PSI_SECURITY 0x20 +-# define PSI_CMOS_BATT_LOW 0x40 +-# define PSI_CMOS_BATT_FAIL 0x80 +- /* write */ +-# define PSI_CLR_SWITCH_OFF 0x13 +-# define PSI_CLR_HX_OFF 0x14 +-# define PSI_CLR_CMOS_BATT_FAIL 0x17 +- +-#define VOYAGER_PSI_MASK 0x8001 +-# define PSI_MASK_MASK 0x10 +- +-#define VOYAGER_PSI_AC_FAIL_REG 0x8004 +-#define AC_FAIL_STAT_CHANGE 0x80 +- +-#define VOYAGER_PSI_GENERAL_REG 0x8007 +- /* read */ +-# define PSI_SWITCH_ON 0x01 +-# define PSI_SWITCH_ENABLED 0x02 +-# define PSI_ALARM_ENABLED 0x08 +-# define PSI_SECURE_ENABLED 0x10 +-# define PSI_COLD_RESET 0x20 +-# define PSI_COLD_START 0x80 +- /* write */ +-# define PSI_POWER_DOWN 0x10 +-# define PSI_SWITCH_DISABLE 0x01 +-# define PSI_SWITCH_ENABLE 0x11 +-# define PSI_CLEAR 0x12 +-# define PSI_ALARM_DISABLE 0x03 +-# define PSI_ALARM_ENABLE 0x13 +-# define PSI_CLEAR_COLD_RESET 0x05 +-# define PSI_SET_COLD_RESET 0x15 +-# define PSI_CLEAR_COLD_START 0x07 +-# define PSI_SET_COLD_START 0x17 +- +- +- +-struct voyager_bios_info { +- __u8 len; +- __u8 major; +- __u8 minor; +- __u8 debug; +- __u8 num_classes; +- __u8 class_1; +- __u8 class_2; +-}; +- +-/* The following structures and definitions are for the Kernel/SUS +- * interface these are needed to find out how SUS initialised any Quad +- * boards in the system */ +- +-#define NUMBER_OF_MC_BUSSES 2 +-#define SLOTS_PER_MC_BUS 8 +-#define MAX_CPUS 16 /* 16 way CPU system */ +-#define MAX_PROCESSOR_BOARDS 4 /* 4 processor slot system */ +-#define MAX_CACHE_LEVELS 4 /* # of cache levels supported */ +-#define MAX_SHARED_CPUS 4 /* # of CPUs that can share a LARC */ +-#define NUMBER_OF_POS_REGS 8 +- +-typedef struct { +- __u8 MC_Slot; +- __u8 POS_Values[NUMBER_OF_POS_REGS]; +-} __attribute__((packed)) MC_SlotInformation_t; +- +-struct QuadDescription { +- __u8 Type; /* for type 0 (DYADIC or MONADIC) all fields +- * will be zero except for slot */ +- __u8 StructureVersion; +- __u32 CPI_BaseAddress; +- __u32 LARC_BankSize; +- __u32 LocalMemoryStateBits; +- __u8 Slot; /* Processor slots 1 - 4 */ +-} __attribute__((packed)); +- +-struct ProcBoardInfo { +- __u8 Type; +- __u8 StructureVersion; +- __u8 NumberOfBoards; +- struct QuadDescription QuadData[MAX_PROCESSOR_BOARDS]; +-} __attribute__((packed)); +- +-struct CacheDescription { +- __u8 Level; +- __u32 TotalSize; +- __u16 LineSize; +- __u8 Associativity; +- __u8 CacheType; +- __u8 WriteType; +- __u8 Number_CPUs_SharedBy; +- __u8 Shared_CPUs_Hardware_IDs[MAX_SHARED_CPUS]; +- +-} __attribute__((packed)); +- +-struct CPU_Description { +- __u8 CPU_HardwareId; +- char *FRU_String; +- __u8 NumberOfCacheLevels; +- struct CacheDescription CacheLevelData[MAX_CACHE_LEVELS]; +-} __attribute__((packed)); +- +-struct CPU_Info { +- __u8 Type; +- __u8 StructureVersion; +- __u8 NumberOf_CPUs; +- struct CPU_Description CPU_Data[MAX_CPUS]; +-} __attribute__((packed)); +- +- +-/* +- * This structure will be used by SUS and the OS. +- * The assumption about this structure is that no blank space is +- * packed in it by our friend the compiler. +- */ +-typedef struct { +- __u8 Mailbox_SUS; /* Written to by SUS to give +- commands/response to the OS */ +- __u8 Mailbox_OS; /* Written to by the OS to give +- commands/response to SUS */ +- __u8 SUS_MailboxVersion; /* Tells the OS which iteration of the +- interface SUS supports */ +- __u8 OS_MailboxVersion; /* Tells SUS which iteration of the +- interface the OS supports */ +- __u32 OS_Flags; /* Flags set by the OS as info for +- SUS */ +- __u32 SUS_Flags; /* Flags set by SUS as info +- for the OS */ +- __u32 WatchDogPeriod; /* Watchdog period (in seconds) which +- the DP uses to see if the OS +- is dead */ +- __u32 WatchDogCount; /* Updated by the OS on every tic. */ +- __u32 MemoryFor_SUS_ErrorLog; /* Flat 32 bit address which tells SUS +- where to stuff the SUS error log +- on a dump */ +- MC_SlotInformation_t MC_SlotInfo[NUMBER_OF_MC_BUSSES*SLOTS_PER_MC_BUS]; +- /* Storage for MCA POS data */ +- /* All new SECOND_PASS_INTERFACE fields added from this point */ +- struct ProcBoardInfo *BoardData; +- struct CPU_Info *CPU_Data; +- /* All new fields must be added from this point */ +-} Voyager_KernelSUS_Mbox_t; +- +-/* structure for finding the right memory address to send a QIC CPI to */ +-struct voyager_qic_cpi { +- /* Each cache line (32 bytes) can trigger a cpi. The cpi +- * read/write may occur anywhere in the cache line---pick the +- * middle to be safe */ +- struct { +- __u32 pad1[3]; +- __u32 cpi; +- __u32 pad2[4]; +- } qic_cpi[8]; +-}; +- +-struct voyager_status { +- __u32 power_fail:1; +- __u32 switch_off:1; +- __u32 request_from_kernel:1; +-}; +- +-struct voyager_psi_regs { +- __u8 cat_id; +- __u8 cat_dev; +- __u8 cat_control; +- __u8 subaddr; +- __u8 dummy4; +- __u8 checkbit; +- __u8 subaddr_low; +- __u8 subaddr_high; +- __u8 intstatus; +- __u8 stat1; +- __u8 stat3; +- __u8 fault; +- __u8 tms; +- __u8 gen; +- __u8 sysconf; +- __u8 dummy15; +-}; +- +-struct voyager_psi_subregs { +- __u8 supply; +- __u8 mask; +- __u8 present; +- __u8 DCfail; +- __u8 ACfail; +- __u8 fail; +- __u8 UPSfail; +- __u8 genstatus; +-}; +- +-struct voyager_psi { +- struct voyager_psi_regs regs; +- struct voyager_psi_subregs subregs; +-}; +- +-struct voyager_SUS { +-#define VOYAGER_DUMP_BUTTON_NMI 0x1 +-#define VOYAGER_SUS_VALID 0x2 +-#define VOYAGER_SYSINT_COMPLETE 0x3 +- __u8 SUS_mbox; +-#define VOYAGER_NO_COMMAND 0x0 +-#define VOYAGER_IGNORE_DUMP 0x1 +-#define VOYAGER_DO_DUMP 0x2 +-#define VOYAGER_SYSINT_HANDSHAKE 0x3 +-#define VOYAGER_DO_MEM_DUMP 0x4 +-#define VOYAGER_SYSINT_WAS_RECOVERED 0x5 +- __u8 kernel_mbox; +-#define VOYAGER_MAILBOX_VERSION 0x10 +- __u8 SUS_version; +- __u8 kernel_version; +-#define VOYAGER_OS_HAS_SYSINT 0x1 +-#define VOYAGER_OS_IN_PROGRESS 0x2 +-#define VOYAGER_UPDATING_WDPERIOD 0x4 +- __u32 kernel_flags; +-#define VOYAGER_SUS_BOOTING 0x1 +-#define VOYAGER_SUS_IN_PROGRESS 0x2 +- __u32 SUS_flags; +- __u32 watchdog_period; +- __u32 watchdog_count; +- __u32 SUS_errorlog; +- /* lots of system configuration stuff under here */ +-}; +- +-/* Variables exported by voyager_smp */ +-extern __u32 voyager_extended_vic_processors; +-extern __u32 voyager_allowed_boot_processors; +-extern __u32 voyager_quad_processors; +-extern struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS]; +-extern struct voyager_SUS *voyager_SUS; +- +-/* variables exported always */ +-extern struct task_struct *voyager_thread; +-extern int voyager_level; +-extern struct voyager_status voyager_status; +- +-/* functions exported by the voyager and voyager_smp modules */ +-extern int voyager_cat_readb(__u8 module, __u8 asic, int reg); +-extern void voyager_cat_init(void); +-extern void voyager_detect(struct voyager_bios_info *); +-extern void voyager_trap_init(void); +-extern void voyager_setup_irqs(void); +-extern int voyager_memory_detect(int region, __u32 *addr, __u32 *length); +-extern void voyager_smp_intr_init(void); +-extern __u8 voyager_extended_cmos_read(__u16 cmos_address); +-extern void voyager_smp_dump(void); +-extern void voyager_timer_interrupt(void); +-extern void smp_local_timer_interrupt(void); +-extern void voyager_power_off(void); +-extern void smp_voyager_power_off(void *dummy); +-extern void voyager_restart(void); +-extern void voyager_cat_power_off(void); +-extern void voyager_cat_do_common_interrupt(void); +-extern void voyager_handle_nmi(void); +-extern void voyager_smp_intr_init(void); +-/* Commands for the following are */ +-#define VOYAGER_PSI_READ 0 +-#define VOYAGER_PSI_WRITE 1 +-#define VOYAGER_PSI_SUBREAD 2 +-#define VOYAGER_PSI_SUBWRITE 3 +-extern void voyager_cat_psi(__u8, __u16, __u8 *); +Index: linux-2.6-tip/arch/x86/include/asm/xen/events.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/xen/events.h ++++ linux-2.6-tip/arch/x86/include/asm/xen/events.h +@@ -15,10 +15,4 @@ static inline int xen_irqs_disabled(stru + return raw_irqs_disabled_flags(regs->flags); + } + +-static inline void xen_do_IRQ(int irq, struct pt_regs *regs) +-{ +- regs->orig_ax = ~irq; +- do_IRQ(regs); +-} +- + #endif /* _ASM_X86_XEN_EVENTS_H */ +Index: linux-2.6-tip/arch/x86/include/asm/xen/hypercall.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/xen/hypercall.h ++++ linux-2.6-tip/arch/x86/include/asm/xen/hypercall.h +@@ -296,6 +296,8 @@ HYPERVISOR_get_debugreg(int reg) + static inline int + HYPERVISOR_update_descriptor(u64 ma, u64 desc) + { ++ if (sizeof(u64) == sizeof(long)) ++ return _hypercall2(int, update_descriptor, ma, desc); + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); + } + +Index: linux-2.6-tip/arch/x86/include/asm/xen/hypervisor.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/xen/hypervisor.h ++++ linux-2.6-tip/arch/x86/include/asm/xen/hypervisor.h +@@ -38,22 +38,30 @@ extern struct shared_info *HYPERVISOR_sh + extern struct start_info *xen_start_info; + + enum xen_domain_type { +- XEN_NATIVE, +- XEN_PV_DOMAIN, +- XEN_HVM_DOMAIN, ++ XEN_NATIVE, /* running on bare hardware */ ++ XEN_PV_DOMAIN, /* running in a PV domain */ ++ XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ + }; + +-extern enum xen_domain_type xen_domain_type; +- + #ifdef CONFIG_XEN +-#define xen_domain() (xen_domain_type != XEN_NATIVE) ++extern enum xen_domain_type xen_domain_type; + #else +-#define xen_domain() (0) ++#define xen_domain_type XEN_NATIVE + #endif + +-#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN) +-#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN) +- +-#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) ++#define xen_domain() (xen_domain_type != XEN_NATIVE) ++#define xen_pv_domain() (xen_domain() && \ ++ xen_domain_type == XEN_PV_DOMAIN) ++#define xen_hvm_domain() (xen_domain() && \ ++ xen_domain_type == XEN_HVM_DOMAIN) ++ ++#ifdef CONFIG_XEN_DOM0 ++#include ++ ++#define xen_initial_domain() (xen_pv_domain() && \ ++ xen_start_info->flags & SIF_INITDOMAIN) ++#else /* !CONFIG_XEN_DOM0 */ ++#define xen_initial_domain() (0) ++#endif /* CONFIG_XEN_DOM0 */ + + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ +Index: linux-2.6-tip/arch/x86/include/asm/xen/page.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/xen/page.h ++++ linux-2.6-tip/arch/x86/include/asm/xen/page.h +@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x) + + + xmaddr_t arbitrary_virt_to_machine(void *address); ++unsigned long arbitrary_virt_to_mfn(void *vaddr); + void make_lowmem_page_readonly(void *vaddr); + void make_lowmem_page_readwrite(void *vaddr); + +Index: linux-2.6-tip/arch/x86/include/asm/xor.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/xor.h ++++ linux-2.6-tip/arch/x86/include/asm/xor.h +@@ -1,5 +1,10 @@ ++#ifdef CONFIG_KMEMCHECK ++/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ ++# include ++#else + #ifdef CONFIG_X86_32 + # include "xor_32.h" + #else + # include "xor_64.h" + #endif ++#endif +Index: linux-2.6-tip/arch/x86/kernel/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/Makefile ++++ linux-2.6-tip/arch/x86/kernel/Makefile +@@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack + CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) + CFLAGS_hpet.o := $(nostackp) + CFLAGS_tsc.o := $(nostackp) ++CFLAGS_paravirt.o := $(nostackp) + + obj-y := process_$(BITS).o signal.o entry_$(BITS).o + obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o + obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o +-obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o ++obj-y += setup.o i8259.o irqinit_$(BITS).o + obj-$(CONFIG_X86_VISWS) += visws_quirks.o + obj-$(CONFIG_X86_32) += probe_roms_32.o + obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o +@@ -49,31 +50,28 @@ obj-y += step.o + obj-$(CONFIG_STACKTRACE) += stacktrace.o + obj-y += cpu/ + obj-y += acpi/ +-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o ++obj-y += reboot.o + obj-$(CONFIG_MCA) += mca_32.o + obj-$(CONFIG_X86_MSR) += msr.o + obj-$(CONFIG_X86_CPUID) += cpuid.o + obj-$(CONFIG_PCI) += early-quirks.o + apm-y := apm_32.o + obj-$(CONFIG_APM) += apm.o +-obj-$(CONFIG_X86_SMP) += smp.o +-obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o +-obj-$(CONFIG_X86_32_SMP) += smpcommon.o +-obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o ++obj-$(CONFIG_SMP) += smp.o ++obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o ++obj-$(CONFIG_SMP) += setup_percpu.o ++obj-$(CONFIG_X86_64_SMP) += tsc_sync.o + obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o + obj-$(CONFIG_X86_MPPARSE) += mpparse.o +-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o +-obj-$(CONFIG_X86_IO_APIC) += io_apic.o ++obj-y += apic/ + obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o + obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +-obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o ++obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o ++obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o + obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o + obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o + obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +-obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +-obj-$(CONFIG_X86_ES7000) += es7000_32.o +-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o +-obj-y += vsmp_64.o ++obj-$(CONFIG_X86_VSMP) += vsmp_64.o + obj-$(CONFIG_KPROBES) += kprobes.o + obj-$(CONFIG_MODULES) += module_$(BITS).o + obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +@@ -109,21 +107,18 @@ obj-$(CONFIG_MICROCODE) += microcode.o + + obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o + +-obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 ++obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o + + ### + # 64 bit specific files + ifeq ($(CONFIG_X86_64),y) +- obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o +- obj-y += bios_uv.o uv_irq.o uv_sysfs.o +- obj-y += genx2apic_cluster.o +- obj-y += genx2apic_phys.o +- obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o +- obj-$(CONFIG_AUDIT) += audit_64.o +- +- obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o +- obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o +- obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o ++ obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o ++ obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o ++ obj-$(CONFIG_AUDIT) += audit_64.o ++ ++ obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o ++ obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o ++ obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o + +- obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o ++ obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o + endif +Index: linux-2.6-tip/arch/x86/kernel/acpi/boot.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/acpi/boot.c ++++ linux-2.6-tip/arch/x86/kernel/acpi/boot.c +@@ -37,15 +37,10 @@ + #include + #include + #include +-#include + #include + #include + #include + +-#ifdef CONFIG_X86_LOCAL_APIC +-# include +-#endif +- + static int __initdata acpi_force = 0; + u32 acpi_rsdt_forced; + #ifdef CONFIG_ACPI +@@ -56,16 +51,7 @@ int acpi_disabled = 1; + EXPORT_SYMBOL(acpi_disabled); + + #ifdef CONFIG_X86_64 +- +-#include +- +-#else /* X86 */ +- +-#ifdef CONFIG_X86_LOCAL_APIC +-#include +-#include +-#endif /* CONFIG_X86_LOCAL_APIC */ +- ++# include + #endif /* X86 */ + + #define BAD_MADT_ENTRY(entry, end) ( \ +@@ -121,35 +107,18 @@ enum acpi_irq_model_id acpi_irq_model = + */ + char *__init __acpi_map_table(unsigned long phys, unsigned long size) + { +- unsigned long base, offset, mapped_size; +- int idx; + + if (!phys || !size) + return NULL; + +- if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT)) +- return __va(phys); +- +- offset = phys & (PAGE_SIZE - 1); +- mapped_size = PAGE_SIZE - offset; +- clear_fixmap(FIX_ACPI_END); +- set_fixmap(FIX_ACPI_END, phys); +- base = fix_to_virt(FIX_ACPI_END); +- +- /* +- * Most cases can be covered by the below. +- */ +- idx = FIX_ACPI_END; +- while (mapped_size < size) { +- if (--idx < FIX_ACPI_BEGIN) +- return NULL; /* cannot handle this */ +- phys += PAGE_SIZE; +- clear_fixmap(idx); +- set_fixmap(idx, phys); +- mapped_size += PAGE_SIZE; +- } ++ return early_ioremap(phys, size); ++} ++void __init __acpi_unmap_table(char *map, unsigned long size) ++{ ++ if (!map || !size) ++ return; + +- return ((unsigned char *)base + offset); ++ early_iounmap(map, size); + } + + #ifdef CONFIG_PCI_MMCONFIG +@@ -239,7 +208,8 @@ static int __init acpi_parse_madt(struct + madt->address); + } + +- acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); ++ default_acpi_madt_oem_check(madt->header.oem_id, ++ madt->header.oem_table_id); + + return 0; + } +@@ -884,7 +854,7 @@ static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); + } mp_ioapic_routing[MAX_IO_APICS]; + +-static int mp_find_ioapic(int gsi) ++int mp_find_ioapic(int gsi) + { + int i = 0; + +@@ -899,6 +869,16 @@ static int mp_find_ioapic(int gsi) + return -1; + } + ++int mp_find_ioapic_pin(int ioapic, int gsi) ++{ ++ if (WARN_ON(ioapic == -1)) ++ return -1; ++ if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end)) ++ return -1; ++ ++ return gsi - mp_ioapic_routing[ioapic].gsi_base; ++} ++ + static u8 __init uniq_ioapic_id(u8 id) + { + #ifdef CONFIG_X86_32 +@@ -912,8 +892,8 @@ static u8 __init uniq_ioapic_id(u8 id) + DECLARE_BITMAP(used, 256); + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { +- struct mp_config_ioapic *ia = &mp_ioapics[i]; +- __set_bit(ia->mp_apicid, used); ++ struct mpc_ioapic *ia = &mp_ioapics[i]; ++ __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; +@@ -945,29 +925,29 @@ void __init mp_register_ioapic(int id, u + + idx = nr_ioapics; + +- mp_ioapics[idx].mp_type = MP_IOAPIC; +- mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; +- mp_ioapics[idx].mp_apicaddr = address; ++ mp_ioapics[idx].type = MP_IOAPIC; ++ mp_ioapics[idx].flags = MPC_APIC_USABLE; ++ mp_ioapics[idx].apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); +- mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); ++ mp_ioapics[idx].apicid = uniq_ioapic_id(id); + #ifdef CONFIG_X86_32 +- mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); ++ mp_ioapics[idx].apicver = io_apic_get_version(idx); + #else +- mp_ioapics[idx].mp_apicver = 0; ++ mp_ioapics[idx].apicver = 0; + #endif + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ +- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; ++ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + +- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " +- "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, +- mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, ++ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " ++ "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, ++ mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, + mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); + + nr_ioapics++; +@@ -996,19 +976,19 @@ int __init acpi_probe_gsi(void) + return max_gsi + 1; + } + +-static void assign_to_mp_irq(struct mp_config_intsrc *m, +- struct mp_config_intsrc *mp_irq) ++static void assign_to_mp_irq(struct mpc_intsrc *m, ++ struct mpc_intsrc *mp_irq) + { +- memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); ++ memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); + } + +-static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, +- struct mp_config_intsrc *m) ++static int mp_irq_cmp(struct mpc_intsrc *mp_irq, ++ struct mpc_intsrc *m) + { +- return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); ++ return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); + } + +-static void save_mp_irq(struct mp_config_intsrc *m) ++static void save_mp_irq(struct mpc_intsrc *m) + { + int i; + +@@ -1026,7 +1006,7 @@ void __init mp_override_legacy_irq(u8 bu + { + int ioapic; + int pin; +- struct mp_config_intsrc mp_irq; ++ struct mpc_intsrc mp_irq; + + /* + * Convert 'gsi' to 'ioapic.pin'. +@@ -1034,7 +1014,7 @@ void __init mp_override_legacy_irq(u8 bu + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; +- pin = gsi - mp_ioapic_routing[ioapic].gsi_base; ++ pin = mp_find_ioapic_pin(ioapic, gsi); + + /* + * TBD: This check is for faulty timer entries, where the override +@@ -1044,13 +1024,13 @@ void __init mp_override_legacy_irq(u8 bu + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + +- mp_irq.mp_type = MP_INTSRC; +- mp_irq.mp_irqtype = mp_INT; +- mp_irq.mp_irqflag = (trigger << 2) | polarity; +- mp_irq.mp_srcbus = MP_ISA_BUS; +- mp_irq.mp_srcbusirq = bus_irq; /* IRQ */ +- mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ +- mp_irq.mp_dstirq = pin; /* INTIN# */ ++ mp_irq.type = MP_INTSRC; ++ mp_irq.irqtype = mp_INT; ++ mp_irq.irqflag = (trigger << 2) | polarity; ++ mp_irq.srcbus = MP_ISA_BUS; ++ mp_irq.srcbusirq = bus_irq; /* IRQ */ ++ mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ ++ mp_irq.dstirq = pin; /* INTIN# */ + + save_mp_irq(&mp_irq); + } +@@ -1060,7 +1040,7 @@ void __init mp_config_acpi_legacy_irqs(v + int i; + int ioapic; + unsigned int dstapic; +- struct mp_config_intsrc mp_irq; ++ struct mpc_intsrc mp_irq; + + #if defined (CONFIG_MCA) || defined (CONFIG_EISA) + /* +@@ -1085,7 +1065,7 @@ void __init mp_config_acpi_legacy_irqs(v + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; +- dstapic = mp_ioapics[ioapic].mp_apicid; ++ dstapic = mp_ioapics[ioapic].apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless +@@ -1095,16 +1075,14 @@ void __init mp_config_acpi_legacy_irqs(v + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { +- struct mp_config_intsrc *irq = mp_irqs + idx; ++ struct mpc_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ +- if (irq->mp_srcbus == MP_ISA_BUS +- && irq->mp_srcbusirq == i) ++ if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ +- if (irq->mp_dstapic == dstapic && +- irq->mp_dstirq == i) ++ if (irq->dstapic == dstapic && irq->dstirq == i) + break; + } + +@@ -1113,13 +1091,13 @@ void __init mp_config_acpi_legacy_irqs(v + continue; /* IRQ already used */ + } + +- mp_irq.mp_type = MP_INTSRC; +- mp_irq.mp_irqflag = 0; /* Conforming */ +- mp_irq.mp_srcbus = MP_ISA_BUS; +- mp_irq.mp_dstapic = dstapic; +- mp_irq.mp_irqtype = mp_INT; +- mp_irq.mp_srcbusirq = i; /* Identity mapped */ +- mp_irq.mp_dstirq = i; ++ mp_irq.type = MP_INTSRC; ++ mp_irq.irqflag = 0; /* Conforming */ ++ mp_irq.srcbus = MP_ISA_BUS; ++ mp_irq.dstapic = dstapic; ++ mp_irq.irqtype = mp_INT; ++ mp_irq.srcbusirq = i; /* Identity mapped */ ++ mp_irq.dstirq = i; + + save_mp_irq(&mp_irq); + } +@@ -1156,7 +1134,7 @@ int mp_register_gsi(u32 gsi, int trigger + return gsi; + } + +- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; ++ ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); + + #ifdef CONFIG_X86_32 + if (ioapic_renumber_irq) +@@ -1230,22 +1208,22 @@ int mp_config_acpi_gsi(unsigned char num + u32 gsi, int triggering, int polarity) + { + #ifdef CONFIG_X86_MPPARSE +- struct mp_config_intsrc mp_irq; ++ struct mpc_intsrc mp_irq; + int ioapic; + + if (!acpi_ioapic) + return 0; + + /* print the entry should happen on mptable identically */ +- mp_irq.mp_type = MP_INTSRC; +- mp_irq.mp_irqtype = mp_INT; +- mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | ++ mp_irq.type = MP_INTSRC; ++ mp_irq.irqtype = mp_INT; ++ mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); +- mp_irq.mp_srcbus = number; +- mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); ++ mp_irq.srcbus = number; ++ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); +- mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; +- mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; ++ mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; ++ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + save_mp_irq(&mp_irq); + #endif +@@ -1372,7 +1350,7 @@ static void __init acpi_process_madt(voi + if (!error) { + acpi_lapic = 1; + +-#ifdef CONFIG_X86_GENERICARCH ++#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); + #endif + /* +@@ -1384,9 +1362,8 @@ static void __init acpi_process_madt(voi + acpi_ioapic = 1; + + smp_found_config = 1; +-#ifdef CONFIG_X86_32 +- setup_apic_routing(); +-#endif ++ if (apic->setup_apic_routing) ++ apic->setup_apic_routing(); + } + } + if (error == -EINVAL) { +Index: linux-2.6-tip/arch/x86/kernel/acpi/processor.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/acpi/processor.c ++++ linux-2.6-tip/arch/x86/kernel/acpi/processor.c +@@ -43,6 +43,11 @@ static void init_intel_pdc(struct acpi_p + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = ACPI_PDC_C_CAPABILITY_SMP; ++ /* ++ * If mwait/monitor is unsupported, C2/C3_FFH will be disabled. ++ */ ++ if (!cpu_has(c, X86_FEATURE_MWAIT)) ++ buf[2] &= ~ACPI_PDC_C_C2C3_FFH; + + /* + * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so +Index: linux-2.6-tip/arch/x86/kernel/acpi/realmode/wakeup.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/acpi/realmode/wakeup.S ++++ linux-2.6-tip/arch/x86/kernel/acpi/realmode/wakeup.S +@@ -3,8 +3,8 @@ + */ + #include + #include +-#include +-#include ++#include ++#include + #include + + .code16 +Index: linux-2.6-tip/arch/x86/kernel/acpi/sleep.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/acpi/sleep.c ++++ linux-2.6-tip/arch/x86/kernel/acpi/sleep.c +@@ -101,6 +101,7 @@ int acpi_save_state_mem(void) + stack_start.sp = temp_stack + sizeof(temp_stack); + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_table(smp_processor_id()); ++ initial_gs = per_cpu_offset(smp_processor_id()); + #endif + initial_code = (unsigned long)wakeup_long64; + saved_magic = 0x123456789abcdef0; +Index: linux-2.6-tip/arch/x86/kernel/acpi/wakeup_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/acpi/wakeup_32.S ++++ linux-2.6-tip/arch/x86/kernel/acpi/wakeup_32.S +@@ -1,7 +1,7 @@ + .section .text.page_aligned + #include + #include +-#include ++#include + + # Copyright 2003, 2008 Pavel Machek , distribute under GPLv2 + +Index: linux-2.6-tip/arch/x86/kernel/acpi/wakeup_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/acpi/wakeup_64.S ++++ linux-2.6-tip/arch/x86/kernel/acpi/wakeup_64.S +@@ -1,8 +1,8 @@ + .text + #include + #include +-#include +-#include ++#include ++#include + #include + #include + +Index: linux-2.6-tip/arch/x86/kernel/alternative.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/alternative.c ++++ linux-2.6-tip/arch/x86/kernel/alternative.c +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -12,7 +13,9 @@ + #include + #include + #include ++#include + #include ++#include + + #define MAX_PATCH_LEN (255-1) + +@@ -226,6 +229,7 @@ static void alternatives_smp_lock(u8 **s + { + u8 **ptr; + ++ mutex_lock(&text_mutex); + for (ptr = start; ptr < end; ptr++) { + if (*ptr < text) + continue; +@@ -234,6 +238,7 @@ static void alternatives_smp_lock(u8 **s + /* turn DS segment override prefix into lock prefix */ + text_poke(*ptr, ((unsigned char []){0xf0}), 1); + }; ++ mutex_unlock(&text_mutex); + } + + static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) +@@ -243,6 +248,7 @@ static void alternatives_smp_unlock(u8 * + if (noreplace_smp) + return; + ++ mutex_lock(&text_mutex); + for (ptr = start; ptr < end; ptr++) { + if (*ptr < text) + continue; +@@ -251,6 +257,7 @@ static void alternatives_smp_unlock(u8 * + /* turn lock prefix into DS segment override prefix */ + text_poke(*ptr, ((unsigned char []){0x3E}), 1); + }; ++ mutex_unlock(&text_mutex); + } + + struct smp_alt_module { +@@ -414,9 +421,17 @@ void __init alternative_instructions(voi + that might execute the to be patched code. + Other CPUs are not running. */ + stop_nmi(); +-#ifdef CONFIG_X86_MCE +- stop_mce(); +-#endif ++ ++ /* ++ * Don't stop machine check exceptions while patching. ++ * MCEs only happen when something got corrupted and in this ++ * case we must do something about the corruption. ++ * Ignoring it is worse than a unlikely patching race. ++ * Also machine checks tend to be broadcast and if one CPU ++ * goes into machine check the others follow quickly, so we don't ++ * expect a machine check to cause undue problems during to code ++ * patching. ++ */ + + apply_alternatives(__alt_instructions, __alt_instructions_end); + +@@ -456,9 +471,6 @@ void __init alternative_instructions(voi + (unsigned long)__smp_locks_end); + + restart_nmi(); +-#ifdef CONFIG_X86_MCE +- restart_mce(); +-#endif + } + + /** +@@ -495,12 +507,13 @@ void *text_poke_early(void *addr, const + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. ++ * ++ * Note: Must be called under text_mutex. + */ + void *__kprobes text_poke(void *addr, const void *opcode, size_t len) + { + unsigned long flags; + char *vaddr; +- int nr_pages = 2; + struct page *pages[2]; + int i; + +@@ -513,18 +526,21 @@ void *__kprobes text_poke(void *addr, co + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + BUG_ON(!pages[0]); +- if (!pages[1]) +- nr_pages = 1; +- vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); +- BUG_ON(!vaddr); + local_irq_save(flags); ++ set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); ++ if (pages[1]) ++ set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); ++ vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); + memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); +- local_irq_restore(flags); +- vunmap(vaddr); ++ clear_fixmap(FIX_TEXT_POKE0); ++ if (pages[1]) ++ clear_fixmap(FIX_TEXT_POKE1); ++ local_flush_tlb(); + sync_core(); + /* Could also do a CLFLUSH here to speed up CPU recovery; but + that causes hangs on some VIA CPUs. */ + for (i = 0; i < len; i++) + BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); ++ local_irq_restore(flags); + return addr; + } +Index: linux-2.6-tip/arch/x86/kernel/amd_iommu.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/amd_iommu.c ++++ linux-2.6-tip/arch/x86/kernel/amd_iommu.c +@@ -22,10 +22,9 @@ + #include + #include + #include ++#include + #include +-#ifdef CONFIG_IOMMU_API + #include +-#endif + #include + #include + #include +@@ -1297,8 +1296,10 @@ static void __unmap_single(struct amd_io + /* + * The exported map_single function for dma_ops. + */ +-static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, +- size_t size, int dir) ++static dma_addr_t map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + unsigned long flags; + struct amd_iommu *iommu; +@@ -1306,6 +1307,7 @@ static dma_addr_t map_single(struct devi + u16 devid; + dma_addr_t addr; + u64 dma_mask; ++ phys_addr_t paddr = page_to_phys(page) + offset; + + INC_STATS_COUNTER(cnt_map_single); + +@@ -1340,8 +1342,8 @@ out: + /* + * The exported unmap_single function for dma_ops. + */ +-static void unmap_single(struct device *dev, dma_addr_t dma_addr, +- size_t size, int dir) ++static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, ++ enum dma_data_direction dir, struct dma_attrs *attrs) + { + unsigned long flags; + struct amd_iommu *iommu; +@@ -1390,7 +1392,8 @@ static int map_sg_no_iommu(struct device + * lists). + */ + static int map_sg(struct device *dev, struct scatterlist *sglist, +- int nelems, int dir) ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + unsigned long flags; + struct amd_iommu *iommu; +@@ -1457,7 +1460,8 @@ unmap: + * lists). + */ + static void unmap_sg(struct device *dev, struct scatterlist *sglist, +- int nelems, int dir) ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + unsigned long flags; + struct amd_iommu *iommu; +@@ -1644,11 +1648,11 @@ static void prealloc_protection_domains( + } + } + +-static struct dma_mapping_ops amd_iommu_dma_ops = { ++static struct dma_map_ops amd_iommu_dma_ops = { + .alloc_coherent = alloc_coherent, + .free_coherent = free_coherent, +- .map_single = map_single, +- .unmap_single = unmap_single, ++ .map_page = map_page, ++ .unmap_page = unmap_page, + .map_sg = map_sg, + .unmap_sg = unmap_sg, + .dma_supported = amd_iommu_dma_supported, +Index: linux-2.6-tip/arch/x86/kernel/apic.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/apic.c ++++ /dev/null +@@ -1,2223 +0,0 @@ +-/* +- * Local APIC handling, local APIC timers +- * +- * (c) 1999, 2000 Ingo Molnar +- * +- * Fixes +- * Maciej W. Rozycki : Bits for genuine 82489DX APICs; +- * thanks to Eric Gilmore +- * and Rolf G. Tews +- * for testing these extensively. +- * Maciej W. Rozycki : Various updates and fixes. +- * Mikael Pettersson : Power Management for UP-APIC. +- * Pavel Machek and +- * Mikael Pettersson : PM converted to driver model. +- */ +- +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +- +-/* +- * Sanity check +- */ +-#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) +-# error SPURIOUS_APIC_VECTOR definition error +-#endif +- +-#ifdef CONFIG_X86_32 +-/* +- * Knob to control our willingness to enable the local APIC. +- * +- * +1=force-enable +- */ +-static int force_enable_local_apic; +-/* +- * APIC command line parameters +- */ +-static int __init parse_lapic(char *arg) +-{ +- force_enable_local_apic = 1; +- return 0; +-} +-early_param("lapic", parse_lapic); +-/* Local APIC was disabled by the BIOS and enabled by the kernel */ +-static int enabled_via_apicbase; +- +-#endif +- +-#ifdef CONFIG_X86_64 +-static int apic_calibrate_pmtmr __initdata; +-static __init int setup_apicpmtimer(char *s) +-{ +- apic_calibrate_pmtmr = 1; +- notsc_setup(NULL); +- return 0; +-} +-__setup("apicpmtimer", setup_apicpmtimer); +-#endif +- +-#ifdef CONFIG_X86_64 +-#define HAVE_X2APIC +-#endif +- +-#ifdef HAVE_X2APIC +-int x2apic; +-/* x2apic enabled before OS handover */ +-static int x2apic_preenabled; +-static int disable_x2apic; +-static __init int setup_nox2apic(char *str) +-{ +- disable_x2apic = 1; +- setup_clear_cpu_cap(X86_FEATURE_X2APIC); +- return 0; +-} +-early_param("nox2apic", setup_nox2apic); +-#endif +- +-unsigned long mp_lapic_addr; +-int disable_apic; +-/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +-static int disable_apic_timer __cpuinitdata; +-/* Local APIC timer works in C2 */ +-int local_apic_timer_c2_ok; +-EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); +- +-int first_system_vector = 0xfe; +- +-/* +- * Debug level, exported for io_apic.c +- */ +-unsigned int apic_verbosity; +- +-int pic_mode; +- +-/* Have we found an MP table */ +-int smp_found_config; +- +-static struct resource lapic_resource = { +- .name = "Local APIC", +- .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +-}; +- +-static unsigned int calibration_result; +- +-static int lapic_next_event(unsigned long delta, +- struct clock_event_device *evt); +-static void lapic_timer_setup(enum clock_event_mode mode, +- struct clock_event_device *evt); +-static void lapic_timer_broadcast(const struct cpumask *mask); +-static void apic_pm_activate(void); +- +-/* +- * The local apic timer can be used for any function which is CPU local. +- */ +-static struct clock_event_device lapic_clockevent = { +- .name = "lapic", +- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT +- | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, +- .shift = 32, +- .set_mode = lapic_timer_setup, +- .set_next_event = lapic_next_event, +- .broadcast = lapic_timer_broadcast, +- .rating = 100, +- .irq = -1, +-}; +-static DEFINE_PER_CPU(struct clock_event_device, lapic_events); +- +-static unsigned long apic_phys; +- +-/* +- * Get the LAPIC version +- */ +-static inline int lapic_get_version(void) +-{ +- return GET_APIC_VERSION(apic_read(APIC_LVR)); +-} +- +-/* +- * Check, if the APIC is integrated or a separate chip +- */ +-static inline int lapic_is_integrated(void) +-{ +-#ifdef CONFIG_X86_64 +- return 1; +-#else +- return APIC_INTEGRATED(lapic_get_version()); +-#endif +-} +- +-/* +- * Check, whether this is a modern or a first generation APIC +- */ +-static int modern_apic(void) +-{ +- /* AMD systems use old APIC versions, so check the CPU */ +- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && +- boot_cpu_data.x86 >= 0xf) +- return 1; +- return lapic_get_version() >= 0x14; +-} +- +-/* +- * Paravirt kernels also might be using these below ops. So we still +- * use generic apic_read()/apic_write(), which might be pointing to different +- * ops in PARAVIRT case. +- */ +-void xapic_wait_icr_idle(void) +-{ +- while (apic_read(APIC_ICR) & APIC_ICR_BUSY) +- cpu_relax(); +-} +- +-u32 safe_xapic_wait_icr_idle(void) +-{ +- u32 send_status; +- int timeout; +- +- timeout = 0; +- do { +- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; +- if (!send_status) +- break; +- udelay(100); +- } while (timeout++ < 1000); +- +- return send_status; +-} +- +-void xapic_icr_write(u32 low, u32 id) +-{ +- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); +- apic_write(APIC_ICR, low); +-} +- +-static u64 xapic_icr_read(void) +-{ +- u32 icr1, icr2; +- +- icr2 = apic_read(APIC_ICR2); +- icr1 = apic_read(APIC_ICR); +- +- return icr1 | ((u64)icr2 << 32); +-} +- +-static struct apic_ops xapic_ops = { +- .read = native_apic_mem_read, +- .write = native_apic_mem_write, +- .icr_read = xapic_icr_read, +- .icr_write = xapic_icr_write, +- .wait_icr_idle = xapic_wait_icr_idle, +- .safe_wait_icr_idle = safe_xapic_wait_icr_idle, +-}; +- +-struct apic_ops __read_mostly *apic_ops = &xapic_ops; +-EXPORT_SYMBOL_GPL(apic_ops); +- +-#ifdef HAVE_X2APIC +-static void x2apic_wait_icr_idle(void) +-{ +- /* no need to wait for icr idle in x2apic */ +- return; +-} +- +-static u32 safe_x2apic_wait_icr_idle(void) +-{ +- /* no need to wait for icr idle in x2apic */ +- return 0; +-} +- +-void x2apic_icr_write(u32 low, u32 id) +-{ +- wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +-} +- +-static u64 x2apic_icr_read(void) +-{ +- unsigned long val; +- +- rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); +- return val; +-} +- +-static struct apic_ops x2apic_ops = { +- .read = native_apic_msr_read, +- .write = native_apic_msr_write, +- .icr_read = x2apic_icr_read, +- .icr_write = x2apic_icr_write, +- .wait_icr_idle = x2apic_wait_icr_idle, +- .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +-}; +-#endif +- +-/** +- * enable_NMI_through_LVT0 - enable NMI through local vector table 0 +- */ +-void __cpuinit enable_NMI_through_LVT0(void) +-{ +- unsigned int v; +- +- /* unmask and set to NMI */ +- v = APIC_DM_NMI; +- +- /* Level triggered for 82489DX (32bit mode) */ +- if (!lapic_is_integrated()) +- v |= APIC_LVT_LEVEL_TRIGGER; +- +- apic_write(APIC_LVT0, v); +-} +- +-#ifdef CONFIG_X86_32 +-/** +- * get_physical_broadcast - Get number of physical broadcast IDs +- */ +-int get_physical_broadcast(void) +-{ +- return modern_apic() ? 0xff : 0xf; +-} +-#endif +- +-/** +- * lapic_get_maxlvt - get the maximum number of local vector table entries +- */ +-int lapic_get_maxlvt(void) +-{ +- unsigned int v; +- +- v = apic_read(APIC_LVR); +- /* +- * - we always have APIC integrated on 64bit mode +- * - 82489DXs do not report # of LVT entries +- */ +- return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; +-} +- +-/* +- * Local APIC timer +- */ +- +-/* Clock divisor */ +-#define APIC_DIVISOR 16 +- +-/* +- * This function sets up the local APIC timer, with a timeout of +- * 'clocks' APIC bus clock. During calibration we actually call +- * this function twice on the boot CPU, once with a bogus timeout +- * value, second time for real. The other (noncalibrating) CPUs +- * call this function only once, with the real, calibrated value. +- * +- * We do reads before writes even if unnecessary, to get around the +- * P5 APIC double write bug. +- */ +-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +-{ +- unsigned int lvtt_value, tmp_value; +- +- lvtt_value = LOCAL_TIMER_VECTOR; +- if (!oneshot) +- lvtt_value |= APIC_LVT_TIMER_PERIODIC; +- if (!lapic_is_integrated()) +- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); +- +- if (!irqen) +- lvtt_value |= APIC_LVT_MASKED; +- +- apic_write(APIC_LVTT, lvtt_value); +- +- /* +- * Divide PICLK by 16 +- */ +- tmp_value = apic_read(APIC_TDCR); +- apic_write(APIC_TDCR, +- (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | +- APIC_TDR_DIV_16); +- +- if (!oneshot) +- apic_write(APIC_TMICT, clocks / APIC_DIVISOR); +-} +- +-/* +- * Setup extended LVT, AMD specific (K8, family 10h) +- * +- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and +- * MCE interrupts are supported. Thus MCE offset must be set to 0. +- * +- * If mask=1, the LVT entry does not generate interrupts while mask=0 +- * enables the vector. See also the BKDGs. +- */ +- +-#define APIC_EILVT_LVTOFF_MCE 0 +-#define APIC_EILVT_LVTOFF_IBS 1 +- +-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +-{ +- unsigned long reg = (lvt_off << 4) + APIC_EILVT0; +- unsigned int v = (mask << 16) | (msg_type << 8) | vector; +- +- apic_write(reg, v); +-} +- +-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +-{ +- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); +- return APIC_EILVT_LVTOFF_MCE; +-} +- +-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +-{ +- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); +- return APIC_EILVT_LVTOFF_IBS; +-} +-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); +- +-/* +- * Program the next event, relative to now +- */ +-static int lapic_next_event(unsigned long delta, +- struct clock_event_device *evt) +-{ +- apic_write(APIC_TMICT, delta); +- return 0; +-} +- +-/* +- * Setup the lapic timer in periodic or oneshot mode +- */ +-static void lapic_timer_setup(enum clock_event_mode mode, +- struct clock_event_device *evt) +-{ +- unsigned long flags; +- unsigned int v; +- +- /* Lapic used as dummy for broadcast ? */ +- if (evt->features & CLOCK_EVT_FEAT_DUMMY) +- return; +- +- local_irq_save(flags); +- +- switch (mode) { +- case CLOCK_EVT_MODE_PERIODIC: +- case CLOCK_EVT_MODE_ONESHOT: +- __setup_APIC_LVTT(calibration_result, +- mode != CLOCK_EVT_MODE_PERIODIC, 1); +- break; +- case CLOCK_EVT_MODE_UNUSED: +- case CLOCK_EVT_MODE_SHUTDOWN: +- v = apic_read(APIC_LVTT); +- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); +- apic_write(APIC_LVTT, v); +- apic_write(APIC_TMICT, 0xffffffff); +- break; +- case CLOCK_EVT_MODE_RESUME: +- /* Nothing to do here */ +- break; +- } +- +- local_irq_restore(flags); +-} +- +-/* +- * Local APIC timer broadcast function +- */ +-static void lapic_timer_broadcast(const struct cpumask *mask) +-{ +-#ifdef CONFIG_SMP +- send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +-#endif +-} +- +-/* +- * Setup the local APIC timer for this CPU. Copy the initilized values +- * of the boot CPU and register the clock event in the framework. +- */ +-static void __cpuinit setup_APIC_timer(void) +-{ +- struct clock_event_device *levt = &__get_cpu_var(lapic_events); +- +- memcpy(levt, &lapic_clockevent, sizeof(*levt)); +- levt->cpumask = cpumask_of(smp_processor_id()); +- +- clockevents_register_device(levt); +-} +- +-/* +- * In this functions we calibrate APIC bus clocks to the external timer. +- * +- * We want to do the calibration only once since we want to have local timer +- * irqs syncron. CPUs connected by the same APIC bus have the very same bus +- * frequency. +- * +- * This was previously done by reading the PIT/HPET and waiting for a wrap +- * around to find out, that a tick has elapsed. I have a box, where the PIT +- * readout is broken, so it never gets out of the wait loop again. This was +- * also reported by others. +- * +- * Monitoring the jiffies value is inaccurate and the clockevents +- * infrastructure allows us to do a simple substitution of the interrupt +- * handler. +- * +- * The calibration routine also uses the pm_timer when possible, as the PIT +- * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes +- * back to normal later in the boot process). +- */ +- +-#define LAPIC_CAL_LOOPS (HZ/10) +- +-static __initdata int lapic_cal_loops = -1; +-static __initdata long lapic_cal_t1, lapic_cal_t2; +-static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +-static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; +- +-/* +- * Temporary interrupt handler. +- */ +-static void __init lapic_cal_handler(struct clock_event_device *dev) +-{ +- unsigned long long tsc = 0; +- long tapic = apic_read(APIC_TMCCT); +- unsigned long pm = acpi_pm_read_early(); +- +- if (cpu_has_tsc) +- rdtscll(tsc); +- +- switch (lapic_cal_loops++) { +- case 0: +- lapic_cal_t1 = tapic; +- lapic_cal_tsc1 = tsc; +- lapic_cal_pm1 = pm; +- lapic_cal_j1 = jiffies; +- break; +- +- case LAPIC_CAL_LOOPS: +- lapic_cal_t2 = tapic; +- lapic_cal_tsc2 = tsc; +- if (pm < lapic_cal_pm1) +- pm += ACPI_PM_OVRRUN; +- lapic_cal_pm2 = pm; +- lapic_cal_j2 = jiffies; +- break; +- } +-} +- +-static int __init calibrate_by_pmtimer(long deltapm, long *delta) +-{ +- const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; +- const long pm_thresh = pm_100ms / 100; +- unsigned long mult; +- u64 res; +- +-#ifndef CONFIG_X86_PM_TIMER +- return -1; +-#endif +- +- apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); +- +- /* Check, if the PM timer is available */ +- if (!deltapm) +- return -1; +- +- mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); +- +- if (deltapm > (pm_100ms - pm_thresh) && +- deltapm < (pm_100ms + pm_thresh)) { +- apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); +- } else { +- res = (((u64)deltapm) * mult) >> 22; +- do_div(res, 1000000); +- pr_warning("APIC calibration not consistent " +- "with PM Timer: %ldms instead of 100ms\n", +- (long)res); +- /* Correct the lapic counter value */ +- res = (((u64)(*delta)) * pm_100ms); +- do_div(res, deltapm); +- pr_info("APIC delta adjusted to PM-Timer: " +- "%lu (%ld)\n", (unsigned long)res, *delta); +- *delta = (long)res; +- } +- +- return 0; +-} +- +-static int __init calibrate_APIC_clock(void) +-{ +- struct clock_event_device *levt = &__get_cpu_var(lapic_events); +- void (*real_handler)(struct clock_event_device *dev); +- unsigned long deltaj; +- long delta; +- int pm_referenced = 0; +- +- local_irq_disable(); +- +- /* Replace the global interrupt handler */ +- real_handler = global_clock_event->event_handler; +- global_clock_event->event_handler = lapic_cal_handler; +- +- /* +- * Setup the APIC counter to maximum. There is no way the lapic +- * can underflow in the 100ms detection time frame +- */ +- __setup_APIC_LVTT(0xffffffff, 0, 0); +- +- /* Let the interrupts run */ +- local_irq_enable(); +- +- while (lapic_cal_loops <= LAPIC_CAL_LOOPS) +- cpu_relax(); +- +- local_irq_disable(); +- +- /* Restore the real event handler */ +- global_clock_event->event_handler = real_handler; +- +- /* Build delta t1-t2 as apic timer counts down */ +- delta = lapic_cal_t1 - lapic_cal_t2; +- apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); +- +- /* we trust the PM based calibration if possible */ +- pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, +- &delta); +- +- /* Calculate the scaled math multiplication factor */ +- lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, +- lapic_clockevent.shift); +- lapic_clockevent.max_delta_ns = +- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); +- lapic_clockevent.min_delta_ns = +- clockevent_delta2ns(0xF, &lapic_clockevent); +- +- calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; +- +- apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); +- apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); +- apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", +- calibration_result); +- +- if (cpu_has_tsc) { +- delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); +- apic_printk(APIC_VERBOSE, "..... CPU clock speed is " +- "%ld.%04ld MHz.\n", +- (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), +- (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); +- } +- +- apic_printk(APIC_VERBOSE, "..... host bus clock speed is " +- "%u.%04u MHz.\n", +- calibration_result / (1000000 / HZ), +- calibration_result % (1000000 / HZ)); +- +- /* +- * Do a sanity check on the APIC calibration result +- */ +- if (calibration_result < (1000000 / HZ)) { +- local_irq_enable(); +- pr_warning("APIC frequency too slow, disabling apic timer\n"); +- return -1; +- } +- +- levt->features &= ~CLOCK_EVT_FEAT_DUMMY; +- +- /* +- * PM timer calibration failed or not turned on +- * so lets try APIC timer based calibration +- */ +- if (!pm_referenced) { +- apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); +- +- /* +- * Setup the apic timer manually +- */ +- levt->event_handler = lapic_cal_handler; +- lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); +- lapic_cal_loops = -1; +- +- /* Let the interrupts run */ +- local_irq_enable(); +- +- while (lapic_cal_loops <= LAPIC_CAL_LOOPS) +- cpu_relax(); +- +- /* Stop the lapic timer */ +- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); +- +- /* Jiffies delta */ +- deltaj = lapic_cal_j2 - lapic_cal_j1; +- apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); +- +- /* Check, if the jiffies result is consistent */ +- if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) +- apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); +- else +- levt->features |= CLOCK_EVT_FEAT_DUMMY; +- } else +- local_irq_enable(); +- +- if (levt->features & CLOCK_EVT_FEAT_DUMMY) { +- pr_warning("APIC timer disabled due to verification failure\n"); +- return -1; +- } +- +- return 0; +-} +- +-/* +- * Setup the boot APIC +- * +- * Calibrate and verify the result. +- */ +-void __init setup_boot_APIC_clock(void) +-{ +- /* +- * The local apic timer can be disabled via the kernel +- * commandline or from the CPU detection code. Register the lapic +- * timer as a dummy clock event source on SMP systems, so the +- * broadcast mechanism is used. On UP systems simply ignore it. +- */ +- if (disable_apic_timer) { +- pr_info("Disabling APIC timer\n"); +- /* No broadcast on UP ! */ +- if (num_possible_cpus() > 1) { +- lapic_clockevent.mult = 1; +- setup_APIC_timer(); +- } +- return; +- } +- +- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" +- "calibrating APIC timer ...\n"); +- +- if (calibrate_APIC_clock()) { +- /* No broadcast on UP ! */ +- if (num_possible_cpus() > 1) +- setup_APIC_timer(); +- return; +- } +- +- /* +- * If nmi_watchdog is set to IO_APIC, we need the +- * PIT/HPET going. Otherwise register lapic as a dummy +- * device. +- */ +- if (nmi_watchdog != NMI_IO_APIC) +- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; +- else +- pr_warning("APIC timer registered as dummy," +- " due to nmi_watchdog=%d!\n", nmi_watchdog); +- +- /* Setup the lapic or request the broadcast */ +- setup_APIC_timer(); +-} +- +-void __cpuinit setup_secondary_APIC_clock(void) +-{ +- setup_APIC_timer(); +-} +- +-/* +- * The guts of the apic timer interrupt +- */ +-static void local_apic_timer_interrupt(void) +-{ +- int cpu = smp_processor_id(); +- struct clock_event_device *evt = &per_cpu(lapic_events, cpu); +- +- /* +- * Normally we should not be here till LAPIC has been initialized but +- * in some cases like kdump, its possible that there is a pending LAPIC +- * timer interrupt from previous kernel's context and is delivered in +- * new kernel the moment interrupts are enabled. +- * +- * Interrupts are enabled early and LAPIC is setup much later, hence +- * its possible that when we get here evt->event_handler is NULL. +- * Check for event_handler being NULL and discard the interrupt as +- * spurious. +- */ +- if (!evt->event_handler) { +- pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); +- /* Switch it off */ +- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); +- return; +- } +- +- /* +- * the NMI deadlock-detector uses this. +- */ +- inc_irq_stat(apic_timer_irqs); +- +- evt->event_handler(evt); +-} +- +-/* +- * Local APIC timer interrupt. This is the most natural way for doing +- * local interrupts, but local timer interrupts can be emulated by +- * broadcast interrupts too. [in case the hw doesn't support APIC timers] +- * +- * [ if a single-CPU system runs an SMP kernel then we call the local +- * interrupt as well. Thus we cannot inline the local irq ... ] +- */ +-void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +-{ +- struct pt_regs *old_regs = set_irq_regs(regs); +- +- /* +- * NOTE! We'd better ACK the irq immediately, +- * because timer handling can be slow. +- */ +- ack_APIC_irq(); +- /* +- * update_process_times() expects us to have done irq_enter(). +- * Besides, if we don't timer interrupts ignore the global +- * interrupt lock, which is the WrongThing (tm) to do. +- */ +- exit_idle(); +- irq_enter(); +- local_apic_timer_interrupt(); +- irq_exit(); +- +- set_irq_regs(old_regs); +-} +- +-int setup_profiling_timer(unsigned int multiplier) +-{ +- return -EINVAL; +-} +- +-/* +- * Local APIC start and shutdown +- */ +- +-/** +- * clear_local_APIC - shutdown the local APIC +- * +- * This is called, when a CPU is disabled and before rebooting, so the state of +- * the local APIC has no dangling leftovers. Also used to cleanout any BIOS +- * leftovers during boot. +- */ +-void clear_local_APIC(void) +-{ +- int maxlvt; +- u32 v; +- +- /* APIC hasn't been mapped yet */ +- if (!apic_phys) +- return; +- +- maxlvt = lapic_get_maxlvt(); +- /* +- * Masking an LVT entry can trigger a local APIC error +- * if the vector is zero. Mask LVTERR first to prevent this. +- */ +- if (maxlvt >= 3) { +- v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ +- apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); +- } +- /* +- * Careful: we have to set masks only first to deassert +- * any level-triggered sources. +- */ +- v = apic_read(APIC_LVTT); +- apic_write(APIC_LVTT, v | APIC_LVT_MASKED); +- v = apic_read(APIC_LVT0); +- apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +- v = apic_read(APIC_LVT1); +- apic_write(APIC_LVT1, v | APIC_LVT_MASKED); +- if (maxlvt >= 4) { +- v = apic_read(APIC_LVTPC); +- apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); +- } +- +- /* lets not touch this if we didn't frob it */ +-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +- if (maxlvt >= 5) { +- v = apic_read(APIC_LVTTHMR); +- apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); +- } +-#endif +- /* +- * Clean APIC state for other OSs: +- */ +- apic_write(APIC_LVTT, APIC_LVT_MASKED); +- apic_write(APIC_LVT0, APIC_LVT_MASKED); +- apic_write(APIC_LVT1, APIC_LVT_MASKED); +- if (maxlvt >= 3) +- apic_write(APIC_LVTERR, APIC_LVT_MASKED); +- if (maxlvt >= 4) +- apic_write(APIC_LVTPC, APIC_LVT_MASKED); +- +- /* Integrated APIC (!82489DX) ? */ +- if (lapic_is_integrated()) { +- if (maxlvt > 3) +- /* Clear ESR due to Pentium errata 3AP and 11AP */ +- apic_write(APIC_ESR, 0); +- apic_read(APIC_ESR); +- } +-} +- +-/** +- * disable_local_APIC - clear and disable the local APIC +- */ +-void disable_local_APIC(void) +-{ +- unsigned int value; +- +- /* APIC hasn't been mapped yet */ +- if (!apic_phys) +- return; +- +- clear_local_APIC(); +- +- /* +- * Disable APIC (implies clearing of registers +- * for 82489DX!). +- */ +- value = apic_read(APIC_SPIV); +- value &= ~APIC_SPIV_APIC_ENABLED; +- apic_write(APIC_SPIV, value); +- +-#ifdef CONFIG_X86_32 +- /* +- * When LAPIC was disabled by the BIOS and enabled by the kernel, +- * restore the disabled state. +- */ +- if (enabled_via_apicbase) { +- unsigned int l, h; +- +- rdmsr(MSR_IA32_APICBASE, l, h); +- l &= ~MSR_IA32_APICBASE_ENABLE; +- wrmsr(MSR_IA32_APICBASE, l, h); +- } +-#endif +-} +- +-/* +- * If Linux enabled the LAPIC against the BIOS default disable it down before +- * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and +- * not power-off. Additionally clear all LVT entries before disable_local_APIC +- * for the case where Linux didn't enable the LAPIC. +- */ +-void lapic_shutdown(void) +-{ +- unsigned long flags; +- +- if (!cpu_has_apic) +- return; +- +- local_irq_save(flags); +- +-#ifdef CONFIG_X86_32 +- if (!enabled_via_apicbase) +- clear_local_APIC(); +- else +-#endif +- disable_local_APIC(); +- +- +- local_irq_restore(flags); +-} +- +-/* +- * This is to verify that we're looking at a real local APIC. +- * Check these against your board if the CPUs aren't getting +- * started for no apparent reason. +- */ +-int __init verify_local_APIC(void) +-{ +- unsigned int reg0, reg1; +- +- /* +- * The version register is read-only in a real APIC. +- */ +- reg0 = apic_read(APIC_LVR); +- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); +- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); +- reg1 = apic_read(APIC_LVR); +- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); +- +- /* +- * The two version reads above should print the same +- * numbers. If the second one is different, then we +- * poke at a non-APIC. +- */ +- if (reg1 != reg0) +- return 0; +- +- /* +- * Check if the version looks reasonably. +- */ +- reg1 = GET_APIC_VERSION(reg0); +- if (reg1 == 0x00 || reg1 == 0xff) +- return 0; +- reg1 = lapic_get_maxlvt(); +- if (reg1 < 0x02 || reg1 == 0xff) +- return 0; +- +- /* +- * The ID register is read/write in a real APIC. +- */ +- reg0 = apic_read(APIC_ID); +- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); +- apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); +- reg1 = apic_read(APIC_ID); +- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); +- apic_write(APIC_ID, reg0); +- if (reg1 != (reg0 ^ APIC_ID_MASK)) +- return 0; +- +- /* +- * The next two are just to see if we have sane values. +- * They're only really relevant if we're in Virtual Wire +- * compatibility mode, but most boxes are anymore. +- */ +- reg0 = apic_read(APIC_LVT0); +- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); +- reg1 = apic_read(APIC_LVT1); +- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); +- +- return 1; +-} +- +-/** +- * sync_Arb_IDs - synchronize APIC bus arbitration IDs +- */ +-void __init sync_Arb_IDs(void) +-{ +- /* +- * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not +- * needed on AMD. +- */ +- if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) +- return; +- +- /* +- * Wait for idle. +- */ +- apic_wait_icr_idle(); +- +- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); +- apic_write(APIC_ICR, APIC_DEST_ALLINC | +- APIC_INT_LEVELTRIG | APIC_DM_INIT); +-} +- +-/* +- * An initial setup of the virtual wire mode. +- */ +-void __init init_bsp_APIC(void) +-{ +- unsigned int value; +- +- /* +- * Don't do the setup now if we have a SMP BIOS as the +- * through-I/O-APIC virtual wire mode might be active. +- */ +- if (smp_found_config || !cpu_has_apic) +- return; +- +- /* +- * Do not trust the local APIC being empty at bootup. +- */ +- clear_local_APIC(); +- +- /* +- * Enable APIC. +- */ +- value = apic_read(APIC_SPIV); +- value &= ~APIC_VECTOR_MASK; +- value |= APIC_SPIV_APIC_ENABLED; +- +-#ifdef CONFIG_X86_32 +- /* This bit is reserved on P4/Xeon and should be cleared */ +- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && +- (boot_cpu_data.x86 == 15)) +- value &= ~APIC_SPIV_FOCUS_DISABLED; +- else +-#endif +- value |= APIC_SPIV_FOCUS_DISABLED; +- value |= SPURIOUS_APIC_VECTOR; +- apic_write(APIC_SPIV, value); +- +- /* +- * Set up the virtual wire mode. +- */ +- apic_write(APIC_LVT0, APIC_DM_EXTINT); +- value = APIC_DM_NMI; +- if (!lapic_is_integrated()) /* 82489DX */ +- value |= APIC_LVT_LEVEL_TRIGGER; +- apic_write(APIC_LVT1, value); +-} +- +-static void __cpuinit lapic_setup_esr(void) +-{ +- unsigned int oldvalue, value, maxlvt; +- +- if (!lapic_is_integrated()) { +- pr_info("No ESR for 82489DX.\n"); +- return; +- } +- +- if (esr_disable) { +- /* +- * Something untraceable is creating bad interrupts on +- * secondary quads ... for the moment, just leave the +- * ESR disabled - we can't do anything useful with the +- * errors anyway - mbligh +- */ +- pr_info("Leaving ESR disabled.\n"); +- return; +- } +- +- maxlvt = lapic_get_maxlvt(); +- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ +- apic_write(APIC_ESR, 0); +- oldvalue = apic_read(APIC_ESR); +- +- /* enables sending errors */ +- value = ERROR_APIC_VECTOR; +- apic_write(APIC_LVTERR, value); +- +- /* +- * spec says clear errors after enabling vector. +- */ +- if (maxlvt > 3) +- apic_write(APIC_ESR, 0); +- value = apic_read(APIC_ESR); +- if (value != oldvalue) +- apic_printk(APIC_VERBOSE, "ESR value before enabling " +- "vector: 0x%08x after: 0x%08x\n", +- oldvalue, value); +-} +- +- +-/** +- * setup_local_APIC - setup the local APIC +- */ +-void __cpuinit setup_local_APIC(void) +-{ +- unsigned int value; +- int i, j; +- +-#ifdef CONFIG_X86_32 +- /* Pound the ESR really hard over the head with a big hammer - mbligh */ +- if (lapic_is_integrated() && esr_disable) { +- apic_write(APIC_ESR, 0); +- apic_write(APIC_ESR, 0); +- apic_write(APIC_ESR, 0); +- apic_write(APIC_ESR, 0); +- } +-#endif +- +- preempt_disable(); +- +- /* +- * Double-check whether this APIC is really registered. +- * This is meaningless in clustered apic mode, so we skip it. +- */ +- if (!apic_id_registered()) +- BUG(); +- +- /* +- * Intel recommends to set DFR, LDR and TPR before enabling +- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel +- * document number 292116). So here it goes... +- */ +- init_apic_ldr(); +- +- /* +- * Set Task Priority to 'accept all'. We never change this +- * later on. +- */ +- value = apic_read(APIC_TASKPRI); +- value &= ~APIC_TPRI_MASK; +- apic_write(APIC_TASKPRI, value); +- +- /* +- * After a crash, we no longer service the interrupts and a pending +- * interrupt from previous kernel might still have ISR bit set. +- * +- * Most probably by now CPU has serviced that pending interrupt and +- * it might not have done the ack_APIC_irq() because it thought, +- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it +- * does not clear the ISR bit and cpu thinks it has already serivced +- * the interrupt. Hence a vector might get locked. It was noticed +- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. +- */ +- for (i = APIC_ISR_NR - 1; i >= 0; i--) { +- value = apic_read(APIC_ISR + i*0x10); +- for (j = 31; j >= 0; j--) { +- if (value & (1< 1) || +- (boot_cpu_data.x86 >= 15)) +- break; +- goto no_apic; +- case X86_VENDOR_INTEL: +- if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || +- (boot_cpu_data.x86 == 5 && cpu_has_apic)) +- break; +- goto no_apic; +- default: +- goto no_apic; +- } +- +- if (!cpu_has_apic) { +- /* +- * Over-ride BIOS and try to enable the local APIC only if +- * "lapic" specified. +- */ +- if (!force_enable_local_apic) { +- pr_info("Local APIC disabled by BIOS -- " +- "you can enable it with \"lapic\"\n"); +- return -1; +- } +- /* +- * Some BIOSes disable the local APIC in the APIC_BASE +- * MSR. This can only be done in software for Intel P6 or later +- * and AMD K7 (Model > 1) or later. +- */ +- rdmsr(MSR_IA32_APICBASE, l, h); +- if (!(l & MSR_IA32_APICBASE_ENABLE)) { +- pr_info("Local APIC disabled by BIOS -- reenabling.\n"); +- l &= ~MSR_IA32_APICBASE_BASE; +- l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; +- wrmsr(MSR_IA32_APICBASE, l, h); +- enabled_via_apicbase = 1; +- } +- } +- /* +- * The APIC feature bit should now be enabled +- * in `cpuid' +- */ +- features = cpuid_edx(1); +- if (!(features & (1 << X86_FEATURE_APIC))) { +- pr_warning("Could not enable APIC!\n"); +- return -1; +- } +- set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); +- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +- +- /* The BIOS may have set up the APIC at some other address */ +- rdmsr(MSR_IA32_APICBASE, l, h); +- if (l & MSR_IA32_APICBASE_ENABLE) +- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; +- +- pr_info("Found and enabled local APIC!\n"); +- +- apic_pm_activate(); +- +- return 0; +- +-no_apic: +- pr_info("No local APIC present or hardware disabled\n"); +- return -1; +-} +-#endif +- +-#ifdef CONFIG_X86_64 +-void __init early_init_lapic_mapping(void) +-{ +- unsigned long phys_addr; +- +- /* +- * If no local APIC can be found then go out +- * : it means there is no mpatable and MADT +- */ +- if (!smp_found_config) +- return; +- +- phys_addr = mp_lapic_addr; +- +- set_fixmap_nocache(FIX_APIC_BASE, phys_addr); +- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", +- APIC_BASE, phys_addr); +- +- /* +- * Fetch the APIC ID of the BSP in case we have a +- * default configuration (or the MP table is broken). +- */ +- boot_cpu_physical_apicid = read_apic_id(); +-} +-#endif +- +-/** +- * init_apic_mappings - initialize APIC mappings +- */ +-void __init init_apic_mappings(void) +-{ +-#ifdef HAVE_X2APIC +- if (x2apic) { +- boot_cpu_physical_apicid = read_apic_id(); +- return; +- } +-#endif +- +- /* +- * If no local APIC can be found then set up a fake all +- * zeroes page to simulate the local APIC and another +- * one for the IO-APIC. +- */ +- if (!smp_found_config && detect_init_APIC()) { +- apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); +- apic_phys = __pa(apic_phys); +- } else +- apic_phys = mp_lapic_addr; +- +- set_fixmap_nocache(FIX_APIC_BASE, apic_phys); +- apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", +- APIC_BASE, apic_phys); +- +- /* +- * Fetch the APIC ID of the BSP in case we have a +- * default configuration (or the MP table is broken). +- */ +- if (boot_cpu_physical_apicid == -1U) +- boot_cpu_physical_apicid = read_apic_id(); +-} +- +-/* +- * This initializes the IO-APIC and APIC hardware if this is +- * a UP kernel. +- */ +-int apic_version[MAX_APICS]; +- +-int __init APIC_init_uniprocessor(void) +-{ +-#ifdef CONFIG_X86_64 +- if (disable_apic) { +- pr_info("Apic disabled\n"); +- return -1; +- } +- if (!cpu_has_apic) { +- disable_apic = 1; +- pr_info("Apic disabled by BIOS\n"); +- return -1; +- } +-#else +- if (!smp_found_config && !cpu_has_apic) +- return -1; +- +- /* +- * Complain if the BIOS pretends there is one. +- */ +- if (!cpu_has_apic && +- APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { +- pr_err("BIOS bug, local APIC 0x%x not detected!...\n", +- boot_cpu_physical_apicid); +- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); +- return -1; +- } +-#endif +- +-#ifdef HAVE_X2APIC +- enable_IR_x2apic(); +-#endif +-#ifdef CONFIG_X86_64 +- setup_apic_routing(); +-#endif +- +- verify_local_APIC(); +- connect_bsp_APIC(); +- +-#ifdef CONFIG_X86_64 +- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +-#else +- /* +- * Hack: In case of kdump, after a crash, kernel might be booting +- * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid +- * might be zero if read from MP tables. Get it from LAPIC. +- */ +-# ifdef CONFIG_CRASH_DUMP +- boot_cpu_physical_apicid = read_apic_id(); +-# endif +-#endif +- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); +- setup_local_APIC(); +- +-#ifdef CONFIG_X86_64 +- /* +- * Now enable IO-APICs, actually call clear_IO_APIC +- * We need clear_IO_APIC before enabling vector on BP +- */ +- if (!skip_ioapic_setup && nr_ioapics) +- enable_IO_APIC(); +-#endif +- +-#ifdef CONFIG_X86_IO_APIC +- if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) +-#endif +- localise_nmi_watchdog(); +- end_local_APIC_setup(); +- +-#ifdef CONFIG_X86_IO_APIC +- if (smp_found_config && !skip_ioapic_setup && nr_ioapics) +- setup_IO_APIC(); +-# ifdef CONFIG_X86_64 +- else +- nr_ioapics = 0; +-# endif +-#endif +- +-#ifdef CONFIG_X86_64 +- setup_boot_APIC_clock(); +- check_nmi_watchdog(); +-#else +- setup_boot_clock(); +-#endif +- +- return 0; +-} +- +-/* +- * Local APIC interrupts +- */ +- +-/* +- * This interrupt should _never_ happen with our APIC/SMP architecture +- */ +-void smp_spurious_interrupt(struct pt_regs *regs) +-{ +- u32 v; +- +- exit_idle(); +- irq_enter(); +- /* +- * Check if this really is a spurious interrupt and ACK it +- * if it is a vectored one. Just in case... +- * Spurious interrupts should not be ACKed. +- */ +- v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); +- if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) +- ack_APIC_irq(); +- +- inc_irq_stat(irq_spurious_count); +- +- /* see sw-dev-man vol 3, chapter 7.4.13.5 */ +- pr_info("spurious APIC interrupt on CPU#%d, " +- "should never happen.\n", smp_processor_id()); +- irq_exit(); +-} +- +-/* +- * This interrupt should never happen with our APIC/SMP architecture +- */ +-void smp_error_interrupt(struct pt_regs *regs) +-{ +- u32 v, v1; +- +- exit_idle(); +- irq_enter(); +- /* First tickle the hardware, only then report what went on. -- REW */ +- v = apic_read(APIC_ESR); +- apic_write(APIC_ESR, 0); +- v1 = apic_read(APIC_ESR); +- ack_APIC_irq(); +- atomic_inc(&irq_err_count); +- +- /* +- * Here is what the APIC error bits mean: +- * 0: Send CS error +- * 1: Receive CS error +- * 2: Send accept error +- * 3: Receive accept error +- * 4: Reserved +- * 5: Send illegal vector +- * 6: Received illegal vector +- * 7: Illegal register address +- */ +- pr_debug("APIC error on CPU%d: %02x(%02x)\n", +- smp_processor_id(), v , v1); +- irq_exit(); +-} +- +-/** +- * connect_bsp_APIC - attach the APIC to the interrupt system +- */ +-void __init connect_bsp_APIC(void) +-{ +-#ifdef CONFIG_X86_32 +- if (pic_mode) { +- /* +- * Do not trust the local APIC being empty at bootup. +- */ +- clear_local_APIC(); +- /* +- * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's +- * local APIC to INT and NMI lines. +- */ +- apic_printk(APIC_VERBOSE, "leaving PIC mode, " +- "enabling APIC mode.\n"); +- outb(0x70, 0x22); +- outb(0x01, 0x23); +- } +-#endif +- enable_apic_mode(); +-} +- +-/** +- * disconnect_bsp_APIC - detach the APIC from the interrupt system +- * @virt_wire_setup: indicates, whether virtual wire mode is selected +- * +- * Virtual wire mode is necessary to deliver legacy interrupts even when the +- * APIC is disabled. +- */ +-void disconnect_bsp_APIC(int virt_wire_setup) +-{ +- unsigned int value; +- +-#ifdef CONFIG_X86_32 +- if (pic_mode) { +- /* +- * Put the board back into PIC mode (has an effect only on +- * certain older boards). Note that APIC interrupts, including +- * IPIs, won't work beyond this point! The only exception are +- * INIT IPIs. +- */ +- apic_printk(APIC_VERBOSE, "disabling APIC mode, " +- "entering PIC mode.\n"); +- outb(0x70, 0x22); +- outb(0x00, 0x23); +- return; +- } +-#endif +- +- /* Go back to Virtual Wire compatibility mode */ +- +- /* For the spurious interrupt use vector F, and enable it */ +- value = apic_read(APIC_SPIV); +- value &= ~APIC_VECTOR_MASK; +- value |= APIC_SPIV_APIC_ENABLED; +- value |= 0xf; +- apic_write(APIC_SPIV, value); +- +- if (!virt_wire_setup) { +- /* +- * For LVT0 make it edge triggered, active high, +- * external and enabled +- */ +- value = apic_read(APIC_LVT0); +- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | +- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | +- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); +- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; +- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); +- apic_write(APIC_LVT0, value); +- } else { +- /* Disable LVT0 */ +- apic_write(APIC_LVT0, APIC_LVT_MASKED); +- } +- +- /* +- * For LVT1 make it edge triggered, active high, +- * nmi and enabled +- */ +- value = apic_read(APIC_LVT1); +- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | +- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | +- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); +- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; +- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); +- apic_write(APIC_LVT1, value); +-} +- +-void __cpuinit generic_processor_info(int apicid, int version) +-{ +- int cpu; +- +- /* +- * Validate version +- */ +- if (version == 0x0) { +- pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " +- "fixing up to 0x10. (tell your hw vendor)\n", +- version); +- version = 0x10; +- } +- apic_version[apicid] = version; +- +- if (num_processors >= nr_cpu_ids) { +- int max = nr_cpu_ids; +- int thiscpu = max + disabled_cpus; +- +- pr_warning( +- "ACPI: NR_CPUS/possible_cpus limit of %i reached." +- " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); +- +- disabled_cpus++; +- return; +- } +- +- num_processors++; +- cpu = cpumask_next_zero(-1, cpu_present_mask); +- +- if (version != apic_version[boot_cpu_physical_apicid]) +- WARN_ONCE(1, +- "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n", +- apic_version[boot_cpu_physical_apicid], cpu, version); +- +- physid_set(apicid, phys_cpu_present_map); +- if (apicid == boot_cpu_physical_apicid) { +- /* +- * x86_bios_cpu_apicid is required to have processors listed +- * in same order as logical cpu numbers. Hence the first +- * entry is BSP, and so on. +- */ +- cpu = 0; +- } +- if (apicid > max_physical_apicid) +- max_physical_apicid = apicid; +- +-#ifdef CONFIG_X86_32 +- /* +- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y +- * but we need to work other dependencies like SMP_SUSPEND etc +- * before this can be done without some confusion. +- * if (CPU_HOTPLUG_ENABLED || num_processors > 8) +- * - Ashok Raj +- */ +- if (max_physical_apicid >= 8) { +- switch (boot_cpu_data.x86_vendor) { +- case X86_VENDOR_INTEL: +- if (!APIC_XAPIC(version)) { +- def_to_bigsmp = 0; +- break; +- } +- /* If P4 and above fall through */ +- case X86_VENDOR_AMD: +- def_to_bigsmp = 1; +- } +- } +-#endif +- +-#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) +- /* are we being called early in kernel startup? */ +- if (early_per_cpu_ptr(x86_cpu_to_apicid)) { +- u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); +- u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); +- +- cpu_to_apicid[cpu] = apicid; +- bios_cpu_apicid[cpu] = apicid; +- } else { +- per_cpu(x86_cpu_to_apicid, cpu) = apicid; +- per_cpu(x86_bios_cpu_apicid, cpu) = apicid; +- } +-#endif +- +- set_cpu_possible(cpu, true); +- set_cpu_present(cpu, true); +-} +- +-#ifdef CONFIG_X86_64 +-int hard_smp_processor_id(void) +-{ +- return read_apic_id(); +-} +-#endif +- +-/* +- * Power management +- */ +-#ifdef CONFIG_PM +- +-static struct { +- /* +- * 'active' is true if the local APIC was enabled by us and +- * not the BIOS; this signifies that we are also responsible +- * for disabling it before entering apm/acpi suspend +- */ +- int active; +- /* r/w apic fields */ +- unsigned int apic_id; +- unsigned int apic_taskpri; +- unsigned int apic_ldr; +- unsigned int apic_dfr; +- unsigned int apic_spiv; +- unsigned int apic_lvtt; +- unsigned int apic_lvtpc; +- unsigned int apic_lvt0; +- unsigned int apic_lvt1; +- unsigned int apic_lvterr; +- unsigned int apic_tmict; +- unsigned int apic_tdcr; +- unsigned int apic_thmr; +-} apic_pm_state; +- +-static int lapic_suspend(struct sys_device *dev, pm_message_t state) +-{ +- unsigned long flags; +- int maxlvt; +- +- if (!apic_pm_state.active) +- return 0; +- +- maxlvt = lapic_get_maxlvt(); +- +- apic_pm_state.apic_id = apic_read(APIC_ID); +- apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); +- apic_pm_state.apic_ldr = apic_read(APIC_LDR); +- apic_pm_state.apic_dfr = apic_read(APIC_DFR); +- apic_pm_state.apic_spiv = apic_read(APIC_SPIV); +- apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); +- if (maxlvt >= 4) +- apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); +- apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); +- apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); +- apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); +- apic_pm_state.apic_tmict = apic_read(APIC_TMICT); +- apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +- if (maxlvt >= 5) +- apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +-#endif +- +- local_irq_save(flags); +- disable_local_APIC(); +- local_irq_restore(flags); +- return 0; +-} +- +-static int lapic_resume(struct sys_device *dev) +-{ +- unsigned int l, h; +- unsigned long flags; +- int maxlvt; +- +- if (!apic_pm_state.active) +- return 0; +- +- maxlvt = lapic_get_maxlvt(); +- +- local_irq_save(flags); +- +-#ifdef HAVE_X2APIC +- if (x2apic) +- enable_x2apic(); +- else +-#endif +- { +- /* +- * Make sure the APICBASE points to the right address +- * +- * FIXME! This will be wrong if we ever support suspend on +- * SMP! We'll need to do this as part of the CPU restore! +- */ +- rdmsr(MSR_IA32_APICBASE, l, h); +- l &= ~MSR_IA32_APICBASE_BASE; +- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; +- wrmsr(MSR_IA32_APICBASE, l, h); +- } +- +- apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); +- apic_write(APIC_ID, apic_pm_state.apic_id); +- apic_write(APIC_DFR, apic_pm_state.apic_dfr); +- apic_write(APIC_LDR, apic_pm_state.apic_ldr); +- apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); +- apic_write(APIC_SPIV, apic_pm_state.apic_spiv); +- apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); +- apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) +- if (maxlvt >= 5) +- apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +-#endif +- if (maxlvt >= 4) +- apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); +- apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); +- apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); +- apic_write(APIC_TMICT, apic_pm_state.apic_tmict); +- apic_write(APIC_ESR, 0); +- apic_read(APIC_ESR); +- apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); +- apic_write(APIC_ESR, 0); +- apic_read(APIC_ESR); +- +- local_irq_restore(flags); +- +- return 0; +-} +- +-/* +- * This device has no shutdown method - fully functioning local APICs +- * are needed on every CPU up until machine_halt/restart/poweroff. +- */ +- +-static struct sysdev_class lapic_sysclass = { +- .name = "lapic", +- .resume = lapic_resume, +- .suspend = lapic_suspend, +-}; +- +-static struct sys_device device_lapic = { +- .id = 0, +- .cls = &lapic_sysclass, +-}; +- +-static void __cpuinit apic_pm_activate(void) +-{ +- apic_pm_state.active = 1; +-} +- +-static int __init init_lapic_sysfs(void) +-{ +- int error; +- +- if (!cpu_has_apic) +- return 0; +- /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ +- +- error = sysdev_class_register(&lapic_sysclass); +- if (!error) +- error = sysdev_register(&device_lapic); +- return error; +-} +-device_initcall(init_lapic_sysfs); +- +-#else /* CONFIG_PM */ +- +-static void apic_pm_activate(void) { } +- +-#endif /* CONFIG_PM */ +- +-#ifdef CONFIG_X86_64 +-/* +- * apic_is_clustered_box() -- Check if we can expect good TSC +- * +- * Thus far, the major user of this is IBM's Summit2 series: +- * +- * Clustered boxes may have unsynced TSC problems if they are +- * multi-chassis. Use available data to take a good guess. +- * If in doubt, go HPET. +- */ +-__cpuinit int apic_is_clustered_box(void) +-{ +- int i, clusters, zeros; +- unsigned id; +- u16 *bios_cpu_apicid; +- DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); +- +- /* +- * there is not this kind of box with AMD CPU yet. +- * Some AMD box with quadcore cpu and 8 sockets apicid +- * will be [4, 0x23] or [8, 0x27] could be thought to +- * vsmp box still need checking... +- */ +- if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) +- return 0; +- +- bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); +- bitmap_zero(clustermap, NUM_APIC_CLUSTERS); +- +- for (i = 0; i < nr_cpu_ids; i++) { +- /* are we being called early in kernel startup? */ +- if (bios_cpu_apicid) { +- id = bios_cpu_apicid[i]; +- } else if (i < nr_cpu_ids) { +- if (cpu_present(i)) +- id = per_cpu(x86_bios_cpu_apicid, i); +- else +- continue; +- } else +- break; +- +- if (id != BAD_APICID) +- __set_bit(APIC_CLUSTERID(id), clustermap); +- } +- +- /* Problem: Partially populated chassis may not have CPUs in some of +- * the APIC clusters they have been allocated. Only present CPUs have +- * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. +- * Since clusters are allocated sequentially, count zeros only if +- * they are bounded by ones. +- */ +- clusters = 0; +- zeros = 0; +- for (i = 0; i < NUM_APIC_CLUSTERS; i++) { +- if (test_bit(i, clustermap)) { +- clusters += 1 + zeros; +- zeros = 0; +- } else +- ++zeros; +- } +- +- /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are +- * not guaranteed to be synced between boards +- */ +- if (is_vsmp_box() && clusters > 1) +- return 1; +- +- /* +- * If clusters > 2, then should be multi-chassis. +- * May have to revisit this when multi-core + hyperthreaded CPUs come +- * out, but AFAIK this will work even for them. +- */ +- return (clusters > 2); +-} +-#endif +- +-/* +- * APIC command line parameters +- */ +-static int __init setup_disableapic(char *arg) +-{ +- disable_apic = 1; +- setup_clear_cpu_cap(X86_FEATURE_APIC); +- return 0; +-} +-early_param("disableapic", setup_disableapic); +- +-/* same as disableapic, for compatibility */ +-static int __init setup_nolapic(char *arg) +-{ +- return setup_disableapic(arg); +-} +-early_param("nolapic", setup_nolapic); +- +-static int __init parse_lapic_timer_c2_ok(char *arg) +-{ +- local_apic_timer_c2_ok = 1; +- return 0; +-} +-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); +- +-static int __init parse_disable_apic_timer(char *arg) +-{ +- disable_apic_timer = 1; +- return 0; +-} +-early_param("noapictimer", parse_disable_apic_timer); +- +-static int __init parse_nolapic_timer(char *arg) +-{ +- disable_apic_timer = 1; +- return 0; +-} +-early_param("nolapic_timer", parse_nolapic_timer); +- +-static int __init apic_set_verbosity(char *arg) +-{ +- if (!arg) { +-#ifdef CONFIG_X86_64 +- skip_ioapic_setup = 0; +- return 0; +-#endif +- return -EINVAL; +- } +- +- if (strcmp("debug", arg) == 0) +- apic_verbosity = APIC_DEBUG; +- else if (strcmp("verbose", arg) == 0) +- apic_verbosity = APIC_VERBOSE; +- else { +- pr_warning("APIC Verbosity level %s not recognised" +- " use apic=verbose or apic=debug\n", arg); +- return -EINVAL; +- } +- +- return 0; +-} +-early_param("apic", apic_set_verbosity); +- +-static int __init lapic_insert_resource(void) +-{ +- if (!apic_phys) +- return -1; +- +- /* Put local APIC into the resource map. */ +- lapic_resource.start = apic_phys; +- lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; +- insert_resource(&iomem_resource, &lapic_resource); +- +- return 0; +-} +- +-/* +- * need call insert after e820_reserve_resources() +- * that is using request_resource +- */ +-late_initcall(lapic_insert_resource); +Index: linux-2.6-tip/arch/x86/kernel/apic/Makefile +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/Makefile +@@ -0,0 +1,19 @@ ++# ++# Makefile for local APIC drivers and for the IO-APIC code ++# ++ ++obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o ++obj-$(CONFIG_X86_IO_APIC) += io_apic.o ++obj-$(CONFIG_SMP) += ipi.o ++ ++ifeq ($(CONFIG_X86_64),y) ++obj-y += apic_flat_64.o ++obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o ++obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o ++obj-$(CONFIG_X86_UV) += x2apic_uv_x.o ++endif ++ ++obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o ++obj-$(CONFIG_X86_NUMAQ) += numaq_32.o ++obj-$(CONFIG_X86_ES7000) += es7000_32.o ++obj-$(CONFIG_X86_SUMMIT) += summit_32.o +Index: linux-2.6-tip/arch/x86/kernel/apic/apic.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/apic.c +@@ -0,0 +1,2219 @@ ++/* ++ * Local APIC handling, local APIC timers ++ * ++ * (c) 1999, 2000, 2009 Ingo Molnar ++ * ++ * Fixes ++ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; ++ * thanks to Eric Gilmore ++ * and Rolf G. Tews ++ * for testing these extensively. ++ * Maciej W. Rozycki : Various updates and fixes. ++ * Mikael Pettersson : Power Management for UP-APIC. ++ * Pavel Machek and ++ * Mikael Pettersson : PM converted to driver model. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++unsigned int num_processors; ++ ++unsigned disabled_cpus __cpuinitdata; ++ ++/* Processor that is doing the boot up */ ++unsigned int boot_cpu_physical_apicid = -1U; ++ ++/* ++ * The highest APIC ID seen during enumeration. ++ * ++ * This determines the messaging protocol we can use: if all APIC IDs ++ * are in the 0 ... 7 range, then we can use logical addressing which ++ * has some performance advantages (better broadcasting). ++ * ++ * If there's an APIC ID above 8, we use physical addressing. ++ */ ++unsigned int max_physical_apicid; ++ ++/* ++ * Bitmask of physically existing CPUs: ++ */ ++physid_mask_t phys_cpu_present_map; ++ ++/* ++ * Map cpu index to physical APIC ID ++ */ ++DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); ++DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); ++EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); ++EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); ++ ++#ifdef CONFIG_X86_32 ++/* ++ * Knob to control our willingness to enable the local APIC. ++ * ++ * +1=force-enable ++ */ ++static int force_enable_local_apic; ++/* ++ * APIC command line parameters ++ */ ++static int __init parse_lapic(char *arg) ++{ ++ force_enable_local_apic = 1; ++ return 0; ++} ++early_param("lapic", parse_lapic); ++/* Local APIC was disabled by the BIOS and enabled by the kernel */ ++static int enabled_via_apicbase; ++ ++#endif ++ ++#ifdef CONFIG_X86_64 ++static int apic_calibrate_pmtmr __initdata; ++static __init int setup_apicpmtimer(char *s) ++{ ++ apic_calibrate_pmtmr = 1; ++ notsc_setup(NULL); ++ return 0; ++} ++__setup("apicpmtimer", setup_apicpmtimer); ++#endif ++ ++#ifdef CONFIG_X86_X2APIC ++int x2apic; ++/* x2apic enabled before OS handover */ ++static int x2apic_preenabled; ++static int disable_x2apic; ++static __init int setup_nox2apic(char *str) ++{ ++ disable_x2apic = 1; ++ setup_clear_cpu_cap(X86_FEATURE_X2APIC); ++ return 0; ++} ++early_param("nox2apic", setup_nox2apic); ++#endif ++ ++unsigned long mp_lapic_addr; ++int disable_apic; ++/* Disable local APIC timer from the kernel commandline or via dmi quirk */ ++static int disable_apic_timer __cpuinitdata; ++/* Local APIC timer works in C2 */ ++int local_apic_timer_c2_ok; ++EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); ++ ++int first_system_vector = 0xfe; ++ ++/* ++ * Debug level, exported for io_apic.c ++ */ ++unsigned int apic_verbosity; ++ ++int pic_mode; ++ ++/* Have we found an MP table */ ++int smp_found_config; ++ ++static struct resource lapic_resource = { ++ .name = "Local APIC", ++ .flags = IORESOURCE_MEM | IORESOURCE_BUSY, ++}; ++ ++static unsigned int calibration_result; ++ ++static int lapic_next_event(unsigned long delta, ++ struct clock_event_device *evt); ++static void lapic_timer_setup(enum clock_event_mode mode, ++ struct clock_event_device *evt); ++static void lapic_timer_broadcast(const struct cpumask *mask); ++static void apic_pm_activate(void); ++ ++/* ++ * The local apic timer can be used for any function which is CPU local. ++ */ ++static struct clock_event_device lapic_clockevent = { ++ .name = "lapic", ++ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT ++ | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, ++ .shift = 32, ++ .set_mode = lapic_timer_setup, ++ .set_next_event = lapic_next_event, ++ .broadcast = lapic_timer_broadcast, ++ .rating = 100, ++ .irq = -1, ++}; ++static DEFINE_PER_CPU(struct clock_event_device, lapic_events); ++ ++static unsigned long apic_phys; ++ ++/* ++ * Get the LAPIC version ++ */ ++static inline int lapic_get_version(void) ++{ ++ return GET_APIC_VERSION(apic_read(APIC_LVR)); ++} ++ ++/* ++ * Check, if the APIC is integrated or a separate chip ++ */ ++static inline int lapic_is_integrated(void) ++{ ++#ifdef CONFIG_X86_64 ++ return 1; ++#else ++ return APIC_INTEGRATED(lapic_get_version()); ++#endif ++} ++ ++/* ++ * Check, whether this is a modern or a first generation APIC ++ */ ++static int modern_apic(void) ++{ ++ /* AMD systems use old APIC versions, so check the CPU */ ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && ++ boot_cpu_data.x86 >= 0xf) ++ return 1; ++ return lapic_get_version() >= 0x14; ++} ++ ++void native_apic_wait_icr_idle(void) ++{ ++ while (apic_read(APIC_ICR) & APIC_ICR_BUSY) ++ cpu_relax(); ++} ++ ++u32 native_safe_apic_wait_icr_idle(void) ++{ ++ u32 send_status; ++ int timeout; ++ ++ timeout = 0; ++ do { ++ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; ++ if (!send_status) ++ break; ++ udelay(100); ++ } while (timeout++ < 1000); ++ ++ return send_status; ++} ++ ++void native_apic_icr_write(u32 low, u32 id) ++{ ++ apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); ++ apic_write(APIC_ICR, low); ++} ++ ++u64 native_apic_icr_read(void) ++{ ++ u32 icr1, icr2; ++ ++ icr2 = apic_read(APIC_ICR2); ++ icr1 = apic_read(APIC_ICR); ++ ++ return icr1 | ((u64)icr2 << 32); ++} ++ ++/** ++ * enable_NMI_through_LVT0 - enable NMI through local vector table 0 ++ */ ++void __cpuinit enable_NMI_through_LVT0(void) ++{ ++ unsigned int v; ++ ++ /* unmask and set to NMI */ ++ v = APIC_DM_NMI; ++ ++ /* Level triggered for 82489DX (32bit mode) */ ++ if (!lapic_is_integrated()) ++ v |= APIC_LVT_LEVEL_TRIGGER; ++ ++ apic_write(APIC_LVT0, v); ++} ++ ++#ifdef CONFIG_X86_32 ++/** ++ * get_physical_broadcast - Get number of physical broadcast IDs ++ */ ++int get_physical_broadcast(void) ++{ ++ return modern_apic() ? 0xff : 0xf; ++} ++#endif ++ ++/** ++ * lapic_get_maxlvt - get the maximum number of local vector table entries ++ */ ++int lapic_get_maxlvt(void) ++{ ++ unsigned int v; ++ ++ v = apic_read(APIC_LVR); ++ /* ++ * - we always have APIC integrated on 64bit mode ++ * - 82489DXs do not report # of LVT entries ++ */ ++ return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; ++} ++ ++/* ++ * Local APIC timer ++ */ ++ ++/* Clock divisor */ ++#define APIC_DIVISOR 16 ++ ++/* ++ * This function sets up the local APIC timer, with a timeout of ++ * 'clocks' APIC bus clock. During calibration we actually call ++ * this function twice on the boot CPU, once with a bogus timeout ++ * value, second time for real. The other (noncalibrating) CPUs ++ * call this function only once, with the real, calibrated value. ++ * ++ * We do reads before writes even if unnecessary, to get around the ++ * P5 APIC double write bug. ++ */ ++static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) ++{ ++ unsigned int lvtt_value, tmp_value; ++ ++ lvtt_value = LOCAL_TIMER_VECTOR; ++ if (!oneshot) ++ lvtt_value |= APIC_LVT_TIMER_PERIODIC; ++ if (!lapic_is_integrated()) ++ lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); ++ ++ if (!irqen) ++ lvtt_value |= APIC_LVT_MASKED; ++ ++ apic_write(APIC_LVTT, lvtt_value); ++ ++ /* ++ * Divide PICLK by 16 ++ */ ++ tmp_value = apic_read(APIC_TDCR); ++ apic_write(APIC_TDCR, ++ (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | ++ APIC_TDR_DIV_16); ++ ++ if (!oneshot) ++ apic_write(APIC_TMICT, clocks / APIC_DIVISOR); ++} ++ ++/* ++ * Setup extended LVT, AMD specific (K8, family 10h) ++ * ++ * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and ++ * MCE interrupts are supported. Thus MCE offset must be set to 0. ++ * ++ * If mask=1, the LVT entry does not generate interrupts while mask=0 ++ * enables the vector. See also the BKDGs. ++ */ ++ ++#define APIC_EILVT_LVTOFF_MCE 0 ++#define APIC_EILVT_LVTOFF_IBS 1 ++ ++static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) ++{ ++ unsigned long reg = (lvt_off << 4) + APIC_EILVT0; ++ unsigned int v = (mask << 16) | (msg_type << 8) | vector; ++ ++ apic_write(reg, v); ++} ++ ++u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) ++{ ++ setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); ++ return APIC_EILVT_LVTOFF_MCE; ++} ++ ++u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) ++{ ++ setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); ++ return APIC_EILVT_LVTOFF_IBS; ++} ++EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); ++ ++/* ++ * Program the next event, relative to now ++ */ ++static int lapic_next_event(unsigned long delta, ++ struct clock_event_device *evt) ++{ ++ apic_write(APIC_TMICT, delta); ++ return 0; ++} ++ ++/* ++ * Setup the lapic timer in periodic or oneshot mode ++ */ ++static void lapic_timer_setup(enum clock_event_mode mode, ++ struct clock_event_device *evt) ++{ ++ unsigned long flags; ++ unsigned int v; ++ ++ /* Lapic used as dummy for broadcast ? */ ++ if (evt->features & CLOCK_EVT_FEAT_DUMMY) ++ return; ++ ++ local_irq_save(flags); ++ ++ switch (mode) { ++ case CLOCK_EVT_MODE_PERIODIC: ++ case CLOCK_EVT_MODE_ONESHOT: ++ __setup_APIC_LVTT(calibration_result, ++ mode != CLOCK_EVT_MODE_PERIODIC, 1); ++ break; ++ case CLOCK_EVT_MODE_UNUSED: ++ case CLOCK_EVT_MODE_SHUTDOWN: ++ v = apic_read(APIC_LVTT); ++ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); ++ apic_write(APIC_LVTT, v); ++ apic_write(APIC_TMICT, 0xffffffff); ++ break; ++ case CLOCK_EVT_MODE_RESUME: ++ /* Nothing to do here */ ++ break; ++ } ++ ++ local_irq_restore(flags); ++} ++ ++/* ++ * Local APIC timer broadcast function ++ */ ++static void lapic_timer_broadcast(const struct cpumask *mask) ++{ ++#ifdef CONFIG_SMP ++ apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR); ++#endif ++} ++ ++/* ++ * Setup the local APIC timer for this CPU. Copy the initilized values ++ * of the boot CPU and register the clock event in the framework. ++ */ ++static void __cpuinit setup_APIC_timer(void) ++{ ++ struct clock_event_device *levt = &__get_cpu_var(lapic_events); ++ ++ memcpy(levt, &lapic_clockevent, sizeof(*levt)); ++ levt->cpumask = cpumask_of(smp_processor_id()); ++ ++ clockevents_register_device(levt); ++} ++ ++/* ++ * In this functions we calibrate APIC bus clocks to the external timer. ++ * ++ * We want to do the calibration only once since we want to have local timer ++ * irqs syncron. CPUs connected by the same APIC bus have the very same bus ++ * frequency. ++ * ++ * This was previously done by reading the PIT/HPET and waiting for a wrap ++ * around to find out, that a tick has elapsed. I have a box, where the PIT ++ * readout is broken, so it never gets out of the wait loop again. This was ++ * also reported by others. ++ * ++ * Monitoring the jiffies value is inaccurate and the clockevents ++ * infrastructure allows us to do a simple substitution of the interrupt ++ * handler. ++ * ++ * The calibration routine also uses the pm_timer when possible, as the PIT ++ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes ++ * back to normal later in the boot process). ++ */ ++ ++#define LAPIC_CAL_LOOPS (HZ/10) ++ ++static __initdata int lapic_cal_loops = -1; ++static __initdata long lapic_cal_t1, lapic_cal_t2; ++static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; ++static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; ++static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; ++ ++/* ++ * Temporary interrupt handler. ++ */ ++static void __init lapic_cal_handler(struct clock_event_device *dev) ++{ ++ unsigned long long tsc = 0; ++ long tapic = apic_read(APIC_TMCCT); ++ unsigned long pm = acpi_pm_read_early(); ++ ++ if (cpu_has_tsc) ++ rdtscll(tsc); ++ ++ switch (lapic_cal_loops++) { ++ case 0: ++ lapic_cal_t1 = tapic; ++ lapic_cal_tsc1 = tsc; ++ lapic_cal_pm1 = pm; ++ lapic_cal_j1 = jiffies; ++ break; ++ ++ case LAPIC_CAL_LOOPS: ++ lapic_cal_t2 = tapic; ++ lapic_cal_tsc2 = tsc; ++ if (pm < lapic_cal_pm1) ++ pm += ACPI_PM_OVRRUN; ++ lapic_cal_pm2 = pm; ++ lapic_cal_j2 = jiffies; ++ break; ++ } ++} ++ ++static int __init ++calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) ++{ ++ const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; ++ const long pm_thresh = pm_100ms / 100; ++ unsigned long mult; ++ u64 res; ++ ++#ifndef CONFIG_X86_PM_TIMER ++ return -1; ++#endif ++ ++ apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm); ++ ++ /* Check, if the PM timer is available */ ++ if (!deltapm) ++ return -1; ++ ++ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); ++ ++ if (deltapm > (pm_100ms - pm_thresh) && ++ deltapm < (pm_100ms + pm_thresh)) { ++ apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n"); ++ return 0; ++ } ++ ++ res = (((u64)deltapm) * mult) >> 22; ++ do_div(res, 1000000); ++ pr_warning("APIC calibration not consistent " ++ "with PM-Timer: %ldms instead of 100ms\n",(long)res); ++ ++ /* Correct the lapic counter value */ ++ res = (((u64)(*delta)) * pm_100ms); ++ do_div(res, deltapm); ++ pr_info("APIC delta adjusted to PM-Timer: " ++ "%lu (%ld)\n", (unsigned long)res, *delta); ++ *delta = (long)res; ++ ++ /* Correct the tsc counter value */ ++ if (cpu_has_tsc) { ++ res = (((u64)(*deltatsc)) * pm_100ms); ++ do_div(res, deltapm); ++ apic_printk(APIC_VERBOSE, "TSC delta adjusted to " ++ "PM-Timer: %lu (%ld) \n", ++ (unsigned long)res, *deltatsc); ++ *deltatsc = (long)res; ++ } ++ ++ return 0; ++} ++ ++static int __init calibrate_APIC_clock(void) ++{ ++ struct clock_event_device *levt = &__get_cpu_var(lapic_events); ++ void (*real_handler)(struct clock_event_device *dev); ++ unsigned long deltaj; ++ long delta, deltatsc; ++ int pm_referenced = 0; ++ ++ local_irq_disable(); ++ ++ /* Replace the global interrupt handler */ ++ real_handler = global_clock_event->event_handler; ++ global_clock_event->event_handler = lapic_cal_handler; ++ ++ /* ++ * Setup the APIC counter to maximum. There is no way the lapic ++ * can underflow in the 100ms detection time frame ++ */ ++ __setup_APIC_LVTT(0xffffffff, 0, 0); ++ ++ /* Let the interrupts run */ ++ local_irq_enable(); ++ ++ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) ++ cpu_relax(); ++ ++ local_irq_disable(); ++ ++ /* Restore the real event handler */ ++ global_clock_event->event_handler = real_handler; ++ ++ /* Build delta t1-t2 as apic timer counts down */ ++ delta = lapic_cal_t1 - lapic_cal_t2; ++ apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); ++ ++ deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); ++ ++ /* we trust the PM based calibration if possible */ ++ pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, ++ &delta, &deltatsc); ++ ++ /* Calculate the scaled math multiplication factor */ ++ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, ++ lapic_clockevent.shift); ++ lapic_clockevent.max_delta_ns = ++ clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); ++ lapic_clockevent.min_delta_ns = ++ clockevent_delta2ns(0xF, &lapic_clockevent); ++ ++ calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; ++ ++ apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); ++ apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); ++ apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", ++ calibration_result); ++ ++ if (cpu_has_tsc) { ++ apic_printk(APIC_VERBOSE, "..... CPU clock speed is " ++ "%ld.%04ld MHz.\n", ++ (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), ++ (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); ++ } ++ ++ apic_printk(APIC_VERBOSE, "..... host bus clock speed is " ++ "%u.%04u MHz.\n", ++ calibration_result / (1000000 / HZ), ++ calibration_result % (1000000 / HZ)); ++ ++ /* ++ * Do a sanity check on the APIC calibration result ++ */ ++ if (calibration_result < (1000000 / HZ)) { ++ local_irq_enable(); ++ pr_warning("APIC frequency too slow, disabling apic timer\n"); ++ return -1; ++ } ++ ++ levt->features &= ~CLOCK_EVT_FEAT_DUMMY; ++ ++ /* ++ * PM timer calibration failed or not turned on ++ * so lets try APIC timer based calibration ++ */ ++ if (!pm_referenced) { ++ apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); ++ ++ /* ++ * Setup the apic timer manually ++ */ ++ levt->event_handler = lapic_cal_handler; ++ lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); ++ lapic_cal_loops = -1; ++ ++ /* Let the interrupts run */ ++ local_irq_enable(); ++ ++ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) ++ cpu_relax(); ++ ++ /* Stop the lapic timer */ ++ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); ++ ++ /* Jiffies delta */ ++ deltaj = lapic_cal_j2 - lapic_cal_j1; ++ apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); ++ ++ /* Check, if the jiffies result is consistent */ ++ if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) ++ apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); ++ else ++ levt->features |= CLOCK_EVT_FEAT_DUMMY; ++ } else ++ local_irq_enable(); ++ ++ if (levt->features & CLOCK_EVT_FEAT_DUMMY) { ++ pr_warning("APIC timer disabled due to verification failure\n"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Setup the boot APIC ++ * ++ * Calibrate and verify the result. ++ */ ++void __init setup_boot_APIC_clock(void) ++{ ++ /* ++ * The local apic timer can be disabled via the kernel ++ * commandline or from the CPU detection code. Register the lapic ++ * timer as a dummy clock event source on SMP systems, so the ++ * broadcast mechanism is used. On UP systems simply ignore it. ++ */ ++ if (disable_apic_timer) { ++ pr_info("Disabling APIC timer\n"); ++ /* No broadcast on UP ! */ ++ if (num_possible_cpus() > 1) { ++ lapic_clockevent.mult = 1; ++ setup_APIC_timer(); ++ } ++ return; ++ } ++ ++ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" ++ "calibrating APIC timer ...\n"); ++ ++ if (calibrate_APIC_clock()) { ++ /* No broadcast on UP ! */ ++ if (num_possible_cpus() > 1) ++ setup_APIC_timer(); ++ return; ++ } ++ ++ /* ++ * If nmi_watchdog is set to IO_APIC, we need the ++ * PIT/HPET going. Otherwise register lapic as a dummy ++ * device. ++ */ ++ if (nmi_watchdog != NMI_IO_APIC) ++ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; ++ else ++ pr_warning("APIC timer registered as dummy," ++ " due to nmi_watchdog=%d!\n", nmi_watchdog); ++ ++ /* Setup the lapic or request the broadcast */ ++ setup_APIC_timer(); ++} ++ ++void __cpuinit setup_secondary_APIC_clock(void) ++{ ++ setup_APIC_timer(); ++} ++ ++/* ++ * The guts of the apic timer interrupt ++ */ ++static void local_apic_timer_interrupt(void) ++{ ++ int cpu = smp_processor_id(); ++ struct clock_event_device *evt = &per_cpu(lapic_events, cpu); ++ ++ /* ++ * Normally we should not be here till LAPIC has been initialized but ++ * in some cases like kdump, its possible that there is a pending LAPIC ++ * timer interrupt from previous kernel's context and is delivered in ++ * new kernel the moment interrupts are enabled. ++ * ++ * Interrupts are enabled early and LAPIC is setup much later, hence ++ * its possible that when we get here evt->event_handler is NULL. ++ * Check for event_handler being NULL and discard the interrupt as ++ * spurious. ++ */ ++ if (!evt->event_handler) { ++ pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); ++ /* Switch it off */ ++ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); ++ return; ++ } ++ ++ /* ++ * the NMI deadlock-detector uses this. ++ */ ++ inc_irq_stat(apic_timer_irqs); ++ ++ evt->event_handler(evt); ++ ++ perf_counter_unthrottle(); ++} ++ ++/* ++ * Local APIC timer interrupt. This is the most natural way for doing ++ * local interrupts, but local timer interrupts can be emulated by ++ * broadcast interrupts too. [in case the hw doesn't support APIC timers] ++ * ++ * [ if a single-CPU system runs an SMP kernel then we call the local ++ * interrupt as well. Thus we cannot inline the local irq ... ] ++ */ ++void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) ++{ ++ struct pt_regs *old_regs = set_irq_regs(regs); ++ ++ /* ++ * NOTE! We'd better ACK the irq immediately, ++ * because timer handling can be slow. ++ */ ++ ack_APIC_irq(); ++ /* ++ * update_process_times() expects us to have done irq_enter(). ++ * Besides, if we don't timer interrupts ignore the global ++ * interrupt lock, which is the WrongThing (tm) to do. ++ */ ++ exit_idle(); ++ irq_enter(); ++ local_apic_timer_interrupt(); ++ irq_exit(); ++ ++ set_irq_regs(old_regs); ++} ++ ++int setup_profiling_timer(unsigned int multiplier) ++{ ++ return -EINVAL; ++} ++ ++/* ++ * Local APIC start and shutdown ++ */ ++ ++/** ++ * clear_local_APIC - shutdown the local APIC ++ * ++ * This is called, when a CPU is disabled and before rebooting, so the state of ++ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS ++ * leftovers during boot. ++ */ ++void clear_local_APIC(void) ++{ ++ int maxlvt; ++ u32 v; ++ ++ /* APIC hasn't been mapped yet */ ++ if (!x2apic && !apic_phys) ++ return; ++ ++ maxlvt = lapic_get_maxlvt(); ++ /* ++ * Masking an LVT entry can trigger a local APIC error ++ * if the vector is zero. Mask LVTERR first to prevent this. ++ */ ++ if (maxlvt >= 3) { ++ v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ ++ apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); ++ } ++ /* ++ * Careful: we have to set masks only first to deassert ++ * any level-triggered sources. ++ */ ++ v = apic_read(APIC_LVTT); ++ apic_write(APIC_LVTT, v | APIC_LVT_MASKED); ++ v = apic_read(APIC_LVT0); ++ apic_write(APIC_LVT0, v | APIC_LVT_MASKED); ++ v = apic_read(APIC_LVT1); ++ apic_write(APIC_LVT1, v | APIC_LVT_MASKED); ++ if (maxlvt >= 4) { ++ v = apic_read(APIC_LVTPC); ++ apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); ++ } ++ ++ /* lets not touch this if we didn't frob it */ ++#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) ++ if (maxlvt >= 5) { ++ v = apic_read(APIC_LVTTHMR); ++ apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); ++ } ++#endif ++#ifdef CONFIG_X86_MCE_INTEL ++ if (maxlvt >= 6) { ++ v = apic_read(APIC_LVTCMCI); ++ if (!(v & APIC_LVT_MASKED)) ++ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); ++ } ++#endif ++ ++ /* ++ * Clean APIC state for other OSs: ++ */ ++ apic_write(APIC_LVTT, APIC_LVT_MASKED); ++ apic_write(APIC_LVT0, APIC_LVT_MASKED); ++ apic_write(APIC_LVT1, APIC_LVT_MASKED); ++ if (maxlvt >= 3) ++ apic_write(APIC_LVTERR, APIC_LVT_MASKED); ++ if (maxlvt >= 4) ++ apic_write(APIC_LVTPC, APIC_LVT_MASKED); ++ ++ /* Integrated APIC (!82489DX) ? */ ++ if (lapic_is_integrated()) { ++ if (maxlvt > 3) ++ /* Clear ESR due to Pentium errata 3AP and 11AP */ ++ apic_write(APIC_ESR, 0); ++ apic_read(APIC_ESR); ++ } ++} ++ ++/** ++ * disable_local_APIC - clear and disable the local APIC ++ */ ++void disable_local_APIC(void) ++{ ++ unsigned int value; ++ ++ /* APIC hasn't been mapped yet */ ++ if (!apic_phys) ++ return; ++ ++ clear_local_APIC(); ++ ++ /* ++ * Disable APIC (implies clearing of registers ++ * for 82489DX!). ++ */ ++ value = apic_read(APIC_SPIV); ++ value &= ~APIC_SPIV_APIC_ENABLED; ++ apic_write(APIC_SPIV, value); ++ ++#ifdef CONFIG_X86_32 ++ /* ++ * When LAPIC was disabled by the BIOS and enabled by the kernel, ++ * restore the disabled state. ++ */ ++ if (enabled_via_apicbase) { ++ unsigned int l, h; ++ ++ rdmsr(MSR_IA32_APICBASE, l, h); ++ l &= ~MSR_IA32_APICBASE_ENABLE; ++ wrmsr(MSR_IA32_APICBASE, l, h); ++ } ++#endif ++} ++ ++/* ++ * If Linux enabled the LAPIC against the BIOS default disable it down before ++ * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and ++ * not power-off. Additionally clear all LVT entries before disable_local_APIC ++ * for the case where Linux didn't enable the LAPIC. ++ */ ++void lapic_shutdown(void) ++{ ++ unsigned long flags; ++ ++ if (!cpu_has_apic) ++ return; ++ ++ local_irq_save(flags); ++ ++#ifdef CONFIG_X86_32 ++ if (!enabled_via_apicbase) ++ clear_local_APIC(); ++ else ++#endif ++ disable_local_APIC(); ++ ++ ++ local_irq_restore(flags); ++} ++ ++/* ++ * This is to verify that we're looking at a real local APIC. ++ * Check these against your board if the CPUs aren't getting ++ * started for no apparent reason. ++ */ ++int __init verify_local_APIC(void) ++{ ++ unsigned int reg0, reg1; ++ ++ /* ++ * The version register is read-only in a real APIC. ++ */ ++ reg0 = apic_read(APIC_LVR); ++ apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); ++ apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); ++ reg1 = apic_read(APIC_LVR); ++ apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); ++ ++ /* ++ * The two version reads above should print the same ++ * numbers. If the second one is different, then we ++ * poke at a non-APIC. ++ */ ++ if (reg1 != reg0) ++ return 0; ++ ++ /* ++ * Check if the version looks reasonably. ++ */ ++ reg1 = GET_APIC_VERSION(reg0); ++ if (reg1 == 0x00 || reg1 == 0xff) ++ return 0; ++ reg1 = lapic_get_maxlvt(); ++ if (reg1 < 0x02 || reg1 == 0xff) ++ return 0; ++ ++ /* ++ * The ID register is read/write in a real APIC. ++ */ ++ reg0 = apic_read(APIC_ID); ++ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); ++ apic_write(APIC_ID, reg0 ^ apic->apic_id_mask); ++ reg1 = apic_read(APIC_ID); ++ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); ++ apic_write(APIC_ID, reg0); ++ if (reg1 != (reg0 ^ apic->apic_id_mask)) ++ return 0; ++ ++ /* ++ * The next two are just to see if we have sane values. ++ * They're only really relevant if we're in Virtual Wire ++ * compatibility mode, but most boxes are anymore. ++ */ ++ reg0 = apic_read(APIC_LVT0); ++ apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); ++ reg1 = apic_read(APIC_LVT1); ++ apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); ++ ++ return 1; ++} ++ ++/** ++ * sync_Arb_IDs - synchronize APIC bus arbitration IDs ++ */ ++void __init sync_Arb_IDs(void) ++{ ++ /* ++ * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not ++ * needed on AMD. ++ */ ++ if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ++ return; ++ ++ /* ++ * Wait for idle. ++ */ ++ apic_wait_icr_idle(); ++ ++ apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); ++ apic_write(APIC_ICR, APIC_DEST_ALLINC | ++ APIC_INT_LEVELTRIG | APIC_DM_INIT); ++} ++ ++/* ++ * An initial setup of the virtual wire mode. ++ */ ++void __init init_bsp_APIC(void) ++{ ++ unsigned int value; ++ ++ /* ++ * Don't do the setup now if we have a SMP BIOS as the ++ * through-I/O-APIC virtual wire mode might be active. ++ */ ++ if (smp_found_config || !cpu_has_apic) ++ return; ++ ++ /* ++ * Do not trust the local APIC being empty at bootup. ++ */ ++ clear_local_APIC(); ++ ++ /* ++ * Enable APIC. ++ */ ++ value = apic_read(APIC_SPIV); ++ value &= ~APIC_VECTOR_MASK; ++ value |= APIC_SPIV_APIC_ENABLED; ++ ++#ifdef CONFIG_X86_32 ++ /* This bit is reserved on P4/Xeon and should be cleared */ ++ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && ++ (boot_cpu_data.x86 == 15)) ++ value &= ~APIC_SPIV_FOCUS_DISABLED; ++ else ++#endif ++ value |= APIC_SPIV_FOCUS_DISABLED; ++ value |= SPURIOUS_APIC_VECTOR; ++ apic_write(APIC_SPIV, value); ++ ++ /* ++ * Set up the virtual wire mode. ++ */ ++ apic_write(APIC_LVT0, APIC_DM_EXTINT); ++ value = APIC_DM_NMI; ++ if (!lapic_is_integrated()) /* 82489DX */ ++ value |= APIC_LVT_LEVEL_TRIGGER; ++ apic_write(APIC_LVT1, value); ++} ++ ++static void __cpuinit lapic_setup_esr(void) ++{ ++ unsigned int oldvalue, value, maxlvt; ++ ++ if (!lapic_is_integrated()) { ++ pr_info("No ESR for 82489DX.\n"); ++ return; ++ } ++ ++ if (apic->disable_esr) { ++ /* ++ * Something untraceable is creating bad interrupts on ++ * secondary quads ... for the moment, just leave the ++ * ESR disabled - we can't do anything useful with the ++ * errors anyway - mbligh ++ */ ++ pr_info("Leaving ESR disabled.\n"); ++ return; ++ } ++ ++ maxlvt = lapic_get_maxlvt(); ++ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ ++ apic_write(APIC_ESR, 0); ++ oldvalue = apic_read(APIC_ESR); ++ ++ /* enables sending errors */ ++ value = ERROR_APIC_VECTOR; ++ apic_write(APIC_LVTERR, value); ++ ++ /* ++ * spec says clear errors after enabling vector. ++ */ ++ if (maxlvt > 3) ++ apic_write(APIC_ESR, 0); ++ value = apic_read(APIC_ESR); ++ if (value != oldvalue) ++ apic_printk(APIC_VERBOSE, "ESR value before enabling " ++ "vector: 0x%08x after: 0x%08x\n", ++ oldvalue, value); ++} ++ ++ ++/** ++ * setup_local_APIC - setup the local APIC ++ */ ++void __cpuinit setup_local_APIC(void) ++{ ++ unsigned int value; ++ int i, j; ++ ++ if (disable_apic) { ++ arch_disable_smp_support(); ++ return; ++ } ++ ++#ifdef CONFIG_X86_32 ++ /* Pound the ESR really hard over the head with a big hammer - mbligh */ ++ if (lapic_is_integrated() && apic->disable_esr) { ++ apic_write(APIC_ESR, 0); ++ apic_write(APIC_ESR, 0); ++ apic_write(APIC_ESR, 0); ++ apic_write(APIC_ESR, 0); ++ } ++#endif ++ perf_counters_lapic_init(0); ++ ++ preempt_disable(); ++ ++ /* ++ * Double-check whether this APIC is really registered. ++ * This is meaningless in clustered apic mode, so we skip it. ++ */ ++ if (!apic->apic_id_registered()) ++ BUG(); ++ ++ /* ++ * Intel recommends to set DFR, LDR and TPR before enabling ++ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel ++ * document number 292116). So here it goes... ++ */ ++ apic->init_apic_ldr(); ++ ++ /* ++ * Set Task Priority to 'accept all'. We never change this ++ * later on. ++ */ ++ value = apic_read(APIC_TASKPRI); ++ value &= ~APIC_TPRI_MASK; ++ apic_write(APIC_TASKPRI, value); ++ ++ /* ++ * After a crash, we no longer service the interrupts and a pending ++ * interrupt from previous kernel might still have ISR bit set. ++ * ++ * Most probably by now CPU has serviced that pending interrupt and ++ * it might not have done the ack_APIC_irq() because it thought, ++ * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it ++ * does not clear the ISR bit and cpu thinks it has already serivced ++ * the interrupt. Hence a vector might get locked. It was noticed ++ * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. ++ */ ++ for (i = APIC_ISR_NR - 1; i >= 0; i--) { ++ value = apic_read(APIC_ISR + i*0x10); ++ for (j = 31; j >= 0; j--) { ++ if (value & (1< 1) || ++ (boot_cpu_data.x86 >= 15)) ++ break; ++ goto no_apic; ++ case X86_VENDOR_INTEL: ++ if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || ++ (boot_cpu_data.x86 == 5 && cpu_has_apic)) ++ break; ++ goto no_apic; ++ default: ++ goto no_apic; ++ } ++ ++ if (!cpu_has_apic) { ++ /* ++ * Over-ride BIOS and try to enable the local APIC only if ++ * "lapic" specified. ++ */ ++ if (!force_enable_local_apic) { ++ pr_info("Local APIC disabled by BIOS -- " ++ "you can enable it with \"lapic\"\n"); ++ return -1; ++ } ++ /* ++ * Some BIOSes disable the local APIC in the APIC_BASE ++ * MSR. This can only be done in software for Intel P6 or later ++ * and AMD K7 (Model > 1) or later. ++ */ ++ rdmsr(MSR_IA32_APICBASE, l, h); ++ if (!(l & MSR_IA32_APICBASE_ENABLE)) { ++ pr_info("Local APIC disabled by BIOS -- reenabling.\n"); ++ l &= ~MSR_IA32_APICBASE_BASE; ++ l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; ++ wrmsr(MSR_IA32_APICBASE, l, h); ++ enabled_via_apicbase = 1; ++ } ++ } ++ /* ++ * The APIC feature bit should now be enabled ++ * in `cpuid' ++ */ ++ features = cpuid_edx(1); ++ if (!(features & (1 << X86_FEATURE_APIC))) { ++ pr_warning("Could not enable APIC!\n"); ++ return -1; ++ } ++ set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); ++ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; ++ ++ /* The BIOS may have set up the APIC at some other address */ ++ rdmsr(MSR_IA32_APICBASE, l, h); ++ if (l & MSR_IA32_APICBASE_ENABLE) ++ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; ++ ++ pr_info("Found and enabled local APIC!\n"); ++ ++ apic_pm_activate(); ++ ++ return 0; ++ ++no_apic: ++ pr_info("No local APIC present or hardware disabled\n"); ++ return -1; ++} ++#endif ++ ++#ifdef CONFIG_X86_64 ++void __init early_init_lapic_mapping(void) ++{ ++ unsigned long phys_addr; ++ ++ /* ++ * If no local APIC can be found then go out ++ * : it means there is no mpatable and MADT ++ */ ++ if (!smp_found_config) ++ return; ++ ++ phys_addr = mp_lapic_addr; ++ ++ set_fixmap_nocache(FIX_APIC_BASE, phys_addr); ++ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", ++ APIC_BASE, phys_addr); ++ ++ /* ++ * Fetch the APIC ID of the BSP in case we have a ++ * default configuration (or the MP table is broken). ++ */ ++ boot_cpu_physical_apicid = read_apic_id(); ++} ++#endif ++ ++/** ++ * init_apic_mappings - initialize APIC mappings ++ */ ++void __init init_apic_mappings(void) ++{ ++ if (x2apic) { ++ boot_cpu_physical_apicid = read_apic_id(); ++ return; ++ } ++ ++ /* ++ * If no local APIC can be found then set up a fake all ++ * zeroes page to simulate the local APIC and another ++ * one for the IO-APIC. ++ */ ++ if (!smp_found_config && detect_init_APIC()) { ++ apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); ++ apic_phys = __pa(apic_phys); ++ } else ++ apic_phys = mp_lapic_addr; ++ ++ set_fixmap_nocache(FIX_APIC_BASE, apic_phys); ++ apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", ++ APIC_BASE, apic_phys); ++ ++ /* ++ * Fetch the APIC ID of the BSP in case we have a ++ * default configuration (or the MP table is broken). ++ */ ++ if (boot_cpu_physical_apicid == -1U) ++ boot_cpu_physical_apicid = read_apic_id(); ++} ++ ++/* ++ * This initializes the IO-APIC and APIC hardware if this is ++ * a UP kernel. ++ */ ++int apic_version[MAX_APICS]; ++ ++int __init APIC_init_uniprocessor(void) ++{ ++ if (disable_apic) { ++ pr_info("Apic disabled\n"); ++ return -1; ++ } ++#ifdef CONFIG_X86_64 ++ if (!cpu_has_apic) { ++ disable_apic = 1; ++ pr_info("Apic disabled by BIOS\n"); ++ return -1; ++ } ++#else ++ if (!smp_found_config && !cpu_has_apic) ++ return -1; ++ ++ /* ++ * Complain if the BIOS pretends there is one. ++ */ ++ if (!cpu_has_apic && ++ APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { ++ pr_err("BIOS bug, local APIC 0x%x not detected!...\n", ++ boot_cpu_physical_apicid); ++ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); ++ return -1; ++ } ++#endif ++ ++ enable_IR_x2apic(); ++#ifdef CONFIG_X86_64 ++ default_setup_apic_routing(); ++#endif ++ ++ verify_local_APIC(); ++ connect_bsp_APIC(); ++ ++#ifdef CONFIG_X86_64 ++ apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); ++#else ++ /* ++ * Hack: In case of kdump, after a crash, kernel might be booting ++ * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid ++ * might be zero if read from MP tables. Get it from LAPIC. ++ */ ++# ifdef CONFIG_CRASH_DUMP ++ boot_cpu_physical_apicid = read_apic_id(); ++# endif ++#endif ++ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); ++ setup_local_APIC(); ++ ++#ifdef CONFIG_X86_IO_APIC ++ /* ++ * Now enable IO-APICs, actually call clear_IO_APIC ++ * We need clear_IO_APIC before enabling error vector ++ */ ++ if (!skip_ioapic_setup && nr_ioapics) ++ enable_IO_APIC(); ++#endif ++ ++ end_local_APIC_setup(); ++ ++#ifdef CONFIG_X86_IO_APIC ++ if (smp_found_config && !skip_ioapic_setup && nr_ioapics) ++ setup_IO_APIC(); ++ else { ++ nr_ioapics = 0; ++ localise_nmi_watchdog(); ++ } ++#else ++ localise_nmi_watchdog(); ++#endif ++ ++ setup_boot_clock(); ++#ifdef CONFIG_X86_64 ++ check_nmi_watchdog(); ++#endif ++ ++ return 0; ++} ++ ++/* ++ * Local APIC interrupts ++ */ ++ ++/* ++ * This interrupt should _never_ happen with our APIC/SMP architecture ++ */ ++void smp_spurious_interrupt(struct pt_regs *regs) ++{ ++ u32 v; ++ ++ exit_idle(); ++ irq_enter(); ++ /* ++ * Check if this really is a spurious interrupt and ACK it ++ * if it is a vectored one. Just in case... ++ * Spurious interrupts should not be ACKed. ++ */ ++ v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); ++ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ++ ack_APIC_irq(); ++ ++ inc_irq_stat(irq_spurious_count); ++ ++ /* see sw-dev-man vol 3, chapter 7.4.13.5 */ ++ pr_info("spurious APIC interrupt on CPU#%d, " ++ "should never happen.\n", smp_processor_id()); ++ irq_exit(); ++} ++ ++/* ++ * This interrupt should never happen with our APIC/SMP architecture ++ */ ++void smp_error_interrupt(struct pt_regs *regs) ++{ ++ u32 v, v1; ++ ++ exit_idle(); ++ irq_enter(); ++ /* First tickle the hardware, only then report what went on. -- REW */ ++ v = apic_read(APIC_ESR); ++ apic_write(APIC_ESR, 0); ++ v1 = apic_read(APIC_ESR); ++ ack_APIC_irq(); ++ atomic_inc(&irq_err_count); ++ ++ /* ++ * Here is what the APIC error bits mean: ++ * 0: Send CS error ++ * 1: Receive CS error ++ * 2: Send accept error ++ * 3: Receive accept error ++ * 4: Reserved ++ * 5: Send illegal vector ++ * 6: Received illegal vector ++ * 7: Illegal register address ++ */ ++ pr_debug("APIC error on CPU%d: %02x(%02x)\n", ++ smp_processor_id(), v , v1); ++ irq_exit(); ++} ++ ++/** ++ * connect_bsp_APIC - attach the APIC to the interrupt system ++ */ ++void __init connect_bsp_APIC(void) ++{ ++#ifdef CONFIG_X86_32 ++ if (pic_mode) { ++ /* ++ * Do not trust the local APIC being empty at bootup. ++ */ ++ clear_local_APIC(); ++ /* ++ * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's ++ * local APIC to INT and NMI lines. ++ */ ++ apic_printk(APIC_VERBOSE, "leaving PIC mode, " ++ "enabling APIC mode.\n"); ++ outb(0x70, 0x22); ++ outb(0x01, 0x23); ++ } ++#endif ++ if (apic->enable_apic_mode) ++ apic->enable_apic_mode(); ++} ++ ++/** ++ * disconnect_bsp_APIC - detach the APIC from the interrupt system ++ * @virt_wire_setup: indicates, whether virtual wire mode is selected ++ * ++ * Virtual wire mode is necessary to deliver legacy interrupts even when the ++ * APIC is disabled. ++ */ ++void disconnect_bsp_APIC(int virt_wire_setup) ++{ ++ unsigned int value; ++ ++#ifdef CONFIG_X86_32 ++ if (pic_mode) { ++ /* ++ * Put the board back into PIC mode (has an effect only on ++ * certain older boards). Note that APIC interrupts, including ++ * IPIs, won't work beyond this point! The only exception are ++ * INIT IPIs. ++ */ ++ apic_printk(APIC_VERBOSE, "disabling APIC mode, " ++ "entering PIC mode.\n"); ++ outb(0x70, 0x22); ++ outb(0x00, 0x23); ++ return; ++ } ++#endif ++ ++ /* Go back to Virtual Wire compatibility mode */ ++ ++ /* For the spurious interrupt use vector F, and enable it */ ++ value = apic_read(APIC_SPIV); ++ value &= ~APIC_VECTOR_MASK; ++ value |= APIC_SPIV_APIC_ENABLED; ++ value |= 0xf; ++ apic_write(APIC_SPIV, value); ++ ++ if (!virt_wire_setup) { ++ /* ++ * For LVT0 make it edge triggered, active high, ++ * external and enabled ++ */ ++ value = apic_read(APIC_LVT0); ++ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | ++ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | ++ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); ++ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; ++ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); ++ apic_write(APIC_LVT0, value); ++ } else { ++ /* Disable LVT0 */ ++ apic_write(APIC_LVT0, APIC_LVT_MASKED); ++ } ++ ++ /* ++ * For LVT1 make it edge triggered, active high, ++ * nmi and enabled ++ */ ++ value = apic_read(APIC_LVT1); ++ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | ++ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | ++ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); ++ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; ++ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); ++ apic_write(APIC_LVT1, value); ++} ++ ++void __cpuinit generic_processor_info(int apicid, int version) ++{ ++ int cpu; ++ ++ /* ++ * Validate version ++ */ ++ if (version == 0x0) { ++ pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " ++ "fixing up to 0x10. (tell your hw vendor)\n", ++ version); ++ version = 0x10; ++ } ++ apic_version[apicid] = version; ++ ++ if (num_processors >= nr_cpu_ids) { ++ int max = nr_cpu_ids; ++ int thiscpu = max + disabled_cpus; ++ ++ pr_warning( ++ "ACPI: NR_CPUS/possible_cpus limit of %i reached." ++ " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); ++ ++ disabled_cpus++; ++ return; ++ } ++ ++ num_processors++; ++ cpu = cpumask_next_zero(-1, cpu_present_mask); ++ ++ if (version != apic_version[boot_cpu_physical_apicid]) ++ WARN_ONCE(1, ++ "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n", ++ apic_version[boot_cpu_physical_apicid], cpu, version); ++ ++ physid_set(apicid, phys_cpu_present_map); ++ if (apicid == boot_cpu_physical_apicid) { ++ /* ++ * x86_bios_cpu_apicid is required to have processors listed ++ * in same order as logical cpu numbers. Hence the first ++ * entry is BSP, and so on. ++ */ ++ cpu = 0; ++ } ++ if (apicid > max_physical_apicid) ++ max_physical_apicid = apicid; ++ ++#ifdef CONFIG_X86_32 ++ /* ++ * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y ++ * but we need to work other dependencies like SMP_SUSPEND etc ++ * before this can be done without some confusion. ++ * if (CPU_HOTPLUG_ENABLED || num_processors > 8) ++ * - Ashok Raj ++ */ ++ if (max_physical_apicid >= 8) { ++ switch (boot_cpu_data.x86_vendor) { ++ case X86_VENDOR_INTEL: ++ if (!APIC_XAPIC(version)) { ++ def_to_bigsmp = 0; ++ break; ++ } ++ /* If P4 and above fall through */ ++ case X86_VENDOR_AMD: ++ def_to_bigsmp = 1; ++ } ++ } ++#endif ++ ++#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) ++ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; ++ early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; ++#endif ++ ++ set_cpu_possible(cpu, true); ++ set_cpu_present(cpu, true); ++} ++ ++int hard_smp_processor_id(void) ++{ ++ return read_apic_id(); ++} ++ ++void default_init_apic_ldr(void) ++{ ++ unsigned long val; ++ ++ apic_write(APIC_DFR, APIC_DFR_VALUE); ++ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; ++ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); ++ apic_write(APIC_LDR, val); ++} ++ ++#ifdef CONFIG_X86_32 ++int default_apicid_to_node(int logical_apicid) ++{ ++#ifdef CONFIG_SMP ++ return apicid_2_node[hard_smp_processor_id()]; ++#else ++ return 0; ++#endif ++} ++#endif ++ ++/* ++ * Power management ++ */ ++#ifdef CONFIG_PM ++ ++static struct { ++ /* ++ * 'active' is true if the local APIC was enabled by us and ++ * not the BIOS; this signifies that we are also responsible ++ * for disabling it before entering apm/acpi suspend ++ */ ++ int active; ++ /* r/w apic fields */ ++ unsigned int apic_id; ++ unsigned int apic_taskpri; ++ unsigned int apic_ldr; ++ unsigned int apic_dfr; ++ unsigned int apic_spiv; ++ unsigned int apic_lvtt; ++ unsigned int apic_lvtpc; ++ unsigned int apic_lvt0; ++ unsigned int apic_lvt1; ++ unsigned int apic_lvterr; ++ unsigned int apic_tmict; ++ unsigned int apic_tdcr; ++ unsigned int apic_thmr; ++} apic_pm_state; ++ ++static int lapic_suspend(struct sys_device *dev, pm_message_t state) ++{ ++ unsigned long flags; ++ int maxlvt; ++ ++ if (!apic_pm_state.active) ++ return 0; ++ ++ maxlvt = lapic_get_maxlvt(); ++ ++ apic_pm_state.apic_id = apic_read(APIC_ID); ++ apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); ++ apic_pm_state.apic_ldr = apic_read(APIC_LDR); ++ apic_pm_state.apic_dfr = apic_read(APIC_DFR); ++ apic_pm_state.apic_spiv = apic_read(APIC_SPIV); ++ apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); ++ if (maxlvt >= 4) ++ apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); ++ apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); ++ apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); ++ apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); ++ apic_pm_state.apic_tmict = apic_read(APIC_TMICT); ++ apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); ++#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) ++ if (maxlvt >= 5) ++ apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); ++#endif ++ ++ local_irq_save(flags); ++ disable_local_APIC(); ++ local_irq_restore(flags); ++ return 0; ++} ++ ++static int lapic_resume(struct sys_device *dev) ++{ ++ unsigned int l, h; ++ unsigned long flags; ++ int maxlvt; ++ ++ if (!apic_pm_state.active) ++ return 0; ++ ++ maxlvt = lapic_get_maxlvt(); ++ ++ local_irq_save(flags); ++ ++ if (x2apic) ++ enable_x2apic(); ++ else { ++ /* ++ * Make sure the APICBASE points to the right address ++ * ++ * FIXME! This will be wrong if we ever support suspend on ++ * SMP! We'll need to do this as part of the CPU restore! ++ */ ++ rdmsr(MSR_IA32_APICBASE, l, h); ++ l &= ~MSR_IA32_APICBASE_BASE; ++ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; ++ wrmsr(MSR_IA32_APICBASE, l, h); ++ } ++ ++ apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); ++ apic_write(APIC_ID, apic_pm_state.apic_id); ++ apic_write(APIC_DFR, apic_pm_state.apic_dfr); ++ apic_write(APIC_LDR, apic_pm_state.apic_ldr); ++ apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); ++ apic_write(APIC_SPIV, apic_pm_state.apic_spiv); ++ apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); ++ apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); ++#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) ++ if (maxlvt >= 5) ++ apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); ++#endif ++ if (maxlvt >= 4) ++ apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); ++ apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); ++ apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); ++ apic_write(APIC_TMICT, apic_pm_state.apic_tmict); ++ apic_write(APIC_ESR, 0); ++ apic_read(APIC_ESR); ++ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); ++ apic_write(APIC_ESR, 0); ++ apic_read(APIC_ESR); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* ++ * This device has no shutdown method - fully functioning local APICs ++ * are needed on every CPU up until machine_halt/restart/poweroff. ++ */ ++ ++static struct sysdev_class lapic_sysclass = { ++ .name = "lapic", ++ .resume = lapic_resume, ++ .suspend = lapic_suspend, ++}; ++ ++static struct sys_device device_lapic = { ++ .id = 0, ++ .cls = &lapic_sysclass, ++}; ++ ++static void __cpuinit apic_pm_activate(void) ++{ ++ apic_pm_state.active = 1; ++} ++ ++static int __init init_lapic_sysfs(void) ++{ ++ int error; ++ ++ if (!cpu_has_apic) ++ return 0; ++ /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ ++ ++ error = sysdev_class_register(&lapic_sysclass); ++ if (!error) ++ error = sysdev_register(&device_lapic); ++ return error; ++} ++device_initcall(init_lapic_sysfs); ++ ++#else /* CONFIG_PM */ ++ ++static void apic_pm_activate(void) { } ++ ++#endif /* CONFIG_PM */ ++ ++#ifdef CONFIG_X86_64 ++/* ++ * apic_is_clustered_box() -- Check if we can expect good TSC ++ * ++ * Thus far, the major user of this is IBM's Summit2 series: ++ * ++ * Clustered boxes may have unsynced TSC problems if they are ++ * multi-chassis. Use available data to take a good guess. ++ * If in doubt, go HPET. ++ */ ++__cpuinit int apic_is_clustered_box(void) ++{ ++ int i, clusters, zeros; ++ unsigned id; ++ u16 *bios_cpu_apicid; ++ DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); ++ ++ /* ++ * there is not this kind of box with AMD CPU yet. ++ * Some AMD box with quadcore cpu and 8 sockets apicid ++ * will be [4, 0x23] or [8, 0x27] could be thought to ++ * vsmp box still need checking... ++ */ ++ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) ++ return 0; ++ ++ bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); ++ bitmap_zero(clustermap, NUM_APIC_CLUSTERS); ++ ++ for (i = 0; i < nr_cpu_ids; i++) { ++ /* are we being called early in kernel startup? */ ++ if (bios_cpu_apicid) { ++ id = bios_cpu_apicid[i]; ++ } else if (i < nr_cpu_ids) { ++ if (cpu_present(i)) ++ id = per_cpu(x86_bios_cpu_apicid, i); ++ else ++ continue; ++ } else ++ break; ++ ++ if (id != BAD_APICID) ++ __set_bit(APIC_CLUSTERID(id), clustermap); ++ } ++ ++ /* Problem: Partially populated chassis may not have CPUs in some of ++ * the APIC clusters they have been allocated. Only present CPUs have ++ * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. ++ * Since clusters are allocated sequentially, count zeros only if ++ * they are bounded by ones. ++ */ ++ clusters = 0; ++ zeros = 0; ++ for (i = 0; i < NUM_APIC_CLUSTERS; i++) { ++ if (test_bit(i, clustermap)) { ++ clusters += 1 + zeros; ++ zeros = 0; ++ } else ++ ++zeros; ++ } ++ ++ /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are ++ * not guaranteed to be synced between boards ++ */ ++ if (is_vsmp_box() && clusters > 1) ++ return 1; ++ ++ /* ++ * If clusters > 2, then should be multi-chassis. ++ * May have to revisit this when multi-core + hyperthreaded CPUs come ++ * out, but AFAIK this will work even for them. ++ */ ++ return (clusters > 2); ++} ++#endif ++ ++/* ++ * APIC command line parameters ++ */ ++static int __init setup_disableapic(char *arg) ++{ ++ disable_apic = 1; ++ setup_clear_cpu_cap(X86_FEATURE_APIC); ++ return 0; ++} ++early_param("disableapic", setup_disableapic); ++ ++/* same as disableapic, for compatibility */ ++static int __init setup_nolapic(char *arg) ++{ ++ return setup_disableapic(arg); ++} ++early_param("nolapic", setup_nolapic); ++ ++static int __init parse_lapic_timer_c2_ok(char *arg) ++{ ++ local_apic_timer_c2_ok = 1; ++ return 0; ++} ++early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); ++ ++static int __init parse_disable_apic_timer(char *arg) ++{ ++ disable_apic_timer = 1; ++ return 0; ++} ++early_param("noapictimer", parse_disable_apic_timer); ++ ++static int __init parse_nolapic_timer(char *arg) ++{ ++ disable_apic_timer = 1; ++ return 0; ++} ++early_param("nolapic_timer", parse_nolapic_timer); ++ ++static int __init apic_set_verbosity(char *arg) ++{ ++ if (!arg) { ++#ifdef CONFIG_X86_64 ++ skip_ioapic_setup = 0; ++ return 0; ++#endif ++ return -EINVAL; ++ } ++ ++ if (strcmp("debug", arg) == 0) ++ apic_verbosity = APIC_DEBUG; ++ else if (strcmp("verbose", arg) == 0) ++ apic_verbosity = APIC_VERBOSE; ++ else { ++ pr_warning("APIC Verbosity level %s not recognised" ++ " use apic=verbose or apic=debug\n", arg); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++early_param("apic", apic_set_verbosity); ++ ++static int __init lapic_insert_resource(void) ++{ ++ if (!apic_phys) ++ return -1; ++ ++ /* Put local APIC into the resource map. */ ++ lapic_resource.start = apic_phys; ++ lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; ++ insert_resource(&iomem_resource, &lapic_resource); ++ ++ return 0; ++} ++ ++/* ++ * need call insert after e820_reserve_resources() ++ * that is using request_resource ++ */ ++late_initcall(lapic_insert_resource); +Index: linux-2.6-tip/arch/x86/kernel/apic/apic_flat_64.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/apic_flat_64.c +@@ -0,0 +1,373 @@ ++/* ++ * Copyright 2004 James Cleverdon, IBM. ++ * Subject to the GNU Public License, v.2 ++ * ++ * Flat APIC subarch code. ++ * ++ * Hacked for x86-64 by James Cleverdon from i386 architecture code by ++ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and ++ * James Cleverdon. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_ACPI ++#include ++#endif ++ ++static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ return 1; ++} ++ ++static const struct cpumask *flat_target_cpus(void) ++{ ++ return cpu_online_mask; ++} ++ ++static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ /* Careful. Some cpus do not strictly honor the set of cpus ++ * specified in the interrupt destination when using lowest ++ * priority interrupt delivery mode. ++ * ++ * In particular there was a hyperthreading cpu observed to ++ * deliver interrupts to the wrong hyperthread when only one ++ * hyperthread was specified in the interrupt desitination. ++ */ ++ cpumask_clear(retmask); ++ cpumask_bits(retmask)[0] = APIC_ALL_CPUS; ++} ++ ++/* ++ * Set up the logical destination ID. ++ * ++ * Intel recommends to set DFR, LDR and TPR before enabling ++ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel ++ * document number 292116). So here it goes... ++ */ ++static void flat_init_apic_ldr(void) ++{ ++ unsigned long val; ++ unsigned long num, id; ++ ++ num = smp_processor_id(); ++ id = 1UL << num; ++ apic_write(APIC_DFR, APIC_DFR_FLAT); ++ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; ++ val |= SET_APIC_LOGICAL_ID(id); ++ apic_write(APIC_LDR, val); ++} ++ ++static inline void _flat_send_IPI_mask(unsigned long mask, int vector) ++{ ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ __default_send_IPI_dest_field(mask, vector, apic->dest_logical); ++ local_irq_restore(flags); ++} ++ ++static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) ++{ ++ unsigned long mask = cpumask_bits(cpumask)[0]; ++ ++ _flat_send_IPI_mask(mask, vector); ++} ++ ++static void ++ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) ++{ ++ unsigned long mask = cpumask_bits(cpumask)[0]; ++ int cpu = smp_processor_id(); ++ ++ if (cpu < BITS_PER_LONG) ++ clear_bit(cpu, &mask); ++ ++ _flat_send_IPI_mask(mask, vector); ++} ++ ++static void flat_send_IPI_allbutself(int vector) ++{ ++ int cpu = smp_processor_id(); ++#ifdef CONFIG_HOTPLUG_CPU ++ int hotplug = 1; ++#else ++ int hotplug = 0; ++#endif ++ if (hotplug || vector == NMI_VECTOR) { ++ if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { ++ unsigned long mask = cpumask_bits(cpu_online_mask)[0]; ++ ++ if (cpu < BITS_PER_LONG) ++ clear_bit(cpu, &mask); ++ ++ _flat_send_IPI_mask(mask, vector); ++ } ++ } else if (num_online_cpus() > 1) { ++ __default_send_IPI_shortcut(APIC_DEST_ALLBUT, ++ vector, apic->dest_logical); ++ } ++} ++ ++static void flat_send_IPI_all(int vector) ++{ ++ if (vector == NMI_VECTOR) { ++ flat_send_IPI_mask(cpu_online_mask, vector); ++ } else { ++ __default_send_IPI_shortcut(APIC_DEST_ALLINC, ++ vector, apic->dest_logical); ++ } ++} ++ ++static unsigned int flat_get_apic_id(unsigned long x) ++{ ++ unsigned int id; ++ ++ id = (((x)>>24) & 0xFFu); ++ ++ return id; ++} ++ ++static unsigned long set_apic_id(unsigned int id) ++{ ++ unsigned long x; ++ ++ x = ((id & 0xFFu)<<24); ++ return x; ++} ++ ++static unsigned int read_xapic_id(void) ++{ ++ unsigned int id; ++ ++ id = flat_get_apic_id(apic_read(APIC_ID)); ++ return id; ++} ++ ++static int flat_apic_id_registered(void) ++{ ++ return physid_isset(read_xapic_id(), phys_cpu_present_map); ++} ++ ++static int flat_phys_pkg_id(int initial_apic_id, int index_msb) ++{ ++ return hard_smp_processor_id() >> index_msb; ++} ++ ++struct apic apic_flat = { ++ .name = "flat", ++ .probe = NULL, ++ .acpi_madt_oem_check = flat_acpi_madt_oem_check, ++ .apic_id_registered = flat_apic_id_registered, ++ ++ .irq_delivery_mode = dest_LowestPrio, ++ .irq_dest_mode = 1, /* logical */ ++ ++ .target_cpus = flat_target_cpus, ++ .disable_esr = 0, ++ .dest_logical = APIC_DEST_LOGICAL, ++ .check_apicid_used = NULL, ++ .check_apicid_present = NULL, ++ ++ .vector_allocation_domain = flat_vector_allocation_domain, ++ .init_apic_ldr = flat_init_apic_ldr, ++ ++ .ioapic_phys_id_map = NULL, ++ .setup_apic_routing = NULL, ++ .multi_timer_check = NULL, ++ .apicid_to_node = NULL, ++ .cpu_to_logical_apicid = NULL, ++ .cpu_present_to_apicid = default_cpu_present_to_apicid, ++ .apicid_to_cpu_present = NULL, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = default_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = flat_phys_pkg_id, ++ .mps_oem_check = NULL, ++ ++ .get_apic_id = flat_get_apic_id, ++ .set_apic_id = set_apic_id, ++ .apic_id_mask = 0xFFu << 24, ++ ++ .cpu_mask_to_apicid = default_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = flat_send_IPI_mask, ++ .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, ++ .send_IPI_allbutself = flat_send_IPI_allbutself, ++ .send_IPI_all = flat_send_IPI_all, ++ .send_IPI_self = apic_send_IPI_self, ++ ++ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, ++ .wait_for_init_deassert = NULL, ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = NULL, ++ ++ .read = native_apic_mem_read, ++ .write = native_apic_mem_write, ++ .icr_read = native_apic_icr_read, ++ .icr_write = native_apic_icr_write, ++ .wait_icr_idle = native_apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, ++}; ++ ++/* ++ * Physflat mode is used when there are more than 8 CPUs on a AMD system. ++ * We cannot use logical delivery in this case because the mask ++ * overflows, so use physical mode. ++ */ ++static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++#ifdef CONFIG_ACPI ++ /* ++ * Quirk: some x86_64 machines can only use physical APIC mode ++ * regardless of how many processors are present (x86_64 ES7000 ++ * is an example). ++ */ ++ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && ++ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { ++ printk(KERN_DEBUG "system APIC only can use physical flat"); ++ return 1; ++ } ++#endif ++ ++ return 0; ++} ++ ++static const struct cpumask *physflat_target_cpus(void) ++{ ++ return cpu_online_mask; ++} ++ ++static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ cpumask_clear(retmask); ++ cpumask_set_cpu(cpu, retmask); ++} ++ ++static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) ++{ ++ default_send_IPI_mask_sequence_phys(cpumask, vector); ++} ++ ++static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, ++ int vector) ++{ ++ default_send_IPI_mask_allbutself_phys(cpumask, vector); ++} ++ ++static void physflat_send_IPI_allbutself(int vector) ++{ ++ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); ++} ++ ++static void physflat_send_IPI_all(int vector) ++{ ++ physflat_send_IPI_mask(cpu_online_mask, vector); ++} ++ ++static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ int cpu; ++ ++ /* ++ * We're using fixed IRQ delivery, can only return one phys APIC ID. ++ * May as well be the first. ++ */ ++ cpu = cpumask_first(cpumask); ++ if ((unsigned)cpu < nr_cpu_ids) ++ return per_cpu(x86_cpu_to_apicid, cpu); ++ else ++ return BAD_APICID; ++} ++ ++static unsigned int ++physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, ++ const struct cpumask *andmask) ++{ ++ int cpu; ++ ++ /* ++ * We're using fixed IRQ delivery, can only return one phys APIC ID. ++ * May as well be the first. ++ */ ++ for_each_cpu_and(cpu, cpumask, andmask) { ++ if (cpumask_test_cpu(cpu, cpu_online_mask)) ++ break; ++ } ++ if (cpu < nr_cpu_ids) ++ return per_cpu(x86_cpu_to_apicid, cpu); ++ ++ return BAD_APICID; ++} ++ ++struct apic apic_physflat = { ++ ++ .name = "physical flat", ++ .probe = NULL, ++ .acpi_madt_oem_check = physflat_acpi_madt_oem_check, ++ .apic_id_registered = flat_apic_id_registered, ++ ++ .irq_delivery_mode = dest_Fixed, ++ .irq_dest_mode = 0, /* physical */ ++ ++ .target_cpus = physflat_target_cpus, ++ .disable_esr = 0, ++ .dest_logical = 0, ++ .check_apicid_used = NULL, ++ .check_apicid_present = NULL, ++ ++ .vector_allocation_domain = physflat_vector_allocation_domain, ++ /* not needed, but shouldn't hurt: */ ++ .init_apic_ldr = flat_init_apic_ldr, ++ ++ .ioapic_phys_id_map = NULL, ++ .setup_apic_routing = NULL, ++ .multi_timer_check = NULL, ++ .apicid_to_node = NULL, ++ .cpu_to_logical_apicid = NULL, ++ .cpu_present_to_apicid = default_cpu_present_to_apicid, ++ .apicid_to_cpu_present = NULL, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = default_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = flat_phys_pkg_id, ++ .mps_oem_check = NULL, ++ ++ .get_apic_id = flat_get_apic_id, ++ .set_apic_id = set_apic_id, ++ .apic_id_mask = 0xFFu << 24, ++ ++ .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = physflat_send_IPI_mask, ++ .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, ++ .send_IPI_allbutself = physflat_send_IPI_allbutself, ++ .send_IPI_all = physflat_send_IPI_all, ++ .send_IPI_self = apic_send_IPI_self, ++ ++ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, ++ .wait_for_init_deassert = NULL, ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = NULL, ++ ++ .read = native_apic_mem_read, ++ .write = native_apic_mem_write, ++ .icr_read = native_apic_icr_read, ++ .icr_write = native_apic_icr_write, ++ .wait_icr_idle = native_apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, ++}; +Index: linux-2.6-tip/arch/x86/kernel/apic/bigsmp_32.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/bigsmp_32.c +@@ -0,0 +1,267 @@ ++/* ++ * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. ++ * ++ * Drives the local APIC in "clustered mode". ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++static unsigned bigsmp_get_apic_id(unsigned long x) ++{ ++ return (x >> 24) & 0xFF; ++} ++ ++static int bigsmp_apic_id_registered(void) ++{ ++ return 1; ++} ++ ++static const struct cpumask *bigsmp_target_cpus(void) ++{ ++#ifdef CONFIG_SMP ++ return cpu_online_mask; ++#else ++ return cpumask_of(0); ++#endif ++} ++ ++static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) ++{ ++ return 0; ++} ++ ++static unsigned long bigsmp_check_apicid_present(int bit) ++{ ++ return 1; ++} ++ ++static inline unsigned long calculate_ldr(int cpu) ++{ ++ unsigned long val, id; ++ ++ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; ++ id = per_cpu(x86_bios_cpu_apicid, cpu); ++ val |= SET_APIC_LOGICAL_ID(id); ++ ++ return val; ++} ++ ++/* ++ * Set up the logical destination ID. ++ * ++ * Intel recommends to set DFR, LDR and TPR before enabling ++ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel ++ * document number 292116). So here it goes... ++ */ ++static void bigsmp_init_apic_ldr(void) ++{ ++ unsigned long val; ++ int cpu = smp_processor_id(); ++ ++ apic_write(APIC_DFR, APIC_DFR_FLAT); ++ val = calculate_ldr(cpu); ++ apic_write(APIC_LDR, val); ++} ++ ++static void bigsmp_setup_apic_routing(void) ++{ ++ printk(KERN_INFO ++ "Enabling APIC mode: Physflat. Using %d I/O APICs\n", ++ nr_ioapics); ++} ++ ++static int bigsmp_apicid_to_node(int logical_apicid) ++{ ++ return apicid_2_node[hard_smp_processor_id()]; ++} ++ ++static int bigsmp_cpu_present_to_apicid(int mps_cpu) ++{ ++ if (mps_cpu < nr_cpu_ids) ++ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); ++ ++ return BAD_APICID; ++} ++ ++static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid) ++{ ++ return physid_mask_of_physid(phys_apicid); ++} ++ ++/* Mapping from cpu number to logical apicid */ ++static inline int bigsmp_cpu_to_logical_apicid(int cpu) ++{ ++ if (cpu >= nr_cpu_ids) ++ return BAD_APICID; ++ return cpu_physical_id(cpu); ++} ++ ++static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) ++{ ++ /* For clustered we don't have a good way to do this yet - hack */ ++ return physids_promote(0xFFL); ++} ++ ++static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) ++{ ++ return 1; ++} ++ ++/* As we are using single CPU as destination, pick only one CPU here */ ++static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); ++} ++ ++static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, ++ const struct cpumask *andmask) ++{ ++ int cpu; ++ ++ /* ++ * We're using fixed IRQ delivery, can only return one phys APIC ID. ++ * May as well be the first. ++ */ ++ for_each_cpu_and(cpu, cpumask, andmask) { ++ if (cpumask_test_cpu(cpu, cpu_online_mask)) ++ break; ++ } ++ if (cpu < nr_cpu_ids) ++ return bigsmp_cpu_to_logical_apicid(cpu); ++ ++ return BAD_APICID; ++} ++ ++static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) ++{ ++ return cpuid_apic >> index_msb; ++} ++ ++static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector) ++{ ++ default_send_IPI_mask_sequence_phys(mask, vector); ++} ++ ++static void bigsmp_send_IPI_allbutself(int vector) ++{ ++ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); ++} ++ ++static void bigsmp_send_IPI_all(int vector) ++{ ++ bigsmp_send_IPI_mask(cpu_online_mask, vector); ++} ++ ++static int dmi_bigsmp; /* can be set by dmi scanners */ ++ ++static int hp_ht_bigsmp(const struct dmi_system_id *d) ++{ ++ printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); ++ dmi_bigsmp = 1; ++ ++ return 0; ++} ++ ++ ++static const struct dmi_system_id bigsmp_dmi_table[] = { ++ { hp_ht_bigsmp, "HP ProLiant DL760 G2", ++ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), ++ DMI_MATCH(DMI_BIOS_VERSION, "P44-"), ++ } ++ }, ++ ++ { hp_ht_bigsmp, "HP ProLiant DL740", ++ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), ++ DMI_MATCH(DMI_BIOS_VERSION, "P47-"), ++ } ++ }, ++ { } /* NULL entry stops DMI scanning */ ++}; ++ ++static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ cpumask_clear(retmask); ++ cpumask_set_cpu(cpu, retmask); ++} ++ ++static int probe_bigsmp(void) ++{ ++ if (def_to_bigsmp) ++ dmi_bigsmp = 1; ++ else ++ dmi_check_system(bigsmp_dmi_table); ++ ++ return dmi_bigsmp; ++} ++ ++struct apic apic_bigsmp = { ++ ++ .name = "bigsmp", ++ .probe = probe_bigsmp, ++ .acpi_madt_oem_check = NULL, ++ .apic_id_registered = bigsmp_apic_id_registered, ++ ++ .irq_delivery_mode = dest_Fixed, ++ /* phys delivery to target CPU: */ ++ .irq_dest_mode = 0, ++ ++ .target_cpus = bigsmp_target_cpus, ++ .disable_esr = 1, ++ .dest_logical = 0, ++ .check_apicid_used = bigsmp_check_apicid_used, ++ .check_apicid_present = bigsmp_check_apicid_present, ++ ++ .vector_allocation_domain = bigsmp_vector_allocation_domain, ++ .init_apic_ldr = bigsmp_init_apic_ldr, ++ ++ .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, ++ .setup_apic_routing = bigsmp_setup_apic_routing, ++ .multi_timer_check = NULL, ++ .apicid_to_node = bigsmp_apicid_to_node, ++ .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, ++ .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, ++ .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = bigsmp_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = bigsmp_phys_pkg_id, ++ .mps_oem_check = NULL, ++ ++ .get_apic_id = bigsmp_get_apic_id, ++ .set_apic_id = NULL, ++ .apic_id_mask = 0xFF << 24, ++ ++ .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = bigsmp_send_IPI_mask, ++ .send_IPI_mask_allbutself = NULL, ++ .send_IPI_allbutself = bigsmp_send_IPI_allbutself, ++ .send_IPI_all = bigsmp_send_IPI_all, ++ .send_IPI_self = default_send_IPI_self, ++ ++ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, ++ ++ .wait_for_init_deassert = default_wait_for_init_deassert, ++ ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = default_inquire_remote_apic, ++ ++ .read = native_apic_mem_read, ++ .write = native_apic_mem_write, ++ .icr_read = native_apic_icr_read, ++ .icr_write = native_apic_icr_write, ++ .wait_icr_idle = native_apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, ++}; +Index: linux-2.6-tip/arch/x86/kernel/apic/es7000_32.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/es7000_32.c +@@ -0,0 +1,781 @@ ++/* ++ * Written by: Garry Forsgren, Unisys Corporation ++ * Natalie Protasevich, Unisys Corporation ++ * ++ * This file contains the code to configure and interface ++ * with Unisys ES7000 series hardware system manager. ++ * ++ * Copyright (c) 2003 Unisys Corporation. ++ * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar ++ * ++ * All Rights Reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of version 2 of the GNU General Public License as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it would be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write the Free Software Foundation, Inc., 59 ++ * Temple Place - Suite 330, Boston MA 02111-1307, USA. ++ * ++ * Contact information: Unisys Corporation, Township Line & Union Meeting ++ * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: ++ * ++ * http://www.unisys.com ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * ES7000 chipsets ++ */ ++ ++#define NON_UNISYS 0 ++#define ES7000_CLASSIC 1 ++#define ES7000_ZORRO 2 ++ ++#define MIP_REG 1 ++#define MIP_PSAI_REG 4 ++ ++#define MIP_BUSY 1 ++#define MIP_SPIN 0xf0000 ++#define MIP_VALID 0x0100000000000000ULL ++#define MIP_SW_APIC 0x1020b ++ ++#define MIP_PORT(val) ((val >> 32) & 0xffff) ++ ++#define MIP_RD_LO(val) (val & 0xffffffff) ++ ++struct mip_reg { ++ unsigned long long off_0x00; ++ unsigned long long off_0x08; ++ unsigned long long off_0x10; ++ unsigned long long off_0x18; ++ unsigned long long off_0x20; ++ unsigned long long off_0x28; ++ unsigned long long off_0x30; ++ unsigned long long off_0x38; ++}; ++ ++struct mip_reg_info { ++ unsigned long long mip_info; ++ unsigned long long delivery_info; ++ unsigned long long host_reg; ++ unsigned long long mip_reg; ++}; ++ ++struct psai { ++ unsigned long long entry_type; ++ unsigned long long addr; ++ unsigned long long bep_addr; ++}; ++ ++#ifdef CONFIG_ACPI ++ ++struct es7000_oem_table { ++ struct acpi_table_header Header; ++ u32 OEMTableAddr; ++ u32 OEMTableSize; ++}; ++ ++static unsigned long oem_addrX; ++static unsigned long oem_size; ++ ++#endif ++ ++/* ++ * ES7000 Globals ++ */ ++ ++static volatile unsigned long *psai; ++static struct mip_reg *mip_reg; ++static struct mip_reg *host_reg; ++static int mip_port; ++static unsigned long mip_addr; ++static unsigned long host_addr; ++ ++int es7000_plat; ++ ++/* ++ * GSI override for ES7000 platforms. ++ */ ++ ++static unsigned int base; ++ ++static int ++es7000_rename_gsi(int ioapic, int gsi) ++{ ++ if (es7000_plat == ES7000_ZORRO) ++ return gsi; ++ ++ if (!base) { ++ int i; ++ for (i = 0; i < nr_ioapics; i++) ++ base += nr_ioapic_registers[i]; ++ } ++ ++ if (!ioapic && (gsi < 16)) ++ gsi += base; ++ ++ return gsi; ++} ++ ++static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) ++{ ++ unsigned long vect = 0, psaival = 0; ++ ++ if (psai == NULL) ++ return -1; ++ ++ vect = ((unsigned long)__pa(eip)/0x1000) << 16; ++ psaival = (0x1000000 | vect | cpu); ++ ++ while (*psai & 0x1000000) ++ ; ++ ++ *psai = psaival; ++ ++ return 0; ++} ++ ++static int es7000_apic_is_cluster(void) ++{ ++ /* MPENTIUMIII */ ++ if (boot_cpu_data.x86 == 6 && ++ (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) ++ return 1; ++ ++ return 0; ++} ++ ++static void setup_unisys(void) ++{ ++ /* ++ * Determine the generation of the ES7000 currently running. ++ * ++ * es7000_plat = 1 if the machine is a 5xx ES7000 box ++ * es7000_plat = 2 if the machine is a x86_64 ES7000 box ++ * ++ */ ++ if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) ++ es7000_plat = ES7000_ZORRO; ++ else ++ es7000_plat = ES7000_CLASSIC; ++ ioapic_renumber_irq = es7000_rename_gsi; ++} ++ ++/* ++ * Parse the OEM Table: ++ */ ++static int parse_unisys_oem(char *oemptr) ++{ ++ int i; ++ int success = 0; ++ unsigned char type, size; ++ unsigned long val; ++ char *tp = NULL; ++ struct psai *psaip = NULL; ++ struct mip_reg_info *mi; ++ struct mip_reg *host, *mip; ++ ++ tp = oemptr; ++ ++ tp += 8; ++ ++ for (i = 0; i <= 6; i++) { ++ type = *tp++; ++ size = *tp++; ++ tp -= 2; ++ switch (type) { ++ case MIP_REG: ++ mi = (struct mip_reg_info *)tp; ++ val = MIP_RD_LO(mi->host_reg); ++ host_addr = val; ++ host = (struct mip_reg *)val; ++ host_reg = __va(host); ++ val = MIP_RD_LO(mi->mip_reg); ++ mip_port = MIP_PORT(mi->mip_info); ++ mip_addr = val; ++ mip = (struct mip_reg *)val; ++ mip_reg = __va(mip); ++ pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", ++ (unsigned long)host_reg); ++ pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", ++ (unsigned long)mip_reg); ++ success++; ++ break; ++ case MIP_PSAI_REG: ++ psaip = (struct psai *)tp; ++ if (tp != NULL) { ++ if (psaip->addr) ++ psai = __va(psaip->addr); ++ else ++ psai = NULL; ++ success++; ++ } ++ break; ++ default: ++ break; ++ } ++ tp += size; ++ } ++ ++ if (success < 2) ++ es7000_plat = NON_UNISYS; ++ else ++ setup_unisys(); ++ ++ return es7000_plat; ++} ++ ++#ifdef CONFIG_ACPI ++static int find_unisys_acpi_oem_table(unsigned long *oem_addr) ++{ ++ struct acpi_table_header *header = NULL; ++ struct es7000_oem_table *table; ++ acpi_size tbl_size; ++ acpi_status ret; ++ int i = 0; ++ ++ for (;;) { ++ ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size); ++ if (!ACPI_SUCCESS(ret)) ++ return -1; ++ ++ if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) ++ break; ++ ++ early_acpi_os_unmap_memory(header, tbl_size); ++ } ++ ++ table = (void *)header; ++ ++ oem_addrX = table->OEMTableAddr; ++ oem_size = table->OEMTableSize; ++ ++ early_acpi_os_unmap_memory(header, tbl_size); ++ ++ *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size); ++ ++ return 0; ++} ++ ++static void unmap_unisys_acpi_oem_table(unsigned long oem_addr) ++{ ++ if (!oem_addr) ++ return; ++ ++ __acpi_unmap_table((char *)oem_addr, oem_size); ++} ++ ++static int es7000_check_dsdt(void) ++{ ++ struct acpi_table_header header; ++ ++ if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) && ++ !strncmp(header.oem_id, "UNISYS", 6)) ++ return 1; ++ return 0; ++} ++ ++static int es7000_acpi_ret; ++ ++/* Hook from generic ACPI tables.c */ ++static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ unsigned long oem_addr = 0; ++ int check_dsdt; ++ int ret = 0; ++ ++ /* check dsdt at first to avoid clear fix_map for oem_addr */ ++ check_dsdt = es7000_check_dsdt(); ++ ++ if (!find_unisys_acpi_oem_table(&oem_addr)) { ++ if (check_dsdt) { ++ ret = parse_unisys_oem((char *)oem_addr); ++ } else { ++ setup_unisys(); ++ ret = 1; ++ } ++ /* ++ * we need to unmap it ++ */ ++ unmap_unisys_acpi_oem_table(oem_addr); ++ } ++ ++ es7000_acpi_ret = ret; ++ ++ return ret && !es7000_apic_is_cluster(); ++} ++ ++static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) ++{ ++ int ret = es7000_acpi_ret; ++ ++ return ret && es7000_apic_is_cluster(); ++} ++ ++#else /* !CONFIG_ACPI: */ ++static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ return 0; ++} ++ ++static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) ++{ ++ return 0; ++} ++#endif /* !CONFIG_ACPI */ ++ ++static void es7000_spin(int n) ++{ ++ int i = 0; ++ ++ while (i++ < n) ++ rep_nop(); ++} ++ ++static int es7000_mip_write(struct mip_reg *mip_reg) ++{ ++ int status = 0; ++ int spin; ++ ++ spin = MIP_SPIN; ++ while ((host_reg->off_0x38 & MIP_VALID) != 0) { ++ if (--spin <= 0) { ++ WARN(1, "Timeout waiting for Host Valid Flag\n"); ++ return -1; ++ } ++ es7000_spin(MIP_SPIN); ++ } ++ ++ memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); ++ outb(1, mip_port); ++ ++ spin = MIP_SPIN; ++ ++ while ((mip_reg->off_0x38 & MIP_VALID) == 0) { ++ if (--spin <= 0) { ++ WARN(1, "Timeout waiting for MIP Valid Flag\n"); ++ return -1; ++ } ++ es7000_spin(MIP_SPIN); ++ } ++ ++ status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48; ++ mip_reg->off_0x38 &= ~MIP_VALID; ++ ++ return status; ++} ++ ++static void es7000_enable_apic_mode(void) ++{ ++ struct mip_reg es7000_mip_reg; ++ int mip_status; ++ ++ if (!es7000_plat) ++ return; ++ ++ printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); ++ memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); ++ es7000_mip_reg.off_0x00 = MIP_SW_APIC; ++ es7000_mip_reg.off_0x38 = MIP_VALID; ++ ++ while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) ++ WARN(1, "Command failed, status = %x\n", mip_status); ++} ++ ++static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ /* Careful. Some cpus do not strictly honor the set of cpus ++ * specified in the interrupt destination when using lowest ++ * priority interrupt delivery mode. ++ * ++ * In particular there was a hyperthreading cpu observed to ++ * deliver interrupts to the wrong hyperthread when only one ++ * hyperthread was specified in the interrupt desitination. ++ */ ++ cpumask_clear(retmask); ++ cpumask_bits(retmask)[0] = APIC_ALL_CPUS; ++} ++ ++ ++static void es7000_wait_for_init_deassert(atomic_t *deassert) ++{ ++ while (!atomic_read(deassert)) ++ cpu_relax(); ++} ++ ++static unsigned int es7000_get_apic_id(unsigned long x) ++{ ++ return (x >> 24) & 0xFF; ++} ++ ++static void es7000_send_IPI_mask(const struct cpumask *mask, int vector) ++{ ++ default_send_IPI_mask_sequence_phys(mask, vector); ++} ++ ++static void es7000_send_IPI_allbutself(int vector) ++{ ++ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); ++} ++ ++static void es7000_send_IPI_all(int vector) ++{ ++ es7000_send_IPI_mask(cpu_online_mask, vector); ++} ++ ++static int es7000_apic_id_registered(void) ++{ ++ return 1; ++} ++ ++static const struct cpumask *target_cpus_cluster(void) ++{ ++ return cpu_all_mask; ++} ++ ++static const struct cpumask *es7000_target_cpus(void) ++{ ++ return cpumask_of(smp_processor_id()); ++} ++ ++static unsigned long ++es7000_check_apicid_used(physid_mask_t bitmap, int apicid) ++{ ++ return 0; ++} ++static unsigned long es7000_check_apicid_present(int bit) ++{ ++ return physid_isset(bit, phys_cpu_present_map); ++} ++ ++static unsigned long calculate_ldr(int cpu) ++{ ++ unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); ++ ++ return SET_APIC_LOGICAL_ID(id); ++} ++ ++/* ++ * Set up the logical destination ID. ++ * ++ * Intel recommends to set DFR, LdR and TPR before enabling ++ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel ++ * document number 292116). So here it goes... ++ */ ++static void es7000_init_apic_ldr_cluster(void) ++{ ++ unsigned long val; ++ int cpu = smp_processor_id(); ++ ++ apic_write(APIC_DFR, APIC_DFR_CLUSTER); ++ val = calculate_ldr(cpu); ++ apic_write(APIC_LDR, val); ++} ++ ++static void es7000_init_apic_ldr(void) ++{ ++ unsigned long val; ++ int cpu = smp_processor_id(); ++ ++ apic_write(APIC_DFR, APIC_DFR_FLAT); ++ val = calculate_ldr(cpu); ++ apic_write(APIC_LDR, val); ++} ++ ++static void es7000_setup_apic_routing(void) ++{ ++ int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); ++ ++ printk(KERN_INFO ++ "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", ++ (apic_version[apic] == 0x14) ? ++ "Physical Cluster" : "Logical Cluster", ++ nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); ++} ++ ++static int es7000_apicid_to_node(int logical_apicid) ++{ ++ return 0; ++} ++ ++ ++static int es7000_cpu_present_to_apicid(int mps_cpu) ++{ ++ if (!mps_cpu) ++ return boot_cpu_physical_apicid; ++ else if (mps_cpu < nr_cpu_ids) ++ return per_cpu(x86_bios_cpu_apicid, mps_cpu); ++ else ++ return BAD_APICID; ++} ++ ++static int cpu_id; ++ ++static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) ++{ ++ physid_mask_t mask; ++ ++ mask = physid_mask_of_physid(cpu_id); ++ ++cpu_id; ++ ++ return mask; ++} ++ ++/* Mapping from cpu number to logical apicid */ ++static int es7000_cpu_to_logical_apicid(int cpu) ++{ ++#ifdef CONFIG_SMP ++ if (cpu >= nr_cpu_ids) ++ return BAD_APICID; ++ return cpu_2_logical_apicid[cpu]; ++#else ++ return logical_smp_processor_id(); ++#endif ++} ++ ++static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) ++{ ++ /* For clustered we don't have a good way to do this yet - hack */ ++ return physids_promote(0xff); ++} ++ ++static int es7000_check_phys_apicid_present(int cpu_physical_apicid) ++{ ++ boot_cpu_physical_apicid = read_apic_id(); ++ return 1; ++} ++ ++static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ unsigned int round = 0; ++ int cpu, uninitialized_var(apicid); ++ ++ /* ++ * The cpus in the mask must all be on the apic cluster. ++ */ ++ for_each_cpu(cpu, cpumask) { ++ int new_apicid = es7000_cpu_to_logical_apicid(cpu); ++ ++ if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { ++ WARN(1, "Not a valid mask!"); ++ ++ return BAD_APICID; ++ } ++ apicid = new_apicid; ++ round++; ++ } ++ return apicid; ++} ++ ++static unsigned int ++es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, ++ const struct cpumask *andmask) ++{ ++ int apicid = es7000_cpu_to_logical_apicid(0); ++ cpumask_var_t cpumask; ++ ++ if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) ++ return apicid; ++ ++ cpumask_and(cpumask, inmask, andmask); ++ cpumask_and(cpumask, cpumask, cpu_online_mask); ++ apicid = es7000_cpu_mask_to_apicid(cpumask); ++ ++ free_cpumask_var(cpumask); ++ ++ return apicid; ++} ++ ++static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) ++{ ++ return cpuid_apic >> index_msb; ++} ++ ++static int probe_es7000(void) ++{ ++ /* probed later in mptable/ACPI hooks */ ++ return 0; ++} ++ ++static int es7000_mps_ret; ++static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem, ++ char *productid) ++{ ++ int ret = 0; ++ ++ if (mpc->oemptr) { ++ struct mpc_oemtable *oem_table = ++ (struct mpc_oemtable *)mpc->oemptr; ++ ++ if (!strncmp(oem, "UNISYS", 6)) ++ ret = parse_unisys_oem((char *)oem_table); ++ } ++ ++ es7000_mps_ret = ret; ++ ++ return ret && !es7000_apic_is_cluster(); ++} ++ ++static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, ++ char *productid) ++{ ++ int ret = es7000_mps_ret; ++ ++ return ret && es7000_apic_is_cluster(); ++} ++ ++struct apic apic_es7000_cluster = { ++ ++ .name = "es7000", ++ .probe = probe_es7000, ++ .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, ++ .apic_id_registered = es7000_apic_id_registered, ++ ++ .irq_delivery_mode = dest_LowestPrio, ++ /* logical delivery broadcast to all procs: */ ++ .irq_dest_mode = 1, ++ ++ .target_cpus = target_cpus_cluster, ++ .disable_esr = 1, ++ .dest_logical = 0, ++ .check_apicid_used = es7000_check_apicid_used, ++ .check_apicid_present = es7000_check_apicid_present, ++ ++ .vector_allocation_domain = es7000_vector_allocation_domain, ++ .init_apic_ldr = es7000_init_apic_ldr_cluster, ++ ++ .ioapic_phys_id_map = es7000_ioapic_phys_id_map, ++ .setup_apic_routing = es7000_setup_apic_routing, ++ .multi_timer_check = NULL, ++ .apicid_to_node = es7000_apicid_to_node, ++ .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, ++ .cpu_present_to_apicid = es7000_cpu_present_to_apicid, ++ .apicid_to_cpu_present = es7000_apicid_to_cpu_present, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = es7000_check_phys_apicid_present, ++ .enable_apic_mode = es7000_enable_apic_mode, ++ .phys_pkg_id = es7000_phys_pkg_id, ++ .mps_oem_check = es7000_mps_oem_check_cluster, ++ ++ .get_apic_id = es7000_get_apic_id, ++ .set_apic_id = NULL, ++ .apic_id_mask = 0xFF << 24, ++ ++ .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = es7000_send_IPI_mask, ++ .send_IPI_mask_allbutself = NULL, ++ .send_IPI_allbutself = es7000_send_IPI_allbutself, ++ .send_IPI_all = es7000_send_IPI_all, ++ .send_IPI_self = default_send_IPI_self, ++ ++ .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip, ++ ++ .trampoline_phys_low = 0x467, ++ .trampoline_phys_high = 0x469, ++ ++ .wait_for_init_deassert = NULL, ++ ++ /* Nothing to do for most platforms, since cleared by the INIT cycle: */ ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = default_inquire_remote_apic, ++ ++ .read = native_apic_mem_read, ++ .write = native_apic_mem_write, ++ .icr_read = native_apic_icr_read, ++ .icr_write = native_apic_icr_write, ++ .wait_icr_idle = native_apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, ++}; ++ ++struct apic apic_es7000 = { ++ ++ .name = "es7000", ++ .probe = probe_es7000, ++ .acpi_madt_oem_check = es7000_acpi_madt_oem_check, ++ .apic_id_registered = es7000_apic_id_registered, ++ ++ .irq_delivery_mode = dest_Fixed, ++ /* phys delivery to target CPUs: */ ++ .irq_dest_mode = 0, ++ ++ .target_cpus = es7000_target_cpus, ++ .disable_esr = 1, ++ .dest_logical = 0, ++ .check_apicid_used = es7000_check_apicid_used, ++ .check_apicid_present = es7000_check_apicid_present, ++ ++ .vector_allocation_domain = es7000_vector_allocation_domain, ++ .init_apic_ldr = es7000_init_apic_ldr, ++ ++ .ioapic_phys_id_map = es7000_ioapic_phys_id_map, ++ .setup_apic_routing = es7000_setup_apic_routing, ++ .multi_timer_check = NULL, ++ .apicid_to_node = es7000_apicid_to_node, ++ .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, ++ .cpu_present_to_apicid = es7000_cpu_present_to_apicid, ++ .apicid_to_cpu_present = es7000_apicid_to_cpu_present, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = es7000_check_phys_apicid_present, ++ .enable_apic_mode = es7000_enable_apic_mode, ++ .phys_pkg_id = es7000_phys_pkg_id, ++ .mps_oem_check = es7000_mps_oem_check, ++ ++ .get_apic_id = es7000_get_apic_id, ++ .set_apic_id = NULL, ++ .apic_id_mask = 0xFF << 24, ++ ++ .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = es7000_send_IPI_mask, ++ .send_IPI_mask_allbutself = NULL, ++ .send_IPI_allbutself = es7000_send_IPI_allbutself, ++ .send_IPI_all = es7000_send_IPI_all, ++ .send_IPI_self = default_send_IPI_self, ++ ++ .trampoline_phys_low = 0x467, ++ .trampoline_phys_high = 0x469, ++ ++ .wait_for_init_deassert = es7000_wait_for_init_deassert, ++ ++ /* Nothing to do for most platforms, since cleared by the INIT cycle: */ ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = default_inquire_remote_apic, ++ ++ .read = native_apic_mem_read, ++ .write = native_apic_mem_write, ++ .icr_read = native_apic_icr_read, ++ .icr_write = native_apic_icr_write, ++ .wait_icr_idle = native_apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, ++}; +Index: linux-2.6-tip/arch/x86/kernel/apic/io_apic.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/io_apic.c +@@ -0,0 +1,4166 @@ ++/* ++ * Intel IO-APIC support for multi-Pentium hosts. ++ * ++ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo ++ * ++ * Many thanks to Stig Venaas for trying out countless experimental ++ * patches and reporting/debugging problems patiently! ++ * ++ * (c) 1999, Multiple IO-APIC support, developed by ++ * Ken-ichi Yaku and ++ * Hidemi Kishimoto , ++ * further tested and cleaned up by Zach Brown ++ * and Ingo Molnar ++ * ++ * Fixes ++ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; ++ * thanks to Eric Gilmore ++ * and Rolf G. Tews ++ * for testing these extensively ++ * Paul Diefenbaugh : Added full ACPI support ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* time_after() */ ++#ifdef CONFIG_ACPI ++#include ++#endif ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define __apicdebuginit(type) static type __init ++ ++/* ++ * Is the SiS APIC rmw bug present ? ++ * -1 = don't know, 0 = no, 1 = yes ++ */ ++int sis_apic_bug = -1; ++ ++static DEFINE_RAW_SPINLOCK(ioapic_lock); ++static DEFINE_RAW_SPINLOCK(vector_lock); ++ ++/* ++ * # of IRQ routing registers ++ */ ++int nr_ioapic_registers[MAX_IO_APICS]; ++ ++/* I/O APIC entries */ ++struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; ++int nr_ioapics; ++ ++/* MP IRQ source entries */ ++struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; ++ ++/* # of MP IRQ source entries */ ++int mp_irq_entries; ++ ++#if defined (CONFIG_MCA) || defined (CONFIG_EISA) ++int mp_bus_id_to_type[MAX_MP_BUSSES]; ++#endif ++ ++DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); ++ ++int skip_ioapic_setup; ++ ++void arch_disable_smp_support(void) ++{ ++#ifdef CONFIG_PCI ++ noioapicquirk = 1; ++ noioapicreroute = -1; ++#endif ++ skip_ioapic_setup = 1; ++} ++ ++static int __init parse_noapic(char *str) ++{ ++ /* disable IO-APIC */ ++ arch_disable_smp_support(); ++ return 0; ++} ++early_param("noapic", parse_noapic); ++ ++struct irq_pin_list; ++ ++/* ++ * This is performance-critical, we want to do it O(1) ++ * ++ * the indexing order of this array favors 1:1 mappings ++ * between pins and IRQs. ++ */ ++ ++struct irq_pin_list { ++ int apic, pin; ++ struct irq_pin_list *next; ++}; ++ ++static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) ++{ ++ struct irq_pin_list *pin; ++ int node; ++ ++ node = cpu_to_node(cpu); ++ ++ pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); ++ ++ return pin; ++} ++ ++struct irq_cfg { ++ struct irq_pin_list *irq_2_pin; ++ cpumask_var_t domain; ++ cpumask_var_t old_domain; ++ unsigned move_cleanup_count; ++ u8 vector; ++ u8 move_in_progress : 1; ++#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC ++ u8 move_desc_pending : 1; ++#endif ++}; ++ ++/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ ++#ifdef CONFIG_SPARSE_IRQ ++static struct irq_cfg irq_cfgx[] = { ++#else ++static struct irq_cfg irq_cfgx[NR_IRQS] = { ++#endif ++ [0] = { .vector = IRQ0_VECTOR, }, ++ [1] = { .vector = IRQ1_VECTOR, }, ++ [2] = { .vector = IRQ2_VECTOR, }, ++ [3] = { .vector = IRQ3_VECTOR, }, ++ [4] = { .vector = IRQ4_VECTOR, }, ++ [5] = { .vector = IRQ5_VECTOR, }, ++ [6] = { .vector = IRQ6_VECTOR, }, ++ [7] = { .vector = IRQ7_VECTOR, }, ++ [8] = { .vector = IRQ8_VECTOR, }, ++ [9] = { .vector = IRQ9_VECTOR, }, ++ [10] = { .vector = IRQ10_VECTOR, }, ++ [11] = { .vector = IRQ11_VECTOR, }, ++ [12] = { .vector = IRQ12_VECTOR, }, ++ [13] = { .vector = IRQ13_VECTOR, }, ++ [14] = { .vector = IRQ14_VECTOR, }, ++ [15] = { .vector = IRQ15_VECTOR, }, ++}; ++ ++int __init arch_early_irq_init(void) ++{ ++ struct irq_cfg *cfg; ++ struct irq_desc *desc; ++ int count; ++ int i; ++ ++ cfg = irq_cfgx; ++ count = ARRAY_SIZE(irq_cfgx); ++ ++ for (i = 0; i < count; i++) { ++ desc = irq_to_desc(i); ++ desc->chip_data = &cfg[i]; ++ alloc_bootmem_cpumask_var(&cfg[i].domain); ++ alloc_bootmem_cpumask_var(&cfg[i].old_domain); ++ if (i < NR_IRQS_LEGACY) ++ cpumask_setall(cfg[i].domain); ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SPARSE_IRQ ++static struct irq_cfg *irq_cfg(unsigned int irq) ++{ ++ struct irq_cfg *cfg = NULL; ++ struct irq_desc *desc; ++ ++ desc = irq_to_desc(irq); ++ if (desc) ++ cfg = desc->chip_data; ++ ++ return cfg; ++} ++ ++static struct irq_cfg *get_one_free_irq_cfg(int cpu) ++{ ++ struct irq_cfg *cfg; ++ int node; ++ ++ node = cpu_to_node(cpu); ++ ++ cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); ++ if (cfg) { ++ if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { ++ kfree(cfg); ++ cfg = NULL; ++ } else if (!alloc_cpumask_var_node(&cfg->old_domain, ++ GFP_ATOMIC, node)) { ++ free_cpumask_var(cfg->domain); ++ kfree(cfg); ++ cfg = NULL; ++ } else { ++ cpumask_clear(cfg->domain); ++ cpumask_clear(cfg->old_domain); ++ } ++ } ++ ++ return cfg; ++} ++ ++int arch_init_chip_data(struct irq_desc *desc, int cpu) ++{ ++ struct irq_cfg *cfg; ++ ++ cfg = desc->chip_data; ++ if (!cfg) { ++ desc->chip_data = get_one_free_irq_cfg(cpu); ++ if (!desc->chip_data) { ++ printk(KERN_ERR "can not alloc irq_cfg\n"); ++ BUG_ON(1); ++ } ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC ++ ++static void ++init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) ++{ ++ struct irq_pin_list *old_entry, *head, *tail, *entry; ++ ++ cfg->irq_2_pin = NULL; ++ old_entry = old_cfg->irq_2_pin; ++ if (!old_entry) ++ return; ++ ++ entry = get_one_free_irq_2_pin(cpu); ++ if (!entry) ++ return; ++ ++ entry->apic = old_entry->apic; ++ entry->pin = old_entry->pin; ++ head = entry; ++ tail = entry; ++ old_entry = old_entry->next; ++ while (old_entry) { ++ entry = get_one_free_irq_2_pin(cpu); ++ if (!entry) { ++ entry = head; ++ while (entry) { ++ head = entry->next; ++ kfree(entry); ++ entry = head; ++ } ++ /* still use the old one */ ++ return; ++ } ++ entry->apic = old_entry->apic; ++ entry->pin = old_entry->pin; ++ tail->next = entry; ++ tail = entry; ++ old_entry = old_entry->next; ++ } ++ ++ tail->next = NULL; ++ cfg->irq_2_pin = head; ++} ++ ++static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) ++{ ++ struct irq_pin_list *entry, *next; ++ ++ if (old_cfg->irq_2_pin == cfg->irq_2_pin) ++ return; ++ ++ entry = old_cfg->irq_2_pin; ++ ++ while (entry) { ++ next = entry->next; ++ kfree(entry); ++ entry = next; ++ } ++ old_cfg->irq_2_pin = NULL; ++} ++ ++void arch_init_copy_chip_data(struct irq_desc *old_desc, ++ struct irq_desc *desc, int cpu) ++{ ++ struct irq_cfg *cfg; ++ struct irq_cfg *old_cfg; ++ ++ cfg = get_one_free_irq_cfg(cpu); ++ ++ if (!cfg) ++ return; ++ ++ desc->chip_data = cfg; ++ ++ old_cfg = old_desc->chip_data; ++ ++ memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); ++ ++ init_copy_irq_2_pin(old_cfg, cfg, cpu); ++} ++ ++static void free_irq_cfg(struct irq_cfg *old_cfg) ++{ ++ kfree(old_cfg); ++} ++ ++void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) ++{ ++ struct irq_cfg *old_cfg, *cfg; ++ ++ old_cfg = old_desc->chip_data; ++ cfg = desc->chip_data; ++ ++ if (old_cfg == cfg) ++ return; ++ ++ if (old_cfg) { ++ free_irq_2_pin(old_cfg, cfg); ++ free_irq_cfg(old_cfg); ++ old_desc->chip_data = NULL; ++ } ++} ++ ++static void ++set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) ++{ ++ struct irq_cfg *cfg = desc->chip_data; ++ ++ if (!cfg->move_in_progress) { ++ /* it means that domain is not changed */ ++ if (!cpumask_intersects(desc->affinity, mask)) ++ cfg->move_desc_pending = 1; ++ } ++} ++#endif ++ ++#else ++static struct irq_cfg *irq_cfg(unsigned int irq) ++{ ++ return irq < nr_irqs ? irq_cfgx + irq : NULL; ++} ++ ++#endif ++ ++#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC ++static inline void ++set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) ++{ ++} ++#endif ++ ++struct io_apic { ++ unsigned int index; ++ unsigned int unused[3]; ++ unsigned int data; ++ unsigned int unused2[11]; ++ unsigned int eoi; ++}; ++ ++static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) ++{ ++ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) ++ + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); ++} ++ ++static inline void io_apic_eoi(unsigned int apic, unsigned int vector) ++{ ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(vector, &io_apic->eoi); ++} ++ ++static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) ++{ ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(reg, &io_apic->index); ++ return readl(&io_apic->data); ++} ++ ++static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) ++{ ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(reg, &io_apic->index); ++ writel(value, &io_apic->data); ++} ++ ++/* ++ * Re-write a value: to be used for read-modify-write ++ * cycles where the read already set up the index register. ++ * ++ * Older SiS APIC requires we rewrite the index register ++ */ ++static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) ++{ ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ ++ if (sis_apic_bug) ++ writel(reg, &io_apic->index); ++ writel(value, &io_apic->data); ++} ++ ++static bool io_apic_level_ack_pending(struct irq_cfg *cfg) ++{ ++ struct irq_pin_list *entry; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ entry = cfg->irq_2_pin; ++ for (;;) { ++ unsigned int reg; ++ int pin; ++ ++ if (!entry) ++ break; ++ pin = entry->pin; ++ reg = io_apic_read(entry->apic, 0x10 + pin*2); ++ /* Is the remote IRR bit set? */ ++ if (reg & IO_APIC_REDIR_REMOTE_IRR) { ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ return true; ++ } ++ if (!entry->next) ++ break; ++ entry = entry->next; ++ } ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return false; ++} ++ ++union entry_union { ++ struct { u32 w1, w2; }; ++ struct IO_APIC_route_entry entry; ++}; ++ ++static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) ++{ ++ union entry_union eu; ++ unsigned long flags; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); ++ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ return eu.entry; ++} ++ ++/* ++ * When we write a new IO APIC routing entry, we need to write the high ++ * word first! If the mask bit in the low word is clear, we will enable ++ * the interrupt, and we need to make sure the entry is fully populated ++ * before that happens. ++ */ ++static void ++__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) ++{ ++ union entry_union eu; ++ eu.entry = e; ++ io_apic_write(apic, 0x11 + 2*pin, eu.w2); ++ io_apic_write(apic, 0x10 + 2*pin, eu.w1); ++} ++ ++void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) ++{ ++ unsigned long flags; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __ioapic_write_entry(apic, pin, e); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++/* ++ * When we mask an IO APIC routing entry, we need to write the low ++ * word first, in order to set the mask bit before we change the ++ * high bits! ++ */ ++static void ioapic_mask_entry(int apic, int pin) ++{ ++ unsigned long flags; ++ union entry_union eu = { .entry.mask = 1 }; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(apic, 0x10 + 2*pin, eu.w1); ++ io_apic_write(apic, 0x11 + 2*pin, eu.w2); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++#ifdef CONFIG_SMP ++static void send_cleanup_vector(struct irq_cfg *cfg) ++{ ++ cpumask_var_t cleanup_mask; ++ ++ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { ++ unsigned int i; ++ cfg->move_cleanup_count = 0; ++ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) ++ cfg->move_cleanup_count++; ++ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) ++ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); ++ } else { ++ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); ++ cfg->move_cleanup_count = cpumask_weight(cleanup_mask); ++ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); ++ free_cpumask_var(cleanup_mask); ++ } ++ cfg->move_in_progress = 0; ++} ++ ++static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) ++{ ++ int apic, pin; ++ struct irq_pin_list *entry; ++ u8 vector = cfg->vector; ++ ++ entry = cfg->irq_2_pin; ++ for (;;) { ++ unsigned int reg; ++ ++ if (!entry) ++ break; ++ ++ apic = entry->apic; ++ pin = entry->pin; ++ /* ++ * With interrupt-remapping, destination information comes ++ * from interrupt-remapping table entry. ++ */ ++ if (!irq_remapped(irq)) ++ io_apic_write(apic, 0x11 + pin*2, dest); ++ reg = io_apic_read(apic, 0x10 + pin*2); ++ reg &= ~IO_APIC_REDIR_VECTOR_MASK; ++ reg |= vector; ++ io_apic_modify(apic, 0x10 + pin*2, reg); ++ if (!entry->next) ++ break; ++ entry = entry->next; ++ } ++} ++ ++static int ++assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); ++ ++/* ++ * Either sets desc->affinity to a valid value, and returns ++ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and ++ * leaves desc->affinity untouched. ++ */ ++static unsigned int ++set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) ++{ ++ struct irq_cfg *cfg; ++ unsigned int irq; ++ ++ if (!cpumask_intersects(mask, cpu_online_mask)) ++ return BAD_APICID; ++ ++ irq = desc->irq; ++ cfg = desc->chip_data; ++ if (assign_irq_vector(irq, cfg, mask)) ++ return BAD_APICID; ++ ++ /* check that before desc->addinity get updated */ ++ set_extra_move_desc(desc, mask); ++ ++ cpumask_copy(desc->affinity, mask); ++ ++ return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); ++} ++ ++static void ++set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) ++{ ++ struct irq_cfg *cfg; ++ unsigned long flags; ++ unsigned int dest; ++ unsigned int irq; ++ ++ irq = desc->irq; ++ cfg = desc->chip_data; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ dest = set_desc_affinity(desc, mask); ++ if (dest != BAD_APICID) { ++ /* Only the high 8 bits are valid. */ ++ dest = SET_APIC_LOGICAL_ID(dest); ++ __target_IO_APIC_irq(irq, dest, cfg); ++ } ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void ++set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) ++{ ++ struct irq_desc *desc; ++ ++ desc = irq_to_desc(irq); ++ ++ set_ioapic_affinity_irq_desc(desc, mask); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are ++ * shared ISA-space IRQs, so we have to support them. We are super ++ * fast in the common case, and fast for shared ISA-space IRQs. ++ */ ++static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) ++{ ++ struct irq_pin_list *entry; ++ ++ entry = cfg->irq_2_pin; ++ if (!entry) { ++ entry = get_one_free_irq_2_pin(cpu); ++ if (!entry) { ++ printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", ++ apic, pin); ++ return; ++ } ++ cfg->irq_2_pin = entry; ++ entry->apic = apic; ++ entry->pin = pin; ++ return; ++ } ++ ++ while (entry->next) { ++ /* not again, please */ ++ if (entry->apic == apic && entry->pin == pin) ++ return; ++ ++ entry = entry->next; ++ } ++ ++ entry->next = get_one_free_irq_2_pin(cpu); ++ entry = entry->next; ++ entry->apic = apic; ++ entry->pin = pin; ++} ++ ++/* ++ * Reroute an IRQ to a different pin. ++ */ ++static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, ++ int oldapic, int oldpin, ++ int newapic, int newpin) ++{ ++ struct irq_pin_list *entry = cfg->irq_2_pin; ++ int replaced = 0; ++ ++ while (entry) { ++ if (entry->apic == oldapic && entry->pin == oldpin) { ++ entry->apic = newapic; ++ entry->pin = newpin; ++ replaced = 1; ++ /* every one is different, right? */ ++ break; ++ } ++ entry = entry->next; ++ } ++ ++ /* why? call replace before add? */ ++ if (!replaced) ++ add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); ++} ++ ++static inline void io_apic_modify_irq(struct irq_cfg *cfg, ++ int mask_and, int mask_or, ++ void (*final)(struct irq_pin_list *entry)) ++{ ++ int pin; ++ struct irq_pin_list *entry; ++ ++ for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { ++ unsigned int reg; ++ pin = entry->pin; ++ reg = io_apic_read(entry->apic, 0x10 + pin * 2); ++ reg &= mask_and; ++ reg |= mask_or; ++ io_apic_modify(entry->apic, 0x10 + pin * 2, reg); ++ if (final) ++ final(entry); ++ } ++} ++ ++static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) ++{ ++ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); ++} ++ ++#ifdef CONFIG_X86_64 ++static void io_apic_sync(struct irq_pin_list *entry) ++{ ++ /* ++ * Synchronize the IO-APIC and the CPU by doing ++ * a dummy read from the IO-APIC ++ */ ++ struct io_apic __iomem *io_apic; ++ io_apic = io_apic_base(entry->apic); ++ readl(&io_apic->data); ++} ++ ++static void __mask_IO_APIC_irq(struct irq_cfg *cfg) ++{ ++ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); ++} ++#else /* CONFIG_X86_32 */ ++static void __mask_IO_APIC_irq(struct irq_cfg *cfg) ++{ ++ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); ++} ++ ++static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) ++{ ++ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, ++ IO_APIC_REDIR_MASKED, NULL); ++} ++ ++static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) ++{ ++ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, ++ IO_APIC_REDIR_LEVEL_TRIGGER, NULL); ++} ++#endif /* CONFIG_X86_32 */ ++ ++static void mask_IO_APIC_irq_desc(struct irq_desc *desc) ++{ ++ struct irq_cfg *cfg = desc->chip_data; ++ unsigned long flags; ++ ++ BUG_ON(!cfg); ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __mask_IO_APIC_irq(cfg); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) ++{ ++ struct irq_cfg *cfg = desc->chip_data; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __unmask_IO_APIC_irq(cfg); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void mask_IO_APIC_irq(unsigned int irq) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ mask_IO_APIC_irq_desc(desc); ++} ++static void unmask_IO_APIC_irq(unsigned int irq) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ unmask_IO_APIC_irq_desc(desc); ++} ++ ++static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ /* Check delivery_mode to be sure we're not clearing an SMI pin */ ++ entry = ioapic_read_entry(apic, pin); ++ if (entry.delivery_mode == dest_SMI) ++ return; ++ /* ++ * Disable it in the IO-APIC irq-routing table: ++ */ ++ ioapic_mask_entry(apic, pin); ++} ++ ++static void clear_IO_APIC (void) ++{ ++ int apic, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ++ clear_IO_APIC_pin(apic, pin); ++} ++ ++#ifdef CONFIG_X86_32 ++/* ++ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to ++ * specific CPU-side IRQs. ++ */ ++ ++#define MAX_PIRQS 8 ++static int pirq_entries[MAX_PIRQS] = { ++ [0 ... MAX_PIRQS - 1] = -1 ++}; ++ ++static int __init ioapic_pirq_setup(char *str) ++{ ++ int i, max; ++ int ints[MAX_PIRQS+1]; ++ ++ get_options(str, ARRAY_SIZE(ints), ints); ++ ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "PIRQ redirection, working around broken MP-BIOS.\n"); ++ max = MAX_PIRQS; ++ if (ints[0] < MAX_PIRQS) ++ max = ints[0]; ++ ++ for (i = 0; i < max; i++) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); ++ /* ++ * PIRQs are mapped upside down, usually. ++ */ ++ pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; ++ } ++ return 1; ++} ++ ++__setup("pirq=", ioapic_pirq_setup); ++#endif /* CONFIG_X86_32 */ ++ ++#ifdef CONFIG_INTR_REMAP ++/* I/O APIC RTE contents at the OS boot up */ ++static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; ++ ++/* ++ * Saves all the IO-APIC RTE's ++ */ ++int save_IO_APIC_setup(void) ++{ ++ union IO_APIC_reg_01 reg_01; ++ unsigned long flags; ++ int apic, pin; ++ ++ /* ++ * The number of IO-APIC IRQ registers (== #pins): ++ */ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(apic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ nr_ioapic_registers[apic] = reg_01.bits.entries+1; ++ } ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ early_ioapic_entries[apic] = ++ kzalloc(sizeof(struct IO_APIC_route_entry) * ++ nr_ioapic_registers[apic], GFP_KERNEL); ++ if (!early_ioapic_entries[apic]) ++ goto nomem; ++ } ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ++ early_ioapic_entries[apic][pin] = ++ ioapic_read_entry(apic, pin); ++ ++ return 0; ++ ++nomem: ++ while (apic >= 0) ++ kfree(early_ioapic_entries[apic--]); ++ memset(early_ioapic_entries, 0, ++ ARRAY_SIZE(early_ioapic_entries)); ++ ++ return -ENOMEM; ++} ++ ++void mask_IO_APIC_setup(void) ++{ ++ int apic, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ if (!early_ioapic_entries[apic]) ++ break; ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ struct IO_APIC_route_entry entry; ++ ++ entry = early_ioapic_entries[apic][pin]; ++ if (!entry.mask) { ++ entry.mask = 1; ++ ioapic_write_entry(apic, pin, entry); ++ } ++ } ++ } ++} ++ ++void restore_IO_APIC_setup(void) ++{ ++ int apic, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ if (!early_ioapic_entries[apic]) ++ break; ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ++ ioapic_write_entry(apic, pin, ++ early_ioapic_entries[apic][pin]); ++ kfree(early_ioapic_entries[apic]); ++ early_ioapic_entries[apic] = NULL; ++ } ++} ++ ++void reinit_intr_remapped_IO_APIC(int intr_remapping) ++{ ++ /* ++ * for now plain restore of previous settings. ++ * TBD: In the case of OS enabling interrupt-remapping, ++ * IO-APIC RTE's need to be setup to point to interrupt-remapping ++ * table entries. for now, do a plain restore, and wait for ++ * the setup_IO_APIC_irqs() to do proper initialization. ++ */ ++ restore_IO_APIC_setup(); ++} ++#endif ++ ++/* ++ * Find the IRQ entry number of a certain pin. ++ */ ++static int find_irq_entry(int apic, int pin, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].irqtype == type && ++ (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || ++ mp_irqs[i].dstapic == MP_APIC_ALL) && ++ mp_irqs[i].dstirq == pin) ++ return i; ++ ++ return -1; ++} ++ ++/* ++ * Find the pin to which IRQ[irq] (ISA) is connected ++ */ ++static int __init find_isa_irq_pin(int irq, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].srcbus; ++ ++ if (test_bit(lbus, mp_bus_not_pci) && ++ (mp_irqs[i].irqtype == type) && ++ (mp_irqs[i].srcbusirq == irq)) ++ ++ return mp_irqs[i].dstirq; ++ } ++ return -1; ++} ++ ++static int __init find_isa_irq_apic(int irq, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].srcbus; ++ ++ if (test_bit(lbus, mp_bus_not_pci) && ++ (mp_irqs[i].irqtype == type) && ++ (mp_irqs[i].srcbusirq == irq)) ++ break; ++ } ++ if (i < mp_irq_entries) { ++ int apic; ++ for(apic = 0; apic < nr_ioapics; apic++) { ++ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) ++ return apic; ++ } ++ } ++ ++ return -1; ++} ++ ++/* ++ * Find a specific PCI IRQ entry. ++ * Not an __init, possibly needed by modules ++ */ ++static int pin_2_irq(int idx, int apic, int pin); ++ ++int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) ++{ ++ int apic, i, best_guess = -1; ++ ++ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", ++ bus, slot, pin); ++ if (test_bit(bus, mp_bus_not_pci)) { ++ apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); ++ return -1; ++ } ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].srcbus; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || ++ mp_irqs[i].dstapic == MP_APIC_ALL) ++ break; ++ ++ if (!test_bit(lbus, mp_bus_not_pci) && ++ !mp_irqs[i].irqtype && ++ (bus == lbus) && ++ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { ++ int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); ++ ++ if (!(apic || IO_APIC_IRQ(irq))) ++ continue; ++ ++ if (pin == (mp_irqs[i].srcbusirq & 3)) ++ return irq; ++ /* ++ * Use the first all-but-pin matching entry as a ++ * best-guess fuzzy result for broken mptables. ++ */ ++ if (best_guess < 0) ++ best_guess = irq; ++ } ++ } ++ return best_guess; ++} ++ ++EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); ++ ++#if defined(CONFIG_EISA) || defined(CONFIG_MCA) ++/* ++ * EISA Edge/Level control register, ELCR ++ */ ++static int EISA_ELCR(unsigned int irq) ++{ ++ if (irq < NR_IRQS_LEGACY) { ++ unsigned int port = 0x4d0 + (irq >> 3); ++ return (inb(port) >> (irq & 7)) & 1; ++ } ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "Broken MPtable reports ISA irq %d\n", irq); ++ return 0; ++} ++ ++#endif ++ ++/* ISA interrupts are always polarity zero edge triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_ISA_trigger(idx) (0) ++#define default_ISA_polarity(idx) (0) ++ ++/* EISA interrupts are always polarity zero and can be edge or level ++ * trigger depending on the ELCR value. If an interrupt is listed as ++ * EISA conforming in the MP table, that means its trigger type must ++ * be read in from the ELCR */ ++ ++#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) ++#define default_EISA_polarity(idx) default_ISA_polarity(idx) ++ ++/* PCI interrupts are always polarity one level triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_PCI_trigger(idx) (1) ++#define default_PCI_polarity(idx) (1) ++ ++/* MCA interrupts are always polarity zero level triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_MCA_trigger(idx) (1) ++#define default_MCA_polarity(idx) default_ISA_polarity(idx) ++ ++static int MPBIOS_polarity(int idx) ++{ ++ int bus = mp_irqs[idx].srcbus; ++ int polarity; ++ ++ /* ++ * Determine IRQ line polarity (high active or low active): ++ */ ++ switch (mp_irqs[idx].irqflag & 3) ++ { ++ case 0: /* conforms, ie. bus-type dependent polarity */ ++ if (test_bit(bus, mp_bus_not_pci)) ++ polarity = default_ISA_polarity(idx); ++ else ++ polarity = default_PCI_polarity(idx); ++ break; ++ case 1: /* high active */ ++ { ++ polarity = 0; ++ break; ++ } ++ case 2: /* reserved */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ case 3: /* low active */ ++ { ++ polarity = 1; ++ break; ++ } ++ default: /* invalid */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ } ++ return polarity; ++} ++ ++static int MPBIOS_trigger(int idx) ++{ ++ int bus = mp_irqs[idx].srcbus; ++ int trigger; ++ ++ /* ++ * Determine IRQ trigger mode (edge or level sensitive): ++ */ ++ switch ((mp_irqs[idx].irqflag>>2) & 3) ++ { ++ case 0: /* conforms, ie. bus-type dependent */ ++ if (test_bit(bus, mp_bus_not_pci)) ++ trigger = default_ISA_trigger(idx); ++ else ++ trigger = default_PCI_trigger(idx); ++#if defined(CONFIG_EISA) || defined(CONFIG_MCA) ++ switch (mp_bus_id_to_type[bus]) { ++ case MP_BUS_ISA: /* ISA pin */ ++ { ++ /* set before the switch */ ++ break; ++ } ++ case MP_BUS_EISA: /* EISA pin */ ++ { ++ trigger = default_EISA_trigger(idx); ++ break; ++ } ++ case MP_BUS_PCI: /* PCI pin */ ++ { ++ /* set before the switch */ ++ break; ++ } ++ case MP_BUS_MCA: /* MCA pin */ ++ { ++ trigger = default_MCA_trigger(idx); ++ break; ++ } ++ default: ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 1; ++ break; ++ } ++ } ++#endif ++ break; ++ case 1: /* edge */ ++ { ++ trigger = 0; ++ break; ++ } ++ case 2: /* reserved */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 1; ++ break; ++ } ++ case 3: /* level */ ++ { ++ trigger = 1; ++ break; ++ } ++ default: /* invalid */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 0; ++ break; ++ } ++ } ++ return trigger; ++} ++ ++static inline int irq_polarity(int idx) ++{ ++ return MPBIOS_polarity(idx); ++} ++ ++static inline int irq_trigger(int idx) ++{ ++ return MPBIOS_trigger(idx); ++} ++ ++int (*ioapic_renumber_irq)(int ioapic, int irq); ++static int pin_2_irq(int idx, int apic, int pin) ++{ ++ int irq, i; ++ int bus = mp_irqs[idx].srcbus; ++ ++ /* ++ * Debugging check, we are in big trouble if this message pops up! ++ */ ++ if (mp_irqs[idx].dstirq != pin) ++ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); ++ ++ if (test_bit(bus, mp_bus_not_pci)) { ++ irq = mp_irqs[idx].srcbusirq; ++ } else { ++ /* ++ * PCI IRQs are mapped in order ++ */ ++ i = irq = 0; ++ while (i < apic) ++ irq += nr_ioapic_registers[i++]; ++ irq += pin; ++ /* ++ * For MPS mode, so far only needed by ES7000 platform ++ */ ++ if (ioapic_renumber_irq) ++ irq = ioapic_renumber_irq(apic, irq); ++ } ++ ++#ifdef CONFIG_X86_32 ++ /* ++ * PCI IRQ command line redirection. Yes, limits are hardcoded. ++ */ ++ if ((pin >= 16) && (pin <= 23)) { ++ if (pirq_entries[pin-16] != -1) { ++ if (!pirq_entries[pin-16]) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "disabling PIRQ%d\n", pin-16); ++ } else { ++ irq = pirq_entries[pin-16]; ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "using PIRQ%d -> IRQ %d\n", ++ pin-16, irq); ++ } ++ } ++ } ++#endif ++ ++ return irq; ++} ++ ++void lock_vector_lock(void) ++{ ++ /* Used to the online set of cpus does not change ++ * during assign_irq_vector. ++ */ ++ spin_lock(&vector_lock); ++} ++ ++void unlock_vector_lock(void) ++{ ++ spin_unlock(&vector_lock); ++} ++ ++static int ++__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) ++{ ++ /* ++ * NOTE! The local APIC isn't very good at handling ++ * multiple interrupts at the same interrupt level. ++ * As the interrupt level is determined by taking the ++ * vector number and shifting that right by 4, we ++ * want to spread these out a bit so that they don't ++ * all fall in the same interrupt level. ++ * ++ * Also, we've got to be careful not to trash gate ++ * 0x80, because int 0x80 is hm, kind of importantish. ;) ++ */ ++ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; ++ unsigned int old_vector; ++ int cpu, err; ++ cpumask_var_t tmp_mask; ++ ++ if ((cfg->move_in_progress) || cfg->move_cleanup_count) ++ return -EBUSY; ++ ++ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) ++ return -ENOMEM; ++ ++ old_vector = cfg->vector; ++ if (old_vector) { ++ cpumask_and(tmp_mask, mask, cpu_online_mask); ++ cpumask_and(tmp_mask, cfg->domain, tmp_mask); ++ if (!cpumask_empty(tmp_mask)) { ++ free_cpumask_var(tmp_mask); ++ return 0; ++ } ++ } ++ ++ /* Only try and allocate irqs on cpus that are present */ ++ err = -ENOSPC; ++ for_each_cpu_and(cpu, mask, cpu_online_mask) { ++ int new_cpu; ++ int vector, offset; ++ ++ apic->vector_allocation_domain(cpu, tmp_mask); ++ ++ vector = current_vector; ++ offset = current_offset; ++next: ++ vector += 8; ++ if (vector >= first_system_vector) { ++ /* If out of vectors on large boxen, must share them. */ ++ offset = (offset + 1) % 8; ++ vector = FIRST_DEVICE_VECTOR + offset; ++ } ++ if (unlikely(current_vector == vector)) ++ continue; ++ ++ if (test_bit(vector, used_vectors)) ++ goto next; ++ ++ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) ++ if (per_cpu(vector_irq, new_cpu)[vector] != -1) ++ goto next; ++ /* Found one! */ ++ current_vector = vector; ++ current_offset = offset; ++ if (old_vector) { ++ cfg->move_in_progress = 1; ++ cpumask_copy(cfg->old_domain, cfg->domain); ++ } ++ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) ++ per_cpu(vector_irq, new_cpu)[vector] = irq; ++ cfg->vector = vector; ++ cpumask_copy(cfg->domain, tmp_mask); ++ err = 0; ++ break; ++ } ++ free_cpumask_var(tmp_mask); ++ return err; ++} ++ ++static int ++assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) ++{ ++ int err; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ err = __assign_irq_vector(irq, cfg, mask); ++ spin_unlock_irqrestore(&vector_lock, flags); ++ return err; ++} ++ ++static void __clear_irq_vector(int irq, struct irq_cfg *cfg) ++{ ++ int cpu, vector; ++ ++ BUG_ON(!cfg->vector); ++ ++ vector = cfg->vector; ++ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) ++ per_cpu(vector_irq, cpu)[vector] = -1; ++ ++ cfg->vector = 0; ++ cpumask_clear(cfg->domain); ++ ++ if (likely(!cfg->move_in_progress)) ++ return; ++ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { ++ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; ++ vector++) { ++ if (per_cpu(vector_irq, cpu)[vector] != irq) ++ continue; ++ per_cpu(vector_irq, cpu)[vector] = -1; ++ break; ++ } ++ } ++ cfg->move_in_progress = 0; ++} ++ ++void __setup_vector_irq(int cpu) ++{ ++ /* Initialize vector_irq on a new cpu */ ++ /* This function must be called with vector_lock held */ ++ int irq, vector; ++ struct irq_cfg *cfg; ++ struct irq_desc *desc; ++ ++ /* Mark the inuse vectors */ ++ for_each_irq_desc(irq, desc) { ++ cfg = desc->chip_data; ++ if (!cpumask_test_cpu(cpu, cfg->domain)) ++ continue; ++ vector = cfg->vector; ++ per_cpu(vector_irq, cpu)[vector] = irq; ++ } ++ /* Mark the free vectors */ ++ for (vector = 0; vector < NR_VECTORS; ++vector) { ++ irq = per_cpu(vector_irq, cpu)[vector]; ++ if (irq < 0) ++ continue; ++ ++ cfg = irq_cfg(irq); ++ if (!cpumask_test_cpu(cpu, cfg->domain)) ++ per_cpu(vector_irq, cpu)[vector] = -1; ++ } ++} ++ ++static struct irq_chip ioapic_chip; ++static struct irq_chip ir_ioapic_chip; ++ ++#define IOAPIC_AUTO -1 ++#define IOAPIC_EDGE 0 ++#define IOAPIC_LEVEL 1 ++ ++#ifdef CONFIG_X86_32 ++static inline int IO_APIC_irq_trigger(int irq) ++{ ++ int apic, idx, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ idx = find_irq_entry(apic, pin, mp_INT); ++ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) ++ return irq_trigger(idx); ++ } ++ } ++ /* ++ * nonexistent IRQs are edge default ++ */ ++ return 0; ++} ++#else ++static inline int IO_APIC_irq_trigger(int irq) ++{ ++ return 1; ++} ++#endif ++ ++static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) ++{ ++ ++ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || ++ trigger == IOAPIC_LEVEL) ++ desc->status |= IRQ_LEVEL; ++ else ++ desc->status &= ~IRQ_LEVEL; ++ ++ if (irq_remapped(irq)) { ++ desc->status |= IRQ_MOVE_PCNTXT; ++ if (trigger) ++ set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, ++ handle_fasteoi_irq, ++ "fasteoi"); ++ else ++ set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, ++ handle_edge_irq, "edge"); ++ return; ++ } ++ ++ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || ++ trigger == IOAPIC_LEVEL) ++ set_irq_chip_and_handler_name(irq, &ioapic_chip, ++ handle_fasteoi_irq, ++ "fasteoi"); ++ else ++ set_irq_chip_and_handler_name(irq, &ioapic_chip, ++ handle_edge_irq, "edge"); ++} ++ ++int setup_ioapic_entry(int apic_id, int irq, ++ struct IO_APIC_route_entry *entry, ++ unsigned int destination, int trigger, ++ int polarity, int vector, int pin) ++{ ++ /* ++ * add it to the IO-APIC irq-routing table: ++ */ ++ memset(entry,0,sizeof(*entry)); ++ ++ if (intr_remapping_enabled) { ++ struct intel_iommu *iommu = map_ioapic_to_ir(apic_id); ++ struct irte irte; ++ struct IR_IO_APIC_route_entry *ir_entry = ++ (struct IR_IO_APIC_route_entry *) entry; ++ int index; ++ ++ if (!iommu) ++ panic("No mapping iommu for ioapic %d\n", apic_id); ++ ++ index = alloc_irte(iommu, irq, 1); ++ if (index < 0) ++ panic("Failed to allocate IRTE for ioapic %d\n", apic_id); ++ ++ memset(&irte, 0, sizeof(irte)); ++ ++ irte.present = 1; ++ irte.dst_mode = apic->irq_dest_mode; ++ /* ++ * Trigger mode in the IRTE will always be edge, and the ++ * actual level or edge trigger will be setup in the IO-APIC ++ * RTE. This will help simplify level triggered irq migration. ++ * For more details, see the comments above explainig IO-APIC ++ * irq migration in the presence of interrupt-remapping. ++ */ ++ irte.trigger_mode = 0; ++ irte.dlvry_mode = apic->irq_delivery_mode; ++ irte.vector = vector; ++ irte.dest_id = IRTE_DEST(destination); ++ ++ modify_irte(irq, &irte); ++ ++ ir_entry->index2 = (index >> 15) & 0x1; ++ ir_entry->zero = 0; ++ ir_entry->format = 1; ++ ir_entry->index = (index & 0x7fff); ++ /* ++ * IO-APIC RTE will be configured with virtual vector. ++ * irq handler will do the explicit EOI to the io-apic. ++ */ ++ ir_entry->vector = pin; ++ } else { ++ entry->delivery_mode = apic->irq_delivery_mode; ++ entry->dest_mode = apic->irq_dest_mode; ++ entry->dest = destination; ++ entry->vector = vector; ++ } ++ ++ entry->mask = 0; /* enable IRQ */ ++ entry->trigger = trigger; ++ entry->polarity = polarity; ++ ++ /* Mask level triggered irqs. ++ * Use IRQ_DELAYED_DISABLE for edge triggered irqs. ++ */ ++ if (trigger) ++ entry->mask = 1; ++ return 0; ++} ++ ++static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc, ++ int trigger, int polarity) ++{ ++ struct irq_cfg *cfg; ++ struct IO_APIC_route_entry entry; ++ unsigned int dest; ++ ++ if (!IO_APIC_IRQ(irq)) ++ return; ++ ++ cfg = desc->chip_data; ++ ++ if (assign_irq_vector(irq, cfg, apic->target_cpus())) ++ return; ++ ++ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); ++ ++ apic_printk(APIC_VERBOSE,KERN_DEBUG ++ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " ++ "IRQ %d Mode:%i Active:%i)\n", ++ apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, ++ irq, trigger, polarity); ++ ++ ++ if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, ++ dest, trigger, polarity, cfg->vector, pin)) { ++ printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", ++ mp_ioapics[apic_id].apicid, pin); ++ __clear_irq_vector(irq, cfg); ++ return; ++ } ++ ++ ioapic_register_intr(irq, desc, trigger); ++ if (irq < NR_IRQS_LEGACY) ++ disable_8259A_irq(irq); ++ ++ ioapic_write_entry(apic_id, pin, entry); ++} ++ ++static void __init setup_IO_APIC_irqs(void) ++{ ++ int apic_id, pin, idx, irq; ++ int notcon = 0; ++ struct irq_desc *desc; ++ struct irq_cfg *cfg; ++ int cpu = boot_cpu_id; ++ ++ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); ++ ++ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { ++ for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { ++ ++ idx = find_irq_entry(apic_id, pin, mp_INT); ++ if (idx == -1) { ++ if (!notcon) { ++ notcon = 1; ++ apic_printk(APIC_VERBOSE, ++ KERN_DEBUG " %d-%d", ++ mp_ioapics[apic_id].apicid, pin); ++ } else ++ apic_printk(APIC_VERBOSE, " %d-%d", ++ mp_ioapics[apic_id].apicid, pin); ++ continue; ++ } ++ if (notcon) { ++ apic_printk(APIC_VERBOSE, ++ " (apicid-pin) not connected\n"); ++ notcon = 0; ++ } ++ ++ irq = pin_2_irq(idx, apic_id, pin); ++ ++ /* ++ * Skip the timer IRQ if there's a quirk handler ++ * installed and if it returns 1: ++ */ ++ if (apic->multi_timer_check && ++ apic->multi_timer_check(apic_id, irq)) ++ continue; ++ ++ desc = irq_to_desc_alloc_cpu(irq, cpu); ++ if (!desc) { ++ printk(KERN_INFO "can not get irq_desc for %d\n", irq); ++ continue; ++ } ++ cfg = desc->chip_data; ++ add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); ++ ++ setup_IO_APIC_irq(apic_id, pin, irq, desc, ++ irq_trigger(idx), irq_polarity(idx)); ++ } ++ } ++ ++ if (notcon) ++ apic_printk(APIC_VERBOSE, ++ " (apicid-pin) not connected\n"); ++} ++ ++/* ++ * Set up the timer pin, possibly with the 8259A-master behind. ++ */ ++static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, ++ int vector) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ if (intr_remapping_enabled) ++ return; ++ ++ memset(&entry, 0, sizeof(entry)); ++ ++ /* ++ * We use logical delivery to get the timer IRQ ++ * to the first CPU. ++ */ ++ entry.dest_mode = apic->irq_dest_mode; ++ entry.mask = 0; /* don't mask IRQ for edge */ ++ entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); ++ entry.delivery_mode = apic->irq_delivery_mode; ++ entry.polarity = 0; ++ entry.trigger = 0; ++ entry.vector = vector; ++ ++ /* ++ * The timer IRQ doesn't have to know that behind the ++ * scene we may have a 8259A-master in AEOI mode ... ++ */ ++ set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); ++ ++ /* ++ * Add it to the IO-APIC irq-routing table: ++ */ ++ ioapic_write_entry(apic_id, pin, entry); ++} ++ ++ ++__apicdebuginit(void) print_IO_APIC(void) ++{ ++ int apic, i; ++ union IO_APIC_reg_00 reg_00; ++ union IO_APIC_reg_01 reg_01; ++ union IO_APIC_reg_02 reg_02; ++ union IO_APIC_reg_03 reg_03; ++ unsigned long flags; ++ struct irq_cfg *cfg; ++ struct irq_desc *desc; ++ unsigned int irq; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); ++ for (i = 0; i < nr_ioapics; i++) ++ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", ++ mp_ioapics[i].apicid, nr_ioapic_registers[i]); ++ ++ /* ++ * We are a bit conservative about what we expect. We have to ++ * know about every hardware change ASAP. ++ */ ++ printk(KERN_INFO "testing the IO APIC.......................\n"); ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic, 0); ++ reg_01.raw = io_apic_read(apic, 1); ++ if (reg_01.bits.version >= 0x10) ++ reg_02.raw = io_apic_read(apic, 2); ++ if (reg_01.bits.version >= 0x20) ++ reg_03.raw = io_apic_read(apic, 3); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ printk("\n"); ++ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); ++ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); ++ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); ++ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); ++ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); ++ ++ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); ++ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); ++ ++ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); ++ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); ++ ++ /* ++ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, ++ * but the value of reg_02 is read as the previous read register ++ * value, so ignore it if reg_02 == reg_01. ++ */ ++ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { ++ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); ++ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); ++ } ++ ++ /* ++ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 ++ * or reg_03, but the value of reg_0[23] is read as the previous read ++ * register value, so ignore it if reg_03 == reg_0[12]. ++ */ ++ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && ++ reg_03.raw != reg_01.raw) { ++ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); ++ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); ++ } ++ ++ printk(KERN_DEBUG ".... IRQ redirection table:\n"); ++ ++ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" ++ " Stat Dmod Deli Vect: \n"); ++ ++ for (i = 0; i <= reg_01.bits.entries; i++) { ++ struct IO_APIC_route_entry entry; ++ ++ entry = ioapic_read_entry(apic, i); ++ ++ printk(KERN_DEBUG " %02x %03X ", ++ i, ++ entry.dest ++ ); ++ ++ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", ++ entry.mask, ++ entry.trigger, ++ entry.irr, ++ entry.polarity, ++ entry.delivery_status, ++ entry.dest_mode, ++ entry.delivery_mode, ++ entry.vector ++ ); ++ } ++ } ++ printk(KERN_DEBUG "IRQ to pin mappings:\n"); ++ for_each_irq_desc(irq, desc) { ++ struct irq_pin_list *entry; ++ ++ cfg = desc->chip_data; ++ entry = cfg->irq_2_pin; ++ if (!entry) ++ continue; ++ printk(KERN_DEBUG "IRQ%d ", irq); ++ for (;;) { ++ printk("-> %d:%d", entry->apic, entry->pin); ++ if (!entry->next) ++ break; ++ entry = entry->next; ++ } ++ printk("\n"); ++ } ++ ++ printk(KERN_INFO ".................................... done.\n"); ++ ++ return; ++} ++ ++__apicdebuginit(void) print_APIC_bitfield(int base) ++{ ++ unsigned int v; ++ int i, j; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); ++ for (i = 0; i < 8; i++) { ++ v = apic_read(base + i*0x10); ++ for (j = 0; j < 32; j++) { ++ if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ ++ apic_write(APIC_ESR, 0); ++ ++ v = apic_read(APIC_ESR); ++ printk(KERN_DEBUG "... APIC ESR: %08x\n", v); ++ } ++ ++ icr = apic_icr_read(); ++ printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); ++ printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); ++ ++ v = apic_read(APIC_LVTT); ++ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); ++ ++ if (maxlvt > 3) { /* PC is LVT#4. */ ++ v = apic_read(APIC_LVTPC); ++ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); ++ } ++ v = apic_read(APIC_LVT0); ++ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); ++ v = apic_read(APIC_LVT1); ++ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); ++ ++ if (maxlvt > 2) { /* ERR is LVT#3. */ ++ v = apic_read(APIC_LVTERR); ++ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); ++ } ++ ++ v = apic_read(APIC_TMICT); ++ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); ++ v = apic_read(APIC_TMCCT); ++ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); ++ v = apic_read(APIC_TDCR); ++ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); ++ printk("\n"); ++} ++ ++__apicdebuginit(void) print_all_local_APICs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_online_cpu(cpu) ++ smp_call_function_single(cpu, print_local_APIC, NULL, 1); ++ preempt_enable(); ++} ++ ++__apicdebuginit(void) print_PIC(void) ++{ ++ unsigned int v; ++ unsigned long flags; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "\nprinting PIC contents\n"); ++ ++ spin_lock_irqsave(&i8259A_lock, flags); ++ ++ v = inb(0xa1) << 8 | inb(0x21); ++ printk(KERN_DEBUG "... PIC IMR: %04x\n", v); ++ ++ v = inb(0xa0) << 8 | inb(0x20); ++ printk(KERN_DEBUG "... PIC IRR: %04x\n", v); ++ ++ outb(0x0b,0xa0); ++ outb(0x0b,0x20); ++ v = inb(0xa0) << 8 | inb(0x20); ++ outb(0x0a,0xa0); ++ outb(0x0a,0x20); ++ ++ spin_unlock_irqrestore(&i8259A_lock, flags); ++ ++ printk(KERN_DEBUG "... PIC ISR: %04x\n", v); ++ ++ v = inb(0x4d1) << 8 | inb(0x4d0); ++ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); ++} ++ ++__apicdebuginit(int) print_all_ICs(void) ++{ ++ print_PIC(); ++ print_all_local_APICs(); ++ print_IO_APIC(); ++ ++ return 0; ++} ++ ++fs_initcall(print_all_ICs); ++ ++ ++/* Where if anywhere is the i8259 connect in external int mode */ ++static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; ++ ++void __init enable_IO_APIC(void) ++{ ++ union IO_APIC_reg_01 reg_01; ++ int i8259_apic, i8259_pin; ++ int apic; ++ unsigned long flags; ++ ++ /* ++ * The number of IO-APIC IRQ registers (== #pins): ++ */ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(apic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ nr_ioapic_registers[apic] = reg_01.bits.entries+1; ++ } ++ for(apic = 0; apic < nr_ioapics; apic++) { ++ int pin; ++ /* See if any of the pins is in ExtINT mode */ ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ struct IO_APIC_route_entry entry; ++ entry = ioapic_read_entry(apic, pin); ++ ++ /* If the interrupt line is enabled and in ExtInt mode ++ * I have found the pin where the i8259 is connected. ++ */ ++ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { ++ ioapic_i8259.apic = apic; ++ ioapic_i8259.pin = pin; ++ goto found_i8259; ++ } ++ } ++ } ++ found_i8259: ++ /* Look to see what if the MP table has reported the ExtINT */ ++ /* If we could not find the appropriate pin by looking at the ioapic ++ * the i8259 probably is not connected the ioapic but give the ++ * mptable a chance anyway. ++ */ ++ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); ++ i8259_apic = find_isa_irq_apic(0, mp_ExtINT); ++ /* Trust the MP table if nothing is setup in the hardware */ ++ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { ++ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); ++ ioapic_i8259.pin = i8259_pin; ++ ioapic_i8259.apic = i8259_apic; ++ } ++ /* Complain if the MP table and the hardware disagree */ ++ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && ++ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) ++ { ++ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); ++ } ++ ++ /* ++ * Do not trust the IO-APIC being empty at bootup ++ */ ++ clear_IO_APIC(); ++} ++ ++/* ++ * Not an __init, needed by the reboot code ++ */ ++void disable_IO_APIC(void) ++{ ++ /* ++ * Clear the IO-APIC before rebooting: ++ */ ++ clear_IO_APIC(); ++ ++ /* ++ * If the i8259 is routed through an IOAPIC ++ * Put that IOAPIC in virtual wire mode ++ * so legacy interrupts can be delivered. ++ * ++ * With interrupt-remapping, for now we will use virtual wire A mode, ++ * as virtual wire B is little complex (need to configure both ++ * IOAPIC RTE aswell as interrupt-remapping table entry). ++ * As this gets called during crash dump, keep this simple for now. ++ */ ++ if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { ++ struct IO_APIC_route_entry entry; ++ ++ memset(&entry, 0, sizeof(entry)); ++ entry.mask = 0; /* Enabled */ ++ entry.trigger = 0; /* Edge */ ++ entry.irr = 0; ++ entry.polarity = 0; /* High */ ++ entry.delivery_status = 0; ++ entry.dest_mode = 0; /* Physical */ ++ entry.delivery_mode = dest_ExtINT; /* ExtInt */ ++ entry.vector = 0; ++ entry.dest = read_apic_id(); ++ ++ /* ++ * Add it to the IO-APIC irq-routing table: ++ */ ++ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); ++ } ++ ++ /* ++ * Use virtual wire A mode when interrupt remapping is enabled. ++ */ ++ disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); ++} ++ ++#ifdef CONFIG_X86_32 ++/* ++ * function to set the IO-APIC physical IDs based on the ++ * values stored in the MPC table. ++ * ++ * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 ++ */ ++ ++static void __init setup_ioapic_ids_from_mpc(void) ++{ ++ union IO_APIC_reg_00 reg_00; ++ physid_mask_t phys_id_present_map; ++ int apic_id; ++ int i; ++ unsigned char old_id; ++ unsigned long flags; ++ ++ if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) ++ return; ++ ++ /* ++ * Don't check I/O APIC IDs for xAPIC systems. They have ++ * no meaning without the serial APIC bus. ++ */ ++ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ++ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) ++ return; ++ /* ++ * This is broken; anything with a real cpu count has to ++ * circumvent this idiocy regardless. ++ */ ++ phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); ++ ++ /* ++ * Set the IOAPIC ID to the value stored in the MPC table. ++ */ ++ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { ++ ++ /* Read the register 0 value */ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic_id, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ old_id = mp_ioapics[apic_id].apicid; ++ ++ if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { ++ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", ++ apic_id, mp_ioapics[apic_id].apicid); ++ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", ++ reg_00.bits.ID); ++ mp_ioapics[apic_id].apicid = reg_00.bits.ID; ++ } ++ ++ /* ++ * Sanity check, is the ID really free? Every APIC in a ++ * system must have a unique ID or we get lots of nice ++ * 'stuck on smp_invalidate_needed IPI wait' messages. ++ */ ++ if (apic->check_apicid_used(phys_id_present_map, ++ mp_ioapics[apic_id].apicid)) { ++ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", ++ apic_id, mp_ioapics[apic_id].apicid); ++ for (i = 0; i < get_physical_broadcast(); i++) ++ if (!physid_isset(i, phys_id_present_map)) ++ break; ++ if (i >= get_physical_broadcast()) ++ panic("Max APIC ID exceeded!\n"); ++ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", ++ i); ++ physid_set(i, phys_id_present_map); ++ mp_ioapics[apic_id].apicid = i; ++ } else { ++ physid_mask_t tmp; ++ tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); ++ apic_printk(APIC_VERBOSE, "Setting %d in the " ++ "phys_id_present_map\n", ++ mp_ioapics[apic_id].apicid); ++ physids_or(phys_id_present_map, phys_id_present_map, tmp); ++ } ++ ++ ++ /* ++ * We need to adjust the IRQ routing table ++ * if the ID changed. ++ */ ++ if (old_id != mp_ioapics[apic_id].apicid) ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].dstapic == old_id) ++ mp_irqs[i].dstapic ++ = mp_ioapics[apic_id].apicid; ++ ++ /* ++ * Read the right value from the MPC table and ++ * write it into the ID register. ++ */ ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "...changing IO-APIC physical APIC ID to %d ...", ++ mp_ioapics[apic_id].apicid); ++ ++ reg_00.bits.ID = mp_ioapics[apic_id].apicid; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(apic_id, 0, reg_00.raw); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ /* ++ * Sanity check ++ */ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic_id, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) ++ printk("could not set ID!\n"); ++ else ++ apic_printk(APIC_VERBOSE, " ok.\n"); ++ } ++} ++#endif ++ ++int no_timer_check __initdata; ++ ++static int __init notimercheck(char *s) ++{ ++ no_timer_check = 1; ++ return 1; ++} ++__setup("no_timer_check", notimercheck); ++ ++/* ++ * There is a nasty bug in some older SMP boards, their mptable lies ++ * about the timer IRQ. We do the following to work around the situation: ++ * ++ * - timer IRQ defaults to IO-APIC IRQ ++ * - if this function detects that timer IRQs are defunct, then we fall ++ * back to ISA timer IRQs ++ */ ++static int __init timer_irq_works(void) ++{ ++ unsigned long t1 = jiffies; ++ unsigned long flags; ++ ++ if (no_timer_check) ++ return 1; ++ ++ local_save_flags(flags); ++ local_irq_enable(); ++ /* Let ten ticks pass... */ ++ mdelay((10 * 1000) / HZ); ++ local_irq_restore(flags); ++ ++ /* ++ * Expect a few ticks at least, to be sure some possible ++ * glue logic does not lock up after one or two first ++ * ticks in a non-ExtINT mode. Also the local APIC ++ * might have cached one ExtINT interrupt. Finally, at ++ * least one tick may be lost due to delays. ++ */ ++ ++ /* jiffies wrap? */ ++ if (time_after(jiffies, t1 + 4) && ++ time_before(jiffies, t1 + 16)) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * In the SMP+IOAPIC case it might happen that there are an unspecified ++ * number of pending IRQ events unhandled. These cases are very rare, ++ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much ++ * better to do it this way as thus we do not have to be aware of ++ * 'pending' interrupts in the IRQ path, except at this point. ++ */ ++/* ++ * Edge triggered needs to resend any interrupt ++ * that was delayed but this is now handled in the device ++ * independent code. ++ */ ++ ++/* ++ * Starting up a edge-triggered IO-APIC interrupt is ++ * nasty - we need to make sure that we get the edge. ++ * If it is already asserted for some reason, we need ++ * return 1 to indicate that is was pending. ++ * ++ * This is not complete - we should be able to fake ++ * an edge even if it isn't on the 8259A... ++ */ ++ ++static unsigned int startup_ioapic_irq(unsigned int irq) ++{ ++ int was_pending = 0; ++ unsigned long flags; ++ struct irq_cfg *cfg; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ if (irq < NR_IRQS_LEGACY) { ++ disable_8259A_irq(irq); ++ if (i8259A_irq_pending(irq)) ++ was_pending = 1; ++ } ++ cfg = irq_cfg(irq); ++ __unmask_IO_APIC_irq(cfg); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return was_pending; ++} ++ ++#ifdef CONFIG_X86_64 ++static int ioapic_retrigger_irq(unsigned int irq) ++{ ++ ++ struct irq_cfg *cfg = irq_cfg(irq); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); ++ spin_unlock_irqrestore(&vector_lock, flags); ++ ++ return 1; ++} ++#else ++static int ioapic_retrigger_irq(unsigned int irq) ++{ ++ apic->send_IPI_self(irq_cfg(irq)->vector); ++ ++ return 1; ++} ++#endif ++ ++/* ++ * Level and edge triggered IO-APIC interrupts need different handling, ++ * so we use two separate IRQ descriptors. Edge triggered IRQs can be ++ * handled with the level-triggered descriptor, but that one has slightly ++ * more overhead. Level-triggered interrupts cannot be handled with the ++ * edge-triggered handler, without risking IRQ storms and other ugly ++ * races. ++ */ ++ ++#ifdef CONFIG_SMP ++ ++#ifdef CONFIG_INTR_REMAP ++ ++/* ++ * Migrate the IO-APIC irq in the presence of intr-remapping. ++ * ++ * For both level and edge triggered, irq migration is a simple atomic ++ * update(of vector and cpu destination) of IRTE and flush the hardware cache. ++ * ++ * For level triggered, we eliminate the io-apic RTE modification (with the ++ * updated vector information), by using a virtual vector (io-apic pin number). ++ * Real vector that is used for interrupting cpu will be coming from ++ * the interrupt-remapping table entry. ++ */ ++static void ++migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) ++{ ++ struct irq_cfg *cfg; ++ struct irte irte; ++ unsigned int dest; ++ unsigned int irq; ++ ++ if (!cpumask_intersects(mask, cpu_online_mask)) ++ return; ++ ++ irq = desc->irq; ++ if (get_irte(irq, &irte)) ++ return; ++ ++ cfg = desc->chip_data; ++ if (assign_irq_vector(irq, cfg, mask)) ++ return; ++ ++ set_extra_move_desc(desc, mask); ++ ++ dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); ++ ++ irte.vector = cfg->vector; ++ irte.dest_id = IRTE_DEST(dest); ++ ++ /* ++ * Modified the IRTE and flushes the Interrupt entry cache. ++ */ ++ modify_irte(irq, &irte); ++ ++ if (cfg->move_in_progress) ++ send_cleanup_vector(cfg); ++ ++ cpumask_copy(desc->affinity, mask); ++} ++ ++/* ++ * Migrates the IRQ destination in the process context. ++ */ ++static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, ++ const struct cpumask *mask) ++{ ++ migrate_ioapic_irq_desc(desc, mask); ++} ++static void set_ir_ioapic_affinity_irq(unsigned int irq, ++ const struct cpumask *mask) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ set_ir_ioapic_affinity_irq_desc(desc, mask); ++} ++#else ++static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, ++ const struct cpumask *mask) ++{ ++} ++#endif ++ ++asmlinkage void smp_irq_move_cleanup_interrupt(void) ++{ ++ unsigned vector, me; ++ ++ ack_APIC_irq(); ++ exit_idle(); ++ irq_enter(); ++ ++ me = smp_processor_id(); ++ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { ++ unsigned int irq; ++ unsigned int irr; ++ struct irq_desc *desc; ++ struct irq_cfg *cfg; ++ irq = __get_cpu_var(vector_irq)[vector]; ++ ++ if (irq == -1) ++ continue; ++ ++ desc = irq_to_desc(irq); ++ if (!desc) ++ continue; ++ ++ cfg = irq_cfg(irq); ++ spin_lock(&desc->lock); ++ if (!cfg->move_cleanup_count) ++ goto unlock; ++ ++ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) ++ goto unlock; ++ ++ irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); ++ /* ++ * Check if the vector that needs to be cleanedup is ++ * registered at the cpu's IRR. If so, then this is not ++ * the best time to clean it up. Lets clean it up in the ++ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR ++ * to myself. ++ */ ++ if (irr & (1 << (vector % 32))) { ++ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); ++ goto unlock; ++ } ++ __get_cpu_var(vector_irq)[vector] = -1; ++ cfg->move_cleanup_count--; ++unlock: ++ spin_unlock(&desc->lock); ++ } ++ ++ irq_exit(); ++} ++ ++static void irq_complete_move(struct irq_desc **descp) ++{ ++ struct irq_desc *desc = *descp; ++ struct irq_cfg *cfg = desc->chip_data; ++ unsigned vector, me; ++ ++ if (likely(!cfg->move_in_progress)) { ++#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC ++ if (likely(!cfg->move_desc_pending)) ++ return; ++ ++ /* domain has not changed, but affinity did */ ++ me = smp_processor_id(); ++ if (cpumask_test_cpu(me, desc->affinity)) { ++ *descp = desc = move_irq_desc(desc, me); ++ /* get the new one */ ++ cfg = desc->chip_data; ++ cfg->move_desc_pending = 0; ++ } ++#endif ++ return; ++ } ++ ++ vector = ~get_irq_regs()->orig_ax; ++ me = smp_processor_id(); ++ ++ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { ++#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC ++ *descp = desc = move_irq_desc(desc, me); ++ /* get the new one */ ++ cfg = desc->chip_data; ++#endif ++ send_cleanup_vector(cfg); ++ } ++} ++#else ++static inline void irq_complete_move(struct irq_desc **descp) {} ++#endif ++ ++#ifdef CONFIG_INTR_REMAP ++static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) ++{ ++ int apic, pin; ++ struct irq_pin_list *entry; ++ ++ entry = cfg->irq_2_pin; ++ for (;;) { ++ ++ if (!entry) ++ break; ++ ++ apic = entry->apic; ++ pin = entry->pin; ++ io_apic_eoi(apic, pin); ++ entry = entry->next; ++ } ++} ++ ++static void ++eoi_ioapic_irq(struct irq_desc *desc) ++{ ++ struct irq_cfg *cfg; ++ unsigned long flags; ++ unsigned int irq; ++ ++ irq = desc->irq; ++ cfg = desc->chip_data; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __eoi_ioapic_irq(irq, cfg); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void ack_x2apic_level(unsigned int irq) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ ack_x2APIC_irq(); ++ eoi_ioapic_irq(desc); ++} ++ ++static void ack_x2apic_edge(unsigned int irq) ++{ ++ ack_x2APIC_irq(); ++} ++ ++#endif ++ ++static void ack_apic_edge(unsigned int irq) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ irq_complete_move(&desc); ++ move_native_irq(irq); ++ ack_APIC_irq(); ++} ++ ++atomic_t irq_mis_count; ++ ++static void ack_apic_level(unsigned int irq) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++#ifdef CONFIG_X86_32 ++ unsigned long v; ++ int i; ++#endif ++ struct irq_cfg *cfg; ++ int do_unmask_irq = 0; ++ ++ irq_complete_move(&desc); ++#ifdef CONFIG_GENERIC_PENDING_IRQ ++ /* If we are moving the irq we need to mask it */ ++ if (unlikely(desc->status & IRQ_MOVE_PENDING) && ++ !(desc->status & IRQ_INPROGRESS)) { ++ do_unmask_irq = 1; ++ mask_IO_APIC_irq_desc(desc); ++ } ++#endif ++ ++#ifdef CONFIG_X86_32 ++ /* ++ * It appears there is an erratum which affects at least version 0x11 ++ * of I/O APIC (that's the 82093AA and cores integrated into various ++ * chipsets). Under certain conditions a level-triggered interrupt is ++ * erroneously delivered as edge-triggered one but the respective IRR ++ * bit gets set nevertheless. As a result the I/O unit expects an EOI ++ * message but it will never arrive and further interrupts are blocked ++ * from the source. The exact reason is so far unknown, but the ++ * phenomenon was observed when two consecutive interrupt requests ++ * from a given source get delivered to the same CPU and the source is ++ * temporarily disabled in between. ++ * ++ * A workaround is to simulate an EOI message manually. We achieve it ++ * by setting the trigger mode to edge and then to level when the edge ++ * trigger mode gets detected in the TMR of a local APIC for a ++ * level-triggered interrupt. We mask the source for the time of the ++ * operation to prevent an edge-triggered interrupt escaping meanwhile. ++ * The idea is from Manfred Spraul. --macro ++ */ ++ cfg = desc->chip_data; ++ i = cfg->vector; ++ ++ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); ++#endif ++ ++ /* ++ * We must acknowledge the irq before we move it or the acknowledge will ++ * not propagate properly. ++ */ ++ ack_APIC_irq(); ++ ++ /* Now we can move and renable the irq */ ++ if (unlikely(do_unmask_irq)) { ++ /* Only migrate the irq if the ack has been received. ++ * ++ * On rare occasions the broadcast level triggered ack gets ++ * delayed going to ioapics, and if we reprogram the ++ * vector while Remote IRR is still set the irq will never ++ * fire again. ++ * ++ * To prevent this scenario we read the Remote IRR bit ++ * of the ioapic. This has two effects. ++ * - On any sane system the read of the ioapic will ++ * flush writes (and acks) going to the ioapic from ++ * this cpu. ++ * - We get to see if the ACK has actually been delivered. ++ * ++ * Based on failed experiments of reprogramming the ++ * ioapic entry from outside of irq context starting ++ * with masking the ioapic entry and then polling until ++ * Remote IRR was clear before reprogramming the ++ * ioapic I don't trust the Remote IRR bit to be ++ * completey accurate. ++ * ++ * However there appears to be no other way to plug ++ * this race, so if the Remote IRR bit is not ++ * accurate and is causing problems then it is a hardware bug ++ * and you can go talk to the chipset vendor about it. ++ */ ++ cfg = desc->chip_data; ++ if (!io_apic_level_ack_pending(cfg)) ++ move_masked_irq(irq); ++ unmask_IO_APIC_irq_desc(desc); ++ } ++#if (defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)) && \ ++ defined(CONFIG_PREEMPT_HARDIRQS) ++ /* ++ * With threaded interrupts, we always have IRQ_INPROGRESS ++ * when acking. ++ */ ++ else if (unlikely(desc->status & IRQ_MOVE_PENDING)) ++ move_masked_irq(irq); ++#endif ++ ++#ifdef CONFIG_X86_32 ++ if (!(v & (1 << (i & 0x1f)))) { ++ atomic_inc(&irq_mis_count); ++ spin_lock(&ioapic_lock); ++ __mask_and_edge_IO_APIC_irq(cfg); ++ __unmask_and_level_IO_APIC_irq(cfg); ++ spin_unlock(&ioapic_lock); ++ } ++#endif ++} ++ ++static struct irq_chip ioapic_chip __read_mostly = { ++ .name = "IO-APIC", ++ .startup = startup_ioapic_irq, ++ .mask = mask_IO_APIC_irq, ++ .unmask = unmask_IO_APIC_irq, ++ .ack = ack_apic_edge, ++ .eoi = ack_apic_level, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ioapic_affinity_irq, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++static struct irq_chip ir_ioapic_chip __read_mostly = { ++ .name = "IR-IO-APIC", ++ .startup = startup_ioapic_irq, ++ .mask = mask_IO_APIC_irq, ++ .unmask = unmask_IO_APIC_irq, ++#ifdef CONFIG_INTR_REMAP ++ .ack = ack_x2apic_edge, ++ .eoi = ack_x2apic_level, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ir_ioapic_affinity_irq, ++#endif ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++static inline void init_IO_APIC_traps(void) ++{ ++ int irq; ++ struct irq_desc *desc; ++ struct irq_cfg *cfg; ++ ++ /* ++ * NOTE! The local APIC isn't very good at handling ++ * multiple interrupts at the same interrupt level. ++ * As the interrupt level is determined by taking the ++ * vector number and shifting that right by 4, we ++ * want to spread these out a bit so that they don't ++ * all fall in the same interrupt level. ++ * ++ * Also, we've got to be careful not to trash gate ++ * 0x80, because int 0x80 is hm, kind of importantish. ;) ++ */ ++ for_each_irq_desc(irq, desc) { ++ cfg = desc->chip_data; ++ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { ++ /* ++ * Hmm.. We don't have an entry for this, ++ * so default to an old-fashioned 8259 ++ * interrupt if we can.. ++ */ ++ if (irq < NR_IRQS_LEGACY) ++ make_8259A_irq(irq); ++ else ++ /* Strange. Oh, well.. */ ++ desc->chip = &no_irq_chip; ++ } ++ } ++} ++ ++/* ++ * The local APIC irq-chip implementation: ++ */ ++ ++static void mask_lapic_irq(unsigned int irq) ++{ ++ unsigned long v; ++ ++ v = apic_read(APIC_LVT0); ++ apic_write(APIC_LVT0, v | APIC_LVT_MASKED); ++} ++ ++static void unmask_lapic_irq(unsigned int irq) ++{ ++ unsigned long v; ++ ++ v = apic_read(APIC_LVT0); ++ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); ++} ++ ++static void ack_lapic_irq(unsigned int irq) ++{ ++ ack_APIC_irq(); ++} ++ ++static struct irq_chip lapic_chip __read_mostly = { ++ .name = "local-APIC", ++ .mask = mask_lapic_irq, ++ .unmask = unmask_lapic_irq, ++ .ack = ack_lapic_irq, ++}; ++ ++static void lapic_register_intr(int irq, struct irq_desc *desc) ++{ ++ desc->status &= ~IRQ_LEVEL; ++ set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, ++ "edge"); ++} ++ ++static void __init setup_nmi(void) ++{ ++ /* ++ * Dirty trick to enable the NMI watchdog ... ++ * We put the 8259A master into AEOI mode and ++ * unmask on all local APICs LVT0 as NMI. ++ * ++ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') ++ * is from Maciej W. Rozycki - so we do not have to EOI from ++ * the NMI handler or the timer interrupt. ++ */ ++ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); ++ ++ enable_NMI_through_LVT0(); ++ ++ apic_printk(APIC_VERBOSE, " done.\n"); ++} ++ ++/* ++ * This looks a bit hackish but it's about the only one way of sending ++ * a few INTA cycles to 8259As and any associated glue logic. ICR does ++ * not support the ExtINT mode, unfortunately. We need to send these ++ * cycles as some i82489DX-based boards have glue logic that keeps the ++ * 8259A interrupt line asserted until INTA. --macro ++ */ ++static inline void __init unlock_ExtINT_logic(void) ++{ ++ int apic, pin, i; ++ struct IO_APIC_route_entry entry0, entry1; ++ unsigned char save_control, save_freq_select; ++ ++ pin = find_isa_irq_pin(8, mp_INT); ++ if (pin == -1) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ apic = find_isa_irq_apic(8, mp_INT); ++ if (apic == -1) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ ++ entry0 = ioapic_read_entry(apic, pin); ++ clear_IO_APIC_pin(apic, pin); ++ ++ memset(&entry1, 0, sizeof(entry1)); ++ ++ entry1.dest_mode = 0; /* physical delivery */ ++ entry1.mask = 0; /* unmask IRQ now */ ++ entry1.dest = hard_smp_processor_id(); ++ entry1.delivery_mode = dest_ExtINT; ++ entry1.polarity = entry0.polarity; ++ entry1.trigger = 0; ++ entry1.vector = 0; ++ ++ ioapic_write_entry(apic, pin, entry1); ++ ++ save_control = CMOS_READ(RTC_CONTROL); ++ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); ++ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, ++ RTC_FREQ_SELECT); ++ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); ++ ++ i = 100; ++ while (i-- > 0) { ++ mdelay(10); ++ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) ++ i -= 10; ++ } ++ ++ CMOS_WRITE(save_control, RTC_CONTROL); ++ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); ++ clear_IO_APIC_pin(apic, pin); ++ ++ ioapic_write_entry(apic, pin, entry0); ++} ++ ++static int disable_timer_pin_1 __initdata; ++/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ ++static int __init disable_timer_pin_setup(char *arg) ++{ ++ disable_timer_pin_1 = 1; ++ return 0; ++} ++early_param("disable_timer_pin_1", disable_timer_pin_setup); ++ ++int timer_through_8259 __initdata; ++ ++/* ++ * This code may look a bit paranoid, but it's supposed to cooperate with ++ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ ++ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast ++ * fanatically on his truly buggy board. ++ * ++ * FIXME: really need to revamp this for all platforms. ++ */ ++static inline void __init check_timer(void) ++{ ++ struct irq_desc *desc = irq_to_desc(0); ++ struct irq_cfg *cfg = desc->chip_data; ++ int cpu = boot_cpu_id; ++ int apic1, pin1, apic2, pin2; ++ unsigned long flags; ++ int no_pin1 = 0; ++ ++ local_irq_save(flags); ++ ++ /* ++ * get/set the timer IRQ vector: ++ */ ++ disable_8259A_irq(0); ++ assign_irq_vector(0, cfg, apic->target_cpus()); ++ ++ /* ++ * As IRQ0 is to be enabled in the 8259A, the virtual ++ * wire has to be disabled in the local APIC. Also ++ * timer interrupts need to be acknowledged manually in ++ * the 8259A for the i82489DX when using the NMI ++ * watchdog as that APIC treats NMIs as level-triggered. ++ * The AEOI mode will finish them in the 8259A ++ * automatically. ++ */ ++ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); ++ init_8259A(1); ++#ifdef CONFIG_X86_32 ++ { ++ unsigned int ver; ++ ++ ver = apic_read(APIC_LVR); ++ ver = GET_APIC_VERSION(ver); ++ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); ++ } ++#endif ++ ++ pin1 = find_isa_irq_pin(0, mp_INT); ++ apic1 = find_isa_irq_apic(0, mp_INT); ++ pin2 = ioapic_i8259.pin; ++ apic2 = ioapic_i8259.apic; ++ ++ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " ++ "apic1=%d pin1=%d apic2=%d pin2=%d\n", ++ cfg->vector, apic1, pin1, apic2, pin2); ++ ++ /* ++ * Some BIOS writers are clueless and report the ExtINTA ++ * I/O APIC input from the cascaded 8259A as the timer ++ * interrupt input. So just in case, if only one pin ++ * was found above, try it both directly and through the ++ * 8259A. ++ */ ++ if (pin1 == -1) { ++ if (intr_remapping_enabled) ++ panic("BIOS bug: timer not connected to IO-APIC"); ++ pin1 = pin2; ++ apic1 = apic2; ++ no_pin1 = 1; ++ } else if (pin2 == -1) { ++ pin2 = pin1; ++ apic2 = apic1; ++ } ++ ++ if (pin1 != -1) { ++ /* ++ * Ok, does IRQ0 through the IOAPIC work? ++ */ ++ if (no_pin1) { ++ add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); ++ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); ++ } else { ++ /* for edge trigger, setup_IO_APIC_irq already ++ * leave it unmasked. ++ * so only need to unmask if it is level-trigger ++ * do we really have level trigger timer? ++ */ ++ int idx; ++ idx = find_irq_entry(apic1, pin1, mp_INT); ++ if (idx != -1 && irq_trigger(idx)) ++ unmask_IO_APIC_irq_desc(desc); ++ } ++ if (timer_irq_works()) { ++ if (nmi_watchdog == NMI_IO_APIC) { ++ setup_nmi(); ++ enable_8259A_irq(0); ++ } ++ if (disable_timer_pin_1 > 0) ++ clear_IO_APIC_pin(0, pin1); ++ goto out; ++ } ++ if (intr_remapping_enabled) ++ panic("timer doesn't work through Interrupt-remapped IO-APIC"); ++ local_irq_disable(); ++ clear_IO_APIC_pin(apic1, pin1); ++ if (!no_pin1) ++ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " ++ "8254 timer not connected to IO-APIC\n"); ++ ++ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " ++ "(IRQ0) through the 8259A ...\n"); ++ apic_printk(APIC_QUIET, KERN_INFO ++ "..... (found apic %d pin %d) ...\n", apic2, pin2); ++ /* ++ * legacy devices should be connected to IO APIC #0 ++ */ ++ replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); ++ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); ++ enable_8259A_irq(0); ++ if (timer_irq_works()) { ++ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); ++ timer_through_8259 = 1; ++ if (nmi_watchdog == NMI_IO_APIC) { ++ disable_8259A_irq(0); ++ setup_nmi(); ++ enable_8259A_irq(0); ++ } ++ goto out; ++ } ++ /* ++ * Cleanup, just in case ... ++ */ ++ local_irq_disable(); ++ disable_8259A_irq(0); ++ clear_IO_APIC_pin(apic2, pin2); ++ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); ++ } ++ ++ if (nmi_watchdog == NMI_IO_APIC) { ++ apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " ++ "through the IO-APIC - disabling NMI Watchdog!\n"); ++ nmi_watchdog = NMI_NONE; ++ } ++#ifdef CONFIG_X86_32 ++ timer_ack = 0; ++#endif ++ ++ apic_printk(APIC_QUIET, KERN_INFO ++ "...trying to set up timer as Virtual Wire IRQ...\n"); ++ ++ lapic_register_intr(0, desc); ++ apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ ++ enable_8259A_irq(0); ++ ++ if (timer_irq_works()) { ++ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); ++ goto out; ++ } ++ local_irq_disable(); ++ disable_8259A_irq(0); ++ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); ++ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); ++ ++ apic_printk(APIC_QUIET, KERN_INFO ++ "...trying to set up timer as ExtINT IRQ...\n"); ++ ++ init_8259A(0); ++ make_8259A_irq(0); ++ apic_write(APIC_LVT0, APIC_DM_EXTINT); ++ ++ unlock_ExtINT_logic(); ++ ++ if (timer_irq_works()) { ++ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); ++ goto out; ++ } ++ local_irq_disable(); ++ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); ++ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " ++ "report. Then try booting with the 'noapic' option.\n"); ++out: ++ local_irq_restore(flags); ++} ++ ++/* ++ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available ++ * to devices. However there may be an I/O APIC pin available for ++ * this interrupt regardless. The pin may be left unconnected, but ++ * typically it will be reused as an ExtINT cascade interrupt for ++ * the master 8259A. In the MPS case such a pin will normally be ++ * reported as an ExtINT interrupt in the MP table. With ACPI ++ * there is no provision for ExtINT interrupts, and in the absence ++ * of an override it would be treated as an ordinary ISA I/O APIC ++ * interrupt, that is edge-triggered and unmasked by default. We ++ * used to do this, but it caused problems on some systems because ++ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using ++ * the same ExtINT cascade interrupt to drive the local APIC of the ++ * bootstrap processor. Therefore we refrain from routing IRQ2 to ++ * the I/O APIC in all cases now. No actual device should request ++ * it anyway. --macro ++ */ ++#define PIC_IRQS (1 << PIC_CASCADE_IR) ++ ++void __init setup_IO_APIC(void) ++{ ++ ++ /* ++ * calling enable_IO_APIC() is moved to setup_local_APIC for BP ++ */ ++ ++ io_apic_irqs = ~PIC_IRQS; ++ ++ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); ++ /* ++ * Set up IO-APIC IRQ routing. ++ */ ++#ifdef CONFIG_X86_32 ++ if (!acpi_ioapic) ++ setup_ioapic_ids_from_mpc(); ++#endif ++ sync_Arb_IDs(); ++ setup_IO_APIC_irqs(); ++ init_IO_APIC_traps(); ++ check_timer(); ++} ++ ++/* ++ * Called after all the initialization is done. If we didnt find any ++ * APIC bugs then we can allow the modify fast path ++ */ ++ ++static int __init io_apic_bug_finalize(void) ++{ ++ if (sis_apic_bug == -1) ++ sis_apic_bug = 0; ++ return 0; ++} ++ ++late_initcall(io_apic_bug_finalize); ++ ++struct sysfs_ioapic_data { ++ struct sys_device dev; ++ struct IO_APIC_route_entry entry[0]; ++}; ++static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; ++ ++static int ioapic_suspend(struct sys_device *dev, pm_message_t state) ++{ ++ struct IO_APIC_route_entry *entry; ++ struct sysfs_ioapic_data *data; ++ int i; ++ ++ data = container_of(dev, struct sysfs_ioapic_data, dev); ++ entry = data->entry; ++ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) ++ *entry = ioapic_read_entry(dev->id, i); ++ ++ return 0; ++} ++ ++static int ioapic_resume(struct sys_device *dev) ++{ ++ struct IO_APIC_route_entry *entry; ++ struct sysfs_ioapic_data *data; ++ unsigned long flags; ++ union IO_APIC_reg_00 reg_00; ++ int i; ++ ++ data = container_of(dev, struct sysfs_ioapic_data, dev); ++ entry = data->entry; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(dev->id, 0); ++ if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { ++ reg_00.bits.ID = mp_ioapics[dev->id].apicid; ++ io_apic_write(dev->id, 0, reg_00.raw); ++ } ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ++ ioapic_write_entry(dev->id, i, entry[i]); ++ ++ return 0; ++} ++ ++static struct sysdev_class ioapic_sysdev_class = { ++ .name = "ioapic", ++ .suspend = ioapic_suspend, ++ .resume = ioapic_resume, ++}; ++ ++static int __init ioapic_init_sysfs(void) ++{ ++ struct sys_device * dev; ++ int i, size, error; ++ ++ error = sysdev_class_register(&ioapic_sysdev_class); ++ if (error) ++ return error; ++ ++ for (i = 0; i < nr_ioapics; i++ ) { ++ size = sizeof(struct sys_device) + nr_ioapic_registers[i] ++ * sizeof(struct IO_APIC_route_entry); ++ mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); ++ if (!mp_ioapic_data[i]) { ++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); ++ continue; ++ } ++ dev = &mp_ioapic_data[i]->dev; ++ dev->id = i; ++ dev->cls = &ioapic_sysdev_class; ++ error = sysdev_register(dev); ++ if (error) { ++ kfree(mp_ioapic_data[i]); ++ mp_ioapic_data[i] = NULL; ++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); ++ continue; ++ } ++ } ++ ++ return 0; ++} ++ ++device_initcall(ioapic_init_sysfs); ++ ++static int nr_irqs_gsi = NR_IRQS_LEGACY; ++/* ++ * Dynamic irq allocate and deallocation ++ */ ++unsigned int create_irq_nr(unsigned int irq_want) ++{ ++ /* Allocate an unused irq */ ++ unsigned int irq; ++ unsigned int new; ++ unsigned long flags; ++ struct irq_cfg *cfg_new = NULL; ++ int cpu = boot_cpu_id; ++ struct irq_desc *desc_new = NULL; ++ ++ irq = 0; ++ if (irq_want < nr_irqs_gsi) ++ irq_want = nr_irqs_gsi; ++ ++ for (new = irq_want; new < nr_irqs; new++) { ++ desc_new = irq_to_desc_alloc_cpu(new, cpu); ++ if (!desc_new) { ++ printk(KERN_INFO "can not get irq_desc for %d\n", new); ++ continue; ++ } ++ cfg_new = desc_new->chip_data; ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ if (cfg_new->vector != 0) { ++ spin_unlock_irqrestore(&vector_lock, flags); ++ continue; ++ } ++ if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) ++ irq = new; ++ spin_unlock_irqrestore(&vector_lock, flags); ++ break; ++ } ++ ++ if (irq > 0) { ++ dynamic_irq_init(irq); ++ /* restore it, in case dynamic_irq_init clear it */ ++ if (desc_new) ++ desc_new->chip_data = cfg_new; ++ } ++ return irq; ++} ++ ++int create_irq(void) ++{ ++ unsigned int irq_want; ++ int irq; ++ ++ irq_want = nr_irqs_gsi; ++ irq = create_irq_nr(irq_want); ++ ++ if (irq == 0) ++ irq = -1; ++ ++ return irq; ++} ++ ++void destroy_irq(unsigned int irq) ++{ ++ unsigned long flags; ++ struct irq_cfg *cfg; ++ struct irq_desc *desc; ++ ++ /* store it, in case dynamic_irq_cleanup clear it */ ++ desc = irq_to_desc(irq); ++ cfg = desc->chip_data; ++ dynamic_irq_cleanup(irq); ++ /* connect back irq_cfg */ ++ if (desc) ++ desc->chip_data = cfg; ++ ++ free_irte(irq); ++ spin_lock_irqsave(&vector_lock, flags); ++ __clear_irq_vector(irq, cfg); ++ spin_unlock_irqrestore(&vector_lock, flags); ++} ++ ++/* ++ * MSI message composition ++ */ ++#ifdef CONFIG_PCI_MSI ++static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) ++{ ++ struct irq_cfg *cfg; ++ int err; ++ unsigned dest; ++ ++ if (disable_apic) ++ return -ENXIO; ++ ++ cfg = irq_cfg(irq); ++ err = assign_irq_vector(irq, cfg, apic->target_cpus()); ++ if (err) ++ return err; ++ ++ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); ++ ++ if (irq_remapped(irq)) { ++ struct irte irte; ++ int ir_index; ++ u16 sub_handle; ++ ++ ir_index = map_irq_to_irte_handle(irq, &sub_handle); ++ BUG_ON(ir_index == -1); ++ ++ memset (&irte, 0, sizeof(irte)); ++ ++ irte.present = 1; ++ irte.dst_mode = apic->irq_dest_mode; ++ irte.trigger_mode = 0; /* edge */ ++ irte.dlvry_mode = apic->irq_delivery_mode; ++ irte.vector = cfg->vector; ++ irte.dest_id = IRTE_DEST(dest); ++ ++ modify_irte(irq, &irte); ++ ++ msg->address_hi = MSI_ADDR_BASE_HI; ++ msg->data = sub_handle; ++ msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | ++ MSI_ADDR_IR_SHV | ++ MSI_ADDR_IR_INDEX1(ir_index) | ++ MSI_ADDR_IR_INDEX2(ir_index); ++ } else { ++ if (x2apic_enabled()) ++ msg->address_hi = MSI_ADDR_BASE_HI | ++ MSI_ADDR_EXT_DEST_ID(dest); ++ else ++ msg->address_hi = MSI_ADDR_BASE_HI; ++ ++ msg->address_lo = ++ MSI_ADDR_BASE_LO | ++ ((apic->irq_dest_mode == 0) ? ++ MSI_ADDR_DEST_MODE_PHYSICAL: ++ MSI_ADDR_DEST_MODE_LOGICAL) | ++ ((apic->irq_delivery_mode != dest_LowestPrio) ? ++ MSI_ADDR_REDIRECTION_CPU: ++ MSI_ADDR_REDIRECTION_LOWPRI) | ++ MSI_ADDR_DEST_ID(dest); ++ ++ msg->data = ++ MSI_DATA_TRIGGER_EDGE | ++ MSI_DATA_LEVEL_ASSERT | ++ ((apic->irq_delivery_mode != dest_LowestPrio) ? ++ MSI_DATA_DELIVERY_FIXED: ++ MSI_DATA_DELIVERY_LOWPRI) | ++ MSI_DATA_VECTOR(cfg->vector); ++ } ++ return err; ++} ++ ++#ifdef CONFIG_SMP ++static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ struct irq_cfg *cfg; ++ struct msi_msg msg; ++ unsigned int dest; ++ ++ dest = set_desc_affinity(desc, mask); ++ if (dest == BAD_APICID) ++ return; ++ ++ cfg = desc->chip_data; ++ ++ read_msi_msg_desc(desc, &msg); ++ ++ msg.data &= ~MSI_DATA_VECTOR_MASK; ++ msg.data |= MSI_DATA_VECTOR(cfg->vector); ++ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; ++ msg.address_lo |= MSI_ADDR_DEST_ID(dest); ++ ++ write_msi_msg_desc(desc, &msg); ++} ++#ifdef CONFIG_INTR_REMAP ++/* ++ * Migrate the MSI irq to another cpumask. This migration is ++ * done in the process context using interrupt-remapping hardware. ++ */ ++static void ++ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ struct irq_cfg *cfg = desc->chip_data; ++ unsigned int dest; ++ struct irte irte; ++ ++ if (get_irte(irq, &irte)) ++ return; ++ ++ dest = set_desc_affinity(desc, mask); ++ if (dest == BAD_APICID) ++ return; ++ ++ irte.vector = cfg->vector; ++ irte.dest_id = IRTE_DEST(dest); ++ ++ /* ++ * atomically update the IRTE with the new destination and vector. ++ */ ++ modify_irte(irq, &irte); ++ ++ /* ++ * After this point, all the interrupts will start arriving ++ * at the new destination. So, time to cleanup the previous ++ * vector allocation. ++ */ ++ if (cfg->move_in_progress) ++ send_cleanup_vector(cfg); ++} ++ ++#endif ++#endif /* CONFIG_SMP */ ++ ++/* ++ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, ++ * which implement the MSI or MSI-X Capability Structure. ++ */ ++static struct irq_chip msi_chip = { ++ .name = "PCI-MSI", ++ .unmask = unmask_msi_irq, ++ .mask = mask_msi_irq, ++ .ack = ack_apic_edge, ++#ifdef CONFIG_SMP ++ .set_affinity = set_msi_irq_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++static struct irq_chip msi_ir_chip = { ++ .name = "IR-PCI-MSI", ++ .unmask = unmask_msi_irq, ++ .mask = mask_msi_irq, ++#ifdef CONFIG_INTR_REMAP ++ .ack = ack_x2apic_edge, ++#ifdef CONFIG_SMP ++ .set_affinity = ir_set_msi_irq_affinity, ++#endif ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++/* ++ * Map the PCI dev to the corresponding remapping hardware unit ++ * and allocate 'nvec' consecutive interrupt-remapping table entries ++ * in it. ++ */ ++static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) ++{ ++ struct intel_iommu *iommu; ++ int index; ++ ++ iommu = map_dev_to_ir(dev); ++ if (!iommu) { ++ printk(KERN_ERR ++ "Unable to map PCI %s to iommu\n", pci_name(dev)); ++ return -ENOENT; ++ } ++ ++ index = alloc_irte(iommu, irq, nvec); ++ if (index < 0) { ++ printk(KERN_ERR ++ "Unable to allocate %d IRTE for PCI %s\n", nvec, ++ pci_name(dev)); ++ return -ENOSPC; ++ } ++ return index; ++} ++ ++static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) ++{ ++ int ret; ++ struct msi_msg msg; ++ ++ ret = msi_compose_msg(dev, irq, &msg); ++ if (ret < 0) ++ return ret; ++ ++ set_irq_msi(irq, msidesc); ++ write_msi_msg(irq, &msg); ++ ++ if (irq_remapped(irq)) { ++ struct irq_desc *desc = irq_to_desc(irq); ++ /* ++ * irq migration in process context ++ */ ++ desc->status |= IRQ_MOVE_PCNTXT; ++ set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); ++ } else ++ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); ++ ++ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); ++ ++ return 0; ++} ++ ++int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ unsigned int irq; ++ int ret, sub_handle; ++ struct msi_desc *msidesc; ++ unsigned int irq_want; ++ struct intel_iommu *iommu = NULL; ++ int index = 0; ++ ++ irq_want = nr_irqs_gsi; ++ sub_handle = 0; ++ list_for_each_entry(msidesc, &dev->msi_list, list) { ++ irq = create_irq_nr(irq_want); ++ if (irq == 0) ++ return -1; ++ irq_want = irq + 1; ++ if (!intr_remapping_enabled) ++ goto no_ir; ++ ++ if (!sub_handle) { ++ /* ++ * allocate the consecutive block of IRTE's ++ * for 'nvec' ++ */ ++ index = msi_alloc_irte(dev, irq, nvec); ++ if (index < 0) { ++ ret = index; ++ goto error; ++ } ++ } else { ++ iommu = map_dev_to_ir(dev); ++ if (!iommu) { ++ ret = -ENOENT; ++ goto error; ++ } ++ /* ++ * setup the mapping between the irq and the IRTE ++ * base index, the sub_handle pointing to the ++ * appropriate interrupt remap table entry. ++ */ ++ set_irte_irq(irq, iommu, index, sub_handle); ++ } ++no_ir: ++ ret = setup_msi_irq(dev, msidesc, irq); ++ if (ret < 0) ++ goto error; ++ sub_handle++; ++ } ++ return 0; ++ ++error: ++ destroy_irq(irq); ++ return ret; ++} ++ ++void arch_teardown_msi_irq(unsigned int irq) ++{ ++ destroy_irq(irq); ++} ++ ++#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) ++#ifdef CONFIG_SMP ++static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ struct irq_cfg *cfg; ++ struct msi_msg msg; ++ unsigned int dest; ++ ++ dest = set_desc_affinity(desc, mask); ++ if (dest == BAD_APICID) ++ return; ++ ++ cfg = desc->chip_data; ++ ++ dmar_msi_read(irq, &msg); ++ ++ msg.data &= ~MSI_DATA_VECTOR_MASK; ++ msg.data |= MSI_DATA_VECTOR(cfg->vector); ++ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; ++ msg.address_lo |= MSI_ADDR_DEST_ID(dest); ++ ++ dmar_msi_write(irq, &msg); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++struct irq_chip dmar_msi_type = { ++ .name = "DMAR_MSI", ++ .unmask = dmar_msi_unmask, ++ .mask = dmar_msi_mask, ++ .ack = ack_apic_edge, ++#ifdef CONFIG_SMP ++ .set_affinity = dmar_msi_set_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_dmar_msi(unsigned int irq) ++{ ++ int ret; ++ struct msi_msg msg; ++ ++ ret = msi_compose_msg(NULL, irq, &msg); ++ if (ret < 0) ++ return ret; ++ dmar_msi_write(irq, &msg); ++ set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, ++ "edge"); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_HPET_TIMER ++ ++#ifdef CONFIG_SMP ++static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ struct irq_cfg *cfg; ++ struct msi_msg msg; ++ unsigned int dest; ++ ++ dest = set_desc_affinity(desc, mask); ++ if (dest == BAD_APICID) ++ return; ++ ++ cfg = desc->chip_data; ++ ++ hpet_msi_read(irq, &msg); ++ ++ msg.data &= ~MSI_DATA_VECTOR_MASK; ++ msg.data |= MSI_DATA_VECTOR(cfg->vector); ++ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; ++ msg.address_lo |= MSI_ADDR_DEST_ID(dest); ++ ++ hpet_msi_write(irq, &msg); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++static struct irq_chip hpet_msi_type = { ++ .name = "HPET_MSI", ++ .unmask = hpet_msi_unmask, ++ .mask = hpet_msi_mask, ++ .ack = ack_apic_edge, ++#ifdef CONFIG_SMP ++ .set_affinity = hpet_msi_set_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_hpet_msi(unsigned int irq) ++{ ++ int ret; ++ struct msi_msg msg; ++ ++ ret = msi_compose_msg(NULL, irq, &msg); ++ if (ret < 0) ++ return ret; ++ ++ hpet_msi_write(irq, &msg); ++ set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, ++ "edge"); ++ ++ return 0; ++} ++#endif ++ ++#endif /* CONFIG_PCI_MSI */ ++/* ++ * Hypertransport interrupt support ++ */ ++#ifdef CONFIG_HT_IRQ ++ ++#ifdef CONFIG_SMP ++ ++static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) ++{ ++ struct ht_irq_msg msg; ++ fetch_ht_irq_msg(irq, &msg); ++ ++ msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); ++ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); ++ ++ msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); ++ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); ++ ++ write_ht_irq_msg(irq, &msg); ++} ++ ++static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ struct irq_cfg *cfg; ++ unsigned int dest; ++ ++ dest = set_desc_affinity(desc, mask); ++ if (dest == BAD_APICID) ++ return; ++ ++ cfg = desc->chip_data; ++ ++ target_ht_irq(irq, dest, cfg->vector); ++} ++ ++#endif ++ ++static struct irq_chip ht_irq_chip = { ++ .name = "PCI-HT", ++ .mask = mask_ht_irq, ++ .unmask = unmask_ht_irq, ++ .ack = ack_apic_edge, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ht_irq_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) ++{ ++ struct irq_cfg *cfg; ++ int err; ++ ++ if (disable_apic) ++ return -ENXIO; ++ ++ cfg = irq_cfg(irq); ++ err = assign_irq_vector(irq, cfg, apic->target_cpus()); ++ if (!err) { ++ struct ht_irq_msg msg; ++ unsigned dest; ++ ++ dest = apic->cpu_mask_to_apicid_and(cfg->domain, ++ apic->target_cpus()); ++ ++ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); ++ ++ msg.address_lo = ++ HT_IRQ_LOW_BASE | ++ HT_IRQ_LOW_DEST_ID(dest) | ++ HT_IRQ_LOW_VECTOR(cfg->vector) | ++ ((apic->irq_dest_mode == 0) ? ++ HT_IRQ_LOW_DM_PHYSICAL : ++ HT_IRQ_LOW_DM_LOGICAL) | ++ HT_IRQ_LOW_RQEOI_EDGE | ++ ((apic->irq_delivery_mode != dest_LowestPrio) ? ++ HT_IRQ_LOW_MT_FIXED : ++ HT_IRQ_LOW_MT_ARBITRATED) | ++ HT_IRQ_LOW_IRQ_MASKED; ++ ++ write_ht_irq_msg(irq, &msg); ++ ++ set_irq_chip_and_handler_name(irq, &ht_irq_chip, ++ handle_edge_irq, "edge"); ++ ++ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); ++ } ++ return err; ++} ++#endif /* CONFIG_HT_IRQ */ ++ ++#ifdef CONFIG_X86_UV ++/* ++ * Re-target the irq to the specified CPU and enable the specified MMR located ++ * on the specified blade to allow the sending of MSIs to the specified CPU. ++ */ ++int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, ++ unsigned long mmr_offset) ++{ ++ const struct cpumask *eligible_cpu = cpumask_of(cpu); ++ struct irq_cfg *cfg; ++ int mmr_pnode; ++ unsigned long mmr_value; ++ struct uv_IO_APIC_route_entry *entry; ++ unsigned long flags; ++ int err; ++ ++ cfg = irq_cfg(irq); ++ ++ err = assign_irq_vector(irq, cfg, eligible_cpu); ++ if (err != 0) ++ return err; ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, ++ irq_name); ++ spin_unlock_irqrestore(&vector_lock, flags); ++ ++ mmr_value = 0; ++ entry = (struct uv_IO_APIC_route_entry *)&mmr_value; ++ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); ++ ++ entry->vector = cfg->vector; ++ entry->delivery_mode = apic->irq_delivery_mode; ++ entry->dest_mode = apic->irq_dest_mode; ++ entry->polarity = 0; ++ entry->trigger = 0; ++ entry->mask = 0; ++ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); ++ ++ mmr_pnode = uv_blade_to_pnode(mmr_blade); ++ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); ++ ++ return irq; ++} ++ ++/* ++ * Disable the specified MMR located on the specified blade so that MSIs are ++ * longer allowed to be sent. ++ */ ++void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) ++{ ++ unsigned long mmr_value; ++ struct uv_IO_APIC_route_entry *entry; ++ int mmr_pnode; ++ ++ mmr_value = 0; ++ entry = (struct uv_IO_APIC_route_entry *)&mmr_value; ++ BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); ++ ++ entry->mask = 1; ++ ++ mmr_pnode = uv_blade_to_pnode(mmr_blade); ++ uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); ++} ++#endif /* CONFIG_X86_64 */ ++ ++int __init io_apic_get_redir_entries (int ioapic) ++{ ++ union IO_APIC_reg_01 reg_01; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(ioapic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return reg_01.bits.entries; ++} ++ ++void __init probe_nr_irqs_gsi(void) ++{ ++ int nr = 0; ++ ++ nr = acpi_probe_gsi(); ++ if (nr > nr_irqs_gsi) { ++ nr_irqs_gsi = nr; ++ } else { ++ /* for acpi=off or acpi is not compiled in */ ++ int idx; ++ ++ nr = 0; ++ for (idx = 0; idx < nr_ioapics; idx++) ++ nr += io_apic_get_redir_entries(idx) + 1; ++ ++ if (nr > nr_irqs_gsi) ++ nr_irqs_gsi = nr; ++ } ++ ++ printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); ++} ++ ++#ifdef CONFIG_SPARSE_IRQ ++int __init arch_probe_nr_irqs(void) ++{ ++ int nr; ++ ++ if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) ++ nr_irqs = NR_VECTORS * nr_cpu_ids; ++ ++ nr = nr_irqs_gsi + 8 * nr_cpu_ids; ++#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) ++ /* ++ * for MSI and HT dyn irq ++ */ ++ nr += nr_irqs_gsi * 16; ++#endif ++ if (nr < nr_irqs) ++ nr_irqs = nr; ++ ++ return 0; ++} ++#endif ++ ++/* -------------------------------------------------------------------------- ++ ACPI-based IOAPIC Configuration ++ -------------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_ACPI ++ ++#ifdef CONFIG_X86_32 ++int __init io_apic_get_unique_id(int ioapic, int apic_id) ++{ ++ union IO_APIC_reg_00 reg_00; ++ static physid_mask_t apic_id_map = PHYSID_MASK_NONE; ++ physid_mask_t tmp; ++ unsigned long flags; ++ int i = 0; ++ ++ /* ++ * The P4 platform supports up to 256 APIC IDs on two separate APIC ++ * buses (one for LAPICs, one for IOAPICs), where predecessors only ++ * supports up to 16 on one shared APIC bus. ++ * ++ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full ++ * advantage of new APIC bus architecture. ++ */ ++ ++ if (physids_empty(apic_id_map)) ++ apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(ioapic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ if (apic_id >= get_physical_broadcast()) { ++ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " ++ "%d\n", ioapic, apic_id, reg_00.bits.ID); ++ apic_id = reg_00.bits.ID; ++ } ++ ++ /* ++ * Every APIC in a system must have a unique ID or we get lots of nice ++ * 'stuck on smp_invalidate_needed IPI wait' messages. ++ */ ++ if (apic->check_apicid_used(apic_id_map, apic_id)) { ++ ++ for (i = 0; i < get_physical_broadcast(); i++) { ++ if (!apic->check_apicid_used(apic_id_map, i)) ++ break; ++ } ++ ++ if (i == get_physical_broadcast()) ++ panic("Max apic_id exceeded!\n"); ++ ++ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " ++ "trying %d\n", ioapic, apic_id, i); ++ ++ apic_id = i; ++ } ++ ++ tmp = apic->apicid_to_cpu_present(apic_id); ++ physids_or(apic_id_map, apic_id_map, tmp); ++ ++ if (reg_00.bits.ID != apic_id) { ++ reg_00.bits.ID = apic_id; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(ioapic, 0, reg_00.raw); ++ reg_00.raw = io_apic_read(ioapic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ /* Sanity check */ ++ if (reg_00.bits.ID != apic_id) { ++ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); ++ return -1; ++ } ++ } ++ ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); ++ ++ return apic_id; ++} ++ ++int __init io_apic_get_version(int ioapic) ++{ ++ union IO_APIC_reg_01 reg_01; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(ioapic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return reg_01.bits.version; ++} ++#endif ++ ++int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) ++{ ++ struct irq_desc *desc; ++ struct irq_cfg *cfg; ++ int cpu = boot_cpu_id; ++ ++ if (!IO_APIC_IRQ(irq)) { ++ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ++ ioapic); ++ return -EINVAL; ++ } ++ ++ desc = irq_to_desc_alloc_cpu(irq, cpu); ++ if (!desc) { ++ printk(KERN_INFO "can not get irq_desc %d\n", irq); ++ return 0; ++ } ++ ++ /* ++ * IRQs < 16 are already in the irq_2_pin[] map ++ */ ++ if (irq >= NR_IRQS_LEGACY) { ++ cfg = desc->chip_data; ++ add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); ++ } ++ ++ setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); ++ ++ return 0; ++} ++ ++ ++int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) ++{ ++ int i; ++ ++ if (skip_ioapic_setup) ++ return -1; ++ ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].irqtype == mp_INT && ++ mp_irqs[i].srcbusirq == bus_irq) ++ break; ++ if (i >= mp_irq_entries) ++ return -1; ++ ++ *trigger = irq_trigger(i); ++ *polarity = irq_polarity(i); ++ return 0; ++} ++ ++#endif /* CONFIG_ACPI */ ++ ++/* ++ * This function currently is only a helper for the i386 smp boot process where ++ * we need to reprogram the ioredtbls to cater for the cpus which have come online ++ * so mask in all cases should simply be apic->target_cpus() ++ */ ++#ifdef CONFIG_SMP ++void __init setup_ioapic_dest(void) ++{ ++ int pin, ioapic, irq, irq_entry; ++ struct irq_desc *desc; ++ struct irq_cfg *cfg; ++ const struct cpumask *mask; ++ ++ if (skip_ioapic_setup == 1) ++ return; ++ ++ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { ++ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { ++ irq_entry = find_irq_entry(ioapic, pin, mp_INT); ++ if (irq_entry == -1) ++ continue; ++ irq = pin_2_irq(irq_entry, ioapic, pin); ++ ++ /* setup_IO_APIC_irqs could fail to get vector for some device ++ * when you have too many devices, because at that time only boot ++ * cpu is online. ++ */ ++ desc = irq_to_desc(irq); ++ cfg = desc->chip_data; ++ if (!cfg->vector) { ++ setup_IO_APIC_irq(ioapic, pin, irq, desc, ++ irq_trigger(irq_entry), ++ irq_polarity(irq_entry)); ++ continue; ++ ++ } ++ ++ /* ++ * Honour affinities which have been set in early boot ++ */ ++ if (desc->status & ++ (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) ++ mask = desc->affinity; ++ else ++ mask = apic->target_cpus(); ++ ++ if (intr_remapping_enabled) ++ set_ir_ioapic_affinity_irq_desc(desc, mask); ++ else ++ set_ioapic_affinity_irq_desc(desc, mask); ++ } ++ ++ } ++} ++#endif ++ ++#define IOAPIC_RESOURCE_NAME_SIZE 11 ++ ++static struct resource *ioapic_resources; ++ ++static struct resource * __init ioapic_setup_resources(void) ++{ ++ unsigned long n; ++ struct resource *res; ++ char *mem; ++ int i; ++ ++ if (nr_ioapics <= 0) ++ return NULL; ++ ++ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); ++ n *= nr_ioapics; ++ ++ mem = alloc_bootmem(n); ++ res = (void *)mem; ++ ++ if (mem != NULL) { ++ mem += sizeof(struct resource) * nr_ioapics; ++ ++ for (i = 0; i < nr_ioapics; i++) { ++ res[i].name = mem; ++ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; ++ sprintf(mem, "IOAPIC %u", i); ++ mem += IOAPIC_RESOURCE_NAME_SIZE; ++ } ++ } ++ ++ ioapic_resources = res; ++ ++ return res; ++} ++ ++void __init ioapic_init_mappings(void) ++{ ++ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; ++ struct resource *ioapic_res; ++ int i; ++ ++ ioapic_res = ioapic_setup_resources(); ++ for (i = 0; i < nr_ioapics; i++) { ++ if (smp_found_config) { ++ ioapic_phys = mp_ioapics[i].apicaddr; ++#ifdef CONFIG_X86_32 ++ if (!ioapic_phys) { ++ printk(KERN_ERR ++ "WARNING: bogus zero IO-APIC " ++ "address found in MPTABLE, " ++ "disabling IO/APIC support!\n"); ++ smp_found_config = 0; ++ skip_ioapic_setup = 1; ++ goto fake_ioapic_page; ++ } ++#endif ++ } else { ++#ifdef CONFIG_X86_32 ++fake_ioapic_page: ++#endif ++ ioapic_phys = (unsigned long) ++ alloc_bootmem_pages(PAGE_SIZE); ++ ioapic_phys = __pa(ioapic_phys); ++ } ++ set_fixmap_nocache(idx, ioapic_phys); ++ apic_printk(APIC_VERBOSE, ++ "mapped IOAPIC to %08lx (%08lx)\n", ++ __fix_to_virt(idx), ioapic_phys); ++ idx++; ++ ++ if (ioapic_res != NULL) { ++ ioapic_res->start = ioapic_phys; ++ ioapic_res->end = ioapic_phys + (4 * 1024) - 1; ++ ioapic_res++; ++ } ++ } ++} ++ ++static int __init ioapic_insert_resources(void) ++{ ++ int i; ++ struct resource *r = ioapic_resources; ++ ++ if (!r) { ++ if (nr_ioapics > 0) { ++ printk(KERN_ERR ++ "IO APIC resources couldn't be allocated.\n"); ++ return -1; ++ } ++ return 0; ++ } ++ ++ for (i = 0; i < nr_ioapics; i++) { ++ insert_resource(&iomem_resource, r); ++ r++; ++ } ++ ++ return 0; ++} ++ ++/* Insert the IO APIC resources after PCI initialization has occured to handle ++ * IO APICS that are mapped in on a BAR in PCI space. */ ++late_initcall(ioapic_insert_resources); +Index: linux-2.6-tip/arch/x86/kernel/apic/ipi.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/ipi.c +@@ -0,0 +1,164 @@ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) ++{ ++ unsigned long query_cpu; ++ unsigned long flags; ++ ++ /* ++ * Hack. The clustered APIC addressing mode doesn't allow us to send ++ * to an arbitrary mask, so I do a unicast to each CPU instead. ++ * - mbligh ++ */ ++ local_irq_save(flags); ++ for_each_cpu(query_cpu, mask) { ++ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, ++ query_cpu), vector, APIC_DEST_PHYSICAL); ++ } ++ local_irq_restore(flags); ++} ++ ++void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, ++ int vector) ++{ ++ unsigned int this_cpu = smp_processor_id(); ++ unsigned int query_cpu; ++ unsigned long flags; ++ ++ /* See Hack comment above */ ++ ++ local_irq_save(flags); ++ for_each_cpu(query_cpu, mask) { ++ if (query_cpu == this_cpu) ++ continue; ++ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, ++ query_cpu), vector, APIC_DEST_PHYSICAL); ++ } ++ local_irq_restore(flags); ++} ++ ++void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, ++ int vector) ++{ ++ unsigned long flags; ++ unsigned int query_cpu; ++ ++ /* ++ * Hack. The clustered APIC addressing mode doesn't allow us to send ++ * to an arbitrary mask, so I do a unicasts to each CPU instead. This ++ * should be modified to do 1 message per cluster ID - mbligh ++ */ ++ ++ local_irq_save(flags); ++ for_each_cpu(query_cpu, mask) ++ __default_send_IPI_dest_field( ++ apic->cpu_to_logical_apicid(query_cpu), vector, ++ apic->dest_logical); ++ local_irq_restore(flags); ++} ++ ++void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, ++ int vector) ++{ ++ unsigned long flags; ++ unsigned int query_cpu; ++ unsigned int this_cpu = smp_processor_id(); ++ ++ /* See Hack comment above */ ++ ++ local_irq_save(flags); ++ for_each_cpu(query_cpu, mask) { ++ if (query_cpu == this_cpu) ++ continue; ++ __default_send_IPI_dest_field( ++ apic->cpu_to_logical_apicid(query_cpu), vector, ++ apic->dest_logical); ++ } ++ local_irq_restore(flags); ++} ++ ++#ifdef CONFIG_X86_32 ++ ++/* ++ * This is only used on smaller machines. ++ */ ++void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) ++{ ++ unsigned long mask = cpumask_bits(cpumask)[0]; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); ++ __default_send_IPI_dest_field(mask, vector, apic->dest_logical); ++ local_irq_restore(flags); ++} ++ ++void default_send_IPI_allbutself(int vector) ++{ ++ /* ++ * if there are no other CPUs in the system then we get an APIC send ++ * error if we try to broadcast, thus avoid sending IPIs in this case. ++ */ ++ if (!(num_online_cpus() > 1)) ++ return; ++ ++ __default_local_send_IPI_allbutself(vector); ++} ++ ++void default_send_IPI_all(int vector) ++{ ++ __default_local_send_IPI_all(vector); ++} ++ ++void default_send_IPI_self(int vector) ++{ ++ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical); ++} ++ ++/* must come after the send_IPI functions above for inlining */ ++static int convert_apicid_to_cpu(int apic_id) ++{ ++ int i; ++ ++ for_each_possible_cpu(i) { ++ if (per_cpu(x86_cpu_to_apicid, i) == apic_id) ++ return i; ++ } ++ return -1; ++} ++ ++int safe_smp_processor_id(void) ++{ ++ int apicid, cpuid; ++ ++ if (!boot_cpu_has(X86_FEATURE_APIC)) ++ return 0; ++ ++ apicid = hard_smp_processor_id(); ++ if (apicid == BAD_APICID) ++ return 0; ++ ++ cpuid = convert_apicid_to_cpu(apicid); ++ ++ return cpuid >= 0 ? cpuid : 0; ++} ++#endif +Index: linux-2.6-tip/arch/x86/kernel/apic/nmi.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/nmi.c +@@ -0,0 +1,567 @@ ++/* ++ * NMI watchdog support on APIC systems ++ * ++ * Started by Ingo Molnar ++ * ++ * Fixes: ++ * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. ++ * Mikael Pettersson : Power Management for local APIC NMI watchdog. ++ * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. ++ * Pavel Machek and ++ * Mikael Pettersson : PM converted to driver model. Disable/enable API. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++int unknown_nmi_panic; ++int nmi_watchdog_enabled; ++ ++static cpumask_var_t backtrace_mask; ++ ++/* nmi_active: ++ * >0: the lapic NMI watchdog is active, but can be disabled ++ * <0: the lapic NMI watchdog has not been set up, and cannot ++ * be enabled ++ * 0: the lapic NMI watchdog is disabled, but can be enabled ++ */ ++atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ ++EXPORT_SYMBOL(nmi_active); ++ ++unsigned int nmi_watchdog = NMI_NONE; ++EXPORT_SYMBOL(nmi_watchdog); ++ ++static int panic_on_timeout; ++ ++static unsigned int nmi_hz = HZ; ++static DEFINE_PER_CPU(short, wd_enabled); ++static int endflag __initdata; ++ ++static inline unsigned int get_nmi_count(int cpu) ++{ ++ return per_cpu(irq_stat, cpu).__nmi_count; ++} ++ ++static inline int mce_in_progress(void) ++{ ++#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) ++ return atomic_read(&mce_entry) > 0; ++#endif ++ return 0; ++} ++ ++/* ++ * Take the local apic timer and PIT/HPET into account. We don't ++ * know which one is active, when we have highres/dyntick on ++ */ ++static inline unsigned int get_timer_irqs(int cpu) ++{ ++ return per_cpu(irq_stat, cpu).apic_timer_irqs + ++ per_cpu(irq_stat, cpu).irq0_irqs; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * The performance counters used by NMI_LOCAL_APIC don't trigger when ++ * the CPU is idle. To make sure the NMI watchdog really ticks on all ++ * CPUs during the test make them busy. ++ */ ++static __init void nmi_cpu_busy(void *data) ++{ ++#ifndef CONFIG_PREEMPT_RT ++ local_irq_enable_in_hardirq(); ++#endif ++ /* ++ * Intentionally don't use cpu_relax here. This is ++ * to make sure that the performance counter really ticks, ++ * even if there is a simulator or similar that catches the ++ * pause instruction. On a real HT machine this is fine because ++ * all other CPUs are busy with "useless" delay loops and don't ++ * care if they get somewhat less cycles. ++ */ ++ while (endflag == 0) ++ mb(); ++} ++#endif ++ ++static void report_broken_nmi(int cpu, int *prev_nmi_count) ++{ ++ printk(KERN_CONT "\n"); ++ ++ printk(KERN_WARNING ++ "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", ++ cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); ++ ++ printk(KERN_WARNING ++ "Please report this to bugzilla.kernel.org,\n"); ++ printk(KERN_WARNING ++ "and attach the output of the 'dmesg' command.\n"); ++ ++ per_cpu(wd_enabled, cpu) = 0; ++ atomic_dec(&nmi_active); ++} ++ ++static void __acpi_nmi_disable(void *__unused) ++{ ++ apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); ++} ++ ++int __init check_nmi_watchdog(void) ++{ ++ unsigned int *prev_nmi_count; ++ int cpu; ++ ++ if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) ++ return 0; ++ ++ prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); ++ if (!prev_nmi_count) ++ goto error; ++ ++ alloc_cpumask_var(&backtrace_mask, GFP_KERNEL); ++ printk(KERN_INFO "Testing NMI watchdog ... "); ++ ++#ifdef CONFIG_SMP ++ if (nmi_watchdog == NMI_LOCAL_APIC) ++ smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); ++#endif ++ ++ for_each_possible_cpu(cpu) ++ prev_nmi_count[cpu] = get_nmi_count(cpu); ++ local_irq_enable(); ++ mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ ++ ++ for_each_online_cpu(cpu) { ++ if (!per_cpu(wd_enabled, cpu)) ++ continue; ++ if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) ++ report_broken_nmi(cpu, prev_nmi_count); ++ } ++ endflag = 1; ++ if (!atomic_read(&nmi_active)) { ++ kfree(prev_nmi_count); ++ atomic_set(&nmi_active, -1); ++ goto error; ++ } ++ printk("OK.\n"); ++ ++ /* ++ * now that we know it works we can reduce NMI frequency to ++ * something more reasonable; makes a difference in some configs ++ */ ++ if (nmi_watchdog == NMI_LOCAL_APIC) ++ nmi_hz = lapic_adjust_nmi_hz(1); ++ ++ kfree(prev_nmi_count); ++ return 0; ++error: ++ if (nmi_watchdog == NMI_IO_APIC) { ++ if (!timer_through_8259) ++ disable_8259A_irq(0); ++ on_each_cpu(__acpi_nmi_disable, NULL, 1); ++ } ++ ++#ifdef CONFIG_X86_32 ++ timer_ack = 0; ++#endif ++ return -1; ++} ++ ++static int __init setup_nmi_watchdog(char *str) ++{ ++ unsigned int nmi; ++ ++ if (!strncmp(str, "panic", 5)) { ++ panic_on_timeout = 1; ++ str = strchr(str, ','); ++ if (!str) ++ return 1; ++ ++str; ++ } ++ ++ if (!strncmp(str, "lapic", 5)) ++ nmi_watchdog = NMI_LOCAL_APIC; ++ else if (!strncmp(str, "ioapic", 6)) ++ nmi_watchdog = NMI_IO_APIC; ++ else { ++ get_option(&str, &nmi); ++ if (nmi >= NMI_INVALID) ++ return 0; ++ nmi_watchdog = nmi; ++ } ++ ++ return 1; ++} ++__setup("nmi_watchdog=", setup_nmi_watchdog); ++ ++/* ++ * Suspend/resume support ++ */ ++#ifdef CONFIG_PM ++ ++static int nmi_pm_active; /* nmi_active before suspend */ ++ ++static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) ++{ ++ /* only CPU0 goes here, other CPUs should be offline */ ++ nmi_pm_active = atomic_read(&nmi_active); ++ stop_apic_nmi_watchdog(NULL); ++ BUG_ON(atomic_read(&nmi_active) != 0); ++ return 0; ++} ++ ++static int lapic_nmi_resume(struct sys_device *dev) ++{ ++ /* only CPU0 goes here, other CPUs should be offline */ ++ if (nmi_pm_active > 0) { ++ setup_apic_nmi_watchdog(NULL); ++ touch_nmi_watchdog(); ++ } ++ return 0; ++} ++ ++static struct sysdev_class nmi_sysclass = { ++ .name = "lapic_nmi", ++ .resume = lapic_nmi_resume, ++ .suspend = lapic_nmi_suspend, ++}; ++ ++static struct sys_device device_lapic_nmi = { ++ .id = 0, ++ .cls = &nmi_sysclass, ++}; ++ ++static int __init init_lapic_nmi_sysfs(void) ++{ ++ int error; ++ ++ /* ++ * should really be a BUG_ON but b/c this is an ++ * init call, it just doesn't work. -dcz ++ */ ++ if (nmi_watchdog != NMI_LOCAL_APIC) ++ return 0; ++ ++ if (atomic_read(&nmi_active) < 0) ++ return 0; ++ ++ error = sysdev_class_register(&nmi_sysclass); ++ if (!error) ++ error = sysdev_register(&device_lapic_nmi); ++ return error; ++} ++ ++/* must come after the local APIC's device_initcall() */ ++late_initcall(init_lapic_nmi_sysfs); ++ ++#endif /* CONFIG_PM */ ++ ++static void __acpi_nmi_enable(void *__unused) ++{ ++ apic_write(APIC_LVT0, APIC_DM_NMI); ++} ++ ++/* ++ * Enable timer based NMIs on all CPUs: ++ */ ++void acpi_nmi_enable(void) ++{ ++ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) ++ on_each_cpu(__acpi_nmi_enable, NULL, 1); ++} ++ ++/* ++ * Disable timer based NMIs on all CPUs: ++ */ ++void acpi_nmi_disable(void) ++{ ++ if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) ++ on_each_cpu(__acpi_nmi_disable, NULL, 1); ++} ++ ++/* ++ * This function is called as soon the LAPIC NMI watchdog driver has everything ++ * in place and it's ready to check if the NMIs belong to the NMI watchdog ++ */ ++void cpu_nmi_set_wd_enabled(void) ++{ ++ __get_cpu_var(wd_enabled) = 1; ++} ++ ++void setup_apic_nmi_watchdog(void *unused) ++{ ++ if (__get_cpu_var(wd_enabled)) ++ return; ++ ++ /* cheap hack to support suspend/resume */ ++ /* if cpu0 is not active neither should the other cpus */ ++ if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) ++ return; ++ ++ switch (nmi_watchdog) { ++ case NMI_LOCAL_APIC: ++ if (lapic_watchdog_init(nmi_hz) < 0) { ++ __get_cpu_var(wd_enabled) = 0; ++ return; ++ } ++ /* FALL THROUGH */ ++ case NMI_IO_APIC: ++ __get_cpu_var(wd_enabled) = 1; ++ atomic_inc(&nmi_active); ++ } ++} ++ ++void stop_apic_nmi_watchdog(void *unused) ++{ ++ /* only support LOCAL and IO APICs for now */ ++ if (!nmi_watchdog_active()) ++ return; ++ if (__get_cpu_var(wd_enabled) == 0) ++ return; ++ if (nmi_watchdog == NMI_LOCAL_APIC) ++ lapic_watchdog_stop(); ++ else ++ __acpi_nmi_disable(NULL); ++ __get_cpu_var(wd_enabled) = 0; ++ atomic_dec(&nmi_active); ++} ++ ++/* ++ * the best way to detect whether a CPU has a 'hard lockup' problem ++ * is to check it's local APIC timer IRQ counts. If they are not ++ * changing then that CPU has some problem. ++ * ++ * as these watchdog NMI IRQs are generated on every CPU, we only ++ * have to check the current processor. ++ * ++ * since NMIs don't listen to _any_ locks, we have to be extremely ++ * careful not to rely on unsafe variables. The printk might lock ++ * up though, so we have to break up any console locks first ... ++ * [when there will be more tty-related locks, break them up here too!] ++ */ ++ ++static DEFINE_PER_CPU(unsigned, last_irq_sum); ++static DEFINE_PER_CPU(local_t, alert_counter); ++static DEFINE_PER_CPU(int, nmi_touch); ++ ++void touch_nmi_watchdog(void) ++{ ++ if (nmi_watchdog_active()) { ++ unsigned cpu; ++ ++ /* ++ * Tell other CPUs to reset their alert counters. We cannot ++ * do it ourselves because the alert count increase is not ++ * atomic. ++ */ ++ for_each_present_cpu(cpu) { ++ if (per_cpu(nmi_touch, cpu) != 1) ++ per_cpu(nmi_touch, cpu) = 1; ++ } ++ } ++ ++ /* ++ * Tickle the softlockup detector too: ++ */ ++ touch_softlockup_watchdog(); ++} ++EXPORT_SYMBOL(touch_nmi_watchdog); ++ ++notrace __kprobes int ++nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) ++{ ++ /* ++ * Since current_thread_info()-> is always on the stack, and we ++ * always switch the stack NMI-atomically, it's safe to use ++ * smp_processor_id(). ++ */ ++ unsigned int sum; ++ int touched = 0; ++ int cpu = smp_processor_id(); ++ int rc = 0; ++ ++ /* check for other users first */ ++ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) ++ == NOTIFY_STOP) { ++ rc = 1; ++ touched = 1; ++ } ++ ++ sum = get_timer_irqs(cpu); ++ ++ if (__get_cpu_var(nmi_touch)) { ++ __get_cpu_var(nmi_touch) = 0; ++ touched = 1; ++ } ++ ++ if (cpumask_test_cpu(cpu, backtrace_mask)) { ++ static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ ++ ++ spin_lock(&lock); ++ printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); ++ dump_stack(); ++ spin_unlock(&lock); ++ cpumask_clear_cpu(cpu, backtrace_mask); ++ } ++ ++ /* Could check oops_in_progress here too, but it's safer not to */ ++ if (mce_in_progress()) ++ touched = 1; ++ ++ /* if the none of the timers isn't firing, this cpu isn't doing much */ ++ if (!touched && __get_cpu_var(last_irq_sum) == sum) { ++ /* ++ * Ayiee, looks like this CPU is stuck ... ++ * wait a few IRQs (5 seconds) before doing the oops ... ++ */ ++ local_inc(&__get_cpu_var(alert_counter)); ++ if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) ++ /* ++ * die_nmi will return ONLY if NOTIFY_STOP happens.. ++ */ ++ die_nmi("BUG: NMI Watchdog detected LOCKUP", ++ regs, panic_on_timeout); ++ } else { ++ __get_cpu_var(last_irq_sum) = sum; ++ local_set(&__get_cpu_var(alert_counter), 0); ++ } ++ ++ /* see if the nmi watchdog went off */ ++ if (!__get_cpu_var(wd_enabled)) ++ return rc; ++ switch (nmi_watchdog) { ++ case NMI_LOCAL_APIC: ++ rc |= lapic_wd_event(nmi_hz); ++ break; ++ case NMI_IO_APIC: ++ /* ++ * don't know how to accurately check for this. ++ * just assume it was a watchdog timer interrupt ++ * This matches the old behaviour. ++ */ ++ rc = 1; ++ break; ++ } ++ return rc; ++} ++ ++#ifdef CONFIG_SYSCTL ++ ++static void enable_ioapic_nmi_watchdog_single(void *unused) ++{ ++ __get_cpu_var(wd_enabled) = 1; ++ atomic_inc(&nmi_active); ++ __acpi_nmi_enable(NULL); ++} ++ ++static void enable_ioapic_nmi_watchdog(void) ++{ ++ on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); ++ touch_nmi_watchdog(); ++} ++ ++static void disable_ioapic_nmi_watchdog(void) ++{ ++ on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); ++} ++ ++static int __init setup_unknown_nmi_panic(char *str) ++{ ++ unknown_nmi_panic = 1; ++ return 1; ++} ++__setup("unknown_nmi_panic", setup_unknown_nmi_panic); ++ ++static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) ++{ ++ unsigned char reason = get_nmi_reason(); ++ char buf[64]; ++ ++ sprintf(buf, "NMI received for unknown reason %02x\n", reason); ++ die_nmi(buf, regs, 1); /* Always panic here */ ++ return 0; ++} ++ ++/* ++ * proc handler for /proc/sys/kernel/nmi ++ */ ++int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, ++ void __user *buffer, size_t *length, loff_t *ppos) ++{ ++ int old_state; ++ ++ nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; ++ old_state = nmi_watchdog_enabled; ++ proc_dointvec(table, write, file, buffer, length, ppos); ++ if (!!old_state == !!nmi_watchdog_enabled) ++ return 0; ++ ++ if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { ++ printk(KERN_WARNING ++ "NMI watchdog is permanently disabled\n"); ++ return -EIO; ++ } ++ ++ if (nmi_watchdog == NMI_LOCAL_APIC) { ++ if (nmi_watchdog_enabled) ++ enable_lapic_nmi_watchdog(); ++ else ++ disable_lapic_nmi_watchdog(); ++ } else if (nmi_watchdog == NMI_IO_APIC) { ++ if (nmi_watchdog_enabled) ++ enable_ioapic_nmi_watchdog(); ++ else ++ disable_ioapic_nmi_watchdog(); ++ } else { ++ printk(KERN_WARNING ++ "NMI watchdog doesn't know what hardware to touch\n"); ++ return -EIO; ++ } ++ return 0; ++} ++ ++#endif /* CONFIG_SYSCTL */ ++ ++int do_nmi_callback(struct pt_regs *regs, int cpu) ++{ ++#ifdef CONFIG_SYSCTL ++ if (unknown_nmi_panic) ++ return unknown_nmi_panic_callback(regs, cpu); ++#endif ++ return 0; ++} ++ ++void __trigger_all_cpu_backtrace(void) ++{ ++ int i; ++ ++ cpumask_copy(backtrace_mask, cpu_online_mask); ++ /* Wait for up to 10 seconds for all CPUs to do the backtrace */ ++ for (i = 0; i < 10 * 1000; i++) { ++ if (cpumask_empty(backtrace_mask)) ++ break; ++ mdelay(1); ++ } ++} +Index: linux-2.6-tip/arch/x86/kernel/apic/numaq_32.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/numaq_32.c +@@ -0,0 +1,558 @@ ++/* ++ * Written by: Patricia Gaughen, IBM Corporation ++ * ++ * Copyright (C) 2002, IBM Corp. ++ * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar ++ * ++ * All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or ++ * NON INFRINGEMENT. See the GNU General Public License for more ++ * details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ++ * ++ * Send feedback to ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) ++ ++int found_numaq; ++ ++/* ++ * Have to match translation table entries to main table entries by counter ++ * hence the mpc_record variable .... can't see a less disgusting way of ++ * doing this .... ++ */ ++struct mpc_trans { ++ unsigned char mpc_type; ++ unsigned char trans_len; ++ unsigned char trans_type; ++ unsigned char trans_quad; ++ unsigned char trans_global; ++ unsigned char trans_local; ++ unsigned short trans_reserved; ++}; ++ ++/* x86_quirks member */ ++static int mpc_record; ++ ++static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; ++ ++int mp_bus_id_to_node[MAX_MP_BUSSES]; ++int mp_bus_id_to_local[MAX_MP_BUSSES]; ++int quad_local_to_mp_bus_id[NR_CPUS/4][4]; ++ ++ ++static inline void numaq_register_node(int node, struct sys_cfg_data *scd) ++{ ++ struct eachquadmem *eq = scd->eq + node; ++ ++ node_set_online(node); ++ ++ /* Convert to pages */ ++ node_start_pfn[node] = ++ MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size); ++ ++ node_end_pfn[node] = ++ MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); ++ ++ e820_register_active_regions(node, node_start_pfn[node], ++ node_end_pfn[node]); ++ ++ memory_present(node, node_start_pfn[node], node_end_pfn[node]); ++ ++ node_remap_size[node] = node_memmap_size_bytes(node, ++ node_start_pfn[node], ++ node_end_pfn[node]); ++} ++ ++/* ++ * Function: smp_dump_qct() ++ * ++ * Description: gets memory layout from the quad config table. This ++ * function also updates node_online_map with the nodes (quads) present. ++ */ ++static void __init smp_dump_qct(void) ++{ ++ struct sys_cfg_data *scd; ++ int node; ++ ++ scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); ++ ++ nodes_clear(node_online_map); ++ for_each_node(node) { ++ if (scd->quads_present31_0 & (1 << node)) ++ numaq_register_node(node, scd); ++ } ++} ++ ++void __cpuinit numaq_tsc_disable(void) ++{ ++ if (!found_numaq) ++ return; ++ ++ if (num_online_nodes() > 1) { ++ printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); ++ setup_clear_cpu_cap(X86_FEATURE_TSC); ++ } ++} ++ ++static int __init numaq_pre_time_init(void) ++{ ++ numaq_tsc_disable(); ++ return 0; ++} ++ ++static inline int generate_logical_apicid(int quad, int phys_apicid) ++{ ++ return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); ++} ++ ++/* x86_quirks member */ ++static int mpc_apic_id(struct mpc_cpu *m) ++{ ++ int quad = translation_table[mpc_record]->trans_quad; ++ int logical_apicid = generate_logical_apicid(quad, m->apicid); ++ ++ printk(KERN_DEBUG ++ "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", ++ m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, ++ (m->cpufeature & CPU_MODEL_MASK) >> 4, ++ m->apicver, quad, logical_apicid); ++ ++ return logical_apicid; ++} ++ ++/* x86_quirks member */ ++static void mpc_oem_bus_info(struct mpc_bus *m, char *name) ++{ ++ int quad = translation_table[mpc_record]->trans_quad; ++ int local = translation_table[mpc_record]->trans_local; ++ ++ mp_bus_id_to_node[m->busid] = quad; ++ mp_bus_id_to_local[m->busid] = local; ++ ++ printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad); ++} ++ ++/* x86_quirks member */ ++static void mpc_oem_pci_bus(struct mpc_bus *m) ++{ ++ int quad = translation_table[mpc_record]->trans_quad; ++ int local = translation_table[mpc_record]->trans_local; ++ ++ quad_local_to_mp_bus_id[quad][local] = m->busid; ++} ++ ++static void __init MP_translation_info(struct mpc_trans *m) ++{ ++ printk(KERN_INFO ++ "Translation: record %d, type %d, quad %d, global %d, local %d\n", ++ mpc_record, m->trans_type, m->trans_quad, m->trans_global, ++ m->trans_local); ++ ++ if (mpc_record >= MAX_MPC_ENTRY) ++ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); ++ else ++ translation_table[mpc_record] = m; /* stash this for later */ ++ ++ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) ++ node_set_online(m->trans_quad); ++} ++ ++static int __init mpf_checksum(unsigned char *mp, int len) ++{ ++ int sum = 0; ++ ++ while (len--) ++ sum += *mp++; ++ ++ return sum & 0xFF; ++} ++ ++/* ++ * Read/parse the MPC oem tables ++ */ ++static void __init ++ smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) ++{ ++ int count = sizeof(*oemtable); /* the header size */ ++ unsigned char *oemptr = ((unsigned char *)oemtable) + count; ++ ++ mpc_record = 0; ++ printk(KERN_INFO ++ "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); ++ ++ if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { ++ printk(KERN_WARNING ++ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", ++ oemtable->signature[0], oemtable->signature[1], ++ oemtable->signature[2], oemtable->signature[3]); ++ return; ++ } ++ ++ if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) { ++ printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); ++ return; ++ } ++ ++ while (count < oemtable->length) { ++ switch (*oemptr) { ++ case MP_TRANSLATION: ++ { ++ struct mpc_trans *m = (void *)oemptr; ++ ++ MP_translation_info(m); ++ oemptr += sizeof(*m); ++ count += sizeof(*m); ++ ++mpc_record; ++ break; ++ } ++ default: ++ printk(KERN_WARNING ++ "Unrecognised OEM table entry type! - %d\n", ++ (int)*oemptr); ++ return; ++ } ++ } ++} ++ ++static int __init numaq_setup_ioapic_ids(void) ++{ ++ /* so can skip it */ ++ return 1; ++} ++ ++static struct x86_quirks numaq_x86_quirks __initdata = { ++ .arch_pre_time_init = numaq_pre_time_init, ++ .arch_time_init = NULL, ++ .arch_pre_intr_init = NULL, ++ .arch_memory_setup = NULL, ++ .arch_intr_init = NULL, ++ .arch_trap_init = NULL, ++ .mach_get_smp_config = NULL, ++ .mach_find_smp_config = NULL, ++ .mpc_record = &mpc_record, ++ .mpc_apic_id = mpc_apic_id, ++ .mpc_oem_bus_info = mpc_oem_bus_info, ++ .mpc_oem_pci_bus = mpc_oem_pci_bus, ++ .smp_read_mpc_oem = smp_read_mpc_oem, ++ .setup_ioapic_ids = numaq_setup_ioapic_ids, ++}; ++ ++static __init void early_check_numaq(void) ++{ ++ /* ++ * Find possible boot-time SMP configuration: ++ */ ++ early_find_smp_config(); ++ ++ /* ++ * get boot-time SMP configuration: ++ */ ++ if (smp_found_config) ++ early_get_smp_config(); ++ ++ if (found_numaq) ++ x86_quirks = &numaq_x86_quirks; ++} ++ ++int __init get_memcfg_numaq(void) ++{ ++ early_check_numaq(); ++ if (!found_numaq) ++ return 0; ++ smp_dump_qct(); ++ ++ return 1; ++} ++ ++#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) ++ ++static inline unsigned int numaq_get_apic_id(unsigned long x) ++{ ++ return (x >> 24) & 0x0F; ++} ++ ++static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector) ++{ ++ default_send_IPI_mask_sequence_logical(mask, vector); ++} ++ ++static inline void numaq_send_IPI_allbutself(int vector) ++{ ++ default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); ++} ++ ++static inline void numaq_send_IPI_all(int vector) ++{ ++ numaq_send_IPI_mask(cpu_online_mask, vector); ++} ++ ++#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8) ++#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa) ++ ++/* ++ * Because we use NMIs rather than the INIT-STARTUP sequence to ++ * bootstrap the CPUs, the APIC may be in a weird state. Kick it: ++ */ ++static inline void numaq_smp_callin_clear_local_apic(void) ++{ ++ clear_local_APIC(); ++} ++ ++static inline const struct cpumask *numaq_target_cpus(void) ++{ ++ return cpu_all_mask; ++} ++ ++static inline unsigned long ++numaq_check_apicid_used(physid_mask_t bitmap, int apicid) ++{ ++ return physid_isset(apicid, bitmap); ++} ++ ++static inline unsigned long numaq_check_apicid_present(int bit) ++{ ++ return physid_isset(bit, phys_cpu_present_map); ++} ++ ++static inline int numaq_apic_id_registered(void) ++{ ++ return 1; ++} ++ ++static inline void numaq_init_apic_ldr(void) ++{ ++ /* Already done in NUMA-Q firmware */ ++} ++ ++static inline void numaq_setup_apic_routing(void) ++{ ++ printk(KERN_INFO ++ "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n", ++ nr_ioapics); ++} ++ ++/* ++ * Skip adding the timer int on secondary nodes, which causes ++ * a small but painful rift in the time-space continuum. ++ */ ++static inline int numaq_multi_timer_check(int apic, int irq) ++{ ++ return apic != 0 && irq == 0; ++} ++ ++static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) ++{ ++ /* We don't have a good way to do this yet - hack */ ++ return physids_promote(0xFUL); ++} ++ ++static inline int numaq_cpu_to_logical_apicid(int cpu) ++{ ++ if (cpu >= nr_cpu_ids) ++ return BAD_APICID; ++ return cpu_2_logical_apicid[cpu]; ++} ++ ++/* ++ * Supporting over 60 cpus on NUMA-Q requires a locality-dependent ++ * cpu to APIC ID relation to properly interact with the intelligent ++ * mode of the cluster controller. ++ */ ++static inline int numaq_cpu_present_to_apicid(int mps_cpu) ++{ ++ if (mps_cpu < 60) ++ return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3)); ++ else ++ return BAD_APICID; ++} ++ ++static inline int numaq_apicid_to_node(int logical_apicid) ++{ ++ return logical_apicid >> 4; ++} ++ ++static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) ++{ ++ int node = numaq_apicid_to_node(logical_apicid); ++ int cpu = __ffs(logical_apicid & 0xf); ++ ++ return physid_mask_of_physid(cpu + 4*node); ++} ++ ++/* Where the IO area was mapped on multiquad, always 0 otherwise */ ++void *xquad_portio; ++ ++static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) ++{ ++ return 1; ++} ++ ++/* ++ * We use physical apicids here, not logical, so just return the default ++ * physical broadcast to stop people from breaking us ++ */ ++static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ return 0x0F; ++} ++ ++static inline unsigned int ++numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, ++ const struct cpumask *andmask) ++{ ++ return 0x0F; ++} ++ ++/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ ++static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb) ++{ ++ return cpuid_apic >> index_msb; ++} ++ ++static int ++numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) ++{ ++ if (strncmp(oem, "IBM NUMA", 8)) ++ printk(KERN_ERR "Warning! Not a NUMA-Q system!\n"); ++ else ++ found_numaq = 1; ++ ++ return found_numaq; ++} ++ ++static int probe_numaq(void) ++{ ++ /* already know from get_memcfg_numaq() */ ++ return found_numaq; ++} ++ ++static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ /* Careful. Some cpus do not strictly honor the set of cpus ++ * specified in the interrupt destination when using lowest ++ * priority interrupt delivery mode. ++ * ++ * In particular there was a hyperthreading cpu observed to ++ * deliver interrupts to the wrong hyperthread when only one ++ * hyperthread was specified in the interrupt desitination. ++ */ ++ cpumask_clear(retmask); ++ cpumask_bits(retmask)[0] = APIC_ALL_CPUS; ++} ++ ++static void numaq_setup_portio_remap(void) ++{ ++ int num_quads = num_online_nodes(); ++ ++ if (num_quads <= 1) ++ return; ++ ++ printk(KERN_INFO ++ "Remapping cross-quad port I/O for %d quads\n", num_quads); ++ ++ xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD); ++ ++ printk(KERN_INFO ++ "xquad_portio vaddr 0x%08lx, len %08lx\n", ++ (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); ++} ++ ++struct apic apic_numaq = { ++ ++ .name = "NUMAQ", ++ .probe = probe_numaq, ++ .acpi_madt_oem_check = NULL, ++ .apic_id_registered = numaq_apic_id_registered, ++ ++ .irq_delivery_mode = dest_LowestPrio, ++ /* physical delivery on LOCAL quad: */ ++ .irq_dest_mode = 0, ++ ++ .target_cpus = numaq_target_cpus, ++ .disable_esr = 1, ++ .dest_logical = APIC_DEST_LOGICAL, ++ .check_apicid_used = numaq_check_apicid_used, ++ .check_apicid_present = numaq_check_apicid_present, ++ ++ .vector_allocation_domain = numaq_vector_allocation_domain, ++ .init_apic_ldr = numaq_init_apic_ldr, ++ ++ .ioapic_phys_id_map = numaq_ioapic_phys_id_map, ++ .setup_apic_routing = numaq_setup_apic_routing, ++ .multi_timer_check = numaq_multi_timer_check, ++ .apicid_to_node = numaq_apicid_to_node, ++ .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid, ++ .cpu_present_to_apicid = numaq_cpu_present_to_apicid, ++ .apicid_to_cpu_present = numaq_apicid_to_cpu_present, ++ .setup_portio_remap = numaq_setup_portio_remap, ++ .check_phys_apicid_present = numaq_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = numaq_phys_pkg_id, ++ .mps_oem_check = numaq_mps_oem_check, ++ ++ .get_apic_id = numaq_get_apic_id, ++ .set_apic_id = NULL, ++ .apic_id_mask = 0x0F << 24, ++ ++ .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = numaq_send_IPI_mask, ++ .send_IPI_mask_allbutself = NULL, ++ .send_IPI_allbutself = numaq_send_IPI_allbutself, ++ .send_IPI_all = numaq_send_IPI_all, ++ .send_IPI_self = default_send_IPI_self, ++ ++ .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi, ++ .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH, ++ ++ /* We don't do anything here because we use NMI's to boot instead */ ++ .wait_for_init_deassert = NULL, ++ ++ .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic, ++ .inquire_remote_apic = NULL, ++ ++ .read = native_apic_mem_read, ++ .write = native_apic_mem_write, ++ .icr_read = native_apic_icr_read, ++ .icr_write = native_apic_icr_write, ++ .wait_icr_idle = native_apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, ++}; +Index: linux-2.6-tip/arch/x86/kernel/apic/probe_32.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/probe_32.c +@@ -0,0 +1,285 @@ ++/* ++ * Default generic APIC driver. This handles up to 8 CPUs. ++ * ++ * Copyright 2003 Andi Kleen, SuSE Labs. ++ * Subject to the GNU Public License, v.2 ++ * ++ * Generic x86 APIC driver probe layer. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_HOTPLUG_CPU ++#define DEFAULT_SEND_IPI (1) ++#else ++#define DEFAULT_SEND_IPI (0) ++#endif ++ ++int no_broadcast = DEFAULT_SEND_IPI; ++ ++static __init int no_ipi_broadcast(char *str) ++{ ++ get_option(&str, &no_broadcast); ++ pr_info("Using %s mode\n", ++ no_broadcast ? "No IPI Broadcast" : "IPI Broadcast"); ++ return 1; ++} ++__setup("no_ipi_broadcast=", no_ipi_broadcast); ++ ++static int __init print_ipi_mode(void) ++{ ++ pr_info("Using IPI %s mode\n", ++ no_broadcast ? "No-Shortcut" : "Shortcut"); ++ return 0; ++} ++late_initcall(print_ipi_mode); ++ ++void default_setup_apic_routing(void) ++{ ++#ifdef CONFIG_X86_IO_APIC ++ printk(KERN_INFO ++ "Enabling APIC mode: Flat. Using %d I/O APICs\n", ++ nr_ioapics); ++#endif ++} ++ ++static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ /* ++ * Careful. Some cpus do not strictly honor the set of cpus ++ * specified in the interrupt destination when using lowest ++ * priority interrupt delivery mode. ++ * ++ * In particular there was a hyperthreading cpu observed to ++ * deliver interrupts to the wrong hyperthread when only one ++ * hyperthread was specified in the interrupt desitination. ++ */ ++ cpumask_clear(retmask); ++ cpumask_bits(retmask)[0] = APIC_ALL_CPUS; ++} ++ ++/* should be called last. */ ++static int probe_default(void) ++{ ++ return 1; ++} ++ ++struct apic apic_default = { ++ ++ .name = "default", ++ .probe = probe_default, ++ .acpi_madt_oem_check = NULL, ++ .apic_id_registered = default_apic_id_registered, ++ ++ .irq_delivery_mode = dest_LowestPrio, ++ /* logical delivery broadcast to all CPUs: */ ++ .irq_dest_mode = 1, ++ ++ .target_cpus = default_target_cpus, ++ .disable_esr = 0, ++ .dest_logical = APIC_DEST_LOGICAL, ++ .check_apicid_used = default_check_apicid_used, ++ .check_apicid_present = default_check_apicid_present, ++ ++ .vector_allocation_domain = default_vector_allocation_domain, ++ .init_apic_ldr = default_init_apic_ldr, ++ ++ .ioapic_phys_id_map = default_ioapic_phys_id_map, ++ .setup_apic_routing = default_setup_apic_routing, ++ .multi_timer_check = NULL, ++ .apicid_to_node = default_apicid_to_node, ++ .cpu_to_logical_apicid = default_cpu_to_logical_apicid, ++ .cpu_present_to_apicid = default_cpu_present_to_apicid, ++ .apicid_to_cpu_present = default_apicid_to_cpu_present, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = default_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = default_phys_pkg_id, ++ .mps_oem_check = NULL, ++ ++ .get_apic_id = default_get_apic_id, ++ .set_apic_id = NULL, ++ .apic_id_mask = 0x0F << 24, ++ ++ .cpu_mask_to_apicid = default_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = default_send_IPI_mask_logical, ++ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, ++ .send_IPI_allbutself = default_send_IPI_allbutself, ++ .send_IPI_all = default_send_IPI_all, ++ .send_IPI_self = default_send_IPI_self, ++ ++ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, ++ ++ .wait_for_init_deassert = default_wait_for_init_deassert, ++ ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = default_inquire_remote_apic, ++ ++ .read = native_apic_mem_read, ++ .write = native_apic_mem_write, ++ .icr_read = native_apic_icr_read, ++ .icr_write = native_apic_icr_write, ++ .wait_icr_idle = native_apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, ++}; ++ ++extern struct apic apic_numaq; ++extern struct apic apic_summit; ++extern struct apic apic_bigsmp; ++extern struct apic apic_es7000; ++extern struct apic apic_es7000_cluster; ++extern struct apic apic_default; ++ ++struct apic *apic = &apic_default; ++EXPORT_SYMBOL_GPL(apic); ++ ++static struct apic *apic_probe[] __initdata = { ++#ifdef CONFIG_X86_NUMAQ ++ &apic_numaq, ++#endif ++#ifdef CONFIG_X86_SUMMIT ++ &apic_summit, ++#endif ++#ifdef CONFIG_X86_BIGSMP ++ &apic_bigsmp, ++#endif ++#ifdef CONFIG_X86_ES7000 ++ &apic_es7000, ++ &apic_es7000_cluster, ++#endif ++ &apic_default, /* must be last */ ++ NULL, ++}; ++ ++static int cmdline_apic __initdata; ++static int __init parse_apic(char *arg) ++{ ++ int i; ++ ++ if (!arg) ++ return -EINVAL; ++ ++ for (i = 0; apic_probe[i]; i++) { ++ if (!strcmp(apic_probe[i]->name, arg)) { ++ apic = apic_probe[i]; ++ cmdline_apic = 1; ++ return 0; ++ } ++ } ++ ++ /* Parsed again by __setup for debug/verbose */ ++ return 0; ++} ++early_param("apic", parse_apic); ++ ++void __init generic_bigsmp_probe(void) ++{ ++#ifdef CONFIG_X86_BIGSMP ++ /* ++ * This routine is used to switch to bigsmp mode when ++ * - There is no apic= option specified by the user ++ * - generic_apic_probe() has chosen apic_default as the sub_arch ++ * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support ++ */ ++ ++ if (!cmdline_apic && apic == &apic_default) { ++ if (apic_bigsmp.probe()) { ++ apic = &apic_bigsmp; ++ printk(KERN_INFO "Overriding APIC driver with %s\n", ++ apic->name); ++ } ++ } ++#endif ++} ++ ++void __init generic_apic_probe(void) ++{ ++ if (!cmdline_apic) { ++ int i; ++ for (i = 0; apic_probe[i]; i++) { ++ if (apic_probe[i]->probe()) { ++ apic = apic_probe[i]; ++ break; ++ } ++ } ++ /* Not visible without early console */ ++ if (!apic_probe[i]) ++ panic("Didn't find an APIC driver"); ++ } ++ printk(KERN_INFO "Using APIC driver %s\n", apic->name); ++} ++ ++/* These functions can switch the APIC even after the initial ->probe() */ ++ ++int __init ++generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) ++{ ++ int i; ++ ++ for (i = 0; apic_probe[i]; ++i) { ++ if (!apic_probe[i]->mps_oem_check) ++ continue; ++ if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) ++ continue; ++ ++ if (!cmdline_apic) { ++ apic = apic_probe[i]; ++ printk(KERN_INFO "Switched to APIC driver `%s'.\n", ++ apic->name); ++ } ++ return 1; ++ } ++ return 0; ++} ++ ++int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ int i; ++ ++ for (i = 0; apic_probe[i]; ++i) { ++ if (!apic_probe[i]->acpi_madt_oem_check) ++ continue; ++ if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) ++ continue; ++ ++ if (!cmdline_apic) { ++ apic = apic_probe[i]; ++ printk(KERN_INFO "Switched to APIC driver `%s'.\n", ++ apic->name); ++ } ++ return 1; ++ } ++ return 0; ++} +Index: linux-2.6-tip/arch/x86/kernel/apic/probe_64.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/probe_64.c +@@ -0,0 +1,100 @@ ++/* ++ * Copyright 2004 James Cleverdon, IBM. ++ * Subject to the GNU Public License, v.2 ++ * ++ * Generic APIC sub-arch probe layer. ++ * ++ * Hacked for x86-64 by James Cleverdon from i386 architecture code by ++ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and ++ * James Cleverdon. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++extern struct apic apic_flat; ++extern struct apic apic_physflat; ++extern struct apic apic_x2xpic_uv_x; ++extern struct apic apic_x2apic_phys; ++extern struct apic apic_x2apic_cluster; ++ ++struct apic __read_mostly *apic = &apic_flat; ++EXPORT_SYMBOL_GPL(apic); ++ ++static struct apic *apic_probe[] __initdata = { ++#ifdef CONFIG_X86_UV ++ &apic_x2apic_uv_x, ++#endif ++#ifdef CONFIG_X86_X2APIC ++ &apic_x2apic_phys, ++ &apic_x2apic_cluster, ++#endif ++ &apic_physflat, ++ NULL, ++}; ++ ++/* ++ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. ++ */ ++void __init default_setup_apic_routing(void) ++{ ++#ifdef CONFIG_X86_X2APIC ++ if (x2apic && (apic != &apic_x2apic_phys && ++#ifdef CONFIG_X86_UV ++ apic != &apic_x2apic_uv_x && ++#endif ++ apic != &apic_x2apic_cluster)) { ++ if (x2apic_phys) ++ apic = &apic_x2apic_phys; ++ else ++ apic = &apic_x2apic_cluster; ++ printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); ++ } ++#endif ++ ++ if (apic == &apic_flat) { ++ if (max_physical_apicid >= 8) ++ apic = &apic_physflat; ++ printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); ++ } ++ ++ /* ++ * Now that apic routing model is selected, configure the ++ * fault handling for intr remapping. ++ */ ++ if (intr_remapping_enabled) ++ enable_drhd_fault_handling(); ++} ++ ++/* Same for both flat and physical. */ ++ ++void apic_send_IPI_self(int vector) ++{ ++ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); ++} ++ ++int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ int i; ++ ++ for (i = 0; apic_probe[i]; ++i) { ++ if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { ++ apic = apic_probe[i]; ++ printk(KERN_INFO "Setting APIC routing to %s.\n", ++ apic->name); ++ return 1; ++ } ++ } ++ return 0; ++} +Index: linux-2.6-tip/arch/x86/kernel/apic/summit_32.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/summit_32.c +@@ -0,0 +1,576 @@ ++/* ++ * IBM Summit-Specific Code ++ * ++ * Written By: Matthew Dobson, IBM Corporation ++ * ++ * Copyright (c) 2003 IBM Corp. ++ * ++ * All rights reserved. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or (at ++ * your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or ++ * NON INFRINGEMENT. See the GNU General Public License for more ++ * details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ++ * ++ * Send feedback to ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * APIC driver for the IBM "Summit" chipset. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static unsigned summit_get_apic_id(unsigned long x) ++{ ++ return (x >> 24) & 0xFF; ++} ++ ++static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector) ++{ ++ default_send_IPI_mask_sequence_logical(mask, vector); ++} ++ ++static void summit_send_IPI_allbutself(int vector) ++{ ++ default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); ++} ++ ++static void summit_send_IPI_all(int vector) ++{ ++ summit_send_IPI_mask(cpu_online_mask, vector); ++} ++ ++#include ++ ++extern int use_cyclone; ++ ++#ifdef CONFIG_X86_SUMMIT_NUMA ++static void setup_summit(void); ++#else ++static inline void setup_summit(void) {} ++#endif ++ ++static int summit_mps_oem_check(struct mpc_table *mpc, char *oem, ++ char *productid) ++{ ++ if (!strncmp(oem, "IBM ENSW", 8) && ++ (!strncmp(productid, "VIGIL SMP", 9) ++ || !strncmp(productid, "EXA", 3) ++ || !strncmp(productid, "RUTHLESS SMP", 12))){ ++ mark_tsc_unstable("Summit based system"); ++ use_cyclone = 1; /*enable cyclone-timer*/ ++ setup_summit(); ++ return 1; ++ } ++ return 0; ++} ++ ++/* Hook from generic ACPI tables.c */ ++static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ if (!strncmp(oem_id, "IBM", 3) && ++ (!strncmp(oem_table_id, "SERVIGIL", 8) ++ || !strncmp(oem_table_id, "EXA", 3))){ ++ mark_tsc_unstable("Summit based system"); ++ use_cyclone = 1; /*enable cyclone-timer*/ ++ setup_summit(); ++ return 1; ++ } ++ return 0; ++} ++ ++struct rio_table_hdr { ++ unsigned char version; /* Version number of this data structure */ ++ /* Version 3 adds chassis_num & WP_index */ ++ unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */ ++ unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */ ++} __attribute__((packed)); ++ ++struct scal_detail { ++ unsigned char node_id; /* Scalability Node ID */ ++ unsigned long CBAR; /* Address of 1MB register space */ ++ unsigned char port0node; /* Node ID port connected to: 0xFF=None */ ++ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ ++ unsigned char port1node; /* Node ID port connected to: 0xFF = None */ ++ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ ++ unsigned char port2node; /* Node ID port connected to: 0xFF = None */ ++ unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */ ++ unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */ ++} __attribute__((packed)); ++ ++struct rio_detail { ++ unsigned char node_id; /* RIO Node ID */ ++ unsigned long BBAR; /* Address of 1MB register space */ ++ unsigned char type; /* Type of device */ ++ unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/ ++ /* For CYC: Node ID of Twister that owns this CYC */ ++ unsigned char port0node; /* Node ID port connected to: 0xFF=None */ ++ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */ ++ unsigned char port1node; /* Node ID port connected to: 0xFF=None */ ++ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */ ++ unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */ ++ /* For CYC: 0 */ ++ unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */ ++ /* = 0 : the XAPIC is not used, ie:*/ ++ /* ints fwded to another XAPIC */ ++ /* Bits1:7 Reserved */ ++ /* For CYC: Bits0:7 Reserved */ ++ unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */ ++ /* lower slot numbers/PCI bus numbers */ ++ /* For CYC: No meaning */ ++ unsigned char chassis_num; /* 1 based Chassis number */ ++ /* For LookOut WPEGs this field indicates the */ ++ /* Expansion Chassis #, enumerated from Boot */ ++ /* Node WPEG external port, then Boot Node CYC */ ++ /* external port, then Next Vigil chassis WPEG */ ++ /* external port, etc. */ ++ /* Shared Lookouts have only 1 chassis number (the */ ++ /* first one assigned) */ ++} __attribute__((packed)); ++ ++ ++typedef enum { ++ CompatTwister = 0, /* Compatibility Twister */ ++ AltTwister = 1, /* Alternate Twister of internal 8-way */ ++ CompatCyclone = 2, /* Compatibility Cyclone */ ++ AltCyclone = 3, /* Alternate Cyclone of internal 8-way */ ++ CompatWPEG = 4, /* Compatibility WPEG */ ++ AltWPEG = 5, /* Second Planar WPEG */ ++ LookOutAWPEG = 6, /* LookOut WPEG */ ++ LookOutBWPEG = 7, /* LookOut WPEG */ ++} node_type; ++ ++static inline int is_WPEG(struct rio_detail *rio){ ++ return (rio->type == CompatWPEG || rio->type == AltWPEG || ++ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); ++} ++ ++ ++/* In clustered mode, the high nibble of APIC ID is a cluster number. ++ * The low nibble is a 4-bit bitmap. */ ++#define XAPIC_DEST_CPUS_SHIFT 4 ++#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) ++#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) ++ ++#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) ++ ++static const struct cpumask *summit_target_cpus(void) ++{ ++ /* CPU_MASK_ALL (0xff) has undefined behaviour with ++ * dest_LowestPrio mode logical clustered apic interrupt routing ++ * Just start on cpu 0. IRQ balancing will spread load ++ */ ++ return cpumask_of(0); ++} ++ ++static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) ++{ ++ return 0; ++} ++ ++/* we don't use the phys_cpu_present_map to indicate apicid presence */ ++static unsigned long summit_check_apicid_present(int bit) ++{ ++ return 1; ++} ++ ++static void summit_init_apic_ldr(void) ++{ ++ unsigned long val, id; ++ int count = 0; ++ u8 my_id = (u8)hard_smp_processor_id(); ++ u8 my_cluster = APIC_CLUSTER(my_id); ++#ifdef CONFIG_SMP ++ u8 lid; ++ int i; ++ ++ /* Create logical APIC IDs by counting CPUs already in cluster. */ ++ for (count = 0, i = nr_cpu_ids; --i >= 0; ) { ++ lid = cpu_2_logical_apicid[i]; ++ if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) ++ ++count; ++ } ++#endif ++ /* We only have a 4 wide bitmap in cluster mode. If a deranged ++ * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ ++ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); ++ id = my_cluster | (1UL << count); ++ apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); ++ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; ++ val |= SET_APIC_LOGICAL_ID(id); ++ apic_write(APIC_LDR, val); ++} ++ ++static int summit_apic_id_registered(void) ++{ ++ return 1; ++} ++ ++static void summit_setup_apic_routing(void) ++{ ++ printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", ++ nr_ioapics); ++} ++ ++static int summit_apicid_to_node(int logical_apicid) ++{ ++#ifdef CONFIG_SMP ++ return apicid_2_node[hard_smp_processor_id()]; ++#else ++ return 0; ++#endif ++} ++ ++/* Mapping from cpu number to logical apicid */ ++static inline int summit_cpu_to_logical_apicid(int cpu) ++{ ++#ifdef CONFIG_SMP ++ if (cpu >= nr_cpu_ids) ++ return BAD_APICID; ++ return cpu_2_logical_apicid[cpu]; ++#else ++ return logical_smp_processor_id(); ++#endif ++} ++ ++static int summit_cpu_present_to_apicid(int mps_cpu) ++{ ++ if (mps_cpu < nr_cpu_ids) ++ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); ++ else ++ return BAD_APICID; ++} ++ ++static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) ++{ ++ /* For clustered we don't have a good way to do this yet - hack */ ++ return physids_promote(0x0F); ++} ++ ++static physid_mask_t summit_apicid_to_cpu_present(int apicid) ++{ ++ return physid_mask_of_physid(0); ++} ++ ++static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) ++{ ++ return 1; ++} ++ ++static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ unsigned int round = 0; ++ int cpu, apicid = 0; ++ ++ /* ++ * The cpus in the mask must all be on the apic cluster. ++ */ ++ for_each_cpu(cpu, cpumask) { ++ int new_apicid = summit_cpu_to_logical_apicid(cpu); ++ ++ if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { ++ printk("%s: Not a valid mask!\n", __func__); ++ return BAD_APICID; ++ } ++ apicid |= new_apicid; ++ round++; ++ } ++ return apicid; ++} ++ ++static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, ++ const struct cpumask *andmask) ++{ ++ int apicid = summit_cpu_to_logical_apicid(0); ++ cpumask_var_t cpumask; ++ ++ if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) ++ return apicid; ++ ++ cpumask_and(cpumask, inmask, andmask); ++ cpumask_and(cpumask, cpumask, cpu_online_mask); ++ apicid = summit_cpu_mask_to_apicid(cpumask); ++ ++ free_cpumask_var(cpumask); ++ ++ return apicid; ++} ++ ++/* ++ * cpuid returns the value latched in the HW at reset, not the APIC ID ++ * register's value. For any box whose BIOS changes APIC IDs, like ++ * clustered APIC systems, we must use hard_smp_processor_id. ++ * ++ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. ++ */ ++static int summit_phys_pkg_id(int cpuid_apic, int index_msb) ++{ ++ return hard_smp_processor_id() >> index_msb; ++} ++ ++static int probe_summit(void) ++{ ++ /* probed later in mptable/ACPI hooks */ ++ return 0; ++} ++ ++static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ /* Careful. Some cpus do not strictly honor the set of cpus ++ * specified in the interrupt destination when using lowest ++ * priority interrupt delivery mode. ++ * ++ * In particular there was a hyperthreading cpu observed to ++ * deliver interrupts to the wrong hyperthread when only one ++ * hyperthread was specified in the interrupt desitination. ++ */ ++ cpumask_clear(retmask); ++ cpumask_bits(retmask)[0] = APIC_ALL_CPUS; ++} ++ ++#ifdef CONFIG_X86_SUMMIT_NUMA ++static struct rio_table_hdr *rio_table_hdr; ++static struct scal_detail *scal_devs[MAX_NUMNODES]; ++static struct rio_detail *rio_devs[MAX_NUMNODES*4]; ++ ++#ifndef CONFIG_X86_NUMAQ ++static int mp_bus_id_to_node[MAX_MP_BUSSES]; ++#endif ++ ++static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) ++{ ++ int twister = 0, node = 0; ++ int i, bus, num_buses; ++ ++ for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { ++ if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) { ++ twister = rio_devs[i]->owner_id; ++ break; ++ } ++ } ++ if (i == rio_table_hdr->num_rio_dev) { ++ printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); ++ return last_bus; ++ } ++ ++ for (i = 0; i < rio_table_hdr->num_scal_dev; i++) { ++ if (scal_devs[i]->node_id == twister) { ++ node = scal_devs[i]->node_id; ++ break; ++ } ++ } ++ if (i == rio_table_hdr->num_scal_dev) { ++ printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); ++ return last_bus; ++ } ++ ++ switch (rio_devs[wpeg_num]->type) { ++ case CompatWPEG: ++ /* ++ * The Compatibility Winnipeg controls the 2 legacy buses, ++ * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case ++ * a PCI-PCI bridge card is used in either slot: total 5 buses. ++ */ ++ num_buses = 5; ++ break; ++ case AltWPEG: ++ /* ++ * The Alternate Winnipeg controls the 2 133MHz buses [1 slot ++ * each], their 2 "extra" buses, the 100MHz bus [2 slots] and ++ * the "extra" buses for each of those slots: total 7 buses. ++ */ ++ num_buses = 7; ++ break; ++ case LookOutAWPEG: ++ case LookOutBWPEG: ++ /* ++ * A Lookout Winnipeg controls 3 100MHz buses [2 slots each] ++ * & the "extra" buses for each of those slots: total 9 buses. ++ */ ++ num_buses = 9; ++ break; ++ default: ++ printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); ++ return last_bus; ++ } ++ ++ for (bus = last_bus; bus < last_bus + num_buses; bus++) ++ mp_bus_id_to_node[bus] = node; ++ return bus; ++} ++ ++static int build_detail_arrays(void) ++{ ++ unsigned long ptr; ++ int i, scal_detail_size, rio_detail_size; ++ ++ if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { ++ printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); ++ return 0; ++ } ++ ++ switch (rio_table_hdr->version) { ++ default: ++ printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); ++ return 0; ++ case 2: ++ scal_detail_size = 11; ++ rio_detail_size = 13; ++ break; ++ case 3: ++ scal_detail_size = 12; ++ rio_detail_size = 15; ++ break; ++ } ++ ++ ptr = (unsigned long)rio_table_hdr + 3; ++ for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) ++ scal_devs[i] = (struct scal_detail *)ptr; ++ ++ for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) ++ rio_devs[i] = (struct rio_detail *)ptr; ++ ++ return 1; ++} ++ ++void setup_summit(void) ++{ ++ unsigned long ptr; ++ unsigned short offset; ++ int i, next_wpeg, next_bus = 0; ++ ++ /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ ++ ptr = get_bios_ebda(); ++ ptr = (unsigned long)phys_to_virt(ptr); ++ ++ rio_table_hdr = NULL; ++ offset = 0x180; ++ while (offset) { ++ /* The block id is stored in the 2nd word */ ++ if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) { ++ /* set the pointer past the offset & block id */ ++ rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); ++ break; ++ } ++ /* The next offset is stored in the 1st word. 0 means no more */ ++ offset = *((unsigned short *)(ptr + offset)); ++ } ++ if (!rio_table_hdr) { ++ printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); ++ return; ++ } ++ ++ if (!build_detail_arrays()) ++ return; ++ ++ /* The first Winnipeg we're looking for has an index of 0 */ ++ next_wpeg = 0; ++ do { ++ for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { ++ if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) { ++ /* It's the Winnipeg we're looking for! */ ++ next_bus = setup_pci_node_map_for_wpeg(i, next_bus); ++ next_wpeg++; ++ break; ++ } ++ } ++ /* ++ * If we go through all Rio devices and don't find one with ++ * the next index, it means we've found all the Winnipegs, ++ * and thus all the PCI buses. ++ */ ++ if (i == rio_table_hdr->num_rio_dev) ++ next_wpeg = 0; ++ } while (next_wpeg != 0); ++} ++#endif ++ ++struct apic apic_summit = { ++ ++ .name = "summit", ++ .probe = probe_summit, ++ .acpi_madt_oem_check = summit_acpi_madt_oem_check, ++ .apic_id_registered = summit_apic_id_registered, ++ ++ .irq_delivery_mode = dest_LowestPrio, ++ /* logical delivery broadcast to all CPUs: */ ++ .irq_dest_mode = 1, ++ ++ .target_cpus = summit_target_cpus, ++ .disable_esr = 1, ++ .dest_logical = APIC_DEST_LOGICAL, ++ .check_apicid_used = summit_check_apicid_used, ++ .check_apicid_present = summit_check_apicid_present, ++ ++ .vector_allocation_domain = summit_vector_allocation_domain, ++ .init_apic_ldr = summit_init_apic_ldr, ++ ++ .ioapic_phys_id_map = summit_ioapic_phys_id_map, ++ .setup_apic_routing = summit_setup_apic_routing, ++ .multi_timer_check = NULL, ++ .apicid_to_node = summit_apicid_to_node, ++ .cpu_to_logical_apicid = summit_cpu_to_logical_apicid, ++ .cpu_present_to_apicid = summit_cpu_present_to_apicid, ++ .apicid_to_cpu_present = summit_apicid_to_cpu_present, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = summit_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = summit_phys_pkg_id, ++ .mps_oem_check = summit_mps_oem_check, ++ ++ .get_apic_id = summit_get_apic_id, ++ .set_apic_id = NULL, ++ .apic_id_mask = 0xFF << 24, ++ ++ .cpu_mask_to_apicid = summit_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = summit_send_IPI_mask, ++ .send_IPI_mask_allbutself = NULL, ++ .send_IPI_allbutself = summit_send_IPI_allbutself, ++ .send_IPI_all = summit_send_IPI_all, ++ .send_IPI_self = default_send_IPI_self, ++ ++ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, ++ ++ .wait_for_init_deassert = default_wait_for_init_deassert, ++ ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = default_inquire_remote_apic, ++ ++ .read = native_apic_mem_read, ++ .write = native_apic_mem_write, ++ .icr_read = native_apic_icr_read, ++ .icr_write = native_apic_icr_write, ++ .wait_icr_idle = native_apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, ++}; +Index: linux-2.6-tip/arch/x86/kernel/apic/x2apic_cluster.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/x2apic_cluster.c +@@ -0,0 +1,245 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); ++ ++static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ return x2apic_enabled(); ++} ++ ++/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ ++ ++static const struct cpumask *x2apic_target_cpus(void) ++{ ++ return cpumask_of(0); ++} ++ ++/* ++ * for now each logical cpu is in its own vector allocation domain. ++ */ ++static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ cpumask_clear(retmask); ++ cpumask_set_cpu(cpu, retmask); ++} ++ ++static void ++ __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) ++{ ++ unsigned long cfg; ++ ++ cfg = __prepare_ICR(0, vector, dest); ++ ++ /* ++ * send the IPI. ++ */ ++ native_x2apic_icr_write(cfg, apicid); ++} ++ ++/* ++ * for now, we send the IPI's one by one in the cpumask. ++ * TBD: Based on the cpu mask, we can send the IPI's to the cluster group ++ * at once. We have 16 cpu's in a cluster. This will minimize IPI register ++ * writes. ++ */ ++static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) ++{ ++ unsigned long query_cpu; ++ unsigned long flags; ++ ++ x2apic_wrmsr_fence(); ++ ++ local_irq_save(flags); ++ for_each_cpu(query_cpu, mask) { ++ __x2apic_send_IPI_dest( ++ per_cpu(x86_cpu_to_logical_apicid, query_cpu), ++ vector, apic->dest_logical); ++ } ++ local_irq_restore(flags); ++} ++ ++static void ++ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) ++{ ++ unsigned long this_cpu = smp_processor_id(); ++ unsigned long query_cpu; ++ unsigned long flags; ++ ++ x2apic_wrmsr_fence(); ++ ++ local_irq_save(flags); ++ for_each_cpu(query_cpu, mask) { ++ if (query_cpu == this_cpu) ++ continue; ++ __x2apic_send_IPI_dest( ++ per_cpu(x86_cpu_to_logical_apicid, query_cpu), ++ vector, apic->dest_logical); ++ } ++ local_irq_restore(flags); ++} ++ ++static void x2apic_send_IPI_allbutself(int vector) ++{ ++ unsigned long this_cpu = smp_processor_id(); ++ unsigned long query_cpu; ++ unsigned long flags; ++ ++ x2apic_wrmsr_fence(); ++ ++ local_irq_save(flags); ++ for_each_online_cpu(query_cpu) { ++ if (query_cpu == this_cpu) ++ continue; ++ __x2apic_send_IPI_dest( ++ per_cpu(x86_cpu_to_logical_apicid, query_cpu), ++ vector, apic->dest_logical); ++ } ++ local_irq_restore(flags); ++} ++ ++static void x2apic_send_IPI_all(int vector) ++{ ++ x2apic_send_IPI_mask(cpu_online_mask, vector); ++} ++ ++static int x2apic_apic_id_registered(void) ++{ ++ return 1; ++} ++ ++static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ /* ++ * We're using fixed IRQ delivery, can only return one logical APIC ID. ++ * May as well be the first. ++ */ ++ int cpu = cpumask_first(cpumask); ++ ++ if ((unsigned)cpu < nr_cpu_ids) ++ return per_cpu(x86_cpu_to_logical_apicid, cpu); ++ else ++ return BAD_APICID; ++} ++ ++static unsigned int ++x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, ++ const struct cpumask *andmask) ++{ ++ int cpu; ++ ++ /* ++ * We're using fixed IRQ delivery, can only return one logical APIC ID. ++ * May as well be the first. ++ */ ++ for_each_cpu_and(cpu, cpumask, andmask) { ++ if (cpumask_test_cpu(cpu, cpu_online_mask)) ++ break; ++ } ++ ++ if (cpu < nr_cpu_ids) ++ return per_cpu(x86_cpu_to_logical_apicid, cpu); ++ ++ return BAD_APICID; ++} ++ ++static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) ++{ ++ unsigned int id; ++ ++ id = x; ++ return id; ++} ++ ++static unsigned long set_apic_id(unsigned int id) ++{ ++ unsigned long x; ++ ++ x = id; ++ return x; ++} ++ ++static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) ++{ ++ return current_cpu_data.initial_apicid >> index_msb; ++} ++ ++static void x2apic_send_IPI_self(int vector) ++{ ++ apic_write(APIC_SELF_IPI, vector); ++} ++ ++static void init_x2apic_ldr(void) ++{ ++ int cpu = smp_processor_id(); ++ ++ per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); ++} ++ ++struct apic apic_x2apic_cluster = { ++ ++ .name = "cluster x2apic", ++ .probe = NULL, ++ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, ++ .apic_id_registered = x2apic_apic_id_registered, ++ ++ .irq_delivery_mode = dest_LowestPrio, ++ .irq_dest_mode = 1, /* logical */ ++ ++ .target_cpus = x2apic_target_cpus, ++ .disable_esr = 0, ++ .dest_logical = APIC_DEST_LOGICAL, ++ .check_apicid_used = NULL, ++ .check_apicid_present = NULL, ++ ++ .vector_allocation_domain = x2apic_vector_allocation_domain, ++ .init_apic_ldr = init_x2apic_ldr, ++ ++ .ioapic_phys_id_map = NULL, ++ .setup_apic_routing = NULL, ++ .multi_timer_check = NULL, ++ .apicid_to_node = NULL, ++ .cpu_to_logical_apicid = NULL, ++ .cpu_present_to_apicid = default_cpu_present_to_apicid, ++ .apicid_to_cpu_present = NULL, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = default_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = x2apic_cluster_phys_pkg_id, ++ .mps_oem_check = NULL, ++ ++ .get_apic_id = x2apic_cluster_phys_get_apic_id, ++ .set_apic_id = set_apic_id, ++ .apic_id_mask = 0xFFFFFFFFu, ++ ++ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = x2apic_send_IPI_mask, ++ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, ++ .send_IPI_allbutself = x2apic_send_IPI_allbutself, ++ .send_IPI_all = x2apic_send_IPI_all, ++ .send_IPI_self = x2apic_send_IPI_self, ++ ++ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, ++ .wait_for_init_deassert = NULL, ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = NULL, ++ ++ .read = native_apic_msr_read, ++ .write = native_apic_msr_write, ++ .icr_read = native_x2apic_icr_read, ++ .icr_write = native_x2apic_icr_write, ++ .wait_icr_idle = native_x2apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, ++}; +Index: linux-2.6-tip/arch/x86/kernel/apic/x2apic_phys.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/x2apic_phys.c +@@ -0,0 +1,234 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++int x2apic_phys; ++ ++static int set_x2apic_phys_mode(char *arg) ++{ ++ x2apic_phys = 1; ++ return 0; ++} ++early_param("x2apic_phys", set_x2apic_phys_mode); ++ ++static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ if (x2apic_phys) ++ return x2apic_enabled(); ++ else ++ return 0; ++} ++ ++/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ ++ ++static const struct cpumask *x2apic_target_cpus(void) ++{ ++ return cpumask_of(0); ++} ++ ++static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ cpumask_clear(retmask); ++ cpumask_set_cpu(cpu, retmask); ++} ++ ++static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, ++ unsigned int dest) ++{ ++ unsigned long cfg; ++ ++ cfg = __prepare_ICR(0, vector, dest); ++ ++ /* ++ * send the IPI. ++ */ ++ native_x2apic_icr_write(cfg, apicid); ++} ++ ++static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) ++{ ++ unsigned long query_cpu; ++ unsigned long flags; ++ ++ x2apic_wrmsr_fence(); ++ ++ local_irq_save(flags); ++ for_each_cpu(query_cpu, mask) { ++ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), ++ vector, APIC_DEST_PHYSICAL); ++ } ++ local_irq_restore(flags); ++} ++ ++static void ++ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) ++{ ++ unsigned long this_cpu = smp_processor_id(); ++ unsigned long query_cpu; ++ unsigned long flags; ++ ++ x2apic_wrmsr_fence(); ++ ++ local_irq_save(flags); ++ for_each_cpu(query_cpu, mask) { ++ if (query_cpu != this_cpu) ++ __x2apic_send_IPI_dest( ++ per_cpu(x86_cpu_to_apicid, query_cpu), ++ vector, APIC_DEST_PHYSICAL); ++ } ++ local_irq_restore(flags); ++} ++ ++static void x2apic_send_IPI_allbutself(int vector) ++{ ++ unsigned long this_cpu = smp_processor_id(); ++ unsigned long query_cpu; ++ unsigned long flags; ++ ++ x2apic_wrmsr_fence(); ++ ++ local_irq_save(flags); ++ for_each_online_cpu(query_cpu) { ++ if (query_cpu == this_cpu) ++ continue; ++ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), ++ vector, APIC_DEST_PHYSICAL); ++ } ++ local_irq_restore(flags); ++} ++ ++static void x2apic_send_IPI_all(int vector) ++{ ++ x2apic_send_IPI_mask(cpu_online_mask, vector); ++} ++ ++static int x2apic_apic_id_registered(void) ++{ ++ return 1; ++} ++ ++static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ /* ++ * We're using fixed IRQ delivery, can only return one phys APIC ID. ++ * May as well be the first. ++ */ ++ int cpu = cpumask_first(cpumask); ++ ++ if ((unsigned)cpu < nr_cpu_ids) ++ return per_cpu(x86_cpu_to_apicid, cpu); ++ else ++ return BAD_APICID; ++} ++ ++static unsigned int ++x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, ++ const struct cpumask *andmask) ++{ ++ int cpu; ++ ++ /* ++ * We're using fixed IRQ delivery, can only return one phys APIC ID. ++ * May as well be the first. ++ */ ++ for_each_cpu_and(cpu, cpumask, andmask) { ++ if (cpumask_test_cpu(cpu, cpu_online_mask)) ++ break; ++ } ++ ++ if (cpu < nr_cpu_ids) ++ return per_cpu(x86_cpu_to_apicid, cpu); ++ ++ return BAD_APICID; ++} ++ ++static unsigned int x2apic_phys_get_apic_id(unsigned long x) ++{ ++ return x; ++} ++ ++static unsigned long set_apic_id(unsigned int id) ++{ ++ return id; ++} ++ ++static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) ++{ ++ return current_cpu_data.initial_apicid >> index_msb; ++} ++ ++static void x2apic_send_IPI_self(int vector) ++{ ++ apic_write(APIC_SELF_IPI, vector); ++} ++ ++static void init_x2apic_ldr(void) ++{ ++} ++ ++struct apic apic_x2apic_phys = { ++ ++ .name = "physical x2apic", ++ .probe = NULL, ++ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, ++ .apic_id_registered = x2apic_apic_id_registered, ++ ++ .irq_delivery_mode = dest_Fixed, ++ .irq_dest_mode = 0, /* physical */ ++ ++ .target_cpus = x2apic_target_cpus, ++ .disable_esr = 0, ++ .dest_logical = 0, ++ .check_apicid_used = NULL, ++ .check_apicid_present = NULL, ++ ++ .vector_allocation_domain = x2apic_vector_allocation_domain, ++ .init_apic_ldr = init_x2apic_ldr, ++ ++ .ioapic_phys_id_map = NULL, ++ .setup_apic_routing = NULL, ++ .multi_timer_check = NULL, ++ .apicid_to_node = NULL, ++ .cpu_to_logical_apicid = NULL, ++ .cpu_present_to_apicid = default_cpu_present_to_apicid, ++ .apicid_to_cpu_present = NULL, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = default_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = x2apic_phys_pkg_id, ++ .mps_oem_check = NULL, ++ ++ .get_apic_id = x2apic_phys_get_apic_id, ++ .set_apic_id = set_apic_id, ++ .apic_id_mask = 0xFFFFFFFFu, ++ ++ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = x2apic_send_IPI_mask, ++ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, ++ .send_IPI_allbutself = x2apic_send_IPI_allbutself, ++ .send_IPI_all = x2apic_send_IPI_all, ++ .send_IPI_self = x2apic_send_IPI_self, ++ ++ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, ++ .wait_for_init_deassert = NULL, ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = NULL, ++ ++ .read = native_apic_msr_read, ++ .write = native_apic_msr_write, ++ .icr_read = native_x2apic_icr_read, ++ .icr_write = native_x2apic_icr_write, ++ .wait_icr_idle = native_x2apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, ++}; +Index: linux-2.6-tip/arch/x86/kernel/apic/x2apic_uv_x.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/apic/x2apic_uv_x.c +@@ -0,0 +1,648 @@ ++/* ++ * This file is subject to the terms and conditions of the GNU General Public ++ * License. See the file "COPYING" in the main directory of this archive ++ * for more details. ++ * ++ * SGI UV APIC functions (note: not an Intel compatible APIC) ++ * ++ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++DEFINE_PER_CPU(int, x2apic_extra_bits); ++ ++static enum uv_system_type uv_system_type; ++ ++static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++ if (!strcmp(oem_id, "SGI")) { ++ if (!strcmp(oem_table_id, "UVL")) ++ uv_system_type = UV_LEGACY_APIC; ++ else if (!strcmp(oem_table_id, "UVX")) ++ uv_system_type = UV_X2APIC; ++ else if (!strcmp(oem_table_id, "UVH")) { ++ uv_system_type = UV_NON_UNIQUE_APIC; ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++enum uv_system_type get_uv_system_type(void) ++{ ++ return uv_system_type; ++} ++ ++int is_uv_system(void) ++{ ++ return uv_system_type != UV_NONE; ++} ++EXPORT_SYMBOL_GPL(is_uv_system); ++ ++DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); ++EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); ++ ++struct uv_blade_info *uv_blade_info; ++EXPORT_SYMBOL_GPL(uv_blade_info); ++ ++short *uv_node_to_blade; ++EXPORT_SYMBOL_GPL(uv_node_to_blade); ++ ++short *uv_cpu_to_blade; ++EXPORT_SYMBOL_GPL(uv_cpu_to_blade); ++ ++short uv_possible_blades; ++EXPORT_SYMBOL_GPL(uv_possible_blades); ++ ++unsigned long sn_rtc_cycles_per_second; ++EXPORT_SYMBOL(sn_rtc_cycles_per_second); ++ ++/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ ++ ++static const struct cpumask *uv_target_cpus(void) ++{ ++ return cpumask_of(0); ++} ++ ++static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) ++{ ++ cpumask_clear(retmask); ++ cpumask_set_cpu(cpu, retmask); ++} ++ ++static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) ++{ ++#ifdef CONFIG_SMP ++ unsigned long val; ++ int pnode; ++ ++ pnode = uv_apicid_to_pnode(phys_apicid); ++ val = (1UL << UVH_IPI_INT_SEND_SHFT) | ++ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ++ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | ++ APIC_DM_INIT; ++ uv_write_global_mmr64(pnode, UVH_IPI_INT, val); ++ mdelay(10); ++ ++ val = (1UL << UVH_IPI_INT_SEND_SHFT) | ++ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ++ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | ++ APIC_DM_STARTUP; ++ uv_write_global_mmr64(pnode, UVH_IPI_INT, val); ++ ++ atomic_set(&init_deasserted, 1); ++#endif ++ return 0; ++} ++ ++static void uv_send_IPI_one(int cpu, int vector) ++{ ++ unsigned long val, apicid; ++ int pnode; ++ ++ apicid = per_cpu(x86_cpu_to_apicid, cpu); ++ pnode = uv_apicid_to_pnode(apicid); ++ ++ val = (1UL << UVH_IPI_INT_SEND_SHFT) | ++ (apicid << UVH_IPI_INT_APIC_ID_SHFT) | ++ (vector << UVH_IPI_INT_VECTOR_SHFT); ++ ++ uv_write_global_mmr64(pnode, UVH_IPI_INT, val); ++} ++ ++static void uv_send_IPI_mask(const struct cpumask *mask, int vector) ++{ ++ unsigned int cpu; ++ ++ for_each_cpu(cpu, mask) ++ uv_send_IPI_one(cpu, vector); ++} ++ ++static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) ++{ ++ unsigned int this_cpu = smp_processor_id(); ++ unsigned int cpu; ++ ++ for_each_cpu(cpu, mask) { ++ if (cpu != this_cpu) ++ uv_send_IPI_one(cpu, vector); ++ } ++} ++ ++static void uv_send_IPI_allbutself(int vector) ++{ ++ unsigned int this_cpu = smp_processor_id(); ++ unsigned int cpu; ++ ++ for_each_online_cpu(cpu) { ++ if (cpu != this_cpu) ++ uv_send_IPI_one(cpu, vector); ++ } ++} ++ ++static void uv_send_IPI_all(int vector) ++{ ++ uv_send_IPI_mask(cpu_online_mask, vector); ++} ++ ++static int uv_apic_id_registered(void) ++{ ++ return 1; ++} ++ ++static void uv_init_apic_ldr(void) ++{ ++} ++ ++static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) ++{ ++ /* ++ * We're using fixed IRQ delivery, can only return one phys APIC ID. ++ * May as well be the first. ++ */ ++ int cpu = cpumask_first(cpumask); ++ ++ if ((unsigned)cpu < nr_cpu_ids) ++ return per_cpu(x86_cpu_to_apicid, cpu); ++ else ++ return BAD_APICID; ++} ++ ++static unsigned int ++uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, ++ const struct cpumask *andmask) ++{ ++ int cpu; ++ ++ /* ++ * We're using fixed IRQ delivery, can only return one phys APIC ID. ++ * May as well be the first. ++ */ ++ for_each_cpu_and(cpu, cpumask, andmask) { ++ if (cpumask_test_cpu(cpu, cpu_online_mask)) ++ break; ++ } ++ if (cpu < nr_cpu_ids) ++ return per_cpu(x86_cpu_to_apicid, cpu); ++ ++ return BAD_APICID; ++} ++ ++static unsigned int x2apic_get_apic_id(unsigned long x) ++{ ++ unsigned int id; ++ ++ WARN_ON(preemptible() && num_online_cpus() > 1); ++ id = x | __get_cpu_var(x2apic_extra_bits); ++ ++ return id; ++} ++ ++static unsigned long set_apic_id(unsigned int id) ++{ ++ unsigned long x; ++ ++ /* maskout x2apic_extra_bits ? */ ++ x = id; ++ return x; ++} ++ ++static unsigned int uv_read_apic_id(void) ++{ ++ ++ return x2apic_get_apic_id(apic_read(APIC_ID)); ++} ++ ++static int uv_phys_pkg_id(int initial_apicid, int index_msb) ++{ ++ return uv_read_apic_id() >> index_msb; ++} ++ ++static void uv_send_IPI_self(int vector) ++{ ++ apic_write(APIC_SELF_IPI, vector); ++} ++ ++struct apic apic_x2apic_uv_x = { ++ ++ .name = "UV large system", ++ .probe = NULL, ++ .acpi_madt_oem_check = uv_acpi_madt_oem_check, ++ .apic_id_registered = uv_apic_id_registered, ++ ++ .irq_delivery_mode = dest_Fixed, ++ .irq_dest_mode = 1, /* logical */ ++ ++ .target_cpus = uv_target_cpus, ++ .disable_esr = 0, ++ .dest_logical = APIC_DEST_LOGICAL, ++ .check_apicid_used = NULL, ++ .check_apicid_present = NULL, ++ ++ .vector_allocation_domain = uv_vector_allocation_domain, ++ .init_apic_ldr = uv_init_apic_ldr, ++ ++ .ioapic_phys_id_map = NULL, ++ .setup_apic_routing = NULL, ++ .multi_timer_check = NULL, ++ .apicid_to_node = NULL, ++ .cpu_to_logical_apicid = NULL, ++ .cpu_present_to_apicid = default_cpu_present_to_apicid, ++ .apicid_to_cpu_present = NULL, ++ .setup_portio_remap = NULL, ++ .check_phys_apicid_present = default_check_phys_apicid_present, ++ .enable_apic_mode = NULL, ++ .phys_pkg_id = uv_phys_pkg_id, ++ .mps_oem_check = NULL, ++ ++ .get_apic_id = x2apic_get_apic_id, ++ .set_apic_id = set_apic_id, ++ .apic_id_mask = 0xFFFFFFFFu, ++ ++ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, ++ .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, ++ ++ .send_IPI_mask = uv_send_IPI_mask, ++ .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, ++ .send_IPI_allbutself = uv_send_IPI_allbutself, ++ .send_IPI_all = uv_send_IPI_all, ++ .send_IPI_self = uv_send_IPI_self, ++ ++ .wakeup_secondary_cpu = uv_wakeup_secondary, ++ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, ++ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, ++ .wait_for_init_deassert = NULL, ++ .smp_callin_clear_local_apic = NULL, ++ .inquire_remote_apic = NULL, ++ ++ .read = native_apic_msr_read, ++ .write = native_apic_msr_write, ++ .icr_read = native_x2apic_icr_read, ++ .icr_write = native_x2apic_icr_write, ++ .wait_icr_idle = native_x2apic_wait_icr_idle, ++ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, ++}; ++ ++static __cpuinit void set_x2apic_extra_bits(int pnode) ++{ ++ __get_cpu_var(x2apic_extra_bits) = (pnode << 6); ++} ++ ++/* ++ * Called on boot cpu. ++ */ ++static __init int boot_pnode_to_blade(int pnode) ++{ ++ int blade; ++ ++ for (blade = 0; blade < uv_num_possible_blades(); blade++) ++ if (pnode == uv_blade_info[blade].pnode) ++ return blade; ++ ++ panic("x2apic_uv: bad pnode!"); ++} ++ ++struct redir_addr { ++ unsigned long redirect; ++ unsigned long alias; ++}; ++ ++#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT ++ ++static __initdata struct redir_addr redir_addrs[] = { ++ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, ++ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, ++ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, ++}; ++ ++static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) ++{ ++ union uvh_si_alias0_overlay_config_u alias; ++ union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { ++ alias.v = uv_read_local_mmr(redir_addrs[i].alias); ++ if (alias.s.base == 0) { ++ *size = (1UL << alias.s.m_alias); ++ redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); ++ *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; ++ return; ++ } ++ } ++ panic("get_lowmem_redirect: no match!"); ++} ++ ++static __init void map_low_mmrs(void) ++{ ++ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); ++ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); ++} ++ ++enum map_type {map_wb, map_uc}; ++ ++static __init void map_high(char *id, unsigned long base, int shift, ++ int max_pnode, enum map_type map_type) ++{ ++ unsigned long bytes, paddr; ++ ++ paddr = base << shift; ++ bytes = (1UL << shift) * (max_pnode + 1); ++ printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, ++ paddr + bytes); ++ if (map_type == map_uc) ++ init_extra_mapping_uc(paddr, bytes); ++ else ++ init_extra_mapping_wb(paddr, bytes); ++ ++} ++static __init void map_gru_high(int max_pnode) ++{ ++ union uvh_rh_gam_gru_overlay_config_mmr_u gru; ++ int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; ++ ++ gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); ++ if (gru.s.enable) ++ map_high("GRU", gru.s.base, shift, max_pnode, map_wb); ++} ++ ++static __init void map_config_high(int max_pnode) ++{ ++ union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; ++ int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; ++ ++ cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); ++ if (cfg.s.enable) ++ map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); ++} ++ ++static __init void map_mmr_high(int max_pnode) ++{ ++ union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; ++ int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; ++ ++ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); ++ if (mmr.s.enable) ++ map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); ++} ++ ++static __init void map_mmioh_high(int max_pnode) ++{ ++ union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; ++ int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; ++ ++ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); ++ if (mmioh.s.enable) ++ map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); ++} ++ ++static __init void uv_rtc_init(void) ++{ ++ long status; ++ u64 ticks_per_sec; ++ ++ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, ++ &ticks_per_sec); ++ if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { ++ printk(KERN_WARNING ++ "unable to determine platform RTC clock frequency, " ++ "guessing.\n"); ++ /* BIOS gives wrong value for clock freq. so guess */ ++ sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; ++ } else ++ sn_rtc_cycles_per_second = ticks_per_sec; ++} ++ ++/* ++ * percpu heartbeat timer ++ */ ++static void uv_heartbeat(unsigned long ignored) ++{ ++ struct timer_list *timer = &uv_hub_info->scir.timer; ++ unsigned char bits = uv_hub_info->scir.state; ++ ++ /* flip heartbeat bit */ ++ bits ^= SCIR_CPU_HEARTBEAT; ++ ++ /* is this cpu idle? */ ++ if (idle_cpu(raw_smp_processor_id())) ++ bits &= ~SCIR_CPU_ACTIVITY; ++ else ++ bits |= SCIR_CPU_ACTIVITY; ++ ++ /* update system controller interface reg */ ++ uv_set_scir_bits(bits); ++ ++ /* enable next timer period */ ++ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); ++} ++ ++static void __cpuinit uv_heartbeat_enable(int cpu) ++{ ++ if (!uv_cpu_hub_info(cpu)->scir.enabled) { ++ struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; ++ ++ uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); ++ setup_timer(timer, uv_heartbeat, cpu); ++ timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; ++ add_timer_on(timer, cpu); ++ uv_cpu_hub_info(cpu)->scir.enabled = 1; ++ } ++ ++ /* check boot cpu */ ++ if (!uv_cpu_hub_info(0)->scir.enabled) ++ uv_heartbeat_enable(0); ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void __cpuinit uv_heartbeat_disable(int cpu) ++{ ++ if (uv_cpu_hub_info(cpu)->scir.enabled) { ++ uv_cpu_hub_info(cpu)->scir.enabled = 0; ++ del_timer(&uv_cpu_hub_info(cpu)->scir.timer); ++ } ++ uv_set_cpu_scir_bits(cpu, 0xff); ++} ++ ++/* ++ * cpu hotplug notifier ++ */ ++static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, ++ unsigned long action, void *hcpu) ++{ ++ long cpu = (long)hcpu; ++ ++ switch (action) { ++ case CPU_ONLINE: ++ uv_heartbeat_enable(cpu); ++ break; ++ case CPU_DOWN_PREPARE: ++ uv_heartbeat_disable(cpu); ++ break; ++ default: ++ break; ++ } ++ return NOTIFY_OK; ++} ++ ++static __init void uv_scir_register_cpu_notifier(void) ++{ ++ hotcpu_notifier(uv_scir_cpu_notify, 0); ++} ++ ++#else /* !CONFIG_HOTPLUG_CPU */ ++ ++static __init void uv_scir_register_cpu_notifier(void) ++{ ++} ++ ++static __init int uv_init_heartbeat(void) ++{ ++ int cpu; ++ ++ if (is_uv_system()) ++ for_each_online_cpu(cpu) ++ uv_heartbeat_enable(cpu); ++ return 0; ++} ++ ++late_initcall(uv_init_heartbeat); ++ ++#endif /* !CONFIG_HOTPLUG_CPU */ ++ ++/* ++ * Called on each cpu to initialize the per_cpu UV data area. ++ * FIXME: hotplug not supported yet ++ */ ++void __cpuinit uv_cpu_init(void) ++{ ++ /* CPU 0 initilization will be done via uv_system_init. */ ++ if (!uv_blade_info) ++ return; ++ ++ uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; ++ ++ if (get_uv_system_type() == UV_NON_UNIQUE_APIC) ++ set_x2apic_extra_bits(uv_hub_info->pnode); ++} ++ ++ ++void __init uv_system_init(void) ++{ ++ union uvh_si_addr_map_config_u m_n_config; ++ union uvh_node_id_u node_id; ++ unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; ++ int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; ++ int max_pnode = 0; ++ unsigned long mmr_base, present; ++ ++ map_low_mmrs(); ++ ++ m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); ++ m_val = m_n_config.s.m_skt; ++ n_val = m_n_config.s.n_skt; ++ mmr_base = ++ uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ++ ~UV_MMR_ENABLE; ++ printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); ++ ++ for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) ++ uv_possible_blades += ++ hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); ++ printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); ++ ++ bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); ++ uv_blade_info = kmalloc(bytes, GFP_KERNEL); ++ ++ get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); ++ ++ bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); ++ uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); ++ memset(uv_node_to_blade, 255, bytes); ++ ++ bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); ++ uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); ++ memset(uv_cpu_to_blade, 255, bytes); ++ ++ blade = 0; ++ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { ++ present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); ++ for (j = 0; j < 64; j++) { ++ if (!test_bit(j, &present)) ++ continue; ++ uv_blade_info[blade].pnode = (i * 64 + j); ++ uv_blade_info[blade].nr_possible_cpus = 0; ++ uv_blade_info[blade].nr_online_cpus = 0; ++ blade++; ++ } ++ } ++ ++ node_id.v = uv_read_local_mmr(UVH_NODE_ID); ++ gnode_upper = (((unsigned long)node_id.s.node_id) & ++ ~((1 << n_val) - 1)) << m_val; ++ ++ uv_bios_init(); ++ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, ++ &sn_coherency_id, &sn_region_size); ++ uv_rtc_init(); ++ ++ for_each_present_cpu(cpu) { ++ nid = cpu_to_node(cpu); ++ pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); ++ blade = boot_pnode_to_blade(pnode); ++ lcpu = uv_blade_info[blade].nr_possible_cpus; ++ uv_blade_info[blade].nr_possible_cpus++; ++ ++ uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; ++ uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; ++ uv_cpu_hub_info(cpu)->m_val = m_val; ++ uv_cpu_hub_info(cpu)->n_val = m_val; ++ uv_cpu_hub_info(cpu)->numa_blade_id = blade; ++ uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; ++ uv_cpu_hub_info(cpu)->pnode = pnode; ++ uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1; ++ uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; ++ uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; ++ uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; ++ uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; ++ uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; ++ uv_node_to_blade[nid] = blade; ++ uv_cpu_to_blade[cpu] = blade; ++ max_pnode = max(pnode, max_pnode); ++ ++ printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " ++ "lcpu %d, blade %d\n", ++ cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, ++ lcpu, blade); ++ } ++ ++ map_gru_high(max_pnode); ++ map_mmr_high(max_pnode); ++ map_config_high(max_pnode); ++ map_mmioh_high(max_pnode); ++ ++ uv_cpu_init(); ++ uv_scir_register_cpu_notifier(); ++ proc_mkdir("sgi_uv", NULL); ++} +Index: linux-2.6-tip/arch/x86/kernel/apm_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/apm_32.c ++++ linux-2.6-tip/arch/x86/kernel/apm_32.c +@@ -301,7 +301,7 @@ extern int (*console_blank_hook)(int); + */ + #define APM_ZERO_SEGS + +-#include "apm.h" ++#include + + /* + * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. +@@ -466,7 +466,7 @@ static const lookup_t error_table[] = { + * @err: APM BIOS return code + * + * Write a meaningful log entry to the kernel log in the event of +- * an APM error. ++ * an APM error. Note that this also handles (negative) kernel errors. + */ + + static void apm_error(char *str, int err) +@@ -478,43 +478,14 @@ static void apm_error(char *str, int err + break; + if (i < ERROR_COUNT) + printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); ++ else if (err < 0) ++ printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); + else + printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", + str, err); + } + + /* +- * Lock APM functionality to physical CPU 0 +- */ +- +-#ifdef CONFIG_SMP +- +-static cpumask_t apm_save_cpus(void) +-{ +- cpumask_t x = current->cpus_allowed; +- /* Some bioses don't like being called from CPU != 0 */ +- set_cpus_allowed(current, cpumask_of_cpu(0)); +- BUG_ON(smp_processor_id() != 0); +- return x; +-} +- +-static inline void apm_restore_cpus(cpumask_t mask) +-{ +- set_cpus_allowed(current, mask); +-} +- +-#else +- +-/* +- * No CPU lockdown needed on a uniprocessor +- */ +- +-#define apm_save_cpus() (current->cpus_allowed) +-#define apm_restore_cpus(x) (void)(x) +- +-#endif +- +-/* + * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and + * apm_info.allow_ints, we are being really paranoid here! Not only + * are interrupts disabled, but all the segment registers (except SS) +@@ -568,16 +539,23 @@ static inline void apm_irq_restore(unsig + # define APM_DO_RESTORE_SEGS + #endif + ++struct apm_bios_call { ++ u32 func; ++ /* In and out */ ++ u32 ebx; ++ u32 ecx; ++ /* Out only */ ++ u32 eax; ++ u32 edx; ++ u32 esi; ++ ++ /* Error: -ENOMEM, or bits 8-15 of eax */ ++ int err; ++}; ++ + /** +- * apm_bios_call - Make an APM BIOS 32bit call +- * @func: APM function to execute +- * @ebx_in: EBX register for call entry +- * @ecx_in: ECX register for call entry +- * @eax: EAX register return +- * @ebx: EBX register return +- * @ecx: ECX register return +- * @edx: EDX register return +- * @esi: ESI register return ++ * __apm_bios_call - Make an APM BIOS 32bit call ++ * @_call: pointer to struct apm_bios_call. + * + * Make an APM call using the 32bit protected mode interface. The + * caller is responsible for knowing if APM BIOS is configured and +@@ -586,80 +564,142 @@ static inline void apm_irq_restore(unsig + * flag is loaded into AL. If there is an error, then the error + * code is returned in AH (bits 8-15 of eax) and this function + * returns non-zero. ++ * ++ * Note: this makes the call on the current CPU. + */ +- +-static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, +- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) ++static long __apm_bios_call(void *_call) + { + APM_DECL_SEGS + unsigned long flags; +- cpumask_t cpus; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; +- +- cpus = apm_save_cpus(); ++ struct apm_bios_call *call = _call; + + cpu = get_cpu(); ++ BUG_ON(cpu != 0); + gdt = get_cpu_gdt_table(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + APM_DO_SAVE_SEGS; +- apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); ++ apm_bios_call_asm(call->func, call->ebx, call->ecx, ++ &call->eax, &call->ebx, &call->ecx, &call->edx, ++ &call->esi); + APM_DO_RESTORE_SEGS; + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); +- apm_restore_cpus(cpus); + +- return *eax & 0xff; ++ return call->eax & 0xff; ++} ++ ++/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */ ++static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call) ++{ ++ int ret; ++ ++ /* Don't bother with work_on_cpu in the common case, so we don't ++ * have to worry about OOM or overhead. */ ++ if (get_cpu() == 0) { ++ ret = fn(call); ++ put_cpu(); ++ } else { ++ put_cpu(); ++ ret = work_on_cpu(0, fn, call); ++ } ++ ++ /* work_on_cpu can fail with -ENOMEM */ ++ if (ret < 0) ++ call->err = ret; ++ else ++ call->err = (call->eax >> 8) & 0xff; ++ ++ return ret; + } + + /** +- * apm_bios_call_simple - make a simple APM BIOS 32bit call +- * @func: APM function to invoke +- * @ebx_in: EBX register value for BIOS call +- * @ecx_in: ECX register value for BIOS call +- * @eax: EAX register on return from the BIOS call ++ * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0) ++ * @call: the apm_bios_call registers. ++ * ++ * If there is an error, it is returned in @call.err. ++ */ ++static int apm_bios_call(struct apm_bios_call *call) ++{ ++ return on_cpu0(__apm_bios_call, call); ++} ++ ++/** ++ * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0) ++ * @_call: pointer to struct apm_bios_call. + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in AH +- * (bits 8-15 of eax) and this function returns non-zero. This is +- * used for simpler BIOS operations. This call may hold interrupts +- * off for a long time on some laptops. ++ * (bits 8-15 of eax) and this function returns non-zero (it can ++ * also return -ENOMEM). This is used for simpler BIOS operations. ++ * This call may hold interrupts off for a long time on some laptops. ++ * ++ * Note: this makes the call on the current CPU. + */ +- +-static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) ++static long __apm_bios_call_simple(void *_call) + { + u8 error; + APM_DECL_SEGS + unsigned long flags; +- cpumask_t cpus; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; +- +- cpus = apm_save_cpus(); ++ struct apm_bios_call *call = _call; + + cpu = get_cpu(); ++ BUG_ON(cpu != 0); + gdt = get_cpu_gdt_table(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + APM_DO_SAVE_SEGS; +- error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); ++ error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, ++ &call->eax); + APM_DO_RESTORE_SEGS; + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); +- apm_restore_cpus(cpus); + return error; + } + + /** ++ * apm_bios_call_simple - make a simple APM BIOS 32bit call ++ * @func: APM function to invoke ++ * @ebx_in: EBX register value for BIOS call ++ * @ecx_in: ECX register value for BIOS call ++ * @eax: EAX register on return from the BIOS call ++ * @err: bits ++ * ++ * Make a BIOS call that returns one value only, or just status. ++ * If there is an error, then the error code is returned in @err ++ * and this function returns non-zero. This is used for simpler ++ * BIOS operations. This call may hold interrupts off for a long ++ * time on some laptops. ++ */ ++static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax, ++ int *err) ++{ ++ struct apm_bios_call call; ++ int ret; ++ ++ call.func = func; ++ call.ebx = ebx_in; ++ call.ecx = ecx_in; ++ ++ ret = on_cpu0(__apm_bios_call_simple, &call); ++ *eax = call.eax; ++ *err = call.err; ++ return ret; ++} ++ ++/** + * apm_driver_version - APM driver version + * @val: loaded with the APM version on return + * +@@ -678,9 +718,10 @@ static u8 apm_bios_call_simple(u32 func, + static int apm_driver_version(u_short *val) + { + u32 eax; ++ int err; + +- if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) +- return (eax >> 8) & 0xff; ++ if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err)) ++ return err; + *val = eax; + return APM_SUCCESS; + } +@@ -701,22 +742,21 @@ static int apm_driver_version(u_short *v + * that APM 1.2 is in use. If no messges are pending the value 0x80 + * is returned (No power management events pending). + */ +- + static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) + { +- u32 eax; +- u32 ebx; +- u32 ecx; +- u32 dummy; ++ struct apm_bios_call call; + +- if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, +- &dummy, &dummy)) +- return (eax >> 8) & 0xff; +- *event = ebx; ++ call.func = APM_FUNC_GET_EVENT; ++ call.ebx = call.ecx = 0; ++ ++ if (apm_bios_call(&call)) ++ return call.err; ++ ++ *event = call.ebx; + if (apm_info.connection_version < 0x0102) + *info = ~0; /* indicate info not valid */ + else +- *info = ecx; ++ *info = call.ecx; + return APM_SUCCESS; + } + +@@ -737,9 +777,10 @@ static int apm_get_event(apm_event_t *ev + static int set_power_state(u_short what, u_short state) + { + u32 eax; ++ int err; + +- if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) +- return (eax >> 8) & 0xff; ++ if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err)) ++ return err; + return APM_SUCCESS; + } + +@@ -770,6 +811,7 @@ static int apm_do_idle(void) + u8 ret = 0; + int idled = 0; + int polling; ++ int err; + + polling = !!(current_thread_info()->status & TS_POLLING); + if (polling) { +@@ -782,7 +824,7 @@ static int apm_do_idle(void) + } + if (!need_resched()) { + idled = 1; +- ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); ++ ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); + } + if (polling) + current_thread_info()->status |= TS_POLLING; +@@ -797,8 +839,7 @@ static int apm_do_idle(void) + * Only report the failure the first 5 times. + */ + if (++t < 5) { +- printk(KERN_DEBUG "apm_do_idle failed (%d)\n", +- (eax >> 8) & 0xff); ++ printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err); + t = jiffies; + } + return -1; +@@ -816,9 +857,10 @@ static int apm_do_idle(void) + static void apm_do_busy(void) + { + u32 dummy; ++ int err; + + if (clock_slowed || ALWAYS_CALL_BUSY) { +- (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); ++ (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err); + clock_slowed = 0; + } + } +@@ -937,7 +979,7 @@ static void apm_power_off(void) + + /* Some bioses don't like being called from CPU != 0 */ + if (apm_info.realmode_power_off) { +- (void)apm_save_cpus(); ++ set_cpus_allowed_ptr(current, cpumask_of(0)); + machine_real_restart(po_bios_call, sizeof(po_bios_call)); + } else { + (void)set_system_power_state(APM_STATE_OFF); +@@ -956,12 +998,13 @@ static void apm_power_off(void) + static int apm_enable_power_management(int enable) + { + u32 eax; ++ int err; + + if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) + return APM_NOT_ENGAGED; + if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, +- enable, &eax)) +- return (eax >> 8) & 0xff; ++ enable, &eax, &err)) ++ return err; + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISABLED; + else +@@ -986,24 +1029,23 @@ static int apm_enable_power_management(i + + static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) + { +- u32 eax; +- u32 ebx; +- u32 ecx; +- u32 edx; +- u32 dummy; ++ struct apm_bios_call call; ++ ++ call.func = APM_FUNC_GET_STATUS; ++ call.ebx = APM_DEVICE_ALL; ++ call.ecx = 0; + + if (apm_info.get_power_status_broken) + return APM_32_UNSUPPORTED; +- if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, +- &eax, &ebx, &ecx, &edx, &dummy)) +- return (eax >> 8) & 0xff; +- *status = ebx; +- *bat = ecx; ++ if (apm_bios_call(&call)) ++ return call.err; ++ *status = call.ebx; ++ *bat = call.ecx; + if (apm_info.get_power_status_swabinminutes) { +- *life = swab16((u16)edx); ++ *life = swab16((u16)call.edx); + *life |= 0x8000; + } else +- *life = edx; ++ *life = call.edx; + return APM_SUCCESS; + } + +@@ -1048,12 +1090,14 @@ static int apm_get_battery_status(u_shor + static int apm_engage_power_management(u_short device, int enable) + { + u32 eax; ++ int err; + + if ((enable == 0) && (device == APM_DEVICE_ALL) + && (apm_info.bios.flags & APM_BIOS_DISABLED)) + return APM_DISABLED; +- if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax)) +- return (eax >> 8) & 0xff; ++ if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, ++ &eax, &err)) ++ return err; + if (device == APM_DEVICE_ALL) { + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; +@@ -1682,16 +1726,14 @@ static int apm(void *unused) + char *power_stat; + char *bat_stat; + +-#ifdef CONFIG_SMP + /* 2002/08/01 - WT + * This is to avoid random crashes at boot time during initialization + * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. + * Some bioses don't like being called from CPU != 0. + * Method suggested by Ingo Molnar. + */ +- set_cpus_allowed(current, cpumask_of_cpu(0)); ++ set_cpus_allowed_ptr(current, cpumask_of(0)); + BUG_ON(smp_processor_id() != 0); +-#endif + + if (apm_info.connection_version == 0) { + apm_info.connection_version = apm_info.bios.version; +Index: linux-2.6-tip/arch/x86/kernel/asm-offsets_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/asm-offsets_32.c ++++ linux-2.6-tip/arch/x86/kernel/asm-offsets_32.c +@@ -75,6 +75,7 @@ void foo(void) + OFFSET(PT_DS, pt_regs, ds); + OFFSET(PT_ES, pt_regs, es); + OFFSET(PT_FS, pt_regs, fs); ++ OFFSET(PT_GS, pt_regs, gs); + OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); + OFFSET(PT_EIP, pt_regs, ip); + OFFSET(PT_CS, pt_regs, cs); +Index: linux-2.6-tip/arch/x86/kernel/asm-offsets_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/asm-offsets_64.c ++++ linux-2.6-tip/arch/x86/kernel/asm-offsets_64.c +@@ -11,7 +11,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -48,16 +47,6 @@ int main(void) + #endif + BLANK(); + #undef ENTRY +-#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) +- ENTRY(kernelstack); +- ENTRY(oldrsp); +- ENTRY(pcurrent); +- ENTRY(irqcount); +- ENTRY(cpunumber); +- ENTRY(irqstackptr); +- ENTRY(data_offset); +- BLANK(); +-#undef ENTRY + #ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); +Index: linux-2.6-tip/arch/x86/kernel/check.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/check.c ++++ linux-2.6-tip/arch/x86/kernel/check.c +@@ -83,7 +83,7 @@ void __init setup_bios_corruption_check( + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + +- if (addr == 0) ++ if (!(addr + 1)) + break; + + if (addr >= corruption_check_size) +Index: linux-2.6-tip/arch/x86/kernel/cpu/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/Makefile ++++ linux-2.6-tip/arch/x86/kernel/cpu/Makefile +@@ -1,5 +1,5 @@ + # +-# Makefile for x86-compatible CPU details and quirks ++# Makefile for x86-compatible CPU details, features and quirks + # + + # Don't trace early stages of a secondary CPU boot +@@ -14,19 +14,22 @@ obj-y += vmware.o hypervisor.o + obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o + obj-$(CONFIG_X86_64) += bugs_64.o + ++obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o ++ + obj-$(CONFIG_CPU_SUP_INTEL) += intel.o + obj-$(CONFIG_CPU_SUP_AMD) += amd.o + obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o +-obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o +-obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o ++obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o + obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o + obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o + +-obj-$(CONFIG_X86_MCE) += mcheck/ +-obj-$(CONFIG_MTRR) += mtrr/ +-obj-$(CONFIG_CPU_FREQ) += cpufreq/ ++obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ++ ++obj-$(CONFIG_X86_MCE) += mcheck/ ++obj-$(CONFIG_MTRR) += mtrr/ ++obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o ++obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o + + quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ +Index: linux-2.6-tip/arch/x86/kernel/cpu/addon_cpuid_features.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/addon_cpuid_features.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/addon_cpuid_features.c +@@ -7,7 +7,7 @@ + #include + #include + +-#include ++#include + + struct cpuid_bit { + u16 feature; +@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_feat + u32 regs[4]; + const struct cpuid_bit *cb; + +- static const struct cpuid_bit cpuid_bits[] = { ++ static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, + { 0, 0, 0, 0 } + }; +@@ -69,7 +69,7 @@ void __cpuinit init_scattered_cpuid_feat + */ + void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) + { +-#ifdef CONFIG_X86_SMP ++#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; +@@ -116,22 +116,14 @@ void __cpuinit detect_extended_topology( + + core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + +-#ifdef CONFIG_X86_32 +- c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) ++ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) + & core_select_mask; +- c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); ++ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ +- c->apicid = phys_pkg_id(c->initial_apicid, 0); +-#else +- c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; +- c->phys_proc_id = phys_pkg_id(core_plus_mask_width); +- /* +- * Reinit the apicid, now that we have extended initial_apicid. +- */ +- c->apicid = phys_pkg_id(0); +-#endif ++ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); ++ + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + + +@@ -143,37 +135,3 @@ void __cpuinit detect_extended_topology( + return; + #endif + } +- +-#ifdef CONFIG_X86_PAT +-void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) +-{ +- if (!cpu_has_pat) +- pat_disable("PAT not supported by CPU."); +- +- switch (c->x86_vendor) { +- case X86_VENDOR_INTEL: +- /* +- * There is a known erratum on Pentium III and Core Solo +- * and Core Duo CPUs. +- * " Page with PAT set to WC while associated MTRR is UC +- * may consolidate to UC " +- * Because of this erratum, it is better to stick with +- * setting WC in MTRR rather than using PAT on these CPUs. +- * +- * Enable PAT WC only on P4, Core 2 or later CPUs. +- */ +- if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15)) +- return; +- +- pat_disable("PAT WC disabled due to known CPU erratum."); +- return; +- +- case X86_VENDOR_AMD: +- case X86_VENDOR_CENTAUR: +- case X86_VENDOR_TRANSMETA: +- return; +- } +- +- pat_disable("PAT disabled. Not yet verified on this CPU type."); +-} +-#endif +Index: linux-2.6-tip/arch/x86/kernel/cpu/amd.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/amd.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/amd.c +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_X86_64 + # include +@@ -12,8 +13,6 @@ + # include + #endif + +-#include +- + #include "cpu.h" + + #ifdef CONFIG_X86_32 +@@ -143,6 +142,55 @@ static void __cpuinit init_amd_k6(struct + } + } + ++static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) ++{ ++#ifdef CONFIG_SMP ++ /* calling is from identify_secondary_cpu() ? */ ++ if (c->cpu_index == boot_cpu_id) ++ return; ++ ++ /* ++ * Certain Athlons might work (for various values of 'work') in SMP ++ * but they are not certified as MP capable. ++ */ ++ /* Athlon 660/661 is valid. */ ++ if ((c->x86_model == 6) && ((c->x86_mask == 0) || ++ (c->x86_mask == 1))) ++ goto valid_k7; ++ ++ /* Duron 670 is valid */ ++ if ((c->x86_model == 7) && (c->x86_mask == 0)) ++ goto valid_k7; ++ ++ /* ++ * Athlon 662, Duron 671, and Athlon >model 7 have capability ++ * bit. It's worth noting that the A5 stepping (662) of some ++ * Athlon XP's have the MP bit set. ++ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for ++ * more. ++ */ ++ if (((c->x86_model == 6) && (c->x86_mask >= 2)) || ++ ((c->x86_model == 7) && (c->x86_mask >= 1)) || ++ (c->x86_model > 7)) ++ if (cpu_has_mp) ++ goto valid_k7; ++ ++ /* If we get here, not a certified SMP capable AMD system. */ ++ ++ /* ++ * Don't taint if we are running SMP kernel on a single non-MP ++ * approved Athlon ++ */ ++ WARN_ONCE(1, "WARNING: This combination of AMD" ++ "processors is not suitable for SMP.\n"); ++ if (!test_taint(TAINT_UNSAFE_SMP)) ++ add_taint(TAINT_UNSAFE_SMP); ++ ++valid_k7: ++ ; ++#endif ++} ++ + static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) + { + u32 l, h; +@@ -177,6 +225,8 @@ static void __cpuinit init_amd_k7(struct + } + + set_cpu_cap(c, X86_FEATURE_K7); ++ ++ amd_k7_smp_check(c); + } + #endif + +@@ -370,6 +420,10 @@ static void __cpuinit init_amd(struct cp + if (c->x86 >= 6) + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); + ++ /* Enable Performance counter for K7 and later */ ++ if (c->x86 > 6 && c->x86 <= 0x11) ++ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); ++ + if (!c->x86_model_id[0]) { + switch (c->x86) { + case 0xf: +@@ -452,7 +506,7 @@ static unsigned int __cpuinit amd_size_c + } + #endif + +-static struct cpu_dev amd_cpu_dev __cpuinitdata = { ++static const struct cpu_dev __cpuinitconst amd_cpu_dev = { + .c_vendor = "AMD", + .c_ident = { "AuthenticAMD" }, + #ifdef CONFIG_X86_32 +Index: linux-2.6-tip/arch/x86/kernel/cpu/centaur.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/centaur.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/centaur.c +@@ -1,11 +1,11 @@ ++#include + #include + #include +-#include + + #include +-#include + #include + #include ++#include + + #include "cpu.h" + +@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpu + */ + c->x86_capability[5] = cpuid_edx(0xC0000001); + } +- ++#ifdef CONFIG_X86_32 + /* Cyrix III family needs CX8 & PGE explicitly enabled. */ + if (c->x86_model >= 6 && c->x86_model <= 9) { + rdmsr(MSR_VIA_FCR, lo, hi); +@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpu + /* Before Nehemiah, the C3's had 3dNOW! */ + if (c->x86_model >= 6 && c->x86_model < 9) + set_cpu_cap(c, X86_FEATURE_3DNOW); ++#endif ++ if (c->x86 == 0x6 && c->x86_model >= 0xf) { ++ c->x86_cache_alignment = c->x86_clflush_size * 2; ++ set_cpu_cap(c, X86_FEATURE_REP_GOOD); ++ } + + display_cacheinfo(c); + } +@@ -316,16 +321,25 @@ enum { + static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) + { + switch (c->x86) { ++#ifdef CONFIG_X86_32 + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; ++#endif ++ case 6: ++ if (c->x86_model >= 0xf) ++ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); ++ break; + } ++#ifdef CONFIG_X86_64 ++ set_cpu_cap(c, X86_FEATURE_SYSENTER32); ++#endif + } + + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) + { +- ++#ifdef CONFIG_X86_32 + char *name; + u32 fcr_set = 0; + u32 fcr_clr = 0; +@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struc + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); +- ++#endif ++ early_init_centaur(c); + switch (c->x86) { ++#ifdef CONFIG_X86_32 + case 5: + switch (c->x86_model) { + case 4: +@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struc + } + sprintf(c->x86_model_id, "WinChip %s", name); + break; +- ++#endif + case 6: + init_c3(c); + break; + } ++#ifdef CONFIG_X86_64 ++ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); ++#endif + } + + static unsigned int __cpuinit + centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) + { ++#ifdef CONFIG_X86_32 + /* VIA C3 CPUs (670-68F) need further shifting. */ + if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) + size >>= 8; +@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c + if ((c->x86 == 6) && (c->x86_model == 9) && + (c->x86_mask == 1) && (size == 65)) + size -= 1; +- ++#endif + return size; + } + +-static struct cpu_dev centaur_cpu_dev __cpuinitdata = { ++static const struct cpu_dev __cpuinitconst centaur_cpu_dev = { + .c_vendor = "Centaur", + .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, +Index: linux-2.6-tip/arch/x86/kernel/cpu/centaur_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/centaur_64.c ++++ /dev/null +@@ -1,37 +0,0 @@ +-#include +-#include +- +-#include +-#include +- +-#include "cpu.h" +- +-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +-{ +- if (c->x86 == 0x6 && c->x86_model >= 0xf) +- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +- +- set_cpu_cap(c, X86_FEATURE_SYSENTER32); +-} +- +-static void __cpuinit init_centaur(struct cpuinfo_x86 *c) +-{ +- early_init_centaur(c); +- +- if (c->x86 == 0x6 && c->x86_model >= 0xf) { +- c->x86_cache_alignment = c->x86_clflush_size * 2; +- set_cpu_cap(c, X86_FEATURE_REP_GOOD); +- } +- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +-} +- +-static struct cpu_dev centaur_cpu_dev __cpuinitdata = { +- .c_vendor = "Centaur", +- .c_ident = { "CentaurHauls" }, +- .c_early_init = early_init_centaur, +- .c_init = init_centaur, +- .c_x86_vendor = X86_VENDOR_CENTAUR, +-}; +- +-cpu_dev_register(centaur_cpu_dev); +- +Index: linux-2.6-tip/arch/x86/kernel/cpu/common.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/common.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/common.c +@@ -1,118 +1,118 @@ +-#include +-#include +-#include +-#include + #include ++#include + #include ++#include + #include +-#include +-#include ++#include ++#include + #include ++#include ++#include ++#include + #include +-#include +-#include +-#include +-#include +-#include ++#include ++ ++#include ++#include + #include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include + #include ++#include ++#include ++#include + #include ++#include + #include +-#include +-#include + #include ++ + #ifdef CONFIG_X86_LOCAL_APIC +-#include +-#include +-#include +-#include ++#include + #endif + +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- + #include "cpu.h" + +-#ifdef CONFIG_X86_64 +- + /* all of these masks are initialized in setup_cpu_local_masks() */ +-cpumask_var_t cpu_callin_mask; +-cpumask_var_t cpu_callout_mask; + cpumask_var_t cpu_initialized_mask; ++cpumask_var_t cpu_callout_mask; ++cpumask_var_t cpu_callin_mask; + + /* representing cpus for which sibling maps can be computed */ + cpumask_var_t cpu_sibling_setup_mask; + +-#else /* CONFIG_X86_32 */ +- +-cpumask_t cpu_callin_map; +-cpumask_t cpu_callout_map; +-cpumask_t cpu_initialized; +-cpumask_t cpu_sibling_setup_map; +- +-#endif /* CONFIG_X86_32 */ +- ++/* correctly size the local cpu masks */ ++void __init setup_cpu_local_masks(void) ++{ ++ alloc_bootmem_cpumask_var(&cpu_initialized_mask); ++ alloc_bootmem_cpumask_var(&cpu_callin_mask); ++ alloc_bootmem_cpumask_var(&cpu_callout_mask); ++ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); ++} + +-static struct cpu_dev *this_cpu __cpuinitdata; ++static const struct cpu_dev *this_cpu __cpuinitdata; + ++DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { + #ifdef CONFIG_X86_64 +-/* We need valid kernel segments for data and code in long mode too +- * IRET will check the segment types kkeil 2000/10/28 +- * Also sysret mandates a special GDT layout +- */ +-/* The TLS descriptors are currently at a different place compared to i386. +- Hopefully nobody expects them at a fixed place (Wine?) */ +-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { +- [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, +- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, +- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, +- [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, +- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, +- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, +-} }; ++ /* ++ * We need valid kernel segments for data and code in long mode too ++ * IRET will check the segment types kkeil 2000/10/28 ++ * Also sysret mandates a special GDT layout ++ * ++ * TLS descriptors are currently at a different place compared to i386. ++ * Hopefully nobody expects them at a fixed place (Wine?) ++ */ ++ [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, ++ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, ++ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, ++ [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, ++ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, ++ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + #else +-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +- [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, +- [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, +- [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, +- [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, ++ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, ++ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, ++ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, ++ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + /* 32-bit code */ +- [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, ++ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ +- [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, ++ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + /* 16-bit data */ +- [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, ++ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + /* 16-bit data */ +- [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, ++ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + /* 16-bit data */ +- [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, ++ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + /* 32-bit code */ +- [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, ++ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ +- [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, ++ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + /* data */ +- [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, ++ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + +- [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, +- [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, +-} }; ++ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, ++ [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, ++ GDT_STACK_CANARY_INIT + #endif ++} }; + EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + + #ifdef CONFIG_X86_32 +@@ -153,16 +153,17 @@ static inline int flag_is_changeable_p(u + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ +- asm volatile ("pushfl\n\t" +- "pushfl\n\t" +- "popl %0\n\t" +- "movl %0,%1\n\t" +- "xorl %2,%0\n\t" +- "pushl %0\n\t" +- "popfl\n\t" +- "pushfl\n\t" +- "popl %0\n\t" +- "popfl\n\t" ++ asm volatile ("pushfl \n\t" ++ "pushfl \n\t" ++ "popl %0 \n\t" ++ "movl %0, %1 \n\t" ++ "xorl %2, %0 \n\t" ++ "pushl %0 \n\t" ++ "popfl \n\t" ++ "pushfl \n\t" ++ "popl %0 \n\t" ++ "popfl \n\t" ++ + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + +@@ -177,18 +178,22 @@ static int __cpuinit have_cpuid_p(void) + + static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) + { +- if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { +- /* Disable processor serial number */ +- unsigned long lo, hi; +- rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); +- lo |= 0x200000; +- wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); +- printk(KERN_NOTICE "CPU serial number disabled.\n"); +- clear_cpu_cap(c, X86_FEATURE_PN); ++ unsigned long lo, hi; + +- /* Disabling the serial number may affect the cpuid level */ +- c->cpuid_level = cpuid_eax(0); +- } ++ if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr) ++ return; ++ ++ /* Disable processor serial number: */ ++ ++ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); ++ lo |= 0x200000; ++ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); ++ ++ printk(KERN_NOTICE "CPU serial number disabled.\n"); ++ clear_cpu_cap(c, X86_FEATURE_PN); ++ ++ /* Disabling the serial number may affect the cpuid level */ ++ c->cpuid_level = cpuid_eax(0); + } + + static int __init x86_serial_nr_setup(char *s) +@@ -213,16 +218,64 @@ static inline void squash_the_stupid_ser + #endif + + /* ++ * Some CPU features depend on higher CPUID levels, which may not always ++ * be available due to CPUID level capping or broken virtualization ++ * software. Add those features to this table to auto-disable them. ++ */ ++struct cpuid_dependent_feature { ++ u32 feature; ++ u32 level; ++}; ++ ++static const struct cpuid_dependent_feature __cpuinitconst ++cpuid_dependent_features[] = { ++ { X86_FEATURE_MWAIT, 0x00000005 }, ++ { X86_FEATURE_DCA, 0x00000009 }, ++ { X86_FEATURE_XSAVE, 0x0000000d }, ++ { 0, 0 } ++}; ++ ++static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) ++{ ++ const struct cpuid_dependent_feature *df; ++ ++ for (df = cpuid_dependent_features; df->feature; df++) { ++ ++ if (!cpu_has(c, df->feature)) ++ continue; ++ /* ++ * Note: cpuid_level is set to -1 if unavailable, but ++ * extended_extended_level is set to 0 if unavailable ++ * and the legitimate extended levels are all negative ++ * when signed; hence the weird messing around with ++ * signs here... ++ */ ++ if (!((s32)df->level < 0 ? ++ (u32)df->level > (u32)c->extended_cpuid_level : ++ (s32)df->level > (s32)c->cpuid_level)) ++ continue; ++ ++ clear_cpu_cap(c, df->feature); ++ if (!warn) ++ continue; ++ ++ printk(KERN_WARNING ++ "CPU: CPU feature %s disabled, no CPUID level 0x%x\n", ++ x86_cap_flags[df->feature], df->level); ++ } ++} ++ ++/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; +- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used +- * ++ * in particular, if CPUID levels 0x80000002..4 are supported, this ++ * isn't used + */ + + /* Look up CPU names by table lookup. */ +-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) ++static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) + { +- struct cpu_model_info *info; ++ const struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ +@@ -242,21 +295,34 @@ static char __cpuinit *table_lookup_mode + + __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + +-/* Current gdt points %fs at the "master" per-cpu area: after this, +- * it's on the real one. */ +-void switch_to_new_gdt(void) ++void load_percpu_segment(int cpu) ++{ ++#ifdef CONFIG_X86_32 ++ loadsegment(fs, __KERNEL_PERCPU); ++#else ++ loadsegment(gs, 0); ++ wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); ++#endif ++ load_stack_canary_segment(); ++} ++ ++/* ++ * Current gdt points %fs at the "master" per-cpu area: after this, ++ * it's on the real one. ++ */ ++void switch_to_new_gdt(int cpu) + { + struct desc_ptr gdt_descr; + +- gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); ++ gdt_descr.address = (long)get_cpu_gdt_table(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +-#ifdef CONFIG_X86_32 +- asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); +-#endif ++ /* Reload the per-cpu base */ ++ ++ load_percpu_segment(cpu); + } + +-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; ++static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; + + static void __cpuinit default_init(struct cpuinfo_x86 *c) + { +@@ -275,7 +341,7 @@ static void __cpuinit default_init(struc + #endif + } + +-static struct cpu_dev __cpuinitdata default_cpu = { ++static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +@@ -289,22 +355,24 @@ static void __cpuinit get_model_name(str + if (c->extended_cpuid_level < 0x80000004) + return; + +- v = (unsigned int *) c->x86_model_id; ++ v = (unsigned int *)c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + +- /* Intel chips right-justify this string for some dumb reason; +- undo that brain damage */ ++ /* ++ * Intel chips right-justify this string for some dumb reason; ++ * undo that brain damage: ++ */ + p = q = &c->x86_model_id[0]; + while (*p == ' ') +- p++; ++ p++; + if (p != q) { +- while (*p) +- *q++ = *p++; +- while (q <= &c->x86_model_id[48]) +- *q++ = '\0'; /* Zero-pad the rest */ ++ while (*p) ++ *q++ = *p++; ++ while (q <= &c->x86_model_id[48]) ++ *q++ = '\0'; /* Zero-pad the rest */ + } + } + +@@ -373,36 +441,30 @@ void __cpuinit detect_ht(struct cpuinfo_ + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); +- } else if (smp_num_siblings > 1) { ++ goto out; ++ } + +- if (smp_num_siblings > nr_cpu_ids) { +- printk(KERN_WARNING "CPU: Unsupported number of siblings %d", +- smp_num_siblings); +- smp_num_siblings = 1; +- return; +- } ++ if (smp_num_siblings <= 1) ++ goto out; + +- index_msb = get_count_order(smp_num_siblings); +-#ifdef CONFIG_X86_64 +- c->phys_proc_id = phys_pkg_id(index_msb); +-#else +- c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); +-#endif ++ if (smp_num_siblings > nr_cpu_ids) { ++ pr_warning("CPU: Unsupported number of siblings %d", ++ smp_num_siblings); ++ smp_num_siblings = 1; ++ return; ++ } + +- smp_num_siblings = smp_num_siblings / c->x86_max_cores; ++ index_msb = get_count_order(smp_num_siblings); ++ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + +- index_msb = get_count_order(smp_num_siblings); ++ smp_num_siblings = smp_num_siblings / c->x86_max_cores; + +- core_bits = get_count_order(c->x86_max_cores); ++ index_msb = get_count_order(smp_num_siblings); + +-#ifdef CONFIG_X86_64 +- c->cpu_core_id = phys_pkg_id(index_msb) & +- ((1 << core_bits) - 1); +-#else +- c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & +- ((1 << core_bits) - 1); +-#endif +- } ++ core_bits = get_count_order(c->x86_max_cores); ++ ++ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & ++ ((1 << core_bits) - 1); + + out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { +@@ -417,8 +479,8 @@ out: + static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) + { + char *v = c->x86_vendor_id; +- int i; + static int printed; ++ int i; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (!cpu_devs[i]) +@@ -427,6 +489,7 @@ static void __cpuinit get_cpu_vendor(str + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { ++ + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; +@@ -435,7 +498,9 @@ static void __cpuinit get_cpu_vendor(str + + if (!printed) { + printed++; +- printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); ++ printk(KERN_ERR ++ "CPU: vendor_id '%s' unknown, using generic init.\n", v); ++ + printk(KERN_ERR "CPU: Your system may be unstable.\n"); + } + +@@ -455,14 +520,17 @@ void __cpuinit cpu_detect(struct cpuinfo + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; ++ + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; ++ + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xf) << 4; ++ + if (cap0 & (1<<19)) { + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; +@@ -478,6 +546,7 @@ static void __cpuinit get_cpu_cap(struct + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; ++ + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; +@@ -486,6 +555,7 @@ static void __cpuinit get_cpu_cap(struct + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; ++ + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); +@@ -493,13 +563,15 @@ static void __cpuinit get_cpu_cap(struct + } + } + +-#ifdef CONFIG_X86_64 + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } ++#ifdef CONFIG_X86_32 ++ else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) ++ c->x86_phys_bits = 36; + #endif + + if (c->extended_cpuid_level >= 0x80000007) +@@ -546,8 +618,12 @@ static void __init early_identify_cpu(st + { + #ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; ++ c->x86_phys_bits = 36; ++ c->x86_virt_bits = 48; + #else + c->x86_clflush_size = 32; ++ c->x86_phys_bits = 32; ++ c->x86_virt_bits = 32; + #endif + c->x86_cache_alignment = c->x86_clflush_size; + +@@ -570,21 +646,20 @@ static void __init early_identify_cpu(st + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); + +- validate_pat_support(c); +- + #ifdef CONFIG_SMP + c->cpu_index = boot_cpu_id; + #endif ++ filter_cpuid_features(c, false); + } + + void __init early_cpu_init(void) + { +- struct cpu_dev **cdev; ++ const struct cpu_dev *const *cdev; + int count = 0; + +- printk("KERNEL supported cpus:\n"); ++ printk(KERN_INFO "KERNEL supported cpus:\n"); + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { +- struct cpu_dev *cpudev = *cdev; ++ const struct cpu_dev *cpudev = *cdev; + unsigned int j; + + if (count >= X86_VENDOR_NUM) +@@ -595,7 +670,7 @@ void __init early_cpu_init(void) + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; +- printk(" %s %s\n", cpudev->c_vendor, ++ printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } + } +@@ -637,7 +712,7 @@ static void __cpuinit generic_identify(s + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; + #ifdef CONFIG_X86_32 + # ifdef CONFIG_X86_HT +- c->apicid = phys_pkg_id(c->initial_apicid, 0); ++ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + # else + c->apicid = c->initial_apicid; + # endif +@@ -671,9 +746,13 @@ static void __cpuinit identify_cpu(struc + c->x86_coreid_bits = 0; + #ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; ++ c->x86_phys_bits = 36; ++ c->x86_virt_bits = 48; + #else + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_clflush_size = 32; ++ c->x86_phys_bits = 32; ++ c->x86_virt_bits = 32; + #endif + c->x86_cache_alignment = c->x86_clflush_size; + memset(&c->x86_capability, 0, sizeof c->x86_capability); +@@ -684,7 +763,7 @@ static void __cpuinit identify_cpu(struc + this_cpu->c_identify(c); + + #ifdef CONFIG_X86_64 +- c->apicid = phys_pkg_id(0); ++ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + #endif + + /* +@@ -704,13 +783,16 @@ static void __cpuinit identify_cpu(struc + squash_the_stupid_serial_number(c); + + /* +- * The vendor-specific functions might have changed features. Now +- * we do "generic changes." ++ * The vendor-specific functions might have changed features. ++ * Now we do "generic changes." + */ + ++ /* Filter out anything that depends on CPUID levels we don't have */ ++ filter_cpuid_features(c, true); ++ + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { +- char *p; ++ const char *p; + p = table_lookup_model(c); + if (p) + strcpy(c->x86_model_id, p); +@@ -766,12 +848,14 @@ static void vgetcpu_set_mode(void) + void __init identify_boot_cpu(void) + { + identify_cpu(&boot_cpu_data); ++ init_c1e_mask(); + #ifdef CONFIG_X86_32 + sysenter_setup(); + enable_sep_cpu(); + #else + vgetcpu_set_mode(); + #endif ++ init_hw_perf_counters(); + } + + void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +@@ -785,11 +869,11 @@ void __cpuinit identify_secondary_cpu(st + } + + struct msr_range { +- unsigned min; +- unsigned max; ++ unsigned min; ++ unsigned max; + }; + +-static struct msr_range msr_range_array[] __cpuinitdata = { ++static const struct msr_range msr_range_array[] __cpuinitconst = { + { 0x00000000, 0x00000418}, + { 0xc0000000, 0xc000040b}, + { 0xc0010000, 0xc0010142}, +@@ -798,14 +882,15 @@ static struct msr_range msr_range_array[ + + static void __cpuinit print_cpu_msr(void) + { ++ unsigned index_min, index_max; + unsigned index; + u64 val; + int i; +- unsigned index_min, index_max; + + for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { + index_min = msr_range_array[i].min; + index_max = msr_range_array[i].max; ++ + for (index = index_min; index < index_max; index++) { + if (rdmsrl_amd_safe(index, &val)) + continue; +@@ -815,6 +900,7 @@ static void __cpuinit print_cpu_msr(void + } + + static int show_msr __cpuinitdata; ++ + static __init int setup_show_msr(char *arg) + { + int num; +@@ -836,12 +922,14 @@ __setup("noclflush", setup_noclflush); + + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) + { +- char *vendor = NULL; ++ const char *vendor = NULL; + +- if (c->x86_vendor < X86_VENDOR_NUM) ++ if (c->x86_vendor < X86_VENDOR_NUM) { + vendor = this_cpu->c_vendor; +- else if (c->cpuid_level >= 0) +- vendor = c->x86_vendor_id; ++ } else { ++ if (c->cpuid_level >= 0) ++ vendor = c->x86_vendor_id; ++ } + + if (vendor && !strstr(c->x86_model_id, vendor)) + printk(KERN_CONT "%s ", vendor); +@@ -868,65 +956,47 @@ void __cpuinit print_cpu_info(struct cpu + static __init int setup_disablecpuid(char *arg) + { + int bit; ++ + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; ++ + return 1; + } + __setup("clearcpuid=", setup_disablecpuid); + + #ifdef CONFIG_X86_64 +-struct x8664_pda **_cpu_pda __read_mostly; +-EXPORT_SYMBOL(_cpu_pda); +- + struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; + +-static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; ++DEFINE_PER_CPU_FIRST(union irq_stack_union, ++ irq_stack_union) __aligned(PAGE_SIZE); + +-void __cpuinit pda_init(int cpu) +-{ +- struct x8664_pda *pda = cpu_pda(cpu); ++DEFINE_PER_CPU(char *, irq_stack_ptr) = ++ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + +- /* Setup up data that may be needed in __get_free_pages early */ +- loadsegment(fs, 0); +- loadsegment(gs, 0); +- /* Memory clobbers used to order PDA accessed */ +- mb(); +- wrmsrl(MSR_GS_BASE, pda); +- mb(); +- +- pda->cpunumber = cpu; +- pda->irqcount = -1; +- pda->kernelstack = (unsigned long)stack_thread_info() - +- PDA_STACKOFFSET + THREAD_SIZE; +- pda->active_mm = &init_mm; +- pda->mmu_state = 0; +- +- if (cpu == 0) { +- /* others are initialized in smpboot.c */ +- pda->pcurrent = &init_task; +- pda->irqstackptr = boot_cpu_stack; +- pda->irqstackptr += IRQSTACKSIZE - 64; +- } else { +- if (!pda->irqstackptr) { +- pda->irqstackptr = (char *) +- __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); +- if (!pda->irqstackptr) +- panic("cannot allocate irqstack for cpu %d", +- cpu); +- pda->irqstackptr += IRQSTACKSIZE - 64; +- } ++DEFINE_PER_CPU(unsigned long, kernel_stack) = ++ (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; ++EXPORT_PER_CPU_SYMBOL(kernel_stack); + +- if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) +- pda->nodenumber = cpu_to_node(cpu); +- } +-} ++DEFINE_PER_CPU(unsigned int, irq_count) = -1; + +-static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + +- DEBUG_STKSZ] __page_aligned_bss; ++/* ++ * Special IST stacks which the CPU switches to when it calls ++ * an IST-marked descriptor entry. Up to 7 stacks (hardware ++ * limit), all of them are 4K, except the debug stack which ++ * is 8K. ++ */ ++static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { ++ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, ++#if DEBUG_STACK > 0 ++ [DEBUG_STACK - 1] = DEBUG_STKSZ ++#endif ++}; + +-extern asmlinkage void ignore_sysret(void); ++static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks ++ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) ++ __aligned(PAGE_SIZE); + + /* May not be marked __init: used by software suspend */ + void syscall_init(void) +@@ -957,16 +1027,38 @@ unsigned long kernel_eflags; + */ + DEFINE_PER_CPU(struct orig_ist, orig_ist); + +-#else ++#else /* CONFIG_X86_64 */ ++ ++#ifdef CONFIG_CC_STACKPROTECTOR ++DEFINE_PER_CPU(unsigned long, stack_canary); ++#endif + +-/* Make sure %fs is initialized properly in idle threads */ ++/* Make sure %fs and %gs are initialized properly in idle threads */ + struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) + { + memset(regs, 0, sizeof(struct pt_regs)); + regs->fs = __KERNEL_PERCPU; ++ regs->gs = __KERNEL_STACK_CANARY; ++ + return regs; + } +-#endif ++#endif /* CONFIG_X86_64 */ ++ ++/* ++ * Clear all 6 debug registers: ++ */ ++static void clear_all_debug_regs(void) ++{ ++ int i; ++ ++ for (i = 0; i < 8; i++) { ++ /* Ignore db4, db5 */ ++ if ((i == 4) || (i == 5)) ++ continue; ++ ++ set_debugreg(0, i); ++ } ++} + + /* + * cpu_init() initializes state that is per-CPU. Some data is already +@@ -976,21 +1068,25 @@ struct pt_regs * __cpuinit idle_regs(str + * A lot of state is already set up in PDA init for 64 bit + */ + #ifdef CONFIG_X86_64 ++ + void __cpuinit cpu_init(void) + { +- int cpu = stack_smp_processor_id(); +- struct tss_struct *t = &per_cpu(init_tss, cpu); +- struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); +- unsigned long v; +- char *estacks = NULL; ++ struct orig_ist *orig_ist; + struct task_struct *me; ++ struct tss_struct *t; ++ unsigned long v; ++ int cpu; + int i; + +- /* CPU 0 is initialised in head64.c */ +- if (cpu != 0) +- pda_init(cpu); +- else +- estacks = boot_exception_stacks; ++ cpu = stack_smp_processor_id(); ++ t = &per_cpu(init_tss, cpu); ++ orig_ist = &per_cpu(orig_ist, cpu); ++ ++#ifdef CONFIG_NUMA ++ if (cpu != 0 && percpu_read(node_number) == 0 && ++ cpu_to_node(cpu) != NUMA_NO_NODE) ++ percpu_write(node_number, cpu_to_node(cpu)); ++#endif + + me = current; + +@@ -1006,7 +1102,9 @@ void __cpuinit cpu_init(void) + * and set up the GDT descriptor: + */ + +- switch_to_new_gdt(); ++ switch_to_new_gdt(cpu); ++ loadsegment(fs, 0); ++ + load_idt((const struct desc_ptr *)&idt_descr); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); +@@ -1017,31 +1115,24 @@ void __cpuinit cpu_init(void) + barrier(); + + check_efer(); +- if (cpu != 0 && x2apic) ++ if (cpu != 0) + enable_x2apic(); + + /* + * set up and load the per-CPU TSS + */ + if (!orig_ist->ist[0]) { +- static const unsigned int order[N_EXCEPTION_STACKS] = { +- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, +- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER +- }; ++ char *estacks = per_cpu(exception_stacks, cpu); ++ + for (v = 0; v < N_EXCEPTION_STACKS; v++) { +- if (cpu) { +- estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); +- if (!estacks) +- panic("Cannot allocate exception " +- "stack %ld %d\n", v, cpu); +- } +- estacks += PAGE_SIZE << order[v]; ++ estacks += exception_stack_sizes[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; + } + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); ++ + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. +@@ -1051,8 +1142,7 @@ void __cpuinit cpu_init(void) + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; +- if (me->mm) +- BUG(); ++ BUG_ON(me->mm); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); +@@ -1069,22 +1159,9 @@ void __cpuinit cpu_init(void) + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); +- else { +-#endif +- /* +- * Clear all 6 debug registers: +- */ +- +- set_debugreg(0UL, 0); +- set_debugreg(0UL, 1); +- set_debugreg(0UL, 2); +- set_debugreg(0UL, 3); +- set_debugreg(0UL, 6); +- set_debugreg(0UL, 7); +-#ifdef CONFIG_KGDB +- /* If the kgdb is connected no debug regs should be altered. */ +- } ++ else + #endif ++ clear_all_debug_regs(); + + fpu_init(); + +@@ -1105,7 +1182,8 @@ void __cpuinit cpu_init(void) + + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); +- for (;;) local_irq_enable(); ++ for (;;) ++ local_irq_enable(); + } + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); +@@ -1114,15 +1192,14 @@ void __cpuinit cpu_init(void) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + load_idt(&idt_descr); +- switch_to_new_gdt(); ++ switch_to_new_gdt(cpu); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + curr->active_mm = &init_mm; +- if (curr->mm) +- BUG(); ++ BUG_ON(curr->mm); + enter_lazy_tlb(&init_mm, curr); + + load_sp0(t, thread); +@@ -1130,21 +1207,14 @@ void __cpuinit cpu_init(void) + load_TR_desc(); + load_LDT(&init_mm.context); + ++ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); ++ + #ifdef CONFIG_DOUBLEFAULT + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); + #endif + +- /* Clear %gs. */ +- asm volatile ("mov %0, %%gs" : : "r" (0)); +- +- /* Clear all 6 debug registers: */ +- set_debugreg(0, 0); +- set_debugreg(0, 1); +- set_debugreg(0, 2); +- set_debugreg(0, 3); +- set_debugreg(0, 6); +- set_debugreg(0, 7); ++ clear_all_debug_regs(); + + /* + * Force FPU initialization: +@@ -1164,6 +1234,4 @@ void __cpuinit cpu_init(void) + + xsave_init(); + } +- +- + #endif +Index: linux-2.6-tip/arch/x86/kernel/cpu/cpu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/cpu.h ++++ linux-2.6-tip/arch/x86/kernel/cpu/cpu.h +@@ -3,33 +3,34 @@ + #define ARCH_X86_CPU_H + + struct cpu_model_info { +- int vendor; +- int family; +- char *model_names[16]; ++ int vendor; ++ int family; ++ const char *model_names[16]; + }; + + /* attempt to consolidate cpu attributes */ + struct cpu_dev { +- char * c_vendor; ++ const char *c_vendor; + + /* some have two possibilities for cpuid string */ +- char * c_ident[2]; ++ const char *c_ident[2]; + + struct cpu_model_info c_models[4]; + +- void (*c_early_init)(struct cpuinfo_x86 *c); +- void (*c_init)(struct cpuinfo_x86 * c); +- void (*c_identify)(struct cpuinfo_x86 * c); +- unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); +- int c_x86_vendor; ++ void (*c_early_init)(struct cpuinfo_x86 *); ++ void (*c_init)(struct cpuinfo_x86 *); ++ void (*c_identify)(struct cpuinfo_x86 *); ++ unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); ++ int c_x86_vendor; + }; + + #define cpu_dev_register(cpu_devX) \ +- static struct cpu_dev *__cpu_dev_##cpu_devX __used \ ++ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ + __attribute__((__section__(".x86_cpu_dev.init"))) = \ + &cpu_devX; + +-extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; ++extern const struct cpu_dev *const __x86_cpu_dev_start[], ++ *const __x86_cpu_dev_end[]; + + extern void display_cacheinfo(struct cpuinfo_x86 *c); + +Index: linux-2.6-tip/arch/x86/kernel/cpu/cpu_debug.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/cpu/cpu_debug.c +@@ -0,0 +1,901 @@ ++/* ++ * CPU x86 architecture debug code ++ * ++ * Copyright(C) 2009 Jaswinder Singh Rajput ++ * ++ * For licencing details see kernel-base/COPYING ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); ++static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); ++static DEFINE_PER_CPU(unsigned, cpu_modelflag); ++static DEFINE_PER_CPU(int, cpu_priv_count); ++static DEFINE_PER_CPU(unsigned, cpu_model); ++ ++static DEFINE_MUTEX(cpu_debug_lock); ++ ++static struct dentry *cpu_debugfs_dir; ++ ++static struct cpu_debug_base cpu_base[] = { ++ { "mc", CPU_MC, 0 }, ++ { "monitor", CPU_MONITOR, 0 }, ++ { "time", CPU_TIME, 0 }, ++ { "pmc", CPU_PMC, 1 }, ++ { "platform", CPU_PLATFORM, 0 }, ++ { "apic", CPU_APIC, 0 }, ++ { "poweron", CPU_POWERON, 0 }, ++ { "control", CPU_CONTROL, 0 }, ++ { "features", CPU_FEATURES, 0 }, ++ { "lastbranch", CPU_LBRANCH, 0 }, ++ { "bios", CPU_BIOS, 0 }, ++ { "freq", CPU_FREQ, 0 }, ++ { "mtrr", CPU_MTRR, 0 }, ++ { "perf", CPU_PERF, 0 }, ++ { "cache", CPU_CACHE, 0 }, ++ { "sysenter", CPU_SYSENTER, 0 }, ++ { "therm", CPU_THERM, 0 }, ++ { "misc", CPU_MISC, 0 }, ++ { "debug", CPU_DEBUG, 0 }, ++ { "pat", CPU_PAT, 0 }, ++ { "vmx", CPU_VMX, 0 }, ++ { "call", CPU_CALL, 0 }, ++ { "base", CPU_BASE, 0 }, ++ { "ver", CPU_VER, 0 }, ++ { "conf", CPU_CONF, 0 }, ++ { "smm", CPU_SMM, 0 }, ++ { "svm", CPU_SVM, 0 }, ++ { "osvm", CPU_OSVM, 0 }, ++ { "tss", CPU_TSS, 0 }, ++ { "cr", CPU_CR, 0 }, ++ { "dt", CPU_DT, 0 }, ++ { "registers", CPU_REG_ALL, 0 }, ++}; ++ ++static struct cpu_file_base cpu_file[] = { ++ { "index", CPU_REG_ALL, 0 }, ++ { "value", CPU_REG_ALL, 1 }, ++}; ++ ++/* Intel Registers Range */ ++static struct cpu_debug_range cpu_intel_range[] = { ++ { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, ++ { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, ++ { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, ++ { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, ++ { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, ++ { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, ++ ++ { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, ++ { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, ++ { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, ++ { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, ++ ++ { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, ++ { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, ++ { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, ++ { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, ++ ++ { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, ++ { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, ++ { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, ++ { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, ++ ++ { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, ++ { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, ++ { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, ++ { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, ++ ++ { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, ++ { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, ++ { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, ++ { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, ++ { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, ++ ++ { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, ++ { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, ++ { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, ++ { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, ++ { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, ++ { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, ++ { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, ++ { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, ++ ++ { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, ++ { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, ++ { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, ++ { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, ++ { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, ++ { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, ++ { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, ++ { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, ++ ++ { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, ++ { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, ++ { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, ++ { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, ++ { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, ++ { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, ++ ++ { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, ++ { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, ++ { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, ++ { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, ++ { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, ++ { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, ++ { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, ++ { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, ++ { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, ++ { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, ++ { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, ++ { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, ++ ++ { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, ++ { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, ++ { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, ++ { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, ++ { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, ++ { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, ++ { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, ++ { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, ++ { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, ++ { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, ++ { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, ++ ++ { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, ++ { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, ++ { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, ++ ++ { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, ++ ++ { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, ++ { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, ++ { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, ++ { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, ++}; ++ ++/* AMD Registers Range */ ++static struct cpu_debug_range cpu_amd_range[] = { ++ { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, ++ { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, ++ { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, ++ { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, ++ { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, ++ { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, ++ ++ { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, ++ { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, ++ { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, ++ { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, ++ ++ { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, ++ { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, ++ { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, ++ { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, ++ { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, ++ { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, ++ ++ { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, ++ ++ { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, ++ { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, ++ { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, ++ { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, ++ ++ { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, ++ { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, ++ { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, ++ { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, ++ { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, ++ { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, ++ { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, ++ { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, ++ { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, ++ { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, ++ { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, ++ { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, ++ { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, ++ { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, ++ { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, ++ { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, ++ { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, ++ { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, ++}; ++ ++ ++/* Intel */ ++static int get_intel_modelflag(unsigned model) ++{ ++ int flag; ++ ++ switch (model) { ++ case 0x0501: ++ case 0x0502: ++ case 0x0504: ++ flag = CPU_INTEL_PENTIUM; ++ break; ++ case 0x0601: ++ case 0x0603: ++ case 0x0605: ++ case 0x0607: ++ case 0x0608: ++ case 0x060A: ++ case 0x060B: ++ flag = CPU_INTEL_P6; ++ break; ++ case 0x0609: ++ case 0x060D: ++ flag = CPU_INTEL_PENTIUM_M; ++ break; ++ case 0x060E: ++ flag = CPU_INTEL_CORE; ++ break; ++ case 0x060F: ++ case 0x0617: ++ flag = CPU_INTEL_CORE2; ++ break; ++ case 0x061C: ++ flag = CPU_INTEL_ATOM; ++ break; ++ case 0x0F00: ++ case 0x0F01: ++ case 0x0F02: ++ case 0x0F03: ++ case 0x0F04: ++ flag = CPU_INTEL_XEON_P4; ++ break; ++ case 0x0F06: ++ flag = CPU_INTEL_XEON_MP; ++ break; ++ default: ++ flag = CPU_NONE; ++ break; ++ } ++ ++ return flag; ++} ++ ++/* AMD */ ++static int get_amd_modelflag(unsigned model) ++{ ++ int flag; ++ ++ switch (model >> 8) { ++ case 0x6: ++ flag = CPU_AMD_K6; ++ break; ++ case 0x7: ++ flag = CPU_AMD_K7; ++ break; ++ case 0x8: ++ flag = CPU_AMD_K8; ++ break; ++ case 0xf: ++ flag = CPU_AMD_0F; ++ break; ++ case 0x10: ++ flag = CPU_AMD_10; ++ break; ++ case 0x11: ++ flag = CPU_AMD_11; ++ break; ++ default: ++ flag = CPU_NONE; ++ break; ++ } ++ ++ return flag; ++} ++ ++static int get_cpu_modelflag(unsigned cpu) ++{ ++ int flag; ++ ++ flag = per_cpu(cpu_model, cpu); ++ ++ switch (flag >> 16) { ++ case X86_VENDOR_INTEL: ++ flag = get_intel_modelflag(flag); ++ break; ++ case X86_VENDOR_AMD: ++ flag = get_amd_modelflag(flag & 0xffff); ++ break; ++ default: ++ flag = CPU_NONE; ++ break; ++ } ++ ++ return flag; ++} ++ ++static int get_cpu_range_count(unsigned cpu) ++{ ++ int index; ++ ++ switch (per_cpu(cpu_model, cpu) >> 16) { ++ case X86_VENDOR_INTEL: ++ index = ARRAY_SIZE(cpu_intel_range); ++ break; ++ case X86_VENDOR_AMD: ++ index = ARRAY_SIZE(cpu_amd_range); ++ break; ++ default: ++ index = 0; ++ break; ++ } ++ ++ return index; ++} ++ ++static int is_typeflag_valid(unsigned cpu, unsigned flag) ++{ ++ unsigned vendor, modelflag; ++ int i, index; ++ ++ /* Standard Registers should be always valid */ ++ if (flag >= CPU_TSS) ++ return 1; ++ ++ modelflag = per_cpu(cpu_modelflag, cpu); ++ vendor = per_cpu(cpu_model, cpu) >> 16; ++ index = get_cpu_range_count(cpu); ++ ++ for (i = 0; i < index; i++) { ++ switch (vendor) { ++ case X86_VENDOR_INTEL: ++ if ((cpu_intel_range[i].model & modelflag) && ++ (cpu_intel_range[i].flag & flag)) ++ return 1; ++ break; ++ case X86_VENDOR_AMD: ++ if ((cpu_amd_range[i].model & modelflag) && ++ (cpu_amd_range[i].flag & flag)) ++ return 1; ++ break; ++ } ++ } ++ ++ /* Invalid */ ++ return 0; ++} ++ ++static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, ++ int index, unsigned flag) ++{ ++ unsigned modelflag; ++ ++ modelflag = per_cpu(cpu_modelflag, cpu); ++ *max = 0; ++ switch (per_cpu(cpu_model, cpu) >> 16) { ++ case X86_VENDOR_INTEL: ++ if ((cpu_intel_range[index].model & modelflag) && ++ (cpu_intel_range[index].flag & flag)) { ++ *min = cpu_intel_range[index].min; ++ *max = cpu_intel_range[index].max; ++ } ++ break; ++ case X86_VENDOR_AMD: ++ if ((cpu_amd_range[index].model & modelflag) && ++ (cpu_amd_range[index].flag & flag)) { ++ *min = cpu_amd_range[index].min; ++ *max = cpu_amd_range[index].max; ++ } ++ break; ++ } ++ ++ return *max; ++} ++ ++/* This function can also be called with seq = NULL for printk */ ++static void print_cpu_data(struct seq_file *seq, unsigned type, ++ u32 low, u32 high) ++{ ++ struct cpu_private *priv; ++ u64 val = high; ++ ++ if (seq) { ++ priv = seq->private; ++ if (priv->file) { ++ val = (val << 32) | low; ++ seq_printf(seq, "0x%llx\n", val); ++ } else ++ seq_printf(seq, " %08x: %08x_%08x\n", ++ type, high, low); ++ } else ++ printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); ++} ++ ++/* This function can also be called with seq = NULL for printk */ ++static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) ++{ ++ unsigned msr, msr_min, msr_max; ++ struct cpu_private *priv; ++ u32 low, high; ++ int i, range; ++ ++ if (seq) { ++ priv = seq->private; ++ if (priv->file) { ++ if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, ++ &low, &high)) ++ print_cpu_data(seq, priv->reg, low, high); ++ return; ++ } ++ } ++ ++ range = get_cpu_range_count(cpu); ++ ++ for (i = 0; i < range; i++) { ++ if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) ++ continue; ++ ++ for (msr = msr_min; msr <= msr_max; msr++) { ++ if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) ++ continue; ++ print_cpu_data(seq, msr, low, high); ++ } ++ } ++} ++ ++static void print_tss(void *arg) ++{ ++ struct pt_regs *regs = task_pt_regs(current); ++ struct seq_file *seq = arg; ++ unsigned int seg; ++ ++ seq_printf(seq, " RAX\t: %016lx\n", regs->ax); ++ seq_printf(seq, " RBX\t: %016lx\n", regs->bx); ++ seq_printf(seq, " RCX\t: %016lx\n", regs->cx); ++ seq_printf(seq, " RDX\t: %016lx\n", regs->dx); ++ ++ seq_printf(seq, " RSI\t: %016lx\n", regs->si); ++ seq_printf(seq, " RDI\t: %016lx\n", regs->di); ++ seq_printf(seq, " RBP\t: %016lx\n", regs->bp); ++ seq_printf(seq, " ESP\t: %016lx\n", regs->sp); ++ ++#ifdef CONFIG_X86_64 ++ seq_printf(seq, " R08\t: %016lx\n", regs->r8); ++ seq_printf(seq, " R09\t: %016lx\n", regs->r9); ++ seq_printf(seq, " R10\t: %016lx\n", regs->r10); ++ seq_printf(seq, " R11\t: %016lx\n", regs->r11); ++ seq_printf(seq, " R12\t: %016lx\n", regs->r12); ++ seq_printf(seq, " R13\t: %016lx\n", regs->r13); ++ seq_printf(seq, " R14\t: %016lx\n", regs->r14); ++ seq_printf(seq, " R15\t: %016lx\n", regs->r15); ++#endif ++ ++ asm("movl %%cs,%0" : "=r" (seg)); ++ seq_printf(seq, " CS\t: %04x\n", seg); ++ asm("movl %%ds,%0" : "=r" (seg)); ++ seq_printf(seq, " DS\t: %04x\n", seg); ++ seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); ++ asm("movl %%es,%0" : "=r" (seg)); ++ seq_printf(seq, " ES\t: %04x\n", seg); ++ asm("movl %%fs,%0" : "=r" (seg)); ++ seq_printf(seq, " FS\t: %04x\n", seg); ++ asm("movl %%gs,%0" : "=r" (seg)); ++ seq_printf(seq, " GS\t: %04x\n", seg); ++ ++ seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); ++ ++ seq_printf(seq, " EIP\t: %016lx\n", regs->ip); ++} ++ ++static void print_cr(void *arg) ++{ ++ struct seq_file *seq = arg; ++ ++ seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); ++ seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); ++ seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); ++ seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); ++#ifdef CONFIG_X86_64 ++ seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); ++#endif ++} ++ ++static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) ++{ ++ seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); ++} ++ ++static void print_dt(void *seq) ++{ ++ struct desc_ptr dt; ++ unsigned long ldt; ++ ++ /* IDT */ ++ store_idt((struct desc_ptr *)&dt); ++ print_desc_ptr("IDT", seq, dt); ++ ++ /* GDT */ ++ store_gdt((struct desc_ptr *)&dt); ++ print_desc_ptr("GDT", seq, dt); ++ ++ /* LDT */ ++ store_ldt(ldt); ++ seq_printf(seq, " LDT\t: %016lx\n", ldt); ++ ++ /* TR */ ++ store_tr(ldt); ++ seq_printf(seq, " TR\t: %016lx\n", ldt); ++} ++ ++static void print_dr(void *arg) ++{ ++ struct seq_file *seq = arg; ++ unsigned long dr; ++ int i; ++ ++ for (i = 0; i < 8; i++) { ++ /* Ignore db4, db5 */ ++ if ((i == 4) || (i == 5)) ++ continue; ++ get_debugreg(dr, i); ++ seq_printf(seq, " dr%d\t: %016lx\n", i, dr); ++ } ++ ++ seq_printf(seq, "\n MSR\t:\n"); ++} ++ ++static void print_apic(void *arg) ++{ ++ struct seq_file *seq = arg; ++ ++#ifdef CONFIG_X86_LOCAL_APIC ++ seq_printf(seq, " LAPIC\t:\n"); ++ seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); ++ seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); ++ seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); ++ seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); ++ seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); ++ seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); ++ seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); ++ seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); ++ seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); ++ seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); ++ seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); ++ seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); ++ seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); ++ seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); ++ seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); ++ seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); ++ seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); ++ seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); ++ seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); ++ seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); ++ seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); ++#endif /* CONFIG_X86_LOCAL_APIC */ ++ ++ seq_printf(seq, "\n MSR\t:\n"); ++} ++ ++static int cpu_seq_show(struct seq_file *seq, void *v) ++{ ++ struct cpu_private *priv = seq->private; ++ ++ if (priv == NULL) ++ return -EINVAL; ++ ++ switch (cpu_base[priv->type].flag) { ++ case CPU_TSS: ++ smp_call_function_single(priv->cpu, print_tss, seq, 1); ++ break; ++ case CPU_CR: ++ smp_call_function_single(priv->cpu, print_cr, seq, 1); ++ break; ++ case CPU_DT: ++ smp_call_function_single(priv->cpu, print_dt, seq, 1); ++ break; ++ case CPU_DEBUG: ++ if (priv->file == CPU_INDEX_BIT) ++ smp_call_function_single(priv->cpu, print_dr, seq, 1); ++ print_msr(seq, priv->cpu, cpu_base[priv->type].flag); ++ break; ++ case CPU_APIC: ++ if (priv->file == CPU_INDEX_BIT) ++ smp_call_function_single(priv->cpu, print_apic, seq, 1); ++ print_msr(seq, priv->cpu, cpu_base[priv->type].flag); ++ break; ++ ++ default: ++ print_msr(seq, priv->cpu, cpu_base[priv->type].flag); ++ break; ++ } ++ seq_printf(seq, "\n"); ++ ++ return 0; ++} ++ ++static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) ++{ ++ if (*pos == 0) /* One time is enough ;-) */ ++ return seq; ++ ++ return NULL; ++} ++ ++static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ (*pos)++; ++ ++ return cpu_seq_start(seq, pos); ++} ++ ++static void cpu_seq_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static const struct seq_operations cpu_seq_ops = { ++ .start = cpu_seq_start, ++ .next = cpu_seq_next, ++ .stop = cpu_seq_stop, ++ .show = cpu_seq_show, ++}; ++ ++static int cpu_seq_open(struct inode *inode, struct file *file) ++{ ++ struct cpu_private *priv = inode->i_private; ++ struct seq_file *seq; ++ int err; ++ ++ err = seq_open(file, &cpu_seq_ops); ++ if (!err) { ++ seq = file->private_data; ++ seq->private = priv; ++ } ++ ++ return err; ++} ++ ++static int write_msr(struct cpu_private *priv, u64 val) ++{ ++ u32 low, high; ++ ++ high = (val >> 32) & 0xffffffff; ++ low = val & 0xffffffff; ++ ++ if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) ++ return 0; ++ ++ return -EPERM; ++} ++ ++static int write_cpu_register(struct cpu_private *priv, const char *buf) ++{ ++ int ret = -EPERM; ++ u64 val; ++ ++ ret = strict_strtoull(buf, 0, &val); ++ if (ret < 0) ++ return ret; ++ ++ /* Supporting only MSRs */ ++ if (priv->type < CPU_TSS_BIT) ++ return write_msr(priv, val); ++ ++ return ret; ++} ++ ++static ssize_t cpu_write(struct file *file, const char __user *ubuf, ++ size_t count, loff_t *off) ++{ ++ struct seq_file *seq = file->private_data; ++ struct cpu_private *priv = seq->private; ++ char buf[19]; ++ ++ if ((priv == NULL) || (count >= sizeof(buf))) ++ return -EINVAL; ++ ++ if (copy_from_user(&buf, ubuf, count)) ++ return -EFAULT; ++ ++ buf[count] = 0; ++ ++ if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) ++ if (!write_cpu_register(priv, buf)) ++ return count; ++ ++ return -EACCES; ++} ++ ++static const struct file_operations cpu_fops = { ++ .owner = THIS_MODULE, ++ .open = cpu_seq_open, ++ .read = seq_read, ++ .write = cpu_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, ++ unsigned file, struct dentry *dentry) ++{ ++ struct cpu_private *priv = NULL; ++ ++ /* Already intialized */ ++ if (file == CPU_INDEX_BIT) ++ if (per_cpu(cpu_arr[type].init, cpu)) ++ return 0; ++ ++ priv = kzalloc(sizeof(*priv), GFP_KERNEL); ++ if (priv == NULL) ++ return -ENOMEM; ++ ++ priv->cpu = cpu; ++ priv->type = type; ++ priv->reg = reg; ++ priv->file = file; ++ mutex_lock(&cpu_debug_lock); ++ per_cpu(priv_arr[type], cpu) = priv; ++ per_cpu(cpu_priv_count, cpu)++; ++ mutex_unlock(&cpu_debug_lock); ++ ++ if (file) ++ debugfs_create_file(cpu_file[file].name, S_IRUGO, ++ dentry, (void *)priv, &cpu_fops); ++ else { ++ debugfs_create_file(cpu_base[type].name, S_IRUGO, ++ per_cpu(cpu_arr[type].dentry, cpu), ++ (void *)priv, &cpu_fops); ++ mutex_lock(&cpu_debug_lock); ++ per_cpu(cpu_arr[type].init, cpu) = 1; ++ mutex_unlock(&cpu_debug_lock); ++ } ++ ++ return 0; ++} ++ ++static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, ++ struct dentry *dentry) ++{ ++ unsigned file; ++ int err = 0; ++ ++ for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { ++ err = cpu_create_file(cpu, type, reg, file, dentry); ++ if (err) ++ return err; ++ } ++ ++ return err; ++} ++ ++static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) ++{ ++ struct dentry *cpu_dentry = NULL; ++ unsigned reg, reg_min, reg_max; ++ int i, range, err = 0; ++ char reg_dir[12]; ++ u32 low, high; ++ ++ range = get_cpu_range_count(cpu); ++ ++ for (i = 0; i < range; i++) { ++ if (!get_cpu_range(cpu, ®_min, ®_max, i, ++ cpu_base[type].flag)) ++ continue; ++ ++ for (reg = reg_min; reg <= reg_max; reg++) { ++ if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) ++ continue; ++ ++ sprintf(reg_dir, "0x%x", reg); ++ cpu_dentry = debugfs_create_dir(reg_dir, dentry); ++ err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); ++ if (err) ++ return err; ++ } ++ } ++ ++ return err; ++} ++ ++static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) ++{ ++ struct dentry *cpu_dentry = NULL; ++ unsigned type; ++ int err = 0; ++ ++ for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { ++ if (!is_typeflag_valid(cpu, cpu_base[type].flag)) ++ continue; ++ cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); ++ per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry; ++ ++ if (type < CPU_TSS_BIT) ++ err = cpu_init_msr(cpu, type, cpu_dentry); ++ else ++ err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, ++ cpu_dentry); ++ if (err) ++ return err; ++ } ++ ++ return err; ++} ++ ++static int cpu_init_cpu(void) ++{ ++ struct dentry *cpu_dentry = NULL; ++ struct cpuinfo_x86 *cpui; ++ char cpu_dir[12]; ++ unsigned cpu; ++ int err = 0; ++ ++ for (cpu = 0; cpu < nr_cpu_ids; cpu++) { ++ cpui = &cpu_data(cpu); ++ if (!cpu_has(cpui, X86_FEATURE_MSR)) ++ continue; ++ per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | ++ (cpui->x86 << 8) | ++ (cpui->x86_model)); ++ per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); ++ ++ sprintf(cpu_dir, "cpu%d", cpu); ++ cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); ++ err = cpu_init_allreg(cpu, cpu_dentry); ++ ++ pr_info("cpu%d(%d) debug files %d\n", ++ cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu)); ++ if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) { ++ pr_err("Register files count %d exceeds limit %d\n", ++ per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES); ++ per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES; ++ err = -ENFILE; ++ } ++ if (err) ++ return err; ++ } ++ ++ return err; ++} ++ ++static int __init cpu_debug_init(void) ++{ ++ cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); ++ ++ return cpu_init_cpu(); ++} ++ ++static void __exit cpu_debug_exit(void) ++{ ++ int i, cpu; ++ ++ if (cpu_debugfs_dir) ++ debugfs_remove_recursive(cpu_debugfs_dir); ++ ++ for (cpu = 0; cpu < nr_cpu_ids; cpu++) ++ for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++) ++ kfree(per_cpu(priv_arr[i], cpu)); ++} ++ ++module_init(cpu_debug_init); ++module_exit(cpu_debug_exit); ++ ++MODULE_AUTHOR("Jaswinder Singh Rajput"); ++MODULE_DESCRIPTION("CPU Debug module"); ++MODULE_LICENSE("GPL"); +Index: linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +@@ -33,7 +33,7 @@ + #include + #include + #include +-#include ++#include + + #include + #include +@@ -70,6 +70,8 @@ struct acpi_cpufreq_data { + + static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); + ++DEFINE_TRACE(power_mark); ++ + /* acpi_perf_data is a pointer to percpu data. */ + static struct acpi_processor_performance *acpi_perf_data; + +@@ -601,7 +603,7 @@ static int acpi_cpufreq_cpu_init(struct + if (!data) + return -ENOMEM; + +- data->acpi_data = percpu_ptr(acpi_perf_data, cpu); ++ data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); + per_cpu(drv_data, cpu) = data; + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) +Index: linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/cpufreq/e_powersaver.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +@@ -204,12 +204,12 @@ static int eps_cpu_init(struct cpufreq_p + } + /* Enable Enhanced PowerSaver */ + rdmsrl(MSR_IA32_MISC_ENABLE, val); +- if (!(val & 1 << 16)) { +- val |= 1 << 16; ++ if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { ++ val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; + wrmsrl(MSR_IA32_MISC_ENABLE, val); + /* Can be locked at 0 */ + rdmsrl(MSR_IA32_MISC_ENABLE, val); +- if (!(val & 1 << 16)) { ++ if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { + printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); + return -ENODEV; + } +Index: linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +@@ -203,7 +203,7 @@ static int cpufreq_p4_cpu_init(struct cp + unsigned int i; + + #ifdef CONFIG_SMP +- cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu)); ++ cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); + #endif + + /* Errata workaround */ +Index: linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/cpufreq/powernow-k8.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +@@ -56,7 +56,10 @@ static DEFINE_PER_CPU(struct powernow_k8 + static int cpu_family = CPU_OPTERON; + + #ifndef CONFIG_SMP +-DEFINE_PER_CPU(cpumask_t, cpu_core_map); ++static inline const struct cpumask *cpu_core_mask(int cpu) ++{ ++ return cpumask_of(0); ++} + #endif + + /* Return a frequency in MHz, given an input fid */ +@@ -654,7 +657,7 @@ static int fill_powernow_table(struct po + + dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); + data->powernow_table = powernow_table; +- if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) ++ if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) + print_basics(data); + + for (j = 0; j < data->numps; j++) +@@ -808,7 +811,7 @@ static int powernow_k8_cpu_init_acpi(str + + /* fill in data */ + data->numps = data->acpi_data.state_count; +- if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) ++ if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) + print_basics(data); + powernow_k8_acpi_pst_values(data, 0); + +@@ -1224,7 +1227,7 @@ static int __cpuinit powernowk8_cpu_init + if (cpu_family == CPU_HW_PSTATE) + cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); + else +- cpumask_copy(pol->cpus, &per_cpu(cpu_core_map, pol->cpu)); ++ cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); + data->available_cores = pol->cpus; + + if (cpu_family == CPU_HW_PSTATE) +@@ -1286,7 +1289,7 @@ static unsigned int powernowk8_get (unsi + unsigned int khz = 0; + unsigned int first; + +- first = first_cpu(per_cpu(cpu_core_map, cpu)); ++ first = cpumask_first(cpu_core_mask(cpu)); + data = per_cpu(powernow_data, first); + + if (!data) +Index: linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +@@ -390,14 +390,14 @@ static int centrino_cpu_init(struct cpuf + enable it if not. */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + +- if (!(l & (1<<16))) { +- l |= (1<<16); ++ if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { ++ l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; + dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); + wrmsr(MSR_IA32_MISC_ENABLE, l, h); + + /* check to see if it stuck */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); +- if (!(l & (1<<16))) { ++ if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { + printk(KERN_INFO PFX + "couldn't enable Enhanced SpeedStep\n"); + return -ENODEV; +Index: linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +@@ -322,7 +322,7 @@ static int speedstep_cpu_init(struct cpu + + /* only run on CPU to be set, or on its sibling */ + #ifdef CONFIG_SMP +- cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu)); ++ cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); + #endif + + cpus_allowed = current->cpus_allowed; +Index: linux-2.6-tip/arch/x86/kernel/cpu/cyrix.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/cyrix.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/cyrix.c +@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(uns + */ + static unsigned char Cx86_dir0_msb __cpuinitdata = 0; + +-static char Cx86_model[][9] __cpuinitdata = { ++static const char __cpuinitconst Cx86_model[][9] = { + "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", + "M II ", "Unknown" + }; +-static char Cx486_name[][5] __cpuinitdata = { ++static const char __cpuinitconst Cx486_name[][5] = { + "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", + "SRx2", "DRx2" + }; +-static char Cx486S_name[][4] __cpuinitdata = { ++static const char __cpuinitconst Cx486S_name[][4] = { + "S", "S2", "Se", "S2e" + }; +-static char Cx486D_name[][4] __cpuinitdata = { ++static const char __cpuinitconst Cx486D_name[][4] = { + "DX", "DX2", "?", "?", "?", "DX4" + }; + static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; +-static char cyrix_model_mult1[] __cpuinitdata = "12??43"; +-static char cyrix_model_mult2[] __cpuinitdata = "12233445"; ++static const char __cpuinitconst cyrix_model_mult1[] = "12??43"; ++static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; + + /* + * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old +@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(str + } + } + +-static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { ++static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { + .c_vendor = "Cyrix", + .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, +@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cp + + cpu_dev_register(cyrix_cpu_dev); + +-static struct cpu_dev nsc_cpu_dev __cpuinitdata = { ++static const struct cpu_dev __cpuinitconst nsc_cpu_dev = { + .c_vendor = "NSC", + .c_ident = { "Geode by NSC" }, + .c_init = init_nsc, +Index: linux-2.6-tip/arch/x86/kernel/cpu/intel.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/intel.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/intel.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -13,6 +14,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_X86_64 + #include +@@ -24,7 +26,6 @@ + #ifdef CONFIG_X86_LOCAL_APIC + #include + #include +-#include + #endif + + static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +@@ -54,15 +55,60 @@ static void __cpuinit early_init_intel(s + c->x86_cache_alignment = 128; + #endif + ++ /* CPUID workaround for 0F33/0F34 CPU */ ++ if (c->x86 == 0xF && c->x86_model == 0x3 ++ && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) ++ c->x86_phys_bits = 36; ++ + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate +- * with P/T states and does not stop in deep C-states ++ * with P/T states and does not stop in deep C-states. ++ * ++ * It is also reliable across cores and sockets. (but not across ++ * cabinets - we turn it off in that case explicitly.) + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); ++ set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); ++ sched_clock_stable = 1; + } + ++ /* ++ * There is a known erratum on Pentium III and Core Solo ++ * and Core Duo CPUs. ++ * " Page with PAT set to WC while associated MTRR is UC ++ * may consolidate to UC " ++ * Because of this erratum, it is better to stick with ++ * setting WC in MTRR rather than using PAT on these CPUs. ++ * ++ * Enable PAT WC only on P4, Core 2 or later CPUs. ++ */ ++ if (c->x86 == 6 && c->x86_model < 15) ++ clear_cpu_cap(c, X86_FEATURE_PAT); ++ ++#ifdef CONFIG_KMEMCHECK ++ /* ++ * P4s have a "fast strings" feature which causes single- ++ * stepping REP instructions to only generate a #DB on ++ * cache-line boundaries. ++ * ++ * Ingo Molnar reported a Pentium D (model 6) and a Xeon ++ * (model 2) with the same problem. ++ */ ++ if (c->x86 == 15) { ++ u64 misc_enable; ++ ++ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); ++ ++ if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { ++ printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); ++ ++ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; ++ wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); ++ } ++ } ++#endif + } + + #ifdef CONFIG_X86_32 +@@ -99,6 +145,28 @@ static void __cpuinit trap_init_f00f_bug + } + #endif + ++static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) ++{ ++#ifdef CONFIG_SMP ++ /* calling is from identify_secondary_cpu() ? */ ++ if (c->cpu_index == boot_cpu_id) ++ return; ++ ++ /* ++ * Mask B, Pentium, but not Pentium MMX ++ */ ++ if (c->x86 == 5 && ++ c->x86_mask >= 1 && c->x86_mask <= 4 && ++ c->x86_model <= 3) { ++ /* ++ * Remember we have B step Pentia with bugs ++ */ ++ WARN_ONCE(1, "WARNING: SMP operation may be unreliable" ++ "with B stepping processors.\n"); ++ } ++#endif ++} ++ + static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) + { + unsigned long lo, hi; +@@ -135,10 +203,10 @@ static void __cpuinit intel_workarounds( + */ + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { + rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); +- if ((lo & (1<<9)) == 0) { ++ if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { + printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); + printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); +- lo |= (1<<9); /* Disable hw prefetching */ ++ lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; + wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); + } + } +@@ -175,6 +243,8 @@ static void __cpuinit intel_workarounds( + #ifdef CONFIG_X86_NUMAQ + numaq_tsc_disable(); + #endif ++ ++ intel_smp_check(c); + } + #else + static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +@@ -374,7 +444,7 @@ static unsigned int __cpuinit intel_size + } + #endif + +-static struct cpu_dev intel_cpu_dev __cpuinitdata = { ++static const struct cpu_dev __cpuinitconst intel_cpu_dev = { + .c_vendor = "Intel", + .c_ident = { "GenuineIntel" }, + #ifdef CONFIG_X86_32 +Index: linux-2.6-tip/arch/x86/kernel/cpu/intel_cacheinfo.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/intel_cacheinfo.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/intel_cacheinfo.c +@@ -32,7 +32,7 @@ struct _cache_table + }; + + /* all the cache descriptor types we care about (no TLB or trace cache entries) */ +-static struct _cache_table cache_table[] __cpuinitdata = ++static const struct _cache_table __cpuinitconst cache_table[] = + { + { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ + { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ +@@ -147,10 +147,19 @@ struct _cpuid4_info { + union _cpuid4_leaf_ecx ecx; + unsigned long size; + unsigned long can_disable; +- cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ ++ DECLARE_BITMAP(shared_cpu_map, NR_CPUS); + }; + +-#ifdef CONFIG_PCI ++/* subset of above _cpuid4_info w/o shared_cpu_map */ ++struct _cpuid4_info_regs { ++ union _cpuid4_leaf_eax eax; ++ union _cpuid4_leaf_ebx ebx; ++ union _cpuid4_leaf_ecx ecx; ++ unsigned long size; ++ unsigned long can_disable; ++}; ++ ++#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) + static struct pci_device_id k8_nb_id[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, +@@ -197,15 +206,15 @@ union l3_cache { + unsigned val; + }; + +-static unsigned short assocs[] __cpuinitdata = { ++static const unsigned short __cpuinitconst assocs[] = { + [1] = 1, [2] = 2, [4] = 4, [6] = 8, + [8] = 16, [0xa] = 32, [0xb] = 48, + [0xc] = 64, + [0xf] = 0xffff // ?? + }; + +-static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; +-static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; ++static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; ++static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 }; + + static void __cpuinit + amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, +@@ -278,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_ + } + + static void __cpuinit +-amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) ++amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) + { + if (index < 3) + return; +@@ -286,7 +295,8 @@ amd_check_l3_disable(int index, struct _ + } + + static int +-__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) ++__cpuinit cpuid4_cache_lookup_regs(int index, ++ struct _cpuid4_info_regs *this_leaf) + { + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; +@@ -353,11 +363,10 @@ unsigned int __cpuinit init_intel_cachei + * parameters cpuid leaf to find the cache details + */ + for (i = 0; i < num_cache_leaves; i++) { +- struct _cpuid4_info this_leaf; +- ++ struct _cpuid4_info_regs this_leaf; + int retval; + +- retval = cpuid4_cache_lookup(i, &this_leaf); ++ retval = cpuid4_cache_lookup_regs(i, &this_leaf); + if (retval >= 0) { + switch(this_leaf.eax.split.level) { + case 1: +@@ -490,6 +499,8 @@ unsigned int __cpuinit init_intel_cachei + return l2; + } + ++#ifdef CONFIG_SYSFS ++ + /* pointer to _cpuid4_info array (for each cache leaf) */ + static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); + #define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) +@@ -506,17 +517,20 @@ static void __cpuinit cache_shared_cpu_m + num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; + + if (num_threads_sharing == 1) +- cpu_set(cpu, this_leaf->shared_cpu_map); ++ cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); + else { + index_msb = get_count_order(num_threads_sharing); + + for_each_online_cpu(i) { + if (cpu_data(i).apicid >> index_msb == + c->apicid >> index_msb) { +- cpu_set(i, this_leaf->shared_cpu_map); ++ cpumask_set_cpu(i, ++ to_cpumask(this_leaf->shared_cpu_map)); + if (i != cpu && per_cpu(cpuid4_info, i)) { +- sibling_leaf = CPUID4_INFO_IDX(i, index); +- cpu_set(cpu, sibling_leaf->shared_cpu_map); ++ sibling_leaf = ++ CPUID4_INFO_IDX(i, index); ++ cpumask_set_cpu(cpu, to_cpumask( ++ sibling_leaf->shared_cpu_map)); + } + } + } +@@ -528,9 +542,10 @@ static void __cpuinit cache_remove_share + int sibling; + + this_leaf = CPUID4_INFO_IDX(cpu, index); +- for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { ++ for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) { + sibling_leaf = CPUID4_INFO_IDX(sibling, index); +- cpu_clear(cpu, sibling_leaf->shared_cpu_map); ++ cpumask_clear_cpu(cpu, ++ to_cpumask(sibling_leaf->shared_cpu_map)); + } + } + #else +@@ -549,6 +564,15 @@ static void __cpuinit free_cache_attribu + per_cpu(cpuid4_info, cpu) = NULL; + } + ++static int ++__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) ++{ ++ struct _cpuid4_info_regs *leaf_regs = ++ (struct _cpuid4_info_regs *)this_leaf; ++ ++ return cpuid4_cache_lookup_regs(index, leaf_regs); ++} ++ + static void __cpuinit get_cpu_leaves(void *_retval) + { + int j, *retval = _retval, cpu = smp_processor_id(); +@@ -590,8 +614,6 @@ static int __cpuinit detect_cache_attrib + return retval; + } + +-#ifdef CONFIG_SYSFS +- + #include + #include + +@@ -635,8 +657,9 @@ static ssize_t show_shared_cpu_map_func( + int n = 0; + + if (len > 1) { +- cpumask_t *mask = &this_leaf->shared_cpu_map; ++ const struct cpumask *mask; + ++ mask = to_cpumask(this_leaf->shared_cpu_map); + n = type? + cpulist_scnprintf(buf, len-2, mask) : + cpumask_scnprintf(buf, len-2, mask); +@@ -699,7 +722,8 @@ static struct pci_dev *get_k8_northbridg + + static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) + { +- int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); ++ const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); ++ int node = cpu_to_node(cpumask_first(mask)); + struct pci_dev *dev = NULL; + ssize_t ret = 0; + int i; +@@ -733,7 +757,8 @@ static ssize_t + store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, + size_t count) + { +- int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); ++ const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); ++ int node = cpu_to_node(cpumask_first(mask)); + struct pci_dev *dev = NULL; + unsigned int ret, index, val; + +@@ -878,7 +903,7 @@ err_out: + return -ENOMEM; + } + +-static cpumask_t cache_dev_map = CPU_MASK_NONE; ++static DECLARE_BITMAP(cache_dev_map, NR_CPUS); + + /* Add/Remove cache interface for CPU device */ + static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +@@ -918,7 +943,7 @@ static int __cpuinit cache_add_dev(struc + } + kobject_uevent(&(this_object->kobj), KOBJ_ADD); + } +- cpu_set(cpu, cache_dev_map); ++ cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); + + kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); + return 0; +@@ -931,9 +956,9 @@ static void __cpuinit cache_remove_dev(s + + if (per_cpu(cpuid4_info, cpu) == NULL) + return; +- if (!cpu_isset(cpu, cache_dev_map)) ++ if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) + return; +- cpu_clear(cpu, cache_dev_map); ++ cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); + + for (i = 0; i < num_cache_leaves; i++) + kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); +Index: linux-2.6-tip/arch/x86/kernel/cpu/mcheck/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mcheck/Makefile ++++ linux-2.6-tip/arch/x86/kernel/cpu/mcheck/Makefile +@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o + obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o + obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o + obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o ++obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o +Index: linux-2.6-tip/arch/x86/kernel/cpu/mcheck/mce_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mcheck/mce_32.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/mcheck/mce_32.c +@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) + } + } + +-static unsigned long old_cr4 __initdata; +- +-void __init stop_mce(void) +-{ +- old_cr4 = read_cr4(); +- clear_in_cr4(X86_CR4_MCE); +-} +- +-void __init restart_mce(void) +-{ +- if (old_cr4 & X86_CR4_MCE) +- set_in_cr4(X86_CR4_MCE); +-} +- + static int __init mcheck_disable(char *str) + { + mce_disabled = 1; +Index: linux-2.6-tip/arch/x86/kernel/cpu/mcheck/mce_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mcheck/mce_64.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/mcheck/mce_64.c +@@ -3,6 +3,8 @@ + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. ++ * Copyright 2008 Intel Corporation ++ * Author: Andi Kleen + */ + + #include +@@ -24,6 +26,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include + #include + #include +@@ -32,7 +37,6 @@ + #include + + #define MISC_MCELOG_MINOR 227 +-#define NR_SYSFS_BANKS 6 + + atomic_t mce_entry; + +@@ -47,7 +51,7 @@ static int mce_dont_init; + */ + static int tolerant = 1; + static int banks; +-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; ++static u64 *bank; + static unsigned long notify_user; + static int rip_msr; + static int mce_bootlog = -1; +@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger + + static DECLARE_WAIT_QUEUE_HEAD(mce_wait); + ++/* MCA banks polled by the period polling timer for corrected events */ ++DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { ++ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL ++}; ++ ++/* Do initial initialization of a struct mce */ ++void mce_setup(struct mce *m) ++{ ++ memset(m, 0, sizeof(struct mce)); ++ m->cpu = smp_processor_id(); ++ rdtscll(m->tsc); ++} ++ + /* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also +@@ -119,11 +136,11 @@ static void print_mce(struct mce *m) + print_symbol("{%s}", m->ip); + printk("\n"); + } +- printk(KERN_EMERG "TSC %Lx ", m->tsc); ++ printk(KERN_EMERG "TSC %llx ", m->tsc); + if (m->addr) +- printk("ADDR %Lx ", m->addr); ++ printk("ADDR %llx ", m->addr); + if (m->misc) +- printk("MISC %Lx ", m->misc); ++ printk("MISC %llx ", m->misc); + printk("\n"); + printk(KERN_EMERG "This is not a software problem!\n"); + printk(KERN_EMERG "Run through mcelog --ascii to decode " +@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct + panic(msg); + } + +-static int mce_available(struct cpuinfo_x86 *c) ++int mce_available(struct cpuinfo_x86 *c) + { ++ if (mce_dont_init) ++ return 0; + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); + } + +@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mc + } + + /* +- * The actual machine check handler ++ * Poll for corrected events or events that happened before reset. ++ * Those are just logged through /dev/mcelog. ++ * ++ * This is executed in standard interrupt context. ++ */ ++void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) ++{ ++ struct mce m; ++ int i; ++ ++ mce_setup(&m); ++ ++ rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); ++ for (i = 0; i < banks; i++) { ++ if (!bank[i] || !test_bit(i, *b)) ++ continue; ++ ++ m.misc = 0; ++ m.addr = 0; ++ m.bank = i; ++ m.tsc = 0; ++ ++ barrier(); ++ rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); ++ if (!(m.status & MCI_STATUS_VAL)) ++ continue; ++ ++ /* ++ * Uncorrected events are handled by the exception handler ++ * when it is enabled. But when the exception is disabled log ++ * everything. ++ * ++ * TBD do the same check for MCI_STATUS_EN here? ++ */ ++ if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) ++ continue; ++ ++ if (m.status & MCI_STATUS_MISCV) ++ rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); ++ if (m.status & MCI_STATUS_ADDRV) ++ rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); ++ ++ if (!(flags & MCP_TIMESTAMP)) ++ m.tsc = 0; ++ /* ++ * Don't get the IP here because it's unlikely to ++ * have anything to do with the actual error location. ++ */ ++ ++ mce_log(&m); ++ add_taint(TAINT_MACHINE_CHECK); ++ ++ /* ++ * Clear state for this bank. ++ */ ++ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); ++ } ++ ++ /* ++ * Don't clear MCG_STATUS here because it's only defined for ++ * exceptions. ++ */ ++} ++ ++/* ++ * The actual machine check handler. This only handles real ++ * exceptions when something got corrupted coming in through int 18. ++ * ++ * This is executed in NMI context not subject to normal locking rules. This ++ * implies that most kernel services cannot be safely used. Don't even ++ * think about putting a printk in there! + */ + void do_machine_check(struct pt_regs * regs, long error_code) + { +@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * r + * error. + */ + int kill_it = 0; ++ DECLARE_BITMAP(toclear, MAX_NR_BANKS); + + atomic_inc(&mce_entry); + +- if ((regs +- && notify_die(DIE_NMI, "machine check", regs, error_code, ++ if (notify_die(DIE_NMI, "machine check", regs, error_code, + 18, SIGKILL) == NOTIFY_STOP) +- || !banks) + goto out2; ++ if (!banks) ++ goto out2; ++ ++ mce_setup(&m); + +- memset(&m, 0, sizeof(struct mce)); +- m.cpu = smp_processor_id(); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) +@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * r + barrier(); + + for (i = 0; i < banks; i++) { +- if (i < NR_SYSFS_BANKS && !bank[i]) ++ __clear_bit(i, toclear); ++ if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; +- m.tsc = 0; + + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + ++ /* ++ * Non uncorrected errors are handled by machine_check_poll ++ * Leave them alone. ++ */ ++ if ((m.status & MCI_STATUS_UC) == 0) ++ continue; ++ ++ /* ++ * Set taint even when machine check was not enabled. ++ */ ++ add_taint(TAINT_MACHINE_CHECK); ++ ++ __set_bit(i, toclear); ++ + if (m.status & MCI_STATUS_EN) { + /* if PCC was set, there's no way out */ + no_way_out |= !!(m.status & MCI_STATUS_PCC); +@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * r + no_way_out = 1; + kill_it = 1; + } ++ } else { ++ /* ++ * Machine check event was not enabled. Clear, but ++ * ignore. ++ */ ++ continue; + } + + if (m.status & MCI_STATUS_MISCV) +@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * r + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + mce_get_rip(&m, regs); +- if (error_code >= 0) +- rdtscll(m.tsc); +- if (error_code != -2) +- mce_log(&m); ++ mce_log(&m); + + /* Did this bank cause the exception? */ + /* Assume that the bank with uncorrectable errors did it, +@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * r + panicm = m; + panicm_found = 1; + } +- +- add_taint(TAINT_MACHINE_CHECK); + } + +- /* Never do anything final in the polling timer */ +- if (!regs) +- goto out; +- + /* If we didn't find an uncorrectable error, pick + the last one (shouldn't happen, just being safe). */ + if (!panicm_found) +@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * r + /* notify userspace ASAP */ + set_thread_flag(TIF_MCE_NOTIFY); + +- out: + /* the last thing we do is clear state */ +- for (i = 0; i < banks; i++) +- wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); ++ for (i = 0; i < banks; i++) { ++ if (test_bit(i, toclear)) ++ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); ++ } + wrmsrl(MSR_IA32_MCG_STATUS, 0); + out2: + atomic_dec(&mce_entry); +@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * r + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +-void mce_log_therm_throt_event(unsigned int cpu, __u64 status) ++void mce_log_therm_throt_event(__u64 status) + { + struct mce m; + +- memset(&m, 0, sizeof(m)); +- m.cpu = cpu; ++ mce_setup(&m); + m.bank = MCE_THERMAL_BANK; + m.status = status; +- rdtscll(m.tsc); + mce_log(&m); + } + #endif /* CONFIG_X86_MCE_INTEL */ +@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned + + static int check_interval = 5 * 60; /* 5 minutes */ + static int next_interval; /* in jiffies */ +-static void mcheck_timer(struct work_struct *work); +-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); ++static void mcheck_timer(unsigned long); ++static DEFINE_PER_CPU(struct timer_list, mce_timer); + +-static void mcheck_check_cpu(void *info) ++static void mcheck_timer(unsigned long data) + { +- if (mce_available(¤t_cpu_data)) +- do_machine_check(NULL, 0); +-} ++ struct timer_list *t = &per_cpu(mce_timer, data); + +-static void mcheck_timer(struct work_struct *work) +-{ +- on_each_cpu(mcheck_check_cpu, NULL, 1); ++ WARN_ON(smp_processor_id() != data); ++ ++ if (mce_available(¤t_cpu_data)) ++ machine_check_poll(MCP_TIMESTAMP, ++ &__get_cpu_var(mce_poll_banks)); + + /* + * Alert userspace if needed. If we logged an MCE, reduce the +@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_str + (int)round_jiffies_relative(check_interval*HZ)); + } + +- schedule_delayed_work(&mcheck_work, next_interval); ++ t->expires = jiffies + next_interval; ++ add_timer(t); ++} ++ ++static void mce_do_trigger(struct work_struct *work) ++{ ++ call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); + } + ++static DECLARE_WORK(mce_trigger_work, mce_do_trigger); ++ + /* +- * This is only called from process context. This is where we do +- * anything we need to alert userspace about new MCEs. This is called +- * directly from the poller and also from entry.S and idle, thanks to +- * TIF_MCE_NOTIFY. ++ * Notify the user(s) about new machine check events. ++ * Can be called from interrupt context, but not from machine check/NMI ++ * context. + */ + int mce_notify_user(void) + { ++ /* Not more than two messages every minute */ ++ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); ++ + clear_thread_flag(TIF_MCE_NOTIFY); + if (test_and_clear_bit(0, ¬ify_user)) { +- static unsigned long last_print; +- unsigned long now = jiffies; +- + wake_up_interruptible(&mce_wait); +- if (trigger[0]) +- call_usermodehelper(trigger, trigger_argv, NULL, +- UMH_NO_WAIT); + +- if (time_after_eq(now, last_print + (check_interval*HZ))) { +- last_print = now; ++ /* ++ * There is no risk of missing notifications because ++ * work_pending is always cleared before the function is ++ * executed. ++ */ ++ if (trigger[0] && !work_pending(&mce_trigger_work)) ++ schedule_work(&mce_trigger_work); ++ ++ if (__ratelimit(&ratelimit)) + printk(KERN_INFO "Machine check events logged\n"); +- } + + return 1; + } +@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_no + + static __init int periodic_mcheck_init(void) + { +- next_interval = check_interval * HZ; +- if (next_interval) +- schedule_delayed_work(&mcheck_work, +- round_jiffies_relative(next_interval)); +- idle_notifier_register(&mce_idle_notifier); +- return 0; ++ idle_notifier_register(&mce_idle_notifier); ++ return 0; + } + __initcall(periodic_mcheck_init); + +- + /* + * Initialize Machine Checks for a CPU. + */ +-static void mce_init(void *dummy) ++static int mce_cap_init(void) + { + u64 cap; +- int i; ++ unsigned b; + + rdmsrl(MSR_IA32_MCG_CAP, cap); +- banks = cap & 0xff; +- if (banks > MCE_EXTENDED_BANK) { +- banks = MCE_EXTENDED_BANK; +- printk(KERN_INFO "MCE: warning: using only %d banks\n", +- MCE_EXTENDED_BANK); ++ b = cap & 0xff; ++ if (b > MAX_NR_BANKS) { ++ printk(KERN_WARNING ++ "MCE: Using only %u machine check banks out of %u\n", ++ MAX_NR_BANKS, b); ++ b = MAX_NR_BANKS; ++ } ++ ++ /* Don't support asymmetric configurations today */ ++ WARN_ON(banks != 0 && b != banks); ++ banks = b; ++ if (!bank) { ++ bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); ++ if (!bank) ++ return -ENOMEM; ++ memset(bank, 0xff, banks * sizeof(u64)); + } ++ + /* Use accurate RIP reporting if available. */ + if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + rip_msr = MSR_IA32_MCG_EIP; + +- /* Log the machine checks left over from the previous reset. +- This also clears all registers */ +- do_machine_check(NULL, mce_bootlog ? -1 : -2); ++ return 0; ++} ++ ++static void mce_init(void *dummy) ++{ ++ u64 cap; ++ int i; ++ mce_banks_t all_banks; ++ ++ /* ++ * Log the machine checks left over from the previous reset. ++ */ ++ bitmap_fill(all_banks, MAX_NR_BANKS); ++ machine_check_poll(MCP_UC, &all_banks); + + set_in_cr4(X86_CR4_MCE); + ++ rdmsrl(MSR_IA32_MCG_CAP, cap); + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < banks; i++) { +- if (i < NR_SYSFS_BANKS) +- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); +- else +- wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); +- ++ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + } + + /* Add per CPU specific workarounds here */ +-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) ++static void mce_cpu_quirks(struct cpuinfo_x86 *c) + { + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD) { +- if(c->x86 == 15) ++ if (c->x86 == 15 && banks > 4) + /* disable GART TBL walk error reporting, which trips off + incorrectly with the IOMMU & 3ware & Cerberus. */ +- clear_bit(10, &bank[4]); ++ clear_bit(10, (unsigned long *)&bank[4]); + if(c->x86 <= 17 && mce_bootlog < 0) + /* Lots of broken BIOS around that don't clear them + by default and leave crap in there. Don't log. */ +@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpui + } + } + ++static void mce_init_timer(void) ++{ ++ struct timer_list *t = &__get_cpu_var(mce_timer); ++ ++ /* data race harmless because everyone sets to the same value */ ++ if (!next_interval) ++ next_interval = check_interval * HZ; ++ if (!next_interval) ++ return; ++ setup_timer(t, mcheck_timer, smp_processor_id()); ++ t->expires = round_jiffies(jiffies + next_interval); ++ add_timer(t); ++} ++ + /* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off. + */ + void __cpuinit mcheck_init(struct cpuinfo_x86 *c) + { +- mce_cpu_quirks(c); ++ if (!mce_available(c)) ++ return; + +- if (mce_dont_init || +- !mce_available(c)) ++ if (mce_cap_init() < 0) { ++ mce_dont_init = 1; + return; ++ } ++ mce_cpu_quirks(c); + + mce_init(NULL); + mce_cpu_features(c); ++ mce_init_timer(); + } + + /* +@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *fil + { + unsigned long *cpu_tsc; + static DEFINE_MUTEX(mce_read_mutex); +- unsigned next; ++ unsigned prev, next; + char __user *buf = ubuf; + int i, err; + +@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *fil + } + + err = 0; +- for (i = 0; i < next; i++) { +- unsigned long start = jiffies; +- +- while (!mcelog.entry[i].finished) { +- if (time_after_eq(jiffies, start + 2)) { +- memset(mcelog.entry + i,0, sizeof(struct mce)); +- goto timeout; ++ prev = 0; ++ do { ++ for (i = prev; i < next; i++) { ++ unsigned long start = jiffies; ++ ++ while (!mcelog.entry[i].finished) { ++ if (time_after_eq(jiffies, start + 2)) { ++ memset(mcelog.entry + i, 0, ++ sizeof(struct mce)); ++ goto timeout; ++ } ++ cpu_relax(); + } +- cpu_relax(); ++ smp_rmb(); ++ err |= copy_to_user(buf, mcelog.entry + i, ++ sizeof(struct mce)); ++ buf += sizeof(struct mce); ++timeout: ++ ; + } +- smp_rmb(); +- err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); +- buf += sizeof(struct mce); +- timeout: +- ; +- } + +- memset(mcelog.entry, 0, next * sizeof(struct mce)); +- mcelog.next = 0; ++ memset(mcelog.entry + prev, 0, ++ (next - prev) * sizeof(struct mce)); ++ prev = next; ++ next = cmpxchg(&mcelog.next, prev, 0); ++ } while (next != prev); + + synchronize_sched(); + +@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device + &mce_chrdev_ops, + }; + +-static unsigned long old_cr4 __initdata; +- +-void __init stop_mce(void) +-{ +- old_cr4 = read_cr4(); +- clear_in_cr4(X86_CR4_MCE); +-} +- +-void __init restart_mce(void) +-{ +- if (old_cr4 & X86_CR4_MCE) +- set_in_cr4(X86_CR4_MCE); +-} +- + /* + * Old style boot options parsing. Only for compatibility. + */ +@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *s + return 1; + } + +-/* mce=off disables machine check. Note you can re-enable it later +- using sysfs. ++/* mce=off disables machine check. + mce=TOLERANCELEVEL (number, see above) + mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + mce=nobootlog Don't log MCEs from before booting. */ +@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable); + * Sysfs support + */ + ++/* ++ * Disable machine checks on suspend and shutdown. We can't really handle ++ * them later. ++ */ ++static int mce_disable(void) ++{ ++ int i; ++ ++ for (i = 0; i < banks; i++) ++ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); ++ return 0; ++} ++ ++static int mce_suspend(struct sys_device *dev, pm_message_t state) ++{ ++ return mce_disable(); ++} ++ ++static int mce_shutdown(struct sys_device *dev) ++{ ++ return mce_disable(); ++} ++ + /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. + Only one CPU is active at this time, the others get readded later using + CPU hotplug. */ +@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device + return 0; + } + ++static void mce_cpu_restart(void *data) ++{ ++ del_timer_sync(&__get_cpu_var(mce_timer)); ++ if (mce_available(¤t_cpu_data)) ++ mce_init(NULL); ++ mce_init_timer(); ++} ++ + /* Reinit MCEs after user configuration changes */ + static void mce_restart(void) + { +- if (next_interval) +- cancel_delayed_work(&mcheck_work); +- /* Timer race is harmless here */ +- on_each_cpu(mce_init, NULL, 1); + next_interval = check_interval * HZ; +- if (next_interval) +- schedule_delayed_work(&mcheck_work, +- round_jiffies_relative(next_interval)); ++ on_each_cpu(mce_cpu_restart, NULL, 1); + } + + static struct sysdev_class mce_sysclass = { ++ .suspend = mce_suspend, ++ .shutdown = mce_shutdown, + .resume = mce_resume, + .name = "machinecheck", + }; +@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned + } \ + static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); + +-/* +- * TBD should generate these dynamically based on number of available banks. +- * Have only 6 contol banks in /sysfs until then. +- */ +-ACCESSOR(bank0ctl,bank[0],mce_restart()) +-ACCESSOR(bank1ctl,bank[1],mce_restart()) +-ACCESSOR(bank2ctl,bank[2],mce_restart()) +-ACCESSOR(bank3ctl,bank[3],mce_restart()) +-ACCESSOR(bank4ctl,bank[4],mce_restart()) +-ACCESSOR(bank5ctl,bank[5],mce_restart()) ++static struct sysdev_attribute *bank_attrs; ++ ++static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, ++ char *buf) ++{ ++ u64 b = bank[attr - bank_attrs]; ++ return sprintf(buf, "%llx\n", b); ++} ++ ++static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, ++ const char *buf, size_t siz) ++{ ++ char *end; ++ u64 new = simple_strtoull(buf, &end, 0); ++ if (end == buf) ++ return -EINVAL; ++ bank[attr - bank_attrs] = new; ++ mce_restart(); ++ return end-buf; ++} + + static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +@@ -814,13 +986,11 @@ static SYSDEV_ATTR(trigger, 0644, show_t + static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); + ACCESSOR(check_interval,check_interval,mce_restart()) + static struct sysdev_attribute *mce_attributes[] = { +- &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, +- &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, + &attr_tolerant.attr, &attr_check_interval, &attr_trigger, + NULL + }; + +-static cpumask_t mce_device_initialized = CPU_MASK_NONE; ++static cpumask_var_t mce_device_initialized; + + /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ + static __cpuinit int mce_create_device(unsigned int cpu) +@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(u + if (err) + goto error; + } +- cpu_set(cpu, mce_device_initialized); ++ for (i = 0; i < banks; i++) { ++ err = sysdev_create_file(&per_cpu(device_mce, cpu), ++ &bank_attrs[i]); ++ if (err) ++ goto error2; ++ } ++ cpumask_set_cpu(cpu, mce_device_initialized); + + return 0; ++error2: ++ while (--i >= 0) { ++ sysdev_remove_file(&per_cpu(device_mce, cpu), ++ &bank_attrs[i]); ++ } + error: +- while (i--) { ++ while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce,cpu), + mce_attributes[i]); + } +@@ -862,14 +1043,44 @@ static __cpuinit void mce_remove_device( + { + int i; + +- if (!cpu_isset(cpu, mce_device_initialized)) ++ if (!cpumask_test_cpu(cpu, mce_device_initialized)) + return; + + for (i = 0; mce_attributes[i]; i++) + sysdev_remove_file(&per_cpu(device_mce,cpu), + mce_attributes[i]); ++ for (i = 0; i < banks; i++) ++ sysdev_remove_file(&per_cpu(device_mce, cpu), ++ &bank_attrs[i]); + sysdev_unregister(&per_cpu(device_mce,cpu)); +- cpu_clear(cpu, mce_device_initialized); ++ cpumask_clear_cpu(cpu, mce_device_initialized); ++} ++ ++/* Make sure there are no machine checks on offlined CPUs. */ ++static void mce_disable_cpu(void *h) ++{ ++ int i; ++ unsigned long action = *(unsigned long *)h; ++ ++ if (!mce_available(¤t_cpu_data)) ++ return; ++ if (!(action & CPU_TASKS_FROZEN)) ++ cmci_clear(); ++ for (i = 0; i < banks; i++) ++ wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); ++} ++ ++static void mce_reenable_cpu(void *h) ++{ ++ int i; ++ unsigned long action = *(unsigned long *)h; ++ ++ if (!mce_available(¤t_cpu_data)) ++ return; ++ if (!(action & CPU_TASKS_FROZEN)) ++ cmci_reenable(); ++ for (i = 0; i < banks; i++) ++ wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + } + + /* Get notified when a cpu comes on/off. Be hotplug friendly. */ +@@ -877,6 +1088,7 @@ static int __cpuinit mce_cpu_callback(st + unsigned long action, void *hcpu) + { + unsigned int cpu = (unsigned long)hcpu; ++ struct timer_list *t = &per_cpu(mce_timer, cpu); + + switch (action) { + case CPU_ONLINE: +@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(st + threshold_cpu_callback(action, cpu); + mce_remove_device(cpu); + break; ++ case CPU_DOWN_PREPARE: ++ case CPU_DOWN_PREPARE_FROZEN: ++ del_timer_sync(t); ++ smp_call_function_single(cpu, mce_disable_cpu, &action, 1); ++ break; ++ case CPU_DOWN_FAILED: ++ case CPU_DOWN_FAILED_FROZEN: ++ t->expires = round_jiffies(jiffies + next_interval); ++ add_timer_on(t, cpu); ++ smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); ++ break; ++ case CPU_POST_DEAD: ++ /* intentionally ignoring frozen here */ ++ cmci_rediscover(cpu); ++ break; + } + return NOTIFY_OK; + } +@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_not + .notifier_call = mce_cpu_callback, + }; + ++static __init int mce_init_banks(void) ++{ ++ int i; ++ ++ bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, ++ GFP_KERNEL); ++ if (!bank_attrs) ++ return -ENOMEM; ++ ++ for (i = 0; i < banks; i++) { ++ struct sysdev_attribute *a = &bank_attrs[i]; ++ a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); ++ if (!a->attr.name) ++ goto nomem; ++ a->attr.mode = 0644; ++ a->show = show_bank; ++ a->store = set_bank; ++ } ++ return 0; ++ ++nomem: ++ while (--i >= 0) ++ kfree(bank_attrs[i].attr.name); ++ kfree(bank_attrs); ++ bank_attrs = NULL; ++ return -ENOMEM; ++} ++ + static __init int mce_init_device(void) + { + int err; +@@ -906,6 +1161,13 @@ static __init int mce_init_device(void) + + if (!mce_available(&boot_cpu_data)) + return -EIO; ++ ++ alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); ++ ++ err = mce_init_banks(); ++ if (err) ++ return err; ++ + err = sysdev_class_register(&mce_sysclass); + if (err) + return err; +Index: linux-2.6-tip/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mcheck/mce_amd_64.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +@@ -67,7 +67,7 @@ static struct threshold_block threshold_ + struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; +- cpumask_t cpus; ++ cpumask_var_t cpus; + }; + static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); + +@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANK + + static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ + ++static void amd_threshold_interrupt(void); ++ + /* + * CPU Initialization + */ +@@ -90,7 +92,8 @@ struct thresh_restart { + }; + + /* must be called with correct cpu affinity */ +-static long threshold_restart_bank(void *_tr) ++/* Called via smp_call_function_single() */ ++static void threshold_restart_bank(void *_tr) + { + struct thresh_restart *tr = _tr; + u32 mci_misc_hi, mci_misc_lo; +@@ -117,7 +120,6 @@ static long threshold_restart_bank(void + + mci_misc_hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); +- return 0; + } + + /* cpu init entry point, called from mce.c with preempt off */ +@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo + tr.reset = 0; + tr.old_limit = 0; + threshold_restart_bank(&tr); ++ ++ mce_threshold_vector = amd_threshold_interrupt; + } + } + } +@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo + * the interrupt goes off when error_count reaches threshold_limit. + * the handler will simply log mcelog w/ software defined bank number. + */ +-asmlinkage void mce_threshold_interrupt(void) ++static void amd_threshold_interrupt(void) + { + unsigned int bank, block; + struct mce m; + u32 low = 0, high = 0, address = 0; + +- ack_APIC_irq(); +- exit_idle(); +- irq_enter(); +- +- memset(&m, 0, sizeof(m)); +- rdtscll(m.tsc); +- m.cpu = smp_processor_id(); ++ mce_setup(&m); + + /* assume first bank caused it */ + for (bank = 0; bank < NR_BANKS; ++bank) { +@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt( + + /* Log the machine check that caused the threshold + event. */ +- do_machine_check(NULL, 0); ++ machine_check_poll(MCP_TIMESTAMP, ++ &__get_cpu_var(mce_poll_banks)); + + if (high & MASK_OVERFLOW_HI) { + rdmsrl(address, m.misc); +@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt( + + bank * NR_BLOCKS + + block; + mce_log(&m); +- goto out; ++ return; + } + } + } +-out: +- inc_irq_stat(irq_threshold_count); +- irq_exit(); + } + + /* +@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(st + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; +- work_on_cpu(b->cpu, threshold_restart_bank, &tr); ++ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return end - buf; + } +@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(str + tr.b = b; + tr.reset = 0; + +- work_on_cpu(b->cpu, threshold_restart_bank, &tr); ++ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return end - buf; + } + +-static long local_error_count(void *_b) ++struct threshold_block_cross_cpu { ++ struct threshold_block *tb; ++ long retval; ++}; ++ ++static void local_error_count_handler(void *_tbcc) + { +- struct threshold_block *b = _b; ++ struct threshold_block_cross_cpu *tbcc = _tbcc; ++ struct threshold_block *b = tbcc->tb; + u32 low, high; + + rdmsr(b->address, low, high); +- return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); ++ tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); + } + + static ssize_t show_error_count(struct threshold_block *b, char *buf) + { +- return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); ++ struct threshold_block_cross_cpu tbcc = { .tb = b, }; ++ ++ smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); ++ return sprintf(buf, "%lx\n", tbcc.retval); + } + + static ssize_t store_error_count(struct threshold_block *b, +@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct + { + struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + +- work_on_cpu(b->cpu, threshold_restart_bank, &tr); ++ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + return 1; + } + +@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_ + if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) + return 0; + +- if (rdmsr_safe(address, &low, &high)) ++ if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) + return 0; + + if (!(high & MASK_VALID_HI)) { +@@ -462,12 +467,11 @@ out_free: + return err; + } + +-static __cpuinit long local_allocate_threshold_blocks(void *_bank) ++static __cpuinit long ++local_allocate_threshold_blocks(int cpu, unsigned int bank) + { +- unsigned int *bank = _bank; +- +- return allocate_threshold_blocks(smp_processor_id(), *bank, 0, +- MSR_IA32_MC0_MISC + *bank * 4); ++ return allocate_threshold_blocks(cpu, bank, 0, ++ MSR_IA32_MC0_MISC + bank * 4); + } + + /* symlinks sibling shared banks to first core. first core owns dir/files. */ +@@ -481,7 +485,7 @@ static __cpuinit int threshold_create_ba + + #ifdef CONFIG_SMP + if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ +- i = first_cpu(per_cpu(cpu_core_map, cpu)); ++ i = cpumask_first(cpu_core_mask(cpu)); + + /* first core not up yet */ + if (cpu_data(i).cpu_core_id) +@@ -501,7 +505,7 @@ static __cpuinit int threshold_create_ba + if (err) + goto out; + +- b->cpus = per_cpu(cpu_core_map, cpu); ++ cpumask_copy(b->cpus, cpu_core_mask(cpu)); + per_cpu(threshold_banks, cpu)[bank] = b; + goto out; + } +@@ -512,24 +516,29 @@ static __cpuinit int threshold_create_ba + err = -ENOMEM; + goto out; + } ++ if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { ++ kfree(b); ++ err = -ENOMEM; ++ goto out; ++ } + + b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); + if (!b->kobj) + goto out_free; + + #ifndef CONFIG_SMP +- b->cpus = CPU_MASK_ALL; ++ cpumask_setall(b->cpus); + #else +- b->cpus = per_cpu(cpu_core_map, cpu); ++ cpumask_copy(b->cpus, cpu_core_mask(cpu)); + #endif + + per_cpu(threshold_banks, cpu)[bank] = b; + +- err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); ++ err = local_allocate_threshold_blocks(cpu, bank); + if (err) + goto out_free; + +- for_each_cpu_mask_nr(i, b->cpus) { ++ for_each_cpu(i, b->cpus) { + if (i == cpu) + continue; + +@@ -545,6 +554,7 @@ static __cpuinit int threshold_create_ba + + out_free: + per_cpu(threshold_banks, cpu)[bank] = NULL; ++ free_cpumask_var(b->cpus); + kfree(b); + out: + return err; +@@ -619,7 +629,7 @@ static void threshold_remove_bank(unsign + #endif + + /* remove all sibling symlinks before unregistering */ +- for_each_cpu_mask_nr(i, b->cpus) { ++ for_each_cpu(i, b->cpus) { + if (i == cpu) + continue; + +@@ -632,6 +642,7 @@ static void threshold_remove_bank(unsign + free_out: + kobject_del(b->kobj); + kobject_put(b->kobj); ++ free_cpumask_var(b->cpus); + kfree(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; + } +Index: linux-2.6-tip/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mcheck/mce_intel_64.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +@@ -1,17 +1,21 @@ + /* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo ++ * Copyright (C) 2008, 2009 Intel Corporation ++ * Author: Andi Kleen + */ + + #include + #include + #include + #include ++#include + #include + #include + #include + #include + #include ++#include + + asmlinkage void smp_thermal_interrupt(void) + { +@@ -24,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(vo + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & 1)) +- mce_log_therm_throt_event(smp_processor_id(), msr_val); ++ mce_log_therm_throt_event(msr_val); + + inc_irq_stat(irq_thermal_count); + irq_exit(); +@@ -48,13 +52,13 @@ static void intel_init_thermal(struct cp + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); +- if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { ++ if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + +- if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) ++ if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) + tm2 = 1; + + if (h & APIC_VECTOR_MASK) { +@@ -72,7 +76,7 @@ static void intel_init_thermal(struct cp + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); +- wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); ++ wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); +@@ -84,7 +88,209 @@ static void intel_init_thermal(struct cp + return; + } + ++/* ++ * Support for Intel Correct Machine Check Interrupts. This allows ++ * the CPU to raise an interrupt when a corrected machine check happened. ++ * Normally we pick those up using a regular polling timer. ++ * Also supports reliable discovery of shared banks. ++ */ ++ ++static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); ++ ++/* ++ * cmci_discover_lock protects against parallel discovery attempts ++ * which could race against each other. ++ */ ++static DEFINE_SPINLOCK(cmci_discover_lock); ++ ++#define CMCI_THRESHOLD 1 ++ ++static int cmci_supported(int *banks) ++{ ++ u64 cap; ++ ++ /* ++ * Vendor check is not strictly needed, but the initial ++ * initialization is vendor keyed and this ++ * makes sure none of the backdoors are entered otherwise. ++ */ ++ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ++ return 0; ++ if (!cpu_has_apic || lapic_get_maxlvt() < 6) ++ return 0; ++ rdmsrl(MSR_IA32_MCG_CAP, cap); ++ *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); ++ return !!(cap & MCG_CMCI_P); ++} ++ ++/* ++ * The interrupt handler. This is called on every event. ++ * Just call the poller directly to log any events. ++ * This could in theory increase the threshold under high load, ++ * but doesn't for now. ++ */ ++static void intel_threshold_interrupt(void) ++{ ++ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); ++ mce_notify_user(); ++} ++ ++static void print_update(char *type, int *hdr, int num) ++{ ++ if (*hdr == 0) ++ printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); ++ *hdr = 1; ++ printk(KERN_CONT " %s:%d", type, num); ++} ++ ++/* ++ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks ++ * on this CPU. Use the algorithm recommended in the SDM to discover shared ++ * banks. ++ */ ++static void cmci_discover(int banks, int boot) ++{ ++ unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); ++ int hdr = 0; ++ int i; ++ ++ spin_lock(&cmci_discover_lock); ++ for (i = 0; i < banks; i++) { ++ u64 val; ++ ++ if (test_bit(i, owned)) ++ continue; ++ ++ rdmsrl(MSR_IA32_MC0_CTL2 + i, val); ++ ++ /* Already owned by someone else? */ ++ if (val & CMCI_EN) { ++ if (test_and_clear_bit(i, owned) || boot) ++ print_update("SHD", &hdr, i); ++ __clear_bit(i, __get_cpu_var(mce_poll_banks)); ++ continue; ++ } ++ ++ val |= CMCI_EN | CMCI_THRESHOLD; ++ wrmsrl(MSR_IA32_MC0_CTL2 + i, val); ++ rdmsrl(MSR_IA32_MC0_CTL2 + i, val); ++ ++ /* Did the enable bit stick? -- the bank supports CMCI */ ++ if (val & CMCI_EN) { ++ if (!test_and_set_bit(i, owned) || boot) ++ print_update("CMCI", &hdr, i); ++ __clear_bit(i, __get_cpu_var(mce_poll_banks)); ++ } else { ++ WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); ++ } ++ } ++ spin_unlock(&cmci_discover_lock); ++ if (hdr) ++ printk(KERN_CONT "\n"); ++} ++ ++/* ++ * Just in case we missed an event during initialization check ++ * all the CMCI owned banks. ++ */ ++void cmci_recheck(void) ++{ ++ unsigned long flags; ++ int banks; ++ ++ if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) ++ return; ++ local_irq_save(flags); ++ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); ++ local_irq_restore(flags); ++} ++ ++/* ++ * Disable CMCI on this CPU for all banks it owns when it goes down. ++ * This allows other CPUs to claim the banks on rediscovery. ++ */ ++void cmci_clear(void) ++{ ++ int i; ++ int banks; ++ u64 val; ++ ++ if (!cmci_supported(&banks)) ++ return; ++ spin_lock(&cmci_discover_lock); ++ for (i = 0; i < banks; i++) { ++ if (!test_bit(i, __get_cpu_var(mce_banks_owned))) ++ continue; ++ /* Disable CMCI */ ++ rdmsrl(MSR_IA32_MC0_CTL2 + i, val); ++ val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); ++ wrmsrl(MSR_IA32_MC0_CTL2 + i, val); ++ __clear_bit(i, __get_cpu_var(mce_banks_owned)); ++ } ++ spin_unlock(&cmci_discover_lock); ++} ++ ++/* ++ * After a CPU went down cycle through all the others and rediscover ++ * Must run in process context. ++ */ ++void cmci_rediscover(int dying) ++{ ++ int banks; ++ int cpu; ++ cpumask_var_t old; ++ ++ if (!cmci_supported(&banks)) ++ return; ++ if (!alloc_cpumask_var(&old, GFP_KERNEL)) ++ return; ++ cpumask_copy(old, ¤t->cpus_allowed); ++ ++ for_each_online_cpu (cpu) { ++ if (cpu == dying) ++ continue; ++ if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) ++ continue; ++ /* Recheck banks in case CPUs don't all have the same */ ++ if (cmci_supported(&banks)) ++ cmci_discover(banks, 0); ++ } ++ ++ set_cpus_allowed_ptr(current, old); ++ free_cpumask_var(old); ++} ++ ++/* ++ * Reenable CMCI on this CPU in case a CPU down failed. ++ */ ++void cmci_reenable(void) ++{ ++ int banks; ++ if (cmci_supported(&banks)) ++ cmci_discover(banks, 0); ++} ++ ++static void intel_init_cmci(void) ++{ ++ int banks; ++ ++ if (!cmci_supported(&banks)) ++ return; ++ ++ mce_threshold_vector = intel_threshold_interrupt; ++ cmci_discover(banks, 1); ++ /* ++ * For CPU #0 this runs with still disabled APIC, but that's ++ * ok because only the vector is set up. We still do another ++ * check for the banks later for CPU #0 just to make sure ++ * to not miss any events. ++ */ ++ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); ++ cmci_recheck(); ++} ++ + void mce_intel_feature_init(struct cpuinfo_x86 *c) + { + intel_init_thermal(c); ++ intel_init_cmci(); + } +Index: linux-2.6-tip/arch/x86/kernel/cpu/mcheck/p4.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mcheck/p4.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/mcheck/p4.c +@@ -85,7 +85,7 @@ static void intel_init_thermal(struct cp + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); +- if ((l & (1<<3)) && (h & APIC_DM_SMI)) { ++ if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", + cpu); + return; /* -EBUSY */ +@@ -111,7 +111,7 @@ static void intel_init_thermal(struct cp + vendor_thermal_interrupt = intel_thermal_interrupt; + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); +- wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); ++ wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); +Index: linux-2.6-tip/arch/x86/kernel/cpu/mcheck/threshold.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/cpu/mcheck/threshold.c +@@ -0,0 +1,29 @@ ++/* ++ * Common corrected MCE threshold handler code: ++ */ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++static void default_threshold_interrupt(void) ++{ ++ printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", ++ THRESHOLD_APIC_VECTOR); ++} ++ ++void (*mce_threshold_vector)(void) = default_threshold_interrupt; ++ ++asmlinkage void mce_threshold_interrupt(void) ++{ ++ exit_idle(); ++ irq_enter(); ++ inc_irq_stat(irq_threshold_count); ++ mce_threshold_vector(); ++ irq_exit(); ++ /* Ack only at the end to avoid potential reentry */ ++ ack_APIC_irq(); ++} +Index: linux-2.6-tip/arch/x86/kernel/cpu/mtrr/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mtrr/Makefile ++++ linux-2.6-tip/arch/x86/kernel/cpu/mtrr/Makefile +@@ -1,3 +1,3 @@ +-obj-y := main.o if.o generic.o state.o ++obj-y := main.o if.o generic.o state.o cleanup.o + obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o + +Index: linux-2.6-tip/arch/x86/kernel/cpu/mtrr/cleanup.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/cpu/mtrr/cleanup.c +@@ -0,0 +1,1101 @@ ++/* MTRR (Memory Type Range Register) cleanup ++ ++ Copyright (C) 2009 Yinghai Lu ++ ++ This library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Library General Public ++ License as published by the Free Software Foundation; either ++ version 2 of the License, or (at your option) any later version. ++ ++ This library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Library General Public License for more details. ++ ++ You should have received a copy of the GNU Library General Public ++ License along with this library; if not, write to the Free ++ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "mtrr.h" ++ ++/* should be related to MTRR_VAR_RANGES nums */ ++#define RANGE_NUM 256 ++ ++struct res_range { ++ unsigned long start; ++ unsigned long end; ++}; ++ ++static int __init ++add_range(struct res_range *range, int nr_range, unsigned long start, ++ unsigned long end) ++{ ++ /* out of slots */ ++ if (nr_range >= RANGE_NUM) ++ return nr_range; ++ ++ range[nr_range].start = start; ++ range[nr_range].end = end; ++ ++ nr_range++; ++ ++ return nr_range; ++} ++ ++static int __init ++add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, ++ unsigned long end) ++{ ++ int i; ++ ++ /* try to merge it with old one */ ++ for (i = 0; i < nr_range; i++) { ++ unsigned long final_start, final_end; ++ unsigned long common_start, common_end; ++ ++ if (!range[i].end) ++ continue; ++ ++ common_start = max(range[i].start, start); ++ common_end = min(range[i].end, end); ++ if (common_start > common_end + 1) ++ continue; ++ ++ final_start = min(range[i].start, start); ++ final_end = max(range[i].end, end); ++ ++ range[i].start = final_start; ++ range[i].end = final_end; ++ return nr_range; ++ } ++ ++ /* need to add that */ ++ return add_range(range, nr_range, start, end); ++} ++ ++static void __init ++subtract_range(struct res_range *range, unsigned long start, unsigned long end) ++{ ++ int i, j; ++ ++ for (j = 0; j < RANGE_NUM; j++) { ++ if (!range[j].end) ++ continue; ++ ++ if (start <= range[j].start && end >= range[j].end) { ++ range[j].start = 0; ++ range[j].end = 0; ++ continue; ++ } ++ ++ if (start <= range[j].start && end < range[j].end && ++ range[j].start < end + 1) { ++ range[j].start = end + 1; ++ continue; ++ } ++ ++ ++ if (start > range[j].start && end >= range[j].end && ++ range[j].end > start - 1) { ++ range[j].end = start - 1; ++ continue; ++ } ++ ++ if (start > range[j].start && end < range[j].end) { ++ /* find the new spare */ ++ for (i = 0; i < RANGE_NUM; i++) { ++ if (range[i].end == 0) ++ break; ++ } ++ if (i < RANGE_NUM) { ++ range[i].end = range[j].end; ++ range[i].start = end + 1; ++ } else { ++ printk(KERN_ERR "run of slot in ranges\n"); ++ } ++ range[j].end = start - 1; ++ continue; ++ } ++ } ++} ++ ++static int __init cmp_range(const void *x1, const void *x2) ++{ ++ const struct res_range *r1 = x1; ++ const struct res_range *r2 = x2; ++ long start1, start2; ++ ++ start1 = r1->start; ++ start2 = r2->start; ++ ++ return start1 - start2; ++} ++ ++struct var_mtrr_range_state { ++ unsigned long base_pfn; ++ unsigned long size_pfn; ++ mtrr_type type; ++}; ++ ++static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; ++static int __initdata debug_print; ++ ++static int __init ++x86_get_mtrr_mem_range(struct res_range *range, int nr_range, ++ unsigned long extra_remove_base, ++ unsigned long extra_remove_size) ++{ ++ unsigned long base, size; ++ mtrr_type type; ++ int i; ++ ++ for (i = 0; i < num_var_ranges; i++) { ++ type = range_state[i].type; ++ if (type != MTRR_TYPE_WRBACK) ++ continue; ++ base = range_state[i].base_pfn; ++ size = range_state[i].size_pfn; ++ nr_range = add_range_with_merge(range, nr_range, base, ++ base + size - 1); ++ } ++ if (debug_print) { ++ printk(KERN_DEBUG "After WB checking\n"); ++ for (i = 0; i < nr_range; i++) ++ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", ++ range[i].start, range[i].end + 1); ++ } ++ ++ /* take out UC ranges */ ++ for (i = 0; i < num_var_ranges; i++) { ++ type = range_state[i].type; ++ if (type != MTRR_TYPE_UNCACHABLE && ++ type != MTRR_TYPE_WRPROT) ++ continue; ++ size = range_state[i].size_pfn; ++ if (!size) ++ continue; ++ base = range_state[i].base_pfn; ++ if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && ++ (mtrr_state.enabled & 1)) { ++ /* Var MTRR contains UC entry below 1M? Skip it: */ ++ printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " ++ "contains strange UC entry under 1M, check " ++ "with your system vendor!\n", i); ++ if (base + size <= (1<<(20-PAGE_SHIFT))) ++ continue; ++ size -= (1<<(20-PAGE_SHIFT)) - base; ++ base = 1<<(20-PAGE_SHIFT); ++ } ++ subtract_range(range, base, base + size - 1); ++ } ++ if (extra_remove_size) ++ subtract_range(range, extra_remove_base, ++ extra_remove_base + extra_remove_size - 1); ++ ++ /* get new range num */ ++ nr_range = 0; ++ for (i = 0; i < RANGE_NUM; i++) { ++ if (!range[i].end) ++ continue; ++ nr_range++; ++ } ++ if (debug_print) { ++ printk(KERN_DEBUG "After UC checking\n"); ++ for (i = 0; i < nr_range; i++) ++ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", ++ range[i].start, range[i].end + 1); ++ } ++ ++ /* sort the ranges */ ++ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); ++ if (debug_print) { ++ printk(KERN_DEBUG "After sorting\n"); ++ for (i = 0; i < nr_range; i++) ++ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", ++ range[i].start, range[i].end + 1); ++ } ++ ++ /* clear those is not used */ ++ for (i = nr_range; i < RANGE_NUM; i++) ++ memset(&range[i], 0, sizeof(range[i])); ++ ++ return nr_range; ++} ++ ++static struct res_range __initdata range[RANGE_NUM]; ++static int __initdata nr_range; ++ ++#ifdef CONFIG_MTRR_SANITIZER ++ ++static unsigned long __init sum_ranges(struct res_range *range, int nr_range) ++{ ++ unsigned long sum; ++ int i; ++ ++ sum = 0; ++ for (i = 0; i < nr_range; i++) ++ sum += range[i].end + 1 - range[i].start; ++ ++ return sum; ++} ++ ++static int enable_mtrr_cleanup __initdata = ++ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; ++ ++static int __init disable_mtrr_cleanup_setup(char *str) ++{ ++ enable_mtrr_cleanup = 0; ++ return 0; ++} ++early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); ++ ++static int __init enable_mtrr_cleanup_setup(char *str) ++{ ++ enable_mtrr_cleanup = 1; ++ return 0; ++} ++early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); ++ ++static int __init mtrr_cleanup_debug_setup(char *str) ++{ ++ debug_print = 1; ++ return 0; ++} ++early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); ++ ++struct var_mtrr_state { ++ unsigned long range_startk; ++ unsigned long range_sizek; ++ unsigned long chunk_sizek; ++ unsigned long gran_sizek; ++ unsigned int reg; ++}; ++ ++static void __init ++set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, ++ unsigned char type, unsigned int address_bits) ++{ ++ u32 base_lo, base_hi, mask_lo, mask_hi; ++ u64 base, mask; ++ ++ if (!sizek) { ++ fill_mtrr_var_range(reg, 0, 0, 0, 0); ++ return; ++ } ++ ++ mask = (1ULL << address_bits) - 1; ++ mask &= ~((((u64)sizek) << 10) - 1); ++ ++ base = ((u64)basek) << 10; ++ ++ base |= type; ++ mask |= 0x800; ++ ++ base_lo = base & ((1ULL<<32) - 1); ++ base_hi = base >> 32; ++ ++ mask_lo = mask & ((1ULL<<32) - 1); ++ mask_hi = mask >> 32; ++ ++ fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); ++} ++ ++static void __init ++save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, ++ unsigned char type) ++{ ++ range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); ++ range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); ++ range_state[reg].type = type; ++} ++ ++static void __init ++set_var_mtrr_all(unsigned int address_bits) ++{ ++ unsigned long basek, sizek; ++ unsigned char type; ++ unsigned int reg; ++ ++ for (reg = 0; reg < num_var_ranges; reg++) { ++ basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); ++ sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); ++ type = range_state[reg].type; ++ ++ set_var_mtrr(reg, basek, sizek, type, address_bits); ++ } ++} ++ ++static unsigned long to_size_factor(unsigned long sizek, char *factorp) ++{ ++ char factor; ++ unsigned long base = sizek; ++ ++ if (base & ((1<<10) - 1)) { ++ /* not MB alignment */ ++ factor = 'K'; ++ } else if (base & ((1<<20) - 1)) { ++ factor = 'M'; ++ base >>= 10; ++ } else { ++ factor = 'G'; ++ base >>= 20; ++ } ++ ++ *factorp = factor; ++ ++ return base; ++} ++ ++static unsigned int __init ++range_to_mtrr(unsigned int reg, unsigned long range_startk, ++ unsigned long range_sizek, unsigned char type) ++{ ++ if (!range_sizek || (reg >= num_var_ranges)) ++ return reg; ++ ++ while (range_sizek) { ++ unsigned long max_align, align; ++ unsigned long sizek; ++ ++ /* Compute the maximum size I can make a range */ ++ if (range_startk) ++ max_align = ffs(range_startk) - 1; ++ else ++ max_align = 32; ++ align = fls(range_sizek) - 1; ++ if (align > max_align) ++ align = max_align; ++ ++ sizek = 1 << align; ++ if (debug_print) { ++ char start_factor = 'K', size_factor = 'K'; ++ unsigned long start_base, size_base; ++ ++ start_base = to_size_factor(range_startk, ++ &start_factor), ++ size_base = to_size_factor(sizek, &size_factor), ++ ++ printk(KERN_DEBUG "Setting variable MTRR %d, " ++ "base: %ld%cB, range: %ld%cB, type %s\n", ++ reg, start_base, start_factor, ++ size_base, size_factor, ++ (type == MTRR_TYPE_UNCACHABLE) ? "UC" : ++ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other") ++ ); ++ } ++ save_var_mtrr(reg++, range_startk, sizek, type); ++ range_startk += sizek; ++ range_sizek -= sizek; ++ if (reg >= num_var_ranges) ++ break; ++ } ++ return reg; ++} ++ ++static unsigned __init ++range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, ++ unsigned long sizek) ++{ ++ unsigned long hole_basek, hole_sizek; ++ unsigned long second_basek, second_sizek; ++ unsigned long range0_basek, range0_sizek; ++ unsigned long range_basek, range_sizek; ++ unsigned long chunk_sizek; ++ unsigned long gran_sizek; ++ ++ hole_basek = 0; ++ hole_sizek = 0; ++ second_basek = 0; ++ second_sizek = 0; ++ chunk_sizek = state->chunk_sizek; ++ gran_sizek = state->gran_sizek; ++ ++ /* align with gran size, prevent small block used up MTRRs */ ++ range_basek = ALIGN(state->range_startk, gran_sizek); ++ if ((range_basek > basek) && basek) ++ return second_sizek; ++ state->range_sizek -= (range_basek - state->range_startk); ++ range_sizek = ALIGN(state->range_sizek, gran_sizek); ++ ++ while (range_sizek > state->range_sizek) { ++ range_sizek -= gran_sizek; ++ if (!range_sizek) ++ return 0; ++ } ++ state->range_sizek = range_sizek; ++ ++ /* try to append some small hole */ ++ range0_basek = state->range_startk; ++ range0_sizek = ALIGN(state->range_sizek, chunk_sizek); ++ ++ /* no increase */ ++ if (range0_sizek == state->range_sizek) { ++ if (debug_print) ++ printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", ++ range0_basek<<10, ++ (range0_basek + state->range_sizek)<<10); ++ state->reg = range_to_mtrr(state->reg, range0_basek, ++ state->range_sizek, MTRR_TYPE_WRBACK); ++ return 0; ++ } ++ ++ /* only cut back, when it is not the last */ ++ if (sizek) { ++ while (range0_basek + range0_sizek > (basek + sizek)) { ++ if (range0_sizek >= chunk_sizek) ++ range0_sizek -= chunk_sizek; ++ else ++ range0_sizek = 0; ++ ++ if (!range0_sizek) ++ break; ++ } ++ } ++ ++second_try: ++ range_basek = range0_basek + range0_sizek; ++ ++ /* one hole in the middle */ ++ if (range_basek > basek && range_basek <= (basek + sizek)) ++ second_sizek = range_basek - basek; ++ ++ if (range0_sizek > state->range_sizek) { ++ ++ /* one hole in middle or at end */ ++ hole_sizek = range0_sizek - state->range_sizek - second_sizek; ++ ++ /* hole size should be less than half of range0 size */ ++ if (hole_sizek >= (range0_sizek >> 1) && ++ range0_sizek >= chunk_sizek) { ++ range0_sizek -= chunk_sizek; ++ second_sizek = 0; ++ hole_sizek = 0; ++ ++ goto second_try; ++ } ++ } ++ ++ if (range0_sizek) { ++ if (debug_print) ++ printk(KERN_DEBUG "range0: %016lx - %016lx\n", ++ range0_basek<<10, ++ (range0_basek + range0_sizek)<<10); ++ state->reg = range_to_mtrr(state->reg, range0_basek, ++ range0_sizek, MTRR_TYPE_WRBACK); ++ } ++ ++ if (range0_sizek < state->range_sizek) { ++ /* need to handle left over */ ++ range_sizek = state->range_sizek - range0_sizek; ++ ++ if (debug_print) ++ printk(KERN_DEBUG "range: %016lx - %016lx\n", ++ range_basek<<10, ++ (range_basek + range_sizek)<<10); ++ state->reg = range_to_mtrr(state->reg, range_basek, ++ range_sizek, MTRR_TYPE_WRBACK); ++ } ++ ++ if (hole_sizek) { ++ hole_basek = range_basek - hole_sizek - second_sizek; ++ if (debug_print) ++ printk(KERN_DEBUG "hole: %016lx - %016lx\n", ++ hole_basek<<10, ++ (hole_basek + hole_sizek)<<10); ++ state->reg = range_to_mtrr(state->reg, hole_basek, ++ hole_sizek, MTRR_TYPE_UNCACHABLE); ++ } ++ ++ return second_sizek; ++} ++ ++static void __init ++set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, ++ unsigned long size_pfn) ++{ ++ unsigned long basek, sizek; ++ unsigned long second_sizek = 0; ++ ++ if (state->reg >= num_var_ranges) ++ return; ++ ++ basek = base_pfn << (PAGE_SHIFT - 10); ++ sizek = size_pfn << (PAGE_SHIFT - 10); ++ ++ /* See if I can merge with the last range */ ++ if ((basek <= 1024) || ++ (state->range_startk + state->range_sizek == basek)) { ++ unsigned long endk = basek + sizek; ++ state->range_sizek = endk - state->range_startk; ++ return; ++ } ++ /* Write the range mtrrs */ ++ if (state->range_sizek != 0) ++ second_sizek = range_to_mtrr_with_hole(state, basek, sizek); ++ ++ /* Allocate an msr */ ++ state->range_startk = basek + second_sizek; ++ state->range_sizek = sizek - second_sizek; ++} ++ ++/* mininum size of mtrr block that can take hole */ ++static u64 mtrr_chunk_size __initdata = (256ULL<<20); ++ ++static int __init parse_mtrr_chunk_size_opt(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ mtrr_chunk_size = memparse(p, &p); ++ return 0; ++} ++early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); ++ ++/* granity of mtrr of block */ ++static u64 mtrr_gran_size __initdata; ++ ++static int __init parse_mtrr_gran_size_opt(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ mtrr_gran_size = memparse(p, &p); ++ return 0; ++} ++early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); ++ ++static int nr_mtrr_spare_reg __initdata = ++ CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; ++ ++static int __init parse_mtrr_spare_reg(char *arg) ++{ ++ if (arg) ++ nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); ++ return 0; ++} ++ ++early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); ++ ++static int __init ++x86_setup_var_mtrrs(struct res_range *range, int nr_range, ++ u64 chunk_size, u64 gran_size) ++{ ++ struct var_mtrr_state var_state; ++ int i; ++ int num_reg; ++ ++ var_state.range_startk = 0; ++ var_state.range_sizek = 0; ++ var_state.reg = 0; ++ var_state.chunk_sizek = chunk_size >> 10; ++ var_state.gran_sizek = gran_size >> 10; ++ ++ memset(range_state, 0, sizeof(range_state)); ++ ++ /* Write the range etc */ ++ for (i = 0; i < nr_range; i++) ++ set_var_mtrr_range(&var_state, range[i].start, ++ range[i].end - range[i].start + 1); ++ ++ /* Write the last range */ ++ if (var_state.range_sizek != 0) ++ range_to_mtrr_with_hole(&var_state, 0, 0); ++ ++ num_reg = var_state.reg; ++ /* Clear out the extra MTRR's */ ++ while (var_state.reg < num_var_ranges) { ++ save_var_mtrr(var_state.reg, 0, 0, 0); ++ var_state.reg++; ++ } ++ ++ return num_reg; ++} ++ ++struct mtrr_cleanup_result { ++ unsigned long gran_sizek; ++ unsigned long chunk_sizek; ++ unsigned long lose_cover_sizek; ++ unsigned int num_reg; ++ int bad; ++}; ++ ++/* ++ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G ++ * chunk size: gran_size, ..., 2G ++ * so we need (1+16)*8 ++ */ ++#define NUM_RESULT 136 ++#define PSHIFT (PAGE_SHIFT - 10) ++ ++static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; ++static unsigned long __initdata min_loss_pfn[RANGE_NUM]; ++ ++static void __init print_out_mtrr_range_state(void) ++{ ++ int i; ++ char start_factor = 'K', size_factor = 'K'; ++ unsigned long start_base, size_base; ++ mtrr_type type; ++ ++ for (i = 0; i < num_var_ranges; i++) { ++ ++ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); ++ if (!size_base) ++ continue; ++ ++ size_base = to_size_factor(size_base, &size_factor), ++ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); ++ start_base = to_size_factor(start_base, &start_factor), ++ type = range_state[i].type; ++ ++ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", ++ i, start_base, start_factor, ++ size_base, size_factor, ++ (type == MTRR_TYPE_UNCACHABLE) ? "UC" : ++ ((type == MTRR_TYPE_WRPROT) ? "WP" : ++ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) ++ ); ++ } ++} ++ ++static int __init mtrr_need_cleanup(void) ++{ ++ int i; ++ mtrr_type type; ++ unsigned long size; ++ /* extra one for all 0 */ ++ int num[MTRR_NUM_TYPES + 1]; ++ ++ /* check entries number */ ++ memset(num, 0, sizeof(num)); ++ for (i = 0; i < num_var_ranges; i++) { ++ type = range_state[i].type; ++ size = range_state[i].size_pfn; ++ if (type >= MTRR_NUM_TYPES) ++ continue; ++ if (!size) ++ type = MTRR_NUM_TYPES; ++ if (type == MTRR_TYPE_WRPROT) ++ type = MTRR_TYPE_UNCACHABLE; ++ num[type]++; ++ } ++ ++ /* check if we got UC entries */ ++ if (!num[MTRR_TYPE_UNCACHABLE]) ++ return 0; ++ ++ /* check if we only had WB and UC */ ++ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != ++ num_var_ranges - num[MTRR_NUM_TYPES]) ++ return 0; ++ ++ return 1; ++} ++ ++static unsigned long __initdata range_sums; ++static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, ++ unsigned long extra_remove_base, ++ unsigned long extra_remove_size, ++ int i) ++{ ++ int num_reg; ++ static struct res_range range_new[RANGE_NUM]; ++ static int nr_range_new; ++ unsigned long range_sums_new; ++ ++ /* convert ranges to var ranges state */ ++ num_reg = x86_setup_var_mtrrs(range, nr_range, ++ chunk_size, gran_size); ++ ++ /* we got new setting in range_state, check it */ ++ memset(range_new, 0, sizeof(range_new)); ++ nr_range_new = x86_get_mtrr_mem_range(range_new, 0, ++ extra_remove_base, extra_remove_size); ++ range_sums_new = sum_ranges(range_new, nr_range_new); ++ ++ result[i].chunk_sizek = chunk_size >> 10; ++ result[i].gran_sizek = gran_size >> 10; ++ result[i].num_reg = num_reg; ++ if (range_sums < range_sums_new) { ++ result[i].lose_cover_sizek = ++ (range_sums_new - range_sums) << PSHIFT; ++ result[i].bad = 1; ++ } else ++ result[i].lose_cover_sizek = ++ (range_sums - range_sums_new) << PSHIFT; ++ ++ /* double check it */ ++ if (!result[i].bad && !result[i].lose_cover_sizek) { ++ if (nr_range_new != nr_range || ++ memcmp(range, range_new, sizeof(range))) ++ result[i].bad = 1; ++ } ++ ++ if (!result[i].bad && (range_sums - range_sums_new < ++ min_loss_pfn[num_reg])) { ++ min_loss_pfn[num_reg] = ++ range_sums - range_sums_new; ++ } ++} ++ ++static void __init mtrr_print_out_one_result(int i) ++{ ++ char gran_factor, chunk_factor, lose_factor; ++ unsigned long gran_base, chunk_base, lose_base; ++ ++ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), ++ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), ++ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), ++ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", ++ result[i].bad ? "*BAD*" : " ", ++ gran_base, gran_factor, chunk_base, chunk_factor); ++ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", ++ result[i].num_reg, result[i].bad ? "-" : "", ++ lose_base, lose_factor); ++} ++ ++static int __init mtrr_search_optimal_index(void) ++{ ++ int i; ++ int num_reg_good; ++ int index_good; ++ ++ if (nr_mtrr_spare_reg >= num_var_ranges) ++ nr_mtrr_spare_reg = num_var_ranges - 1; ++ num_reg_good = -1; ++ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { ++ if (!min_loss_pfn[i]) ++ num_reg_good = i; ++ } ++ ++ index_good = -1; ++ if (num_reg_good != -1) { ++ for (i = 0; i < NUM_RESULT; i++) { ++ if (!result[i].bad && ++ result[i].num_reg == num_reg_good && ++ !result[i].lose_cover_sizek) { ++ index_good = i; ++ break; ++ } ++ } ++ } ++ ++ return index_good; ++} ++ ++ ++int __init mtrr_cleanup(unsigned address_bits) ++{ ++ unsigned long extra_remove_base, extra_remove_size; ++ unsigned long base, size, def, dummy; ++ mtrr_type type; ++ u64 chunk_size, gran_size; ++ int index_good; ++ int i; ++ ++ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) ++ return 0; ++ rdmsr(MTRRdefType_MSR, def, dummy); ++ def &= 0xff; ++ if (def != MTRR_TYPE_UNCACHABLE) ++ return 0; ++ ++ /* get it and store it aside */ ++ memset(range_state, 0, sizeof(range_state)); ++ for (i = 0; i < num_var_ranges; i++) { ++ mtrr_if->get(i, &base, &size, &type); ++ range_state[i].base_pfn = base; ++ range_state[i].size_pfn = size; ++ range_state[i].type = type; ++ } ++ ++ /* check if we need handle it and can handle it */ ++ if (!mtrr_need_cleanup()) ++ return 0; ++ ++ /* print original var MTRRs at first, for debugging: */ ++ printk(KERN_DEBUG "original variable MTRRs\n"); ++ print_out_mtrr_range_state(); ++ ++ memset(range, 0, sizeof(range)); ++ extra_remove_size = 0; ++ extra_remove_base = 1 << (32 - PAGE_SHIFT); ++ if (mtrr_tom2) ++ extra_remove_size = ++ (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; ++ nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, ++ extra_remove_size); ++ /* ++ * [0, 1M) should always be coverred by var mtrr with WB ++ * and fixed mtrrs should take effective before var mtrr for it ++ */ ++ nr_range = add_range_with_merge(range, nr_range, 0, ++ (1ULL<<(20 - PAGE_SHIFT)) - 1); ++ /* sort the ranges */ ++ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); ++ ++ range_sums = sum_ranges(range, nr_range); ++ printk(KERN_INFO "total RAM coverred: %ldM\n", ++ range_sums >> (20 - PAGE_SHIFT)); ++ ++ if (mtrr_chunk_size && mtrr_gran_size) { ++ i = 0; ++ mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, ++ extra_remove_base, extra_remove_size, i); ++ ++ mtrr_print_out_one_result(i); ++ ++ if (!result[i].bad) { ++ set_var_mtrr_all(address_bits); ++ printk(KERN_DEBUG "New variable MTRRs\n"); ++ print_out_mtrr_range_state(); ++ return 1; ++ } ++ printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " ++ "will find optimal one\n"); ++ } ++ ++ i = 0; ++ memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); ++ memset(result, 0, sizeof(result)); ++ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { ++ ++ for (chunk_size = gran_size; chunk_size < (1ULL<<32); ++ chunk_size <<= 1) { ++ ++ if (i >= NUM_RESULT) ++ continue; ++ ++ mtrr_calc_range_state(chunk_size, gran_size, ++ extra_remove_base, extra_remove_size, i); ++ if (debug_print) { ++ mtrr_print_out_one_result(i); ++ printk(KERN_INFO "\n"); ++ } ++ ++ i++; ++ } ++ } ++ ++ /* try to find the optimal index */ ++ index_good = mtrr_search_optimal_index(); ++ ++ if (index_good != -1) { ++ printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); ++ i = index_good; ++ mtrr_print_out_one_result(i); ++ ++ /* convert ranges to var ranges state */ ++ chunk_size = result[i].chunk_sizek; ++ chunk_size <<= 10; ++ gran_size = result[i].gran_sizek; ++ gran_size <<= 10; ++ x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); ++ set_var_mtrr_all(address_bits); ++ printk(KERN_DEBUG "New variable MTRRs\n"); ++ print_out_mtrr_range_state(); ++ return 1; ++ } else { ++ /* print out all */ ++ for (i = 0; i < NUM_RESULT; i++) ++ mtrr_print_out_one_result(i); ++ } ++ ++ printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); ++ printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); ++ ++ return 0; ++} ++#else ++int __init mtrr_cleanup(unsigned address_bits) ++{ ++ return 0; ++} ++#endif ++ ++static int disable_mtrr_trim; ++ ++static int __init disable_mtrr_trim_setup(char *str) ++{ ++ disable_mtrr_trim = 1; ++ return 0; ++} ++early_param("disable_mtrr_trim", disable_mtrr_trim_setup); ++ ++/* ++ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB ++ * for memory >4GB. Check for that here. ++ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't ++ * apply to are wrong, but so far we don't know of any such case in the wild. ++ */ ++#define Tom2Enabled (1U << 21) ++#define Tom2ForceMemTypeWB (1U << 22) ++ ++int __init amd_special_default_mtrr(void) ++{ ++ u32 l, h; ++ ++ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ++ return 0; ++ if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) ++ return 0; ++ /* In case some hypervisor doesn't pass SYSCFG through */ ++ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) ++ return 0; ++ /* ++ * Memory between 4GB and top of mem is forced WB by this magic bit. ++ * Reserved before K8RevF, but should be zero there. ++ */ ++ if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == ++ (Tom2Enabled | Tom2ForceMemTypeWB)) ++ return 1; ++ return 0; ++} ++ ++static u64 __init real_trim_memory(unsigned long start_pfn, ++ unsigned long limit_pfn) ++{ ++ u64 trim_start, trim_size; ++ trim_start = start_pfn; ++ trim_start <<= PAGE_SHIFT; ++ trim_size = limit_pfn; ++ trim_size <<= PAGE_SHIFT; ++ trim_size -= trim_start; ++ ++ return e820_update_range(trim_start, trim_size, E820_RAM, ++ E820_RESERVED); ++} ++/** ++ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs ++ * @end_pfn: ending page frame number ++ * ++ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain ++ * memory configurations. This routine checks that the highest MTRR matches ++ * the end of memory, to make sure the MTRRs having a write back type cover ++ * all of the memory the kernel is intending to use. If not, it'll trim any ++ * memory off the end by adjusting end_pfn, removing it from the kernel's ++ * allocation pools, warning the user with an obnoxious message. ++ */ ++int __init mtrr_trim_uncached_memory(unsigned long end_pfn) ++{ ++ unsigned long i, base, size, highest_pfn = 0, def, dummy; ++ mtrr_type type; ++ u64 total_trim_size; ++ ++ /* extra one for all 0 */ ++ int num[MTRR_NUM_TYPES + 1]; ++ /* ++ * Make sure we only trim uncachable memory on machines that ++ * support the Intel MTRR architecture: ++ */ ++ if (!is_cpu(INTEL) || disable_mtrr_trim) ++ return 0; ++ rdmsr(MTRRdefType_MSR, def, dummy); ++ def &= 0xff; ++ if (def != MTRR_TYPE_UNCACHABLE) ++ return 0; ++ ++ /* get it and store it aside */ ++ memset(range_state, 0, sizeof(range_state)); ++ for (i = 0; i < num_var_ranges; i++) { ++ mtrr_if->get(i, &base, &size, &type); ++ range_state[i].base_pfn = base; ++ range_state[i].size_pfn = size; ++ range_state[i].type = type; ++ } ++ ++ /* Find highest cached pfn */ ++ for (i = 0; i < num_var_ranges; i++) { ++ type = range_state[i].type; ++ if (type != MTRR_TYPE_WRBACK) ++ continue; ++ base = range_state[i].base_pfn; ++ size = range_state[i].size_pfn; ++ if (highest_pfn < base + size) ++ highest_pfn = base + size; ++ } ++ ++ /* kvm/qemu doesn't have mtrr set right, don't trim them all */ ++ if (!highest_pfn) { ++ printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); ++ return 0; ++ } ++ ++ /* check entries number */ ++ memset(num, 0, sizeof(num)); ++ for (i = 0; i < num_var_ranges; i++) { ++ type = range_state[i].type; ++ if (type >= MTRR_NUM_TYPES) ++ continue; ++ size = range_state[i].size_pfn; ++ if (!size) ++ type = MTRR_NUM_TYPES; ++ num[type]++; ++ } ++ ++ /* no entry for WB? */ ++ if (!num[MTRR_TYPE_WRBACK]) ++ return 0; ++ ++ /* check if we only had WB and UC */ ++ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != ++ num_var_ranges - num[MTRR_NUM_TYPES]) ++ return 0; ++ ++ memset(range, 0, sizeof(range)); ++ nr_range = 0; ++ if (mtrr_tom2) { ++ range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); ++ range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; ++ if (highest_pfn < range[nr_range].end + 1) ++ highest_pfn = range[nr_range].end + 1; ++ nr_range++; ++ } ++ nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); ++ ++ total_trim_size = 0; ++ /* check the head */ ++ if (range[0].start) ++ total_trim_size += real_trim_memory(0, range[0].start); ++ /* check the holes */ ++ for (i = 0; i < nr_range - 1; i++) { ++ if (range[i].end + 1 < range[i+1].start) ++ total_trim_size += real_trim_memory(range[i].end + 1, ++ range[i+1].start); ++ } ++ /* check the top */ ++ i = nr_range - 1; ++ if (range[i].end + 1 < end_pfn) ++ total_trim_size += real_trim_memory(range[i].end + 1, ++ end_pfn); ++ ++ if (total_trim_size) { ++ printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" ++ " all of memory, losing %lluMB of RAM.\n", ++ total_trim_size >> 20); ++ ++ if (!changed_by_mtrr_cleanup) ++ WARN_ON(1); ++ ++ printk(KERN_INFO "update e820 for mtrr\n"); ++ update_e820(); ++ ++ return 1; ++ } ++ ++ return 0; ++} ++ +Index: linux-2.6-tip/arch/x86/kernel/cpu/mtrr/generic.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mtrr/generic.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/mtrr/generic.c +@@ -33,14 +33,6 @@ u64 mtrr_tom2; + struct mtrr_state_type mtrr_state = {}; + EXPORT_SYMBOL_GPL(mtrr_state); + +-static int __initdata mtrr_show; +-static int __init mtrr_debug(char *opt) +-{ +- mtrr_show = 1; +- return 0; +-} +-early_param("mtrr.show", mtrr_debug); +- + /** + * BIOS is expected to clear MtrrFixDramModEn bit, see for example + * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD +@@ -216,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info) + get_fixed_ranges(mtrr_state.fixed_ranges); + } + +-static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) ++static unsigned __initdata last_fixed_start; ++static unsigned __initdata last_fixed_end; ++static mtrr_type __initdata last_fixed_type; ++ ++static void __init print_fixed_last(void) ++{ ++ if (!last_fixed_end) ++ return; ++ ++ printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, ++ last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); ++ ++ last_fixed_end = 0; ++} ++ ++static void __init update_fixed_last(unsigned base, unsigned end, ++ mtrr_type type) ++{ ++ last_fixed_start = base; ++ last_fixed_end = end; ++ last_fixed_type = type; ++} ++ ++static void __init print_fixed(unsigned base, unsigned step, ++ const mtrr_type *types) + { + unsigned i; + +- for (i = 0; i < 8; ++i, ++types, base += step) +- printk(KERN_INFO "MTRR %05X-%05X %s\n", +- base, base + step - 1, mtrr_attrib_to_str(*types)); ++ for (i = 0; i < 8; ++i, ++types, base += step) { ++ if (last_fixed_end == 0) { ++ update_fixed_last(base, base + step, *types); ++ continue; ++ } ++ if (last_fixed_end == base && last_fixed_type == *types) { ++ last_fixed_end = base + step; ++ continue; ++ } ++ /* new segments: gap or different type */ ++ print_fixed_last(); ++ update_fixed_last(base, base + step, *types); ++ } + } + + static void prepare_set(void); + static void post_set(void); + ++static void __init print_mtrr_state(void) ++{ ++ unsigned int i; ++ int high_width; ++ ++ printk(KERN_DEBUG "MTRR default type: %s\n", ++ mtrr_attrib_to_str(mtrr_state.def_type)); ++ if (mtrr_state.have_fixed) { ++ printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", ++ mtrr_state.enabled & 1 ? "en" : "dis"); ++ print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); ++ for (i = 0; i < 2; ++i) ++ print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); ++ for (i = 0; i < 8; ++i) ++ print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); ++ ++ /* tail */ ++ print_fixed_last(); ++ } ++ printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", ++ mtrr_state.enabled & 2 ? "en" : "dis"); ++ high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; ++ for (i = 0; i < num_var_ranges; ++i) { ++ if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) ++ printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", ++ i, ++ high_width, ++ mtrr_state.var_ranges[i].base_hi, ++ mtrr_state.var_ranges[i].base_lo >> 12, ++ high_width, ++ mtrr_state.var_ranges[i].mask_hi, ++ mtrr_state.var_ranges[i].mask_lo >> 12, ++ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); ++ else ++ printk(KERN_DEBUG " %u disabled\n", i); ++ } ++ if (mtrr_tom2) { ++ printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", ++ mtrr_tom2, mtrr_tom2>>20); ++ } ++} ++ + /* Grab all of the MTRR state for this CPU into *state */ + void __init get_mtrr_state(void) + { +@@ -259,41 +327,9 @@ void __init get_mtrr_state(void) + mtrr_tom2 |= low; + mtrr_tom2 &= 0xffffff800000ULL; + } +- if (mtrr_show) { +- int high_width; + +- printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); +- if (mtrr_state.have_fixed) { +- printk(KERN_INFO "MTRR fixed ranges %sabled:\n", +- mtrr_state.enabled & 1 ? "en" : "dis"); +- print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); +- for (i = 0; i < 2; ++i) +- print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); +- for (i = 0; i < 8; ++i) +- print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); +- } +- printk(KERN_INFO "MTRR variable ranges %sabled:\n", +- mtrr_state.enabled & 2 ? "en" : "dis"); +- high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; +- for (i = 0; i < num_var_ranges; ++i) { +- if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) +- printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n", +- i, +- high_width, +- mtrr_state.var_ranges[i].base_hi, +- mtrr_state.var_ranges[i].base_lo >> 12, +- high_width, +- mtrr_state.var_ranges[i].mask_hi, +- mtrr_state.var_ranges[i].mask_lo >> 12, +- mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); +- else +- printk(KERN_INFO "MTRR %u disabled\n", i); +- } +- if (mtrr_tom2) { +- printk(KERN_INFO "TOM2: %016llx aka %lldM\n", +- mtrr_tom2, mtrr_tom2>>20); +- } +- } ++ print_mtrr_state(); ++ + mtrr_state_set = 1; + + /* PAT setup for BP. We need to go through sync steps here */ +@@ -383,22 +419,31 @@ static void generic_get_mtrr(unsigned in + { + unsigned int mask_lo, mask_hi, base_lo, base_hi; + unsigned int tmp, hi; ++ int cpu; ++ ++ /* ++ * get_mtrr doesn't need to update mtrr_state, also it could be called ++ * from any cpu, so try to print it out directly. ++ */ ++ cpu = get_cpu(); + + rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); ++ + if ((mask_lo & 0x800) == 0) { + /* Invalid (i.e. free) range */ + *base = 0; + *size = 0; + *type = 0; +- return; ++ goto out_put_cpu; + } + + rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); + +- /* Work out the shifted address mask. */ ++ /* Work out the shifted address mask: */ + tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | tmp; +- /* Expand tmp with high bits to all 1s*/ ++ ++ /* Expand tmp with high bits to all 1s: */ + hi = fls(tmp); + if (hi > 0) { + tmp |= ~((1<<(hi - 1)) - 1); +@@ -409,11 +454,19 @@ static void generic_get_mtrr(unsigned in + } + } + +- /* This works correctly if size is a power of two, i.e. a +- contiguous range. */ ++ /* ++ * This works correctly if size is a power of two, i.e. a ++ * contiguous range: ++ */ + *size = -mask_lo; + *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *type = base_lo & 0xff; ++ ++ printk(KERN_DEBUG " get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n", ++ cpu, reg, *base, *size, ++ mtrr_attrib_to_str(*type & 0xff)); ++out_put_cpu: ++ put_cpu(); + } + + /** +@@ -495,7 +548,7 @@ static unsigned long set_mtrr_state(void + + + static unsigned long cr4 = 0; +-static DEFINE_SPINLOCK(set_atomicity_lock); ++static DEFINE_RAW_SPINLOCK(set_atomicity_lock); + + /* + * Since we are disabling the cache don't allow any interrupts - they +Index: linux-2.6-tip/arch/x86/kernel/cpu/mtrr/main.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mtrr/main.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/mtrr/main.c +@@ -574,7 +574,7 @@ struct mtrr_value { + unsigned long lsize; + }; + +-static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; ++static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; + + static int mtrr_save(struct sys_device * sysdev, pm_message_t state) + { +@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, +- &mtrr_state[i].lbase, +- &mtrr_state[i].lsize, +- &mtrr_state[i].ltype); ++ &mtrr_value[i].lbase, ++ &mtrr_value[i].lsize, ++ &mtrr_value[i].ltype); + } + return 0; + } +@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_devic + int i; + + for (i = 0; i < num_var_ranges; i++) { +- if (mtrr_state[i].lsize) ++ if (mtrr_value[i].lsize) + set_mtrr(i, +- mtrr_state[i].lbase, +- mtrr_state[i].lsize, +- mtrr_state[i].ltype); ++ mtrr_value[i].lbase, ++ mtrr_value[i].lsize, ++ mtrr_value[i].ltype); + } + return 0; + } +@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_ + .resume = mtrr_restore, + }; + +-/* should be related to MTRR_VAR_RANGES nums */ +-#define RANGE_NUM 256 +- +-struct res_range { +- unsigned long start; +- unsigned long end; +-}; +- +-static int __init +-add_range(struct res_range *range, int nr_range, unsigned long start, +- unsigned long end) +-{ +- /* out of slots */ +- if (nr_range >= RANGE_NUM) +- return nr_range; +- +- range[nr_range].start = start; +- range[nr_range].end = end; +- +- nr_range++; +- +- return nr_range; +-} +- +-static int __init +-add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, +- unsigned long end) +-{ +- int i; +- +- /* try to merge it with old one */ +- for (i = 0; i < nr_range; i++) { +- unsigned long final_start, final_end; +- unsigned long common_start, common_end; +- +- if (!range[i].end) +- continue; +- +- common_start = max(range[i].start, start); +- common_end = min(range[i].end, end); +- if (common_start > common_end + 1) +- continue; +- +- final_start = min(range[i].start, start); +- final_end = max(range[i].end, end); +- +- range[i].start = final_start; +- range[i].end = final_end; +- return nr_range; +- } +- +- /* need to add that */ +- return add_range(range, nr_range, start, end); +-} +- +-static void __init +-subtract_range(struct res_range *range, unsigned long start, unsigned long end) +-{ +- int i, j; +- +- for (j = 0; j < RANGE_NUM; j++) { +- if (!range[j].end) +- continue; +- +- if (start <= range[j].start && end >= range[j].end) { +- range[j].start = 0; +- range[j].end = 0; +- continue; +- } +- +- if (start <= range[j].start && end < range[j].end && +- range[j].start < end + 1) { +- range[j].start = end + 1; +- continue; +- } +- +- +- if (start > range[j].start && end >= range[j].end && +- range[j].end > start - 1) { +- range[j].end = start - 1; +- continue; +- } +- +- if (start > range[j].start && end < range[j].end) { +- /* find the new spare */ +- for (i = 0; i < RANGE_NUM; i++) { +- if (range[i].end == 0) +- break; +- } +- if (i < RANGE_NUM) { +- range[i].end = range[j].end; +- range[i].start = end + 1; +- } else { +- printk(KERN_ERR "run of slot in ranges\n"); +- } +- range[j].end = start - 1; +- continue; +- } +- } +-} +- +-static int __init cmp_range(const void *x1, const void *x2) +-{ +- const struct res_range *r1 = x1; +- const struct res_range *r2 = x2; +- long start1, start2; +- +- start1 = r1->start; +- start2 = r2->start; +- +- return start1 - start2; +-} +- +-struct var_mtrr_range_state { +- unsigned long base_pfn; +- unsigned long size_pfn; +- mtrr_type type; +-}; +- +-static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +-static int __initdata debug_print; +- +-static int __init +-x86_get_mtrr_mem_range(struct res_range *range, int nr_range, +- unsigned long extra_remove_base, +- unsigned long extra_remove_size) +-{ +- unsigned long i, base, size; +- mtrr_type type; +- +- for (i = 0; i < num_var_ranges; i++) { +- type = range_state[i].type; +- if (type != MTRR_TYPE_WRBACK) +- continue; +- base = range_state[i].base_pfn; +- size = range_state[i].size_pfn; +- nr_range = add_range_with_merge(range, nr_range, base, +- base + size - 1); +- } +- if (debug_print) { +- printk(KERN_DEBUG "After WB checking\n"); +- for (i = 0; i < nr_range; i++) +- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", +- range[i].start, range[i].end + 1); +- } +- +- /* take out UC ranges */ +- for (i = 0; i < num_var_ranges; i++) { +- type = range_state[i].type; +- if (type != MTRR_TYPE_UNCACHABLE && +- type != MTRR_TYPE_WRPROT) +- continue; +- size = range_state[i].size_pfn; +- if (!size) +- continue; +- base = range_state[i].base_pfn; +- subtract_range(range, base, base + size - 1); +- } +- if (extra_remove_size) +- subtract_range(range, extra_remove_base, +- extra_remove_base + extra_remove_size - 1); +- +- /* get new range num */ +- nr_range = 0; +- for (i = 0; i < RANGE_NUM; i++) { +- if (!range[i].end) +- continue; +- nr_range++; +- } +- if (debug_print) { +- printk(KERN_DEBUG "After UC checking\n"); +- for (i = 0; i < nr_range; i++) +- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", +- range[i].start, range[i].end + 1); +- } +- +- /* sort the ranges */ +- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); +- if (debug_print) { +- printk(KERN_DEBUG "After sorting\n"); +- for (i = 0; i < nr_range; i++) +- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", +- range[i].start, range[i].end + 1); +- } +- +- /* clear those is not used */ +- for (i = nr_range; i < RANGE_NUM; i++) +- memset(&range[i], 0, sizeof(range[i])); +- +- return nr_range; +-} +- +-static struct res_range __initdata range[RANGE_NUM]; +-static int __initdata nr_range; +- +-#ifdef CONFIG_MTRR_SANITIZER +- +-static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +-{ +- unsigned long sum; +- int i; +- +- sum = 0; +- for (i = 0; i < nr_range; i++) +- sum += range[i].end + 1 - range[i].start; +- +- return sum; +-} +- +-static int enable_mtrr_cleanup __initdata = +- CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; +- +-static int __init disable_mtrr_cleanup_setup(char *str) +-{ +- enable_mtrr_cleanup = 0; +- return 0; +-} +-early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); +- +-static int __init enable_mtrr_cleanup_setup(char *str) +-{ +- enable_mtrr_cleanup = 1; +- return 0; +-} +-early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); +- +-static int __init mtrr_cleanup_debug_setup(char *str) +-{ +- debug_print = 1; +- return 0; +-} +-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); +- +-struct var_mtrr_state { +- unsigned long range_startk; +- unsigned long range_sizek; +- unsigned long chunk_sizek; +- unsigned long gran_sizek; +- unsigned int reg; +-}; +- +-static void __init +-set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, +- unsigned char type, unsigned int address_bits) +-{ +- u32 base_lo, base_hi, mask_lo, mask_hi; +- u64 base, mask; +- +- if (!sizek) { +- fill_mtrr_var_range(reg, 0, 0, 0, 0); +- return; +- } +- +- mask = (1ULL << address_bits) - 1; +- mask &= ~((((u64)sizek) << 10) - 1); +- +- base = ((u64)basek) << 10; +- +- base |= type; +- mask |= 0x800; +- +- base_lo = base & ((1ULL<<32) - 1); +- base_hi = base >> 32; +- +- mask_lo = mask & ((1ULL<<32) - 1); +- mask_hi = mask >> 32; +- +- fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); +-} +- +-static void __init +-save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, +- unsigned char type) +-{ +- range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); +- range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); +- range_state[reg].type = type; +-} +- +-static void __init +-set_var_mtrr_all(unsigned int address_bits) +-{ +- unsigned long basek, sizek; +- unsigned char type; +- unsigned int reg; +- +- for (reg = 0; reg < num_var_ranges; reg++) { +- basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); +- sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); +- type = range_state[reg].type; +- +- set_var_mtrr(reg, basek, sizek, type, address_bits); +- } +-} +- +-static unsigned long to_size_factor(unsigned long sizek, char *factorp) +-{ +- char factor; +- unsigned long base = sizek; +- +- if (base & ((1<<10) - 1)) { +- /* not MB alignment */ +- factor = 'K'; +- } else if (base & ((1<<20) - 1)){ +- factor = 'M'; +- base >>= 10; +- } else { +- factor = 'G'; +- base >>= 20; +- } +- +- *factorp = factor; +- +- return base; +-} +- +-static unsigned int __init +-range_to_mtrr(unsigned int reg, unsigned long range_startk, +- unsigned long range_sizek, unsigned char type) +-{ +- if (!range_sizek || (reg >= num_var_ranges)) +- return reg; +- +- while (range_sizek) { +- unsigned long max_align, align; +- unsigned long sizek; +- +- /* Compute the maximum size I can make a range */ +- if (range_startk) +- max_align = ffs(range_startk) - 1; +- else +- max_align = 32; +- align = fls(range_sizek) - 1; +- if (align > max_align) +- align = max_align; +- +- sizek = 1 << align; +- if (debug_print) { +- char start_factor = 'K', size_factor = 'K'; +- unsigned long start_base, size_base; +- +- start_base = to_size_factor(range_startk, &start_factor), +- size_base = to_size_factor(sizek, &size_factor), +- +- printk(KERN_DEBUG "Setting variable MTRR %d, " +- "base: %ld%cB, range: %ld%cB, type %s\n", +- reg, start_base, start_factor, +- size_base, size_factor, +- (type == MTRR_TYPE_UNCACHABLE)?"UC": +- ((type == MTRR_TYPE_WRBACK)?"WB":"Other") +- ); +- } +- save_var_mtrr(reg++, range_startk, sizek, type); +- range_startk += sizek; +- range_sizek -= sizek; +- if (reg >= num_var_ranges) +- break; +- } +- return reg; +-} +- +-static unsigned __init +-range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, +- unsigned long sizek) +-{ +- unsigned long hole_basek, hole_sizek; +- unsigned long second_basek, second_sizek; +- unsigned long range0_basek, range0_sizek; +- unsigned long range_basek, range_sizek; +- unsigned long chunk_sizek; +- unsigned long gran_sizek; +- +- hole_basek = 0; +- hole_sizek = 0; +- second_basek = 0; +- second_sizek = 0; +- chunk_sizek = state->chunk_sizek; +- gran_sizek = state->gran_sizek; +- +- /* align with gran size, prevent small block used up MTRRs */ +- range_basek = ALIGN(state->range_startk, gran_sizek); +- if ((range_basek > basek) && basek) +- return second_sizek; +- state->range_sizek -= (range_basek - state->range_startk); +- range_sizek = ALIGN(state->range_sizek, gran_sizek); +- +- while (range_sizek > state->range_sizek) { +- range_sizek -= gran_sizek; +- if (!range_sizek) +- return 0; +- } +- state->range_sizek = range_sizek; +- +- /* try to append some small hole */ +- range0_basek = state->range_startk; +- range0_sizek = ALIGN(state->range_sizek, chunk_sizek); +- +- /* no increase */ +- if (range0_sizek == state->range_sizek) { +- if (debug_print) +- printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", +- range0_basek<<10, +- (range0_basek + state->range_sizek)<<10); +- state->reg = range_to_mtrr(state->reg, range0_basek, +- state->range_sizek, MTRR_TYPE_WRBACK); +- return 0; +- } +- +- /* only cut back, when it is not the last */ +- if (sizek) { +- while (range0_basek + range0_sizek > (basek + sizek)) { +- if (range0_sizek >= chunk_sizek) +- range0_sizek -= chunk_sizek; +- else +- range0_sizek = 0; +- +- if (!range0_sizek) +- break; +- } +- } +- +-second_try: +- range_basek = range0_basek + range0_sizek; +- +- /* one hole in the middle */ +- if (range_basek > basek && range_basek <= (basek + sizek)) +- second_sizek = range_basek - basek; +- +- if (range0_sizek > state->range_sizek) { +- +- /* one hole in middle or at end */ +- hole_sizek = range0_sizek - state->range_sizek - second_sizek; +- +- /* hole size should be less than half of range0 size */ +- if (hole_sizek >= (range0_sizek >> 1) && +- range0_sizek >= chunk_sizek) { +- range0_sizek -= chunk_sizek; +- second_sizek = 0; +- hole_sizek = 0; +- +- goto second_try; +- } +- } +- +- if (range0_sizek) { +- if (debug_print) +- printk(KERN_DEBUG "range0: %016lx - %016lx\n", +- range0_basek<<10, +- (range0_basek + range0_sizek)<<10); +- state->reg = range_to_mtrr(state->reg, range0_basek, +- range0_sizek, MTRR_TYPE_WRBACK); +- } +- +- if (range0_sizek < state->range_sizek) { +- /* need to handle left over */ +- range_sizek = state->range_sizek - range0_sizek; +- +- if (debug_print) +- printk(KERN_DEBUG "range: %016lx - %016lx\n", +- range_basek<<10, +- (range_basek + range_sizek)<<10); +- state->reg = range_to_mtrr(state->reg, range_basek, +- range_sizek, MTRR_TYPE_WRBACK); +- } +- +- if (hole_sizek) { +- hole_basek = range_basek - hole_sizek - second_sizek; +- if (debug_print) +- printk(KERN_DEBUG "hole: %016lx - %016lx\n", +- hole_basek<<10, +- (hole_basek + hole_sizek)<<10); +- state->reg = range_to_mtrr(state->reg, hole_basek, +- hole_sizek, MTRR_TYPE_UNCACHABLE); +- } +- +- return second_sizek; +-} +- +-static void __init +-set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, +- unsigned long size_pfn) +-{ +- unsigned long basek, sizek; +- unsigned long second_sizek = 0; +- +- if (state->reg >= num_var_ranges) +- return; +- +- basek = base_pfn << (PAGE_SHIFT - 10); +- sizek = size_pfn << (PAGE_SHIFT - 10); +- +- /* See if I can merge with the last range */ +- if ((basek <= 1024) || +- (state->range_startk + state->range_sizek == basek)) { +- unsigned long endk = basek + sizek; +- state->range_sizek = endk - state->range_startk; +- return; +- } +- /* Write the range mtrrs */ +- if (state->range_sizek != 0) +- second_sizek = range_to_mtrr_with_hole(state, basek, sizek); +- +- /* Allocate an msr */ +- state->range_startk = basek + second_sizek; +- state->range_sizek = sizek - second_sizek; +-} +- +-/* mininum size of mtrr block that can take hole */ +-static u64 mtrr_chunk_size __initdata = (256ULL<<20); +- +-static int __init parse_mtrr_chunk_size_opt(char *p) +-{ +- if (!p) +- return -EINVAL; +- mtrr_chunk_size = memparse(p, &p); +- return 0; +-} +-early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); +- +-/* granity of mtrr of block */ +-static u64 mtrr_gran_size __initdata; +- +-static int __init parse_mtrr_gran_size_opt(char *p) +-{ +- if (!p) +- return -EINVAL; +- mtrr_gran_size = memparse(p, &p); +- return 0; +-} +-early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); +- +-static int nr_mtrr_spare_reg __initdata = +- CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; +- +-static int __init parse_mtrr_spare_reg(char *arg) +-{ +- if (arg) +- nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); +- return 0; +-} +- +-early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); +- +-static int __init +-x86_setup_var_mtrrs(struct res_range *range, int nr_range, +- u64 chunk_size, u64 gran_size) +-{ +- struct var_mtrr_state var_state; +- int i; +- int num_reg; +- +- var_state.range_startk = 0; +- var_state.range_sizek = 0; +- var_state.reg = 0; +- var_state.chunk_sizek = chunk_size >> 10; +- var_state.gran_sizek = gran_size >> 10; +- +- memset(range_state, 0, sizeof(range_state)); +- +- /* Write the range etc */ +- for (i = 0; i < nr_range; i++) +- set_var_mtrr_range(&var_state, range[i].start, +- range[i].end - range[i].start + 1); +- +- /* Write the last range */ +- if (var_state.range_sizek != 0) +- range_to_mtrr_with_hole(&var_state, 0, 0); +- +- num_reg = var_state.reg; +- /* Clear out the extra MTRR's */ +- while (var_state.reg < num_var_ranges) { +- save_var_mtrr(var_state.reg, 0, 0, 0); +- var_state.reg++; +- } +- +- return num_reg; +-} +- +-struct mtrr_cleanup_result { +- unsigned long gran_sizek; +- unsigned long chunk_sizek; +- unsigned long lose_cover_sizek; +- unsigned int num_reg; +- int bad; +-}; +- +-/* +- * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G +- * chunk size: gran_size, ..., 2G +- * so we need (1+16)*8 +- */ +-#define NUM_RESULT 136 +-#define PSHIFT (PAGE_SHIFT - 10) +- +-static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; +-static unsigned long __initdata min_loss_pfn[RANGE_NUM]; +- +-static void __init print_out_mtrr_range_state(void) +-{ +- int i; +- char start_factor = 'K', size_factor = 'K'; +- unsigned long start_base, size_base; +- mtrr_type type; +- +- for (i = 0; i < num_var_ranges; i++) { +- +- size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); +- if (!size_base) +- continue; +- +- size_base = to_size_factor(size_base, &size_factor), +- start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); +- start_base = to_size_factor(start_base, &start_factor), +- type = range_state[i].type; +- +- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", +- i, start_base, start_factor, +- size_base, size_factor, +- (type == MTRR_TYPE_UNCACHABLE) ? "UC" : +- ((type == MTRR_TYPE_WRPROT) ? "WP" : +- ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) +- ); +- } +-} +- +-static int __init mtrr_need_cleanup(void) +-{ +- int i; +- mtrr_type type; +- unsigned long size; +- /* extra one for all 0 */ +- int num[MTRR_NUM_TYPES + 1]; +- +- /* check entries number */ +- memset(num, 0, sizeof(num)); +- for (i = 0; i < num_var_ranges; i++) { +- type = range_state[i].type; +- size = range_state[i].size_pfn; +- if (type >= MTRR_NUM_TYPES) +- continue; +- if (!size) +- type = MTRR_NUM_TYPES; +- if (type == MTRR_TYPE_WRPROT) +- type = MTRR_TYPE_UNCACHABLE; +- num[type]++; +- } +- +- /* check if we got UC entries */ +- if (!num[MTRR_TYPE_UNCACHABLE]) +- return 0; +- +- /* check if we only had WB and UC */ +- if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != +- num_var_ranges - num[MTRR_NUM_TYPES]) +- return 0; +- +- return 1; +-} +- +-static unsigned long __initdata range_sums; +-static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, +- unsigned long extra_remove_base, +- unsigned long extra_remove_size, +- int i) +-{ +- int num_reg; +- static struct res_range range_new[RANGE_NUM]; +- static int nr_range_new; +- unsigned long range_sums_new; +- +- /* convert ranges to var ranges state */ +- num_reg = x86_setup_var_mtrrs(range, nr_range, +- chunk_size, gran_size); +- +- /* we got new setting in range_state, check it */ +- memset(range_new, 0, sizeof(range_new)); +- nr_range_new = x86_get_mtrr_mem_range(range_new, 0, +- extra_remove_base, extra_remove_size); +- range_sums_new = sum_ranges(range_new, nr_range_new); +- +- result[i].chunk_sizek = chunk_size >> 10; +- result[i].gran_sizek = gran_size >> 10; +- result[i].num_reg = num_reg; +- if (range_sums < range_sums_new) { +- result[i].lose_cover_sizek = +- (range_sums_new - range_sums) << PSHIFT; +- result[i].bad = 1; +- } else +- result[i].lose_cover_sizek = +- (range_sums - range_sums_new) << PSHIFT; +- +- /* double check it */ +- if (!result[i].bad && !result[i].lose_cover_sizek) { +- if (nr_range_new != nr_range || +- memcmp(range, range_new, sizeof(range))) +- result[i].bad = 1; +- } +- +- if (!result[i].bad && (range_sums - range_sums_new < +- min_loss_pfn[num_reg])) { +- min_loss_pfn[num_reg] = +- range_sums - range_sums_new; +- } +-} +- +-static void __init mtrr_print_out_one_result(int i) +-{ +- char gran_factor, chunk_factor, lose_factor; +- unsigned long gran_base, chunk_base, lose_base; +- +- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), +- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), +- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), +- printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", +- result[i].bad ? "*BAD*" : " ", +- gran_base, gran_factor, chunk_base, chunk_factor); +- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", +- result[i].num_reg, result[i].bad ? "-" : "", +- lose_base, lose_factor); +-} +- +-static int __init mtrr_search_optimal_index(void) +-{ +- int i; +- int num_reg_good; +- int index_good; +- +- if (nr_mtrr_spare_reg >= num_var_ranges) +- nr_mtrr_spare_reg = num_var_ranges - 1; +- num_reg_good = -1; +- for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { +- if (!min_loss_pfn[i]) +- num_reg_good = i; +- } +- +- index_good = -1; +- if (num_reg_good != -1) { +- for (i = 0; i < NUM_RESULT; i++) { +- if (!result[i].bad && +- result[i].num_reg == num_reg_good && +- !result[i].lose_cover_sizek) { +- index_good = i; +- break; +- } +- } +- } +- +- return index_good; +-} +- +- +-static int __init mtrr_cleanup(unsigned address_bits) +-{ +- unsigned long extra_remove_base, extra_remove_size; +- unsigned long base, size, def, dummy; +- mtrr_type type; +- u64 chunk_size, gran_size; +- int index_good; +- int i; +- +- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) +- return 0; +- rdmsr(MTRRdefType_MSR, def, dummy); +- def &= 0xff; +- if (def != MTRR_TYPE_UNCACHABLE) +- return 0; +- +- /* get it and store it aside */ +- memset(range_state, 0, sizeof(range_state)); +- for (i = 0; i < num_var_ranges; i++) { +- mtrr_if->get(i, &base, &size, &type); +- range_state[i].base_pfn = base; +- range_state[i].size_pfn = size; +- range_state[i].type = type; +- } +- +- /* check if we need handle it and can handle it */ +- if (!mtrr_need_cleanup()) +- return 0; +- +- /* print original var MTRRs at first, for debugging: */ +- printk(KERN_DEBUG "original variable MTRRs\n"); +- print_out_mtrr_range_state(); +- +- memset(range, 0, sizeof(range)); +- extra_remove_size = 0; +- extra_remove_base = 1 << (32 - PAGE_SHIFT); +- if (mtrr_tom2) +- extra_remove_size = +- (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; +- nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, +- extra_remove_size); +- /* +- * [0, 1M) should always be coverred by var mtrr with WB +- * and fixed mtrrs should take effective before var mtrr for it +- */ +- nr_range = add_range_with_merge(range, nr_range, 0, +- (1ULL<<(20 - PAGE_SHIFT)) - 1); +- /* sort the ranges */ +- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); +- +- range_sums = sum_ranges(range, nr_range); +- printk(KERN_INFO "total RAM coverred: %ldM\n", +- range_sums >> (20 - PAGE_SHIFT)); +- +- if (mtrr_chunk_size && mtrr_gran_size) { +- i = 0; +- mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, +- extra_remove_base, extra_remove_size, i); +- +- mtrr_print_out_one_result(i); +- +- if (!result[i].bad) { +- set_var_mtrr_all(address_bits); +- return 1; +- } +- printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " +- "will find optimal one\n"); +- } +- +- i = 0; +- memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); +- memset(result, 0, sizeof(result)); +- for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { +- +- for (chunk_size = gran_size; chunk_size < (1ULL<<32); +- chunk_size <<= 1) { +- +- if (i >= NUM_RESULT) +- continue; +- +- mtrr_calc_range_state(chunk_size, gran_size, +- extra_remove_base, extra_remove_size, i); +- if (debug_print) { +- mtrr_print_out_one_result(i); +- printk(KERN_INFO "\n"); +- } +- +- i++; +- } +- } +- +- /* try to find the optimal index */ +- index_good = mtrr_search_optimal_index(); +- +- if (index_good != -1) { +- printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); +- i = index_good; +- mtrr_print_out_one_result(i); +- +- /* convert ranges to var ranges state */ +- chunk_size = result[i].chunk_sizek; +- chunk_size <<= 10; +- gran_size = result[i].gran_sizek; +- gran_size <<= 10; +- x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); +- set_var_mtrr_all(address_bits); +- printk(KERN_DEBUG "New variable MTRRs\n"); +- print_out_mtrr_range_state(); +- return 1; +- } else { +- /* print out all */ +- for (i = 0; i < NUM_RESULT; i++) +- mtrr_print_out_one_result(i); +- } +- +- printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); +- printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); +- +- return 0; +-} +-#else +-static int __init mtrr_cleanup(unsigned address_bits) +-{ +- return 0; +-} +-#endif +- +-static int __initdata changed_by_mtrr_cleanup; +- +-static int disable_mtrr_trim; +- +-static int __init disable_mtrr_trim_setup(char *str) +-{ +- disable_mtrr_trim = 1; +- return 0; +-} +-early_param("disable_mtrr_trim", disable_mtrr_trim_setup); +- +-/* +- * Newer AMD K8s and later CPUs have a special magic MSR way to force WB +- * for memory >4GB. Check for that here. +- * Note this won't check if the MTRRs < 4GB where the magic bit doesn't +- * apply to are wrong, but so far we don't know of any such case in the wild. +- */ +-#define Tom2Enabled (1U << 21) +-#define Tom2ForceMemTypeWB (1U << 22) +- +-int __init amd_special_default_mtrr(void) +-{ +- u32 l, h; +- +- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) +- return 0; +- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) +- return 0; +- /* In case some hypervisor doesn't pass SYSCFG through */ +- if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) +- return 0; +- /* +- * Memory between 4GB and top of mem is forced WB by this magic bit. +- * Reserved before K8RevF, but should be zero there. +- */ +- if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == +- (Tom2Enabled | Tom2ForceMemTypeWB)) +- return 1; +- return 0; +-} +- +-static u64 __init real_trim_memory(unsigned long start_pfn, +- unsigned long limit_pfn) +-{ +- u64 trim_start, trim_size; +- trim_start = start_pfn; +- trim_start <<= PAGE_SHIFT; +- trim_size = limit_pfn; +- trim_size <<= PAGE_SHIFT; +- trim_size -= trim_start; +- +- return e820_update_range(trim_start, trim_size, E820_RAM, +- E820_RESERVED); +-} +-/** +- * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs +- * @end_pfn: ending page frame number +- * +- * Some buggy BIOSes don't setup the MTRRs properly for systems with certain +- * memory configurations. This routine checks that the highest MTRR matches +- * the end of memory, to make sure the MTRRs having a write back type cover +- * all of the memory the kernel is intending to use. If not, it'll trim any +- * memory off the end by adjusting end_pfn, removing it from the kernel's +- * allocation pools, warning the user with an obnoxious message. +- */ +-int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +-{ +- unsigned long i, base, size, highest_pfn = 0, def, dummy; +- mtrr_type type; +- u64 total_trim_size; +- +- /* extra one for all 0 */ +- int num[MTRR_NUM_TYPES + 1]; +- /* +- * Make sure we only trim uncachable memory on machines that +- * support the Intel MTRR architecture: +- */ +- if (!is_cpu(INTEL) || disable_mtrr_trim) +- return 0; +- rdmsr(MTRRdefType_MSR, def, dummy); +- def &= 0xff; +- if (def != MTRR_TYPE_UNCACHABLE) +- return 0; +- +- /* get it and store it aside */ +- memset(range_state, 0, sizeof(range_state)); +- for (i = 0; i < num_var_ranges; i++) { +- mtrr_if->get(i, &base, &size, &type); +- range_state[i].base_pfn = base; +- range_state[i].size_pfn = size; +- range_state[i].type = type; +- } +- +- /* Find highest cached pfn */ +- for (i = 0; i < num_var_ranges; i++) { +- type = range_state[i].type; +- if (type != MTRR_TYPE_WRBACK) +- continue; +- base = range_state[i].base_pfn; +- size = range_state[i].size_pfn; +- if (highest_pfn < base + size) +- highest_pfn = base + size; +- } +- +- /* kvm/qemu doesn't have mtrr set right, don't trim them all */ +- if (!highest_pfn) { +- printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); +- return 0; +- } +- +- /* check entries number */ +- memset(num, 0, sizeof(num)); +- for (i = 0; i < num_var_ranges; i++) { +- type = range_state[i].type; +- if (type >= MTRR_NUM_TYPES) +- continue; +- size = range_state[i].size_pfn; +- if (!size) +- type = MTRR_NUM_TYPES; +- num[type]++; +- } +- +- /* no entry for WB? */ +- if (!num[MTRR_TYPE_WRBACK]) +- return 0; +- +- /* check if we only had WB and UC */ +- if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != +- num_var_ranges - num[MTRR_NUM_TYPES]) +- return 0; +- +- memset(range, 0, sizeof(range)); +- nr_range = 0; +- if (mtrr_tom2) { +- range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); +- range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; +- if (highest_pfn < range[nr_range].end + 1) +- highest_pfn = range[nr_range].end + 1; +- nr_range++; +- } +- nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); +- +- total_trim_size = 0; +- /* check the head */ +- if (range[0].start) +- total_trim_size += real_trim_memory(0, range[0].start); +- /* check the holes */ +- for (i = 0; i < nr_range - 1; i++) { +- if (range[i].end + 1 < range[i+1].start) +- total_trim_size += real_trim_memory(range[i].end + 1, +- range[i+1].start); +- } +- /* check the top */ +- i = nr_range - 1; +- if (range[i].end + 1 < end_pfn) +- total_trim_size += real_trim_memory(range[i].end + 1, +- end_pfn); +- +- if (total_trim_size) { +- printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" +- " all of memory, losing %lluMB of RAM.\n", +- total_trim_size >> 20); +- +- if (!changed_by_mtrr_cleanup) +- WARN_ON(1); +- +- printk(KERN_INFO "update e820 for mtrr\n"); +- update_e820(); +- +- return 1; +- } +- +- return 0; +-} ++int __initdata changed_by_mtrr_cleanup; + + /** + * mtrr_bp_init - initialize mtrrs on the boot CPU +Index: linux-2.6-tip/arch/x86/kernel/cpu/mtrr/mtrr.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/mtrr/mtrr.h ++++ linux-2.6-tip/arch/x86/kernel/cpu/mtrr/mtrr.h +@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if; + + extern unsigned int num_var_ranges; + extern u64 mtrr_tom2; ++extern struct mtrr_state_type mtrr_state; + + void mtrr_state_warn(void); + const char *mtrr_attrib_to_str(int x); +@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsi + int amd_init_mtrr(void); + int cyrix_init_mtrr(void); + int centaur_init_mtrr(void); ++ ++extern int changed_by_mtrr_cleanup; ++extern int mtrr_cleanup(unsigned address_bits); +Index: linux-2.6-tip/arch/x86/kernel/cpu/perf_counter.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/cpu/perf_counter.c +@@ -0,0 +1,989 @@ ++/* ++ * Performance counter x86 architecture code ++ * ++ * Copyright(C) 2008 Thomas Gleixner ++ * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar ++ * Copyright(C) 2009 Jaswinder Singh Rajput ++ * ++ * For licencing details see kernel-base/COPYING ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++static bool perf_counters_initialized __read_mostly; ++ ++/* ++ * Number of (generic) HW counters: ++ */ ++static int nr_counters_generic __read_mostly; ++static u64 perf_counter_mask __read_mostly; ++static u64 counter_value_mask __read_mostly; ++static int counter_value_bits __read_mostly; ++ ++static int nr_counters_fixed __read_mostly; ++ ++struct cpu_hw_counters { ++ struct perf_counter *counters[X86_PMC_IDX_MAX]; ++ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; ++ unsigned long interrupts; ++ u64 throttle_ctrl; ++ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; ++ int enabled; ++}; ++ ++/* ++ * struct pmc_x86_ops - performance counter x86 ops ++ */ ++struct pmc_x86_ops { ++ u64 (*save_disable_all)(void); ++ void (*restore_all)(u64); ++ u64 (*get_status)(u64); ++ void (*ack_status)(u64); ++ void (*enable)(int, u64); ++ void (*disable)(int, u64); ++ unsigned eventsel; ++ unsigned perfctr; ++ u64 (*event_map)(int); ++ u64 (*raw_event)(u64); ++ int max_events; ++}; ++ ++static struct pmc_x86_ops *pmc_ops __read_mostly; ++ ++static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { ++ .enabled = 1, ++}; ++ ++static __read_mostly int intel_perfmon_version; ++ ++/* ++ * Intel PerfMon v3. Used on Core2 and later. ++ */ ++static const u64 intel_perfmon_event_map[] = ++{ ++ [PERF_COUNT_CPU_CYCLES] = 0x003c, ++ [PERF_COUNT_INSTRUCTIONS] = 0x00c0, ++ [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, ++ [PERF_COUNT_CACHE_MISSES] = 0x412e, ++ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, ++ [PERF_COUNT_BRANCH_MISSES] = 0x00c5, ++ [PERF_COUNT_BUS_CYCLES] = 0x013c, ++}; ++ ++static u64 pmc_intel_event_map(int event) ++{ ++ return intel_perfmon_event_map[event]; ++} ++ ++static u64 pmc_intel_raw_event(u64 event) ++{ ++#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL ++#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL ++#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL ++ ++#define CORE_EVNTSEL_MASK \ ++ (CORE_EVNTSEL_EVENT_MASK | \ ++ CORE_EVNTSEL_UNIT_MASK | \ ++ CORE_EVNTSEL_COUNTER_MASK) ++ ++ return event & CORE_EVNTSEL_MASK; ++} ++ ++/* ++ * AMD Performance Monitor K7 and later. ++ */ ++static const u64 amd_perfmon_event_map[] = ++{ ++ [PERF_COUNT_CPU_CYCLES] = 0x0076, ++ [PERF_COUNT_INSTRUCTIONS] = 0x00c0, ++ [PERF_COUNT_CACHE_REFERENCES] = 0x0080, ++ [PERF_COUNT_CACHE_MISSES] = 0x0081, ++ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, ++ [PERF_COUNT_BRANCH_MISSES] = 0x00c5, ++}; ++ ++static u64 pmc_amd_event_map(int event) ++{ ++ return amd_perfmon_event_map[event]; ++} ++ ++static u64 pmc_amd_raw_event(u64 event) ++{ ++#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL ++#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL ++#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL ++ ++#define K7_EVNTSEL_MASK \ ++ (K7_EVNTSEL_EVENT_MASK | \ ++ K7_EVNTSEL_UNIT_MASK | \ ++ K7_EVNTSEL_COUNTER_MASK) ++ ++ return event & K7_EVNTSEL_MASK; ++} ++ ++/* ++ * Propagate counter elapsed time into the generic counter. ++ * Can only be executed on the CPU where the counter is active. ++ * Returns the delta events processed. ++ */ ++static void ++x86_perf_counter_update(struct perf_counter *counter, ++ struct hw_perf_counter *hwc, int idx) ++{ ++ u64 prev_raw_count, new_raw_count, delta; ++ ++ /* ++ * Careful: an NMI might modify the previous counter value. ++ * ++ * Our tactic to handle this is to first atomically read and ++ * exchange a new raw count - then add that new-prev delta ++ * count to the generic counter atomically: ++ */ ++again: ++ prev_raw_count = atomic64_read(&hwc->prev_count); ++ rdmsrl(hwc->counter_base + idx, new_raw_count); ++ ++ if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, ++ new_raw_count) != prev_raw_count) ++ goto again; ++ ++ /* ++ * Now we have the new raw value and have updated the prev ++ * timestamp already. We can now calculate the elapsed delta ++ * (counter-)time and add that to the generic counter. ++ * ++ * Careful, not all hw sign-extends above the physical width ++ * of the count, so we do that by clipping the delta to 32 bits: ++ */ ++ delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); ++ ++ atomic64_add(delta, &counter->count); ++ atomic64_sub(delta, &hwc->period_left); ++} ++ ++/* ++ * Setup the hardware configuration for a given hw_event_type ++ */ ++static int __hw_perf_counter_init(struct perf_counter *counter) ++{ ++ struct perf_counter_hw_event *hw_event = &counter->hw_event; ++ struct hw_perf_counter *hwc = &counter->hw; ++ ++ if (unlikely(!perf_counters_initialized)) ++ return -EINVAL; ++ ++ /* ++ * Generate PMC IRQs: ++ * (keep 'enabled' bit clear for now) ++ */ ++ hwc->config = ARCH_PERFMON_EVENTSEL_INT; ++ ++ /* ++ * Count user and OS events unless requested not to. ++ */ ++ if (!hw_event->exclude_user) ++ hwc->config |= ARCH_PERFMON_EVENTSEL_USR; ++ if (!hw_event->exclude_kernel) ++ hwc->config |= ARCH_PERFMON_EVENTSEL_OS; ++ ++ /* ++ * If privileged enough, allow NMI events: ++ */ ++ hwc->nmi = 0; ++ if (capable(CAP_SYS_ADMIN) && hw_event->nmi) ++ hwc->nmi = 1; ++ ++ hwc->irq_period = hw_event->irq_period; ++ /* ++ * Intel PMCs cannot be accessed sanely above 32 bit width, ++ * so we install an artificial 1<<31 period regardless of ++ * the generic counter period: ++ */ ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ++ if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) ++ hwc->irq_period = 0x7FFFFFFF; ++ ++ atomic64_set(&hwc->period_left, hwc->irq_period); ++ ++ /* ++ * Raw event type provide the config in the event structure ++ */ ++ if (perf_event_raw(hw_event)) { ++ hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); ++ } else { ++ if (perf_event_id(hw_event) >= pmc_ops->max_events) ++ return -EINVAL; ++ /* ++ * The generic map: ++ */ ++ hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); ++ } ++ counter->wakeup_pending = 0; ++ ++ return 0; ++} ++ ++static u64 pmc_intel_save_disable_all(void) ++{ ++ u64 ctrl; ++ ++ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); ++ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ++ ++ return ctrl; ++} ++ ++static u64 pmc_amd_save_disable_all(void) ++{ ++ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); ++ int enabled, idx; ++ ++ enabled = cpuc->enabled; ++ cpuc->enabled = 0; ++ /* ++ * ensure we write the disable before we start disabling the ++ * counters proper, so that pcm_amd_enable() does the right thing. ++ */ ++ barrier(); ++ ++ for (idx = 0; idx < nr_counters_generic; idx++) { ++ u64 val; ++ ++ rdmsrl(MSR_K7_EVNTSEL0 + idx, val); ++ if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { ++ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; ++ wrmsrl(MSR_K7_EVNTSEL0 + idx, val); ++ } ++ } ++ ++ return enabled; ++} ++ ++u64 hw_perf_save_disable(void) ++{ ++ if (unlikely(!perf_counters_initialized)) ++ return 0; ++ ++ return pmc_ops->save_disable_all(); ++} ++/* ++ * Exported because of ACPI idle ++ */ ++EXPORT_SYMBOL_GPL(hw_perf_save_disable); ++ ++static void pmc_intel_restore_all(u64 ctrl) ++{ ++ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); ++} ++ ++static void pmc_amd_restore_all(u64 ctrl) ++{ ++ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); ++ int idx; ++ ++ cpuc->enabled = ctrl; ++ barrier(); ++ if (!ctrl) ++ return; ++ ++ for (idx = 0; idx < nr_counters_generic; idx++) { ++ if (test_bit(idx, cpuc->active_mask)) { ++ u64 val; ++ ++ rdmsrl(MSR_K7_EVNTSEL0 + idx, val); ++ val |= ARCH_PERFMON_EVENTSEL0_ENABLE; ++ wrmsrl(MSR_K7_EVNTSEL0 + idx, val); ++ } ++ } ++} ++ ++void hw_perf_restore(u64 ctrl) ++{ ++ if (unlikely(!perf_counters_initialized)) ++ return; ++ ++ pmc_ops->restore_all(ctrl); ++} ++/* ++ * Exported because of ACPI idle ++ */ ++EXPORT_SYMBOL_GPL(hw_perf_restore); ++ ++static u64 pmc_intel_get_status(u64 mask) ++{ ++ u64 status; ++ ++ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); ++ ++ return status; ++} ++ ++static u64 pmc_amd_get_status(u64 mask) ++{ ++ u64 status = 0; ++ int idx; ++ ++ for (idx = 0; idx < nr_counters_generic; idx++) { ++ s64 val; ++ ++ if (!(mask & (1 << idx))) ++ continue; ++ ++ rdmsrl(MSR_K7_PERFCTR0 + idx, val); ++ val <<= (64 - counter_value_bits); ++ if (val >= 0) ++ status |= (1 << idx); ++ } ++ ++ return status; ++} ++ ++static u64 hw_perf_get_status(u64 mask) ++{ ++ if (unlikely(!perf_counters_initialized)) ++ return 0; ++ ++ return pmc_ops->get_status(mask); ++} ++ ++static void pmc_intel_ack_status(u64 ack) ++{ ++ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); ++} ++ ++static void pmc_amd_ack_status(u64 ack) ++{ ++} ++ ++static void hw_perf_ack_status(u64 ack) ++{ ++ if (unlikely(!perf_counters_initialized)) ++ return; ++ ++ pmc_ops->ack_status(ack); ++} ++ ++static void pmc_intel_enable(int idx, u64 config) ++{ ++ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, ++ config | ARCH_PERFMON_EVENTSEL0_ENABLE); ++} ++ ++static void pmc_amd_enable(int idx, u64 config) ++{ ++ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); ++ ++ set_bit(idx, cpuc->active_mask); ++ if (cpuc->enabled) ++ config |= ARCH_PERFMON_EVENTSEL0_ENABLE; ++ ++ wrmsrl(MSR_K7_EVNTSEL0 + idx, config); ++} ++ ++static void hw_perf_enable(int idx, u64 config) ++{ ++ if (unlikely(!perf_counters_initialized)) ++ return; ++ ++ pmc_ops->enable(idx, config); ++} ++ ++static void pmc_intel_disable(int idx, u64 config) ++{ ++ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); ++} ++ ++static void pmc_amd_disable(int idx, u64 config) ++{ ++ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); ++ ++ clear_bit(idx, cpuc->active_mask); ++ wrmsrl(MSR_K7_EVNTSEL0 + idx, config); ++ ++} ++ ++static void hw_perf_disable(int idx, u64 config) ++{ ++ if (unlikely(!perf_counters_initialized)) ++ return; ++ ++ pmc_ops->disable(idx, config); ++} ++ ++static inline void ++__pmc_fixed_disable(struct perf_counter *counter, ++ struct hw_perf_counter *hwc, unsigned int __idx) ++{ ++ int idx = __idx - X86_PMC_IDX_FIXED; ++ u64 ctrl_val, mask; ++ int err; ++ ++ mask = 0xfULL << (idx * 4); ++ ++ rdmsrl(hwc->config_base, ctrl_val); ++ ctrl_val &= ~mask; ++ err = checking_wrmsrl(hwc->config_base, ctrl_val); ++} ++ ++static inline void ++__pmc_generic_disable(struct perf_counter *counter, ++ struct hw_perf_counter *hwc, unsigned int idx) ++{ ++ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) ++ __pmc_fixed_disable(counter, hwc, idx); ++ else ++ hw_perf_disable(idx, hwc->config); ++} ++ ++static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); ++ ++/* ++ * Set the next IRQ period, based on the hwc->period_left value. ++ * To be called with the counter disabled in hw: ++ */ ++static void ++__hw_perf_counter_set_period(struct perf_counter *counter, ++ struct hw_perf_counter *hwc, int idx) ++{ ++ s64 left = atomic64_read(&hwc->period_left); ++ s64 period = hwc->irq_period; ++ int err; ++ ++ /* ++ * If we are way outside a reasoable range then just skip forward: ++ */ ++ if (unlikely(left <= -period)) { ++ left = period; ++ atomic64_set(&hwc->period_left, left); ++ } ++ ++ if (unlikely(left <= 0)) { ++ left += period; ++ atomic64_set(&hwc->period_left, left); ++ } ++ ++ per_cpu(prev_left[idx], smp_processor_id()) = left; ++ ++ /* ++ * The hw counter starts counting from this counter offset, ++ * mark it to be able to extra future deltas: ++ */ ++ atomic64_set(&hwc->prev_count, (u64)-left); ++ ++ err = checking_wrmsrl(hwc->counter_base + idx, ++ (u64)(-left) & counter_value_mask); ++} ++ ++static inline void ++__pmc_fixed_enable(struct perf_counter *counter, ++ struct hw_perf_counter *hwc, unsigned int __idx) ++{ ++ int idx = __idx - X86_PMC_IDX_FIXED; ++ u64 ctrl_val, bits, mask; ++ int err; ++ ++ /* ++ * Enable IRQ generation (0x8), ++ * and enable ring-3 counting (0x2) and ring-0 counting (0x1) ++ * if requested: ++ */ ++ bits = 0x8ULL; ++ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) ++ bits |= 0x2; ++ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) ++ bits |= 0x1; ++ bits <<= (idx * 4); ++ mask = 0xfULL << (idx * 4); ++ ++ rdmsrl(hwc->config_base, ctrl_val); ++ ctrl_val &= ~mask; ++ ctrl_val |= bits; ++ err = checking_wrmsrl(hwc->config_base, ctrl_val); ++} ++ ++static void ++__pmc_generic_enable(struct perf_counter *counter, ++ struct hw_perf_counter *hwc, int idx) ++{ ++ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) ++ __pmc_fixed_enable(counter, hwc, idx); ++ else ++ hw_perf_enable(idx, hwc->config); ++} ++ ++static int ++fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) ++{ ++ unsigned int event; ++ ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ++ return -1; ++ ++ if (unlikely(hwc->nmi)) ++ return -1; ++ ++ event = hwc->config & ARCH_PERFMON_EVENT_MASK; ++ ++ if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) ++ return X86_PMC_IDX_FIXED_INSTRUCTIONS; ++ if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) ++ return X86_PMC_IDX_FIXED_CPU_CYCLES; ++ if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) ++ return X86_PMC_IDX_FIXED_BUS_CYCLES; ++ ++ return -1; ++} ++ ++/* ++ * Find a PMC slot for the freshly enabled / scheduled in counter: ++ */ ++static int pmc_generic_enable(struct perf_counter *counter) ++{ ++ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); ++ struct hw_perf_counter *hwc = &counter->hw; ++ int idx; ++ ++ idx = fixed_mode_idx(counter, hwc); ++ if (idx >= 0) { ++ /* ++ * Try to get the fixed counter, if that is already taken ++ * then try to get a generic counter: ++ */ ++ if (test_and_set_bit(idx, cpuc->used)) ++ goto try_generic; ++ ++ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ++ /* ++ * We set it so that counter_base + idx in wrmsr/rdmsr maps to ++ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: ++ */ ++ hwc->counter_base = ++ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; ++ hwc->idx = idx; ++ } else { ++ idx = hwc->idx; ++ /* Try to get the previous generic counter again */ ++ if (test_and_set_bit(idx, cpuc->used)) { ++try_generic: ++ idx = find_first_zero_bit(cpuc->used, nr_counters_generic); ++ if (idx == nr_counters_generic) ++ return -EAGAIN; ++ ++ set_bit(idx, cpuc->used); ++ hwc->idx = idx; ++ } ++ hwc->config_base = pmc_ops->eventsel; ++ hwc->counter_base = pmc_ops->perfctr; ++ } ++ ++ perf_counters_lapic_init(hwc->nmi); ++ ++ __pmc_generic_disable(counter, hwc, idx); ++ ++ cpuc->counters[idx] = counter; ++ /* ++ * Make it visible before enabling the hw: ++ */ ++ smp_wmb(); ++ ++ __hw_perf_counter_set_period(counter, hwc, idx); ++ __pmc_generic_enable(counter, hwc, idx); ++ ++ return 0; ++} ++ ++void perf_counter_print_debug(void) ++{ ++ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; ++ struct cpu_hw_counters *cpuc; ++ int cpu, idx; ++ ++ if (!nr_counters_generic) ++ return; ++ ++ local_irq_disable(); ++ ++ cpu = smp_processor_id(); ++ cpuc = &per_cpu(cpu_hw_counters, cpu); ++ ++ if (intel_perfmon_version >= 2) { ++ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); ++ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); ++ rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); ++ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); ++ ++ pr_info("\n"); ++ pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); ++ pr_info("CPU#%d: status: %016llx\n", cpu, status); ++ pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); ++ pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); ++ } ++ pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); ++ ++ for (idx = 0; idx < nr_counters_generic; idx++) { ++ rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); ++ rdmsrl(pmc_ops->perfctr + idx, pmc_count); ++ ++ prev_left = per_cpu(prev_left[idx], cpu); ++ ++ pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", ++ cpu, idx, pmc_ctrl); ++ pr_info("CPU#%d: gen-PMC%d count: %016llx\n", ++ cpu, idx, pmc_count); ++ pr_info("CPU#%d: gen-PMC%d left: %016llx\n", ++ cpu, idx, prev_left); ++ } ++ for (idx = 0; idx < nr_counters_fixed; idx++) { ++ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); ++ ++ pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", ++ cpu, idx, pmc_count); ++ } ++ local_irq_enable(); ++} ++ ++static void pmc_generic_disable(struct perf_counter *counter) ++{ ++ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); ++ struct hw_perf_counter *hwc = &counter->hw; ++ unsigned int idx = hwc->idx; ++ ++ __pmc_generic_disable(counter, hwc, idx); ++ ++ clear_bit(idx, cpuc->used); ++ cpuc->counters[idx] = NULL; ++ /* ++ * Make sure the cleared pointer becomes visible before we ++ * (potentially) free the counter: ++ */ ++ smp_wmb(); ++ ++ /* ++ * Drain the remaining delta count out of a counter ++ * that we are disabling: ++ */ ++ x86_perf_counter_update(counter, hwc, idx); ++} ++ ++/* ++ * Save and restart an expired counter. Called by NMI contexts, ++ * so it has to be careful about preempting normal counter ops: ++ */ ++static void perf_save_and_restart(struct perf_counter *counter) ++{ ++ struct hw_perf_counter *hwc = &counter->hw; ++ int idx = hwc->idx; ++ ++ x86_perf_counter_update(counter, hwc, idx); ++ __hw_perf_counter_set_period(counter, hwc, idx); ++ ++ if (counter->state == PERF_COUNTER_STATE_ACTIVE) ++ __pmc_generic_enable(counter, hwc, idx); ++} ++ ++/* ++ * Maximum interrupt frequency of 100KHz per CPU ++ */ ++#define PERFMON_MAX_INTERRUPTS (100000/HZ) ++ ++/* ++ * This handler is triggered by the local APIC, so the APIC IRQ handling ++ * rules apply: ++ */ ++static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) ++{ ++ int bit, cpu = smp_processor_id(); ++ u64 ack, status; ++ struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); ++ int ret = 0; ++ ++ cpuc->throttle_ctrl = hw_perf_save_disable(); ++ ++ status = hw_perf_get_status(cpuc->throttle_ctrl); ++ if (!status) ++ goto out; ++ ++ ret = 1; ++again: ++ inc_irq_stat(apic_perf_irqs); ++ ack = status; ++ for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { ++ struct perf_counter *counter = cpuc->counters[bit]; ++ ++ clear_bit(bit, (unsigned long *) &status); ++ if (!counter) ++ continue; ++ ++ perf_save_and_restart(counter); ++ perf_counter_output(counter, nmi, regs); ++ } ++ ++ hw_perf_ack_status(ack); ++ ++ /* ++ * Repeat if there is more work to be done: ++ */ ++ status = hw_perf_get_status(cpuc->throttle_ctrl); ++ if (status) ++ goto again; ++out: ++ /* ++ * Restore - do not reenable when global enable is off or throttled: ++ */ ++ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) ++ hw_perf_restore(cpuc->throttle_ctrl); ++ ++ return ret; ++} ++ ++void perf_counter_unthrottle(void) ++{ ++ struct cpu_hw_counters *cpuc; ++ ++ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) ++ return; ++ ++ if (unlikely(!perf_counters_initialized)) ++ return; ++ ++ cpuc = &__get_cpu_var(cpu_hw_counters); ++ if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { ++ if (printk_ratelimit()) ++ printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); ++ hw_perf_restore(cpuc->throttle_ctrl); ++ } ++ cpuc->interrupts = 0; ++} ++ ++void smp_perf_counter_interrupt(struct pt_regs *regs) ++{ ++ irq_enter(); ++ apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ++ ack_APIC_irq(); ++ __smp_perf_counter_interrupt(regs, 0); ++ irq_exit(); ++} ++ ++/* ++ * This handler is triggered by NMI contexts: ++ */ ++void perf_counter_notify(struct pt_regs *regs) ++{ ++ struct cpu_hw_counters *cpuc; ++ unsigned long flags; ++ int bit, cpu; ++ ++ local_irq_save(flags); ++ cpu = smp_processor_id(); ++ cpuc = &per_cpu(cpu_hw_counters, cpu); ++ ++ for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { ++ struct perf_counter *counter = cpuc->counters[bit]; ++ ++ if (!counter) ++ continue; ++ ++ if (counter->wakeup_pending) { ++ counter->wakeup_pending = 0; ++ wake_up(&counter->waitq); ++ } ++ } ++ ++ local_irq_restore(flags); ++} ++ ++void perf_counters_lapic_init(int nmi) ++{ ++ u32 apic_val; ++ ++ if (!perf_counters_initialized) ++ return; ++ /* ++ * Enable the performance counter vector in the APIC LVT: ++ */ ++ apic_val = apic_read(APIC_LVTERR); ++ ++ apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); ++ if (nmi) ++ apic_write(APIC_LVTPC, APIC_DM_NMI); ++ else ++ apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ++ apic_write(APIC_LVTERR, apic_val); ++} ++ ++static int __kprobes ++perf_counter_nmi_handler(struct notifier_block *self, ++ unsigned long cmd, void *__args) ++{ ++ struct die_args *args = __args; ++ struct pt_regs *regs; ++ int ret; ++ ++ switch (cmd) { ++ case DIE_NMI: ++ case DIE_NMI_IPI: ++ break; ++ ++ default: ++ return NOTIFY_DONE; ++ } ++ ++ regs = args->regs; ++ ++ apic_write(APIC_LVTPC, APIC_DM_NMI); ++ ret = __smp_perf_counter_interrupt(regs, 1); ++ ++ return ret ? NOTIFY_STOP : NOTIFY_OK; ++} ++ ++static __read_mostly struct notifier_block perf_counter_nmi_notifier = { ++ .notifier_call = perf_counter_nmi_handler, ++ .next = NULL, ++ .priority = 1 ++}; ++ ++static struct pmc_x86_ops pmc_intel_ops = { ++ .save_disable_all = pmc_intel_save_disable_all, ++ .restore_all = pmc_intel_restore_all, ++ .get_status = pmc_intel_get_status, ++ .ack_status = pmc_intel_ack_status, ++ .enable = pmc_intel_enable, ++ .disable = pmc_intel_disable, ++ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, ++ .perfctr = MSR_ARCH_PERFMON_PERFCTR0, ++ .event_map = pmc_intel_event_map, ++ .raw_event = pmc_intel_raw_event, ++ .max_events = ARRAY_SIZE(intel_perfmon_event_map), ++}; ++ ++static struct pmc_x86_ops pmc_amd_ops = { ++ .save_disable_all = pmc_amd_save_disable_all, ++ .restore_all = pmc_amd_restore_all, ++ .get_status = pmc_amd_get_status, ++ .ack_status = pmc_amd_ack_status, ++ .enable = pmc_amd_enable, ++ .disable = pmc_amd_disable, ++ .eventsel = MSR_K7_EVNTSEL0, ++ .perfctr = MSR_K7_PERFCTR0, ++ .event_map = pmc_amd_event_map, ++ .raw_event = pmc_amd_raw_event, ++ .max_events = ARRAY_SIZE(amd_perfmon_event_map), ++}; ++ ++static struct pmc_x86_ops *pmc_intel_init(void) ++{ ++ union cpuid10_edx edx; ++ union cpuid10_eax eax; ++ unsigned int unused; ++ unsigned int ebx; ++ ++ /* ++ * Check whether the Architectural PerfMon supports ++ * Branch Misses Retired Event or not. ++ */ ++ cpuid(10, &eax.full, &ebx, &unused, &edx.full); ++ if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) ++ return NULL; ++ ++ intel_perfmon_version = eax.split.version_id; ++ if (intel_perfmon_version < 2) ++ return NULL; ++ ++ pr_info("Intel Performance Monitoring support detected.\n"); ++ pr_info("... version: %d\n", intel_perfmon_version); ++ pr_info("... bit width: %d\n", eax.split.bit_width); ++ pr_info("... mask length: %d\n", eax.split.mask_length); ++ ++ nr_counters_generic = eax.split.num_counters; ++ nr_counters_fixed = edx.split.num_counters_fixed; ++ counter_value_mask = (1ULL << eax.split.bit_width) - 1; ++ ++ return &pmc_intel_ops; ++} ++ ++static struct pmc_x86_ops *pmc_amd_init(void) ++{ ++ nr_counters_generic = 4; ++ nr_counters_fixed = 0; ++ counter_value_mask = 0x0000FFFFFFFFFFFFULL; ++ counter_value_bits = 48; ++ ++ pr_info("AMD Performance Monitoring support detected.\n"); ++ ++ return &pmc_amd_ops; ++} ++ ++void __init init_hw_perf_counters(void) ++{ ++ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) ++ return; ++ ++ switch (boot_cpu_data.x86_vendor) { ++ case X86_VENDOR_INTEL: ++ pmc_ops = pmc_intel_init(); ++ break; ++ case X86_VENDOR_AMD: ++ pmc_ops = pmc_amd_init(); ++ break; ++ } ++ if (!pmc_ops) ++ return; ++ ++ pr_info("... num counters: %d\n", nr_counters_generic); ++ if (nr_counters_generic > X86_PMC_MAX_GENERIC) { ++ nr_counters_generic = X86_PMC_MAX_GENERIC; ++ WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", ++ nr_counters_generic, X86_PMC_MAX_GENERIC); ++ } ++ perf_counter_mask = (1 << nr_counters_generic) - 1; ++ perf_max_counters = nr_counters_generic; ++ ++ pr_info("... value mask: %016Lx\n", counter_value_mask); ++ ++ if (nr_counters_fixed > X86_PMC_MAX_FIXED) { ++ nr_counters_fixed = X86_PMC_MAX_FIXED; ++ WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", ++ nr_counters_fixed, X86_PMC_MAX_FIXED); ++ } ++ pr_info("... fixed counters: %d\n", nr_counters_fixed); ++ ++ perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; ++ ++ pr_info("... counter mask: %016Lx\n", perf_counter_mask); ++ perf_counters_initialized = true; ++ ++ perf_counters_lapic_init(0); ++ register_die_notifier(&perf_counter_nmi_notifier); ++} ++ ++static void pmc_generic_read(struct perf_counter *counter) ++{ ++ x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); ++} ++ ++static const struct hw_perf_counter_ops x86_perf_counter_ops = { ++ .enable = pmc_generic_enable, ++ .disable = pmc_generic_disable, ++ .read = pmc_generic_read, ++}; ++ ++const struct hw_perf_counter_ops * ++hw_perf_counter_init(struct perf_counter *counter) ++{ ++ int err; ++ ++ err = __hw_perf_counter_init(counter); ++ if (err) ++ return NULL; ++ ++ return &x86_perf_counter_ops; ++} +Index: linux-2.6-tip/arch/x86/kernel/cpu/perfctr-watchdog.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/perfctr-watchdog.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/perfctr-watchdog.c +@@ -20,7 +20,7 @@ + #include + + #include +-#include ++#include + + struct nmi_watchdog_ctlblk { + unsigned int cccr_msr; +Index: linux-2.6-tip/arch/x86/kernel/cpu/proc.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/proc.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/proc.c +@@ -7,15 +7,14 @@ + /* + * Get CPU information for use by the procfs. + */ +-#ifdef CONFIG_X86_32 + static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, + unsigned int cpu) + { +-#ifdef CONFIG_X86_HT ++#ifdef CONFIG_SMP + if (c->x86_max_cores * smp_num_siblings > 1) { + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", +- cpus_weight(per_cpu(cpu_core_map, cpu))); ++ cpumask_weight(cpu_sibling_mask(cpu))); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + seq_printf(m, "apicid\t\t: %d\n", c->apicid); +@@ -24,6 +23,7 @@ static void show_cpuinfo_core(struct seq + #endif + } + ++#ifdef CONFIG_X86_32 + static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) + { + /* +@@ -50,22 +50,6 @@ static void show_cpuinfo_misc(struct seq + c->wp_works_ok ? "yes" : "no"); + } + #else +-static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, +- unsigned int cpu) +-{ +-#ifdef CONFIG_SMP +- if (c->x86_max_cores * smp_num_siblings > 1) { +- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); +- seq_printf(m, "siblings\t: %d\n", +- cpus_weight(per_cpu(cpu_core_map, cpu))); +- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); +- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); +- seq_printf(m, "apicid\t\t: %d\n", c->apicid); +- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); +- } +-#endif +-} +- + static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) + { + seq_printf(m, +@@ -159,9 +143,9 @@ static int show_cpuinfo(struct seq_file + static void *c_start(struct seq_file *m, loff_t *pos) + { + if (*pos == 0) /* just in case, cpu 0 is not the first */ +- *pos = first_cpu(cpu_online_map); ++ *pos = cpumask_first(cpu_online_mask); + else +- *pos = next_cpu_nr(*pos - 1, cpu_online_map); ++ *pos = cpumask_next(*pos - 1, cpu_online_mask); + if ((*pos) < nr_cpu_ids) + return &cpu_data(*pos); + return NULL; +Index: linux-2.6-tip/arch/x86/kernel/cpu/transmeta.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/transmeta.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/transmeta.c +@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(str + #endif + } + +-static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { ++static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = { + .c_vendor = "Transmeta", + .c_ident = { "GenuineTMx86", "TransmetaCPU" }, + .c_early_init = early_init_transmeta, +Index: linux-2.6-tip/arch/x86/kernel/cpu/umc.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/cpu/umc.c ++++ linux-2.6-tip/arch/x86/kernel/cpu/umc.c +@@ -8,7 +8,7 @@ + * so no special init takes place. + */ + +-static struct cpu_dev umc_cpu_dev __cpuinitdata = { ++static const struct cpu_dev __cpuinitconst umc_cpu_dev = { + .c_vendor = "UMC", + .c_ident = { "UMC UMC UMC" }, + .c_models = { +Index: linux-2.6-tip/arch/x86/kernel/crash.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/crash.c ++++ linux-2.6-tip/arch/x86/kernel/crash.c +@@ -24,12 +24,10 @@ + #include + #include + #include +-#include ++#include + #include + #include + +-#include +- + + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + +Index: linux-2.6-tip/arch/x86/kernel/dumpstack.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/dumpstack.c ++++ linux-2.6-tip/arch/x86/kernel/dumpstack.c +@@ -10,10 +10,12 @@ + #include + #include + #include ++#include + #include + #include + #include + #include ++#include + + #include + +@@ -99,7 +101,7 @@ print_context_stack(struct thread_info * + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { +- ops->address(data, addr, bp == 0); ++ ops->address(data, addr, 0); + } + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } +@@ -186,7 +188,7 @@ void dump_stack(void) + } + EXPORT_SYMBOL(dump_stack); + +-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; ++static raw_spinlock_t die_lock = RAW_SPIN_LOCK_UNLOCKED(die_lock); + static int die_owner = -1; + static unsigned int die_nest_count; + +@@ -195,16 +197,21 @@ unsigned __kprobes long oops_begin(void) + int cpu; + unsigned long flags; + ++ /* notify the hw-branch tracer so it may disable tracing and ++ add the last trace to the trace buffer - ++ the earlier this happens, the more useful the trace. */ ++ trace_hw_branch_oops(); ++ + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); +- if (!__raw_spin_trylock(&die_lock)) { ++ if (!spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else +- __raw_spin_lock(&die_lock); ++ spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; +@@ -224,7 +231,7 @@ void __kprobes oops_end(unsigned long fl + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ +- __raw_spin_unlock(&die_lock); ++ spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + +Index: linux-2.6-tip/arch/x86/kernel/dumpstack_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/dumpstack_64.c ++++ linux-2.6-tip/arch/x86/kernel/dumpstack_64.c +@@ -23,10 +23,14 @@ static unsigned long *in_exception_stack + unsigned *usedp, char **idp) + { + static char ids[][8] = { ++#if DEBUG_STACK > 0 + [DEBUG_STACK - 1] = "#DB", ++#endif + [NMI_STACK - 1] = "NMI", + [DOUBLEFAULT_STACK - 1] = "#DF", ++#if STACKFAULT_STACK > 0 + [STACKFAULT_STACK - 1] = "#SS", ++#endif + [MCE_STACK - 1] = "#MC", + #if DEBUG_STKSZ > EXCEPTION_STKSZ + [N_EXCEPTION_STACKS ... +@@ -106,7 +110,8 @@ void dump_trace(struct task_struct *task + const struct stacktrace_ops *ops, void *data) + { + const unsigned cpu = get_cpu(); +- unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; ++ unsigned long *irq_stack_end = ++ (unsigned long *)per_cpu(irq_stack_ptr, cpu); + unsigned used = 0; + struct thread_info *tinfo; + int graph = 0; +@@ -160,23 +165,23 @@ void dump_trace(struct task_struct *task + stack = (unsigned long *) estack_end[-2]; + continue; + } +- if (irqstack_end) { +- unsigned long *irqstack; +- irqstack = irqstack_end - +- (IRQSTACKSIZE - 64) / sizeof(*irqstack); ++ if (irq_stack_end) { ++ unsigned long *irq_stack; ++ irq_stack = irq_stack_end - ++ (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); + +- if (stack >= irqstack && stack < irqstack_end) { ++ if (stack >= irq_stack && stack < irq_stack_end) { + if (ops->stack(data, "IRQ") < 0) + break; + bp = print_context_stack(tinfo, stack, bp, +- ops, data, irqstack_end, &graph); ++ ops, data, irq_stack_end, &graph); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ +- stack = (unsigned long *) (irqstack_end[-1]); +- irqstack_end = NULL; ++ stack = (unsigned long *) (irq_stack_end[-1]); ++ irq_stack_end = NULL; + ops->stack(data, "EOI"); + continue; + } +@@ -199,10 +204,10 @@ show_stack_log_lvl(struct task_struct *t + unsigned long *stack; + int i; + const int cpu = smp_processor_id(); +- unsigned long *irqstack_end = +- (unsigned long *) (cpu_pda(cpu)->irqstackptr); +- unsigned long *irqstack = +- (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); ++ unsigned long *irq_stack_end = ++ (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); ++ unsigned long *irq_stack = ++ (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + + /* + * debugging aid: "show_stack(NULL, NULL);" prints the +@@ -218,9 +223,9 @@ show_stack_log_lvl(struct task_struct *t + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { +- if (stack >= irqstack && stack <= irqstack_end) { +- if (stack == irqstack_end) { +- stack = (unsigned long *) (irqstack_end[-1]); ++ if (stack >= irq_stack && stack <= irq_stack_end) { ++ if (stack == irq_stack_end) { ++ stack = (unsigned long *) (irq_stack_end[-1]); + printk(" "); + } + } else { +@@ -241,7 +246,7 @@ void show_registers(struct pt_regs *regs + int i; + unsigned long sp; + const int cpu = smp_processor_id(); +- struct task_struct *cur = cpu_pda(cpu)->pcurrent; ++ struct task_struct *cur = current; + + sp = regs->sp; + printk("CPU %d ", cpu); +Index: linux-2.6-tip/arch/x86/kernel/e820.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/e820.c ++++ linux-2.6-tip/arch/x86/kernel/e820.c +@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u6 + /* + * Add a memory region to the kernel e820 map. + */ +-void __init e820_add_region(u64 start, u64 size, int type) ++static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, ++ int type) + { +- int x = e820.nr_map; ++ int x = e820x->nr_map; + +- if (x == ARRAY_SIZE(e820.map)) { ++ if (x == ARRAY_SIZE(e820x->map)) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + +- e820.map[x].addr = start; +- e820.map[x].size = size; +- e820.map[x].type = type; +- e820.nr_map++; ++ e820x->map[x].addr = start; ++ e820x->map[x].size = size; ++ e820x->map[x].type = type; ++ e820x->nr_map++; ++} ++ ++void __init e820_add_region(u64 start, u64 size, int type) ++{ ++ __e820_add_region(&e820, start, size, type); ++} ++ ++static void __init e820_print_type(u32 type) ++{ ++ switch (type) { ++ case E820_RAM: ++ case E820_RESERVED_KERN: ++ printk(KERN_CONT "(usable)"); ++ break; ++ case E820_RESERVED: ++ printk(KERN_CONT "(reserved)"); ++ break; ++ case E820_ACPI: ++ printk(KERN_CONT "(ACPI data)"); ++ break; ++ case E820_NVS: ++ printk(KERN_CONT "(ACPI NVS)"); ++ break; ++ case E820_UNUSABLE: ++ printk(KERN_CONT "(unusable)"); ++ break; ++ default: ++ printk(KERN_CONT "type %u", type); ++ break; ++ } + } + + void __init e820_print_map(char *who) +@@ -134,27 +165,8 @@ void __init e820_print_map(char *who) + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); +- switch (e820.map[i].type) { +- case E820_RAM: +- case E820_RESERVED_KERN: +- printk(KERN_CONT "(usable)\n"); +- break; +- case E820_RESERVED: +- printk(KERN_CONT "(reserved)\n"); +- break; +- case E820_ACPI: +- printk(KERN_CONT "(ACPI data)\n"); +- break; +- case E820_NVS: +- printk(KERN_CONT "(ACPI NVS)\n"); +- break; +- case E820_UNUSABLE: +- printk("(unusable)\n"); +- break; +- default: +- printk(KERN_CONT "type %u\n", e820.map[i].type); +- break; +- } ++ e820_print_type(e820.map[i].type); ++ printk(KERN_CONT "\n"); + } + } + +@@ -221,7 +233,7 @@ void __init e820_print_map(char *who) + */ + + int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, +- int *pnr_map) ++ u32 *pnr_map) + { + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ +@@ -417,11 +429,12 @@ static int __init append_e820_map(struct + return __append_e820_map(biosmap, nr_map); + } + +-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start, ++static u64 __init __e820_update_range(struct e820map *e820x, u64 start, + u64 size, unsigned old_type, + unsigned new_type) + { +- int i; ++ u64 end; ++ unsigned int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); +@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map( + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + +- for (i = 0; i < e820.nr_map; i++) { ++ end = start + size; ++ printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", ++ (unsigned long long) start, ++ (unsigned long long) end); ++ e820_print_type(old_type); ++ printk(KERN_CONT " ==> "); ++ e820_print_type(new_type); ++ printk(KERN_CONT "\n"); ++ ++ for (i = 0; i < e820x->nr_map; i++) { + struct e820entry *ei = &e820x->map[i]; + u64 final_start, final_end; ++ u64 ei_end; ++ + if (ei->type != old_type) + continue; +- /* totally covered? */ +- if (ei->addr >= start && +- (ei->addr + ei->size) <= (start + size)) { ++ ++ ei_end = ei->addr + ei->size; ++ /* totally covered by new range? */ ++ if (ei->addr >= start && ei_end <= end) { + ei->type = new_type; + real_updated_size += ei->size; + continue; + } ++ ++ /* new range is totally covered? */ ++ if (ei->addr < start && ei_end > end) { ++ __e820_add_region(e820x, start, size, new_type); ++ __e820_add_region(e820x, end, ei_end - end, ei->type); ++ ei->size = start - ei->addr; ++ real_updated_size += size; ++ continue; ++ } ++ + /* partially covered */ + final_start = max(start, ei->addr); +- final_end = min(start + size, ei->addr + ei->size); ++ final_end = min(end, ei_end); + if (final_start >= final_end) + continue; +- e820_add_region(final_start, final_end - final_start, +- new_type); ++ ++ __e820_add_region(e820x, final_start, final_end - final_start, ++ new_type); ++ + real_updated_size += final_end - final_start; + ++ /* ++ * left range could be head or tail, so need to update ++ * size at first. ++ */ + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; +@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map( + u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) + { +- return e820_update_range_map(&e820, start, size, old_type, new_type); ++ return __e820_update_range(&e820, start, size, old_type, new_type); + } + + static u64 __init e820_update_range_saved(u64 start, u64 size, + unsigned old_type, unsigned new_type) + { +- return e820_update_range_map(&e820_saved, start, size, old_type, ++ return __e820_update_range(&e820_saved, start, size, old_type, + new_type); + } + +@@ -511,7 +552,7 @@ u64 __init e820_remove_range(u64 start, + + void __init update_e820(void) + { +- int nr_map; ++ u32 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) +@@ -522,7 +563,7 @@ void __init update_e820(void) + } + static void __init update_e820_saved(void) + { +- int nr_map; ++ u32 nr_map; + + nr_map = e820_saved.nr_map; + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) +@@ -858,6 +899,9 @@ void __init reserve_early_overlap_ok(u64 + */ + void __init reserve_early(u64 start, u64 end, char *name) + { ++ if (start >= end) ++ return; ++ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 0); + } +@@ -1017,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start + continue; + return addr; + } +- return -1UL; + ++ return -1ULL; + } + + /* +@@ -1031,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt + u64 start; + + start = startt; +- while (size < sizet) ++ while (size < sizet && (start + 1)) + start = find_e820_area_size(start, &size, align); + + if (size < sizet) + return 0; + ++#ifdef CONFIG_X86_32 ++ if (start >= MAXMEM) ++ return 0; ++ if (start + size > MAXMEM) ++ size = MAXMEM - start; ++#endif ++ + addr = round_down(start + size - sizet, align); ++ if (addr < start) ++ return 0; + e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); + e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820 for early_reserve_e820\n"); +@@ -1250,7 +1303,7 @@ early_param("memmap", parse_memmap_opt); + void __init finish_e820_parsing(void) + { + if (userdef) { +- int nr = e820.nr_map; ++ u32 nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) + early_panic("Invalid user supplied memory map"); +@@ -1333,7 +1386,7 @@ void __init e820_reserve_resources_late( + char *__init default_machine_specific_memory_setup(void) + { + char *who = "BIOS-e820"; +- int new_nr; ++ u32 new_nr; + /* + * Try to copy the BIOS-supplied E820-map. + * +Index: linux-2.6-tip/arch/x86/kernel/early-quirks.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/early-quirks.c ++++ linux-2.6-tip/arch/x86/kernel/early-quirks.c +@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, + } + + #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) ++#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) + static u32 __init ati_ixp4x0_rev(int num, int slot, int func) + { + u32 d; +@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num + d &= 0xff; + return d; + } ++#endif + + static void __init ati_bugs(int num, int slot, int func) + { +Index: linux-2.6-tip/arch/x86/kernel/early_printk.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/early_printk.c ++++ linux-2.6-tip/arch/x86/kernel/early_printk.c +@@ -13,8 +13,8 @@ + #include + #include + #include +-#include + #include ++#include + #include + + /* Simple VGA output */ +@@ -59,7 +59,7 @@ static void early_vga_write(struct conso + static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, +- .flags = CON_PRINTBUFFER, ++ .flags = CON_PRINTBUFFER | CON_ATOMIC, + .index = -1, + }; + +@@ -156,7 +156,7 @@ static __init void early_serial_init(cha + static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, +- .flags = CON_PRINTBUFFER, ++ .flags = CON_PRINTBUFFER | CON_ATOMIC, + .index = -1, + }; + +@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void + return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); + } + +-static void dbgp_mdelay(int ms) ++static void __init dbgp_mdelay(int ms) + { + int i; + +@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *bu + writel(hi, &ehci_debug->data47); + } + +-static void dbgp_get_data(void *buf, int size) ++static void __init dbgp_get_data(void *buf, int size) + { + unsigned char *bytes = buf; + u32 lo, hi; +@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devn + return ret; + } + +-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, ++static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, + int size) + { + u32 pids, addr, ctrl; +@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnu + return ret; + } + +-static int dbgp_control_msg(unsigned devnum, int requesttype, int request, +- int value, int index, void *data, int size) ++static int __init dbgp_control_msg(unsigned devnum, int requesttype, ++ int request, int value, int index, void *data, int size) + { + u32 pids, addr, ctrl; + struct usb_ctrlrequest req; +@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num + return 0; + } + +-static int ehci_reset_port(int port) ++static int __init ehci_reset_port(int port) + { + u32 portsc; + u32 delay_time, delay; +@@ -532,7 +532,7 @@ static int ehci_reset_port(int port) + return -EBUSY; + } + +-static int ehci_wait_for_port(int port) ++static int __init ehci_wait_for_port(int port) + { + u32 status; + int ret, reps; +@@ -557,13 +557,13 @@ static inline void dbgp_printk(const cha + + typedef void (*set_debug_port_t)(int port); + +-static void default_set_debug_port(int port) ++static void __init default_set_debug_port(int port) + { + } + +-static set_debug_port_t set_debug_port = default_set_debug_port; ++static set_debug_port_t __initdata set_debug_port = default_set_debug_port; + +-static void nvidia_set_debug_port(int port) ++static void __init nvidia_set_debug_port(int port) + { + u32 dword; + dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, +@@ -881,7 +881,7 @@ static int __initdata early_console_init + + asmlinkage void early_printk(const char *fmt, ...) + { +- char buf[512]; ++ static char buf[512]; + int n; + va_list ap; + +Index: linux-2.6-tip/arch/x86/kernel/efi.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/efi.c ++++ linux-2.6-tip/arch/x86/kernel/efi.c +@@ -366,10 +366,12 @@ void __init efi_init(void) + SMBIOS_TABLE_GUID)) { + efi.smbios = config_tables[i].table; + printk(" SMBIOS=0x%lx ", config_tables[i].table); ++#ifdef CONFIG_X86_UV + } else if (!efi_guidcmp(config_tables[i].guid, + UV_SYSTEM_TABLE_GUID)) { + efi.uv_systab = config_tables[i].table; + printk(" UVsystab=0x%lx ", config_tables[i].table); ++#endif + } else if (!efi_guidcmp(config_tables[i].guid, + HCDP_TABLE_GUID)) { + efi.hcdp = config_tables[i].table; +Index: linux-2.6-tip/arch/x86/kernel/efi_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/efi_64.c ++++ linux-2.6-tip/arch/x86/kernel/efi_64.c +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + static pgd_t save_pgd __initdata; + static unsigned long efi_flags __initdata; +Index: linux-2.6-tip/arch/x86/kernel/efi_stub_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/efi_stub_32.S ++++ linux-2.6-tip/arch/x86/kernel/efi_stub_32.S +@@ -6,7 +6,7 @@ + */ + + #include +-#include ++#include + + /* + * efi_call_phys(void *, ...) is a function with variable parameters. +@@ -113,6 +113,7 @@ ENTRY(efi_call_phys) + movl (%edx), %ecx + pushl %ecx + ret ++ENDPROC(efi_call_phys) + .previous + + .data +Index: linux-2.6-tip/arch/x86/kernel/efi_stub_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/efi_stub_64.S ++++ linux-2.6-tip/arch/x86/kernel/efi_stub_64.S +@@ -41,6 +41,7 @@ ENTRY(efi_call0) + addq $32, %rsp + RESTORE_XMM + ret ++ENDPROC(efi_call0) + + ENTRY(efi_call1) + SAVE_XMM +@@ -50,6 +51,7 @@ ENTRY(efi_call1) + addq $32, %rsp + RESTORE_XMM + ret ++ENDPROC(efi_call1) + + ENTRY(efi_call2) + SAVE_XMM +@@ -59,6 +61,7 @@ ENTRY(efi_call2) + addq $32, %rsp + RESTORE_XMM + ret ++ENDPROC(efi_call2) + + ENTRY(efi_call3) + SAVE_XMM +@@ -69,6 +72,7 @@ ENTRY(efi_call3) + addq $32, %rsp + RESTORE_XMM + ret ++ENDPROC(efi_call3) + + ENTRY(efi_call4) + SAVE_XMM +@@ -80,6 +84,7 @@ ENTRY(efi_call4) + addq $32, %rsp + RESTORE_XMM + ret ++ENDPROC(efi_call4) + + ENTRY(efi_call5) + SAVE_XMM +@@ -92,6 +97,7 @@ ENTRY(efi_call5) + addq $48, %rsp + RESTORE_XMM + ret ++ENDPROC(efi_call5) + + ENTRY(efi_call6) + SAVE_XMM +@@ -107,3 +113,4 @@ ENTRY(efi_call6) + addq $48, %rsp + RESTORE_XMM + ret ++ENDPROC(efi_call6) +Index: linux-2.6-tip/arch/x86/kernel/entry_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/entry_32.S ++++ linux-2.6-tip/arch/x86/kernel/entry_32.S +@@ -30,12 +30,13 @@ + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs +- * 28(%esp) - orig_eax +- * 2C(%esp) - %eip +- * 30(%esp) - %cs +- * 34(%esp) - %eflags +- * 38(%esp) - %oldesp +- * 3C(%esp) - %oldss ++ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS ++ * 2C(%esp) - orig_eax ++ * 30(%esp) - %eip ++ * 34(%esp) - %cs ++ * 38(%esp) - %eflags ++ * 3C(%esp) - %oldesp ++ * 40(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ +@@ -46,7 +47,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + #include +@@ -101,121 +102,221 @@ + #define resume_userspace_sig resume_userspace + #endif + +-#define SAVE_ALL \ +- cld; \ +- pushl %fs; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- /*CFI_REL_OFFSET fs, 0;*/\ +- pushl %es; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- /*CFI_REL_OFFSET es, 0;*/\ +- pushl %ds; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- /*CFI_REL_OFFSET ds, 0;*/\ +- pushl %eax; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- CFI_REL_OFFSET eax, 0;\ +- pushl %ebp; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- CFI_REL_OFFSET ebp, 0;\ +- pushl %edi; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- CFI_REL_OFFSET edi, 0;\ +- pushl %esi; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- CFI_REL_OFFSET esi, 0;\ +- pushl %edx; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- CFI_REL_OFFSET edx, 0;\ +- pushl %ecx; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- CFI_REL_OFFSET ecx, 0;\ +- pushl %ebx; \ +- CFI_ADJUST_CFA_OFFSET 4;\ +- CFI_REL_OFFSET ebx, 0;\ +- movl $(__USER_DS), %edx; \ +- movl %edx, %ds; \ +- movl %edx, %es; \ +- movl $(__KERNEL_PERCPU), %edx; \ ++/* ++ * User gs save/restore ++ * ++ * %gs is used for userland TLS and kernel only uses it for stack ++ * canary which is required to be at %gs:20 by gcc. Read the comment ++ * at the top of stackprotector.h for more info. ++ * ++ * Local labels 98 and 99 are used. ++ */ ++#ifdef CONFIG_X86_32_LAZY_GS ++ ++ /* unfortunately push/pop can't be no-op */ ++.macro PUSH_GS ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++.endm ++.macro POP_GS pop=0 ++ addl $(4 + \pop), %esp ++ CFI_ADJUST_CFA_OFFSET -(4 + \pop) ++.endm ++.macro POP_GS_EX ++.endm ++ ++ /* all the rest are no-op */ ++.macro PTGS_TO_GS ++.endm ++.macro PTGS_TO_GS_EX ++.endm ++.macro GS_TO_REG reg ++.endm ++.macro REG_TO_PTGS reg ++.endm ++.macro SET_KERNEL_GS reg ++.endm ++ ++#else /* CONFIG_X86_32_LAZY_GS */ ++ ++.macro PUSH_GS ++ pushl %gs ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET gs, 0*/ ++.endm ++ ++.macro POP_GS pop=0 ++98: popl %gs ++ CFI_ADJUST_CFA_OFFSET -4 ++ /*CFI_RESTORE gs*/ ++ .if \pop <> 0 ++ add $\pop, %esp ++ CFI_ADJUST_CFA_OFFSET -\pop ++ .endif ++.endm ++.macro POP_GS_EX ++.pushsection .fixup, "ax" ++99: movl $0, (%esp) ++ jmp 98b ++.section __ex_table, "a" ++ .align 4 ++ .long 98b, 99b ++.popsection ++.endm ++ ++.macro PTGS_TO_GS ++98: mov PT_GS(%esp), %gs ++.endm ++.macro PTGS_TO_GS_EX ++.pushsection .fixup, "ax" ++99: movl $0, PT_GS(%esp) ++ jmp 98b ++.section __ex_table, "a" ++ .align 4 ++ .long 98b, 99b ++.popsection ++.endm ++ ++.macro GS_TO_REG reg ++ movl %gs, \reg ++ /*CFI_REGISTER gs, \reg*/ ++.endm ++.macro REG_TO_PTGS reg ++ movl \reg, PT_GS(%esp) ++ /*CFI_REL_OFFSET gs, PT_GS*/ ++.endm ++.macro SET_KERNEL_GS reg ++ movl $(__KERNEL_STACK_CANARY), \reg ++ movl \reg, %gs ++.endm ++ ++#endif /* CONFIG_X86_32_LAZY_GS */ ++ ++.macro SAVE_ALL ++ cld ++ PUSH_GS ++ pushl %fs ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET fs, 0;*/ ++ pushl %es ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET es, 0;*/ ++ pushl %ds ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET ds, 0;*/ ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET eax, 0 ++ pushl %ebp ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ebp, 0 ++ pushl %edi ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET edi, 0 ++ pushl %esi ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET esi, 0 ++ pushl %edx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET edx, 0 ++ pushl %ecx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ecx, 0 ++ pushl %ebx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ebx, 0 ++ movl $(__USER_DS), %edx ++ movl %edx, %ds ++ movl %edx, %es ++ movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs ++ SET_KERNEL_GS %edx ++.endm + +-#define RESTORE_INT_REGS \ +- popl %ebx; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- CFI_RESTORE ebx;\ +- popl %ecx; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- CFI_RESTORE ecx;\ +- popl %edx; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- CFI_RESTORE edx;\ +- popl %esi; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- CFI_RESTORE esi;\ +- popl %edi; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- CFI_RESTORE edi;\ +- popl %ebp; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- CFI_RESTORE ebp;\ +- popl %eax; \ +- CFI_ADJUST_CFA_OFFSET -4;\ ++.macro RESTORE_INT_REGS ++ popl %ebx ++ CFI_ADJUST_CFA_OFFSET -4 ++ CFI_RESTORE ebx ++ popl %ecx ++ CFI_ADJUST_CFA_OFFSET -4 ++ CFI_RESTORE ecx ++ popl %edx ++ CFI_ADJUST_CFA_OFFSET -4 ++ CFI_RESTORE edx ++ popl %esi ++ CFI_ADJUST_CFA_OFFSET -4 ++ CFI_RESTORE esi ++ popl %edi ++ CFI_ADJUST_CFA_OFFSET -4 ++ CFI_RESTORE edi ++ popl %ebp ++ CFI_ADJUST_CFA_OFFSET -4 ++ CFI_RESTORE ebp ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE eax ++.endm + +-#define RESTORE_REGS \ +- RESTORE_INT_REGS; \ +-1: popl %ds; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- /*CFI_RESTORE ds;*/\ +-2: popl %es; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- /*CFI_RESTORE es;*/\ +-3: popl %fs; \ +- CFI_ADJUST_CFA_OFFSET -4;\ +- /*CFI_RESTORE fs;*/\ +-.pushsection .fixup,"ax"; \ +-4: movl $0,(%esp); \ +- jmp 1b; \ +-5: movl $0,(%esp); \ +- jmp 2b; \ +-6: movl $0,(%esp); \ +- jmp 3b; \ +-.section __ex_table,"a";\ +- .align 4; \ +- .long 1b,4b; \ +- .long 2b,5b; \ +- .long 3b,6b; \ ++.macro RESTORE_REGS pop=0 ++ RESTORE_INT_REGS ++1: popl %ds ++ CFI_ADJUST_CFA_OFFSET -4 ++ /*CFI_RESTORE ds;*/ ++2: popl %es ++ CFI_ADJUST_CFA_OFFSET -4 ++ /*CFI_RESTORE es;*/ ++3: popl %fs ++ CFI_ADJUST_CFA_OFFSET -4 ++ /*CFI_RESTORE fs;*/ ++ POP_GS \pop ++.pushsection .fixup, "ax" ++4: movl $0, (%esp) ++ jmp 1b ++5: movl $0, (%esp) ++ jmp 2b ++6: movl $0, (%esp) ++ jmp 3b ++.section __ex_table, "a" ++ .align 4 ++ .long 1b, 4b ++ .long 2b, 5b ++ .long 3b, 6b + .popsection ++ POP_GS_EX ++.endm + +-#define RING0_INT_FRAME \ +- CFI_STARTPROC simple;\ +- CFI_SIGNAL_FRAME;\ +- CFI_DEF_CFA esp, 3*4;\ +- /*CFI_OFFSET cs, -2*4;*/\ ++.macro RING0_INT_FRAME ++ CFI_STARTPROC simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA esp, 3*4 ++ /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 ++.endm + +-#define RING0_EC_FRAME \ +- CFI_STARTPROC simple;\ +- CFI_SIGNAL_FRAME;\ +- CFI_DEF_CFA esp, 4*4;\ +- /*CFI_OFFSET cs, -2*4;*/\ ++.macro RING0_EC_FRAME ++ CFI_STARTPROC simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA esp, 4*4 ++ /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 ++.endm + +-#define RING0_PTREGS_FRAME \ +- CFI_STARTPROC simple;\ +- CFI_SIGNAL_FRAME;\ +- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ +- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ +- CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ +- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ +- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ +- CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ +- CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ +- CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ +- CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ +- CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ +- CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ ++.macro RING0_PTREGS_FRAME ++ CFI_STARTPROC simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX ++ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ ++ CFI_OFFSET eip, PT_EIP-PT_OLDESP ++ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ ++ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ ++ CFI_OFFSET eax, PT_EAX-PT_OLDESP ++ CFI_OFFSET ebp, PT_EBP-PT_OLDESP ++ CFI_OFFSET edi, PT_EDI-PT_OLDESP ++ CFI_OFFSET esi, PT_ESI-PT_OLDESP ++ CFI_OFFSET edx, PT_EDX-PT_OLDESP ++ CFI_OFFSET ecx, PT_ECX-PT_OLDESP + CFI_OFFSET ebx, PT_EBX-PT_OLDESP ++.endm + + ENTRY(ret_from_fork) + CFI_STARTPROC +@@ -270,14 +371,18 @@ END(ret_from_exception) + #ifdef CONFIG_PREEMPT + ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) ++ cmpl $0, kernel_preemption ++ jz restore_nocheck + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_nocheck + need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl +- jz restore_all ++ jz restore_nocheck + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? +- jz restore_all ++ jz restore_nocheck ++ DISABLE_INTERRUPTS(CLBR_ANY) ++ + call preempt_schedule_irq + jmp need_resched + END(resume_kernel) +@@ -341,8 +446,7 @@ sysenter_past_esp: + + GET_THREAD_INFO(%ebp) + +- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ +- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) ++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit + sysenter_do_call: + cmpl $(nr_syscalls), %eax +@@ -353,7 +457,7 @@ sysenter_do_call: + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx +- testw $_TIF_ALLWORK_MASK, %cx ++ testl $_TIF_ALLWORK_MASK, %ecx + jne sysexit_audit + sysenter_exit: + /* if something modifies registers it must also disable sysexit */ +@@ -362,11 +466,12 @@ sysenter_exit: + xorl %ebp,%ebp + TRACE_IRQS_ON + 1: mov PT_FS(%esp), %fs ++ PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + + #ifdef CONFIG_AUDITSYSCALL + sysenter_audit: +- testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) ++ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 +@@ -383,7 +488,7 @@ sysenter_audit: + jmp sysenter_do_call + + sysexit_audit: +- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx ++ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) +@@ -396,7 +501,7 @@ sysexit_audit: + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx +- testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx ++ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +@@ -410,6 +515,7 @@ sysexit_audit: + .align 4 + .long 1b,2b + .popsection ++ PTGS_TO_GS_EX + ENDPROC(ia32_sysenter_target) + + # system call handler stub +@@ -420,8 +526,7 @@ ENTRY(system_call) + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation +- /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ +- testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) ++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +@@ -435,7 +540,7 @@ syscall_exit: + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx +- testw $_TIF_ALLWORK_MASK, %cx # current->work ++ testl $_TIF_ALLWORK_MASK, %ecx # current->work + jne syscall_exit_work + + restore_all: +@@ -452,8 +557,7 @@ restore_all: + restore_nocheck: + TRACE_IRQS_IRET + restore_nocheck_notrace: +- RESTORE_REGS +- addl $4, %esp # skip orig_eax/error_code ++ RESTORE_REGS 4 # skip orig_eax/error_code + CFI_ADJUST_CFA_OFFSET -4 + irq_return: + INTERRUPT_RETURN +@@ -513,20 +617,19 @@ ENDPROC(system_call) + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway + work_pending: +- testb $_TIF_NEED_RESCHED, %cl ++ testl $(_TIF_NEED_RESCHED), %ecx + jz work_notifysig + work_resched: +- call schedule ++ call __schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret +- TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all +- testb $_TIF_NEED_RESCHED, %cl ++ testl $(_TIF_NEED_RESCHED), %ecx + jnz work_resched + + work_notifysig: # deal with pending signals and +@@ -571,7 +674,7 @@ END(syscall_trace_entry) + # perform syscall exit tracing + ALIGN + syscall_exit_work: +- testb $_TIF_WORK_SYSCALL_EXIT, %cl ++ testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call +@@ -595,28 +698,50 @@ syscall_badsys: + END(syscall_badsys) + CFI_ENDPROC + +-#define FIXUP_ESPFIX_STACK \ +- /* since we are on a wrong stack, we cant make it a C code :( */ \ +- PER_CPU(gdt_page, %ebx); \ +- GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ +- addl %esp, %eax; \ +- pushl $__KERNEL_DS; \ +- CFI_ADJUST_CFA_OFFSET 4; \ +- pushl %eax; \ +- CFI_ADJUST_CFA_OFFSET 4; \ +- lss (%esp), %esp; \ +- CFI_ADJUST_CFA_OFFSET -8; +-#define UNWIND_ESPFIX_STACK \ +- movl %ss, %eax; \ +- /* see if on espfix stack */ \ +- cmpw $__ESPFIX_SS, %ax; \ +- jne 27f; \ +- movl $__KERNEL_DS, %eax; \ +- movl %eax, %ds; \ +- movl %eax, %es; \ +- /* switch to normal stack */ \ +- FIXUP_ESPFIX_STACK; \ +-27:; ++/* ++ * System calls that need a pt_regs pointer. ++ */ ++#define PTREGSCALL(name) \ ++ ALIGN; \ ++ptregs_##name: \ ++ leal 4(%esp),%eax; \ ++ jmp sys_##name; ++ ++PTREGSCALL(iopl) ++PTREGSCALL(fork) ++PTREGSCALL(clone) ++PTREGSCALL(vfork) ++PTREGSCALL(execve) ++PTREGSCALL(sigaltstack) ++PTREGSCALL(sigreturn) ++PTREGSCALL(rt_sigreturn) ++PTREGSCALL(vm86) ++PTREGSCALL(vm86old) ++ ++.macro FIXUP_ESPFIX_STACK ++ /* since we are on a wrong stack, we cant make it a C code :( */ ++ PER_CPU(gdt_page, %ebx) ++ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) ++ addl %esp, %eax ++ pushl $__KERNEL_DS ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ lss (%esp), %esp ++ CFI_ADJUST_CFA_OFFSET -8 ++.endm ++.macro UNWIND_ESPFIX_STACK ++ movl %ss, %eax ++ /* see if on espfix stack */ ++ cmpw $__ESPFIX_SS, %ax ++ jne 27f ++ movl $__KERNEL_DS, %eax ++ movl %eax, %ds ++ movl %eax, %es ++ /* switch to normal stack */ ++ FIXUP_ESPFIX_STACK ++27: ++.endm + + /* + * Build the entry stubs and pointer table with some assembler magic. +@@ -672,7 +797,7 @@ common_interrupt: + ENDPROC(common_interrupt) + CFI_ENDPROC + +-#define BUILD_INTERRUPT(name, nr) \ ++#define BUILD_INTERRUPT3(name, nr, fn) \ + ENTRY(name) \ + RING0_INT_FRAME; \ + pushl $~(nr); \ +@@ -680,13 +805,15 @@ ENTRY(name) \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ +- call smp_##name; \ ++ call fn; \ + jmp ret_from_intr; \ + CFI_ENDPROC; \ + ENDPROC(name) + ++#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) ++ + /* The include is where all of the SMP etc. interrupts come from */ +-#include "entry_arch.h" ++#include + + ENTRY(coprocessor_error) + RING0_INT_FRAME +@@ -1068,7 +1195,10 @@ ENTRY(page_fault) + CFI_ADJUST_CFA_OFFSET 4 + ALIGN + error_code: +- /* the function address is in %fs's slot on the stack */ ++ /* the function address is in %gs's slot on the stack */ ++ pushl %fs ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET fs, 0*/ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ +@@ -1097,20 +1227,15 @@ error_code: + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld +- pushl %fs +- CFI_ADJUST_CFA_OFFSET 4 +- /*CFI_REL_OFFSET fs, 0*/ + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK +- popl %ecx +- CFI_ADJUST_CFA_OFFSET -4 +- /*CFI_REGISTER es, ecx*/ +- movl PT_FS(%esp), %edi # get the function address ++ GS_TO_REG %ecx ++ movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart +- mov %ecx, PT_FS(%esp) +- /*CFI_REL_OFFSET fs, ES*/ ++ REG_TO_PTGS %ecx ++ SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es +@@ -1134,26 +1259,27 @@ END(page_fault) + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +-#define FIX_STACK(offset, ok, label) \ +- cmpw $__KERNEL_CS,4(%esp); \ +- jne ok; \ +-label: \ +- movl TSS_sysenter_sp0+offset(%esp),%esp; \ +- CFI_DEF_CFA esp, 0; \ +- CFI_UNDEFINED eip; \ +- pushfl; \ +- CFI_ADJUST_CFA_OFFSET 4; \ +- pushl $__KERNEL_CS; \ +- CFI_ADJUST_CFA_OFFSET 4; \ +- pushl $sysenter_past_esp; \ +- CFI_ADJUST_CFA_OFFSET 4; \ ++.macro FIX_STACK offset ok label ++ cmpw $__KERNEL_CS, 4(%esp) ++ jne \ok ++\label: ++ movl TSS_sysenter_sp0 + \offset(%esp), %esp ++ CFI_DEF_CFA esp, 0 ++ CFI_UNDEFINED eip ++ pushfl ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $__KERNEL_CS ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $sysenter_past_esp ++ CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 ++.endm + + ENTRY(debug) + RING0_INT_FRAME + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct +- FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) ++ FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn + debug_stack_correct: + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 +@@ -1211,7 +1337,7 @@ nmi_stack_correct: + + nmi_stack_fixup: + RING0_INT_FRAME +- FIX_STACK(12,nmi_stack_correct, 1) ++ FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + + nmi_debug_stack_check: +@@ -1222,7 +1348,7 @@ nmi_debug_stack_check: + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct +- FIX_STACK(24,nmi_stack_correct, 1) ++ FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + + nmi_espfix_stack: +@@ -1234,7 +1360,7 @@ nmi_espfix_stack: + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 +- addw $4, (%esp) ++ addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) +Index: linux-2.6-tip/arch/x86/kernel/entry_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/entry_64.S ++++ linux-2.6-tip/arch/x86/kernel/entry_64.S +@@ -48,10 +48,11 @@ + #include + #include + #include +-#include ++#include + #include + #include + #include ++#include + + /* Avoid __ASSEMBLER__'ifying just for this. */ + #include +@@ -76,20 +77,17 @@ ENTRY(ftrace_caller) + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + +-.globl ftrace_call +-ftrace_call: ++GLOBAL(ftrace_call) + call ftrace_stub + + MCOUNT_RESTORE_FRAME + + #ifdef CONFIG_FUNCTION_GRAPH_TRACER +-.globl ftrace_graph_call +-ftrace_graph_call: ++GLOBAL(ftrace_graph_call) + jmp ftrace_stub + #endif + +-.globl ftrace_stub +-ftrace_stub: ++GLOBAL(ftrace_stub) + retq + END(ftrace_caller) + +@@ -109,8 +107,7 @@ ENTRY(mcount) + jnz ftrace_graph_caller + #endif + +-.globl ftrace_stub +-ftrace_stub: ++GLOBAL(ftrace_stub) + retq + + trace: +@@ -147,9 +144,7 @@ ENTRY(ftrace_graph_caller) + retq + END(ftrace_graph_caller) + +- +-.globl return_to_handler +-return_to_handler: ++GLOBAL(return_to_handler) + subq $80, %rsp + + movq %rax, (%rsp) +@@ -187,6 +182,7 @@ return_to_handler: + ENTRY(native_usergs_sysret64) + swapgs + sysretq ++ENDPROC(native_usergs_sysret64) + #endif /* CONFIG_PARAVIRT */ + + +@@ -209,7 +205,7 @@ ENTRY(native_usergs_sysret64) + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp offset=0 +- movq %gs:pda_oldrsp,\tmp ++ movq PER_CPU_VAR(old_rsp),\tmp + movq \tmp,RSP+\offset(%rsp) + movq $__USER_DS,SS+\offset(%rsp) + movq $__USER_CS,CS+\offset(%rsp) +@@ -220,7 +216,7 @@ ENTRY(native_usergs_sysret64) + + .macro RESTORE_TOP_OF_STACK tmp offset=0 + movq RSP+\offset(%rsp),\tmp +- movq \tmp,%gs:pda_oldrsp ++ movq \tmp,PER_CPU_VAR(old_rsp) + movq EFLAGS+\offset(%rsp),\tmp + movq \tmp,R11+\offset(%rsp) + .endm +@@ -336,15 +332,15 @@ ENTRY(save_args) + je 1f + SWAPGS + /* +- * irqcount is used to check if a CPU is already on an interrupt stack ++ * irq_count is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +-1: incl %gs:pda_irqcount ++1: incl PER_CPU_VAR(irq_count) + jne 2f + popq_cfi %rax /* move return address... */ +- mov %gs:pda_irqstackptr,%rsp ++ mov PER_CPU_VAR(irq_stack_ptr),%rsp + EMPTY_FRAME 0 + pushq_cfi %rbp /* backlink for unwinder */ + pushq_cfi %rax /* ... to the new stack */ +@@ -372,6 +368,7 @@ ENTRY(save_rest) + END(save_rest) + + /* save complete stack frame */ ++ .pushsection .kprobes.text, "ax" + ENTRY(save_paranoid) + XCPT_FRAME 1 RDI+8 + cld +@@ -400,6 +397,7 @@ ENTRY(save_paranoid) + 1: ret + CFI_ENDPROC + END(save_paranoid) ++ .popsection + + /* + * A newly forked process directly context switches into this address. +@@ -409,6 +407,8 @@ END(save_paranoid) + ENTRY(ret_from_fork) + DEFAULT_FRAME + ++ LOCK ; btr $TIF_FORK,TI_flags(%r8) ++ + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 8 + popf # reset kernel eflags +@@ -418,7 +418,6 @@ ENTRY(ret_from_fork) + + GET_THREAD_INFO(%rcx) + +- CFI_REMEMBER_STATE + RESTORE_REST + + testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? +@@ -430,7 +429,6 @@ ENTRY(ret_from_fork) + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET + jmp ret_from_sys_call # go to the SYSRET fastpath + +- CFI_RESTORE_STATE + CFI_ENDPROC + END(ret_from_fork) + +@@ -468,7 +466,7 @@ END(ret_from_fork) + ENTRY(system_call) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME +- CFI_DEF_CFA rsp,PDA_STACKOFFSET ++ CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET + CFI_REGISTER rip,rcx + /*CFI_REGISTER rflags,r11*/ + SWAPGS_UNSAFE_STACK +@@ -479,8 +477,8 @@ ENTRY(system_call) + */ + ENTRY(system_call_after_swapgs) + +- movq %rsp,%gs:pda_oldrsp +- movq %gs:pda_kernelstack,%rsp ++ movq %rsp,PER_CPU_VAR(old_rsp) ++ movq PER_CPU_VAR(kernel_stack),%rsp + /* + * No need to follow this irqs off/on section - it's straight + * and short: +@@ -523,7 +521,7 @@ sysret_check: + CFI_REGISTER rip,rcx + RESTORE_ARGS 0,-ARG_SKIP,1 + /*CFI_REGISTER rflags,r11*/ +- movq %gs:pda_oldrsp, %rsp ++ movq PER_CPU_VAR(old_rsp), %rsp + USERGS_SYSRET64 + + CFI_RESTORE_STATE +@@ -630,16 +628,14 @@ tracesys: + * Syscall return path ending with IRET. + * Has correct top of stack, but partial stack frame. + */ +- .globl int_ret_from_sys_call +- .globl int_with_check +-int_ret_from_sys_call: ++GLOBAL(int_ret_from_sys_call) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $3,CS-ARGOFFSET(%rsp) + je retint_restore_args + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +-int_with_check: ++GLOBAL(int_with_check) + LOCKDEP_SYS_EXIT_IRQ + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%edx +@@ -833,11 +829,11 @@ common_interrupt: + XCPT_FRAME + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + interrupt do_IRQ +- /* 0(%rsp): oldrsp-ARGOFFSET */ ++ /* 0(%rsp): old_rsp-ARGOFFSET */ + ret_from_intr: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF +- decl %gs:pda_irqcount ++ decl PER_CPU_VAR(irq_count) + leaveq + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 +@@ -982,10 +978,14 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt + #endif + ++#ifdef CONFIG_X86_UV + apicinterrupt UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt ++#endif + apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt ++apicinterrupt GENERIC_INTERRUPT_VECTOR \ ++ generic_interrupt smp_generic_interrupt + + #ifdef CONFIG_SMP + apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ +@@ -1025,6 +1025,11 @@ apicinterrupt ERROR_APIC_VECTOR \ + apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt + ++#ifdef CONFIG_PERF_COUNTERS ++apicinterrupt LOCAL_PERF_VECTOR \ ++ perf_counter_interrupt smp_perf_counter_interrupt ++#endif ++ + /* + * Exception entry points. + */ +@@ -1073,10 +1078,10 @@ ENTRY(\sym) + TRACE_IRQS_OFF + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ +- movq %gs:pda_data_offset, %rbp +- subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) ++ PER_CPU(init_tss, %rbp) ++ subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + call \do_sym +- addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) ++ addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + jmp paranoid_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC + END(\sym) +@@ -1138,7 +1143,7 @@ ENTRY(native_load_gs_index) + CFI_STARTPROC + pushf + CFI_ADJUST_CFA_OFFSET 8 +- DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) ++ DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + SWAPGS + gs_change: + movl %edi,%gs +@@ -1260,14 +1265,14 @@ ENTRY(call_softirq) + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp +- incl %gs:pda_irqcount +- cmove %gs:pda_irqstackptr,%rsp ++ incl PER_CPU_VAR(irq_count) ++ cmove PER_CPU_VAR(irq_stack_ptr),%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 +- decl %gs:pda_irqcount ++ decl PER_CPU_VAR(irq_count) + ret + CFI_ENDPROC + END(call_softirq) +@@ -1297,15 +1302,15 @@ ENTRY(xen_do_hypervisor_callback) # do + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + DEFAULT_FRAME +-11: incl %gs:pda_irqcount ++11: incl PER_CPU_VAR(irq_count) + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp +- cmovzq %gs:pda_irqstackptr,%rsp ++ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp +- decl %gs:pda_irqcount ++ decl PER_CPU_VAR(irq_count) + jmp error_exit + CFI_ENDPROC + END(do_hypervisor_callback) +Index: linux-2.6-tip/arch/x86/kernel/es7000_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/es7000_32.c ++++ /dev/null +@@ -1,378 +0,0 @@ +-/* +- * Written by: Garry Forsgren, Unisys Corporation +- * Natalie Protasevich, Unisys Corporation +- * This file contains the code to configure and interface +- * with Unisys ES7000 series hardware system manager. +- * +- * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. +- * +- * This program is free software; you can redistribute it and/or modify it +- * under the terms of version 2 of the GNU General Public License as +- * published by the Free Software Foundation. +- * +- * This program is distributed in the hope that it would be useful, but +- * WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +- * +- * You should have received a copy of the GNU General Public License along +- * with this program; if not, write the Free Software Foundation, Inc., 59 +- * Temple Place - Suite 330, Boston MA 02111-1307, USA. +- * +- * Contact information: Unisys Corporation, Township Line & Union Meeting +- * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: +- * +- * http://www.unisys.com +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * ES7000 chipsets +- */ +- +-#define NON_UNISYS 0 +-#define ES7000_CLASSIC 1 +-#define ES7000_ZORRO 2 +- +- +-#define MIP_REG 1 +-#define MIP_PSAI_REG 4 +- +-#define MIP_BUSY 1 +-#define MIP_SPIN 0xf0000 +-#define MIP_VALID 0x0100000000000000ULL +-#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) +- +-#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) +- +-struct mip_reg_info { +- unsigned long long mip_info; +- unsigned long long delivery_info; +- unsigned long long host_reg; +- unsigned long long mip_reg; +-}; +- +-struct part_info { +- unsigned char type; +- unsigned char length; +- unsigned char part_id; +- unsigned char apic_mode; +- unsigned long snum; +- char ptype[16]; +- char sname[64]; +- char pname[64]; +-}; +- +-struct psai { +- unsigned long long entry_type; +- unsigned long long addr; +- unsigned long long bep_addr; +-}; +- +-struct es7000_mem_info { +- unsigned char type; +- unsigned char length; +- unsigned char resv[6]; +- unsigned long long start; +- unsigned long long size; +-}; +- +-struct es7000_oem_table { +- unsigned long long hdr; +- struct mip_reg_info mip; +- struct part_info pif; +- struct es7000_mem_info shm; +- struct psai psai; +-}; +- +-#ifdef CONFIG_ACPI +- +-struct oem_table { +- struct acpi_table_header Header; +- u32 OEMTableAddr; +- u32 OEMTableSize; +-}; +- +-extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +-extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); +-#endif +- +-struct mip_reg { +- unsigned long long off_0; +- unsigned long long off_8; +- unsigned long long off_10; +- unsigned long long off_18; +- unsigned long long off_20; +- unsigned long long off_28; +- unsigned long long off_30; +- unsigned long long off_38; +-}; +- +-#define MIP_SW_APIC 0x1020b +-#define MIP_FUNC(VALUE) (VALUE & 0xff) +- +-/* +- * ES7000 Globals +- */ +- +-static volatile unsigned long *psai = NULL; +-static struct mip_reg *mip_reg; +-static struct mip_reg *host_reg; +-static int mip_port; +-static unsigned long mip_addr, host_addr; +- +-int es7000_plat; +- +-/* +- * GSI override for ES7000 platforms. +- */ +- +-static unsigned int base; +- +-static int +-es7000_rename_gsi(int ioapic, int gsi) +-{ +- if (es7000_plat == ES7000_ZORRO) +- return gsi; +- +- if (!base) { +- int i; +- for (i = 0; i < nr_ioapics; i++) +- base += nr_ioapic_registers[i]; +- } +- +- if (!ioapic && (gsi < 16)) +- gsi += base; +- return gsi; +-} +- +-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +-{ +- unsigned long vect = 0, psaival = 0; +- +- if (psai == NULL) +- return -1; +- +- vect = ((unsigned long)__pa(eip)/0x1000) << 16; +- psaival = (0x1000000 | vect | cpu); +- +- while (*psai & 0x1000000) +- ; +- +- *psai = psaival; +- +- return 0; +-} +- +-static void noop_wait_for_deassert(atomic_t *deassert_not_used) +-{ +-} +- +-static int __init es7000_update_genapic(void) +-{ +- genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; +- +- /* MPENTIUMIII */ +- if (boot_cpu_data.x86 == 6 && +- (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { +- es7000_update_genapic_to_cluster(); +- genapic->wait_for_init_deassert = noop_wait_for_deassert; +- genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; +- } +- +- return 0; +-} +- +-void __init +-setup_unisys(void) +-{ +- /* +- * Determine the generation of the ES7000 currently running. +- * +- * es7000_plat = 1 if the machine is a 5xx ES7000 box +- * es7000_plat = 2 if the machine is a x86_64 ES7000 box +- * +- */ +- if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) +- es7000_plat = ES7000_ZORRO; +- else +- es7000_plat = ES7000_CLASSIC; +- ioapic_renumber_irq = es7000_rename_gsi; +- +- x86_quirks->update_genapic = es7000_update_genapic; +-} +- +-/* +- * Parse the OEM Table +- */ +- +-int __init +-parse_unisys_oem (char *oemptr) +-{ +- int i; +- int success = 0; +- unsigned char type, size; +- unsigned long val; +- char *tp = NULL; +- struct psai *psaip = NULL; +- struct mip_reg_info *mi; +- struct mip_reg *host, *mip; +- +- tp = oemptr; +- +- tp += 8; +- +- for (i=0; i <= 6; i++) { +- type = *tp++; +- size = *tp++; +- tp -= 2; +- switch (type) { +- case MIP_REG: +- mi = (struct mip_reg_info *)tp; +- val = MIP_RD_LO(mi->host_reg); +- host_addr = val; +- host = (struct mip_reg *)val; +- host_reg = __va(host); +- val = MIP_RD_LO(mi->mip_reg); +- mip_port = MIP_PORT(mi->mip_info); +- mip_addr = val; +- mip = (struct mip_reg *)val; +- mip_reg = __va(mip); +- pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", +- (unsigned long)host_reg); +- pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", +- (unsigned long)mip_reg); +- success++; +- break; +- case MIP_PSAI_REG: +- psaip = (struct psai *)tp; +- if (tp != NULL) { +- if (psaip->addr) +- psai = __va(psaip->addr); +- else +- psai = NULL; +- success++; +- } +- break; +- default: +- break; +- } +- tp += size; +- } +- +- if (success < 2) { +- es7000_plat = NON_UNISYS; +- } else +- setup_unisys(); +- return es7000_plat; +-} +- +-#ifdef CONFIG_ACPI +-static unsigned long oem_addrX; +-static unsigned long oem_size; +-int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) +-{ +- struct acpi_table_header *header = NULL; +- int i = 0; +- +- while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { +- if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { +- struct oem_table *t = (struct oem_table *)header; +- +- oem_addrX = t->OEMTableAddr; +- oem_size = t->OEMTableSize; +- +- *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, +- oem_size); +- return 0; +- } +- } +- return -1; +-} +- +-void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) +-{ +-} +-#endif +- +-static void +-es7000_spin(int n) +-{ +- int i = 0; +- +- while (i++ < n) +- rep_nop(); +-} +- +-static int __init +-es7000_mip_write(struct mip_reg *mip_reg) +-{ +- int status = 0; +- int spin; +- +- spin = MIP_SPIN; +- while (((unsigned long long)host_reg->off_38 & +- (unsigned long long)MIP_VALID) != 0) { +- if (--spin <= 0) { +- printk("es7000_mip_write: Timeout waiting for Host Valid Flag"); +- return -1; +- } +- es7000_spin(MIP_SPIN); +- } +- +- memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); +- outb(1, mip_port); +- +- spin = MIP_SPIN; +- +- while (((unsigned long long)mip_reg->off_38 & +- (unsigned long long)MIP_VALID) == 0) { +- if (--spin <= 0) { +- printk("es7000_mip_write: Timeout waiting for MIP Valid Flag"); +- return -1; +- } +- es7000_spin(MIP_SPIN); +- } +- +- status = ((unsigned long long)mip_reg->off_0 & +- (unsigned long long)0xffff0000000000ULL) >> 48; +- mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 & +- (unsigned long long)~MIP_VALID); +- return status; +-} +- +-void __init +-es7000_sw_apic(void) +-{ +- if (es7000_plat) { +- int mip_status; +- struct mip_reg es7000_mip_reg; +- +- printk("ES7000: Enabling APIC mode.\n"); +- memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); +- es7000_mip_reg.off_0 = MIP_SW_APIC; +- es7000_mip_reg.off_38 = (MIP_VALID); +- while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) +- printk("es7000_sw_apic: command failed, status = %x\n", +- mip_status); +- return; +- } +-} +Index: linux-2.6-tip/arch/x86/kernel/ftrace.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/ftrace.c ++++ linux-2.6-tip/arch/x86/kernel/ftrace.c +@@ -18,6 +18,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -26,6 +27,18 @@ + + #ifdef CONFIG_DYNAMIC_FTRACE + ++int ftrace_arch_code_modify_prepare(void) ++{ ++ set_kernel_text_rw(); ++ return 0; ++} ++ ++int ftrace_arch_code_modify_post_process(void) ++{ ++ set_kernel_text_ro(); ++ return 0; ++} ++ + union ftrace_code_union { + char code[MCOUNT_INSN_SIZE]; + struct { +@@ -66,11 +79,11 @@ static unsigned char *ftrace_call_replac + * + * 1) Put the instruction pointer into the IP buffer + * and the new code into the "code" buffer. +- * 2) Set a flag that says we are modifying code +- * 3) Wait for any running NMIs to finish. +- * 4) Write the code +- * 5) clear the flag. +- * 6) Wait for any running NMIs to finish. ++ * 2) Wait for any running NMIs to finish and set a flag that says ++ * we are modifying code, it is done in an atomic operation. ++ * 3) Write the code ++ * 4) clear the flag. ++ * 5) Wait for any running NMIs to finish. + * + * If an NMI is executed, the first thing it does is to call + * "ftrace_nmi_enter". This will check if the flag is set to write +@@ -82,9 +95,9 @@ static unsigned char *ftrace_call_replac + * are the same as what exists. + */ + +-static atomic_t in_nmi = ATOMIC_INIT(0); ++#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ ++static atomic_t nmi_running = ATOMIC_INIT(0); + static int mod_code_status; /* holds return value of text write */ +-static int mod_code_write; /* set when NMI should do the write */ + static void *mod_code_ip; /* holds the IP to write to */ + static void *mod_code_newcode; /* holds the text to write to the IP */ + +@@ -101,6 +114,20 @@ int ftrace_arch_read_dyn_info(char *buf, + return r; + } + ++static void clear_mod_flag(void) ++{ ++ int old = atomic_read(&nmi_running); ++ ++ for (;;) { ++ int new = old & ~MOD_CODE_WRITE_FLAG; ++ ++ if (old == new) ++ break; ++ ++ old = atomic_cmpxchg(&nmi_running, old, new); ++ } ++} ++ + static void ftrace_mod_code(void) + { + /* +@@ -111,37 +138,52 @@ static void ftrace_mod_code(void) + */ + mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, + MCOUNT_INSN_SIZE); ++ ++ /* if we fail, then kill any new writers */ ++ if (mod_code_status) ++ clear_mod_flag(); + } + + void ftrace_nmi_enter(void) + { +- atomic_inc(&in_nmi); +- /* Must have in_nmi seen before reading write flag */ +- smp_mb(); +- if (mod_code_write) { ++ if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { ++ smp_rmb(); + ftrace_mod_code(); + atomic_inc(&nmi_update_count); + } ++ /* Must have previous changes seen before executions */ ++ smp_mb(); + } + + void ftrace_nmi_exit(void) + { +- /* Finish all executions before clearing in_nmi */ +- smp_wmb(); +- atomic_dec(&in_nmi); ++ /* Finish all executions before clearing nmi_running */ ++ smp_mb(); ++ atomic_dec(&nmi_running); ++} ++ ++static void wait_for_nmi_and_set_mod_flag(void) ++{ ++ if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) ++ return; ++ ++ do { ++ cpu_relax(); ++ } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); ++ ++ nmi_wait_count++; + } + + static void wait_for_nmi(void) + { +- int waited = 0; ++ if (!atomic_read(&nmi_running)) ++ return; + +- while (atomic_read(&in_nmi)) { +- waited = 1; ++ do { + cpu_relax(); +- } ++ } while (atomic_read(&nmi_running)); + +- if (waited) +- nmi_wait_count++; ++ nmi_wait_count++; + } + + static int +@@ -151,14 +193,9 @@ do_ftrace_mod_code(unsigned long ip, voi + mod_code_newcode = new_code; + + /* The buffers need to be visible before we let NMIs write them */ +- smp_wmb(); +- +- mod_code_write = 1; +- +- /* Make sure write bit is visible before we wait on NMIs */ + smp_mb(); + +- wait_for_nmi(); ++ wait_for_nmi_and_set_mod_flag(); + + /* Make sure all running NMIs have finished before we write the code */ + smp_mb(); +@@ -166,13 +203,9 @@ do_ftrace_mod_code(unsigned long ip, voi + ftrace_mod_code(); + + /* Make sure the write happens before clearing the bit */ +- smp_wmb(); +- +- mod_code_write = 0; +- +- /* make sure NMIs see the cleared bit */ + smp_mb(); + ++ clear_mod_flag(); + wait_for_nmi(); + + return mod_code_status; +@@ -368,100 +401,8 @@ int ftrace_disable_ftrace_graph_caller(v + return ftrace_mod_jmp(ip, old_offset, new_offset); + } + +-#else /* CONFIG_DYNAMIC_FTRACE */ +- +-/* +- * These functions are picked from those used on +- * this page for dynamic ftrace. They have been +- * simplified to ignore all traces in NMI context. +- */ +-static atomic_t in_nmi; +- +-void ftrace_nmi_enter(void) +-{ +- atomic_inc(&in_nmi); +-} +- +-void ftrace_nmi_exit(void) +-{ +- atomic_dec(&in_nmi); +-} +- + #endif /* !CONFIG_DYNAMIC_FTRACE */ + +-/* Add a function return address to the trace stack on thread info.*/ +-static int push_return_trace(unsigned long ret, unsigned long long time, +- unsigned long func, int *depth) +-{ +- int index; +- +- if (!current->ret_stack) +- return -EBUSY; +- +- /* The return trace stack is full */ +- if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { +- atomic_inc(¤t->trace_overrun); +- return -EBUSY; +- } +- +- index = ++current->curr_ret_stack; +- barrier(); +- current->ret_stack[index].ret = ret; +- current->ret_stack[index].func = func; +- current->ret_stack[index].calltime = time; +- *depth = index; +- +- return 0; +-} +- +-/* Retrieve a function return address to the trace stack on thread info.*/ +-static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +-{ +- int index; +- +- index = current->curr_ret_stack; +- +- if (unlikely(index < 0)) { +- ftrace_graph_stop(); +- WARN_ON(1); +- /* Might as well panic, otherwise we have no where to go */ +- *ret = (unsigned long)panic; +- return; +- } +- +- *ret = current->ret_stack[index].ret; +- trace->func = current->ret_stack[index].func; +- trace->calltime = current->ret_stack[index].calltime; +- trace->overrun = atomic_read(¤t->trace_overrun); +- trace->depth = index; +- barrier(); +- current->curr_ret_stack--; +- +-} +- +-/* +- * Send the trace to the ring-buffer. +- * @return the original return address. +- */ +-unsigned long ftrace_return_to_handler(void) +-{ +- struct ftrace_graph_ret trace; +- unsigned long ret; +- +- pop_return_trace(&trace, &ret); +- trace.rettime = cpu_clock(raw_smp_processor_id()); +- ftrace_graph_return(&trace); +- +- if (unlikely(!ret)) { +- ftrace_graph_stop(); +- WARN_ON(1); +- /* Might as well panic. What else to do? */ +- ret = (unsigned long)panic; +- } +- +- return ret; +-} +- + /* + * Hook the return address and push it in the stack of return addrs + * in current thread info. +@@ -469,14 +410,13 @@ unsigned long ftrace_return_to_handler(v + void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) + { + unsigned long old; +- unsigned long long calltime; + int faulted; + struct ftrace_graph_ent trace; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ +- if (unlikely(atomic_read(&in_nmi))) ++ if (unlikely(in_nmi())) + return; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) +@@ -512,17 +452,7 @@ void prepare_ftrace_return(unsigned long + return; + } + +- if (unlikely(!__kernel_text_address(old))) { +- ftrace_graph_stop(); +- *parent = old; +- WARN_ON(1); +- return; +- } +- +- calltime = cpu_clock(raw_smp_processor_id()); +- +- if (push_return_trace(old, calltime, +- self_addr, &trace.depth) == -EBUSY) { ++ if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { + *parent = old; + return; + } +@@ -536,3 +466,66 @@ void prepare_ftrace_return(unsigned long + } + } + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ ++ ++#ifdef CONFIG_FTRACE_SYSCALLS ++ ++extern unsigned long __start_syscalls_metadata[]; ++extern unsigned long __stop_syscalls_metadata[]; ++extern unsigned long *sys_call_table; ++ ++static struct syscall_metadata **syscalls_metadata; ++ ++static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) ++{ ++ struct syscall_metadata *start; ++ struct syscall_metadata *stop; ++ char str[KSYM_SYMBOL_LEN]; ++ ++ ++ start = (struct syscall_metadata *)__start_syscalls_metadata; ++ stop = (struct syscall_metadata *)__stop_syscalls_metadata; ++ kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); ++ ++ for ( ; start < stop; start++) { ++ if (start->name && !strcmp(start->name, str)) ++ return start; ++ } ++ return NULL; ++} ++ ++struct syscall_metadata *syscall_nr_to_meta(int nr) ++{ ++ if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) ++ return NULL; ++ ++ return syscalls_metadata[nr]; ++} ++ ++void arch_init_ftrace_syscalls(void) ++{ ++ int i; ++ struct syscall_metadata *meta; ++ unsigned long **psys_syscall_table = &sys_call_table; ++ static atomic_t refs; ++ ++ if (atomic_inc_return(&refs) != 1) ++ goto end; ++ ++ syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * ++ FTRACE_SYSCALL_MAX, GFP_KERNEL); ++ if (!syscalls_metadata) { ++ WARN_ON(1); ++ return; ++ } ++ ++ for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { ++ meta = find_syscall_meta(psys_syscall_table[i]); ++ syscalls_metadata[i] = meta; ++ } ++ return; ++ ++ /* Paranoid: avoid overflow */ ++end: ++ atomic_dec(&refs); ++} ++#endif +Index: linux-2.6-tip/arch/x86/kernel/genapic_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/genapic_64.c ++++ /dev/null +@@ -1,82 +0,0 @@ +-/* +- * Copyright 2004 James Cleverdon, IBM. +- * Subject to the GNU Public License, v.2 +- * +- * Generic APIC sub-arch probe layer. +- * +- * Hacked for x86-64 by James Cleverdon from i386 architecture code by +- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and +- * James Cleverdon. +- */ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +- +-extern struct genapic apic_flat; +-extern struct genapic apic_physflat; +-extern struct genapic apic_x2xpic_uv_x; +-extern struct genapic apic_x2apic_phys; +-extern struct genapic apic_x2apic_cluster; +- +-struct genapic __read_mostly *genapic = &apic_flat; +- +-static struct genapic *apic_probe[] __initdata = { +- &apic_x2apic_uv_x, +- &apic_x2apic_phys, +- &apic_x2apic_cluster, +- &apic_physflat, +- NULL, +-}; +- +-/* +- * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. +- */ +-void __init setup_apic_routing(void) +-{ +- if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) { +- if (!intr_remapping_enabled) +- genapic = &apic_flat; +- } +- +- if (genapic == &apic_flat) { +- if (max_physical_apicid >= 8) +- genapic = &apic_physflat; +- printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); +- } +- +- if (x86_quirks->update_genapic) +- x86_quirks->update_genapic(); +-} +- +-/* Same for both flat and physical. */ +- +-void apic_send_IPI_self(int vector) +-{ +- __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +-} +- +-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- int i; +- +- for (i = 0; apic_probe[i]; ++i) { +- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { +- genapic = apic_probe[i]; +- printk(KERN_INFO "Setting APIC routing to %s.\n", +- genapic->name); +- return 1; +- } +- } +- return 0; +-} +Index: linux-2.6-tip/arch/x86/kernel/genapic_flat_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/genapic_flat_64.c ++++ /dev/null +@@ -1,307 +0,0 @@ +-/* +- * Copyright 2004 James Cleverdon, IBM. +- * Subject to the GNU Public License, v.2 +- * +- * Flat APIC subarch code. +- * +- * Hacked for x86-64 by James Cleverdon from i386 architecture code by +- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and +- * James Cleverdon. +- */ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef CONFIG_ACPI +-#include +-#endif +- +-static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- return 1; +-} +- +-static const struct cpumask *flat_target_cpus(void) +-{ +- return cpu_online_mask; +-} +- +-static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) +-{ +- /* Careful. Some cpus do not strictly honor the set of cpus +- * specified in the interrupt destination when using lowest +- * priority interrupt delivery mode. +- * +- * In particular there was a hyperthreading cpu observed to +- * deliver interrupts to the wrong hyperthread when only one +- * hyperthread was specified in the interrupt desitination. +- */ +- cpumask_clear(retmask); +- cpumask_bits(retmask)[0] = APIC_ALL_CPUS; +-} +- +-/* +- * Set up the logical destination ID. +- * +- * Intel recommends to set DFR, LDR and TPR before enabling +- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel +- * document number 292116). So here it goes... +- */ +-static void flat_init_apic_ldr(void) +-{ +- unsigned long val; +- unsigned long num, id; +- +- num = smp_processor_id(); +- id = 1UL << num; +- apic_write(APIC_DFR, APIC_DFR_FLAT); +- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; +- val |= SET_APIC_LOGICAL_ID(id); +- apic_write(APIC_LDR, val); +-} +- +-static inline void _flat_send_IPI_mask(unsigned long mask, int vector) +-{ +- unsigned long flags; +- +- local_irq_save(flags); +- __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); +- local_irq_restore(flags); +-} +- +-static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) +-{ +- unsigned long mask = cpumask_bits(cpumask)[0]; +- +- _flat_send_IPI_mask(mask, vector); +-} +- +-static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, +- int vector) +-{ +- unsigned long mask = cpumask_bits(cpumask)[0]; +- int cpu = smp_processor_id(); +- +- if (cpu < BITS_PER_LONG) +- clear_bit(cpu, &mask); +- _flat_send_IPI_mask(mask, vector); +-} +- +-static void flat_send_IPI_allbutself(int vector) +-{ +- int cpu = smp_processor_id(); +-#ifdef CONFIG_HOTPLUG_CPU +- int hotplug = 1; +-#else +- int hotplug = 0; +-#endif +- if (hotplug || vector == NMI_VECTOR) { +- if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { +- unsigned long mask = cpumask_bits(cpu_online_mask)[0]; +- +- if (cpu < BITS_PER_LONG) +- clear_bit(cpu, &mask); +- +- _flat_send_IPI_mask(mask, vector); +- } +- } else if (num_online_cpus() > 1) { +- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); +- } +-} +- +-static void flat_send_IPI_all(int vector) +-{ +- if (vector == NMI_VECTOR) +- flat_send_IPI_mask(cpu_online_mask, vector); +- else +- __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +-} +- +-static unsigned int get_apic_id(unsigned long x) +-{ +- unsigned int id; +- +- id = (((x)>>24) & 0xFFu); +- return id; +-} +- +-static unsigned long set_apic_id(unsigned int id) +-{ +- unsigned long x; +- +- x = ((id & 0xFFu)<<24); +- return x; +-} +- +-static unsigned int read_xapic_id(void) +-{ +- unsigned int id; +- +- id = get_apic_id(apic_read(APIC_ID)); +- return id; +-} +- +-static int flat_apic_id_registered(void) +-{ +- return physid_isset(read_xapic_id(), phys_cpu_present_map); +-} +- +-static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask) +-{ +- return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; +-} +- +-static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +- const struct cpumask *andmask) +-{ +- unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; +- unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS; +- +- return mask1 & mask2; +-} +- +-static unsigned int phys_pkg_id(int index_msb) +-{ +- return hard_smp_processor_id() >> index_msb; +-} +- +-struct genapic apic_flat = { +- .name = "flat", +- .acpi_madt_oem_check = flat_acpi_madt_oem_check, +- .int_delivery_mode = dest_LowestPrio, +- .int_dest_mode = (APIC_DEST_LOGICAL != 0), +- .target_cpus = flat_target_cpus, +- .vector_allocation_domain = flat_vector_allocation_domain, +- .apic_id_registered = flat_apic_id_registered, +- .init_apic_ldr = flat_init_apic_ldr, +- .send_IPI_all = flat_send_IPI_all, +- .send_IPI_allbutself = flat_send_IPI_allbutself, +- .send_IPI_mask = flat_send_IPI_mask, +- .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, +- .send_IPI_self = apic_send_IPI_self, +- .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, +- .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, +- .phys_pkg_id = phys_pkg_id, +- .get_apic_id = get_apic_id, +- .set_apic_id = set_apic_id, +- .apic_id_mask = (0xFFu<<24), +-}; +- +-/* +- * Physflat mode is used when there are more than 8 CPUs on a AMD system. +- * We cannot use logical delivery in this case because the mask +- * overflows, so use physical mode. +- */ +-static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +-#ifdef CONFIG_ACPI +- /* +- * Quirk: some x86_64 machines can only use physical APIC mode +- * regardless of how many processors are present (x86_64 ES7000 +- * is an example). +- */ +- if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && +- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { +- printk(KERN_DEBUG "system APIC only can use physical flat"); +- return 1; +- } +-#endif +- +- return 0; +-} +- +-static const struct cpumask *physflat_target_cpus(void) +-{ +- return cpu_online_mask; +-} +- +-static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) +-{ +- cpumask_clear(retmask); +- cpumask_set_cpu(cpu, retmask); +-} +- +-static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) +-{ +- send_IPI_mask_sequence(cpumask, vector); +-} +- +-static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, +- int vector) +-{ +- send_IPI_mask_allbutself(cpumask, vector); +-} +- +-static void physflat_send_IPI_allbutself(int vector) +-{ +- send_IPI_mask_allbutself(cpu_online_mask, vector); +-} +- +-static void physflat_send_IPI_all(int vector) +-{ +- physflat_send_IPI_mask(cpu_online_mask, vector); +-} +- +-static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one phys APIC ID. +- * May as well be the first. +- */ +- cpu = cpumask_first(cpumask); +- if ((unsigned)cpu < nr_cpu_ids) +- return per_cpu(x86_cpu_to_apicid, cpu); +- else +- return BAD_APICID; +-} +- +-static unsigned int +-physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +- const struct cpumask *andmask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one phys APIC ID. +- * May as well be the first. +- */ +- for_each_cpu_and(cpu, cpumask, andmask) +- if (cpumask_test_cpu(cpu, cpu_online_mask)) +- break; +- if (cpu < nr_cpu_ids) +- return per_cpu(x86_cpu_to_apicid, cpu); +- return BAD_APICID; +-} +- +-struct genapic apic_physflat = { +- .name = "physical flat", +- .acpi_madt_oem_check = physflat_acpi_madt_oem_check, +- .int_delivery_mode = dest_Fixed, +- .int_dest_mode = (APIC_DEST_PHYSICAL != 0), +- .target_cpus = physflat_target_cpus, +- .vector_allocation_domain = physflat_vector_allocation_domain, +- .apic_id_registered = flat_apic_id_registered, +- .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ +- .send_IPI_all = physflat_send_IPI_all, +- .send_IPI_allbutself = physflat_send_IPI_allbutself, +- .send_IPI_mask = physflat_send_IPI_mask, +- .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, +- .send_IPI_self = apic_send_IPI_self, +- .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, +- .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, +- .phys_pkg_id = phys_pkg_id, +- .get_apic_id = get_apic_id, +- .set_apic_id = set_apic_id, +- .apic_id_mask = (0xFFu<<24), +-}; +Index: linux-2.6-tip/arch/x86/kernel/genx2apic_cluster.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/genx2apic_cluster.c ++++ /dev/null +@@ -1,198 +0,0 @@ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +- +-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +- +-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- if (cpu_has_x2apic) +- return 1; +- +- return 0; +-} +- +-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ +- +-static const struct cpumask *x2apic_target_cpus(void) +-{ +- return cpumask_of(0); +-} +- +-/* +- * for now each logical cpu is in its own vector allocation domain. +- */ +-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) +-{ +- cpumask_clear(retmask); +- cpumask_set_cpu(cpu, retmask); +-} +- +-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, +- unsigned int dest) +-{ +- unsigned long cfg; +- +- cfg = __prepare_ICR(0, vector, dest); +- +- /* +- * send the IPI. +- */ +- x2apic_icr_write(cfg, apicid); +-} +- +-/* +- * for now, we send the IPI's one by one in the cpumask. +- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group +- * at once. We have 16 cpu's in a cluster. This will minimize IPI register +- * writes. +- */ +-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +-{ +- unsigned long flags; +- unsigned long query_cpu; +- +- local_irq_save(flags); +- for_each_cpu(query_cpu, mask) +- __x2apic_send_IPI_dest( +- per_cpu(x86_cpu_to_logical_apicid, query_cpu), +- vector, APIC_DEST_LOGICAL); +- local_irq_restore(flags); +-} +- +-static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, +- int vector) +-{ +- unsigned long flags; +- unsigned long query_cpu; +- unsigned long this_cpu = smp_processor_id(); +- +- local_irq_save(flags); +- for_each_cpu(query_cpu, mask) +- if (query_cpu != this_cpu) +- __x2apic_send_IPI_dest( +- per_cpu(x86_cpu_to_logical_apicid, query_cpu), +- vector, APIC_DEST_LOGICAL); +- local_irq_restore(flags); +-} +- +-static void x2apic_send_IPI_allbutself(int vector) +-{ +- unsigned long flags; +- unsigned long query_cpu; +- unsigned long this_cpu = smp_processor_id(); +- +- local_irq_save(flags); +- for_each_online_cpu(query_cpu) +- if (query_cpu != this_cpu) +- __x2apic_send_IPI_dest( +- per_cpu(x86_cpu_to_logical_apicid, query_cpu), +- vector, APIC_DEST_LOGICAL); +- local_irq_restore(flags); +-} +- +-static void x2apic_send_IPI_all(int vector) +-{ +- x2apic_send_IPI_mask(cpu_online_mask, vector); +-} +- +-static int x2apic_apic_id_registered(void) +-{ +- return 1; +-} +- +-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one logical APIC ID. +- * May as well be the first. +- */ +- cpu = cpumask_first(cpumask); +- if ((unsigned)cpu < nr_cpu_ids) +- return per_cpu(x86_cpu_to_logical_apicid, cpu); +- else +- return BAD_APICID; +-} +- +-static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +- const struct cpumask *andmask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one logical APIC ID. +- * May as well be the first. +- */ +- for_each_cpu_and(cpu, cpumask, andmask) +- if (cpumask_test_cpu(cpu, cpu_online_mask)) +- break; +- if (cpu < nr_cpu_ids) +- return per_cpu(x86_cpu_to_logical_apicid, cpu); +- return BAD_APICID; +-} +- +-static unsigned int get_apic_id(unsigned long x) +-{ +- unsigned int id; +- +- id = x; +- return id; +-} +- +-static unsigned long set_apic_id(unsigned int id) +-{ +- unsigned long x; +- +- x = id; +- return x; +-} +- +-static unsigned int phys_pkg_id(int index_msb) +-{ +- return current_cpu_data.initial_apicid >> index_msb; +-} +- +-static void x2apic_send_IPI_self(int vector) +-{ +- apic_write(APIC_SELF_IPI, vector); +-} +- +-static void init_x2apic_ldr(void) +-{ +- int cpu = smp_processor_id(); +- +- per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); +- return; +-} +- +-struct genapic apic_x2apic_cluster = { +- .name = "cluster x2apic", +- .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, +- .int_delivery_mode = dest_LowestPrio, +- .int_dest_mode = (APIC_DEST_LOGICAL != 0), +- .target_cpus = x2apic_target_cpus, +- .vector_allocation_domain = x2apic_vector_allocation_domain, +- .apic_id_registered = x2apic_apic_id_registered, +- .init_apic_ldr = init_x2apic_ldr, +- .send_IPI_all = x2apic_send_IPI_all, +- .send_IPI_allbutself = x2apic_send_IPI_allbutself, +- .send_IPI_mask = x2apic_send_IPI_mask, +- .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, +- .send_IPI_self = x2apic_send_IPI_self, +- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, +- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, +- .phys_pkg_id = phys_pkg_id, +- .get_apic_id = get_apic_id, +- .set_apic_id = set_apic_id, +- .apic_id_mask = (0xFFFFFFFFu), +-}; +Index: linux-2.6-tip/arch/x86/kernel/genx2apic_phys.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/genx2apic_phys.c ++++ /dev/null +@@ -1,194 +0,0 @@ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +- +-static int x2apic_phys; +- +-static int set_x2apic_phys_mode(char *arg) +-{ +- x2apic_phys = 1; +- return 0; +-} +-early_param("x2apic_phys", set_x2apic_phys_mode); +- +-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- if (cpu_has_x2apic && x2apic_phys) +- return 1; +- +- return 0; +-} +- +-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ +- +-static const struct cpumask *x2apic_target_cpus(void) +-{ +- return cpumask_of(0); +-} +- +-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) +-{ +- cpumask_clear(retmask); +- cpumask_set_cpu(cpu, retmask); +-} +- +-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, +- unsigned int dest) +-{ +- unsigned long cfg; +- +- cfg = __prepare_ICR(0, vector, dest); +- +- /* +- * send the IPI. +- */ +- x2apic_icr_write(cfg, apicid); +-} +- +-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +-{ +- unsigned long flags; +- unsigned long query_cpu; +- +- local_irq_save(flags); +- for_each_cpu(query_cpu, mask) { +- __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), +- vector, APIC_DEST_PHYSICAL); +- } +- local_irq_restore(flags); +-} +- +-static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, +- int vector) +-{ +- unsigned long flags; +- unsigned long query_cpu; +- unsigned long this_cpu = smp_processor_id(); +- +- local_irq_save(flags); +- for_each_cpu(query_cpu, mask) { +- if (query_cpu != this_cpu) +- __x2apic_send_IPI_dest( +- per_cpu(x86_cpu_to_apicid, query_cpu), +- vector, APIC_DEST_PHYSICAL); +- } +- local_irq_restore(flags); +-} +- +-static void x2apic_send_IPI_allbutself(int vector) +-{ +- unsigned long flags; +- unsigned long query_cpu; +- unsigned long this_cpu = smp_processor_id(); +- +- local_irq_save(flags); +- for_each_online_cpu(query_cpu) +- if (query_cpu != this_cpu) +- __x2apic_send_IPI_dest( +- per_cpu(x86_cpu_to_apicid, query_cpu), +- vector, APIC_DEST_PHYSICAL); +- local_irq_restore(flags); +-} +- +-static void x2apic_send_IPI_all(int vector) +-{ +- x2apic_send_IPI_mask(cpu_online_mask, vector); +-} +- +-static int x2apic_apic_id_registered(void) +-{ +- return 1; +-} +- +-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one phys APIC ID. +- * May as well be the first. +- */ +- cpu = cpumask_first(cpumask); +- if ((unsigned)cpu < nr_cpu_ids) +- return per_cpu(x86_cpu_to_apicid, cpu); +- else +- return BAD_APICID; +-} +- +-static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +- const struct cpumask *andmask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one phys APIC ID. +- * May as well be the first. +- */ +- for_each_cpu_and(cpu, cpumask, andmask) +- if (cpumask_test_cpu(cpu, cpu_online_mask)) +- break; +- if (cpu < nr_cpu_ids) +- return per_cpu(x86_cpu_to_apicid, cpu); +- return BAD_APICID; +-} +- +-static unsigned int get_apic_id(unsigned long x) +-{ +- unsigned int id; +- +- id = x; +- return id; +-} +- +-static unsigned long set_apic_id(unsigned int id) +-{ +- unsigned long x; +- +- x = id; +- return x; +-} +- +-static unsigned int phys_pkg_id(int index_msb) +-{ +- return current_cpu_data.initial_apicid >> index_msb; +-} +- +-static void x2apic_send_IPI_self(int vector) +-{ +- apic_write(APIC_SELF_IPI, vector); +-} +- +-static void init_x2apic_ldr(void) +-{ +- return; +-} +- +-struct genapic apic_x2apic_phys = { +- .name = "physical x2apic", +- .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, +- .int_delivery_mode = dest_Fixed, +- .int_dest_mode = (APIC_DEST_PHYSICAL != 0), +- .target_cpus = x2apic_target_cpus, +- .vector_allocation_domain = x2apic_vector_allocation_domain, +- .apic_id_registered = x2apic_apic_id_registered, +- .init_apic_ldr = init_x2apic_ldr, +- .send_IPI_all = x2apic_send_IPI_all, +- .send_IPI_allbutself = x2apic_send_IPI_allbutself, +- .send_IPI_mask = x2apic_send_IPI_mask, +- .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, +- .send_IPI_self = x2apic_send_IPI_self, +- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, +- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, +- .phys_pkg_id = phys_pkg_id, +- .get_apic_id = get_apic_id, +- .set_apic_id = set_apic_id, +- .apic_id_mask = (0xFFFFFFFFu), +-}; +Index: linux-2.6-tip/arch/x86/kernel/genx2apic_uv_x.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/genx2apic_uv_x.c ++++ /dev/null +@@ -1,600 +0,0 @@ +-/* +- * This file is subject to the terms and conditions of the GNU General Public +- * License. See the file "COPYING" in the main directory of this archive +- * for more details. +- * +- * SGI UV APIC functions (note: not an Intel compatible APIC) +- * +- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-DEFINE_PER_CPU(int, x2apic_extra_bits); +- +-static enum uv_system_type uv_system_type; +- +-static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- if (!strcmp(oem_id, "SGI")) { +- if (!strcmp(oem_table_id, "UVL")) +- uv_system_type = UV_LEGACY_APIC; +- else if (!strcmp(oem_table_id, "UVX")) +- uv_system_type = UV_X2APIC; +- else if (!strcmp(oem_table_id, "UVH")) { +- uv_system_type = UV_NON_UNIQUE_APIC; +- return 1; +- } +- } +- return 0; +-} +- +-enum uv_system_type get_uv_system_type(void) +-{ +- return uv_system_type; +-} +- +-int is_uv_system(void) +-{ +- return uv_system_type != UV_NONE; +-} +-EXPORT_SYMBOL_GPL(is_uv_system); +- +-DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); +-EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); +- +-struct uv_blade_info *uv_blade_info; +-EXPORT_SYMBOL_GPL(uv_blade_info); +- +-short *uv_node_to_blade; +-EXPORT_SYMBOL_GPL(uv_node_to_blade); +- +-short *uv_cpu_to_blade; +-EXPORT_SYMBOL_GPL(uv_cpu_to_blade); +- +-short uv_possible_blades; +-EXPORT_SYMBOL_GPL(uv_possible_blades); +- +-unsigned long sn_rtc_cycles_per_second; +-EXPORT_SYMBOL(sn_rtc_cycles_per_second); +- +-/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ +- +-static const struct cpumask *uv_target_cpus(void) +-{ +- return cpumask_of(0); +-} +- +-static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) +-{ +- cpumask_clear(retmask); +- cpumask_set_cpu(cpu, retmask); +-} +- +-int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) +-{ +- unsigned long val; +- int pnode; +- +- pnode = uv_apicid_to_pnode(phys_apicid); +- val = (1UL << UVH_IPI_INT_SEND_SHFT) | +- (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | +- (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | +- APIC_DM_INIT; +- uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +- mdelay(10); +- +- val = (1UL << UVH_IPI_INT_SEND_SHFT) | +- (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | +- (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | +- APIC_DM_STARTUP; +- uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +- return 0; +-} +- +-static void uv_send_IPI_one(int cpu, int vector) +-{ +- unsigned long val, apicid, lapicid; +- int pnode; +- +- apicid = per_cpu(x86_cpu_to_apicid, cpu); +- lapicid = apicid & 0x3f; /* ZZZ macro needed */ +- pnode = uv_apicid_to_pnode(apicid); +- val = +- (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid << +- UVH_IPI_INT_APIC_ID_SHFT) | +- (vector << UVH_IPI_INT_VECTOR_SHFT); +- uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +-} +- +-static void uv_send_IPI_mask(const struct cpumask *mask, int vector) +-{ +- unsigned int cpu; +- +- for_each_cpu(cpu, mask) +- uv_send_IPI_one(cpu, vector); +-} +- +-static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +-{ +- unsigned int cpu; +- unsigned int this_cpu = smp_processor_id(); +- +- for_each_cpu(cpu, mask) +- if (cpu != this_cpu) +- uv_send_IPI_one(cpu, vector); +-} +- +-static void uv_send_IPI_allbutself(int vector) +-{ +- unsigned int cpu; +- unsigned int this_cpu = smp_processor_id(); +- +- for_each_online_cpu(cpu) +- if (cpu != this_cpu) +- uv_send_IPI_one(cpu, vector); +-} +- +-static void uv_send_IPI_all(int vector) +-{ +- uv_send_IPI_mask(cpu_online_mask, vector); +-} +- +-static int uv_apic_id_registered(void) +-{ +- return 1; +-} +- +-static void uv_init_apic_ldr(void) +-{ +-} +- +-static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one phys APIC ID. +- * May as well be the first. +- */ +- cpu = cpumask_first(cpumask); +- if ((unsigned)cpu < nr_cpu_ids) +- return per_cpu(x86_cpu_to_apicid, cpu); +- else +- return BAD_APICID; +-} +- +-static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +- const struct cpumask *andmask) +-{ +- int cpu; +- +- /* +- * We're using fixed IRQ delivery, can only return one phys APIC ID. +- * May as well be the first. +- */ +- for_each_cpu_and(cpu, cpumask, andmask) +- if (cpumask_test_cpu(cpu, cpu_online_mask)) +- break; +- if (cpu < nr_cpu_ids) +- return per_cpu(x86_cpu_to_apicid, cpu); +- return BAD_APICID; +-} +- +-static unsigned int get_apic_id(unsigned long x) +-{ +- unsigned int id; +- +- WARN_ON(preemptible() && num_online_cpus() > 1); +- id = x | __get_cpu_var(x2apic_extra_bits); +- +- return id; +-} +- +-static unsigned long set_apic_id(unsigned int id) +-{ +- unsigned long x; +- +- /* maskout x2apic_extra_bits ? */ +- x = id; +- return x; +-} +- +-static unsigned int uv_read_apic_id(void) +-{ +- +- return get_apic_id(apic_read(APIC_ID)); +-} +- +-static unsigned int phys_pkg_id(int index_msb) +-{ +- return uv_read_apic_id() >> index_msb; +-} +- +-static void uv_send_IPI_self(int vector) +-{ +- apic_write(APIC_SELF_IPI, vector); +-} +- +-struct genapic apic_x2apic_uv_x = { +- .name = "UV large system", +- .acpi_madt_oem_check = uv_acpi_madt_oem_check, +- .int_delivery_mode = dest_Fixed, +- .int_dest_mode = (APIC_DEST_PHYSICAL != 0), +- .target_cpus = uv_target_cpus, +- .vector_allocation_domain = uv_vector_allocation_domain, +- .apic_id_registered = uv_apic_id_registered, +- .init_apic_ldr = uv_init_apic_ldr, +- .send_IPI_all = uv_send_IPI_all, +- .send_IPI_allbutself = uv_send_IPI_allbutself, +- .send_IPI_mask = uv_send_IPI_mask, +- .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, +- .send_IPI_self = uv_send_IPI_self, +- .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, +- .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, +- .phys_pkg_id = phys_pkg_id, +- .get_apic_id = get_apic_id, +- .set_apic_id = set_apic_id, +- .apic_id_mask = (0xFFFFFFFFu), +-}; +- +-static __cpuinit void set_x2apic_extra_bits(int pnode) +-{ +- __get_cpu_var(x2apic_extra_bits) = (pnode << 6); +-} +- +-/* +- * Called on boot cpu. +- */ +-static __init int boot_pnode_to_blade(int pnode) +-{ +- int blade; +- +- for (blade = 0; blade < uv_num_possible_blades(); blade++) +- if (pnode == uv_blade_info[blade].pnode) +- return blade; +- BUG(); +-} +- +-struct redir_addr { +- unsigned long redirect; +- unsigned long alias; +-}; +- +-#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT +- +-static __initdata struct redir_addr redir_addrs[] = { +- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, +- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, +- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, +-}; +- +-static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) +-{ +- union uvh_si_alias0_overlay_config_u alias; +- union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; +- int i; +- +- for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { +- alias.v = uv_read_local_mmr(redir_addrs[i].alias); +- if (alias.s.base == 0) { +- *size = (1UL << alias.s.m_alias); +- redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); +- *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; +- return; +- } +- } +- BUG(); +-} +- +-static __init void map_low_mmrs(void) +-{ +- init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); +- init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +-} +- +-enum map_type {map_wb, map_uc}; +- +-static __init void map_high(char *id, unsigned long base, int shift, +- int max_pnode, enum map_type map_type) +-{ +- unsigned long bytes, paddr; +- +- paddr = base << shift; +- bytes = (1UL << shift) * (max_pnode + 1); +- printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, +- paddr + bytes); +- if (map_type == map_uc) +- init_extra_mapping_uc(paddr, bytes); +- else +- init_extra_mapping_wb(paddr, bytes); +- +-} +-static __init void map_gru_high(int max_pnode) +-{ +- union uvh_rh_gam_gru_overlay_config_mmr_u gru; +- int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; +- +- gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); +- if (gru.s.enable) +- map_high("GRU", gru.s.base, shift, max_pnode, map_wb); +-} +- +-static __init void map_config_high(int max_pnode) +-{ +- union uvh_rh_gam_cfg_overlay_config_mmr_u cfg; +- int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT; +- +- cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); +- if (cfg.s.enable) +- map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); +-} +- +-static __init void map_mmr_high(int max_pnode) +-{ +- union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; +- int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; +- +- mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); +- if (mmr.s.enable) +- map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); +-} +- +-static __init void map_mmioh_high(int max_pnode) +-{ +- union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; +- int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; +- +- mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); +- if (mmioh.s.enable) +- map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); +-} +- +-static __init void uv_rtc_init(void) +-{ +- long status; +- u64 ticks_per_sec; +- +- status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, +- &ticks_per_sec); +- if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { +- printk(KERN_WARNING +- "unable to determine platform RTC clock frequency, " +- "guessing.\n"); +- /* BIOS gives wrong value for clock freq. so guess */ +- sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; +- } else +- sn_rtc_cycles_per_second = ticks_per_sec; +-} +- +-/* +- * percpu heartbeat timer +- */ +-static void uv_heartbeat(unsigned long ignored) +-{ +- struct timer_list *timer = &uv_hub_info->scir.timer; +- unsigned char bits = uv_hub_info->scir.state; +- +- /* flip heartbeat bit */ +- bits ^= SCIR_CPU_HEARTBEAT; +- +- /* is this cpu idle? */ +- if (idle_cpu(raw_smp_processor_id())) +- bits &= ~SCIR_CPU_ACTIVITY; +- else +- bits |= SCIR_CPU_ACTIVITY; +- +- /* update system controller interface reg */ +- uv_set_scir_bits(bits); +- +- /* enable next timer period */ +- mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); +-} +- +-static void __cpuinit uv_heartbeat_enable(int cpu) +-{ +- if (!uv_cpu_hub_info(cpu)->scir.enabled) { +- struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; +- +- uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); +- setup_timer(timer, uv_heartbeat, cpu); +- timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; +- add_timer_on(timer, cpu); +- uv_cpu_hub_info(cpu)->scir.enabled = 1; +- } +- +- /* check boot cpu */ +- if (!uv_cpu_hub_info(0)->scir.enabled) +- uv_heartbeat_enable(0); +-} +- +-#ifdef CONFIG_HOTPLUG_CPU +-static void __cpuinit uv_heartbeat_disable(int cpu) +-{ +- if (uv_cpu_hub_info(cpu)->scir.enabled) { +- uv_cpu_hub_info(cpu)->scir.enabled = 0; +- del_timer(&uv_cpu_hub_info(cpu)->scir.timer); +- } +- uv_set_cpu_scir_bits(cpu, 0xff); +-} +- +-/* +- * cpu hotplug notifier +- */ +-static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, +- unsigned long action, void *hcpu) +-{ +- long cpu = (long)hcpu; +- +- switch (action) { +- case CPU_ONLINE: +- uv_heartbeat_enable(cpu); +- break; +- case CPU_DOWN_PREPARE: +- uv_heartbeat_disable(cpu); +- break; +- default: +- break; +- } +- return NOTIFY_OK; +-} +- +-static __init void uv_scir_register_cpu_notifier(void) +-{ +- hotcpu_notifier(uv_scir_cpu_notify, 0); +-} +- +-#else /* !CONFIG_HOTPLUG_CPU */ +- +-static __init void uv_scir_register_cpu_notifier(void) +-{ +-} +- +-static __init int uv_init_heartbeat(void) +-{ +- int cpu; +- +- if (is_uv_system()) +- for_each_online_cpu(cpu) +- uv_heartbeat_enable(cpu); +- return 0; +-} +- +-late_initcall(uv_init_heartbeat); +- +-#endif /* !CONFIG_HOTPLUG_CPU */ +- +-/* +- * Called on each cpu to initialize the per_cpu UV data area. +- * ZZZ hotplug not supported yet +- */ +-void __cpuinit uv_cpu_init(void) +-{ +- /* CPU 0 initilization will be done via uv_system_init. */ +- if (!uv_blade_info) +- return; +- +- uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; +- +- if (get_uv_system_type() == UV_NON_UNIQUE_APIC) +- set_x2apic_extra_bits(uv_hub_info->pnode); +-} +- +- +-void __init uv_system_init(void) +-{ +- union uvh_si_addr_map_config_u m_n_config; +- union uvh_node_id_u node_id; +- unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; +- int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; +- int max_pnode = 0; +- unsigned long mmr_base, present; +- +- map_low_mmrs(); +- +- m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); +- m_val = m_n_config.s.m_skt; +- n_val = m_n_config.s.n_skt; +- mmr_base = +- uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & +- ~UV_MMR_ENABLE; +- printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); +- +- for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) +- uv_possible_blades += +- hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); +- printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); +- +- bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); +- uv_blade_info = kmalloc(bytes, GFP_KERNEL); +- +- get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); +- +- bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); +- uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); +- memset(uv_node_to_blade, 255, bytes); +- +- bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); +- uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); +- memset(uv_cpu_to_blade, 255, bytes); +- +- blade = 0; +- for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { +- present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); +- for (j = 0; j < 64; j++) { +- if (!test_bit(j, &present)) +- continue; +- uv_blade_info[blade].pnode = (i * 64 + j); +- uv_blade_info[blade].nr_possible_cpus = 0; +- uv_blade_info[blade].nr_online_cpus = 0; +- blade++; +- } +- } +- +- node_id.v = uv_read_local_mmr(UVH_NODE_ID); +- gnode_upper = (((unsigned long)node_id.s.node_id) & +- ~((1 << n_val) - 1)) << m_val; +- +- uv_bios_init(); +- uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, +- &sn_coherency_id, &sn_region_size); +- uv_rtc_init(); +- +- for_each_present_cpu(cpu) { +- nid = cpu_to_node(cpu); +- pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); +- blade = boot_pnode_to_blade(pnode); +- lcpu = uv_blade_info[blade].nr_possible_cpus; +- uv_blade_info[blade].nr_possible_cpus++; +- +- uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; +- uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; +- uv_cpu_hub_info(cpu)->m_val = m_val; +- uv_cpu_hub_info(cpu)->n_val = m_val; +- uv_cpu_hub_info(cpu)->numa_blade_id = blade; +- uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; +- uv_cpu_hub_info(cpu)->pnode = pnode; +- uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1; +- uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; +- uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; +- uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; +- uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; +- uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; +- uv_node_to_blade[nid] = blade; +- uv_cpu_to_blade[cpu] = blade; +- max_pnode = max(pnode, max_pnode); +- +- printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " +- "lcpu %d, blade %d\n", +- cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, +- lcpu, blade); +- } +- +- map_gru_high(max_pnode); +- map_mmr_high(max_pnode); +- map_config_high(max_pnode); +- map_mmioh_high(max_pnode); +- +- uv_cpu_init(); +- uv_scir_register_cpu_notifier(); +- proc_mkdir("sgi_uv", NULL); +-} +Index: linux-2.6-tip/arch/x86/kernel/head32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/head32.c ++++ linux-2.6-tip/arch/x86/kernel/head32.c +@@ -18,7 +18,7 @@ void __init i386_start_kernel(void) + { + reserve_trampoline_memory(); + +- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); ++ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + + #ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD */ +@@ -29,9 +29,6 @@ void __init i386_start_kernel(void) + reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); + } + #endif +- reserve_early(init_pg_tables_start, init_pg_tables_end, +- "INIT_PG_TABLE"); +- + reserve_ebda_region(); + + /* +Index: linux-2.6-tip/arch/x86/kernel/head64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/head64.c ++++ linux-2.6-tip/arch/x86/kernel/head64.c +@@ -26,32 +26,15 @@ + #include + #include + +-/* boot cpu pda */ +-static struct x8664_pda _boot_cpu_pda; +- +-#ifdef CONFIG_SMP +-/* +- * We install an empty cpu_pda pointer table to indicate to early users +- * (numa_set_node) that the cpu_pda pointer table for cpus other than +- * the boot cpu is not yet setup. +- */ +-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; +-#else +-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; +-#endif +- +-void __init x86_64_init_pda(void) +-{ +- _cpu_pda = __cpu_pda; +- cpu_pda(0) = &_boot_cpu_pda; +- pda_init(0); +-} +- + static void __init zap_identity_mappings(void) + { + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); +- __flush_tlb_all(); ++ /* ++ * preempt_disable/enable does not work this early in the ++ * bootup yet: ++ */ ++ write_cr3(read_cr3()); + } + + /* Don't add a printk in there. printk relies on the PDA which is not initialized +@@ -112,8 +95,6 @@ void __init x86_64_start_kernel(char * r + if (console_loglevel == 10) + early_printk("Kernel alive\n"); + +- x86_64_init_pda(); +- + x86_64_start_reservations(real_mode_data); + } + +@@ -123,7 +104,7 @@ void __init x86_64_start_reservations(ch + + reserve_trampoline_memory(); + +- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); ++ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + + #ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD */ +Index: linux-2.6-tip/arch/x86/kernel/head_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/head_32.S ++++ linux-2.6-tip/arch/x86/kernel/head_32.S +@@ -11,14 +11,15 @@ + #include + #include + #include +-#include +-#include ++#include ++#include + #include + #include + #include + #include + #include + #include ++#include + + /* Physical address */ + #define pa(X) ((X) - __PAGE_OFFSET) +@@ -37,42 +38,40 @@ + #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + + /* +- * This is how much memory *in addition to the memory covered up to +- * and including _end* we need mapped initially. ++ * This is how much memory in addition to the memory covered up to ++ * and including _end we need mapped initially. + * We need: +- * - one bit for each possible page, but only in low memory, which means +- * 2^32/4096/8 = 128K worst case (4G/4G split.) +- * - enough space to map all low memory, which means +- * (2^32/4096) / 1024 pages (worst case, non PAE) +- * (2^32/4096) / 512 + 4 pages (worst case for PAE) +- * - a few pages for allocator use before the kernel pagetable has +- * been set up ++ * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) ++ * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) + * + * Modulo rounding, each megabyte assigned here requires a kilobyte of + * memory, which is currently unreclaimed. + * + * This should be a multiple of a page. ++ * ++ * KERNEL_IMAGE_SIZE should be greater than pa(_end) ++ * and small than max_low_pfn, otherwise will waste some page table entries + */ +-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) +- +-/* +- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate +- * pagetables from above the 16MB DMA limit, so we'll have to set +- * up pagetables 16MB more (worst-case): +- */ +-#ifdef CONFIG_DEBUG_PAGEALLOC +-LOW_PAGES = LOW_PAGES + 0x1000000 +-#endif + + #if PTRS_PER_PMD > 1 +-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD ++#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) + #else +-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) ++#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) + #endif +-BOOTBITMAP_SIZE = LOW_PAGES / 8 +-ALLOCATOR_SLOP = 4 + +-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm ++/* Enough space to fit pagetables for the low memory linear map */ ++MAPPING_BEYOND_END = \ ++ PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT ++ ++/* ++ * Worst-case size of the kernel mapping we need to make: ++ * the worst-case size of the kernel itself, plus the extra we need ++ * to map for the linear map. ++ */ ++KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT ++ ++INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm ++RESERVE_BRK(pagetables, INIT_MAP_SIZE) + + /* + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, +@@ -165,10 +164,10 @@ num_subarch_entries = (. - subarch_entri + + /* + * Initialize page tables. This creates a PDE and a set of page +- * tables, which are located immediately beyond _end. The variable +- * init_pg_tables_end is set up to point to the first "safe" location. ++ * tables, which are located immediately beyond __brk_base. The variable ++ * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) +- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. ++ * and PAGE_OFFSET for up to _end. + * + * Note that the stack is not yet set up! + */ +@@ -189,8 +188,7 @@ default_entry: + + xorl %ebx,%ebx /* %ebx is kept at zero */ + +- movl $pa(pg0), %edi +- movl %edi, pa(init_pg_tables_start) ++ movl $pa(__brk_base), %edi + movl $pa(swapper_pg_pmd), %edx + movl $PTE_IDENT_ATTR, %eax + 10: +@@ -208,14 +206,14 @@ default_entry: + loop 11b + + /* +- * End condition: we must map up to and including INIT_MAP_BEYOND_END +- * bytes beyond the end of our own page tables. ++ * End condition: we must map up to the end + MAPPING_BEYOND_END. + */ +- leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp ++ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp + cmpl %ebp,%eax + jb 10b + 1: +- movl %edi,pa(init_pg_tables_end) ++ addl $__PAGE_OFFSET, %edi ++ movl %edi, pa(_brk_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) + +@@ -226,8 +224,7 @@ default_entry: + + page_pde_offset = (__PAGE_OFFSET >> 20); + +- movl $pa(pg0), %edi +- movl %edi, pa(init_pg_tables_start) ++ movl $pa(__brk_base), %edi + movl $pa(swapper_pg_dir), %edx + movl $PTE_IDENT_ATTR, %eax + 10: +@@ -241,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20); + addl $0x1000,%eax + loop 11b + /* +- * End condition: we must map up to and including INIT_MAP_BEYOND_END +- * bytes beyond the end of our own page tables; the +0x007 is +- * the attribute bits ++ * End condition: we must map up to the end + MAPPING_BEYOND_END. + */ +- leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp ++ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp + cmpl %ebp,%eax + jb 10b +- movl %edi,pa(init_pg_tables_end) ++ addl $__PAGE_OFFSET, %edi ++ movl %edi, pa(_brk_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) + +@@ -429,14 +425,34 @@ is386: movl $2,%ecx # set MP + ljmp $(__KERNEL_CS),$1f + 1: movl $(__KERNEL_DS),%eax # reload all the segment registers + movl %eax,%ss # after changing gdt. +- movl %eax,%fs # gets reset once there's real percpu + + movl $(__USER_DS),%eax # DS/ES contains default USER segment + movl %eax,%ds + movl %eax,%es + +- xorl %eax,%eax # Clear GS and LDT ++ movl $(__KERNEL_PERCPU), %eax ++ movl %eax,%fs # set this cpu's percpu ++ ++#ifdef CONFIG_CC_STACKPROTECTOR ++ /* ++ * The linker can't handle this by relocation. Manually set ++ * base address in stack canary segment descriptor. ++ */ ++ cmpb $0,ready ++ jne 1f ++ movl $per_cpu__gdt_page,%eax ++ movl $per_cpu__stack_canary,%ecx ++ subl $20, %ecx ++ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) ++ shrl $16, %ecx ++ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) ++ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) ++1: ++#endif ++ movl $(__KERNEL_STACK_CANARY),%eax + movl %eax,%gs ++ ++ xorl %eax,%eax # Clear LDT + lldt %ax + + cld # gcc2 wants the direction flag cleared at all times +@@ -446,8 +462,6 @@ is386: movl $2,%ecx # set MP + movb $1, ready + cmpb $0,%cl # the first CPU calls start_kernel + je 1f +- movl $(__KERNEL_PERCPU), %eax +- movl %eax,%fs # set this cpu's percpu + movl (stack_start), %esp + 1: + #endif /* CONFIG_SMP */ +@@ -548,12 +562,8 @@ early_fault: + pushl %eax + pushl %edx /* trapno */ + pushl $fault_msg +-#ifdef CONFIG_EARLY_PRINTK +- call early_printk +-#else + call printk + #endif +-#endif + call dump_stack + hlt_loop: + hlt +@@ -580,12 +590,12 @@ ignore_int: + pushl 32(%esp) + pushl 40(%esp) + pushl $int_msg +-#ifdef CONFIG_EARLY_PRINTK +- call early_printk +-#else + call printk +-#endif ++ ++ call dump_stack ++ + addl $(5*4),%esp ++ call dump_stack + popl %ds + popl %es + popl %edx +@@ -622,6 +632,7 @@ swapper_pg_fixmap: + .fill 1024,4,0 + ENTRY(empty_zero_page) + .fill 4096,1,0 ++ + /* + * This starts the data section. + */ +@@ -660,7 +671,7 @@ early_recursion_flag: + .long 0 + + int_msg: +- .asciz "Unknown interrupt or fault at EIP %p %p %p\n" ++ .asciz "Unknown interrupt or fault at: %p %p %p\n" + + fault_msg: + /* fault info: */ +Index: linux-2.6-tip/arch/x86/kernel/head_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/head_64.S ++++ linux-2.6-tip/arch/x86/kernel/head_64.S +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_PARAVIRT + #include +@@ -226,12 +227,15 @@ ENTRY(secondary_startup_64) + movl %eax,%fs + movl %eax,%gs + +- /* +- * Setup up a dummy PDA. this is just for some early bootup code +- * that does in_interrupt() +- */ ++ /* Set up %gs. ++ * ++ * The base of %gs always points to the bottom of the irqstack ++ * union. If the stack protector canary is enabled, it is ++ * located at %gs:40. Note that, on SMP, the boot cpu uses ++ * init data section till per cpu areas are set up. ++ */ + movl $MSR_GS_BASE,%ecx +- movq $empty_zero_page,%rax ++ movq initial_gs(%rip),%rax + movq %rax,%rdx + shrq $32,%rdx + wrmsr +@@ -257,6 +261,8 @@ ENTRY(secondary_startup_64) + .align 8 + ENTRY(initial_code) + .quad x86_64_start_kernel ++ ENTRY(initial_gs) ++ .quad INIT_PER_CPU_VAR(irq_stack_union) + __FINITDATA + + ENTRY(stack_start) +@@ -323,8 +329,6 @@ early_idt_ripmsg: + #endif /* CONFIG_EARLY_PRINTK */ + .previous + +-.balign PAGE_SIZE +- + #define NEXT_PAGE(name) \ + .balign PAGE_SIZE; \ + ENTRY(name) +@@ -401,7 +405,8 @@ NEXT_PAGE(level2_spare_pgt) + .globl early_gdt_descr + early_gdt_descr: + .word GDT_ENTRIES*8-1 +- .quad per_cpu__gdt_page ++early_gdt_descr_base: ++ .quad INIT_PER_CPU_VAR(gdt_page) + + ENTRY(phys_base) + /* This must match the first entry in level2_kernel_pgt */ +@@ -412,7 +417,7 @@ ENTRY(phys_base) + .section .bss, "aw", @nobits + .align L1_CACHE_BYTES + ENTRY(idt_table) +- .skip 256 * 16 ++ .skip IDT_ENTRIES * 16 + + .section .bss.page_aligned, "aw", @nobits + .align PAGE_SIZE +Index: linux-2.6-tip/arch/x86/kernel/hpet.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/hpet.c ++++ linux-2.6-tip/arch/x86/kernel/hpet.c +@@ -80,6 +80,7 @@ static inline void hpet_clear_mapping(vo + */ + static int boot_hpet_disable; + int hpet_force_user; ++static int hpet_verbose; + + static int __init hpet_setup(char *str) + { +@@ -88,6 +89,8 @@ static int __init hpet_setup(char *str) + boot_hpet_disable = 1; + if (!strncmp("force", str, 5)) + hpet_force_user = 1; ++ if (!strncmp("verbose", str, 7)) ++ hpet_verbose = 1; + } + return 1; + } +@@ -119,6 +122,43 @@ int is_hpet_enabled(void) + } + EXPORT_SYMBOL_GPL(is_hpet_enabled); + ++static void _hpet_print_config(const char *function, int line) ++{ ++ u32 i, timers, l, h; ++ printk(KERN_INFO "hpet: %s(%d):\n", function, line); ++ l = hpet_readl(HPET_ID); ++ h = hpet_readl(HPET_PERIOD); ++ timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; ++ printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h); ++ l = hpet_readl(HPET_CFG); ++ h = hpet_readl(HPET_STATUS); ++ printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h); ++ l = hpet_readl(HPET_COUNTER); ++ h = hpet_readl(HPET_COUNTER+4); ++ printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); ++ ++ for (i = 0; i < timers; i++) { ++ l = hpet_readl(HPET_Tn_CFG(i)); ++ h = hpet_readl(HPET_Tn_CFG(i)+4); ++ printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", ++ i, l, h); ++ l = hpet_readl(HPET_Tn_CMP(i)); ++ h = hpet_readl(HPET_Tn_CMP(i)+4); ++ printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", ++ i, l, h); ++ l = hpet_readl(HPET_Tn_ROUTE(i)); ++ h = hpet_readl(HPET_Tn_ROUTE(i)+4); ++ printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", ++ i, l, h); ++ } ++} ++ ++#define hpet_print_config() \ ++do { \ ++ if (hpet_verbose) \ ++ _hpet_print_config(__FUNCTION__, __LINE__); \ ++} while (0) ++ + /* + * When the hpet driver (/dev/hpet) is enabled, we need to reserve + * timer 0 and timer 1 in case of RTC emulation. +@@ -301,6 +341,7 @@ static void hpet_set_mode(enum clock_eve + */ + hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); + hpet_start_counter(); ++ hpet_print_config(); + break; + + case CLOCK_EVT_MODE_ONESHOT: +@@ -327,6 +368,7 @@ static void hpet_set_mode(enum clock_eve + irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); + enable_irq(hdev->irq); + } ++ hpet_print_config(); + break; + } + } +@@ -468,7 +510,8 @@ static int hpet_setup_irq(struct hpet_de + { + + if (request_irq(dev->irq, hpet_interrupt_handler, +- IRQF_DISABLED|IRQF_NOBALANCING, dev->name, dev)) ++ IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, ++ dev->name, dev)) + return -1; + + disable_irq(dev->irq); +@@ -545,6 +588,7 @@ static void hpet_msi_capability_lookup(u + + num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + num_timers++; /* Value read out starts from 0 */ ++ hpet_print_config(); + + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); + if (!hpet_devs) +@@ -812,6 +856,7 @@ int __init hpet_enable(void) + * information and the number of channels + */ + id = hpet_readl(HPET_ID); ++ hpet_print_config(); + + #ifdef CONFIG_HPET_EMULATE_RTC + /* +@@ -864,6 +909,7 @@ static __init int hpet_late_init(void) + return -ENODEV; + + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); ++ hpet_print_config(); + + for_each_online_cpu(cpu) { + hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); +Index: linux-2.6-tip/arch/x86/kernel/i8253.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/i8253.c ++++ linux-2.6-tip/arch/x86/kernel/i8253.c +@@ -3,19 +3,19 @@ + * + */ + #include +-#include + #include ++#include + #include + #include +-#include ++#include ++#include ++#include + +-#include +-#include + #include +-#include + #include ++#include + +-DEFINE_SPINLOCK(i8253_lock); ++DEFINE_RAW_SPINLOCK(i8253_lock); + EXPORT_SYMBOL(i8253_lock); + + #ifdef CONFIG_X86_32 +@@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_ev + { + spin_lock(&i8253_lock); + +- switch(mode) { ++ switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_pit(0x34, PIT_MODE); +@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long + * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - + * !using_apic_timer decisions in do_timer_interrupt_hook() + */ +-static struct clock_event_device pit_clockevent = { ++static struct clock_event_device pit_ce = { + .name = "pit", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = init_pit_timer, +@@ -114,15 +114,13 @@ void __init setup_pit_timer(void) + * Start pit with the boot cpu mask and make it global after the + * IO_APIC has been initialized. + */ +- pit_clockevent.cpumask = cpumask_of(smp_processor_id()); +- pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, +- pit_clockevent.shift); +- pit_clockevent.max_delta_ns = +- clockevent_delta2ns(0x7FFF, &pit_clockevent); +- pit_clockevent.min_delta_ns = +- clockevent_delta2ns(0xF, &pit_clockevent); +- clockevents_register_device(&pit_clockevent); +- global_clock_event = &pit_clockevent; ++ pit_ce.cpumask = cpumask_of(smp_processor_id()); ++ pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift); ++ pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce); ++ pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce); ++ ++ clockevents_register_device(&pit_ce); ++ global_clock_event = &pit_ce; + } + + #ifndef CONFIG_X86_64 +@@ -133,11 +131,11 @@ void __init setup_pit_timer(void) + */ + static cycle_t pit_read(void) + { ++ static int old_count; ++ static u32 old_jifs; + unsigned long flags; + int count; + u32 jifs; +- static int old_count; +- static u32 old_jifs; + + spin_lock_irqsave(&i8253_lock, flags); + /* +@@ -179,9 +177,9 @@ static cycle_t pit_read(void) + * Previous attempts to handle these cases intelligently were + * buggy, so we just do the simple thing now. + */ +- if (count > old_count && jifs == old_jifs) { ++ if (count > old_count && jifs == old_jifs) + count = old_count; +- } ++ + old_count = count; + old_jifs = jifs; + +@@ -192,13 +190,13 @@ static cycle_t pit_read(void) + return (cycle_t)(jifs * LATCH) + count; + } + +-static struct clocksource clocksource_pit = { +- .name = "pit", +- .rating = 110, +- .read = pit_read, +- .mask = CLOCKSOURCE_MASK(32), +- .mult = 0, +- .shift = 20, ++static struct clocksource pit_cs = { ++ .name = "pit", ++ .rating = 110, ++ .read = pit_read, ++ .mask = CLOCKSOURCE_MASK(32), ++ .mult = 0, ++ .shift = 20, + }; + + static void pit_disable_clocksource(void) +@@ -206,9 +204,9 @@ static void pit_disable_clocksource(void + /* + * Use mult to check whether it is registered or not + */ +- if (clocksource_pit.mult) { +- clocksource_unregister(&clocksource_pit); +- clocksource_pit.mult = 0; ++ if (pit_cs.mult) { ++ clocksource_unregister(&pit_cs); ++ pit_cs.mult = 0; + } + } + +@@ -222,13 +220,13 @@ static int __init init_pit_clocksource(v + * - when local APIC timer is active (PIT is switched off) + */ + if (num_possible_cpus() > 1 || is_hpet_enabled() || +- pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) ++ pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) + return 0; + +- clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, +- clocksource_pit.shift); +- return clocksource_register(&clocksource_pit); ++ pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); ++ ++ return clocksource_register(&pit_cs); + } + arch_initcall(init_pit_clocksource); + +-#endif ++#endif /* !CONFIG_X86_64 */ +Index: linux-2.6-tip/arch/x86/kernel/i8259.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/i8259.c ++++ linux-2.6-tip/arch/x86/kernel/i8259.c +@@ -22,7 +22,6 @@ + #include + #include + #include +-#include + #include + + /* +@@ -33,8 +32,8 @@ + */ + + static int i8259A_auto_eoi; +-DEFINE_SPINLOCK(i8259A_lock); + static void mask_and_ack_8259A(unsigned int); ++DEFINE_RAW_SPINLOCK(i8259A_lock); + + struct irq_chip i8259A_chip = { + .name = "XT-PIC", +@@ -169,6 +168,8 @@ static void mask_and_ack_8259A(unsigned + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; ++ if (irq & 8) ++ outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to slave */ + cached_irq_mask |= irqmask; + + handle_real_irq: +@@ -329,10 +330,10 @@ void init_8259A(int auto_eoi) + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); + +- if (auto_eoi) /* master does Auto EOI */ +- outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); +- else /* master expects normal EOI */ +- outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); ++ if (!auto_eoi) /* master expects normal EOI */ ++ outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); ++ else /* master does Auto EOI */ ++ outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + +Index: linux-2.6-tip/arch/x86/kernel/io_apic.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/io_apic.c ++++ /dev/null +@@ -1,4182 +0,0 @@ +-/* +- * Intel IO-APIC support for multi-Pentium hosts. +- * +- * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo +- * +- * Many thanks to Stig Venaas for trying out countless experimental +- * patches and reporting/debugging problems patiently! +- * +- * (c) 1999, Multiple IO-APIC support, developed by +- * Ken-ichi Yaku and +- * Hidemi Kishimoto , +- * further tested and cleaned up by Zach Brown +- * and Ingo Molnar +- * +- * Fixes +- * Maciej W. Rozycki : Bits for genuine 82489DX APICs; +- * thanks to Eric Gilmore +- * and Rolf G. Tews +- * for testing these extensively +- * Paul Diefenbaugh : Added full ACPI support +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include /* time_after() */ +-#ifdef CONFIG_ACPI +-#include +-#endif +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +- +-#define __apicdebuginit(type) static type __init +- +-/* +- * Is the SiS APIC rmw bug present ? +- * -1 = don't know, 0 = no, 1 = yes +- */ +-int sis_apic_bug = -1; +- +-static DEFINE_SPINLOCK(ioapic_lock); +-static DEFINE_SPINLOCK(vector_lock); +- +-/* +- * # of IRQ routing registers +- */ +-int nr_ioapic_registers[MAX_IO_APICS]; +- +-/* I/O APIC entries */ +-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; +-int nr_ioapics; +- +-/* MP IRQ source entries */ +-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; +- +-/* # of MP IRQ source entries */ +-int mp_irq_entries; +- +-#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +-int mp_bus_id_to_type[MAX_MP_BUSSES]; +-#endif +- +-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); +- +-int skip_ioapic_setup; +- +-static int __init parse_noapic(char *str) +-{ +- /* disable IO-APIC */ +- disable_ioapic_setup(); +- return 0; +-} +-early_param("noapic", parse_noapic); +- +-struct irq_pin_list; +- +-/* +- * This is performance-critical, we want to do it O(1) +- * +- * the indexing order of this array favors 1:1 mappings +- * between pins and IRQs. +- */ +- +-struct irq_pin_list { +- int apic, pin; +- struct irq_pin_list *next; +-}; +- +-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +-{ +- struct irq_pin_list *pin; +- int node; +- +- node = cpu_to_node(cpu); +- +- pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); +- +- return pin; +-} +- +-struct irq_cfg { +- struct irq_pin_list *irq_2_pin; +- cpumask_var_t domain; +- cpumask_var_t old_domain; +- unsigned move_cleanup_count; +- u8 vector; +- u8 move_in_progress : 1; +-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC +- u8 move_desc_pending : 1; +-#endif +-}; +- +-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +-#ifdef CONFIG_SPARSE_IRQ +-static struct irq_cfg irq_cfgx[] = { +-#else +-static struct irq_cfg irq_cfgx[NR_IRQS] = { +-#endif +- [0] = { .vector = IRQ0_VECTOR, }, +- [1] = { .vector = IRQ1_VECTOR, }, +- [2] = { .vector = IRQ2_VECTOR, }, +- [3] = { .vector = IRQ3_VECTOR, }, +- [4] = { .vector = IRQ4_VECTOR, }, +- [5] = { .vector = IRQ5_VECTOR, }, +- [6] = { .vector = IRQ6_VECTOR, }, +- [7] = { .vector = IRQ7_VECTOR, }, +- [8] = { .vector = IRQ8_VECTOR, }, +- [9] = { .vector = IRQ9_VECTOR, }, +- [10] = { .vector = IRQ10_VECTOR, }, +- [11] = { .vector = IRQ11_VECTOR, }, +- [12] = { .vector = IRQ12_VECTOR, }, +- [13] = { .vector = IRQ13_VECTOR, }, +- [14] = { .vector = IRQ14_VECTOR, }, +- [15] = { .vector = IRQ15_VECTOR, }, +-}; +- +-int __init arch_early_irq_init(void) +-{ +- struct irq_cfg *cfg; +- struct irq_desc *desc; +- int count; +- int i; +- +- cfg = irq_cfgx; +- count = ARRAY_SIZE(irq_cfgx); +- +- for (i = 0; i < count; i++) { +- desc = irq_to_desc(i); +- desc->chip_data = &cfg[i]; +- alloc_bootmem_cpumask_var(&cfg[i].domain); +- alloc_bootmem_cpumask_var(&cfg[i].old_domain); +- if (i < NR_IRQS_LEGACY) +- cpumask_setall(cfg[i].domain); +- } +- +- return 0; +-} +- +-#ifdef CONFIG_SPARSE_IRQ +-static struct irq_cfg *irq_cfg(unsigned int irq) +-{ +- struct irq_cfg *cfg = NULL; +- struct irq_desc *desc; +- +- desc = irq_to_desc(irq); +- if (desc) +- cfg = desc->chip_data; +- +- return cfg; +-} +- +-static struct irq_cfg *get_one_free_irq_cfg(int cpu) +-{ +- struct irq_cfg *cfg; +- int node; +- +- node = cpu_to_node(cpu); +- +- cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); +- if (cfg) { +- if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { +- kfree(cfg); +- cfg = NULL; +- } else if (!alloc_cpumask_var_node(&cfg->old_domain, +- GFP_ATOMIC, node)) { +- free_cpumask_var(cfg->domain); +- kfree(cfg); +- cfg = NULL; +- } else { +- cpumask_clear(cfg->domain); +- cpumask_clear(cfg->old_domain); +- } +- } +- +- return cfg; +-} +- +-int arch_init_chip_data(struct irq_desc *desc, int cpu) +-{ +- struct irq_cfg *cfg; +- +- cfg = desc->chip_data; +- if (!cfg) { +- desc->chip_data = get_one_free_irq_cfg(cpu); +- if (!desc->chip_data) { +- printk(KERN_ERR "can not alloc irq_cfg\n"); +- BUG_ON(1); +- } +- } +- +- return 0; +-} +- +-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC +- +-static void +-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +-{ +- struct irq_pin_list *old_entry, *head, *tail, *entry; +- +- cfg->irq_2_pin = NULL; +- old_entry = old_cfg->irq_2_pin; +- if (!old_entry) +- return; +- +- entry = get_one_free_irq_2_pin(cpu); +- if (!entry) +- return; +- +- entry->apic = old_entry->apic; +- entry->pin = old_entry->pin; +- head = entry; +- tail = entry; +- old_entry = old_entry->next; +- while (old_entry) { +- entry = get_one_free_irq_2_pin(cpu); +- if (!entry) { +- entry = head; +- while (entry) { +- head = entry->next; +- kfree(entry); +- entry = head; +- } +- /* still use the old one */ +- return; +- } +- entry->apic = old_entry->apic; +- entry->pin = old_entry->pin; +- tail->next = entry; +- tail = entry; +- old_entry = old_entry->next; +- } +- +- tail->next = NULL; +- cfg->irq_2_pin = head; +-} +- +-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) +-{ +- struct irq_pin_list *entry, *next; +- +- if (old_cfg->irq_2_pin == cfg->irq_2_pin) +- return; +- +- entry = old_cfg->irq_2_pin; +- +- while (entry) { +- next = entry->next; +- kfree(entry); +- entry = next; +- } +- old_cfg->irq_2_pin = NULL; +-} +- +-void arch_init_copy_chip_data(struct irq_desc *old_desc, +- struct irq_desc *desc, int cpu) +-{ +- struct irq_cfg *cfg; +- struct irq_cfg *old_cfg; +- +- cfg = get_one_free_irq_cfg(cpu); +- +- if (!cfg) +- return; +- +- desc->chip_data = cfg; +- +- old_cfg = old_desc->chip_data; +- +- memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); +- +- init_copy_irq_2_pin(old_cfg, cfg, cpu); +-} +- +-static void free_irq_cfg(struct irq_cfg *old_cfg) +-{ +- kfree(old_cfg); +-} +- +-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) +-{ +- struct irq_cfg *old_cfg, *cfg; +- +- old_cfg = old_desc->chip_data; +- cfg = desc->chip_data; +- +- if (old_cfg == cfg) +- return; +- +- if (old_cfg) { +- free_irq_2_pin(old_cfg, cfg); +- free_irq_cfg(old_cfg); +- old_desc->chip_data = NULL; +- } +-} +- +-static void +-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +-{ +- struct irq_cfg *cfg = desc->chip_data; +- +- if (!cfg->move_in_progress) { +- /* it means that domain is not changed */ +- if (!cpumask_intersects(&desc->affinity, mask)) +- cfg->move_desc_pending = 1; +- } +-} +-#endif +- +-#else +-static struct irq_cfg *irq_cfg(unsigned int irq) +-{ +- return irq < nr_irqs ? irq_cfgx + irq : NULL; +-} +- +-#endif +- +-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC +-static inline void +-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +-{ +-} +-#endif +- +-struct io_apic { +- unsigned int index; +- unsigned int unused[3]; +- unsigned int data; +-}; +- +-static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +-{ +- return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) +- + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); +-} +- +-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +-{ +- struct io_apic __iomem *io_apic = io_apic_base(apic); +- writel(reg, &io_apic->index); +- return readl(&io_apic->data); +-} +- +-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +-{ +- struct io_apic __iomem *io_apic = io_apic_base(apic); +- writel(reg, &io_apic->index); +- writel(value, &io_apic->data); +-} +- +-/* +- * Re-write a value: to be used for read-modify-write +- * cycles where the read already set up the index register. +- * +- * Older SiS APIC requires we rewrite the index register +- */ +-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +-{ +- struct io_apic __iomem *io_apic = io_apic_base(apic); +- +- if (sis_apic_bug) +- writel(reg, &io_apic->index); +- writel(value, &io_apic->data); +-} +- +-static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +-{ +- struct irq_pin_list *entry; +- unsigned long flags; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- entry = cfg->irq_2_pin; +- for (;;) { +- unsigned int reg; +- int pin; +- +- if (!entry) +- break; +- pin = entry->pin; +- reg = io_apic_read(entry->apic, 0x10 + pin*2); +- /* Is the remote IRR bit set? */ +- if (reg & IO_APIC_REDIR_REMOTE_IRR) { +- spin_unlock_irqrestore(&ioapic_lock, flags); +- return true; +- } +- if (!entry->next) +- break; +- entry = entry->next; +- } +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- return false; +-} +- +-union entry_union { +- struct { u32 w1, w2; }; +- struct IO_APIC_route_entry entry; +-}; +- +-static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +-{ +- union entry_union eu; +- unsigned long flags; +- spin_lock_irqsave(&ioapic_lock, flags); +- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); +- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- return eu.entry; +-} +- +-/* +- * When we write a new IO APIC routing entry, we need to write the high +- * word first! If the mask bit in the low word is clear, we will enable +- * the interrupt, and we need to make sure the entry is fully populated +- * before that happens. +- */ +-static void +-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +-{ +- union entry_union eu; +- eu.entry = e; +- io_apic_write(apic, 0x11 + 2*pin, eu.w2); +- io_apic_write(apic, 0x10 + 2*pin, eu.w1); +-} +- +-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +-{ +- unsigned long flags; +- spin_lock_irqsave(&ioapic_lock, flags); +- __ioapic_write_entry(apic, pin, e); +- spin_unlock_irqrestore(&ioapic_lock, flags); +-} +- +-/* +- * When we mask an IO APIC routing entry, we need to write the low +- * word first, in order to set the mask bit before we change the +- * high bits! +- */ +-static void ioapic_mask_entry(int apic, int pin) +-{ +- unsigned long flags; +- union entry_union eu = { .entry.mask = 1 }; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- io_apic_write(apic, 0x10 + 2*pin, eu.w1); +- io_apic_write(apic, 0x11 + 2*pin, eu.w2); +- spin_unlock_irqrestore(&ioapic_lock, flags); +-} +- +-#ifdef CONFIG_SMP +-static void send_cleanup_vector(struct irq_cfg *cfg) +-{ +- cpumask_var_t cleanup_mask; +- +- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { +- unsigned int i; +- cfg->move_cleanup_count = 0; +- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) +- cfg->move_cleanup_count++; +- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) +- send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); +- } else { +- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); +- cfg->move_cleanup_count = cpumask_weight(cleanup_mask); +- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); +- free_cpumask_var(cleanup_mask); +- } +- cfg->move_in_progress = 0; +-} +- +-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +-{ +- int apic, pin; +- struct irq_pin_list *entry; +- u8 vector = cfg->vector; +- +- entry = cfg->irq_2_pin; +- for (;;) { +- unsigned int reg; +- +- if (!entry) +- break; +- +- apic = entry->apic; +- pin = entry->pin; +-#ifdef CONFIG_INTR_REMAP +- /* +- * With interrupt-remapping, destination information comes +- * from interrupt-remapping table entry. +- */ +- if (!irq_remapped(irq)) +- io_apic_write(apic, 0x11 + pin*2, dest); +-#else +- io_apic_write(apic, 0x11 + pin*2, dest); +-#endif +- reg = io_apic_read(apic, 0x10 + pin*2); +- reg &= ~IO_APIC_REDIR_VECTOR_MASK; +- reg |= vector; +- io_apic_modify(apic, 0x10 + pin*2, reg); +- if (!entry->next) +- break; +- entry = entry->next; +- } +-} +- +-static int +-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); +- +-/* +- * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid +- * of that, or returns BAD_APICID and leaves desc->affinity untouched. +- */ +-static unsigned int +-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +-{ +- struct irq_cfg *cfg; +- unsigned int irq; +- +- if (!cpumask_intersects(mask, cpu_online_mask)) +- return BAD_APICID; +- +- irq = desc->irq; +- cfg = desc->chip_data; +- if (assign_irq_vector(irq, cfg, mask)) +- return BAD_APICID; +- +- cpumask_and(&desc->affinity, cfg->domain, mask); +- set_extra_move_desc(desc, mask); +- return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); +-} +- +-static void +-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +-{ +- struct irq_cfg *cfg; +- unsigned long flags; +- unsigned int dest; +- unsigned int irq; +- +- irq = desc->irq; +- cfg = desc->chip_data; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- dest = set_desc_affinity(desc, mask); +- if (dest != BAD_APICID) { +- /* Only the high 8 bits are valid. */ +- dest = SET_APIC_LOGICAL_ID(dest); +- __target_IO_APIC_irq(irq, dest, cfg); +- } +- spin_unlock_irqrestore(&ioapic_lock, flags); +-} +- +-static void +-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +-{ +- struct irq_desc *desc; +- +- desc = irq_to_desc(irq); +- +- set_ioapic_affinity_irq_desc(desc, mask); +-} +-#endif /* CONFIG_SMP */ +- +-/* +- * The common case is 1:1 IRQ<->pin mappings. Sometimes there are +- * shared ISA-space IRQs, so we have to support them. We are super +- * fast in the common case, and fast for shared ISA-space IRQs. +- */ +-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +-{ +- struct irq_pin_list *entry; +- +- entry = cfg->irq_2_pin; +- if (!entry) { +- entry = get_one_free_irq_2_pin(cpu); +- if (!entry) { +- printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", +- apic, pin); +- return; +- } +- cfg->irq_2_pin = entry; +- entry->apic = apic; +- entry->pin = pin; +- return; +- } +- +- while (entry->next) { +- /* not again, please */ +- if (entry->apic == apic && entry->pin == pin) +- return; +- +- entry = entry->next; +- } +- +- entry->next = get_one_free_irq_2_pin(cpu); +- entry = entry->next; +- entry->apic = apic; +- entry->pin = pin; +-} +- +-/* +- * Reroute an IRQ to a different pin. +- */ +-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +- int oldapic, int oldpin, +- int newapic, int newpin) +-{ +- struct irq_pin_list *entry = cfg->irq_2_pin; +- int replaced = 0; +- +- while (entry) { +- if (entry->apic == oldapic && entry->pin == oldpin) { +- entry->apic = newapic; +- entry->pin = newpin; +- replaced = 1; +- /* every one is different, right? */ +- break; +- } +- entry = entry->next; +- } +- +- /* why? call replace before add? */ +- if (!replaced) +- add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); +-} +- +-static inline void io_apic_modify_irq(struct irq_cfg *cfg, +- int mask_and, int mask_or, +- void (*final)(struct irq_pin_list *entry)) +-{ +- int pin; +- struct irq_pin_list *entry; +- +- for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { +- unsigned int reg; +- pin = entry->pin; +- reg = io_apic_read(entry->apic, 0x10 + pin * 2); +- reg &= mask_and; +- reg |= mask_or; +- io_apic_modify(entry->apic, 0x10 + pin * 2, reg); +- if (final) +- final(entry); +- } +-} +- +-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) +-{ +- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); +-} +- +-#ifdef CONFIG_X86_64 +-static void io_apic_sync(struct irq_pin_list *entry) +-{ +- /* +- * Synchronize the IO-APIC and the CPU by doing +- * a dummy read from the IO-APIC +- */ +- struct io_apic __iomem *io_apic; +- io_apic = io_apic_base(entry->apic); +- readl(&io_apic->data); +-} +- +-static void __mask_IO_APIC_irq(struct irq_cfg *cfg) +-{ +- io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); +-} +-#else /* CONFIG_X86_32 */ +-static void __mask_IO_APIC_irq(struct irq_cfg *cfg) +-{ +- io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); +-} +- +-static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) +-{ +- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, +- IO_APIC_REDIR_MASKED, NULL); +-} +- +-static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) +-{ +- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, +- IO_APIC_REDIR_LEVEL_TRIGGER, NULL); +-} +-#endif /* CONFIG_X86_32 */ +- +-static void mask_IO_APIC_irq_desc(struct irq_desc *desc) +-{ +- struct irq_cfg *cfg = desc->chip_data; +- unsigned long flags; +- +- BUG_ON(!cfg); +- +- spin_lock_irqsave(&ioapic_lock, flags); +- __mask_IO_APIC_irq(cfg); +- spin_unlock_irqrestore(&ioapic_lock, flags); +-} +- +-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) +-{ +- struct irq_cfg *cfg = desc->chip_data; +- unsigned long flags; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- __unmask_IO_APIC_irq(cfg); +- spin_unlock_irqrestore(&ioapic_lock, flags); +-} +- +-static void mask_IO_APIC_irq(unsigned int irq) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- +- mask_IO_APIC_irq_desc(desc); +-} +-static void unmask_IO_APIC_irq(unsigned int irq) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- +- unmask_IO_APIC_irq_desc(desc); +-} +- +-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +-{ +- struct IO_APIC_route_entry entry; +- +- /* Check delivery_mode to be sure we're not clearing an SMI pin */ +- entry = ioapic_read_entry(apic, pin); +- if (entry.delivery_mode == dest_SMI) +- return; +- /* +- * Disable it in the IO-APIC irq-routing table: +- */ +- ioapic_mask_entry(apic, pin); +-} +- +-static void clear_IO_APIC (void) +-{ +- int apic, pin; +- +- for (apic = 0; apic < nr_ioapics; apic++) +- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) +- clear_IO_APIC_pin(apic, pin); +-} +- +-#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) +-void send_IPI_self(int vector) +-{ +- unsigned int cfg; +- +- /* +- * Wait for idle. +- */ +- apic_wait_icr_idle(); +- cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; +- /* +- * Send the IPI. The write to APIC_ICR fires this off. +- */ +- apic_write(APIC_ICR, cfg); +-} +-#endif /* !CONFIG_SMP && CONFIG_X86_32*/ +- +-#ifdef CONFIG_X86_32 +-/* +- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to +- * specific CPU-side IRQs. +- */ +- +-#define MAX_PIRQS 8 +-static int pirq_entries [MAX_PIRQS]; +-static int pirqs_enabled; +- +-static int __init ioapic_pirq_setup(char *str) +-{ +- int i, max; +- int ints[MAX_PIRQS+1]; +- +- get_options(str, ARRAY_SIZE(ints), ints); +- +- for (i = 0; i < MAX_PIRQS; i++) +- pirq_entries[i] = -1; +- +- pirqs_enabled = 1; +- apic_printk(APIC_VERBOSE, KERN_INFO +- "PIRQ redirection, working around broken MP-BIOS.\n"); +- max = MAX_PIRQS; +- if (ints[0] < MAX_PIRQS) +- max = ints[0]; +- +- for (i = 0; i < max; i++) { +- apic_printk(APIC_VERBOSE, KERN_DEBUG +- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); +- /* +- * PIRQs are mapped upside down, usually. +- */ +- pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; +- } +- return 1; +-} +- +-__setup("pirq=", ioapic_pirq_setup); +-#endif /* CONFIG_X86_32 */ +- +-#ifdef CONFIG_INTR_REMAP +-/* I/O APIC RTE contents at the OS boot up */ +-static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; +- +-/* +- * Saves and masks all the unmasked IO-APIC RTE's +- */ +-int save_mask_IO_APIC_setup(void) +-{ +- union IO_APIC_reg_01 reg_01; +- unsigned long flags; +- int apic, pin; +- +- /* +- * The number of IO-APIC IRQ registers (== #pins): +- */ +- for (apic = 0; apic < nr_ioapics; apic++) { +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_01.raw = io_apic_read(apic, 1); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- nr_ioapic_registers[apic] = reg_01.bits.entries+1; +- } +- +- for (apic = 0; apic < nr_ioapics; apic++) { +- early_ioapic_entries[apic] = +- kzalloc(sizeof(struct IO_APIC_route_entry) * +- nr_ioapic_registers[apic], GFP_KERNEL); +- if (!early_ioapic_entries[apic]) +- goto nomem; +- } +- +- for (apic = 0; apic < nr_ioapics; apic++) +- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { +- struct IO_APIC_route_entry entry; +- +- entry = early_ioapic_entries[apic][pin] = +- ioapic_read_entry(apic, pin); +- if (!entry.mask) { +- entry.mask = 1; +- ioapic_write_entry(apic, pin, entry); +- } +- } +- +- return 0; +- +-nomem: +- while (apic >= 0) +- kfree(early_ioapic_entries[apic--]); +- memset(early_ioapic_entries, 0, +- ARRAY_SIZE(early_ioapic_entries)); +- +- return -ENOMEM; +-} +- +-void restore_IO_APIC_setup(void) +-{ +- int apic, pin; +- +- for (apic = 0; apic < nr_ioapics; apic++) { +- if (!early_ioapic_entries[apic]) +- break; +- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) +- ioapic_write_entry(apic, pin, +- early_ioapic_entries[apic][pin]); +- kfree(early_ioapic_entries[apic]); +- early_ioapic_entries[apic] = NULL; +- } +-} +- +-void reinit_intr_remapped_IO_APIC(int intr_remapping) +-{ +- /* +- * for now plain restore of previous settings. +- * TBD: In the case of OS enabling interrupt-remapping, +- * IO-APIC RTE's need to be setup to point to interrupt-remapping +- * table entries. for now, do a plain restore, and wait for +- * the setup_IO_APIC_irqs() to do proper initialization. +- */ +- restore_IO_APIC_setup(); +-} +-#endif +- +-/* +- * Find the IRQ entry number of a certain pin. +- */ +-static int find_irq_entry(int apic, int pin, int type) +-{ +- int i; +- +- for (i = 0; i < mp_irq_entries; i++) +- if (mp_irqs[i].mp_irqtype == type && +- (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || +- mp_irqs[i].mp_dstapic == MP_APIC_ALL) && +- mp_irqs[i].mp_dstirq == pin) +- return i; +- +- return -1; +-} +- +-/* +- * Find the pin to which IRQ[irq] (ISA) is connected +- */ +-static int __init find_isa_irq_pin(int irq, int type) +-{ +- int i; +- +- for (i = 0; i < mp_irq_entries; i++) { +- int lbus = mp_irqs[i].mp_srcbus; +- +- if (test_bit(lbus, mp_bus_not_pci) && +- (mp_irqs[i].mp_irqtype == type) && +- (mp_irqs[i].mp_srcbusirq == irq)) +- +- return mp_irqs[i].mp_dstirq; +- } +- return -1; +-} +- +-static int __init find_isa_irq_apic(int irq, int type) +-{ +- int i; +- +- for (i = 0; i < mp_irq_entries; i++) { +- int lbus = mp_irqs[i].mp_srcbus; +- +- if (test_bit(lbus, mp_bus_not_pci) && +- (mp_irqs[i].mp_irqtype == type) && +- (mp_irqs[i].mp_srcbusirq == irq)) +- break; +- } +- if (i < mp_irq_entries) { +- int apic; +- for(apic = 0; apic < nr_ioapics; apic++) { +- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) +- return apic; +- } +- } +- +- return -1; +-} +- +-/* +- * Find a specific PCI IRQ entry. +- * Not an __init, possibly needed by modules +- */ +-static int pin_2_irq(int idx, int apic, int pin); +- +-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +-{ +- int apic, i, best_guess = -1; +- +- apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", +- bus, slot, pin); +- if (test_bit(bus, mp_bus_not_pci)) { +- apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); +- return -1; +- } +- for (i = 0; i < mp_irq_entries; i++) { +- int lbus = mp_irqs[i].mp_srcbus; +- +- for (apic = 0; apic < nr_ioapics; apic++) +- if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || +- mp_irqs[i].mp_dstapic == MP_APIC_ALL) +- break; +- +- if (!test_bit(lbus, mp_bus_not_pci) && +- !mp_irqs[i].mp_irqtype && +- (bus == lbus) && +- (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { +- int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); +- +- if (!(apic || IO_APIC_IRQ(irq))) +- continue; +- +- if (pin == (mp_irqs[i].mp_srcbusirq & 3)) +- return irq; +- /* +- * Use the first all-but-pin matching entry as a +- * best-guess fuzzy result for broken mptables. +- */ +- if (best_guess < 0) +- best_guess = irq; +- } +- } +- return best_guess; +-} +- +-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); +- +-#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +-/* +- * EISA Edge/Level control register, ELCR +- */ +-static int EISA_ELCR(unsigned int irq) +-{ +- if (irq < NR_IRQS_LEGACY) { +- unsigned int port = 0x4d0 + (irq >> 3); +- return (inb(port) >> (irq & 7)) & 1; +- } +- apic_printk(APIC_VERBOSE, KERN_INFO +- "Broken MPtable reports ISA irq %d\n", irq); +- return 0; +-} +- +-#endif +- +-/* ISA interrupts are always polarity zero edge triggered, +- * when listed as conforming in the MP table. */ +- +-#define default_ISA_trigger(idx) (0) +-#define default_ISA_polarity(idx) (0) +- +-/* EISA interrupts are always polarity zero and can be edge or level +- * trigger depending on the ELCR value. If an interrupt is listed as +- * EISA conforming in the MP table, that means its trigger type must +- * be read in from the ELCR */ +- +-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +-#define default_EISA_polarity(idx) default_ISA_polarity(idx) +- +-/* PCI interrupts are always polarity one level triggered, +- * when listed as conforming in the MP table. */ +- +-#define default_PCI_trigger(idx) (1) +-#define default_PCI_polarity(idx) (1) +- +-/* MCA interrupts are always polarity zero level triggered, +- * when listed as conforming in the MP table. */ +- +-#define default_MCA_trigger(idx) (1) +-#define default_MCA_polarity(idx) default_ISA_polarity(idx) +- +-static int MPBIOS_polarity(int idx) +-{ +- int bus = mp_irqs[idx].mp_srcbus; +- int polarity; +- +- /* +- * Determine IRQ line polarity (high active or low active): +- */ +- switch (mp_irqs[idx].mp_irqflag & 3) +- { +- case 0: /* conforms, ie. bus-type dependent polarity */ +- if (test_bit(bus, mp_bus_not_pci)) +- polarity = default_ISA_polarity(idx); +- else +- polarity = default_PCI_polarity(idx); +- break; +- case 1: /* high active */ +- { +- polarity = 0; +- break; +- } +- case 2: /* reserved */ +- { +- printk(KERN_WARNING "broken BIOS!!\n"); +- polarity = 1; +- break; +- } +- case 3: /* low active */ +- { +- polarity = 1; +- break; +- } +- default: /* invalid */ +- { +- printk(KERN_WARNING "broken BIOS!!\n"); +- polarity = 1; +- break; +- } +- } +- return polarity; +-} +- +-static int MPBIOS_trigger(int idx) +-{ +- int bus = mp_irqs[idx].mp_srcbus; +- int trigger; +- +- /* +- * Determine IRQ trigger mode (edge or level sensitive): +- */ +- switch ((mp_irqs[idx].mp_irqflag>>2) & 3) +- { +- case 0: /* conforms, ie. bus-type dependent */ +- if (test_bit(bus, mp_bus_not_pci)) +- trigger = default_ISA_trigger(idx); +- else +- trigger = default_PCI_trigger(idx); +-#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +- switch (mp_bus_id_to_type[bus]) { +- case MP_BUS_ISA: /* ISA pin */ +- { +- /* set before the switch */ +- break; +- } +- case MP_BUS_EISA: /* EISA pin */ +- { +- trigger = default_EISA_trigger(idx); +- break; +- } +- case MP_BUS_PCI: /* PCI pin */ +- { +- /* set before the switch */ +- break; +- } +- case MP_BUS_MCA: /* MCA pin */ +- { +- trigger = default_MCA_trigger(idx); +- break; +- } +- default: +- { +- printk(KERN_WARNING "broken BIOS!!\n"); +- trigger = 1; +- break; +- } +- } +-#endif +- break; +- case 1: /* edge */ +- { +- trigger = 0; +- break; +- } +- case 2: /* reserved */ +- { +- printk(KERN_WARNING "broken BIOS!!\n"); +- trigger = 1; +- break; +- } +- case 3: /* level */ +- { +- trigger = 1; +- break; +- } +- default: /* invalid */ +- { +- printk(KERN_WARNING "broken BIOS!!\n"); +- trigger = 0; +- break; +- } +- } +- return trigger; +-} +- +-static inline int irq_polarity(int idx) +-{ +- return MPBIOS_polarity(idx); +-} +- +-static inline int irq_trigger(int idx) +-{ +- return MPBIOS_trigger(idx); +-} +- +-int (*ioapic_renumber_irq)(int ioapic, int irq); +-static int pin_2_irq(int idx, int apic, int pin) +-{ +- int irq, i; +- int bus = mp_irqs[idx].mp_srcbus; +- +- /* +- * Debugging check, we are in big trouble if this message pops up! +- */ +- if (mp_irqs[idx].mp_dstirq != pin) +- printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); +- +- if (test_bit(bus, mp_bus_not_pci)) { +- irq = mp_irqs[idx].mp_srcbusirq; +- } else { +- /* +- * PCI IRQs are mapped in order +- */ +- i = irq = 0; +- while (i < apic) +- irq += nr_ioapic_registers[i++]; +- irq += pin; +- /* +- * For MPS mode, so far only needed by ES7000 platform +- */ +- if (ioapic_renumber_irq) +- irq = ioapic_renumber_irq(apic, irq); +- } +- +-#ifdef CONFIG_X86_32 +- /* +- * PCI IRQ command line redirection. Yes, limits are hardcoded. +- */ +- if ((pin >= 16) && (pin <= 23)) { +- if (pirq_entries[pin-16] != -1) { +- if (!pirq_entries[pin-16]) { +- apic_printk(APIC_VERBOSE, KERN_DEBUG +- "disabling PIRQ%d\n", pin-16); +- } else { +- irq = pirq_entries[pin-16]; +- apic_printk(APIC_VERBOSE, KERN_DEBUG +- "using PIRQ%d -> IRQ %d\n", +- pin-16, irq); +- } +- } +- } +-#endif +- +- return irq; +-} +- +-void lock_vector_lock(void) +-{ +- /* Used to the online set of cpus does not change +- * during assign_irq_vector. +- */ +- spin_lock(&vector_lock); +-} +- +-void unlock_vector_lock(void) +-{ +- spin_unlock(&vector_lock); +-} +- +-static int +-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +-{ +- /* +- * NOTE! The local APIC isn't very good at handling +- * multiple interrupts at the same interrupt level. +- * As the interrupt level is determined by taking the +- * vector number and shifting that right by 4, we +- * want to spread these out a bit so that they don't +- * all fall in the same interrupt level. +- * +- * Also, we've got to be careful not to trash gate +- * 0x80, because int 0x80 is hm, kind of importantish. ;) +- */ +- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; +- unsigned int old_vector; +- int cpu, err; +- cpumask_var_t tmp_mask; +- +- if ((cfg->move_in_progress) || cfg->move_cleanup_count) +- return -EBUSY; +- +- if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) +- return -ENOMEM; +- +- old_vector = cfg->vector; +- if (old_vector) { +- cpumask_and(tmp_mask, mask, cpu_online_mask); +- cpumask_and(tmp_mask, cfg->domain, tmp_mask); +- if (!cpumask_empty(tmp_mask)) { +- free_cpumask_var(tmp_mask); +- return 0; +- } +- } +- +- /* Only try and allocate irqs on cpus that are present */ +- err = -ENOSPC; +- for_each_cpu_and(cpu, mask, cpu_online_mask) { +- int new_cpu; +- int vector, offset; +- +- vector_allocation_domain(cpu, tmp_mask); +- +- vector = current_vector; +- offset = current_offset; +-next: +- vector += 8; +- if (vector >= first_system_vector) { +- /* If out of vectors on large boxen, must share them. */ +- offset = (offset + 1) % 8; +- vector = FIRST_DEVICE_VECTOR + offset; +- } +- if (unlikely(current_vector == vector)) +- continue; +- +- if (test_bit(vector, used_vectors)) +- goto next; +- +- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) +- if (per_cpu(vector_irq, new_cpu)[vector] != -1) +- goto next; +- /* Found one! */ +- current_vector = vector; +- current_offset = offset; +- if (old_vector) { +- cfg->move_in_progress = 1; +- cpumask_copy(cfg->old_domain, cfg->domain); +- } +- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) +- per_cpu(vector_irq, new_cpu)[vector] = irq; +- cfg->vector = vector; +- cpumask_copy(cfg->domain, tmp_mask); +- err = 0; +- break; +- } +- free_cpumask_var(tmp_mask); +- return err; +-} +- +-static int +-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +-{ +- int err; +- unsigned long flags; +- +- spin_lock_irqsave(&vector_lock, flags); +- err = __assign_irq_vector(irq, cfg, mask); +- spin_unlock_irqrestore(&vector_lock, flags); +- return err; +-} +- +-static void __clear_irq_vector(int irq, struct irq_cfg *cfg) +-{ +- int cpu, vector; +- +- BUG_ON(!cfg->vector); +- +- vector = cfg->vector; +- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) +- per_cpu(vector_irq, cpu)[vector] = -1; +- +- cfg->vector = 0; +- cpumask_clear(cfg->domain); +- +- if (likely(!cfg->move_in_progress)) +- return; +- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { +- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; +- vector++) { +- if (per_cpu(vector_irq, cpu)[vector] != irq) +- continue; +- per_cpu(vector_irq, cpu)[vector] = -1; +- break; +- } +- } +- cfg->move_in_progress = 0; +-} +- +-void __setup_vector_irq(int cpu) +-{ +- /* Initialize vector_irq on a new cpu */ +- /* This function must be called with vector_lock held */ +- int irq, vector; +- struct irq_cfg *cfg; +- struct irq_desc *desc; +- +- /* Mark the inuse vectors */ +- for_each_irq_desc(irq, desc) { +- cfg = desc->chip_data; +- if (!cpumask_test_cpu(cpu, cfg->domain)) +- continue; +- vector = cfg->vector; +- per_cpu(vector_irq, cpu)[vector] = irq; +- } +- /* Mark the free vectors */ +- for (vector = 0; vector < NR_VECTORS; ++vector) { +- irq = per_cpu(vector_irq, cpu)[vector]; +- if (irq < 0) +- continue; +- +- cfg = irq_cfg(irq); +- if (!cpumask_test_cpu(cpu, cfg->domain)) +- per_cpu(vector_irq, cpu)[vector] = -1; +- } +-} +- +-static struct irq_chip ioapic_chip; +-#ifdef CONFIG_INTR_REMAP +-static struct irq_chip ir_ioapic_chip; +-#endif +- +-#define IOAPIC_AUTO -1 +-#define IOAPIC_EDGE 0 +-#define IOAPIC_LEVEL 1 +- +-#ifdef CONFIG_X86_32 +-static inline int IO_APIC_irq_trigger(int irq) +-{ +- int apic, idx, pin; +- +- for (apic = 0; apic < nr_ioapics; apic++) { +- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { +- idx = find_irq_entry(apic, pin, mp_INT); +- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) +- return irq_trigger(idx); +- } +- } +- /* +- * nonexistent IRQs are edge default +- */ +- return 0; +-} +-#else +-static inline int IO_APIC_irq_trigger(int irq) +-{ +- return 1; +-} +-#endif +- +-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) +-{ +- +- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || +- trigger == IOAPIC_LEVEL) +- desc->status |= IRQ_LEVEL; +- else +- desc->status &= ~IRQ_LEVEL; +- +-#ifdef CONFIG_INTR_REMAP +- if (irq_remapped(irq)) { +- desc->status |= IRQ_MOVE_PCNTXT; +- if (trigger) +- set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, +- handle_fasteoi_irq, +- "fasteoi"); +- else +- set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, +- handle_edge_irq, "edge"); +- return; +- } +-#endif +- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || +- trigger == IOAPIC_LEVEL) +- set_irq_chip_and_handler_name(irq, &ioapic_chip, +- handle_fasteoi_irq, +- "fasteoi"); +- else +- set_irq_chip_and_handler_name(irq, &ioapic_chip, +- handle_edge_irq, "edge"); +-} +- +-static int setup_ioapic_entry(int apic, int irq, +- struct IO_APIC_route_entry *entry, +- unsigned int destination, int trigger, +- int polarity, int vector) +-{ +- /* +- * add it to the IO-APIC irq-routing table: +- */ +- memset(entry,0,sizeof(*entry)); +- +-#ifdef CONFIG_INTR_REMAP +- if (intr_remapping_enabled) { +- struct intel_iommu *iommu = map_ioapic_to_ir(apic); +- struct irte irte; +- struct IR_IO_APIC_route_entry *ir_entry = +- (struct IR_IO_APIC_route_entry *) entry; +- int index; +- +- if (!iommu) +- panic("No mapping iommu for ioapic %d\n", apic); +- +- index = alloc_irte(iommu, irq, 1); +- if (index < 0) +- panic("Failed to allocate IRTE for ioapic %d\n", apic); +- +- memset(&irte, 0, sizeof(irte)); +- +- irte.present = 1; +- irte.dst_mode = INT_DEST_MODE; +- irte.trigger_mode = trigger; +- irte.dlvry_mode = INT_DELIVERY_MODE; +- irte.vector = vector; +- irte.dest_id = IRTE_DEST(destination); +- +- modify_irte(irq, &irte); +- +- ir_entry->index2 = (index >> 15) & 0x1; +- ir_entry->zero = 0; +- ir_entry->format = 1; +- ir_entry->index = (index & 0x7fff); +- } else +-#endif +- { +- entry->delivery_mode = INT_DELIVERY_MODE; +- entry->dest_mode = INT_DEST_MODE; +- entry->dest = destination; +- } +- +- entry->mask = 0; /* enable IRQ */ +- entry->trigger = trigger; +- entry->polarity = polarity; +- entry->vector = vector; +- +- /* Mask level triggered irqs. +- * Use IRQ_DELAYED_DISABLE for edge triggered irqs. +- */ +- if (trigger) +- entry->mask = 1; +- return 0; +-} +- +-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, +- int trigger, int polarity) +-{ +- struct irq_cfg *cfg; +- struct IO_APIC_route_entry entry; +- unsigned int dest; +- +- if (!IO_APIC_IRQ(irq)) +- return; +- +- cfg = desc->chip_data; +- +- if (assign_irq_vector(irq, cfg, TARGET_CPUS)) +- return; +- +- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); +- +- apic_printk(APIC_VERBOSE,KERN_DEBUG +- "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " +- "IRQ %d Mode:%i Active:%i)\n", +- apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, +- irq, trigger, polarity); +- +- +- if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, +- dest, trigger, polarity, cfg->vector)) { +- printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", +- mp_ioapics[apic].mp_apicid, pin); +- __clear_irq_vector(irq, cfg); +- return; +- } +- +- ioapic_register_intr(irq, desc, trigger); +- if (irq < NR_IRQS_LEGACY) +- disable_8259A_irq(irq); +- +- ioapic_write_entry(apic, pin, entry); +-} +- +-static void __init setup_IO_APIC_irqs(void) +-{ +- int apic, pin, idx, irq; +- int notcon = 0; +- struct irq_desc *desc; +- struct irq_cfg *cfg; +- int cpu = boot_cpu_id; +- +- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); +- +- for (apic = 0; apic < nr_ioapics; apic++) { +- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { +- +- idx = find_irq_entry(apic, pin, mp_INT); +- if (idx == -1) { +- if (!notcon) { +- notcon = 1; +- apic_printk(APIC_VERBOSE, +- KERN_DEBUG " %d-%d", +- mp_ioapics[apic].mp_apicid, +- pin); +- } else +- apic_printk(APIC_VERBOSE, " %d-%d", +- mp_ioapics[apic].mp_apicid, +- pin); +- continue; +- } +- if (notcon) { +- apic_printk(APIC_VERBOSE, +- " (apicid-pin) not connected\n"); +- notcon = 0; +- } +- +- irq = pin_2_irq(idx, apic, pin); +-#ifdef CONFIG_X86_32 +- if (multi_timer_check(apic, irq)) +- continue; +-#endif +- desc = irq_to_desc_alloc_cpu(irq, cpu); +- if (!desc) { +- printk(KERN_INFO "can not get irq_desc for %d\n", irq); +- continue; +- } +- cfg = desc->chip_data; +- add_pin_to_irq_cpu(cfg, cpu, apic, pin); +- +- setup_IO_APIC_irq(apic, pin, irq, desc, +- irq_trigger(idx), irq_polarity(idx)); +- } +- } +- +- if (notcon) +- apic_printk(APIC_VERBOSE, +- " (apicid-pin) not connected\n"); +-} +- +-/* +- * Set up the timer pin, possibly with the 8259A-master behind. +- */ +-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, +- int vector) +-{ +- struct IO_APIC_route_entry entry; +- +-#ifdef CONFIG_INTR_REMAP +- if (intr_remapping_enabled) +- return; +-#endif +- +- memset(&entry, 0, sizeof(entry)); +- +- /* +- * We use logical delivery to get the timer IRQ +- * to the first CPU. +- */ +- entry.dest_mode = INT_DEST_MODE; +- entry.mask = 1; /* mask IRQ now */ +- entry.dest = cpu_mask_to_apicid(TARGET_CPUS); +- entry.delivery_mode = INT_DELIVERY_MODE; +- entry.polarity = 0; +- entry.trigger = 0; +- entry.vector = vector; +- +- /* +- * The timer IRQ doesn't have to know that behind the +- * scene we may have a 8259A-master in AEOI mode ... +- */ +- set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); +- +- /* +- * Add it to the IO-APIC irq-routing table: +- */ +- ioapic_write_entry(apic, pin, entry); +-} +- +- +-__apicdebuginit(void) print_IO_APIC(void) +-{ +- int apic, i; +- union IO_APIC_reg_00 reg_00; +- union IO_APIC_reg_01 reg_01; +- union IO_APIC_reg_02 reg_02; +- union IO_APIC_reg_03 reg_03; +- unsigned long flags; +- struct irq_cfg *cfg; +- struct irq_desc *desc; +- unsigned int irq; +- +- if (apic_verbosity == APIC_QUIET) +- return; +- +- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); +- for (i = 0; i < nr_ioapics; i++) +- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", +- mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); +- +- /* +- * We are a bit conservative about what we expect. We have to +- * know about every hardware change ASAP. +- */ +- printk(KERN_INFO "testing the IO APIC.......................\n"); +- +- for (apic = 0; apic < nr_ioapics; apic++) { +- +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_00.raw = io_apic_read(apic, 0); +- reg_01.raw = io_apic_read(apic, 1); +- if (reg_01.bits.version >= 0x10) +- reg_02.raw = io_apic_read(apic, 2); +- if (reg_01.bits.version >= 0x20) +- reg_03.raw = io_apic_read(apic, 3); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- printk("\n"); +- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); +- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); +- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); +- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); +- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); +- +- printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); +- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); +- +- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); +- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); +- +- /* +- * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, +- * but the value of reg_02 is read as the previous read register +- * value, so ignore it if reg_02 == reg_01. +- */ +- if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { +- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); +- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); +- } +- +- /* +- * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 +- * or reg_03, but the value of reg_0[23] is read as the previous read +- * register value, so ignore it if reg_03 == reg_0[12]. +- */ +- if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && +- reg_03.raw != reg_01.raw) { +- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); +- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); +- } +- +- printk(KERN_DEBUG ".... IRQ redirection table:\n"); +- +- printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" +- " Stat Dmod Deli Vect: \n"); +- +- for (i = 0; i <= reg_01.bits.entries; i++) { +- struct IO_APIC_route_entry entry; +- +- entry = ioapic_read_entry(apic, i); +- +- printk(KERN_DEBUG " %02x %03X ", +- i, +- entry.dest +- ); +- +- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", +- entry.mask, +- entry.trigger, +- entry.irr, +- entry.polarity, +- entry.delivery_status, +- entry.dest_mode, +- entry.delivery_mode, +- entry.vector +- ); +- } +- } +- printk(KERN_DEBUG "IRQ to pin mappings:\n"); +- for_each_irq_desc(irq, desc) { +- struct irq_pin_list *entry; +- +- cfg = desc->chip_data; +- entry = cfg->irq_2_pin; +- if (!entry) +- continue; +- printk(KERN_DEBUG "IRQ%d ", irq); +- for (;;) { +- printk("-> %d:%d", entry->apic, entry->pin); +- if (!entry->next) +- break; +- entry = entry->next; +- } +- printk("\n"); +- } +- +- printk(KERN_INFO ".................................... done.\n"); +- +- return; +-} +- +-__apicdebuginit(void) print_APIC_bitfield(int base) +-{ +- unsigned int v; +- int i, j; +- +- if (apic_verbosity == APIC_QUIET) +- return; +- +- printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); +- for (i = 0; i < 8; i++) { +- v = apic_read(base + i*0x10); +- for (j = 0; j < 32; j++) { +- if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ +- apic_write(APIC_ESR, 0); +- +- v = apic_read(APIC_ESR); +- printk(KERN_DEBUG "... APIC ESR: %08x\n", v); +- } +- +- icr = apic_icr_read(); +- printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); +- printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); +- +- v = apic_read(APIC_LVTT); +- printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); +- +- if (maxlvt > 3) { /* PC is LVT#4. */ +- v = apic_read(APIC_LVTPC); +- printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); +- } +- v = apic_read(APIC_LVT0); +- printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); +- v = apic_read(APIC_LVT1); +- printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); +- +- if (maxlvt > 2) { /* ERR is LVT#3. */ +- v = apic_read(APIC_LVTERR); +- printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); +- } +- +- v = apic_read(APIC_TMICT); +- printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); +- v = apic_read(APIC_TMCCT); +- printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); +- v = apic_read(APIC_TDCR); +- printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); +- printk("\n"); +-} +- +-__apicdebuginit(void) print_all_local_APICs(void) +-{ +- int cpu; +- +- preempt_disable(); +- for_each_online_cpu(cpu) +- smp_call_function_single(cpu, print_local_APIC, NULL, 1); +- preempt_enable(); +-} +- +-__apicdebuginit(void) print_PIC(void) +-{ +- unsigned int v; +- unsigned long flags; +- +- if (apic_verbosity == APIC_QUIET) +- return; +- +- printk(KERN_DEBUG "\nprinting PIC contents\n"); +- +- spin_lock_irqsave(&i8259A_lock, flags); +- +- v = inb(0xa1) << 8 | inb(0x21); +- printk(KERN_DEBUG "... PIC IMR: %04x\n", v); +- +- v = inb(0xa0) << 8 | inb(0x20); +- printk(KERN_DEBUG "... PIC IRR: %04x\n", v); +- +- outb(0x0b,0xa0); +- outb(0x0b,0x20); +- v = inb(0xa0) << 8 | inb(0x20); +- outb(0x0a,0xa0); +- outb(0x0a,0x20); +- +- spin_unlock_irqrestore(&i8259A_lock, flags); +- +- printk(KERN_DEBUG "... PIC ISR: %04x\n", v); +- +- v = inb(0x4d1) << 8 | inb(0x4d0); +- printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +-} +- +-__apicdebuginit(int) print_all_ICs(void) +-{ +- print_PIC(); +- print_all_local_APICs(); +- print_IO_APIC(); +- +- return 0; +-} +- +-fs_initcall(print_all_ICs); +- +- +-/* Where if anywhere is the i8259 connect in external int mode */ +-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; +- +-void __init enable_IO_APIC(void) +-{ +- union IO_APIC_reg_01 reg_01; +- int i8259_apic, i8259_pin; +- int apic; +- unsigned long flags; +- +-#ifdef CONFIG_X86_32 +- int i; +- if (!pirqs_enabled) +- for (i = 0; i < MAX_PIRQS; i++) +- pirq_entries[i] = -1; +-#endif +- +- /* +- * The number of IO-APIC IRQ registers (== #pins): +- */ +- for (apic = 0; apic < nr_ioapics; apic++) { +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_01.raw = io_apic_read(apic, 1); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- nr_ioapic_registers[apic] = reg_01.bits.entries+1; +- } +- for(apic = 0; apic < nr_ioapics; apic++) { +- int pin; +- /* See if any of the pins is in ExtINT mode */ +- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { +- struct IO_APIC_route_entry entry; +- entry = ioapic_read_entry(apic, pin); +- +- /* If the interrupt line is enabled and in ExtInt mode +- * I have found the pin where the i8259 is connected. +- */ +- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { +- ioapic_i8259.apic = apic; +- ioapic_i8259.pin = pin; +- goto found_i8259; +- } +- } +- } +- found_i8259: +- /* Look to see what if the MP table has reported the ExtINT */ +- /* If we could not find the appropriate pin by looking at the ioapic +- * the i8259 probably is not connected the ioapic but give the +- * mptable a chance anyway. +- */ +- i8259_pin = find_isa_irq_pin(0, mp_ExtINT); +- i8259_apic = find_isa_irq_apic(0, mp_ExtINT); +- /* Trust the MP table if nothing is setup in the hardware */ +- if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { +- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); +- ioapic_i8259.pin = i8259_pin; +- ioapic_i8259.apic = i8259_apic; +- } +- /* Complain if the MP table and the hardware disagree */ +- if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && +- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) +- { +- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); +- } +- +- /* +- * Do not trust the IO-APIC being empty at bootup +- */ +- clear_IO_APIC(); +-} +- +-/* +- * Not an __init, needed by the reboot code +- */ +-void disable_IO_APIC(void) +-{ +- /* +- * Clear the IO-APIC before rebooting: +- */ +- clear_IO_APIC(); +- +- /* +- * If the i8259 is routed through an IOAPIC +- * Put that IOAPIC in virtual wire mode +- * so legacy interrupts can be delivered. +- */ +- if (ioapic_i8259.pin != -1) { +- struct IO_APIC_route_entry entry; +- +- memset(&entry, 0, sizeof(entry)); +- entry.mask = 0; /* Enabled */ +- entry.trigger = 0; /* Edge */ +- entry.irr = 0; +- entry.polarity = 0; /* High */ +- entry.delivery_status = 0; +- entry.dest_mode = 0; /* Physical */ +- entry.delivery_mode = dest_ExtINT; /* ExtInt */ +- entry.vector = 0; +- entry.dest = read_apic_id(); +- +- /* +- * Add it to the IO-APIC irq-routing table: +- */ +- ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); +- } +- +- disconnect_bsp_APIC(ioapic_i8259.pin != -1); +-} +- +-#ifdef CONFIG_X86_32 +-/* +- * function to set the IO-APIC physical IDs based on the +- * values stored in the MPC table. +- * +- * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 +- */ +- +-static void __init setup_ioapic_ids_from_mpc(void) +-{ +- union IO_APIC_reg_00 reg_00; +- physid_mask_t phys_id_present_map; +- int apic; +- int i; +- unsigned char old_id; +- unsigned long flags; +- +- if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) +- return; +- +- /* +- * Don't check I/O APIC IDs for xAPIC systems. They have +- * no meaning without the serial APIC bus. +- */ +- if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) +- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) +- return; +- /* +- * This is broken; anything with a real cpu count has to +- * circumvent this idiocy regardless. +- */ +- phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); +- +- /* +- * Set the IOAPIC ID to the value stored in the MPC table. +- */ +- for (apic = 0; apic < nr_ioapics; apic++) { +- +- /* Read the register 0 value */ +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_00.raw = io_apic_read(apic, 0); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- old_id = mp_ioapics[apic].mp_apicid; +- +- if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { +- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", +- apic, mp_ioapics[apic].mp_apicid); +- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", +- reg_00.bits.ID); +- mp_ioapics[apic].mp_apicid = reg_00.bits.ID; +- } +- +- /* +- * Sanity check, is the ID really free? Every APIC in a +- * system must have a unique ID or we get lots of nice +- * 'stuck on smp_invalidate_needed IPI wait' messages. +- */ +- if (check_apicid_used(phys_id_present_map, +- mp_ioapics[apic].mp_apicid)) { +- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", +- apic, mp_ioapics[apic].mp_apicid); +- for (i = 0; i < get_physical_broadcast(); i++) +- if (!physid_isset(i, phys_id_present_map)) +- break; +- if (i >= get_physical_broadcast()) +- panic("Max APIC ID exceeded!\n"); +- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", +- i); +- physid_set(i, phys_id_present_map); +- mp_ioapics[apic].mp_apicid = i; +- } else { +- physid_mask_t tmp; +- tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); +- apic_printk(APIC_VERBOSE, "Setting %d in the " +- "phys_id_present_map\n", +- mp_ioapics[apic].mp_apicid); +- physids_or(phys_id_present_map, phys_id_present_map, tmp); +- } +- +- +- /* +- * We need to adjust the IRQ routing table +- * if the ID changed. +- */ +- if (old_id != mp_ioapics[apic].mp_apicid) +- for (i = 0; i < mp_irq_entries; i++) +- if (mp_irqs[i].mp_dstapic == old_id) +- mp_irqs[i].mp_dstapic +- = mp_ioapics[apic].mp_apicid; +- +- /* +- * Read the right value from the MPC table and +- * write it into the ID register. +- */ +- apic_printk(APIC_VERBOSE, KERN_INFO +- "...changing IO-APIC physical APIC ID to %d ...", +- mp_ioapics[apic].mp_apicid); +- +- reg_00.bits.ID = mp_ioapics[apic].mp_apicid; +- spin_lock_irqsave(&ioapic_lock, flags); +- io_apic_write(apic, 0, reg_00.raw); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- /* +- * Sanity check +- */ +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_00.raw = io_apic_read(apic, 0); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) +- printk("could not set ID!\n"); +- else +- apic_printk(APIC_VERBOSE, " ok.\n"); +- } +-} +-#endif +- +-int no_timer_check __initdata; +- +-static int __init notimercheck(char *s) +-{ +- no_timer_check = 1; +- return 1; +-} +-__setup("no_timer_check", notimercheck); +- +-/* +- * There is a nasty bug in some older SMP boards, their mptable lies +- * about the timer IRQ. We do the following to work around the situation: +- * +- * - timer IRQ defaults to IO-APIC IRQ +- * - if this function detects that timer IRQs are defunct, then we fall +- * back to ISA timer IRQs +- */ +-static int __init timer_irq_works(void) +-{ +- unsigned long t1 = jiffies; +- unsigned long flags; +- +- if (no_timer_check) +- return 1; +- +- local_save_flags(flags); +- local_irq_enable(); +- /* Let ten ticks pass... */ +- mdelay((10 * 1000) / HZ); +- local_irq_restore(flags); +- +- /* +- * Expect a few ticks at least, to be sure some possible +- * glue logic does not lock up after one or two first +- * ticks in a non-ExtINT mode. Also the local APIC +- * might have cached one ExtINT interrupt. Finally, at +- * least one tick may be lost due to delays. +- */ +- +- /* jiffies wrap? */ +- if (time_after(jiffies, t1 + 4)) +- return 1; +- return 0; +-} +- +-/* +- * In the SMP+IOAPIC case it might happen that there are an unspecified +- * number of pending IRQ events unhandled. These cases are very rare, +- * so we 'resend' these IRQs via IPIs, to the same CPU. It's much +- * better to do it this way as thus we do not have to be aware of +- * 'pending' interrupts in the IRQ path, except at this point. +- */ +-/* +- * Edge triggered needs to resend any interrupt +- * that was delayed but this is now handled in the device +- * independent code. +- */ +- +-/* +- * Starting up a edge-triggered IO-APIC interrupt is +- * nasty - we need to make sure that we get the edge. +- * If it is already asserted for some reason, we need +- * return 1 to indicate that is was pending. +- * +- * This is not complete - we should be able to fake +- * an edge even if it isn't on the 8259A... +- */ +- +-static unsigned int startup_ioapic_irq(unsigned int irq) +-{ +- int was_pending = 0; +- unsigned long flags; +- struct irq_cfg *cfg; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- if (irq < NR_IRQS_LEGACY) { +- disable_8259A_irq(irq); +- if (i8259A_irq_pending(irq)) +- was_pending = 1; +- } +- cfg = irq_cfg(irq); +- __unmask_IO_APIC_irq(cfg); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- return was_pending; +-} +- +-#ifdef CONFIG_X86_64 +-static int ioapic_retrigger_irq(unsigned int irq) +-{ +- +- struct irq_cfg *cfg = irq_cfg(irq); +- unsigned long flags; +- +- spin_lock_irqsave(&vector_lock, flags); +- send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); +- spin_unlock_irqrestore(&vector_lock, flags); +- +- return 1; +-} +-#else +-static int ioapic_retrigger_irq(unsigned int irq) +-{ +- send_IPI_self(irq_cfg(irq)->vector); +- +- return 1; +-} +-#endif +- +-/* +- * Level and edge triggered IO-APIC interrupts need different handling, +- * so we use two separate IRQ descriptors. Edge triggered IRQs can be +- * handled with the level-triggered descriptor, but that one has slightly +- * more overhead. Level-triggered interrupts cannot be handled with the +- * edge-triggered handler, without risking IRQ storms and other ugly +- * races. +- */ +- +-#ifdef CONFIG_SMP +- +-#ifdef CONFIG_INTR_REMAP +-static void ir_irq_migration(struct work_struct *work); +- +-static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); +- +-/* +- * Migrate the IO-APIC irq in the presence of intr-remapping. +- * +- * For edge triggered, irq migration is a simple atomic update(of vector +- * and cpu destination) of IRTE and flush the hardware cache. +- * +- * For level triggered, we need to modify the io-apic RTE aswell with the update +- * vector information, along with modifying IRTE with vector and destination. +- * So irq migration for level triggered is little bit more complex compared to +- * edge triggered migration. But the good news is, we use the same algorithm +- * for level triggered migration as we have today, only difference being, +- * we now initiate the irq migration from process context instead of the +- * interrupt context. +- * +- * In future, when we do a directed EOI (combined with cpu EOI broadcast +- * suppression) to the IO-APIC, level triggered irq migration will also be +- * as simple as edge triggered migration and we can do the irq migration +- * with a simple atomic update to IO-APIC RTE. +- */ +-static void +-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +-{ +- struct irq_cfg *cfg; +- struct irte irte; +- int modify_ioapic_rte; +- unsigned int dest; +- unsigned long flags; +- unsigned int irq; +- +- if (!cpumask_intersects(mask, cpu_online_mask)) +- return; +- +- irq = desc->irq; +- if (get_irte(irq, &irte)) +- return; +- +- cfg = desc->chip_data; +- if (assign_irq_vector(irq, cfg, mask)) +- return; +- +- set_extra_move_desc(desc, mask); +- +- dest = cpu_mask_to_apicid_and(cfg->domain, mask); +- +- modify_ioapic_rte = desc->status & IRQ_LEVEL; +- if (modify_ioapic_rte) { +- spin_lock_irqsave(&ioapic_lock, flags); +- __target_IO_APIC_irq(irq, dest, cfg); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- } +- +- irte.vector = cfg->vector; +- irte.dest_id = IRTE_DEST(dest); +- +- /* +- * Modified the IRTE and flushes the Interrupt entry cache. +- */ +- modify_irte(irq, &irte); +- +- if (cfg->move_in_progress) +- send_cleanup_vector(cfg); +- +- cpumask_copy(&desc->affinity, mask); +-} +- +-static int migrate_irq_remapped_level_desc(struct irq_desc *desc) +-{ +- int ret = -1; +- struct irq_cfg *cfg = desc->chip_data; +- +- mask_IO_APIC_irq_desc(desc); +- +- if (io_apic_level_ack_pending(cfg)) { +- /* +- * Interrupt in progress. Migrating irq now will change the +- * vector information in the IO-APIC RTE and that will confuse +- * the EOI broadcast performed by cpu. +- * So, delay the irq migration to the next instance. +- */ +- schedule_delayed_work(&ir_migration_work, 1); +- goto unmask; +- } +- +- /* everthing is clear. we have right of way */ +- migrate_ioapic_irq_desc(desc, &desc->pending_mask); +- +- ret = 0; +- desc->status &= ~IRQ_MOVE_PENDING; +- cpumask_clear(&desc->pending_mask); +- +-unmask: +- unmask_IO_APIC_irq_desc(desc); +- +- return ret; +-} +- +-static void ir_irq_migration(struct work_struct *work) +-{ +- unsigned int irq; +- struct irq_desc *desc; +- +- for_each_irq_desc(irq, desc) { +- if (desc->status & IRQ_MOVE_PENDING) { +- unsigned long flags; +- +- spin_lock_irqsave(&desc->lock, flags); +- if (!desc->chip->set_affinity || +- !(desc->status & IRQ_MOVE_PENDING)) { +- desc->status &= ~IRQ_MOVE_PENDING; +- spin_unlock_irqrestore(&desc->lock, flags); +- continue; +- } +- +- desc->chip->set_affinity(irq, &desc->pending_mask); +- spin_unlock_irqrestore(&desc->lock, flags); +- } +- } +-} +- +-/* +- * Migrates the IRQ destination in the process context. +- */ +-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +- const struct cpumask *mask) +-{ +- if (desc->status & IRQ_LEVEL) { +- desc->status |= IRQ_MOVE_PENDING; +- cpumask_copy(&desc->pending_mask, mask); +- migrate_irq_remapped_level_desc(desc); +- return; +- } +- +- migrate_ioapic_irq_desc(desc, mask); +-} +-static void set_ir_ioapic_affinity_irq(unsigned int irq, +- const struct cpumask *mask) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- +- set_ir_ioapic_affinity_irq_desc(desc, mask); +-} +-#endif +- +-asmlinkage void smp_irq_move_cleanup_interrupt(void) +-{ +- unsigned vector, me; +- +- ack_APIC_irq(); +- exit_idle(); +- irq_enter(); +- +- me = smp_processor_id(); +- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { +- unsigned int irq; +- unsigned int irr; +- struct irq_desc *desc; +- struct irq_cfg *cfg; +- irq = __get_cpu_var(vector_irq)[vector]; +- +- if (irq == -1) +- continue; +- +- desc = irq_to_desc(irq); +- if (!desc) +- continue; +- +- cfg = irq_cfg(irq); +- spin_lock(&desc->lock); +- if (!cfg->move_cleanup_count) +- goto unlock; +- +- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) +- goto unlock; +- +- irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); +- /* +- * Check if the vector that needs to be cleanedup is +- * registered at the cpu's IRR. If so, then this is not +- * the best time to clean it up. Lets clean it up in the +- * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR +- * to myself. +- */ +- if (irr & (1 << (vector % 32))) { +- send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); +- goto unlock; +- } +- __get_cpu_var(vector_irq)[vector] = -1; +- cfg->move_cleanup_count--; +-unlock: +- spin_unlock(&desc->lock); +- } +- +- irq_exit(); +-} +- +-static void irq_complete_move(struct irq_desc **descp) +-{ +- struct irq_desc *desc = *descp; +- struct irq_cfg *cfg = desc->chip_data; +- unsigned vector, me; +- +- if (likely(!cfg->move_in_progress)) { +-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC +- if (likely(!cfg->move_desc_pending)) +- return; +- +- /* domain has not changed, but affinity did */ +- me = smp_processor_id(); +- if (cpu_isset(me, desc->affinity)) { +- *descp = desc = move_irq_desc(desc, me); +- /* get the new one */ +- cfg = desc->chip_data; +- cfg->move_desc_pending = 0; +- } +-#endif +- return; +- } +- +- vector = ~get_irq_regs()->orig_ax; +- me = smp_processor_id(); +- +- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { +-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC +- *descp = desc = move_irq_desc(desc, me); +- /* get the new one */ +- cfg = desc->chip_data; +-#endif +- send_cleanup_vector(cfg); +- } +-} +-#else +-static inline void irq_complete_move(struct irq_desc **descp) {} +-#endif +- +-#ifdef CONFIG_INTR_REMAP +-static void ack_x2apic_level(unsigned int irq) +-{ +- ack_x2APIC_irq(); +-} +- +-static void ack_x2apic_edge(unsigned int irq) +-{ +- ack_x2APIC_irq(); +-} +- +-#endif +- +-static void ack_apic_edge(unsigned int irq) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- +- irq_complete_move(&desc); +- move_native_irq(irq); +- ack_APIC_irq(); +-} +- +-atomic_t irq_mis_count; +- +-static void ack_apic_level(unsigned int irq) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- +-#ifdef CONFIG_X86_32 +- unsigned long v; +- int i; +-#endif +- struct irq_cfg *cfg; +- int do_unmask_irq = 0; +- +- irq_complete_move(&desc); +-#ifdef CONFIG_GENERIC_PENDING_IRQ +- /* If we are moving the irq we need to mask it */ +- if (unlikely(desc->status & IRQ_MOVE_PENDING)) { +- do_unmask_irq = 1; +- mask_IO_APIC_irq_desc(desc); +- } +-#endif +- +-#ifdef CONFIG_X86_32 +- /* +- * It appears there is an erratum which affects at least version 0x11 +- * of I/O APIC (that's the 82093AA and cores integrated into various +- * chipsets). Under certain conditions a level-triggered interrupt is +- * erroneously delivered as edge-triggered one but the respective IRR +- * bit gets set nevertheless. As a result the I/O unit expects an EOI +- * message but it will never arrive and further interrupts are blocked +- * from the source. The exact reason is so far unknown, but the +- * phenomenon was observed when two consecutive interrupt requests +- * from a given source get delivered to the same CPU and the source is +- * temporarily disabled in between. +- * +- * A workaround is to simulate an EOI message manually. We achieve it +- * by setting the trigger mode to edge and then to level when the edge +- * trigger mode gets detected in the TMR of a local APIC for a +- * level-triggered interrupt. We mask the source for the time of the +- * operation to prevent an edge-triggered interrupt escaping meanwhile. +- * The idea is from Manfred Spraul. --macro +- */ +- cfg = desc->chip_data; +- i = cfg->vector; +- +- v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); +-#endif +- +- /* +- * We must acknowledge the irq before we move it or the acknowledge will +- * not propagate properly. +- */ +- ack_APIC_irq(); +- +- /* Now we can move and renable the irq */ +- if (unlikely(do_unmask_irq)) { +- /* Only migrate the irq if the ack has been received. +- * +- * On rare occasions the broadcast level triggered ack gets +- * delayed going to ioapics, and if we reprogram the +- * vector while Remote IRR is still set the irq will never +- * fire again. +- * +- * To prevent this scenario we read the Remote IRR bit +- * of the ioapic. This has two effects. +- * - On any sane system the read of the ioapic will +- * flush writes (and acks) going to the ioapic from +- * this cpu. +- * - We get to see if the ACK has actually been delivered. +- * +- * Based on failed experiments of reprogramming the +- * ioapic entry from outside of irq context starting +- * with masking the ioapic entry and then polling until +- * Remote IRR was clear before reprogramming the +- * ioapic I don't trust the Remote IRR bit to be +- * completey accurate. +- * +- * However there appears to be no other way to plug +- * this race, so if the Remote IRR bit is not +- * accurate and is causing problems then it is a hardware bug +- * and you can go talk to the chipset vendor about it. +- */ +- cfg = desc->chip_data; +- if (!io_apic_level_ack_pending(cfg)) +- move_masked_irq(irq); +- unmask_IO_APIC_irq_desc(desc); +- } +- +-#ifdef CONFIG_X86_32 +- if (!(v & (1 << (i & 0x1f)))) { +- atomic_inc(&irq_mis_count); +- spin_lock(&ioapic_lock); +- __mask_and_edge_IO_APIC_irq(cfg); +- __unmask_and_level_IO_APIC_irq(cfg); +- spin_unlock(&ioapic_lock); +- } +-#endif +-} +- +-static struct irq_chip ioapic_chip __read_mostly = { +- .name = "IO-APIC", +- .startup = startup_ioapic_irq, +- .mask = mask_IO_APIC_irq, +- .unmask = unmask_IO_APIC_irq, +- .ack = ack_apic_edge, +- .eoi = ack_apic_level, +-#ifdef CONFIG_SMP +- .set_affinity = set_ioapic_affinity_irq, +-#endif +- .retrigger = ioapic_retrigger_irq, +-}; +- +-#ifdef CONFIG_INTR_REMAP +-static struct irq_chip ir_ioapic_chip __read_mostly = { +- .name = "IR-IO-APIC", +- .startup = startup_ioapic_irq, +- .mask = mask_IO_APIC_irq, +- .unmask = unmask_IO_APIC_irq, +- .ack = ack_x2apic_edge, +- .eoi = ack_x2apic_level, +-#ifdef CONFIG_SMP +- .set_affinity = set_ir_ioapic_affinity_irq, +-#endif +- .retrigger = ioapic_retrigger_irq, +-}; +-#endif +- +-static inline void init_IO_APIC_traps(void) +-{ +- int irq; +- struct irq_desc *desc; +- struct irq_cfg *cfg; +- +- /* +- * NOTE! The local APIC isn't very good at handling +- * multiple interrupts at the same interrupt level. +- * As the interrupt level is determined by taking the +- * vector number and shifting that right by 4, we +- * want to spread these out a bit so that they don't +- * all fall in the same interrupt level. +- * +- * Also, we've got to be careful not to trash gate +- * 0x80, because int 0x80 is hm, kind of importantish. ;) +- */ +- for_each_irq_desc(irq, desc) { +- cfg = desc->chip_data; +- if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { +- /* +- * Hmm.. We don't have an entry for this, +- * so default to an old-fashioned 8259 +- * interrupt if we can.. +- */ +- if (irq < NR_IRQS_LEGACY) +- make_8259A_irq(irq); +- else +- /* Strange. Oh, well.. */ +- desc->chip = &no_irq_chip; +- } +- } +-} +- +-/* +- * The local APIC irq-chip implementation: +- */ +- +-static void mask_lapic_irq(unsigned int irq) +-{ +- unsigned long v; +- +- v = apic_read(APIC_LVT0); +- apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +-} +- +-static void unmask_lapic_irq(unsigned int irq) +-{ +- unsigned long v; +- +- v = apic_read(APIC_LVT0); +- apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +-} +- +-static void ack_lapic_irq(unsigned int irq) +-{ +- ack_APIC_irq(); +-} +- +-static struct irq_chip lapic_chip __read_mostly = { +- .name = "local-APIC", +- .mask = mask_lapic_irq, +- .unmask = unmask_lapic_irq, +- .ack = ack_lapic_irq, +-}; +- +-static void lapic_register_intr(int irq, struct irq_desc *desc) +-{ +- desc->status &= ~IRQ_LEVEL; +- set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, +- "edge"); +-} +- +-static void __init setup_nmi(void) +-{ +- /* +- * Dirty trick to enable the NMI watchdog ... +- * We put the 8259A master into AEOI mode and +- * unmask on all local APICs LVT0 as NMI. +- * +- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') +- * is from Maciej W. Rozycki - so we do not have to EOI from +- * the NMI handler or the timer interrupt. +- */ +- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); +- +- enable_NMI_through_LVT0(); +- +- apic_printk(APIC_VERBOSE, " done.\n"); +-} +- +-/* +- * This looks a bit hackish but it's about the only one way of sending +- * a few INTA cycles to 8259As and any associated glue logic. ICR does +- * not support the ExtINT mode, unfortunately. We need to send these +- * cycles as some i82489DX-based boards have glue logic that keeps the +- * 8259A interrupt line asserted until INTA. --macro +- */ +-static inline void __init unlock_ExtINT_logic(void) +-{ +- int apic, pin, i; +- struct IO_APIC_route_entry entry0, entry1; +- unsigned char save_control, save_freq_select; +- +- pin = find_isa_irq_pin(8, mp_INT); +- if (pin == -1) { +- WARN_ON_ONCE(1); +- return; +- } +- apic = find_isa_irq_apic(8, mp_INT); +- if (apic == -1) { +- WARN_ON_ONCE(1); +- return; +- } +- +- entry0 = ioapic_read_entry(apic, pin); +- clear_IO_APIC_pin(apic, pin); +- +- memset(&entry1, 0, sizeof(entry1)); +- +- entry1.dest_mode = 0; /* physical delivery */ +- entry1.mask = 0; /* unmask IRQ now */ +- entry1.dest = hard_smp_processor_id(); +- entry1.delivery_mode = dest_ExtINT; +- entry1.polarity = entry0.polarity; +- entry1.trigger = 0; +- entry1.vector = 0; +- +- ioapic_write_entry(apic, pin, entry1); +- +- save_control = CMOS_READ(RTC_CONTROL); +- save_freq_select = CMOS_READ(RTC_FREQ_SELECT); +- CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, +- RTC_FREQ_SELECT); +- CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); +- +- i = 100; +- while (i-- > 0) { +- mdelay(10); +- if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) +- i -= 10; +- } +- +- CMOS_WRITE(save_control, RTC_CONTROL); +- CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); +- clear_IO_APIC_pin(apic, pin); +- +- ioapic_write_entry(apic, pin, entry0); +-} +- +-static int disable_timer_pin_1 __initdata; +-/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +-static int __init disable_timer_pin_setup(char *arg) +-{ +- disable_timer_pin_1 = 1; +- return 0; +-} +-early_param("disable_timer_pin_1", disable_timer_pin_setup); +- +-int timer_through_8259 __initdata; +- +-/* +- * This code may look a bit paranoid, but it's supposed to cooperate with +- * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ +- * is so screwy. Thanks to Brian Perkins for testing/hacking this beast +- * fanatically on his truly buggy board. +- * +- * FIXME: really need to revamp this for all platforms. +- */ +-static inline void __init check_timer(void) +-{ +- struct irq_desc *desc = irq_to_desc(0); +- struct irq_cfg *cfg = desc->chip_data; +- int cpu = boot_cpu_id; +- int apic1, pin1, apic2, pin2; +- unsigned long flags; +- unsigned int ver; +- int no_pin1 = 0; +- +- local_irq_save(flags); +- +- ver = apic_read(APIC_LVR); +- ver = GET_APIC_VERSION(ver); +- +- /* +- * get/set the timer IRQ vector: +- */ +- disable_8259A_irq(0); +- assign_irq_vector(0, cfg, TARGET_CPUS); +- +- /* +- * As IRQ0 is to be enabled in the 8259A, the virtual +- * wire has to be disabled in the local APIC. Also +- * timer interrupts need to be acknowledged manually in +- * the 8259A for the i82489DX when using the NMI +- * watchdog as that APIC treats NMIs as level-triggered. +- * The AEOI mode will finish them in the 8259A +- * automatically. +- */ +- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); +- init_8259A(1); +-#ifdef CONFIG_X86_32 +- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +-#endif +- +- pin1 = find_isa_irq_pin(0, mp_INT); +- apic1 = find_isa_irq_apic(0, mp_INT); +- pin2 = ioapic_i8259.pin; +- apic2 = ioapic_i8259.apic; +- +- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " +- "apic1=%d pin1=%d apic2=%d pin2=%d\n", +- cfg->vector, apic1, pin1, apic2, pin2); +- +- /* +- * Some BIOS writers are clueless and report the ExtINTA +- * I/O APIC input from the cascaded 8259A as the timer +- * interrupt input. So just in case, if only one pin +- * was found above, try it both directly and through the +- * 8259A. +- */ +- if (pin1 == -1) { +-#ifdef CONFIG_INTR_REMAP +- if (intr_remapping_enabled) +- panic("BIOS bug: timer not connected to IO-APIC"); +-#endif +- pin1 = pin2; +- apic1 = apic2; +- no_pin1 = 1; +- } else if (pin2 == -1) { +- pin2 = pin1; +- apic2 = apic1; +- } +- +- if (pin1 != -1) { +- /* +- * Ok, does IRQ0 through the IOAPIC work? +- */ +- if (no_pin1) { +- add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); +- setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); +- } +- unmask_IO_APIC_irq_desc(desc); +- if (timer_irq_works()) { +- if (nmi_watchdog == NMI_IO_APIC) { +- setup_nmi(); +- enable_8259A_irq(0); +- } +- if (disable_timer_pin_1 > 0) +- clear_IO_APIC_pin(0, pin1); +- goto out; +- } +-#ifdef CONFIG_INTR_REMAP +- if (intr_remapping_enabled) +- panic("timer doesn't work through Interrupt-remapped IO-APIC"); +-#endif +- clear_IO_APIC_pin(apic1, pin1); +- if (!no_pin1) +- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " +- "8254 timer not connected to IO-APIC\n"); +- +- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " +- "(IRQ0) through the 8259A ...\n"); +- apic_printk(APIC_QUIET, KERN_INFO +- "..... (found apic %d pin %d) ...\n", apic2, pin2); +- /* +- * legacy devices should be connected to IO APIC #0 +- */ +- replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); +- setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); +- unmask_IO_APIC_irq_desc(desc); +- enable_8259A_irq(0); +- if (timer_irq_works()) { +- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); +- timer_through_8259 = 1; +- if (nmi_watchdog == NMI_IO_APIC) { +- disable_8259A_irq(0); +- setup_nmi(); +- enable_8259A_irq(0); +- } +- goto out; +- } +- /* +- * Cleanup, just in case ... +- */ +- disable_8259A_irq(0); +- clear_IO_APIC_pin(apic2, pin2); +- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); +- } +- +- if (nmi_watchdog == NMI_IO_APIC) { +- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " +- "through the IO-APIC - disabling NMI Watchdog!\n"); +- nmi_watchdog = NMI_NONE; +- } +-#ifdef CONFIG_X86_32 +- timer_ack = 0; +-#endif +- +- apic_printk(APIC_QUIET, KERN_INFO +- "...trying to set up timer as Virtual Wire IRQ...\n"); +- +- lapic_register_intr(0, desc); +- apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ +- enable_8259A_irq(0); +- +- if (timer_irq_works()) { +- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); +- goto out; +- } +- disable_8259A_irq(0); +- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); +- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); +- +- apic_printk(APIC_QUIET, KERN_INFO +- "...trying to set up timer as ExtINT IRQ...\n"); +- +- init_8259A(0); +- make_8259A_irq(0); +- apic_write(APIC_LVT0, APIC_DM_EXTINT); +- +- unlock_ExtINT_logic(); +- +- if (timer_irq_works()) { +- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); +- goto out; +- } +- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); +- panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " +- "report. Then try booting with the 'noapic' option.\n"); +-out: +- local_irq_restore(flags); +-} +- +-/* +- * Traditionally ISA IRQ2 is the cascade IRQ, and is not available +- * to devices. However there may be an I/O APIC pin available for +- * this interrupt regardless. The pin may be left unconnected, but +- * typically it will be reused as an ExtINT cascade interrupt for +- * the master 8259A. In the MPS case such a pin will normally be +- * reported as an ExtINT interrupt in the MP table. With ACPI +- * there is no provision for ExtINT interrupts, and in the absence +- * of an override it would be treated as an ordinary ISA I/O APIC +- * interrupt, that is edge-triggered and unmasked by default. We +- * used to do this, but it caused problems on some systems because +- * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using +- * the same ExtINT cascade interrupt to drive the local APIC of the +- * bootstrap processor. Therefore we refrain from routing IRQ2 to +- * the I/O APIC in all cases now. No actual device should request +- * it anyway. --macro +- */ +-#define PIC_IRQS (1 << PIC_CASCADE_IR) +- +-void __init setup_IO_APIC(void) +-{ +- +-#ifdef CONFIG_X86_32 +- enable_IO_APIC(); +-#else +- /* +- * calling enable_IO_APIC() is moved to setup_local_APIC for BP +- */ +-#endif +- +- io_apic_irqs = ~PIC_IRQS; +- +- apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); +- /* +- * Set up IO-APIC IRQ routing. +- */ +-#ifdef CONFIG_X86_32 +- if (!acpi_ioapic) +- setup_ioapic_ids_from_mpc(); +-#endif +- sync_Arb_IDs(); +- setup_IO_APIC_irqs(); +- init_IO_APIC_traps(); +- check_timer(); +-} +- +-/* +- * Called after all the initialization is done. If we didnt find any +- * APIC bugs then we can allow the modify fast path +- */ +- +-static int __init io_apic_bug_finalize(void) +-{ +- if (sis_apic_bug == -1) +- sis_apic_bug = 0; +- return 0; +-} +- +-late_initcall(io_apic_bug_finalize); +- +-struct sysfs_ioapic_data { +- struct sys_device dev; +- struct IO_APIC_route_entry entry[0]; +-}; +-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; +- +-static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +-{ +- struct IO_APIC_route_entry *entry; +- struct sysfs_ioapic_data *data; +- int i; +- +- data = container_of(dev, struct sysfs_ioapic_data, dev); +- entry = data->entry; +- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) +- *entry = ioapic_read_entry(dev->id, i); +- +- return 0; +-} +- +-static int ioapic_resume(struct sys_device *dev) +-{ +- struct IO_APIC_route_entry *entry; +- struct sysfs_ioapic_data *data; +- unsigned long flags; +- union IO_APIC_reg_00 reg_00; +- int i; +- +- data = container_of(dev, struct sysfs_ioapic_data, dev); +- entry = data->entry; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_00.raw = io_apic_read(dev->id, 0); +- if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { +- reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; +- io_apic_write(dev->id, 0, reg_00.raw); +- } +- spin_unlock_irqrestore(&ioapic_lock, flags); +- for (i = 0; i < nr_ioapic_registers[dev->id]; i++) +- ioapic_write_entry(dev->id, i, entry[i]); +- +- return 0; +-} +- +-static struct sysdev_class ioapic_sysdev_class = { +- .name = "ioapic", +- .suspend = ioapic_suspend, +- .resume = ioapic_resume, +-}; +- +-static int __init ioapic_init_sysfs(void) +-{ +- struct sys_device * dev; +- int i, size, error; +- +- error = sysdev_class_register(&ioapic_sysdev_class); +- if (error) +- return error; +- +- for (i = 0; i < nr_ioapics; i++ ) { +- size = sizeof(struct sys_device) + nr_ioapic_registers[i] +- * sizeof(struct IO_APIC_route_entry); +- mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); +- if (!mp_ioapic_data[i]) { +- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); +- continue; +- } +- dev = &mp_ioapic_data[i]->dev; +- dev->id = i; +- dev->cls = &ioapic_sysdev_class; +- error = sysdev_register(dev); +- if (error) { +- kfree(mp_ioapic_data[i]); +- mp_ioapic_data[i] = NULL; +- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); +- continue; +- } +- } +- +- return 0; +-} +- +-device_initcall(ioapic_init_sysfs); +- +-/* +- * Dynamic irq allocate and deallocation +- */ +-unsigned int create_irq_nr(unsigned int irq_want) +-{ +- /* Allocate an unused irq */ +- unsigned int irq; +- unsigned int new; +- unsigned long flags; +- struct irq_cfg *cfg_new = NULL; +- int cpu = boot_cpu_id; +- struct irq_desc *desc_new = NULL; +- +- irq = 0; +- spin_lock_irqsave(&vector_lock, flags); +- for (new = irq_want; new < NR_IRQS; new++) { +- if (platform_legacy_irq(new)) +- continue; +- +- desc_new = irq_to_desc_alloc_cpu(new, cpu); +- if (!desc_new) { +- printk(KERN_INFO "can not get irq_desc for %d\n", new); +- continue; +- } +- cfg_new = desc_new->chip_data; +- +- if (cfg_new->vector != 0) +- continue; +- if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) +- irq = new; +- break; +- } +- spin_unlock_irqrestore(&vector_lock, flags); +- +- if (irq > 0) { +- dynamic_irq_init(irq); +- /* restore it, in case dynamic_irq_init clear it */ +- if (desc_new) +- desc_new->chip_data = cfg_new; +- } +- return irq; +-} +- +-static int nr_irqs_gsi = NR_IRQS_LEGACY; +-int create_irq(void) +-{ +- unsigned int irq_want; +- int irq; +- +- irq_want = nr_irqs_gsi; +- irq = create_irq_nr(irq_want); +- +- if (irq == 0) +- irq = -1; +- +- return irq; +-} +- +-void destroy_irq(unsigned int irq) +-{ +- unsigned long flags; +- struct irq_cfg *cfg; +- struct irq_desc *desc; +- +- /* store it, in case dynamic_irq_cleanup clear it */ +- desc = irq_to_desc(irq); +- cfg = desc->chip_data; +- dynamic_irq_cleanup(irq); +- /* connect back irq_cfg */ +- if (desc) +- desc->chip_data = cfg; +- +-#ifdef CONFIG_INTR_REMAP +- free_irte(irq); +-#endif +- spin_lock_irqsave(&vector_lock, flags); +- __clear_irq_vector(irq, cfg); +- spin_unlock_irqrestore(&vector_lock, flags); +-} +- +-/* +- * MSI message composition +- */ +-#ifdef CONFIG_PCI_MSI +-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +-{ +- struct irq_cfg *cfg; +- int err; +- unsigned dest; +- +- cfg = irq_cfg(irq); +- err = assign_irq_vector(irq, cfg, TARGET_CPUS); +- if (err) +- return err; +- +- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); +- +-#ifdef CONFIG_INTR_REMAP +- if (irq_remapped(irq)) { +- struct irte irte; +- int ir_index; +- u16 sub_handle; +- +- ir_index = map_irq_to_irte_handle(irq, &sub_handle); +- BUG_ON(ir_index == -1); +- +- memset (&irte, 0, sizeof(irte)); +- +- irte.present = 1; +- irte.dst_mode = INT_DEST_MODE; +- irte.trigger_mode = 0; /* edge */ +- irte.dlvry_mode = INT_DELIVERY_MODE; +- irte.vector = cfg->vector; +- irte.dest_id = IRTE_DEST(dest); +- +- modify_irte(irq, &irte); +- +- msg->address_hi = MSI_ADDR_BASE_HI; +- msg->data = sub_handle; +- msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | +- MSI_ADDR_IR_SHV | +- MSI_ADDR_IR_INDEX1(ir_index) | +- MSI_ADDR_IR_INDEX2(ir_index); +- } else +-#endif +- { +- msg->address_hi = MSI_ADDR_BASE_HI; +- msg->address_lo = +- MSI_ADDR_BASE_LO | +- ((INT_DEST_MODE == 0) ? +- MSI_ADDR_DEST_MODE_PHYSICAL: +- MSI_ADDR_DEST_MODE_LOGICAL) | +- ((INT_DELIVERY_MODE != dest_LowestPrio) ? +- MSI_ADDR_REDIRECTION_CPU: +- MSI_ADDR_REDIRECTION_LOWPRI) | +- MSI_ADDR_DEST_ID(dest); +- +- msg->data = +- MSI_DATA_TRIGGER_EDGE | +- MSI_DATA_LEVEL_ASSERT | +- ((INT_DELIVERY_MODE != dest_LowestPrio) ? +- MSI_DATA_DELIVERY_FIXED: +- MSI_DATA_DELIVERY_LOWPRI) | +- MSI_DATA_VECTOR(cfg->vector); +- } +- return err; +-} +- +-#ifdef CONFIG_SMP +-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- struct irq_cfg *cfg; +- struct msi_msg msg; +- unsigned int dest; +- +- dest = set_desc_affinity(desc, mask); +- if (dest == BAD_APICID) +- return; +- +- cfg = desc->chip_data; +- +- read_msi_msg_desc(desc, &msg); +- +- msg.data &= ~MSI_DATA_VECTOR_MASK; +- msg.data |= MSI_DATA_VECTOR(cfg->vector); +- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; +- msg.address_lo |= MSI_ADDR_DEST_ID(dest); +- +- write_msi_msg_desc(desc, &msg); +-} +-#ifdef CONFIG_INTR_REMAP +-/* +- * Migrate the MSI irq to another cpumask. This migration is +- * done in the process context using interrupt-remapping hardware. +- */ +-static void +-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- struct irq_cfg *cfg = desc->chip_data; +- unsigned int dest; +- struct irte irte; +- +- if (get_irte(irq, &irte)) +- return; +- +- dest = set_desc_affinity(desc, mask); +- if (dest == BAD_APICID) +- return; +- +- irte.vector = cfg->vector; +- irte.dest_id = IRTE_DEST(dest); +- +- /* +- * atomically update the IRTE with the new destination and vector. +- */ +- modify_irte(irq, &irte); +- +- /* +- * After this point, all the interrupts will start arriving +- * at the new destination. So, time to cleanup the previous +- * vector allocation. +- */ +- if (cfg->move_in_progress) +- send_cleanup_vector(cfg); +-} +- +-#endif +-#endif /* CONFIG_SMP */ +- +-/* +- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, +- * which implement the MSI or MSI-X Capability Structure. +- */ +-static struct irq_chip msi_chip = { +- .name = "PCI-MSI", +- .unmask = unmask_msi_irq, +- .mask = mask_msi_irq, +- .ack = ack_apic_edge, +-#ifdef CONFIG_SMP +- .set_affinity = set_msi_irq_affinity, +-#endif +- .retrigger = ioapic_retrigger_irq, +-}; +- +-#ifdef CONFIG_INTR_REMAP +-static struct irq_chip msi_ir_chip = { +- .name = "IR-PCI-MSI", +- .unmask = unmask_msi_irq, +- .mask = mask_msi_irq, +- .ack = ack_x2apic_edge, +-#ifdef CONFIG_SMP +- .set_affinity = ir_set_msi_irq_affinity, +-#endif +- .retrigger = ioapic_retrigger_irq, +-}; +- +-/* +- * Map the PCI dev to the corresponding remapping hardware unit +- * and allocate 'nvec' consecutive interrupt-remapping table entries +- * in it. +- */ +-static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) +-{ +- struct intel_iommu *iommu; +- int index; +- +- iommu = map_dev_to_ir(dev); +- if (!iommu) { +- printk(KERN_ERR +- "Unable to map PCI %s to iommu\n", pci_name(dev)); +- return -ENOENT; +- } +- +- index = alloc_irte(iommu, irq, nvec); +- if (index < 0) { +- printk(KERN_ERR +- "Unable to allocate %d IRTE for PCI %s\n", nvec, +- pci_name(dev)); +- return -ENOSPC; +- } +- return index; +-} +-#endif +- +-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) +-{ +- int ret; +- struct msi_msg msg; +- +- ret = msi_compose_msg(dev, irq, &msg); +- if (ret < 0) +- return ret; +- +- set_irq_msi(irq, msidesc); +- write_msi_msg(irq, &msg); +- +-#ifdef CONFIG_INTR_REMAP +- if (irq_remapped(irq)) { +- struct irq_desc *desc = irq_to_desc(irq); +- /* +- * irq migration in process context +- */ +- desc->status |= IRQ_MOVE_PCNTXT; +- set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); +- } else +-#endif +- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); +- +- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); +- +- return 0; +-} +- +-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc) +-{ +- unsigned int irq; +- int ret; +- unsigned int irq_want; +- +- irq_want = nr_irqs_gsi; +- irq = create_irq_nr(irq_want); +- if (irq == 0) +- return -1; +- +-#ifdef CONFIG_INTR_REMAP +- if (!intr_remapping_enabled) +- goto no_ir; +- +- ret = msi_alloc_irte(dev, irq, 1); +- if (ret < 0) +- goto error; +-no_ir: +-#endif +- ret = setup_msi_irq(dev, msidesc, irq); +- if (ret < 0) { +- destroy_irq(irq); +- return ret; +- } +- return 0; +- +-#ifdef CONFIG_INTR_REMAP +-error: +- destroy_irq(irq); +- return ret; +-#endif +-} +- +-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +-{ +- unsigned int irq; +- int ret, sub_handle; +- struct msi_desc *msidesc; +- unsigned int irq_want; +- +-#ifdef CONFIG_INTR_REMAP +- struct intel_iommu *iommu = 0; +- int index = 0; +-#endif +- +- irq_want = nr_irqs_gsi; +- sub_handle = 0; +- list_for_each_entry(msidesc, &dev->msi_list, list) { +- irq = create_irq_nr(irq_want); +- irq_want++; +- if (irq == 0) +- return -1; +-#ifdef CONFIG_INTR_REMAP +- if (!intr_remapping_enabled) +- goto no_ir; +- +- if (!sub_handle) { +- /* +- * allocate the consecutive block of IRTE's +- * for 'nvec' +- */ +- index = msi_alloc_irte(dev, irq, nvec); +- if (index < 0) { +- ret = index; +- goto error; +- } +- } else { +- iommu = map_dev_to_ir(dev); +- if (!iommu) { +- ret = -ENOENT; +- goto error; +- } +- /* +- * setup the mapping between the irq and the IRTE +- * base index, the sub_handle pointing to the +- * appropriate interrupt remap table entry. +- */ +- set_irte_irq(irq, iommu, index, sub_handle); +- } +-no_ir: +-#endif +- ret = setup_msi_irq(dev, msidesc, irq); +- if (ret < 0) +- goto error; +- sub_handle++; +- } +- return 0; +- +-error: +- destroy_irq(irq); +- return ret; +-} +- +-void arch_teardown_msi_irq(unsigned int irq) +-{ +- destroy_irq(irq); +-} +- +-#ifdef CONFIG_DMAR +-#ifdef CONFIG_SMP +-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- struct irq_cfg *cfg; +- struct msi_msg msg; +- unsigned int dest; +- +- dest = set_desc_affinity(desc, mask); +- if (dest == BAD_APICID) +- return; +- +- cfg = desc->chip_data; +- +- dmar_msi_read(irq, &msg); +- +- msg.data &= ~MSI_DATA_VECTOR_MASK; +- msg.data |= MSI_DATA_VECTOR(cfg->vector); +- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; +- msg.address_lo |= MSI_ADDR_DEST_ID(dest); +- +- dmar_msi_write(irq, &msg); +-} +- +-#endif /* CONFIG_SMP */ +- +-struct irq_chip dmar_msi_type = { +- .name = "DMAR_MSI", +- .unmask = dmar_msi_unmask, +- .mask = dmar_msi_mask, +- .ack = ack_apic_edge, +-#ifdef CONFIG_SMP +- .set_affinity = dmar_msi_set_affinity, +-#endif +- .retrigger = ioapic_retrigger_irq, +-}; +- +-int arch_setup_dmar_msi(unsigned int irq) +-{ +- int ret; +- struct msi_msg msg; +- +- ret = msi_compose_msg(NULL, irq, &msg); +- if (ret < 0) +- return ret; +- dmar_msi_write(irq, &msg); +- set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, +- "edge"); +- return 0; +-} +-#endif +- +-#ifdef CONFIG_HPET_TIMER +- +-#ifdef CONFIG_SMP +-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- struct irq_cfg *cfg; +- struct msi_msg msg; +- unsigned int dest; +- +- dest = set_desc_affinity(desc, mask); +- if (dest == BAD_APICID) +- return; +- +- cfg = desc->chip_data; +- +- hpet_msi_read(irq, &msg); +- +- msg.data &= ~MSI_DATA_VECTOR_MASK; +- msg.data |= MSI_DATA_VECTOR(cfg->vector); +- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; +- msg.address_lo |= MSI_ADDR_DEST_ID(dest); +- +- hpet_msi_write(irq, &msg); +-} +- +-#endif /* CONFIG_SMP */ +- +-struct irq_chip hpet_msi_type = { +- .name = "HPET_MSI", +- .unmask = hpet_msi_unmask, +- .mask = hpet_msi_mask, +- .ack = ack_apic_edge, +-#ifdef CONFIG_SMP +- .set_affinity = hpet_msi_set_affinity, +-#endif +- .retrigger = ioapic_retrigger_irq, +-}; +- +-int arch_setup_hpet_msi(unsigned int irq) +-{ +- int ret; +- struct msi_msg msg; +- +- ret = msi_compose_msg(NULL, irq, &msg); +- if (ret < 0) +- return ret; +- +- hpet_msi_write(irq, &msg); +- set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, +- "edge"); +- +- return 0; +-} +-#endif +- +-#endif /* CONFIG_PCI_MSI */ +-/* +- * Hypertransport interrupt support +- */ +-#ifdef CONFIG_HT_IRQ +- +-#ifdef CONFIG_SMP +- +-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +-{ +- struct ht_irq_msg msg; +- fetch_ht_irq_msg(irq, &msg); +- +- msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); +- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); +- +- msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); +- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); +- +- write_ht_irq_msg(irq, &msg); +-} +- +-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- struct irq_cfg *cfg; +- unsigned int dest; +- +- dest = set_desc_affinity(desc, mask); +- if (dest == BAD_APICID) +- return; +- +- cfg = desc->chip_data; +- +- target_ht_irq(irq, dest, cfg->vector); +-} +- +-#endif +- +-static struct irq_chip ht_irq_chip = { +- .name = "PCI-HT", +- .mask = mask_ht_irq, +- .unmask = unmask_ht_irq, +- .ack = ack_apic_edge, +-#ifdef CONFIG_SMP +- .set_affinity = set_ht_irq_affinity, +-#endif +- .retrigger = ioapic_retrigger_irq, +-}; +- +-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +-{ +- struct irq_cfg *cfg; +- int err; +- +- cfg = irq_cfg(irq); +- err = assign_irq_vector(irq, cfg, TARGET_CPUS); +- if (!err) { +- struct ht_irq_msg msg; +- unsigned dest; +- +- dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); +- +- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); +- +- msg.address_lo = +- HT_IRQ_LOW_BASE | +- HT_IRQ_LOW_DEST_ID(dest) | +- HT_IRQ_LOW_VECTOR(cfg->vector) | +- ((INT_DEST_MODE == 0) ? +- HT_IRQ_LOW_DM_PHYSICAL : +- HT_IRQ_LOW_DM_LOGICAL) | +- HT_IRQ_LOW_RQEOI_EDGE | +- ((INT_DELIVERY_MODE != dest_LowestPrio) ? +- HT_IRQ_LOW_MT_FIXED : +- HT_IRQ_LOW_MT_ARBITRATED) | +- HT_IRQ_LOW_IRQ_MASKED; +- +- write_ht_irq_msg(irq, &msg); +- +- set_irq_chip_and_handler_name(irq, &ht_irq_chip, +- handle_edge_irq, "edge"); +- +- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); +- } +- return err; +-} +-#endif /* CONFIG_HT_IRQ */ +- +-#ifdef CONFIG_X86_64 +-/* +- * Re-target the irq to the specified CPU and enable the specified MMR located +- * on the specified blade to allow the sending of MSIs to the specified CPU. +- */ +-int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, +- unsigned long mmr_offset) +-{ +- const struct cpumask *eligible_cpu = cpumask_of(cpu); +- struct irq_cfg *cfg; +- int mmr_pnode; +- unsigned long mmr_value; +- struct uv_IO_APIC_route_entry *entry; +- unsigned long flags; +- int err; +- +- cfg = irq_cfg(irq); +- +- err = assign_irq_vector(irq, cfg, eligible_cpu); +- if (err != 0) +- return err; +- +- spin_lock_irqsave(&vector_lock, flags); +- set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, +- irq_name); +- spin_unlock_irqrestore(&vector_lock, flags); +- +- mmr_value = 0; +- entry = (struct uv_IO_APIC_route_entry *)&mmr_value; +- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); +- +- entry->vector = cfg->vector; +- entry->delivery_mode = INT_DELIVERY_MODE; +- entry->dest_mode = INT_DEST_MODE; +- entry->polarity = 0; +- entry->trigger = 0; +- entry->mask = 0; +- entry->dest = cpu_mask_to_apicid(eligible_cpu); +- +- mmr_pnode = uv_blade_to_pnode(mmr_blade); +- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +- +- return irq; +-} +- +-/* +- * Disable the specified MMR located on the specified blade so that MSIs are +- * longer allowed to be sent. +- */ +-void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) +-{ +- unsigned long mmr_value; +- struct uv_IO_APIC_route_entry *entry; +- int mmr_pnode; +- +- mmr_value = 0; +- entry = (struct uv_IO_APIC_route_entry *)&mmr_value; +- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); +- +- entry->mask = 1; +- +- mmr_pnode = uv_blade_to_pnode(mmr_blade); +- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +-} +-#endif /* CONFIG_X86_64 */ +- +-int __init io_apic_get_redir_entries (int ioapic) +-{ +- union IO_APIC_reg_01 reg_01; +- unsigned long flags; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_01.raw = io_apic_read(ioapic, 1); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- return reg_01.bits.entries; +-} +- +-void __init probe_nr_irqs_gsi(void) +-{ +- int nr = 0; +- +- nr = acpi_probe_gsi(); +- if (nr > nr_irqs_gsi) { +- nr_irqs_gsi = nr; +- } else { +- /* for acpi=off or acpi is not compiled in */ +- int idx; +- +- nr = 0; +- for (idx = 0; idx < nr_ioapics; idx++) +- nr += io_apic_get_redir_entries(idx) + 1; +- +- if (nr > nr_irqs_gsi) +- nr_irqs_gsi = nr; +- } +- +- printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); +-} +- +-/* -------------------------------------------------------------------------- +- ACPI-based IOAPIC Configuration +- -------------------------------------------------------------------------- */ +- +-#ifdef CONFIG_ACPI +- +-#ifdef CONFIG_X86_32 +-int __init io_apic_get_unique_id(int ioapic, int apic_id) +-{ +- union IO_APIC_reg_00 reg_00; +- static physid_mask_t apic_id_map = PHYSID_MASK_NONE; +- physid_mask_t tmp; +- unsigned long flags; +- int i = 0; +- +- /* +- * The P4 platform supports up to 256 APIC IDs on two separate APIC +- * buses (one for LAPICs, one for IOAPICs), where predecessors only +- * supports up to 16 on one shared APIC bus. +- * +- * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full +- * advantage of new APIC bus architecture. +- */ +- +- if (physids_empty(apic_id_map)) +- apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); +- +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_00.raw = io_apic_read(ioapic, 0); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- if (apic_id >= get_physical_broadcast()) { +- printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " +- "%d\n", ioapic, apic_id, reg_00.bits.ID); +- apic_id = reg_00.bits.ID; +- } +- +- /* +- * Every APIC in a system must have a unique ID or we get lots of nice +- * 'stuck on smp_invalidate_needed IPI wait' messages. +- */ +- if (check_apicid_used(apic_id_map, apic_id)) { +- +- for (i = 0; i < get_physical_broadcast(); i++) { +- if (!check_apicid_used(apic_id_map, i)) +- break; +- } +- +- if (i == get_physical_broadcast()) +- panic("Max apic_id exceeded!\n"); +- +- printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " +- "trying %d\n", ioapic, apic_id, i); +- +- apic_id = i; +- } +- +- tmp = apicid_to_cpu_present(apic_id); +- physids_or(apic_id_map, apic_id_map, tmp); +- +- if (reg_00.bits.ID != apic_id) { +- reg_00.bits.ID = apic_id; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- io_apic_write(ioapic, 0, reg_00.raw); +- reg_00.raw = io_apic_read(ioapic, 0); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- /* Sanity check */ +- if (reg_00.bits.ID != apic_id) { +- printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); +- return -1; +- } +- } +- +- apic_printk(APIC_VERBOSE, KERN_INFO +- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); +- +- return apic_id; +-} +- +-int __init io_apic_get_version(int ioapic) +-{ +- union IO_APIC_reg_01 reg_01; +- unsigned long flags; +- +- spin_lock_irqsave(&ioapic_lock, flags); +- reg_01.raw = io_apic_read(ioapic, 1); +- spin_unlock_irqrestore(&ioapic_lock, flags); +- +- return reg_01.bits.version; +-} +-#endif +- +-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) +-{ +- struct irq_desc *desc; +- struct irq_cfg *cfg; +- int cpu = boot_cpu_id; +- +- if (!IO_APIC_IRQ(irq)) { +- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", +- ioapic); +- return -EINVAL; +- } +- +- desc = irq_to_desc_alloc_cpu(irq, cpu); +- if (!desc) { +- printk(KERN_INFO "can not get irq_desc %d\n", irq); +- return 0; +- } +- +- /* +- * IRQs < 16 are already in the irq_2_pin[] map +- */ +- if (irq >= NR_IRQS_LEGACY) { +- cfg = desc->chip_data; +- add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); +- } +- +- setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); +- +- return 0; +-} +- +- +-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) +-{ +- int i; +- +- if (skip_ioapic_setup) +- return -1; +- +- for (i = 0; i < mp_irq_entries; i++) +- if (mp_irqs[i].mp_irqtype == mp_INT && +- mp_irqs[i].mp_srcbusirq == bus_irq) +- break; +- if (i >= mp_irq_entries) +- return -1; +- +- *trigger = irq_trigger(i); +- *polarity = irq_polarity(i); +- return 0; +-} +- +-#endif /* CONFIG_ACPI */ +- +-/* +- * This function currently is only a helper for the i386 smp boot process where +- * we need to reprogram the ioredtbls to cater for the cpus which have come online +- * so mask in all cases should simply be TARGET_CPUS +- */ +-#ifdef CONFIG_SMP +-void __init setup_ioapic_dest(void) +-{ +- int pin, ioapic, irq, irq_entry; +- struct irq_desc *desc; +- struct irq_cfg *cfg; +- const struct cpumask *mask; +- +- if (skip_ioapic_setup == 1) +- return; +- +- for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { +- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { +- irq_entry = find_irq_entry(ioapic, pin, mp_INT); +- if (irq_entry == -1) +- continue; +- irq = pin_2_irq(irq_entry, ioapic, pin); +- +- /* setup_IO_APIC_irqs could fail to get vector for some device +- * when you have too many devices, because at that time only boot +- * cpu is online. +- */ +- desc = irq_to_desc(irq); +- cfg = desc->chip_data; +- if (!cfg->vector) { +- setup_IO_APIC_irq(ioapic, pin, irq, desc, +- irq_trigger(irq_entry), +- irq_polarity(irq_entry)); +- continue; +- +- } +- +- /* +- * Honour affinities which have been set in early boot +- */ +- if (desc->status & +- (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) +- mask = &desc->affinity; +- else +- mask = TARGET_CPUS; +- +-#ifdef CONFIG_INTR_REMAP +- if (intr_remapping_enabled) +- set_ir_ioapic_affinity_irq_desc(desc, mask); +- else +-#endif +- set_ioapic_affinity_irq_desc(desc, mask); +- } +- +- } +-} +-#endif +- +-#define IOAPIC_RESOURCE_NAME_SIZE 11 +- +-static struct resource *ioapic_resources; +- +-static struct resource * __init ioapic_setup_resources(void) +-{ +- unsigned long n; +- struct resource *res; +- char *mem; +- int i; +- +- if (nr_ioapics <= 0) +- return NULL; +- +- n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); +- n *= nr_ioapics; +- +- mem = alloc_bootmem(n); +- res = (void *)mem; +- +- if (mem != NULL) { +- mem += sizeof(struct resource) * nr_ioapics; +- +- for (i = 0; i < nr_ioapics; i++) { +- res[i].name = mem; +- res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; +- sprintf(mem, "IOAPIC %u", i); +- mem += IOAPIC_RESOURCE_NAME_SIZE; +- } +- } +- +- ioapic_resources = res; +- +- return res; +-} +- +-void __init ioapic_init_mappings(void) +-{ +- unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; +- struct resource *ioapic_res; +- int i; +- +- ioapic_res = ioapic_setup_resources(); +- for (i = 0; i < nr_ioapics; i++) { +- if (smp_found_config) { +- ioapic_phys = mp_ioapics[i].mp_apicaddr; +-#ifdef CONFIG_X86_32 +- if (!ioapic_phys) { +- printk(KERN_ERR +- "WARNING: bogus zero IO-APIC " +- "address found in MPTABLE, " +- "disabling IO/APIC support!\n"); +- smp_found_config = 0; +- skip_ioapic_setup = 1; +- goto fake_ioapic_page; +- } +-#endif +- } else { +-#ifdef CONFIG_X86_32 +-fake_ioapic_page: +-#endif +- ioapic_phys = (unsigned long) +- alloc_bootmem_pages(PAGE_SIZE); +- ioapic_phys = __pa(ioapic_phys); +- } +- set_fixmap_nocache(idx, ioapic_phys); +- apic_printk(APIC_VERBOSE, +- "mapped IOAPIC to %08lx (%08lx)\n", +- __fix_to_virt(idx), ioapic_phys); +- idx++; +- +- if (ioapic_res != NULL) { +- ioapic_res->start = ioapic_phys; +- ioapic_res->end = ioapic_phys + (4 * 1024) - 1; +- ioapic_res++; +- } +- } +-} +- +-static int __init ioapic_insert_resources(void) +-{ +- int i; +- struct resource *r = ioapic_resources; +- +- if (!r) { +- printk(KERN_ERR +- "IO APIC resources could be not be allocated.\n"); +- return -1; +- } +- +- for (i = 0; i < nr_ioapics; i++) { +- insert_resource(&iomem_resource, r); +- r++; +- } +- +- return 0; +-} +- +-/* Insert the IO APIC resources after PCI initialization has occured to handle +- * IO APICS that are mapped in on a BAR in PCI space. */ +-late_initcall(ioapic_insert_resources); +Index: linux-2.6-tip/arch/x86/kernel/io_delay.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/io_delay.c ++++ linux-2.6-tip/arch/x86/kernel/io_delay.c +@@ -7,10 +7,10 @@ + */ + #include + #include +-#include + #include ++#include + #include +-#include ++#include + + int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; + +@@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay); + static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) + { + if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { +- printk(KERN_NOTICE "%s: using 0xed I/O delay port\n", +- id->ident); ++ pr_notice("%s: using 0xed I/O delay port\n", id->ident); + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + } + +@@ -64,40 +63,40 @@ static struct dmi_system_id __initdata i + .callback = dmi_io_delay_0xed_port, + .ident = "Compaq Presario V6000", + .matches = { +- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), +- DMI_MATCH(DMI_BOARD_NAME, "30B7") ++ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), ++ DMI_MATCH(DMI_BOARD_NAME, "30B7") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv9000z", + .matches = { +- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), +- DMI_MATCH(DMI_BOARD_NAME, "30B9") ++ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), ++ DMI_MATCH(DMI_BOARD_NAME, "30B9") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv6000", + .matches = { +- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), +- DMI_MATCH(DMI_BOARD_NAME, "30B8") ++ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), ++ DMI_MATCH(DMI_BOARD_NAME, "30B8") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion tx1000", + .matches = { +- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), +- DMI_MATCH(DMI_BOARD_NAME, "30BF") ++ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), ++ DMI_MATCH(DMI_BOARD_NAME, "30BF") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { +- DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), +- DMI_MATCH(DMI_BOARD_NAME, "30D3") ++ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), ++ DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, + { } +Index: linux-2.6-tip/arch/x86/kernel/ioport.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/ioport.c ++++ linux-2.6-tip/arch/x86/kernel/ioport.c +@@ -85,19 +85,8 @@ asmlinkage long sys_ioperm(unsigned long + + t->io_bitmap_max = bytes; + +-#ifdef CONFIG_X86_32 +- /* +- * Sets the lazy trigger so that the next I/O operation will +- * reload the correct bitmap. +- * Reset the owner so that a process switch will not set +- * tss->io_bitmap_base to IO_BITMAP_OFFSET. +- */ +- tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; +- tss->io_bitmap_owner = NULL; +-#else + /* Update the TSS: */ + memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); +-#endif + + put_cpu(); + +@@ -131,9 +120,8 @@ static int do_iopl(unsigned int level, s + } + + #ifdef CONFIG_X86_32 +-asmlinkage long sys_iopl(unsigned long regsp) ++long sys_iopl(struct pt_regs *regs) + { +- struct pt_regs *regs = (struct pt_regs *)®sp; + unsigned int level = regs->bx; + struct thread_struct *t = ¤t->thread; + int rc; +Index: linux-2.6-tip/arch/x86/kernel/ipi.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/ipi.c ++++ /dev/null +@@ -1,190 +0,0 @@ +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef CONFIG_X86_32 +-#include +-#include +- +-/* +- * the following functions deal with sending IPIs between CPUs. +- * +- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. +- */ +- +-static inline int __prepare_ICR(unsigned int shortcut, int vector) +-{ +- unsigned int icr = shortcut | APIC_DEST_LOGICAL; +- +- switch (vector) { +- default: +- icr |= APIC_DM_FIXED | vector; +- break; +- case NMI_VECTOR: +- icr |= APIC_DM_NMI; +- break; +- } +- return icr; +-} +- +-static inline int __prepare_ICR2(unsigned int mask) +-{ +- return SET_APIC_DEST_FIELD(mask); +-} +- +-void __send_IPI_shortcut(unsigned int shortcut, int vector) +-{ +- /* +- * Subtle. In the case of the 'never do double writes' workaround +- * we have to lock out interrupts to be safe. As we don't care +- * of the value read we use an atomic rmw access to avoid costly +- * cli/sti. Otherwise we use an even cheaper single atomic write +- * to the APIC. +- */ +- unsigned int cfg; +- +- /* +- * Wait for idle. +- */ +- apic_wait_icr_idle(); +- +- /* +- * No need to touch the target chip field +- */ +- cfg = __prepare_ICR(shortcut, vector); +- +- /* +- * Send the IPI. The write to APIC_ICR fires this off. +- */ +- apic_write(APIC_ICR, cfg); +-} +- +-void send_IPI_self(int vector) +-{ +- __send_IPI_shortcut(APIC_DEST_SELF, vector); +-} +- +-/* +- * This is used to send an IPI with no shorthand notation (the destination is +- * specified in bits 56 to 63 of the ICR). +- */ +-static inline void __send_IPI_dest_field(unsigned long mask, int vector) +-{ +- unsigned long cfg; +- +- /* +- * Wait for idle. +- */ +- if (unlikely(vector == NMI_VECTOR)) +- safe_apic_wait_icr_idle(); +- else +- apic_wait_icr_idle(); +- +- /* +- * prepare target chip field +- */ +- cfg = __prepare_ICR2(mask); +- apic_write(APIC_ICR2, cfg); +- +- /* +- * program the ICR +- */ +- cfg = __prepare_ICR(0, vector); +- +- /* +- * Send the IPI. The write to APIC_ICR fires this off. +- */ +- apic_write(APIC_ICR, cfg); +-} +- +-/* +- * This is only used on smaller machines. +- */ +-void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) +-{ +- unsigned long mask = cpumask_bits(cpumask)[0]; +- unsigned long flags; +- +- local_irq_save(flags); +- WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); +- __send_IPI_dest_field(mask, vector); +- local_irq_restore(flags); +-} +- +-void send_IPI_mask_sequence(const struct cpumask *mask, int vector) +-{ +- unsigned long flags; +- unsigned int query_cpu; +- +- /* +- * Hack. The clustered APIC addressing mode doesn't allow us to send +- * to an arbitrary mask, so I do a unicasts to each CPU instead. This +- * should be modified to do 1 message per cluster ID - mbligh +- */ +- +- local_irq_save(flags); +- for_each_cpu(query_cpu, mask) +- __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); +- local_irq_restore(flags); +-} +- +-void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +-{ +- unsigned long flags; +- unsigned int query_cpu; +- unsigned int this_cpu = smp_processor_id(); +- +- /* See Hack comment above */ +- +- local_irq_save(flags); +- for_each_cpu(query_cpu, mask) +- if (query_cpu != this_cpu) +- __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), +- vector); +- local_irq_restore(flags); +-} +- +-/* must come after the send_IPI functions above for inlining */ +-static int convert_apicid_to_cpu(int apic_id) +-{ +- int i; +- +- for_each_possible_cpu(i) { +- if (per_cpu(x86_cpu_to_apicid, i) == apic_id) +- return i; +- } +- return -1; +-} +- +-int safe_smp_processor_id(void) +-{ +- int apicid, cpuid; +- +- if (!boot_cpu_has(X86_FEATURE_APIC)) +- return 0; +- +- apicid = hard_smp_processor_id(); +- if (apicid == BAD_APICID) +- return 0; +- +- cpuid = convert_apicid_to_cpu(apicid); +- +- return cpuid >= 0 ? cpuid : 0; +-} +-#endif +Index: linux-2.6-tip/arch/x86/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/irq.c ++++ linux-2.6-tip/arch/x86/kernel/irq.c +@@ -6,13 +6,18 @@ + #include + #include + #include ++#include + + #include + #include + #include ++#include + + atomic_t irq_err_count; + ++/* Function pointer for generic interrupt vector handling */ ++void (*generic_interrupt_extension)(void) = NULL; ++ + /* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. +@@ -36,63 +41,68 @@ void ack_bad_irq(unsigned int irq) + #endif + } + +-#ifdef CONFIG_X86_32 +-# define irq_stats(x) (&per_cpu(irq_stat, x)) +-#else +-# define irq_stats(x) cpu_pda(x) +-#endif ++#define irq_stats(x) (&per_cpu(irq_stat, x)) + /* + * /proc/interrupts printing: + */ +-static int show_other_interrupts(struct seq_file *p) ++static int show_other_interrupts(struct seq_file *p, int prec) + { + int j; + +- seq_printf(p, "NMI: "); ++ seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); + #ifdef CONFIG_X86_LOCAL_APIC +- seq_printf(p, "LOC: "); ++ seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); ++ ++ seq_printf(p, "%*s: ", prec, "SPU"); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); ++ seq_printf(p, " Spurious interrupts\n"); ++ seq_printf(p, "CNT: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); ++ seq_printf(p, " Performance counter interrupts\n"); + #endif ++ if (generic_interrupt_extension) { ++ seq_printf(p, "PLT: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); ++ seq_printf(p, " Platform interrupts\n"); ++ } + #ifdef CONFIG_SMP +- seq_printf(p, "RES: "); ++ seq_printf(p, "%*s: ", prec, "RES"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); +- seq_printf(p, "CAL: "); ++ seq_printf(p, "%*s: ", prec, "CAL"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); +- seq_printf(p, "TLB: "); ++ seq_printf(p, "%*s: ", prec, "TLB"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); + #endif + #ifdef CONFIG_X86_MCE +- seq_printf(p, "TRM: "); ++ seq_printf(p, "%*s: ", prec, "TRM"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); + # ifdef CONFIG_X86_64 +- seq_printf(p, "THR: "); ++ seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); + # endif + #endif +-#ifdef CONFIG_X86_LOCAL_APIC +- seq_printf(p, "SPU: "); +- for_each_online_cpu(j) +- seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); +- seq_printf(p, " Spurious interrupts\n"); +-#endif +- seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); ++ seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); + #if defined(CONFIG_X86_IO_APIC) +- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); ++ seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); + #endif + return 0; + } +@@ -100,19 +110,22 @@ static int show_other_interrupts(struct + int show_interrupts(struct seq_file *p, void *v) + { + unsigned long flags, any_count = 0; +- int i = *(loff_t *) v, j; ++ int i = *(loff_t *) v, j, prec; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + ++ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) ++ j *= 10; ++ + if (i == nr_irqs) +- return show_other_interrupts(p); ++ return show_other_interrupts(p, prec); + + /* print header */ + if (i == 0) { +- seq_printf(p, " "); ++ seq_printf(p, "%*s", prec + 8, ""); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); +@@ -123,23 +136,15 @@ int show_interrupts(struct seq_file *p, + return 0; + + spin_lock_irqsave(&desc->lock, flags); +-#ifndef CONFIG_SMP +- any_count = kstat_irqs(i); +-#else + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); +-#endif + action = desc->action; + if (!action && !any_count) + goto out; + +- seq_printf(p, "%3d: ", i); +-#ifndef CONFIG_SMP +- seq_printf(p, "%10u ", kstat_irqs(i)); +-#else ++ seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); +-#endif + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); + +@@ -164,7 +169,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu) + + #ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; ++ sum += irq_stats(cpu)->irq_spurious_count; ++ sum += irq_stats(cpu)->apic_perf_irqs; + #endif ++ if (generic_interrupt_extension) ++ sum += irq_stats(cpu)->generic_irqs; + #ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; +@@ -176,9 +185,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu) + sum += irq_stats(cpu)->irq_threshold_count; + #endif + #endif +-#ifdef CONFIG_X86_LOCAL_APIC +- sum += irq_stats(cpu)->irq_spurious_count; +-#endif + return sum; + } + +@@ -192,4 +198,63 @@ u64 arch_irq_stat(void) + return sum; + } + ++ ++/* ++ * do_IRQ handles all normal device IRQ's (the special ++ * SMP cross-CPU interrupts have their own specific ++ * handlers). ++ */ ++unsigned int __irq_entry do_IRQ(struct pt_regs *regs) ++{ ++ struct pt_regs *old_regs = set_irq_regs(regs); ++ ++ /* high bit used in ret_from_ code */ ++ unsigned vector = ~regs->orig_ax; ++ unsigned irq; ++ ++ exit_idle(); ++ irq_enter(); ++ ++ irq = __get_cpu_var(vector_irq)[vector]; ++ ++ if (!handle_irq(irq, regs)) { ++#ifdef CONFIG_X86_64 ++ if (!disable_apic) ++ ack_APIC_irq(); ++#endif ++ ++ if (printk_ratelimit()) ++ printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", ++ __func__, smp_processor_id(), vector, irq); ++ } ++ ++ irq_exit(); ++ ++ set_irq_regs(old_regs); ++ return 1; ++} ++ ++/* ++ * Handler for GENERIC_INTERRUPT_VECTOR. ++ */ ++void smp_generic_interrupt(struct pt_regs *regs) ++{ ++ struct pt_regs *old_regs = set_irq_regs(regs); ++ ++ ack_APIC_irq(); ++ ++ exit_idle(); ++ ++ irq_enter(); ++ ++ inc_irq_stat(generic_irqs); ++ ++ if (generic_interrupt_extension) ++ generic_interrupt_extension(); ++ ++ irq_exit(); ++ ++ set_irq_regs(old_regs); ++} ++ + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); +Index: linux-2.6-tip/arch/x86/kernel/irq_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/irq_32.c ++++ linux-2.6-tip/arch/x86/kernel/irq_32.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + #include + +@@ -55,13 +56,13 @@ static inline void print_stack_overflow( + union irq_ctx { + struct thread_info tinfo; + u32 stack[THREAD_SIZE/sizeof(u32)]; +-}; ++} __attribute__((aligned(PAGE_SIZE))); + +-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; +-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; ++static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); ++static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); + +-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; ++static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); ++static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); + + static void call_on_stack(void *func, void *stack) + { +@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struc + u32 *isp, arg1, arg2; + + curctx = (union irq_ctx *) current_thread_info(); +- irqctx = hardirq_ctx[smp_processor_id()]; ++ irqctx = __get_cpu_var(hardirq_ctx); + + /* + * this is where we switch to the IRQ stack. However, if we are +@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu) + { + union irq_ctx *irqctx; + +- if (hardirq_ctx[cpu]) ++ if (per_cpu(hardirq_ctx, cpu)) + return; + +- irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; ++ irqctx = &per_cpu(hardirq_stack, cpu); + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + +- hardirq_ctx[cpu] = irqctx; ++ per_cpu(hardirq_ctx, cpu) = irqctx; + +- irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; ++ irqctx = &per_cpu(softirq_stack, cpu); + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = 0; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + +- softirq_ctx[cpu] = irqctx; ++ per_cpu(softirq_ctx, cpu) = irqctx; + + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", +- cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); ++ cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); + } + + void irq_ctx_exit(int cpu) + { +- hardirq_ctx[cpu] = NULL; ++ per_cpu(hardirq_ctx, cpu) = NULL; + } + + asmlinkage void do_softirq(void) +@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void) + + if (local_softirq_pending()) { + curctx = current_thread_info(); +- irqctx = softirq_ctx[smp_processor_id()]; ++ irqctx = __get_cpu_var(softirq_ctx); + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; + +@@ -191,33 +192,16 @@ static inline int + execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } + #endif + +-/* +- * do_IRQ handles all normal device IRQ's (the special +- * SMP cross-CPU interrupts have their own specific +- * handlers). +- */ +-unsigned int do_IRQ(struct pt_regs *regs) ++bool handle_irq(unsigned irq, struct pt_regs *regs) + { +- struct pt_regs *old_regs; +- /* high bit used in ret_from_ code */ +- int overflow; +- unsigned vector = ~regs->orig_ax; + struct irq_desc *desc; +- unsigned irq; +- +- +- old_regs = set_irq_regs(regs); +- irq_enter(); +- irq = __get_cpu_var(vector_irq)[vector]; ++ int overflow; + + overflow = check_stack_overflow(); + + desc = irq_to_desc(irq); +- if (unlikely(!desc)) { +- printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", +- __func__, irq, vector, smp_processor_id()); +- BUG(); +- } ++ if (unlikely(!desc)) ++ return false; + + if (!execute_on_irq_stack(overflow, desc, irq)) { + if (unlikely(overflow)) +@@ -225,13 +209,10 @@ unsigned int do_IRQ(struct pt_regs *regs + desc->handle_irq(irq, desc); + } + +- irq_exit(); +- set_irq_regs(old_regs); +- return 1; ++ return true; + } + + #ifdef CONFIG_HOTPLUG_CPU +-#include + + /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ + void fixup_irqs(void) +@@ -248,7 +229,7 @@ void fixup_irqs(void) + if (irq == 2) + continue; + +- affinity = &desc->affinity; ++ affinity = desc->affinity; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + printk("Breaking affinity for irq %i\n", irq); + affinity = cpu_all_mask; +Index: linux-2.6-tip/arch/x86/kernel/irq_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/irq_64.c ++++ linux-2.6-tip/arch/x86/kernel/irq_64.c +@@ -18,6 +18,13 @@ + #include + #include + #include ++#include ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); ++EXPORT_PER_CPU_SYMBOL(irq_stat); ++ ++DEFINE_PER_CPU(struct pt_regs *, irq_regs); ++EXPORT_PER_CPU_SYMBOL(irq_regs); + + /* + * Probabilistic stack overflow check: +@@ -41,42 +48,18 @@ static inline void stack_overflow_check( + #endif + } + +-/* +- * do_IRQ handles all normal device IRQ's (the special +- * SMP cross-CPU interrupts have their own specific +- * handlers). +- */ +-asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) ++bool handle_irq(unsigned irq, struct pt_regs *regs) + { +- struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; + +- /* high bit used in ret_from_ code */ +- unsigned vector = ~regs->orig_ax; +- unsigned irq; +- +- exit_idle(); +- irq_enter(); +- irq = __get_cpu_var(vector_irq)[vector]; +- + stack_overflow_check(regs); + + desc = irq_to_desc(irq); +- if (likely(desc)) +- generic_handle_irq_desc(irq, desc); +- else { +- if (!disable_apic) +- ack_APIC_irq(); +- +- if (printk_ratelimit()) +- printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", +- __func__, smp_processor_id(), vector); +- } +- +- irq_exit(); ++ if (unlikely(!desc)) ++ return false; + +- set_irq_regs(old_regs); +- return 1; ++ generic_handle_irq_desc(irq, desc); ++ return true; + } + + #ifdef CONFIG_HOTPLUG_CPU +@@ -100,7 +83,7 @@ void fixup_irqs(void) + /* interrupt's are disabled at this point */ + spin_lock(&desc->lock); + +- affinity = &desc->affinity; ++ affinity = desc->affinity; + if (!irq_has_action(irq) || + cpumask_equal(affinity, cpu_online_mask)) { + spin_unlock(&desc->lock); +Index: linux-2.6-tip/arch/x86/kernel/irqinit_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/irqinit_32.c ++++ linux-2.6-tip/arch/x86/kernel/irqinit_32.c +@@ -18,7 +18,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + +@@ -50,6 +50,7 @@ static irqreturn_t math_error_irq(int cp + */ + static struct irqaction fpu_irq = { + .handler = math_error_irq, ++ .flags = IRQF_NODELAY, + .mask = CPU_MASK_NONE, + .name = "fpu", + }; +@@ -78,6 +79,16 @@ void __init init_ISA_irqs(void) + } + } + ++/* ++ * IRQ2 is cascade interrupt to second interrupt controller ++ */ ++static struct irqaction irq2 = { ++ .handler = no_action, ++ .flags = IRQF_NODELAY, ++ .mask = CPU_MASK_NONE, ++ .name = "cascade", ++}; ++ + DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, +@@ -111,28 +122,8 @@ int vector_used_by_percpu_irq(unsigned i + return 0; + } + +-/* Overridden in paravirt.c */ +-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +- +-void __init native_init_IRQ(void) ++static void __init smp_intr_init(void) + { +- int i; +- +- /* all the set up before the call gates are initialised */ +- pre_intr_init_hook(); +- +- /* +- * Cover the whole vector space, no vector can escape +- * us. (some of these will be overridden and become +- * 'special' SMP interrupts) +- */ +- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { +- /* SYSCALL_VECTOR was reserved in trap_init. */ +- if (i != SYSCALL_VECTOR) +- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); +- } +- +- + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper +@@ -140,8 +131,15 @@ void __init native_init_IRQ(void) + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + +- /* IPI for invalidation */ +- alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); ++ /* IPIs for invalidation */ ++ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); ++ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); ++ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); ++ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); ++ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); ++ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); ++ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); ++ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); +@@ -154,25 +152,65 @@ void __init native_init_IRQ(void) + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); + #endif ++} ++ ++static void __init apic_intr_init(void) ++{ ++ smp_intr_init(); + + #ifdef CONFIG_X86_LOCAL_APIC + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + ++ /* generic IPI for platform specific use */ ++ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); ++ + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +-#endif ++# ifdef CONFIG_PERF_COUNTERS ++ alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); ++# endif + +-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) ++# ifdef CONFIG_X86_MCE_P4THERMAL + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); ++# endif + #endif ++} ++ ++/* Overridden in paravirt.c */ ++void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); ++ ++void __init native_init_IRQ(void) ++{ ++ int i; ++ ++ /* Execute any quirks before the call gates are initialised: */ ++ x86_quirk_pre_intr_init(); + +- /* setup after call gates are initialised (usually add in +- * the architecture specific gates) ++ apic_intr_init(); ++ ++ /* ++ * Cover the whole vector space, no vector can escape ++ * us. (some of these will be overridden and become ++ * 'special' SMP interrupts) ++ */ ++ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { ++ int vector = FIRST_EXTERNAL_VECTOR + i; ++ /* SYSCALL_VECTOR was reserved in trap_init. */ ++ if (!test_bit(vector, used_vectors)) ++ set_intr_gate(vector, interrupt[i]); ++ } ++ ++ if (!acpi_ioapic) ++ setup_irq(2, &irq2); ++ ++ /* ++ * Call quirks after call gates are initialised (usually add in ++ * the architecture specific gates): + */ +- intr_init_hook(); ++ x86_quirk_intr_init(); + + /* + * External FPU? Set up irq13 if so, for +Index: linux-2.6-tip/arch/x86/kernel/irqinit_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/irqinit_64.c ++++ linux-2.6-tip/arch/x86/kernel/irqinit_64.c +@@ -147,9 +147,17 @@ static void __init apic_intr_init(void) + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + ++ /* generic IPI for platform specific use */ ++ alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); ++ + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); ++ ++ /* Performance monitoring interrupt: */ ++#ifdef CONFIG_PERF_COUNTERS ++ alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); ++#endif + } + + void __init native_init_IRQ(void) +@@ -157,6 +165,9 @@ void __init native_init_IRQ(void) + int i; + + init_ISA_irqs(); ++ ++ apic_intr_init(); ++ + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become +@@ -164,12 +175,10 @@ void __init native_init_IRQ(void) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; +- if (vector != IA32_SYSCALL_VECTOR) ++ if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } + +- apic_intr_init(); +- + if (!acpi_ioapic) + setup_irq(2, &irq2); + } +Index: linux-2.6-tip/arch/x86/kernel/kdebugfs.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/kdebugfs.c ++++ linux-2.6-tip/arch/x86/kernel/kdebugfs.c +@@ -8,11 +8,11 @@ + */ + #include + #include +-#include ++#include + #include ++#include + #include + #include +-#include + + #include + +@@ -26,9 +26,8 @@ struct setup_data_node { + u32 len; + }; + +-static ssize_t +-setup_data_read(struct file *file, char __user *user_buf, size_t count, +- loff_t *ppos) ++static ssize_t setup_data_read(struct file *file, char __user *user_buf, ++ size_t count, loff_t *ppos) + { + struct setup_data_node *node = file->private_data; + unsigned long remain; +@@ -39,20 +38,21 @@ setup_data_read(struct file *file, char + + if (pos < 0) + return -EINVAL; ++ + if (pos >= node->len) + return 0; + + if (count > node->len - pos) + count = node->len - pos; ++ + pa = node->paddr + sizeof(struct setup_data) + pos; + pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); + if (PageHighMem(pg)) { + p = ioremap_cache(pa, count); + if (!p) + return -ENXIO; +- } else { ++ } else + p = __va(pa); +- } + + remain = copy_to_user(user_buf, p, count); + +@@ -70,12 +70,13 @@ setup_data_read(struct file *file, char + static int setup_data_open(struct inode *inode, struct file *file) + { + file->private_data = inode->i_private; ++ + return 0; + } + + static const struct file_operations fops_setup_data = { +- .read = setup_data_read, +- .open = setup_data_open, ++ .read = setup_data_read, ++ .open = setup_data_open, + }; + + static int __init +@@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *pa + { + struct dentry *d, *type, *data; + char buf[16]; +- int error; + + sprintf(buf, "%d", no); + d = debugfs_create_dir(buf, parent); +- if (!d) { +- error = -ENOMEM; +- goto err_return; +- } ++ if (!d) ++ return -ENOMEM; ++ + type = debugfs_create_x32("type", S_IRUGO, d, &node->type); +- if (!type) { +- error = -ENOMEM; ++ if (!type) + goto err_dir; +- } ++ + data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); +- if (!data) { +- error = -ENOMEM; ++ if (!data) + goto err_type; +- } ++ + return 0; + + err_type: + debugfs_remove(type); + err_dir: + debugfs_remove(d); +-err_return: +- return error; ++ return -ENOMEM; + } + + static int __init create_setup_data_nodes(struct dentry *parent) + { + struct setup_data_node *node; + struct setup_data *data; +- int error, no = 0; ++ int error = -ENOMEM; + struct dentry *d; + struct page *pg; + u64 pa_data; ++ int no = 0; + + d = debugfs_create_dir("setup_data", parent); +- if (!d) { +- error = -ENOMEM; +- goto err_return; +- } ++ if (!d) ++ return -ENOMEM; + + pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + node = kmalloc(sizeof(*node), GFP_KERNEL); +- if (!node) { +- error = -ENOMEM; ++ if (!node) + goto err_dir; +- } ++ + pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); + if (PageHighMem(pg)) { + data = ioremap_cache(pa_data, sizeof(*data)); +@@ -143,9 +137,8 @@ static int __init create_setup_data_node + error = -ENXIO; + goto err_dir; + } +- } else { ++ } else + data = __va(pa_data); +- } + + node->paddr = pa_data; + node->type = data->type; +@@ -159,11 +152,11 @@ static int __init create_setup_data_node + goto err_dir; + no++; + } ++ + return 0; + + err_dir: + debugfs_remove(d); +-err_return: + return error; + } + +@@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_ + static int __init boot_params_kdebugfs_init(void) + { + struct dentry *dbp, *version, *data; +- int error; ++ int error = -ENOMEM; + + dbp = debugfs_create_dir("boot_params", NULL); +- if (!dbp) { +- error = -ENOMEM; +- goto err_return; +- } ++ if (!dbp) ++ return -ENOMEM; ++ + version = debugfs_create_x16("version", S_IRUGO, dbp, + &boot_params.hdr.version); +- if (!version) { +- error = -ENOMEM; ++ if (!version) + goto err_dir; +- } ++ + data = debugfs_create_blob("data", S_IRUGO, dbp, + &boot_params_blob); +- if (!data) { +- error = -ENOMEM; ++ if (!data) + goto err_version; +- } ++ + error = create_setup_data_nodes(dbp); + if (error) + goto err_data; ++ + return 0; + + err_data: +@@ -205,10 +196,9 @@ err_version: + debugfs_remove(version); + err_dir: + debugfs_remove(dbp); +-err_return: + return error; + } +-#endif ++#endif /* CONFIG_DEBUG_BOOT_PARAMS */ + + static int __init arch_kdebugfs_init(void) + { +Index: linux-2.6-tip/arch/x86/kernel/kgdb.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/kgdb.c ++++ linux-2.6-tip/arch/x86/kernel/kgdb.c +@@ -46,7 +46,7 @@ + #include + #include + +-#include ++#include + + /* + * Put the error code here just in case the user cares: +@@ -347,7 +347,7 @@ void kgdb_post_primary_code(struct pt_re + */ + void kgdb_roundup_cpus(unsigned long flags) + { +- send_IPI_allbutself(APIC_DM_NMI); ++ apic->send_IPI_allbutself(APIC_DM_NMI); + } + #endif + +Index: linux-2.6-tip/arch/x86/kernel/kprobes.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/kprobes.c ++++ linux-2.6-tip/arch/x86/kernel/kprobes.c +@@ -193,7 +193,7 @@ static int __kprobes can_boost(kprobe_op + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; + +- if (search_exception_tables(opcodes)) ++ if (search_exception_tables((unsigned long)opcodes)) + return 0; /* Page fault may occur on this address. */ + + retry: +@@ -454,7 +454,7 @@ static void __kprobes setup_singlestep(s + /* Boost up -- we can execute copied instructions directly */ + reset_current_kprobe(); + regs->ip = (unsigned long)p->ainsn.insn; +- preempt_enable_no_resched(); ++ preempt_enable(); + return; + } + #endif +@@ -480,7 +480,7 @@ static int __kprobes reenter_kprobe(stru + arch_disarm_kprobe(p); + regs->ip = (unsigned long)p->addr; + reset_current_kprobe(); +- preempt_enable_no_resched(); ++ preempt_enable(); + break; + #endif + case KPROBE_HIT_ACTIVE: +@@ -576,7 +576,7 @@ static int __kprobes kprobe_handler(stru + } + } /* else: not a kprobe fault; let the kernel handle it */ + +- preempt_enable_no_resched(); ++ preempt_enable(); + return 0; + } + +@@ -638,13 +638,13 @@ static void __used __kprobes kretprobe_t + #else + " pushf\n" + /* +- * Skip cs, ip, orig_ax. ++ * Skip cs, ip, orig_ax and gs. + * trampoline_handler() will plug in these values + */ +- " subl $12, %esp\n" ++ " subl $16, %esp\n" + " pushl %fs\n" +- " pushl %ds\n" + " pushl %es\n" ++ " pushl %ds\n" + " pushl %eax\n" + " pushl %ebp\n" + " pushl %edi\n" +@@ -655,10 +655,10 @@ static void __used __kprobes kretprobe_t + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* Move flags to cs */ +- " movl 52(%esp), %edx\n" +- " movl %edx, 48(%esp)\n" ++ " movl 56(%esp), %edx\n" ++ " movl %edx, 52(%esp)\n" + /* Replace saved flags with true return address. */ +- " movl %eax, 52(%esp)\n" ++ " movl %eax, 56(%esp)\n" + " popl %ebx\n" + " popl %ecx\n" + " popl %edx\n" +@@ -666,8 +666,8 @@ static void __used __kprobes kretprobe_t + " popl %edi\n" + " popl %ebp\n" + " popl %eax\n" +- /* Skip ip, orig_ax, es, ds, fs */ +- " addl $20, %esp\n" ++ /* Skip ds, es, fs, gs, orig_ax and ip */ ++ " addl $24, %esp\n" + " popf\n" + #endif + " ret\n"); +@@ -691,6 +691,7 @@ static __used __kprobes void *trampoline + regs->cs = __KERNEL_CS; + #else + regs->cs = __KERNEL_CS | get_kernel_rpl(); ++ regs->gs = 0; + #endif + regs->ip = trampoline_address; + regs->orig_ax = ~0UL; +@@ -875,7 +876,7 @@ static int __kprobes post_kprobe_handler + } + reset_current_kprobe(); + out: +- preempt_enable_no_resched(); ++ preempt_enable(); + + /* + * if somebody else is singlestepping across a probe point, flags +@@ -909,7 +910,7 @@ int __kprobes kprobe_fault_handler(struc + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); +- preempt_enable_no_resched(); ++ preempt_enable(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: +@@ -1050,7 +1051,7 @@ int __kprobes longjmp_break_handler(stru + memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), + kcb->jprobes_stack, + MIN_STACK_SIZE(kcb->jprobe_saved_sp)); +- preempt_enable_no_resched(); ++ preempt_enable(); + return 1; + } + return 0; +Index: linux-2.6-tip/arch/x86/kernel/kvm.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/kvm.c ++++ linux-2.6-tip/arch/x86/kernel/kvm.c +@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *pt + kvm_mmu_write(ptep, pte_val(pte)); + } + +-static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr, +- pte_t *ptep, pte_t pte) +-{ +- kvm_mmu_write(ptep, pte_val(pte)); +-} +- + static void kvm_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) + { +@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void) + #if PAGETABLE_LEVELS >= 3 + #ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; +- pv_mmu_ops.set_pte_present = kvm_set_pte_present; + pv_mmu_ops.pte_clear = kvm_pte_clear; + pv_mmu_ops.pmd_clear = kvm_pmd_clear; + #endif +Index: linux-2.6-tip/arch/x86/kernel/kvmclock.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/kvmclock.c ++++ linux-2.6-tip/arch/x86/kernel/kvmclock.c +@@ -19,7 +19,6 @@ + #include + #include + #include +-#include + #include + #include + #include +Index: linux-2.6-tip/arch/x86/kernel/machine_kexec_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/machine_kexec_32.c ++++ linux-2.6-tip/arch/x86/kernel/machine_kexec_32.c +@@ -14,12 +14,12 @@ + #include + #include + #include ++#include + + #include + #include + #include + #include +-#include + #include + #include + #include +@@ -63,7 +63,7 @@ static void load_segments(void) + "\tmovl %%eax,%%fs\n" + "\tmovl %%eax,%%gs\n" + "\tmovl %%eax,%%ss\n" +- ::: "eax", "memory"); ++ : : : "eax", "memory"); + #undef STR + #undef __STR + } +@@ -121,7 +121,7 @@ static void machine_kexec_page_table_set + static void machine_kexec_prepare_page_tables(struct kimage *image) + { + void *control_page; +- pmd_t *pmd = 0; ++ pmd_t *pmd = NULL; + + control_page = page_address(image->control_code_page); + #ifdef CONFIG_X86_PAE +@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image) + + if (image->preserve_context) { + #ifdef CONFIG_X86_IO_APIC +- /* We need to put APICs in legacy mode so that we can ++ /* ++ * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need +@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + +- /* The segment registers are funny things, they have both a ++ /* ++ * The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the +@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image) + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); +- /* The gdt & idt are now invalid. ++ /* ++ * The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); ++ set_gdt(phys_to_virt(0), 0); ++ set_idt(phys_to_virt(0), 0); + + /* now call it */ + image->start = relocate_kernel_ptr((unsigned long)image->head, +Index: linux-2.6-tip/arch/x86/kernel/machine_kexec_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/machine_kexec_64.c ++++ linux-2.6-tip/arch/x86/kernel/machine_kexec_64.c +@@ -12,20 +12,47 @@ + #include + #include + #include ++#include ++#include + + #include + #include + #include +-#include + +-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +-static u64 kexec_pgd[512] PAGE_ALIGNED; +-static u64 kexec_pud0[512] PAGE_ALIGNED; +-static u64 kexec_pmd0[512] PAGE_ALIGNED; +-static u64 kexec_pte0[512] PAGE_ALIGNED; +-static u64 kexec_pud1[512] PAGE_ALIGNED; +-static u64 kexec_pmd1[512] PAGE_ALIGNED; +-static u64 kexec_pte1[512] PAGE_ALIGNED; ++static int init_one_level2_page(struct kimage *image, pgd_t *pgd, ++ unsigned long addr) ++{ ++ pud_t *pud; ++ pmd_t *pmd; ++ struct page *page; ++ int result = -ENOMEM; ++ ++ addr &= PMD_MASK; ++ pgd += pgd_index(addr); ++ if (!pgd_present(*pgd)) { ++ page = kimage_alloc_control_pages(image, 0); ++ if (!page) ++ goto out; ++ pud = (pud_t *)page_address(page); ++ memset(pud, 0, PAGE_SIZE); ++ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); ++ } ++ pud = pud_offset(pgd, addr); ++ if (!pud_present(*pud)) { ++ page = kimage_alloc_control_pages(image, 0); ++ if (!page) ++ goto out; ++ pmd = (pmd_t *)page_address(page); ++ memset(pmd, 0, PAGE_SIZE); ++ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); ++ } ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); ++ result = 0; ++out: ++ return result; ++} + + static void init_level2_page(pmd_t *level2p, unsigned long addr) + { +@@ -92,9 +119,8 @@ static int init_level4_page(struct kimag + } + level3p = (pud_t *)page_address(page); + result = init_level3_page(image, level3p, addr, last_addr); +- if (result) { ++ if (result) + goto out; +- } + set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + addr += PGDIR_SIZE; + } +@@ -107,12 +133,72 @@ out: + return result; + } + ++static void free_transition_pgtable(struct kimage *image) ++{ ++ free_page((unsigned long)image->arch.pud); ++ free_page((unsigned long)image->arch.pmd); ++ free_page((unsigned long)image->arch.pte); ++} ++ ++static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) ++{ ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ unsigned long vaddr, paddr; ++ int result = -ENOMEM; ++ ++ vaddr = (unsigned long)relocate_kernel; ++ paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); ++ pgd += pgd_index(vaddr); ++ if (!pgd_present(*pgd)) { ++ pud = (pud_t *)get_zeroed_page(GFP_KERNEL); ++ if (!pud) ++ goto err; ++ image->arch.pud = pud; ++ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); ++ } ++ pud = pud_offset(pgd, vaddr); ++ if (!pud_present(*pud)) { ++ pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); ++ if (!pmd) ++ goto err; ++ image->arch.pmd = pmd; ++ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); ++ } ++ pmd = pmd_offset(pud, vaddr); ++ if (!pmd_present(*pmd)) { ++ pte = (pte_t *)get_zeroed_page(GFP_KERNEL); ++ if (!pte) ++ goto err; ++ image->arch.pte = pte; ++ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); ++ } ++ pte = pte_offset_kernel(pmd, vaddr); ++ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); ++ return 0; ++err: ++ free_transition_pgtable(image); ++ return result; ++} ++ + + static int init_pgtable(struct kimage *image, unsigned long start_pgtable) + { + pgd_t *level4p; ++ int result; + level4p = (pgd_t *)__va(start_pgtable); +- return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); ++ result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); ++ if (result) ++ return result; ++ /* ++ * image->start may be outside 0 ~ max_pfn, for example when ++ * jump back to original kernel from kexeced kernel ++ */ ++ result = init_one_level2_page(image, level4p, image->start); ++ if (result) ++ return result; ++ return init_transition_pgtable(image, level4p); + } + + static void set_idt(void *newidt, u16 limit) +@@ -174,7 +260,7 @@ int machine_kexec_prepare(struct kimage + + void machine_kexec_cleanup(struct kimage *image) + { +- return; ++ free_transition_pgtable(image); + } + + /* +@@ -185,36 +271,45 @@ void machine_kexec(struct kimage *image) + { + unsigned long page_list[PAGES_NR]; + void *control_page; ++ int save_ftrace_enabled; + +- tracer_disable(); ++#ifdef CONFIG_KEXEC_JUMP ++ if (kexec_image->preserve_context) ++ save_processor_state(); ++#endif ++ ++ save_ftrace_enabled = __ftrace_enabled_save(); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + ++ if (image->preserve_context) { ++#ifdef CONFIG_X86_IO_APIC ++ /* ++ * We need to put APICs in legacy mode so that we can ++ * get timer interrupts in second kernel. kexec/kdump ++ * paths already have calls to disable_IO_APIC() in ++ * one form or other. kexec jump path also need ++ * one. ++ */ ++ disable_IO_APIC(); ++#endif ++ } ++ + control_page = page_address(image->control_code_page) + PAGE_SIZE; +- memcpy(control_page, relocate_kernel, PAGE_SIZE); ++ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + + page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); +- page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; +- page_list[PA_PGD] = virt_to_phys(&kexec_pgd); +- page_list[VA_PGD] = (unsigned long)kexec_pgd; +- page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0); +- page_list[VA_PUD_0] = (unsigned long)kexec_pud0; +- page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0); +- page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; +- page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0); +- page_list[VA_PTE_0] = (unsigned long)kexec_pte0; +- page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1); +- page_list[VA_PUD_1] = (unsigned long)kexec_pud1; +- page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1); +- page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; +- page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1); +- page_list[VA_PTE_1] = (unsigned long)kexec_pte1; +- ++ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + +- /* The segment registers are funny things, they have both a ++ if (image->type == KEXEC_TYPE_DEFAULT) ++ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) ++ << PAGE_SHIFT); ++ ++ /* ++ * The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the +@@ -224,15 +319,25 @@ void machine_kexec(struct kimage *image) + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); +- /* The gdt & idt are now invalid. ++ /* ++ * The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); ++ set_gdt(phys_to_virt(0), 0); ++ set_idt(phys_to_virt(0), 0); + + /* now call it */ +- relocate_kernel((unsigned long)image->head, (unsigned long)page_list, +- image->start); ++ image->start = relocate_kernel((unsigned long)image->head, ++ (unsigned long)page_list, ++ image->start, ++ image->preserve_context); ++ ++#ifdef CONFIG_KEXEC_JUMP ++ if (kexec_image->preserve_context) ++ restore_processor_state(); ++#endif ++ ++ __ftrace_enabled_restore(save_ftrace_enabled); + } + + void arch_crash_save_vmcoreinfo(void) +Index: linux-2.6-tip/arch/x86/kernel/mca_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/mca_32.c ++++ linux-2.6-tip/arch/x86/kernel/mca_32.c +@@ -51,7 +51,6 @@ + #include + #include + #include +-#include + + static unsigned char which_scsi; + +@@ -474,6 +473,4 @@ void __kprobes mca_handle_nmi(void) + * adapter was responsible for the error. + */ + bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); +- +- mca_nmi_hook(); +-} /* mca_handle_nmi */ ++} +Index: linux-2.6-tip/arch/x86/kernel/microcode_amd.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/microcode_amd.c ++++ linux-2.6-tip/arch/x86/kernel/microcode_amd.c +@@ -12,31 +12,30 @@ + * + * Licensed under the terms of the GNU General Public + * License version 2. See file COPYING for details. +-*/ +- ++ */ ++#include + #include +-#include +-#include +-#include +-#include +-#include +-#include +-#include + #include ++#include + #include +-#include +-#include ++#include ++#include ++#include ++#include ++#include ++#include + #include ++#include ++#include ++#include + #include +-#include +-#include + #include +-#include +-#include ++#include ++#include + +-#include +-#include + #include ++#include ++#include + + MODULE_DESCRIPTION("AMD Microcode Update Driver"); + MODULE_AUTHOR("Peter Oruba"); +@@ -72,8 +71,8 @@ struct microcode_header_amd { + } __attribute__((packed)); + + struct microcode_amd { +- struct microcode_header_amd hdr; +- unsigned int mpb[0]; ++ struct microcode_header_amd hdr; ++ unsigned int mpb[0]; + }; + + #define UCODE_MAX_SIZE 2048 +@@ -81,7 +80,7 @@ struct microcode_amd { + #define UCODE_CONTAINER_HEADER_SIZE 12 + + /* serialize access to the physical write */ +-static DEFINE_SPINLOCK(microcode_update_lock); ++static DEFINE_RAW_SPINLOCK(microcode_update_lock); + + static struct equiv_cpu_entry *equiv_cpu_table; + +@@ -184,8 +183,8 @@ static int get_ucode_data(void *to, cons + return 0; + } + +-static void *get_next_ucode(const u8 *buf, unsigned int size, +- unsigned int *mc_size) ++static void * ++get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) + { + unsigned int total_size; + u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; +@@ -223,7 +222,6 @@ static void *get_next_ucode(const u8 *bu + return mc; + } + +- + static int install_equiv_cpu_table(const u8 *buf) + { + u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; +@@ -372,4 +370,3 @@ struct microcode_ops * __init init_amd_m + { + return µcode_amd_ops; + } +- +Index: linux-2.6-tip/arch/x86/kernel/microcode_core.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/microcode_core.c ++++ linux-2.6-tip/arch/x86/kernel/microcode_core.c +@@ -70,67 +70,78 @@ + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ ++#include + #include +-#include +-#include +-#include ++#include ++#include + #include ++#include + #include +-#include +-#include ++#include + #include +-#include +-#include +-#include +-#include ++#include ++#include + #include ++#include ++#include ++#include + #include +-#include +-#include ++#include ++#include + +-#include +-#include +-#include + #include ++#include ++#include + + MODULE_DESCRIPTION("Microcode Update Driver"); + MODULE_AUTHOR("Tigran Aivazian "); + MODULE_LICENSE("GPL"); + +-#define MICROCODE_VERSION "2.00" ++#define MICROCODE_VERSION "2.00" + +-static struct microcode_ops *microcode_ops; ++static struct microcode_ops *microcode_ops; + + /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ + static DEFINE_MUTEX(microcode_mutex); + +-struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; ++struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; + EXPORT_SYMBOL_GPL(ucode_cpu_info); + + #ifdef CONFIG_MICROCODE_OLD_INTERFACE ++struct update_for_cpu { ++ const void __user *buf; ++ size_t size; ++}; ++ ++static long update_for_cpu(void *_ufc) ++{ ++ struct update_for_cpu *ufc = _ufc; ++ int error; ++ ++ error = microcode_ops->request_microcode_user(smp_processor_id(), ++ ufc->buf, ufc->size); ++ if (error < 0) ++ return error; ++ if (!error) ++ microcode_ops->apply_microcode(smp_processor_id()); ++ return error; ++} ++ + static int do_microcode_update(const void __user *buf, size_t size) + { +- cpumask_t old; + int error = 0; + int cpu; +- +- old = current->cpus_allowed; ++ struct update_for_cpu ufc = { .buf = buf, .size = size }; + + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; +- +- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); +- error = microcode_ops->request_microcode_user(cpu, buf, size); ++ error = work_on_cpu(cpu, update_for_cpu, &ufc); + if (error < 0) +- goto out; +- if (!error) +- microcode_ops->apply_microcode(cpu); ++ break; + } +-out: +- set_cpus_allowed_ptr(current, &old); + return error; + } + +@@ -198,18 +209,33 @@ static void microcode_dev_exit(void) + + MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); + #else +-#define microcode_dev_init() 0 +-#define microcode_dev_exit() do { } while (0) ++#define microcode_dev_init() 0 ++#define microcode_dev_exit() do { } while (0) + #endif + + /* fake device for request_firmware */ +-static struct platform_device *microcode_pdev; ++static struct platform_device *microcode_pdev; ++ ++static long reload_for_cpu(void *unused) ++{ ++ struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); ++ int err = 0; ++ ++ mutex_lock(µcode_mutex); ++ if (uci->valid) { ++ err = microcode_ops->request_microcode_fw(smp_processor_id(), ++ µcode_pdev->dev); ++ if (!err) ++ microcode_ops->apply_microcode(smp_processor_id()); ++ } ++ mutex_unlock(µcode_mutex); ++ return err; ++} + + static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) + { +- struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; +@@ -218,21 +244,9 @@ static ssize_t reload_store(struct sys_d + if (end == buf) + return -EINVAL; + if (val == 1) { +- cpumask_t old = current->cpus_allowed; +- + get_online_cpus(); +- if (cpu_online(cpu)) { +- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); +- mutex_lock(µcode_mutex); +- if (uci->valid) { +- err = microcode_ops->request_microcode_fw(cpu, +- µcode_pdev->dev); +- if (!err) +- microcode_ops->apply_microcode(cpu); +- } +- mutex_unlock(µcode_mutex); +- set_cpus_allowed_ptr(current, &old); +- } ++ if (cpu_online(cpu)) ++ err = work_on_cpu(cpu, reload_for_cpu, NULL); + put_online_cpus(); + } + if (err) +@@ -268,8 +282,8 @@ static struct attribute *mc_default_attr + }; + + static struct attribute_group mc_attr_group = { +- .attrs = mc_default_attrs, +- .name = "microcode", ++ .attrs = mc_default_attrs, ++ .name = "microcode", + }; + + static void __microcode_fini_cpu(int cpu) +@@ -328,9 +342,9 @@ static int microcode_resume_cpu(int cpu) + return 0; + } + +-static void microcode_update_cpu(int cpu) ++static long microcode_update_cpu(void *unused) + { +- struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + int err = 0; + + /* +@@ -338,30 +352,27 @@ static void microcode_update_cpu(int cpu + * otherwise just request a firmware: + */ + if (uci->valid) { +- err = microcode_resume_cpu(cpu); +- } else { +- collect_cpu_info(cpu); ++ err = microcode_resume_cpu(smp_processor_id()); ++ } else { ++ collect_cpu_info(smp_processor_id()); + if (uci->valid && system_state == SYSTEM_RUNNING) +- err = microcode_ops->request_microcode_fw(cpu, ++ err = microcode_ops->request_microcode_fw( ++ smp_processor_id(), + µcode_pdev->dev); + } + if (!err) +- microcode_ops->apply_microcode(cpu); ++ microcode_ops->apply_microcode(smp_processor_id()); ++ return err; + } + +-static void microcode_init_cpu(int cpu) ++static int microcode_init_cpu(int cpu) + { +- cpumask_t old = current->cpus_allowed; +- +- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); +- /* We should bind the task to the CPU */ +- BUG_ON(raw_smp_processor_id() != cpu); +- ++ int err; + mutex_lock(µcode_mutex); +- microcode_update_cpu(cpu); ++ err = work_on_cpu(cpu, microcode_update_cpu, NULL); + mutex_unlock(µcode_mutex); + +- set_cpus_allowed_ptr(current, &old); ++ return err; + } + + static int mc_sysdev_add(struct sys_device *sys_dev) +@@ -379,8 +390,18 @@ static int mc_sysdev_add(struct sys_devi + if (err) + return err; + +- microcode_init_cpu(cpu); +- return 0; ++ err = microcode_init_cpu(cpu); ++#if 0 ++ /* ++ * While it looks correct, it's broken as we remove the sysfs ++ * entry in sysdev_remove below again. The error handling in ++ * this file is completely wreckaged and we have multiple ++ * hotplug handling via notifier and sysdev as well. Sigh. ++ */ ++ if (err) ++ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); ++#endif ++ return err; + } + + static int mc_sysdev_remove(struct sys_device *sys_dev) +@@ -404,14 +425,14 @@ static int mc_sysdev_resume(struct sys_d + return 0; + + /* only CPU 0 will apply ucode here */ +- microcode_update_cpu(0); ++ microcode_update_cpu(NULL); + return 0; + } + + static struct sysdev_driver mc_sysdev_driver = { +- .add = mc_sysdev_add, +- .remove = mc_sysdev_remove, +- .resume = mc_sysdev_resume, ++ .add = mc_sysdev_add, ++ .remove = mc_sysdev_remove, ++ .resume = mc_sysdev_resume, + }; + + static __cpuinit int +@@ -424,7 +445,9 @@ mc_cpu_callback(struct notifier_block *n + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: +- microcode_init_cpu(cpu); ++ if (microcode_init_cpu(cpu)) ++ printk(KERN_ERR "microcode: failed to init CPU%d\n", ++ cpu); + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + pr_debug("microcode: CPU%d added\n", cpu); +@@ -448,7 +471,7 @@ mc_cpu_callback(struct notifier_block *n + } + + static struct notifier_block __refdata mc_cpu_notifier = { +- .notifier_call = mc_cpu_callback, ++ .notifier_call = mc_cpu_callback, + }; + + static int __init microcode_init(void) +Index: linux-2.6-tip/arch/x86/kernel/microcode_intel.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/microcode_intel.c ++++ linux-2.6-tip/arch/x86/kernel/microcode_intel.c +@@ -70,28 +70,28 @@ + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ ++#include + #include +-#include +-#include +-#include ++#include ++#include + #include ++#include + #include +-#include +-#include ++#include + #include +-#include +-#include +-#include +-#include ++#include ++#include + #include ++#include ++#include ++#include + #include +-#include +-#include ++#include ++#include + +-#include +-#include +-#include + #include ++#include ++#include + + MODULE_DESCRIPTION("Microcode Update Driver"); + MODULE_AUTHOR("Tigran Aivazian "); +@@ -129,12 +129,13 @@ struct extended_sigtable { + struct extended_signature sigs[0]; + }; + +-#define DEFAULT_UCODE_DATASIZE (2000) ++#define DEFAULT_UCODE_DATASIZE (2000) + #define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) + #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) + #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) + #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) + #define DWSIZE (sizeof(u32)) ++ + #define get_totalsize(mc) \ + (((struct microcode_intel *)mc)->hdr.totalsize ? \ + ((struct microcode_intel *)mc)->hdr.totalsize : \ +@@ -150,7 +151,7 @@ struct extended_sigtable { + #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) + + /* serialize access to the physical write to MSR 0x79 */ +-static DEFINE_SPINLOCK(microcode_update_lock); ++static DEFINE_RAW_SPINLOCK(microcode_update_lock); + + static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) + { +@@ -196,31 +197,32 @@ static inline int update_match_cpu(struc + return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; + } + +-static inline int +-update_match_revision(struct microcode_header_intel *mc_header, int rev) ++static inline int ++update_match_revision(struct microcode_header_intel *mc_header, int rev) + { + return (mc_header->rev <= rev) ? 0 : 1; + } + + static int microcode_sanity_check(void *mc) + { ++ unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; +- struct extended_signature *ext_sig; +- unsigned long total_size, data_size, ext_table_size; + int sum, orig_sum, ext_sigcount = 0, i; ++ struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); ++ + if (data_size + MC_HEADER_SIZE > total_size) { + printk(KERN_ERR "microcode: error! " +- "Bad data size in microcode data file\n"); ++ "Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + printk(KERN_ERR "microcode: error! " +- "Unknown microcode update format\n"); ++ "Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); +@@ -318,11 +320,15 @@ get_matching_microcode(struct cpu_signat + + static void apply_microcode(int cpu) + { ++ struct microcode_intel *mc_intel; ++ struct ucode_cpu_info *uci; + unsigned long flags; + unsigned int val[2]; +- int cpu_num = raw_smp_processor_id(); +- struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +- struct microcode_intel *mc_intel = uci->mc; ++ int cpu_num; ++ ++ cpu_num = raw_smp_processor_id(); ++ uci = ucode_cpu_info + cpu; ++ mc_intel = uci->mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); +@@ -348,15 +354,17 @@ static void apply_microcode(int cpu) + spin_unlock_irqrestore(µcode_update_lock, flags); + if (val[1] != mc_intel->hdr.rev) { + printk(KERN_ERR "microcode: CPU%d update from revision " +- "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); ++ "0x%x to 0x%x failed\n", ++ cpu_num, uci->cpu_sig.rev, val[1]); + return; + } + printk(KERN_INFO "microcode: CPU%d updated from revision " +- "0x%x to 0x%x, date = %04x-%02x-%02x \n", ++ "0x%x to 0x%x, date = %04x-%02x-%02x \n", + cpu_num, uci->cpu_sig.rev, val[1], + mc_intel->hdr.date & 0xffff, + mc_intel->hdr.date >> 24, + (mc_intel->hdr.date >> 16) & 0xff); ++ + uci->cpu_sig.rev = val[1]; + } + +@@ -404,18 +412,23 @@ static int generic_load_microcode(int cp + leftover -= mc_size; + } + +- if (new_mc) { +- if (!leftover) { +- if (uci->mc) +- vfree(uci->mc); +- uci->mc = (struct microcode_intel *)new_mc; +- pr_debug("microcode: CPU%d found a matching microcode update with" +- " version 0x%x (current=0x%x)\n", +- cpu, new_rev, uci->cpu_sig.rev); +- } else +- vfree(new_mc); ++ if (!new_mc) ++ goto out; ++ ++ if (leftover) { ++ vfree(new_mc); ++ goto out; + } + ++ if (uci->mc) ++ vfree(uci->mc); ++ uci->mc = (struct microcode_intel *)new_mc; ++ ++ pr_debug("microcode: CPU%d found a matching microcode update with" ++ " version 0x%x (current=0x%x)\n", ++ cpu, new_rev, uci->cpu_sig.rev); ++ ++ out: + return (int)leftover; + } + +@@ -442,8 +455,8 @@ static int request_microcode_fw(int cpu, + return ret; + } + +- ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, +- &get_ucode_fw); ++ ret = generic_load_microcode(cpu, (void *)firmware->data, ++ firmware->size, &get_ucode_fw); + + release_firmware(firmware); + +@@ -460,7 +473,7 @@ static int request_microcode_user(int cp + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + +- return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); ++ return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); + } + + static void microcode_fini_cpu(int cpu) +Index: linux-2.6-tip/arch/x86/kernel/mmconf-fam10h_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/mmconf-fam10h_64.c ++++ linux-2.6-tip/arch/x86/kernel/mmconf-fam10h_64.c +@@ -226,7 +226,7 @@ static int __devinit set_check_enable_am + return 0; + } + +-static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { ++static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { + { + .callback = set_check_enable_amd_mmconf, + .ident = "Sun Microsystems Machine", +Index: linux-2.6-tip/arch/x86/kernel/module_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/module_32.c ++++ linux-2.6-tip/arch/x86/kernel/module_32.c +@@ -42,7 +42,7 @@ void module_free(struct module *mod, voi + { + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception +- table entries. */ ++ table entries. */ + } + + /* We don't need anything special. */ +@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr, + *para = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + +- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { ++ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) +- locks= s; ++ locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + } +Index: linux-2.6-tip/arch/x86/kernel/module_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/module_64.c ++++ linux-2.6-tip/arch/x86/kernel/module_64.c +@@ -30,14 +30,14 @@ + #include + #include + +-#define DEBUGP(fmt...) ++#define DEBUGP(fmt...) + + #ifndef CONFIG_UML + void module_free(struct module *mod, void *module_region) + { + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception +- table entries. */ ++ table entries. */ + } + + void *module_alloc(unsigned long size) +@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechd + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; +- u64 val; ++ u64 val; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); +@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechd + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + +- DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", +- (int)ELF64_R_TYPE(rel[i].r_info), +- sym->st_value, rel[i].r_addend, (u64)loc); ++ DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", ++ (int)ELF64_R_TYPE(rel[i].r_info), ++ sym->st_value, rel[i].r_addend, (u64)loc); + +- val = sym->st_value + rel[i].r_addend; ++ val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: +@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechd + if ((s64)val != *(s32 *)loc) + goto overflow; + break; +- case R_X86_64_PC32: ++ case R_X86_64_PC32: + val -= (u64)loc; + *(u32 *)loc = val; + #if 0 + if ((s64)val != *(s32 *)loc) +- goto overflow; ++ goto overflow; + #endif + break; + default: +- printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", ++ printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } +@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechd + return 0; + + overflow: +- printk(KERN_ERR "overflow in relocation type %d val %Lx\n", ++ printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + me->name); +@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs, + unsigned int relsec, + struct module *me) + { +- printk("non add relocation not supported\n"); ++ printk(KERN_ERR "non add relocation not supported\n"); + return -ENOSYS; +-} ++} + + int module_finalize(const Elf_Ehdr *hdr, +- const Elf_Shdr *sechdrs, +- struct module *me) ++ const Elf_Shdr *sechdrs, ++ struct module *me) + { + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; +@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr, + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) +- locks= s; ++ locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + } +Index: linux-2.6-tip/arch/x86/kernel/mpparse.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/mpparse.c ++++ linux-2.6-tip/arch/x86/kernel/mpparse.c +@@ -3,7 +3,7 @@ + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 +- * (c) 1998, 1999, 2000 Ingo Molnar ++ * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * (c) 2008 Alexey Starikovskiy + */ + +@@ -29,12 +29,7 @@ + #include + #include + +-#include +-#ifdef CONFIG_X86_32 +-#include +-#include +-#endif +- ++#include + /* + * Checksum an MP configuration block. + */ +@@ -114,9 +109,6 @@ static void __init MP_bus_info(struct mp + } else + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); + } +-#endif +- +-#ifdef CONFIG_X86_IO_APIC + + static int bad_ioapic(unsigned long address) + { +@@ -144,11 +136,11 @@ static void __init MP_ioapic_info(struct + if (bad_ioapic(m->apicaddr)) + return; + +- mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr; +- mp_ioapics[nr_ioapics].mp_apicid = m->apicid; +- mp_ioapics[nr_ioapics].mp_type = m->type; +- mp_ioapics[nr_ioapics].mp_apicver = m->apicver; +- mp_ioapics[nr_ioapics].mp_flags = m->flags; ++ mp_ioapics[nr_ioapics].apicaddr = m->apicaddr; ++ mp_ioapics[nr_ioapics].apicid = m->apicid; ++ mp_ioapics[nr_ioapics].type = m->type; ++ mp_ioapics[nr_ioapics].apicver = m->apicver; ++ mp_ioapics[nr_ioapics].flags = m->flags; + nr_ioapics++; + } + +@@ -160,55 +152,55 @@ static void print_MP_intsrc_info(struct + m->srcbusirq, m->dstapic, m->dstirq); + } + +-static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) ++static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) + { + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", +- mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, +- (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, +- mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); ++ mp_irq->irqtype, mp_irq->irqflag & 3, ++ (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, ++ mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); + } + + static void __init assign_to_mp_irq(struct mpc_intsrc *m, +- struct mp_config_intsrc *mp_irq) ++ struct mpc_intsrc *mp_irq) + { +- mp_irq->mp_dstapic = m->dstapic; +- mp_irq->mp_type = m->type; +- mp_irq->mp_irqtype = m->irqtype; +- mp_irq->mp_irqflag = m->irqflag; +- mp_irq->mp_srcbus = m->srcbus; +- mp_irq->mp_srcbusirq = m->srcbusirq; +- mp_irq->mp_dstirq = m->dstirq; ++ mp_irq->dstapic = m->dstapic; ++ mp_irq->type = m->type; ++ mp_irq->irqtype = m->irqtype; ++ mp_irq->irqflag = m->irqflag; ++ mp_irq->srcbus = m->srcbus; ++ mp_irq->srcbusirq = m->srcbusirq; ++ mp_irq->dstirq = m->dstirq; + } + +-static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, ++static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) + { +- m->dstapic = mp_irq->mp_dstapic; +- m->type = mp_irq->mp_type; +- m->irqtype = mp_irq->mp_irqtype; +- m->irqflag = mp_irq->mp_irqflag; +- m->srcbus = mp_irq->mp_srcbus; +- m->srcbusirq = mp_irq->mp_srcbusirq; +- m->dstirq = mp_irq->mp_dstirq; ++ m->dstapic = mp_irq->dstapic; ++ m->type = mp_irq->type; ++ m->irqtype = mp_irq->irqtype; ++ m->irqflag = mp_irq->irqflag; ++ m->srcbus = mp_irq->srcbus; ++ m->srcbusirq = mp_irq->srcbusirq; ++ m->dstirq = mp_irq->dstirq; + } + +-static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, ++static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) + { +- if (mp_irq->mp_dstapic != m->dstapic) ++ if (mp_irq->dstapic != m->dstapic) + return 1; +- if (mp_irq->mp_type != m->type) ++ if (mp_irq->type != m->type) + return 2; +- if (mp_irq->mp_irqtype != m->irqtype) ++ if (mp_irq->irqtype != m->irqtype) + return 3; +- if (mp_irq->mp_irqflag != m->irqflag) ++ if (mp_irq->irqflag != m->irqflag) + return 4; +- if (mp_irq->mp_srcbus != m->srcbus) ++ if (mp_irq->srcbus != m->srcbus) + return 5; +- if (mp_irq->mp_srcbusirq != m->srcbusirq) ++ if (mp_irq->srcbusirq != m->srcbusirq) + return 6; +- if (mp_irq->mp_dstirq != m->dstirq) ++ if (mp_irq->dstirq != m->dstirq) + return 7; + + return 0; +@@ -229,8 +221,12 @@ static void __init MP_intsrc_info(struct + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); + } ++#else /* CONFIG_X86_IO_APIC */ ++static inline void __init MP_bus_info(struct mpc_bus *m) {} ++static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} ++static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {} ++#endif /* CONFIG_X86_IO_APIC */ + +-#endif + + static void __init MP_lintsrc_info(struct mpc_lintsrc *m) + { +@@ -280,6 +276,20 @@ static int __init smp_check_mpc(struct m + return 1; + } + ++static void skip_entry(unsigned char **ptr, int *count, int size) ++{ ++ *ptr += size; ++ *count += size; ++} ++ ++static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) ++{ ++ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" ++ "type %x\n", *mpt); ++ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, ++ 1, mpc, mpc->length, 1); ++} ++ + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) + { + char str[16]; +@@ -292,16 +302,7 @@ static int __init smp_read_mpc(struct mp + return 0; + + #ifdef CONFIG_X86_32 +- /* +- * need to make sure summit and es7000's mps_oem_check is safe to be +- * called early via genericarch 's mps_oem_check +- */ +- if (early) { +-#ifdef CONFIG_X86_NUMAQ +- numaq_mps_oem_check(mpc, oem, str); +-#endif +- } else +- mps_oem_check(mpc, oem, str); ++ generic_mps_oem_check(mpc, oem, str); + #endif + /* save the local APIC address, it might be non-default */ + if (!acpi_lapic) +@@ -324,61 +325,30 @@ static int __init smp_read_mpc(struct mp + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: +- { +- struct mpc_cpu *m = (struct mpc_cpu *)mpt; +- /* ACPI may have already provided this data */ +- if (!acpi_lapic) +- MP_processor_info(m); +- mpt += sizeof(*m); +- count += sizeof(*m); +- break; +- } ++ /* ACPI may have already provided this data */ ++ if (!acpi_lapic) ++ MP_processor_info((struct mpc_cpu *)mpt); ++ skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); ++ break; + case MP_BUS: +- { +- struct mpc_bus *m = (struct mpc_bus *)mpt; +-#ifdef CONFIG_X86_IO_APIC +- MP_bus_info(m); +-#endif +- mpt += sizeof(*m); +- count += sizeof(*m); +- break; +- } ++ MP_bus_info((struct mpc_bus *)mpt); ++ skip_entry(&mpt, &count, sizeof(struct mpc_bus)); ++ break; + case MP_IOAPIC: +- { +-#ifdef CONFIG_X86_IO_APIC +- struct mpc_ioapic *m = (struct mpc_ioapic *)mpt; +- MP_ioapic_info(m); +-#endif +- mpt += sizeof(struct mpc_ioapic); +- count += sizeof(struct mpc_ioapic); +- break; +- } ++ MP_ioapic_info((struct mpc_ioapic *)mpt); ++ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); ++ break; + case MP_INTSRC: +- { +-#ifdef CONFIG_X86_IO_APIC +- struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; +- +- MP_intsrc_info(m); +-#endif +- mpt += sizeof(struct mpc_intsrc); +- count += sizeof(struct mpc_intsrc); +- break; +- } ++ MP_intsrc_info((struct mpc_intsrc *)mpt); ++ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); ++ break; + case MP_LINTSRC: +- { +- struct mpc_lintsrc *m = +- (struct mpc_lintsrc *)mpt; +- MP_lintsrc_info(m); +- mpt += sizeof(*m); +- count += sizeof(*m); +- break; +- } ++ MP_lintsrc_info((struct mpc_lintsrc *)mpt); ++ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); ++ break; + default: + /* wrong mptable */ +- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); +- printk(KERN_ERR "type %x\n", *mpt); +- print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, +- 1, mpc, mpc->length, 1); ++ smp_dump_mptable(mpc, mpt); + count = mpc->length; + break; + } +@@ -386,13 +356,13 @@ static int __init smp_read_mpc(struct mp + (*x86_quirks->mpc_record)++; + } + +-#ifdef CONFIG_X86_GENERICARCH +- generic_bigsmp_probe(); ++#ifdef CONFIG_X86_BIGSMP ++ generic_bigsmp_probe(); + #endif + +-#ifdef CONFIG_X86_32 +- setup_apic_routing(); +-#endif ++ if (apic->setup_apic_routing) ++ apic->setup_apic_routing(); ++ + if (!num_processors) + printk(KERN_ERR "MPTABLE: no processors registered!\n"); + return num_processors; +@@ -417,7 +387,7 @@ static void __init construct_default_ioi + intsrc.type = MP_INTSRC; + intsrc.irqflag = 0; /* conforming */ + intsrc.srcbus = 0; +- intsrc.dstapic = mp_ioapics[0].mp_apicid; ++ intsrc.dstapic = mp_ioapics[0].apicid; + + intsrc.irqtype = mp_INT; + +@@ -570,14 +540,76 @@ static inline void __init construct_defa + } + } + +-static struct intel_mp_floating *mpf_found; ++static struct mpf_intel *mpf_found; ++ ++static unsigned long __init get_mpc_size(unsigned long physptr) ++{ ++ struct mpc_table *mpc; ++ unsigned long size; ++ ++ mpc = early_ioremap(physptr, PAGE_SIZE); ++ size = mpc->length; ++ early_iounmap(mpc, PAGE_SIZE); ++ apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); ++ ++ return size; ++} ++ ++static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) ++{ ++ struct mpc_table *mpc; ++ unsigned long size; ++ ++ size = get_mpc_size(mpf->physptr); ++ mpc = early_ioremap(mpf->physptr, size); ++ /* ++ * Read the physical hardware table. Anything here will ++ * override the defaults. ++ */ ++ if (!smp_read_mpc(mpc, early)) { ++#ifdef CONFIG_X86_LOCAL_APIC ++ smp_found_config = 0; ++#endif ++ printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" ++ "... disabling SMP support. (tell your hw vendor)\n"); ++ early_iounmap(mpc, size); ++ return -1; ++ } ++ early_iounmap(mpc, size); ++ ++ if (early) ++ return -1; ++ ++#ifdef CONFIG_X86_IO_APIC ++ /* ++ * If there are no explicit MP IRQ entries, then we are ++ * broken. We set up most of the low 16 IO-APIC pins to ++ * ISA defaults and hope it will work. ++ */ ++ if (!mp_irq_entries) { ++ struct mpc_bus bus; ++ ++ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " ++ "using default mptable. (tell your hw vendor)\n"); ++ ++ bus.type = MP_BUS; ++ bus.busid = 0; ++ memcpy(bus.bustype, "ISA ", 6); ++ MP_bus_info(&bus); ++ ++ construct_default_ioirq_mptable(0); ++ } ++#endif ++ ++ return 0; ++} + + /* + * Scan the memory blocks for an SMP configuration block. + */ + static void __init __get_smp_config(unsigned int early) + { +- struct intel_mp_floating *mpf = mpf_found; ++ struct mpf_intel *mpf = mpf_found; + + if (!mpf) + return; +@@ -598,9 +630,9 @@ static void __init __get_smp_config(unsi + } + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", +- mpf->mpf_specification); ++ mpf->specification); + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) +- if (mpf->mpf_feature2 & (1 << 7)) { ++ if (mpf->feature2 & (1 << 7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { +@@ -611,7 +643,7 @@ static void __init __get_smp_config(unsi + /* + * Now see if we need to read further. + */ +- if (mpf->mpf_feature1 != 0) { ++ if (mpf->feature1 != 0) { + if (early) { + /* + * local APIC has default address +@@ -621,49 +653,12 @@ static void __init __get_smp_config(unsi + } + + printk(KERN_INFO "Default MP configuration #%d\n", +- mpf->mpf_feature1); +- construct_default_ISA_mptable(mpf->mpf_feature1); +- +- } else if (mpf->mpf_physptr) { ++ mpf->feature1); ++ construct_default_ISA_mptable(mpf->feature1); + +- /* +- * Read the physical hardware table. Anything here will +- * override the defaults. +- */ +- if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { +-#ifdef CONFIG_X86_LOCAL_APIC +- smp_found_config = 0; +-#endif +- printk(KERN_ERR +- "BIOS bug, MP table errors detected!...\n"); +- printk(KERN_ERR "... disabling SMP support. " +- "(tell your hw vendor)\n"); +- return; +- } +- +- if (early) ++ } else if (mpf->physptr) { ++ if (check_physptr(mpf, early)) + return; +-#ifdef CONFIG_X86_IO_APIC +- /* +- * If there are no explicit MP IRQ entries, then we are +- * broken. We set up most of the low 16 IO-APIC pins to +- * ISA defaults and hope it will work. +- */ +- if (!mp_irq_entries) { +- struct mpc_bus bus; +- +- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " +- "using default mptable. " +- "(tell your hw vendor)\n"); +- +- bus.type = MP_BUS; +- bus.busid = 0; +- memcpy(bus.bustype, "ISA ", 6); +- MP_bus_info(&bus); +- +- construct_default_ioirq_mptable(0); +- } +-#endif + } else + BUG(); + +@@ -684,54 +679,62 @@ void __init get_smp_config(void) + __get_smp_config(0); + } + ++static void smp_reserve_bootmem(struct mpf_intel *mpf) ++{ ++ unsigned long size = get_mpc_size(mpf->physptr); ++#ifdef CONFIG_X86_32 ++ /* ++ * We cannot access to MPC table to compute table size yet, ++ * as only few megabytes from the bottom is mapped now. ++ * PC-9800's MPC table places on the very last of physical ++ * memory; so that simply reserving PAGE_SIZE from mpf->physptr ++ * yields BUG() in reserve_bootmem. ++ * also need to make sure physptr is below than max_low_pfn ++ * we don't need reserve the area above max_low_pfn ++ */ ++ unsigned long end = max_low_pfn * PAGE_SIZE; ++ ++ if (mpf->physptr < end) { ++ if (mpf->physptr + size > end) ++ size = end - mpf->physptr; ++ reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); ++ } ++#else ++ reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT); ++#endif ++} ++ + static int __init smp_scan_config(unsigned long base, unsigned long length, + unsigned reserve) + { + unsigned int *bp = phys_to_virt(base); +- struct intel_mp_floating *mpf; ++ struct mpf_intel *mpf; + + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", + bp, length); + BUILD_BUG_ON(sizeof(*mpf) != 16); + + while (length > 0) { +- mpf = (struct intel_mp_floating *)bp; ++ mpf = (struct mpf_intel *)bp; + if ((*bp == SMP_MAGIC_IDENT) && +- (mpf->mpf_length == 1) && ++ (mpf->length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && +- ((mpf->mpf_specification == 1) +- || (mpf->mpf_specification == 4))) { ++ ((mpf->specification == 1) ++ || (mpf->specification == 4))) { + #ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; + #endif + mpf_found = mpf; + +- printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", +- mpf, virt_to_phys(mpf)); ++ printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", ++ mpf, (u64)virt_to_phys(mpf)); + + if (!reserve) + return 1; +- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, +- BOOTMEM_DEFAULT); +- if (mpf->mpf_physptr) { +- unsigned long size = PAGE_SIZE; +-#ifdef CONFIG_X86_32 +- /* +- * We cannot access to MPC table to compute +- * table size yet, as only few megabytes from +- * the bottom is mapped now. +- * PC-9800's MPC table places on the very last +- * of physical memory; so that simply reserving +- * PAGE_SIZE from mpg->mpf_physptr yields BUG() +- * in reserve_bootmem. +- */ +- unsigned long end = max_low_pfn * PAGE_SIZE; +- if (mpf->mpf_physptr + size > end) +- size = end - mpf->mpf_physptr; +-#endif +- reserve_bootmem_generic(mpf->mpf_physptr, size, ++ reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), + BOOTMEM_DEFAULT); +- } ++ if (mpf->physptr) ++ smp_reserve_bootmem(mpf); + + return 1; + } +@@ -809,15 +812,15 @@ static int __init get_MP_intsrc_index(s + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { +- if (mp_irqs[i].mp_irqtype != mp_INT) ++ if (mp_irqs[i].irqtype != mp_INT) + continue; + +- if (mp_irqs[i].mp_irqflag != 0x0f) ++ if (mp_irqs[i].irqflag != 0x0f) + continue; + +- if (mp_irqs[i].mp_srcbus != m->srcbus) ++ if (mp_irqs[i].srcbus != m->srcbus) + continue; +- if (mp_irqs[i].mp_srcbusirq != m->srcbusirq) ++ if (mp_irqs[i].srcbusirq != m->srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ +@@ -834,7 +837,57 @@ static int __init get_MP_intsrc_index(s + #define SPARE_SLOT_NUM 20 + + static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; +-#endif ++ ++static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) ++{ ++ int i; ++ ++ apic_printk(APIC_VERBOSE, "OLD "); ++ print_MP_intsrc_info(m); ++ ++ i = get_MP_intsrc_index(m); ++ if (i > 0) { ++ assign_to_mpc_intsrc(&mp_irqs[i], m); ++ apic_printk(APIC_VERBOSE, "NEW "); ++ print_mp_irq_info(&mp_irqs[i]); ++ return; ++ } ++ if (!i) { ++ /* legacy, do nothing */ ++ return; ++ } ++ if (*nr_m_spare < SPARE_SLOT_NUM) { ++ /* ++ * not found (-1), or duplicated (-2) are invalid entries, ++ * we need to use the slot later ++ */ ++ m_spare[*nr_m_spare] = m; ++ *nr_m_spare += 1; ++ } ++} ++#else /* CONFIG_X86_IO_APIC */ ++static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} ++#endif /* CONFIG_X86_IO_APIC */ ++ ++static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, ++ int count) ++{ ++ if (!mpc_new_phys) { ++ pr_info("No spare slots, try to append...take your risk, " ++ "new mpc_length %x\n", count); ++ } else { ++ if (count <= mpc_new_length) ++ pr_info("No spare slots, try to append..., " ++ "new mpc_length %x\n", count); ++ else { ++ pr_err("mpc_new_length %lx is too small\n", ++ mpc_new_length); ++ return -1; ++ } ++ } ++ ++ return 0; ++} + + static int __init replace_intsrc_all(struct mpc_table *mpc, + unsigned long mpc_new_phys, +@@ -842,77 +895,33 @@ static int __init replace_intsrc_all(st + { + #ifdef CONFIG_X86_IO_APIC + int i; +- int nr_m_spare = 0; + #endif +- + int count = sizeof(*mpc); ++ int nr_m_spare = 0; + unsigned char *mpt = ((unsigned char *)mpc) + count; + + printk(KERN_INFO "mpc_length %x\n", mpc->length); + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: +- { +- struct mpc_cpu *m = (struct mpc_cpu *)mpt; +- mpt += sizeof(*m); +- count += sizeof(*m); +- break; +- } ++ skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); ++ break; + case MP_BUS: +- { +- struct mpc_bus *m = (struct mpc_bus *)mpt; +- mpt += sizeof(*m); +- count += sizeof(*m); +- break; +- } ++ skip_entry(&mpt, &count, sizeof(struct mpc_bus)); ++ break; + case MP_IOAPIC: +- { +- mpt += sizeof(struct mpc_ioapic); +- count += sizeof(struct mpc_ioapic); +- break; +- } ++ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); ++ break; + case MP_INTSRC: +- { +-#ifdef CONFIG_X86_IO_APIC +- struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; +- +- printk(KERN_INFO "OLD "); +- print_MP_intsrc_info(m); +- i = get_MP_intsrc_index(m); +- if (i > 0) { +- assign_to_mpc_intsrc(&mp_irqs[i], m); +- printk(KERN_INFO "NEW "); +- print_mp_irq_info(&mp_irqs[i]); +- } else if (!i) { +- /* legacy, do nothing */ +- } else if (nr_m_spare < SPARE_SLOT_NUM) { +- /* +- * not found (-1), or duplicated (-2) +- * are invalid entries, +- * we need to use the slot later +- */ +- m_spare[nr_m_spare] = m; +- nr_m_spare++; +- } +-#endif +- mpt += sizeof(struct mpc_intsrc); +- count += sizeof(struct mpc_intsrc); +- break; +- } ++ check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); ++ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); ++ break; + case MP_LINTSRC: +- { +- struct mpc_lintsrc *m = +- (struct mpc_lintsrc *)mpt; +- mpt += sizeof(*m); +- count += sizeof(*m); +- break; +- } ++ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); ++ break; + default: + /* wrong mptable */ +- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); +- printk(KERN_ERR "type %x\n", *mpt); +- print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, +- 1, mpc, mpc->length, 1); ++ smp_dump_mptable(mpc, mpt); + goto out; + } + } +@@ -922,30 +931,22 @@ static int __init replace_intsrc_all(st + if (irq_used[i]) + continue; + +- if (mp_irqs[i].mp_irqtype != mp_INT) ++ if (mp_irqs[i].irqtype != mp_INT) + continue; + +- if (mp_irqs[i].mp_irqflag != 0x0f) ++ if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (nr_m_spare > 0) { +- printk(KERN_INFO "*NEW* found "); ++ apic_printk(APIC_VERBOSE, "*NEW* found\n"); + nr_m_spare--; + assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; + count += sizeof(struct mpc_intsrc); +- if (!mpc_new_phys) { +- printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count); +- } else { +- if (count <= mpc_new_length) +- printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count); +- else { +- printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length); +- goto out; +- } +- } ++ if (!check_slot(mpc_new_phys, mpc_new_length, count)) ++ goto out; + assign_to_mpc_intsrc(&mp_irqs[i], m); + mpc->length = count; + mpt += sizeof(struct mpc_intsrc); +@@ -1001,7 +1002,7 @@ static int __init update_mp_table(void) + { + char str[16]; + char oem[10]; +- struct intel_mp_floating *mpf; ++ struct mpf_intel *mpf; + struct mpc_table *mpc, *mpc_new; + + if (!enable_update_mptable) +@@ -1014,19 +1015,19 @@ static int __init update_mp_table(void) + /* + * Now see if we need to go further. + */ +- if (mpf->mpf_feature1 != 0) ++ if (mpf->feature1 != 0) + return 0; + +- if (!mpf->mpf_physptr) ++ if (!mpf->physptr) + return 0; + +- mpc = phys_to_virt(mpf->mpf_physptr); ++ mpc = phys_to_virt(mpf->physptr); + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + +- printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); +- printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); ++ printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf)); ++ printk(KERN_INFO "physptr: %x\n", mpf->physptr); + + if (mpc_new_phys && mpc->length > mpc_new_length) { + mpc_new_phys = 0; +@@ -1047,23 +1048,23 @@ static int __init update_mp_table(void) + } + printk(KERN_INFO "use in-positon replacing\n"); + } else { +- mpf->mpf_physptr = mpc_new_phys; ++ mpf->physptr = mpc_new_phys; + mpc_new = phys_to_virt(mpc_new_phys); + memcpy(mpc_new, mpc, mpc->length); + mpc = mpc_new; + /* check if we can modify that */ +- if (mpc_new_phys - mpf->mpf_physptr) { +- struct intel_mp_floating *mpf_new; ++ if (mpc_new_phys - mpf->physptr) { ++ struct mpf_intel *mpf_new; + /* steal 16 bytes from [0, 1k) */ + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + mpf_new = phys_to_virt(0x400 - 16); + memcpy(mpf_new, mpf, 16); + mpf = mpf_new; +- mpf->mpf_physptr = mpc_new_phys; ++ mpf->physptr = mpc_new_phys; + } +- mpf->mpf_checksum = 0; +- mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); +- printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); ++ mpf->checksum = 0; ++ mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); ++ printk(KERN_INFO "physptr new: %x\n", mpf->physptr); + } + + /* +Index: linux-2.6-tip/arch/x86/kernel/msr.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/msr.c ++++ linux-2.6-tip/arch/x86/kernel/msr.c +@@ -35,10 +35,10 @@ + #include + #include + #include ++#include + + #include + #include +-#include + #include + + static struct class *msr_class; +Index: linux-2.6-tip/arch/x86/kernel/nmi.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/nmi.c ++++ /dev/null +@@ -1,572 +0,0 @@ +-/* +- * NMI watchdog support on APIC systems +- * +- * Started by Ingo Molnar +- * +- * Fixes: +- * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. +- * Mikael Pettersson : Power Management for local APIC NMI watchdog. +- * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. +- * Pavel Machek and +- * Mikael Pettersson : PM converted to driver model. Disable/enable API. +- */ +- +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +- +-#include +- +-#include +- +-int unknown_nmi_panic; +-int nmi_watchdog_enabled; +- +-static cpumask_t backtrace_mask = CPU_MASK_NONE; +- +-/* nmi_active: +- * >0: the lapic NMI watchdog is active, but can be disabled +- * <0: the lapic NMI watchdog has not been set up, and cannot +- * be enabled +- * 0: the lapic NMI watchdog is disabled, but can be enabled +- */ +-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +-EXPORT_SYMBOL(nmi_active); +- +-unsigned int nmi_watchdog = NMI_NONE; +-EXPORT_SYMBOL(nmi_watchdog); +- +-static int panic_on_timeout; +- +-static unsigned int nmi_hz = HZ; +-static DEFINE_PER_CPU(short, wd_enabled); +-static int endflag __initdata; +- +-static inline unsigned int get_nmi_count(int cpu) +-{ +-#ifdef CONFIG_X86_64 +- return cpu_pda(cpu)->__nmi_count; +-#else +- return nmi_count(cpu); +-#endif +-} +- +-static inline int mce_in_progress(void) +-{ +-#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +- return atomic_read(&mce_entry) > 0; +-#endif +- return 0; +-} +- +-/* +- * Take the local apic timer and PIT/HPET into account. We don't +- * know which one is active, when we have highres/dyntick on +- */ +-static inline unsigned int get_timer_irqs(int cpu) +-{ +-#ifdef CONFIG_X86_64 +- return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); +-#else +- return per_cpu(irq_stat, cpu).apic_timer_irqs + +- per_cpu(irq_stat, cpu).irq0_irqs; +-#endif +-} +- +-#ifdef CONFIG_SMP +-/* +- * The performance counters used by NMI_LOCAL_APIC don't trigger when +- * the CPU is idle. To make sure the NMI watchdog really ticks on all +- * CPUs during the test make them busy. +- */ +-static __init void nmi_cpu_busy(void *data) +-{ +- local_irq_enable_in_hardirq(); +- /* +- * Intentionally don't use cpu_relax here. This is +- * to make sure that the performance counter really ticks, +- * even if there is a simulator or similar that catches the +- * pause instruction. On a real HT machine this is fine because +- * all other CPUs are busy with "useless" delay loops and don't +- * care if they get somewhat less cycles. +- */ +- while (endflag == 0) +- mb(); +-} +-#endif +- +-static void report_broken_nmi(int cpu, int *prev_nmi_count) +-{ +- printk(KERN_CONT "\n"); +- +- printk(KERN_WARNING +- "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", +- cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); +- +- printk(KERN_WARNING +- "Please report this to bugzilla.kernel.org,\n"); +- printk(KERN_WARNING +- "and attach the output of the 'dmesg' command.\n"); +- +- per_cpu(wd_enabled, cpu) = 0; +- atomic_dec(&nmi_active); +-} +- +-static void __acpi_nmi_disable(void *__unused) +-{ +- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +-} +- +-int __init check_nmi_watchdog(void) +-{ +- unsigned int *prev_nmi_count; +- int cpu; +- +- if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) +- return 0; +- +- prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); +- if (!prev_nmi_count) +- goto error; +- +- printk(KERN_INFO "Testing NMI watchdog ... "); +- +-#ifdef CONFIG_SMP +- if (nmi_watchdog == NMI_LOCAL_APIC) +- smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); +-#endif +- +- for_each_possible_cpu(cpu) +- prev_nmi_count[cpu] = get_nmi_count(cpu); +- local_irq_enable(); +- mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ +- +- for_each_online_cpu(cpu) { +- if (!per_cpu(wd_enabled, cpu)) +- continue; +- if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) +- report_broken_nmi(cpu, prev_nmi_count); +- } +- endflag = 1; +- if (!atomic_read(&nmi_active)) { +- kfree(prev_nmi_count); +- atomic_set(&nmi_active, -1); +- goto error; +- } +- printk("OK.\n"); +- +- /* +- * now that we know it works we can reduce NMI frequency to +- * something more reasonable; makes a difference in some configs +- */ +- if (nmi_watchdog == NMI_LOCAL_APIC) +- nmi_hz = lapic_adjust_nmi_hz(1); +- +- kfree(prev_nmi_count); +- return 0; +-error: +- if (nmi_watchdog == NMI_IO_APIC) { +- if (!timer_through_8259) +- disable_8259A_irq(0); +- on_each_cpu(__acpi_nmi_disable, NULL, 1); +- } +- +-#ifdef CONFIG_X86_32 +- timer_ack = 0; +-#endif +- return -1; +-} +- +-static int __init setup_nmi_watchdog(char *str) +-{ +- unsigned int nmi; +- +- if (!strncmp(str, "panic", 5)) { +- panic_on_timeout = 1; +- str = strchr(str, ','); +- if (!str) +- return 1; +- ++str; +- } +- +- if (!strncmp(str, "lapic", 5)) +- nmi_watchdog = NMI_LOCAL_APIC; +- else if (!strncmp(str, "ioapic", 6)) +- nmi_watchdog = NMI_IO_APIC; +- else { +- get_option(&str, &nmi); +- if (nmi >= NMI_INVALID) +- return 0; +- nmi_watchdog = nmi; +- } +- +- return 1; +-} +-__setup("nmi_watchdog=", setup_nmi_watchdog); +- +-/* +- * Suspend/resume support +- */ +-#ifdef CONFIG_PM +- +-static int nmi_pm_active; /* nmi_active before suspend */ +- +-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) +-{ +- /* only CPU0 goes here, other CPUs should be offline */ +- nmi_pm_active = atomic_read(&nmi_active); +- stop_apic_nmi_watchdog(NULL); +- BUG_ON(atomic_read(&nmi_active) != 0); +- return 0; +-} +- +-static int lapic_nmi_resume(struct sys_device *dev) +-{ +- /* only CPU0 goes here, other CPUs should be offline */ +- if (nmi_pm_active > 0) { +- setup_apic_nmi_watchdog(NULL); +- touch_nmi_watchdog(); +- } +- return 0; +-} +- +-static struct sysdev_class nmi_sysclass = { +- .name = "lapic_nmi", +- .resume = lapic_nmi_resume, +- .suspend = lapic_nmi_suspend, +-}; +- +-static struct sys_device device_lapic_nmi = { +- .id = 0, +- .cls = &nmi_sysclass, +-}; +- +-static int __init init_lapic_nmi_sysfs(void) +-{ +- int error; +- +- /* +- * should really be a BUG_ON but b/c this is an +- * init call, it just doesn't work. -dcz +- */ +- if (nmi_watchdog != NMI_LOCAL_APIC) +- return 0; +- +- if (atomic_read(&nmi_active) < 0) +- return 0; +- +- error = sysdev_class_register(&nmi_sysclass); +- if (!error) +- error = sysdev_register(&device_lapic_nmi); +- return error; +-} +- +-/* must come after the local APIC's device_initcall() */ +-late_initcall(init_lapic_nmi_sysfs); +- +-#endif /* CONFIG_PM */ +- +-static void __acpi_nmi_enable(void *__unused) +-{ +- apic_write(APIC_LVT0, APIC_DM_NMI); +-} +- +-/* +- * Enable timer based NMIs on all CPUs: +- */ +-void acpi_nmi_enable(void) +-{ +- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) +- on_each_cpu(__acpi_nmi_enable, NULL, 1); +-} +- +-/* +- * Disable timer based NMIs on all CPUs: +- */ +-void acpi_nmi_disable(void) +-{ +- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) +- on_each_cpu(__acpi_nmi_disable, NULL, 1); +-} +- +-/* +- * This function is called as soon the LAPIC NMI watchdog driver has everything +- * in place and it's ready to check if the NMIs belong to the NMI watchdog +- */ +-void cpu_nmi_set_wd_enabled(void) +-{ +- __get_cpu_var(wd_enabled) = 1; +-} +- +-void setup_apic_nmi_watchdog(void *unused) +-{ +- if (__get_cpu_var(wd_enabled)) +- return; +- +- /* cheap hack to support suspend/resume */ +- /* if cpu0 is not active neither should the other cpus */ +- if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) +- return; +- +- switch (nmi_watchdog) { +- case NMI_LOCAL_APIC: +- if (lapic_watchdog_init(nmi_hz) < 0) { +- __get_cpu_var(wd_enabled) = 0; +- return; +- } +- /* FALL THROUGH */ +- case NMI_IO_APIC: +- __get_cpu_var(wd_enabled) = 1; +- atomic_inc(&nmi_active); +- } +-} +- +-void stop_apic_nmi_watchdog(void *unused) +-{ +- /* only support LOCAL and IO APICs for now */ +- if (!nmi_watchdog_active()) +- return; +- if (__get_cpu_var(wd_enabled) == 0) +- return; +- if (nmi_watchdog == NMI_LOCAL_APIC) +- lapic_watchdog_stop(); +- else +- __acpi_nmi_disable(NULL); +- __get_cpu_var(wd_enabled) = 0; +- atomic_dec(&nmi_active); +-} +- +-/* +- * the best way to detect whether a CPU has a 'hard lockup' problem +- * is to check it's local APIC timer IRQ counts. If they are not +- * changing then that CPU has some problem. +- * +- * as these watchdog NMI IRQs are generated on every CPU, we only +- * have to check the current processor. +- * +- * since NMIs don't listen to _any_ locks, we have to be extremely +- * careful not to rely on unsafe variables. The printk might lock +- * up though, so we have to break up any console locks first ... +- * [when there will be more tty-related locks, break them up here too!] +- */ +- +-static DEFINE_PER_CPU(unsigned, last_irq_sum); +-static DEFINE_PER_CPU(local_t, alert_counter); +-static DEFINE_PER_CPU(int, nmi_touch); +- +-void touch_nmi_watchdog(void) +-{ +- if (nmi_watchdog_active()) { +- unsigned cpu; +- +- /* +- * Tell other CPUs to reset their alert counters. We cannot +- * do it ourselves because the alert count increase is not +- * atomic. +- */ +- for_each_present_cpu(cpu) { +- if (per_cpu(nmi_touch, cpu) != 1) +- per_cpu(nmi_touch, cpu) = 1; +- } +- } +- +- /* +- * Tickle the softlockup detector too: +- */ +- touch_softlockup_watchdog(); +-} +-EXPORT_SYMBOL(touch_nmi_watchdog); +- +-notrace __kprobes int +-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) +-{ +- /* +- * Since current_thread_info()-> is always on the stack, and we +- * always switch the stack NMI-atomically, it's safe to use +- * smp_processor_id(). +- */ +- unsigned int sum; +- int touched = 0; +- int cpu = smp_processor_id(); +- int rc = 0; +- +- /* check for other users first */ +- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) +- == NOTIFY_STOP) { +- rc = 1; +- touched = 1; +- } +- +- sum = get_timer_irqs(cpu); +- +- if (__get_cpu_var(nmi_touch)) { +- __get_cpu_var(nmi_touch) = 0; +- touched = 1; +- } +- +- if (cpu_isset(cpu, backtrace_mask)) { +- static DEFINE_SPINLOCK(lock); /* Serialise the printks */ +- +- spin_lock(&lock); +- printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); +- dump_stack(); +- spin_unlock(&lock); +- cpu_clear(cpu, backtrace_mask); +- } +- +- /* Could check oops_in_progress here too, but it's safer not to */ +- if (mce_in_progress()) +- touched = 1; +- +- /* if the none of the timers isn't firing, this cpu isn't doing much */ +- if (!touched && __get_cpu_var(last_irq_sum) == sum) { +- /* +- * Ayiee, looks like this CPU is stuck ... +- * wait a few IRQs (5 seconds) before doing the oops ... +- */ +- local_inc(&__get_cpu_var(alert_counter)); +- if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) +- /* +- * die_nmi will return ONLY if NOTIFY_STOP happens.. +- */ +- die_nmi("BUG: NMI Watchdog detected LOCKUP", +- regs, panic_on_timeout); +- } else { +- __get_cpu_var(last_irq_sum) = sum; +- local_set(&__get_cpu_var(alert_counter), 0); +- } +- +- /* see if the nmi watchdog went off */ +- if (!__get_cpu_var(wd_enabled)) +- return rc; +- switch (nmi_watchdog) { +- case NMI_LOCAL_APIC: +- rc |= lapic_wd_event(nmi_hz); +- break; +- case NMI_IO_APIC: +- /* +- * don't know how to accurately check for this. +- * just assume it was a watchdog timer interrupt +- * This matches the old behaviour. +- */ +- rc = 1; +- break; +- } +- return rc; +-} +- +-#ifdef CONFIG_SYSCTL +- +-static void enable_ioapic_nmi_watchdog_single(void *unused) +-{ +- __get_cpu_var(wd_enabled) = 1; +- atomic_inc(&nmi_active); +- __acpi_nmi_enable(NULL); +-} +- +-static void enable_ioapic_nmi_watchdog(void) +-{ +- on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); +- touch_nmi_watchdog(); +-} +- +-static void disable_ioapic_nmi_watchdog(void) +-{ +- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); +-} +- +-static int __init setup_unknown_nmi_panic(char *str) +-{ +- unknown_nmi_panic = 1; +- return 1; +-} +-__setup("unknown_nmi_panic", setup_unknown_nmi_panic); +- +-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +-{ +- unsigned char reason = get_nmi_reason(); +- char buf[64]; +- +- sprintf(buf, "NMI received for unknown reason %02x\n", reason); +- die_nmi(buf, regs, 1); /* Always panic here */ +- return 0; +-} +- +-/* +- * proc handler for /proc/sys/kernel/nmi +- */ +-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, +- void __user *buffer, size_t *length, loff_t *ppos) +-{ +- int old_state; +- +- nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; +- old_state = nmi_watchdog_enabled; +- proc_dointvec(table, write, file, buffer, length, ppos); +- if (!!old_state == !!nmi_watchdog_enabled) +- return 0; +- +- if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { +- printk(KERN_WARNING +- "NMI watchdog is permanently disabled\n"); +- return -EIO; +- } +- +- if (nmi_watchdog == NMI_LOCAL_APIC) { +- if (nmi_watchdog_enabled) +- enable_lapic_nmi_watchdog(); +- else +- disable_lapic_nmi_watchdog(); +- } else if (nmi_watchdog == NMI_IO_APIC) { +- if (nmi_watchdog_enabled) +- enable_ioapic_nmi_watchdog(); +- else +- disable_ioapic_nmi_watchdog(); +- } else { +- printk(KERN_WARNING +- "NMI watchdog doesn't know what hardware to touch\n"); +- return -EIO; +- } +- return 0; +-} +- +-#endif /* CONFIG_SYSCTL */ +- +-int do_nmi_callback(struct pt_regs *regs, int cpu) +-{ +-#ifdef CONFIG_SYSCTL +- if (unknown_nmi_panic) +- return unknown_nmi_panic_callback(regs, cpu); +-#endif +- return 0; +-} +- +-void __trigger_all_cpu_backtrace(void) +-{ +- int i; +- +- backtrace_mask = cpu_online_map; +- /* Wait for up to 10 seconds for all CPUs to do the backtrace */ +- for (i = 0; i < 10 * 1000; i++) { +- if (cpus_empty(backtrace_mask)) +- break; +- mdelay(1); +- } +-} +Index: linux-2.6-tip/arch/x86/kernel/numaq_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/numaq_32.c ++++ /dev/null +@@ -1,293 +0,0 @@ +-/* +- * Written by: Patricia Gaughen, IBM Corporation +- * +- * Copyright (C) 2002, IBM Corp. +- * +- * All rights reserved. +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License as published by +- * the Free Software Foundation; either version 2 of the License, or +- * (at your option) any later version. +- * +- * This program is distributed in the hope that it will be useful, but +- * WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or +- * NON INFRINGEMENT. See the GNU General Public License for more +- * details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program; if not, write to the Free Software +- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +- * +- * Send feedback to +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) +- +-/* +- * Function: smp_dump_qct() +- * +- * Description: gets memory layout from the quad config table. This +- * function also updates node_online_map with the nodes (quads) present. +- */ +-static void __init smp_dump_qct(void) +-{ +- int node; +- struct eachquadmem *eq; +- struct sys_cfg_data *scd = +- (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR); +- +- nodes_clear(node_online_map); +- for_each_node(node) { +- if (scd->quads_present31_0 & (1 << node)) { +- node_set_online(node); +- eq = &scd->eq[node]; +- /* Convert to pages */ +- node_start_pfn[node] = MB_TO_PAGES( +- eq->hi_shrd_mem_start - eq->priv_mem_size); +- node_end_pfn[node] = MB_TO_PAGES( +- eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); +- +- e820_register_active_regions(node, node_start_pfn[node], +- node_end_pfn[node]); +- memory_present(node, +- node_start_pfn[node], node_end_pfn[node]); +- node_remap_size[node] = node_memmap_size_bytes(node, +- node_start_pfn[node], +- node_end_pfn[node]); +- } +- } +-} +- +- +-void __cpuinit numaq_tsc_disable(void) +-{ +- if (!found_numaq) +- return; +- +- if (num_online_nodes() > 1) { +- printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); +- setup_clear_cpu_cap(X86_FEATURE_TSC); +- } +-} +- +-static int __init numaq_pre_time_init(void) +-{ +- numaq_tsc_disable(); +- return 0; +-} +- +-int found_numaq; +-/* +- * Have to match translation table entries to main table entries by counter +- * hence the mpc_record variable .... can't see a less disgusting way of +- * doing this .... +- */ +-struct mpc_config_translation { +- unsigned char mpc_type; +- unsigned char trans_len; +- unsigned char trans_type; +- unsigned char trans_quad; +- unsigned char trans_global; +- unsigned char trans_local; +- unsigned short trans_reserved; +-}; +- +-/* x86_quirks member */ +-static int mpc_record; +-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] +- __cpuinitdata; +- +-static inline int generate_logical_apicid(int quad, int phys_apicid) +-{ +- return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); +-} +- +-/* x86_quirks member */ +-static int mpc_apic_id(struct mpc_cpu *m) +-{ +- int quad = translation_table[mpc_record]->trans_quad; +- int logical_apicid = generate_logical_apicid(quad, m->apicid); +- +- printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", +- m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, +- (m->cpufeature & CPU_MODEL_MASK) >> 4, +- m->apicver, quad, logical_apicid); +- return logical_apicid; +-} +- +-int mp_bus_id_to_node[MAX_MP_BUSSES]; +- +-int mp_bus_id_to_local[MAX_MP_BUSSES]; +- +-/* x86_quirks member */ +-static void mpc_oem_bus_info(struct mpc_bus *m, char *name) +-{ +- int quad = translation_table[mpc_record]->trans_quad; +- int local = translation_table[mpc_record]->trans_local; +- +- mp_bus_id_to_node[m->busid] = quad; +- mp_bus_id_to_local[m->busid] = local; +- printk(KERN_INFO "Bus #%d is %s (node %d)\n", +- m->busid, name, quad); +-} +- +-int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +- +-/* x86_quirks member */ +-static void mpc_oem_pci_bus(struct mpc_bus *m) +-{ +- int quad = translation_table[mpc_record]->trans_quad; +- int local = translation_table[mpc_record]->trans_local; +- +- quad_local_to_mp_bus_id[quad][local] = m->busid; +-} +- +-static void __init MP_translation_info(struct mpc_config_translation *m) +-{ +- printk(KERN_INFO +- "Translation: record %d, type %d, quad %d, global %d, local %d\n", +- mpc_record, m->trans_type, m->trans_quad, m->trans_global, +- m->trans_local); +- +- if (mpc_record >= MAX_MPC_ENTRY) +- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); +- else +- translation_table[mpc_record] = m; /* stash this for later */ +- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) +- node_set_online(m->trans_quad); +-} +- +-static int __init mpf_checksum(unsigned char *mp, int len) +-{ +- int sum = 0; +- +- while (len--) +- sum += *mp++; +- +- return sum & 0xFF; +-} +- +-/* +- * Read/parse the MPC oem tables +- */ +- +-static void __init smp_read_mpc_oem(struct mpc_oemtable *oemtable, +- unsigned short oemsize) +-{ +- int count = sizeof(*oemtable); /* the header size */ +- unsigned char *oemptr = ((unsigned char *)oemtable) + count; +- +- mpc_record = 0; +- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", +- oemtable); +- if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { +- printk(KERN_WARNING +- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", +- oemtable->signature[0], oemtable->signature[1], +- oemtable->signature[2], oemtable->signature[3]); +- return; +- } +- if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) { +- printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); +- return; +- } +- while (count < oemtable->length) { +- switch (*oemptr) { +- case MP_TRANSLATION: +- { +- struct mpc_config_translation *m = +- (struct mpc_config_translation *)oemptr; +- MP_translation_info(m); +- oemptr += sizeof(*m); +- count += sizeof(*m); +- ++mpc_record; +- break; +- } +- default: +- { +- printk(KERN_WARNING +- "Unrecognised OEM table entry type! - %d\n", +- (int)*oemptr); +- return; +- } +- } +- } +-} +- +-static int __init numaq_setup_ioapic_ids(void) +-{ +- /* so can skip it */ +- return 1; +-} +- +-static int __init numaq_update_genapic(void) +-{ +- genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; +- +- return 0; +-} +- +-static struct x86_quirks numaq_x86_quirks __initdata = { +- .arch_pre_time_init = numaq_pre_time_init, +- .arch_time_init = NULL, +- .arch_pre_intr_init = NULL, +- .arch_memory_setup = NULL, +- .arch_intr_init = NULL, +- .arch_trap_init = NULL, +- .mach_get_smp_config = NULL, +- .mach_find_smp_config = NULL, +- .mpc_record = &mpc_record, +- .mpc_apic_id = mpc_apic_id, +- .mpc_oem_bus_info = mpc_oem_bus_info, +- .mpc_oem_pci_bus = mpc_oem_pci_bus, +- .smp_read_mpc_oem = smp_read_mpc_oem, +- .setup_ioapic_ids = numaq_setup_ioapic_ids, +- .update_genapic = numaq_update_genapic, +-}; +- +-void numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) +-{ +- if (strncmp(oem, "IBM NUMA", 8)) +- printk("Warning! Not a NUMA-Q system!\n"); +- else +- found_numaq = 1; +-} +- +-static __init void early_check_numaq(void) +-{ +- /* +- * Find possible boot-time SMP configuration: +- */ +- early_find_smp_config(); +- /* +- * get boot-time SMP configuration: +- */ +- if (smp_found_config) +- early_get_smp_config(); +- +- if (found_numaq) +- x86_quirks = &numaq_x86_quirks; +-} +- +-int __init get_memcfg_numaq(void) +-{ +- early_check_numaq(); +- if (!found_numaq) +- return 0; +- smp_dump_qct(); +- return 1; +-} +Index: linux-2.6-tip/arch/x86/kernel/paravirt-spinlocks.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/paravirt-spinlocks.c ++++ linux-2.6-tip/arch/x86/kernel/paravirt-spinlocks.c +@@ -8,7 +8,7 @@ + #include + + static inline void +-default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) ++default_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) + { + __raw_spin_lock(lock); + } +@@ -26,13 +26,3 @@ struct pv_lock_ops pv_lock_ops = { + }; + EXPORT_SYMBOL(pv_lock_ops); + +-void __init paravirt_use_bytelocks(void) +-{ +-#ifdef CONFIG_SMP +- pv_lock_ops.spin_is_locked = __byte_spin_is_locked; +- pv_lock_ops.spin_is_contended = __byte_spin_is_contended; +- pv_lock_ops.spin_lock = __byte_spin_lock; +- pv_lock_ops.spin_trylock = __byte_spin_trylock; +- pv_lock_ops.spin_unlock = __byte_spin_unlock; +-#endif +-} +Index: linux-2.6-tip/arch/x86/kernel/paravirt.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/paravirt.c ++++ linux-2.6-tip/arch/x86/kernel/paravirt.c +@@ -28,7 +28,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -44,6 +43,17 @@ void _paravirt_nop(void) + { + } + ++/* identity function, which can be inlined */ ++u32 _paravirt_ident_32(u32 x) ++{ ++ return x; ++} ++ ++u64 _paravirt_ident_64(u64 x) ++{ ++ return x; ++} ++ + static void __init default_banner(void) + { + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", +@@ -138,9 +148,16 @@ unsigned paravirt_patch_default(u8 type, + if (opfunc == NULL) + /* If there's no function, patch it with a ud2a (BUG) */ + ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); +- else if (opfunc == paravirt_nop) ++ else if (opfunc == _paravirt_nop) + /* If the operation is a nop, then nop the callsite */ + ret = paravirt_patch_nop(); ++ ++ /* identity functions just return their single argument */ ++ else if (opfunc == _paravirt_ident_32) ++ ret = paravirt_patch_ident_32(insnbuf, len); ++ else if (opfunc == _paravirt_ident_64) ++ ret = paravirt_patch_ident_64(insnbuf, len); ++ + else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || + type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || +@@ -318,10 +335,10 @@ struct pv_time_ops pv_time_ops = { + + struct pv_irq_ops pv_irq_ops = { + .init_IRQ = native_init_IRQ, +- .save_fl = native_save_fl, +- .restore_fl = native_restore_fl, +- .irq_disable = native_irq_disable, +- .irq_enable = native_irq_enable, ++ .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), ++ .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), ++ .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), ++ .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .safe_halt = native_safe_halt, + .halt = native_halt, + #ifdef CONFIG_X86_64 +@@ -399,6 +416,28 @@ struct pv_apic_ops pv_apic_ops = { + #endif + }; + ++#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) ++/* 32-bit pagetable entries */ ++#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32) ++#else ++/* 64-bit pagetable entries */ ++#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) ++#endif ++ ++#ifdef CONFIG_HIGHPTE ++/* ++ * kmap_atomic() might be an inline or a macro: ++ */ ++static void *kmap_atomic_func(struct page *page, enum km_type idx) ++{ ++ return kmap_atomic(page, idx); ++} ++static void *kmap_atomic_direct_func(struct page *page, enum km_type idx) ++{ ++ return kmap_atomic_direct(page, idx); ++} ++#endif ++ + struct pv_mmu_ops pv_mmu_ops = { + #ifndef CONFIG_X86_64 + .pagetable_setup_start = native_pagetable_setup_start, +@@ -439,33 +478,34 @@ struct pv_mmu_ops pv_mmu_ops = { + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + + #ifdef CONFIG_HIGHPTE +- .kmap_atomic_pte = kmap_atomic, ++ .kmap_atomic_pte = kmap_atomic_func, ++ .kmap_atomic_pte_direct = kmap_atomic_direct_func, + #endif + + #if PAGETABLE_LEVELS >= 3 + #ifdef CONFIG_X86_PAE + .set_pte_atomic = native_set_pte_atomic, +- .set_pte_present = native_set_pte_present, + .pte_clear = native_pte_clear, + .pmd_clear = native_pmd_clear, + #endif + .set_pud = native_set_pud, +- .pmd_val = native_pmd_val, +- .make_pmd = native_make_pmd, ++ ++ .pmd_val = PTE_IDENT, ++ .make_pmd = PTE_IDENT, + + #if PAGETABLE_LEVELS == 4 +- .pud_val = native_pud_val, +- .make_pud = native_make_pud, ++ .pud_val = PTE_IDENT, ++ .make_pud = PTE_IDENT, ++ + .set_pgd = native_set_pgd, + #endif + #endif /* PAGETABLE_LEVELS >= 3 */ + +- .pte_val = native_pte_val, +- .pte_flags = native_pte_flags, +- .pgd_val = native_pgd_val, ++ .pte_val = PTE_IDENT, ++ .pgd_val = PTE_IDENT, + +- .make_pte = native_make_pte, +- .make_pgd = native_make_pgd, ++ .make_pte = PTE_IDENT, ++ .make_pgd = PTE_IDENT, + + .dup_mmap = paravirt_nop, + .exit_mmap = paravirt_nop, +Index: linux-2.6-tip/arch/x86/kernel/paravirt_patch_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/paravirt_patch_32.c ++++ linux-2.6-tip/arch/x86/kernel/paravirt_patch_32.c +@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %c + DEF_NATIVE(pv_cpu_ops, clts, "clts"); + DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); + ++unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) ++{ ++ /* arg in %eax, return in %eax */ ++ return 0; ++} ++ ++unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) ++{ ++ /* arg in %edx:%eax, return in %edx:%eax */ ++ return 0; ++} ++ + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) + { +Index: linux-2.6-tip/arch/x86/kernel/paravirt_patch_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/paravirt_patch_64.c ++++ linux-2.6-tip/arch/x86/kernel/paravirt_patch_64.c +@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, + DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); + DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); + ++DEF_NATIVE(, mov32, "mov %edi, %eax"); ++DEF_NATIVE(, mov64, "mov %rdi, %rax"); ++ ++unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) ++{ ++ return paravirt_patch_insns(insnbuf, len, ++ start__mov32, end__mov32); ++} ++ ++unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) ++{ ++ return paravirt_patch_insns(insnbuf, len, ++ start__mov64, end__mov64); ++} ++ + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) + { +Index: linux-2.6-tip/arch/x86/kernel/pci-calgary_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/pci-calgary_64.c ++++ linux-2.6-tip/arch/x86/kernel/pci-calgary_64.c +@@ -380,8 +380,9 @@ static inline struct iommu_table *find_i + return tbl; + } + +-static void calgary_unmap_sg(struct device *dev, +- struct scatterlist *sglist, int nelems, int direction) ++static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, ++ int nelems,enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct iommu_table *tbl = find_iommu_table(dev); + struct scatterlist *s; +@@ -404,7 +405,8 @@ static void calgary_unmap_sg(struct devi + } + + static int calgary_map_sg(struct device *dev, struct scatterlist *sg, +- int nelems, int direction) ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct iommu_table *tbl = find_iommu_table(dev); + struct scatterlist *s; +@@ -429,15 +431,14 @@ static int calgary_map_sg(struct device + s->dma_address = (entry << PAGE_SHIFT) | s->offset; + + /* insert into HW table */ +- tce_build(tbl, entry, npages, vaddr & PAGE_MASK, +- direction); ++ tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir); + + s->dma_length = s->length; + } + + return nelems; + error: +- calgary_unmap_sg(dev, sg, nelems, direction); ++ calgary_unmap_sg(dev, sg, nelems, dir, NULL); + for_each_sg(sg, s, nelems, i) { + sg->dma_address = bad_dma_address; + sg->dma_length = 0; +@@ -445,10 +446,12 @@ error: + return 0; + } + +-static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, +- size_t size, int direction) ++static dma_addr_t calgary_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { +- void *vaddr = phys_to_virt(paddr); ++ void *vaddr = page_address(page) + offset; + unsigned long uaddr; + unsigned int npages; + struct iommu_table *tbl = find_iommu_table(dev); +@@ -456,17 +459,18 @@ static dma_addr_t calgary_map_single(str + uaddr = (unsigned long)vaddr; + npages = iommu_num_pages(uaddr, size, PAGE_SIZE); + +- return iommu_alloc(dev, tbl, vaddr, npages, direction); ++ return iommu_alloc(dev, tbl, vaddr, npages, dir); + } + +-static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, +- size_t size, int direction) ++static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct iommu_table *tbl = find_iommu_table(dev); + unsigned int npages; + +- npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); +- iommu_free(tbl, dma_handle, npages); ++ npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); ++ iommu_free(tbl, dma_addr, npages); + } + + static void* calgary_alloc_coherent(struct device *dev, size_t size, +@@ -515,13 +519,13 @@ static void calgary_free_coherent(struct + free_pages((unsigned long)vaddr, get_order(size)); + } + +-static struct dma_mapping_ops calgary_dma_ops = { ++static struct dma_map_ops calgary_dma_ops = { + .alloc_coherent = calgary_alloc_coherent, + .free_coherent = calgary_free_coherent, +- .map_single = calgary_map_single, +- .unmap_single = calgary_unmap_single, + .map_sg = calgary_map_sg, + .unmap_sg = calgary_unmap_sg, ++ .map_page = calgary_map_page, ++ .unmap_page = calgary_unmap_page, + }; + + static inline void __iomem * busno_to_bbar(unsigned char num) +Index: linux-2.6-tip/arch/x86/kernel/pci-dma.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/pci-dma.c ++++ linux-2.6-tip/arch/x86/kernel/pci-dma.c +@@ -1,4 +1,5 @@ + #include ++#include + #include + #include + #include +@@ -12,7 +13,7 @@ + + static int forbid_dac __read_mostly; + +-struct dma_mapping_ops *dma_ops; ++struct dma_map_ops *dma_ops; + EXPORT_SYMBOL(dma_ops); + + static int iommu_sac_force __read_mostly; +@@ -44,6 +45,9 @@ struct device x86_dma_fallback_dev = { + }; + EXPORT_SYMBOL(x86_dma_fallback_dev); + ++/* Number of entries preallocated for DMA-API debugging */ ++#define PREALLOC_DMA_DEBUG_ENTRIES 32768 ++ + int dma_set_mask(struct device *dev, u64 mask) + { + if (!dev->dma_mask || !dma_supported(dev, mask)) +@@ -224,7 +228,7 @@ early_param("iommu", iommu_setup); + + int dma_supported(struct device *dev, u64 mask) + { +- struct dma_mapping_ops *ops = get_dma_ops(dev); ++ struct dma_map_ops *ops = get_dma_ops(dev); + + #ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { +@@ -265,6 +269,12 @@ EXPORT_SYMBOL(dma_supported); + + static int __init pci_iommu_init(void) + { ++ dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); ++ ++#ifdef CONFIG_PCI ++ dma_debug_add_bus(&pci_bus_type); ++#endif ++ + calgary_iommu_init(); + + intel_iommu_init(); +Index: linux-2.6-tip/arch/x86/kernel/pci-gart_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/pci-gart_64.c ++++ linux-2.6-tip/arch/x86/kernel/pci-gart_64.c +@@ -255,10 +255,13 @@ static dma_addr_t dma_map_area(struct de + } + + /* Map a single area into the IOMMU */ +-static dma_addr_t +-gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) ++static dma_addr_t gart_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + unsigned long bus; ++ phys_addr_t paddr = page_to_phys(page) + offset; + + if (!dev) + dev = &x86_dma_fallback_dev; +@@ -275,8 +278,9 @@ gart_map_single(struct device *dev, phys + /* + * Free a DMA mapping. + */ +-static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, +- size_t size, int direction) ++static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + unsigned long iommu_page; + int npages; +@@ -298,8 +302,8 @@ static void gart_unmap_single(struct dev + /* + * Wrapper for pci_unmap_single working with scatterlists. + */ +-static void +-gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) ++static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, ++ enum dma_data_direction dir, struct dma_attrs *attrs) + { + struct scatterlist *s; + int i; +@@ -307,7 +311,7 @@ gart_unmap_sg(struct device *dev, struct + for_each_sg(sg, s, nents, i) { + if (!s->dma_length || !s->length) + break; +- gart_unmap_single(dev, s->dma_address, s->dma_length, dir); ++ gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL); + } + } + +@@ -329,7 +333,7 @@ static int dma_map_sg_nonforce(struct de + addr = dma_map_area(dev, addr, s->length, dir, 0); + if (addr == bad_dma_address) { + if (i > 0) +- gart_unmap_sg(dev, sg, i, dir); ++ gart_unmap_sg(dev, sg, i, dir, NULL); + nents = 0; + sg[0].dma_length = 0; + break; +@@ -400,8 +404,8 @@ dma_map_cont(struct device *dev, struct + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +-static int +-gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) ++static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, ++ enum dma_data_direction dir, struct dma_attrs *attrs) + { + struct scatterlist *s, *ps, *start_sg, *sgmap; + int need = 0, nextneed, i, out, start; +@@ -468,7 +472,7 @@ gart_map_sg(struct device *dev, struct s + + error: + flush_gart(); +- gart_unmap_sg(dev, sg, out, dir); ++ gart_unmap_sg(dev, sg, out, dir, NULL); + + /* When it was forced or merged try again in a dumb way */ + if (force_iommu || iommu_merge) { +@@ -521,7 +525,7 @@ static void + gart_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) + { +- gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL); ++ gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); + free_pages((unsigned long)vaddr, get_order(size)); + } + +@@ -707,11 +711,11 @@ static __init int init_k8_gatt(struct ag + return -1; + } + +-static struct dma_mapping_ops gart_dma_ops = { +- .map_single = gart_map_single, +- .unmap_single = gart_unmap_single, ++static struct dma_map_ops gart_dma_ops = { + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, ++ .map_page = gart_map_page, ++ .unmap_page = gart_unmap_page, + .alloc_coherent = gart_alloc_coherent, + .free_coherent = gart_free_coherent, + }; +Index: linux-2.6-tip/arch/x86/kernel/pci-nommu.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/pci-nommu.c ++++ linux-2.6-tip/arch/x86/kernel/pci-nommu.c +@@ -1,14 +1,14 @@ + /* Fallback functions when the main IOMMU code is not compiled in. This + code is roughly equivalent to i386. */ +-#include +-#include +-#include +-#include + #include + #include ++#include ++#include ++#include ++#include + +-#include + #include ++#include + #include + + static int +@@ -25,19 +25,19 @@ check_addr(char *name, struct device *hw + return 1; + } + +-static dma_addr_t +-nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, +- int direction) ++static dma_addr_t nommu_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { +- dma_addr_t bus = paddr; ++ dma_addr_t bus = page_to_phys(page) + offset; + WARN_ON(size == 0); +- if (!check_addr("map_single", hwdev, bus, size)) +- return bad_dma_address; ++ if (!check_addr("map_single", dev, bus, size)) ++ return bad_dma_address; + flush_write_buffers(); + return bus; + } + +- + /* Map a set of buffers described by scatterlist in streaming + * mode for DMA. This is the scatter-gather version of the + * above pci_map_single interface. Here the scatter gather list +@@ -54,7 +54,8 @@ nommu_map_single(struct device *hwdev, p + * the same here. + */ + static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, +- int nents, int direction) ++ int nents, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct scatterlist *s; + int i; +@@ -78,12 +79,12 @@ static void nommu_free_coherent(struct d + free_pages((unsigned long)vaddr, get_order(size)); + } + +-struct dma_mapping_ops nommu_dma_ops = { +- .alloc_coherent = dma_generic_alloc_coherent, +- .free_coherent = nommu_free_coherent, +- .map_single = nommu_map_single, +- .map_sg = nommu_map_sg, +- .is_phys = 1, ++struct dma_map_ops nommu_dma_ops = { ++ .alloc_coherent = dma_generic_alloc_coherent, ++ .free_coherent = nommu_free_coherent, ++ .map_sg = nommu_map_sg, ++ .map_page = nommu_map_page, ++ .is_phys = 1, + }; + + void __init no_iommu_init(void) +Index: linux-2.6-tip/arch/x86/kernel/pci-swiotlb.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/pci-swiotlb.c +@@ -0,0 +1,84 @@ ++/* Glue code to lib/swiotlb.c */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++int swiotlb __read_mostly; ++ ++void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) ++{ ++ return alloc_bootmem_low_pages(size); ++} ++ ++void *swiotlb_alloc(unsigned order, unsigned long nslabs) ++{ ++ return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); ++} ++ ++dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) ++{ ++ return paddr; ++} ++ ++phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) ++{ ++ return baddr; ++} ++ ++int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) ++{ ++ return 0; ++} ++ ++static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, ++ dma_addr_t *dma_handle, gfp_t flags) ++{ ++ void *vaddr; ++ ++ vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags); ++ if (vaddr) ++ return vaddr; ++ ++ return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); ++} ++ ++struct dma_map_ops swiotlb_dma_ops = { ++ .mapping_error = swiotlb_dma_mapping_error, ++ .alloc_coherent = x86_swiotlb_alloc_coherent, ++ .free_coherent = swiotlb_free_coherent, ++ .sync_single_for_cpu = swiotlb_sync_single_for_cpu, ++ .sync_single_for_device = swiotlb_sync_single_for_device, ++ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, ++ .sync_single_range_for_device = swiotlb_sync_single_range_for_device, ++ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, ++ .sync_sg_for_device = swiotlb_sync_sg_for_device, ++ .map_sg = swiotlb_map_sg_attrs, ++ .unmap_sg = swiotlb_unmap_sg_attrs, ++ .map_page = swiotlb_map_page, ++ .unmap_page = swiotlb_unmap_page, ++ .dma_supported = NULL, ++}; ++ ++void __init pci_swiotlb_init(void) ++{ ++ /* don't initialize swiotlb if iommu=off (no_iommu=1) */ ++#ifdef CONFIG_X86_64 ++ if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ++ swiotlb = 1; ++#endif ++ if (swiotlb_force) ++ swiotlb = 1; ++ if (swiotlb) { ++ printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); ++ swiotlb_init(); ++ dma_ops = &swiotlb_dma_ops; ++ } ++} +Index: linux-2.6-tip/arch/x86/kernel/pci-swiotlb_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/pci-swiotlb_64.c ++++ /dev/null +@@ -1,91 +0,0 @@ +-/* Glue code to lib/swiotlb.c */ +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +- +-int swiotlb __read_mostly; +- +-void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) +-{ +- return alloc_bootmem_low_pages(size); +-} +- +-void *swiotlb_alloc(unsigned order, unsigned long nslabs) +-{ +- return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); +-} +- +-dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) +-{ +- return paddr; +-} +- +-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +-{ +- return baddr; +-} +- +-int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) +-{ +- return 0; +-} +- +-static dma_addr_t +-swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, +- int direction) +-{ +- return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); +-} +- +-static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, +- dma_addr_t *dma_handle, gfp_t flags) +-{ +- void *vaddr; +- +- vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags); +- if (vaddr) +- return vaddr; +- +- return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); +-} +- +-struct dma_mapping_ops swiotlb_dma_ops = { +- .mapping_error = swiotlb_dma_mapping_error, +- .alloc_coherent = x86_swiotlb_alloc_coherent, +- .free_coherent = swiotlb_free_coherent, +- .map_single = swiotlb_map_single_phys, +- .unmap_single = swiotlb_unmap_single, +- .sync_single_for_cpu = swiotlb_sync_single_for_cpu, +- .sync_single_for_device = swiotlb_sync_single_for_device, +- .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, +- .sync_single_range_for_device = swiotlb_sync_single_range_for_device, +- .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, +- .sync_sg_for_device = swiotlb_sync_sg_for_device, +- .map_sg = swiotlb_map_sg, +- .unmap_sg = swiotlb_unmap_sg, +- .dma_supported = NULL, +-}; +- +-void __init pci_swiotlb_init(void) +-{ +- /* don't initialize swiotlb if iommu=off (no_iommu=1) */ +-#ifdef CONFIG_X86_64 +- if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) +- swiotlb = 1; +-#endif +- if (swiotlb_force) +- swiotlb = 1; +- if (swiotlb) { +- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); +- swiotlb_init(); +- dma_ops = &swiotlb_dma_ops; +- } +-} +Index: linux-2.6-tip/arch/x86/kernel/probe_roms_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/probe_roms_32.c ++++ linux-2.6-tip/arch/x86/kernel/probe_roms_32.c +@@ -18,7 +18,7 @@ + #include + #include + #include +-#include ++#include + + static struct resource system_rom_resource = { + .name = "System ROM", +Index: linux-2.6-tip/arch/x86/kernel/process.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/process.c ++++ linux-2.6-tip/arch/x86/kernel/process.c +@@ -1,16 +1,19 @@ + #include + #include + #include +-#include + #include ++#include + #include + #include + #include + #include + #include +-#include ++#include + #include + #include ++#include ++#include ++#include + + unsigned long idle_halt; + EXPORT_SYMBOL(idle_halt); +@@ -19,6 +22,9 @@ EXPORT_SYMBOL(idle_nomwait); + + struct kmem_cache *task_xstate_cachep; + ++DEFINE_TRACE(power_start); ++DEFINE_TRACE(power_end); ++ + int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) + { + *dst = *src; +@@ -52,10 +58,197 @@ void arch_task_cache_init(void) + task_xstate_cachep = + kmem_cache_create("task_xstate", xstate_size, + __alignof__(union thread_xstate), +- SLAB_PANIC, NULL); ++ SLAB_PANIC | SLAB_NOTRACK, NULL); + } + + /* ++ * Free current thread data structures etc.. ++ */ ++void exit_thread(void) ++{ ++ struct task_struct *me = current; ++ struct thread_struct *t = &me->thread; ++ unsigned long *bp = t->io_bitmap_ptr; ++ ++ if (bp) { ++ struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); ++ ++ t->io_bitmap_ptr = NULL; ++ clear_thread_flag(TIF_IO_BITMAP); ++ /* ++ * Careful, clear this in the TSS too: ++ */ ++ memset(tss->io_bitmap, 0xff, t->io_bitmap_max); ++ t->io_bitmap_max = 0; ++ put_cpu(); ++ kfree(bp); ++ } ++ ++ ds_exit_thread(current); ++} ++ ++void flush_thread(void) ++{ ++ struct task_struct *tsk = current; ++ ++#ifdef CONFIG_X86_64 ++ if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { ++ clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); ++ if (test_tsk_thread_flag(tsk, TIF_IA32)) { ++ clear_tsk_thread_flag(tsk, TIF_IA32); ++ } else { ++ set_tsk_thread_flag(tsk, TIF_IA32); ++ current_thread_info()->status |= TS_COMPAT; ++ } ++ } ++#endif ++ ++ clear_tsk_thread_flag(tsk, TIF_DEBUG); ++ ++ tsk->thread.debugreg0 = 0; ++ tsk->thread.debugreg1 = 0; ++ tsk->thread.debugreg2 = 0; ++ tsk->thread.debugreg3 = 0; ++ tsk->thread.debugreg6 = 0; ++ tsk->thread.debugreg7 = 0; ++ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); ++ /* ++ * Forget coprocessor state.. ++ */ ++ tsk->fpu_counter = 0; ++ clear_fpu(tsk); ++ clear_used_math(); ++} ++ ++static void hard_disable_TSC(void) ++{ ++ write_cr4(read_cr4() | X86_CR4_TSD); ++} ++ ++void disable_TSC(void) ++{ ++ preempt_disable(); ++ if (!test_and_set_thread_flag(TIF_NOTSC)) ++ /* ++ * Must flip the CPU state synchronously with ++ * TIF_NOTSC in the current running context. ++ */ ++ hard_disable_TSC(); ++ preempt_enable(); ++} ++ ++static void hard_enable_TSC(void) ++{ ++ write_cr4(read_cr4() & ~X86_CR4_TSD); ++} ++ ++static void enable_TSC(void) ++{ ++ preempt_disable(); ++ if (test_and_clear_thread_flag(TIF_NOTSC)) ++ /* ++ * Must flip the CPU state synchronously with ++ * TIF_NOTSC in the current running context. ++ */ ++ hard_enable_TSC(); ++ preempt_enable(); ++} ++ ++int get_tsc_mode(unsigned long adr) ++{ ++ unsigned int val; ++ ++ if (test_thread_flag(TIF_NOTSC)) ++ val = PR_TSC_SIGSEGV; ++ else ++ val = PR_TSC_ENABLE; ++ ++ return put_user(val, (unsigned int __user *)adr); ++} ++ ++int set_tsc_mode(unsigned int val) ++{ ++ if (val == PR_TSC_SIGSEGV) ++ disable_TSC(); ++ else if (val == PR_TSC_ENABLE) ++ enable_TSC(); ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++ ++void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, ++ struct tss_struct *tss) ++{ ++ struct thread_struct *prev, *next; ++ ++ prev = &prev_p->thread; ++ next = &next_p->thread; ++ ++ if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || ++ test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) ++ ds_switch_to(prev_p, next_p); ++ else if (next->debugctlmsr != prev->debugctlmsr) ++ update_debugctlmsr(next->debugctlmsr); ++ ++ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { ++ set_debugreg(next->debugreg0, 0); ++ set_debugreg(next->debugreg1, 1); ++ set_debugreg(next->debugreg2, 2); ++ set_debugreg(next->debugreg3, 3); ++ /* no 4 and 5 */ ++ set_debugreg(next->debugreg6, 6); ++ set_debugreg(next->debugreg7, 7); ++ } ++ ++ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ ++ test_tsk_thread_flag(next_p, TIF_NOTSC)) { ++ /* prev and next are different */ ++ if (test_tsk_thread_flag(next_p, TIF_NOTSC)) ++ hard_disable_TSC(); ++ else ++ hard_enable_TSC(); ++ } ++ ++ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { ++ /* ++ * Copy the relevant range of the IO bitmap. ++ * Normally this is 128 bytes or less: ++ */ ++ memcpy(tss->io_bitmap, next->io_bitmap_ptr, ++ max(prev->io_bitmap_max, next->io_bitmap_max)); ++ } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { ++ /* ++ * Clear any possible leftover bits: ++ */ ++ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); ++ } ++} ++ ++int sys_fork(struct pt_regs *regs) ++{ ++ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); ++} ++ ++/* ++ * This is trivial, and on the face of it looks like it ++ * could equally well be done in user mode. ++ * ++ * Not so, for quite unobvious reasons - register pressure. ++ * In user mode vfork() cannot have a stack frame, and if ++ * done by calling the "clone()" system call directly, you ++ * do not have enough call-clobbered registers to hold all ++ * the information you need. ++ */ ++int sys_vfork(struct pt_regs *regs) ++{ ++ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, ++ NULL, NULL); ++} ++ ++ ++/* + * Idle related variables and functions + */ + unsigned long boot_option_idle_override = 0; +@@ -135,7 +328,7 @@ void stop_this_cpu(void *dummy) + /* + * Remove this CPU: + */ +- cpu_clear(smp_processor_id(), cpu_online_map); ++ set_cpu_online(smp_processor_id(), false); + disable_local_APIC(); + + for (;;) { +@@ -285,12 +478,13 @@ static int __cpuinit check_c1e_idle(cons + return 1; + } + +-static cpumask_t c1e_mask = CPU_MASK_NONE; ++static cpumask_var_t c1e_mask; + static int c1e_detected; + + void c1e_remove_cpu(int cpu) + { +- cpu_clear(cpu, c1e_mask); ++ if (c1e_mask != NULL) ++ cpumask_clear_cpu(cpu, c1e_mask); + } + + /* +@@ -319,8 +513,8 @@ static void c1e_idle(void) + if (c1e_detected) { + int cpu = smp_processor_id(); + +- if (!cpu_isset(cpu, c1e_mask)) { +- cpu_set(cpu, c1e_mask); ++ if (!cpumask_test_cpu(cpu, c1e_mask)) { ++ cpumask_set_cpu(cpu, c1e_mask); + /* + * Force broadcast so ACPI can not interfere. Needs + * to run with interrupts enabled as it uses +@@ -350,7 +544,7 @@ static void c1e_idle(void) + + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) + { +-#ifdef CONFIG_X86_SMP ++#ifdef CONFIG_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); +@@ -372,6 +566,15 @@ void __cpuinit select_idle_routine(const + pm_idle = default_idle; + } + ++void __init init_c1e_mask(void) ++{ ++ /* If we're using c1e_idle, we need to allocate c1e_mask. */ ++ if (pm_idle == c1e_idle) { ++ alloc_cpumask_var(&c1e_mask, GFP_KERNEL); ++ cpumask_clear(c1e_mask); ++ } ++} ++ + static int __init idle_setup(char *str) + { + if (!str) +Index: linux-2.6-tip/arch/x86/kernel/process_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/process_32.c ++++ linux-2.6-tip/arch/x86/kernel/process_32.c +@@ -11,6 +11,7 @@ + + #include + ++#include + #include + #include + #include +@@ -66,9 +67,6 @@ asmlinkage void ret_from_fork(void) __as + DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; + EXPORT_PER_CPU_SYMBOL(current_task); + +-DEFINE_PER_CPU(int, cpu_number); +-EXPORT_PER_CPU_SYMBOL(cpu_number); +- + /* + * Return saved PC of a blocked thread. + */ +@@ -94,6 +92,15 @@ void cpu_idle(void) + { + int cpu = smp_processor_id(); + ++ /* ++ * If we're the non-boot CPU, nothing set the stack canary up ++ * for us. CPU0 already has it initialized but no harm in ++ * doing it again. This is a good place for updating it, as ++ * we wont ever return from this function (so the invalid ++ * canaries already on the stack wont ever trigger). ++ */ ++ boot_init_stack_canary(); ++ + current_thread_info()->status |= TS_POLLING; + + /* endless idle loop with no priority at all */ +@@ -101,23 +108,23 @@ void cpu_idle(void) + tick_nohz_stop_sched_tick(1); + while (!need_resched()) { + +- check_pgt_cache(); + rmb(); + + if (cpu_is_offline(cpu)) + play_dead(); + + local_irq_disable(); +- __get_cpu_var(irq_stat).idle_timestamp = jiffies; + /* Don't trace irqs off for idle */ + stop_critical_timings(); + pm_idle(); + start_critical_timings(); + } ++ local_irq_disable(); + tick_nohz_restart_sched_tick(); +- preempt_enable_no_resched(); +- schedule(); ++ __preempt_enable_no_resched(); ++ __schedule(); + preempt_disable(); ++ local_irq_enable(); + } + } + +@@ -132,7 +139,7 @@ void __show_regs(struct pt_regs *regs, i + if (user_mode_vm(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; +- savesegment(gs, gs); ++ gs = get_user_gs(regs); + } else { + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); +@@ -159,8 +166,10 @@ void __show_regs(struct pt_regs *regs, i + regs->ax, regs->bx, regs->cx, regs->dx); + printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + regs->si, regs->di, regs->bp, sp); +- printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", +- (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); ++ printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x" ++ " preempt:%08x\n", ++ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, ++ preempt_count()); + + if (!all) + return; +@@ -213,6 +222,7 @@ int kernel_thread(int (*fn)(void *), voi + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; ++ regs.gs = __KERNEL_STACK_CANARY; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); +@@ -223,55 +233,6 @@ int kernel_thread(int (*fn)(void *), voi + } + EXPORT_SYMBOL(kernel_thread); + +-/* +- * Free current thread data structures etc.. +- */ +-void exit_thread(void) +-{ +- /* The process may have allocated an io port bitmap... nuke it. */ +- if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { +- struct task_struct *tsk = current; +- struct thread_struct *t = &tsk->thread; +- int cpu = get_cpu(); +- struct tss_struct *tss = &per_cpu(init_tss, cpu); +- +- kfree(t->io_bitmap_ptr); +- t->io_bitmap_ptr = NULL; +- clear_thread_flag(TIF_IO_BITMAP); +- /* +- * Careful, clear this in the TSS too: +- */ +- memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); +- t->io_bitmap_max = 0; +- tss->io_bitmap_owner = NULL; +- tss->io_bitmap_max = 0; +- tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; +- put_cpu(); +- } +- +- ds_exit_thread(current); +-} +- +-void flush_thread(void) +-{ +- struct task_struct *tsk = current; +- +- tsk->thread.debugreg0 = 0; +- tsk->thread.debugreg1 = 0; +- tsk->thread.debugreg2 = 0; +- tsk->thread.debugreg3 = 0; +- tsk->thread.debugreg6 = 0; +- tsk->thread.debugreg7 = 0; +- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); +- clear_tsk_thread_flag(tsk, TIF_DEBUG); +- /* +- * Forget coprocessor state.. +- */ +- tsk->fpu_counter = 0; +- clear_fpu(tsk); +- clear_used_math(); +-} +- + void release_thread(struct task_struct *dead_task) + { + BUG_ON(dead_task->mm); +@@ -305,7 +266,7 @@ int copy_thread(int nr, unsigned long cl + + p->thread.ip = (unsigned long) ret_from_fork; + +- savesegment(gs, p->thread.gs); ++ task_user_gs(p) = get_user_gs(regs); + + tsk = current; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { +@@ -343,7 +304,7 @@ int copy_thread(int nr, unsigned long cl + void + start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) + { +- __asm__("movl %0, %%gs" : : "r"(0)); ++ set_user_gs(regs, 0); + regs->fs = 0; + set_fs(USER_DS); + regs->ds = __USER_DS; +@@ -359,127 +320,6 @@ start_thread(struct pt_regs *regs, unsig + } + EXPORT_SYMBOL_GPL(start_thread); + +-static void hard_disable_TSC(void) +-{ +- write_cr4(read_cr4() | X86_CR4_TSD); +-} +- +-void disable_TSC(void) +-{ +- preempt_disable(); +- if (!test_and_set_thread_flag(TIF_NOTSC)) +- /* +- * Must flip the CPU state synchronously with +- * TIF_NOTSC in the current running context. +- */ +- hard_disable_TSC(); +- preempt_enable(); +-} +- +-static void hard_enable_TSC(void) +-{ +- write_cr4(read_cr4() & ~X86_CR4_TSD); +-} +- +-static void enable_TSC(void) +-{ +- preempt_disable(); +- if (test_and_clear_thread_flag(TIF_NOTSC)) +- /* +- * Must flip the CPU state synchronously with +- * TIF_NOTSC in the current running context. +- */ +- hard_enable_TSC(); +- preempt_enable(); +-} +- +-int get_tsc_mode(unsigned long adr) +-{ +- unsigned int val; +- +- if (test_thread_flag(TIF_NOTSC)) +- val = PR_TSC_SIGSEGV; +- else +- val = PR_TSC_ENABLE; +- +- return put_user(val, (unsigned int __user *)adr); +-} +- +-int set_tsc_mode(unsigned int val) +-{ +- if (val == PR_TSC_SIGSEGV) +- disable_TSC(); +- else if (val == PR_TSC_ENABLE) +- enable_TSC(); +- else +- return -EINVAL; +- +- return 0; +-} +- +-static noinline void +-__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, +- struct tss_struct *tss) +-{ +- struct thread_struct *prev, *next; +- +- prev = &prev_p->thread; +- next = &next_p->thread; +- +- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || +- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) +- ds_switch_to(prev_p, next_p); +- else if (next->debugctlmsr != prev->debugctlmsr) +- update_debugctlmsr(next->debugctlmsr); +- +- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { +- set_debugreg(next->debugreg0, 0); +- set_debugreg(next->debugreg1, 1); +- set_debugreg(next->debugreg2, 2); +- set_debugreg(next->debugreg3, 3); +- /* no 4 and 5 */ +- set_debugreg(next->debugreg6, 6); +- set_debugreg(next->debugreg7, 7); +- } +- +- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ +- test_tsk_thread_flag(next_p, TIF_NOTSC)) { +- /* prev and next are different */ +- if (test_tsk_thread_flag(next_p, TIF_NOTSC)) +- hard_disable_TSC(); +- else +- hard_enable_TSC(); +- } +- +- if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +- /* +- * Disable the bitmap via an invalid offset. We still cache +- * the previous bitmap owner and the IO bitmap contents: +- */ +- tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; +- return; +- } +- +- if (likely(next == tss->io_bitmap_owner)) { +- /* +- * Previous owner of the bitmap (hence the bitmap content) +- * matches the next task, we dont have to do anything but +- * to set a valid offset in the TSS: +- */ +- tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; +- return; +- } +- /* +- * Lazy TSS's I/O bitmap copy. We set an invalid offset here +- * and we let the task to get a GPF in case an I/O instruction +- * is performed. The handler of the GPF will verify that the +- * faulting task has a valid I/O bitmap and, it true, does the +- * real copy and restart the instruction. This will save us +- * redundant copies when the currently switched task does not +- * perform any I/O during its timeslice. +- */ +- tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; +-} + + /* + * switch_to(x,yn) should switch tasks from x to y. +@@ -540,7 +380,7 @@ __switch_to(struct task_struct *prev_p, + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. + */ +- savesegment(gs, prev->gs); ++ lazy_save_gs(prev->gs); + + /* + * Load the per-thread Thread-Local Storage descriptor. +@@ -586,64 +426,44 @@ __switch_to(struct task_struct *prev_p, + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) +- loadsegment(gs, next->gs); ++ lazy_load_gs(next->gs); + +- x86_write_percpu(current_task, next_p); ++ percpu_write(current_task, next_p); + + return prev_p; + } + +-asmlinkage int sys_fork(struct pt_regs regs) +-{ +- return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); +-} +- +-asmlinkage int sys_clone(struct pt_regs regs) ++int sys_clone(struct pt_regs *regs) + { + unsigned long clone_flags; + unsigned long newsp; + int __user *parent_tidptr, *child_tidptr; + +- clone_flags = regs.bx; +- newsp = regs.cx; +- parent_tidptr = (int __user *)regs.dx; +- child_tidptr = (int __user *)regs.di; ++ clone_flags = regs->bx; ++ newsp = regs->cx; ++ parent_tidptr = (int __user *)regs->dx; ++ child_tidptr = (int __user *)regs->di; + if (!newsp) +- newsp = regs.sp; +- return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); +-} +- +-/* +- * This is trivial, and on the face of it looks like it +- * could equally well be done in user mode. +- * +- * Not so, for quite unobvious reasons - register pressure. +- * In user mode vfork() cannot have a stack frame, and if +- * done by calling the "clone()" system call directly, you +- * do not have enough call-clobbered registers to hold all +- * the information you need. +- */ +-asmlinkage int sys_vfork(struct pt_regs regs) +-{ +- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); ++ newsp = regs->sp; ++ return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); + } + + /* + * sys_execve() executes a new program. + */ +-asmlinkage int sys_execve(struct pt_regs regs) ++int sys_execve(struct pt_regs *regs) + { + int error; + char *filename; + +- filename = getname((char __user *) regs.bx); ++ filename = getname((char __user *) regs->bx); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + goto out; + error = do_execve(filename, +- (char __user * __user *) regs.cx, +- (char __user * __user *) regs.dx, +- ®s); ++ (char __user * __user *) regs->cx, ++ (char __user * __user *) regs->dx, ++ regs); + if (error == 0) { + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); +Index: linux-2.6-tip/arch/x86/kernel/process_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/process_64.c ++++ linux-2.6-tip/arch/x86/kernel/process_64.c +@@ -16,6 +16,7 @@ + + #include + ++#include + #include + #include + #include +@@ -47,7 +48,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -58,6 +58,12 @@ + + asmlinkage extern void ret_from_fork(void); + ++DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; ++EXPORT_PER_CPU_SYMBOL(current_task); ++ ++DEFINE_PER_CPU(unsigned long, old_rsp); ++static DEFINE_PER_CPU(unsigned char, is_idle); ++ + unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + + static ATOMIC_NOTIFIER_HEAD(idle_notifier); +@@ -76,13 +82,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregist + + void enter_idle(void) + { +- write_pda(isidle, 1); ++ percpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); + } + + static void __exit_idle(void) + { +- if (test_and_clear_bit_pda(0, isidle) == 0) ++ if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); + } +@@ -112,6 +118,16 @@ static inline void play_dead(void) + void cpu_idle(void) + { + current_thread_info()->status |= TS_POLLING; ++ ++ /* ++ * If we're the non-boot CPU, nothing set the stack canary up ++ * for us. CPU0 already has it initialized but no harm in ++ * doing it again. This is a good place for updating it, as ++ * we wont ever return from this function (so the invalid ++ * canaries already on the stack wont ever trigger). ++ */ ++ boot_init_stack_canary(); ++ + /* endless idle loop with no priority at all */ + while (1) { + tick_nohz_stop_sched_tick(1); +@@ -139,9 +155,11 @@ void cpu_idle(void) + } + + tick_nohz_restart_sched_tick(); +- preempt_enable_no_resched(); +- schedule(); ++ local_irq_disable(); ++ __preempt_enable_no_resched(); ++ __schedule(); + preempt_disable(); ++ local_irq_enable(); + } + } + +@@ -221,61 +239,6 @@ void show_regs(struct pt_regs *regs) + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); + } + +-/* +- * Free current thread data structures etc.. +- */ +-void exit_thread(void) +-{ +- struct task_struct *me = current; +- struct thread_struct *t = &me->thread; +- +- if (me->thread.io_bitmap_ptr) { +- struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); +- +- kfree(t->io_bitmap_ptr); +- t->io_bitmap_ptr = NULL; +- clear_thread_flag(TIF_IO_BITMAP); +- /* +- * Careful, clear this in the TSS too: +- */ +- memset(tss->io_bitmap, 0xff, t->io_bitmap_max); +- t->io_bitmap_max = 0; +- put_cpu(); +- } +- +- ds_exit_thread(current); +-} +- +-void flush_thread(void) +-{ +- struct task_struct *tsk = current; +- +- if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { +- clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); +- if (test_tsk_thread_flag(tsk, TIF_IA32)) { +- clear_tsk_thread_flag(tsk, TIF_IA32); +- } else { +- set_tsk_thread_flag(tsk, TIF_IA32); +- current_thread_info()->status |= TS_COMPAT; +- } +- } +- clear_tsk_thread_flag(tsk, TIF_DEBUG); +- +- tsk->thread.debugreg0 = 0; +- tsk->thread.debugreg1 = 0; +- tsk->thread.debugreg2 = 0; +- tsk->thread.debugreg3 = 0; +- tsk->thread.debugreg6 = 0; +- tsk->thread.debugreg7 = 0; +- memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); +- /* +- * Forget coprocessor state.. +- */ +- tsk->fpu_counter = 0; +- clear_fpu(tsk); +- clear_used_math(); +-} +- + void release_thread(struct task_struct *dead_task) + { + if (dead_task->mm) { +@@ -397,7 +360,7 @@ start_thread(struct pt_regs *regs, unsig + load_gs_index(0); + regs->ip = new_ip; + regs->sp = new_sp; +- write_pda(oldrsp, new_sp); ++ percpu_write(old_rsp, new_sp); + regs->cs = __USER_CS; + regs->ss = __USER_DS; + regs->flags = 0x200; +@@ -409,118 +372,6 @@ start_thread(struct pt_regs *regs, unsig + } + EXPORT_SYMBOL_GPL(start_thread); + +-static void hard_disable_TSC(void) +-{ +- write_cr4(read_cr4() | X86_CR4_TSD); +-} +- +-void disable_TSC(void) +-{ +- preempt_disable(); +- if (!test_and_set_thread_flag(TIF_NOTSC)) +- /* +- * Must flip the CPU state synchronously with +- * TIF_NOTSC in the current running context. +- */ +- hard_disable_TSC(); +- preempt_enable(); +-} +- +-static void hard_enable_TSC(void) +-{ +- write_cr4(read_cr4() & ~X86_CR4_TSD); +-} +- +-static void enable_TSC(void) +-{ +- preempt_disable(); +- if (test_and_clear_thread_flag(TIF_NOTSC)) +- /* +- * Must flip the CPU state synchronously with +- * TIF_NOTSC in the current running context. +- */ +- hard_enable_TSC(); +- preempt_enable(); +-} +- +-int get_tsc_mode(unsigned long adr) +-{ +- unsigned int val; +- +- if (test_thread_flag(TIF_NOTSC)) +- val = PR_TSC_SIGSEGV; +- else +- val = PR_TSC_ENABLE; +- +- return put_user(val, (unsigned int __user *)adr); +-} +- +-int set_tsc_mode(unsigned int val) +-{ +- if (val == PR_TSC_SIGSEGV) +- disable_TSC(); +- else if (val == PR_TSC_ENABLE) +- enable_TSC(); +- else +- return -EINVAL; +- +- return 0; +-} +- +-/* +- * This special macro can be used to load a debugging register +- */ +-#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) +- +-static inline void __switch_to_xtra(struct task_struct *prev_p, +- struct task_struct *next_p, +- struct tss_struct *tss) +-{ +- struct thread_struct *prev, *next; +- +- prev = &prev_p->thread, +- next = &next_p->thread; +- +- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || +- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) +- ds_switch_to(prev_p, next_p); +- else if (next->debugctlmsr != prev->debugctlmsr) +- update_debugctlmsr(next->debugctlmsr); +- +- if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { +- loaddebug(next, 0); +- loaddebug(next, 1); +- loaddebug(next, 2); +- loaddebug(next, 3); +- /* no 4 and 5 */ +- loaddebug(next, 6); +- loaddebug(next, 7); +- } +- +- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ +- test_tsk_thread_flag(next_p, TIF_NOTSC)) { +- /* prev and next are different */ +- if (test_tsk_thread_flag(next_p, TIF_NOTSC)) +- hard_disable_TSC(); +- else +- hard_enable_TSC(); +- } +- +- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +- /* +- * Copy the relevant range of the IO bitmap. +- * Normally this is 128 bytes or less: +- */ +- memcpy(tss->io_bitmap, next->io_bitmap_ptr, +- max(prev->io_bitmap_max, next->io_bitmap_max)); +- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { +- /* +- * Clear any possible leftover bits: +- */ +- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); +- } +-} +- + /* + * switch_to(x,y) should switch tasks from x to y. + * +@@ -618,21 +469,13 @@ __switch_to(struct task_struct *prev_p, + /* + * Switch the PDA and FPU contexts. + */ +- prev->usersp = read_pda(oldrsp); +- write_pda(oldrsp, next->usersp); +- write_pda(pcurrent, next_p); ++ prev->usersp = percpu_read(old_rsp); ++ percpu_write(old_rsp, next->usersp); ++ percpu_write(current_task, next_p); + +- write_pda(kernelstack, ++ percpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + +- THREAD_SIZE - PDA_STACKOFFSET); +-#ifdef CONFIG_CC_STACKPROTECTOR +- write_pda(stack_canary, next_p->stack_canary); +- /* +- * Build time only check to make sure the stack_canary is at +- * offset 40 in the pda; this is a gcc ABI requirement +- */ +- BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); +-#endif ++ THREAD_SIZE - KERNEL_STACK_OFFSET); + + /* + * Now maybe reload the debug registers and handle I/O bitmaps +@@ -686,11 +529,6 @@ void set_personality_64bit(void) + current->personality &= ~READ_IMPLIES_EXEC; + } + +-asmlinkage long sys_fork(struct pt_regs *regs) +-{ +- return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); +-} +- + asmlinkage long + sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +@@ -700,22 +538,6 @@ sys_clone(unsigned long clone_flags, uns + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); + } + +-/* +- * This is trivial, and on the face of it looks like it +- * could equally well be done in user mode. +- * +- * Not so, for quite unobvious reasons - register pressure. +- * In user mode vfork() cannot have a stack frame, and if +- * done by calling the "clone()" system call directly, you +- * do not have enough call-clobbered registers to hold all +- * the information you need. +- */ +-asmlinkage long sys_vfork(struct pt_regs *regs) +-{ +- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, +- NULL, NULL); +-} +- + unsigned long get_wchan(struct task_struct *p) + { + unsigned long stack; +Index: linux-2.6-tip/arch/x86/kernel/ptrace.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/ptrace.c ++++ linux-2.6-tip/arch/x86/kernel/ptrace.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -75,10 +76,7 @@ static inline bool invalid_selector(u16 + static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) + { + BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); +- regno >>= 2; +- if (regno > FS) +- --regno; +- return ®s->bx + regno; ++ return ®s->bx + (regno >> 2); + } + + static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +@@ -90,9 +88,10 @@ static u16 get_segment_reg(struct task_s + if (offset != offsetof(struct user_regs_struct, gs)) + retval = *pt_regs_access(task_pt_regs(task), offset); + else { +- retval = task->thread.gs; + if (task == current) +- savesegment(gs, retval); ++ retval = get_user_gs(task_pt_regs(task)); ++ else ++ retval = task_user_gs(task); + } + return retval; + } +@@ -126,13 +125,10 @@ static int set_segment_reg(struct task_s + break; + + case offsetof(struct user_regs_struct, gs): +- task->thread.gs = value; + if (task == current) +- /* +- * The user-mode %gs is not affected by +- * kernel entry, so we must update the CPU. +- */ +- loadsegment(gs, value); ++ set_user_gs(task_pt_regs(task), value); ++ else ++ task_user_gs(task) = value; + } + + return 0; +@@ -273,7 +269,7 @@ static unsigned long debugreg_addr_limit + if (test_tsk_thread_flag(task, TIF_IA32)) + return IA32_PAGE_OFFSET - 3; + #endif +- return TASK_SIZE64 - 7; ++ return TASK_SIZE_MAX - 7; + } + + #endif /* CONFIG_X86_32 */ +@@ -1420,6 +1416,9 @@ asmregparm long syscall_trace_enter(stru + tracehook_report_syscall_entry(regs)) + ret = -1L; + ++ if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) ++ ftrace_syscall_enter(regs); ++ + if (unlikely(current->audit_context)) { + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, +@@ -1443,6 +1442,9 @@ asmregparm void syscall_trace_leave(stru + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + ++ if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) ++ ftrace_syscall_exit(regs); ++ + if (test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, 0); + +Index: linux-2.6-tip/arch/x86/kernel/quirks.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/quirks.c ++++ linux-2.6-tip/arch/x86/kernel/quirks.c +@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void) + if (!force_hpet_address) + return; + +- if (rcba_base == NULL) +- BUG(); ++ BUG_ON(rcba_base == NULL); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); +@@ -172,7 +171,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I + ich_force_enable_hpet); + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); +- ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */ ++ ich_force_enable_hpet); + + static struct pci_dev *cached_dev; + +Index: linux-2.6-tip/arch/x86/kernel/reboot.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/reboot.c ++++ linux-2.6-tip/arch/x86/kernel/reboot.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_X86_32 + # include +@@ -23,8 +24,6 @@ + # include + #endif + +-#include +- + /* + * Power off function, if any + */ +@@ -658,7 +657,7 @@ static int crash_nmi_callback(struct not + + static void smp_send_nmi_allbutself(void) + { +- send_IPI_allbutself(NMI_VECTOR); ++ apic->send_IPI_allbutself(NMI_VECTOR); + } + + static struct notifier_block crash_nmi_nb = { +Index: linux-2.6-tip/arch/x86/kernel/relocate_kernel_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/relocate_kernel_32.S ++++ linux-2.6-tip/arch/x86/kernel/relocate_kernel_32.S +@@ -7,7 +7,7 @@ + */ + + #include +-#include ++#include + #include + #include + +@@ -17,7 +17,8 @@ + + #define PTR(x) (x << 2) + +-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE ++/* ++ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +@@ -76,8 +77,10 @@ relocate_kernel: + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + +- /* get physical address of control page now */ +- /* this is impossible after page table switch */ ++ /* ++ * get physical address of control page now ++ * this is impossible after page table switch ++ */ + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi + + /* switch to new set of page tables */ +@@ -97,7 +100,8 @@ identity_mapped: + /* store the start address on the stack */ + pushl %edx + +- /* Set cr0 to a known state: ++ /* ++ * Set cr0 to a known state: + * - Paging disabled + * - Alignment check disabled + * - Write protect disabled +@@ -113,7 +117,8 @@ identity_mapped: + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f +- /* Set cr4 to a known state: ++ /* ++ * Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + xorl %eax, %eax +@@ -132,15 +137,18 @@ identity_mapped: + call swap_pages + addl $8, %esp + +- /* To be certain of avoiding problems with self-modifying code ++ /* ++ * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + +- /* set all of the registers to known values */ +- /* leave %esp alone */ ++ /* ++ * set all of the registers to known values ++ * leave %esp alone ++ */ + + testl %esi, %esi + jnz 1f +Index: linux-2.6-tip/arch/x86/kernel/relocate_kernel_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/relocate_kernel_64.S ++++ linux-2.6-tip/arch/x86/kernel/relocate_kernel_64.S +@@ -7,10 +7,10 @@ + */ + + #include +-#include ++#include + #include + #include +-#include ++#include + + /* + * Must be relocatable PIC code callable as a C function +@@ -19,145 +19,76 @@ + #define PTR(x) (x << 3) + #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) + ++/* ++ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE ++ * ~ control_page + PAGE_SIZE are used as data storage and stack for ++ * jumping back ++ */ ++#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) ++ ++/* Minimal CPU state */ ++#define RSP DATA(0x0) ++#define CR0 DATA(0x8) ++#define CR3 DATA(0x10) ++#define CR4 DATA(0x18) ++ ++/* other data */ ++#define CP_PA_TABLE_PAGE DATA(0x20) ++#define CP_PA_SWAP_PAGE DATA(0x28) ++#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) ++ + .text + .align PAGE_SIZE + .code64 + .globl relocate_kernel + relocate_kernel: +- /* %rdi indirection_page ++ /* ++ * %rdi indirection_page + * %rsi page_list + * %rdx start address ++ * %rcx preserve_context + */ + +- /* map the control page at its virtual address */ +- +- movq $0x0000ff8000000000, %r10 /* mask */ +- mov $(39 - 3), %cl /* bits to shift */ +- movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ +- +- movq %r11, %r9 +- andq %r10, %r9 +- shrq %cl, %r9 +- +- movq PTR(VA_PGD)(%rsi), %r8 +- addq %r8, %r9 +- movq PTR(PA_PUD_0)(%rsi), %r8 +- orq $PAGE_ATTR, %r8 +- movq %r8, (%r9) +- +- shrq $9, %r10 +- sub $9, %cl +- +- movq %r11, %r9 +- andq %r10, %r9 +- shrq %cl, %r9 +- +- movq PTR(VA_PUD_0)(%rsi), %r8 +- addq %r8, %r9 +- movq PTR(PA_PMD_0)(%rsi), %r8 +- orq $PAGE_ATTR, %r8 +- movq %r8, (%r9) +- +- shrq $9, %r10 +- sub $9, %cl +- +- movq %r11, %r9 +- andq %r10, %r9 +- shrq %cl, %r9 +- +- movq PTR(VA_PMD_0)(%rsi), %r8 +- addq %r8, %r9 +- movq PTR(PA_PTE_0)(%rsi), %r8 +- orq $PAGE_ATTR, %r8 +- movq %r8, (%r9) +- +- shrq $9, %r10 +- sub $9, %cl +- +- movq %r11, %r9 +- andq %r10, %r9 +- shrq %cl, %r9 +- +- movq PTR(VA_PTE_0)(%rsi), %r8 +- addq %r8, %r9 +- movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 +- orq $PAGE_ATTR, %r8 +- movq %r8, (%r9) +- +- /* identity map the control page at its physical address */ +- +- movq $0x0000ff8000000000, %r10 /* mask */ +- mov $(39 - 3), %cl /* bits to shift */ +- movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ +- +- movq %r11, %r9 +- andq %r10, %r9 +- shrq %cl, %r9 +- +- movq PTR(VA_PGD)(%rsi), %r8 +- addq %r8, %r9 +- movq PTR(PA_PUD_1)(%rsi), %r8 +- orq $PAGE_ATTR, %r8 +- movq %r8, (%r9) +- +- shrq $9, %r10 +- sub $9, %cl +- +- movq %r11, %r9 +- andq %r10, %r9 +- shrq %cl, %r9 +- +- movq PTR(VA_PUD_1)(%rsi), %r8 +- addq %r8, %r9 +- movq PTR(PA_PMD_1)(%rsi), %r8 +- orq $PAGE_ATTR, %r8 +- movq %r8, (%r9) +- +- shrq $9, %r10 +- sub $9, %cl +- +- movq %r11, %r9 +- andq %r10, %r9 +- shrq %cl, %r9 +- +- movq PTR(VA_PMD_1)(%rsi), %r8 +- addq %r8, %r9 +- movq PTR(PA_PTE_1)(%rsi), %r8 +- orq $PAGE_ATTR, %r8 +- movq %r8, (%r9) +- +- shrq $9, %r10 +- sub $9, %cl +- +- movq %r11, %r9 +- andq %r10, %r9 +- shrq %cl, %r9 ++ /* Save the CPU context, used for jumping back */ ++ pushq %rbx ++ pushq %rbp ++ pushq %r12 ++ pushq %r13 ++ pushq %r14 ++ pushq %r15 ++ pushf + +- movq PTR(VA_PTE_1)(%rsi), %r8 +- addq %r8, %r9 +- movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 +- orq $PAGE_ATTR, %r8 +- movq %r8, (%r9) +- +-relocate_new_kernel: +- /* %rdi indirection_page +- * %rsi page_list +- * %rdx start address +- */ ++ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 ++ movq %rsp, RSP(%r11) ++ movq %cr0, %rax ++ movq %rax, CR0(%r11) ++ movq %cr3, %rax ++ movq %rax, CR3(%r11) ++ movq %cr4, %rax ++ movq %rax, CR4(%r11) + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + +- /* get physical address of control page now */ +- /* this is impossible after page table switch */ ++ /* ++ * get physical address of control page now ++ * this is impossible after page table switch ++ */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ +- movq PTR(PA_TABLE_PAGE)(%rsi), %rcx ++ movq PTR(PA_TABLE_PAGE)(%rsi), %r9 ++ ++ /* get physical address of swap page now */ ++ movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + +- /* switch to new set of page tables */ +- movq PTR(PA_PGD)(%rsi), %r9 ++ /* save some information for jumping back */ ++ movq %r9, CP_PA_TABLE_PAGE(%r11) ++ movq %r10, CP_PA_SWAP_PAGE(%r11) ++ movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) ++ ++ /* Switch to the identity mapped page tables */ + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ +@@ -172,7 +103,8 @@ identity_mapped: + /* store the start address on the stack */ + pushq %rdx + +- /* Set cr0 to a known state: ++ /* ++ * Set cr0 to a known state: + * - Paging enabled + * - Alignment check disabled + * - Write protect disabled +@@ -185,7 +117,8 @@ identity_mapped: + orl $(X86_CR0_PG | X86_CR0_PE), %eax + movq %rax, %cr0 + +- /* Set cr4 to a known state: ++ /* ++ * Set cr4 to a known state: + * - physical address extension enabled + */ + movq $X86_CR4_PAE, %rax +@@ -194,12 +127,88 @@ identity_mapped: + jmp 1f + 1: + +- /* Switch to the identity mapped page tables, +- * and flush the TLB. +- */ +- movq %rcx, %cr3 ++ /* Flush the TLB (needed?) */ ++ movq %r9, %cr3 ++ ++ movq %rcx, %r11 ++ call swap_pages ++ ++ /* ++ * To be certain of avoiding problems with self-modifying code ++ * I need to execute a serializing instruction here. ++ * So I flush the TLB by reloading %cr3 here, it's handy, ++ * and not processor dependent. ++ */ ++ movq %cr3, %rax ++ movq %rax, %cr3 ++ ++ /* ++ * set all of the registers to known values ++ * leave %rsp alone ++ */ ++ ++ testq %r11, %r11 ++ jnz 1f ++ xorq %rax, %rax ++ xorq %rbx, %rbx ++ xorq %rcx, %rcx ++ xorq %rdx, %rdx ++ xorq %rsi, %rsi ++ xorq %rdi, %rdi ++ xorq %rbp, %rbp ++ xorq %r8, %r8 ++ xorq %r9, %r9 ++ xorq %r10, %r9 ++ xorq %r11, %r11 ++ xorq %r12, %r12 ++ xorq %r13, %r13 ++ xorq %r14, %r14 ++ xorq %r15, %r15 ++ ++ ret ++ ++1: ++ popq %rdx ++ leaq PAGE_SIZE(%r10), %rsp ++ call *%rdx ++ ++ /* get the re-entry point of the peer system */ ++ movq 0(%rsp), %rbp ++ call 1f ++1: ++ popq %r8 ++ subq $(1b - relocate_kernel), %r8 ++ movq CP_PA_SWAP_PAGE(%r8), %r10 ++ movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi ++ movq CP_PA_TABLE_PAGE(%r8), %rax ++ movq %rax, %cr3 ++ lea PAGE_SIZE(%r8), %rsp ++ call swap_pages ++ movq $virtual_mapped, %rax ++ pushq %rax ++ ret ++ ++virtual_mapped: ++ movq RSP(%r8), %rsp ++ movq CR4(%r8), %rax ++ movq %rax, %cr4 ++ movq CR3(%r8), %rax ++ movq CR0(%r8), %r8 ++ movq %rax, %cr3 ++ movq %r8, %cr0 ++ movq %rbp, %rax ++ ++ popf ++ popq %r15 ++ popq %r14 ++ popq %r13 ++ popq %r12 ++ popq %rbp ++ popq %rbx ++ ret + + /* Do the copies */ ++swap_pages: + movq %rdi, %rcx /* Put the page_list in %rcx */ + xorq %rdi, %rdi + xorq %rsi, %rsi +@@ -231,36 +240,27 @@ identity_mapped: + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + ++ movq %rdi, %rdx ++ movq %rsi, %rax ++ ++ movq %r10, %rdi + movq $512, %rcx + rep ; movsq +- jmp 0b +-3: +- +- /* To be certain of avoiding problems with self-modifying code +- * I need to execute a serializing instruction here. +- * So I flush the TLB by reloading %cr3 here, it's handy, +- * and not processor dependent. +- */ +- movq %cr3, %rax +- movq %rax, %cr3 + +- /* set all of the registers to known values */ +- /* leave %rsp alone */ ++ movq %rax, %rdi ++ movq %rdx, %rsi ++ movq $512, %rcx ++ rep ; movsq + +- xorq %rax, %rax +- xorq %rbx, %rbx +- xorq %rcx, %rcx +- xorq %rdx, %rdx +- xorq %rsi, %rsi +- xorq %rdi, %rdi +- xorq %rbp, %rbp +- xorq %r8, %r8 +- xorq %r9, %r9 +- xorq %r10, %r9 +- xorq %r11, %r11 +- xorq %r12, %r12 +- xorq %r13, %r13 +- xorq %r14, %r14 +- xorq %r15, %r15 ++ movq %rdx, %rdi ++ movq %r10, %rsi ++ movq $512, %rcx ++ rep ; movsq + ++ lea PAGE_SIZE(%rax), %rsi ++ jmp 0b ++3: + ret ++ ++ .globl kexec_control_code_size ++.set kexec_control_code_size, . - relocate_kernel +Index: linux-2.6-tip/arch/x86/kernel/rtc.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/rtc.c ++++ linux-2.6-tip/arch/x86/kernel/rtc.c +@@ -1,14 +1,14 @@ + /* + * RTC related functions + */ ++#include ++#include + #include + #include +-#include +-#include + #include + +-#include + #include ++#include + + #ifdef CONFIG_X86_32 + /* +@@ -16,9 +16,9 @@ + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +-volatile unsigned long cmos_lock = 0; ++volatile unsigned long cmos_lock; + EXPORT_SYMBOL(cmos_lock); +-#endif ++#endif /* CONFIG_X86_32 */ + + /* For two digit years assume time is always after that */ + #define CMOS_YEARS_OFFS 2000 +@@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock); + */ + int mach_set_rtc_mmss(unsigned long nowtime) + { +- int retval = 0; + int real_seconds, real_minutes, cmos_minutes; + unsigned char save_control, save_freq_select; ++ int retval = 0; + + /* tell the clock it's being set */ + save_control = CMOS_READ(RTC_CONTROL); +@@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowt + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); + } +- CMOS_WRITE(real_seconds,RTC_SECONDS); +- CMOS_WRITE(real_minutes,RTC_MINUTES); ++ CMOS_WRITE(real_seconds, RTC_SECONDS); ++ CMOS_WRITE(real_minutes, RTC_MINUTES); + } else { + printk(KERN_WARNING + "set_rtc_mmss: can't update from %d to %d\n", +@@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned cha + outb(addr, RTC_PORT(0)); + val = inb(RTC_PORT(1)); + lock_cmos_suffix(addr); ++ + return val; + } + EXPORT_SYMBOL(rtc_cmos_read); +@@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write); + + static int set_rtc_mmss(unsigned long nowtime) + { +- int retval; + unsigned long flags; ++ int retval; + + spin_lock_irqsave(&rtc_lock, flags); + retval = set_wallclock(nowtime); +@@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void) + platform_device_register(&rtc_device); + dev_info(&rtc_device.dev, + "registered platform RTC device (no PNP device found)\n"); ++ + return 0; + } + device_initcall(add_rtc_cmos); +Index: linux-2.6-tip/arch/x86/kernel/scx200_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/scx200_32.c ++++ linux-2.6-tip/arch/x86/kernel/scx200_32.c +@@ -78,8 +78,10 @@ static int __devinit scx200_probe(struct + if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) { + scx200_cb_base = SCx200_CB_BASE_FIXED; + } else { +- pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base); +- if (scx200_cb_probe(base)) { ++ int err; ++ ++ err = pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base); ++ if (!err && scx200_cb_probe(base)) { + scx200_cb_base = base; + } else { + printk(KERN_WARNING NAME ": Configuration Block not found\n"); +Index: linux-2.6-tip/arch/x86/kernel/setup.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/setup.c ++++ linux-2.6-tip/arch/x86/kernel/setup.c +@@ -74,14 +74,15 @@ + #include + #include + #include +-#include + #include ++#include ++#include + #include + #include + #include + #include + #include +-#include ++#include + #include + #include + #include +@@ -89,7 +90,7 @@ + + #include + #include +-#include ++#include + #include + #include + #include +@@ -97,7 +98,6 @@ + #include + #include + +-#include + #include + #include + +@@ -112,6 +112,25 @@ + #define ARCH_SETUP + #endif + ++RESERVE_BRK(dmi_alloc, 65536); ++ ++unsigned int boot_cpu_id __read_mostly; ++ ++static __initdata unsigned long _brk_start = (unsigned long)__brk_base; ++unsigned long _brk_end = (unsigned long)__brk_base; ++ ++#ifdef CONFIG_X86_64 ++int default_cpu_present_to_apicid(int mps_cpu) ++{ ++ return __default_cpu_present_to_apicid(mps_cpu); ++} ++ ++int default_check_phys_apicid_present(int boot_cpu_physical_apicid) ++{ ++ return __default_check_phys_apicid_present(boot_cpu_physical_apicid); ++} ++#endif ++ + #ifndef CONFIG_DEBUG_BOOT_PARAMS + struct boot_params __initdata boot_params; + #else +@@ -144,12 +163,6 @@ static struct resource bss_resource = { + + + #ifdef CONFIG_X86_32 +-/* This value is set up by the early boot code to point to the value +- immediately after the boot time page tables. It contains a *physical* +- address, and must not be in the .bss segment! */ +-unsigned long init_pg_tables_start __initdata = ~0UL; +-unsigned long init_pg_tables_end __initdata = ~0UL; +- + static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, +@@ -188,7 +201,9 @@ struct ist_info ist_info; + #endif + + #else +-struct cpuinfo_x86 boot_cpu_data __read_mostly; ++struct cpuinfo_x86 boot_cpu_data __read_mostly = { ++ .x86_phys_bits = MAX_PHYSMEM_BITS, ++}; + EXPORT_SYMBOL(boot_cpu_data); + #endif + +@@ -203,12 +218,6 @@ unsigned long mmu_cr4_features = X86_CR4 + int bootloader_type; + + /* +- * Early DMI memory +- */ +-int dmi_alloc_index; +-char dmi_alloc_data[DMI_MAX_DATA]; +- +-/* + * Setup options + */ + struct screen_info screen_info; +@@ -253,6 +262,35 @@ static inline void copy_edd(void) + } + #endif + ++void * __init extend_brk(size_t size, size_t align) ++{ ++ size_t mask = align - 1; ++ void *ret; ++ ++ BUG_ON(_brk_start == 0); ++ BUG_ON(align & mask); ++ ++ _brk_end = (_brk_end + mask) & ~mask; ++ BUG_ON((char *)(_brk_end + size) > __brk_limit); ++ ++ ret = (void *)_brk_end; ++ _brk_end += size; ++ ++ memset(ret, 0, size); ++ ++ return ret; ++} ++ ++static void __init reserve_brk(void) ++{ ++ if (_brk_end > _brk_start) ++ reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); ++ ++ /* Mark brk area as locked down and no longer taking any ++ new allocations */ ++ _brk_start = 0; ++} ++ + #ifdef CONFIG_BLK_DEV_INITRD + + #ifdef CONFIG_X86_32 +@@ -586,20 +624,7 @@ static int __init setup_elfcorehdr(char + early_param("elfcorehdr", setup_elfcorehdr); + #endif + +-static int __init default_update_genapic(void) +-{ +-#ifdef CONFIG_X86_SMP +-# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) +- genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; +-# endif +-#endif +- +- return 0; +-} +- +-static struct x86_quirks default_x86_quirks __initdata = { +- .update_genapic = default_update_genapic, +-}; ++static struct x86_quirks default_x86_quirks __initdata; + + struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; + +@@ -656,7 +681,6 @@ void __init setup_arch(char **cmdline_p) + #ifdef CONFIG_X86_32 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + visws_early_detect(); +- pre_setup_arch_hook(); + #else + printk(KERN_INFO "Command line: %s\n", boot_command_line); + #endif +@@ -715,11 +739,7 @@ void __init setup_arch(char **cmdline_p) + init_mm.start_code = (unsigned long) _text; + init_mm.end_code = (unsigned long) _etext; + init_mm.end_data = (unsigned long) _edata; +-#ifdef CONFIG_X86_32 +- init_mm.brk = init_pg_tables_end + PAGE_OFFSET; +-#else +- init_mm.brk = (unsigned long) &_end; +-#endif ++ init_mm.brk = _brk_end; + + code_resource.start = virt_to_phys(_text); + code_resource.end = virt_to_phys(_etext)-1; +@@ -824,8 +844,7 @@ void __init setup_arch(char **cmdline_p) + #else + num_physpages = max_pfn; + +- if (cpu_has_x2apic) +- check_x2apic(); ++ check_x2apic(); + + /* How many end-of-memory variables you have, grandma! */ + /* need this before calling reserve_initrd */ +@@ -841,6 +860,8 @@ void __init setup_arch(char **cmdline_p) + setup_bios_corruption_check(); + #endif + ++ reserve_brk(); ++ + /* max_pfn_mapped is updated here */ + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<arch_pre_intr_init) { ++ if (x86_quirks->arch_pre_intr_init()) ++ return; ++ } ++ init_ISA_irqs(); ++} + ++/** ++ * x86_quirk_intr_init - post gate setup interrupt initialisation ++ * ++ * Description: ++ * Fill in any interrupts that may have been left out by the general ++ * init_IRQ() routine. interrupts having to do with the machine rather ++ * than the devices on the I/O bus (like APIC interrupts in intel MP ++ * systems) are started here. ++ **/ ++void __init x86_quirk_intr_init(void) ++{ ++ if (x86_quirks->arch_intr_init) { ++ if (x86_quirks->arch_intr_init()) ++ return; ++ } ++} ++ ++/** ++ * x86_quirk_trap_init - initialise system specific traps ++ * ++ * Description: ++ * Called as the final act of trap_init(). Used in VISWS to initialise ++ * the various board specific APIC traps. ++ **/ ++void __init x86_quirk_trap_init(void) ++{ ++ if (x86_quirks->arch_trap_init) { ++ if (x86_quirks->arch_trap_init()) ++ return; ++ } ++} ++ ++static struct irqaction irq0 = { ++ .handler = timer_interrupt, ++ .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, ++ .mask = CPU_MASK_NONE, ++ .name = "timer" ++}; ++ ++/** ++ * x86_quirk_pre_time_init - do any specific initialisations before. ++ * ++ **/ ++void __init x86_quirk_pre_time_init(void) ++{ ++ if (x86_quirks->arch_pre_time_init) ++ x86_quirks->arch_pre_time_init(); ++} ++ ++/** ++ * x86_quirk_time_init - do any specific initialisations for the system timer. ++ * ++ * Description: ++ * Must plug the system timer interrupt source at HZ into the IRQ listed ++ * in irq_vectors.h:TIMER_IRQ ++ **/ ++void __init x86_quirk_time_init(void) ++{ ++ if (x86_quirks->arch_time_init) { ++ /* ++ * A nonzero return code does not mean failure, it means ++ * that the architecture quirk does not want any ++ * generic (timer) setup to be performed after this: ++ */ ++ if (x86_quirks->arch_time_init()) ++ return; ++ } ++ ++ irq0.mask = cpumask_of_cpu(0); ++ setup_irq(0, &irq0); ++} ++#endif /* CONFIG_X86_32 */ +Index: linux-2.6-tip/arch/x86/kernel/setup_percpu.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/setup_percpu.c ++++ linux-2.6-tip/arch/x86/kernel/setup_percpu.c +@@ -7,402 +7,439 @@ + #include + #include + #include ++#include + #include + #include + #include + #include + #include + #include ++#include ++#include ++#include ++#include + +-#ifdef CONFIG_X86_LOCAL_APIC +-unsigned int num_processors; +-unsigned disabled_cpus __cpuinitdata; +-/* Processor that is doing the boot up */ +-unsigned int boot_cpu_physical_apicid = -1U; +-EXPORT_SYMBOL(boot_cpu_physical_apicid); +-unsigned int max_physical_apicid; +- +-/* Bitmask of physically existing CPUs */ +-physid_mask_t phys_cpu_present_map; +-#endif +- +-/* map cpu index to physical APIC ID */ +-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); +-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); +- +-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +-#define X86_64_NUMA 1 +- +-/* map cpu index to node index */ +-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); +- +-/* which logical CPUs are on which nodes */ +-cpumask_t *node_to_cpumask_map; +-EXPORT_SYMBOL(node_to_cpumask_map); ++#ifdef CONFIG_DEBUG_PER_CPU_MAPS ++# define DBG(x...) printk(KERN_DEBUG x) ++#else ++# define DBG(x...) ++#endif + +-/* setup node_to_cpumask_map */ +-static void __init setup_node_to_cpumask_map(void); ++DEFINE_PER_CPU(int, cpu_number); ++EXPORT_PER_CPU_SYMBOL(cpu_number); + ++#ifdef CONFIG_X86_64 ++#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) + #else +-static inline void setup_node_to_cpumask_map(void) { } ++#define BOOT_PERCPU_OFFSET 0 + #endif + +-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) ++DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; ++EXPORT_PER_CPU_SYMBOL(this_cpu_off); ++ ++unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { ++ [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, ++}; ++EXPORT_SYMBOL(__per_cpu_offset); ++ + /* +- * Copy data used in early init routines from the initial arrays to the +- * per cpu data areas. These arrays then become expendable and the +- * *_early_ptr's are zeroed indicating that the static arrays are gone. ++ * On x86_64 symbols referenced from code should be reachable using ++ * 32bit relocations. Reserve space for static percpu variables in ++ * modules so that they are always served from the first chunk which ++ * is located at the percpu segment base. On x86_32, anything can ++ * address anywhere. No need to reserve space in the first chunk. + */ +-static void __init setup_per_cpu_maps(void) ++#ifdef CONFIG_X86_64 ++#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE ++#else ++#define PERCPU_FIRST_CHUNK_RESERVE 0 ++#endif ++ ++/** ++ * pcpu_need_numa - determine percpu allocation needs to consider NUMA ++ * ++ * If NUMA is not configured or there is only one NUMA node available, ++ * there is no reason to consider NUMA. This function determines ++ * whether percpu allocation should consider NUMA or not. ++ * ++ * RETURNS: ++ * true if NUMA should be considered; otherwise, false. ++ */ ++static bool __init pcpu_need_numa(void) + { +- int cpu; ++#ifdef CONFIG_NEED_MULTIPLE_NODES ++ pg_data_t *last = NULL; ++ unsigned int cpu; + + for_each_possible_cpu(cpu) { +- per_cpu(x86_cpu_to_apicid, cpu) = +- early_per_cpu_map(x86_cpu_to_apicid, cpu); +- per_cpu(x86_bios_cpu_apicid, cpu) = +- early_per_cpu_map(x86_bios_cpu_apicid, cpu); +-#ifdef X86_64_NUMA +- per_cpu(x86_cpu_to_node_map, cpu) = +- early_per_cpu_map(x86_cpu_to_node_map, cpu); +-#endif +- } ++ int node = early_cpu_to_node(cpu); + +- /* indicate the early static arrays will soon be gone */ +- early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; +- early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +-#ifdef X86_64_NUMA +- early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; ++ if (node_online(node) && NODE_DATA(node) && ++ last && last != NODE_DATA(node)) ++ return true; ++ ++ last = NODE_DATA(node); ++ } + #endif ++ return false; + } + +-#ifdef CONFIG_X86_32 +-/* +- * Great future not-so-futuristic plan: make i386 and x86_64 do it +- * the same way +- */ +-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +-EXPORT_SYMBOL(__per_cpu_offset); +-static inline void setup_cpu_pda_map(void) { } +- +-#elif !defined(CONFIG_SMP) +-static inline void setup_cpu_pda_map(void) { } +- +-#else /* CONFIG_SMP && CONFIG_X86_64 */ +- +-/* +- * Allocate cpu_pda pointer table and array via alloc_bootmem. ++/** ++ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu ++ * @cpu: cpu to allocate for ++ * @size: size allocation in bytes ++ * @align: alignment ++ * ++ * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper ++ * does the right thing for NUMA regardless of the current ++ * configuration. ++ * ++ * RETURNS: ++ * Pointer to the allocated area on success, NULL on failure. + */ +-static void __init setup_cpu_pda_map(void) ++static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, ++ unsigned long align) + { +- char *pda; +- struct x8664_pda **new_cpu_pda; +- unsigned long size; +- int cpu; +- +- size = roundup(sizeof(struct x8664_pda), cache_line_size()); +- +- /* allocate cpu_pda array and pointer table */ +- { +- unsigned long tsize = nr_cpu_ids * sizeof(void *); +- unsigned long asize = size * (nr_cpu_ids - 1); +- +- tsize = roundup(tsize, cache_line_size()); +- new_cpu_pda = alloc_bootmem(tsize + asize); +- pda = (char *)new_cpu_pda + tsize; ++ const unsigned long goal = __pa(MAX_DMA_ADDRESS); ++#ifdef CONFIG_NEED_MULTIPLE_NODES ++ int node = early_cpu_to_node(cpu); ++ void *ptr; ++ ++ if (!node_online(node) || !NODE_DATA(node)) { ++ ptr = __alloc_bootmem_nopanic(size, align, goal); ++ pr_info("cpu %d has no node %d or node-local memory\n", ++ cpu, node); ++ pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", ++ cpu, size, __pa(ptr)); ++ } else { ++ ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), ++ size, align, goal); ++ pr_debug("per cpu data for cpu%d %lu bytes on node%d at " ++ "%016lx\n", cpu, size, node, __pa(ptr)); + } +- +- /* initialize pointer table to static pda's */ +- for_each_possible_cpu(cpu) { +- if (cpu == 0) { +- /* leave boot cpu pda in place */ +- new_cpu_pda[0] = cpu_pda(0); +- continue; +- } +- new_cpu_pda[cpu] = (struct x8664_pda *)pda; +- new_cpu_pda[cpu]->in_bootmem = 1; +- pda += size; +- } +- +- /* point to new pointer table */ +- _cpu_pda = new_cpu_pda; ++ return ptr; ++#else ++ return __alloc_bootmem_nopanic(size, align, goal); ++#endif + } + +-#endif /* CONFIG_SMP && CONFIG_X86_64 */ +- +-#ifdef CONFIG_X86_64 ++/* ++ * Remap allocator ++ * ++ * This allocator uses PMD page as unit. A PMD page is allocated for ++ * each cpu and each is remapped into vmalloc area using PMD mapping. ++ * As PMD page is quite large, only part of it is used for the first ++ * chunk. Unused part is returned to the bootmem allocator. ++ * ++ * So, the PMD pages are mapped twice - once to the physical mapping ++ * and to the vmalloc area for the first percpu chunk. The double ++ * mapping does add one more PMD TLB entry pressure but still is much ++ * better than only using 4k mappings while still being NUMA friendly. ++ */ ++#ifdef CONFIG_NEED_MULTIPLE_NODES ++static size_t pcpur_size __initdata; ++static void **pcpur_ptrs __initdata; + +-/* correctly size the local cpu masks */ +-static void __init setup_cpu_local_masks(void) ++static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) + { +- alloc_bootmem_cpumask_var(&cpu_initialized_mask); +- alloc_bootmem_cpumask_var(&cpu_callin_mask); +- alloc_bootmem_cpumask_var(&cpu_callout_mask); +- alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +-} ++ size_t off = (size_t)pageno << PAGE_SHIFT; + +-#else /* CONFIG_X86_32 */ ++ if (off >= pcpur_size) ++ return NULL; + +-static inline void setup_cpu_local_masks(void) +-{ ++ return virt_to_page(pcpur_ptrs[cpu] + off); + } + +-#endif /* CONFIG_X86_32 */ +- +-/* +- * Great future plan: +- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. +- * Always point %gs to its beginning +- */ +-void __init setup_per_cpu_areas(void) ++static ssize_t __init setup_pcpu_remap(size_t static_size) + { +- ssize_t size, old_size; +- char *ptr; +- int cpu; +- unsigned long align = 1; +- +- /* Setup cpu_pda map */ +- setup_cpu_pda_map(); +- +- /* Copy section for each CPU (we discard the original) */ +- old_size = PERCPU_ENOUGH_ROOM; +- align = max_t(unsigned long, PAGE_SIZE, align); +- size = roundup(old_size, align); ++ static struct vm_struct vm; ++ pg_data_t *last; ++ size_t ptrs_size, dyn_size; ++ unsigned int cpu; ++ ssize_t ret; + +- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", +- NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); +- +- pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); ++ /* ++ * If large page isn't supported, there's no benefit in doing ++ * this. Also, on non-NUMA, embedding is better. ++ */ ++ if (!cpu_has_pse || pcpu_need_numa()) ++ return -EINVAL; + ++ last = NULL; + for_each_possible_cpu(cpu) { +-#ifndef CONFIG_NEED_MULTIPLE_NODES +- ptr = __alloc_bootmem(size, align, +- __pa(MAX_DMA_ADDRESS)); +-#else + int node = early_cpu_to_node(cpu); +- if (!node_online(node) || !NODE_DATA(node)) { +- ptr = __alloc_bootmem(size, align, +- __pa(MAX_DMA_ADDRESS)); +- pr_info("cpu %d has no node %d or node-local memory\n", +- cpu, node); +- pr_debug("per cpu data for cpu%d at %016lx\n", +- cpu, __pa(ptr)); +- } else { +- ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, +- __pa(MAX_DMA_ADDRESS)); +- pr_debug("per cpu data for cpu%d on node%d at %016lx\n", +- cpu, node, __pa(ptr)); +- } +-#endif +- per_cpu_offset(cpu) = ptr - __per_cpu_start; +- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); +- } + +- /* Setup percpu data maps */ +- setup_per_cpu_maps(); ++ if (node_online(node) && NODE_DATA(node) && ++ last && last != NODE_DATA(node)) ++ goto proceed; ++ ++ last = NODE_DATA(node); ++ } ++ return -EINVAL; ++ ++proceed: ++ /* ++ * Currently supports only single page. Supporting multiple ++ * pages won't be too difficult if it ever becomes necessary. ++ */ ++ pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + ++ PERCPU_DYNAMIC_RESERVE); ++ if (pcpur_size > PMD_SIZE) { ++ pr_warning("PERCPU: static data is larger than large page, " ++ "can't use large page\n"); ++ return -EINVAL; ++ } ++ dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; ++ ++ /* allocate pointer array and alloc large pages */ ++ ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); ++ pcpur_ptrs = alloc_bootmem(ptrs_size); + +- /* Setup node to cpumask map */ +- setup_node_to_cpumask_map(); ++ for_each_possible_cpu(cpu) { ++ pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); ++ if (!pcpur_ptrs[cpu]) ++ goto enomem; ++ ++ /* ++ * Only use pcpur_size bytes and give back the rest. ++ * ++ * Ingo: The 2MB up-rounding bootmem is needed to make ++ * sure the partial 2MB page is still fully RAM - it's ++ * not well-specified to have a PAT-incompatible area ++ * (unmapped RAM, device memory, etc.) in that hole. ++ */ ++ free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), ++ PMD_SIZE - pcpur_size); ++ ++ memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); ++ } ++ ++ /* allocate address and map */ ++ vm.flags = VM_ALLOC; ++ vm.size = num_possible_cpus() * PMD_SIZE; ++ vm_area_register_early(&vm, PMD_SIZE); + +- /* Setup cpu initialized, callin, callout masks */ +- setup_cpu_local_masks(); +-} ++ for_each_possible_cpu(cpu) { ++ pmd_t *pmd; + ++ pmd = populate_extra_pmd((unsigned long)vm.addr ++ + cpu * PMD_SIZE); ++ set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), ++ PAGE_KERNEL_LARGE)); ++ } ++ ++ /* we're ready, commit */ ++ pr_info("PERCPU: Remapped at %p with large pages, static data " ++ "%zu bytes\n", vm.addr, static_size); ++ ++ ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, ++ PERCPU_FIRST_CHUNK_RESERVE, dyn_size, ++ PMD_SIZE, vm.addr, NULL); ++ goto out_free_ar; ++ ++enomem: ++ for_each_possible_cpu(cpu) ++ if (pcpur_ptrs[cpu]) ++ free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); ++ ret = -ENOMEM; ++out_free_ar: ++ free_bootmem(__pa(pcpur_ptrs), ptrs_size); ++ return ret; ++} ++#else ++static ssize_t __init setup_pcpu_remap(size_t static_size) ++{ ++ return -EINVAL; ++} + #endif + +-#ifdef X86_64_NUMA +- + /* +- * Allocate node_to_cpumask_map based on number of available nodes +- * Requires node_possible_map to be valid. ++ * Embedding allocator + * +- * Note: node_to_cpumask() is not valid until after this is done. ++ * The first chunk is sized to just contain the static area plus ++ * module and dynamic reserves and embedded into linear physical ++ * mapping so that it can use PMD mapping without additional TLB ++ * pressure. + */ +-static void __init setup_node_to_cpumask_map(void) ++static ssize_t __init setup_pcpu_embed(size_t static_size) + { +- unsigned int node, num = 0; +- cpumask_t *map; +- +- /* setup nr_node_ids if not done yet */ +- if (nr_node_ids == MAX_NUMNODES) { +- for_each_node_mask(node, node_possible_map) +- num = node; +- nr_node_ids = num + 1; +- } +- +- /* allocate the map */ +- map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); ++ size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + +- pr_debug("Node to cpumask map at %p for %d nodes\n", +- map, nr_node_ids); ++ /* ++ * If large page isn't supported, there's no benefit in doing ++ * this. Also, embedding allocation doesn't play well with ++ * NUMA. ++ */ ++ if (!cpu_has_pse || pcpu_need_numa()) ++ return -EINVAL; + +- /* node_to_cpumask() will now work */ +- node_to_cpumask_map = map; ++ return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, ++ reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); + } + +-void __cpuinit numa_set_node(int cpu, int node) +-{ +- int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); +- +- if (cpu_pda(cpu) && node != NUMA_NO_NODE) +- cpu_pda(cpu)->nodenumber = node; +- +- if (cpu_to_node_map) +- cpu_to_node_map[cpu] = node; +- +- else if (per_cpu_offset(cpu)) +- per_cpu(x86_cpu_to_node_map, cpu) = node; +- +- else +- pr_debug("Setting node for non-present cpu %d\n", cpu); +-} ++/* ++ * 4k page allocator ++ * ++ * This is the basic allocator. Static percpu area is allocated ++ * page-by-page and most of initialization is done by the generic ++ * setup function. ++ */ ++static struct page **pcpu4k_pages __initdata; ++static int pcpu4k_nr_static_pages __initdata; + +-void __cpuinit numa_clear_node(int cpu) ++static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) + { +- numa_set_node(cpu, NUMA_NO_NODE); ++ if (pageno < pcpu4k_nr_static_pages) ++ return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; ++ return NULL; + } + +-#ifndef CONFIG_DEBUG_PER_CPU_MAPS +- +-void __cpuinit numa_add_cpu(int cpu) ++static void __init pcpu4k_populate_pte(unsigned long addr) + { +- cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); ++ populate_extra_pte(addr); + } + +-void __cpuinit numa_remove_cpu(int cpu) ++static ssize_t __init setup_pcpu_4k(size_t static_size) + { +- cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); +-} ++ size_t pages_size; ++ unsigned int cpu; ++ int i, j; ++ ssize_t ret; + +-#else /* CONFIG_DEBUG_PER_CPU_MAPS */ ++ pcpu4k_nr_static_pages = PFN_UP(static_size); + +-/* +- * --------- debug versions of the numa functions --------- +- */ +-static void __cpuinit numa_set_cpumask(int cpu, int enable) +-{ +- int node = cpu_to_node(cpu); +- cpumask_t *mask; +- char buf[64]; +- +- if (node_to_cpumask_map == NULL) { +- printk(KERN_ERR "node_to_cpumask_map NULL\n"); +- dump_stack(); +- return; +- } ++ /* unaligned allocations can't be freed, round up to page size */ ++ pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() ++ * sizeof(pcpu4k_pages[0])); ++ pcpu4k_pages = alloc_bootmem(pages_size); + +- mask = &node_to_cpumask_map[node]; +- if (enable) +- cpu_set(cpu, *mask); +- else +- cpu_clear(cpu, *mask); ++ /* allocate and copy */ ++ j = 0; ++ for_each_possible_cpu(cpu) ++ for (i = 0; i < pcpu4k_nr_static_pages; i++) { ++ void *ptr; + +- cpulist_scnprintf(buf, sizeof(buf), mask); +- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", +- enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); +-} ++ ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); ++ if (!ptr) ++ goto enomem; + +-void __cpuinit numa_add_cpu(int cpu) +-{ +- numa_set_cpumask(cpu, 1); +-} ++ memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); ++ pcpu4k_pages[j++] = virt_to_page(ptr); ++ } + +-void __cpuinit numa_remove_cpu(int cpu) +-{ +- numa_set_cpumask(cpu, 0); ++ /* we're ready, commit */ ++ pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", ++ pcpu4k_nr_static_pages, static_size); ++ ++ ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, ++ PERCPU_FIRST_CHUNK_RESERVE, -1, ++ -1, NULL, pcpu4k_populate_pte); ++ goto out_free_ar; ++ ++enomem: ++ while (--j >= 0) ++ free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); ++ ret = -ENOMEM; ++out_free_ar: ++ free_bootmem(__pa(pcpu4k_pages), pages_size); ++ return ret; + } + +-int cpu_to_node(int cpu) ++static inline void setup_percpu_segment(int cpu) + { +- if (early_per_cpu_ptr(x86_cpu_to_node_map)) { +- printk(KERN_WARNING +- "cpu_to_node(%d): usage too early!\n", cpu); +- dump_stack(); +- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; +- } +- return per_cpu(x86_cpu_to_node_map, cpu); ++#ifdef CONFIG_X86_32 ++ struct desc_struct gdt; ++ ++ pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, ++ 0x2 | DESCTYPE_S, 0x8); ++ gdt.s = 1; ++ write_gdt_entry(get_cpu_gdt_table(cpu), ++ GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); ++#endif + } +-EXPORT_SYMBOL(cpu_to_node); + + /* +- * Same function as cpu_to_node() but used if called before the +- * per_cpu areas are setup. ++ * Great future plan: ++ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. ++ * Always point %gs to its beginning + */ +-int early_cpu_to_node(int cpu) ++void __init setup_per_cpu_areas(void) + { +- if (early_per_cpu_ptr(x86_cpu_to_node_map)) +- return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; +- +- if (!per_cpu_offset(cpu)) { +- printk(KERN_WARNING +- "early_cpu_to_node(%d): no per_cpu area!\n", cpu); +- dump_stack(); +- return NUMA_NO_NODE; +- } +- return per_cpu(x86_cpu_to_node_map, cpu); +-} ++ size_t static_size = __per_cpu_end - __per_cpu_start; ++ unsigned int cpu; ++ unsigned long delta; ++ size_t pcpu_unit_size; ++ ssize_t ret; + ++ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", ++ NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + +-/* empty cpumask */ +-static const cpumask_t cpu_mask_none; ++ /* ++ * Allocate percpu area. If PSE is supported, try to make use ++ * of large page mappings. Please read comments on top of ++ * each allocator for details. ++ */ ++ ret = setup_pcpu_remap(static_size); ++ if (ret < 0) ++ ret = setup_pcpu_embed(static_size); ++ if (ret < 0) ++ ret = setup_pcpu_4k(static_size); ++ if (ret < 0) ++ panic("cannot allocate static percpu area (%zu bytes, err=%zd)", ++ static_size, ret); + +-/* +- * Returns a pointer to the bitmask of CPUs on Node 'node'. +- */ +-const cpumask_t *cpumask_of_node(int node) +-{ +- if (node_to_cpumask_map == NULL) { +- printk(KERN_WARNING +- "cpumask_of_node(%d): no node_to_cpumask_map!\n", +- node); +- dump_stack(); +- return (const cpumask_t *)&cpu_online_map; +- } +- if (node >= nr_node_ids) { +- printk(KERN_WARNING +- "cpumask_of_node(%d): node > nr_node_ids(%d)\n", +- node, nr_node_ids); +- dump_stack(); +- return &cpu_mask_none; +- } +- return &node_to_cpumask_map[node]; +-} +-EXPORT_SYMBOL(cpumask_of_node); ++ pcpu_unit_size = ret; + +-/* +- * Returns a bitmask of CPUs on Node 'node'. +- * +- * Side note: this function creates the returned cpumask on the stack +- * so with a high NR_CPUS count, excessive stack space is used. The +- * node_to_cpumask_ptr function should be used whenever possible. +- */ +-cpumask_t node_to_cpumask(int node) +-{ +- if (node_to_cpumask_map == NULL) { +- printk(KERN_WARNING +- "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); +- dump_stack(); +- return cpu_online_map; +- } +- if (node >= nr_node_ids) { +- printk(KERN_WARNING +- "node_to_cpumask(%d): node > nr_node_ids(%d)\n", +- node, nr_node_ids); +- dump_stack(); +- return cpu_mask_none; ++ /* alrighty, percpu areas up and running */ ++ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; ++ for_each_possible_cpu(cpu) { ++ per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; ++ per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); ++ per_cpu(cpu_number, cpu) = cpu; ++ setup_percpu_segment(cpu); ++ setup_stack_canary_segment(cpu); ++ /* ++ * Copy data used in early init routines from the ++ * initial arrays to the per cpu data areas. These ++ * arrays then become expendable and the *_early_ptr's ++ * are zeroed indicating that the static arrays are ++ * gone. ++ */ ++#ifdef CONFIG_X86_LOCAL_APIC ++ per_cpu(x86_cpu_to_apicid, cpu) = ++ early_per_cpu_map(x86_cpu_to_apicid, cpu); ++ per_cpu(x86_bios_cpu_apicid, cpu) = ++ early_per_cpu_map(x86_bios_cpu_apicid, cpu); ++#endif ++#ifdef CONFIG_X86_64 ++ per_cpu(irq_stack_ptr, cpu) = ++ per_cpu(irq_stack_union.irq_stack, cpu) + ++ IRQ_STACK_SIZE - 64; ++#ifdef CONFIG_NUMA ++ per_cpu(x86_cpu_to_node_map, cpu) = ++ early_per_cpu_map(x86_cpu_to_node_map, cpu); ++#endif ++#endif ++ /* ++ * Up to this point, the boot CPU has been using .data.init ++ * area. Reload any changed state for the boot CPU. ++ */ ++ if (cpu == boot_cpu_id) ++ switch_to_new_gdt(cpu); + } +- return node_to_cpumask_map[node]; +-} +-EXPORT_SYMBOL(node_to_cpumask); + +-/* +- * --------- end of debug versions of the numa functions --------- +- */ +- +-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ ++ /* indicate the early static arrays will soon be gone */ ++#ifdef CONFIG_X86_LOCAL_APIC ++ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; ++ early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; ++#endif ++#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) ++ early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; ++#endif + +-#endif /* X86_64_NUMA */ ++ /* Setup node to cpumask map */ ++ setup_node_to_cpumask_map(); + ++ /* Setup cpu initialized, callin, callout masks */ ++ setup_cpu_local_masks(); ++} +Index: linux-2.6-tip/arch/x86/kernel/signal.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/signal.c ++++ linux-2.6-tip/arch/x86/kernel/signal.c +@@ -6,7 +6,7 @@ + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-2002 x86-64 support by Andi Kleen + */ +- ++#include + #include + #include + #include +@@ -50,27 +50,23 @@ + # define FIX_EFLAGS __FIX_EFLAGS + #endif + +-#define COPY(x) { \ +- err |= __get_user(regs->x, &sc->x); \ +-} +- +-#define COPY_SEG(seg) { \ +- unsigned short tmp; \ +- err |= __get_user(tmp, &sc->seg); \ +- regs->seg = tmp; \ +-} +- +-#define COPY_SEG_CPL3(seg) { \ +- unsigned short tmp; \ +- err |= __get_user(tmp, &sc->seg); \ +- regs->seg = tmp | 3; \ +-} +- +-#define GET_SEG(seg) { \ +- unsigned short tmp; \ +- err |= __get_user(tmp, &sc->seg); \ +- loadsegment(seg, tmp); \ +-} ++#define COPY(x) do { \ ++ get_user_ex(regs->x, &sc->x); \ ++} while (0) ++ ++#define GET_SEG(seg) ({ \ ++ unsigned short tmp; \ ++ get_user_ex(tmp, &sc->seg); \ ++ tmp; \ ++}) ++ ++#define COPY_SEG(seg) do { \ ++ regs->seg = GET_SEG(seg); \ ++} while (0) ++ ++#define COPY_SEG_CPL3(seg) do { \ ++ regs->seg = GET_SEG(seg) | 3; \ ++} while (0) + + static int + restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, +@@ -83,45 +79,49 @@ restore_sigcontext(struct pt_regs *regs, + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + ++ get_user_try { ++ + #ifdef CONFIG_X86_32 +- GET_SEG(gs); +- COPY_SEG(fs); +- COPY_SEG(es); +- COPY_SEG(ds); ++ set_user_gs(regs, GET_SEG(gs)); ++ COPY_SEG(fs); ++ COPY_SEG(es); ++ COPY_SEG(ds); + #endif /* CONFIG_X86_32 */ + +- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); +- COPY(dx); COPY(cx); COPY(ip); ++ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); ++ COPY(dx); COPY(cx); COPY(ip); + + #ifdef CONFIG_X86_64 +- COPY(r8); +- COPY(r9); +- COPY(r10); +- COPY(r11); +- COPY(r12); +- COPY(r13); +- COPY(r14); +- COPY(r15); ++ COPY(r8); ++ COPY(r9); ++ COPY(r10); ++ COPY(r11); ++ COPY(r12); ++ COPY(r13); ++ COPY(r14); ++ COPY(r15); + #endif /* CONFIG_X86_64 */ + + #ifdef CONFIG_X86_32 +- COPY_SEG_CPL3(cs); +- COPY_SEG_CPL3(ss); ++ COPY_SEG_CPL3(cs); ++ COPY_SEG_CPL3(ss); + #else /* !CONFIG_X86_32 */ +- /* Kernel saves and restores only the CS segment register on signals, +- * which is the bare minimum needed to allow mixed 32/64-bit code. +- * App's signal handler can save/restore other segments if needed. */ +- COPY_SEG_CPL3(cs); ++ /* Kernel saves and restores only the CS segment register on signals, ++ * which is the bare minimum needed to allow mixed 32/64-bit code. ++ * App's signal handler can save/restore other segments if needed. */ ++ COPY_SEG_CPL3(cs); + #endif /* CONFIG_X86_32 */ + +- err |= __get_user(tmpflags, &sc->flags); +- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); +- regs->orig_ax = -1; /* disable syscall checks */ ++ get_user_ex(tmpflags, &sc->flags); ++ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); ++ regs->orig_ax = -1; /* disable syscall checks */ ++ ++ get_user_ex(buf, &sc->fpstate); ++ err |= restore_i387_xstate(buf); + +- err |= __get_user(buf, &sc->fpstate); +- err |= restore_i387_xstate(buf); ++ get_user_ex(*pax, &sc->ax); ++ } get_user_catch(err); + +- err |= __get_user(*pax, &sc->ax); + return err; + } + +@@ -131,57 +131,55 @@ setup_sigcontext(struct sigcontext __use + { + int err = 0; + +-#ifdef CONFIG_X86_32 +- { +- unsigned int tmp; ++ put_user_try { + +- savesegment(gs, tmp); +- err |= __put_user(tmp, (unsigned int __user *)&sc->gs); +- } +- err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); +- err |= __put_user(regs->es, (unsigned int __user *)&sc->es); +- err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); +-#endif /* CONFIG_X86_32 */ +- +- err |= __put_user(regs->di, &sc->di); +- err |= __put_user(regs->si, &sc->si); +- err |= __put_user(regs->bp, &sc->bp); +- err |= __put_user(regs->sp, &sc->sp); +- err |= __put_user(regs->bx, &sc->bx); +- err |= __put_user(regs->dx, &sc->dx); +- err |= __put_user(regs->cx, &sc->cx); +- err |= __put_user(regs->ax, &sc->ax); ++#ifdef CONFIG_X86_32 ++ put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs); ++ put_user_ex(regs->fs, (unsigned int __user *)&sc->fs); ++ put_user_ex(regs->es, (unsigned int __user *)&sc->es); ++ put_user_ex(regs->ds, (unsigned int __user *)&sc->ds); ++#endif /* CONFIG_X86_32 */ ++ ++ put_user_ex(regs->di, &sc->di); ++ put_user_ex(regs->si, &sc->si); ++ put_user_ex(regs->bp, &sc->bp); ++ put_user_ex(regs->sp, &sc->sp); ++ put_user_ex(regs->bx, &sc->bx); ++ put_user_ex(regs->dx, &sc->dx); ++ put_user_ex(regs->cx, &sc->cx); ++ put_user_ex(regs->ax, &sc->ax); + #ifdef CONFIG_X86_64 +- err |= __put_user(regs->r8, &sc->r8); +- err |= __put_user(regs->r9, &sc->r9); +- err |= __put_user(regs->r10, &sc->r10); +- err |= __put_user(regs->r11, &sc->r11); +- err |= __put_user(regs->r12, &sc->r12); +- err |= __put_user(regs->r13, &sc->r13); +- err |= __put_user(regs->r14, &sc->r14); +- err |= __put_user(regs->r15, &sc->r15); ++ put_user_ex(regs->r8, &sc->r8); ++ put_user_ex(regs->r9, &sc->r9); ++ put_user_ex(regs->r10, &sc->r10); ++ put_user_ex(regs->r11, &sc->r11); ++ put_user_ex(regs->r12, &sc->r12); ++ put_user_ex(regs->r13, &sc->r13); ++ put_user_ex(regs->r14, &sc->r14); ++ put_user_ex(regs->r15, &sc->r15); + #endif /* CONFIG_X86_64 */ + +- err |= __put_user(current->thread.trap_no, &sc->trapno); +- err |= __put_user(current->thread.error_code, &sc->err); +- err |= __put_user(regs->ip, &sc->ip); +-#ifdef CONFIG_X86_32 +- err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); +- err |= __put_user(regs->flags, &sc->flags); +- err |= __put_user(regs->sp, &sc->sp_at_signal); +- err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); ++ put_user_ex(current->thread.trap_no, &sc->trapno); ++ put_user_ex(current->thread.error_code, &sc->err); ++ put_user_ex(regs->ip, &sc->ip); ++#ifdef CONFIG_X86_32 ++ put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); ++ put_user_ex(regs->flags, &sc->flags); ++ put_user_ex(regs->sp, &sc->sp_at_signal); ++ put_user_ex(regs->ss, (unsigned int __user *)&sc->ss); + #else /* !CONFIG_X86_32 */ +- err |= __put_user(regs->flags, &sc->flags); +- err |= __put_user(regs->cs, &sc->cs); +- err |= __put_user(0, &sc->gs); +- err |= __put_user(0, &sc->fs); ++ put_user_ex(regs->flags, &sc->flags); ++ put_user_ex(regs->cs, &sc->cs); ++ put_user_ex(0, &sc->gs); ++ put_user_ex(0, &sc->fs); + #endif /* CONFIG_X86_32 */ + +- err |= __put_user(fpstate, &sc->fpstate); +- +- /* non-iBCS2 extensions.. */ +- err |= __put_user(mask, &sc->oldmask); +- err |= __put_user(current->thread.cr2, &sc->cr2); ++ put_user_ex(fpstate, &sc->fpstate); ++ ++ /* non-iBCS2 extensions.. */ ++ put_user_ex(mask, &sc->oldmask); ++ put_user_ex(current->thread.cr2, &sc->cr2); ++ } put_user_catch(err); + + return err; + } +@@ -189,6 +187,77 @@ setup_sigcontext(struct sigcontext __use + /* + * Set up a signal frame. + */ ++ ++/* ++ * Determine which stack to use.. ++ */ ++static unsigned long align_sigframe(unsigned long sp) ++{ ++#ifdef CONFIG_X86_32 ++ /* ++ * Align the stack pointer according to the i386 ABI, ++ * i.e. so that on function entry ((sp + 4) & 15) == 0. ++ */ ++ sp = ((sp + 4) & -16ul) - 4; ++#else /* !CONFIG_X86_32 */ ++ sp = round_down(sp, 16) - 8; ++#endif ++ return sp; ++} ++ ++static inline void __user * ++get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, ++ void __user **fpstate) ++{ ++ /* Default to using normal stack */ ++ unsigned long sp = regs->sp; ++ int onsigstack = on_sig_stack(sp); ++ ++#ifdef CONFIG_X86_64 ++ /* redzone */ ++ sp -= 128; ++#endif /* CONFIG_X86_64 */ ++ ++ if (!onsigstack) { ++ /* This is the X/Open sanctioned signal stack switching. */ ++ if (ka->sa.sa_flags & SA_ONSTACK) { ++ if (sas_ss_flags(sp) == 0) ++ sp = current->sas_ss_sp + current->sas_ss_size; ++ } else { ++#ifdef CONFIG_X86_32 ++ /* This is the legacy signal stack switching. */ ++ if ((regs->ss & 0xffff) != __USER_DS && ++ !(ka->sa.sa_flags & SA_RESTORER) && ++ ka->sa.sa_restorer) ++ sp = (unsigned long) ka->sa.sa_restorer; ++#endif /* CONFIG_X86_32 */ ++ } ++ } ++ ++ if (used_math()) { ++ sp -= sig_xstate_size; ++#ifdef CONFIG_X86_64 ++ sp = round_down(sp, 64); ++#endif /* CONFIG_X86_64 */ ++ *fpstate = (void __user *)sp; ++ } ++ ++ sp = align_sigframe(sp - frame_size); ++ ++ /* ++ * If we are on the alternate signal stack and would overflow it, don't. ++ * Return an always-bogus address instead so we will die with SIGSEGV. ++ */ ++ if (onsigstack && !likely(on_sig_stack(sp))) ++ return (void __user *)-1L; ++ ++ /* save i387 state */ ++ if (used_math() && save_i387_xstate(*fpstate) < 0) ++ return (void __user *)-1L; ++ ++ return (void __user *)sp; ++} ++ + #ifdef CONFIG_X86_32 + static const struct { + u16 poplmovl; +@@ -212,54 +281,6 @@ static const struct { + 0 + }; + +-/* +- * Determine which stack to use.. +- */ +-static inline void __user * +-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, +- void **fpstate) +-{ +- unsigned long sp; +- +- /* Default to using normal stack */ +- sp = regs->sp; +- +- /* +- * If we are on the alternate signal stack and would overflow it, don't. +- * Return an always-bogus address instead so we will die with SIGSEGV. +- */ +- if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) +- return (void __user *) -1L; +- +- /* This is the X/Open sanctioned signal stack switching. */ +- if (ka->sa.sa_flags & SA_ONSTACK) { +- if (sas_ss_flags(sp) == 0) +- sp = current->sas_ss_sp + current->sas_ss_size; +- } else { +- /* This is the legacy signal stack switching. */ +- if ((regs->ss & 0xffff) != __USER_DS && +- !(ka->sa.sa_flags & SA_RESTORER) && +- ka->sa.sa_restorer) +- sp = (unsigned long) ka->sa.sa_restorer; +- } +- +- if (used_math()) { +- sp = sp - sig_xstate_size; +- *fpstate = (struct _fpstate *) sp; +- if (save_i387_xstate(*fpstate) < 0) +- return (void __user *)-1L; +- } +- +- sp -= frame_size; +- /* +- * Align the stack pointer according to the i386 ABI, +- * i.e. so that on function entry ((sp + 4) & 15) == 0. +- */ +- sp = ((sp + 4) & -16ul) - 4; +- +- return (void __user *) sp; +-} +- + static int + __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, + struct pt_regs *regs) +@@ -336,43 +357,41 @@ static int __setup_rt_frame(int sig, str + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + +- err |= __put_user(sig, &frame->sig); +- err |= __put_user(&frame->info, &frame->pinfo); +- err |= __put_user(&frame->uc, &frame->puc); +- err |= copy_siginfo_to_user(&frame->info, info); +- if (err) +- return -EFAULT; +- +- /* Create the ucontext. */ +- if (cpu_has_xsave) +- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); +- else +- err |= __put_user(0, &frame->uc.uc_flags); +- err |= __put_user(0, &frame->uc.uc_link); +- err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); +- err |= __put_user(sas_ss_flags(regs->sp), +- &frame->uc.uc_stack.ss_flags); +- err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); +- err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, +- regs, set->sig[0]); +- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); +- if (err) +- return -EFAULT; +- +- /* Set up to return from userspace. */ +- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); +- if (ka->sa.sa_flags & SA_RESTORER) +- restorer = ka->sa.sa_restorer; +- err |= __put_user(restorer, &frame->pretcode); ++ put_user_try { ++ put_user_ex(sig, &frame->sig); ++ put_user_ex(&frame->info, &frame->pinfo); ++ put_user_ex(&frame->uc, &frame->puc); ++ err |= copy_siginfo_to_user(&frame->info, info); ++ ++ /* Create the ucontext. */ ++ if (cpu_has_xsave) ++ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); ++ else ++ put_user_ex(0, &frame->uc.uc_flags); ++ put_user_ex(0, &frame->uc.uc_link); ++ put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); ++ put_user_ex(sas_ss_flags(regs->sp), ++ &frame->uc.uc_stack.ss_flags); ++ put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); ++ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, ++ regs, set->sig[0]); ++ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); ++ ++ /* Set up to return from userspace. */ ++ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); ++ if (ka->sa.sa_flags & SA_RESTORER) ++ restorer = ka->sa.sa_restorer; ++ put_user_ex(restorer, &frame->pretcode); + +- /* +- * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 +- * +- * WE DO NOT USE IT ANY MORE! It's only left here for historical +- * reasons and because gdb uses it as a signature to notice +- * signal handler stack frames. +- */ +- err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); ++ /* ++ * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 ++ * ++ * WE DO NOT USE IT ANY MORE! It's only left here for historical ++ * reasons and because gdb uses it as a signature to notice ++ * signal handler stack frames. ++ */ ++ put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); ++ } put_user_catch(err); + + if (err) + return -EFAULT; +@@ -392,24 +411,6 @@ static int __setup_rt_frame(int sig, str + return 0; + } + #else /* !CONFIG_X86_32 */ +-/* +- * Determine which stack to use.. +- */ +-static void __user * +-get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) +-{ +- /* Default to using normal stack - redzone*/ +- sp -= 128; +- +- /* This is the X/Open sanctioned signal stack switching. */ +- if (ka->sa.sa_flags & SA_ONSTACK) { +- if (sas_ss_flags(sp) == 0) +- sp = current->sas_ss_sp + current->sas_ss_size; +- } +- +- return (void __user *)round_down(sp - size, 64); +-} +- + static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) + { +@@ -418,15 +419,7 @@ static int __setup_rt_frame(int sig, str + int err = 0; + struct task_struct *me = current; + +- if (used_math()) { +- fp = get_stack(ka, regs->sp, sig_xstate_size); +- frame = (void __user *)round_down( +- (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; +- +- if (save_i387_xstate(fp) < 0) +- return -EFAULT; +- } else +- frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8; ++ frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; +@@ -436,28 +429,30 @@ static int __setup_rt_frame(int sig, str + return -EFAULT; + } + +- /* Create the ucontext. */ +- if (cpu_has_xsave) +- err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); +- else +- err |= __put_user(0, &frame->uc.uc_flags); +- err |= __put_user(0, &frame->uc.uc_link); +- err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); +- err |= __put_user(sas_ss_flags(regs->sp), +- &frame->uc.uc_stack.ss_flags); +- err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); +- err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); +- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); +- +- /* Set up to return from userspace. If provided, use a stub +- already in userspace. */ +- /* x86-64 should always use SA_RESTORER. */ +- if (ka->sa.sa_flags & SA_RESTORER) { +- err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); +- } else { +- /* could use a vstub here */ +- return -EFAULT; +- } ++ put_user_try { ++ /* Create the ucontext. */ ++ if (cpu_has_xsave) ++ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); ++ else ++ put_user_ex(0, &frame->uc.uc_flags); ++ put_user_ex(0, &frame->uc.uc_link); ++ put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); ++ put_user_ex(sas_ss_flags(regs->sp), ++ &frame->uc.uc_stack.ss_flags); ++ put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); ++ err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); ++ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); ++ ++ /* Set up to return from userspace. If provided, use a stub ++ already in userspace. */ ++ /* x86-64 should always use SA_RESTORER. */ ++ if (ka->sa.sa_flags & SA_RESTORER) { ++ put_user_ex(ka->sa.sa_restorer, &frame->pretcode); ++ } else { ++ /* could use a vstub here */ ++ err |= -EFAULT; ++ } ++ } put_user_catch(err); + + if (err) + return -EFAULT; +@@ -509,31 +504,41 @@ sys_sigaction(int sig, const struct old_ + struct old_sigaction __user *oact) + { + struct k_sigaction new_ka, old_ka; +- int ret; ++ int ret = 0; + + if (act) { + old_sigset_t mask; + +- if (!access_ok(VERIFY_READ, act, sizeof(*act)) || +- __get_user(new_ka.sa.sa_handler, &act->sa_handler) || +- __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) ++ if (!access_ok(VERIFY_READ, act, sizeof(*act))) + return -EFAULT; + +- __get_user(new_ka.sa.sa_flags, &act->sa_flags); +- __get_user(mask, &act->sa_mask); ++ get_user_try { ++ get_user_ex(new_ka.sa.sa_handler, &act->sa_handler); ++ get_user_ex(new_ka.sa.sa_flags, &act->sa_flags); ++ get_user_ex(mask, &act->sa_mask); ++ get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer); ++ } get_user_catch(ret); ++ ++ if (ret) ++ return -EFAULT; + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { +- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || +- __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || +- __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) ++ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact))) + return -EFAULT; + +- __put_user(old_ka.sa.sa_flags, &oact->sa_flags); +- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); ++ put_user_try { ++ put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler); ++ put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags); ++ put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); ++ put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer); ++ } put_user_catch(ret); ++ ++ if (ret) ++ return -EFAULT; + } + + return ret; +@@ -541,14 +546,9 @@ sys_sigaction(int sig, const struct old_ + #endif /* CONFIG_X86_32 */ + + #ifdef CONFIG_X86_32 +-asmlinkage int sys_sigaltstack(unsigned long bx) ++int sys_sigaltstack(struct pt_regs *regs) + { +- /* +- * This is needed to make gcc realize it doesn't own the +- * "struct pt_regs" +- */ +- struct pt_regs *regs = (struct pt_regs *)&bx; +- const stack_t __user *uss = (const stack_t __user *)bx; ++ const stack_t __user *uss = (const stack_t __user *)regs->bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; + + return do_sigaltstack(uss, uoss, regs->sp); +@@ -566,14 +566,12 @@ sys_sigaltstack(const stack_t __user *us + * Do a signal return; undo the signal stack. + */ + #ifdef CONFIG_X86_32 +-asmlinkage unsigned long sys_sigreturn(unsigned long __unused) ++unsigned long sys_sigreturn(struct pt_regs *regs) + { + struct sigframe __user *frame; +- struct pt_regs *regs; + unsigned long ax; + sigset_t set; + +- regs = (struct pt_regs *) &__unused; + frame = (struct sigframe __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) +@@ -600,7 +598,7 @@ badframe: + } + #endif /* CONFIG_X86_32 */ + +-static long do_rt_sigreturn(struct pt_regs *regs) ++long sys_rt_sigreturn(struct pt_regs *regs) + { + struct rt_sigframe __user *frame; + unsigned long ax; +@@ -631,25 +629,6 @@ badframe: + return 0; + } + +-#ifdef CONFIG_X86_32 +-/* +- * Note: do not pass in pt_regs directly as with tail-call optimization +- * GCC will incorrectly stomp on the caller's frame and corrupt user-space +- * register state: +- */ +-asmlinkage int sys_rt_sigreturn(unsigned long __unused) +-{ +- struct pt_regs *regs = (struct pt_regs *)&__unused; +- +- return do_rt_sigreturn(regs); +-} +-#else /* !CONFIG_X86_32 */ +-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +-{ +- return do_rt_sigreturn(regs); +-} +-#endif /* CONFIG_X86_32 */ +- + /* + * OK, we're invoking a handler: + */ +@@ -804,6 +783,13 @@ static void do_signal(struct pt_regs *re + int signr; + sigset_t *oldset; + ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * Fully-preemptible kernel does not need interrupts disabled: ++ */ ++ local_irq_enable(); ++ preempt_check_resched(); ++#endif + /* + * We want the common case to go fast, which is why we may in certain + * cases get here from kernel mode. Just return without doing anything +@@ -893,6 +879,11 @@ do_notify_resume(struct pt_regs *regs, v + tracehook_notify_resume(regs); + } + ++ if (thread_info_flags & _TIF_PERF_COUNTERS) { ++ clear_thread_flag(TIF_PERF_COUNTERS); ++ perf_counter_notify(regs); ++ } ++ + #ifdef CONFIG_X86_32 + clear_thread_flag(TIF_IRET); + #endif /* CONFIG_X86_32 */ +Index: linux-2.6-tip/arch/x86/kernel/smp.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/smp.c ++++ linux-2.6-tip/arch/x86/kernel/smp.c +@@ -2,7 +2,7 @@ + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 +- * (c) 1998-99, 2000 Ingo Molnar ++ * (c) 1998-99, 2000, 2009 Ingo Molnar + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * + * i386 and x86_64 integration by Glauber Costa +@@ -26,8 +26,7 @@ + #include + #include + #include +-#include +-#include ++#include + /* + * Some notes on x86 processor bugs affecting SMP operation: + * +@@ -118,12 +117,22 @@ static void native_smp_send_reschedule(i + WARN_ON(1); + return; + } +- send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); ++ apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); ++} ++ ++/* ++ * this function sends a 'reschedule' IPI to all other CPUs. ++ * This is used when RT tasks are starving and other CPUs ++ * might be able to run them: ++ */ ++void smp_send_reschedule_allbutself(void) ++{ ++ apic->send_IPI_allbutself(RESCHEDULE_VECTOR); + } + + void native_send_call_func_single_ipi(int cpu) + { +- send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); ++ apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); + } + + void native_send_call_func_ipi(const struct cpumask *mask) +@@ -131,7 +140,7 @@ void native_send_call_func_ipi(const str + cpumask_var_t allbutself; + + if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) { +- send_IPI_mask(mask, CALL_FUNCTION_VECTOR); ++ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + return; + } + +@@ -140,9 +149,9 @@ void native_send_call_func_ipi(const str + + if (cpumask_equal(mask, allbutself) && + cpumask_equal(cpu_online_mask, cpu_callout_mask)) +- send_IPI_allbutself(CALL_FUNCTION_VECTOR); ++ apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else +- send_IPI_mask(mask, CALL_FUNCTION_VECTOR); ++ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + + free_cpumask_var(allbutself); + } +Index: linux-2.6-tip/arch/x86/kernel/smpboot.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/smpboot.c ++++ linux-2.6-tip/arch/x86/kernel/smpboot.c +@@ -2,7 +2,7 @@ + * x86 SMP booting functions + * + * (c) 1995 Alan Cox, Building #3 +- * (c) 1998, 1999, 2000 Ingo Molnar ++ * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * Copyright 2001 Andi Kleen, SuSE Labs. + * + * Much of the core SMP work is based on previous work by Thomas Radke, to +@@ -53,7 +53,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -61,13 +60,12 @@ + #include + #include + #include +-#include ++#include + #include ++#include + #include + +-#include +-#include +-#include ++#include + + #ifdef CONFIG_X86_32 + u8 apicid_2_node[MAX_APICID]; +@@ -103,29 +101,20 @@ EXPORT_SYMBOL(smp_num_siblings); + DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; + + /* representing HT siblings of each logical CPU */ +-DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); ++DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); + EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); + + /* representing HT and core siblings of each logical CPU */ +-DEFINE_PER_CPU(cpumask_t, cpu_core_map); ++DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); + EXPORT_PER_CPU_SYMBOL(cpu_core_map); + + /* Per CPU bogomips and other parameters */ + DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); + EXPORT_PER_CPU_SYMBOL(cpu_info); + +-static atomic_t init_deasserted; +- +- +-/* Set if we find a B stepping CPU */ +-static int __cpuinitdata smp_b_stepping; ++atomic_t init_deasserted; + + #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) +- +-/* which logical CPUs are on which nodes */ +-cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly = +- { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; +-EXPORT_SYMBOL(node_to_cpumask_map); + /* which node each logical CPU is on */ + int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; + EXPORT_SYMBOL(cpu_to_node_map); +@@ -134,7 +123,7 @@ EXPORT_SYMBOL(cpu_to_node_map); + static void map_cpu_to_node(int cpu, int node) + { + printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); +- cpumask_set_cpu(cpu, &node_to_cpumask_map[node]); ++ cpumask_set_cpu(cpu, node_to_cpumask_map[node]); + cpu_to_node_map[cpu] = node; + } + +@@ -145,7 +134,7 @@ static void unmap_cpu_to_node(int cpu) + + printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); + for (node = 0; node < MAX_NUMNODES; node++) +- cpumask_clear_cpu(cpu, &node_to_cpumask_map[node]); ++ cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); + cpu_to_node_map[cpu] = 0; + } + #else /* !(CONFIG_NUMA && CONFIG_X86_32) */ +@@ -163,7 +152,7 @@ static void map_cpu_to_logical_apicid(vo + { + int cpu = smp_processor_id(); + int apicid = logical_smp_processor_id(); +- int node = apicid_to_node(apicid); ++ int node = apic->apicid_to_node(apicid); + + if (!node_online(node)) + node = first_online_node; +@@ -196,7 +185,8 @@ static void __cpuinit smp_callin(void) + * our local APIC. We have to wait for the IPI or we'll + * lock up on an APIC access. + */ +- wait_for_init_deassert(&init_deasserted); ++ if (apic->wait_for_init_deassert) ++ apic->wait_for_init_deassert(&init_deasserted); + + /* + * (This works even if the APIC is not enabled.) +@@ -243,7 +233,8 @@ static void __cpuinit smp_callin(void) + */ + + pr_debug("CALLIN, before setup_local_APIC().\n"); +- smp_callin_clear_local_apic(); ++ if (apic->smp_callin_clear_local_apic) ++ apic->smp_callin_clear_local_apic(); + setup_local_APIC(); + end_local_APIC_setup(); + map_cpu_to_logical_apicid(); +@@ -271,8 +262,6 @@ static void __cpuinit smp_callin(void) + cpumask_set_cpu(cpuid, cpu_callin_mask); + } + +-static int __cpuinitdata unsafe_smp; +- + /* + * Activate a secondary processor. + */ +@@ -307,7 +296,7 @@ notrace static void __cpuinit start_seco + __flush_tlb_all(); + #endif + +- /* This must be done before setting cpu_online_map */ ++ /* This must be done before setting cpu_online_mask */ + set_cpu_sibling_map(raw_smp_processor_id()); + wmb(); + +@@ -340,75 +329,22 @@ notrace static void __cpuinit start_seco + cpu_idle(); + } + +-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) +-{ +- /* +- * Mask B, Pentium, but not Pentium MMX +- */ +- if (c->x86_vendor == X86_VENDOR_INTEL && +- c->x86 == 5 && +- c->x86_mask >= 1 && c->x86_mask <= 4 && +- c->x86_model <= 3) +- /* +- * Remember we have B step Pentia with bugs +- */ +- smp_b_stepping = 1; +- +- /* +- * Certain Athlons might work (for various values of 'work') in SMP +- * but they are not certified as MP capable. +- */ +- if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { +- +- if (num_possible_cpus() == 1) +- goto valid_k7; +- +- /* Athlon 660/661 is valid. */ +- if ((c->x86_model == 6) && ((c->x86_mask == 0) || +- (c->x86_mask == 1))) +- goto valid_k7; +- +- /* Duron 670 is valid */ +- if ((c->x86_model == 7) && (c->x86_mask == 0)) +- goto valid_k7; +- +- /* +- * Athlon 662, Duron 671, and Athlon >model 7 have capability +- * bit. It's worth noting that the A5 stepping (662) of some +- * Athlon XP's have the MP bit set. +- * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for +- * more. +- */ +- if (((c->x86_model == 6) && (c->x86_mask >= 2)) || +- ((c->x86_model == 7) && (c->x86_mask >= 1)) || +- (c->x86_model > 7)) +- if (cpu_has_mp) +- goto valid_k7; +- +- /* If we get here, not a certified SMP capable AMD system. */ +- unsafe_smp = 1; +- } +- +-valid_k7: +- ; ++#ifdef CONFIG_CPUMASK_OFFSTACK ++/* In this case, llc_shared_map is a pointer to a cpumask. */ ++static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, ++ const struct cpuinfo_x86 *src) ++{ ++ struct cpumask *llc = dst->llc_shared_map; ++ *dst = *src; ++ dst->llc_shared_map = llc; + } +- +-static void __cpuinit smp_checks(void) ++#else ++static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, ++ const struct cpuinfo_x86 *src) + { +- if (smp_b_stepping) +- printk(KERN_WARNING "WARNING: SMP operation may be unreliable" +- "with B stepping processors.\n"); +- +- /* +- * Don't taint if we are running SMP kernel on a single non-MP +- * approved Athlon +- */ +- if (unsafe_smp && num_online_cpus() > 1) { +- printk(KERN_INFO "WARNING: This combination of AMD" +- "processors is not suitable for SMP.\n"); +- add_taint(TAINT_UNSAFE_SMP); +- } ++ *dst = *src; + } ++#endif /* CONFIG_CPUMASK_OFFSTACK */ + + /* + * The bootstrap kernel entry code has set these up. Save them for +@@ -419,11 +355,10 @@ void __cpuinit smp_store_cpu_info(int id + { + struct cpuinfo_x86 *c = &cpu_data(id); + +- *c = boot_cpu_data; ++ copy_cpuinfo_x86(c, &boot_cpu_data); + c->cpu_index = id; + if (id != 0) + identify_secondary_cpu(c); +- smp_apply_quirks(c); + } + + +@@ -444,15 +379,15 @@ void __cpuinit set_cpu_sibling_map(int c + cpumask_set_cpu(cpu, cpu_sibling_mask(i)); + cpumask_set_cpu(i, cpu_core_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(i)); +- cpumask_set_cpu(i, &c->llc_shared_map); +- cpumask_set_cpu(cpu, &o->llc_shared_map); ++ cpumask_set_cpu(i, c->llc_shared_map); ++ cpumask_set_cpu(cpu, o->llc_shared_map); + } + } + } else { + cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + } + +- cpumask_set_cpu(cpu, &c->llc_shared_map); ++ cpumask_set_cpu(cpu, c->llc_shared_map); + + if (current_cpu_data.x86_max_cores == 1) { + cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); +@@ -463,8 +398,8 @@ void __cpuinit set_cpu_sibling_map(int c + for_each_cpu(i, cpu_sibling_setup_mask) { + if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && + per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { +- cpumask_set_cpu(i, &c->llc_shared_map); +- cpumask_set_cpu(cpu, &cpu_data(i).llc_shared_map); ++ cpumask_set_cpu(i, c->llc_shared_map); ++ cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); + } + if (c->phys_proc_id == cpu_data(i).phys_proc_id) { + cpumask_set_cpu(i, cpu_core_mask(cpu)); +@@ -502,12 +437,7 @@ const struct cpumask *cpu_coregroup_mask + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_mask(cpu); + else +- return &c->llc_shared_map; +-} +- +-cpumask_t cpu_coregroup_map(int cpu) +-{ +- return *cpu_coregroup_mask(cpu); ++ return c->llc_shared_map; + } + + static void impress_friends(void) +@@ -583,7 +513,7 @@ wakeup_secondary_cpu_via_nmi(int logical + /* Target chip */ + /* Boot on the stack */ + /* Kick the second */ +- apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); ++ apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); +@@ -614,12 +544,6 @@ wakeup_secondary_cpu_via_init(int phys_a + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; + +- if (get_uv_system_type() == UV_NON_UNIQUE_APIC) { +- send_status = uv_wakeup_secondary(phys_apicid, start_eip); +- atomic_set(&init_deasserted, 1); +- return send_status; +- } +- + maxlvt = lapic_get_maxlvt(); + + /* +@@ -745,78 +669,23 @@ static void __cpuinit do_fork_idle(struc + complete(&c_idle->done); + } + +-#ifdef CONFIG_X86_64 +- +-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */ +-static void __ref free_bootmem_pda(struct x8664_pda *oldpda) +-{ +- if (!after_bootmem) +- free_bootmem((unsigned long)oldpda, sizeof(*oldpda)); +-} +- +-/* +- * Allocate node local memory for the AP pda. +- * +- * Must be called after the _cpu_pda pointer table is initialized. +- */ +-int __cpuinit get_local_pda(int cpu) +-{ +- struct x8664_pda *oldpda, *newpda; +- unsigned long size = sizeof(struct x8664_pda); +- int node = cpu_to_node(cpu); +- +- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem) +- return 0; +- +- oldpda = cpu_pda(cpu); +- newpda = kmalloc_node(size, GFP_ATOMIC, node); +- if (!newpda) { +- printk(KERN_ERR "Could not allocate node local PDA " +- "for CPU %d on node %d\n", cpu, node); +- +- if (oldpda) +- return 0; /* have a usable pda */ +- else +- return -1; +- } +- +- if (oldpda) { +- memcpy(newpda, oldpda, size); +- free_bootmem_pda(oldpda); +- } +- +- newpda->in_bootmem = 0; +- cpu_pda(cpu) = newpda; +- return 0; +-} +-#endif /* CONFIG_X86_64 */ +- +-static int __cpuinit do_boot_cpu(int apicid, int cpu) + /* + * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad + * (ie clustered apic addressing mode), this is a LOGICAL apic ID. +- * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. ++ * Returns zero if CPU booted OK, else error code from ++ * ->wakeup_secondary_cpu. + */ ++static int __cpuinit do_boot_cpu(int apicid, int cpu) + { + unsigned long boot_error = 0; +- int timeout; + unsigned long start_ip; +- unsigned short nmi_high = 0, nmi_low = 0; ++ int timeout; + struct create_idle c_idle = { +- .cpu = cpu, +- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), ++ .cpu = cpu, ++ .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), + }; +- INIT_WORK(&c_idle.work, do_fork_idle); + +-#ifdef CONFIG_X86_64 +- /* Allocate node local memory for AP pdas */ +- if (cpu > 0) { +- boot_error = get_local_pda(cpu); +- if (boot_error) +- goto restore_state; +- /* if can't get pda memory, can't start cpu */ +- } +-#endif ++ INIT_WORK(&c_idle.work, do_fork_idle); + + alternatives_smp_switch(1); + +@@ -847,14 +716,16 @@ static int __cpuinit do_boot_cpu(int api + + set_idle_for_cpu(cpu, c_idle.idle); + do_rest: +-#ifdef CONFIG_X86_32 + per_cpu(current_task, cpu) = c_idle.idle; +- init_gdt(cpu); ++#ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + irq_ctx_init(cpu); + #else +- cpu_pda(cpu)->pcurrent = c_idle.idle; + clear_tsk_thread_flag(c_idle.idle, TIF_FORK); ++ initial_gs = per_cpu_offset(cpu); ++ per_cpu(kernel_stack, cpu) = ++ (unsigned long)task_stack_page(c_idle.idle) - ++ KERNEL_STACK_OFFSET + THREAD_SIZE; + #endif + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + initial_code = (unsigned long)start_secondary; +@@ -878,8 +749,6 @@ do_rest: + + pr_debug("Setting warm reset code and vector.\n"); + +- store_NMI_vector(&nmi_high, &nmi_low); +- + smpboot_setup_warm_reset_vector(start_ip); + /* + * Be paranoid about clearing APIC errors. +@@ -891,9 +760,13 @@ do_rest: + } + + /* +- * Starting actual IPI sequence... ++ * Kick the secondary CPU. Use the method in the APIC driver ++ * if it's defined - or use an INIT boot APIC message otherwise: + */ +- boot_error = wakeup_secondary_cpu(apicid, start_ip); ++ if (apic->wakeup_secondary_cpu) ++ boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); ++ else ++ boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + + if (!boot_error) { + /* +@@ -927,13 +800,11 @@ do_rest: + else + /* trampoline code not run */ + printk(KERN_ERR "Not responding.\n"); +- if (get_uv_system_type() != UV_NON_UNIQUE_APIC) +- inquire_remote_apic(apicid); ++ if (apic->inquire_remote_apic) ++ apic->inquire_remote_apic(apicid); + } + } +-#ifdef CONFIG_X86_64 +-restore_state: +-#endif ++ + if (boot_error) { + /* Try to put things back the way they were before ... */ + numa_remove_cpu(cpu); /* was set by numa_add_cpu */ +@@ -961,7 +832,7 @@ restore_state: + + int __cpuinit native_cpu_up(unsigned int cpu) + { +- int apicid = cpu_present_to_apicid(cpu); ++ int apicid = apic->cpu_present_to_apicid(cpu); + unsigned long flags; + int err; + +@@ -1033,9 +904,8 @@ int __cpuinit native_cpu_up(unsigned int + */ + static __init void disable_smp(void) + { +- /* use the read/write pointers to the present and possible maps */ +- cpumask_copy(&cpu_present_map, cpumask_of(0)); +- cpumask_copy(&cpu_possible_map, cpumask_of(0)); ++ init_cpu_present(cpumask_of(0)); ++ init_cpu_possible(cpumask_of(0)); + smpboot_clear_io_apic_irqs(); + + if (smp_found_config) +@@ -1054,14 +924,14 @@ static int __init smp_sanity_check(unsig + { + preempt_disable(); + +-#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) ++#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32) + if (def_to_bigsmp && nr_cpu_ids > 8) { + unsigned int cpu; + unsigned nr; + + printk(KERN_WARNING + "More than 8 CPUs detected - skipping them.\n" +- "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); ++ "Use CONFIG_X86_BIGSMP.\n"); + + nr = 0; + for_each_present_cpu(cpu) { +@@ -1107,7 +977,7 @@ static int __init smp_sanity_check(unsig + * Should not be necessary because the MP table should list the boot + * CPU too, but we do it for the sake of robustness anyway. + */ +- if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { ++ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { + printk(KERN_NOTICE + "weird, boot CPU (#%d) not listed by the BIOS.\n", + boot_cpu_physical_apicid); +@@ -1125,6 +995,7 @@ static int __init smp_sanity_check(unsig + printk(KERN_ERR "... forcing use of dummy APIC emulation." + "(tell your hw vendor)\n"); + smpboot_clear_io_apic(); ++ arch_disable_smp_support(); + return -1; + } + +@@ -1166,6 +1037,8 @@ static void __init smp_cpu_index_default + */ + void __init native_smp_prepare_cpus(unsigned int max_cpus) + { ++ unsigned int i; ++ + preempt_disable(); + smp_cpu_index_default(); + current_cpu_data = boot_cpu_data; +@@ -1179,11 +1052,19 @@ void __init native_smp_prepare_cpus(unsi + boot_cpu_logical_apicid = logical_smp_processor_id(); + #endif + current_thread_info()->cpu = 0; /* needed? */ ++ for_each_possible_cpu(i) { ++ alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); ++ alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); ++ alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); ++ cpumask_clear(per_cpu(cpu_core_map, i)); ++ cpumask_clear(per_cpu(cpu_sibling_map, i)); ++ cpumask_clear(cpu_data(i).llc_shared_map); ++ } + set_cpu_sibling_map(0); + +-#ifdef CONFIG_X86_64 + enable_IR_x2apic(); +- setup_apic_routing(); ++#ifdef CONFIG_X86_64 ++ default_setup_apic_routing(); + #endif + + if (smp_sanity_check(max_cpus) < 0) { +@@ -1207,18 +1088,18 @@ void __init native_smp_prepare_cpus(unsi + */ + setup_local_APIC(); + +-#ifdef CONFIG_X86_64 + /* + * Enable IO APIC before setting up error vector + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); +-#endif ++ + end_local_APIC_setup(); + + map_cpu_to_logical_apicid(); + +- setup_portio_remap(); ++ if (apic->setup_portio_remap) ++ apic->setup_portio_remap(); + + smpboot_setup_io_apic(); + /* +@@ -1240,10 +1121,7 @@ out: + void __init native_smp_prepare_boot_cpu(void) + { + int me = smp_processor_id(); +-#ifdef CONFIG_X86_32 +- init_gdt(me); +-#endif +- switch_to_new_gdt(); ++ switch_to_new_gdt(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ + cpumask_set_cpu(me, cpu_callout_mask); + per_cpu(cpu_state, me) = CPU_ONLINE; +@@ -1254,7 +1132,6 @@ void __init native_smp_cpus_done(unsigne + pr_debug("Boot done.\n"); + + impress_friends(); +- smp_checks(); + #ifdef CONFIG_X86_IO_APIC + setup_ioapic_dest(); + #endif +@@ -1271,11 +1148,11 @@ early_param("possible_cpus", _setup_poss + + + /* +- * cpu_possible_map should be static, it cannot change as cpu's ++ * cpu_possible_mask should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. +- * cpu_present_map on the other hand can change dynamically. ++ * cpu_present_mask on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * - Ashok Raj +Index: linux-2.6-tip/arch/x86/kernel/smpcommon.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/smpcommon.c ++++ /dev/null +@@ -1,30 +0,0 @@ +-/* +- * SMP stuff which is common to all sub-architectures. +- */ +-#include +-#include +- +-#ifdef CONFIG_X86_32 +-DEFINE_PER_CPU(unsigned long, this_cpu_off); +-EXPORT_PER_CPU_SYMBOL(this_cpu_off); +- +-/* +- * Initialize the CPU's GDT. This is either the boot CPU doing itself +- * (still using the master per-cpu area), or a CPU doing it for a +- * secondary which will soon come up. +- */ +-__cpuinit void init_gdt(int cpu) +-{ +- struct desc_struct gdt; +- +- pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF, +- 0x2 | DESCTYPE_S, 0x8); +- gdt.s = 1; +- +- write_gdt_entry(get_cpu_gdt_table(cpu), +- GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); +- +- per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; +- per_cpu(cpu_number, cpu) = cpu; +-} +-#endif +Index: linux-2.6-tip/arch/x86/kernel/stacktrace.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/stacktrace.c ++++ linux-2.6-tip/arch/x86/kernel/stacktrace.c +@@ -1,7 +1,7 @@ + /* + * Stack trace management functions + * +- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar ++ * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar + */ + #include + #include +@@ -77,6 +77,13 @@ void save_stack_trace(struct stack_trace + } + EXPORT_SYMBOL_GPL(save_stack_trace); + ++void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) ++{ ++ dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); ++ if (trace->nr_entries < trace->max_entries) ++ trace->entries[trace->nr_entries++] = ULONG_MAX; ++} ++ + void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) + { + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); +Index: linux-2.6-tip/arch/x86/kernel/summit_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/summit_32.c ++++ /dev/null +@@ -1,188 +0,0 @@ +-/* +- * IBM Summit-Specific Code +- * +- * Written By: Matthew Dobson, IBM Corporation +- * +- * Copyright (c) 2003 IBM Corp. +- * +- * All rights reserved. +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License as published by +- * the Free Software Foundation; either version 2 of the License, or (at +- * your option) any later version. +- * +- * This program is distributed in the hope that it will be useful, but +- * WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or +- * NON INFRINGEMENT. See the GNU General Public License for more +- * details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program; if not, write to the Free Software +- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +- * +- * Send feedback to +- * +- */ +- +-#include +-#include +-#include +-#include +-#include +- +-static struct rio_table_hdr *rio_table_hdr __initdata; +-static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; +-static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; +- +-#ifndef CONFIG_X86_NUMAQ +-static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata; +-#endif +- +-static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) +-{ +- int twister = 0, node = 0; +- int i, bus, num_buses; +- +- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { +- if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) { +- twister = rio_devs[i]->owner_id; +- break; +- } +- } +- if (i == rio_table_hdr->num_rio_dev) { +- printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); +- return last_bus; +- } +- +- for (i = 0; i < rio_table_hdr->num_scal_dev; i++) { +- if (scal_devs[i]->node_id == twister) { +- node = scal_devs[i]->node_id; +- break; +- } +- } +- if (i == rio_table_hdr->num_scal_dev) { +- printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); +- return last_bus; +- } +- +- switch (rio_devs[wpeg_num]->type) { +- case CompatWPEG: +- /* +- * The Compatibility Winnipeg controls the 2 legacy buses, +- * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case +- * a PCI-PCI bridge card is used in either slot: total 5 buses. +- */ +- num_buses = 5; +- break; +- case AltWPEG: +- /* +- * The Alternate Winnipeg controls the 2 133MHz buses [1 slot +- * each], their 2 "extra" buses, the 100MHz bus [2 slots] and +- * the "extra" buses for each of those slots: total 7 buses. +- */ +- num_buses = 7; +- break; +- case LookOutAWPEG: +- case LookOutBWPEG: +- /* +- * A Lookout Winnipeg controls 3 100MHz buses [2 slots each] +- * & the "extra" buses for each of those slots: total 9 buses. +- */ +- num_buses = 9; +- break; +- default: +- printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); +- return last_bus; +- } +- +- for (bus = last_bus; bus < last_bus + num_buses; bus++) +- mp_bus_id_to_node[bus] = node; +- return bus; +-} +- +-static int __init build_detail_arrays(void) +-{ +- unsigned long ptr; +- int i, scal_detail_size, rio_detail_size; +- +- if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { +- printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); +- return 0; +- } +- +- switch (rio_table_hdr->version) { +- default: +- printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); +- return 0; +- case 2: +- scal_detail_size = 11; +- rio_detail_size = 13; +- break; +- case 3: +- scal_detail_size = 12; +- rio_detail_size = 15; +- break; +- } +- +- ptr = (unsigned long)rio_table_hdr + 3; +- for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) +- scal_devs[i] = (struct scal_detail *)ptr; +- +- for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) +- rio_devs[i] = (struct rio_detail *)ptr; +- +- return 1; +-} +- +-void __init setup_summit(void) +-{ +- unsigned long ptr; +- unsigned short offset; +- int i, next_wpeg, next_bus = 0; +- +- /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ +- ptr = get_bios_ebda(); +- ptr = (unsigned long)phys_to_virt(ptr); +- +- rio_table_hdr = NULL; +- offset = 0x180; +- while (offset) { +- /* The block id is stored in the 2nd word */ +- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) { +- /* set the pointer past the offset & block id */ +- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); +- break; +- } +- /* The next offset is stored in the 1st word. 0 means no more */ +- offset = *((unsigned short *)(ptr + offset)); +- } +- if (!rio_table_hdr) { +- printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); +- return; +- } +- +- if (!build_detail_arrays()) +- return; +- +- /* The first Winnipeg we're looking for has an index of 0 */ +- next_wpeg = 0; +- do { +- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { +- if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) { +- /* It's the Winnipeg we're looking for! */ +- next_bus = setup_pci_node_map_for_wpeg(i, next_bus); +- next_wpeg++; +- break; +- } +- } +- /* +- * If we go through all Rio devices and don't find one with +- * the next index, it means we've found all the Winnipegs, +- * and thus all the PCI buses. +- */ +- if (i == rio_table_hdr->num_rio_dev) +- next_wpeg = 0; +- } while (next_wpeg != 0); +-} +Index: linux-2.6-tip/arch/x86/kernel/syscall_table_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/syscall_table_32.S ++++ linux-2.6-tip/arch/x86/kernel/syscall_table_32.S +@@ -1,7 +1,7 @@ + ENTRY(sys_call_table) + .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ + .long sys_exit +- .long sys_fork ++ .long ptregs_fork + .long sys_read + .long sys_write + .long sys_open /* 5 */ +@@ -10,7 +10,7 @@ ENTRY(sys_call_table) + .long sys_creat + .long sys_link + .long sys_unlink /* 10 */ +- .long sys_execve ++ .long ptregs_execve + .long sys_chdir + .long sys_time + .long sys_mknod +@@ -109,17 +109,17 @@ ENTRY(sys_call_table) + .long sys_newlstat + .long sys_newfstat + .long sys_uname +- .long sys_iopl /* 110 */ ++ .long ptregs_iopl /* 110 */ + .long sys_vhangup + .long sys_ni_syscall /* old "idle" system call */ +- .long sys_vm86old ++ .long ptregs_vm86old + .long sys_wait4 + .long sys_swapoff /* 115 */ + .long sys_sysinfo + .long sys_ipc + .long sys_fsync +- .long sys_sigreturn +- .long sys_clone /* 120 */ ++ .long ptregs_sigreturn ++ .long ptregs_clone /* 120 */ + .long sys_setdomainname + .long sys_newuname + .long sys_modify_ldt +@@ -165,14 +165,14 @@ ENTRY(sys_call_table) + .long sys_mremap + .long sys_setresuid16 + .long sys_getresuid16 /* 165 */ +- .long sys_vm86 ++ .long ptregs_vm86 + .long sys_ni_syscall /* Old sys_query_module */ + .long sys_poll + .long sys_nfsservctl + .long sys_setresgid16 /* 170 */ + .long sys_getresgid16 + .long sys_prctl +- .long sys_rt_sigreturn ++ .long ptregs_rt_sigreturn + .long sys_rt_sigaction + .long sys_rt_sigprocmask /* 175 */ + .long sys_rt_sigpending +@@ -185,11 +185,11 @@ ENTRY(sys_call_table) + .long sys_getcwd + .long sys_capget + .long sys_capset /* 185 */ +- .long sys_sigaltstack ++ .long ptregs_sigaltstack + .long sys_sendfile + .long sys_ni_syscall /* reserved for streams1 */ + .long sys_ni_syscall /* reserved for streams2 */ +- .long sys_vfork /* 190 */ ++ .long ptregs_vfork /* 190 */ + .long sys_getrlimit + .long sys_mmap2 + .long sys_truncate64 +@@ -332,3 +332,8 @@ ENTRY(sys_call_table) + .long sys_dup3 /* 330 */ + .long sys_pipe2 + .long sys_inotify_init1 ++ .long sys_ni_syscall /* preadv */ ++ .long sys_ni_syscall /* pwritev */ ++ .long sys_rt_tgsigqueueinfo /* 335 */ ++ .long sys_perf_counter_open ++ +Index: linux-2.6-tip/arch/x86/kernel/time_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/time_32.c ++++ linux-2.6-tip/arch/x86/kernel/time_32.c +@@ -33,12 +33,12 @@ + #include + #include + +-#include ++#include + #include + #include + #include + +-#include "do_timer.h" ++#include + + int timer_ack; + +@@ -118,7 +118,7 @@ void __init hpet_time_init(void) + { + if (!hpet_enable()) + setup_pit_timer(); +- time_init_hook(); ++ x86_quirk_time_init(); + } + + /* +@@ -131,7 +131,7 @@ void __init hpet_time_init(void) + */ + void __init time_init(void) + { +- pre_time_init_hook(); ++ x86_quirk_pre_time_init(); + tsc_init(); + late_time_init = choose_time_init(); + } +Index: linux-2.6-tip/arch/x86/kernel/tlb_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/tlb_32.c ++++ /dev/null +@@ -1,256 +0,0 @@ +-#include +-#include +-#include +- +-#include +- +-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) +- ____cacheline_aligned = { &init_mm, 0, }; +- +-/* must come after the send_IPI functions above for inlining */ +-#include +- +-/* +- * Smarter SMP flushing macros. +- * c/o Linus Torvalds. +- * +- * These mean you can really definitely utterly forget about +- * writing to user space from interrupts. (Its not allowed anyway). +- * +- * Optimizations Manfred Spraul +- */ +- +-static cpumask_t flush_cpumask; +-static struct mm_struct *flush_mm; +-static unsigned long flush_va; +-static DEFINE_SPINLOCK(tlbstate_lock); +- +-/* +- * We cannot call mmdrop() because we are in interrupt context, +- * instead update mm->cpu_vm_mask. +- * +- * We need to reload %cr3 since the page tables may be going +- * away from under us.. +- */ +-void leave_mm(int cpu) +-{ +- BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); +- cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); +- load_cr3(swapper_pg_dir); +-} +-EXPORT_SYMBOL_GPL(leave_mm); +- +-/* +- * +- * The flush IPI assumes that a thread switch happens in this order: +- * [cpu0: the cpu that switches] +- * 1) switch_mm() either 1a) or 1b) +- * 1a) thread switch to a different mm +- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); +- * Stop ipi delivery for the old mm. This is not synchronized with +- * the other cpus, but smp_invalidate_interrupt ignore flush ipis +- * for the wrong mm, and in the worst case we perform a superfluous +- * tlb flush. +- * 1a2) set cpu_tlbstate to TLBSTATE_OK +- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 +- * was in lazy tlb mode. +- * 1a3) update cpu_tlbstate[].active_mm +- * Now cpu0 accepts tlb flushes for the new mm. +- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); +- * Now the other cpus will send tlb flush ipis. +- * 1a4) change cr3. +- * 1b) thread switch without mm change +- * cpu_tlbstate[].active_mm is correct, cpu0 already handles +- * flush ipis. +- * 1b1) set cpu_tlbstate to TLBSTATE_OK +- * 1b2) test_and_set the cpu bit in cpu_vm_mask. +- * Atomically set the bit [other cpus will start sending flush ipis], +- * and test the bit. +- * 1b3) if the bit was 0: leave_mm was called, flush the tlb. +- * 2) switch %%esp, ie current +- * +- * The interrupt must handle 2 special cases: +- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. +- * - the cpu performs speculative tlb reads, i.e. even if the cpu only +- * runs in kernel space, the cpu could load tlb entries for user space +- * pages. +- * +- * The good news is that cpu_tlbstate is local to each cpu, no +- * write/read ordering problems. +- */ +- +-/* +- * TLB flush IPI: +- * +- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. +- * 2) Leave the mm if we are in the lazy tlb mode. +- */ +- +-void smp_invalidate_interrupt(struct pt_regs *regs) +-{ +- unsigned long cpu; +- +- cpu = get_cpu(); +- +- if (!cpu_isset(cpu, flush_cpumask)) +- goto out; +- /* +- * This was a BUG() but until someone can quote me the +- * line from the intel manual that guarantees an IPI to +- * multiple CPUs is retried _only_ on the erroring CPUs +- * its staying as a return +- * +- * BUG(); +- */ +- +- if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { +- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { +- if (flush_va == TLB_FLUSH_ALL) +- local_flush_tlb(); +- else +- __flush_tlb_one(flush_va); +- } else +- leave_mm(cpu); +- } +- ack_APIC_irq(); +- smp_mb__before_clear_bit(); +- cpu_clear(cpu, flush_cpumask); +- smp_mb__after_clear_bit(); +-out: +- put_cpu_no_resched(); +- inc_irq_stat(irq_tlb_count); +-} +- +-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, +- unsigned long va) +-{ +- cpumask_t cpumask = *cpumaskp; +- +- /* +- * A couple of (to be removed) sanity checks: +- * +- * - current CPU must not be in mask +- * - mask must exist :) +- */ +- BUG_ON(cpus_empty(cpumask)); +- BUG_ON(cpu_isset(smp_processor_id(), cpumask)); +- BUG_ON(!mm); +- +-#ifdef CONFIG_HOTPLUG_CPU +- /* If a CPU which we ran on has gone down, OK. */ +- cpus_and(cpumask, cpumask, cpu_online_map); +- if (unlikely(cpus_empty(cpumask))) +- return; +-#endif +- +- /* +- * i'm not happy about this global shared spinlock in the +- * MM hot path, but we'll see how contended it is. +- * AK: x86-64 has a faster method that could be ported. +- */ +- spin_lock(&tlbstate_lock); +- +- flush_mm = mm; +- flush_va = va; +- cpus_or(flush_cpumask, cpumask, flush_cpumask); +- +- /* +- * Make the above memory operations globally visible before +- * sending the IPI. +- */ +- smp_mb(); +- /* +- * We have to send the IPI only to +- * CPUs affected. +- */ +- send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR); +- +- while (!cpus_empty(flush_cpumask)) +- /* nothing. lockup detection does not belong here */ +- cpu_relax(); +- +- flush_mm = NULL; +- flush_va = 0; +- spin_unlock(&tlbstate_lock); +-} +- +-void flush_tlb_current_task(void) +-{ +- struct mm_struct *mm = current->mm; +- cpumask_t cpu_mask; +- +- preempt_disable(); +- cpu_mask = mm->cpu_vm_mask; +- cpu_clear(smp_processor_id(), cpu_mask); +- +- local_flush_tlb(); +- if (!cpus_empty(cpu_mask)) +- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); +- preempt_enable(); +-} +- +-void flush_tlb_mm(struct mm_struct *mm) +-{ +- cpumask_t cpu_mask; +- +- preempt_disable(); +- cpu_mask = mm->cpu_vm_mask; +- cpu_clear(smp_processor_id(), cpu_mask); +- +- if (current->active_mm == mm) { +- if (current->mm) +- local_flush_tlb(); +- else +- leave_mm(smp_processor_id()); +- } +- if (!cpus_empty(cpu_mask)) +- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); +- +- preempt_enable(); +-} +- +-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +-{ +- struct mm_struct *mm = vma->vm_mm; +- cpumask_t cpu_mask; +- +- preempt_disable(); +- cpu_mask = mm->cpu_vm_mask; +- cpu_clear(smp_processor_id(), cpu_mask); +- +- if (current->active_mm == mm) { +- if (current->mm) +- __flush_tlb_one(va); +- else +- leave_mm(smp_processor_id()); +- } +- +- if (!cpus_empty(cpu_mask)) +- flush_tlb_others(cpu_mask, mm, va); +- +- preempt_enable(); +-} +-EXPORT_SYMBOL(flush_tlb_page); +- +-static void do_flush_tlb_all(void *info) +-{ +- unsigned long cpu = smp_processor_id(); +- +- __flush_tlb_all(); +- if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) +- leave_mm(cpu); +-} +- +-void flush_tlb_all(void) +-{ +- on_each_cpu(do_flush_tlb_all, NULL, 1); +-} +- +-void reset_lazy_tlbstate(void) +-{ +- int cpu = raw_smp_processor_id(); +- +- per_cpu(cpu_tlbstate, cpu).state = 0; +- per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +-} +- +Index: linux-2.6-tip/arch/x86/kernel/tlb_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/tlb_64.c ++++ /dev/null +@@ -1,284 +0,0 @@ +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-/* +- * Smarter SMP flushing macros. +- * c/o Linus Torvalds. +- * +- * These mean you can really definitely utterly forget about +- * writing to user space from interrupts. (Its not allowed anyway). +- * +- * Optimizations Manfred Spraul +- * +- * More scalable flush, from Andi Kleen +- * +- * To avoid global state use 8 different call vectors. +- * Each CPU uses a specific vector to trigger flushes on other +- * CPUs. Depending on the received vector the target CPUs look into +- * the right per cpu variable for the flush data. +- * +- * With more than 8 CPUs they are hashed to the 8 available +- * vectors. The limited global vector space forces us to this right now. +- * In future when interrupts are split into per CPU domains this could be +- * fixed, at the cost of triggering multiple IPIs in some cases. +- */ +- +-union smp_flush_state { +- struct { +- cpumask_t flush_cpumask; +- struct mm_struct *flush_mm; +- unsigned long flush_va; +- spinlock_t tlbstate_lock; +- }; +- char pad[SMP_CACHE_BYTES]; +-} ____cacheline_aligned; +- +-/* State is put into the per CPU data section, but padded +- to a full cache line because other CPUs can access it and we don't +- want false sharing in the per cpu data segment. */ +-static DEFINE_PER_CPU(union smp_flush_state, flush_state); +- +-/* +- * We cannot call mmdrop() because we are in interrupt context, +- * instead update mm->cpu_vm_mask. +- */ +-void leave_mm(int cpu) +-{ +- if (read_pda(mmu_state) == TLBSTATE_OK) +- BUG(); +- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); +- load_cr3(swapper_pg_dir); +-} +-EXPORT_SYMBOL_GPL(leave_mm); +- +-/* +- * +- * The flush IPI assumes that a thread switch happens in this order: +- * [cpu0: the cpu that switches] +- * 1) switch_mm() either 1a) or 1b) +- * 1a) thread switch to a different mm +- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); +- * Stop ipi delivery for the old mm. This is not synchronized with +- * the other cpus, but smp_invalidate_interrupt ignore flush ipis +- * for the wrong mm, and in the worst case we perform a superfluous +- * tlb flush. +- * 1a2) set cpu mmu_state to TLBSTATE_OK +- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 +- * was in lazy tlb mode. +- * 1a3) update cpu active_mm +- * Now cpu0 accepts tlb flushes for the new mm. +- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); +- * Now the other cpus will send tlb flush ipis. +- * 1a4) change cr3. +- * 1b) thread switch without mm change +- * cpu active_mm is correct, cpu0 already handles +- * flush ipis. +- * 1b1) set cpu mmu_state to TLBSTATE_OK +- * 1b2) test_and_set the cpu bit in cpu_vm_mask. +- * Atomically set the bit [other cpus will start sending flush ipis], +- * and test the bit. +- * 1b3) if the bit was 0: leave_mm was called, flush the tlb. +- * 2) switch %%esp, ie current +- * +- * The interrupt must handle 2 special cases: +- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. +- * - the cpu performs speculative tlb reads, i.e. even if the cpu only +- * runs in kernel space, the cpu could load tlb entries for user space +- * pages. +- * +- * The good news is that cpu mmu_state is local to each cpu, no +- * write/read ordering problems. +- */ +- +-/* +- * TLB flush IPI: +- * +- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. +- * 2) Leave the mm if we are in the lazy tlb mode. +- * +- * Interrupts are disabled. +- */ +- +-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) +-{ +- int cpu; +- int sender; +- union smp_flush_state *f; +- +- cpu = smp_processor_id(); +- /* +- * orig_rax contains the negated interrupt vector. +- * Use that to determine where the sender put the data. +- */ +- sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; +- f = &per_cpu(flush_state, sender); +- +- if (!cpu_isset(cpu, f->flush_cpumask)) +- goto out; +- /* +- * This was a BUG() but until someone can quote me the +- * line from the intel manual that guarantees an IPI to +- * multiple CPUs is retried _only_ on the erroring CPUs +- * its staying as a return +- * +- * BUG(); +- */ +- +- if (f->flush_mm == read_pda(active_mm)) { +- if (read_pda(mmu_state) == TLBSTATE_OK) { +- if (f->flush_va == TLB_FLUSH_ALL) +- local_flush_tlb(); +- else +- __flush_tlb_one(f->flush_va); +- } else +- leave_mm(cpu); +- } +-out: +- ack_APIC_irq(); +- cpu_clear(cpu, f->flush_cpumask); +- inc_irq_stat(irq_tlb_count); +-} +- +-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, +- unsigned long va) +-{ +- int sender; +- union smp_flush_state *f; +- cpumask_t cpumask = *cpumaskp; +- +- if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va)) +- return; +- +- /* Caller has disabled preemption */ +- sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; +- f = &per_cpu(flush_state, sender); +- +- /* +- * Could avoid this lock when +- * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is +- * probably not worth checking this for a cache-hot lock. +- */ +- spin_lock(&f->tlbstate_lock); +- +- f->flush_mm = mm; +- f->flush_va = va; +- cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); +- +- /* +- * Make the above memory operations globally visible before +- * sending the IPI. +- */ +- smp_mb(); +- /* +- * We have to send the IPI only to +- * CPUs affected. +- */ +- send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); +- +- while (!cpus_empty(f->flush_cpumask)) +- cpu_relax(); +- +- f->flush_mm = NULL; +- f->flush_va = 0; +- spin_unlock(&f->tlbstate_lock); +-} +- +-static int __cpuinit init_smp_flush(void) +-{ +- int i; +- +- for_each_possible_cpu(i) +- spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); +- +- return 0; +-} +-core_initcall(init_smp_flush); +- +-void flush_tlb_current_task(void) +-{ +- struct mm_struct *mm = current->mm; +- cpumask_t cpu_mask; +- +- preempt_disable(); +- cpu_mask = mm->cpu_vm_mask; +- cpu_clear(smp_processor_id(), cpu_mask); +- +- local_flush_tlb(); +- if (!cpus_empty(cpu_mask)) +- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); +- preempt_enable(); +-} +- +-void flush_tlb_mm(struct mm_struct *mm) +-{ +- cpumask_t cpu_mask; +- +- preempt_disable(); +- cpu_mask = mm->cpu_vm_mask; +- cpu_clear(smp_processor_id(), cpu_mask); +- +- if (current->active_mm == mm) { +- if (current->mm) +- local_flush_tlb(); +- else +- leave_mm(smp_processor_id()); +- } +- if (!cpus_empty(cpu_mask)) +- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); +- +- preempt_enable(); +-} +- +-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +-{ +- struct mm_struct *mm = vma->vm_mm; +- cpumask_t cpu_mask; +- +- preempt_disable(); +- cpu_mask = mm->cpu_vm_mask; +- cpu_clear(smp_processor_id(), cpu_mask); +- +- if (current->active_mm == mm) { +- if (current->mm) +- __flush_tlb_one(va); +- else +- leave_mm(smp_processor_id()); +- } +- +- if (!cpus_empty(cpu_mask)) +- flush_tlb_others(cpu_mask, mm, va); +- +- preempt_enable(); +-} +- +-static void do_flush_tlb_all(void *info) +-{ +- unsigned long cpu = smp_processor_id(); +- +- __flush_tlb_all(); +- if (read_pda(mmu_state) == TLBSTATE_LAZY) +- leave_mm(cpu); +-} +- +-void flush_tlb_all(void) +-{ +- on_each_cpu(do_flush_tlb_all, NULL, 1); +-} +Index: linux-2.6-tip/arch/x86/kernel/tlb_uv.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/tlb_uv.c ++++ linux-2.6-tip/arch/x86/kernel/tlb_uv.c +@@ -11,16 +11,15 @@ + #include + + #include ++#include + #include + #include + #include +-#include ++#include + #include + #include + #include + +-#include +- + static struct bau_control **uv_bau_table_bases __read_mostly; + static int uv_bau_retry_limit __read_mostly; + +@@ -210,14 +209,15 @@ static int uv_wait_completion(struct bau + * + * Send a broadcast and wait for a broadcast message to complete. + * +- * The cpumaskp mask contains the cpus the broadcast was sent to. ++ * The flush_mask contains the cpus the broadcast was sent to. + * +- * Returns 1 if all remote flushing was done. The mask is zeroed. +- * Returns 0 if some remote flushing remains to be done. The mask is left +- * unchanged. +- */ +-int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, +- cpumask_t *cpumaskp) ++ * Returns NULL if all remote flushing was done. The mask is zeroed. ++ * Returns @flush_mask if some remote flushing remains to be done. The ++ * mask will have some bits still set. ++ */ ++const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade, ++ struct bau_desc *bau_desc, ++ struct cpumask *flush_mask) + { + int completion_status = 0; + int right_shift; +@@ -257,66 +257,75 @@ int uv_flush_send_and_wait(int cpu, int + * the cpu's, all of which are still in the mask. + */ + __get_cpu_var(ptcstats).ptc_i++; +- return 0; ++ return flush_mask; + } + + /* + * Success, so clear the remote cpu's from the mask so we don't + * use the IPI method of shootdown on them. + */ +- for_each_cpu_mask(bit, *cpumaskp) { ++ for_each_cpu(bit, flush_mask) { + blade = uv_cpu_to_blade_id(bit); + if (blade == this_blade) + continue; +- cpu_clear(bit, *cpumaskp); ++ cpumask_clear_cpu(bit, flush_mask); + } +- if (!cpus_empty(*cpumaskp)) +- return 0; +- return 1; ++ if (!cpumask_empty(flush_mask)) ++ return flush_mask; ++ return NULL; + } + ++static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); ++ + /** + * uv_flush_tlb_others - globally purge translation cache of a virtual + * address or all TLB's +- * @cpumaskp: mask of all cpu's in which the address is to be removed ++ * @cpumask: mask of all cpu's in which the address is to be removed + * @mm: mm_struct containing virtual address range + * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) ++ * @cpu: the current cpu + * + * This is the entry point for initiating any UV global TLB shootdown. + * + * Purges the translation caches of all specified processors of the given + * virtual address, or purges all TLB's on specified processors. + * +- * The caller has derived the cpumaskp from the mm_struct and has subtracted +- * the local cpu from the mask. This function is called only if there +- * are bits set in the mask. (e.g. flush_tlb_page()) ++ * The caller has derived the cpumask from the mm_struct. This function ++ * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) + * +- * The cpumaskp is converted into a nodemask of the nodes containing ++ * The cpumask is converted into a nodemask of the nodes containing + * the cpus. + * +- * Returns 1 if all remote flushing was done. +- * Returns 0 if some remote flushing remains to be done. ++ * Note that this function should be called with preemption disabled. ++ * ++ * Returns NULL if all remote flushing was done. ++ * Returns pointer to cpumask if some remote flushing remains to be ++ * done. The returned pointer is valid till preemption is re-enabled. + */ +-int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, +- unsigned long va) ++const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, ++ struct mm_struct *mm, ++ unsigned long va, unsigned int cpu) + { ++ struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); + int i; + int bit; + int blade; +- int cpu; ++ int uv_cpu; + int this_blade; + int locals = 0; + struct bau_desc *bau_desc; + +- cpu = uv_blade_processor_id(); ++ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); ++ ++ uv_cpu = uv_blade_processor_id(); + this_blade = uv_numa_blade_id(); + bau_desc = __get_cpu_var(bau_control).descriptor_base; +- bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; ++ bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; + + bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); + + i = 0; +- for_each_cpu_mask(bit, *cpumaskp) { ++ for_each_cpu(bit, flush_mask) { + blade = uv_cpu_to_blade_id(bit); + BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); + if (blade == this_blade) { +@@ -331,17 +340,17 @@ int uv_flush_tlb_others(cpumask_t *cpuma + * no off_node flushing; return status for local node + */ + if (locals) +- return 0; ++ return flush_mask; + else +- return 1; ++ return NULL; + } + __get_cpu_var(ptcstats).requestor++; + __get_cpu_var(ptcstats).ntargeted += i; + + bau_desc->payload.address = va; +- bau_desc->payload.sending_cpu = smp_processor_id(); ++ bau_desc->payload.sending_cpu = cpu; + +- return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); ++ return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask); + } + + /* +@@ -747,6 +756,10 @@ static int __init uv_bau_init(void) + if (!is_uv_system()) + return 0; + ++ for_each_possible_cpu(cur_cpu) ++ alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), ++ GFP_KERNEL, cpu_to_node(cur_cpu)); ++ + uv_bau_retry_limit = 1; + uv_nshift = uv_hub_info->n_val; + uv_mmask = (1UL << uv_hub_info->n_val) - 1; +Index: linux-2.6-tip/arch/x86/kernel/topology.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/topology.c ++++ linux-2.6-tip/arch/x86/kernel/topology.c +@@ -25,10 +25,10 @@ + * + * Send feedback to + */ +-#include +-#include + #include + #include ++#include ++#include + #include + + static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); +@@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num) + */ + if (num) + per_cpu(cpu_devices, num).cpu.hotpluggable = 1; ++ + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); + } + EXPORT_SYMBOL(arch_register_cpu); +@@ -56,12 +57,13 @@ void arch_unregister_cpu(int num) + unregister_cpu(&per_cpu(cpu_devices, num).cpu); + } + EXPORT_SYMBOL(arch_unregister_cpu); +-#else ++#else /* CONFIG_HOTPLUG_CPU */ ++ + static int __init arch_register_cpu(int num) + { + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); + } +-#endif /*CONFIG_HOTPLUG_CPU*/ ++#endif /* CONFIG_HOTPLUG_CPU */ + + static int __init topology_init(void) + { +@@ -70,11 +72,11 @@ static int __init topology_init(void) + #ifdef CONFIG_NUMA + for_each_online_node(i) + register_one_node(i); +-#endif /* CONFIG_NUMA */ ++#endif + + for_each_present_cpu(i) + arch_register_cpu(i); ++ + return 0; + } +- + subsys_initcall(topology_init); +Index: linux-2.6-tip/arch/x86/kernel/trampoline_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/trampoline_32.S ++++ linux-2.6-tip/arch/x86/kernel/trampoline_32.S +@@ -29,7 +29,7 @@ + + #include + #include +-#include ++#include + + /* We can free up trampoline after bootup if cpu hotplug is not supported. */ + #ifndef CONFIG_HOTPLUG_CPU +Index: linux-2.6-tip/arch/x86/kernel/trampoline_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/trampoline_64.S ++++ linux-2.6-tip/arch/x86/kernel/trampoline_64.S +@@ -25,10 +25,11 @@ + */ + + #include +-#include +-#include ++#include ++#include + #include + #include ++#include + + .section .rodata, "a", @progbits + +@@ -37,7 +38,7 @@ + ENTRY(trampoline_data) + r_base = . + cli # We should be safe anyway +- wbinvd ++ wbinvd + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + mov %ax, %es +@@ -73,9 +74,8 @@ r_base = . + lidtl tidt - r_base # load idt with 0, 0 + lgdtl tgdt - r_base # load gdt with whatever is appropriate + +- xor %ax, %ax +- inc %ax # protected mode (PE) bit +- lmsw %ax # into protected mode ++ mov $X86_CR0_PE, %ax # protected mode (PE) bit ++ lmsw %ax # into protected mode + + # flush prefetch and jump to startup_32 + ljmpl *(startup_32_vector - r_base) +@@ -86,9 +86,8 @@ startup_32: + movl $__KERNEL_DS, %eax # Initialize the %ds segment register + movl %eax, %ds + +- xorl %eax, %eax +- btsl $5, %eax # Enable PAE mode +- movl %eax, %cr4 ++ movl $X86_CR4_PAE, %eax ++ movl %eax, %cr4 # Enable PAE mode + + # Setup trampoline 4 level pagetables + leal (trampoline_level4_pgt - r_base)(%esi), %eax +@@ -99,9 +98,9 @@ startup_32: + xorl %edx, %edx + wrmsr + +- xorl %eax, %eax +- btsl $31, %eax # Enable paging and in turn activate Long Mode +- btsl $0, %eax # Enable protected mode ++ # Enable paging and in turn activate Long Mode ++ # Enable protected mode ++ movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + /* +Index: linux-2.6-tip/arch/x86/kernel/traps.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/traps.c ++++ linux-2.6-tip/arch/x86/kernel/traps.c +@@ -46,6 +46,7 @@ + #endif + + #include ++#include + #include + #include + #include +@@ -54,15 +55,14 @@ + #include + #include + +-#include ++#include + + #ifdef CONFIG_X86_64 + #include + #include +-#include + #else + #include +-#include ++#include + #include + + #include "cpu/mcheck/mce.h" +@@ -92,9 +92,10 @@ static inline void conditional_sti(struc + local_irq_enable(); + } + +-static inline void preempt_conditional_sti(struct pt_regs *regs) ++static inline void preempt_conditional_sti(struct pt_regs *regs, int stack) + { +- inc_preempt_count(); ++ if (stack) ++ inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } +@@ -105,11 +106,12 @@ static inline void conditional_cli(struc + local_irq_disable(); + } + +-static inline void preempt_conditional_cli(struct pt_regs *regs) ++static inline void preempt_conditional_cli(struct pt_regs *regs, int stack) + { + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +- dec_preempt_count(); ++ if (stack) ++ dec_preempt_count(); + } + + #ifdef CONFIG_X86_32 +@@ -119,47 +121,6 @@ die_if_kernel(const char *str, struct pt + if (!user_mode_vm(regs)) + die(str, regs, err); + } +- +-/* +- * Perform the lazy TSS's I/O bitmap copy. If the TSS has an +- * invalid offset set (the LAZY one) and the faulting thread has +- * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, +- * we set the offset field correctly and return 1. +- */ +-static int lazy_iobitmap_copy(void) +-{ +- struct thread_struct *thread; +- struct tss_struct *tss; +- int cpu; +- +- cpu = get_cpu(); +- tss = &per_cpu(init_tss, cpu); +- thread = ¤t->thread; +- +- if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && +- thread->io_bitmap_ptr) { +- memcpy(tss->io_bitmap, thread->io_bitmap_ptr, +- thread->io_bitmap_max); +- /* +- * If the previously set map was extending to higher ports +- * than the current one, pad extra space with 0xff (no access). +- */ +- if (thread->io_bitmap_max < tss->io_bitmap_max) { +- memset((char *) tss->io_bitmap + +- thread->io_bitmap_max, 0xff, +- tss->io_bitmap_max - thread->io_bitmap_max); +- } +- tss->io_bitmap_max = thread->io_bitmap_max; +- tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; +- tss->io_bitmap_owner = thread; +- put_cpu(); +- +- return 1; +- } +- put_cpu(); +- +- return 0; +-} + #endif + + static void __kprobes +@@ -277,9 +238,9 @@ dotraplinkage void do_stack_segment(stru + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; +- preempt_conditional_sti(regs); ++ preempt_conditional_sti(regs, STACKFAULT_STACK); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); +- preempt_conditional_cli(regs); ++ preempt_conditional_cli(regs, STACKFAULT_STACK); + } + + dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +@@ -310,11 +271,6 @@ do_general_protection(struct pt_regs *re + conditional_sti(regs); + + #ifdef CONFIG_X86_32 +- if (lazy_iobitmap_copy()) { +- /* restart the faulting instruction */ +- return; +- } +- + if (regs->flags & X86_VM_MASK) + goto gp_in_vm86; + #endif +@@ -517,9 +473,9 @@ dotraplinkage void __kprobes do_int3(str + return; + #endif + +- preempt_conditional_sti(regs); ++ preempt_conditional_sti(regs, DEBUG_STACK); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); +- preempt_conditional_cli(regs); ++ preempt_conditional_cli(regs, DEBUG_STACK); + } + + #ifdef CONFIG_X86_64 +@@ -581,6 +537,10 @@ dotraplinkage void __kprobes do_debug(st + + get_debugreg(condition, 6); + ++ /* Catch kmemcheck conditions first of all! */ ++ if (condition & DR_STEP && kmemcheck_trap(regs)) ++ return; ++ + /* + * The processor cleared BTF, so don't mark that we need it set. + */ +@@ -592,7 +552,7 @@ dotraplinkage void __kprobes do_debug(st + return; + + /* It's safe to allow irq's after DR6 has been saved */ +- preempt_conditional_sti(regs); ++ preempt_conditional_sti(regs, DEBUG_STACK); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { +@@ -627,7 +587,7 @@ dotraplinkage void __kprobes do_debug(st + */ + clear_dr7: + set_debugreg(0, 7); +- preempt_conditional_cli(regs); ++ preempt_conditional_cli(regs, DEBUG_STACK); + return; + + #ifdef CONFIG_X86_32 +@@ -642,7 +602,7 @@ debug_vm86: + clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; +- preempt_conditional_cli(regs); ++ preempt_conditional_cli(regs, DEBUG_STACK); + return; + } + +@@ -914,19 +874,20 @@ void math_emulate(struct math_emu_info * + } + #endif /* CONFIG_MATH_EMULATION */ + +-dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) ++dotraplinkage void __kprobes ++do_device_not_available(struct pt_regs *regs, long error_code) + { + #ifdef CONFIG_X86_32 + if (read_cr0() & X86_CR0_EM) { + struct math_emu_info info = { }; + +- conditional_sti(®s); ++ conditional_sti(regs); + +- info.regs = ®s; ++ info.regs = regs; + math_emulate(&info); + } else { + math_state_restore(); /* interrupts still off */ +- conditional_sti(®s); ++ conditional_sti(regs); + } + #else + math_state_restore(); +@@ -942,7 +903,7 @@ dotraplinkage void do_iret_error(struct + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; +- info.si_addr = 0; ++ info.si_addr = NULL; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; +@@ -991,8 +952,13 @@ void __init trap_init(void) + #endif + set_intr_gate(19, &simd_coprocessor_error); + ++ /* Reserve all the builtin and the syscall vector: */ ++ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) ++ set_bit(i, used_vectors); ++ + #ifdef CONFIG_IA32_EMULATION + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); ++ set_bit(IA32_SYSCALL_VECTOR, used_vectors); + #endif + + #ifdef CONFIG_X86_32 +@@ -1009,23 +975,15 @@ void __init trap_init(void) + } + + set_system_trap_gate(SYSCALL_VECTOR, &system_call); +-#endif +- +- /* Reserve all the builtin and the syscall vector: */ +- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) +- set_bit(i, used_vectors); +- +-#ifdef CONFIG_X86_64 +- set_bit(IA32_SYSCALL_VECTOR, used_vectors); +-#else + set_bit(SYSCALL_VECTOR, used_vectors); + #endif ++ + /* + * Should be a barrier for any external CPU state: + */ + cpu_init(); + + #ifdef CONFIG_X86_32 +- trap_init_hook(); ++ x86_quirk_trap_init(); + #endif + } +Index: linux-2.6-tip/arch/x86/kernel/tsc.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/tsc.c ++++ linux-2.6-tip/arch/x86/kernel/tsc.c +@@ -17,20 +17,21 @@ + #include + #include + +-unsigned int cpu_khz; /* TSC clocks / usec, not used here */ ++unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ + EXPORT_SYMBOL(cpu_khz); +-unsigned int tsc_khz; ++ ++unsigned int __read_mostly tsc_khz; + EXPORT_SYMBOL(tsc_khz); + + /* + * TSC can be unstable due to cpufreq or due to unsynced TSCs + */ +-static int tsc_unstable; ++static int __read_mostly tsc_unstable; + + /* native_sched_clock() is called before tsc_init(), so + we must start with the TSC soft disabled to prevent + erroneous rdtsc usage on !cpu_has_tsc processors */ +-static int tsc_disabled = -1; ++static int __read_mostly tsc_disabled = -1; + + static int tsc_clocksource_reliable; + /* +@@ -793,7 +794,7 @@ __cpuinit int unsynchronized_tsc(void) + if (!cpu_has_tsc || tsc_unstable) + return 1; + +-#ifdef CONFIG_X86_SMP ++#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; + #endif +Index: linux-2.6-tip/arch/x86/kernel/uv_time.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/kernel/uv_time.c +@@ -0,0 +1,393 @@ ++/* ++ * SGI RTC clock/timer routines. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ * ++ * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. ++ * Copyright (c) Dimitri Sivanich ++ */ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define RTC_NAME "sgi_rtc" ++ ++static cycle_t uv_read_rtc(void); ++static int uv_rtc_next_event(unsigned long, struct clock_event_device *); ++static void uv_rtc_timer_setup(enum clock_event_mode, ++ struct clock_event_device *); ++ ++static struct clocksource clocksource_uv = { ++ .name = RTC_NAME, ++ .rating = 400, ++ .read = uv_read_rtc, ++ .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, ++ .shift = 10, ++ .flags = CLOCK_SOURCE_IS_CONTINUOUS, ++}; ++ ++static struct clock_event_device clock_event_device_uv = { ++ .name = RTC_NAME, ++ .features = CLOCK_EVT_FEAT_ONESHOT, ++ .shift = 20, ++ .rating = 400, ++ .irq = -1, ++ .set_next_event = uv_rtc_next_event, ++ .set_mode = uv_rtc_timer_setup, ++ .event_handler = NULL, ++}; ++ ++static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); ++ ++/* There is one of these allocated per node */ ++struct uv_rtc_timer_head { ++ spinlock_t lock; ++ /* next cpu waiting for timer, local node relative: */ ++ int next_cpu; ++ /* number of cpus on this node: */ ++ int ncpus; ++ struct { ++ int lcpu; /* systemwide logical cpu number */ ++ u64 expires; /* next timer expiration for this cpu */ ++ } cpu[1]; ++}; ++ ++/* ++ * Access to uv_rtc_timer_head via blade id. ++ */ ++static struct uv_rtc_timer_head **blade_info __read_mostly; ++ ++static int uv_rtc_enable; ++ ++/* ++ * Hardware interface routines ++ */ ++ ++/* Send IPIs to another node */ ++static void uv_rtc_send_IPI(int cpu) ++{ ++ unsigned long apicid, val; ++ int pnode; ++ ++ apicid = cpu_physical_id(cpu); ++ pnode = uv_apicid_to_pnode(apicid); ++ val = (1UL << UVH_IPI_INT_SEND_SHFT) | ++ (apicid << UVH_IPI_INT_APIC_ID_SHFT) | ++ (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); ++ ++ uv_write_global_mmr64(pnode, UVH_IPI_INT, val); ++} ++ ++/* Check for an RTC interrupt pending */ ++static int uv_intr_pending(int pnode) ++{ ++ return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & ++ UVH_EVENT_OCCURRED0_RTC1_MASK; ++} ++ ++/* Setup interrupt and return non-zero if early expiration occurred. */ ++static int uv_setup_intr(int cpu, u64 expires) ++{ ++ u64 val; ++ int pnode = uv_cpu_to_pnode(cpu); ++ ++ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, ++ UVH_RTC1_INT_CONFIG_M_MASK); ++ uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); ++ ++ uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, ++ UVH_EVENT_OCCURRED0_RTC1_MASK); ++ ++ val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ++ ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); ++ ++ /* Set configuration */ ++ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); ++ /* Initialize comparator value */ ++ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); ++ ++ return (expires < uv_read_rtc() && !uv_intr_pending(pnode)); ++} ++ ++/* ++ * Per-cpu timer tracking routines ++ */ ++ ++static __init void uv_rtc_deallocate_timers(void) ++{ ++ int bid; ++ ++ for_each_possible_blade(bid) { ++ kfree(blade_info[bid]); ++ } ++ kfree(blade_info); ++} ++ ++/* Allocate per-node list of cpu timer expiration times. */ ++static __init int uv_rtc_allocate_timers(void) ++{ ++ int cpu; ++ ++ blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); ++ if (!blade_info) ++ return -ENOMEM; ++ memset(blade_info, 0, uv_possible_blades * sizeof(void *)); ++ ++ for_each_present_cpu(cpu) { ++ int nid = cpu_to_node(cpu); ++ int bid = uv_cpu_to_blade_id(cpu); ++ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; ++ struct uv_rtc_timer_head *head = blade_info[bid]; ++ ++ if (!head) { ++ head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + ++ (uv_blade_nr_possible_cpus(bid) * ++ 2 * sizeof(u64)), ++ GFP_KERNEL, nid); ++ if (!head) { ++ uv_rtc_deallocate_timers(); ++ return -ENOMEM; ++ } ++ spin_lock_init(&head->lock); ++ head->ncpus = uv_blade_nr_possible_cpus(bid); ++ head->next_cpu = -1; ++ blade_info[bid] = head; ++ } ++ ++ head->cpu[bcpu].lcpu = cpu; ++ head->cpu[bcpu].expires = ULLONG_MAX; ++ } ++ ++ return 0; ++} ++ ++/* Find and set the next expiring timer. */ ++static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) ++{ ++ u64 lowest = ULLONG_MAX; ++ int c, bcpu = -1; ++ ++ head->next_cpu = -1; ++ for (c = 0; c < head->ncpus; c++) { ++ u64 exp = head->cpu[c].expires; ++ if (exp < lowest) { ++ bcpu = c; ++ lowest = exp; ++ } ++ } ++ if (bcpu >= 0) { ++ head->next_cpu = bcpu; ++ c = head->cpu[bcpu].lcpu; ++ if (uv_setup_intr(c, lowest)) ++ /* If we didn't set it up in time, trigger */ ++ uv_rtc_send_IPI(c); ++ } else { ++ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, ++ UVH_RTC1_INT_CONFIG_M_MASK); ++ } ++} ++ ++/* ++ * Set expiration time for current cpu. ++ * ++ * Returns 1 if we missed the expiration time. ++ */ ++static int uv_rtc_set_timer(int cpu, u64 expires) ++{ ++ int pnode = uv_cpu_to_pnode(cpu); ++ int bid = uv_cpu_to_blade_id(cpu); ++ struct uv_rtc_timer_head *head = blade_info[bid]; ++ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; ++ u64 *t = &head->cpu[bcpu].expires; ++ unsigned long flags; ++ int next_cpu; ++ ++ spin_lock_irqsave(&head->lock, flags); ++ ++ next_cpu = head->next_cpu; ++ *t = expires; ++ /* Will this one be next to go off? */ ++ if (next_cpu < 0 || bcpu == next_cpu || ++ expires < head->cpu[next_cpu].expires) { ++ head->next_cpu = bcpu; ++ if (uv_setup_intr(cpu, expires)) { ++ *t = ULLONG_MAX; ++ uv_rtc_find_next_timer(head, pnode); ++ spin_unlock_irqrestore(&head->lock, flags); ++ return 1; ++ } ++ } ++ ++ spin_unlock_irqrestore(&head->lock, flags); ++ return 0; ++} ++ ++/* ++ * Unset expiration time for current cpu. ++ * ++ * Returns 1 if this timer was pending. ++ */ ++static int uv_rtc_unset_timer(int cpu) ++{ ++ int pnode = uv_cpu_to_pnode(cpu); ++ int bid = uv_cpu_to_blade_id(cpu); ++ struct uv_rtc_timer_head *head = blade_info[bid]; ++ int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; ++ u64 *t = &head->cpu[bcpu].expires; ++ unsigned long flags; ++ int rc = 0; ++ ++ spin_lock_irqsave(&head->lock, flags); ++ ++ if (head->next_cpu == bcpu && uv_read_rtc() >= *t) ++ rc = 1; ++ ++ *t = ULLONG_MAX; ++ ++ /* Was the hardware setup for this timer? */ ++ if (head->next_cpu == bcpu) ++ uv_rtc_find_next_timer(head, pnode); ++ ++ spin_unlock_irqrestore(&head->lock, flags); ++ ++ return rc; ++} ++ ++ ++/* ++ * Kernel interface routines. ++ */ ++ ++/* ++ * Read the RTC. ++ */ ++static cycle_t uv_read_rtc(void) ++{ ++ return (cycle_t)uv_read_local_mmr(UVH_RTC); ++} ++ ++/* ++ * Program the next event, relative to now ++ */ ++static int uv_rtc_next_event(unsigned long delta, ++ struct clock_event_device *ced) ++{ ++ int ced_cpu = cpumask_first(ced->cpumask); ++ ++ return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc()); ++} ++ ++/* ++ * Setup the RTC timer in oneshot mode ++ */ ++static void uv_rtc_timer_setup(enum clock_event_mode mode, ++ struct clock_event_device *evt) ++{ ++ int ced_cpu = cpumask_first(evt->cpumask); ++ ++ switch (mode) { ++ case CLOCK_EVT_MODE_PERIODIC: ++ case CLOCK_EVT_MODE_ONESHOT: ++ case CLOCK_EVT_MODE_RESUME: ++ /* Nothing to do here yet */ ++ break; ++ case CLOCK_EVT_MODE_UNUSED: ++ case CLOCK_EVT_MODE_SHUTDOWN: ++ uv_rtc_unset_timer(ced_cpu); ++ break; ++ } ++} ++ ++static void uv_rtc_interrupt(void) ++{ ++ struct clock_event_device *ced = &__get_cpu_var(cpu_ced); ++ int cpu = smp_processor_id(); ++ ++ if (!ced || !ced->event_handler) ++ return; ++ ++ if (uv_rtc_unset_timer(cpu) != 1) ++ return; ++ ++ ced->event_handler(ced); ++} ++ ++static int __init uv_enable_rtc(char *str) ++{ ++ uv_rtc_enable = 1; ++ ++ return 1; ++} ++__setup("uvrtc", uv_enable_rtc); ++ ++static __init void uv_rtc_register_clockevents(struct work_struct *dummy) ++{ ++ struct clock_event_device *ced = &__get_cpu_var(cpu_ced); ++ ++ *ced = clock_event_device_uv; ++ ced->cpumask = cpumask_of(smp_processor_id()); ++ clockevents_register_device(ced); ++} ++ ++static __init int uv_rtc_setup_clock(void) ++{ ++ int rc; ++ ++ if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) ++ return -ENODEV; ++ ++ generic_interrupt_extension = uv_rtc_interrupt; ++ ++ clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, ++ clocksource_uv.shift); ++ ++ rc = clocksource_register(&clocksource_uv); ++ if (rc) { ++ generic_interrupt_extension = NULL; ++ return rc; ++ } ++ ++ /* Setup and register clockevents */ ++ rc = uv_rtc_allocate_timers(); ++ if (rc) { ++ clocksource_unregister(&clocksource_uv); ++ generic_interrupt_extension = NULL; ++ return rc; ++ } ++ ++ clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, ++ NSEC_PER_SEC, clock_event_device_uv.shift); ++ ++ clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / ++ sn_rtc_cycles_per_second; ++ ++ clock_event_device_uv.max_delta_ns = clocksource_uv.mask * ++ (NSEC_PER_SEC / sn_rtc_cycles_per_second); ++ ++ rc = schedule_on_each_cpu(uv_rtc_register_clockevents); ++ if (rc) { ++ clocksource_unregister(&clocksource_uv); ++ generic_interrupt_extension = NULL; ++ uv_rtc_deallocate_timers(); ++ } ++ ++ return rc; ++} ++arch_initcall(uv_rtc_setup_clock); +Index: linux-2.6-tip/arch/x86/kernel/visws_quirks.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/visws_quirks.c ++++ linux-2.6-tip/arch/x86/kernel/visws_quirks.c +@@ -24,18 +24,14 @@ + + #include + #include +-#include + #include + #include + #include + #include ++#include + #include + #include + +-#include +- +-#include "mach_apic.h" +- + #include + + #include +@@ -49,8 +45,6 @@ + + extern int no_broadcast; + +-#include +- + char visws_board_type = -1; + char visws_board_rev = -1; + +@@ -200,7 +194,7 @@ static void __init MP_processor_info(str + return; + } + +- apic_cpus = apicid_to_cpu_present(m->apicid); ++ apic_cpus = apic->apicid_to_cpu_present(m->apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); + /* + * Validate version +@@ -584,7 +578,7 @@ static struct irq_chip piix4_virtual_irq + static irqreturn_t piix4_master_intr(int irq, void *dev_id) + { + int realirq; +- irq_desc_t *desc; ++ struct irq_desc *desc; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); +@@ -649,11 +643,13 @@ out_unlock: + static struct irqaction master_action = { + .handler = piix4_master_intr, + .name = "PIIX4-8259", ++ .flags = IRQF_NODELAY, + }; + + static struct irqaction cascade_action = { + .handler = no_action, + .name = "cascade", ++ .flags = IRQF_NODELAY, + }; + + +Index: linux-2.6-tip/arch/x86/kernel/vm86_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/vm86_32.c ++++ linux-2.6-tip/arch/x86/kernel/vm86_32.c +@@ -137,6 +137,7 @@ struct pt_regs *save_v86_state(struct ke + local_irq_enable(); + + if (!current->thread.vm86_info) { ++ local_irq_disable(); + printk("no vm86_info: BAD\n"); + do_exit(SIGSEGV); + } +@@ -158,7 +159,7 @@ struct pt_regs *save_v86_state(struct ke + ret = KVM86->regs32; + + ret->fs = current->thread.saved_fs; +- loadsegment(gs, current->thread.saved_gs); ++ set_user_gs(ret, current->thread.saved_gs); + + return ret; + } +@@ -197,9 +198,9 @@ out: + static int do_vm86_irq_handling(int subfunction, int irqnumber); + static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); + +-asmlinkage int sys_vm86old(struct pt_regs regs) ++int sys_vm86old(struct pt_regs *regs) + { +- struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; ++ struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx; + struct kernel_vm86_struct info; /* declare this _on top_, + * this avoids wasting of stack space. + * This remains on the stack until we +@@ -218,7 +219,7 @@ asmlinkage int sys_vm86old(struct pt_reg + if (tmp) + goto out; + memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); +- info.regs32 = ®s; ++ info.regs32 = regs; + tsk->thread.vm86_info = v86; + do_sys_vm86(&info, tsk); + ret = 0; /* we never return here */ +@@ -227,7 +228,7 @@ out: + } + + +-asmlinkage int sys_vm86(struct pt_regs regs) ++int sys_vm86(struct pt_regs *regs) + { + struct kernel_vm86_struct info; /* declare this _on top_, + * this avoids wasting of stack space. +@@ -239,12 +240,12 @@ asmlinkage int sys_vm86(struct pt_regs r + struct vm86plus_struct __user *v86; + + tsk = current; +- switch (regs.bx) { ++ switch (regs->bx) { + case VM86_REQUEST_IRQ: + case VM86_FREE_IRQ: + case VM86_GET_IRQ_BITS: + case VM86_GET_AND_RESET_IRQ: +- ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); ++ ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); + goto out; + case VM86_PLUS_INSTALL_CHECK: + /* +@@ -261,14 +262,14 @@ asmlinkage int sys_vm86(struct pt_regs r + ret = -EPERM; + if (tsk->thread.saved_sp0) + goto out; +- v86 = (struct vm86plus_struct __user *)regs.cx; ++ v86 = (struct vm86plus_struct __user *)regs->cx; + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, regs32) - + sizeof(info.regs)); + ret = -EFAULT; + if (tmp) + goto out; +- info.regs32 = ®s; ++ info.regs32 = regs; + info.vm86plus.is_vm86pus = 1; + tsk->thread.vm86_info = (struct vm86_struct __user *)v86; + do_sys_vm86(&info, tsk); +@@ -323,7 +324,7 @@ static void do_sys_vm86(struct kernel_vm + info->regs32->ax = 0; + tsk->thread.saved_sp0 = tsk->thread.sp0; + tsk->thread.saved_fs = info->regs32->fs; +- savesegment(gs, tsk->thread.saved_gs); ++ tsk->thread.saved_gs = get_user_gs(info->regs32); + + tss = &per_cpu(init_tss, get_cpu()); + tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; +Index: linux-2.6-tip/arch/x86/kernel/vmi_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/vmi_32.c ++++ linux-2.6-tip/arch/x86/kernel/vmi_32.c +@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *pt + vmi_ops.update_pte(ptep, VMI_PAGE_PT); + } + +-static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +-{ +- vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); +-} +- + static void vmi_set_pud(pud_t *pudp, pud_t pudval) + { + /* Um, eww */ +@@ -680,10 +675,11 @@ static inline int __init activate_vmi(vo + para_fill(pv_mmu_ops.write_cr2, SetCR2); + para_fill(pv_mmu_ops.write_cr3, SetCR3); + para_fill(pv_cpu_ops.write_cr4, SetCR4); +- para_fill(pv_irq_ops.save_fl, GetInterruptMask); +- para_fill(pv_irq_ops.restore_fl, SetInterruptMask); +- para_fill(pv_irq_ops.irq_disable, DisableInterrupts); +- para_fill(pv_irq_ops.irq_enable, EnableInterrupts); ++ ++ para_fill(pv_irq_ops.save_fl.func, GetInterruptMask); ++ para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask); ++ para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts); ++ para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts); + + para_fill(pv_cpu_ops.wbinvd, WBINVD); + para_fill(pv_cpu_ops.read_tsc, RDTSC); +@@ -749,7 +745,6 @@ static inline int __init activate_vmi(vo + pv_mmu_ops.set_pmd = vmi_set_pmd; + #ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; +- pv_mmu_ops.set_pte_present = vmi_set_pte_present; + pv_mmu_ops.set_pud = vmi_set_pud; + pv_mmu_ops.pte_clear = vmi_pte_clear; + pv_mmu_ops.pmd_clear = vmi_pmd_clear; +@@ -797,8 +792,8 @@ static inline int __init activate_vmi(vo + #endif + + #ifdef CONFIG_X86_LOCAL_APIC +- para_fill(apic_ops->read, APICRead); +- para_fill(apic_ops->write, APICWrite); ++ para_fill(apic->read, APICRead); ++ para_fill(apic->write, APICWrite); + #endif + + /* +Index: linux-2.6-tip/arch/x86/kernel/vmiclock_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/vmiclock_32.c ++++ linux-2.6-tip/arch/x86/kernel/vmiclock_32.c +@@ -28,7 +28,6 @@ + + #include + #include +-#include + #include + #include + #include +@@ -256,7 +255,7 @@ void __devinit vmi_time_bsp_init(void) + */ + clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + local_irq_disable(); +-#ifdef CONFIG_X86_SMP ++#ifdef CONFIG_SMP + /* + * XXX handle_percpu_irq only defined for SMP; we need to switch over + * to using it, since this is a local interrupt, which each CPU must +@@ -288,8 +287,7 @@ static struct clocksource clocksource_vm + static cycle_t read_real_cycles(void) + { + cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); +- return ret >= clocksource_vmi.cycle_last ? +- ret : clocksource_vmi.cycle_last; ++ return max(ret, clocksource_vmi.cycle_last); + } + + static struct clocksource clocksource_vmi = { +Index: linux-2.6-tip/arch/x86/kernel/vmlinux_32.lds.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/vmlinux_32.lds.S ++++ linux-2.6-tip/arch/x86/kernel/vmlinux_32.lds.S +@@ -12,7 +12,7 @@ + + #include + #include +-#include ++#include + #include + #include + +@@ -178,14 +178,7 @@ SECTIONS + __initramfs_end = .; + } + #endif +- . = ALIGN(PAGE_SIZE); +- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { +- __per_cpu_start = .; +- *(.data.percpu.page_aligned) +- *(.data.percpu) +- *(.data.percpu.shared_aligned) +- __per_cpu_end = .; +- } ++ PERCPU(PAGE_SIZE) + . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ + +@@ -196,15 +189,24 @@ SECTIONS + *(.bss) + . = ALIGN(4); + __bss_stop = .; +- _end = . ; +- /* This is where the kernel creates the early boot page tables */ ++ } ++ ++ .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); +- pg0 = . ; ++ __brk_base = . ; ++ . += 64 * 1024 ; /* 64k alignment slop space */ ++ *(.brk_reservation) /* areas brk users have reserved */ ++ __brk_limit = . ; ++ } ++ ++ .end : AT(ADDR(.end) - LOAD_OFFSET) { ++ _end = . ; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) ++ *(.discard) + } + + STABS_DEBUG +@@ -212,6 +214,12 @@ SECTIONS + DWARF_DEBUG + } + ++/* ++ * Build-time check on the image size: ++ */ ++ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), ++ "kernel image bigger than KERNEL_IMAGE_SIZE") ++ + #ifdef CONFIG_KEXEC + /* Link time checks */ + #include +Index: linux-2.6-tip/arch/x86/kernel/vmlinux_64.lds.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/vmlinux_64.lds.S ++++ linux-2.6-tip/arch/x86/kernel/vmlinux_64.lds.S +@@ -5,7 +5,8 @@ + #define LOAD_OFFSET __START_KERNEL_map + + #include +-#include ++#include ++#include + + #undef i386 /* in case the preprocessor is a 32bit one */ + +@@ -13,20 +14,23 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86 + OUTPUT_ARCH(i386:x86-64) + ENTRY(phys_startup_64) + jiffies_64 = jiffies; +-_proxy_pda = 1; + PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ + data.init PT_LOAD FLAGS(7); /* RWE */ ++#ifdef CONFIG_SMP ++ percpu PT_LOAD FLAGS(7); /* RWE */ ++#endif ++ data.init2 PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(0); /* ___ */ + } + SECTIONS + { + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +- _text = .; /* Text and read-only data */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { ++ _text = .; /* Text and read-only data */ + /* First the code that has to be first for bootstrapping */ + *(.text.head) + _stext = .; +@@ -57,13 +61,13 @@ SECTIONS + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS ++ _edata = .; /* End of data section */ + } :data + +- _edata = .; /* End of data section */ + + . = ALIGN(PAGE_SIZE); +- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { ++ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + *(.data.cacheline_aligned) + } + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); +@@ -121,29 +125,29 @@ SECTIONS + #undef VVIRT_OFFSET + #undef VVIRT + +- . = ALIGN(THREAD_SIZE); /* init_task */ + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { ++ . = ALIGN(THREAD_SIZE); /* init_task */ + *(.data.init_task) + }:data.init + +- . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { ++ . = ALIGN(PAGE_SIZE); + *(.data.page_aligned) + } + +- /* might get freed after init */ +- . = ALIGN(PAGE_SIZE); +- __smp_alt_begin = .; +- __smp_locks = .; + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { ++ /* might get freed after init */ ++ . = ALIGN(PAGE_SIZE); ++ __smp_alt_begin = .; ++ __smp_locks = .; + *(.smp_locks) ++ __smp_locks_end = .; ++ . = ALIGN(PAGE_SIZE); ++ __smp_alt_end = .; + } +- __smp_locks_end = .; +- . = ALIGN(PAGE_SIZE); +- __smp_alt_end = .; + + . = ALIGN(PAGE_SIZE); /* Init code and data */ +- __init_begin = .; ++ __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT +@@ -155,40 +159,42 @@ SECTIONS + __initdata_end = .; + } + +- . = ALIGN(16); +- __setup_start = .; +- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } +- __setup_end = .; +- __initcall_start = .; ++ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { ++ . = ALIGN(16); ++ __setup_start = .; ++ *(.init.setup) ++ __setup_end = .; ++ } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { ++ __initcall_start = .; + INITCALLS ++ __initcall_end = .; + } +- __initcall_end = .; +- __con_initcall_start = .; + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { ++ __con_initcall_start = .; + *(.con_initcall.init) ++ __con_initcall_end = .; + } +- __con_initcall_end = .; +- __x86_cpu_dev_start = .; + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { ++ __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) ++ __x86_cpu_dev_end = .; + } +- __x86_cpu_dev_end = .; + SECURITY_INIT + + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { +- __parainstructions = .; ++ __parainstructions = .; + *(.parainstructions) +- __parainstructions_end = .; ++ __parainstructions_end = .; + } + +- . = ALIGN(8); +- __alt_instructions = .; + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { ++ . = ALIGN(8); ++ __alt_instructions = .; + *(.altinstructions) ++ __alt_instructions_end = .; + } +- __alt_instructions_end = .; + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } +@@ -203,28 +209,53 @@ SECTIONS + + #ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); +- __initramfs_start = .; +- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } +- __initramfs_end = .; ++ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { ++ __initramfs_start = .; ++ *(.init.ramfs) ++ __initramfs_end = .; ++ } + #endif + ++#ifdef CONFIG_SMP ++ /* ++ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the ++ * output PHDR, so the next output section - __data_nosave - should ++ * start another section data.init2. Also, pda should be at the head of ++ * percpu area. Preallocate it and define the percpu offset symbol ++ * so that it can be accessed as a percpu variable. ++ */ ++ . = ALIGN(PAGE_SIZE); ++ PERCPU_VADDR(0, :percpu) ++#else + PERCPU(PAGE_SIZE) ++#endif + + . = ALIGN(PAGE_SIZE); + __init_end = .; + +- . = ALIGN(PAGE_SIZE); +- __nosave_begin = .; +- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } +- . = ALIGN(PAGE_SIZE); +- __nosave_end = .; ++ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { ++ . = ALIGN(PAGE_SIZE); ++ __nosave_begin = .; ++ *(.data.nosave) ++ . = ALIGN(PAGE_SIZE); ++ __nosave_end = .; ++ } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ + +- __bss_start = .; /* BSS */ + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { ++ . = ALIGN(PAGE_SIZE); ++ __bss_start = .; /* BSS */ + *(.bss.page_aligned) + *(.bss) +- } +- __bss_stop = .; ++ __bss_stop = .; ++ } ++ ++ .brk : AT(ADDR(.brk) - LOAD_OFFSET) { ++ . = ALIGN(PAGE_SIZE); ++ __brk_base = . ; ++ . += 64 * 1024 ; /* 64k alignment slop space */ ++ *(.brk_reservation) /* areas brk users have reserved */ ++ __brk_limit = . ; ++ } + + _end = . ; + +@@ -232,6 +263,7 @@ SECTIONS + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) ++ *(.discard) + } + + STABS_DEBUG +@@ -239,8 +271,28 @@ SECTIONS + DWARF_DEBUG + } + ++ /* ++ * Per-cpu symbols which need to be offset from __per_cpu_load ++ * for the boot processor. ++ */ ++#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load ++INIT_PER_CPU(gdt_page); ++INIT_PER_CPU(irq_stack_union); ++ + /* + * Build-time check on the image size: + */ + ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") ++ ++#ifdef CONFIG_SMP ++ASSERT((per_cpu__irq_stack_union == 0), ++ "irq_stack_union is not at start of per-cpu area"); ++#endif ++ ++#ifdef CONFIG_KEXEC ++#include ++ ++ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, ++ "kexec control code size is too big") ++#endif +Index: linux-2.6-tip/arch/x86/kernel/vsmp_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/vsmp_64.c ++++ linux-2.6-tip/arch/x86/kernel/vsmp_64.c +@@ -22,7 +22,7 @@ + #include + #include + +-#if defined CONFIG_PCI && defined CONFIG_PARAVIRT ++#ifdef CONFIG_PARAVIRT + /* + * Interrupt control on vSMPowered systems: + * ~AC is a shadow of IF. If IF is 'on' AC should be 'off' +@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void) + flags &= ~X86_EFLAGS_IF; + return flags; + } ++PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl); + + static void vsmp_restore_fl(unsigned long flags) + { +@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned lon + flags |= X86_EFLAGS_AC; + native_restore_fl(flags); + } ++PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); + + static void vsmp_irq_disable(void) + { +@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void) + + native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); + } ++PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); + + static void vsmp_irq_enable(void) + { +@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void) + + native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); + } ++PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable); + + static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void) + cap, ctl); + if (cap & ctl & (1 << 4)) { + /* Setup irq ops and turn on vSMP IRQ fastpath handling */ +- pv_irq_ops.irq_disable = vsmp_irq_disable; +- pv_irq_ops.irq_enable = vsmp_irq_enable; +- pv_irq_ops.save_fl = vsmp_save_fl; +- pv_irq_ops.restore_fl = vsmp_restore_fl; ++ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); ++ pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable); ++ pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); ++ pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); + pv_init_ops.patch = vsmp_patch; + + ctl &= ~(1 << 4); +@@ -110,7 +114,6 @@ static void __init set_vsmp_pv_ops(void) + } + #endif + +-#ifdef CONFIG_PCI + static int is_vsmp = -1; + + static void __init detect_vsmp_box(void) +@@ -135,15 +138,6 @@ int is_vsmp_box(void) + return 0; + } + } +-#else +-static void __init detect_vsmp_box(void) +-{ +-} +-int is_vsmp_box(void) +-{ +- return 0; +-} +-#endif + + void __init vsmp_init(void) + { +Index: linux-2.6-tip/arch/x86/kernel/x8664_ksyms_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/x8664_ksyms_64.c ++++ linux-2.6-tip/arch/x86/kernel/x8664_ksyms_64.c +@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy); + EXPORT_SYMBOL(empty_zero_page); + EXPORT_SYMBOL(init_level4_pgt); + EXPORT_SYMBOL(load_gs_index); +- +-EXPORT_SYMBOL(_proxy_pda); +Index: linux-2.6-tip/arch/x86/kvm/Kconfig +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kvm/Kconfig ++++ linux-2.6-tip/arch/x86/kvm/Kconfig +@@ -59,7 +59,8 @@ config KVM_AMD + + config KVM_TRACE + bool "KVM trace support" +- depends on KVM && MARKERS && SYSFS ++ depends on KVM && SYSFS ++ select MARKERS + select RELAY + select DEBUG_FS + default n +Index: linux-2.6-tip/arch/x86/lguest/Kconfig +=================================================================== +--- linux-2.6-tip.orig/arch/x86/lguest/Kconfig ++++ linux-2.6-tip/arch/x86/lguest/Kconfig +@@ -3,7 +3,6 @@ config LGUEST_GUEST + select PARAVIRT + depends on X86_32 + depends on !X86_PAE +- depends on !X86_VOYAGER + select VIRTIO + select VIRTIO_RING + select VIRTIO_CONSOLE +Index: linux-2.6-tip/arch/x86/lguest/boot.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/lguest/boot.c ++++ linux-2.6-tip/arch/x86/lguest/boot.c +@@ -173,24 +173,29 @@ static unsigned long save_fl(void) + { + return lguest_data.irq_enabled; + } ++PV_CALLEE_SAVE_REGS_THUNK(save_fl); + + /* restore_flags() just sets the flags back to the value given. */ + static void restore_fl(unsigned long flags) + { + lguest_data.irq_enabled = flags; + } ++PV_CALLEE_SAVE_REGS_THUNK(restore_fl); + + /* Interrupts go off... */ + static void irq_disable(void) + { + lguest_data.irq_enabled = 0; + } ++PV_CALLEE_SAVE_REGS_THUNK(irq_disable); + + /* Interrupts go on... */ + static void irq_enable(void) + { + lguest_data.irq_enabled = X86_EFLAGS_IF; + } ++PV_CALLEE_SAVE_REGS_THUNK(irq_enable); ++ + /*:*/ + /*M:003 Note that we don't check for outstanding interrupts when we re-enable + * them (or when we unmask an interrupt). This seems to work for the moment, +@@ -278,7 +283,7 @@ static void lguest_load_tls(struct threa + /* There's one problem which normal hardware doesn't have: the Host + * can't handle us removing entries we're currently using. So we clear + * the GS register here: if it's needed it'll be reloaded anyway. */ +- loadsegment(gs, 0); ++ lazy_load_gs(0); + lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); + } + +@@ -836,13 +841,14 @@ static u32 lguest_apic_safe_wait_icr_idl + return 0; + } + +-static struct apic_ops lguest_basic_apic_ops = { +- .read = lguest_apic_read, +- .write = lguest_apic_write, +- .icr_read = lguest_apic_icr_read, +- .icr_write = lguest_apic_icr_write, +- .wait_icr_idle = lguest_apic_wait_icr_idle, +- .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle, ++static void set_lguest_basic_apic_ops(void) ++{ ++ apic->read = lguest_apic_read; ++ apic->write = lguest_apic_write; ++ apic->icr_read = lguest_apic_icr_read; ++ apic->icr_write = lguest_apic_icr_write; ++ apic->wait_icr_idle = lguest_apic_wait_icr_idle; ++ apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; + }; + #endif + +@@ -997,10 +1003,10 @@ __init void lguest_init(void) + + /* interrupt-related operations */ + pv_irq_ops.init_IRQ = lguest_init_IRQ; +- pv_irq_ops.save_fl = save_fl; +- pv_irq_ops.restore_fl = restore_fl; +- pv_irq_ops.irq_disable = irq_disable; +- pv_irq_ops.irq_enable = irq_enable; ++ pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); ++ pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); ++ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); ++ pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); + pv_irq_ops.safe_halt = lguest_safe_halt; + + /* init-time operations */ +@@ -1045,7 +1051,7 @@ __init void lguest_init(void) + + #ifdef CONFIG_X86_LOCAL_APIC + /* apic read/write intercepts */ +- apic_ops = &lguest_basic_apic_ops; ++ set_lguest_basic_apic_ops(); + #endif + + /* time operations */ +@@ -1060,14 +1066,6 @@ __init void lguest_init(void) + * lguest_init() where the rest of the fairly chaotic boot setup + * occurs. */ + +- /* The native boot code sets up initial page tables immediately after +- * the kernel itself, and sets init_pg_tables_end so they're not +- * clobbered. The Launcher places our initial pagetables somewhere at +- * the top of our physical memory, so we don't need extra space: set +- * init_pg_tables_end to the end of the kernel. */ +- init_pg_tables_start = __pa(pg0); +- init_pg_tables_end = __pa(pg0); +- + /* As described in head_32.S, we map the first 128M of memory. */ + max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; + +Index: linux-2.6-tip/arch/x86/lib/getuser.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/lib/getuser.S ++++ linux-2.6-tip/arch/x86/lib/getuser.S +@@ -28,7 +28,7 @@ + + #include + #include +-#include ++#include + #include + #include + #include +Index: linux-2.6-tip/arch/x86/lib/memcpy_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/lib/memcpy_64.S ++++ linux-2.6-tip/arch/x86/lib/memcpy_64.S +@@ -1,30 +1,38 @@ + /* Copyright 2002 Andi Kleen */ + + #include +-#include ++ + #include ++#include + + /* + * memcpy - Copy a memory block. + * +- * Input: +- * rdi destination +- * rsi source +- * rdx count +- * ++ * Input: ++ * rdi destination ++ * rsi source ++ * rdx count ++ * + * Output: + * rax original destination +- */ ++ */ + ++/* ++ * memcpy_c() - fast string ops (REP MOVSQ) based variant. ++ * ++ * Calls to this get patched into the kernel image via the ++ * alternative instructions framework: ++ */ + ALIGN + memcpy_c: + CFI_STARTPROC +- movq %rdi,%rax +- movl %edx,%ecx +- shrl $3,%ecx +- andl $7,%edx ++ movq %rdi, %rax ++ ++ movl %edx, %ecx ++ shrl $3, %ecx ++ andl $7, %edx + rep movsq +- movl %edx,%ecx ++ movl %edx, %ecx + rep movsb + ret + CFI_ENDPROC +@@ -33,99 +41,110 @@ ENDPROC(memcpy_c) + ENTRY(__memcpy) + ENTRY(memcpy) + CFI_STARTPROC +- pushq %rbx +- CFI_ADJUST_CFA_OFFSET 8 +- CFI_REL_OFFSET rbx, 0 +- movq %rdi,%rax + +- movl %edx,%ecx +- shrl $6,%ecx ++ /* ++ * Put the number of full 64-byte blocks into %ecx. ++ * Tail portion is handled at the end: ++ */ ++ movq %rdi, %rax ++ movl %edx, %ecx ++ shrl $6, %ecx + jz .Lhandle_tail + + .p2align 4 + .Lloop_64: ++ /* ++ * We decrement the loop index here - and the zero-flag is ++ * checked at the end of the loop (instructions inbetween do ++ * not change the zero flag): ++ */ + decl %ecx + +- movq (%rsi),%r11 +- movq 8(%rsi),%r8 +- +- movq %r11,(%rdi) +- movq %r8,1*8(%rdi) +- +- movq 2*8(%rsi),%r9 +- movq 3*8(%rsi),%r10 ++ /* ++ * Move in blocks of 4x16 bytes: ++ */ ++ movq 0*8(%rsi), %r11 ++ movq 1*8(%rsi), %r8 ++ movq %r11, 0*8(%rdi) ++ movq %r8, 1*8(%rdi) ++ ++ movq 2*8(%rsi), %r9 ++ movq 3*8(%rsi), %r10 ++ movq %r9, 2*8(%rdi) ++ movq %r10, 3*8(%rdi) ++ ++ movq 4*8(%rsi), %r11 ++ movq 5*8(%rsi), %r8 ++ movq %r11, 4*8(%rdi) ++ movq %r8, 5*8(%rdi) ++ ++ movq 6*8(%rsi), %r9 ++ movq 7*8(%rsi), %r10 ++ movq %r9, 6*8(%rdi) ++ movq %r10, 7*8(%rdi) + +- movq %r9,2*8(%rdi) +- movq %r10,3*8(%rdi) ++ leaq 64(%rsi), %rsi ++ leaq 64(%rdi), %rdi + +- movq 4*8(%rsi),%r11 +- movq 5*8(%rsi),%r8 +- +- movq %r11,4*8(%rdi) +- movq %r8,5*8(%rdi) +- +- movq 6*8(%rsi),%r9 +- movq 7*8(%rsi),%r10 +- +- movq %r9,6*8(%rdi) +- movq %r10,7*8(%rdi) +- +- leaq 64(%rsi),%rsi +- leaq 64(%rdi),%rdi + jnz .Lloop_64 + + .Lhandle_tail: +- movl %edx,%ecx +- andl $63,%ecx +- shrl $3,%ecx ++ movl %edx, %ecx ++ andl $63, %ecx ++ shrl $3, %ecx + jz .Lhandle_7 ++ + .p2align 4 + .Lloop_8: + decl %ecx +- movq (%rsi),%r8 +- movq %r8,(%rdi) +- leaq 8(%rdi),%rdi +- leaq 8(%rsi),%rsi ++ movq (%rsi), %r8 ++ movq %r8, (%rdi) ++ leaq 8(%rdi), %rdi ++ leaq 8(%rsi), %rsi + jnz .Lloop_8 + + .Lhandle_7: +- movl %edx,%ecx +- andl $7,%ecx +- jz .Lende ++ movl %edx, %ecx ++ andl $7, %ecx ++ jz .Lend ++ + .p2align 4 + .Lloop_1: +- movb (%rsi),%r8b +- movb %r8b,(%rdi) ++ movb (%rsi), %r8b ++ movb %r8b, (%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +-.Lende: +- popq %rbx +- CFI_ADJUST_CFA_OFFSET -8 +- CFI_RESTORE rbx ++.Lend: + ret +-.Lfinal: + CFI_ENDPROC + ENDPROC(memcpy) + ENDPROC(__memcpy) + +- /* Some CPUs run faster using the string copy instructions. +- It is also a lot simpler. Use this when possible */ ++ /* ++ * Some CPUs run faster using the string copy instructions. ++ * It is also a lot simpler. Use this when possible: ++ */ + +- .section .altinstr_replacement,"ax" ++ .section .altinstr_replacement, "ax" + 1: .byte 0xeb /* jmp */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ + 2: + .previous +- .section .altinstructions,"a" ++ ++ .section .altinstructions, "a" + .align 8 + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD +- /* Replace only beginning, memcpy is used to apply alternatives, so it +- * is silly to overwrite itself with nops - reboot is only outcome... */ ++ ++ /* ++ * Replace only beginning, memcpy is used to apply alternatives, ++ * so it is silly to overwrite itself with nops - reboot is the ++ * only outcome... ++ */ + .byte 2b - 1b + .byte 2b - 1b + .previous +Index: linux-2.6-tip/arch/x86/mach-default/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-default/Makefile ++++ /dev/null +@@ -1,5 +0,0 @@ +-# +-# Makefile for the linux kernel. +-# +- +-obj-y := setup.o +Index: linux-2.6-tip/arch/x86/mach-default/setup.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-default/setup.c ++++ /dev/null +@@ -1,174 +0,0 @@ +-/* +- * Machine specific setup for generic +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +- +-#ifdef CONFIG_HOTPLUG_CPU +-#define DEFAULT_SEND_IPI (1) +-#else +-#define DEFAULT_SEND_IPI (0) +-#endif +- +-int no_broadcast = DEFAULT_SEND_IPI; +- +-/** +- * pre_intr_init_hook - initialisation prior to setting up interrupt vectors +- * +- * Description: +- * Perform any necessary interrupt initialisation prior to setting up +- * the "ordinary" interrupt call gates. For legacy reasons, the ISA +- * interrupts should be initialised here if the machine emulates a PC +- * in any way. +- **/ +-void __init pre_intr_init_hook(void) +-{ +- if (x86_quirks->arch_pre_intr_init) { +- if (x86_quirks->arch_pre_intr_init()) +- return; +- } +- init_ISA_irqs(); +-} +- +-/* +- * IRQ2 is cascade interrupt to second interrupt controller +- */ +-static struct irqaction irq2 = { +- .handler = no_action, +- .mask = CPU_MASK_NONE, +- .name = "cascade", +-}; +- +-/** +- * intr_init_hook - post gate setup interrupt initialisation +- * +- * Description: +- * Fill in any interrupts that may have been left out by the general +- * init_IRQ() routine. interrupts having to do with the machine rather +- * than the devices on the I/O bus (like APIC interrupts in intel MP +- * systems) are started here. +- **/ +-void __init intr_init_hook(void) +-{ +- if (x86_quirks->arch_intr_init) { +- if (x86_quirks->arch_intr_init()) +- return; +- } +- if (!acpi_ioapic) +- setup_irq(2, &irq2); +- +-} +- +-/** +- * pre_setup_arch_hook - hook called prior to any setup_arch() execution +- * +- * Description: +- * generally used to activate any machine specific identification +- * routines that may be needed before setup_arch() runs. On Voyager +- * this is used to get the board revision and type. +- **/ +-void __init pre_setup_arch_hook(void) +-{ +-} +- +-/** +- * trap_init_hook - initialise system specific traps +- * +- * Description: +- * Called as the final act of trap_init(). Used in VISWS to initialise +- * the various board specific APIC traps. +- **/ +-void __init trap_init_hook(void) +-{ +- if (x86_quirks->arch_trap_init) { +- if (x86_quirks->arch_trap_init()) +- return; +- } +-} +- +-static struct irqaction irq0 = { +- .handler = timer_interrupt, +- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, +- .mask = CPU_MASK_NONE, +- .name = "timer" +-}; +- +-/** +- * pre_time_init_hook - do any specific initialisations before. +- * +- **/ +-void __init pre_time_init_hook(void) +-{ +- if (x86_quirks->arch_pre_time_init) +- x86_quirks->arch_pre_time_init(); +-} +- +-/** +- * time_init_hook - do any specific initialisations for the system timer. +- * +- * Description: +- * Must plug the system timer interrupt source at HZ into the IRQ listed +- * in irq_vectors.h:TIMER_IRQ +- **/ +-void __init time_init_hook(void) +-{ +- if (x86_quirks->arch_time_init) { +- /* +- * A nonzero return code does not mean failure, it means +- * that the architecture quirk does not want any +- * generic (timer) setup to be performed after this: +- */ +- if (x86_quirks->arch_time_init()) +- return; +- } +- +- irq0.mask = cpumask_of_cpu(0); +- setup_irq(0, &irq0); +-} +- +-#ifdef CONFIG_MCA +-/** +- * mca_nmi_hook - hook into MCA specific NMI chain +- * +- * Description: +- * The MCA (Microchannel Architecture) has an NMI chain for NMI sources +- * along the MCA bus. Use this to hook into that chain if you will need +- * it. +- **/ +-void mca_nmi_hook(void) +-{ +- /* +- * If I recall correctly, there's a whole bunch of other things that +- * we can do to check for NMI problems, but that's all I know about +- * at the moment. +- */ +- pr_warning("NMI generated from unknown source!\n"); +-} +-#endif +- +-static __init int no_ipi_broadcast(char *str) +-{ +- get_option(&str, &no_broadcast); +- pr_info("Using %s mode\n", +- no_broadcast ? "No IPI Broadcast" : "IPI Broadcast"); +- return 1; +-} +-__setup("no_ipi_broadcast=", no_ipi_broadcast); +- +-static int __init print_ipi_mode(void) +-{ +- pr_info("Using IPI %s mode\n", +- no_broadcast ? "No-Shortcut" : "Shortcut"); +- return 0; +-} +- +-late_initcall(print_ipi_mode); +- +Index: linux-2.6-tip/arch/x86/mach-generic/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-generic/Makefile ++++ /dev/null +@@ -1,11 +0,0 @@ +-# +-# Makefile for the generic architecture +-# +- +-EXTRA_CFLAGS := -Iarch/x86/kernel +- +-obj-y := probe.o default.o +-obj-$(CONFIG_X86_NUMAQ) += numaq.o +-obj-$(CONFIG_X86_SUMMIT) += summit.o +-obj-$(CONFIG_X86_BIGSMP) += bigsmp.o +-obj-$(CONFIG_X86_ES7000) += es7000.o +Index: linux-2.6-tip/arch/x86/mach-generic/bigsmp.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-generic/bigsmp.c ++++ /dev/null +@@ -1,60 +0,0 @@ +-/* +- * APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs. +- * Drives the local APIC in "clustered mode". +- */ +-#define APIC_DEFINITION 1 +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-static int dmi_bigsmp; /* can be set by dmi scanners */ +- +-static int hp_ht_bigsmp(const struct dmi_system_id *d) +-{ +- printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); +- dmi_bigsmp = 1; +- return 0; +-} +- +- +-static const struct dmi_system_id bigsmp_dmi_table[] = { +- { hp_ht_bigsmp, "HP ProLiant DL760 G2", +- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), +- DMI_MATCH(DMI_BIOS_VERSION, "P44-"),} +- }, +- +- { hp_ht_bigsmp, "HP ProLiant DL740", +- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), +- DMI_MATCH(DMI_BIOS_VERSION, "P47-"),} +- }, +- { } +-}; +- +-static void vector_allocation_domain(int cpu, cpumask_t *retmask) +-{ +- cpus_clear(*retmask); +- cpu_set(cpu, *retmask); +-} +- +-static int probe_bigsmp(void) +-{ +- if (def_to_bigsmp) +- dmi_bigsmp = 1; +- else +- dmi_check_system(bigsmp_dmi_table); +- return dmi_bigsmp; +-} +- +-struct genapic apic_bigsmp = APIC_INIT("bigsmp", probe_bigsmp); +Index: linux-2.6-tip/arch/x86/mach-generic/default.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-generic/default.c ++++ /dev/null +@@ -1,27 +0,0 @@ +-/* +- * Default generic APIC driver. This handles up to 8 CPUs. +- */ +-#define APIC_DEFINITION 1 +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* should be called last. */ +-static int probe_default(void) +-{ +- return 1; +-} +- +-struct genapic apic_default = APIC_INIT("default", probe_default); +Index: linux-2.6-tip/arch/x86/mach-generic/es7000.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-generic/es7000.c ++++ /dev/null +@@ -1,103 +0,0 @@ +-/* +- * APIC driver for the Unisys ES7000 chipset. +- */ +-#define APIC_DEFINITION 1 +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-void __init es7000_update_genapic_to_cluster(void) +-{ +- genapic->target_cpus = target_cpus_cluster; +- genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER; +- genapic->int_dest_mode = INT_DEST_MODE_CLUSTER; +- genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER; +- +- genapic->init_apic_ldr = init_apic_ldr_cluster; +- +- genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster; +-} +- +-static int probe_es7000(void) +-{ +- /* probed later in mptable/ACPI hooks */ +- return 0; +-} +- +-extern void es7000_sw_apic(void); +-static void __init enable_apic_mode(void) +-{ +- es7000_sw_apic(); +- return; +-} +- +-static __init int +-mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) +-{ +- if (mpc->oemptr) { +- struct mpc_oemtable *oem_table = +- (struct mpc_oemtable *)mpc->oemptr; +- if (!strncmp(oem, "UNISYS", 6)) +- return parse_unisys_oem((char *)oem_table); +- } +- return 0; +-} +- +-#ifdef CONFIG_ACPI +-/* Hook from generic ACPI tables.c */ +-static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- unsigned long oem_addr = 0; +- int check_dsdt; +- int ret = 0; +- +- /* check dsdt at first to avoid clear fix_map for oem_addr */ +- check_dsdt = es7000_check_dsdt(); +- +- if (!find_unisys_acpi_oem_table(&oem_addr)) { +- if (check_dsdt) +- ret = parse_unisys_oem((char *)oem_addr); +- else { +- setup_unisys(); +- ret = 1; +- } +- /* +- * we need to unmap it +- */ +- unmap_unisys_acpi_oem_table(oem_addr); +- } +- return ret; +-} +-#else +-static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- return 0; +-} +-#endif +- +-static void vector_allocation_domain(int cpu, cpumask_t *retmask) +-{ +- /* Careful. Some cpus do not strictly honor the set of cpus +- * specified in the interrupt destination when using lowest +- * priority interrupt delivery mode. +- * +- * In particular there was a hyperthreading cpu observed to +- * deliver interrupts to the wrong hyperthread when only one +- * hyperthread was specified in the interrupt desitination. +- */ +- *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; +-} +- +-struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); +Index: linux-2.6-tip/arch/x86/mach-generic/numaq.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-generic/numaq.c ++++ /dev/null +@@ -1,53 +0,0 @@ +-/* +- * APIC driver for the IBM NUMAQ chipset. +- */ +-#define APIC_DEFINITION 1 +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-static int mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) +-{ +- numaq_mps_oem_check(mpc, oem, productid); +- return found_numaq; +-} +- +-static int probe_numaq(void) +-{ +- /* already know from get_memcfg_numaq() */ +- return found_numaq; +-} +- +-/* Hook from generic ACPI tables.c */ +-static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- return 0; +-} +- +-static void vector_allocation_domain(int cpu, cpumask_t *retmask) +-{ +- /* Careful. Some cpus do not strictly honor the set of cpus +- * specified in the interrupt destination when using lowest +- * priority interrupt delivery mode. +- * +- * In particular there was a hyperthreading cpu observed to +- * deliver interrupts to the wrong hyperthread when only one +- * hyperthread was specified in the interrupt desitination. +- */ +- *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; +-} +- +-struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); +Index: linux-2.6-tip/arch/x86/mach-generic/probe.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-generic/probe.c ++++ /dev/null +@@ -1,152 +0,0 @@ +-/* +- * Copyright 2003 Andi Kleen, SuSE Labs. +- * Subject to the GNU Public License, v.2 +- * +- * Generic x86 APIC driver probe layer. +- */ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-extern struct genapic apic_numaq; +-extern struct genapic apic_summit; +-extern struct genapic apic_bigsmp; +-extern struct genapic apic_es7000; +-extern struct genapic apic_default; +- +-struct genapic *genapic = &apic_default; +- +-static struct genapic *apic_probe[] __initdata = { +-#ifdef CONFIG_X86_NUMAQ +- &apic_numaq, +-#endif +-#ifdef CONFIG_X86_SUMMIT +- &apic_summit, +-#endif +-#ifdef CONFIG_X86_BIGSMP +- &apic_bigsmp, +-#endif +-#ifdef CONFIG_X86_ES7000 +- &apic_es7000, +-#endif +- &apic_default, /* must be last */ +- NULL, +-}; +- +-static int cmdline_apic __initdata; +-static int __init parse_apic(char *arg) +-{ +- int i; +- +- if (!arg) +- return -EINVAL; +- +- for (i = 0; apic_probe[i]; i++) { +- if (!strcmp(apic_probe[i]->name, arg)) { +- genapic = apic_probe[i]; +- cmdline_apic = 1; +- return 0; +- } +- } +- +- if (x86_quirks->update_genapic) +- x86_quirks->update_genapic(); +- +- /* Parsed again by __setup for debug/verbose */ +- return 0; +-} +-early_param("apic", parse_apic); +- +-void __init generic_bigsmp_probe(void) +-{ +-#ifdef CONFIG_X86_BIGSMP +- /* +- * This routine is used to switch to bigsmp mode when +- * - There is no apic= option specified by the user +- * - generic_apic_probe() has chosen apic_default as the sub_arch +- * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support +- */ +- +- if (!cmdline_apic && genapic == &apic_default) { +- if (apic_bigsmp.probe()) { +- genapic = &apic_bigsmp; +- if (x86_quirks->update_genapic) +- x86_quirks->update_genapic(); +- printk(KERN_INFO "Overriding APIC driver with %s\n", +- genapic->name); +- } +- } +-#endif +-} +- +-void __init generic_apic_probe(void) +-{ +- if (!cmdline_apic) { +- int i; +- for (i = 0; apic_probe[i]; i++) { +- if (apic_probe[i]->probe()) { +- genapic = apic_probe[i]; +- break; +- } +- } +- /* Not visible without early console */ +- if (!apic_probe[i]) +- panic("Didn't find an APIC driver"); +- +- if (x86_quirks->update_genapic) +- x86_quirks->update_genapic(); +- } +- printk(KERN_INFO "Using APIC driver %s\n", genapic->name); +-} +- +-/* These functions can switch the APIC even after the initial ->probe() */ +- +-int __init mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) +-{ +- int i; +- for (i = 0; apic_probe[i]; ++i) { +- if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) { +- if (!cmdline_apic) { +- genapic = apic_probe[i]; +- if (x86_quirks->update_genapic) +- x86_quirks->update_genapic(); +- printk(KERN_INFO "Switched to APIC driver `%s'.\n", +- genapic->name); +- } +- return 1; +- } +- } +- return 0; +-} +- +-int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +-{ +- int i; +- for (i = 0; apic_probe[i]; ++i) { +- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { +- if (!cmdline_apic) { +- genapic = apic_probe[i]; +- if (x86_quirks->update_genapic) +- x86_quirks->update_genapic(); +- printk(KERN_INFO "Switched to APIC driver `%s'.\n", +- genapic->name); +- } +- return 1; +- } +- } +- return 0; +-} +- +-int hard_smp_processor_id(void) +-{ +- return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID)); +-} +Index: linux-2.6-tip/arch/x86/mach-generic/summit.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-generic/summit.c ++++ /dev/null +@@ -1,40 +0,0 @@ +-/* +- * APIC driver for the IBM "Summit" chipset. +- */ +-#define APIC_DEFINITION 1 +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-static int probe_summit(void) +-{ +- /* probed later in mptable/ACPI hooks */ +- return 0; +-} +- +-static void vector_allocation_domain(int cpu, cpumask_t *retmask) +-{ +- /* Careful. Some cpus do not strictly honor the set of cpus +- * specified in the interrupt destination when using lowest +- * priority interrupt delivery mode. +- * +- * In particular there was a hyperthreading cpu observed to +- * deliver interrupts to the wrong hyperthread when only one +- * hyperthread was specified in the interrupt desitination. +- */ +- *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; +-} +- +-struct genapic apic_summit = APIC_INIT("summit", probe_summit); +Index: linux-2.6-tip/arch/x86/mach-rdc321x/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-rdc321x/Makefile ++++ /dev/null +@@ -1,5 +0,0 @@ +-# +-# Makefile for the RDC321x specific parts of the kernel +-# +-obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o +- +Index: linux-2.6-tip/arch/x86/mach-rdc321x/gpio.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-rdc321x/gpio.c ++++ /dev/null +@@ -1,194 +0,0 @@ +-/* +- * GPIO support for RDC SoC R3210/R8610 +- * +- * Copyright (C) 2007, Florian Fainelli +- * Copyright (C) 2008, Volker Weiss +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License as published by +- * the Free Software Foundation; either version 2 of the License, or +- * (at your option) any later version. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program; if not, write to the Free Software +- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +- * +- */ +- +- +-#include +-#include +-#include +-#include +- +-#include +-#include +- +- +-/* spin lock to protect our private copy of GPIO data register plus +- the access to PCI conf registers. */ +-static DEFINE_SPINLOCK(gpio_lock); +- +-/* copy of GPIO data registers */ +-static u32 gpio_data_reg1; +-static u32 gpio_data_reg2; +- +-static u32 gpio_request_data[2]; +- +- +-static inline void rdc321x_conf_write(unsigned addr, u32 value) +-{ +- outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); +- outl(value, RDC3210_CFGREG_DATA); +-} +- +-static inline void rdc321x_conf_or(unsigned addr, u32 value) +-{ +- outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); +- value |= inl(RDC3210_CFGREG_DATA); +- outl(value, RDC3210_CFGREG_DATA); +-} +- +-static inline u32 rdc321x_conf_read(unsigned addr) +-{ +- outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR); +- +- return inl(RDC3210_CFGREG_DATA); +-} +- +-/* configure pin as GPIO */ +-static void rdc321x_configure_gpio(unsigned gpio) +-{ +- unsigned long flags; +- +- spin_lock_irqsave(&gpio_lock, flags); +- rdc321x_conf_or(gpio < 32 +- ? RDC321X_GPIO_CTRL_REG1 : RDC321X_GPIO_CTRL_REG2, +- 1 << (gpio & 0x1f)); +- spin_unlock_irqrestore(&gpio_lock, flags); +-} +- +-/* initially setup the 2 copies of the gpio data registers. +- This function must be called by the platform setup code. */ +-void __init rdc321x_gpio_setup() +-{ +- /* this might not be, what others (BIOS, bootloader, etc.) +- wrote to these registers before, but it's a good guess. Still +- better than just using 0xffffffff. */ +- +- gpio_data_reg1 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG1); +- gpio_data_reg2 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG2); +-} +- +-/* determine, if gpio number is valid */ +-static inline int rdc321x_is_gpio(unsigned gpio) +-{ +- return gpio <= RDC321X_MAX_GPIO; +-} +- +-/* request GPIO */ +-int rdc_gpio_request(unsigned gpio, const char *label) +-{ +- unsigned long flags; +- +- if (!rdc321x_is_gpio(gpio)) +- return -EINVAL; +- +- spin_lock_irqsave(&gpio_lock, flags); +- if (gpio_request_data[(gpio & 0x20) ? 1 : 0] & (1 << (gpio & 0x1f))) +- goto inuse; +- gpio_request_data[(gpio & 0x20) ? 1 : 0] |= (1 << (gpio & 0x1f)); +- spin_unlock_irqrestore(&gpio_lock, flags); +- +- return 0; +-inuse: +- spin_unlock_irqrestore(&gpio_lock, flags); +- return -EINVAL; +-} +-EXPORT_SYMBOL(rdc_gpio_request); +- +-/* release previously-claimed GPIO */ +-void rdc_gpio_free(unsigned gpio) +-{ +- unsigned long flags; +- +- if (!rdc321x_is_gpio(gpio)) +- return; +- +- spin_lock_irqsave(&gpio_lock, flags); +- gpio_request_data[(gpio & 0x20) ? 1 : 0] &= ~(1 << (gpio & 0x1f)); +- spin_unlock_irqrestore(&gpio_lock, flags); +-} +-EXPORT_SYMBOL(rdc_gpio_free); +- +-/* read GPIO pin */ +-int rdc_gpio_get_value(unsigned gpio) +-{ +- u32 reg; +- unsigned long flags; +- +- spin_lock_irqsave(&gpio_lock, flags); +- reg = rdc321x_conf_read(gpio < 32 +- ? RDC321X_GPIO_DATA_REG1 : RDC321X_GPIO_DATA_REG2); +- spin_unlock_irqrestore(&gpio_lock, flags); +- +- return (1 << (gpio & 0x1f)) & reg ? 1 : 0; +-} +-EXPORT_SYMBOL(rdc_gpio_get_value); +- +-/* set GPIO pin to value */ +-void rdc_gpio_set_value(unsigned gpio, int value) +-{ +- unsigned long flags; +- u32 reg; +- +- reg = 1 << (gpio & 0x1f); +- if (gpio < 32) { +- spin_lock_irqsave(&gpio_lock, flags); +- if (value) +- gpio_data_reg1 |= reg; +- else +- gpio_data_reg1 &= ~reg; +- rdc321x_conf_write(RDC321X_GPIO_DATA_REG1, gpio_data_reg1); +- spin_unlock_irqrestore(&gpio_lock, flags); +- } else { +- spin_lock_irqsave(&gpio_lock, flags); +- if (value) +- gpio_data_reg2 |= reg; +- else +- gpio_data_reg2 &= ~reg; +- rdc321x_conf_write(RDC321X_GPIO_DATA_REG2, gpio_data_reg2); +- spin_unlock_irqrestore(&gpio_lock, flags); +- } +-} +-EXPORT_SYMBOL(rdc_gpio_set_value); +- +-/* configure GPIO pin as input */ +-int rdc_gpio_direction_input(unsigned gpio) +-{ +- if (!rdc321x_is_gpio(gpio)) +- return -EINVAL; +- +- rdc321x_configure_gpio(gpio); +- +- return 0; +-} +-EXPORT_SYMBOL(rdc_gpio_direction_input); +- +-/* configure GPIO pin as output and set value */ +-int rdc_gpio_direction_output(unsigned gpio, int value) +-{ +- if (!rdc321x_is_gpio(gpio)) +- return -EINVAL; +- +- gpio_set_value(gpio, value); +- rdc321x_configure_gpio(gpio); +- +- return 0; +-} +-EXPORT_SYMBOL(rdc_gpio_direction_output); +Index: linux-2.6-tip/arch/x86/mach-rdc321x/platform.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-rdc321x/platform.c ++++ /dev/null +@@ -1,69 +0,0 @@ +-/* +- * Generic RDC321x platform devices +- * +- * Copyright (C) 2007 Florian Fainelli +- * +- * This program is free software; you can redistribute it and/or +- * modify it under the terms of the GNU General Public License +- * as published by the Free Software Foundation; either version 2 +- * of the License, or (at your option) any later version. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program; if not, write to the +- * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +- * Boston, MA 02110-1301, USA. +- * +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +- +-/* LEDS */ +-static struct gpio_led default_leds[] = { +- { .name = "rdc:dmz", .gpio = 1, }, +-}; +- +-static struct gpio_led_platform_data rdc321x_led_data = { +- .num_leds = ARRAY_SIZE(default_leds), +- .leds = default_leds, +-}; +- +-static struct platform_device rdc321x_leds = { +- .name = "leds-gpio", +- .id = -1, +- .dev = { +- .platform_data = &rdc321x_led_data, +- } +-}; +- +-/* Watchdog */ +-static struct platform_device rdc321x_wdt = { +- .name = "rdc321x-wdt", +- .id = -1, +- .num_resources = 0, +-}; +- +-static struct platform_device *rdc321x_devs[] = { +- &rdc321x_leds, +- &rdc321x_wdt +-}; +- +-static int __init rdc_board_setup(void) +-{ +- rdc321x_gpio_setup(); +- +- return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs)); +-} +- +-arch_initcall(rdc_board_setup); +Index: linux-2.6-tip/arch/x86/mach-voyager/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-voyager/Makefile ++++ /dev/null +@@ -1,8 +0,0 @@ +-# +-# Makefile for the linux kernel. +-# +- +-EXTRA_CFLAGS := -Iarch/x86/kernel +-obj-y := setup.o voyager_basic.o voyager_thread.o +- +-obj-$(CONFIG_SMP) += voyager_smp.o voyager_cat.o +Index: linux-2.6-tip/arch/x86/mach-voyager/setup.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-voyager/setup.c ++++ /dev/null +@@ -1,118 +0,0 @@ +-/* +- * Machine specific setup for generic +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-void __init pre_intr_init_hook(void) +-{ +- init_ISA_irqs(); +-} +- +-/* +- * IRQ2 is cascade interrupt to second interrupt controller +- */ +-static struct irqaction irq2 = { +- .handler = no_action, +- .mask = CPU_MASK_NONE, +- .name = "cascade", +-}; +- +-void __init intr_init_hook(void) +-{ +-#ifdef CONFIG_SMP +- voyager_smp_intr_init(); +-#endif +- +- setup_irq(2, &irq2); +-} +- +-static void voyager_disable_tsc(void) +-{ +- /* Voyagers run their CPUs from independent clocks, so disable +- * the TSC code because we can't sync them */ +- setup_clear_cpu_cap(X86_FEATURE_TSC); +-} +- +-void __init pre_setup_arch_hook(void) +-{ +- voyager_disable_tsc(); +-} +- +-void __init pre_time_init_hook(void) +-{ +- voyager_disable_tsc(); +-} +- +-void __init trap_init_hook(void) +-{ +-} +- +-static struct irqaction irq0 = { +- .handler = timer_interrupt, +- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, +- .mask = CPU_MASK_NONE, +- .name = "timer" +-}; +- +-void __init time_init_hook(void) +-{ +- irq0.mask = cpumask_of_cpu(safe_smp_processor_id()); +- setup_irq(0, &irq0); +-} +- +-/* Hook for machine specific memory setup. */ +- +-char *__init machine_specific_memory_setup(void) +-{ +- char *who; +- int new_nr; +- +- who = "NOT VOYAGER"; +- +- if (voyager_level == 5) { +- __u32 addr, length; +- int i; +- +- who = "Voyager-SUS"; +- +- e820.nr_map = 0; +- for (i = 0; voyager_memory_detect(i, &addr, &length); i++) { +- e820_add_region(addr, length, E820_RAM); +- } +- return who; +- } else if (voyager_level == 4) { +- __u32 tom; +- __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8; +- /* select the DINO config space */ +- outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT); +- /* Read DINO top of memory register */ +- tom = ((inb(catbase + 0x4) & 0xf0) << 16) +- + ((inb(catbase + 0x5) & 0x7f) << 24); +- +- if (inb(catbase) != VOYAGER_DINO) { +- printk(KERN_ERR +- "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n"); +- tom = (boot_params.screen_info.ext_mem_k) << 10; +- } +- who = "Voyager-TOM"; +- e820_add_region(0, 0x9f000, E820_RAM); +- /* map from 1M to top of memory */ +- e820_add_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024, +- E820_RAM); +- /* FIXME: Should check the ASICs to see if I need to +- * take out the 8M window. Just do it at the moment +- * */ +- e820_add_region(8 * 1024 * 1024, 8 * 1024 * 1024, +- E820_RESERVED); +- return who; +- } +- +- return default_machine_specific_memory_setup(); +-} +Index: linux-2.6-tip/arch/x86/mach-voyager/voyager_basic.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-voyager/voyager_basic.c ++++ /dev/null +@@ -1,317 +0,0 @@ +-/* Copyright (C) 1999,2001 +- * +- * Author: J.E.J.Bottomley@HansenPartnership.com +- * +- * This file contains all the voyager specific routines for getting +- * initialisation of the architecture to function. For additional +- * features see: +- * +- * voyager_cat.c - Voyager CAT bus interface +- * voyager_smp.c - Voyager SMP hal (emulates linux smp.c) +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * Power off function, if any +- */ +-void (*pm_power_off) (void); +-EXPORT_SYMBOL(pm_power_off); +- +-int voyager_level = 0; +- +-struct voyager_SUS *voyager_SUS = NULL; +- +-#ifdef CONFIG_SMP +-static void voyager_dump(int dummy1, struct tty_struct *dummy3) +-{ +- /* get here via a sysrq */ +- voyager_smp_dump(); +-} +- +-static struct sysrq_key_op sysrq_voyager_dump_op = { +- .handler = voyager_dump, +- .help_msg = "Voyager", +- .action_msg = "Dump Voyager Status", +-}; +-#endif +- +-void voyager_detect(struct voyager_bios_info *bios) +-{ +- if (bios->len != 0xff) { +- int class = (bios->class_1 << 8) +- | (bios->class_2 & 0xff); +- +- printk("Voyager System detected.\n" +- " Class %x, Revision %d.%d\n", +- class, bios->major, bios->minor); +- if (class == VOYAGER_LEVEL4) +- voyager_level = 4; +- else if (class < VOYAGER_LEVEL5_AND_ABOVE) +- voyager_level = 3; +- else +- voyager_level = 5; +- printk(" Architecture Level %d\n", voyager_level); +- if (voyager_level < 4) +- printk +- ("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n"); +- /* install the power off handler */ +- pm_power_off = voyager_power_off; +-#ifdef CONFIG_SMP +- register_sysrq_key('v', &sysrq_voyager_dump_op); +-#endif +- } else { +- printk("\n\n**WARNING**: No Voyager Subsystem Found\n"); +- } +-} +- +-void voyager_system_interrupt(int cpl, void *dev_id) +-{ +- printk("Voyager: detected system interrupt\n"); +-} +- +-/* Routine to read information from the extended CMOS area */ +-__u8 voyager_extended_cmos_read(__u16 addr) +-{ +- outb(addr & 0xff, 0x74); +- outb((addr >> 8) & 0xff, 0x75); +- return inb(0x76); +-} +- +-/* internal definitions for the SUS Click Map of memory */ +- +-#define CLICK_ENTRIES 16 +-#define CLICK_SIZE 4096 /* click to byte conversion for Length */ +- +-typedef struct ClickMap { +- struct Entry { +- __u32 Address; +- __u32 Length; +- } Entry[CLICK_ENTRIES]; +-} ClickMap_t; +- +-/* This routine is pretty much an awful hack to read the bios clickmap by +- * mapping it into page 0. There are usually three regions in the map: +- * Base Memory +- * Extended Memory +- * zero length marker for end of map +- * +- * Returns are 0 for failure and 1 for success on extracting region. +- */ +-int __init voyager_memory_detect(int region, __u32 * start, __u32 * length) +-{ +- int i; +- int retval = 0; +- __u8 cmos[4]; +- ClickMap_t *map; +- unsigned long map_addr; +- unsigned long old; +- +- if (region >= CLICK_ENTRIES) { +- printk("Voyager: Illegal ClickMap region %d\n", region); +- return 0; +- } +- +- for (i = 0; i < sizeof(cmos); i++) +- cmos[i] = +- voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i); +- +- map_addr = *(unsigned long *)cmos; +- +- /* steal page 0 for this */ +- old = pg0[0]; +- pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); +- local_flush_tlb(); +- /* now clear everything out but page 0 */ +- map = (ClickMap_t *) (map_addr & (~PAGE_MASK)); +- +- /* zero length is the end of the clickmap */ +- if (map->Entry[region].Length != 0) { +- *length = map->Entry[region].Length * CLICK_SIZE; +- *start = map->Entry[region].Address; +- retval = 1; +- } +- +- /* replace the mapping */ +- pg0[0] = old; +- local_flush_tlb(); +- return retval; +-} +- +-/* voyager specific handling code for timer interrupts. Used to hand +- * off the timer tick to the SMP code, since the VIC doesn't have an +- * internal timer (The QIC does, but that's another story). */ +-void voyager_timer_interrupt(void) +-{ +- if ((jiffies & 0x3ff) == 0) { +- +- /* There seems to be something flaky in either +- * hardware or software that is resetting the timer 0 +- * count to something much higher than it should be +- * This seems to occur in the boot sequence, just +- * before root is mounted. Therefore, every 10 +- * seconds or so, we sanity check the timer zero count +- * and kick it back to where it should be. +- * +- * FIXME: This is the most awful hack yet seen. I +- * should work out exactly what is interfering with +- * the timer count settings early in the boot sequence +- * and swiftly introduce it to something sharp and +- * pointy. */ +- __u16 val; +- +- spin_lock(&i8253_lock); +- +- outb_p(0x00, 0x43); +- val = inb_p(0x40); +- val |= inb(0x40) << 8; +- spin_unlock(&i8253_lock); +- +- if (val > LATCH) { +- printk +- ("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", +- val); +- spin_lock(&i8253_lock); +- outb(0x34, 0x43); +- outb_p(LATCH & 0xff, 0x40); /* LSB */ +- outb(LATCH >> 8, 0x40); /* MSB */ +- spin_unlock(&i8253_lock); +- } +- } +-#ifdef CONFIG_SMP +- smp_vic_timer_interrupt(); +-#endif +-} +- +-void voyager_power_off(void) +-{ +- printk("VOYAGER Power Off\n"); +- +- if (voyager_level == 5) { +- voyager_cat_power_off(); +- } else if (voyager_level == 4) { +- /* This doesn't apparently work on most L4 machines, +- * but the specs say to do this to get automatic power +- * off. Unfortunately, if it doesn't power off the +- * machine, it ends up doing a cold restart, which +- * isn't really intended, so comment out the code */ +-#if 0 +- int port; +- +- /* enable the voyager Configuration Space */ +- outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, VOYAGER_MC_SETUP); +- /* the port for the power off flag is an offset from the +- floating base */ +- port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21; +- /* set the power off flag */ +- outb(inb(port) | 0x1, port); +-#endif +- } +- /* and wait for it to happen */ +- local_irq_disable(); +- for (;;) +- halt(); +-} +- +-/* copied from process.c */ +-static inline void kb_wait(void) +-{ +- int i; +- +- for (i = 0; i < 0x10000; i++) +- if ((inb_p(0x64) & 0x02) == 0) +- break; +-} +- +-void machine_shutdown(void) +-{ +- /* Architecture specific shutdown needed before a kexec */ +-} +- +-void machine_restart(char *cmd) +-{ +- printk("Voyager Warm Restart\n"); +- kb_wait(); +- +- if (voyager_level == 5) { +- /* write magic values to the RTC to inform system that +- * shutdown is beginning */ +- outb(0x8f, 0x70); +- outb(0x5, 0x71); +- +- udelay(50); +- outb(0xfe, 0x64); /* pull reset low */ +- } else if (voyager_level == 4) { +- __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8; +- __u8 basebd = inb(VOYAGER_MC_SETUP); +- +- outb(basebd | 0x08, VOYAGER_MC_SETUP); +- outb(0x02, catbase + 0x21); +- } +- local_irq_disable(); +- for (;;) +- halt(); +-} +- +-void machine_emergency_restart(void) +-{ +- /*for now, just hook this to a warm restart */ +- machine_restart(NULL); +-} +- +-void mca_nmi_hook(void) +-{ +- __u8 dumpval __maybe_unused = inb(0xf823); +- __u8 swnmi __maybe_unused = inb(0xf813); +- +- /* FIXME: assume dump switch pressed */ +- /* check to see if the dump switch was pressed */ +- VDEBUG(("VOYAGER: dumpval = 0x%x, swnmi = 0x%x\n", dumpval, swnmi)); +- /* clear swnmi */ +- outb(0xff, 0xf813); +- /* tell SUS to ignore dump */ +- if (voyager_level == 5 && voyager_SUS != NULL) { +- if (voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) { +- voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND; +- voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS; +- udelay(1000); +- voyager_SUS->kernel_mbox = VOYAGER_IGNORE_DUMP; +- voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS; +- } +- } +- printk(KERN_ERR +- "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", +- smp_processor_id()); +- show_stack(NULL, NULL); +- show_state(); +-} +- +-void machine_halt(void) +-{ +- /* treat a halt like a power off */ +- machine_power_off(); +-} +- +-void machine_power_off(void) +-{ +- if (pm_power_off) +- pm_power_off(); +-} +Index: linux-2.6-tip/arch/x86/mach-voyager/voyager_cat.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-voyager/voyager_cat.c ++++ /dev/null +@@ -1,1197 +0,0 @@ +-/* -*- mode: c; c-basic-offset: 8 -*- */ +- +-/* Copyright (C) 1999,2001 +- * +- * Author: J.E.J.Bottomley@HansenPartnership.com +- * +- * This file contains all the logic for manipulating the CAT bus +- * in a level 5 machine. +- * +- * The CAT bus is a serial configuration and test bus. Its primary +- * uses are to probe the initial configuration of the system and to +- * diagnose error conditions when a system interrupt occurs. The low +- * level interface is fairly primitive, so most of this file consists +- * of bit shift manipulations to send and receive packets on the +- * serial bus */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef VOYAGER_CAT_DEBUG +-#define CDEBUG(x) printk x +-#else +-#define CDEBUG(x) +-#endif +- +-/* the CAT command port */ +-#define CAT_CMD (sspb + 0xe) +-/* the CAT data port */ +-#define CAT_DATA (sspb + 0xd) +- +-/* the internal cat functions */ +-static void cat_pack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits); +-static void cat_unpack(__u8 * msg, __u16 start_bit, __u8 * data, +- __u16 num_bits); +-static void cat_build_header(__u8 * header, const __u16 len, +- const __u16 smallest_reg_bits, +- const __u16 longest_reg_bits); +-static int cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, +- __u8 reg, __u8 op); +-static int cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, +- __u8 reg, __u8 * value); +-static int cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, +- __u8 pad_bits); +-static int cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, +- __u8 value); +-static int cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, +- __u8 * value); +-static int cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, +- __u16 offset, __u16 len, void *buf); +-static int cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp, +- __u8 reg, __u8 value); +-static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp); +-static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp); +- +-static inline const char *cat_module_name(int module_id) +-{ +- switch (module_id) { +- case 0x10: +- return "Processor Slot 0"; +- case 0x11: +- return "Processor Slot 1"; +- case 0x12: +- return "Processor Slot 2"; +- case 0x13: +- return "Processor Slot 4"; +- case 0x14: +- return "Memory Slot 0"; +- case 0x15: +- return "Memory Slot 1"; +- case 0x18: +- return "Primary Microchannel"; +- case 0x19: +- return "Secondary Microchannel"; +- case 0x1a: +- return "Power Supply Interface"; +- case 0x1c: +- return "Processor Slot 5"; +- case 0x1d: +- return "Processor Slot 6"; +- case 0x1e: +- return "Processor Slot 7"; +- case 0x1f: +- return "Processor Slot 8"; +- default: +- return "Unknown Module"; +- } +-} +- +-static int sspb = 0; /* stores the super port location */ +-int voyager_8slot = 0; /* set to true if a 51xx monster */ +- +-voyager_module_t *voyager_cat_list; +- +-/* the I/O port assignments for the VIC and QIC */ +-static struct resource vic_res = { +- .name = "Voyager Interrupt Controller", +- .start = 0xFC00, +- .end = 0xFC6F +-}; +-static struct resource qic_res = { +- .name = "Quad Interrupt Controller", +- .start = 0xFC70, +- .end = 0xFCFF +-}; +- +-/* This function is used to pack a data bit stream inside a message. +- * It writes num_bits of the data buffer in msg starting at start_bit. +- * Note: This function assumes that any unused bit in the data stream +- * is set to zero so that the ors will work correctly */ +-static void +-cat_pack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits) +-{ +- /* compute initial shift needed */ +- const __u16 offset = start_bit % BITS_PER_BYTE; +- __u16 len = num_bits / BITS_PER_BYTE; +- __u16 byte = start_bit / BITS_PER_BYTE; +- __u16 residue = (num_bits % BITS_PER_BYTE) + offset; +- int i; +- +- /* adjust if we have more than a byte of residue */ +- if (residue >= BITS_PER_BYTE) { +- residue -= BITS_PER_BYTE; +- len++; +- } +- +- /* clear out the bits. We assume here that if len==0 then +- * residue >= offset. This is always true for the catbus +- * operations */ +- msg[byte] &= 0xff << (BITS_PER_BYTE - offset); +- msg[byte++] |= data[0] >> offset; +- if (len == 0) +- return; +- for (i = 1; i < len; i++) +- msg[byte++] = (data[i - 1] << (BITS_PER_BYTE - offset)) +- | (data[i] >> offset); +- if (residue != 0) { +- __u8 mask = 0xff >> residue; +- __u8 last_byte = data[i - 1] << (BITS_PER_BYTE - offset) +- | (data[i] >> offset); +- +- last_byte &= ~mask; +- msg[byte] &= mask; +- msg[byte] |= last_byte; +- } +- return; +-} +- +-/* unpack the data again (same arguments as cat_pack()). data buffer +- * must be zero populated. +- * +- * Function: given a message string move to start_bit and copy num_bits into +- * data (starting at bit 0 in data). +- */ +-static void +-cat_unpack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits) +-{ +- /* compute initial shift needed */ +- const __u16 offset = start_bit % BITS_PER_BYTE; +- __u16 len = num_bits / BITS_PER_BYTE; +- const __u8 last_bits = num_bits % BITS_PER_BYTE; +- __u16 byte = start_bit / BITS_PER_BYTE; +- int i; +- +- if (last_bits != 0) +- len++; +- +- /* special case: want < 8 bits from msg and we can get it from +- * a single byte of the msg */ +- if (len == 0 && BITS_PER_BYTE - offset >= num_bits) { +- data[0] = msg[byte] << offset; +- data[0] &= 0xff >> (BITS_PER_BYTE - num_bits); +- return; +- } +- for (i = 0; i < len; i++) { +- /* this annoying if has to be done just in case a read of +- * msg one beyond the array causes a panic */ +- if (offset != 0) { +- data[i] = msg[byte++] << offset; +- data[i] |= msg[byte] >> (BITS_PER_BYTE - offset); +- } else { +- data[i] = msg[byte++]; +- } +- } +- /* do we need to truncate the final byte */ +- if (last_bits != 0) { +- data[i - 1] &= 0xff << (BITS_PER_BYTE - last_bits); +- } +- return; +-} +- +-static void +-cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits, +- const __u16 longest_reg_bits) +-{ +- int i; +- __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE; +- __u8 *last_byte = &header[len - 1]; +- +- if (start_bit == 0) +- start_bit = 1; /* must have at least one bit in the hdr */ +- +- for (i = 0; i < len; i++) +- header[i] = 0; +- +- for (i = start_bit; i > 0; i--) +- *last_byte = ((*last_byte) << 1) + 1; +- +-} +- +-static int +-cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op) +-{ +- __u8 parity, inst, inst_buf[4] = { 0 }; +- __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE]; +- __u16 ibytes, hbytes, padbits; +- int i; +- +- /* +- * Parity is the parity of the register number + 1 (READ_REGISTER +- * and WRITE_REGISTER always add '1' to the number of bits == 1) +- */ +- parity = (__u8) (1 + (reg & 0x01) + +- ((__u8) (reg & 0x02) >> 1) + +- ((__u8) (reg & 0x04) >> 2) + +- ((__u8) (reg & 0x08) >> 3)) % 2; +- +- inst = ((parity << 7) | (reg << 2) | op); +- +- outb(VOYAGER_CAT_IRCYC, CAT_CMD); +- if (!modp->scan_path_connected) { +- if (asicp->asic_id != VOYAGER_CAT_ID) { +- printk +- ("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n"); +- return 1; +- } +- outb(VOYAGER_CAT_HEADER, CAT_DATA); +- outb(inst, CAT_DATA); +- if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { +- CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n")); +- return 1; +- } +- return 0; +- } +- ibytes = modp->inst_bits / BITS_PER_BYTE; +- if ((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) { +- padbits = BITS_PER_BYTE - padbits; +- ibytes++; +- } +- hbytes = modp->largest_reg / BITS_PER_BYTE; +- if (modp->largest_reg % BITS_PER_BYTE) +- hbytes++; +- CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes)); +- /* initialise the instruction sequence to 0xff */ +- for (i = 0; i < ibytes + hbytes; i++) +- iseq[i] = 0xff; +- cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg); +- cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE); +- inst_buf[0] = inst; +- inst_buf[1] = 0xFF >> (modp->largest_reg % BITS_PER_BYTE); +- cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length); +-#ifdef VOYAGER_CAT_DEBUG +- printk("ins = 0x%x, iseq: ", inst); +- for (i = 0; i < ibytes + hbytes; i++) +- printk("0x%x ", iseq[i]); +- printk("\n"); +-#endif +- if (cat_shiftout(iseq, ibytes, hbytes, padbits)) { +- CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n")); +- return 1; +- } +- CDEBUG(("CAT SHIFTOUT DONE\n")); +- return 0; +-} +- +-static int +-cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, +- __u8 * value) +-{ +- if (!modp->scan_path_connected) { +- if (asicp->asic_id != VOYAGER_CAT_ID) { +- CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n")); +- return 1; +- } +- if (reg > VOYAGER_SUBADDRHI) +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- outb(VOYAGER_CAT_DRCYC, CAT_CMD); +- outb(VOYAGER_CAT_HEADER, CAT_DATA); +- *value = inb(CAT_DATA); +- outb(0xAA, CAT_DATA); +- if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { +- CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n")); +- return 1; +- } +- return 0; +- } else { +- __u16 sbits = modp->num_asics - 1 + asicp->ireg_length; +- __u16 sbytes = sbits / BITS_PER_BYTE; +- __u16 tbytes; +- __u8 string[VOYAGER_MAX_SCAN_PATH], +- trailer[VOYAGER_MAX_REG_SIZE]; +- __u8 padbits; +- int i; +- +- outb(VOYAGER_CAT_DRCYC, CAT_CMD); +- +- if ((padbits = sbits % BITS_PER_BYTE) != 0) { +- padbits = BITS_PER_BYTE - padbits; +- sbytes++; +- } +- tbytes = asicp->ireg_length / BITS_PER_BYTE; +- if (asicp->ireg_length % BITS_PER_BYTE) +- tbytes++; +- CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n", +- tbytes, sbytes, padbits)); +- cat_build_header(trailer, tbytes, 1, asicp->ireg_length); +- +- for (i = tbytes - 1; i >= 0; i--) { +- outb(trailer[i], CAT_DATA); +- string[sbytes + i] = inb(CAT_DATA); +- } +- +- for (i = sbytes - 1; i >= 0; i--) { +- outb(0xaa, CAT_DATA); +- string[i] = inb(CAT_DATA); +- } +- *value = 0; +- cat_unpack(string, +- padbits + (tbytes * BITS_PER_BYTE) + +- asicp->asic_location, value, asicp->ireg_length); +-#ifdef VOYAGER_CAT_DEBUG +- printk("value=0x%x, string: ", *value); +- for (i = 0; i < tbytes + sbytes; i++) +- printk("0x%x ", string[i]); +- printk("\n"); +-#endif +- +- /* sanity check the rest of the return */ +- for (i = 0; i < tbytes; i++) { +- __u8 input = 0; +- +- cat_unpack(string, padbits + (i * BITS_PER_BYTE), +- &input, BITS_PER_BYTE); +- if (trailer[i] != input) { +- CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i])); +- return 1; +- } +- } +- CDEBUG(("cat_getdata DONE\n")); +- return 0; +- } +-} +- +-static int +-cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) +-{ +- int i; +- +- for (i = data_bytes + header_bytes - 1; i >= header_bytes; i--) +- outb(data[i], CAT_DATA); +- +- for (i = header_bytes - 1; i >= 0; i--) { +- __u8 header = 0; +- __u8 input; +- +- outb(data[i], CAT_DATA); +- input = inb(CAT_DATA); +- CDEBUG(("cat_shiftout: returned 0x%x\n", input)); +- cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits, +- &header, BITS_PER_BYTE); +- if (input != header) { +- CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header)); +- return 1; +- } +- } +- return 0; +-} +- +-static int +-cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp, +- __u8 reg, __u8 value) +-{ +- outb(VOYAGER_CAT_DRCYC, CAT_CMD); +- if (!modp->scan_path_connected) { +- if (asicp->asic_id != VOYAGER_CAT_ID) { +- CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n")); +- return 1; +- } +- outb(VOYAGER_CAT_HEADER, CAT_DATA); +- outb(value, CAT_DATA); +- if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { +- CDEBUG(("cat_senddata: failed to get correct header response to sent data\n")); +- return 1; +- } +- if (reg > VOYAGER_SUBADDRHI) { +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- outb(VOYAGER_CAT_END, CAT_CMD); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- } +- +- return 0; +- } else { +- __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE; +- __u16 dbytes = +- (modp->num_asics - 1 + asicp->ireg_length) / BITS_PER_BYTE; +- __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], +- hseq[VOYAGER_MAX_REG_SIZE]; +- int i; +- +- if ((padbits = (modp->num_asics - 1 +- + asicp->ireg_length) % BITS_PER_BYTE) != 0) { +- padbits = BITS_PER_BYTE - padbits; +- dbytes++; +- } +- if (asicp->ireg_length % BITS_PER_BYTE) +- hbytes++; +- +- cat_build_header(hseq, hbytes, 1, asicp->ireg_length); +- +- for (i = 0; i < dbytes + hbytes; i++) +- dseq[i] = 0xff; +- CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n", +- dbytes, hbytes, padbits)); +- cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length, +- hseq, hbytes * BITS_PER_BYTE); +- cat_pack(dseq, asicp->asic_location, &value, +- asicp->ireg_length); +-#ifdef VOYAGER_CAT_DEBUG +- printk("dseq "); +- for (i = 0; i < hbytes + dbytes; i++) { +- printk("0x%x ", dseq[i]); +- } +- printk("\n"); +-#endif +- return cat_shiftout(dseq, dbytes, hbytes, padbits); +- } +-} +- +-static int +-cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value) +-{ +- if (cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG)) +- return 1; +- return cat_senddata(modp, asicp, reg, value); +-} +- +-static int +-cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, +- __u8 * value) +-{ +- if (cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG)) +- return 1; +- return cat_getdata(modp, asicp, reg, value); +-} +- +-static int +-cat_subaddrsetup(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, +- __u16 len) +-{ +- __u8 val; +- +- if (len > 1) { +- /* set auto increment */ +- __u8 newval; +- +- if (cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) { +- CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n")); +- return 1; +- } +- CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", +- val)); +- newval = val | VOYAGER_AUTO_INC; +- if (newval != val) { +- if (cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) { +- CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n")); +- return 1; +- } +- } +- } +- if (cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8) (offset & 0xff))) { +- CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n")); +- return 1; +- } +- if (asicp->subaddr > VOYAGER_SUBADDR_LO) { +- if (cat_write +- (modp, asicp, VOYAGER_SUBADDRHI, (__u8) (offset >> 8))) { +- CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n")); +- return 1; +- } +- cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val); +- CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, +- val)); +- } +- cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val); +- CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val)); +- return 0; +-} +- +-static int +-cat_subwrite(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, +- __u16 len, void *buf) +-{ +- int i, retval; +- +- /* FIXME: need special actions for VOYAGER_CAT_ID here */ +- if (asicp->asic_id == VOYAGER_CAT_ID) { +- CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n")); +- /* FIXME -- This is supposed to be handled better +- * There is a problem writing to the cat asic in the +- * PSI. The 30us delay seems to work, though */ +- udelay(30); +- } +- +- if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { +- printk("cat_subwrite: cat_subaddrsetup FAILED\n"); +- return retval; +- } +- +- if (cat_sendinst +- (modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) { +- printk("cat_subwrite: cat_sendinst FAILED\n"); +- return 1; +- } +- for (i = 0; i < len; i++) { +- if (cat_senddata(modp, asicp, 0xFF, ((__u8 *) buf)[i])) { +- printk +- ("cat_subwrite: cat_sendata element at %d FAILED\n", +- i); +- return 1; +- } +- } +- return 0; +-} +-static int +-cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, +- __u16 len, void *buf) +-{ +- int i, retval; +- +- if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { +- CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n")); +- return retval; +- } +- +- if (cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) { +- CDEBUG(("cat_subread: cat_sendinst failed\n")); +- return 1; +- } +- for (i = 0; i < len; i++) { +- if (cat_getdata(modp, asicp, 0xFF, &((__u8 *) buf)[i])) { +- CDEBUG(("cat_subread: cat_getdata element %d failed\n", +- i)); +- return 1; +- } +- } +- return 0; +-} +- +-/* buffer for storing EPROM data read in during initialisation */ +-static __initdata __u8 eprom_buf[0xFFFF]; +-static voyager_module_t *voyager_initial_module; +- +-/* Initialise the cat bus components. We assume this is called by the +- * boot cpu *after* all memory initialisation has been done (so we can +- * use kmalloc) but before smp initialisation, so we can probe the SMP +- * configuration and pick up necessary information. */ +-void __init voyager_cat_init(void) +-{ +- voyager_module_t **modpp = &voyager_initial_module; +- voyager_asic_t **asicpp; +- voyager_asic_t *qabc_asic = NULL; +- int i, j; +- unsigned long qic_addr = 0; +- __u8 qabc_data[0x20]; +- __u8 num_submodules, val; +- voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *) & eprom_buf[0]; +- +- __u8 cmos[4]; +- unsigned long addr; +- +- /* initiallise the SUS mailbox */ +- for (i = 0; i < sizeof(cmos); i++) +- cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i); +- addr = *(unsigned long *)cmos; +- if ((addr & 0xff000000) != 0xff000000) { +- printk(KERN_ERR +- "Voyager failed to get SUS mailbox (addr = 0x%lx\n", +- addr); +- } else { +- static struct resource res; +- +- res.name = "voyager SUS"; +- res.start = addr; +- res.end = addr + 0x3ff; +- +- request_resource(&iomem_resource, &res); +- voyager_SUS = (struct voyager_SUS *) +- ioremap(addr, 0x400); +- printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n", +- voyager_SUS->SUS_version); +- voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION; +- voyager_SUS->kernel_flags = VOYAGER_OS_HAS_SYSINT; +- } +- +- /* clear the processor counts */ +- voyager_extended_vic_processors = 0; +- voyager_quad_processors = 0; +- +- printk("VOYAGER: beginning CAT bus probe\n"); +- /* set up the SuperSet Port Block which tells us where the +- * CAT communication port is */ +- sspb = inb(VOYAGER_SSPB_RELOCATION_PORT) * 0x100; +- VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb)); +- +- /* now find out if were 8 slot or normal */ +- if ((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER) +- == EIGHT_SLOT_IDENTIFIER) { +- voyager_8slot = 1; +- printk(KERN_NOTICE +- "Voyager: Eight slot 51xx configuration detected\n"); +- } +- +- for (i = VOYAGER_MIN_MODULE; i <= VOYAGER_MAX_MODULE; i++) { +- __u8 input; +- int asic; +- __u16 eprom_size; +- __u16 sp_offset; +- +- outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT); +- outb(i, VOYAGER_CAT_CONFIG_PORT); +- +- /* check the presence of the module */ +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- outb(VOYAGER_CAT_IRCYC, CAT_CMD); +- outb(VOYAGER_CAT_HEADER, CAT_DATA); +- /* stream series of alternating 1's and 0's to stimulate +- * response */ +- outb(0xAA, CAT_DATA); +- input = inb(CAT_DATA); +- outb(VOYAGER_CAT_END, CAT_CMD); +- if (input != VOYAGER_CAT_HEADER) { +- continue; +- } +- CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i, +- cat_module_name(i))); +- *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++]; */ +- if (*modpp == NULL) { +- printk("**WARNING** kmalloc failure in cat_init\n"); +- continue; +- } +- memset(*modpp, 0, sizeof(voyager_module_t)); +- /* need temporary asic for cat_subread. It will be +- * filled in correctly later */ +- (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count]; */ +- if ((*modpp)->asic == NULL) { +- printk("**WARNING** kmalloc failure in cat_init\n"); +- continue; +- } +- memset((*modpp)->asic, 0, sizeof(voyager_asic_t)); +- (*modpp)->asic->asic_id = VOYAGER_CAT_ID; +- (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI; +- (*modpp)->module_addr = i; +- (*modpp)->scan_path_connected = 0; +- if (i == VOYAGER_PSI) { +- /* Exception leg for modules with no EEPROM */ +- printk("Module \"%s\"\n", cat_module_name(i)); +- continue; +- } +- +- CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_disconnect(*modpp, (*modpp)->asic); +- if (cat_subread(*modpp, (*modpp)->asic, +- VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), +- &eprom_size)) { +- printk +- ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", +- i); +- outb(VOYAGER_CAT_END, CAT_CMD); +- continue; +- } +- if (eprom_size > sizeof(eprom_buf)) { +- printk +- ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", +- i, eprom_size); +- outb(VOYAGER_CAT_END, CAT_CMD); +- continue; +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, +- eprom_size)); +- if (cat_subread +- (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) { +- outb(VOYAGER_CAT_END, CAT_CMD); +- continue; +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +- printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n", +- cat_module_name(i), eprom_hdr->version_id, +- *((__u32 *) eprom_hdr->tracer), eprom_hdr->num_asics); +- (*modpp)->ee_size = eprom_hdr->ee_size; +- (*modpp)->num_asics = eprom_hdr->num_asics; +- asicpp = &((*modpp)->asic); +- sp_offset = eprom_hdr->scan_path_offset; +- /* All we really care about are the Quad cards. We +- * identify them because they are in a processor slot +- * and have only four asics */ +- if ((i < 0x10 || (i >= 0x14 && i < 0x1c) || i > 0x1f)) { +- modpp = &((*modpp)->next); +- continue; +- } +- /* Now we know it's in a processor slot, does it have +- * a quad baseboard submodule */ +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODPRESENT, +- &num_submodules); +- /* lowest two bits, active low */ +- num_submodules = ~(0xfc | num_submodules); +- CDEBUG(("VOYAGER CAT: %d submodules present\n", +- num_submodules)); +- if (num_submodules == 0) { +- /* fill in the dyadic extended processors */ +- __u8 cpu = i & 0x07; +- +- printk("Module \"%s\": Dyadic Processor Card\n", +- cat_module_name(i)); +- voyager_extended_vic_processors |= (1 << cpu); +- cpu += 4; +- voyager_extended_vic_processors |= (1 << cpu); +- outb(VOYAGER_CAT_END, CAT_CMD); +- continue; +- } +- +- /* now we want to read the asics on the first submodule, +- * which should be the quad base board */ +- +- cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, &val); +- CDEBUG(("cat_init: SUBMODSELECT value = 0x%x\n", val)); +- val = (val & 0x7c) | VOYAGER_QUAD_BASEBOARD; +- cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val); +- +- outb(VOYAGER_CAT_END, CAT_CMD); +- +- CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_disconnect(*modpp, (*modpp)->asic); +- if (cat_subread(*modpp, (*modpp)->asic, +- VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), +- &eprom_size)) { +- printk +- ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", +- i); +- outb(VOYAGER_CAT_END, CAT_CMD); +- continue; +- } +- if (eprom_size > sizeof(eprom_buf)) { +- printk +- ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", +- i, eprom_size); +- outb(VOYAGER_CAT_END, CAT_CMD); +- continue; +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, +- eprom_size)); +- if (cat_subread +- (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) { +- outb(VOYAGER_CAT_END, CAT_CMD); +- continue; +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +- /* Now do everything for the QBB submodule 1 */ +- (*modpp)->ee_size = eprom_hdr->ee_size; +- (*modpp)->num_asics = eprom_hdr->num_asics; +- asicpp = &((*modpp)->asic); +- sp_offset = eprom_hdr->scan_path_offset; +- /* get rid of the dummy CAT asic and read the real one */ +- kfree((*modpp)->asic); +- for (asic = 0; asic < (*modpp)->num_asics; asic++) { +- int j; +- voyager_asic_t *asicp = *asicpp = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++]; */ +- voyager_sp_table_t *sp_table; +- voyager_at_t *asic_table; +- voyager_jtt_t *jtag_table; +- +- if (asicp == NULL) { +- printk +- ("**WARNING** kmalloc failure in cat_init\n"); +- continue; +- } +- asicpp = &(asicp->next); +- asicp->asic_location = asic; +- sp_table = +- (voyager_sp_table_t *) (eprom_buf + sp_offset); +- asicp->asic_id = sp_table->asic_id; +- asic_table = +- (voyager_at_t *) (eprom_buf + +- sp_table->asic_data_offset); +- for (j = 0; j < 4; j++) +- asicp->jtag_id[j] = asic_table->jtag_id[j]; +- jtag_table = +- (voyager_jtt_t *) (eprom_buf + +- asic_table->jtag_offset); +- asicp->ireg_length = jtag_table->ireg_len; +- asicp->bit_location = (*modpp)->inst_bits; +- (*modpp)->inst_bits += asicp->ireg_length; +- if (asicp->ireg_length > (*modpp)->largest_reg) +- (*modpp)->largest_reg = asicp->ireg_length; +- if (asicp->ireg_length < (*modpp)->smallest_reg || +- (*modpp)->smallest_reg == 0) +- (*modpp)->smallest_reg = asicp->ireg_length; +- CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n", +- asicp->asic_id, asicp->ireg_length, +- asicp->bit_location)); +- if (asicp->asic_id == VOYAGER_QUAD_QABC) { +- CDEBUG(("VOYAGER CAT: QABC ASIC found\n")); +- qabc_asic = asicp; +- } +- sp_offset += sizeof(voyager_sp_table_t); +- } +- CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", (*modpp)->inst_bits, (*modpp)->largest_reg, (*modpp)->smallest_reg)); +- /* OK, now we have the QUAD ASICs set up, use them. +- * we need to: +- * +- * 1. Find the Memory area for the Quad CPIs. +- * 2. Find the Extended VIC processor +- * 3. Configure a second extended VIC processor (This +- * cannot be done for the 51xx. +- * */ +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_connect(*modpp, (*modpp)->asic); +- CDEBUG(("CAT CONNECTED!!\n")); +- cat_subread(*modpp, qabc_asic, 0, sizeof(qabc_data), qabc_data); +- qic_addr = qabc_data[5] << 8; +- qic_addr = (qic_addr | qabc_data[6]) << 8; +- qic_addr = (qic_addr | qabc_data[7]) << 8; +- printk +- ("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n", +- cat_module_name(i), qic_addr, qabc_data[8]); +-#if 0 /* plumbing fails---FIXME */ +- if ((qabc_data[8] & 0xf0) == 0) { +- /* FIXME: 32 way 8 CPU slot monster cannot be +- * plumbed this way---need to check for it */ +- +- printk("Plumbing second Extended Quad Processor\n"); +- /* second VIC line hardwired to Quad CPU 1 */ +- qabc_data[8] |= 0x20; +- cat_subwrite(*modpp, qabc_asic, 8, 1, &qabc_data[8]); +-#ifdef VOYAGER_CAT_DEBUG +- /* verify plumbing */ +- cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]); +- if ((qabc_data[8] & 0xf0) == 0) { +- CDEBUG(("PLUMBING FAILED: 0x%x\n", +- qabc_data[8])); +- } +-#endif +- } +-#endif +- +- { +- struct resource *res = +- kzalloc(sizeof(struct resource), GFP_KERNEL); +- res->name = kmalloc(128, GFP_KERNEL); +- sprintf((char *)res->name, "Voyager %s Quad CPI", +- cat_module_name(i)); +- res->start = qic_addr; +- res->end = qic_addr + 0x3ff; +- request_resource(&iomem_resource, res); +- } +- +- qic_addr = (unsigned long)ioremap_cache(qic_addr, 0x400); +- +- for (j = 0; j < 4; j++) { +- __u8 cpu; +- +- if (voyager_8slot) { +- /* 8 slot has a different mapping, +- * each slot has only one vic line, so +- * 1 cpu in each slot must be < 8 */ +- cpu = (i & 0x07) + j * 8; +- } else { +- cpu = (i & 0x03) + j * 4; +- } +- if ((qabc_data[8] & (1 << j))) { +- voyager_extended_vic_processors |= (1 << cpu); +- } +- if (qabc_data[8] & (1 << (j + 4))) { +- /* Second SET register plumbed: Quad +- * card has two VIC connected CPUs. +- * Secondary cannot be booted as a VIC +- * CPU */ +- voyager_extended_vic_processors |= (1 << cpu); +- voyager_allowed_boot_processors &= +- (~(1 << cpu)); +- } +- +- voyager_quad_processors |= (1 << cpu); +- voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *) +- (qic_addr + (j << 8)); +- CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu, +- (unsigned long)voyager_quad_cpi_addr[cpu])); +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +- +- *asicpp = NULL; +- modpp = &((*modpp)->next); +- } +- *modpp = NULL; +- printk +- ("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", +- voyager_extended_vic_processors, voyager_quad_processors, +- voyager_allowed_boot_processors); +- request_resource(&ioport_resource, &vic_res); +- if (voyager_quad_processors) +- request_resource(&ioport_resource, &qic_res); +- /* set up the front power switch */ +-} +- +-int voyager_cat_readb(__u8 module, __u8 asic, int reg) +-{ +- return 0; +-} +- +-static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp) +-{ +- __u8 val; +- int err = 0; +- +- if (!modp->scan_path_connected) +- return 0; +- if (asicp->asic_id != VOYAGER_CAT_ID) { +- CDEBUG(("cat_disconnect: ASIC is not CAT\n")); +- return 1; +- } +- err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); +- if (err) { +- CDEBUG(("cat_disconnect: failed to read SCANPATH\n")); +- return err; +- } +- val &= VOYAGER_DISCONNECT_ASIC; +- err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); +- if (err) { +- CDEBUG(("cat_disconnect: failed to write SCANPATH\n")); +- return err; +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- modp->scan_path_connected = 0; +- +- return 0; +-} +- +-static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp) +-{ +- __u8 val; +- int err = 0; +- +- if (modp->scan_path_connected) +- return 0; +- if (asicp->asic_id != VOYAGER_CAT_ID) { +- CDEBUG(("cat_connect: ASIC is not CAT\n")); +- return 1; +- } +- +- err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); +- if (err) { +- CDEBUG(("cat_connect: failed to read SCANPATH\n")); +- return err; +- } +- val |= VOYAGER_CONNECT_ASIC; +- err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); +- if (err) { +- CDEBUG(("cat_connect: failed to write SCANPATH\n")); +- return err; +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- modp->scan_path_connected = 1; +- +- return 0; +-} +- +-void voyager_cat_power_off(void) +-{ +- /* Power the machine off by writing to the PSI over the CAT +- * bus */ +- __u8 data; +- voyager_module_t psi = { 0 }; +- voyager_asic_t psi_asic = { 0 }; +- +- psi.asic = &psi_asic; +- psi.asic->asic_id = VOYAGER_CAT_ID; +- psi.asic->subaddr = VOYAGER_SUBADDR_HI; +- psi.module_addr = VOYAGER_PSI; +- psi.scan_path_connected = 0; +- +- outb(VOYAGER_CAT_END, CAT_CMD); +- /* Connect the PSI to the CAT Bus */ +- outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT); +- outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_disconnect(&psi, &psi_asic); +- /* Read the status */ +- cat_subread(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data); +- outb(VOYAGER_CAT_END, CAT_CMD); +- CDEBUG(("PSI STATUS 0x%x\n", data)); +- /* These two writes are power off prep and perform */ +- data = PSI_CLEAR; +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data); +- outb(VOYAGER_CAT_END, CAT_CMD); +- data = PSI_POWER_DOWN; +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data); +- outb(VOYAGER_CAT_END, CAT_CMD); +-} +- +-struct voyager_status voyager_status = { 0 }; +- +-void voyager_cat_psi(__u8 cmd, __u16 reg, __u8 * data) +-{ +- voyager_module_t psi = { 0 }; +- voyager_asic_t psi_asic = { 0 }; +- +- psi.asic = &psi_asic; +- psi.asic->asic_id = VOYAGER_CAT_ID; +- psi.asic->subaddr = VOYAGER_SUBADDR_HI; +- psi.module_addr = VOYAGER_PSI; +- psi.scan_path_connected = 0; +- +- outb(VOYAGER_CAT_END, CAT_CMD); +- /* Connect the PSI to the CAT Bus */ +- outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT); +- outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_disconnect(&psi, &psi_asic); +- switch (cmd) { +- case VOYAGER_PSI_READ: +- cat_read(&psi, &psi_asic, reg, data); +- break; +- case VOYAGER_PSI_WRITE: +- cat_write(&psi, &psi_asic, reg, *data); +- break; +- case VOYAGER_PSI_SUBREAD: +- cat_subread(&psi, &psi_asic, reg, 1, data); +- break; +- case VOYAGER_PSI_SUBWRITE: +- cat_subwrite(&psi, &psi_asic, reg, 1, data); +- break; +- default: +- printk(KERN_ERR "Voyager PSI, unrecognised command %d\n", cmd); +- break; +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +-} +- +-void voyager_cat_do_common_interrupt(void) +-{ +- /* This is caused either by a memory parity error or something +- * in the PSI */ +- __u8 data; +- voyager_module_t psi = { 0 }; +- voyager_asic_t psi_asic = { 0 }; +- struct voyager_psi psi_reg; +- int i; +- re_read: +- psi.asic = &psi_asic; +- psi.asic->asic_id = VOYAGER_CAT_ID; +- psi.asic->subaddr = VOYAGER_SUBADDR_HI; +- psi.module_addr = VOYAGER_PSI; +- psi.scan_path_connected = 0; +- +- outb(VOYAGER_CAT_END, CAT_CMD); +- /* Connect the PSI to the CAT Bus */ +- outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT); +- outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_disconnect(&psi, &psi_asic); +- /* Read the status. NOTE: Need to read *all* the PSI regs here +- * otherwise the cmn int will be reasserted */ +- for (i = 0; i < sizeof(psi_reg.regs); i++) { +- cat_read(&psi, &psi_asic, i, &((__u8 *) & psi_reg.regs)[i]); +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +- if ((psi_reg.regs.checkbit & 0x02) == 0) { +- psi_reg.regs.checkbit |= 0x02; +- cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit); +- printk("VOYAGER RE-READ PSI\n"); +- goto re_read; +- } +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- for (i = 0; i < sizeof(psi_reg.subregs); i++) { +- /* This looks strange, but the PSI doesn't do auto increment +- * correctly */ +- cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, +- 1, &((__u8 *) & psi_reg.subregs)[i]); +- } +- outb(VOYAGER_CAT_END, CAT_CMD); +-#ifdef VOYAGER_CAT_DEBUG +- printk("VOYAGER PSI: "); +- for (i = 0; i < sizeof(psi_reg.regs); i++) +- printk("%02x ", ((__u8 *) & psi_reg.regs)[i]); +- printk("\n "); +- for (i = 0; i < sizeof(psi_reg.subregs); i++) +- printk("%02x ", ((__u8 *) & psi_reg.subregs)[i]); +- printk("\n"); +-#endif +- if (psi_reg.regs.intstatus & PSI_MON) { +- /* switch off or power fail */ +- +- if (psi_reg.subregs.supply & PSI_SWITCH_OFF) { +- if (voyager_status.switch_off) { +- printk(KERN_ERR +- "Voyager front panel switch turned off again---Immediate power off!\n"); +- voyager_cat_power_off(); +- /* not reached */ +- } else { +- printk(KERN_ERR +- "Voyager front panel switch turned off\n"); +- voyager_status.switch_off = 1; +- voyager_status.request_from_kernel = 1; +- wake_up_process(voyager_thread); +- } +- /* Tell the hardware we're taking care of the +- * shutdown, otherwise it will power the box off +- * within 3 seconds of the switch being pressed and, +- * which is much more important to us, continue to +- * assert the common interrupt */ +- data = PSI_CLR_SWITCH_OFF; +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG, +- 1, &data); +- outb(VOYAGER_CAT_END, CAT_CMD); +- } else { +- +- VDEBUG(("Voyager ac fail reg 0x%x\n", +- psi_reg.subregs.ACfail)); +- if ((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) { +- /* No further update */ +- return; +- } +-#if 0 +- /* Don't bother trying to find out who failed. +- * FIXME: This probably makes the code incorrect on +- * anything other than a 345x */ +- for (i = 0; i < 5; i++) { +- if (psi_reg.subregs.ACfail & (1 << i)) { +- break; +- } +- } +- printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i); +-#endif +- /* DON'T do this: it shuts down the AC PSI +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- data = PSI_MASK_MASK | i; +- cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK, +- 1, &data); +- outb(VOYAGER_CAT_END, CAT_CMD); +- */ +- printk(KERN_ERR "Voyager AC power failure\n"); +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- data = PSI_COLD_START; +- cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, +- 1, &data); +- outb(VOYAGER_CAT_END, CAT_CMD); +- voyager_status.power_fail = 1; +- voyager_status.request_from_kernel = 1; +- wake_up_process(voyager_thread); +- } +- +- } else if (psi_reg.regs.intstatus & PSI_FAULT) { +- /* Major fault! */ +- printk(KERN_ERR +- "Voyager PSI Detected major fault, immediate power off!\n"); +- voyager_cat_power_off(); +- /* not reached */ +- } else if (psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM +- | PSI_CURRENT | PSI_DVM +- | PSI_PSCFAULT | PSI_STAT_CHG)) { +- /* other psi fault */ +- +- printk(KERN_WARNING "Voyager PSI status 0x%x\n", data); +- /* clear the PSI fault */ +- outb(VOYAGER_CAT_RUN, CAT_CMD); +- cat_write(&psi, &psi_asic, VOYAGER_PSI_STATUS_REG, 0); +- outb(VOYAGER_CAT_END, CAT_CMD); +- } +-} +Index: linux-2.6-tip/arch/x86/mach-voyager/voyager_smp.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-voyager/voyager_smp.c ++++ /dev/null +@@ -1,1807 +0,0 @@ +-/* -*- mode: c; c-basic-offset: 8 -*- */ +- +-/* Copyright (C) 1999,2001 +- * +- * Author: J.E.J.Bottomley@HansenPartnership.com +- * +- * This file provides all the same external entries as smp.c but uses +- * the voyager hal to provide the functionality +- */ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* TLB state -- visible externally, indexed physically */ +-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 }; +- +-/* CPU IRQ affinity -- set to all ones initially */ +-static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = +- {[0 ... NR_CPUS-1] = ~0UL }; +- +-/* per CPU data structure (for /proc/cpuinfo et al), visible externally +- * indexed physically */ +-DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +-EXPORT_PER_CPU_SYMBOL(cpu_info); +- +-/* physical ID of the CPU used to boot the system */ +-unsigned char boot_cpu_id; +- +-/* The memory line addresses for the Quad CPIs */ +-struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS] __cacheline_aligned; +- +-/* The masks for the Extended VIC processors, filled in by cat_init */ +-__u32 voyager_extended_vic_processors = 0; +- +-/* Masks for the extended Quad processors which cannot be VIC booted */ +-__u32 voyager_allowed_boot_processors = 0; +- +-/* The mask for the Quad Processors (both extended and non-extended) */ +-__u32 voyager_quad_processors = 0; +- +-/* Total count of live CPUs, used in process.c to display +- * the CPU information and in irq.c for the per CPU irq +- * activity count. Finally exported by i386_ksyms.c */ +-static int voyager_extended_cpus = 1; +- +-/* Used for the invalidate map that's also checked in the spinlock */ +-static volatile unsigned long smp_invalidate_needed; +- +-/* Bitmask of CPUs present in the system - exported by i386_syms.c, used +- * by scheduler but indexed physically */ +-static cpumask_t voyager_phys_cpu_present_map = CPU_MASK_NONE; +- +-/* The internal functions */ +-static void send_CPI(__u32 cpuset, __u8 cpi); +-static void ack_CPI(__u8 cpi); +-static int ack_QIC_CPI(__u8 cpi); +-static void ack_special_QIC_CPI(__u8 cpi); +-static void ack_VIC_CPI(__u8 cpi); +-static void send_CPI_allbutself(__u8 cpi); +-static void mask_vic_irq(unsigned int irq); +-static void unmask_vic_irq(unsigned int irq); +-static unsigned int startup_vic_irq(unsigned int irq); +-static void enable_local_vic_irq(unsigned int irq); +-static void disable_local_vic_irq(unsigned int irq); +-static void before_handle_vic_irq(unsigned int irq); +-static void after_handle_vic_irq(unsigned int irq); +-static void set_vic_irq_affinity(unsigned int irq, const struct cpumask *mask); +-static void ack_vic_irq(unsigned int irq); +-static void vic_enable_cpi(void); +-static void do_boot_cpu(__u8 cpuid); +-static void do_quad_bootstrap(void); +-static void initialize_secondary(void); +- +-int hard_smp_processor_id(void); +-int safe_smp_processor_id(void); +- +-/* Inline functions */ +-static inline void send_one_QIC_CPI(__u8 cpu, __u8 cpi) +-{ +- voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi = +- (smp_processor_id() << 16) + cpi; +-} +- +-static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi) +-{ +- int cpu; +- +- for_each_online_cpu(cpu) { +- if (cpuset & (1 << cpu)) { +-#ifdef VOYAGER_DEBUG +- if (!cpu_online(cpu)) +- VDEBUG(("CPU%d sending cpi %d to CPU%d not in " +- "cpu_online_map\n", +- hard_smp_processor_id(), cpi, cpu)); +-#endif +- send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); +- } +- } +-} +- +-static inline void wrapper_smp_local_timer_interrupt(void) +-{ +- irq_enter(); +- smp_local_timer_interrupt(); +- irq_exit(); +-} +- +-static inline void send_one_CPI(__u8 cpu, __u8 cpi) +-{ +- if (voyager_quad_processors & (1 << cpu)) +- send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); +- else +- send_CPI(1 << cpu, cpi); +-} +- +-static inline void send_CPI_allbutself(__u8 cpi) +-{ +- __u8 cpu = smp_processor_id(); +- __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu); +- send_CPI(mask, cpi); +-} +- +-static inline int is_cpu_quad(void) +-{ +- __u8 cpumask = inb(VIC_PROC_WHO_AM_I); +- return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER); +-} +- +-static inline int is_cpu_extended(void) +-{ +- __u8 cpu = hard_smp_processor_id(); +- +- return (voyager_extended_vic_processors & (1 << cpu)); +-} +- +-static inline int is_cpu_vic_boot(void) +-{ +- __u8 cpu = hard_smp_processor_id(); +- +- return (voyager_extended_vic_processors +- & voyager_allowed_boot_processors & (1 << cpu)); +-} +- +-static inline void ack_CPI(__u8 cpi) +-{ +- switch (cpi) { +- case VIC_CPU_BOOT_CPI: +- if (is_cpu_quad() && !is_cpu_vic_boot()) +- ack_QIC_CPI(cpi); +- else +- ack_VIC_CPI(cpi); +- break; +- case VIC_SYS_INT: +- case VIC_CMN_INT: +- /* These are slightly strange. Even on the Quad card, +- * They are vectored as VIC CPIs */ +- if (is_cpu_quad()) +- ack_special_QIC_CPI(cpi); +- else +- ack_VIC_CPI(cpi); +- break; +- default: +- printk("VOYAGER ERROR: CPI%d is in common CPI code\n", cpi); +- break; +- } +-} +- +-/* local variables */ +- +-/* The VIC IRQ descriptors -- these look almost identical to the +- * 8259 IRQs except that masks and things must be kept per processor +- */ +-static struct irq_chip vic_chip = { +- .name = "VIC", +- .startup = startup_vic_irq, +- .mask = mask_vic_irq, +- .unmask = unmask_vic_irq, +- .set_affinity = set_vic_irq_affinity, +-}; +- +-/* used to count up as CPUs are brought on line (starts at 0) */ +-static int cpucount = 0; +- +-/* The per cpu profile stuff - used in smp_local_timer_interrupt */ +-static DEFINE_PER_CPU(int, prof_multiplier) = 1; +-static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; +-static DEFINE_PER_CPU(int, prof_counter) = 1; +- +-/* the map used to check if a CPU has booted */ +-static __u32 cpu_booted_map; +- +-/* the synchronize flag used to hold all secondary CPUs spinning in +- * a tight loop until the boot sequence is ready for them */ +-static cpumask_t smp_commenced_mask = CPU_MASK_NONE; +- +-/* This is for the new dynamic CPU boot code */ +- +-/* The per processor IRQ masks (these are usually kept in sync) */ +-static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; +- +-/* the list of IRQs to be enabled by the VIC_ENABLE_IRQ_CPI */ +-static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 }; +- +-/* Lock for enable/disable of VIC interrupts */ +-static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock); +- +-/* The boot processor is correctly set up in PC mode when it +- * comes up, but the secondaries need their master/slave 8259 +- * pairs initializing correctly */ +- +-/* Interrupt counters (per cpu) and total - used to try to +- * even up the interrupt handling routines */ +-static long vic_intr_total = 0; +-static long vic_intr_count[NR_CPUS] __cacheline_aligned = { 0 }; +-static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 }; +- +-/* Since we can only use CPI0, we fake all the other CPIs */ +-static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned; +- +-/* debugging routine to read the isr of the cpu's pic */ +-static inline __u16 vic_read_isr(void) +-{ +- __u16 isr; +- +- outb(0x0b, 0xa0); +- isr = inb(0xa0) << 8; +- outb(0x0b, 0x20); +- isr |= inb(0x20); +- +- return isr; +-} +- +-static __init void qic_setup(void) +-{ +- if (!is_cpu_quad()) { +- /* not a quad, no setup */ +- return; +- } +- outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); +- outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); +- +- if (is_cpu_extended()) { +- /* the QIC duplicate of the VIC base register */ +- outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER); +- outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER); +- +- /* FIXME: should set up the QIC timer and memory parity +- * error vectors here */ +- } +-} +- +-static __init void vic_setup_pic(void) +-{ +- outb(1, VIC_REDIRECT_REGISTER_1); +- /* clear the claim registers for dynamic routing */ +- outb(0, VIC_CLAIM_REGISTER_0); +- outb(0, VIC_CLAIM_REGISTER_1); +- +- outb(0, VIC_PRIORITY_REGISTER); +- /* Set the Primary and Secondary Microchannel vector +- * bases to be the same as the ordinary interrupts +- * +- * FIXME: This would be more efficient using separate +- * vectors. */ +- outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE); +- outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE); +- /* Now initiallise the master PIC belonging to this CPU by +- * sending the four ICWs */ +- +- /* ICW1: level triggered, ICW4 needed */ +- outb(0x19, 0x20); +- +- /* ICW2: vector base */ +- outb(FIRST_EXTERNAL_VECTOR, 0x21); +- +- /* ICW3: slave at line 2 */ +- outb(0x04, 0x21); +- +- /* ICW4: 8086 mode */ +- outb(0x01, 0x21); +- +- /* now the same for the slave PIC */ +- +- /* ICW1: level trigger, ICW4 needed */ +- outb(0x19, 0xA0); +- +- /* ICW2: slave vector base */ +- outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1); +- +- /* ICW3: slave ID */ +- outb(0x02, 0xA1); +- +- /* ICW4: 8086 mode */ +- outb(0x01, 0xA1); +-} +- +-static void do_quad_bootstrap(void) +-{ +- if (is_cpu_quad() && is_cpu_vic_boot()) { +- int i; +- unsigned long flags; +- __u8 cpuid = hard_smp_processor_id(); +- +- local_irq_save(flags); +- +- for (i = 0; i < 4; i++) { +- /* FIXME: this would be >>3 &0x7 on the 32 way */ +- if (((cpuid >> 2) & 0x03) == i) +- /* don't lower our own mask! */ +- continue; +- +- /* masquerade as local Quad CPU */ +- outb(QIC_CPUID_ENABLE | i, QIC_PROCESSOR_ID); +- /* enable the startup CPI */ +- outb(QIC_BOOT_CPI_MASK, QIC_MASK_REGISTER1); +- /* restore cpu id */ +- outb(0, QIC_PROCESSOR_ID); +- } +- local_irq_restore(flags); +- } +-} +- +-void prefill_possible_map(void) +-{ +- /* This is empty on voyager because we need a much +- * earlier detection which is done in find_smp_config */ +-} +- +-/* Set up all the basic stuff: read the SMP config and make all the +- * SMP information reflect only the boot cpu. All others will be +- * brought on-line later. */ +-void __init find_smp_config(void) +-{ +- int i; +- +- boot_cpu_id = hard_smp_processor_id(); +- +- printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id); +- +- /* initialize the CPU structures (moved from smp_boot_cpus) */ +- for (i = 0; i < nr_cpu_ids; i++) +- cpu_irq_affinity[i] = ~0; +- cpu_online_map = cpumask_of_cpu(boot_cpu_id); +- +- /* The boot CPU must be extended */ +- voyager_extended_vic_processors = 1 << boot_cpu_id; +- /* initially, all of the first 8 CPUs can boot */ +- voyager_allowed_boot_processors = 0xff; +- /* set up everything for just this CPU, we can alter +- * this as we start the other CPUs later */ +- /* now get the CPU disposition from the extended CMOS */ +- cpus_addr(voyager_phys_cpu_present_map)[0] = +- voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK); +- cpus_addr(voyager_phys_cpu_present_map)[0] |= +- voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8; +- cpus_addr(voyager_phys_cpu_present_map)[0] |= +- voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + +- 2) << 16; +- cpus_addr(voyager_phys_cpu_present_map)[0] |= +- voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + +- 3) << 24; +- init_cpu_possible(&voyager_phys_cpu_present_map); +- printk("VOYAGER SMP: voyager_phys_cpu_present_map = 0x%lx\n", +- cpus_addr(voyager_phys_cpu_present_map)[0]); +- /* Here we set up the VIC to enable SMP */ +- /* enable the CPIs by writing the base vector to their register */ +- outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER); +- outb(1, VIC_REDIRECT_REGISTER_1); +- /* set the claim registers for static routing --- Boot CPU gets +- * all interrupts untill all other CPUs started */ +- outb(0xff, VIC_CLAIM_REGISTER_0); +- outb(0xff, VIC_CLAIM_REGISTER_1); +- /* Set the Primary and Secondary Microchannel vector +- * bases to be the same as the ordinary interrupts +- * +- * FIXME: This would be more efficient using separate +- * vectors. */ +- outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE); +- outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE); +- +- /* Finally tell the firmware that we're driving */ +- outb(inb(VOYAGER_SUS_IN_CONTROL_PORT) | VOYAGER_IN_CONTROL_FLAG, +- VOYAGER_SUS_IN_CONTROL_PORT); +- +- current_thread_info()->cpu = boot_cpu_id; +- x86_write_percpu(cpu_number, boot_cpu_id); +-} +- +-/* +- * The bootstrap kernel entry code has set these up. Save them +- * for a given CPU, id is physical */ +-void __init smp_store_cpu_info(int id) +-{ +- struct cpuinfo_x86 *c = &cpu_data(id); +- +- *c = boot_cpu_data; +- c->cpu_index = id; +- +- identify_secondary_cpu(c); +-} +- +-/* Routine initially called when a non-boot CPU is brought online */ +-static void __init start_secondary(void *unused) +-{ +- __u8 cpuid = hard_smp_processor_id(); +- +- cpu_init(); +- +- /* OK, we're in the routine */ +- ack_CPI(VIC_CPU_BOOT_CPI); +- +- /* setup the 8259 master slave pair belonging to this CPU --- +- * we won't actually receive any until the boot CPU +- * relinquishes it's static routing mask */ +- vic_setup_pic(); +- +- qic_setup(); +- +- if (is_cpu_quad() && !is_cpu_vic_boot()) { +- /* clear the boot CPI */ +- __u8 dummy; +- +- dummy = +- voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi; +- printk("read dummy %d\n", dummy); +- } +- +- /* lower the mask to receive CPIs */ +- vic_enable_cpi(); +- +- VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); +- +- notify_cpu_starting(cpuid); +- +- /* enable interrupts */ +- local_irq_enable(); +- +- /* get our bogomips */ +- calibrate_delay(); +- +- /* save our processor parameters */ +- smp_store_cpu_info(cpuid); +- +- /* if we're a quad, we may need to bootstrap other CPUs */ +- do_quad_bootstrap(); +- +- /* FIXME: this is rather a poor hack to prevent the CPU +- * activating softirqs while it's supposed to be waiting for +- * permission to proceed. Without this, the new per CPU stuff +- * in the softirqs will fail */ +- local_irq_disable(); +- cpu_set(cpuid, cpu_callin_map); +- +- /* signal that we're done */ +- cpu_booted_map = 1; +- +- while (!cpu_isset(cpuid, smp_commenced_mask)) +- rep_nop(); +- local_irq_enable(); +- +- local_flush_tlb(); +- +- cpu_set(cpuid, cpu_online_map); +- wmb(); +- cpu_idle(); +-} +- +-/* Routine to kick start the given CPU and wait for it to report ready +- * (or timeout in startup). When this routine returns, the requested +- * CPU is either fully running and configured or known to be dead. +- * +- * We call this routine sequentially 1 CPU at a time, so no need for +- * locking */ +- +-static void __init do_boot_cpu(__u8 cpu) +-{ +- struct task_struct *idle; +- int timeout; +- unsigned long flags; +- int quad_boot = (1 << cpu) & voyager_quad_processors +- & ~(voyager_extended_vic_processors +- & voyager_allowed_boot_processors); +- +- /* This is the format of the CPI IDT gate (in real mode) which +- * we're hijacking to boot the CPU */ +- union IDTFormat { +- struct seg { +- __u16 Offset; +- __u16 Segment; +- } idt; +- __u32 val; +- } hijack_source; +- +- __u32 *hijack_vector; +- __u32 start_phys_address = setup_trampoline(); +- +- /* There's a clever trick to this: The linux trampoline is +- * compiled to begin at absolute location zero, so make the +- * address zero but have the data segment selector compensate +- * for the actual address */ +- hijack_source.idt.Offset = start_phys_address & 0x000F; +- hijack_source.idt.Segment = (start_phys_address >> 4) & 0xFFFF; +- +- cpucount++; +- alternatives_smp_switch(1); +- +- idle = fork_idle(cpu); +- if (IS_ERR(idle)) +- panic("failed fork for CPU%d", cpu); +- idle->thread.ip = (unsigned long)start_secondary; +- /* init_tasks (in sched.c) is indexed logically */ +- stack_start.sp = (void *)idle->thread.sp; +- +- init_gdt(cpu); +- per_cpu(current_task, cpu) = idle; +- early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); +- irq_ctx_init(cpu); +- +- /* Note: Don't modify initial ss override */ +- VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, +- (unsigned long)hijack_source.val, hijack_source.idt.Segment, +- hijack_source.idt.Offset, stack_start.sp)); +- +- /* init lowmem identity mapping */ +- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, +- min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +- flush_tlb_all(); +- +- if (quad_boot) { +- printk("CPU %d: non extended Quad boot\n", cpu); +- hijack_vector = +- (__u32 *) +- phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE) * 4); +- *hijack_vector = hijack_source.val; +- } else { +- printk("CPU%d: extended VIC boot\n", cpu); +- hijack_vector = +- (__u32 *) +- phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE) * 4); +- *hijack_vector = hijack_source.val; +- /* VIC errata, may also receive interrupt at this address */ +- hijack_vector = +- (__u32 *) +- phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + +- VIC_DEFAULT_CPI_BASE) * 4); +- *hijack_vector = hijack_source.val; +- } +- /* All non-boot CPUs start with interrupts fully masked. Need +- * to lower the mask of the CPI we're about to send. We do +- * this in the VIC by masquerading as the processor we're +- * about to boot and lowering its interrupt mask */ +- local_irq_save(flags); +- if (quad_boot) { +- send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI); +- } else { +- outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID); +- /* here we're altering registers belonging to `cpu' */ +- +- outb(VIC_BOOT_INTERRUPT_MASK, 0x21); +- /* now go back to our original identity */ +- outb(boot_cpu_id, VIC_PROCESSOR_ID); +- +- /* and boot the CPU */ +- +- send_CPI((1 << cpu), VIC_CPU_BOOT_CPI); +- } +- cpu_booted_map = 0; +- local_irq_restore(flags); +- +- /* now wait for it to become ready (or timeout) */ +- for (timeout = 0; timeout < 50000; timeout++) { +- if (cpu_booted_map) +- break; +- udelay(100); +- } +- /* reset the page table */ +- zap_low_mappings(); +- +- if (cpu_booted_map) { +- VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n", +- cpu, smp_processor_id())); +- +- printk("CPU%d: ", cpu); +- print_cpu_info(&cpu_data(cpu)); +- wmb(); +- cpu_set(cpu, cpu_callout_map); +- cpu_set(cpu, cpu_present_map); +- } else { +- printk("CPU%d FAILED TO BOOT: ", cpu); +- if (* +- ((volatile unsigned char *)phys_to_virt(start_phys_address)) +- == 0xA5) +- printk("Stuck.\n"); +- else +- printk("Not responding.\n"); +- +- cpucount--; +- } +-} +- +-void __init smp_boot_cpus(void) +-{ +- int i; +- +- /* CAT BUS initialisation must be done after the memory */ +- /* FIXME: The L4 has a catbus too, it just needs to be +- * accessed in a totally different way */ +- if (voyager_level == 5) { +- voyager_cat_init(); +- +- /* now that the cat has probed the Voyager System Bus, sanity +- * check the cpu map */ +- if (((voyager_quad_processors | voyager_extended_vic_processors) +- & cpus_addr(voyager_phys_cpu_present_map)[0]) != +- cpus_addr(voyager_phys_cpu_present_map)[0]) { +- /* should panic */ +- printk("\n\n***WARNING*** " +- "Sanity check of CPU present map FAILED\n"); +- } +- } else if (voyager_level == 4) +- voyager_extended_vic_processors = +- cpus_addr(voyager_phys_cpu_present_map)[0]; +- +- /* this sets up the idle task to run on the current cpu */ +- voyager_extended_cpus = 1; +- /* Remove the global_irq_holder setting, it triggers a BUG() on +- * schedule at the moment */ +- //global_irq_holder = boot_cpu_id; +- +- /* FIXME: Need to do something about this but currently only works +- * on CPUs with a tsc which none of mine have. +- smp_tune_scheduling(); +- */ +- smp_store_cpu_info(boot_cpu_id); +- /* setup the jump vector */ +- initial_code = (unsigned long)initialize_secondary; +- printk("CPU%d: ", boot_cpu_id); +- print_cpu_info(&cpu_data(boot_cpu_id)); +- +- if (is_cpu_quad()) { +- /* booting on a Quad CPU */ +- printk("VOYAGER SMP: Boot CPU is Quad\n"); +- qic_setup(); +- do_quad_bootstrap(); +- } +- +- /* enable our own CPIs */ +- vic_enable_cpi(); +- +- cpu_set(boot_cpu_id, cpu_online_map); +- cpu_set(boot_cpu_id, cpu_callout_map); +- +- /* loop over all the extended VIC CPUs and boot them. The +- * Quad CPUs must be bootstrapped by their extended VIC cpu */ +- for (i = 0; i < nr_cpu_ids; i++) { +- if (i == boot_cpu_id || !cpu_isset(i, voyager_phys_cpu_present_map)) +- continue; +- do_boot_cpu(i); +- /* This udelay seems to be needed for the Quad boots +- * don't remove unless you know what you're doing */ +- udelay(1000); +- } +- /* we could compute the total bogomips here, but why bother?, +- * Code added from smpboot.c */ +- { +- unsigned long bogosum = 0; +- +- for_each_online_cpu(i) +- bogosum += cpu_data(i).loops_per_jiffy; +- printk(KERN_INFO "Total of %d processors activated " +- "(%lu.%02lu BogoMIPS).\n", +- cpucount + 1, bogosum / (500000 / HZ), +- (bogosum / (5000 / HZ)) % 100); +- } +- voyager_extended_cpus = hweight32(voyager_extended_vic_processors); +- printk("VOYAGER: Extended (interrupt handling CPUs): " +- "%d, non-extended: %d\n", voyager_extended_cpus, +- num_booting_cpus() - voyager_extended_cpus); +- /* that's it, switch to symmetric mode */ +- outb(0, VIC_PRIORITY_REGISTER); +- outb(0, VIC_CLAIM_REGISTER_0); +- outb(0, VIC_CLAIM_REGISTER_1); +- +- VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus())); +-} +- +-/* Reload the secondary CPUs task structure (this function does not +- * return ) */ +-static void __init initialize_secondary(void) +-{ +-#if 0 +- // AC kernels only +- set_current(hard_get_current()); +-#endif +- +- /* +- * We don't actually need to load the full TSS, +- * basically just the stack pointer and the eip. +- */ +- +- asm volatile ("movl %0,%%esp\n\t" +- "jmp *%1"::"r" (current->thread.sp), +- "r"(current->thread.ip)); +-} +- +-/* handle a Voyager SYS_INT -- If we don't, the base board will +- * panic the system. +- * +- * System interrupts occur because some problem was detected on the +- * various busses. To find out what you have to probe all the +- * hardware via the CAT bus. FIXME: At the moment we do nothing. */ +-void smp_vic_sys_interrupt(struct pt_regs *regs) +-{ +- ack_CPI(VIC_SYS_INT); +- printk("Voyager SYSTEM INTERRUPT\n"); +-} +- +-/* Handle a voyager CMN_INT; These interrupts occur either because of +- * a system status change or because a single bit memory error +- * occurred. FIXME: At the moment, ignore all this. */ +-void smp_vic_cmn_interrupt(struct pt_regs *regs) +-{ +- static __u8 in_cmn_int = 0; +- static DEFINE_SPINLOCK(cmn_int_lock); +- +- /* common ints are broadcast, so make sure we only do this once */ +- _raw_spin_lock(&cmn_int_lock); +- if (in_cmn_int) +- goto unlock_end; +- +- in_cmn_int++; +- _raw_spin_unlock(&cmn_int_lock); +- +- VDEBUG(("Voyager COMMON INTERRUPT\n")); +- +- if (voyager_level == 5) +- voyager_cat_do_common_interrupt(); +- +- _raw_spin_lock(&cmn_int_lock); +- in_cmn_int = 0; +- unlock_end: +- _raw_spin_unlock(&cmn_int_lock); +- ack_CPI(VIC_CMN_INT); +-} +- +-/* +- * Reschedule call back. Nothing to do, all the work is done +- * automatically when we return from the interrupt. */ +-static void smp_reschedule_interrupt(void) +-{ +- /* do nothing */ +-} +- +-static struct mm_struct *flush_mm; +-static unsigned long flush_va; +-static DEFINE_SPINLOCK(tlbstate_lock); +- +-/* +- * We cannot call mmdrop() because we are in interrupt context, +- * instead update mm->cpu_vm_mask. +- * +- * We need to reload %cr3 since the page tables may be going +- * away from under us.. +- */ +-static inline void voyager_leave_mm(unsigned long cpu) +-{ +- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) +- BUG(); +- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); +- load_cr3(swapper_pg_dir); +-} +- +-/* +- * Invalidate call-back +- */ +-static void smp_invalidate_interrupt(void) +-{ +- __u8 cpu = smp_processor_id(); +- +- if (!test_bit(cpu, &smp_invalidate_needed)) +- return; +- /* This will flood messages. Don't uncomment unless you see +- * Problems with cross cpu invalidation +- VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n", +- smp_processor_id())); +- */ +- +- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { +- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { +- if (flush_va == TLB_FLUSH_ALL) +- local_flush_tlb(); +- else +- __flush_tlb_one(flush_va); +- } else +- voyager_leave_mm(cpu); +- } +- smp_mb__before_clear_bit(); +- clear_bit(cpu, &smp_invalidate_needed); +- smp_mb__after_clear_bit(); +-} +- +-/* All the new flush operations for 2.4 */ +- +-/* This routine is called with a physical cpu mask */ +-static void +-voyager_flush_tlb_others(unsigned long cpumask, struct mm_struct *mm, +- unsigned long va) +-{ +- int stuck = 50000; +- +- if (!cpumask) +- BUG(); +- if ((cpumask & cpus_addr(cpu_online_map)[0]) != cpumask) +- BUG(); +- if (cpumask & (1 << smp_processor_id())) +- BUG(); +- if (!mm) +- BUG(); +- +- spin_lock(&tlbstate_lock); +- +- flush_mm = mm; +- flush_va = va; +- atomic_set_mask(cpumask, &smp_invalidate_needed); +- /* +- * We have to send the CPI only to +- * CPUs affected. +- */ +- send_CPI(cpumask, VIC_INVALIDATE_CPI); +- +- while (smp_invalidate_needed) { +- mb(); +- if (--stuck == 0) { +- printk("***WARNING*** Stuck doing invalidate CPI " +- "(CPU%d)\n", smp_processor_id()); +- break; +- } +- } +- +- /* Uncomment only to debug invalidation problems +- VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu)); +- */ +- +- flush_mm = NULL; +- flush_va = 0; +- spin_unlock(&tlbstate_lock); +-} +- +-void flush_tlb_current_task(void) +-{ +- struct mm_struct *mm = current->mm; +- unsigned long cpu_mask; +- +- preempt_disable(); +- +- cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); +- local_flush_tlb(); +- if (cpu_mask) +- voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); +- +- preempt_enable(); +-} +- +-void flush_tlb_mm(struct mm_struct *mm) +-{ +- unsigned long cpu_mask; +- +- preempt_disable(); +- +- cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); +- +- if (current->active_mm == mm) { +- if (current->mm) +- local_flush_tlb(); +- else +- voyager_leave_mm(smp_processor_id()); +- } +- if (cpu_mask) +- voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); +- +- preempt_enable(); +-} +- +-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +-{ +- struct mm_struct *mm = vma->vm_mm; +- unsigned long cpu_mask; +- +- preempt_disable(); +- +- cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); +- if (current->active_mm == mm) { +- if (current->mm) +- __flush_tlb_one(va); +- else +- voyager_leave_mm(smp_processor_id()); +- } +- +- if (cpu_mask) +- voyager_flush_tlb_others(cpu_mask, mm, va); +- +- preempt_enable(); +-} +- +-EXPORT_SYMBOL(flush_tlb_page); +- +-/* enable the requested IRQs */ +-static void smp_enable_irq_interrupt(void) +-{ +- __u8 irq; +- __u8 cpu = get_cpu(); +- +- VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu, +- vic_irq_enable_mask[cpu])); +- +- spin_lock(&vic_irq_lock); +- for (irq = 0; irq < 16; irq++) { +- if (vic_irq_enable_mask[cpu] & (1 << irq)) +- enable_local_vic_irq(irq); +- } +- vic_irq_enable_mask[cpu] = 0; +- spin_unlock(&vic_irq_lock); +- +- put_cpu_no_resched(); +-} +- +-/* +- * CPU halt call-back +- */ +-static void smp_stop_cpu_function(void *dummy) +-{ +- VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id())); +- cpu_clear(smp_processor_id(), cpu_online_map); +- local_irq_disable(); +- for (;;) +- halt(); +-} +- +-/* execute a thread on a new CPU. The function to be called must be +- * previously set up. This is used to schedule a function for +- * execution on all CPUs - set up the function then broadcast a +- * function_interrupt CPI to come here on each CPU */ +-static void smp_call_function_interrupt(void) +-{ +- irq_enter(); +- generic_smp_call_function_interrupt(); +- __get_cpu_var(irq_stat).irq_call_count++; +- irq_exit(); +-} +- +-static void smp_call_function_single_interrupt(void) +-{ +- irq_enter(); +- generic_smp_call_function_single_interrupt(); +- __get_cpu_var(irq_stat).irq_call_count++; +- irq_exit(); +-} +- +-/* Sorry about the name. In an APIC based system, the APICs +- * themselves are programmed to send a timer interrupt. This is used +- * by linux to reschedule the processor. Voyager doesn't have this, +- * so we use the system clock to interrupt one processor, which in +- * turn, broadcasts a timer CPI to all the others --- we receive that +- * CPI here. We don't use this actually for counting so losing +- * ticks doesn't matter +- * +- * FIXME: For those CPUs which actually have a local APIC, we could +- * try to use it to trigger this interrupt instead of having to +- * broadcast the timer tick. Unfortunately, all my pentium DYADs have +- * no local APIC, so I can't do this +- * +- * This function is currently a placeholder and is unused in the code */ +-void smp_apic_timer_interrupt(struct pt_regs *regs) +-{ +- struct pt_regs *old_regs = set_irq_regs(regs); +- wrapper_smp_local_timer_interrupt(); +- set_irq_regs(old_regs); +-} +- +-/* All of the QUAD interrupt GATES */ +-void smp_qic_timer_interrupt(struct pt_regs *regs) +-{ +- struct pt_regs *old_regs = set_irq_regs(regs); +- ack_QIC_CPI(QIC_TIMER_CPI); +- wrapper_smp_local_timer_interrupt(); +- set_irq_regs(old_regs); +-} +- +-void smp_qic_invalidate_interrupt(struct pt_regs *regs) +-{ +- ack_QIC_CPI(QIC_INVALIDATE_CPI); +- smp_invalidate_interrupt(); +-} +- +-void smp_qic_reschedule_interrupt(struct pt_regs *regs) +-{ +- ack_QIC_CPI(QIC_RESCHEDULE_CPI); +- smp_reschedule_interrupt(); +-} +- +-void smp_qic_enable_irq_interrupt(struct pt_regs *regs) +-{ +- ack_QIC_CPI(QIC_ENABLE_IRQ_CPI); +- smp_enable_irq_interrupt(); +-} +- +-void smp_qic_call_function_interrupt(struct pt_regs *regs) +-{ +- ack_QIC_CPI(QIC_CALL_FUNCTION_CPI); +- smp_call_function_interrupt(); +-} +- +-void smp_qic_call_function_single_interrupt(struct pt_regs *regs) +-{ +- ack_QIC_CPI(QIC_CALL_FUNCTION_SINGLE_CPI); +- smp_call_function_single_interrupt(); +-} +- +-void smp_vic_cpi_interrupt(struct pt_regs *regs) +-{ +- struct pt_regs *old_regs = set_irq_regs(regs); +- __u8 cpu = smp_processor_id(); +- +- if (is_cpu_quad()) +- ack_QIC_CPI(VIC_CPI_LEVEL0); +- else +- ack_VIC_CPI(VIC_CPI_LEVEL0); +- +- if (test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu])) +- wrapper_smp_local_timer_interrupt(); +- if (test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu])) +- smp_invalidate_interrupt(); +- if (test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu])) +- smp_reschedule_interrupt(); +- if (test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu])) +- smp_enable_irq_interrupt(); +- if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) +- smp_call_function_interrupt(); +- if (test_and_clear_bit(VIC_CALL_FUNCTION_SINGLE_CPI, &vic_cpi_mailbox[cpu])) +- smp_call_function_single_interrupt(); +- set_irq_regs(old_regs); +-} +- +-static void do_flush_tlb_all(void *info) +-{ +- unsigned long cpu = smp_processor_id(); +- +- __flush_tlb_all(); +- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) +- voyager_leave_mm(cpu); +-} +- +-/* flush the TLB of every active CPU in the system */ +-void flush_tlb_all(void) +-{ +- on_each_cpu(do_flush_tlb_all, 0, 1); +-} +- +-/* send a reschedule CPI to one CPU by physical CPU number*/ +-static void voyager_smp_send_reschedule(int cpu) +-{ +- send_one_CPI(cpu, VIC_RESCHEDULE_CPI); +-} +- +-int hard_smp_processor_id(void) +-{ +- __u8 i; +- __u8 cpumask = inb(VIC_PROC_WHO_AM_I); +- if ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER) +- return cpumask & 0x1F; +- +- for (i = 0; i < 8; i++) { +- if (cpumask & (1 << i)) +- return i; +- } +- printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask); +- return 0; +-} +- +-int safe_smp_processor_id(void) +-{ +- return hard_smp_processor_id(); +-} +- +-/* broadcast a halt to all other CPUs */ +-static void voyager_smp_send_stop(void) +-{ +- smp_call_function(smp_stop_cpu_function, NULL, 1); +-} +- +-/* this function is triggered in time.c when a clock tick fires +- * we need to re-broadcast the tick to all CPUs */ +-void smp_vic_timer_interrupt(void) +-{ +- send_CPI_allbutself(VIC_TIMER_CPI); +- smp_local_timer_interrupt(); +-} +- +-/* local (per CPU) timer interrupt. It does both profiling and +- * process statistics/rescheduling. +- * +- * We do profiling in every local tick, statistics/rescheduling +- * happen only every 'profiling multiplier' ticks. The default +- * multiplier is 1 and it can be changed by writing the new multiplier +- * value into /proc/profile. +- */ +-void smp_local_timer_interrupt(void) +-{ +- int cpu = smp_processor_id(); +- long weight; +- +- profile_tick(CPU_PROFILING); +- if (--per_cpu(prof_counter, cpu) <= 0) { +- /* +- * The multiplier may have changed since the last time we got +- * to this point as a result of the user writing to +- * /proc/profile. In this case we need to adjust the APIC +- * timer accordingly. +- * +- * Interrupts are already masked off at this point. +- */ +- per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); +- if (per_cpu(prof_counter, cpu) != +- per_cpu(prof_old_multiplier, cpu)) { +- /* FIXME: need to update the vic timer tick here */ +- per_cpu(prof_old_multiplier, cpu) = +- per_cpu(prof_counter, cpu); +- } +- +- update_process_times(user_mode_vm(get_irq_regs())); +- } +- +- if (((1 << cpu) & voyager_extended_vic_processors) == 0) +- /* only extended VIC processors participate in +- * interrupt distribution */ +- return; +- +- /* +- * We take the 'long' return path, and there every subsystem +- * grabs the appropriate locks (kernel lock/ irq lock). +- * +- * we might want to decouple profiling from the 'long path', +- * and do the profiling totally in assembly. +- * +- * Currently this isn't too much of an issue (performance wise), +- * we can take more than 100K local irqs per second on a 100 MHz P5. +- */ +- +- if ((++vic_tick[cpu] & 0x7) != 0) +- return; +- /* get here every 16 ticks (about every 1/6 of a second) */ +- +- /* Change our priority to give someone else a chance at getting +- * the IRQ. The algorithm goes like this: +- * +- * In the VIC, the dynamically routed interrupt is always +- * handled by the lowest priority eligible (i.e. receiving +- * interrupts) CPU. If >1 eligible CPUs are equal lowest, the +- * lowest processor number gets it. +- * +- * The priority of a CPU is controlled by a special per-CPU +- * VIC priority register which is 3 bits wide 0 being lowest +- * and 7 highest priority.. +- * +- * Therefore we subtract the average number of interrupts from +- * the number we've fielded. If this number is negative, we +- * lower the activity count and if it is positive, we raise +- * it. +- * +- * I'm afraid this still leads to odd looking interrupt counts: +- * the totals are all roughly equal, but the individual ones +- * look rather skewed. +- * +- * FIXME: This algorithm is total crap when mixed with SMP +- * affinity code since we now try to even up the interrupt +- * counts when an affinity binding is keeping them on a +- * particular CPU*/ +- weight = (vic_intr_count[cpu] * voyager_extended_cpus +- - vic_intr_total) >> 4; +- weight += 4; +- if (weight > 7) +- weight = 7; +- if (weight < 0) +- weight = 0; +- +- outb((__u8) weight, VIC_PRIORITY_REGISTER); +- +-#ifdef VOYAGER_DEBUG +- if ((vic_tick[cpu] & 0xFFF) == 0) { +- /* print this message roughly every 25 secs */ +- printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n", +- cpu, vic_tick[cpu], weight); +- } +-#endif +-} +- +-/* setup the profiling timer */ +-int setup_profiling_timer(unsigned int multiplier) +-{ +- int i; +- +- if ((!multiplier)) +- return -EINVAL; +- +- /* +- * Set the new multiplier for each CPU. CPUs don't start using the +- * new values until the next timer interrupt in which they do process +- * accounting. +- */ +- for (i = 0; i < nr_cpu_ids; ++i) +- per_cpu(prof_multiplier, i) = multiplier; +- +- return 0; +-} +- +-/* This is a bit of a mess, but forced on us by the genirq changes +- * there's no genirq handler that really does what voyager wants +- * so hack it up with the simple IRQ handler */ +-static void handle_vic_irq(unsigned int irq, struct irq_desc *desc) +-{ +- before_handle_vic_irq(irq); +- handle_simple_irq(irq, desc); +- after_handle_vic_irq(irq); +-} +- +-/* The CPIs are handled in the per cpu 8259s, so they must be +- * enabled to be received: FIX: enabling the CPIs in the early +- * boot sequence interferes with bug checking; enable them later +- * on in smp_init */ +-#define VIC_SET_GATE(cpi, vector) \ +- set_intr_gate((cpi) + VIC_DEFAULT_CPI_BASE, (vector)) +-#define QIC_SET_GATE(cpi, vector) \ +- set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector)) +- +-void __init voyager_smp_intr_init(void) +-{ +- int i; +- +- /* initialize the per cpu irq mask to all disabled */ +- for (i = 0; i < nr_cpu_ids; i++) +- vic_irq_mask[i] = 0xFFFF; +- +- VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt); +- +- VIC_SET_GATE(VIC_SYS_INT, vic_sys_interrupt); +- VIC_SET_GATE(VIC_CMN_INT, vic_cmn_interrupt); +- +- QIC_SET_GATE(QIC_TIMER_CPI, qic_timer_interrupt); +- QIC_SET_GATE(QIC_INVALIDATE_CPI, qic_invalidate_interrupt); +- QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt); +- QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt); +- QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt); +- +- /* now put the VIC descriptor into the first 48 IRQs +- * +- * This is for later: first 16 correspond to PC IRQs; next 16 +- * are Primary MC IRQs and final 16 are Secondary MC IRQs */ +- for (i = 0; i < 48; i++) +- set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq); +-} +- +-/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per +- * processor to receive CPI */ +-static void send_CPI(__u32 cpuset, __u8 cpi) +-{ +- int cpu; +- __u32 quad_cpuset = (cpuset & voyager_quad_processors); +- +- if (cpi < VIC_START_FAKE_CPI) { +- /* fake CPI are only used for booting, so send to the +- * extended quads as well---Quads must be VIC booted */ +- outb((__u8) (cpuset), VIC_CPI_Registers[cpi]); +- return; +- } +- if (quad_cpuset) +- send_QIC_CPI(quad_cpuset, cpi); +- cpuset &= ~quad_cpuset; +- cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */ +- if (cpuset == 0) +- return; +- for_each_online_cpu(cpu) { +- if (cpuset & (1 << cpu)) +- set_bit(cpi, &vic_cpi_mailbox[cpu]); +- } +- if (cpuset) +- outb((__u8) cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]); +-} +- +-/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and +- * set the cache line to shared by reading it. +- * +- * DON'T make this inline otherwise the cache line read will be +- * optimised away +- * */ +-static int ack_QIC_CPI(__u8 cpi) +-{ +- __u8 cpu = hard_smp_processor_id(); +- +- cpi &= 7; +- +- outb(1 << cpi, QIC_INTERRUPT_CLEAR1); +- return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi; +-} +- +-static void ack_special_QIC_CPI(__u8 cpi) +-{ +- switch (cpi) { +- case VIC_CMN_INT: +- outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0); +- break; +- case VIC_SYS_INT: +- outb(QIC_SYS_INT, QIC_INTERRUPT_CLEAR0); +- break; +- } +- /* also clear at the VIC, just in case (nop for non-extended proc) */ +- ack_VIC_CPI(cpi); +-} +- +-/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */ +-static void ack_VIC_CPI(__u8 cpi) +-{ +-#ifdef VOYAGER_DEBUG +- unsigned long flags; +- __u16 isr; +- __u8 cpu = smp_processor_id(); +- +- local_irq_save(flags); +- isr = vic_read_isr(); +- if ((isr & (1 << (cpi & 7))) == 0) { +- printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi); +- } +-#endif +- /* send specific EOI; the two system interrupts have +- * bit 4 set for a separate vector but behave as the +- * corresponding 3 bit intr */ +- outb_p(0x60 | (cpi & 7), 0x20); +- +-#ifdef VOYAGER_DEBUG +- if ((vic_read_isr() & (1 << (cpi & 7))) != 0) { +- printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi); +- } +- local_irq_restore(flags); +-#endif +-} +- +-/* cribbed with thanks from irq.c */ +-#define __byte(x,y) (((unsigned char *)&(y))[x]) +-#define cached_21(cpu) (__byte(0,vic_irq_mask[cpu])) +-#define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu])) +- +-static unsigned int startup_vic_irq(unsigned int irq) +-{ +- unmask_vic_irq(irq); +- +- return 0; +-} +- +-/* The enable and disable routines. This is where we run into +- * conflicting architectural philosophy. Fundamentally, the voyager +- * architecture does not expect to have to disable interrupts globally +- * (the IRQ controllers belong to each CPU). The processor masquerade +- * which is used to start the system shouldn't be used in a running OS +- * since it will cause great confusion if two separate CPUs drive to +- * the same IRQ controller (I know, I've tried it). +- * +- * The solution is a variant on the NCR lazy SPL design: +- * +- * 1) To disable an interrupt, do nothing (other than set the +- * IRQ_DISABLED flag). This dares the interrupt actually to arrive. +- * +- * 2) If the interrupt dares to come in, raise the local mask against +- * it (this will result in all the CPU masks being raised +- * eventually). +- * +- * 3) To enable the interrupt, lower the mask on the local CPU and +- * broadcast an Interrupt enable CPI which causes all other CPUs to +- * adjust their masks accordingly. */ +- +-static void unmask_vic_irq(unsigned int irq) +-{ +- /* linux doesn't to processor-irq affinity, so enable on +- * all CPUs we know about */ +- int cpu = smp_processor_id(), real_cpu; +- __u16 mask = (1 << irq); +- __u32 processorList = 0; +- unsigned long flags; +- +- VDEBUG(("VOYAGER: unmask_vic_irq(%d) CPU%d affinity 0x%lx\n", +- irq, cpu, cpu_irq_affinity[cpu])); +- spin_lock_irqsave(&vic_irq_lock, flags); +- for_each_online_cpu(real_cpu) { +- if (!(voyager_extended_vic_processors & (1 << real_cpu))) +- continue; +- if (!(cpu_irq_affinity[real_cpu] & mask)) { +- /* irq has no affinity for this CPU, ignore */ +- continue; +- } +- if (real_cpu == cpu) { +- enable_local_vic_irq(irq); +- } else if (vic_irq_mask[real_cpu] & mask) { +- vic_irq_enable_mask[real_cpu] |= mask; +- processorList |= (1 << real_cpu); +- } +- } +- spin_unlock_irqrestore(&vic_irq_lock, flags); +- if (processorList) +- send_CPI(processorList, VIC_ENABLE_IRQ_CPI); +-} +- +-static void mask_vic_irq(unsigned int irq) +-{ +- /* lazy disable, do nothing */ +-} +- +-static void enable_local_vic_irq(unsigned int irq) +-{ +- __u8 cpu = smp_processor_id(); +- __u16 mask = ~(1 << irq); +- __u16 old_mask = vic_irq_mask[cpu]; +- +- vic_irq_mask[cpu] &= mask; +- if (vic_irq_mask[cpu] == old_mask) +- return; +- +- VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n", +- irq, cpu)); +- +- if (irq & 8) { +- outb_p(cached_A1(cpu), 0xA1); +- (void)inb_p(0xA1); +- } else { +- outb_p(cached_21(cpu), 0x21); +- (void)inb_p(0x21); +- } +-} +- +-static void disable_local_vic_irq(unsigned int irq) +-{ +- __u8 cpu = smp_processor_id(); +- __u16 mask = (1 << irq); +- __u16 old_mask = vic_irq_mask[cpu]; +- +- if (irq == 7) +- return; +- +- vic_irq_mask[cpu] |= mask; +- if (old_mask == vic_irq_mask[cpu]) +- return; +- +- VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n", +- irq, cpu)); +- +- if (irq & 8) { +- outb_p(cached_A1(cpu), 0xA1); +- (void)inb_p(0xA1); +- } else { +- outb_p(cached_21(cpu), 0x21); +- (void)inb_p(0x21); +- } +-} +- +-/* The VIC is level triggered, so the ack can only be issued after the +- * interrupt completes. However, we do Voyager lazy interrupt +- * handling here: It is an extremely expensive operation to mask an +- * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If +- * this interrupt actually comes in, then we mask and ack here to push +- * the interrupt off to another CPU */ +-static void before_handle_vic_irq(unsigned int irq) +-{ +- irq_desc_t *desc = irq_to_desc(irq); +- __u8 cpu = smp_processor_id(); +- +- _raw_spin_lock(&vic_irq_lock); +- vic_intr_total++; +- vic_intr_count[cpu]++; +- +- if (!(cpu_irq_affinity[cpu] & (1 << irq))) { +- /* The irq is not in our affinity mask, push it off +- * onto another CPU */ +- VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d " +- "on cpu %d\n", irq, cpu)); +- disable_local_vic_irq(irq); +- /* set IRQ_INPROGRESS to prevent the handler in irq.c from +- * actually calling the interrupt routine */ +- desc->status |= IRQ_REPLAY | IRQ_INPROGRESS; +- } else if (desc->status & IRQ_DISABLED) { +- /* Damn, the interrupt actually arrived, do the lazy +- * disable thing. The interrupt routine in irq.c will +- * not handle a IRQ_DISABLED interrupt, so nothing more +- * need be done here */ +- VDEBUG(("VOYAGER DEBUG: lazy disable of irq %d on CPU %d\n", +- irq, cpu)); +- disable_local_vic_irq(irq); +- desc->status |= IRQ_REPLAY; +- } else { +- desc->status &= ~IRQ_REPLAY; +- } +- +- _raw_spin_unlock(&vic_irq_lock); +-} +- +-/* Finish the VIC interrupt: basically mask */ +-static void after_handle_vic_irq(unsigned int irq) +-{ +- irq_desc_t *desc = irq_to_desc(irq); +- +- _raw_spin_lock(&vic_irq_lock); +- { +- unsigned int status = desc->status & ~IRQ_INPROGRESS; +-#ifdef VOYAGER_DEBUG +- __u16 isr; +-#endif +- +- desc->status = status; +- if ((status & IRQ_DISABLED)) +- disable_local_vic_irq(irq); +-#ifdef VOYAGER_DEBUG +- /* DEBUG: before we ack, check what's in progress */ +- isr = vic_read_isr(); +- if ((isr & (1 << irq) && !(status & IRQ_REPLAY)) == 0) { +- int i; +- __u8 cpu = smp_processor_id(); +- __u8 real_cpu; +- int mask; /* Um... initialize me??? --RR */ +- +- printk("VOYAGER SMP: CPU%d lost interrupt %d\n", +- cpu, irq); +- for_each_possible_cpu(real_cpu, mask) { +- +- outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu, +- VIC_PROCESSOR_ID); +- isr = vic_read_isr(); +- if (isr & (1 << irq)) { +- printk +- ("VOYAGER SMP: CPU%d ack irq %d\n", +- real_cpu, irq); +- ack_vic_irq(irq); +- } +- outb(cpu, VIC_PROCESSOR_ID); +- } +- } +-#endif /* VOYAGER_DEBUG */ +- /* as soon as we ack, the interrupt is eligible for +- * receipt by another CPU so everything must be in +- * order here */ +- ack_vic_irq(irq); +- if (status & IRQ_REPLAY) { +- /* replay is set if we disable the interrupt +- * in the before_handle_vic_irq() routine, so +- * clear the in progress bit here to allow the +- * next CPU to handle this correctly */ +- desc->status &= ~(IRQ_REPLAY | IRQ_INPROGRESS); +- } +-#ifdef VOYAGER_DEBUG +- isr = vic_read_isr(); +- if ((isr & (1 << irq)) != 0) +- printk("VOYAGER SMP: after_handle_vic_irq() after " +- "ack irq=%d, isr=0x%x\n", irq, isr); +-#endif /* VOYAGER_DEBUG */ +- } +- _raw_spin_unlock(&vic_irq_lock); +- +- /* All code after this point is out of the main path - the IRQ +- * may be intercepted by another CPU if reasserted */ +-} +- +-/* Linux processor - interrupt affinity manipulations. +- * +- * For each processor, we maintain a 32 bit irq affinity mask. +- * Initially it is set to all 1's so every processor accepts every +- * interrupt. In this call, we change the processor's affinity mask: +- * +- * Change from enable to disable: +- * +- * If the interrupt ever comes in to the processor, we will disable it +- * and ack it to push it off to another CPU, so just accept the mask here. +- * +- * Change from disable to enable: +- * +- * change the mask and then do an interrupt enable CPI to re-enable on +- * the selected processors */ +- +-void set_vic_irq_affinity(unsigned int irq, const struct cpumask *mask) +-{ +- /* Only extended processors handle interrupts */ +- unsigned long real_mask; +- unsigned long irq_mask = 1 << irq; +- int cpu; +- +- real_mask = cpus_addr(*mask)[0] & voyager_extended_vic_processors; +- +- if (cpus_addr(*mask)[0] == 0) +- /* can't have no CPUs to accept the interrupt -- extremely +- * bad things will happen */ +- return; +- +- if (irq == 0) +- /* can't change the affinity of the timer IRQ. This +- * is due to the constraint in the voyager +- * architecture that the CPI also comes in on and IRQ +- * line and we have chosen IRQ0 for this. If you +- * raise the mask on this interrupt, the processor +- * will no-longer be able to accept VIC CPIs */ +- return; +- +- if (irq >= 32) +- /* You can only have 32 interrupts in a voyager system +- * (and 32 only if you have a secondary microchannel +- * bus) */ +- return; +- +- for_each_online_cpu(cpu) { +- unsigned long cpu_mask = 1 << cpu; +- +- if (cpu_mask & real_mask) { +- /* enable the interrupt for this cpu */ +- cpu_irq_affinity[cpu] |= irq_mask; +- } else { +- /* disable the interrupt for this cpu */ +- cpu_irq_affinity[cpu] &= ~irq_mask; +- } +- } +- /* this is magic, we now have the correct affinity maps, so +- * enable the interrupt. This will send an enable CPI to +- * those CPUs who need to enable it in their local masks, +- * causing them to correct for the new affinity . If the +- * interrupt is currently globally disabled, it will simply be +- * disabled again as it comes in (voyager lazy disable). If +- * the affinity map is tightened to disable the interrupt on a +- * cpu, it will be pushed off when it comes in */ +- unmask_vic_irq(irq); +-} +- +-static void ack_vic_irq(unsigned int irq) +-{ +- if (irq & 8) { +- outb(0x62, 0x20); /* Specific EOI to cascade */ +- outb(0x60 | (irq & 7), 0xA0); +- } else { +- outb(0x60 | (irq & 7), 0x20); +- } +-} +- +-/* enable the CPIs. In the VIC, the CPIs are delivered by the 8259 +- * but are not vectored by it. This means that the 8259 mask must be +- * lowered to receive them */ +-static __init void vic_enable_cpi(void) +-{ +- __u8 cpu = smp_processor_id(); +- +- /* just take a copy of the current mask (nop for boot cpu) */ +- vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id]; +- +- enable_local_vic_irq(VIC_CPI_LEVEL0); +- enable_local_vic_irq(VIC_CPI_LEVEL1); +- /* for sys int and cmn int */ +- enable_local_vic_irq(7); +- +- if (is_cpu_quad()) { +- outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); +- outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); +- VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n", +- cpu, QIC_CPI_ENABLE)); +- } +- +- VDEBUG(("VOYAGER SMP: ENABLE CPI: CPU%d: MASK 0x%x\n", +- cpu, vic_irq_mask[cpu])); +-} +- +-void voyager_smp_dump() +-{ +- int old_cpu = smp_processor_id(), cpu; +- +- /* dump the interrupt masks of each processor */ +- for_each_online_cpu(cpu) { +- __u16 imr, isr, irr; +- unsigned long flags; +- +- local_irq_save(flags); +- outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID); +- imr = (inb(0xa1) << 8) | inb(0x21); +- outb(0x0a, 0xa0); +- irr = inb(0xa0) << 8; +- outb(0x0a, 0x20); +- irr |= inb(0x20); +- outb(0x0b, 0xa0); +- isr = inb(0xa0) << 8; +- outb(0x0b, 0x20); +- isr |= inb(0x20); +- outb(old_cpu, VIC_PROCESSOR_ID); +- local_irq_restore(flags); +- printk("\tCPU%d: mask=0x%x, IMR=0x%x, IRR=0x%x, ISR=0x%x\n", +- cpu, vic_irq_mask[cpu], imr, irr, isr); +-#if 0 +- /* These lines are put in to try to unstick an un ack'd irq */ +- if (isr != 0) { +- int irq; +- for (irq = 0; irq < 16; irq++) { +- if (isr & (1 << irq)) { +- printk("\tCPU%d: ack irq %d\n", +- cpu, irq); +- local_irq_save(flags); +- outb(VIC_CPU_MASQUERADE_ENABLE | cpu, +- VIC_PROCESSOR_ID); +- ack_vic_irq(irq); +- outb(old_cpu, VIC_PROCESSOR_ID); +- local_irq_restore(flags); +- } +- } +- } +-#endif +- } +-} +- +-void smp_voyager_power_off(void *dummy) +-{ +- if (smp_processor_id() == boot_cpu_id) +- voyager_power_off(); +- else +- smp_stop_cpu_function(NULL); +-} +- +-static void __init voyager_smp_prepare_cpus(unsigned int max_cpus) +-{ +- /* FIXME: ignore max_cpus for now */ +- smp_boot_cpus(); +-} +- +-static void __cpuinit voyager_smp_prepare_boot_cpu(void) +-{ +- init_gdt(smp_processor_id()); +- switch_to_new_gdt(); +- +- cpu_online_map = cpumask_of_cpu(smp_processor_id()); +- cpu_callout_map = cpumask_of_cpu(smp_processor_id()); +- cpu_callin_map = CPU_MASK_NONE; +- cpu_present_map = cpumask_of_cpu(smp_processor_id()); +- +-} +- +-static int __cpuinit voyager_cpu_up(unsigned int cpu) +-{ +- /* This only works at boot for x86. See "rewrite" above. */ +- if (cpu_isset(cpu, smp_commenced_mask)) +- return -ENOSYS; +- +- /* In case one didn't come up */ +- if (!cpu_isset(cpu, cpu_callin_map)) +- return -EIO; +- /* Unleash the CPU! */ +- cpu_set(cpu, smp_commenced_mask); +- while (!cpu_online(cpu)) +- mb(); +- return 0; +-} +- +-static void __init voyager_smp_cpus_done(unsigned int max_cpus) +-{ +- zap_low_mappings(); +-} +- +-void __init smp_setup_processor_id(void) +-{ +- current_thread_info()->cpu = hard_smp_processor_id(); +- x86_write_percpu(cpu_number, hard_smp_processor_id()); +-} +- +-static void voyager_send_call_func(const struct cpumask *callmask) +-{ +- __u32 mask = cpus_addr(*callmask)[0] & ~(1 << smp_processor_id()); +- send_CPI(mask, VIC_CALL_FUNCTION_CPI); +-} +- +-static void voyager_send_call_func_single(int cpu) +-{ +- send_CPI(1 << cpu, VIC_CALL_FUNCTION_SINGLE_CPI); +-} +- +-struct smp_ops smp_ops = { +- .smp_prepare_boot_cpu = voyager_smp_prepare_boot_cpu, +- .smp_prepare_cpus = voyager_smp_prepare_cpus, +- .cpu_up = voyager_cpu_up, +- .smp_cpus_done = voyager_smp_cpus_done, +- +- .smp_send_stop = voyager_smp_send_stop, +- .smp_send_reschedule = voyager_smp_send_reschedule, +- +- .send_call_func_ipi = voyager_send_call_func, +- .send_call_func_single_ipi = voyager_send_call_func_single, +-}; +Index: linux-2.6-tip/arch/x86/mach-voyager/voyager_thread.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mach-voyager/voyager_thread.c ++++ /dev/null +@@ -1,128 +0,0 @@ +-/* -*- mode: c; c-basic-offset: 8 -*- */ +- +-/* Copyright (C) 2001 +- * +- * Author: J.E.J.Bottomley@HansenPartnership.com +- * +- * This module provides the machine status monitor thread for the +- * voyager architecture. This allows us to monitor the machine +- * environment (temp, voltage, fan function) and the front panel and +- * internal UPS. If a fault is detected, this thread takes corrective +- * action (usually just informing init) +- * */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-struct task_struct *voyager_thread; +-static __u8 set_timeout; +- +-static int execute(const char *string) +-{ +- int ret; +- +- char *envp[] = { +- "HOME=/", +- "TERM=linux", +- "PATH=/sbin:/usr/sbin:/bin:/usr/bin", +- NULL, +- }; +- char *argv[] = { +- "/bin/bash", +- "-c", +- (char *)string, +- NULL, +- }; +- +- if ((ret = +- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { +- printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string, +- ret); +- } +- return ret; +-} +- +-static void check_from_kernel(void) +-{ +- if (voyager_status.switch_off) { +- +- /* FIXME: This should be configurable via proc */ +- execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1"); +- } else if (voyager_status.power_fail) { +- VDEBUG(("Voyager daemon detected AC power failure\n")); +- +- /* FIXME: This should be configureable via proc */ +- execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1"); +- set_timeout = 1; +- } +-} +- +-static void check_continuing_condition(void) +-{ +- if (voyager_status.power_fail) { +- __u8 data; +- voyager_cat_psi(VOYAGER_PSI_SUBREAD, +- VOYAGER_PSI_AC_FAIL_REG, &data); +- if ((data & 0x1f) == 0) { +- /* all power restored */ +- printk(KERN_NOTICE +- "VOYAGER AC power restored, cancelling shutdown\n"); +- /* FIXME: should be user configureable */ +- execute +- ("umask 600; echo O > /etc/powerstatus; kill -PWR 1"); +- set_timeout = 0; +- } +- } +-} +- +-static int thread(void *unused) +-{ +- printk(KERN_NOTICE "Voyager starting monitor thread\n"); +- +- for (;;) { +- set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT); +- +- VDEBUG(("Voyager Daemon awoken\n")); +- if (voyager_status.request_from_kernel == 0) { +- /* probably awoken from timeout */ +- check_continuing_condition(); +- } else { +- check_from_kernel(); +- voyager_status.request_from_kernel = 0; +- } +- } +-} +- +-static int __init voyager_thread_start(void) +-{ +- voyager_thread = kthread_run(thread, NULL, "kvoyagerd"); +- if (IS_ERR(voyager_thread)) { +- printk(KERN_ERR +- "Voyager: Failed to create system monitor thread.\n"); +- return PTR_ERR(voyager_thread); +- } +- return 0; +-} +- +-static void __exit voyager_thread_stop(void) +-{ +- kthread_stop(voyager_thread); +-} +- +-module_init(voyager_thread_start); +-module_exit(voyager_thread_stop); +Index: linux-2.6-tip/arch/x86/math-emu/get_address.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/math-emu/get_address.c ++++ linux-2.6-tip/arch/x86/math-emu/get_address.c +@@ -150,11 +150,9 @@ static long pm_address(u_char FPU_modrm, + #endif /* PARANOID */ + + switch (segment) { +- /* gs isn't used by the kernel, so it still has its +- user-space value. */ + case PREFIX_GS_ - 1: +- /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ +- savesegment(gs, addr->selector); ++ /* user gs handling can be lazy, use special accessors */ ++ addr->selector = get_user_gs(FPU_info->regs); + break; + default: + addr->selector = PM_REG_(segment); +Index: linux-2.6-tip/arch/x86/mm/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/Makefile ++++ linux-2.6-tip/arch/x86/mm/Makefile +@@ -1,6 +1,8 @@ +-obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ ++obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ + pat.o pgtable.o gup.o + ++obj-$(CONFIG_SMP) += tlb.o ++ + obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o + + obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +@@ -8,11 +10,13 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetab + + obj-$(CONFIG_HIGHMEM) += highmem_32.o + ++obj-$(CONFIG_KMEMCHECK) += kmemcheck/ ++ + obj-$(CONFIG_MMIOTRACE) += mmiotrace.o + mmiotrace-y := kmmio.o pf_in.o mmio-mod.o + obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o + +-obj-$(CONFIG_NUMA) += numa_$(BITS).o ++obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o + obj-$(CONFIG_K8_NUMA) += k8topology_64.o + obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o + +Index: linux-2.6-tip/arch/x86/mm/extable.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/extable.c ++++ linux-2.6-tip/arch/x86/mm/extable.c +@@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs + + fixup = search_exception_tables(regs->ip); + if (fixup) { ++ /* If fixup is less than 16, it means uaccess error */ ++ if (fixup->fixup < 16) { ++ current_thread_info()->uaccess_err = -EFAULT; ++ regs->ip += fixup->fixup; ++ return 1; ++ } + regs->ip = fixup->fixup; + return 1; + } +Index: linux-2.6-tip/arch/x86/mm/fault.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/fault.c ++++ linux-2.6-tip/arch/x86/mm/fault.c +@@ -1,73 +1,81 @@ + /* + * Copyright (C) 1995 Linus Torvalds +- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. ++ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. ++ * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar + */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include + #include +-#include +-#include +-#include /* For unblank_screen() */ ++#include ++#include + #include + #include +-#include /* for max_low_pfn */ +-#include +-#include + #include + #include ++#include ++#include ++#include ++#include ++#include ++#include ++#include + #include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include + +-#include +-#include +-#include +-#include +-#include ++#include ++ ++#include + #include ++#include ++#include ++#include + #include +-#include + #include ++#include + + /* +- * Page fault error code bits +- * bit 0 == 0 means no page found, 1 means protection fault +- * bit 1 == 0 means read, 1 means write +- * bit 2 == 0 means kernel, 1 means user-mode +- * bit 3 == 1 means use of reserved bit detected +- * bit 4 == 1 means fault was an instruction fetch +- */ +-#define PF_PROT (1<<0) +-#define PF_WRITE (1<<1) +-#define PF_USER (1<<2) +-#define PF_RSVD (1<<3) +-#define PF_INSTR (1<<4) ++ * Page fault error code bits: ++ * ++ * bit 0 == 0: no page found 1: protection fault ++ * bit 1 == 0: read access 1: write access ++ * bit 2 == 0: kernel-mode access 1: user-mode access ++ * bit 3 == 1: use of reserved bit detected ++ * bit 4 == 1: fault was an instruction fetch ++ */ ++enum x86_pf_error_code { ++ ++ PF_PROT = 1 << 0, ++ PF_WRITE = 1 << 1, ++ PF_USER = 1 << 2, ++ PF_RSVD = 1 << 3, ++ PF_INSTR = 1 << 4, ++}; + ++/* ++ * Returns 0 if mmiotrace is disabled, or if the fault is not ++ * handled by mmiotrace: ++ */ + static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) + { +-#ifdef CONFIG_MMIOTRACE + if (unlikely(is_kmmio_active())) + if (kmmio_handler(regs, addr) == 1) + return -1; +-#endif + return 0; + } + + static inline int notify_page_fault(struct pt_regs *regs) + { +-#ifdef CONFIG_KPROBES + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ +- if (!user_mode_vm(regs)) { ++ if (kprobes_built_in() && !user_mode_vm(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; +@@ -75,29 +83,76 @@ static inline int notify_page_fault(stru + } + + return ret; +-#else +- return 0; +-#endif + } + + /* +- * X86_32 +- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. +- * Check that here and ignore it. +- * +- * X86_64 +- * Sometimes the CPU reports invalid exceptions on prefetch. +- * Check that here and ignore it. ++ * Prefetch quirks: ++ * ++ * 32-bit mode: ++ * ++ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. ++ * Check that here and ignore it. ++ * ++ * 64-bit mode: ++ * ++ * Sometimes the CPU reports invalid exceptions on prefetch. ++ * Check that here and ignore it. + * +- * Opcode checker based on code by Richard Brunner ++ * Opcode checker based on code by Richard Brunner. + */ +-static int is_prefetch(struct pt_regs *regs, unsigned long addr, +- unsigned long error_code) ++static inline int ++check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, ++ unsigned char opcode, int *prefetch) + { ++ unsigned char instr_hi = opcode & 0xf0; ++ unsigned char instr_lo = opcode & 0x0f; ++ ++ switch (instr_hi) { ++ case 0x20: ++ case 0x30: ++ /* ++ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. ++ * In X86_64 long mode, the CPU will signal invalid ++ * opcode if some of these prefixes are present so ++ * X86_64 will never get here anyway ++ */ ++ return ((instr_lo & 7) == 0x6); ++#ifdef CONFIG_X86_64 ++ case 0x40: ++ /* ++ * In AMD64 long mode 0x40..0x4F are valid REX prefixes ++ * Need to figure out under what instruction mode the ++ * instruction was issued. Could check the LDT for lm, ++ * but for now it's good enough to assume that long ++ * mode only uses well known segments or kernel. ++ */ ++ return (!user_mode(regs)) || (regs->cs == __USER_CS); ++#endif ++ case 0x60: ++ /* 0x64 thru 0x67 are valid prefixes in all modes. */ ++ return (instr_lo & 0xC) == 0x4; ++ case 0xF0: ++ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ ++ return !instr_lo || (instr_lo>>1) == 1; ++ case 0x00: ++ /* Prefetch instruction is 0x0F0D or 0x0F18 */ ++ if (probe_kernel_address(instr, opcode)) ++ return 0; ++ ++ *prefetch = (instr_lo == 0xF) && ++ (opcode == 0x0D || opcode == 0x18); ++ return 0; ++ default: ++ return 0; ++ } ++} ++ ++static int ++is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) ++{ ++ unsigned char *max_instr; + unsigned char *instr; +- int scan_more = 1; + int prefetch = 0; +- unsigned char *max_instr; + + /* + * If it was a exec (instruction fetch) fault on NX page, then +@@ -106,106 +161,170 @@ static int is_prefetch(struct pt_regs *r + if (error_code & PF_INSTR) + return 0; + +- instr = (unsigned char *)convert_ip_to_linear(current, regs); ++ instr = (void *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + +- while (scan_more && instr < max_instr) { ++ while (instr < max_instr) { + unsigned char opcode; +- unsigned char instr_hi; +- unsigned char instr_lo; + + if (probe_kernel_address(instr, opcode)) + break; + +- instr_hi = opcode & 0xf0; +- instr_lo = opcode & 0x0f; + instr++; + +- switch (instr_hi) { +- case 0x20: +- case 0x30: +- /* +- * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. +- * In X86_64 long mode, the CPU will signal invalid +- * opcode if some of these prefixes are present so +- * X86_64 will never get here anyway +- */ +- scan_more = ((instr_lo & 7) == 0x6); +- break; +-#ifdef CONFIG_X86_64 +- case 0x40: +- /* +- * In AMD64 long mode 0x40..0x4F are valid REX prefixes +- * Need to figure out under what instruction mode the +- * instruction was issued. Could check the LDT for lm, +- * but for now it's good enough to assume that long +- * mode only uses well known segments or kernel. +- */ +- scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); ++ if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) + break; +-#endif +- case 0x60: +- /* 0x64 thru 0x67 are valid prefixes in all modes. */ +- scan_more = (instr_lo & 0xC) == 0x4; +- break; +- case 0xF0: +- /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ +- scan_more = !instr_lo || (instr_lo>>1) == 1; +- break; +- case 0x00: +- /* Prefetch instruction is 0x0F0D or 0x0F18 */ +- scan_more = 0; +- +- if (probe_kernel_address(instr, opcode)) +- break; +- prefetch = (instr_lo == 0xF) && +- (opcode == 0x0D || opcode == 0x18); +- break; +- default: +- scan_more = 0; +- break; +- } + } + return prefetch; + } + +-static void force_sig_info_fault(int si_signo, int si_code, +- unsigned long address, struct task_struct *tsk) ++static void ++force_sig_info_fault(int si_signo, int si_code, unsigned long address, ++ struct task_struct *tsk) + { + siginfo_t info; + +- info.si_signo = si_signo; +- info.si_errno = 0; +- info.si_code = si_code; +- info.si_addr = (void __user *)address; ++ info.si_signo = si_signo; ++ info.si_errno = 0; ++ info.si_code = si_code; ++ info.si_addr = (void __user *)address; ++ + force_sig_info(si_signo, &info, tsk); + } + +-#ifdef CONFIG_X86_64 +-static int bad_address(void *p) ++DEFINE_SPINLOCK(pgd_lock); ++LIST_HEAD(pgd_list); ++ ++#ifdef CONFIG_X86_32 ++static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) + { +- unsigned long dummy; +- return probe_kernel_address((unsigned long *)p, dummy); ++ unsigned index = pgd_index(address); ++ pgd_t *pgd_k; ++ pud_t *pud, *pud_k; ++ pmd_t *pmd, *pmd_k; ++ ++ pgd += index; ++ pgd_k = init_mm.pgd + index; ++ ++ if (!pgd_present(*pgd_k)) ++ return NULL; ++ ++ /* ++ * set_pgd(pgd, *pgd_k); here would be useless on PAE ++ * and redundant with the set_pmd() on non-PAE. As would ++ * set_pud. ++ */ ++ pud = pud_offset(pgd, address); ++ pud_k = pud_offset(pgd_k, address); ++ if (!pud_present(*pud_k)) ++ return NULL; ++ ++ pmd = pmd_offset(pud, address); ++ pmd_k = pmd_offset(pud_k, address); ++ if (!pmd_present(*pmd_k)) ++ return NULL; ++ ++ if (!pmd_present(*pmd)) { ++ set_pmd(pmd, *pmd_k); ++ arch_flush_lazy_mmu_mode(); ++ } else { ++ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); ++ } ++ ++ return pmd_k; ++} ++ ++void vmalloc_sync_all(void) ++{ ++ unsigned long address; ++ ++ if (SHARED_KERNEL_PMD) ++ return; ++ ++ for (address = VMALLOC_START & PMD_MASK; ++ address >= TASK_SIZE && address < FIXADDR_TOP; ++ address += PMD_SIZE) { ++ ++ unsigned long flags; ++ struct page *page; ++ ++ spin_lock_irqsave(&pgd_lock, flags); ++ list_for_each_entry(page, &pgd_list, lru) { ++ if (!vmalloc_sync_one(page_address(page), address)) ++ break; ++ } ++ spin_unlock_irqrestore(&pgd_lock, flags); ++ } ++} ++ ++/* ++ * 32-bit: ++ * ++ * Handle a fault on the vmalloc or module mapping area ++ */ ++static noinline int vmalloc_fault(unsigned long address) ++{ ++ unsigned long pgd_paddr; ++ pmd_t *pmd_k; ++ pte_t *pte_k; ++ ++ /* Make sure we are in vmalloc area: */ ++ if (!(address >= VMALLOC_START && address < VMALLOC_END)) ++ return -1; ++ ++ /* ++ * Synchronize this task's top level page-table ++ * with the 'reference' page table. ++ * ++ * Do _not_ use "current" here. We might be inside ++ * an interrupt in the middle of a task switch.. ++ */ ++ pgd_paddr = read_cr3(); ++ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); ++ if (!pmd_k) ++ return -1; ++ ++ pte_k = pte_offset_kernel(pmd_k, address); ++ if (!pte_present(*pte_k)) ++ return -1; ++ ++ return 0; ++} ++ ++/* ++ * Did it hit the DOS screen memory VA from vm86 mode? ++ */ ++static inline void ++check_v8086_mode(struct pt_regs *regs, unsigned long address, ++ struct task_struct *tsk) ++{ ++ unsigned long bit; ++ ++ if (!v8086_mode(regs)) ++ return; ++ ++ bit = (address - 0xA0000) >> PAGE_SHIFT; ++ if (bit < 32) ++ tsk->thread.screen_bitmap |= 1 << bit; + } +-#endif + + static void dump_pagetable(unsigned long address) + { +-#ifdef CONFIG_X86_32 + __typeof__(pte_val(__pte(0))) page; + + page = read_cr3(); + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; ++ + #ifdef CONFIG_X86_PAE + printk("*pdpt = %016Lx ", page); + if ((page >> PAGE_SHIFT) < max_low_pfn + && page & _PAGE_PRESENT) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) +- & (PTRS_PER_PMD - 1)]; ++ & (PTRS_PER_PMD - 1)]; + printk(KERN_CONT "*pde = %016Lx ", page); + page &= ~_PAGE_NX; + } +@@ -217,19 +336,145 @@ static void dump_pagetable(unsigned long + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case +- * it's allocated already. ++ * it's allocated already: + */ + if ((page >> PAGE_SHIFT) < max_low_pfn + && (page & _PAGE_PRESENT) + && !(page & _PAGE_PSE)) { ++ + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) +- & (PTRS_PER_PTE - 1)]; ++ & (PTRS_PER_PTE - 1)]; + printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); + } + + printk("\n"); +-#else /* CONFIG_X86_64 */ ++} ++ ++#else /* CONFIG_X86_64: */ ++ ++void vmalloc_sync_all(void) ++{ ++ unsigned long address; ++ ++ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; ++ address += PGDIR_SIZE) { ++ ++ const pgd_t *pgd_ref = pgd_offset_k(address); ++ unsigned long flags; ++ struct page *page; ++ ++ if (pgd_none(*pgd_ref)) ++ continue; ++ ++ spin_lock_irqsave(&pgd_lock, flags); ++ list_for_each_entry(page, &pgd_list, lru) { ++ pgd_t *pgd; ++ pgd = (pgd_t *)page_address(page) + pgd_index(address); ++ if (pgd_none(*pgd)) ++ set_pgd(pgd, *pgd_ref); ++ else ++ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); ++ } ++ spin_unlock_irqrestore(&pgd_lock, flags); ++ } ++} ++ ++/* ++ * 64-bit: ++ * ++ * Handle a fault on the vmalloc area ++ * ++ * This assumes no large pages in there. ++ */ ++static noinline int vmalloc_fault(unsigned long address) ++{ ++ pgd_t *pgd, *pgd_ref; ++ pud_t *pud, *pud_ref; ++ pmd_t *pmd, *pmd_ref; ++ pte_t *pte, *pte_ref; ++ ++ /* Make sure we are in vmalloc area: */ ++ if (!(address >= VMALLOC_START && address < VMALLOC_END)) ++ return -1; ++ ++ /* ++ * Copy kernel mappings over when needed. This can also ++ * happen within a race in page table update. In the later ++ * case just flush: ++ */ ++ pgd = pgd_offset(current->active_mm, address); ++ pgd_ref = pgd_offset_k(address); ++ if (pgd_none(*pgd_ref)) ++ return -1; ++ ++ if (pgd_none(*pgd)) ++ set_pgd(pgd, *pgd_ref); ++ else ++ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); ++ ++ /* ++ * Below here mismatches are bugs because these lower tables ++ * are shared: ++ */ ++ ++ pud = pud_offset(pgd, address); ++ pud_ref = pud_offset(pgd_ref, address); ++ if (pud_none(*pud_ref)) ++ return -1; ++ ++ if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) ++ BUG(); ++ ++ pmd = pmd_offset(pud, address); ++ pmd_ref = pmd_offset(pud_ref, address); ++ if (pmd_none(*pmd_ref)) ++ return -1; ++ ++ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) ++ BUG(); ++ ++ pte_ref = pte_offset_kernel(pmd_ref, address); ++ if (!pte_present(*pte_ref)) ++ return -1; ++ ++ pte = pte_offset_kernel(pmd, address); ++ ++ /* ++ * Don't use pte_page here, because the mappings can point ++ * outside mem_map, and the NUMA hash lookup cannot handle ++ * that: ++ */ ++ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) ++ BUG(); ++ ++ return 0; ++} ++ ++static const char errata93_warning[] = ++KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" ++KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" ++KERN_ERR "******* Please consider a BIOS update.\n" ++KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; ++ ++/* ++ * No vm86 mode in 64-bit mode: ++ */ ++static inline void ++check_v8086_mode(struct pt_regs *regs, unsigned long address, ++ struct task_struct *tsk) ++{ ++} ++ ++static int bad_address(void *p) ++{ ++ unsigned long dummy; ++ ++ return probe_kernel_address((unsigned long *)p, dummy); ++} ++ ++static void dump_pagetable(unsigned long address) ++{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; +@@ -238,102 +483,77 @@ static void dump_pagetable(unsigned long + pgd = (pgd_t *)read_cr3(); + + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); ++ + pgd += pgd_index(address); +- if (bad_address(pgd)) goto bad; ++ if (bad_address(pgd)) ++ goto bad; ++ + printk("PGD %lx ", pgd_val(*pgd)); +- if (!pgd_present(*pgd)) goto ret; ++ ++ if (!pgd_present(*pgd)) ++ goto out; + + pud = pud_offset(pgd, address); +- if (bad_address(pud)) goto bad; ++ if (bad_address(pud)) ++ goto bad; ++ + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) +- goto ret; ++ goto out; + + pmd = pmd_offset(pud, address); +- if (bad_address(pmd)) goto bad; ++ if (bad_address(pmd)) ++ goto bad; ++ + printk("PMD %lx ", pmd_val(*pmd)); +- if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; ++ if (!pmd_present(*pmd) || pmd_large(*pmd)) ++ goto out; + + pte = pte_offset_kernel(pmd, address); +- if (bad_address(pte)) goto bad; ++ if (bad_address(pte)) ++ goto bad; ++ + printk("PTE %lx", pte_val(*pte)); +-ret: ++out: + printk("\n"); + return; + bad: + printk("BAD\n"); +-#endif + } + +-#ifdef CONFIG_X86_32 +-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +-{ +- unsigned index = pgd_index(address); +- pgd_t *pgd_k; +- pud_t *pud, *pud_k; +- pmd_t *pmd, *pmd_k; +- +- pgd += index; +- pgd_k = init_mm.pgd + index; ++#endif /* CONFIG_X86_64 */ + +- if (!pgd_present(*pgd_k)) +- return NULL; +- +- /* +- * set_pgd(pgd, *pgd_k); here would be useless on PAE +- * and redundant with the set_pmd() on non-PAE. As would +- * set_pud. +- */ +- +- pud = pud_offset(pgd, address); +- pud_k = pud_offset(pgd_k, address); +- if (!pud_present(*pud_k)) +- return NULL; +- +- pmd = pmd_offset(pud, address); +- pmd_k = pmd_offset(pud_k, address); +- if (!pmd_present(*pmd_k)) +- return NULL; +- if (!pmd_present(*pmd)) { +- set_pmd(pmd, *pmd_k); +- arch_flush_lazy_mmu_mode(); +- } else +- BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); +- return pmd_k; +-} +-#endif +- +-#ifdef CONFIG_X86_64 +-static const char errata93_warning[] = +-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +-KERN_ERR "******* Please consider a BIOS update.\n" +-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +-#endif +- +-/* Workaround for K8 erratum #93 & buggy BIOS. +- BIOS SMM functions are required to use a specific workaround +- to avoid corruption of the 64bit RIP register on C stepping K8. +- A lot of BIOS that didn't get tested properly miss this. +- The OS sees this as a page fault with the upper 32bits of RIP cleared. +- Try to work around it here. +- Note we only handle faults in kernel here. +- Does nothing for X86_32 ++/* ++ * Workaround for K8 erratum #93 & buggy BIOS. ++ * ++ * BIOS SMM functions are required to use a specific workaround ++ * to avoid corruption of the 64bit RIP register on C stepping K8. ++ * ++ * A lot of BIOS that didn't get tested properly miss this. ++ * ++ * The OS sees this as a page fault with the upper 32bits of RIP cleared. ++ * Try to work around it here. ++ * ++ * Note we only handle faults in kernel here. ++ * Does nothing on 32-bit. + */ + static int is_errata93(struct pt_regs *regs, unsigned long address) + { + #ifdef CONFIG_X86_64 +- static int warned; ++ static int once; ++ + if (address != regs->ip) + return 0; ++ + if ((address >> 32) != 0) + return 0; ++ + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { +- if (!warned) { ++ if (!once) { + printk(errata93_warning); +- warned = 1; ++ once = 1; + } + regs->ip = address; + return 1; +@@ -343,16 +563,17 @@ static int is_errata93(struct pt_regs *r + } + + /* +- * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal +- * addresses >4GB. We catch this in the page fault handler because these +- * addresses are not reachable. Just detect this case and return. Any code ++ * Work around K8 erratum #100 K8 in compat mode occasionally jumps ++ * to illegal addresses >4GB. ++ * ++ * We catch this in the page fault handler because these addresses ++ * are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ + static int is_errata100(struct pt_regs *regs, unsigned long address) + { + #ifdef CONFIG_X86_64 +- if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && +- (address >> 32)) ++ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) + return 1; + #endif + return 0; +@@ -362,13 +583,15 @@ static int is_f00f_bug(struct pt_regs *r + { + #ifdef CONFIG_X86_F00F_BUG + unsigned long nr; ++ + /* +- * Pentium F0 0F C7 C8 bug workaround. ++ * Pentium F0 0F C7 C8 bug workaround: + */ + if (boot_cpu_data.f00f_bug) { + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { ++ zap_rt_locks(); + do_invalid_op(regs, 0); + return 1; + } +@@ -377,62 +600,277 @@ static int is_f00f_bug(struct pt_regs *r + return 0; + } + +-static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, +- unsigned long address) ++static const char nx_warning[] = KERN_CRIT ++"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; ++ ++static void ++show_fault_oops(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address) + { +-#ifdef CONFIG_X86_32 + if (!oops_may_print()) + return; +-#endif + +-#ifdef CONFIG_X86_PAE + if (error_code & PF_INSTR) { + unsigned int level; ++ + pte_t *pte = lookup_address(address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) +- printk(KERN_CRIT "kernel tried to execute " +- "NX-protected page - exploit attempt? " +- "(uid: %d)\n", current_uid()); ++ printk(nx_warning, current_uid()); + } +-#endif + + printk(KERN_ALERT "BUG: unable to handle kernel "); + if (address < PAGE_SIZE) + printk(KERN_CONT "NULL pointer dereference"); + else + printk(KERN_CONT "paging request"); ++ + printk(KERN_CONT " at %p\n", (void *) address); + printk(KERN_ALERT "IP:"); + printk_address(regs->ip, 1); ++ + dump_pagetable(address); + } + +-#ifdef CONFIG_X86_64 +-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, +- unsigned long error_code) ++static noinline void ++pgtable_bad(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address) + { +- unsigned long flags = oops_begin(); +- int sig = SIGKILL; + struct task_struct *tsk; ++ unsigned long flags; ++ int sig; ++ ++ flags = oops_begin(); ++ tsk = current; ++ sig = SIGKILL; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", +- current->comm, address); ++ tsk->comm, address); + dump_pagetable(address); +- tsk = current; +- tsk->thread.cr2 = address; +- tsk->thread.trap_no = 14; +- tsk->thread.error_code = error_code; ++ ++ tsk->thread.cr2 = address; ++ tsk->thread.trap_no = 14; ++ tsk->thread.error_code = error_code; ++ + if (__die("Bad pagetable", regs, error_code)) + sig = 0; ++ ++ oops_end(flags, regs, sig); ++} ++ ++static noinline void ++no_context(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address) ++{ ++ struct task_struct *tsk = current; ++ unsigned long *stackend; ++ unsigned long flags; ++ int sig; ++ ++ /* Are we prepared to handle this kernel fault? */ ++ if (fixup_exception(regs)) ++ return; ++ ++ /* ++ * 32-bit: ++ * ++ * Valid to do another page fault here, because if this fault ++ * had been triggered by is_prefetch fixup_exception would have ++ * handled it. ++ * ++ * 64-bit: ++ * ++ * Hall of shame of CPU/BIOS bugs. ++ */ ++ if (is_prefetch(regs, error_code, address)) ++ return; ++ ++ if (is_errata93(regs, address)) ++ return; ++ ++ /* ++ * Oops. The kernel tried to access some bad page. We'll have to ++ * terminate things with extreme prejudice: ++ */ ++ flags = oops_begin(); ++ ++ show_fault_oops(regs, error_code, address); ++ ++ stackend = end_of_stack(tsk); ++ if (*stackend != STACK_END_MAGIC) ++ printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); ++ ++ tsk->thread.cr2 = address; ++ tsk->thread.trap_no = 14; ++ tsk->thread.error_code = error_code; ++ ++ sig = SIGKILL; ++ if (__die("Oops", regs, error_code)) ++ sig = 0; ++ ++ /* Executive summary in case the body of the oops scrolled away */ ++ printk(KERN_EMERG "CR2: %016lx\n", address); ++ + oops_end(flags, regs, sig); + } +-#endif ++ ++/* ++ * Print out info about fatal segfaults, if the show_unhandled_signals ++ * sysctl is set: ++ */ ++static inline void ++show_signal_msg(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address, struct task_struct *tsk) ++{ ++ if (!unhandled_signal(tsk, SIGSEGV)) ++ return; ++ ++ if (!printk_ratelimit()) ++ return; ++ ++ printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", ++ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, ++ tsk->comm, task_pid_nr(tsk), address, ++ (void *)regs->ip, (void *)regs->sp, error_code); ++ ++ print_vma_addr(KERN_CONT " in ", regs->ip); ++ ++ printk(KERN_CONT "\n"); ++} ++ ++static void ++__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address, int si_code) ++{ ++ struct task_struct *tsk = current; ++ ++ /* User mode accesses just cause a SIGSEGV */ ++ if (error_code & PF_USER) { ++ /* ++ * It's possible to have interrupts off here: ++ */ ++ local_irq_enable(); ++ ++ /* ++ * Valid to do another page fault here because this one came ++ * from user space: ++ */ ++ if (is_prefetch(regs, error_code, address)) ++ return; ++ ++ if (is_errata100(regs, address)) ++ return; ++ ++ if (unlikely(show_unhandled_signals)) ++ show_signal_msg(regs, error_code, address, tsk); ++ ++ /* Kernel addresses are always protection faults: */ ++ tsk->thread.cr2 = address; ++ tsk->thread.error_code = error_code | (address >= TASK_SIZE); ++ tsk->thread.trap_no = 14; ++ ++ force_sig_info_fault(SIGSEGV, si_code, address, tsk); ++ ++ return; ++ } ++ ++ if (is_f00f_bug(regs, address)) ++ return; ++ ++ no_context(regs, error_code, address); ++} ++ ++static noinline void ++bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address) ++{ ++ __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); ++} ++ ++static void ++__bad_area(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address, int si_code) ++{ ++ struct mm_struct *mm = current->mm; ++ ++ /* ++ * Something tried to access memory that isn't in our memory map.. ++ * Fix it, but check if it's kernel or user first.. ++ */ ++ up_read(&mm->mmap_sem); ++ ++ __bad_area_nosemaphore(regs, error_code, address, si_code); ++} ++ ++static noinline void ++bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) ++{ ++ __bad_area(regs, error_code, address, SEGV_MAPERR); ++} ++ ++static noinline void ++bad_area_access_error(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address) ++{ ++ __bad_area(regs, error_code, address, SEGV_ACCERR); ++} ++ ++/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ ++static void ++out_of_memory(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address) ++{ ++ /* ++ * We ran out of memory, call the OOM killer, and return the userspace ++ * (which will retry the fault, or kill us if we got oom-killed): ++ */ ++ up_read(¤t->mm->mmap_sem); ++ ++ pagefault_out_of_memory(); ++} ++ ++static void ++do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) ++{ ++ struct task_struct *tsk = current; ++ struct mm_struct *mm = tsk->mm; ++ ++ up_read(&mm->mmap_sem); ++ ++ /* Kernel mode? Handle exceptions or die: */ ++ if (!(error_code & PF_USER)) ++ no_context(regs, error_code, address); ++ ++ /* User-space => ok to do another page fault: */ ++ if (is_prefetch(regs, error_code, address)) ++ return; ++ ++ tsk->thread.cr2 = address; ++ tsk->thread.error_code = error_code; ++ tsk->thread.trap_no = 14; ++ ++ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); ++} ++ ++static noinline void ++mm_fault_error(struct pt_regs *regs, unsigned long error_code, ++ unsigned long address, unsigned int fault) ++{ ++ if (fault & VM_FAULT_OOM) { ++ out_of_memory(regs, error_code, address); ++ } else { ++ if (fault & VM_FAULT_SIGBUS) ++ do_sigbus(regs, error_code, address); ++ else ++ BUG(); ++ } ++} + + static int spurious_fault_check(unsigned long error_code, pte_t *pte) + { + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; ++ + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + +@@ -440,21 +878,25 @@ static int spurious_fault_check(unsigned + } + + /* +- * Handle a spurious fault caused by a stale TLB entry. This allows +- * us to lazily refresh the TLB when increasing the permissions of a +- * kernel page (RO -> RW or NX -> X). Doing it eagerly is very +- * expensive since that implies doing a full cross-processor TLB +- * flush, even if no stale TLB entries exist on other processors. ++ * Handle a spurious fault caused by a stale TLB entry. ++ * ++ * This allows us to lazily refresh the TLB when increasing the ++ * permissions of a kernel page (RO -> RW or NX -> X). Doing it ++ * eagerly is very expensive since that implies doing a full ++ * cross-processor TLB flush, even if no stale TLB entries exist ++ * on other processors. ++ * + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + */ +-static int spurious_fault(unsigned long address, +- unsigned long error_code) ++static noinline int ++spurious_fault(unsigned long error_code, unsigned long address) + { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; ++ int ret; + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & (PF_USER | PF_RSVD)) +@@ -482,126 +924,77 @@ static int spurious_fault(unsigned long + if (!pte_present(*pte)) + return 0; + +- return spurious_fault_check(error_code, pte); +-} +- +-/* +- * X86_32 +- * Handle a fault on the vmalloc or module mapping area +- * +- * X86_64 +- * Handle a fault on the vmalloc area +- * +- * This assumes no large pages in there. +- */ +-static int vmalloc_fault(unsigned long address) +-{ +-#ifdef CONFIG_X86_32 +- unsigned long pgd_paddr; +- pmd_t *pmd_k; +- pte_t *pte_k; +- +- /* Make sure we are in vmalloc area */ +- if (!(address >= VMALLOC_START && address < VMALLOC_END)) +- return -1; ++ ret = spurious_fault_check(error_code, pte); ++ if (!ret) ++ return 0; + + /* +- * Synchronize this task's top level page-table +- * with the 'reference' page table. +- * +- * Do _not_ use "current" here. We might be inside +- * an interrupt in the middle of a task switch.. ++ * Make sure we have permissions in PMD. ++ * If not, then there's a bug in the page tables: + */ +- pgd_paddr = read_cr3(); +- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); +- if (!pmd_k) +- return -1; +- pte_k = pte_offset_kernel(pmd_k, address); +- if (!pte_present(*pte_k)) +- return -1; +- return 0; +-#else +- pgd_t *pgd, *pgd_ref; +- pud_t *pud, *pud_ref; +- pmd_t *pmd, *pmd_ref; +- pte_t *pte, *pte_ref; ++ ret = spurious_fault_check(error_code, (pte_t *) pmd); ++ WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + +- /* Make sure we are in vmalloc area */ +- if (!(address >= VMALLOC_START && address < VMALLOC_END)) +- return -1; ++ return ret; ++} + +- /* Copy kernel mappings over when needed. This can also +- happen within a race in page table update. In the later +- case just flush. */ ++int show_unhandled_signals = 1; + +- pgd = pgd_offset(current->active_mm, address); +- pgd_ref = pgd_offset_k(address); +- if (pgd_none(*pgd_ref)) +- return -1; +- if (pgd_none(*pgd)) +- set_pgd(pgd, *pgd_ref); +- else +- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); ++static inline int ++access_error(unsigned long error_code, int write, struct vm_area_struct *vma) ++{ ++ if (write) { ++ /* write, present and write, not present: */ ++ if (unlikely(!(vma->vm_flags & VM_WRITE))) ++ return 1; ++ return 0; ++ } + +- /* Below here mismatches are bugs because these lower tables +- are shared */ ++ /* read, present: */ ++ if (unlikely(error_code & PF_PROT)) ++ return 1; ++ ++ /* read, not present: */ ++ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) ++ return 1; + +- pud = pud_offset(pgd, address); +- pud_ref = pud_offset(pgd_ref, address); +- if (pud_none(*pud_ref)) +- return -1; +- if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) +- BUG(); +- pmd = pmd_offset(pud, address); +- pmd_ref = pmd_offset(pud_ref, address); +- if (pmd_none(*pmd_ref)) +- return -1; +- if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) +- BUG(); +- pte_ref = pte_offset_kernel(pmd_ref, address); +- if (!pte_present(*pte_ref)) +- return -1; +- pte = pte_offset_kernel(pmd, address); +- /* Don't use pte_page here, because the mappings can point +- outside mem_map, and the NUMA hash lookup cannot handle +- that. */ +- if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) +- BUG(); + return 0; +-#endif + } + +-int show_unhandled_signals = 1; ++static int fault_in_kernel_space(unsigned long address) ++{ ++ return address >= TASK_SIZE_MAX; ++} + + /* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +-#ifdef CONFIG_X86_64 +-asmlinkage +-#endif +-void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) ++dotraplinkage void __kprobes ++do_page_fault(struct pt_regs *regs, unsigned long error_code) + { +- struct task_struct *tsk; +- struct mm_struct *mm; + struct vm_area_struct *vma; ++ struct task_struct *tsk; + unsigned long address; +- int write, si_code; ++ struct mm_struct *mm; ++ int write; + int fault; +-#ifdef CONFIG_X86_64 +- unsigned long flags; +- int sig; +-#endif + + tsk = current; + mm = tsk->mm; ++ + prefetchw(&mm->mmap_sem); + +- /* get the address */ ++ /* Get the faulting address: */ + address = read_cr2(); + +- si_code = SEGV_MAPERR; ++ /* ++ * Detect and handle instructions that would cause a page fault for ++ * both a tracked kernel page and a userspace page. ++ */ ++ if (kmemcheck_active(regs)) ++ kmemcheck_hide(regs); + + if (unlikely(kmmio_fault(regs, address))) + return; +@@ -619,319 +1012,156 @@ void __kprobes do_page_fault(struct pt_r + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ +-#ifdef CONFIG_X86_32 +- if (unlikely(address >= TASK_SIZE)) { +-#else +- if (unlikely(address >= TASK_SIZE64)) { +-#endif +- if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && +- vmalloc_fault(address) >= 0) +- return; ++ if (unlikely(fault_in_kernel_space(address))) { ++ if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { ++ if (vmalloc_fault(address) >= 0) ++ return; ++ ++ if (kmemcheck_fault(regs, address, error_code)) ++ return; ++ } + +- /* Can handle a stale RO->RW TLB */ +- if (spurious_fault(address, error_code)) ++ /* Can handle a stale RO->RW TLB: */ ++ if (spurious_fault(error_code, address)) + return; + +- /* kprobes don't want to hook the spurious faults. */ ++ /* kprobes don't want to hook the spurious faults: */ + if (notify_page_fault(regs)) + return; + /* + * Don't take the mm semaphore here. If we fixup a prefetch +- * fault we could otherwise deadlock. ++ * fault we could otherwise deadlock: + */ +- goto bad_area_nosemaphore; +- } ++ bad_area_nosemaphore(regs, error_code, address); + +- /* kprobes don't want to hook the spurious faults. */ +- if (notify_page_fault(regs)) + return; ++ } + ++ /* kprobes don't want to hook the spurious faults: */ ++ if (unlikely(notify_page_fault(regs))) ++ return; + /* + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any +- * potential system fault or CPU buglet. ++ * potential system fault or CPU buglet: + */ + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; +- } else if (regs->flags & X86_EFLAGS_IF) +- local_irq_enable(); ++ } else { ++ if (regs->flags & X86_EFLAGS_IF) ++ local_irq_enable(); ++ } + +-#ifdef CONFIG_X86_64 + if (unlikely(error_code & PF_RSVD)) +- pgtable_bad(address, regs, error_code); +-#endif ++ pgtable_bad(regs, error_code, address); ++ ++ perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + + /* +- * If we're in an interrupt, have no user context or are running in an +- * atomic region then we must not take the fault. ++ * If we're in an interrupt, have no user context or are running ++ * in an atomic region then we must not take the fault: + */ +- if (unlikely(in_atomic() || !mm)) +- goto bad_area_nosemaphore; ++ if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) { ++ bad_area_nosemaphore(regs, error_code, address); ++ return; ++ } + + /* + * When running in the kernel we expect faults to occur only to +- * addresses in user space. All other faults represent errors in the +- * kernel and should generate an OOPS. Unfortunately, in the case of an +- * erroneous fault occurring in a code path which already holds mmap_sem +- * we will deadlock attempting to validate the fault against the +- * address space. Luckily the kernel only validly references user +- * space from well defined areas of code, which are listed in the +- * exceptions table. ++ * addresses in user space. All other faults represent errors in ++ * the kernel and should generate an OOPS. Unfortunately, in the ++ * case of an erroneous fault occurring in a code path which already ++ * holds mmap_sem we will deadlock attempting to validate the fault ++ * against the address space. Luckily the kernel only validly ++ * references user space from well defined areas of code, which are ++ * listed in the exceptions table. + * + * As the vast majority of faults will be valid we will only perform +- * the source reference check when there is a possibility of a deadlock. +- * Attempt to lock the address space, if we cannot we then validate the +- * source. If this is invalid we can skip the address space check, +- * thus avoiding the deadlock. ++ * the source reference check when there is a possibility of a ++ * deadlock. Attempt to lock the address space, if we cannot we then ++ * validate the source. If this is invalid we can skip the address ++ * space check, thus avoiding the deadlock: + */ +- if (!down_read_trylock(&mm->mmap_sem)) { ++ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if ((error_code & PF_USER) == 0 && +- !search_exception_tables(regs->ip)) +- goto bad_area_nosemaphore; ++ !search_exception_tables(regs->ip)) { ++ bad_area_nosemaphore(regs, error_code, address); ++ return; ++ } + down_read(&mm->mmap_sem); ++ } else { ++ /* ++ * The above down_read_trylock() might have succeeded in ++ * which case we'll have missed the might_sleep() from ++ * down_read(): ++ */ ++ might_sleep(); + } + + vma = find_vma(mm, address); +- if (!vma) +- goto bad_area; +- if (vma->vm_start <= address) ++ if (unlikely(!vma)) { ++ bad_area(regs, error_code, address); ++ return; ++ } ++ if (likely(vma->vm_start <= address)) + goto good_area; +- if (!(vma->vm_flags & VM_GROWSDOWN)) +- goto bad_area; ++ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { ++ bad_area(regs, error_code, address); ++ return; ++ } + if (error_code & PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter +- * and pusha to work. ("enter $65535,$31" pushes ++ * and pusha to work. ("enter $65535, $31" pushes + * 32 pointers and then decrements %sp by 65535.) + */ +- if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) +- goto bad_area; +- } +- if (expand_stack(vma, address)) +- goto bad_area; +-/* +- * Ok, we have a good vm_area for this memory access, so +- * we can handle it.. +- */ +-good_area: +- si_code = SEGV_ACCERR; +- write = 0; +- switch (error_code & (PF_PROT|PF_WRITE)) { +- default: /* 3: write, present */ +- /* fall through */ +- case PF_WRITE: /* write, not present */ +- if (!(vma->vm_flags & VM_WRITE)) +- goto bad_area; +- write++; +- break; +- case PF_PROT: /* read, present */ +- goto bad_area; +- case 0: /* read, not present */ +- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) +- goto bad_area; +- } +- +- /* +- * If for any reason at all we couldn't handle the fault, +- * make sure we exit gracefully rather than endlessly redo +- * the fault. +- */ +- fault = handle_mm_fault(mm, vma, address, write); +- if (unlikely(fault & VM_FAULT_ERROR)) { +- if (fault & VM_FAULT_OOM) +- goto out_of_memory; +- else if (fault & VM_FAULT_SIGBUS) +- goto do_sigbus; +- BUG(); +- } +- if (fault & VM_FAULT_MAJOR) +- tsk->maj_flt++; +- else +- tsk->min_flt++; +- +-#ifdef CONFIG_X86_32 +- /* +- * Did it hit the DOS screen memory VA from vm86 mode? +- */ +- if (v8086_mode(regs)) { +- unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; +- if (bit < 32) +- tsk->thread.screen_bitmap |= 1 << bit; +- } +-#endif +- up_read(&mm->mmap_sem); +- return; +- +-/* +- * Something tried to access memory that isn't in our memory map.. +- * Fix it, but check if it's kernel or user first.. +- */ +-bad_area: +- up_read(&mm->mmap_sem); +- +-bad_area_nosemaphore: +- /* User mode accesses just cause a SIGSEGV */ +- if (error_code & PF_USER) { +- /* +- * It's possible to have interrupts off here. +- */ +- local_irq_enable(); +- +- /* +- * Valid to do another page fault here because this one came +- * from user space. +- */ +- if (is_prefetch(regs, address, error_code)) +- return; +- +- if (is_errata100(regs, address)) ++ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { ++ bad_area(regs, error_code, address); + return; +- +- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && +- printk_ratelimit()) { +- printk( +- "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", +- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, +- tsk->comm, task_pid_nr(tsk), address, +- (void *) regs->ip, (void *) regs->sp, error_code); +- print_vma_addr(" in ", regs->ip); +- printk("\n"); + } +- +- tsk->thread.cr2 = address; +- /* Kernel addresses are always protection faults */ +- tsk->thread.error_code = error_code | (address >= TASK_SIZE); +- tsk->thread.trap_no = 14; +- force_sig_info_fault(SIGSEGV, si_code, address, tsk); +- return; + } +- +- if (is_f00f_bug(regs, address)) +- return; +- +-no_context: +- /* Are we prepared to handle this kernel fault? */ +- if (fixup_exception(regs)) ++ if (unlikely(expand_stack(vma, address))) { ++ bad_area(regs, error_code, address); + return; ++ } + + /* +- * X86_32 +- * Valid to do another page fault here, because if this fault +- * had been triggered by is_prefetch fixup_exception would have +- * handled it. +- * +- * X86_64 +- * Hall of shame of CPU/BIOS bugs. ++ * Ok, we have a good vm_area for this memory access, so ++ * we can handle it.. + */ +- if (is_prefetch(regs, address, error_code)) +- return; ++good_area: ++ write = error_code & PF_WRITE; + +- if (is_errata93(regs, address)) ++ if (unlikely(access_error(error_code, write, vma))) { ++ bad_area_access_error(regs, error_code, address); + return; ++ } + +-/* +- * Oops. The kernel tried to access some bad page. We'll have to +- * terminate things with extreme prejudice. +- */ +-#ifdef CONFIG_X86_32 +- bust_spinlocks(1); +-#else +- flags = oops_begin(); +-#endif +- +- show_fault_oops(regs, error_code, address); +- +- tsk->thread.cr2 = address; +- tsk->thread.trap_no = 14; +- tsk->thread.error_code = error_code; +- +-#ifdef CONFIG_X86_32 +- die("Oops", regs, error_code); +- bust_spinlocks(0); +- do_exit(SIGKILL); +-#else +- sig = SIGKILL; +- if (__die("Oops", regs, error_code)) +- sig = 0; +- /* Executive summary in case the body of the oops scrolled away */ +- printk(KERN_EMERG "CR2: %016lx\n", address); +- oops_end(flags, regs, sig); +-#endif +- +-out_of_memory: + /* +- * We ran out of memory, call the OOM killer, and return the userspace +- * (which will retry the fault, or kill us if we got oom-killed). ++ * If for any reason at all we couldn't handle the fault, ++ * make sure we exit gracefully rather than endlessly redo ++ * the fault: + */ +- up_read(&mm->mmap_sem); +- pagefault_out_of_memory(); +- return; +- +-do_sigbus: +- up_read(&mm->mmap_sem); +- +- /* Kernel mode? Handle exceptions or die */ +- if (!(error_code & PF_USER)) +- goto no_context; +-#ifdef CONFIG_X86_32 +- /* User space => ok to do another page fault */ +- if (is_prefetch(regs, address, error_code)) +- return; +-#endif +- tsk->thread.cr2 = address; +- tsk->thread.error_code = error_code; +- tsk->thread.trap_no = 14; +- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +-} +- +-DEFINE_SPINLOCK(pgd_lock); +-LIST_HEAD(pgd_list); +- +-void vmalloc_sync_all(void) +-{ +- unsigned long address; ++ fault = handle_mm_fault(mm, vma, address, write); + +-#ifdef CONFIG_X86_32 +- if (SHARED_KERNEL_PMD) ++ if (unlikely(fault & VM_FAULT_ERROR)) { ++ mm_fault_error(regs, error_code, address, fault); + return; +- +- for (address = VMALLOC_START & PMD_MASK; +- address >= TASK_SIZE && address < FIXADDR_TOP; +- address += PMD_SIZE) { +- unsigned long flags; +- struct page *page; +- +- spin_lock_irqsave(&pgd_lock, flags); +- list_for_each_entry(page, &pgd_list, lru) { +- if (!vmalloc_sync_one(page_address(page), +- address)) +- break; +- } +- spin_unlock_irqrestore(&pgd_lock, flags); + } +-#else /* CONFIG_X86_64 */ +- for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; +- address += PGDIR_SIZE) { +- const pgd_t *pgd_ref = pgd_offset_k(address); +- unsigned long flags; +- struct page *page; + +- if (pgd_none(*pgd_ref)) +- continue; +- spin_lock_irqsave(&pgd_lock, flags); +- list_for_each_entry(page, &pgd_list, lru) { +- pgd_t *pgd; +- pgd = (pgd_t *)page_address(page) + pgd_index(address); +- if (pgd_none(*pgd)) +- set_pgd(pgd, *pgd_ref); +- else +- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); +- } +- spin_unlock_irqrestore(&pgd_lock, flags); ++ if (fault & VM_FAULT_MAJOR) { ++ tsk->maj_flt++; ++ perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); ++ } else { ++ tsk->min_flt++; ++ perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + } +-#endif ++ ++ check_v8086_mode(regs, address, tsk); ++ ++ up_read(&mm->mmap_sem); + } +Index: linux-2.6-tip/arch/x86/mm/highmem_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/highmem_32.c ++++ linux-2.6-tip/arch/x86/mm/highmem_32.c +@@ -1,11 +1,12 @@ + #include + #include ++#include /* for totalram_pages */ + + void *kmap(struct page *page) + { +- might_sleep(); + if (!PageHighMem(page)) + return page_address(page); ++ might_sleep(); + return kmap_high(page); + } + +@@ -18,6 +19,27 @@ void kunmap(struct page *page) + kunmap_high(page); + } + ++void kunmap_virt(void *ptr) ++{ ++ struct page *page; ++ ++ if ((unsigned long)ptr < PKMAP_ADDR(0)) ++ return; ++ page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); ++ kunmap(page); ++} ++ ++struct page *kmap_to_page(void *ptr) ++{ ++ struct page *page; ++ ++ if ((unsigned long)ptr < PKMAP_ADDR(0)) ++ return virt_to_page(ptr); ++ page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); ++ return page; ++} ++EXPORT_SYMBOL_GPL(kmap_to_page); /* PREEMPT_RT converts some modules to use this */ ++ + static void debug_kmap_atomic_prot(enum km_type type) + { + #ifdef CONFIG_DEBUG_HIGHMEM +@@ -69,12 +91,12 @@ static void debug_kmap_atomic_prot(enum + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +-void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) ++void *__kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) + { + enum fixed_addresses idx; + unsigned long vaddr; + +- /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ ++ preempt_disable(); + pagefault_disable(); + + if (!PageHighMem(page)) +@@ -84,19 +106,24 @@ void *kmap_atomic_prot(struct page *page + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- BUG_ON(!pte_none(*(kmap_pte-idx))); ++ WARN_ON_ONCE(!pte_none(*(kmap_pte-idx))); + set_pte(kmap_pte-idx, mk_pte(page, prot)); + arch_flush_lazy_mmu_mode(); + + return (void *)vaddr; + } + +-void *kmap_atomic(struct page *page, enum km_type type) ++void *__kmap_atomic_direct(struct page *page, enum km_type type) ++{ ++ return __kmap_atomic_prot(page, type, kmap_prot); ++} ++ ++void *__kmap_atomic(struct page *page, enum km_type type) + { + return kmap_atomic_prot(page, type, kmap_prot); + } + +-void kunmap_atomic(void *kvaddr, enum km_type type) ++void __kunmap_atomic(void *kvaddr, enum km_type type) + { + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); +@@ -118,28 +145,21 @@ void kunmap_atomic(void *kvaddr, enum km + + arch_flush_lazy_mmu_mode(); + pagefault_enable(); ++ preempt_enable(); + } + +-/* This is the same as kmap_atomic() but can map memory that doesn't ++/* ++ * This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) ++void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) + { +- enum fixed_addresses idx; +- unsigned long vaddr; +- +- pagefault_disable(); +- +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); +- arch_flush_lazy_mmu_mode(); +- +- return (void*) vaddr; ++ preempt_disable(); ++ return kmap_atomic_prot_pfn(pfn, type, kmap_prot); + } +-EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ ++EXPORT_SYMBOL_GPL(__kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ + +-struct page *kmap_atomic_to_page(void *ptr) ++struct page *__kmap_atomic_to_page(void *ptr) + { + unsigned long idx, vaddr = (unsigned long)ptr; + pte_t *pte; +@@ -154,5 +174,30 @@ struct page *kmap_atomic_to_page(void *p + + EXPORT_SYMBOL(kmap); + EXPORT_SYMBOL(kunmap); +-EXPORT_SYMBOL(kmap_atomic); +-EXPORT_SYMBOL(kunmap_atomic); ++EXPORT_SYMBOL(kunmap_virt); ++EXPORT_SYMBOL(__kmap_atomic); ++EXPORT_SYMBOL(__kunmap_atomic); ++ ++void __init set_highmem_pages_init(void) ++{ ++ struct zone *zone; ++ int nid; ++ ++ for_each_zone(zone) { ++ unsigned long zone_start_pfn, zone_end_pfn; ++ ++ if (!is_highmem(zone)) ++ continue; ++ ++ zone_start_pfn = zone->zone_start_pfn; ++ zone_end_pfn = zone_start_pfn + zone->spanned_pages; ++ ++ nid = zone_to_nid(zone); ++ printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", ++ zone->name, nid, zone_start_pfn, zone_end_pfn); ++ ++ add_highpages_with_active_regions(nid, zone_start_pfn, ++ zone_end_pfn); ++ } ++ totalram_pages += totalhigh_pages; ++} +Index: linux-2.6-tip/arch/x86/mm/init.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/init.c +@@ -0,0 +1,394 @@ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++unsigned long __initdata e820_table_start; ++unsigned long __meminitdata e820_table_end; ++unsigned long __meminitdata e820_table_top; ++ ++enum bootmem_state bootmem_state = BEFORE_BOOTMEM; ++ ++int direct_gbpages ++#ifdef CONFIG_DIRECT_GBPAGES ++ = 1 ++#endif ++; ++ ++static void __init find_early_table_space(unsigned long end, int use_pse, ++ int use_gbpages) ++{ ++ unsigned long puds, pmds, ptes, tables, start; ++ ++ puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; ++ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); ++ ++ if (use_gbpages) { ++ unsigned long extra; ++ ++ extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); ++ pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; ++ } else ++ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; ++ ++ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); ++ ++ if (use_pse) { ++ unsigned long extra; ++ ++ extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); ++#ifdef CONFIG_X86_32 ++ extra += PMD_SIZE; ++#endif ++ ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ } else ++ ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ ++ tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); ++ ++#ifdef CONFIG_X86_32 ++ /* for fixmap */ ++ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); ++#endif ++ ++ /* ++ * RED-PEN putting page tables only on node 0 could ++ * cause a hotspot and fill up ZONE_DMA. The page tables ++ * need roughly 0.5KB per GB. ++ */ ++#ifdef CONFIG_X86_32 ++ start = 0x7000; ++ e820_table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; ++ e820_table_end = e820_table_start; ++ e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); ++ ++ printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", ++ end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); ++} ++ ++struct map_range { ++ unsigned long start; ++ unsigned long end; ++ unsigned page_size_mask; ++}; ++ ++#ifdef CONFIG_X86_32 ++#define NR_RANGE_MR 3 ++#else /* CONFIG_X86_64 */ ++#define NR_RANGE_MR 5 ++#endif ++ ++static int __meminit save_mr(struct map_range *mr, int nr_range, ++ unsigned long start_pfn, unsigned long end_pfn, ++ unsigned long page_size_mask) ++{ ++ if (start_pfn < end_pfn) { ++ if (nr_range >= NR_RANGE_MR) ++ panic("run out of range for init_memory_mapping\n"); ++ mr[nr_range].start = start_pfn<> PAGE_SHIFT; ++ pos = start_pfn << PAGE_SHIFT; ++#ifdef CONFIG_X86_32 ++ /* ++ * Don't use a large page for the first 2/4MB of memory ++ * because there are often fixed size MTRRs in there ++ * and overlapping MTRRs into large pages can cause ++ * slowdowns. ++ */ ++ if (pos == 0) ++ end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); ++ else ++ end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) ++ << (PMD_SHIFT - PAGE_SHIFT); ++#else /* CONFIG_X86_64 */ ++ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) ++ << (PMD_SHIFT - PAGE_SHIFT); ++#endif ++ if (end_pfn > (end >> PAGE_SHIFT)) ++ end_pfn = end >> PAGE_SHIFT; ++ if (start_pfn < end_pfn) { ++ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); ++ pos = end_pfn << PAGE_SHIFT; ++ } ++ ++ /* big page (2M) range */ ++ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) ++ << (PMD_SHIFT - PAGE_SHIFT); ++#ifdef CONFIG_X86_32 ++ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); ++#else /* CONFIG_X86_64 */ ++ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) ++ << (PUD_SHIFT - PAGE_SHIFT); ++ if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) ++ end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); ++#endif ++ ++ if (start_pfn < end_pfn) { ++ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, ++ page_size_mask & (1<>PUD_SHIFT) ++ << (PUD_SHIFT - PAGE_SHIFT); ++ end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); ++ if (start_pfn < end_pfn) { ++ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, ++ page_size_mask & ++ ((1<>PMD_SHIFT) ++ << (PMD_SHIFT - PAGE_SHIFT); ++ end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); ++ if (start_pfn < end_pfn) { ++ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, ++ page_size_mask & (1<>PAGE_SHIFT; ++ end_pfn = end>>PAGE_SHIFT; ++ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); ++ ++ /* try to merge same page size and continuous */ ++ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { ++ unsigned long old_start; ++ if (mr[i].end != mr[i+1].start || ++ mr[i].page_size_mask != mr[i+1].page_size_mask) ++ continue; ++ /* move it */ ++ old_start = mr[i].start; ++ memmove(&mr[i], &mr[i+1], ++ (nr_range - 1 - i) * sizeof(struct map_range)); ++ mr[i--].start = old_start; ++ nr_range--; ++ } ++ ++ for (i = 0; i < nr_range; i++) ++ printk(KERN_DEBUG " %010lx - %010lx page %s\n", ++ mr[i].start, mr[i].end, ++ (mr[i].page_size_mask & (1< e820_table_start) ++ reserve_early(e820_table_start << PAGE_SHIFT, ++ e820_table_end << PAGE_SHIFT, "PGTABLE"); ++ ++ if (bootmem_state == BEFORE_BOOTMEM) ++ early_memtest(start, end); ++ ++ return ret >> PAGE_SHIFT; ++} ++ ++ ++/* ++ * devmem_is_allowed() checks to see if /dev/mem access to a certain address ++ * is valid. The argument is a physical page number. ++ * ++ * ++ * On x86, access has to be given to the first megabyte of ram because that area ++ * contains bios code and data regions used by X and dosemu and similar apps. ++ * Access has to be given to non-kernel-ram areas as well, these contain the PCI ++ * mmio resources as well as potential bios/acpi data regions. ++ */ ++int devmem_is_allowed(unsigned long pagenr) ++{ ++ if (pagenr <= 256) ++ return 1; ++ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) ++ return 0; ++ if (!page_is_ram(pagenr)) ++ return 1; ++ return 0; ++} ++ ++void free_init_pages(char *what, unsigned long begin, unsigned long end) ++{ ++ unsigned long addr = begin; ++ ++ if (addr >= end) ++ return; ++ ++ /* ++ * If debugging page accesses then do not free this memory but ++ * mark them not present - any buggy init-section access will ++ * create a kernel page fault: ++ */ ++#ifdef CONFIG_DEBUG_PAGEALLOC ++ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", ++ begin, PAGE_ALIGN(end)); ++ set_memory_np(begin, (end - begin) >> PAGE_SHIFT); ++#else ++ /* ++ * We just marked the kernel text read only above, now that ++ * we are going to free part of that, we need to make that ++ * writeable first. ++ */ ++ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); ++ ++ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); ++ ++ for (; addr < end; addr += PAGE_SIZE) { ++ ClearPageReserved(virt_to_page(addr)); ++ init_page_count(virt_to_page(addr)); ++ memset((void *)(addr & ~(PAGE_SIZE-1)), ++ POISON_FREE_INITMEM, PAGE_SIZE); ++ free_page(addr); ++ totalram_pages++; ++ } ++#endif ++} ++ ++void free_initmem(void) ++{ ++ free_init_pages("unused kernel memory", ++ (unsigned long)(&__init_begin), ++ (unsigned long)(&__init_end)); ++} ++ ++#ifdef CONFIG_BLK_DEV_INITRD ++void free_initrd_mem(unsigned long start, unsigned long end) ++{ ++ free_init_pages("initrd memory", start, end); ++} ++#endif +Index: linux-2.6-tip/arch/x86/mm/init_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/init_32.c ++++ linux-2.6-tip/arch/x86/mm/init_32.c +@@ -49,31 +49,23 @@ + #include + #include + #include +-#include +- +-unsigned int __VMALLOC_RESERVE = 128 << 20; ++#include + + unsigned long max_low_pfn_mapped; + unsigned long max_pfn_mapped; + +-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + unsigned long highstart_pfn, highend_pfn; + + static noinline int do_test_wp_bit(void); + +- +-static unsigned long __initdata table_start; +-static unsigned long __meminitdata table_end; +-static unsigned long __meminitdata table_top; +- +-static int __initdata after_init_bootmem; ++bool __read_mostly __vmalloc_start_set = false; + + static __init void *alloc_low_page(void) + { +- unsigned long pfn = table_end++; ++ unsigned long pfn = e820_table_end++; + void *adr; + +- if (pfn >= table_top) ++ if (pfn >= e820_table_top) + panic("alloc_low_page: ran out of memory"); + + adr = __va(pfn * PAGE_SIZE); +@@ -89,14 +81,20 @@ static __init void *alloc_low_page(void) + static pmd_t * __init one_md_table_init(pgd_t *pgd) + { + pud_t *pud; +- pmd_t *pmd_table; ++ pmd_t *pmd_table = NULL; + + #ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { +- if (after_init_bootmem) ++ switch (bootmem_state) { ++ case DURING_BOOTMEM: + pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); +- else ++ break; ++ case BEFORE_BOOTMEM: + pmd_table = (pmd_t *)alloc_low_page(); ++ break; ++ default: ++ panic("after bootmem call one_md_table_init\n"); ++ } + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); +@@ -120,15 +118,21 @@ static pte_t * __init one_page_table_ini + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { + pte_t *page_table = NULL; + +- if (after_init_bootmem) { +-#ifdef CONFIG_DEBUG_PAGEALLOC ++ switch (bootmem_state) { ++ case DURING_BOOTMEM: ++#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); + #endif + if (!page_table) + page_table = + (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); +- } else ++ break; ++ case BEFORE_BOOTMEM: + page_table = (pte_t *)alloc_low_page(); ++ break; ++ default: ++ panic("after bootmem call one_page_table_init\n"); ++ } + + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); +@@ -138,6 +142,23 @@ static pte_t * __init one_page_table_ini + return pte_offset_kernel(pmd, 0); + } + ++pmd_t * __init populate_extra_pmd(unsigned long vaddr) ++{ ++ int pgd_idx = pgd_index(vaddr); ++ int pmd_idx = pmd_index(vaddr); ++ ++ return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; ++} ++ ++pte_t * __init populate_extra_pte(unsigned long vaddr) ++{ ++ int pte_idx = pte_index(vaddr); ++ pmd_t *pmd; ++ ++ pmd = populate_extra_pmd(vaddr); ++ return one_page_table_init(pmd) + pte_idx; ++} ++ + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, + unsigned long vaddr, pte_t *lastpte) + { +@@ -154,12 +175,12 @@ static pte_t *__init page_table_kmap_che + if (pmd_idx_kmap_begin != pmd_idx_kmap_end + && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end +- && ((__pa(pte) >> PAGE_SHIFT) < table_start +- || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { ++ && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start ++ || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { + pte_t *newpte; + int i; + +- BUG_ON(after_init_bootmem); ++ BUG_ON(bootmem_state != BEFORE_BOOTMEM); + newpte = alloc_low_page(); + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(newpte + i, pte[i]); +@@ -228,11 +249,14 @@ static inline int is_kernel_text(unsigne + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: + */ +-static void __init kernel_physical_mapping_init(pgd_t *pgd_base, +- unsigned long start_pfn, +- unsigned long end_pfn, +- int use_pse) ++unsigned long __init ++kernel_physical_mapping_init(unsigned long start, ++ unsigned long end, ++ unsigned long page_size_mask) + { ++ int use_pse = page_size_mask == (1<> PAGE_SHIFT; ++ end_pfn = end >> PAGE_SHIFT; ++ + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by +@@ -355,26 +382,6 @@ repeat: + mapping_iter = 2; + goto repeat; + } +-} +- +-/* +- * devmem_is_allowed() checks to see if /dev/mem access to a certain address +- * is valid. The argument is a physical page number. +- * +- * +- * On x86, access has to be given to the first megabyte of ram because that area +- * contains bios code and data regions used by X and dosemu and similar apps. +- * Access has to be given to non-kernel-ram areas as well, these contain the PCI +- * mmio resources as well as potential bios/acpi data regions. +- */ +-int devmem_is_allowed(unsigned long pagenr) +-{ +- if (pagenr <= 256) +- return 1; +- if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) +- return 0; +- if (!page_is_ram(pagenr)) +- return 1; + return 0; + } + +@@ -470,22 +477,10 @@ void __init add_highpages_with_active_re + work_with_active_regions(nid, add_highpages_work_fn, &data); + } + +-#ifndef CONFIG_NUMA +-static void __init set_highmem_pages_init(void) +-{ +- add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); +- +- totalram_pages += totalhigh_pages; +-} +-#endif /* !CONFIG_NUMA */ +- + #else + static inline void permanent_kmaps_init(pgd_t *pgd_base) + { + } +-static inline void set_highmem_pages_init(void) +-{ +-} + #endif /* CONFIG_HIGHMEM */ + + void __init native_pagetable_setup_start(pgd_t *base) +@@ -543,8 +538,9 @@ void __init native_pagetable_setup_done( + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ +-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) ++void __init early_ioremap_page_table_range_init(void) + { ++ pgd_t *pgd_base = swapper_pg_dir; + unsigned long vaddr, end; + + /* +@@ -639,7 +635,7 @@ static int __init noexec_setup(char *str + } + early_param("noexec", noexec_setup); + +-static void __init set_nx(void) ++void __init set_nx(void) + { + unsigned int v[4], l, h; + +@@ -675,75 +671,97 @@ static int __init parse_highmem(char *ar + } + early_param("highmem", parse_highmem); + ++#define MSG_HIGHMEM_TOO_BIG \ ++ "highmem size (%luMB) is bigger than pages available (%luMB)!\n" ++ ++#define MSG_LOWMEM_TOO_SMALL \ ++ "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" + /* +- * Determine low and high memory ranges: ++ * All of RAM fits into lowmem - but if user wants highmem ++ * artificially via the highmem=x boot parameter then create ++ * it: + */ +-void __init find_low_pfn_range(void) ++void __init lowmem_pfn_init(void) + { +- /* it could update max_pfn */ +- + /* max_low_pfn is 0, we already have early_res support */ +- + max_low_pfn = max_pfn; +- if (max_low_pfn > MAXMEM_PFN) { +- if (highmem_pages == -1) +- highmem_pages = max_pfn - MAXMEM_PFN; +- if (highmem_pages + MAXMEM_PFN < max_pfn) +- max_pfn = MAXMEM_PFN + highmem_pages; +- if (highmem_pages + MAXMEM_PFN > max_pfn) { +- printk(KERN_WARNING "only %luMB highmem pages " +- "available, ignoring highmem size of %uMB.\n", +- pages_to_mb(max_pfn - MAXMEM_PFN), ++ ++ if (highmem_pages == -1) ++ highmem_pages = 0; ++#ifdef CONFIG_HIGHMEM ++ if (highmem_pages >= max_pfn) { ++ printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, ++ pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); ++ highmem_pages = 0; ++ } ++ if (highmem_pages) { ++ if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { ++ printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } +- max_low_pfn = MAXMEM_PFN; ++ max_low_pfn -= highmem_pages; ++ } ++#else ++ if (highmem_pages) ++ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); ++#endif ++} ++ ++#define MSG_HIGHMEM_TOO_SMALL \ ++ "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" ++ ++#define MSG_HIGHMEM_TRIMMED \ ++ "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" ++/* ++ * We have more RAM than fits into lowmem - we try to put it into ++ * highmem, also taking the highmem=x boot parameter into account: ++ */ ++void __init highmem_pfn_init(void) ++{ ++ max_low_pfn = MAXMEM_PFN; ++ ++ if (highmem_pages == -1) ++ highmem_pages = max_pfn - MAXMEM_PFN; ++ ++ if (highmem_pages + MAXMEM_PFN < max_pfn) ++ max_pfn = MAXMEM_PFN + highmem_pages; ++ ++ if (highmem_pages + MAXMEM_PFN > max_pfn) { ++ printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, ++ pages_to_mb(max_pfn - MAXMEM_PFN), ++ pages_to_mb(highmem_pages)); ++ highmem_pages = 0; ++ } + #ifndef CONFIG_HIGHMEM +- /* Maximum memory usable is what is directly addressable */ +- printk(KERN_WARNING "Warning only %ldMB will be used.\n", +- MAXMEM>>20); +- if (max_pfn > MAX_NONPAE_PFN) +- printk(KERN_WARNING +- "Use a HIGHMEM64G enabled kernel.\n"); +- else +- printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); +- max_pfn = MAXMEM_PFN; ++ /* Maximum memory usable is what is directly addressable */ ++ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); ++ if (max_pfn > MAX_NONPAE_PFN) ++ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); ++ else ++ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); ++ max_pfn = MAXMEM_PFN; + #else /* !CONFIG_HIGHMEM */ + #ifndef CONFIG_HIGHMEM64G +- if (max_pfn > MAX_NONPAE_PFN) { +- max_pfn = MAX_NONPAE_PFN; +- printk(KERN_WARNING "Warning only 4GB will be used." +- "Use a HIGHMEM64G enabled kernel.\n"); +- } ++ if (max_pfn > MAX_NONPAE_PFN) { ++ max_pfn = MAX_NONPAE_PFN; ++ printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); ++ } + #endif /* !CONFIG_HIGHMEM64G */ + #endif /* !CONFIG_HIGHMEM */ +- } else { +- if (highmem_pages == -1) +- highmem_pages = 0; +-#ifdef CONFIG_HIGHMEM +- if (highmem_pages >= max_pfn) { +- printk(KERN_ERR "highmem size specified (%uMB) is " +- "bigger than pages available (%luMB)!.\n", +- pages_to_mb(highmem_pages), +- pages_to_mb(max_pfn)); +- highmem_pages = 0; +- } +- if (highmem_pages) { +- if (max_low_pfn - highmem_pages < +- 64*1024*1024/PAGE_SIZE){ +- printk(KERN_ERR "highmem size %uMB results in " +- "smaller than 64MB lowmem, ignoring it.\n" +- , pages_to_mb(highmem_pages)); +- highmem_pages = 0; +- } +- max_low_pfn -= highmem_pages; +- } +-#else +- if (highmem_pages) +- printk(KERN_ERR "ignoring highmem size on non-highmem" +- " kernel!\n"); +-#endif +- } ++} ++ ++/* ++ * Determine low and high memory ranges: ++ */ ++void __init find_low_pfn_range(void) ++{ ++ /* it could update max_pfn */ ++ ++ if (max_pfn <= MAXMEM_PFN) ++ lowmem_pfn_init(); ++ else ++ highmem_pfn_init(); + } + + #ifndef CONFIG_NEED_MULTIPLE_NODES +@@ -769,6 +787,8 @@ void __init initmem_init(unsigned long s + #ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; + #endif ++ __vmalloc_start_set = true; ++ + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + +@@ -790,176 +810,64 @@ static void __init zone_sizes_init(void) + free_area_init_nodes(max_zone_pfns); + } + ++static unsigned long __init setup_node_bootmem(int nodeid, ++ unsigned long start_pfn, ++ unsigned long end_pfn, ++ unsigned long bootmap) ++{ ++ unsigned long bootmap_size; ++ ++ /* don't touch min_low_pfn */ ++ bootmap_size = init_bootmem_node(NODE_DATA(nodeid), ++ bootmap >> PAGE_SHIFT, ++ start_pfn, end_pfn); ++ printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", ++ nodeid, start_pfn<> PAGE_SHIFT, +- min_low_pfn, max_low_pfn); + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped<> PUD_SHIFT; +- tables = PAGE_ALIGN(puds * sizeof(pud_t)); +- +- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; +- tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); +- +- if (use_pse) { +- unsigned long extra; ++ printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<>PMD_SHIFT) << PMD_SHIFT); +- extra += PMD_SIZE; +- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; +- } else +- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ for_each_online_node(nodeid) { ++ unsigned long start_pfn, end_pfn; + +- tables += PAGE_ALIGN(ptes * sizeof(pte_t)); +- +- /* for fixmap */ +- tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); +- +- /* +- * RED-PEN putting page tables only on node 0 could +- * cause a hotspot and fill up ZONE_DMA. The page tables +- * need roughly 0.5KB per GB. +- */ +- start = 0x7000; +- table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; +- table_end = table_start; +- table_top = table_start + (tables>>PAGE_SHIFT); +- +- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", +- end, table_start << PAGE_SHIFT, +- (table_start << PAGE_SHIFT) + tables); +-} +- +-unsigned long __init_refok init_memory_mapping(unsigned long start, +- unsigned long end) +-{ +- pgd_t *pgd_base = swapper_pg_dir; +- unsigned long start_pfn, end_pfn; +- unsigned long big_page_start; +-#ifdef CONFIG_DEBUG_PAGEALLOC +- /* +- * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. +- * This will simplify cpa(), which otherwise needs to support splitting +- * large pages into small in interrupt context, etc. +- */ +- int use_pse = 0; ++#ifdef CONFIG_NEED_MULTIPLE_NODES ++ start_pfn = node_start_pfn[nodeid]; ++ end_pfn = node_end_pfn[nodeid]; ++ if (start_pfn > max_low_pfn) ++ continue; ++ if (end_pfn > max_low_pfn) ++ end_pfn = max_low_pfn; + #else +- int use_pse = cpu_has_pse; ++ start_pfn = 0; ++ end_pfn = max_low_pfn; + #endif +- +- /* +- * Find space for the kernel direct mapping tables. +- */ +- if (!after_init_bootmem) +- find_early_table_space(end, use_pse); +- +-#ifdef CONFIG_X86_PAE +- set_nx(); +- if (nx_enabled) +- printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +-#endif +- +- /* Enable PSE if available */ +- if (cpu_has_pse) +- set_in_cr4(X86_CR4_PSE); +- +- /* Enable PGE if available */ +- if (cpu_has_pge) { +- set_in_cr4(X86_CR4_PGE); +- __supported_pte_mask |= _PAGE_GLOBAL; ++ bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, ++ bootmap); + } +- +- /* +- * Don't use a large page for the first 2/4MB of memory +- * because there are often fixed size MTRRs in there +- * and overlapping MTRRs into large pages can cause +- * slowdowns. +- */ +- big_page_start = PMD_SIZE; +- +- if (start < big_page_start) { +- start_pfn = start >> PAGE_SHIFT; +- end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); +- } else { +- /* head is not big page alignment ? */ +- start_pfn = start >> PAGE_SHIFT; +- end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) +- << (PMD_SHIFT - PAGE_SHIFT); +- } +- if (start_pfn < end_pfn) +- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); +- +- /* big page range */ +- start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) +- << (PMD_SHIFT - PAGE_SHIFT); +- if (start_pfn < (big_page_start >> PAGE_SHIFT)) +- start_pfn = big_page_start >> PAGE_SHIFT; +- end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +- if (start_pfn < end_pfn) +- kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, +- use_pse); +- +- /* tail is not big page alignment ? */ +- start_pfn = end_pfn; +- if (start_pfn > (big_page_start>>PAGE_SHIFT)) { +- end_pfn = end >> PAGE_SHIFT; +- if (start_pfn < end_pfn) +- kernel_physical_mapping_init(pgd_base, start_pfn, +- end_pfn, 0); +- } +- +- early_ioremap_page_table_range_init(pgd_base); +- +- load_cr3(swapper_pg_dir); +- +- __flush_tlb_all(); +- +- if (!after_init_bootmem) +- reserve_early(table_start << PAGE_SHIFT, +- table_end << PAGE_SHIFT, "PGTABLE"); +- +- if (!after_init_bootmem) +- early_memtest(start, end); +- +- return end >> PAGE_SHIFT; + } + +- + /* + * paging_init() sets up the page tables - note that the first 8MB are + * already mapped by head.S. +@@ -1024,6 +932,8 @@ void __init mem_init(void) + /* this will put all low memory onto the freelists */ + totalram_pages += free_all_bootmem(); + ++ bootmem_state = AFTER_BOOTMEM; ++ + reservedpages = 0; + for (tmp = 0; tmp < max_low_pfn; tmp++) + /* +@@ -1155,17 +1065,47 @@ static noinline int do_test_wp_bit(void) + const int rodata_test_data = 0xC3; + EXPORT_SYMBOL_GPL(rodata_test_data); + ++static int kernel_set_to_readonly; ++ ++void set_kernel_text_rw(void) ++{ ++ unsigned long start = PFN_ALIGN(_text); ++ unsigned long size = PFN_ALIGN(_etext) - start; ++ ++ if (!kernel_set_to_readonly) ++ return; ++ ++ pr_debug("Set kernel text: %lx - %lx for read write\n", ++ start, start+size); ++ ++ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); ++} ++ ++void set_kernel_text_ro(void) ++{ ++ unsigned long start = PFN_ALIGN(_text); ++ unsigned long size = PFN_ALIGN(_etext) - start; ++ ++ if (!kernel_set_to_readonly) ++ return; ++ ++ pr_debug("Set kernel text: %lx - %lx for read only\n", ++ start, start+size); ++ ++ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); ++} ++ + void mark_rodata_ro(void) + { + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + +-#ifndef CONFIG_DYNAMIC_FTRACE +- /* Dynamic tracing modifies the kernel text section */ + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + ++ kernel_set_to_readonly = 1; ++ + #ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); +@@ -1174,7 +1114,6 @@ void mark_rodata_ro(void) + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); + #endif +-#endif /* CONFIG_DYNAMIC_FTRACE */ + + start += size; + size = (unsigned long)__end_rodata - start; +@@ -1193,52 +1132,6 @@ void mark_rodata_ro(void) + } + #endif + +-void free_init_pages(char *what, unsigned long begin, unsigned long end) +-{ +-#ifdef CONFIG_DEBUG_PAGEALLOC +- /* +- * If debugging page accesses then do not free this memory but +- * mark them not present - any buggy init-section access will +- * create a kernel page fault: +- */ +- printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", +- begin, PAGE_ALIGN(end)); +- set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +-#else +- unsigned long addr; +- +- /* +- * We just marked the kernel text read only above, now that +- * we are going to free part of that, we need to make that +- * writeable first. +- */ +- set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); +- +- for (addr = begin; addr < end; addr += PAGE_SIZE) { +- ClearPageReserved(virt_to_page(addr)); +- init_page_count(virt_to_page(addr)); +- memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); +- free_page(addr); +- totalram_pages++; +- } +- printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); +-#endif +-} +- +-void free_initmem(void) +-{ +- free_init_pages("unused kernel memory", +- (unsigned long)(&__init_begin), +- (unsigned long)(&__init_end)); +-} +- +-#ifdef CONFIG_BLK_DEV_INITRD +-void free_initrd_mem(unsigned long start, unsigned long end) +-{ +- free_init_pages("initrd memory", start, end); +-} +-#endif +- + int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) + { +Index: linux-2.6-tip/arch/x86/mm/init_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/init_64.c ++++ linux-2.6-tip/arch/x86/mm/init_64.c +@@ -48,6 +48,7 @@ + #include + #include + #include ++#include + + /* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. +@@ -59,14 +60,6 @@ unsigned long max_pfn_mapped; + + static unsigned long dma_reserve __initdata; + +-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +- +-int direct_gbpages +-#ifdef CONFIG_DIRECT_GBPAGES +- = 1 +-#endif +-; +- + static int __init parse_direct_gbpages_off(char *arg) + { + direct_gbpages = 0; +@@ -87,12 +80,10 @@ early_param("gbpages", parse_direct_gbpa + * around without checking the pgd every time. + */ + +-int after_bootmem; +- + pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + +-static int do_not_nx __cpuinitdata; ++static int disable_nx __cpuinitdata; + + /* + * noexec=on|off +@@ -107,9 +98,9 @@ static int __init nonx_setup(char *str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; +- do_not_nx = 0; ++ disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { +- do_not_nx = 1; ++ disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +@@ -121,7 +112,7 @@ void __cpuinit check_efer(void) + unsigned long efer; + + rdmsrl(MSR_EFER, efer); +- if (!(efer & EFER_NX) || do_not_nx) ++ if (!(efer & EFER_NX) || disable_nx) + __supported_pte_mask &= ~_PAGE_NX; + } + +@@ -147,20 +138,26 @@ __setup("noexec32=", nonx32_setup); + + /* + * NOTE: This function is marked __ref because it calls __init function +- * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. ++ * (alloc_bootmem_pages). It's safe to do it ONLY when DURING_BOOTMEM. + */ + static __ref void *spp_getpage(void) + { +- void *ptr; ++ void *ptr = NULL; + +- if (after_bootmem) +- ptr = (void *) get_zeroed_page(GFP_ATOMIC); +- else ++ switch (bootmem_state) { ++ case AFTER_BOOTMEM: ++ ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); ++ break; ++ case DURING_BOOTMEM: + ptr = alloc_bootmem_pages(PAGE_SIZE); ++ break; ++ default: ++ panic("calling spp_getpage before bootmem\n"); ++ } + + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", +- after_bootmem ? "after bootmem" : ""); ++ bootmem_state == AFTER_BOOTMEM ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); +@@ -168,34 +165,51 @@ static __ref void *spp_getpage(void) + return ptr; + } + +-void +-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) ++static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) + { +- pud_t *pud; +- pmd_t *pmd; +- pte_t *pte; ++ if (pgd_none(*pgd)) { ++ pud_t *pud = (pud_t *)spp_getpage(); ++ pgd_populate(&init_mm, pgd, pud); ++ if (pud != pud_offset(pgd, 0)) ++ printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", ++ pud, pud_offset(pgd, 0)); ++ } ++ return pud_offset(pgd, vaddr); ++} + +- pud = pud_page + pud_index(vaddr); ++static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) ++{ + if (pud_none(*pud)) { +- pmd = (pmd_t *) spp_getpage(); ++ pmd_t *pmd = (pmd_t *) spp_getpage(); + pud_populate(&init_mm, pud, pmd); +- if (pmd != pmd_offset(pud, 0)) { ++ if (pmd != pmd_offset(pud, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", +- pmd, pmd_offset(pud, 0)); +- return; +- } ++ pmd, pmd_offset(pud, 0)); + } +- pmd = pmd_offset(pud, vaddr); ++ return pmd_offset(pud, vaddr); ++} ++ ++static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) ++{ + if (pmd_none(*pmd)) { +- pte = (pte_t *) spp_getpage(); ++ pte_t *pte = (pte_t *) spp_getpage(); + pmd_populate_kernel(&init_mm, pmd, pte); +- if (pte != pte_offset_kernel(pmd, 0)) { ++ if (pte != pte_offset_kernel(pmd, 0)) + printk(KERN_ERR "PAGETABLE BUG #02!\n"); +- return; +- } + } ++ return pte_offset_kernel(pmd, vaddr); ++} ++ ++void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) ++{ ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ ++ pud = pud_page + pud_index(vaddr); ++ pmd = fill_pmd(pud, vaddr); ++ pte = fill_pte(pmd, vaddr); + +- pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, new_pte); + + /* +@@ -205,8 +219,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsig + __flush_tlb_one(vaddr); + } + +-void +-set_pte_vaddr(unsigned long vaddr, pte_t pteval) ++void set_pte_vaddr(unsigned long vaddr, pte_t pteval) + { + pgd_t *pgd; + pud_t *pud_page; +@@ -223,6 +236,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t + set_pte_vaddr_pud(pud_page, vaddr, pteval); + } + ++pmd_t * __init populate_extra_pmd(unsigned long vaddr) ++{ ++ pgd_t *pgd; ++ pud_t *pud; ++ ++ pgd = pgd_offset_k(vaddr); ++ pud = fill_pud(pgd, vaddr); ++ return fill_pmd(pud, vaddr); ++} ++ ++pte_t * __init populate_extra_pte(unsigned long vaddr) ++{ ++ pmd_t *pmd; ++ ++ pmd = populate_extra_pmd(vaddr); ++ return fill_pte(pmd, vaddr); ++} ++ + /* + * Create large page table mappings for a range of physical addresses. + */ +@@ -291,23 +322,20 @@ void __init cleanup_highmap(void) + } + } + +-static unsigned long __initdata table_start; +-static unsigned long __meminitdata table_end; +-static unsigned long __meminitdata table_top; +- + static __ref void *alloc_low_page(unsigned long *phys) + { +- unsigned long pfn = table_end++; ++ unsigned long pfn; + void *adr; + +- if (after_bootmem) { +- adr = (void *)get_zeroed_page(GFP_ATOMIC); ++ if (bootmem_state == AFTER_BOOTMEM) { ++ adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + *phys = __pa(adr); + + return adr; + } + +- if (pfn >= table_top) ++ pfn = e820_table_end++; ++ if (pfn >= e820_table_top) + panic("alloc_low_page: ran out of memory"); + + adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); +@@ -318,7 +346,7 @@ static __ref void *alloc_low_page(unsign + + static __ref void unmap_low_page(void *adr) + { +- if (after_bootmem) ++ if (bootmem_state == AFTER_BOOTMEM) + return; + + early_iounmap(adr, PAGE_SIZE); +@@ -337,7 +365,7 @@ phys_pte_init(pte_t *pte_page, unsigned + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { + + if (addr >= end) { +- if (!after_bootmem) { ++ if (bootmem_state != AFTER_BOOTMEM) { + for(; i < PTRS_PER_PTE; i++, pte++) + set_pte(pte, __pte(0)); + } +@@ -393,7 +421,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned + pgprot_t new_prot = prot; + + if (address >= end) { +- if (!after_bootmem) { ++ if (bootmem_state != AFTER_BOOTMEM) { + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + } +@@ -479,7 +507,7 @@ phys_pud_init(pud_t *pud_page, unsigned + if (addr >= end) + break; + +- if (!after_bootmem && ++ if (bootmem_state != AFTER_BOOTMEM && + !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { + set_pud(pud, __pud(0)); + continue; +@@ -547,58 +575,10 @@ phys_pud_update(pgd_t *pgd, unsigned lon + return phys_pud_init(pud, addr, end, page_size_mask); + } + +-static void __init find_early_table_space(unsigned long end, int use_pse, +- int use_gbpages) +-{ +- unsigned long puds, pmds, ptes, tables, start; +- +- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; +- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); +- if (use_gbpages) { +- unsigned long extra; +- extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); +- pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; +- } else +- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; +- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); +- +- if (use_pse) { +- unsigned long extra; +- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; +- } else +- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; +- tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); +- +- /* +- * RED-PEN putting page tables only on node 0 could +- * cause a hotspot and fill up ZONE_DMA. The page tables +- * need roughly 0.5KB per GB. +- */ +- start = 0x8000; +- table_start = find_e820_area(start, end, tables, PAGE_SIZE); +- if (table_start == -1UL) +- panic("Cannot find space for the kernel page tables"); +- +- table_start >>= PAGE_SHIFT; +- table_end = table_start; +- table_top = table_start + (tables >> PAGE_SHIFT); +- +- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", +- end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); +-} +- +-static void __init init_gbpages(void) +-{ +- if (direct_gbpages && cpu_has_gbpages) +- printk(KERN_INFO "Using GB pages for direct mapping\n"); +- else +- direct_gbpages = 0; +-} +- +-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, +- unsigned long end, +- unsigned long page_size_mask) ++unsigned long __init ++kernel_physical_mapping_init(unsigned long start, ++ unsigned long end, ++ unsigned long page_size_mask) + { + + unsigned long next, last_map_addr = end; +@@ -635,176 +615,6 @@ static unsigned long __meminit kernel_ph + return last_map_addr; + } + +-struct map_range { +- unsigned long start; +- unsigned long end; +- unsigned page_size_mask; +-}; +- +-#define NR_RANGE_MR 5 +- +-static int save_mr(struct map_range *mr, int nr_range, +- unsigned long start_pfn, unsigned long end_pfn, +- unsigned long page_size_mask) +-{ +- +- if (start_pfn < end_pfn) { +- if (nr_range >= NR_RANGE_MR) +- panic("run out of range for init_memory_mapping\n"); +- mr[nr_range].start = start_pfn<> PAGE_SHIFT; +- pos = start_pfn << PAGE_SHIFT; +- end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) +- << (PMD_SHIFT - PAGE_SHIFT); +- if (end_pfn > (end >> PAGE_SHIFT)) +- end_pfn = end >> PAGE_SHIFT; +- if (start_pfn < end_pfn) { +- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); +- pos = end_pfn << PAGE_SHIFT; +- } +- +- /* big page (2M) range*/ +- start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) +- << (PMD_SHIFT - PAGE_SHIFT); +- end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) +- << (PUD_SHIFT - PAGE_SHIFT); +- if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) +- end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +- if (start_pfn < end_pfn) { +- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, +- page_size_mask & (1<>PUD_SHIFT) +- << (PUD_SHIFT - PAGE_SHIFT); +- end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); +- if (start_pfn < end_pfn) { +- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, +- page_size_mask & +- ((1<>PMD_SHIFT) +- << (PMD_SHIFT - PAGE_SHIFT); +- end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +- if (start_pfn < end_pfn) { +- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, +- page_size_mask & (1<>PAGE_SHIFT; +- end_pfn = end>>PAGE_SHIFT; +- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); +- +- /* try to merge same page size and continuous */ +- for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { +- unsigned long old_start; +- if (mr[i].end != mr[i+1].start || +- mr[i].page_size_mask != mr[i+1].page_size_mask) +- continue; +- /* move it */ +- old_start = mr[i].start; +- memmove(&mr[i], &mr[i+1], +- (nr_range - 1 - i) * sizeof (struct map_range)); +- mr[i--].start = old_start; +- nr_range--; +- } +- +- for (i = 0; i < nr_range; i++) +- printk(KERN_DEBUG " %010lx - %010lx page %s\n", +- mr[i].start, mr[i].end, +- (mr[i].page_size_mask & (1< table_start) +- reserve_early(table_start << PAGE_SHIFT, +- table_end << PAGE_SHIFT, "PGTABLE"); +- +- printk(KERN_INFO "last_map_addr: %lx end: %lx\n", +- last_map_addr, end); +- +- if (!after_bootmem) +- early_memtest(start, end); +- +- return last_map_addr >> PAGE_SHIFT; +-} +- + #ifndef CONFIG_NUMA + void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) + { +@@ -876,28 +686,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to + + #endif /* CONFIG_MEMORY_HOTPLUG */ + +-/* +- * devmem_is_allowed() checks to see if /dev/mem access to a certain address +- * is valid. The argument is a physical page number. +- * +- * +- * On x86, access has to be given to the first megabyte of ram because that area +- * contains bios code and data regions used by X and dosemu and similar apps. +- * Access has to be given to non-kernel-ram areas as well, these contain the PCI +- * mmio resources as well as potential bios/acpi data regions. +- */ +-int devmem_is_allowed(unsigned long pagenr) +-{ +- if (pagenr <= 256) +- return 1; +- if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) +- return 0; +- if (!page_is_ram(pagenr)) +- return 1; +- return 0; +-} +- +- + static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, + kcore_modules, kcore_vsyscall; + +@@ -910,8 +698,6 @@ void __init mem_init(void) + + /* clear_bss() already clear the empty_zero_page */ + +- reservedpages = 0; +- + /* this will put all low memory onto the freelists */ + #ifdef CONFIG_NUMA + totalram_pages = numa_free_all_bootmem(); +@@ -919,9 +705,9 @@ void __init mem_init(void) + totalram_pages = free_all_bootmem(); + #endif + ++ bootmem_state = AFTER_BOOTMEM; + absent_pages = absent_pages_in_range(0, max_pfn); + reservedpages = max_pfn - totalram_pages - absent_pages; +- after_bootmem = 1; + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; +@@ -947,46 +733,39 @@ void __init mem_init(void) + initsize >> 10); + } + +-void free_init_pages(char *what, unsigned long begin, unsigned long end) ++#ifdef CONFIG_DEBUG_RODATA ++const int rodata_test_data = 0xC3; ++EXPORT_SYMBOL_GPL(rodata_test_data); ++ ++static int kernel_set_to_readonly; ++ ++void set_kernel_text_rw(void) + { +- unsigned long addr = begin; ++ unsigned long start = PFN_ALIGN(_stext); ++ unsigned long end = PFN_ALIGN(__start_rodata); + +- if (addr >= end) ++ if (!kernel_set_to_readonly) + return; + +- /* +- * If debugging page accesses then do not free this memory but +- * mark them not present - any buggy init-section access will +- * create a kernel page fault: +- */ +-#ifdef CONFIG_DEBUG_PAGEALLOC +- printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", +- begin, PAGE_ALIGN(end)); +- set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +-#else +- printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); ++ pr_debug("Set kernel text: %lx - %lx for read write\n", ++ start, end); + +- for (; addr < end; addr += PAGE_SIZE) { +- ClearPageReserved(virt_to_page(addr)); +- init_page_count(virt_to_page(addr)); +- memset((void *)(addr & ~(PAGE_SIZE-1)), +- POISON_FREE_INITMEM, PAGE_SIZE); +- free_page(addr); +- totalram_pages++; +- } +-#endif ++ set_memory_rw(start, (end - start) >> PAGE_SHIFT); + } + +-void free_initmem(void) ++void set_kernel_text_ro(void) + { +- free_init_pages("unused kernel memory", +- (unsigned long)(&__init_begin), +- (unsigned long)(&__init_end)); +-} ++ unsigned long start = PFN_ALIGN(_stext); ++ unsigned long end = PFN_ALIGN(__start_rodata); + +-#ifdef CONFIG_DEBUG_RODATA +-const int rodata_test_data = 0xC3; +-EXPORT_SYMBOL_GPL(rodata_test_data); ++ if (!kernel_set_to_readonly) ++ return; ++ ++ pr_debug("Set kernel text: %lx - %lx for read only\n", ++ start, end); ++ ++ set_memory_ro(start, (end - start) >> PAGE_SHIFT); ++} + + void mark_rodata_ro(void) + { +@@ -994,15 +773,12 @@ void mark_rodata_ro(void) + unsigned long rodata_start = + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + +-#ifdef CONFIG_DYNAMIC_FTRACE +- /* Dynamic tracing modifies the kernel text section */ +- start = rodata_start; +-#endif +- + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + ++ kernel_set_to_readonly = 1; ++ + /* + * The rodata section (but not the kernel text!) should also be + * not-executable. +@@ -1022,13 +798,6 @@ void mark_rodata_ro(void) + + #endif + +-#ifdef CONFIG_BLK_DEV_INITRD +-void free_initrd_mem(unsigned long start, unsigned long end) +-{ +- free_init_pages("initrd memory", start, end); +-} +-#endif +- + int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) + { +Index: linux-2.6-tip/arch/x86/mm/iomap_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/iomap_32.c ++++ linux-2.6-tip/arch/x86/mm/iomap_32.c +@@ -31,16 +31,28 @@ int is_io_mapping_possible(resource_size + } + EXPORT_SYMBOL_GPL(is_io_mapping_possible); + +-/* Map 'pfn' using fixed map 'type' and protections 'prot' +- */ +-void * +-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) ++void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) + { + enum fixed_addresses idx; + unsigned long vaddr; + ++ preempt_disable(); + pagefault_disable(); + ++ idx = type + KM_TYPE_NR * smp_processor_id(); ++ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); ++ arch_flush_lazy_mmu_mode(); ++ ++ return (void *)vaddr; ++} ++ ++/* ++ * Map 'pfn' using fixed map 'type' and protections 'prot' ++ */ ++void * ++iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) ++{ + /* + * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. + * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the +@@ -50,12 +62,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, + if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) + prot = PAGE_KERNEL_UC_MINUS; + +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte(kmap_pte-idx, pfn_pte(pfn, prot)); +- arch_flush_lazy_mmu_mode(); +- +- return (void*) vaddr; ++ return kmap_atomic_prot_pfn(pfn, type, prot); + } + EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); + +@@ -76,5 +83,6 @@ iounmap_atomic(void *kvaddr, enum km_typ + + arch_flush_lazy_mmu_mode(); + pagefault_enable(); ++ preempt_enable(); + } + EXPORT_SYMBOL_GPL(iounmap_atomic); +Index: linux-2.6-tip/arch/x86/mm/ioremap.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/ioremap.c ++++ linux-2.6-tip/arch/x86/mm/ioremap.c +@@ -22,13 +22,17 @@ + #include + #include + +-#ifdef CONFIG_X86_64 +- +-static inline int phys_addr_valid(unsigned long addr) ++static inline int phys_addr_valid(resource_size_t addr) + { +- return addr < (1UL << boot_cpu_data.x86_phys_bits); ++#ifdef CONFIG_PHYS_ADDR_T_64BIT ++ return !(addr >> boot_cpu_data.x86_phys_bits); ++#else ++ return 1; ++#endif + } + ++#ifdef CONFIG_X86_64 ++ + unsigned long __phys_addr(unsigned long x) + { + if (x >= __START_KERNEL_map) { +@@ -38,8 +42,7 @@ unsigned long __phys_addr(unsigned long + } else { + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; +- VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : +- !phys_addr_valid(x)); ++ VIRTUAL_BUG_ON(!phys_addr_valid(x)); + } + return x; + } +@@ -56,10 +59,8 @@ bool __virt_addr_valid(unsigned long x) + if (x < PAGE_OFFSET) + return false; + x -= PAGE_OFFSET; +- if (system_state == SYSTEM_BOOTING ? +- x > MAXMEM : !phys_addr_valid(x)) { ++ if (!phys_addr_valid(x)) + return false; +- } + } + + return pfn_valid(x >> PAGE_SHIFT); +@@ -68,18 +69,12 @@ EXPORT_SYMBOL(__virt_addr_valid); + + #else + +-static inline int phys_addr_valid(unsigned long addr) +-{ +- return 1; +-} +- + #ifdef CONFIG_DEBUG_VIRTUAL + unsigned long __phys_addr(unsigned long x) + { +- /* VMALLOC_* aren't constants; not available at the boot time */ ++ /* VMALLOC_* aren't constants */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET); +- VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && +- is_vmalloc_addr((void *) x)); ++ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); + return x - PAGE_OFFSET; + } + EXPORT_SYMBOL(__phys_addr); +@@ -89,7 +84,9 @@ bool __virt_addr_valid(unsigned long x) + { + if (x < PAGE_OFFSET) + return false; +- if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) ++ if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) ++ return false; ++ if (x >= FIXADDR_START) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); + } +@@ -348,7 +345,7 @@ EXPORT_SYMBOL(ioremap_nocache); + * + * Must be freed with iounmap. + */ +-void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) ++void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) + { + if (pat_enabled) + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, +@@ -508,13 +505,19 @@ static inline pte_t * __init early_iorem + return &bm_pte[pte_index(addr)]; + } + ++static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; ++ + void __init early_ioremap_init(void) + { + pmd_t *pmd; ++ int i; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_init()\n"); + ++ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) ++ slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); ++ + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + memset(bm_pte, 0, sizeof(bm_pte)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); +@@ -544,7 +547,7 @@ void __init early_ioremap_reset(void) + } + + static void __init __early_set_fixmap(enum fixed_addresses idx, +- unsigned long phys, pgprot_t flags) ++ phys_addr_t phys, pgprot_t flags) + { + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; +@@ -563,7 +566,7 @@ static void __init __early_set_fixmap(en + } + + static inline void __init early_set_fixmap(enum fixed_addresses idx, +- unsigned long phys, pgprot_t prot) ++ phys_addr_t phys, pgprot_t prot) + { + if (after_paging_init) + __set_fixmap(idx, phys, prot); +@@ -581,6 +584,7 @@ static inline void __init early_clear_fi + + static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; + static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; ++ + static int __init check_early_ioremap_leak(void) + { + int count = 0; +@@ -602,9 +606,11 @@ static int __init check_early_ioremap_le + } + late_initcall(check_early_ioremap_leak); + +-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) ++static void __init __iomem * ++__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) + { +- unsigned long offset, last_addr; ++ unsigned long offset; ++ resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx0, idx; + int i, slot; +@@ -620,15 +626,15 @@ static void __init __iomem *__early_iore + } + + if (slot < 0) { +- printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", +- phys_addr, size); ++ printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n", ++ (u64)phys_addr, size); + WARN_ON(1); + return NULL; + } + + if (early_ioremap_debug) { +- printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", +- phys_addr, size, slot); ++ printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ", ++ (u64)phys_addr, size, slot); + dump_stack(); + } + +@@ -668,20 +674,22 @@ static void __init __iomem *__early_iore + --nrpages; + } + if (early_ioremap_debug) +- printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); ++ printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); + +- prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); ++ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; + } + + /* Remap an IO device */ +-void __init __iomem *early_ioremap(unsigned long phys_addr, unsigned long size) ++void __init __iomem * ++early_ioremap(resource_size_t phys_addr, unsigned long size) + { + return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); + } + + /* Remap memory */ +-void __init __iomem *early_memremap(unsigned long phys_addr, unsigned long size) ++void __init __iomem * ++early_memremap(resource_size_t phys_addr, unsigned long size) + { + return __early_ioremap(phys_addr, size, PAGE_KERNEL); + } +@@ -738,8 +746,3 @@ void __init early_iounmap(void __iomem * + } + prev_map[slot] = NULL; + } +- +-void __this_fixmap_does_not_exist(void) +-{ +- WARN_ON(1); +-} +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/Makefile +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/Makefile +@@ -0,0 +1 @@ ++obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/error.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/error.c +@@ -0,0 +1,228 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "error.h" ++#include "shadow.h" ++ ++enum kmemcheck_error_type { ++ KMEMCHECK_ERROR_INVALID_ACCESS, ++ KMEMCHECK_ERROR_BUG, ++}; ++ ++#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) ++ ++struct kmemcheck_error { ++ enum kmemcheck_error_type type; ++ ++ union { ++ /* KMEMCHECK_ERROR_INVALID_ACCESS */ ++ struct { ++ /* Kind of access that caused the error */ ++ enum kmemcheck_shadow state; ++ /* Address and size of the erroneous read */ ++ unsigned long address; ++ unsigned int size; ++ }; ++ }; ++ ++ struct pt_regs regs; ++ struct stack_trace trace; ++ unsigned long trace_entries[32]; ++ ++ /* We compress it to a char. */ ++ unsigned char shadow_copy[SHADOW_COPY_SIZE]; ++ unsigned char memory_copy[SHADOW_COPY_SIZE]; ++}; ++ ++/* ++ * Create a ring queue of errors to output. We can't call printk() directly ++ * from the kmemcheck traps, since this may call the console drivers and ++ * result in a recursive fault. ++ */ ++static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; ++static unsigned int error_count; ++static unsigned int error_rd; ++static unsigned int error_wr; ++static unsigned int error_missed_count; ++ ++static struct kmemcheck_error *error_next_wr(void) ++{ ++ struct kmemcheck_error *e; ++ ++ if (error_count == ARRAY_SIZE(error_fifo)) { ++ ++error_missed_count; ++ return NULL; ++ } ++ ++ e = &error_fifo[error_wr]; ++ if (++error_wr == ARRAY_SIZE(error_fifo)) ++ error_wr = 0; ++ ++error_count; ++ return e; ++} ++ ++static struct kmemcheck_error *error_next_rd(void) ++{ ++ struct kmemcheck_error *e; ++ ++ if (error_count == 0) ++ return NULL; ++ ++ e = &error_fifo[error_rd]; ++ if (++error_rd == ARRAY_SIZE(error_fifo)) ++ error_rd = 0; ++ --error_count; ++ return e; ++} ++ ++void kmemcheck_error_recall(void) ++{ ++ static const char *desc[] = { ++ [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", ++ [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", ++ [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", ++ [KMEMCHECK_SHADOW_FREED] = "freed", ++ }; ++ ++ static const char short_desc[] = { ++ [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', ++ [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', ++ [KMEMCHECK_SHADOW_INITIALIZED] = 'i', ++ [KMEMCHECK_SHADOW_FREED] = 'f', ++ }; ++ ++ struct kmemcheck_error *e; ++ unsigned int i; ++ ++ e = error_next_rd(); ++ if (!e) ++ return; ++ ++ switch (e->type) { ++ case KMEMCHECK_ERROR_INVALID_ACCESS: ++ printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " ++ "from %s memory (%p)\n", ++ 8 * e->size, e->state < ARRAY_SIZE(desc) ? ++ desc[e->state] : "(invalid shadow state)", ++ (void *) e->address); ++ ++ printk(KERN_INFO); ++ for (i = 0; i < SHADOW_COPY_SIZE; ++i) ++ printk("%02x", e->memory_copy[i]); ++ printk("\n"); ++ ++ printk(KERN_INFO); ++ for (i = 0; i < SHADOW_COPY_SIZE; ++i) { ++ if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) ++ printk(" %c", short_desc[e->shadow_copy[i]]); ++ else ++ printk(" ?"); ++ } ++ printk("\n"); ++ printk(KERN_INFO "%*c\n", 2 + 2 ++ * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); ++ break; ++ case KMEMCHECK_ERROR_BUG: ++ printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); ++ break; ++ } ++ ++ __show_regs(&e->regs, 1); ++ print_stack_trace(&e->trace, 0); ++} ++ ++static void do_wakeup(unsigned long data) ++{ ++ while (error_count > 0) ++ kmemcheck_error_recall(); ++ ++ if (error_missed_count > 0) { ++ printk(KERN_WARNING "kmemcheck: Lost %d error reports because " ++ "the queue was too small\n", error_missed_count); ++ error_missed_count = 0; ++ } ++} ++ ++static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); ++ ++/* ++ * Save the context of an error report. ++ */ ++void kmemcheck_error_save(enum kmemcheck_shadow state, ++ unsigned long address, unsigned int size, struct pt_regs *regs) ++{ ++ static unsigned long prev_ip; ++ ++ struct kmemcheck_error *e; ++ void *shadow_copy; ++ void *memory_copy; ++ ++ /* Don't report several adjacent errors from the same EIP. */ ++ if (regs->ip == prev_ip) ++ return; ++ prev_ip = regs->ip; ++ ++ e = error_next_wr(); ++ if (!e) ++ return; ++ ++ e->type = KMEMCHECK_ERROR_INVALID_ACCESS; ++ ++ e->state = state; ++ e->address = address; ++ e->size = size; ++ ++ /* Save regs */ ++ memcpy(&e->regs, regs, sizeof(*regs)); ++ ++ /* Save stack trace */ ++ e->trace.nr_entries = 0; ++ e->trace.entries = e->trace_entries; ++ e->trace.max_entries = ARRAY_SIZE(e->trace_entries); ++ e->trace.skip = 0; ++ save_stack_trace_bp(&e->trace, regs->bp); ++ ++ /* Round address down to nearest 16 bytes */ ++ shadow_copy = kmemcheck_shadow_lookup(address ++ & ~(SHADOW_COPY_SIZE - 1)); ++ BUG_ON(!shadow_copy); ++ ++ memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); ++ ++ kmemcheck_show_addr(address); ++ memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); ++ memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); ++ kmemcheck_hide_addr(address); ++ ++ tasklet_hi_schedule_first(&kmemcheck_tasklet); ++} ++ ++/* ++ * Save the context of a kmemcheck bug. ++ */ ++void kmemcheck_error_save_bug(struct pt_regs *regs) ++{ ++ struct kmemcheck_error *e; ++ ++ e = error_next_wr(); ++ if (!e) ++ return; ++ ++ e->type = KMEMCHECK_ERROR_BUG; ++ ++ memcpy(&e->regs, regs, sizeof(*regs)); ++ ++ e->trace.nr_entries = 0; ++ e->trace.entries = e->trace_entries; ++ e->trace.max_entries = ARRAY_SIZE(e->trace_entries); ++ e->trace.skip = 1; ++ save_stack_trace(&e->trace); ++ ++ tasklet_hi_schedule_first(&kmemcheck_tasklet); ++} +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/error.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/error.h +@@ -0,0 +1,15 @@ ++#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H ++#define ARCH__X86__MM__KMEMCHECK__ERROR_H ++ ++#include ++ ++#include "shadow.h" ++ ++void kmemcheck_error_save(enum kmemcheck_shadow state, ++ unsigned long address, unsigned int size, struct pt_regs *regs); ++ ++void kmemcheck_error_save_bug(struct pt_regs *regs); ++ ++void kmemcheck_error_recall(void); ++ ++#endif +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/kmemcheck.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/kmemcheck.c +@@ -0,0 +1,637 @@ ++/** ++ * kmemcheck - a heavyweight memory checker for the linux kernel ++ * Copyright (C) 2007, 2008 Vegard Nossum ++ * (With a lot of help from Ingo Molnar and Pekka Enberg.) ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License (version 2) as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include "error.h" ++#include "opcode.h" ++#include "pte.h" ++#include "selftest.h" ++#include "shadow.h" ++ ++ ++#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT ++# define KMEMCHECK_ENABLED 0 ++#endif ++ ++#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT ++# define KMEMCHECK_ENABLED 1 ++#endif ++ ++#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT ++# define KMEMCHECK_ENABLED 2 ++#endif ++ ++int kmemcheck_enabled = KMEMCHECK_ENABLED; ++ ++void __init kmemcheck_init(void) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * Limit SMP to use a single CPU. We rely on the fact that this code ++ * runs before SMP is set up. ++ */ ++ if (setup_max_cpus > 1) { ++ printk(KERN_INFO ++ "kmemcheck: Limiting number of CPUs to 1.\n"); ++ setup_max_cpus = 1; ++ } ++#endif ++ ++ if (!kmemcheck_selftest()) { ++ printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); ++ kmemcheck_enabled = 0; ++ return; ++ } ++ ++ printk(KERN_INFO "kmemcheck: Initialized\n"); ++} ++ ++/* ++ * We need to parse the kmemcheck= option before any memory is allocated. ++ */ ++static int __init param_kmemcheck(char *str) ++{ ++ if (!str) ++ return -EINVAL; ++ ++ sscanf(str, "%d", &kmemcheck_enabled); ++ return 0; ++} ++ ++early_param("kmemcheck", param_kmemcheck); ++ ++int kmemcheck_show_addr(unsigned long address) ++{ ++ pte_t *pte; ++ ++ pte = kmemcheck_pte_lookup(address); ++ if (!pte) ++ return 0; ++ ++ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); ++ __flush_tlb_one(address); ++ return 1; ++} ++ ++int kmemcheck_hide_addr(unsigned long address) ++{ ++ pte_t *pte; ++ ++ pte = kmemcheck_pte_lookup(address); ++ if (!pte) ++ return 0; ++ ++ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); ++ __flush_tlb_one(address); ++ return 1; ++} ++ ++struct kmemcheck_context { ++ bool busy; ++ int balance; ++ ++ /* ++ * There can be at most two memory operands to an instruction, but ++ * each address can cross a page boundary -- so we may need up to ++ * four addresses that must be hidden/revealed for each fault. ++ */ ++ unsigned long addr[4]; ++ unsigned long n_addrs; ++ unsigned long flags; ++ ++ /* Data size of the instruction that caused a fault. */ ++ unsigned int size; ++}; ++ ++static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); ++ ++bool kmemcheck_active(struct pt_regs *regs) ++{ ++ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); ++ ++ return data->balance > 0; ++} ++ ++/* Save an address that needs to be shown/hidden */ ++static void kmemcheck_save_addr(unsigned long addr) ++{ ++ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); ++ ++ BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); ++ data->addr[data->n_addrs++] = addr; ++} ++ ++static unsigned int kmemcheck_show_all(void) ++{ ++ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); ++ unsigned int i; ++ unsigned int n; ++ ++ n = 0; ++ for (i = 0; i < data->n_addrs; ++i) ++ n += kmemcheck_show_addr(data->addr[i]); ++ ++ return n; ++} ++ ++static unsigned int kmemcheck_hide_all(void) ++{ ++ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); ++ unsigned int i; ++ unsigned int n; ++ ++ n = 0; ++ for (i = 0; i < data->n_addrs; ++i) ++ n += kmemcheck_hide_addr(data->addr[i]); ++ ++ return n; ++} ++ ++/* ++ * Called from the #PF handler. ++ */ ++void kmemcheck_show(struct pt_regs *regs) ++{ ++ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); ++ ++ BUG_ON(!irqs_disabled()); ++ ++ if (unlikely(data->balance != 0)) { ++ kmemcheck_show_all(); ++ kmemcheck_error_save_bug(regs); ++ data->balance = 0; ++ return; ++ } ++ ++ /* ++ * None of the addresses actually belonged to kmemcheck. Note that ++ * this is not an error. ++ */ ++ if (kmemcheck_show_all() == 0) ++ return; ++ ++ ++data->balance; ++ ++ /* ++ * The IF needs to be cleared as well, so that the faulting ++ * instruction can run "uninterrupted". Otherwise, we might take ++ * an interrupt and start executing that before we've had a chance ++ * to hide the page again. ++ * ++ * NOTE: In the rare case of multiple faults, we must not override ++ * the original flags: ++ */ ++ if (!(regs->flags & X86_EFLAGS_TF)) ++ data->flags = regs->flags; ++ ++ regs->flags |= X86_EFLAGS_TF; ++ regs->flags &= ~X86_EFLAGS_IF; ++} ++ ++/* ++ * Called from the #DB handler. ++ */ ++void kmemcheck_hide(struct pt_regs *regs) ++{ ++ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); ++ int n; ++ ++ BUG_ON(!irqs_disabled()); ++ ++ if (data->balance == 0) ++ return; ++ ++ if (unlikely(data->balance != 1)) { ++ kmemcheck_show_all(); ++ kmemcheck_error_save_bug(regs); ++ data->n_addrs = 0; ++ data->balance = 0; ++ ++ if (!(data->flags & X86_EFLAGS_TF)) ++ regs->flags &= ~X86_EFLAGS_TF; ++ if (data->flags & X86_EFLAGS_IF) ++ regs->flags |= X86_EFLAGS_IF; ++ return; ++ } ++ ++ if (kmemcheck_enabled) ++ n = kmemcheck_hide_all(); ++ else ++ n = kmemcheck_show_all(); ++ ++ if (n == 0) ++ return; ++ ++ --data->balance; ++ ++ data->n_addrs = 0; ++ ++ if (!(data->flags & X86_EFLAGS_TF)) ++ regs->flags &= ~X86_EFLAGS_TF; ++ if (data->flags & X86_EFLAGS_IF) ++ regs->flags |= X86_EFLAGS_IF; ++} ++ ++void kmemcheck_show_pages(struct page *p, unsigned int n) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < n; ++i) { ++ unsigned long address; ++ pte_t *pte; ++ unsigned int level; ++ ++ address = (unsigned long) page_address(&p[i]); ++ pte = lookup_address(address, &level); ++ BUG_ON(!pte); ++ BUG_ON(level != PG_LEVEL_4K); ++ ++ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); ++ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); ++ __flush_tlb_one(address); ++ } ++} ++ ++bool kmemcheck_page_is_tracked(struct page *p) ++{ ++ /* This will also check the "hidden" flag of the PTE. */ ++ return kmemcheck_pte_lookup((unsigned long) page_address(p)); ++} ++ ++void kmemcheck_hide_pages(struct page *p, unsigned int n) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < n; ++i) { ++ unsigned long address; ++ pte_t *pte; ++ unsigned int level; ++ ++ address = (unsigned long) page_address(&p[i]); ++ pte = lookup_address(address, &level); ++ BUG_ON(!pte); ++ BUG_ON(level != PG_LEVEL_4K); ++ ++ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); ++ set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); ++ __flush_tlb_one(address); ++ } ++} ++ ++/* Access may NOT cross page boundary */ ++static void kmemcheck_read_strict(struct pt_regs *regs, ++ unsigned long addr, unsigned int size) ++{ ++ void *shadow; ++ enum kmemcheck_shadow status; ++ ++ shadow = kmemcheck_shadow_lookup(addr); ++ if (!shadow) ++ return; ++ ++ kmemcheck_save_addr(addr); ++ status = kmemcheck_shadow_test(shadow, size); ++ if (status == KMEMCHECK_SHADOW_INITIALIZED) ++ return; ++ ++ if (kmemcheck_enabled) ++ kmemcheck_error_save(status, addr, size, regs); ++ ++ if (kmemcheck_enabled == 2) ++ kmemcheck_enabled = 0; ++ ++ /* Don't warn about it again. */ ++ kmemcheck_shadow_set(shadow, size); ++} ++ ++/* Access may cross page boundary */ ++static void kmemcheck_read(struct pt_regs *regs, ++ unsigned long addr, unsigned int size) ++{ ++ unsigned long page = addr & PAGE_MASK; ++ unsigned long next_addr = addr + size - 1; ++ unsigned long next_page = next_addr & PAGE_MASK; ++ ++ if (likely(page == next_page)) { ++ kmemcheck_read_strict(regs, addr, size); ++ return; ++ } ++ ++ /* ++ * What we do is basically to split the access across the ++ * two pages and handle each part separately. Yes, this means ++ * that we may now see reads that are 3 + 5 bytes, for ++ * example (and if both are uninitialized, there will be two ++ * reports), but it makes the code a lot simpler. ++ */ ++ kmemcheck_read_strict(regs, addr, next_page - addr); ++ kmemcheck_read_strict(regs, next_page, next_addr - next_page); ++} ++ ++static void kmemcheck_write_strict(struct pt_regs *regs, ++ unsigned long addr, unsigned int size) ++{ ++ void *shadow; ++ ++ shadow = kmemcheck_shadow_lookup(addr); ++ if (!shadow) ++ return; ++ ++ kmemcheck_save_addr(addr); ++ kmemcheck_shadow_set(shadow, size); ++} ++ ++static void kmemcheck_write(struct pt_regs *regs, ++ unsigned long addr, unsigned int size) ++{ ++ unsigned long page = addr & PAGE_MASK; ++ unsigned long next_addr = addr + size - 1; ++ unsigned long next_page = next_addr & PAGE_MASK; ++ ++ if (likely(page == next_page)) { ++ kmemcheck_write_strict(regs, addr, size); ++ return; ++ } ++ ++ /* See comment in kmemcheck_read(). */ ++ kmemcheck_write_strict(regs, addr, next_page - addr); ++ kmemcheck_write_strict(regs, next_page, next_addr - next_page); ++} ++ ++/* ++ * Copying is hard. We have two addresses, each of which may be split across ++ * a page (and each page will have different shadow addresses). ++ */ ++static void kmemcheck_copy(struct pt_regs *regs, ++ unsigned long src_addr, unsigned long dst_addr, unsigned int size) ++{ ++ uint8_t shadow[8]; ++ enum kmemcheck_shadow status; ++ ++ unsigned long page; ++ unsigned long next_addr; ++ unsigned long next_page; ++ ++ uint8_t *x; ++ unsigned int i; ++ unsigned int n; ++ ++ BUG_ON(size > sizeof(shadow)); ++ ++ page = src_addr & PAGE_MASK; ++ next_addr = src_addr + size - 1; ++ next_page = next_addr & PAGE_MASK; ++ ++ if (likely(page == next_page)) { ++ /* Same page */ ++ x = kmemcheck_shadow_lookup(src_addr); ++ if (x) { ++ kmemcheck_save_addr(src_addr); ++ for (i = 0; i < size; ++i) ++ shadow[i] = x[i]; ++ } else { ++ for (i = 0; i < size; ++i) ++ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; ++ } ++ } else { ++ n = next_page - src_addr; ++ BUG_ON(n > sizeof(shadow)); ++ ++ /* First page */ ++ x = kmemcheck_shadow_lookup(src_addr); ++ if (x) { ++ kmemcheck_save_addr(src_addr); ++ for (i = 0; i < n; ++i) ++ shadow[i] = x[i]; ++ } else { ++ /* Not tracked */ ++ for (i = 0; i < n; ++i) ++ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; ++ } ++ ++ /* Second page */ ++ x = kmemcheck_shadow_lookup(next_page); ++ if (x) { ++ kmemcheck_save_addr(next_page); ++ for (i = n; i < size; ++i) ++ shadow[i] = x[i - n]; ++ } else { ++ /* Not tracked */ ++ for (i = n; i < size; ++i) ++ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; ++ } ++ } ++ ++ page = dst_addr & PAGE_MASK; ++ next_addr = dst_addr + size - 1; ++ next_page = next_addr & PAGE_MASK; ++ ++ if (likely(page == next_page)) { ++ /* Same page */ ++ x = kmemcheck_shadow_lookup(dst_addr); ++ if (x) { ++ kmemcheck_save_addr(dst_addr); ++ for (i = 0; i < size; ++i) { ++ x[i] = shadow[i]; ++ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; ++ } ++ } ++ } else { ++ n = next_page - dst_addr; ++ BUG_ON(n > sizeof(shadow)); ++ ++ /* First page */ ++ x = kmemcheck_shadow_lookup(dst_addr); ++ if (x) { ++ kmemcheck_save_addr(dst_addr); ++ for (i = 0; i < n; ++i) { ++ x[i] = shadow[i]; ++ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; ++ } ++ } ++ ++ /* Second page */ ++ x = kmemcheck_shadow_lookup(next_page); ++ if (x) { ++ kmemcheck_save_addr(next_page); ++ for (i = n; i < size; ++i) { ++ x[i - n] = shadow[i]; ++ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; ++ } ++ } ++ } ++ ++ status = kmemcheck_shadow_test(shadow, size); ++ if (status == KMEMCHECK_SHADOW_INITIALIZED) ++ return; ++ ++ if (kmemcheck_enabled) ++ kmemcheck_error_save(status, src_addr, size, regs); ++ ++ if (kmemcheck_enabled == 2) ++ kmemcheck_enabled = 0; ++} ++ ++enum kmemcheck_method { ++ KMEMCHECK_READ, ++ KMEMCHECK_WRITE, ++}; ++ ++static void kmemcheck_access(struct pt_regs *regs, ++ unsigned long fallback_address, enum kmemcheck_method fallback_method) ++{ ++ const uint8_t *insn; ++ const uint8_t *insn_primary; ++ unsigned int size; ++ ++ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); ++ ++ /* Recursive fault -- ouch. */ ++ if (data->busy) { ++ kmemcheck_show_addr(fallback_address); ++ kmemcheck_error_save_bug(regs); ++ return; ++ } ++ ++ data->busy = true; ++ ++ insn = (const uint8_t *) regs->ip; ++ insn_primary = kmemcheck_opcode_get_primary(insn); ++ ++ kmemcheck_opcode_decode(insn, &size); ++ ++ switch (insn_primary[0]) { ++#ifdef CONFIG_KMEMCHECK_BITOPS_OK ++ /* AND, OR, XOR */ ++ /* ++ * Unfortunately, these instructions have to be excluded from ++ * our regular checking since they access only some (and not ++ * all) bits. This clears out "bogus" bitfield-access warnings. ++ */ ++ case 0x80: ++ case 0x81: ++ case 0x82: ++ case 0x83: ++ switch ((insn_primary[1] >> 3) & 7) { ++ /* OR */ ++ case 1: ++ /* AND */ ++ case 4: ++ /* XOR */ ++ case 6: ++ kmemcheck_write(regs, fallback_address, size); ++ goto out; ++ ++ /* ADD */ ++ case 0: ++ /* ADC */ ++ case 2: ++ /* SBB */ ++ case 3: ++ /* SUB */ ++ case 5: ++ /* CMP */ ++ case 7: ++ break; ++ } ++ break; ++#endif ++ ++ /* MOVS, MOVSB, MOVSW, MOVSD */ ++ case 0xa4: ++ case 0xa5: ++ /* ++ * These instructions are special because they take two ++ * addresses, but we only get one page fault. ++ */ ++ kmemcheck_copy(regs, regs->si, regs->di, size); ++ goto out; ++ ++ /* CMPS, CMPSB, CMPSW, CMPSD */ ++ case 0xa6: ++ case 0xa7: ++ kmemcheck_read(regs, regs->si, size); ++ kmemcheck_read(regs, regs->di, size); ++ goto out; ++ } ++ ++ /* ++ * If the opcode isn't special in any way, we use the data from the ++ * page fault handler to determine the address and type of memory ++ * access. ++ */ ++ switch (fallback_method) { ++ case KMEMCHECK_READ: ++ kmemcheck_read(regs, fallback_address, size); ++ goto out; ++ case KMEMCHECK_WRITE: ++ kmemcheck_write(regs, fallback_address, size); ++ goto out; ++ } ++ ++out: ++ data->busy = false; ++} ++ ++bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, ++ unsigned long error_code) ++{ ++ pte_t *pte; ++ ++ /* ++ * XXX: Is it safe to assume that memory accesses from virtual 86 ++ * mode or non-kernel code segments will _never_ access kernel ++ * memory (e.g. tracked pages)? For now, we need this to avoid ++ * invoking kmemcheck for PnP BIOS calls. ++ */ ++ if (regs->flags & X86_VM_MASK) ++ return false; ++ if (regs->cs != __KERNEL_CS) ++ return false; ++ ++ pte = kmemcheck_pte_lookup(address); ++ if (!pte) ++ return false; ++ ++ if (error_code & 2) ++ kmemcheck_access(regs, address, KMEMCHECK_WRITE); ++ else ++ kmemcheck_access(regs, address, KMEMCHECK_READ); ++ ++ kmemcheck_show(regs); ++ return true; ++} ++ ++bool kmemcheck_trap(struct pt_regs *regs) ++{ ++ if (!kmemcheck_active(regs)) ++ return false; ++ ++ /* We're done. */ ++ kmemcheck_hide(regs); ++ return true; ++} +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/opcode.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/opcode.c +@@ -0,0 +1,106 @@ ++#include ++ ++#include "opcode.h" ++ ++static bool opcode_is_prefix(uint8_t b) ++{ ++ return ++ /* Group 1 */ ++ b == 0xf0 || b == 0xf2 || b == 0xf3 ++ /* Group 2 */ ++ || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 ++ || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e ++ /* Group 3 */ ++ || b == 0x66 ++ /* Group 4 */ ++ || b == 0x67; ++} ++ ++#ifdef CONFIG_X86_64 ++static bool opcode_is_rex_prefix(uint8_t b) ++{ ++ return (b & 0xf0) == 0x40; ++} ++#else ++static bool opcode_is_rex_prefix(uint8_t b) ++{ ++ return false; ++} ++#endif ++ ++#define REX_W (1 << 3) ++ ++/* ++ * This is a VERY crude opcode decoder. We only need to find the size of the ++ * load/store that caused our #PF and this should work for all the opcodes ++ * that we care about. Moreover, the ones who invented this instruction set ++ * should be shot. ++ */ ++void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) ++{ ++ /* Default operand size */ ++ int operand_size_override = 4; ++ ++ /* prefixes */ ++ for (; opcode_is_prefix(*op); ++op) { ++ if (*op == 0x66) ++ operand_size_override = 2; ++ } ++ ++ /* REX prefix */ ++ if (opcode_is_rex_prefix(*op)) { ++ uint8_t rex = *op; ++ ++ ++op; ++ if (rex & REX_W) { ++ switch (*op) { ++ case 0x63: ++ *size = 4; ++ return; ++ case 0x0f: ++ ++op; ++ ++ switch (*op) { ++ case 0xb6: ++ case 0xbe: ++ *size = 1; ++ return; ++ case 0xb7: ++ case 0xbf: ++ *size = 2; ++ return; ++ } ++ ++ break; ++ } ++ ++ *size = 8; ++ return; ++ } ++ } ++ ++ /* escape opcode */ ++ if (*op == 0x0f) { ++ ++op; ++ ++ /* ++ * This is move with zero-extend and sign-extend, respectively; ++ * we don't have to think about 0xb6/0xbe, because this is ++ * already handled in the conditional below. ++ */ ++ if (*op == 0xb7 || *op == 0xbf) ++ operand_size_override = 2; ++ } ++ ++ *size = (*op & 1) ? operand_size_override : 1; ++} ++ ++const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) ++{ ++ /* skip prefixes */ ++ while (opcode_is_prefix(*op)) ++ ++op; ++ if (opcode_is_rex_prefix(*op)) ++ ++op; ++ return op; ++} +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/opcode.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/opcode.h +@@ -0,0 +1,9 @@ ++#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H ++#define ARCH__X86__MM__KMEMCHECK__OPCODE_H ++ ++#include ++ ++void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); ++const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); ++ ++#endif +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/pte.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/pte.c +@@ -0,0 +1,22 @@ ++#include ++ ++#include ++ ++#include "pte.h" ++ ++pte_t *kmemcheck_pte_lookup(unsigned long address) ++{ ++ pte_t *pte; ++ unsigned int level; ++ ++ pte = lookup_address(address, &level); ++ if (!pte) ++ return NULL; ++ if (level != PG_LEVEL_4K) ++ return NULL; ++ if (!pte_hidden(*pte)) ++ return NULL; ++ ++ return pte; ++} ++ +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/pte.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/pte.h +@@ -0,0 +1,10 @@ ++#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H ++#define ARCH__X86__MM__KMEMCHECK__PTE_H ++ ++#include ++ ++#include ++ ++pte_t *kmemcheck_pte_lookup(unsigned long address); ++ ++#endif +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/selftest.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/selftest.c +@@ -0,0 +1,69 @@ ++#include ++ ++#include "opcode.h" ++#include "selftest.h" ++ ++struct selftest_opcode { ++ unsigned int expected_size; ++ const uint8_t *insn; ++ const char *desc; ++}; ++ ++static const struct selftest_opcode selftest_opcodes[] = { ++ /* REP MOVS */ ++ {1, "\xf3\xa4", "rep movsb , "}, ++ {4, "\xf3\xa5", "rep movsl , "}, ++ ++ /* MOVZX / MOVZXD */ ++ {1, "\x66\x0f\xb6\x51\xf8", "movzwq , "}, ++ {1, "\x0f\xb6\x51\xf8", "movzwq , "}, ++ ++ /* MOVSX / MOVSXD */ ++ {1, "\x66\x0f\xbe\x51\xf8", "movswq , "}, ++ {1, "\x0f\xbe\x51\xf8", "movswq , "}, ++ ++#ifdef CONFIG_X86_64 ++ /* MOVZX / MOVZXD */ ++ {1, "\x49\x0f\xb6\x51\xf8", "movzbq , "}, ++ {2, "\x49\x0f\xb7\x51\xf8", "movzbq , "}, ++ ++ /* MOVSX / MOVSXD */ ++ {1, "\x49\x0f\xbe\x51\xf8", "movsbq , "}, ++ {2, "\x49\x0f\xbf\x51\xf8", "movsbq , "}, ++ {4, "\x49\x63\x51\xf8", "movslq , "}, ++#endif ++}; ++ ++static bool selftest_opcode_one(const struct selftest_opcode *op) ++{ ++ unsigned size; ++ ++ kmemcheck_opcode_decode(op->insn, &size); ++ ++ if (size == op->expected_size) ++ return true; ++ ++ printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", ++ op->desc, op->expected_size, size); ++ return false; ++} ++ ++static bool selftest_opcodes_all(void) ++{ ++ bool pass = true; ++ unsigned int i; ++ ++ for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) ++ pass = pass && selftest_opcode_one(&selftest_opcodes[i]); ++ ++ return pass; ++} ++ ++bool kmemcheck_selftest(void) ++{ ++ bool pass = true; ++ ++ pass = pass && selftest_opcodes_all(); ++ ++ return pass; ++} +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/selftest.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/selftest.h +@@ -0,0 +1,6 @@ ++#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H ++#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H ++ ++bool kmemcheck_selftest(void); ++ ++#endif +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/shadow.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/shadow.c +@@ -0,0 +1,162 @@ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "pte.h" ++#include "shadow.h" ++ ++/* ++ * Return the shadow address for the given address. Returns NULL if the ++ * address is not tracked. ++ * ++ * We need to be extremely careful not to follow any invalid pointers, ++ * because this function can be called for *any* possible address. ++ */ ++void *kmemcheck_shadow_lookup(unsigned long address) ++{ ++ pte_t *pte; ++ struct page *page; ++ ++ if (!virt_addr_valid(address)) ++ return NULL; ++ ++ pte = kmemcheck_pte_lookup(address); ++ if (!pte) ++ return NULL; ++ ++ page = virt_to_page(address); ++ if (!page->shadow) ++ return NULL; ++ return page->shadow + (address & (PAGE_SIZE - 1)); ++} ++ ++static void mark_shadow(void *address, unsigned int n, ++ enum kmemcheck_shadow status) ++{ ++ unsigned long addr = (unsigned long) address; ++ unsigned long last_addr = addr + n - 1; ++ unsigned long page = addr & PAGE_MASK; ++ unsigned long last_page = last_addr & PAGE_MASK; ++ unsigned int first_n; ++ void *shadow; ++ ++ /* If the memory range crosses a page boundary, stop there. */ ++ if (page == last_page) ++ first_n = n; ++ else ++ first_n = page + PAGE_SIZE - addr; ++ ++ shadow = kmemcheck_shadow_lookup(addr); ++ if (shadow) ++ memset(shadow, status, first_n); ++ ++ addr += first_n; ++ n -= first_n; ++ ++ /* Do full-page memset()s. */ ++ while (n >= PAGE_SIZE) { ++ shadow = kmemcheck_shadow_lookup(addr); ++ if (shadow) ++ memset(shadow, status, PAGE_SIZE); ++ ++ addr += PAGE_SIZE; ++ n -= PAGE_SIZE; ++ } ++ ++ /* Do the remaining page, if any. */ ++ if (n > 0) { ++ shadow = kmemcheck_shadow_lookup(addr); ++ if (shadow) ++ memset(shadow, status, n); ++ } ++} ++ ++void kmemcheck_mark_unallocated(void *address, unsigned int n) ++{ ++ mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); ++} ++ ++void kmemcheck_mark_uninitialized(void *address, unsigned int n) ++{ ++ mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); ++} ++ ++/* ++ * Fill the shadow memory of the given address such that the memory at that ++ * address is marked as being initialized. ++ */ ++void kmemcheck_mark_initialized(void *address, unsigned int n) ++{ ++ mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); ++} ++EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); ++ ++void kmemcheck_mark_freed(void *address, unsigned int n) ++{ ++ mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); ++} ++ ++void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < n; ++i) ++ kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); ++} ++ ++void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < n; ++i) ++ kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); ++} ++ ++void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < n; ++i) ++ kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); ++} ++ ++enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) ++{ ++ uint8_t *x; ++ unsigned int i; ++ ++ x = shadow; ++ ++#ifdef CONFIG_KMEMCHECK_PARTIAL_OK ++ /* ++ * Make sure _some_ bytes are initialized. Gcc frequently generates ++ * code to access neighboring bytes. ++ */ ++ for (i = 0; i < size; ++i) { ++ if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) ++ return x[i]; ++ } ++#else ++ /* All bytes must be initialized. */ ++ for (i = 0; i < size; ++i) { ++ if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) ++ return x[i]; ++ } ++#endif ++ ++ return x[0]; ++} ++ ++void kmemcheck_shadow_set(void *shadow, unsigned int size) ++{ ++ uint8_t *x; ++ unsigned int i; ++ ++ x = shadow; ++ for (i = 0; i < size; ++i) ++ x[i] = KMEMCHECK_SHADOW_INITIALIZED; ++} +Index: linux-2.6-tip/arch/x86/mm/kmemcheck/shadow.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/kmemcheck/shadow.h +@@ -0,0 +1,16 @@ ++#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H ++#define ARCH__X86__MM__KMEMCHECK__SHADOW_H ++ ++enum kmemcheck_shadow { ++ KMEMCHECK_SHADOW_UNALLOCATED, ++ KMEMCHECK_SHADOW_UNINITIALIZED, ++ KMEMCHECK_SHADOW_INITIALIZED, ++ KMEMCHECK_SHADOW_FREED, ++}; ++ ++void *kmemcheck_shadow_lookup(unsigned long address); ++ ++enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); ++void kmemcheck_shadow_set(void *shadow, unsigned int size); ++ ++#endif +Index: linux-2.6-tip/arch/x86/mm/kmmio.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/kmmio.c ++++ linux-2.6-tip/arch/x86/mm/kmmio.c +@@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned l + struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); + + if (!ctx->active) { +- pr_warning("kmmio: spurious debug trap on CPU %d.\n", ++ pr_debug("kmmio: spurious debug trap on CPU %d.\n", + smp_processor_id()); + goto out; + } +Index: linux-2.6-tip/arch/x86/mm/memtest.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/memtest.c ++++ linux-2.6-tip/arch/x86/mm/memtest.c +@@ -9,44 +9,44 @@ + + #include + +-static void __init memtest(unsigned long start_phys, unsigned long size, +- unsigned pattern) ++static u64 patterns[] __initdata = { ++ 0, ++ 0xffffffffffffffffULL, ++ 0x5555555555555555ULL, ++ 0xaaaaaaaaaaaaaaaaULL, ++ 0x1111111111111111ULL, ++ 0x2222222222222222ULL, ++ 0x4444444444444444ULL, ++ 0x8888888888888888ULL, ++ 0x3333333333333333ULL, ++ 0x6666666666666666ULL, ++ 0x9999999999999999ULL, ++ 0xccccccccccccccccULL, ++ 0x7777777777777777ULL, ++ 0xbbbbbbbbbbbbbbbbULL, ++ 0xddddddddddddddddULL, ++ 0xeeeeeeeeeeeeeeeeULL, ++ 0x7a6c7258554e494cULL, /* yeah ;-) */ ++}; ++ ++static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) + { +- unsigned long i; +- unsigned long *start; +- unsigned long start_bad; +- unsigned long last_bad; +- unsigned long val; +- unsigned long start_phys_aligned; +- unsigned long count; +- unsigned long incr; +- +- switch (pattern) { +- case 0: +- val = 0UL; +- break; +- case 1: +- val = -1UL; +- break; +- case 2: +-#ifdef CONFIG_X86_64 +- val = 0x5555555555555555UL; +-#else +- val = 0x55555555UL; +-#endif +- break; +- case 3: +-#ifdef CONFIG_X86_64 +- val = 0xaaaaaaaaaaaaaaaaUL; +-#else +- val = 0xaaaaaaaaUL; +-#endif +- break; +- default: +- return; +- } ++ printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", ++ (unsigned long long) pattern, ++ (unsigned long long) start_bad, ++ (unsigned long long) end_bad); ++ reserve_early(start_bad, end_bad, "BAD RAM"); ++} + +- incr = sizeof(unsigned long); ++static void __init memtest(u64 pattern, u64 start_phys, u64 size) ++{ ++ u64 i, count; ++ u64 *start; ++ u64 start_bad, last_bad; ++ u64 start_phys_aligned; ++ size_t incr; ++ ++ incr = sizeof(pattern); + start_phys_aligned = ALIGN(start_phys, incr); + count = (size - (start_phys_aligned - start_phys))/incr; + start = __va(start_phys_aligned); +@@ -54,25 +54,42 @@ static void __init memtest(unsigned long + last_bad = 0; + + for (i = 0; i < count; i++) +- start[i] = val; ++ start[i] = pattern; + for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { +- if (*start != val) { +- if (start_phys_aligned == last_bad + incr) { +- last_bad += incr; +- } else { +- if (start_bad) { +- printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", +- val, start_bad, last_bad + incr); +- reserve_early(start_bad, last_bad + incr, "BAD RAM"); +- } +- start_bad = last_bad = start_phys_aligned; +- } ++ if (*start == pattern) ++ continue; ++ if (start_phys_aligned == last_bad + incr) { ++ last_bad += incr; ++ continue; + } ++ if (start_bad) ++ reserve_bad_mem(pattern, start_bad, last_bad + incr); ++ start_bad = last_bad = start_phys_aligned; + } +- if (start_bad) { +- printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", +- val, start_bad, last_bad + incr); +- reserve_early(start_bad, last_bad + incr, "BAD RAM"); ++ if (start_bad) ++ reserve_bad_mem(pattern, start_bad, last_bad + incr); ++} ++ ++static void __init do_one_pass(u64 pattern, u64 start, u64 end) ++{ ++ u64 size = 0; ++ ++ while (start < end) { ++ start = find_e820_area_size(start, &size, 1); ++ ++ /* done ? */ ++ if (start >= end) ++ break; ++ if (start + size > end) ++ size = end - start; ++ ++ printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", ++ (unsigned long long) start, ++ (unsigned long long) start + size, ++ (unsigned long long) cpu_to_be64(pattern)); ++ memtest(pattern, start, size); ++ ++ start += size; + } + } + +@@ -83,6 +100,9 @@ static int __init parse_memtest(char *ar + { + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); ++ else ++ memtest_pattern = ARRAY_SIZE(patterns); ++ + return 0; + } + +@@ -90,33 +110,22 @@ early_param("memtest", parse_memtest); + + void __init early_memtest(unsigned long start, unsigned long end) + { +- u64 t_start, t_size; +- unsigned pattern; ++ unsigned int i; ++ unsigned int idx = 0; + + if (!memtest_pattern) + return; + +- printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); +- for (pattern = 0; pattern < memtest_pattern; pattern++) { +- t_start = start; +- t_size = 0; +- while (t_start < end) { +- t_start = find_e820_area_size(t_start, &t_size, 1); +- +- /* done ? */ +- if (t_start >= end) +- break; +- if (t_start + t_size > end) +- t_size = end - t_start; +- +- printk(KERN_CONT "\n %010llx - %010llx pattern %d", +- (unsigned long long)t_start, +- (unsigned long long)t_start + t_size, pattern); +- +- memtest(t_start, t_size, pattern); ++ printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); ++ for (i = 0; i < memtest_pattern; i++) { ++ idx = i % ARRAY_SIZE(patterns); ++ do_one_pass(patterns[idx], start, end); ++ } + +- t_start += t_size; +- } ++ if (idx > 0) { ++ printk(KERN_INFO "early_memtest: wipe out " ++ "test pattern from memory\n"); ++ /* additional test with pattern 0 will do this */ ++ do_one_pass(0, start, end); + } +- printk(KERN_CONT "\n"); + } +Index: linux-2.6-tip/arch/x86/mm/mmap.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/mmap.c ++++ linux-2.6-tip/arch/x86/mm/mmap.c +@@ -4,7 +4,7 @@ + * Based on code by Ingo Molnar and Andi Kleen, copyrighted + * as follows: + * +- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. ++ * Copyright 2003-2009 Red Hat Inc. + * All Rights Reserved. + * Copyright 2005 Andi Kleen, SUSE Labs. + * Copyright 2007 Jiri Kosina, SUSE Labs. +Index: linux-2.6-tip/arch/x86/mm/mmio-mod.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/mmio-mod.c ++++ linux-2.6-tip/arch/x86/mm/mmio-mod.c +@@ -378,27 +378,34 @@ static void clear_trace_list(void) + } + + #ifdef CONFIG_HOTPLUG_CPU +-static cpumask_t downed_cpus; ++static cpumask_var_t downed_cpus; + + static void enter_uniprocessor(void) + { + int cpu; + int err; + ++ if (downed_cpus == NULL && ++ !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { ++ pr_notice(NAME "Failed to allocate mask\n"); ++ goto out; ++ } ++ + get_online_cpus(); +- downed_cpus = cpu_online_map; +- cpu_clear(first_cpu(cpu_online_map), downed_cpus); ++ cpumask_copy(downed_cpus, cpu_online_mask); ++ cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); + if (num_online_cpus() > 1) + pr_notice(NAME "Disabling non-boot CPUs...\n"); + put_online_cpus(); + +- for_each_cpu_mask(cpu, downed_cpus) { ++ for_each_cpu(cpu, downed_cpus) { + err = cpu_down(cpu); + if (!err) + pr_info(NAME "CPU%d is down.\n", cpu); + else + pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); + } ++out: + if (num_online_cpus() > 1) + pr_warning(NAME "multiple CPUs still online, " + "may miss events.\n"); +@@ -411,10 +418,10 @@ static void __ref leave_uniprocessor(voi + int cpu; + int err; + +- if (cpus_weight(downed_cpus) == 0) ++ if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) + return; + pr_notice(NAME "Re-enabling CPUs...\n"); +- for_each_cpu_mask(cpu, downed_cpus) { ++ for_each_cpu(cpu, downed_cpus) { + err = cpu_up(cpu); + if (!err) + pr_info(NAME "enabled CPU%d.\n", cpu); +Index: linux-2.6-tip/arch/x86/mm/numa.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/numa.c +@@ -0,0 +1,67 @@ ++/* Common code for 32 and 64-bit NUMA */ ++#include ++#include ++#include ++ ++#ifdef CONFIG_DEBUG_PER_CPU_MAPS ++# define DBG(x...) printk(KERN_DEBUG x) ++#else ++# define DBG(x...) ++#endif ++ ++/* ++ * Which logical CPUs are on which nodes ++ */ ++cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; ++EXPORT_SYMBOL(node_to_cpumask_map); ++ ++/* ++ * Allocate node_to_cpumask_map based on number of available nodes ++ * Requires node_possible_map to be valid. ++ * ++ * Note: node_to_cpumask() is not valid until after this is done. ++ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) ++ */ ++void __init setup_node_to_cpumask_map(void) ++{ ++ unsigned int node, num = 0; ++ ++ /* setup nr_node_ids if not done yet */ ++ if (nr_node_ids == MAX_NUMNODES) { ++ for_each_node_mask(node, node_possible_map) ++ num = node; ++ nr_node_ids = num + 1; ++ } ++ ++ /* allocate the map */ ++ for (node = 0; node < nr_node_ids; node++) ++ alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); ++ ++ /* cpumask_of_node() will now work */ ++ pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); ++} ++ ++#ifdef CONFIG_DEBUG_PER_CPU_MAPS ++/* ++ * Returns a pointer to the bitmask of CPUs on Node 'node'. ++ */ ++const struct cpumask *cpumask_of_node(int node) ++{ ++ if (node >= nr_node_ids) { ++ printk(KERN_WARNING ++ "cpumask_of_node(%d): node > nr_node_ids(%d)\n", ++ node, nr_node_ids); ++ dump_stack(); ++ return cpu_none_mask; ++ } ++ if (node_to_cpumask_map[node] == NULL) { ++ printk(KERN_WARNING ++ "cpumask_of_node(%d): no node_to_cpumask_map!\n", ++ node); ++ dump_stack(); ++ return cpu_online_mask; ++ } ++ return node_to_cpumask_map[node]; ++} ++EXPORT_SYMBOL(cpumask_of_node); ++#endif +Index: linux-2.6-tip/arch/x86/mm/numa_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/numa_32.c ++++ linux-2.6-tip/arch/x86/mm/numa_32.c +@@ -194,7 +194,7 @@ void *alloc_remap(int nid, unsigned long + size = ALIGN(size, L1_CACHE_BYTES); + + if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) +- return 0; ++ return NULL; + + node_remap_alloc_vaddr[nid] += size; + memset(allocation, 0, size); +@@ -416,39 +416,14 @@ void __init initmem_init(unsigned long s + for_each_online_node(nid) + propagate_e820_map_node(nid); + +- for_each_online_node(nid) ++ for_each_online_node(nid) { + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); ++ NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; ++ } + +- NODE_DATA(0)->bdata = &bootmem_node_data[0]; + setup_bootmem_allocator(); + } + +-void __init set_highmem_pages_init(void) +-{ +-#ifdef CONFIG_HIGHMEM +- struct zone *zone; +- int nid; +- +- for_each_zone(zone) { +- unsigned long zone_start_pfn, zone_end_pfn; +- +- if (!is_highmem(zone)) +- continue; +- +- zone_start_pfn = zone->zone_start_pfn; +- zone_end_pfn = zone_start_pfn + zone->spanned_pages; +- +- nid = zone_to_nid(zone); +- printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", +- zone->name, nid, zone_start_pfn, zone_end_pfn); +- +- add_highpages_with_active_regions(nid, zone_start_pfn, +- zone_end_pfn); +- } +- totalram_pages += totalhigh_pages; +-#endif +-} +- + #ifdef CONFIG_MEMORY_HOTPLUG + static int paddr_to_nid(u64 addr) + { +Index: linux-2.6-tip/arch/x86/mm/numa_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/numa_64.c ++++ linux-2.6-tip/arch/x86/mm/numa_64.c +@@ -33,6 +33,15 @@ int numa_off __initdata; + static unsigned long __initdata nodemap_addr; + static unsigned long __initdata nodemap_size; + ++DEFINE_PER_CPU(int, node_number) = 0; ++EXPORT_PER_CPU_SYMBOL(node_number); ++ ++/* ++ * Map cpu index to node index ++ */ ++DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); ++EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); ++ + /* + * Given a shift value, try to populate memnodemap[] + * Returns : +@@ -640,3 +649,116 @@ void __init init_cpu_to_node(void) + #endif + + ++void __cpuinit numa_set_node(int cpu, int node) ++{ ++ int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); ++ ++ /* early setting, no percpu area yet */ ++ if (cpu_to_node_map) { ++ cpu_to_node_map[cpu] = node; ++ return; ++ } ++ ++#ifdef CONFIG_DEBUG_PER_CPU_MAPS ++ if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { ++ printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); ++ dump_stack(); ++ return; ++ } ++#endif ++ per_cpu(x86_cpu_to_node_map, cpu) = node; ++ ++ if (node != NUMA_NO_NODE) ++ per_cpu(node_number, cpu) = node; ++} ++ ++void __cpuinit numa_clear_node(int cpu) ++{ ++ numa_set_node(cpu, NUMA_NO_NODE); ++} ++ ++#ifndef CONFIG_DEBUG_PER_CPU_MAPS ++ ++void __cpuinit numa_add_cpu(int cpu) ++{ ++ cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); ++} ++ ++void __cpuinit numa_remove_cpu(int cpu) ++{ ++ cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); ++} ++ ++#else /* CONFIG_DEBUG_PER_CPU_MAPS */ ++ ++/* ++ * --------- debug versions of the numa functions --------- ++ */ ++static void __cpuinit numa_set_cpumask(int cpu, int enable) ++{ ++ int node = early_cpu_to_node(cpu); ++ struct cpumask *mask; ++ char buf[64]; ++ ++ mask = node_to_cpumask_map[node]; ++ if (mask == NULL) { ++ printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); ++ dump_stack(); ++ return; ++ } ++ ++ if (enable) ++ cpumask_set_cpu(cpu, mask); ++ else ++ cpumask_clear_cpu(cpu, mask); ++ ++ cpulist_scnprintf(buf, sizeof(buf), mask); ++ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", ++ enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); ++} ++ ++void __cpuinit numa_add_cpu(int cpu) ++{ ++ numa_set_cpumask(cpu, 1); ++} ++ ++void __cpuinit numa_remove_cpu(int cpu) ++{ ++ numa_set_cpumask(cpu, 0); ++} ++ ++int cpu_to_node(int cpu) ++{ ++ if (early_per_cpu_ptr(x86_cpu_to_node_map)) { ++ printk(KERN_WARNING ++ "cpu_to_node(%d): usage too early!\n", cpu); ++ dump_stack(); ++ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; ++ } ++ return per_cpu(x86_cpu_to_node_map, cpu); ++} ++EXPORT_SYMBOL(cpu_to_node); ++ ++/* ++ * Same function as cpu_to_node() but used if called before the ++ * per_cpu areas are setup. ++ */ ++int early_cpu_to_node(int cpu) ++{ ++ if (early_per_cpu_ptr(x86_cpu_to_node_map)) ++ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; ++ ++ if (!cpu_possible(cpu)) { ++ printk(KERN_WARNING ++ "early_cpu_to_node(%d): no per_cpu area!\n", cpu); ++ dump_stack(); ++ return NUMA_NO_NODE; ++ } ++ return per_cpu(x86_cpu_to_node_map, cpu); ++} ++ ++/* ++ * --------- end of debug versions of the numa functions --------- ++ */ ++ ++#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ +Index: linux-2.6-tip/arch/x86/mm/pageattr.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/pageattr.c ++++ linux-2.6-tip/arch/x86/mm/pageattr.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -33,6 +34,7 @@ struct cpa_data { + unsigned long pfn; + unsigned force_split : 1; + int curpage; ++ struct page **pages; + }; + + /* +@@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock); + + #define CPA_FLUSHTLB 1 + #define CPA_ARRAY 2 ++#define CPA_PAGES_ARRAY 4 + + #ifdef CONFIG_PROC_FS + static unsigned long direct_pages_count[PG_LEVEL_NUM]; +@@ -95,7 +98,7 @@ static inline unsigned long highmap_star + + static inline unsigned long highmap_end_pfn(void) + { +- return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; ++ return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; + } + + #endif +@@ -201,10 +204,10 @@ static void cpa_flush_range(unsigned lon + } + } + +-static void cpa_flush_array(unsigned long *start, int numpages, int cache) ++static void cpa_flush_array(unsigned long *start, int numpages, int cache, ++ int in_flags, struct page **pages) + { + unsigned int i, level; +- unsigned long *addr; + + BUG_ON(irqs_disabled()); + +@@ -225,14 +228,22 @@ static void cpa_flush_array(unsigned lon + * will cause all other CPUs to flush the same + * cachelines: + */ +- for (i = 0, addr = start; i < numpages; i++, addr++) { +- pte_t *pte = lookup_address(*addr, &level); ++ for (i = 0; i < numpages; i++) { ++ unsigned long addr; ++ pte_t *pte; ++ ++ if (in_flags & CPA_PAGES_ARRAY) ++ addr = (unsigned long)page_address(pages[i]); ++ else ++ addr = start[i]; ++ ++ pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) +- clflush_cache_range((void *) *addr, PAGE_SIZE); ++ clflush_cache_range((void *)addr, PAGE_SIZE); + } + } + +@@ -464,7 +475,7 @@ static int split_large_page(pte_t *kpte, + + if (!debug_pagealloc) + spin_unlock(&cpa_lock); +- base = alloc_pages(GFP_KERNEL, 0); ++ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) +@@ -482,6 +493,13 @@ static int split_large_page(pte_t *kpte, + pbase = (pte_t *)page_address(base); + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); + ref_prot = pte_pgprot(pte_clrhuge(*kpte)); ++ /* ++ * If we ever want to utilize the PAT bit, we need to ++ * update this function to make sure it's converted from ++ * bit 12 to bit 7 when we cross from the 2MB level to ++ * the 4K level: ++ */ ++ WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); + + #ifdef CONFIG_X86_64 + if (level == PG_LEVEL_1G) { +@@ -577,7 +595,9 @@ static int __change_page_attr(struct cpa + unsigned int level; + pte_t *kpte, old_pte; + +- if (cpa->flags & CPA_ARRAY) ++ if (cpa->flags & CPA_PAGES_ARRAY) ++ address = (unsigned long)page_address(cpa->pages[cpa->curpage]); ++ else if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; +@@ -680,7 +700,9 @@ static int cpa_process_alias(struct cpa_ + * No need to redo, when the primary call touched the direct + * mapping already: + */ +- if (cpa->flags & CPA_ARRAY) ++ if (cpa->flags & CPA_PAGES_ARRAY) ++ vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); ++ else if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; +@@ -691,7 +713,7 @@ static int cpa_process_alias(struct cpa_ + alias_cpa = *cpa; + temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + alias_cpa.vaddr = &temp_cpa_vaddr; +- alias_cpa.flags &= ~CPA_ARRAY; ++ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + + + ret = __change_page_attr_set_clr(&alias_cpa, 0); +@@ -704,7 +726,7 @@ static int cpa_process_alias(struct cpa_ + * No need to redo, when the primary call touched the high + * mapping already: + */ +- if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) ++ if (within(vaddr, (unsigned long) _text, _brk_end)) + return 0; + + /* +@@ -717,7 +739,7 @@ static int cpa_process_alias(struct cpa_ + alias_cpa = *cpa; + temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + alias_cpa.vaddr = &temp_cpa_vaddr; +- alias_cpa.flags &= ~CPA_ARRAY; ++ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + + /* + * The high mapping range is imprecise, so ignore the return value. +@@ -738,7 +760,7 @@ static int __change_page_attr_set_clr(st + */ + cpa->numpages = numpages; + /* for array changes, we can't use large page */ +- if (cpa->flags & CPA_ARRAY) ++ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa->numpages = 1; + + if (!debug_pagealloc) +@@ -762,7 +784,7 @@ static int __change_page_attr_set_clr(st + */ + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; +- if (cpa->flags & CPA_ARRAY) ++ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; +@@ -779,7 +801,8 @@ static inline int cache_attr(pgprot_t at + + static int change_page_attr_set_clr(unsigned long *addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr, +- int force_split, int array) ++ int force_split, int in_flag, ++ struct page **pages) + { + struct cpa_data cpa; + int ret, cache, checkalias; +@@ -794,15 +817,7 @@ static int change_page_attr_set_clr(unsi + return 0; + + /* Ensure we are PAGE_SIZE aligned */ +- if (!array) { +- if (*addr & ~PAGE_MASK) { +- *addr &= PAGE_MASK; +- /* +- * People should not be passing in unaligned addresses: +- */ +- WARN_ON_ONCE(1); +- } +- } else { ++ if (in_flag & CPA_ARRAY) { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { +@@ -810,10 +825,24 @@ static int change_page_attr_set_clr(unsi + WARN_ON_ONCE(1); + } + } ++ } else if (!(in_flag & CPA_PAGES_ARRAY)) { ++ /* ++ * in_flag of CPA_PAGES_ARRAY implies it is aligned. ++ * No need to cehck in that case ++ */ ++ if (*addr & ~PAGE_MASK) { ++ *addr &= PAGE_MASK; ++ /* ++ * People should not be passing in unaligned addresses: ++ */ ++ WARN_ON_ONCE(1); ++ } + } + ++#if 0 + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); ++#endif + + vm_unmap_aliases(); + +@@ -825,6 +854,7 @@ static int change_page_attr_set_clr(unsi + arch_flush_lazy_mmu_mode(); + + cpa.vaddr = addr; ++ cpa.pages = pages; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; +@@ -832,8 +862,8 @@ static int change_page_attr_set_clr(unsi + cpa.curpage = 0; + cpa.force_split = force_split; + +- if (array) +- cpa.flags |= CPA_ARRAY; ++ if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) ++ cpa.flags |= in_flag; + + /* No alias checking for _NX bit modifications */ + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; +@@ -859,9 +889,10 @@ static int change_page_attr_set_clr(unsi + * wbindv): + */ + if (!ret && cpu_has_clflush) { +- if (cpa.flags & CPA_ARRAY) +- cpa_flush_array(addr, numpages, cache); +- else ++ if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { ++ cpa_flush_array(addr, numpages, cache, ++ cpa.flags, pages); ++ } else + cpa_flush_range(*addr, numpages, cache); + } else + cpa_flush_all(cache); +@@ -881,14 +912,28 @@ static inline int change_page_attr_set(u + pgprot_t mask, int array) + { + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, +- array); ++ (array ? CPA_ARRAY : 0), NULL); + } + + static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) + { + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, +- array); ++ (array ? CPA_ARRAY : 0), NULL); ++} ++ ++static inline int cpa_set_pages_array(struct page **pages, int numpages, ++ pgprot_t mask) ++{ ++ return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, ++ CPA_PAGES_ARRAY, pages); ++} ++ ++static inline int cpa_clear_pages_array(struct page **pages, int numpages, ++ pgprot_t mask) ++{ ++ return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, ++ CPA_PAGES_ARRAY, pages); + } + + int _set_memory_uc(unsigned long addr, int numpages) +@@ -1036,7 +1081,7 @@ int set_memory_np(unsigned long addr, in + int set_memory_4k(unsigned long addr, int numpages) + { + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), +- __pgprot(0), 1, 0); ++ __pgprot(0), 1, 0, NULL); + } + + int set_pages_uc(struct page *page, int numpages) +@@ -1047,6 +1092,35 @@ int set_pages_uc(struct page *page, int + } + EXPORT_SYMBOL(set_pages_uc); + ++int set_pages_array_uc(struct page **pages, int addrinarray) ++{ ++ unsigned long start; ++ unsigned long end; ++ int i; ++ int free_idx; ++ ++ for (i = 0; i < addrinarray; i++) { ++ start = (unsigned long)page_address(pages[i]); ++ end = start + PAGE_SIZE; ++ if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) ++ goto err_out; ++ } ++ ++ if (cpa_set_pages_array(pages, addrinarray, ++ __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) { ++ return 0; /* Success */ ++ } ++err_out: ++ free_idx = i; ++ for (i = 0; i < free_idx; i++) { ++ start = (unsigned long)page_address(pages[i]); ++ end = start + PAGE_SIZE; ++ free_memtype(start, end); ++ } ++ return -EINVAL; ++} ++EXPORT_SYMBOL(set_pages_array_uc); ++ + int set_pages_wb(struct page *page, int numpages) + { + unsigned long addr = (unsigned long)page_address(page); +@@ -1055,6 +1129,26 @@ int set_pages_wb(struct page *page, int + } + EXPORT_SYMBOL(set_pages_wb); + ++int set_pages_array_wb(struct page **pages, int addrinarray) ++{ ++ int retval; ++ unsigned long start; ++ unsigned long end; ++ int i; ++ ++ retval = cpa_clear_pages_array(pages, addrinarray, ++ __pgprot(_PAGE_CACHE_MASK)); ++ ++ for (i = 0; i < addrinarray; i++) { ++ start = (unsigned long)page_address(pages[i]); ++ end = start + PAGE_SIZE; ++ free_memtype(start, end); ++ } ++ ++ return retval; ++} ++EXPORT_SYMBOL(set_pages_array_wb); ++ + int set_pages_x(struct page *page, int numpages) + { + unsigned long addr = (unsigned long)page_address(page); +Index: linux-2.6-tip/arch/x86/mm/pat.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/pat.c ++++ linux-2.6-tip/arch/x86/mm/pat.c +@@ -31,7 +31,7 @@ + #ifdef CONFIG_X86_PAT + int __read_mostly pat_enabled = 1; + +-void __cpuinit pat_disable(char *reason) ++void __cpuinit pat_disable(const char *reason) + { + pat_enabled = 0; + printk(KERN_INFO "%s\n", reason); +@@ -43,6 +43,11 @@ static int __init nopat(char *str) + return 0; + } + early_param("nopat", nopat); ++#else ++static inline void pat_disable(const char *reason) ++{ ++ (void)reason; ++} + #endif + + +@@ -79,16 +84,20 @@ void pat_init(void) + if (!pat_enabled) + return; + +- /* Paranoia check. */ +- if (!cpu_has_pat && boot_pat_state) { +- /* +- * If this happens we are on a secondary CPU, but +- * switched to PAT on the boot CPU. We have no way to +- * undo PAT. +- */ +- printk(KERN_ERR "PAT enabled, " +- "but not supported by secondary CPU\n"); +- BUG(); ++ if (!cpu_has_pat) { ++ if (!boot_pat_state) { ++ pat_disable("PAT not supported by CPU."); ++ return; ++ } else { ++ /* ++ * If this happens we are on a secondary CPU, but ++ * switched to PAT on the boot CPU. We have no way to ++ * undo PAT. ++ */ ++ printk(KERN_ERR "PAT enabled, " ++ "but not supported by secondary CPU\n"); ++ BUG(); ++ } + } + + /* Set PWT to Write-Combining. All other bits stay the same */ +@@ -626,6 +635,33 @@ void unmap_devmem(unsigned long pfn, uns + } + + /* ++ * Change the memory type for the physial address range in kernel identity ++ * mapping space if that range is a part of identity map. ++ */ ++int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) ++{ ++ unsigned long id_sz; ++ ++ if (!pat_enabled || base >= __pa(high_memory)) ++ return 0; ++ ++ id_sz = (__pa(high_memory) < base + size) ? ++ __pa(high_memory) - base : ++ size; ++ ++ if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { ++ printk(KERN_INFO ++ "%s:%d ioremap_change_attr failed %s " ++ "for %Lx-%Lx\n", ++ current->comm, current->pid, ++ cattr_name(flags), ++ base, (unsigned long long)(base + size)); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. +@@ -634,7 +670,7 @@ static int reserve_pfn_range(u64 paddr, + int strict_prot) + { + int is_ram = 0; +- int id_sz, ret; ++ int ret; + unsigned long flags; + unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); + +@@ -672,23 +708,8 @@ static int reserve_pfn_range(u64 paddr, + flags); + } + +- /* Need to keep identity mapping in sync */ +- if (paddr >= __pa(high_memory)) +- return 0; +- +- id_sz = (__pa(high_memory) < paddr + size) ? +- __pa(high_memory) - paddr : +- size; +- +- if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { ++ if (kernel_map_sync_memtype(paddr, size, flags) < 0) { + free_memtype(paddr, paddr + size); +- printk(KERN_ERR +- "%s:%d reserve_pfn_range ioremap_change_attr failed %s " +- "for %Lx-%Lx\n", +- current->comm, current->pid, +- cattr_name(flags), +- (unsigned long long)paddr, +- (unsigned long long)(paddr + size)); + return -EINVAL; + } + return 0; +Index: linux-2.6-tip/arch/x86/mm/pgtable.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/pgtable.c ++++ linux-2.6-tip/arch/x86/mm/pgtable.c +@@ -4,9 +4,11 @@ + #include + #include + ++#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO ++ + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) + { +- return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); ++ return (pte_t *)__get_free_page(PGALLOC_GFP); + } + + pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +@@ -14,9 +16,9 @@ pgtable_t pte_alloc_one(struct mm_struct + struct page *pte; + + #ifdef CONFIG_HIGHPTE +- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); ++ pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); + #else +- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); ++ pte = alloc_pages(PGALLOC_GFP, 0); + #endif + if (pte) + pgtable_page_ctor(pte); +@@ -130,6 +132,7 @@ void pud_populate(struct mm_struct *mm, + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + ++ preempt_disable(); + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, +@@ -138,6 +141,7 @@ void pud_populate(struct mm_struct *mm, + */ + if (mm == current->active_mm) + write_cr3(read_cr3()); ++ preempt_enable(); + } + #else /* !CONFIG_X86_PAE */ + +@@ -161,7 +165,7 @@ static int preallocate_pmds(pmd_t *pmds[ + bool failed = false; + + for(i = 0; i < PREALLOCATED_PMDS; i++) { +- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); + if (pmd == NULL) + failed = true; + pmds[i] = pmd; +@@ -228,7 +232,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + pmd_t *pmds[PREALLOCATED_PMDS]; + unsigned long flags; + +- pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); ++ pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + + if (pgd == NULL) + goto out; +@@ -313,6 +317,24 @@ int ptep_clear_flush_young(struct vm_are + return young; + } + ++/** ++ * reserve_top_address - reserves a hole in the top of kernel address space ++ * @reserve - size of hole to reserve ++ * ++ * Can be used to relocate the fixmap area and poke a hole in the top ++ * of kernel address space to make room for a hypervisor. ++ */ ++void __init reserve_top_address(unsigned long reserve) ++{ ++#ifdef CONFIG_X86_32 ++ BUG_ON(fixmaps_set > 0); ++ printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", ++ (int)-reserve); ++ __FIXADDR_TOP = -reserve - PAGE_SIZE; ++ __VMALLOC_RESERVE += reserve; ++#endif ++} ++ + int fixmaps_set; + + void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) +@@ -327,7 +349,8 @@ void __native_set_fixmap(enum fixed_addr + fixmaps_set++; + } + +-void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) ++void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, ++ pgprot_t flags) + { + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); + } +Index: linux-2.6-tip/arch/x86/mm/pgtable_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/pgtable_32.c ++++ linux-2.6-tip/arch/x86/mm/pgtable_32.c +@@ -20,6 +20,8 @@ + #include + #include + ++unsigned int __VMALLOC_RESERVE = 128 << 20; ++ + /* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. +@@ -48,7 +50,7 @@ void set_pte_vaddr(unsigned long vaddr, + } + pte = pte_offset_kernel(pmd, vaddr); + if (pte_val(pteval)) +- set_pte_present(&init_mm, vaddr, pte, pteval); ++ set_pte_at(&init_mm, vaddr, pte, pteval); + else + pte_clear(&init_mm, vaddr, pte); + +@@ -97,22 +99,6 @@ void set_pmd_pfn(unsigned long vaddr, un + unsigned long __FIXADDR_TOP = 0xfffff000; + EXPORT_SYMBOL(__FIXADDR_TOP); + +-/** +- * reserve_top_address - reserves a hole in the top of kernel address space +- * @reserve - size of hole to reserve +- * +- * Can be used to relocate the fixmap area and poke a hole in the top +- * of kernel address space to make room for a hypervisor. +- */ +-void __init reserve_top_address(unsigned long reserve) +-{ +- BUG_ON(fixmaps_set > 0); +- printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", +- (int)-reserve); +- __FIXADDR_TOP = -reserve - PAGE_SIZE; +- __VMALLOC_RESERVE += reserve; +-} +- + /* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the +Index: linux-2.6-tip/arch/x86/mm/srat_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/mm/srat_64.c ++++ linux-2.6-tip/arch/x86/mm/srat_64.c +@@ -20,7 +20,8 @@ + #include + #include + #include +-#include ++#include ++#include + + int acpi_numa __initdata; + +Index: linux-2.6-tip/arch/x86/mm/tlb.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/mm/tlb.c +@@ -0,0 +1,290 @@ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) ++ = { &init_mm, 0, }; ++ ++/* ++ * Smarter SMP flushing macros. ++ * c/o Linus Torvalds. ++ * ++ * These mean you can really definitely utterly forget about ++ * writing to user space from interrupts. (Its not allowed anyway). ++ * ++ * Optimizations Manfred Spraul ++ * ++ * More scalable flush, from Andi Kleen ++ * ++ * To avoid global state use 8 different call vectors. ++ * Each CPU uses a specific vector to trigger flushes on other ++ * CPUs. Depending on the received vector the target CPUs look into ++ * the right array slot for the flush data. ++ * ++ * With more than 8 CPUs they are hashed to the 8 available ++ * vectors. The limited global vector space forces us to this right now. ++ * In future when interrupts are split into per CPU domains this could be ++ * fixed, at the cost of triggering multiple IPIs in some cases. ++ */ ++ ++union smp_flush_state { ++ struct { ++ struct mm_struct *flush_mm; ++ unsigned long flush_va; ++ DECLARE_BITMAP(flush_cpumask, NR_CPUS); ++ raw_spinlock_t tlbstate_lock; ++ }; ++ char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; ++} ____cacheline_internodealigned_in_smp; ++ ++/* State is put into the per CPU data section, but padded ++ to a full cache line because other CPUs can access it and we don't ++ want false sharing in the per cpu data segment. */ ++static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; ++ ++/* ++ * We cannot call mmdrop() because we are in interrupt context, ++ * instead update mm->cpu_vm_mask. ++ */ ++void leave_mm(int cpu) ++{ ++ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) ++ BUG(); ++ cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); ++ load_cr3(swapper_pg_dir); ++} ++EXPORT_SYMBOL_GPL(leave_mm); ++ ++/* ++ * ++ * The flush IPI assumes that a thread switch happens in this order: ++ * [cpu0: the cpu that switches] ++ * 1) switch_mm() either 1a) or 1b) ++ * 1a) thread switch to a different mm ++ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); ++ * Stop ipi delivery for the old mm. This is not synchronized with ++ * the other cpus, but smp_invalidate_interrupt ignore flush ipis ++ * for the wrong mm, and in the worst case we perform a superfluous ++ * tlb flush. ++ * 1a2) set cpu mmu_state to TLBSTATE_OK ++ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 ++ * was in lazy tlb mode. ++ * 1a3) update cpu active_mm ++ * Now cpu0 accepts tlb flushes for the new mm. ++ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); ++ * Now the other cpus will send tlb flush ipis. ++ * 1a4) change cr3. ++ * 1b) thread switch without mm change ++ * cpu active_mm is correct, cpu0 already handles ++ * flush ipis. ++ * 1b1) set cpu mmu_state to TLBSTATE_OK ++ * 1b2) test_and_set the cpu bit in cpu_vm_mask. ++ * Atomically set the bit [other cpus will start sending flush ipis], ++ * and test the bit. ++ * 1b3) if the bit was 0: leave_mm was called, flush the tlb. ++ * 2) switch %%esp, ie current ++ * ++ * The interrupt must handle 2 special cases: ++ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. ++ * - the cpu performs speculative tlb reads, i.e. even if the cpu only ++ * runs in kernel space, the cpu could load tlb entries for user space ++ * pages. ++ * ++ * The good news is that cpu mmu_state is local to each cpu, no ++ * write/read ordering problems. ++ */ ++ ++/* ++ * TLB flush IPI: ++ * ++ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. ++ * 2) Leave the mm if we are in the lazy tlb mode. ++ * ++ * Interrupts are disabled. ++ */ ++ ++/* ++ * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop ++ * but still used for documentation purpose but the usage is slightly ++ * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt ++ * entry calls in with the first parameter in %eax. Maybe define ++ * intrlinkage? ++ */ ++#ifdef CONFIG_X86_64 ++asmlinkage ++#endif ++void smp_invalidate_interrupt(struct pt_regs *regs) ++{ ++ unsigned int cpu; ++ unsigned int sender; ++ union smp_flush_state *f; ++ ++ cpu = smp_processor_id(); ++ /* ++ * orig_rax contains the negated interrupt vector. ++ * Use that to determine where the sender put the data. ++ */ ++ sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; ++ f = &flush_state[sender]; ++ ++ if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) ++ goto out; ++ /* ++ * This was a BUG() but until someone can quote me the ++ * line from the intel manual that guarantees an IPI to ++ * multiple CPUs is retried _only_ on the erroring CPUs ++ * its staying as a return ++ * ++ * BUG(); ++ */ ++ ++ if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { ++ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { ++ if (f->flush_va == TLB_FLUSH_ALL) ++ local_flush_tlb(); ++ else ++ __flush_tlb_one(f->flush_va); ++ } else ++ leave_mm(cpu); ++ } ++out: ++ ack_APIC_irq(); ++ smp_mb__before_clear_bit(); ++ cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); ++ smp_mb__after_clear_bit(); ++ inc_irq_stat(irq_tlb_count); ++} ++ ++static void flush_tlb_others_ipi(const struct cpumask *cpumask, ++ struct mm_struct *mm, unsigned long va) ++{ ++ unsigned int sender; ++ union smp_flush_state *f; ++ ++ /* Caller has disabled preemption */ ++ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; ++ f = &flush_state[sender]; ++ ++ /* ++ * Could avoid this lock when ++ * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is ++ * probably not worth checking this for a cache-hot lock. ++ */ ++ spin_lock(&f->tlbstate_lock); ++ ++ f->flush_mm = mm; ++ f->flush_va = va; ++ cpumask_andnot(to_cpumask(f->flush_cpumask), ++ cpumask, cpumask_of(smp_processor_id())); ++ ++ /* ++ * We have to send the IPI only to ++ * CPUs affected. ++ */ ++ apic->send_IPI_mask(to_cpumask(f->flush_cpumask), ++ INVALIDATE_TLB_VECTOR_START + sender); ++ ++ while (!cpumask_empty(to_cpumask(f->flush_cpumask))) ++ cpu_relax(); ++ ++ f->flush_mm = NULL; ++ f->flush_va = 0; ++ spin_unlock(&f->tlbstate_lock); ++} ++ ++void native_flush_tlb_others(const struct cpumask *cpumask, ++ struct mm_struct *mm, unsigned long va) ++{ ++ if (is_uv_system()) { ++ unsigned int cpu; ++ ++ cpu = get_cpu(); ++ cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); ++ if (cpumask) ++ flush_tlb_others_ipi(cpumask, mm, va); ++ put_cpu(); ++ return; ++ } ++ flush_tlb_others_ipi(cpumask, mm, va); ++} ++ ++static int __cpuinit init_smp_flush(void) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(flush_state); i++) ++ spin_lock_init(&flush_state[i].tlbstate_lock); ++ ++ return 0; ++} ++core_initcall(init_smp_flush); ++ ++void flush_tlb_current_task(void) ++{ ++ struct mm_struct *mm = current->mm; ++ ++ preempt_disable(); ++ ++ local_flush_tlb(); ++ if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) ++ flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); ++ preempt_enable(); ++} ++ ++void flush_tlb_mm(struct mm_struct *mm) ++{ ++ preempt_disable(); ++ ++ if (current->active_mm == mm) { ++ if (current->mm) ++ local_flush_tlb(); ++ else ++ leave_mm(smp_processor_id()); ++ } ++ if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) ++ flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); ++ ++ preempt_enable(); ++} ++ ++void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ ++ preempt_disable(); ++ ++ if (current->active_mm == mm) { ++ if (current->mm) ++ __flush_tlb_one(va); ++ else ++ leave_mm(smp_processor_id()); ++ } ++ ++ if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) ++ flush_tlb_others(&mm->cpu_vm_mask, mm, va); ++ ++ preempt_enable(); ++} ++ ++static void do_flush_tlb_all(void *info) ++{ ++ unsigned long cpu = smp_processor_id(); ++ ++ __flush_tlb_all(); ++ if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) ++ leave_mm(cpu); ++} ++ ++void flush_tlb_all(void) ++{ ++ on_each_cpu(do_flush_tlb_all, NULL, 1); ++} +Index: linux-2.6-tip/arch/x86/oprofile/nmi_int.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/oprofile/nmi_int.c ++++ linux-2.6-tip/arch/x86/oprofile/nmi_int.c +@@ -40,8 +40,9 @@ static int profile_exceptions_notify(str + + switch (val) { + case DIE_NMI: +- if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) +- ret = NOTIFY_STOP; ++ case DIE_NMI_IPI: ++ model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); ++ ret = NOTIFY_STOP; + break; + default: + break; +@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy) + static struct notifier_block profile_exceptions_nb = { + .notifier_call = profile_exceptions_notify, + .next = NULL, +- .priority = 0 ++ .priority = 2 + }; + + static int nmi_setup(void) +Index: linux-2.6-tip/arch/x86/oprofile/op_model_p4.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/oprofile/op_model_p4.c ++++ linux-2.6-tip/arch/x86/oprofile/op_model_p4.c +@@ -380,7 +380,7 @@ static unsigned int get_stagger(void) + { + #ifdef CONFIG_SMP + int cpu = smp_processor_id(); +- return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); ++ return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); + #endif + return 0; + } +Index: linux-2.6-tip/arch/x86/oprofile/op_model_ppro.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/oprofile/op_model_ppro.c ++++ linux-2.6-tip/arch/x86/oprofile/op_model_ppro.c +@@ -18,7 +18,7 @@ + #include + #include + #include +-#include ++#include + + #include "op_x86_model.h" + #include "op_counter.h" +@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_reg + u64 val; + int i; + ++ /* ++ * This can happen if perf counters are in use when ++ * we steal the die notifier NMI. ++ */ ++ if (unlikely(!reset_value)) ++ goto out; ++ + for (i = 0 ; i < num_counters; ++i) { + if (!reset_value[i]) + continue; +@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_reg + } + } + ++out: + /* Only P6 based Pentium M need to re-unmask the apic vector but it + * doesn't hurt other P6 variant */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); +Index: linux-2.6-tip/arch/x86/pci/amd_bus.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/pci/amd_bus.c ++++ linux-2.6-tip/arch/x86/pci/amd_bus.c +@@ -277,8 +277,8 @@ static int __init early_fill_mp_bus_info + { + int i; + int j; +- unsigned bus; +- unsigned slot; ++ unsigned uninitialized_var(bus); ++ unsigned uninitialized_var(slot); + int found; + int node; + int link; +Index: linux-2.6-tip/arch/x86/pci/common.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/pci/common.c ++++ linux-2.6-tip/arch/x86/pci/common.c +@@ -81,7 +81,7 @@ int pcibios_scanned; + * This interrupt-safe spinlock protects all accesses to PCI + * configuration space. + */ +-DEFINE_SPINLOCK(pci_config_lock); ++DEFINE_RAW_SPINLOCK(pci_config_lock); + + static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) + { +@@ -90,7 +90,7 @@ static int __devinit can_skip_ioresource + return 0; + } + +-static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = { ++static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = { + /* + * Systems where PCI IO resource ISA alignment can be skipped + * when the ISA enable bit in the bridge control is not set +@@ -183,7 +183,7 @@ static int __devinit assign_all_busses(c + } + #endif + +-static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { ++static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { + #ifdef __i386__ + /* + * Laptops which need pci=assign-busses to see Cardbus cards +Index: linux-2.6-tip/arch/x86/pci/fixup.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/pci/fixup.c ++++ linux-2.6-tip/arch/x86/pci/fixup.c +@@ -356,7 +356,7 @@ static void __devinit pci_fixup_video(st + DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); + + +-static struct dmi_system_id __devinitdata msi_k8t_dmi_table[] = { ++static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = { + { + .ident = "MSI-K8T-Neo2Fir", + .matches = { +@@ -413,7 +413,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_V + */ + static u16 toshiba_line_size; + +-static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = { ++static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = { + { + .ident = "Toshiba PS5 based laptop", + .matches = { +Index: linux-2.6-tip/arch/x86/pci/numaq_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/pci/numaq_32.c ++++ linux-2.6-tip/arch/x86/pci/numaq_32.c +@@ -5,7 +5,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + +@@ -18,10 +18,6 @@ + + #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) + +-/* Where the IO area was mapped on multiquad, always 0 otherwise */ +-void *xquad_portio; +-EXPORT_SYMBOL(xquad_portio); +- + #define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) + + #define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ +Index: linux-2.6-tip/arch/x86/pci/pcbios.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/pci/pcbios.c ++++ linux-2.6-tip/arch/x86/pci/pcbios.c +@@ -7,7 +7,7 @@ + #include + #include + #include +-#include ++#include + + /* BIOS32 signature: "_32_" */ + #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) +Index: linux-2.6-tip/arch/x86/power/hibernate_asm_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/power/hibernate_asm_32.S ++++ linux-2.6-tip/arch/x86/power/hibernate_asm_32.S +@@ -8,7 +8,7 @@ + + #include + #include +-#include ++#include + #include + #include + +Index: linux-2.6-tip/arch/x86/power/hibernate_asm_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/power/hibernate_asm_64.S ++++ linux-2.6-tip/arch/x86/power/hibernate_asm_64.S +@@ -18,7 +18,7 @@ + .text + #include + #include +-#include ++#include + #include + #include + +Index: linux-2.6-tip/arch/x86/vdso/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/vdso/Makefile ++++ linux-2.6-tip/arch/x86/vdso/Makefile +@@ -38,7 +38,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + + CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ +- $(filter -g%,$(KBUILD_CFLAGS)) ++ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) + + $(vobjs): KBUILD_CFLAGS += $(CFL) + +Index: linux-2.6-tip/arch/x86/vdso/vma.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/vdso/vma.c ++++ linux-2.6-tip/arch/x86/vdso/vma.c +@@ -85,8 +85,8 @@ static unsigned long vdso_addr(unsigned + unsigned long addr, end; + unsigned offset; + end = (start + PMD_SIZE - 1) & PMD_MASK; +- if (end >= TASK_SIZE64) +- end = TASK_SIZE64; ++ if (end >= TASK_SIZE_MAX) ++ end = TASK_SIZE_MAX; + end -= len; + /* This loses some more bits than a modulo, but is cheaper */ + offset = get_random_int() & (PTRS_PER_PTE - 1); +Index: linux-2.6-tip/arch/x86/xen/Kconfig +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/Kconfig ++++ linux-2.6-tip/arch/x86/xen/Kconfig +@@ -6,7 +6,7 @@ config XEN + bool "Xen guest support" + select PARAVIRT + select PARAVIRT_CLOCK +- depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER)) ++ depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS) + depends on X86_CMPXCHG && X86_TSC + help + This is the Linux Xen port. Enabling this will allow the +Index: linux-2.6-tip/arch/x86/xen/Makefile +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/Makefile ++++ linux-2.6-tip/arch/x86/xen/Makefile +@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg + endif + + obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ +- time.o xen-asm_$(BITS).o grant-table.o suspend.o ++ time.o xen-asm.o xen-asm_$(BITS).o \ ++ grant-table.o suspend.o + + obj-$(CONFIG_SMP) += smp.o spinlock.o + obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o +\ No newline at end of file +Index: linux-2.6-tip/arch/x86/xen/enlighten.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/enlighten.c ++++ linux-2.6-tip/arch/x86/xen/enlighten.c +@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcp + enum xen_domain_type xen_domain_type = XEN_NATIVE; + EXPORT_SYMBOL_GPL(xen_domain_type); + +-/* +- * Identity map, in addition to plain kernel map. This needs to be +- * large enough to allocate page table pages to allocate the rest. +- * Each page can map 2MB. +- */ +-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; +- +-#ifdef CONFIG_X86_64 +-/* l3 pud for userspace vsyscall mapping */ +-static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +-#endif /* CONFIG_X86_64 */ +- +-/* +- * Note about cr3 (pagetable base) values: +- * +- * xen_cr3 contains the current logical cr3 value; it contains the +- * last set cr3. This may not be the current effective cr3, because +- * its update may be being lazily deferred. However, a vcpu looking +- * at its own cr3 can use this value knowing that it everything will +- * be self-consistent. +- * +- * xen_current_cr3 contains the actual vcpu cr3; it is set once the +- * hypercall to set the vcpu cr3 is complete (so it may be a little +- * out of date, but it will never be set early). If one vcpu is +- * looking at another vcpu's cr3 value, it should use this variable. +- */ +-DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +-DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +- + struct start_info *xen_start_info; + EXPORT_SYMBOL_GPL(xen_start_info); + + struct shared_info xen_dummy_shared_info; + ++void *xen_initial_gdt; ++ + /* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. +@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_in + * + * 0: not available, 1: available + */ +-static int have_vcpu_info_placement = +-#ifdef CONFIG_X86_32 +- 1 +-#else +- 0 +-#endif +- ; +- ++static int have_vcpu_info_placement = 1; + + static void xen_vcpu_setup(int cpu) + { +@@ -137,7 +103,7 @@ static void xen_vcpu_setup(int cpu) + + vcpup = &per_cpu(xen_vcpu_info, cpu); + +- info.mfn = virt_to_mfn(vcpup); ++ info.mfn = arbitrary_virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", +@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(in + return HYPERVISOR_get_debugreg(reg); + } + +-static void xen_leave_lazy(void) ++void xen_leave_lazy(void) + { + paravirt_leave_lazy(paravirt_get_lazy_mode()); + xen_mc_flush(); +@@ -335,8 +301,10 @@ static void xen_load_gdt(const struct de + frames = mcs.args; + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { +- frames[f] = virt_to_mfn(va); ++ frames[f] = arbitrary_virt_to_mfn((void *)va); ++ + make_lowmem_page_readonly((void *)va); ++ make_lowmem_page_readonly(mfn_to_virt(frames[f])); + } + + MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); +@@ -348,7 +316,7 @@ static void load_TLS_descriptor(struct t + unsigned int cpu, unsigned int i) + { + struct desc_struct *gdt = get_cpu_gdt_table(cpu); +- xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); ++ xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + struct multicall_space mc = __xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +@@ -357,13 +325,14 @@ static void load_TLS_descriptor(struct t + static void xen_load_tls(struct thread_struct *t, unsigned int cpu) + { + /* +- * XXX sleazy hack: If we're being called in a lazy-cpu zone, +- * it means we're in a context switch, and %gs has just been +- * saved. This means we can zero it out to prevent faults on +- * exit from the hypervisor if the next process has no %gs. +- * Either way, it has been saved, and the new value will get +- * loaded properly. This will go away as soon as Xen has been +- * modified to not save/restore %gs for normal hypercalls. ++ * XXX sleazy hack: If we're being called in a lazy-cpu zone ++ * and lazy gs handling is enabled, it means we're in a ++ * context switch, and %gs has just been saved. This means we ++ * can zero it out to prevent faults on exit from the ++ * hypervisor if the next process has no %gs. Either way, it ++ * has been saved, and the new value will get loaded properly. ++ * This will go away as soon as Xen has been modified to not ++ * save/restore %gs for normal hypercalls. + * + * On x86_64, this hack is not used for %gs, because gs points + * to KERNEL_GS_BASE (and uses it for PDA references), so we +@@ -375,7 +344,7 @@ static void xen_load_tls(struct thread_s + */ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { + #ifdef CONFIG_X86_32 +- loadsegment(gs, 0); ++ lazy_load_gs(0); + #else + loadsegment(fs, 0); + #endif +@@ -521,7 +490,7 @@ static void xen_write_gdt_entry(struct d + break; + + default: { +- xmaddr_t maddr = virt_to_machine(&dt[entry]); ++ xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) +@@ -587,94 +556,18 @@ static u32 xen_safe_apic_wait_icr_idle(v + return 0; + } + +-static struct apic_ops xen_basic_apic_ops = { +- .read = xen_apic_read, +- .write = xen_apic_write, +- .icr_read = xen_apic_icr_read, +- .icr_write = xen_apic_icr_write, +- .wait_icr_idle = xen_apic_wait_icr_idle, +- .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, +-}; +- +-#endif +- +-static void xen_flush_tlb(void) ++static void set_xen_basic_apic_ops(void) + { +- struct mmuext_op *op; +- struct multicall_space mcs; +- +- preempt_disable(); +- +- mcs = xen_mc_entry(sizeof(*op)); +- +- op = mcs.args; +- op->cmd = MMUEXT_TLB_FLUSH_LOCAL; +- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +- +- xen_mc_issue(PARAVIRT_LAZY_MMU); +- +- preempt_enable(); +-} +- +-static void xen_flush_tlb_single(unsigned long addr) +-{ +- struct mmuext_op *op; +- struct multicall_space mcs; +- +- preempt_disable(); +- +- mcs = xen_mc_entry(sizeof(*op)); +- op = mcs.args; +- op->cmd = MMUEXT_INVLPG_LOCAL; +- op->arg1.linear_addr = addr & PAGE_MASK; +- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +- +- xen_mc_issue(PARAVIRT_LAZY_MMU); +- +- preempt_enable(); ++ apic->read = xen_apic_read; ++ apic->write = xen_apic_write; ++ apic->icr_read = xen_apic_icr_read; ++ apic->icr_write = xen_apic_icr_write; ++ apic->wait_icr_idle = xen_apic_wait_icr_idle; ++ apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; + } + +-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, +- unsigned long va) +-{ +- struct { +- struct mmuext_op op; +- cpumask_t mask; +- } *args; +- cpumask_t cpumask = *cpus; +- struct multicall_space mcs; +- +- /* +- * A couple of (to be removed) sanity checks: +- * +- * - current CPU must not be in mask +- * - mask must exist :) +- */ +- BUG_ON(cpus_empty(cpumask)); +- BUG_ON(cpu_isset(smp_processor_id(), cpumask)); +- BUG_ON(!mm); +- +- /* If a CPU which we ran on has gone down, OK. */ +- cpus_and(cpumask, cpumask, cpu_online_map); +- if (cpus_empty(cpumask)) +- return; +- +- mcs = xen_mc_entry(sizeof(*args)); +- args = mcs.args; +- args->mask = cpumask; +- args->op.arg2.vcpumask = &args->mask; +- +- if (va == TLB_FLUSH_ALL) { +- args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; +- } else { +- args->op.cmd = MMUEXT_INVLPG_MULTI; +- args->op.arg1.linear_addr = va; +- } +- +- MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); ++#endif + +- xen_mc_issue(PARAVIRT_LAZY_MMU); +-} + + static void xen_clts(void) + { +@@ -700,21 +593,6 @@ static void xen_write_cr0(unsigned long + xen_mc_issue(PARAVIRT_LAZY_CPU); + } + +-static void xen_write_cr2(unsigned long cr2) +-{ +- x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; +-} +- +-static unsigned long xen_read_cr2(void) +-{ +- return x86_read_percpu(xen_vcpu)->arch.cr2; +-} +- +-static unsigned long xen_read_cr2_direct(void) +-{ +- return x86_read_percpu(xen_vcpu_info.arch.cr2); +-} +- + static void xen_write_cr4(unsigned long cr4) + { + cr4 &= ~X86_CR4_PGE; +@@ -723,71 +601,6 @@ static void xen_write_cr4(unsigned long + native_write_cr4(cr4); + } + +-static unsigned long xen_read_cr3(void) +-{ +- return x86_read_percpu(xen_cr3); +-} +- +-static void set_current_cr3(void *v) +-{ +- x86_write_percpu(xen_current_cr3, (unsigned long)v); +-} +- +-static void __xen_write_cr3(bool kernel, unsigned long cr3) +-{ +- struct mmuext_op *op; +- struct multicall_space mcs; +- unsigned long mfn; +- +- if (cr3) +- mfn = pfn_to_mfn(PFN_DOWN(cr3)); +- else +- mfn = 0; +- +- WARN_ON(mfn == 0 && kernel); +- +- mcs = __xen_mc_entry(sizeof(*op)); +- +- op = mcs.args; +- op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; +- op->arg1.mfn = mfn; +- +- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +- +- if (kernel) { +- x86_write_percpu(xen_cr3, cr3); +- +- /* Update xen_current_cr3 once the batch has actually +- been submitted. */ +- xen_mc_callback(set_current_cr3, (void *)cr3); +- } +-} +- +-static void xen_write_cr3(unsigned long cr3) +-{ +- BUG_ON(preemptible()); +- +- xen_mc_batch(); /* disables interrupts */ +- +- /* Update while interrupts are disabled, so its atomic with +- respect to ipis */ +- x86_write_percpu(xen_cr3, cr3); +- +- __xen_write_cr3(true, cr3); +- +-#ifdef CONFIG_X86_64 +- { +- pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); +- if (user_pgd) +- __xen_write_cr3(false, __pa(user_pgd)); +- else +- __xen_write_cr3(false, 0); +- } +-#endif +- +- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +-} +- + static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) + { + int ret; +@@ -829,185 +642,6 @@ static int xen_write_msr_safe(unsigned i + return ret; + } + +-/* Early in boot, while setting up the initial pagetable, assume +- everything is pinned. */ +-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +-{ +-#ifdef CONFIG_FLATMEM +- BUG_ON(mem_map); /* should only be used early */ +-#endif +- make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +-} +- +-/* Early release_pte assumes that all pts are pinned, since there's +- only init_mm and anything attached to that is pinned. */ +-static void xen_release_pte_init(unsigned long pfn) +-{ +- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +-} +- +-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +-{ +- struct mmuext_op op; +- op.cmd = cmd; +- op.arg1.mfn = pfn_to_mfn(pfn); +- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) +- BUG(); +-} +- +-/* This needs to make sure the new pte page is pinned iff its being +- attached to a pinned pagetable. */ +-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) +-{ +- struct page *page = pfn_to_page(pfn); +- +- if (PagePinned(virt_to_page(mm->pgd))) { +- SetPagePinned(page); +- +- vm_unmap_aliases(); +- if (!PageHighMem(page)) { +- make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); +- if (level == PT_PTE && USE_SPLIT_PTLOCKS) +- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); +- } else { +- /* make sure there are no stray mappings of +- this page */ +- kmap_flush_unused(); +- } +- } +-} +- +-static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) +-{ +- xen_alloc_ptpage(mm, pfn, PT_PTE); +-} +- +-static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) +-{ +- xen_alloc_ptpage(mm, pfn, PT_PMD); +-} +- +-static int xen_pgd_alloc(struct mm_struct *mm) +-{ +- pgd_t *pgd = mm->pgd; +- int ret = 0; +- +- BUG_ON(PagePinned(virt_to_page(pgd))); +- +-#ifdef CONFIG_X86_64 +- { +- struct page *page = virt_to_page(pgd); +- pgd_t *user_pgd; +- +- BUG_ON(page->private != 0); +- +- ret = -ENOMEM; +- +- user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); +- page->private = (unsigned long)user_pgd; +- +- if (user_pgd != NULL) { +- user_pgd[pgd_index(VSYSCALL_START)] = +- __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); +- ret = 0; +- } +- +- BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); +- } +-#endif +- +- return ret; +-} +- +-static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +-{ +-#ifdef CONFIG_X86_64 +- pgd_t *user_pgd = xen_get_user_pgd(pgd); +- +- if (user_pgd) +- free_page((unsigned long)user_pgd); +-#endif +-} +- +-/* This should never happen until we're OK to use struct page */ +-static void xen_release_ptpage(unsigned long pfn, unsigned level) +-{ +- struct page *page = pfn_to_page(pfn); +- +- if (PagePinned(page)) { +- if (!PageHighMem(page)) { +- if (level == PT_PTE && USE_SPLIT_PTLOCKS) +- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); +- make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +- } +- ClearPagePinned(page); +- } +-} +- +-static void xen_release_pte(unsigned long pfn) +-{ +- xen_release_ptpage(pfn, PT_PTE); +-} +- +-static void xen_release_pmd(unsigned long pfn) +-{ +- xen_release_ptpage(pfn, PT_PMD); +-} +- +-#if PAGETABLE_LEVELS == 4 +-static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) +-{ +- xen_alloc_ptpage(mm, pfn, PT_PUD); +-} +- +-static void xen_release_pud(unsigned long pfn) +-{ +- xen_release_ptpage(pfn, PT_PUD); +-} +-#endif +- +-#ifdef CONFIG_HIGHPTE +-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) +-{ +- pgprot_t prot = PAGE_KERNEL; +- +- if (PagePinned(page)) +- prot = PAGE_KERNEL_RO; +- +- if (0 && PageHighMem(page)) +- printk("mapping highpte %lx type %d prot %s\n", +- page_to_pfn(page), type, +- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); +- +- return kmap_atomic_prot(page, type, prot); +-} +-#endif +- +-#ifdef CONFIG_X86_32 +-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +-{ +- /* If there's an existing pte, then don't allow _PAGE_RW to be set */ +- if (pte_val_ma(*ptep) & _PAGE_PRESENT) +- pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & +- pte_val_ma(pte)); +- +- return pte; +-} +- +-/* Init-time set_pte while constructing initial pagetables, which +- doesn't allow RO pagetable pages to be remapped RW */ +-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +-{ +- pte = mask_rw_pte(ptep, pte); +- +- xen_set_pte(ptep, pte); +-} +-#endif +- +-static __init void xen_pagetable_setup_start(pgd_t *base) +-{ +-} +- + void xen_setup_shared_info(void) + { + if (!xen_feature(XENFEAT_auto_translated_physmap)) { +@@ -1028,37 +662,6 @@ void xen_setup_shared_info(void) + xen_setup_mfn_list_list(); + } + +-static __init void xen_pagetable_setup_done(pgd_t *base) +-{ +- xen_setup_shared_info(); +-} +- +-static __init void xen_post_allocator_init(void) +-{ +- pv_mmu_ops.set_pte = xen_set_pte; +- pv_mmu_ops.set_pmd = xen_set_pmd; +- pv_mmu_ops.set_pud = xen_set_pud; +-#if PAGETABLE_LEVELS == 4 +- pv_mmu_ops.set_pgd = xen_set_pgd; +-#endif +- +- /* This will work as long as patching hasn't happened yet +- (which it hasn't) */ +- pv_mmu_ops.alloc_pte = xen_alloc_pte; +- pv_mmu_ops.alloc_pmd = xen_alloc_pmd; +- pv_mmu_ops.release_pte = xen_release_pte; +- pv_mmu_ops.release_pmd = xen_release_pmd; +-#if PAGETABLE_LEVELS == 4 +- pv_mmu_ops.alloc_pud = xen_alloc_pud; +- pv_mmu_ops.release_pud = xen_release_pud; +-#endif +- +-#ifdef CONFIG_X86_64 +- SetPagePinned(virt_to_page(level3_user_vsyscall)); +-#endif +- xen_mark_init_mm_pinned(); +-} +- + /* This is called once we have the cpu_possible_map */ + void xen_setup_vcpu_info_placement(void) + { +@@ -1072,10 +675,10 @@ void xen_setup_vcpu_info_placement(void) + if (have_vcpu_info_placement) { + printk(KERN_INFO "Xen: using vcpu_info placement\n"); + +- pv_irq_ops.save_fl = xen_save_fl_direct; +- pv_irq_ops.restore_fl = xen_restore_fl_direct; +- pv_irq_ops.irq_disable = xen_irq_disable_direct; +- pv_irq_ops.irq_enable = xen_irq_enable_direct; ++ pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); ++ pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); ++ pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); ++ pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); + pv_mmu_ops.read_cr2 = xen_read_cr2_direct; + } + } +@@ -1133,49 +736,6 @@ static unsigned xen_patch(u8 type, u16 c + return ret; + } + +-static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) +-{ +- pte_t pte; +- +- phys >>= PAGE_SHIFT; +- +- switch (idx) { +- case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: +-#ifdef CONFIG_X86_F00F_BUG +- case FIX_F00F_IDT: +-#endif +-#ifdef CONFIG_X86_32 +- case FIX_WP_TEST: +- case FIX_VDSO: +-# ifdef CONFIG_HIGHMEM +- case FIX_KMAP_BEGIN ... FIX_KMAP_END: +-# endif +-#else +- case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +-#endif +-#ifdef CONFIG_X86_LOCAL_APIC +- case FIX_APIC_BASE: /* maps dummy local APIC */ +-#endif +- pte = pfn_pte(phys, prot); +- break; +- +- default: +- pte = mfn_pte(phys, prot); +- break; +- } +- +- __native_set_fixmap(idx, pte); +- +-#ifdef CONFIG_X86_64 +- /* Replicate changes to map the vsyscall page into the user +- pagetable vsyscall mapping. */ +- if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { +- unsigned long vaddr = __fix_to_virt(idx); +- set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); +- } +-#endif +-} +- + static const struct pv_info xen_info __initdata = { + .paravirt_enabled = 1, + .shared_kernel_pmd = 0, +@@ -1271,87 +831,6 @@ static const struct pv_apic_ops xen_apic + #endif + }; + +-static const struct pv_mmu_ops xen_mmu_ops __initdata = { +- .pagetable_setup_start = xen_pagetable_setup_start, +- .pagetable_setup_done = xen_pagetable_setup_done, +- +- .read_cr2 = xen_read_cr2, +- .write_cr2 = xen_write_cr2, +- +- .read_cr3 = xen_read_cr3, +- .write_cr3 = xen_write_cr3, +- +- .flush_tlb_user = xen_flush_tlb, +- .flush_tlb_kernel = xen_flush_tlb, +- .flush_tlb_single = xen_flush_tlb_single, +- .flush_tlb_others = xen_flush_tlb_others, +- +- .pte_update = paravirt_nop, +- .pte_update_defer = paravirt_nop, +- +- .pgd_alloc = xen_pgd_alloc, +- .pgd_free = xen_pgd_free, +- +- .alloc_pte = xen_alloc_pte_init, +- .release_pte = xen_release_pte_init, +- .alloc_pmd = xen_alloc_pte_init, +- .alloc_pmd_clone = paravirt_nop, +- .release_pmd = xen_release_pte_init, +- +-#ifdef CONFIG_HIGHPTE +- .kmap_atomic_pte = xen_kmap_atomic_pte, +-#endif +- +-#ifdef CONFIG_X86_64 +- .set_pte = xen_set_pte, +-#else +- .set_pte = xen_set_pte_init, +-#endif +- .set_pte_at = xen_set_pte_at, +- .set_pmd = xen_set_pmd_hyper, +- +- .ptep_modify_prot_start = __ptep_modify_prot_start, +- .ptep_modify_prot_commit = __ptep_modify_prot_commit, +- +- .pte_val = xen_pte_val, +- .pte_flags = native_pte_flags, +- .pgd_val = xen_pgd_val, +- +- .make_pte = xen_make_pte, +- .make_pgd = xen_make_pgd, +- +-#ifdef CONFIG_X86_PAE +- .set_pte_atomic = xen_set_pte_atomic, +- .set_pte_present = xen_set_pte_at, +- .pte_clear = xen_pte_clear, +- .pmd_clear = xen_pmd_clear, +-#endif /* CONFIG_X86_PAE */ +- .set_pud = xen_set_pud_hyper, +- +- .make_pmd = xen_make_pmd, +- .pmd_val = xen_pmd_val, +- +-#if PAGETABLE_LEVELS == 4 +- .pud_val = xen_pud_val, +- .make_pud = xen_make_pud, +- .set_pgd = xen_set_pgd_hyper, +- +- .alloc_pud = xen_alloc_pte_init, +- .release_pud = xen_release_pte_init, +-#endif /* PAGETABLE_LEVELS == 4 */ +- +- .activate_mm = xen_activate_mm, +- .dup_mmap = xen_dup_mmap, +- .exit_mmap = xen_exit_mmap, +- +- .lazy_mode = { +- .enter = paravirt_enter_lazy_mmu, +- .leave = xen_leave_lazy, +- }, +- +- .set_fixmap = xen_set_fixmap, +-}; +- + static void xen_reboot(int reason) + { + struct sched_shutdown r = { .reason = reason }; +@@ -1394,223 +873,6 @@ static const struct machine_ops __initda + }; + + +-static void __init xen_reserve_top(void) +-{ +-#ifdef CONFIG_X86_32 +- unsigned long top = HYPERVISOR_VIRT_START; +- struct xen_platform_parameters pp; +- +- if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) +- top = pp.virt_start; +- +- reserve_top_address(-top); +-#endif /* CONFIG_X86_32 */ +-} +- +-/* +- * Like __va(), but returns address in the kernel mapping (which is +- * all we have until the physical memory mapping has been set up. +- */ +-static void *__ka(phys_addr_t paddr) +-{ +-#ifdef CONFIG_X86_64 +- return (void *)(paddr + __START_KERNEL_map); +-#else +- return __va(paddr); +-#endif +-} +- +-/* Convert a machine address to physical address */ +-static unsigned long m2p(phys_addr_t maddr) +-{ +- phys_addr_t paddr; +- +- maddr &= PTE_PFN_MASK; +- paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; +- +- return paddr; +-} +- +-/* Convert a machine address to kernel virtual */ +-static void *m2v(phys_addr_t maddr) +-{ +- return __ka(m2p(maddr)); +-} +- +-static void set_page_prot(void *addr, pgprot_t prot) +-{ +- unsigned long pfn = __pa(addr) >> PAGE_SHIFT; +- pte_t pte = pfn_pte(pfn, prot); +- +- if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) +- BUG(); +-} +- +-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +-{ +- unsigned pmdidx, pteidx; +- unsigned ident_pte; +- unsigned long pfn; +- +- ident_pte = 0; +- pfn = 0; +- for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { +- pte_t *pte_page; +- +- /* Reuse or allocate a page of ptes */ +- if (pmd_present(pmd[pmdidx])) +- pte_page = m2v(pmd[pmdidx].pmd); +- else { +- /* Check for free pte pages */ +- if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) +- break; +- +- pte_page = &level1_ident_pgt[ident_pte]; +- ident_pte += PTRS_PER_PTE; +- +- pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); +- } +- +- /* Install mappings */ +- for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { +- pte_t pte; +- +- if (pfn > max_pfn_mapped) +- max_pfn_mapped = pfn; +- +- if (!pte_none(pte_page[pteidx])) +- continue; +- +- pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); +- pte_page[pteidx] = pte; +- } +- } +- +- for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) +- set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); +- +- set_page_prot(pmd, PAGE_KERNEL_RO); +-} +- +-#ifdef CONFIG_X86_64 +-static void convert_pfn_mfn(void *v) +-{ +- pte_t *pte = v; +- int i; +- +- /* All levels are converted the same way, so just treat them +- as ptes. */ +- for (i = 0; i < PTRS_PER_PTE; i++) +- pte[i] = xen_make_pte(pte[i].pte); +-} +- +-/* +- * Set up the inital kernel pagetable. +- * +- * We can construct this by grafting the Xen provided pagetable into +- * head_64.S's preconstructed pagetables. We copy the Xen L2's into +- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This +- * means that only the kernel has a physical mapping to start with - +- * but that's enough to get __va working. We need to fill in the rest +- * of the physical mapping once some sort of allocator has been set +- * up. +- */ +-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, +- unsigned long max_pfn) +-{ +- pud_t *l3; +- pmd_t *l2; +- +- /* Zap identity mapping */ +- init_level4_pgt[0] = __pgd(0); +- +- /* Pre-constructed entries are in pfn, so convert to mfn */ +- convert_pfn_mfn(init_level4_pgt); +- convert_pfn_mfn(level3_ident_pgt); +- convert_pfn_mfn(level3_kernel_pgt); +- +- l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); +- l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); +- +- memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); +- memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); +- +- l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); +- l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); +- memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); +- +- /* Set up identity map */ +- xen_map_identity_early(level2_ident_pgt, max_pfn); +- +- /* Make pagetable pieces RO */ +- set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); +- set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); +- set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); +- set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); +- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); +- set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); +- +- /* Pin down new L4 */ +- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, +- PFN_DOWN(__pa_symbol(init_level4_pgt))); +- +- /* Unpin Xen-provided one */ +- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +- +- /* Switch over */ +- pgd = init_level4_pgt; +- +- /* +- * At this stage there can be no user pgd, and no page +- * structure to attach it to, so make sure we just set kernel +- * pgd. +- */ +- xen_mc_batch(); +- __xen_write_cr3(true, __pa(pgd)); +- xen_mc_issue(PARAVIRT_LAZY_CPU); +- +- reserve_early(__pa(xen_start_info->pt_base), +- __pa(xen_start_info->pt_base + +- xen_start_info->nr_pt_frames * PAGE_SIZE), +- "XEN PAGETABLES"); +- +- return pgd; +-} +-#else /* !CONFIG_X86_64 */ +-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; +- +-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, +- unsigned long max_pfn) +-{ +- pmd_t *kernel_pmd; +- +- init_pg_tables_start = __pa(pgd); +- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; +- max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); +- +- kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); +- memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); +- +- xen_map_identity_early(level2_kernel_pgt, max_pfn); +- +- memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); +- set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], +- __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); +- +- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); +- set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); +- set_page_prot(empty_zero_page, PAGE_KERNEL_RO); +- +- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +- +- xen_write_cr3(__pa(swapper_pg_dir)); +- +- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); +- +- return swapper_pg_dir; +-} +-#endif /* CONFIG_X86_64 */ +- + /* First C function to be called on Xen boot */ + asmlinkage void __init xen_start_kernel(void) + { +@@ -1639,7 +901,7 @@ asmlinkage void __init xen_start_kernel( + /* + * set up the basic apic ops. + */ +- apic_ops = &xen_basic_apic_ops; ++ set_xen_basic_apic_ops(); + #endif + + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { +@@ -1650,10 +912,18 @@ asmlinkage void __init xen_start_kernel( + machine_ops = xen_machine_ops; + + #ifdef CONFIG_X86_64 +- /* Disable until direct per-cpu data access. */ +- have_vcpu_info_placement = 0; +- x86_64_init_pda(); ++ /* ++ * Setup percpu state. We only need to do this for 64-bit ++ * because 32-bit already has %fs set properly. ++ */ ++ load_percpu_segment(0); + #endif ++ /* ++ * The only reliable way to retain the initial address of the ++ * percpu gdt_page is to remember it here, so we can go and ++ * mark it RW later, when the initial percpu area is freed. ++ */ ++ xen_initial_gdt = &per_cpu(gdt_page, 0); + + xen_smp_init(); + +Index: linux-2.6-tip/arch/x86/xen/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/irq.c ++++ linux-2.6-tip/arch/x86/xen/irq.c +@@ -19,27 +19,12 @@ void xen_force_evtchn_callback(void) + (void)HYPERVISOR_xen_version(0, NULL); + } + +-static void __init __xen_init_IRQ(void) +-{ +- int i; +- +- /* Create identity vector->irq map */ +- for(i = 0; i < NR_VECTORS; i++) { +- int cpu; +- +- for_each_possible_cpu(cpu) +- per_cpu(vector_irq, cpu)[i] = i; +- } +- +- xen_init_IRQ(); +-} +- + static unsigned long xen_save_fl(void) + { + struct vcpu_info *vcpu; + unsigned long flags; + +- vcpu = x86_read_percpu(xen_vcpu); ++ vcpu = percpu_read(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; +@@ -50,6 +35,7 @@ static unsigned long xen_save_fl(void) + */ + return (-flags) & X86_EFLAGS_IF; + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); + + static void xen_restore_fl(unsigned long flags) + { +@@ -62,7 +48,7 @@ static void xen_restore_fl(unsigned long + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); +- vcpu = x86_read_percpu(xen_vcpu); ++ vcpu = percpu_read(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); + +@@ -76,6 +62,7 @@ static void xen_restore_fl(unsigned long + xen_force_evtchn_callback(); + } + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); + + static void xen_irq_disable(void) + { +@@ -83,9 +70,10 @@ static void xen_irq_disable(void) + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); +- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; ++ percpu_read(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); + + static void xen_irq_enable(void) + { +@@ -96,7 +84,7 @@ static void xen_irq_enable(void) + the caller is confused and is trying to re-enable interrupts + on an indeterminate processor. */ + +- vcpu = x86_read_percpu(xen_vcpu); ++ vcpu = percpu_read(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + + /* Doesn't matter if we get preempted here, because any +@@ -106,6 +94,7 @@ static void xen_irq_enable(void) + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); + + static void xen_safe_halt(void) + { +@@ -123,11 +112,13 @@ static void xen_halt(void) + } + + static const struct pv_irq_ops xen_irq_ops __initdata = { +- .init_IRQ = __xen_init_IRQ, +- .save_fl = xen_save_fl, +- .restore_fl = xen_restore_fl, +- .irq_disable = xen_irq_disable, +- .irq_enable = xen_irq_enable, ++ .init_IRQ = xen_init_IRQ, ++ ++ .save_fl = PV_CALLEE_SAVE(xen_save_fl), ++ .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), ++ .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), ++ .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), ++ + .safe_halt = xen_safe_halt, + .halt = xen_halt, + #ifdef CONFIG_X86_64 +Index: linux-2.6-tip/arch/x86/xen/mmu.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/mmu.c ++++ linux-2.6-tip/arch/x86/xen/mmu.c +@@ -47,6 +47,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -55,6 +56,8 @@ + + #include + #include ++#include ++#include + + #include "multicalls.h" + #include "mmu.h" +@@ -114,6 +117,37 @@ static inline void check_zero(void) + + #endif /* CONFIG_XEN_DEBUG_FS */ + ++ ++/* ++ * Identity map, in addition to plain kernel map. This needs to be ++ * large enough to allocate page table pages to allocate the rest. ++ * Each page can map 2MB. ++ */ ++static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; ++ ++#ifdef CONFIG_X86_64 ++/* l3 pud for userspace vsyscall mapping */ ++static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; ++#endif /* CONFIG_X86_64 */ ++ ++/* ++ * Note about cr3 (pagetable base) values: ++ * ++ * xen_cr3 contains the current logical cr3 value; it contains the ++ * last set cr3. This may not be the current effective cr3, because ++ * its update may be being lazily deferred. However, a vcpu looking ++ * at its own cr3 can use this value knowing that it everything will ++ * be self-consistent. ++ * ++ * xen_current_cr3 contains the actual vcpu cr3; it is set once the ++ * hypercall to set the vcpu cr3 is complete (so it may be a little ++ * out of date, but it will never be set early). If one vcpu is ++ * looking at another vcpu's cr3 value, it should use this variable. ++ */ ++DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ ++DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ ++ ++ + /* + * Just beyond the highest usermode address. STACK_TOP_MAX has a + * redzone above it, so round it up to a PGD boundary. +@@ -242,6 +276,13 @@ void set_phys_to_machine(unsigned long p + p2m_top[topidx][idx] = mfn; + } + ++unsigned long arbitrary_virt_to_mfn(void *vaddr) ++{ ++ xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); ++ ++ return PFN_DOWN(maddr.maddr); ++} ++ + xmaddr_t arbitrary_virt_to_machine(void *vaddr) + { + unsigned long address = (unsigned long)vaddr; +@@ -458,28 +499,33 @@ pteval_t xen_pte_val(pte_t pte) + { + return pte_mfn_to_pfn(pte.pte); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); + + pgdval_t xen_pgd_val(pgd_t pgd) + { + return pte_mfn_to_pfn(pgd.pgd); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); + + pte_t xen_make_pte(pteval_t pte) + { + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); + + pgd_t xen_make_pgd(pgdval_t pgd) + { + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); + + pmdval_t xen_pmd_val(pmd_t pmd) + { + return pte_mfn_to_pfn(pmd.pmd); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); + + void xen_set_pud_hyper(pud_t *ptr, pud_t val) + { +@@ -556,12 +602,14 @@ pmd_t xen_make_pmd(pmdval_t pmd) + pmd = pte_pfn_to_mfn(pmd); + return native_make_pmd(pmd); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); + + #if PAGETABLE_LEVELS == 4 + pudval_t xen_pud_val(pud_t pud) + { + return pte_mfn_to_pfn(pud.pud); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); + + pud_t xen_make_pud(pudval_t pud) + { +@@ -569,6 +617,7 @@ pud_t xen_make_pud(pudval_t pud) + + return native_make_pud(pud); + } ++PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); + + pgd_t *xen_get_user_pgd(pgd_t *pgd) + { +@@ -1063,18 +1112,14 @@ static void drop_other_mm_ref(void *info + struct mm_struct *mm = info; + struct mm_struct *active_mm; + +-#ifdef CONFIG_X86_64 +- active_mm = read_pda(active_mm); +-#else +- active_mm = __get_cpu_var(cpu_tlbstate).active_mm; +-#endif ++ active_mm = percpu_read(cpu_tlbstate.active_mm); + + if (active_mm == mm) + leave_mm(smp_processor_id()); + + /* If this cpu still has a stale cr3 reference, then make sure + it has been flushed. */ +- if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { ++ if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { + load_cr3(swapper_pg_dir); + arch_flush_lazy_cpu_mode(); + } +@@ -1156,6 +1201,705 @@ void xen_exit_mmap(struct mm_struct *mm) + spin_unlock(&mm->page_table_lock); + } + ++static __init void xen_pagetable_setup_start(pgd_t *base) ++{ ++} ++ ++static __init void xen_pagetable_setup_done(pgd_t *base) ++{ ++ xen_setup_shared_info(); ++} ++ ++static void xen_write_cr2(unsigned long cr2) ++{ ++ percpu_read(xen_vcpu)->arch.cr2 = cr2; ++} ++ ++static unsigned long xen_read_cr2(void) ++{ ++ return percpu_read(xen_vcpu)->arch.cr2; ++} ++ ++unsigned long xen_read_cr2_direct(void) ++{ ++ return percpu_read(xen_vcpu_info.arch.cr2); ++} ++ ++static void xen_flush_tlb(void) ++{ ++ struct mmuext_op *op; ++ struct multicall_space mcs; ++ ++ preempt_disable(); ++ ++ mcs = xen_mc_entry(sizeof(*op)); ++ ++ op = mcs.args; ++ op->cmd = MMUEXT_TLB_FLUSH_LOCAL; ++ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); ++ ++ xen_mc_issue(PARAVIRT_LAZY_MMU); ++ ++ preempt_enable(); ++} ++ ++static void xen_flush_tlb_single(unsigned long addr) ++{ ++ struct mmuext_op *op; ++ struct multicall_space mcs; ++ ++ preempt_disable(); ++ ++ mcs = xen_mc_entry(sizeof(*op)); ++ op = mcs.args; ++ op->cmd = MMUEXT_INVLPG_LOCAL; ++ op->arg1.linear_addr = addr & PAGE_MASK; ++ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); ++ ++ xen_mc_issue(PARAVIRT_LAZY_MMU); ++ ++ preempt_enable(); ++} ++ ++static void xen_flush_tlb_others(const struct cpumask *cpus, ++ struct mm_struct *mm, unsigned long va) ++{ ++ struct { ++ struct mmuext_op op; ++ DECLARE_BITMAP(mask, NR_CPUS); ++ } *args; ++ struct multicall_space mcs; ++ ++ BUG_ON(cpumask_empty(cpus)); ++ BUG_ON(!mm); ++ ++ mcs = xen_mc_entry(sizeof(*args)); ++ args = mcs.args; ++ args->op.arg2.vcpumask = to_cpumask(args->mask); ++ ++ /* Remove us, and any offline CPUS. */ ++ cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); ++ cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); ++ ++ if (va == TLB_FLUSH_ALL) { ++ args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; ++ } else { ++ args->op.cmd = MMUEXT_INVLPG_MULTI; ++ args->op.arg1.linear_addr = va; ++ } ++ ++ MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); ++ ++ xen_mc_issue(PARAVIRT_LAZY_MMU); ++} ++ ++static unsigned long xen_read_cr3(void) ++{ ++ return percpu_read(xen_cr3); ++} ++ ++static void set_current_cr3(void *v) ++{ ++ percpu_write(xen_current_cr3, (unsigned long)v); ++} ++ ++static void __xen_write_cr3(bool kernel, unsigned long cr3) ++{ ++ struct mmuext_op *op; ++ struct multicall_space mcs; ++ unsigned long mfn; ++ ++ if (cr3) ++ mfn = pfn_to_mfn(PFN_DOWN(cr3)); ++ else ++ mfn = 0; ++ ++ WARN_ON(mfn == 0 && kernel); ++ ++ mcs = __xen_mc_entry(sizeof(*op)); ++ ++ op = mcs.args; ++ op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; ++ op->arg1.mfn = mfn; ++ ++ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); ++ ++ if (kernel) { ++ percpu_write(xen_cr3, cr3); ++ ++ /* Update xen_current_cr3 once the batch has actually ++ been submitted. */ ++ xen_mc_callback(set_current_cr3, (void *)cr3); ++ } ++} ++ ++static void xen_write_cr3(unsigned long cr3) ++{ ++ BUG_ON(preemptible()); ++ ++ xen_mc_batch(); /* disables interrupts */ ++ ++ /* Update while interrupts are disabled, so its atomic with ++ respect to ipis */ ++ percpu_write(xen_cr3, cr3); ++ ++ __xen_write_cr3(true, cr3); ++ ++#ifdef CONFIG_X86_64 ++ { ++ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); ++ if (user_pgd) ++ __xen_write_cr3(false, __pa(user_pgd)); ++ else ++ __xen_write_cr3(false, 0); ++ } ++#endif ++ ++ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ ++} ++ ++static int xen_pgd_alloc(struct mm_struct *mm) ++{ ++ pgd_t *pgd = mm->pgd; ++ int ret = 0; ++ ++ BUG_ON(PagePinned(virt_to_page(pgd))); ++ ++#ifdef CONFIG_X86_64 ++ { ++ struct page *page = virt_to_page(pgd); ++ pgd_t *user_pgd; ++ ++ BUG_ON(page->private != 0); ++ ++ ret = -ENOMEM; ++ ++ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); ++ page->private = (unsigned long)user_pgd; ++ ++ if (user_pgd != NULL) { ++ user_pgd[pgd_index(VSYSCALL_START)] = ++ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); ++ ret = 0; ++ } ++ ++ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); ++ } ++#endif ++ ++ return ret; ++} ++ ++static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) ++{ ++#ifdef CONFIG_X86_64 ++ pgd_t *user_pgd = xen_get_user_pgd(pgd); ++ ++ if (user_pgd) ++ free_page((unsigned long)user_pgd); ++#endif ++} ++ ++#ifdef CONFIG_HIGHPTE ++static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) ++{ ++ pgprot_t prot = PAGE_KERNEL; ++ ++ if (PagePinned(page)) ++ prot = PAGE_KERNEL_RO; ++ ++ if (0 && PageHighMem(page)) ++ printk("mapping highpte %lx type %d prot %s\n", ++ page_to_pfn(page), type, ++ (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); ++ ++ return kmap_atomic_prot(page, type, prot); ++} ++#endif ++ ++#ifdef CONFIG_X86_32 ++static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) ++{ ++ /* If there's an existing pte, then don't allow _PAGE_RW to be set */ ++ if (pte_val_ma(*ptep) & _PAGE_PRESENT) ++ pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & ++ pte_val_ma(pte)); ++ ++ return pte; ++} ++ ++/* Init-time set_pte while constructing initial pagetables, which ++ doesn't allow RO pagetable pages to be remapped RW */ ++static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) ++{ ++ pte = mask_rw_pte(ptep, pte); ++ ++ xen_set_pte(ptep, pte); ++} ++#endif ++ ++/* Early in boot, while setting up the initial pagetable, assume ++ everything is pinned. */ ++static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) ++{ ++#ifdef CONFIG_FLATMEM ++ BUG_ON(mem_map); /* should only be used early */ ++#endif ++ make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); ++} ++ ++/* Early release_pte assumes that all pts are pinned, since there's ++ only init_mm and anything attached to that is pinned. */ ++static void xen_release_pte_init(unsigned long pfn) ++{ ++ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); ++} ++ ++static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) ++{ ++ struct mmuext_op op; ++ op.cmd = cmd; ++ op.arg1.mfn = pfn_to_mfn(pfn); ++ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) ++ BUG(); ++} ++ ++/* This needs to make sure the new pte page is pinned iff its being ++ attached to a pinned pagetable. */ ++static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) ++{ ++ struct page *page = pfn_to_page(pfn); ++ ++ if (PagePinned(virt_to_page(mm->pgd))) { ++ SetPagePinned(page); ++ ++ vm_unmap_aliases(); ++ if (!PageHighMem(page)) { ++ make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); ++ if (level == PT_PTE && USE_SPLIT_PTLOCKS) ++ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); ++ } else { ++ /* make sure there are no stray mappings of ++ this page */ ++ kmap_flush_unused(); ++ } ++ } ++} ++ ++static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) ++{ ++ xen_alloc_ptpage(mm, pfn, PT_PTE); ++} ++ ++static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) ++{ ++ xen_alloc_ptpage(mm, pfn, PT_PMD); ++} ++ ++/* This should never happen until we're OK to use struct page */ ++static void xen_release_ptpage(unsigned long pfn, unsigned level) ++{ ++ struct page *page = pfn_to_page(pfn); ++ ++ if (PagePinned(page)) { ++ if (!PageHighMem(page)) { ++ if (level == PT_PTE && USE_SPLIT_PTLOCKS) ++ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); ++ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); ++ } ++ ClearPagePinned(page); ++ } ++} ++ ++static void xen_release_pte(unsigned long pfn) ++{ ++ xen_release_ptpage(pfn, PT_PTE); ++} ++ ++static void xen_release_pmd(unsigned long pfn) ++{ ++ xen_release_ptpage(pfn, PT_PMD); ++} ++ ++#if PAGETABLE_LEVELS == 4 ++static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) ++{ ++ xen_alloc_ptpage(mm, pfn, PT_PUD); ++} ++ ++static void xen_release_pud(unsigned long pfn) ++{ ++ xen_release_ptpage(pfn, PT_PUD); ++} ++#endif ++ ++void __init xen_reserve_top(void) ++{ ++#ifdef CONFIG_X86_32 ++ unsigned long top = HYPERVISOR_VIRT_START; ++ struct xen_platform_parameters pp; ++ ++ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) ++ top = pp.virt_start; ++ ++ reserve_top_address(-top); ++#endif /* CONFIG_X86_32 */ ++} ++ ++/* ++ * Like __va(), but returns address in the kernel mapping (which is ++ * all we have until the physical memory mapping has been set up. ++ */ ++static void *__ka(phys_addr_t paddr) ++{ ++#ifdef CONFIG_X86_64 ++ return (void *)(paddr + __START_KERNEL_map); ++#else ++ return __va(paddr); ++#endif ++} ++ ++/* Convert a machine address to physical address */ ++static unsigned long m2p(phys_addr_t maddr) ++{ ++ phys_addr_t paddr; ++ ++ maddr &= PTE_PFN_MASK; ++ paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; ++ ++ return paddr; ++} ++ ++/* Convert a machine address to kernel virtual */ ++static void *m2v(phys_addr_t maddr) ++{ ++ return __ka(m2p(maddr)); ++} ++ ++static void set_page_prot(void *addr, pgprot_t prot) ++{ ++ unsigned long pfn = __pa(addr) >> PAGE_SHIFT; ++ pte_t pte = pfn_pte(pfn, prot); ++ ++ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) ++ BUG(); ++} ++ ++static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) ++{ ++ unsigned pmdidx, pteidx; ++ unsigned ident_pte; ++ unsigned long pfn; ++ ++ ident_pte = 0; ++ pfn = 0; ++ for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { ++ pte_t *pte_page; ++ ++ /* Reuse or allocate a page of ptes */ ++ if (pmd_present(pmd[pmdidx])) ++ pte_page = m2v(pmd[pmdidx].pmd); ++ else { ++ /* Check for free pte pages */ ++ if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) ++ break; ++ ++ pte_page = &level1_ident_pgt[ident_pte]; ++ ident_pte += PTRS_PER_PTE; ++ ++ pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); ++ } ++ ++ /* Install mappings */ ++ for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { ++ pte_t pte; ++ ++ if (pfn > max_pfn_mapped) ++ max_pfn_mapped = pfn; ++ ++ if (!pte_none(pte_page[pteidx])) ++ continue; ++ ++ pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); ++ pte_page[pteidx] = pte; ++ } ++ } ++ ++ for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) ++ set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); ++ ++ set_page_prot(pmd, PAGE_KERNEL_RO); ++} ++ ++#ifdef CONFIG_X86_64 ++static void convert_pfn_mfn(void *v) ++{ ++ pte_t *pte = v; ++ int i; ++ ++ /* All levels are converted the same way, so just treat them ++ as ptes. */ ++ for (i = 0; i < PTRS_PER_PTE; i++) ++ pte[i] = xen_make_pte(pte[i].pte); ++} ++ ++/* ++ * Set up the inital kernel pagetable. ++ * ++ * We can construct this by grafting the Xen provided pagetable into ++ * head_64.S's preconstructed pagetables. We copy the Xen L2's into ++ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This ++ * means that only the kernel has a physical mapping to start with - ++ * but that's enough to get __va working. We need to fill in the rest ++ * of the physical mapping once some sort of allocator has been set ++ * up. ++ */ ++__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, ++ unsigned long max_pfn) ++{ ++ pud_t *l3; ++ pmd_t *l2; ++ ++ /* Zap identity mapping */ ++ init_level4_pgt[0] = __pgd(0); ++ ++ /* Pre-constructed entries are in pfn, so convert to mfn */ ++ convert_pfn_mfn(init_level4_pgt); ++ convert_pfn_mfn(level3_ident_pgt); ++ convert_pfn_mfn(level3_kernel_pgt); ++ ++ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); ++ l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); ++ ++ memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); ++ memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); ++ ++ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); ++ l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); ++ memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); ++ ++ /* Set up identity map */ ++ xen_map_identity_early(level2_ident_pgt, max_pfn); ++ ++ /* Make pagetable pieces RO */ ++ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); ++ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); ++ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); ++ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); ++ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); ++ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); ++ ++ /* Pin down new L4 */ ++ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, ++ PFN_DOWN(__pa_symbol(init_level4_pgt))); ++ ++ /* Unpin Xen-provided one */ ++ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); ++ ++ /* Switch over */ ++ pgd = init_level4_pgt; ++ ++ /* ++ * At this stage there can be no user pgd, and no page ++ * structure to attach it to, so make sure we just set kernel ++ * pgd. ++ */ ++ xen_mc_batch(); ++ __xen_write_cr3(true, __pa(pgd)); ++ xen_mc_issue(PARAVIRT_LAZY_CPU); ++ ++ reserve_early(__pa(xen_start_info->pt_base), ++ __pa(xen_start_info->pt_base + ++ xen_start_info->nr_pt_frames * PAGE_SIZE), ++ "XEN PAGETABLES"); ++ ++ return pgd; ++} ++#else /* !CONFIG_X86_64 */ ++static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; ++ ++__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, ++ unsigned long max_pfn) ++{ ++ pmd_t *kernel_pmd; ++ ++ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + ++ xen_start_info->nr_pt_frames * PAGE_SIZE + ++ 512*1024); ++ ++ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); ++ memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); ++ ++ xen_map_identity_early(level2_kernel_pgt, max_pfn); ++ ++ memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); ++ set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], ++ __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); ++ ++ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); ++ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); ++ set_page_prot(empty_zero_page, PAGE_KERNEL_RO); ++ ++ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); ++ ++ xen_write_cr3(__pa(swapper_pg_dir)); ++ ++ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); ++ ++ return swapper_pg_dir; ++} ++#endif /* CONFIG_X86_64 */ ++ ++static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) ++{ ++ pte_t pte; ++ ++ phys >>= PAGE_SHIFT; ++ ++ switch (idx) { ++ case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: ++#ifdef CONFIG_X86_F00F_BUG ++ case FIX_F00F_IDT: ++#endif ++#ifdef CONFIG_X86_32 ++ case FIX_WP_TEST: ++ case FIX_VDSO: ++# ifdef CONFIG_HIGHMEM ++ case FIX_KMAP_BEGIN ... FIX_KMAP_END: ++# endif ++#else ++ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: ++#endif ++#ifdef CONFIG_X86_LOCAL_APIC ++ case FIX_APIC_BASE: /* maps dummy local APIC */ ++#endif ++ pte = pfn_pte(phys, prot); ++ break; ++ ++ default: ++ pte = mfn_pte(phys, prot); ++ break; ++ } ++ ++ __native_set_fixmap(idx, pte); ++ ++#ifdef CONFIG_X86_64 ++ /* Replicate changes to map the vsyscall page into the user ++ pagetable vsyscall mapping. */ ++ if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { ++ unsigned long vaddr = __fix_to_virt(idx); ++ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); ++ } ++#endif ++} ++ ++__init void xen_post_allocator_init(void) ++{ ++ pv_mmu_ops.set_pte = xen_set_pte; ++ pv_mmu_ops.set_pmd = xen_set_pmd; ++ pv_mmu_ops.set_pud = xen_set_pud; ++#if PAGETABLE_LEVELS == 4 ++ pv_mmu_ops.set_pgd = xen_set_pgd; ++#endif ++ ++ /* This will work as long as patching hasn't happened yet ++ (which it hasn't) */ ++ pv_mmu_ops.alloc_pte = xen_alloc_pte; ++ pv_mmu_ops.alloc_pmd = xen_alloc_pmd; ++ pv_mmu_ops.release_pte = xen_release_pte; ++ pv_mmu_ops.release_pmd = xen_release_pmd; ++#if PAGETABLE_LEVELS == 4 ++ pv_mmu_ops.alloc_pud = xen_alloc_pud; ++ pv_mmu_ops.release_pud = xen_release_pud; ++#endif ++ ++#ifdef CONFIG_X86_64 ++ SetPagePinned(virt_to_page(level3_user_vsyscall)); ++#endif ++ xen_mark_init_mm_pinned(); ++} ++ ++ ++const struct pv_mmu_ops xen_mmu_ops __initdata = { ++ .pagetable_setup_start = xen_pagetable_setup_start, ++ .pagetable_setup_done = xen_pagetable_setup_done, ++ ++ .read_cr2 = xen_read_cr2, ++ .write_cr2 = xen_write_cr2, ++ ++ .read_cr3 = xen_read_cr3, ++ .write_cr3 = xen_write_cr3, ++ ++ .flush_tlb_user = xen_flush_tlb, ++ .flush_tlb_kernel = xen_flush_tlb, ++ .flush_tlb_single = xen_flush_tlb_single, ++ .flush_tlb_others = xen_flush_tlb_others, ++ ++ .pte_update = paravirt_nop, ++ .pte_update_defer = paravirt_nop, ++ ++ .pgd_alloc = xen_pgd_alloc, ++ .pgd_free = xen_pgd_free, ++ ++ .alloc_pte = xen_alloc_pte_init, ++ .release_pte = xen_release_pte_init, ++ .alloc_pmd = xen_alloc_pte_init, ++ .alloc_pmd_clone = paravirt_nop, ++ .release_pmd = xen_release_pte_init, ++ ++#ifdef CONFIG_HIGHPTE ++ .kmap_atomic_pte = xen_kmap_atomic_pte, ++#endif ++ ++#ifdef CONFIG_X86_64 ++ .set_pte = xen_set_pte, ++#else ++ .set_pte = xen_set_pte_init, ++#endif ++ .set_pte_at = xen_set_pte_at, ++ .set_pmd = xen_set_pmd_hyper, ++ ++ .ptep_modify_prot_start = __ptep_modify_prot_start, ++ .ptep_modify_prot_commit = __ptep_modify_prot_commit, ++ ++ .pte_val = PV_CALLEE_SAVE(xen_pte_val), ++ .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), ++ ++ .make_pte = PV_CALLEE_SAVE(xen_make_pte), ++ .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), ++ ++#ifdef CONFIG_X86_PAE ++ .set_pte_atomic = xen_set_pte_atomic, ++ .pte_clear = xen_pte_clear, ++ .pmd_clear = xen_pmd_clear, ++#endif /* CONFIG_X86_PAE */ ++ .set_pud = xen_set_pud_hyper, ++ ++ .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), ++ .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), ++ ++#if PAGETABLE_LEVELS == 4 ++ .pud_val = PV_CALLEE_SAVE(xen_pud_val), ++ .make_pud = PV_CALLEE_SAVE(xen_make_pud), ++ .set_pgd = xen_set_pgd_hyper, ++ ++ .alloc_pud = xen_alloc_pte_init, ++ .release_pud = xen_release_pte_init, ++#endif /* PAGETABLE_LEVELS == 4 */ ++ ++ .activate_mm = xen_activate_mm, ++ .dup_mmap = xen_dup_mmap, ++ .exit_mmap = xen_exit_mmap, ++ ++ .lazy_mode = { ++ .enter = paravirt_enter_lazy_mmu, ++ .leave = xen_leave_lazy, ++ }, ++ ++ .set_fixmap = xen_set_fixmap, ++}; ++ ++ + #ifdef CONFIG_XEN_DEBUG_FS + + static struct dentry *d_mmu_debug; +Index: linux-2.6-tip/arch/x86/xen/mmu.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/mmu.h ++++ linux-2.6-tip/arch/x86/xen/mmu.h +@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct + void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + ++unsigned long xen_read_cr2_direct(void); ++ ++extern const struct pv_mmu_ops xen_mmu_ops; + #endif /* _XEN_MMU_H */ +Index: linux-2.6-tip/arch/x86/xen/multicalls.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/multicalls.c ++++ linux-2.6-tip/arch/x86/xen/multicalls.c +@@ -39,6 +39,7 @@ struct mc_buffer { + struct multicall_entry entries[MC_BATCH]; + #if MC_DEBUG + struct multicall_entry debug[MC_BATCH]; ++ void *caller[MC_BATCH]; + #endif + unsigned char args[MC_ARGS]; + struct callback { +@@ -154,11 +155,12 @@ void xen_mc_flush(void) + ret, smp_processor_id()); + dump_stack(); + for (i = 0; i < b->mcidx; i++) { +- printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n", ++ printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n", + i+1, b->mcidx, + b->debug[i].op, + b->debug[i].args[0], +- b->entries[i].result); ++ b->entries[i].result, ++ b->caller[i]); + } + } + #endif +@@ -168,8 +170,6 @@ void xen_mc_flush(void) + } else + BUG_ON(b->argidx != 0); + +- local_irq_restore(flags); +- + for (i = 0; i < b->cbidx; i++) { + struct callback *cb = &b->callbacks[i]; + +@@ -177,7 +177,9 @@ void xen_mc_flush(void) + } + b->cbidx = 0; + +- BUG_ON(ret); ++ local_irq_restore(flags); ++ ++ WARN_ON(ret); + } + + struct multicall_space __xen_mc_entry(size_t args) +@@ -197,6 +199,9 @@ struct multicall_space __xen_mc_entry(si + } + + ret.mc = &b->entries[b->mcidx]; ++#ifdef MC_DEBUG ++ b->caller[b->mcidx] = __builtin_return_address(0); ++#endif + b->mcidx++; + ret.args = &b->args[argidx]; + b->argidx = argidx + args; +Index: linux-2.6-tip/arch/x86/xen/multicalls.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/multicalls.h ++++ linux-2.6-tip/arch/x86/xen/multicalls.h +@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned + xen_mc_flush(); + + /* restore flags saved in xen_mc_batch */ +- local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); ++ local_irq_restore(percpu_read(xen_mc_irq_flags)); + } + + /* Set up a callback to be called when the current batch is flushed */ +Index: linux-2.6-tip/arch/x86/xen/smp.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/smp.c ++++ linux-2.6-tip/arch/x86/xen/smp.c +@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_sin + */ + static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) + { +-#ifdef CONFIG_X86_32 +- __get_cpu_var(irq_stat).irq_resched_count++; +-#else +- add_pda(irq_resched_count, 1); +-#endif ++ inc_irq_stat(irq_resched_count); + + return IRQ_HANDLED; + } +@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void) + xen_setup_cpu_clockevents(); + + cpu_set(cpu, cpu_online_map); +- x86_write_percpu(cpu_state, CPU_ONLINE); ++ percpu_write(cpu_state, CPU_ONLINE); + wmb(); + + /* We can take interrupts now: we're officially "up". */ +@@ -162,7 +158,7 @@ static void __init xen_fill_possible_map + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) { + num_processors++; +- cpu_set(i, cpu_possible_map); ++ set_cpu_possible(i, true); + } + } + } +@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_ + + /* We've switched to the "real" per-cpu gdt, so make sure the + old memory can be recycled */ +- make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); ++ make_lowmem_page_readwrite(xen_initial_gdt); + + xen_setup_vcpu_info_placement(); + } +@@ -201,7 +197,7 @@ static void __init xen_smp_prepare_cpus( + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { + for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) + continue; +- cpu_clear(cpu, cpu_possible_map); ++ set_cpu_possible(cpu, false); + } + + for_each_possible_cpu (cpu) { +@@ -214,7 +210,7 @@ static void __init xen_smp_prepare_cpus( + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + +- cpu_set(cpu, cpu_present_map); ++ set_cpu_present(cpu, true); + } + } + +@@ -223,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, + { + struct vcpu_guest_context *ctxt; + struct desc_struct *gdt; ++ unsigned long gdt_mfn; + + if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) + return 0; +@@ -239,6 +236,8 @@ cpu_initialize_context(unsigned int cpu, + ctxt->user_regs.ss = __KERNEL_DS; + #ifdef CONFIG_X86_32 + ctxt->user_regs.fs = __KERNEL_PERCPU; ++#else ++ ctxt->gs_base_kernel = per_cpu_offset(cpu); + #endif + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ +@@ -250,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, + ctxt->ldt_ents = 0; + + BUG_ON((unsigned long)gdt & ~PAGE_MASK); ++ ++ gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); ++ make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + +- ctxt->gdt_frames[0] = virt_to_mfn(gdt); ++ ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; + + ctxt->user_regs.cs = __KERNEL_CS; +@@ -283,23 +285,14 @@ static int __cpuinit xen_cpu_up(unsigned + struct task_struct *idle = idle_task(cpu); + int rc; + +-#ifdef CONFIG_X86_64 +- /* Allocate node local memory for AP pdas */ +- WARN_ON(cpu == 0); +- if (cpu > 0) { +- rc = get_local_pda(cpu); +- if (rc) +- return rc; +- } +-#endif +- +-#ifdef CONFIG_X86_32 +- init_gdt(cpu); + per_cpu(current_task, cpu) = idle; ++#ifdef CONFIG_X86_32 + irq_ctx_init(cpu); + #else +- cpu_pda(cpu)->pcurrent = idle; + clear_tsk_thread_flag(idle, TIF_FORK); ++ per_cpu(kernel_stack, cpu) = ++ (unsigned long)task_stack_page(idle) - ++ KERNEL_STACK_OFFSET + THREAD_SIZE; + #endif + xen_setup_timer(cpu); + xen_init_lock_cpu(cpu); +@@ -445,11 +438,7 @@ static irqreturn_t xen_call_function_int + { + irq_enter(); + generic_smp_call_function_interrupt(); +-#ifdef CONFIG_X86_32 +- __get_cpu_var(irq_stat).irq_call_count++; +-#else +- add_pda(irq_call_count, 1); +-#endif ++ inc_irq_stat(irq_call_count); + irq_exit(); + + return IRQ_HANDLED; +@@ -459,11 +448,7 @@ static irqreturn_t xen_call_function_sin + { + irq_enter(); + generic_smp_call_function_single_interrupt(); +-#ifdef CONFIG_X86_32 +- __get_cpu_var(irq_stat).irq_call_count++; +-#else +- add_pda(irq_call_count, 1); +-#endif ++ inc_irq_stat(irq_call_count); + irq_exit(); + + return IRQ_HANDLED; +Index: linux-2.6-tip/arch/x86/xen/suspend.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/suspend.c ++++ linux-2.6-tip/arch/x86/xen/suspend.c +@@ -6,6 +6,7 @@ + + #include + #include ++#include + + #include "xen-ops.h" + #include "mmu.h" +Index: linux-2.6-tip/arch/x86/xen/xen-asm.S +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/xen/xen-asm.S +@@ -0,0 +1,142 @@ ++/* ++ * Asm versions of Xen pv-ops, suitable for either direct use or ++ * inlining. The inline versions are the same as the direct-use ++ * versions, with the pre- and post-amble chopped off. ++ * ++ * This code is encoded for size rather than absolute efficiency, with ++ * a view to being able to inline as much as possible. ++ * ++ * We only bother with direct forms (ie, vcpu in percpu data) of the ++ * operations here; the indirect forms are better handled in C, since ++ * they're generally too large to inline anyway. ++ */ ++ ++#include ++#include ++#include ++ ++#include "xen-asm.h" ++ ++/* ++ * Enable events. This clears the event mask and tests the pending ++ * event status with one and operation. If there are pending events, ++ * then enter the hypervisor to get them handled. ++ */ ++ENTRY(xen_irq_enable_direct) ++ /* Unmask events */ ++ movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask ++ ++ /* ++ * Preempt here doesn't matter because that will deal with any ++ * pending interrupts. The pending check may end up being run ++ * on the wrong CPU, but that doesn't hurt. ++ */ ++ ++ /* Test for pending */ ++ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending ++ jz 1f ++ ++2: call check_events ++1: ++ENDPATCH(xen_irq_enable_direct) ++ ret ++ ENDPROC(xen_irq_enable_direct) ++ RELOC(xen_irq_enable_direct, 2b+1) ++ ++ ++/* ++ * Disabling events is simply a matter of making the event mask ++ * non-zero. ++ */ ++ENTRY(xen_irq_disable_direct) ++ movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask ++ENDPATCH(xen_irq_disable_direct) ++ ret ++ ENDPROC(xen_irq_disable_direct) ++ RELOC(xen_irq_disable_direct, 0) ++ ++/* ++ * (xen_)save_fl is used to get the current interrupt enable status. ++ * Callers expect the status to be in X86_EFLAGS_IF, and other bits ++ * may be set in the return value. We take advantage of this by ++ * making sure that X86_EFLAGS_IF has the right value (and other bits ++ * in that byte are 0), but other bits in the return value are ++ * undefined. We need to toggle the state of the bit, because Xen and ++ * x86 use opposite senses (mask vs enable). ++ */ ++ENTRY(xen_save_fl_direct) ++ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask ++ setz %ah ++ addb %ah, %ah ++ENDPATCH(xen_save_fl_direct) ++ ret ++ ENDPROC(xen_save_fl_direct) ++ RELOC(xen_save_fl_direct, 0) ++ ++ ++/* ++ * In principle the caller should be passing us a value return from ++ * xen_save_fl_direct, but for robustness sake we test only the ++ * X86_EFLAGS_IF flag rather than the whole byte. After setting the ++ * interrupt mask state, it checks for unmasked pending events and ++ * enters the hypervisor to get them delivered if so. ++ */ ++ENTRY(xen_restore_fl_direct) ++#ifdef CONFIG_X86_64 ++ testw $X86_EFLAGS_IF, %di ++#else ++ testb $X86_EFLAGS_IF>>8, %ah ++#endif ++ setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask ++ /* ++ * Preempt here doesn't matter because that will deal with any ++ * pending interrupts. The pending check may end up being run ++ * on the wrong CPU, but that doesn't hurt. ++ */ ++ ++ /* check for unmasked and pending */ ++ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending ++ jz 1f ++2: call check_events ++1: ++ENDPATCH(xen_restore_fl_direct) ++ ret ++ ENDPROC(xen_restore_fl_direct) ++ RELOC(xen_restore_fl_direct, 2b+1) ++ ++ ++/* ++ * Force an event check by making a hypercall, but preserve regs ++ * before making the call. ++ */ ++check_events: ++#ifdef CONFIG_X86_32 ++ push %eax ++ push %ecx ++ push %edx ++ call xen_force_evtchn_callback ++ pop %edx ++ pop %ecx ++ pop %eax ++#else ++ push %rax ++ push %rcx ++ push %rdx ++ push %rsi ++ push %rdi ++ push %r8 ++ push %r9 ++ push %r10 ++ push %r11 ++ call xen_force_evtchn_callback ++ pop %r11 ++ pop %r10 ++ pop %r9 ++ pop %r8 ++ pop %rdi ++ pop %rsi ++ pop %rdx ++ pop %rcx ++ pop %rax ++#endif ++ ret +Index: linux-2.6-tip/arch/x86/xen/xen-asm.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/x86/xen/xen-asm.h +@@ -0,0 +1,12 @@ ++#ifndef _XEN_XEN_ASM_H ++#define _XEN_XEN_ASM_H ++ ++#include ++ ++#define RELOC(x, v) .globl x##_reloc; x##_reloc=v ++#define ENDPATCH(x) .globl x##_end; x##_end=. ++ ++/* Pseudo-flag used for virtual NMI, which we don't implement yet */ ++#define XEN_EFLAGS_NMI 0x80000000 ++ ++#endif +Index: linux-2.6-tip/arch/x86/xen/xen-asm_32.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/xen-asm_32.S ++++ linux-2.6-tip/arch/x86/xen/xen-asm_32.S +@@ -1,117 +1,43 @@ + /* +- Asm versions of Xen pv-ops, suitable for either direct use or inlining. +- The inline versions are the same as the direct-use versions, with the +- pre- and post-amble chopped off. +- +- This code is encoded for size rather than absolute efficiency, +- with a view to being able to inline as much as possible. +- +- We only bother with direct forms (ie, vcpu in pda) of the operations +- here; the indirect forms are better handled in C, since they're +- generally too large to inline anyway. ++ * Asm versions of Xen pv-ops, suitable for either direct use or ++ * inlining. The inline versions are the same as the direct-use ++ * versions, with the pre- and post-amble chopped off. ++ * ++ * This code is encoded for size rather than absolute efficiency, with ++ * a view to being able to inline as much as possible. ++ * ++ * We only bother with direct forms (ie, vcpu in pda) of the ++ * operations here; the indirect forms are better handled in C, since ++ * they're generally too large to inline anyway. + */ + +-#include +- +-#include + #include +-#include + #include + #include + + #include + +-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +-#define ENDPATCH(x) .globl x##_end; x##_end=. +- +-/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +-#define XEN_EFLAGS_NMI 0x80000000 +- +-/* +- Enable events. This clears the event mask and tests the pending +- event status with one and operation. If there are pending +- events, then enter the hypervisor to get them handled. +- */ +-ENTRY(xen_irq_enable_direct) +- /* Unmask events */ +- movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +- +- /* Preempt here doesn't matter because that will deal with +- any pending interrupts. The pending check may end up being +- run on the wrong CPU, but that doesn't hurt. */ +- +- /* Test for pending */ +- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending +- jz 1f +- +-2: call check_events +-1: +-ENDPATCH(xen_irq_enable_direct) +- ret +- ENDPROC(xen_irq_enable_direct) +- RELOC(xen_irq_enable_direct, 2b+1) +- ++#include "xen-asm.h" + + /* +- Disabling events is simply a matter of making the event mask +- non-zero. ++ * Force an event check by making a hypercall, but preserve regs ++ * before making the call. + */ +-ENTRY(xen_irq_disable_direct) +- movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +-ENDPATCH(xen_irq_disable_direct) +- ret +- ENDPROC(xen_irq_disable_direct) +- RELOC(xen_irq_disable_direct, 0) +- +-/* +- (xen_)save_fl is used to get the current interrupt enable status. +- Callers expect the status to be in X86_EFLAGS_IF, and other bits +- may be set in the return value. We take advantage of this by +- making sure that X86_EFLAGS_IF has the right value (and other bits +- in that byte are 0), but other bits in the return value are +- undefined. We need to toggle the state of the bit, because +- Xen and x86 use opposite senses (mask vs enable). +- */ +-ENTRY(xen_save_fl_direct) +- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +- setz %ah +- addb %ah,%ah +-ENDPATCH(xen_save_fl_direct) +- ret +- ENDPROC(xen_save_fl_direct) +- RELOC(xen_save_fl_direct, 0) +- +- +-/* +- In principle the caller should be passing us a value return +- from xen_save_fl_direct, but for robustness sake we test only +- the X86_EFLAGS_IF flag rather than the whole byte. After +- setting the interrupt mask state, it checks for unmasked +- pending events and enters the hypervisor to get them delivered +- if so. +- */ +-ENTRY(xen_restore_fl_direct) +- testb $X86_EFLAGS_IF>>8, %ah +- setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +- /* Preempt here doesn't matter because that will deal with +- any pending interrupts. The pending check may end up being +- run on the wrong CPU, but that doesn't hurt. */ +- +- /* check for unmasked and pending */ +- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending +- jz 1f +-2: call check_events +-1: +-ENDPATCH(xen_restore_fl_direct) ++check_events: ++ push %eax ++ push %ecx ++ push %edx ++ call xen_force_evtchn_callback ++ pop %edx ++ pop %ecx ++ pop %eax + ret +- ENDPROC(xen_restore_fl_direct) +- RELOC(xen_restore_fl_direct, 2b+1) + + /* +- We can't use sysexit directly, because we're not running in ring0. +- But we can easily fake it up using iret. Assuming xen_sysexit +- is jumped to with a standard stack frame, we can just strip it +- back to a standard iret frame and use iret. ++ * We can't use sysexit directly, because we're not running in ring0. ++ * But we can easily fake it up using iret. Assuming xen_sysexit is ++ * jumped to with a standard stack frame, we can just strip it back to ++ * a standard iret frame and use iret. + */ + ENTRY(xen_sysexit) + movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ +@@ -122,33 +48,31 @@ ENTRY(xen_sysexit) + ENDPROC(xen_sysexit) + + /* +- This is run where a normal iret would be run, with the same stack setup: +- 8: eflags +- 4: cs +- esp-> 0: eip +- +- This attempts to make sure that any pending events are dealt +- with on return to usermode, but there is a small window in +- which an event can happen just before entering usermode. If +- the nested interrupt ends up setting one of the TIF_WORK_MASK +- pending work flags, they will not be tested again before +- returning to usermode. This means that a process can end up +- with pending work, which will be unprocessed until the process +- enters and leaves the kernel again, which could be an +- unbounded amount of time. This means that a pending signal or +- reschedule event could be indefinitely delayed. +- +- The fix is to notice a nested interrupt in the critical +- window, and if one occurs, then fold the nested interrupt into +- the current interrupt stack frame, and re-process it +- iteratively rather than recursively. This means that it will +- exit via the normal path, and all pending work will be dealt +- with appropriately. +- +- Because the nested interrupt handler needs to deal with the +- current stack state in whatever form its in, we keep things +- simple by only using a single register which is pushed/popped +- on the stack. ++ * This is run where a normal iret would be run, with the same stack setup: ++ * 8: eflags ++ * 4: cs ++ * esp-> 0: eip ++ * ++ * This attempts to make sure that any pending events are dealt with ++ * on return to usermode, but there is a small window in which an ++ * event can happen just before entering usermode. If the nested ++ * interrupt ends up setting one of the TIF_WORK_MASK pending work ++ * flags, they will not be tested again before returning to ++ * usermode. This means that a process can end up with pending work, ++ * which will be unprocessed until the process enters and leaves the ++ * kernel again, which could be an unbounded amount of time. This ++ * means that a pending signal or reschedule event could be ++ * indefinitely delayed. ++ * ++ * The fix is to notice a nested interrupt in the critical window, and ++ * if one occurs, then fold the nested interrupt into the current ++ * interrupt stack frame, and re-process it iteratively rather than ++ * recursively. This means that it will exit via the normal path, and ++ * all pending work will be dealt with appropriately. ++ * ++ * Because the nested interrupt handler needs to deal with the current ++ * stack state in whatever form its in, we keep things simple by only ++ * using a single register which is pushed/popped on the stack. + */ + ENTRY(xen_iret) + /* test eflags for special cases */ +@@ -158,13 +82,15 @@ ENTRY(xen_iret) + push %eax + ESP_OFFSET=4 # bytes pushed onto stack + +- /* Store vcpu_info pointer for easy access. Do it this +- way to avoid having to reload %fs */ ++ /* ++ * Store vcpu_info pointer for easy access. Do it this way to ++ * avoid having to reload %fs ++ */ + #ifdef CONFIG_SMP + GET_THREAD_INFO(%eax) +- movl TI_cpu(%eax),%eax +- movl __per_cpu_offset(,%eax,4),%eax +- mov per_cpu__xen_vcpu(%eax),%eax ++ movl TI_cpu(%eax), %eax ++ movl __per_cpu_offset(,%eax,4), %eax ++ mov per_cpu__xen_vcpu(%eax), %eax + #else + movl per_cpu__xen_vcpu, %eax + #endif +@@ -172,37 +98,46 @@ ENTRY(xen_iret) + /* check IF state we're restoring */ + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) + +- /* Maybe enable events. Once this happens we could get a +- recursive event, so the critical region starts immediately +- afterwards. However, if that happens we don't end up +- resuming the code, so we don't have to be worried about +- being preempted to another CPU. */ ++ /* ++ * Maybe enable events. Once this happens we could get a ++ * recursive event, so the critical region starts immediately ++ * afterwards. However, if that happens we don't end up ++ * resuming the code, so we don't have to be worried about ++ * being preempted to another CPU. ++ */ + setz XEN_vcpu_info_mask(%eax) + xen_iret_start_crit: + + /* check for unmasked and pending */ + cmpw $0x0001, XEN_vcpu_info_pending(%eax) + +- /* If there's something pending, mask events again so we +- can jump back into xen_hypervisor_callback */ ++ /* ++ * If there's something pending, mask events again so we can ++ * jump back into xen_hypervisor_callback ++ */ + sete XEN_vcpu_info_mask(%eax) + + popl %eax + +- /* From this point on the registers are restored and the stack +- updated, so we don't need to worry about it if we're preempted */ ++ /* ++ * From this point on the registers are restored and the stack ++ * updated, so we don't need to worry about it if we're ++ * preempted ++ */ + iret_restore_end: + +- /* Jump to hypervisor_callback after fixing up the stack. +- Events are masked, so jumping out of the critical +- region is OK. */ ++ /* ++ * Jump to hypervisor_callback after fixing up the stack. ++ * Events are masked, so jumping out of the critical region is ++ * OK. ++ */ + je xen_hypervisor_callback + + 1: iret + xen_iret_end_crit: +-.section __ex_table,"a" ++.section __ex_table, "a" + .align 4 +- .long 1b,iret_exc ++ .long 1b, iret_exc + .previous + + hyper_iret: +@@ -212,55 +147,55 @@ hyper_iret: + .globl xen_iret_start_crit, xen_iret_end_crit + + /* +- This is called by xen_hypervisor_callback in entry.S when it sees +- that the EIP at the time of interrupt was between xen_iret_start_crit +- and xen_iret_end_crit. We're passed the EIP in %eax so we can do +- a more refined determination of what to do. +- +- The stack format at this point is: +- ---------------- +- ss : (ss/esp may be present if we came from usermode) +- esp : +- eflags } outer exception info +- cs } +- eip } +- ---------------- <- edi (copy dest) +- eax : outer eax if it hasn't been restored +- ---------------- +- eflags } nested exception info +- cs } (no ss/esp because we're nested +- eip } from the same ring) +- orig_eax }<- esi (copy src) +- - - - - - - - - +- fs } +- es } +- ds } SAVE_ALL state +- eax } +- : : +- ebx }<- esp +- ---------------- +- +- In order to deliver the nested exception properly, we need to shift +- everything from the return addr up to the error code so it +- sits just under the outer exception info. This means that when we +- handle the exception, we do it in the context of the outer exception +- rather than starting a new one. +- +- The only caveat is that if the outer eax hasn't been +- restored yet (ie, it's still on stack), we need to insert +- its value into the SAVE_ALL state before going on, since +- it's usermode state which we eventually need to restore. ++ * This is called by xen_hypervisor_callback in entry.S when it sees ++ * that the EIP at the time of interrupt was between ++ * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in ++ * %eax so we can do a more refined determination of what to do. ++ * ++ * The stack format at this point is: ++ * ---------------- ++ * ss : (ss/esp may be present if we came from usermode) ++ * esp : ++ * eflags } outer exception info ++ * cs } ++ * eip } ++ * ---------------- <- edi (copy dest) ++ * eax : outer eax if it hasn't been restored ++ * ---------------- ++ * eflags } nested exception info ++ * cs } (no ss/esp because we're nested ++ * eip } from the same ring) ++ * orig_eax }<- esi (copy src) ++ * - - - - - - - - ++ * fs } ++ * es } ++ * ds } SAVE_ALL state ++ * eax } ++ * : : ++ * ebx }<- esp ++ * ---------------- ++ * ++ * In order to deliver the nested exception properly, we need to shift ++ * everything from the return addr up to the error code so it sits ++ * just under the outer exception info. This means that when we ++ * handle the exception, we do it in the context of the outer ++ * exception rather than starting a new one. ++ * ++ * The only caveat is that if the outer eax hasn't been restored yet ++ * (ie, it's still on stack), we need to insert its value into the ++ * SAVE_ALL state before going on, since it's usermode state which we ++ * eventually need to restore. + */ + ENTRY(xen_iret_crit_fixup) + /* +- Paranoia: Make sure we're really coming from kernel space. +- One could imagine a case where userspace jumps into the +- critical range address, but just before the CPU delivers a GP, +- it decides to deliver an interrupt instead. Unlikely? +- Definitely. Easy to avoid? Yes. The Intel documents +- explicitly say that the reported EIP for a bad jump is the +- jump instruction itself, not the destination, but some virtual +- environments get this wrong. ++ * Paranoia: Make sure we're really coming from kernel space. ++ * One could imagine a case where userspace jumps into the ++ * critical range address, but just before the CPU delivers a ++ * GP, it decides to deliver an interrupt instead. Unlikely? ++ * Definitely. Easy to avoid? Yes. The Intel documents ++ * explicitly say that the reported EIP for a bad jump is the ++ * jump instruction itself, not the destination, but some ++ * virtual environments get this wrong. + */ + movl PT_CS(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx +@@ -270,15 +205,17 @@ ENTRY(xen_iret_crit_fixup) + lea PT_ORIG_EAX(%esp), %esi + lea PT_EFLAGS(%esp), %edi + +- /* If eip is before iret_restore_end then stack +- hasn't been restored yet. */ ++ /* ++ * If eip is before iret_restore_end then stack ++ * hasn't been restored yet. ++ */ + cmp $iret_restore_end, %eax + jae 1f + +- movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ ++ movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */ + movl %eax, PT_EAX(%esp) + +- lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ ++ lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */ + + /* set up the copy */ + 1: std +@@ -286,20 +223,6 @@ ENTRY(xen_iret_crit_fixup) + rep movsl + cld + +- lea 4(%edi),%esp /* point esp to new frame */ ++ lea 4(%edi), %esp /* point esp to new frame */ + 2: jmp xen_do_upcall + +- +-/* +- Force an event check by making a hypercall, +- but preserve regs before making the call. +- */ +-check_events: +- push %eax +- push %ecx +- push %edx +- call xen_force_evtchn_callback +- pop %edx +- pop %ecx +- pop %eax +- ret +Index: linux-2.6-tip/arch/x86/xen/xen-asm_64.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/xen-asm_64.S ++++ linux-2.6-tip/arch/x86/xen/xen-asm_64.S +@@ -1,174 +1,45 @@ + /* +- Asm versions of Xen pv-ops, suitable for either direct use or inlining. +- The inline versions are the same as the direct-use versions, with the +- pre- and post-amble chopped off. +- +- This code is encoded for size rather than absolute efficiency, +- with a view to being able to inline as much as possible. +- +- We only bother with direct forms (ie, vcpu in pda) of the operations +- here; the indirect forms are better handled in C, since they're +- generally too large to inline anyway. ++ * Asm versions of Xen pv-ops, suitable for either direct use or ++ * inlining. The inline versions are the same as the direct-use ++ * versions, with the pre- and post-amble chopped off. ++ * ++ * This code is encoded for size rather than absolute efficiency, with ++ * a view to being able to inline as much as possible. ++ * ++ * We only bother with direct forms (ie, vcpu in pda) of the ++ * operations here; the indirect forms are better handled in C, since ++ * they're generally too large to inline anyway. + */ + +-#include +- +-#include +-#include + #include ++#include ++#include + #include + + #include + +-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +-#define ENDPATCH(x) .globl x##_end; x##_end=. +- +-/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +-#define XEN_EFLAGS_NMI 0x80000000 +- +-#if 1 +-/* +- x86-64 does not yet support direct access to percpu variables +- via a segment override, so we just need to make sure this code +- never gets used +- */ +-#define BUG ud2a +-#define PER_CPU_VAR(var, off) 0xdeadbeef +-#endif +- +-/* +- Enable events. This clears the event mask and tests the pending +- event status with one and operation. If there are pending +- events, then enter the hypervisor to get them handled. +- */ +-ENTRY(xen_irq_enable_direct) +- BUG +- +- /* Unmask events */ +- movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) +- +- /* Preempt here doesn't matter because that will deal with +- any pending interrupts. The pending check may end up being +- run on the wrong CPU, but that doesn't hurt. */ +- +- /* Test for pending */ +- testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) +- jz 1f +- +-2: call check_events +-1: +-ENDPATCH(xen_irq_enable_direct) +- ret +- ENDPROC(xen_irq_enable_direct) +- RELOC(xen_irq_enable_direct, 2b+1) +- +-/* +- Disabling events is simply a matter of making the event mask +- non-zero. +- */ +-ENTRY(xen_irq_disable_direct) +- BUG +- +- movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) +-ENDPATCH(xen_irq_disable_direct) +- ret +- ENDPROC(xen_irq_disable_direct) +- RELOC(xen_irq_disable_direct, 0) +- +-/* +- (xen_)save_fl is used to get the current interrupt enable status. +- Callers expect the status to be in X86_EFLAGS_IF, and other bits +- may be set in the return value. We take advantage of this by +- making sure that X86_EFLAGS_IF has the right value (and other bits +- in that byte are 0), but other bits in the return value are +- undefined. We need to toggle the state of the bit, because +- Xen and x86 use opposite senses (mask vs enable). +- */ +-ENTRY(xen_save_fl_direct) +- BUG +- +- testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) +- setz %ah +- addb %ah,%ah +-ENDPATCH(xen_save_fl_direct) +- ret +- ENDPROC(xen_save_fl_direct) +- RELOC(xen_save_fl_direct, 0) +- +-/* +- In principle the caller should be passing us a value return +- from xen_save_fl_direct, but for robustness sake we test only +- the X86_EFLAGS_IF flag rather than the whole byte. After +- setting the interrupt mask state, it checks for unmasked +- pending events and enters the hypervisor to get them delivered +- if so. +- */ +-ENTRY(xen_restore_fl_direct) +- BUG +- +- testb $X86_EFLAGS_IF>>8, %ah +- setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) +- /* Preempt here doesn't matter because that will deal with +- any pending interrupts. The pending check may end up being +- run on the wrong CPU, but that doesn't hurt. */ +- +- /* check for unmasked and pending */ +- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) +- jz 1f +-2: call check_events +-1: +-ENDPATCH(xen_restore_fl_direct) +- ret +- ENDPROC(xen_restore_fl_direct) +- RELOC(xen_restore_fl_direct, 2b+1) +- +- +-/* +- Force an event check by making a hypercall, +- but preserve regs before making the call. +- */ +-check_events: +- push %rax +- push %rcx +- push %rdx +- push %rsi +- push %rdi +- push %r8 +- push %r9 +- push %r10 +- push %r11 +- call xen_force_evtchn_callback +- pop %r11 +- pop %r10 +- pop %r9 +- pop %r8 +- pop %rdi +- pop %rsi +- pop %rdx +- pop %rcx +- pop %rax +- ret ++#include "xen-asm.h" + + ENTRY(xen_adjust_exception_frame) +- mov 8+0(%rsp),%rcx +- mov 8+8(%rsp),%r11 ++ mov 8+0(%rsp), %rcx ++ mov 8+8(%rsp), %r11 + ret $16 + + hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 + /* +- Xen64 iret frame: +- +- ss +- rsp +- rflags +- cs +- rip <-- standard iret frame +- +- flags +- +- rcx } +- r11 }<-- pushed by hypercall page +-rsp -> rax } ++ * Xen64 iret frame: ++ * ++ * ss ++ * rsp ++ * rflags ++ * cs ++ * rip <-- standard iret frame ++ * ++ * flags ++ * ++ * rcx } ++ * r11 }<-- pushed by hypercall page ++ * rsp->rax } + */ + ENTRY(xen_iret) + pushq $0 +@@ -177,8 +48,8 @@ ENDPATCH(xen_iret) + RELOC(xen_iret, 1b+1) + + /* +- sysexit is not used for 64-bit processes, so it's +- only ever used to return to 32-bit compat userspace. ++ * sysexit is not used for 64-bit processes, so it's only ever used to ++ * return to 32-bit compat userspace. + */ + ENTRY(xen_sysexit) + pushq $__USER32_DS +@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit) + RELOC(xen_sysexit, 1b+1) + + ENTRY(xen_sysret64) +- /* We're already on the usermode stack at this point, but still +- with the kernel gs, so we can easily switch back */ +- movq %rsp, %gs:pda_oldrsp +- movq %gs:pda_kernelstack,%rsp ++ /* ++ * We're already on the usermode stack at this point, but ++ * still with the kernel gs, so we can easily switch back ++ */ ++ movq %rsp, PER_CPU_VAR(old_rsp) ++ movq PER_CPU_VAR(kernel_stack), %rsp + + pushq $__USER_DS +- pushq %gs:pda_oldrsp ++ pushq PER_CPU_VAR(old_rsp) + pushq %r11 + pushq $__USER_CS + pushq %rcx +@@ -210,13 +83,15 @@ ENDPATCH(xen_sysret64) + RELOC(xen_sysret64, 1b+1) + + ENTRY(xen_sysret32) +- /* We're already on the usermode stack at this point, but still +- with the kernel gs, so we can easily switch back */ +- movq %rsp, %gs:pda_oldrsp +- movq %gs:pda_kernelstack, %rsp ++ /* ++ * We're already on the usermode stack at this point, but ++ * still with the kernel gs, so we can easily switch back ++ */ ++ movq %rsp, PER_CPU_VAR(old_rsp) ++ movq PER_CPU_VAR(kernel_stack), %rsp + + pushq $__USER32_DS +- pushq %gs:pda_oldrsp ++ pushq PER_CPU_VAR(old_rsp) + pushq %r11 + pushq $__USER32_CS + pushq %rcx +@@ -227,28 +102,27 @@ ENDPATCH(xen_sysret32) + RELOC(xen_sysret32, 1b+1) + + /* +- Xen handles syscall callbacks much like ordinary exceptions, +- which means we have: +- - kernel gs +- - kernel rsp +- - an iret-like stack frame on the stack (including rcx and r11): +- ss +- rsp +- rflags +- cs +- rip +- r11 +- rsp-> rcx +- +- In all the entrypoints, we undo all that to make it look +- like a CPU-generated syscall/sysenter and jump to the normal +- entrypoint. ++ * Xen handles syscall callbacks much like ordinary exceptions, which ++ * means we have: ++ * - kernel gs ++ * - kernel rsp ++ * - an iret-like stack frame on the stack (including rcx and r11): ++ * ss ++ * rsp ++ * rflags ++ * cs ++ * rip ++ * r11 ++ * rsp->rcx ++ * ++ * In all the entrypoints, we undo all that to make it look like a ++ * CPU-generated syscall/sysenter and jump to the normal entrypoint. + */ + + .macro undo_xen_syscall +- mov 0*8(%rsp),%rcx +- mov 1*8(%rsp),%r11 +- mov 5*8(%rsp),%rsp ++ mov 0*8(%rsp), %rcx ++ mov 1*8(%rsp), %r11 ++ mov 5*8(%rsp), %rsp + .endm + + /* Normal 64-bit system call target */ +@@ -275,7 +149,7 @@ ENDPROC(xen_sysenter_target) + + ENTRY(xen_syscall32_target) + ENTRY(xen_sysenter_target) +- lea 16(%rsp), %rsp /* strip %rcx,%r11 */ ++ lea 16(%rsp), %rsp /* strip %rcx, %r11 */ + mov $-ENOSYS, %rax + pushq $VGCF_in_syscall + jmp hypercall_iret +Index: linux-2.6-tip/arch/x86/xen/xen-head.S +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/xen-head.S ++++ linux-2.6-tip/arch/x86/xen/xen-head.S +@@ -8,7 +8,7 @@ + + #include + #include +-#include ++#include + + #include + #include +Index: linux-2.6-tip/arch/x86/xen/xen-ops.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/xen/xen-ops.h ++++ linux-2.6-tip/arch/x86/xen/xen-ops.h +@@ -10,9 +10,12 @@ + extern const char xen_hypervisor_callback[]; + extern const char xen_failsafe_callback[]; + ++extern void *xen_initial_gdt; ++ + struct trap_info; + void xen_copy_trap_info(struct trap_info *traps); + ++DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); + DECLARE_PER_CPU(unsigned long, xen_cr3); + DECLARE_PER_CPU(unsigned long, xen_current_cr3); + +@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_sh + + void xen_setup_mfn_list_list(void); + void xen_setup_shared_info(void); ++void xen_setup_machphys_mapping(void); ++pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); ++void xen_ident_map_ISA(void); ++void xen_reserve_top(void); ++ ++void xen_leave_lazy(void); ++void xen_post_allocator_init(void); + + char * __init xen_memory_setup(void); + void __init xen_arch_setup(void); +Index: linux-2.6-tip/arch/xtensa/include/asm/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/arch/xtensa/include/asm/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/arch/xtensa/include/asm/swab.h +=================================================================== +--- linux-2.6-tip.orig/arch/xtensa/include/asm/swab.h ++++ linux-2.6-tip/arch/xtensa/include/asm/swab.h +@@ -11,7 +11,7 @@ + #ifndef _XTENSA_SWAB_H + #define _XTENSA_SWAB_H + +-#include ++#include + #include + + #define __SWAB_64_THRU_32__ +Index: linux-2.6-tip/arch/xtensa/kernel/irq.c +=================================================================== +--- linux-2.6-tip.orig/arch/xtensa/kernel/irq.c ++++ linux-2.6-tip/arch/xtensa/kernel/irq.c +@@ -99,7 +99,7 @@ int show_interrupts(struct seq_file *p, + seq_printf(p, "%10u ", kstat_irqs(i)); + #else + for_each_online_cpu(j) +- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + #endif + seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %s", action->name); +Index: linux-2.6-tip/block/Kconfig +=================================================================== +--- linux-2.6-tip.orig/block/Kconfig ++++ linux-2.6-tip/block/Kconfig +@@ -44,22 +44,6 @@ config LBD + + If unsure, say N. + +-config BLK_DEV_IO_TRACE +- bool "Support for tracing block io actions" +- depends on SYSFS +- select RELAY +- select DEBUG_FS +- select TRACEPOINTS +- help +- Say Y here if you want to be able to trace the block layer actions +- on a given queue. Tracing allows you to see any traffic happening +- on a block device queue. For more information (and the userspace +- support tools needed), fetch the blktrace tools from: +- +- git://git.kernel.dk/blktrace.git +- +- If unsure, say N. +- + config BLK_DEV_BSG + bool "Block layer SG support v4 (EXPERIMENTAL)" + depends on EXPERIMENTAL +Index: linux-2.6-tip/block/Makefile +=================================================================== +--- linux-2.6-tip.orig/block/Makefile ++++ linux-2.6-tip/block/Makefile +@@ -13,6 +13,5 @@ obj-$(CONFIG_IOSCHED_AS) += as-iosched.o + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o + +-obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +Index: linux-2.6-tip/block/blk-softirq.c +=================================================================== +--- linux-2.6-tip.orig/block/blk-softirq.c ++++ linux-2.6-tip/block/blk-softirq.c +@@ -64,7 +64,7 @@ static int raise_blk_irq(int cpu, struct + data->info = rq; + data->flags = 0; + +- __smp_call_function_single(cpu, data); ++ __smp_call_function_single(cpu, data, 0); + return 0; + } + +Index: linux-2.6-tip/block/blk.h +=================================================================== +--- linux-2.6-tip.orig/block/blk.h ++++ linux-2.6-tip/block/blk.h +@@ -102,7 +102,7 @@ static inline int blk_cpu_to_group(int c + const struct cpumask *mask = cpu_coregroup_mask(cpu); + return cpumask_first(mask); + #elif defined(CONFIG_SCHED_SMT) +- return first_cpu(per_cpu(cpu_sibling_map, cpu)); ++ return cpumask_first(topology_thread_cpumask(cpu)); + #else + return cpu; + #endif +Index: linux-2.6-tip/block/blktrace.c +=================================================================== +--- linux-2.6-tip.orig/block/blktrace.c ++++ /dev/null +@@ -1,860 +0,0 @@ +-/* +- * Copyright (C) 2006 Jens Axboe +- * +- * This program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License version 2 as +- * published by the Free Software Foundation. +- * +- * This program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with this program; if not, write to the Free Software +- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +- * +- */ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-static unsigned int blktrace_seq __read_mostly = 1; +- +-/* Global reference count of probes */ +-static DEFINE_MUTEX(blk_probe_mutex); +-static atomic_t blk_probes_ref = ATOMIC_INIT(0); +- +-static int blk_register_tracepoints(void); +-static void blk_unregister_tracepoints(void); +- +-/* +- * Send out a notify message. +- */ +-static void trace_note(struct blk_trace *bt, pid_t pid, int action, +- const void *data, size_t len) +-{ +- struct blk_io_trace *t; +- +- t = relay_reserve(bt->rchan, sizeof(*t) + len); +- if (t) { +- const int cpu = smp_processor_id(); +- +- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; +- t->time = ktime_to_ns(ktime_get()); +- t->device = bt->dev; +- t->action = action; +- t->pid = pid; +- t->cpu = cpu; +- t->pdu_len = len; +- memcpy((void *) t + sizeof(*t), data, len); +- } +-} +- +-/* +- * Send out a notify for this process, if we haven't done so since a trace +- * started +- */ +-static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +-{ +- tsk->btrace_seq = blktrace_seq; +- trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); +-} +- +-static void trace_note_time(struct blk_trace *bt) +-{ +- struct timespec now; +- unsigned long flags; +- u32 words[2]; +- +- getnstimeofday(&now); +- words[0] = now.tv_sec; +- words[1] = now.tv_nsec; +- +- local_irq_save(flags); +- trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); +- local_irq_restore(flags); +-} +- +-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +-{ +- int n; +- va_list args; +- unsigned long flags; +- char *buf; +- +- local_irq_save(flags); +- buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); +- va_start(args, fmt); +- n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); +- va_end(args); +- +- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); +- local_irq_restore(flags); +-} +-EXPORT_SYMBOL_GPL(__trace_note_message); +- +-static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, +- pid_t pid) +-{ +- if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) +- return 1; +- if (sector < bt->start_lba || sector > bt->end_lba) +- return 1; +- if (bt->pid && pid != bt->pid) +- return 1; +- +- return 0; +-} +- +-/* +- * Data direction bit lookup +- */ +-static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; +- +-/* The ilog2() calls fall out because they're constant */ +-#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \ +- (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) ) +- +-/* +- * The worker for the various blk_add_trace*() types. Fills out a +- * blk_io_trace structure and places it in a per-cpu subbuffer. +- */ +-static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, +- int rw, u32 what, int error, int pdu_len, void *pdu_data) +-{ +- struct task_struct *tsk = current; +- struct blk_io_trace *t; +- unsigned long flags; +- unsigned long *sequence; +- pid_t pid; +- int cpu; +- +- if (unlikely(bt->trace_state != Blktrace_running)) +- return; +- +- what |= ddir_act[rw & WRITE]; +- what |= MASK_TC_BIT(rw, BARRIER); +- what |= MASK_TC_BIT(rw, SYNCIO); +- what |= MASK_TC_BIT(rw, AHEAD); +- what |= MASK_TC_BIT(rw, META); +- what |= MASK_TC_BIT(rw, DISCARD); +- +- pid = tsk->pid; +- if (unlikely(act_log_check(bt, what, sector, pid))) +- return; +- +- /* +- * A word about the locking here - we disable interrupts to reserve +- * some space in the relay per-cpu buffer, to prevent an irq +- * from coming in and stepping on our toes. +- */ +- local_irq_save(flags); +- +- if (unlikely(tsk->btrace_seq != blktrace_seq)) +- trace_note_tsk(bt, tsk); +- +- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); +- if (t) { +- cpu = smp_processor_id(); +- sequence = per_cpu_ptr(bt->sequence, cpu); +- +- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; +- t->sequence = ++(*sequence); +- t->time = ktime_to_ns(ktime_get()); +- t->sector = sector; +- t->bytes = bytes; +- t->action = what; +- t->pid = pid; +- t->device = bt->dev; +- t->cpu = cpu; +- t->error = error; +- t->pdu_len = pdu_len; +- +- if (pdu_len) +- memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); +- } +- +- local_irq_restore(flags); +-} +- +-static struct dentry *blk_tree_root; +-static DEFINE_MUTEX(blk_tree_mutex); +- +-static void blk_trace_cleanup(struct blk_trace *bt) +-{ +- debugfs_remove(bt->msg_file); +- debugfs_remove(bt->dropped_file); +- relay_close(bt->rchan); +- free_percpu(bt->sequence); +- free_percpu(bt->msg_data); +- kfree(bt); +- mutex_lock(&blk_probe_mutex); +- if (atomic_dec_and_test(&blk_probes_ref)) +- blk_unregister_tracepoints(); +- mutex_unlock(&blk_probe_mutex); +-} +- +-int blk_trace_remove(struct request_queue *q) +-{ +- struct blk_trace *bt; +- +- bt = xchg(&q->blk_trace, NULL); +- if (!bt) +- return -EINVAL; +- +- if (bt->trace_state == Blktrace_setup || +- bt->trace_state == Blktrace_stopped) +- blk_trace_cleanup(bt); +- +- return 0; +-} +-EXPORT_SYMBOL_GPL(blk_trace_remove); +- +-static int blk_dropped_open(struct inode *inode, struct file *filp) +-{ +- filp->private_data = inode->i_private; +- +- return 0; +-} +- +-static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, +- size_t count, loff_t *ppos) +-{ +- struct blk_trace *bt = filp->private_data; +- char buf[16]; +- +- snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); +- +- return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +-} +- +-static const struct file_operations blk_dropped_fops = { +- .owner = THIS_MODULE, +- .open = blk_dropped_open, +- .read = blk_dropped_read, +-}; +- +-static int blk_msg_open(struct inode *inode, struct file *filp) +-{ +- filp->private_data = inode->i_private; +- +- return 0; +-} +- +-static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, +- size_t count, loff_t *ppos) +-{ +- char *msg; +- struct blk_trace *bt; +- +- if (count > BLK_TN_MAX_MSG) +- return -EINVAL; +- +- msg = kmalloc(count, GFP_KERNEL); +- if (msg == NULL) +- return -ENOMEM; +- +- if (copy_from_user(msg, buffer, count)) { +- kfree(msg); +- return -EFAULT; +- } +- +- bt = filp->private_data; +- __trace_note_message(bt, "%s", msg); +- kfree(msg); +- +- return count; +-} +- +-static const struct file_operations blk_msg_fops = { +- .owner = THIS_MODULE, +- .open = blk_msg_open, +- .write = blk_msg_write, +-}; +- +-/* +- * Keep track of how many times we encountered a full subbuffer, to aid +- * the user space app in telling how many lost events there were. +- */ +-static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, +- void *prev_subbuf, size_t prev_padding) +-{ +- struct blk_trace *bt; +- +- if (!relay_buf_full(buf)) +- return 1; +- +- bt = buf->chan->private_data; +- atomic_inc(&bt->dropped); +- return 0; +-} +- +-static int blk_remove_buf_file_callback(struct dentry *dentry) +-{ +- struct dentry *parent = dentry->d_parent; +- debugfs_remove(dentry); +- +- /* +- * this will fail for all but the last file, but that is ok. what we +- * care about is the top level buts->name directory going away, when +- * the last trace file is gone. Then we don't have to rmdir() that +- * manually on trace stop, so it nicely solves the issue with +- * force killing of running traces. +- */ +- +- debugfs_remove(parent); +- return 0; +-} +- +-static struct dentry *blk_create_buf_file_callback(const char *filename, +- struct dentry *parent, +- int mode, +- struct rchan_buf *buf, +- int *is_global) +-{ +- return debugfs_create_file(filename, mode, parent, buf, +- &relay_file_operations); +-} +- +-static struct rchan_callbacks blk_relay_callbacks = { +- .subbuf_start = blk_subbuf_start_callback, +- .create_buf_file = blk_create_buf_file_callback, +- .remove_buf_file = blk_remove_buf_file_callback, +-}; +- +-/* +- * Setup everything required to start tracing +- */ +-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, +- struct blk_user_trace_setup *buts) +-{ +- struct blk_trace *old_bt, *bt = NULL; +- struct dentry *dir = NULL; +- int ret, i; +- +- if (!buts->buf_size || !buts->buf_nr) +- return -EINVAL; +- +- strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); +- buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; +- +- /* +- * some device names have larger paths - convert the slashes +- * to underscores for this to work as expected +- */ +- for (i = 0; i < strlen(buts->name); i++) +- if (buts->name[i] == '/') +- buts->name[i] = '_'; +- +- ret = -ENOMEM; +- bt = kzalloc(sizeof(*bt), GFP_KERNEL); +- if (!bt) +- goto err; +- +- bt->sequence = alloc_percpu(unsigned long); +- if (!bt->sequence) +- goto err; +- +- bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); +- if (!bt->msg_data) +- goto err; +- +- ret = -ENOENT; +- +- if (!blk_tree_root) { +- blk_tree_root = debugfs_create_dir("block", NULL); +- if (!blk_tree_root) +- return -ENOMEM; +- } +- +- dir = debugfs_create_dir(buts->name, blk_tree_root); +- +- if (!dir) +- goto err; +- +- bt->dir = dir; +- bt->dev = dev; +- atomic_set(&bt->dropped, 0); +- +- ret = -EIO; +- bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); +- if (!bt->dropped_file) +- goto err; +- +- bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); +- if (!bt->msg_file) +- goto err; +- +- bt->rchan = relay_open("trace", dir, buts->buf_size, +- buts->buf_nr, &blk_relay_callbacks, bt); +- if (!bt->rchan) +- goto err; +- +- bt->act_mask = buts->act_mask; +- if (!bt->act_mask) +- bt->act_mask = (u16) -1; +- +- bt->start_lba = buts->start_lba; +- bt->end_lba = buts->end_lba; +- if (!bt->end_lba) +- bt->end_lba = -1ULL; +- +- bt->pid = buts->pid; +- bt->trace_state = Blktrace_setup; +- +- mutex_lock(&blk_probe_mutex); +- if (atomic_add_return(1, &blk_probes_ref) == 1) { +- ret = blk_register_tracepoints(); +- if (ret) +- goto probe_err; +- } +- mutex_unlock(&blk_probe_mutex); +- +- ret = -EBUSY; +- old_bt = xchg(&q->blk_trace, bt); +- if (old_bt) { +- (void) xchg(&q->blk_trace, old_bt); +- goto err; +- } +- +- return 0; +-probe_err: +- atomic_dec(&blk_probes_ref); +- mutex_unlock(&blk_probe_mutex); +-err: +- if (bt) { +- if (bt->msg_file) +- debugfs_remove(bt->msg_file); +- if (bt->dropped_file) +- debugfs_remove(bt->dropped_file); +- free_percpu(bt->sequence); +- free_percpu(bt->msg_data); +- if (bt->rchan) +- relay_close(bt->rchan); +- kfree(bt); +- } +- return ret; +-} +- +-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, +- char __user *arg) +-{ +- struct blk_user_trace_setup buts; +- int ret; +- +- ret = copy_from_user(&buts, arg, sizeof(buts)); +- if (ret) +- return -EFAULT; +- +- ret = do_blk_trace_setup(q, name, dev, &buts); +- if (ret) +- return ret; +- +- if (copy_to_user(arg, &buts, sizeof(buts))) +- return -EFAULT; +- +- return 0; +-} +-EXPORT_SYMBOL_GPL(blk_trace_setup); +- +-int blk_trace_startstop(struct request_queue *q, int start) +-{ +- struct blk_trace *bt; +- int ret; +- +- if ((bt = q->blk_trace) == NULL) +- return -EINVAL; +- +- /* +- * For starting a trace, we can transition from a setup or stopped +- * trace. For stopping a trace, the state must be running +- */ +- ret = -EINVAL; +- if (start) { +- if (bt->trace_state == Blktrace_setup || +- bt->trace_state == Blktrace_stopped) { +- blktrace_seq++; +- smp_mb(); +- bt->trace_state = Blktrace_running; +- +- trace_note_time(bt); +- ret = 0; +- } +- } else { +- if (bt->trace_state == Blktrace_running) { +- bt->trace_state = Blktrace_stopped; +- relay_flush(bt->rchan); +- ret = 0; +- } +- } +- +- return ret; +-} +-EXPORT_SYMBOL_GPL(blk_trace_startstop); +- +-/** +- * blk_trace_ioctl: - handle the ioctls associated with tracing +- * @bdev: the block device +- * @cmd: the ioctl cmd +- * @arg: the argument data, if any +- * +- **/ +-int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) +-{ +- struct request_queue *q; +- int ret, start = 0; +- char b[BDEVNAME_SIZE]; +- +- q = bdev_get_queue(bdev); +- if (!q) +- return -ENXIO; +- +- mutex_lock(&bdev->bd_mutex); +- +- switch (cmd) { +- case BLKTRACESETUP: +- bdevname(bdev, b); +- ret = blk_trace_setup(q, b, bdev->bd_dev, arg); +- break; +- case BLKTRACESTART: +- start = 1; +- case BLKTRACESTOP: +- ret = blk_trace_startstop(q, start); +- break; +- case BLKTRACETEARDOWN: +- ret = blk_trace_remove(q); +- break; +- default: +- ret = -ENOTTY; +- break; +- } +- +- mutex_unlock(&bdev->bd_mutex); +- return ret; +-} +- +-/** +- * blk_trace_shutdown: - stop and cleanup trace structures +- * @q: the request queue associated with the device +- * +- **/ +-void blk_trace_shutdown(struct request_queue *q) +-{ +- if (q->blk_trace) { +- blk_trace_startstop(q, 0); +- blk_trace_remove(q); +- } +-} +- +-/* +- * blktrace probes +- */ +- +-/** +- * blk_add_trace_rq - Add a trace for a request oriented action +- * @q: queue the io is for +- * @rq: the source request +- * @what: the action +- * +- * Description: +- * Records an action against a request. Will log the bio offset + size. +- * +- **/ +-static void blk_add_trace_rq(struct request_queue *q, struct request *rq, +- u32 what) +-{ +- struct blk_trace *bt = q->blk_trace; +- int rw = rq->cmd_flags & 0x03; +- +- if (likely(!bt)) +- return; +- +- if (blk_discard_rq(rq)) +- rw |= (1 << BIO_RW_DISCARD); +- +- if (blk_pc_request(rq)) { +- what |= BLK_TC_ACT(BLK_TC_PC); +- __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, +- sizeof(rq->cmd), rq->cmd); +- } else { +- what |= BLK_TC_ACT(BLK_TC_FS); +- __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, +- rw, what, rq->errors, 0, NULL); +- } +-} +- +-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) +-{ +- blk_add_trace_rq(q, rq, BLK_TA_ABORT); +-} +- +-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) +-{ +- blk_add_trace_rq(q, rq, BLK_TA_INSERT); +-} +- +-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) +-{ +- blk_add_trace_rq(q, rq, BLK_TA_ISSUE); +-} +- +-static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq) +-{ +- blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); +-} +- +-static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq) +-{ +- blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); +-} +- +-/** +- * blk_add_trace_bio - Add a trace for a bio oriented action +- * @q: queue the io is for +- * @bio: the source bio +- * @what: the action +- * +- * Description: +- * Records an action against a bio. Will log the bio offset + size. +- * +- **/ +-static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, +- u32 what) +-{ +- struct blk_trace *bt = q->blk_trace; +- +- if (likely(!bt)) +- return; +- +- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, +- !bio_flagged(bio, BIO_UPTODATE), 0, NULL); +-} +- +-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) +-{ +- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); +-} +- +-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) +-{ +- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); +-} +- +-static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio) +-{ +- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); +-} +- +-static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio) +-{ +- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); +-} +- +-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) +-{ +- blk_add_trace_bio(q, bio, BLK_TA_QUEUE); +-} +- +-static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw) +-{ +- if (bio) +- blk_add_trace_bio(q, bio, BLK_TA_GETRQ); +- else { +- struct blk_trace *bt = q->blk_trace; +- +- if (bt) +- __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); +- } +-} +- +- +-static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw) +-{ +- if (bio) +- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); +- else { +- struct blk_trace *bt = q->blk_trace; +- +- if (bt) +- __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL); +- } +-} +- +-static void blk_add_trace_plug(struct request_queue *q) +-{ +- struct blk_trace *bt = q->blk_trace; +- +- if (bt) +- __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); +-} +- +-static void blk_add_trace_unplug_io(struct request_queue *q) +-{ +- struct blk_trace *bt = q->blk_trace; +- +- if (bt) { +- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; +- __be64 rpdu = cpu_to_be64(pdu); +- +- __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, +- sizeof(rpdu), &rpdu); +- } +-} +- +-static void blk_add_trace_unplug_timer(struct request_queue *q) +-{ +- struct blk_trace *bt = q->blk_trace; +- +- if (bt) { +- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; +- __be64 rpdu = cpu_to_be64(pdu); +- +- __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, +- sizeof(rpdu), &rpdu); +- } +-} +- +-static void blk_add_trace_split(struct request_queue *q, struct bio *bio, +- unsigned int pdu) +-{ +- struct blk_trace *bt = q->blk_trace; +- +- if (bt) { +- __be64 rpdu = cpu_to_be64(pdu); +- +- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, +- BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), +- sizeof(rpdu), &rpdu); +- } +-} +- +-/** +- * blk_add_trace_remap - Add a trace for a remap operation +- * @q: queue the io is for +- * @bio: the source bio +- * @dev: target device +- * @from: source sector +- * @to: target sector +- * +- * Description: +- * Device mapper or raid target sometimes need to split a bio because +- * it spans a stripe (or similar). Add a trace for that action. +- * +- **/ +-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, +- dev_t dev, sector_t from, sector_t to) +-{ +- struct blk_trace *bt = q->blk_trace; +- struct blk_io_trace_remap r; +- +- if (likely(!bt)) +- return; +- +- r.device = cpu_to_be32(dev); +- r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); +- r.sector = cpu_to_be64(to); +- +- __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, +- !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); +-} +- +-/** +- * blk_add_driver_data - Add binary message with driver-specific data +- * @q: queue the io is for +- * @rq: io request +- * @data: driver-specific data +- * @len: length of driver-specific data +- * +- * Description: +- * Some drivers might want to write driver-specific data per request. +- * +- **/ +-void blk_add_driver_data(struct request_queue *q, +- struct request *rq, +- void *data, size_t len) +-{ +- struct blk_trace *bt = q->blk_trace; +- +- if (likely(!bt)) +- return; +- +- if (blk_pc_request(rq)) +- __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, +- rq->errors, len, data); +- else +- __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, +- 0, BLK_TA_DRV_DATA, rq->errors, len, data); +-} +-EXPORT_SYMBOL_GPL(blk_add_driver_data); +- +-static int blk_register_tracepoints(void) +-{ +- int ret; +- +- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); +- WARN_ON(ret); +- ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); +- WARN_ON(ret); +- ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); +- WARN_ON(ret); +- ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); +- WARN_ON(ret); +- ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); +- WARN_ON(ret); +- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); +- WARN_ON(ret); +- ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); +- WARN_ON(ret); +- ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); +- WARN_ON(ret); +- ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); +- WARN_ON(ret); +- ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); +- WARN_ON(ret); +- ret = register_trace_block_getrq(blk_add_trace_getrq); +- WARN_ON(ret); +- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); +- WARN_ON(ret); +- ret = register_trace_block_plug(blk_add_trace_plug); +- WARN_ON(ret); +- ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); +- WARN_ON(ret); +- ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); +- WARN_ON(ret); +- ret = register_trace_block_split(blk_add_trace_split); +- WARN_ON(ret); +- ret = register_trace_block_remap(blk_add_trace_remap); +- WARN_ON(ret); +- return 0; +-} +- +-static void blk_unregister_tracepoints(void) +-{ +- unregister_trace_block_remap(blk_add_trace_remap); +- unregister_trace_block_split(blk_add_trace_split); +- unregister_trace_block_unplug_io(blk_add_trace_unplug_io); +- unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); +- unregister_trace_block_plug(blk_add_trace_plug); +- unregister_trace_block_sleeprq(blk_add_trace_sleeprq); +- unregister_trace_block_getrq(blk_add_trace_getrq); +- unregister_trace_block_bio_queue(blk_add_trace_bio_queue); +- unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); +- unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); +- unregister_trace_block_bio_complete(blk_add_trace_bio_complete); +- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); +- unregister_trace_block_rq_complete(blk_add_trace_rq_complete); +- unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); +- unregister_trace_block_rq_issue(blk_add_trace_rq_issue); +- unregister_trace_block_rq_insert(blk_add_trace_rq_insert); +- unregister_trace_block_rq_abort(blk_add_trace_rq_abort); +- +- tracepoint_synchronize_unregister(); +-} +Index: linux-2.6-tip/block/bsg.c +=================================================================== +--- linux-2.6-tip.orig/block/bsg.c ++++ linux-2.6-tip/block/bsg.c +@@ -249,7 +249,7 @@ bsg_map_hdr(struct bsg_device *bd, struc + { + struct request_queue *q = bd->queue; + struct request *rq, *next_rq = NULL; +- int ret, rw; ++ int ret, uninitialized_var(rw); + unsigned int dxfer_len; + void *dxferp = NULL; + +Index: linux-2.6-tip/block/cfq-iosched.c +=================================================================== +--- linux-2.6-tip.orig/block/cfq-iosched.c ++++ linux-2.6-tip/block/cfq-iosched.c +@@ -1539,6 +1539,7 @@ cfq_async_queue_prio(struct cfq_data *cf + return &cfqd->async_idle_cfqq; + default: + BUG(); ++ return NULL; + } + } + +Index: linux-2.6-tip/crypto/xor.c +=================================================================== +--- linux-2.6-tip.orig/crypto/xor.c ++++ linux-2.6-tip/crypto/xor.c +@@ -101,7 +101,12 @@ calibrate_xor_blocks(void) + void *b1, *b2; + struct xor_block_template *f, *fastest; + +- b1 = (void *) __get_free_pages(GFP_KERNEL, 2); ++ /* ++ * Note: Since the memory is not actually used for _anything_ but to ++ * test the XOR speed, we don't really want kmemcheck to warn about ++ * reading uninitialized bytes here. ++ */ ++ b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2); + if (!b1) { + printk(KERN_WARNING "xor: Yikes! No memory available.\n"); + return -ENOMEM; +Index: linux-2.6-tip/drivers/acpi/acpica/exprep.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/acpica/exprep.c ++++ linux-2.6-tip/drivers/acpi/acpica/exprep.c +@@ -320,7 +320,7 @@ acpi_ex_prep_common_field_object(union a + u32 field_bit_position, u32 field_bit_length) + { + u32 access_bit_width; +- u32 byte_alignment; ++ u32 uninitialized_var(byte_alignment); + u32 nearest_byte_address; + + ACPI_FUNCTION_TRACE(ex_prep_common_field_object); +Index: linux-2.6-tip/drivers/acpi/acpica/nsxfeval.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/acpica/nsxfeval.c ++++ linux-2.6-tip/drivers/acpi/acpica/nsxfeval.c +@@ -469,6 +469,9 @@ acpi_walk_namespace(acpi_object_type typ + + ACPI_FUNCTION_TRACE(acpi_walk_namespace); + ++ if (acpi_disabled) ++ return_ACPI_STATUS(AE_NO_NAMESPACE); ++ + /* Parameter validation */ + + if ((type > ACPI_TYPE_LOCAL_MAX) || (!max_depth) || (!user_function)) { +Index: linux-2.6-tip/drivers/acpi/acpica/tbxface.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/acpica/tbxface.c ++++ linux-2.6-tip/drivers/acpi/acpica/tbxface.c +@@ -365,7 +365,7 @@ ACPI_EXPORT_SYMBOL(acpi_unload_table_id) + + /******************************************************************************* + * +- * FUNCTION: acpi_get_table ++ * FUNCTION: acpi_get_table_with_size + * + * PARAMETERS: Signature - ACPI signature of needed table + * Instance - Which instance (for SSDTs) +@@ -377,8 +377,9 @@ ACPI_EXPORT_SYMBOL(acpi_unload_table_id) + * + *****************************************************************************/ + acpi_status +-acpi_get_table(char *signature, +- u32 instance, struct acpi_table_header **out_table) ++acpi_get_table_with_size(char *signature, ++ u32 instance, struct acpi_table_header **out_table, ++ acpi_size *tbl_size) + { + u32 i; + u32 j; +@@ -408,6 +409,7 @@ acpi_get_table(char *signature, + acpi_tb_verify_table(&acpi_gbl_root_table_list.tables[i]); + if (ACPI_SUCCESS(status)) { + *out_table = acpi_gbl_root_table_list.tables[i].pointer; ++ *tbl_size = acpi_gbl_root_table_list.tables[i].length; + } + + if (!acpi_gbl_permanent_mmap) { +@@ -420,6 +422,15 @@ acpi_get_table(char *signature, + return (AE_NOT_FOUND); + } + ++acpi_status ++acpi_get_table(char *signature, ++ u32 instance, struct acpi_table_header **out_table) ++{ ++ acpi_size tbl_size; ++ ++ return acpi_get_table_with_size(signature, ++ instance, out_table, &tbl_size); ++} + ACPI_EXPORT_SYMBOL(acpi_get_table) + + /******************************************************************************* +Index: linux-2.6-tip/drivers/acpi/osl.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/osl.c ++++ linux-2.6-tip/drivers/acpi/osl.c +@@ -272,14 +272,21 @@ acpi_os_map_memory(acpi_physical_address + } + EXPORT_SYMBOL_GPL(acpi_os_map_memory); + +-void acpi_os_unmap_memory(void __iomem * virt, acpi_size size) ++void __ref acpi_os_unmap_memory(void __iomem *virt, acpi_size size) + { +- if (acpi_gbl_permanent_mmap) { ++ if (acpi_gbl_permanent_mmap) + iounmap(virt); +- } ++ else ++ __acpi_unmap_table(virt, size); + } + EXPORT_SYMBOL_GPL(acpi_os_unmap_memory); + ++void __init early_acpi_os_unmap_memory(void __iomem *virt, acpi_size size) ++{ ++ if (!acpi_gbl_permanent_mmap) ++ __acpi_unmap_table(virt, size); ++} ++ + #ifdef ACPI_FUTURE_USAGE + acpi_status + acpi_os_get_physical_address(void *virt, acpi_physical_address * phys) +@@ -792,12 +799,12 @@ void acpi_os_delete_lock(acpi_spinlock h + acpi_status + acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle) + { +- struct semaphore *sem = NULL; ++ struct compat_semaphore *sem = NULL; + +- sem = acpi_os_allocate(sizeof(struct semaphore)); ++ sem = acpi_os_allocate(sizeof(struct compat_semaphore)); + if (!sem) + return AE_NO_MEMORY; +- memset(sem, 0, sizeof(struct semaphore)); ++ memset(sem, 0, sizeof(struct compat_semaphore)); + + sema_init(sem, initial_units); + +@@ -818,7 +825,7 @@ acpi_os_create_semaphore(u32 max_units, + + acpi_status acpi_os_delete_semaphore(acpi_handle handle) + { +- struct semaphore *sem = (struct semaphore *)handle; ++ struct compat_semaphore *sem = (struct compat_semaphore *)handle; + + if (!sem) + return AE_BAD_PARAMETER; +@@ -838,7 +845,7 @@ acpi_status acpi_os_delete_semaphore(acp + acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout) + { + acpi_status status = AE_OK; +- struct semaphore *sem = (struct semaphore *)handle; ++ struct compat_semaphore *sem = (struct compat_semaphore *)handle; + long jiffies; + int ret = 0; + +@@ -879,7 +886,7 @@ acpi_status acpi_os_wait_semaphore(acpi_ + */ + acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units) + { +- struct semaphore *sem = (struct semaphore *)handle; ++ struct compat_semaphore *sem = (struct compat_semaphore *)handle; + + if (!sem || (units < 1)) + return AE_BAD_PARAMETER; +Index: linux-2.6-tip/drivers/acpi/processor_idle.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/processor_idle.c ++++ linux-2.6-tip/drivers/acpi/processor_idle.c +@@ -828,8 +828,11 @@ static int acpi_idle_bm_check(void) + */ + static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) + { ++ u64 perf_flags; ++ + /* Don't trace irqs off for idle */ + stop_critical_timings(); ++ perf_flags = hw_perf_save_disable(); + if (cx->entry_method == ACPI_CSTATE_FFH) { + /* Call into architectural FFH based C-state */ + acpi_processor_ffh_cstate_enter(cx); +@@ -844,6 +847,7 @@ static inline void acpi_idle_do_entry(st + gets asserted in time to freeze execution properly. */ + unused = inl(acpi_gbl_FADT.xpm_timer_block.address); + } ++ hw_perf_restore(perf_flags); + start_critical_timings(); + } + +@@ -958,7 +962,7 @@ static int acpi_idle_enter_simple(struct + } + + static int c3_cpu_count; +-static DEFINE_SPINLOCK(c3_lock); ++static DEFINE_RAW_SPINLOCK(c3_lock); + + /** + * acpi_idle_enter_bm - enters C3 with proper BM handling +Index: linux-2.6-tip/drivers/acpi/processor_perflib.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/processor_perflib.c ++++ linux-2.6-tip/drivers/acpi/processor_perflib.c +@@ -516,12 +516,12 @@ int acpi_processor_preregister_performan + continue; + } + +- if (!performance || !percpu_ptr(performance, i)) { ++ if (!performance || !per_cpu_ptr(performance, i)) { + retval = -EINVAL; + continue; + } + +- pr->performance = percpu_ptr(performance, i); ++ pr->performance = per_cpu_ptr(performance, i); + cpumask_set_cpu(i, pr->performance->shared_cpu_map); + if (acpi_processor_get_psd(pr)) { + retval = -EINVAL; +Index: linux-2.6-tip/drivers/acpi/sbs.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/sbs.c ++++ linux-2.6-tip/drivers/acpi/sbs.c +@@ -389,6 +389,8 @@ static int acpi_battery_get_state(struct + return result; + } + ++#if defined(CONFIG_ACPI_SYSFS_POWER) || defined(CONFIG_ACPI_PROCFS_POWER) ++ + static int acpi_battery_get_alarm(struct acpi_battery *battery) + { + return acpi_smbus_read(battery->sbs->hc, SMBUS_READ_WORD, +@@ -425,6 +427,8 @@ static int acpi_battery_set_alarm(struct + return ret; + } + ++#endif ++ + static int acpi_ac_get_present(struct acpi_sbs *sbs) + { + int result; +@@ -816,7 +820,10 @@ static int acpi_battery_add(struct acpi_ + + static void acpi_battery_remove(struct acpi_sbs *sbs, int id) + { ++#if defined(CONFIG_ACPI_SYSFS_POWER) || defined(CONFIG_ACPI_PROCFS_POWER) + struct acpi_battery *battery = &sbs->battery[id]; ++#endif ++ + #ifdef CONFIG_ACPI_SYSFS_POWER + if (battery->bat.dev) { + if (battery->have_sysfs_alarm) +Index: linux-2.6-tip/drivers/acpi/sleep.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/sleep.c ++++ linux-2.6-tip/drivers/acpi/sleep.c +@@ -24,6 +24,7 @@ + #include "sleep.h" + + u8 sleep_states[ACPI_S_STATE_COUNT]; ++static u32 acpi_target_sleep_state = ACPI_STATE_S0; + + static void acpi_sleep_tts_switch(u32 acpi_state) + { +@@ -77,7 +78,6 @@ static int acpi_sleep_prepare(u32 acpi_s + } + + #ifdef CONFIG_ACPI_SLEEP +-static u32 acpi_target_sleep_state = ACPI_STATE_S0; + /* + * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the + * user to request that behavior by using the 'acpi_old_suspend_ordering' +Index: linux-2.6-tip/drivers/acpi/tables.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/tables.c ++++ linux-2.6-tip/drivers/acpi/tables.c +@@ -181,14 +181,15 @@ acpi_table_parse_entries(char *id, + struct acpi_subtable_header *entry; + unsigned int count = 0; + unsigned long table_end; ++ acpi_size tbl_size; + + if (!handler) + return -EINVAL; + + if (strncmp(id, ACPI_SIG_MADT, 4) == 0) +- acpi_get_table(id, acpi_apic_instance, &table_header); ++ acpi_get_table_with_size(id, acpi_apic_instance, &table_header, &tbl_size); + else +- acpi_get_table(id, 0, &table_header); ++ acpi_get_table_with_size(id, 0, &table_header, &tbl_size); + + if (!table_header) { + printk(KERN_WARNING PREFIX "%4.4s not present\n", id); +@@ -206,8 +207,10 @@ acpi_table_parse_entries(char *id, + table_end) { + if (entry->type == entry_id + && (!max_entries || count++ < max_entries)) +- if (handler(entry, table_end)) ++ if (handler(entry, table_end)) { ++ early_acpi_os_unmap_memory((char *)table_header, tbl_size); + return -EINVAL; ++ } + + entry = (struct acpi_subtable_header *) + ((unsigned long)entry + entry->length); +@@ -217,6 +220,7 @@ acpi_table_parse_entries(char *id, + "%i found\n", id, entry_id, count - max_entries, count); + } + ++ early_acpi_os_unmap_memory((char *)table_header, tbl_size); + return count; + } + +@@ -241,17 +245,19 @@ acpi_table_parse_madt(enum acpi_madt_typ + int __init acpi_table_parse(char *id, acpi_table_handler handler) + { + struct acpi_table_header *table = NULL; ++ acpi_size tbl_size; + + if (!handler) + return -EINVAL; + + if (strncmp(id, ACPI_SIG_MADT, 4) == 0) +- acpi_get_table(id, acpi_apic_instance, &table); ++ acpi_get_table_with_size(id, acpi_apic_instance, &table, &tbl_size); + else +- acpi_get_table(id, 0, &table); ++ acpi_get_table_with_size(id, 0, &table, &tbl_size); + + if (table) { + handler(table); ++ early_acpi_os_unmap_memory(table, tbl_size); + return 0; + } else + return 1; +@@ -265,8 +271,9 @@ int __init acpi_table_parse(char *id, ac + static void __init check_multiple_madt(void) + { + struct acpi_table_header *table = NULL; ++ acpi_size tbl_size; + +- acpi_get_table(ACPI_SIG_MADT, 2, &table); ++ acpi_get_table_with_size(ACPI_SIG_MADT, 2, &table, &tbl_size); + if (table) { + printk(KERN_WARNING PREFIX + "BIOS bug: multiple APIC/MADT found," +@@ -275,6 +282,7 @@ static void __init check_multiple_madt(v + "If \"acpi_apic_instance=%d\" works better, " + "notify linux-acpi@vger.kernel.org\n", + acpi_apic_instance ? 0 : 2); ++ early_acpi_os_unmap_memory(table, tbl_size); + + } else + acpi_apic_instance = 0; +Index: linux-2.6-tip/drivers/ata/libata-core.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ata/libata-core.c ++++ linux-2.6-tip/drivers/ata/libata-core.c +@@ -1484,7 +1484,7 @@ static int ata_hpa_resize(struct ata_dev + struct ata_eh_context *ehc = &dev->link->eh_context; + int print_info = ehc->i.flags & ATA_EHI_PRINTINFO; + u64 sectors = ata_id_n_sectors(dev->id); +- u64 native_sectors; ++ u64 uninitialized_var(native_sectors); + int rc; + + /* do we need to do it? */ +Index: linux-2.6-tip/drivers/ata/libata-scsi.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ata/libata-scsi.c ++++ linux-2.6-tip/drivers/ata/libata-scsi.c +@@ -3247,7 +3247,7 @@ void ata_scsi_scan_host(struct ata_port + int tries = 5; + struct ata_device *last_failed_dev = NULL; + struct ata_link *link; +- struct ata_device *dev; ++ struct ata_device *uninitialized_var(dev); + + if (ap->flags & ATA_FLAG_DISABLED) + return; +Index: linux-2.6-tip/drivers/ata/pata_atiixp.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ata/pata_atiixp.c ++++ linux-2.6-tip/drivers/ata/pata_atiixp.c +@@ -140,7 +140,7 @@ static void atiixp_set_dmamode(struct at + wanted_pio = 3; + else if (adev->dma_mode == XFER_MW_DMA_0) + wanted_pio = 0; +- else BUG(); ++ else panic("atiixp_set_dmamode: unknown DMA mode!"); + + if (adev->pio_mode != wanted_pio) + atiixp_set_pio_timing(ap, adev, wanted_pio); +Index: linux-2.6-tip/drivers/ata/sata_via.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ata/sata_via.c ++++ linux-2.6-tip/drivers/ata/sata_via.c +@@ -566,7 +566,7 @@ static int svia_init_one(struct pci_dev + static int printed_version; + unsigned int i; + int rc; +- struct ata_host *host; ++ struct ata_host *uninitialized_var(host); + int board_id = (int) ent->driver_data; + const unsigned *bar_sizes; + +Index: linux-2.6-tip/drivers/atm/ambassador.c +=================================================================== +--- linux-2.6-tip.orig/drivers/atm/ambassador.c ++++ linux-2.6-tip/drivers/atm/ambassador.c +@@ -2097,7 +2097,7 @@ static int __devinit amb_init (amb_dev * + { + loader_block lb; + +- u32 version; ++ u32 version = -1; + + if (amb_reset (dev, 1)) { + PRINTK (KERN_ERR, "card reset failed!"); +Index: linux-2.6-tip/drivers/atm/horizon.c +=================================================================== +--- linux-2.6-tip.orig/drivers/atm/horizon.c ++++ linux-2.6-tip/drivers/atm/horizon.c +@@ -2131,7 +2131,7 @@ static int atm_pcr_check (struct atm_tra + static int hrz_open (struct atm_vcc *atm_vcc) + { + int error; +- u16 channel; ++ u16 uninitialized_var(channel); + + struct atm_qos * qos; + struct atm_trafprm * txtp; +Index: linux-2.6-tip/drivers/base/cpu.c +=================================================================== +--- linux-2.6-tip.orig/drivers/base/cpu.c ++++ linux-2.6-tip/drivers/base/cpu.c +@@ -107,7 +107,7 @@ static SYSDEV_ATTR(crash_notes, 0400, sh + /* + * Print cpu online, possible, present, and system maps + */ +-static ssize_t print_cpus_map(char *buf, cpumask_t *map) ++static ssize_t print_cpus_map(char *buf, const struct cpumask *map) + { + int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map); + +Index: linux-2.6-tip/drivers/base/iommu.c +=================================================================== +--- linux-2.6-tip.orig/drivers/base/iommu.c ++++ linux-2.6-tip/drivers/base/iommu.c +@@ -31,7 +31,7 @@ void register_iommu(struct iommu_ops *op + iommu_ops = ops; + } + +-bool iommu_found() ++bool iommu_found(void) + { + return iommu_ops != NULL; + } +Index: linux-2.6-tip/drivers/base/node.c +=================================================================== +--- linux-2.6-tip.orig/drivers/base/node.c ++++ linux-2.6-tip/drivers/base/node.c +@@ -24,7 +24,7 @@ static struct sysdev_class node_class = + static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf) + { + struct node *node_dev = to_node(dev); +- node_to_cpumask_ptr(mask, node_dev->sysdev.id); ++ const struct cpumask *mask = cpumask_of_node(node_dev->sysdev.id); + int len; + + /* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */ +Index: linux-2.6-tip/drivers/base/platform.c +=================================================================== +--- linux-2.6-tip.orig/drivers/base/platform.c ++++ linux-2.6-tip/drivers/base/platform.c +@@ -611,7 +611,8 @@ static int platform_match(struct device + + #ifdef CONFIG_PM_SLEEP + +-static int platform_legacy_suspend(struct device *dev, pm_message_t mesg) ++static inline int ++platform_legacy_suspend(struct device *dev, pm_message_t mesg) + { + int ret = 0; + +@@ -621,7 +622,8 @@ static int platform_legacy_suspend(struc + return ret; + } + +-static int platform_legacy_suspend_late(struct device *dev, pm_message_t mesg) ++static inline int ++platform_legacy_suspend_late(struct device *dev, pm_message_t mesg) + { + struct platform_driver *drv = to_platform_driver(dev->driver); + struct platform_device *pdev; +@@ -634,7 +636,7 @@ static int platform_legacy_suspend_late( + return ret; + } + +-static int platform_legacy_resume_early(struct device *dev) ++static inline int platform_legacy_resume_early(struct device *dev) + { + struct platform_driver *drv = to_platform_driver(dev->driver); + struct platform_device *pdev; +@@ -647,7 +649,7 @@ static int platform_legacy_resume_early( + return ret; + } + +-static int platform_legacy_resume(struct device *dev) ++static inline int platform_legacy_resume(struct device *dev) + { + int ret = 0; + +Index: linux-2.6-tip/drivers/base/topology.c +=================================================================== +--- linux-2.6-tip.orig/drivers/base/topology.c ++++ linux-2.6-tip/drivers/base/topology.c +@@ -31,7 +31,10 @@ + #include + #include + +-#define define_one_ro(_name) \ ++#define define_one_ro_named(_name, _func) \ ++static SYSDEV_ATTR(_name, 0444, _func, NULL) ++ ++#define define_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, show_##_name, NULL) + + #define define_id_show_func(name) \ +@@ -42,8 +45,8 @@ static ssize_t show_##name(struct sys_de + return sprintf(buf, "%d\n", topology_##name(cpu)); \ + } + +-#if defined(topology_thread_siblings) || defined(topology_core_siblings) +-static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) ++#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) ++static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf) + { + ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; + int n = 0; +@@ -65,7 +68,7 @@ static ssize_t show_##name(struct sys_de + struct sysdev_attribute *attr, char *buf) \ + { \ + unsigned int cpu = dev->id; \ +- return show_cpumap(0, &(topology_##name(cpu)), buf); \ ++ return show_cpumap(0, topology_##name(cpu), buf); \ + } + + #define define_siblings_show_list(name) \ +@@ -74,7 +77,7 @@ static ssize_t show_##name##_list(struct + char *buf) \ + { \ + unsigned int cpu = dev->id; \ +- return show_cpumap(1, &(topology_##name(cpu)), buf); \ ++ return show_cpumap(1, topology_##name(cpu), buf); \ + } + + #else +@@ -82,9 +85,7 @@ static ssize_t show_##name##_list(struct + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, char *buf) \ + { \ +- unsigned int cpu = dev->id; \ +- cpumask_t mask = topology_##name(cpu); \ +- return show_cpumap(0, &mask, buf); \ ++ return show_cpumap(0, topology_##name(dev->id), buf); \ + } + + #define define_siblings_show_list(name) \ +@@ -92,9 +93,7 @@ static ssize_t show_##name##_list(struct + struct sysdev_attribute *attr, \ + char *buf) \ + { \ +- unsigned int cpu = dev->id; \ +- cpumask_t mask = topology_##name(cpu); \ +- return show_cpumap(1, &mask, buf); \ ++ return show_cpumap(1, topology_##name(dev->id), buf); \ + } + #endif + +@@ -107,13 +106,13 @@ define_one_ro(physical_package_id); + define_id_show_func(core_id); + define_one_ro(core_id); + +-define_siblings_show_func(thread_siblings); +-define_one_ro(thread_siblings); +-define_one_ro(thread_siblings_list); +- +-define_siblings_show_func(core_siblings); +-define_one_ro(core_siblings); +-define_one_ro(core_siblings_list); ++define_siblings_show_func(thread_cpumask); ++define_one_ro_named(thread_siblings, show_thread_cpumask); ++define_one_ro_named(thread_siblings_list, show_thread_cpumask_list); ++ ++define_siblings_show_func(core_cpumask); ++define_one_ro_named(core_siblings, show_core_cpumask); ++define_one_ro_named(core_siblings_list, show_core_cpumask_list); + + static struct attribute *default_attrs[] = { + &attr_physical_package_id.attr, +Index: linux-2.6-tip/drivers/block/DAC960.c +=================================================================== +--- linux-2.6-tip.orig/drivers/block/DAC960.c ++++ linux-2.6-tip/drivers/block/DAC960.c +@@ -6646,7 +6646,8 @@ static long DAC960_gam_ioctl(struct file + (DAC960_ControllerInfo_T __user *) Argument; + DAC960_ControllerInfo_T ControllerInfo; + DAC960_Controller_T *Controller; +- int ControllerNumber; ++ int uninitialized_var(ControllerNumber); ++ + if (UserSpaceControllerInfo == NULL) + ErrorCode = -EINVAL; + else ErrorCode = get_user(ControllerNumber, +Index: linux-2.6-tip/drivers/char/ip2/ip2main.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/ip2/ip2main.c ++++ linux-2.6-tip/drivers/char/ip2/ip2main.c +@@ -3202,4 +3202,4 @@ static struct pci_device_id ip2main_pci_ + { } + }; + +-MODULE_DEVICE_TABLE(pci, ip2main_pci_tbl); ++MODULE_STATIC_DEVICE_TABLE(pci, ip2main_pci_tbl); +Index: linux-2.6-tip/drivers/char/ipmi/ipmi_msghandler.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/ipmi/ipmi_msghandler.c ++++ linux-2.6-tip/drivers/char/ipmi/ipmi_msghandler.c +@@ -1796,7 +1796,8 @@ int ipmi_request_settime(ipmi_user_t + int retries, + unsigned int retry_time_ms) + { +- unsigned char saddr, lun; ++ unsigned char uninitialized_var(saddr), ++ uninitialized_var(lun); + int rv; + + if (!user) +@@ -1828,7 +1829,8 @@ int ipmi_request_supply_msgs(ipmi_user_t + struct ipmi_recv_msg *supplied_recv, + int priority) + { +- unsigned char saddr, lun; ++ unsigned char uninitialized_var(saddr), ++ uninitialized_var(lun); + int rv; + + if (!user) +Index: linux-2.6-tip/drivers/char/isicom.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/isicom.c ++++ linux-2.6-tip/drivers/char/isicom.c +@@ -1585,7 +1585,7 @@ static unsigned int card_count; + static int __devinit isicom_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) + { +- unsigned int signature, index; ++ unsigned int uninitialized_var(signature), index; + int retval = -EPERM; + struct isi_board *board = NULL; + +Index: linux-2.6-tip/drivers/char/random.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/random.c ++++ linux-2.6-tip/drivers/char/random.c +@@ -241,6 +241,10 @@ + #include + #include + ++#ifdef CONFIG_GENERIC_HARDIRQS ++# include ++#endif ++ + #include + #include + #include +@@ -558,7 +562,7 @@ struct timer_rand_state { + unsigned dont_count_entropy:1; + }; + +-#ifndef CONFIG_SPARSE_IRQ ++#ifndef CONFIG_GENERIC_HARDIRQS + + static struct timer_rand_state *irq_timer_state[NR_IRQS]; + +@@ -619,8 +623,11 @@ static void add_timer_randomness(struct + preempt_disable(); + /* if over the trickle threshold, use only 1 in 4096 samples */ + if (input_pool.entropy_count > trickle_thresh && +- (__get_cpu_var(trickle_count)++ & 0xfff)) +- goto out; ++ (__get_cpu_var(trickle_count)++ & 0xfff)) { ++ preempt_enable(); ++ return; ++ } ++ preempt_enable(); + + sample.jiffies = jiffies; + sample.cycles = get_cycles(); +@@ -662,8 +669,6 @@ static void add_timer_randomness(struct + credit_entropy_bits(&input_pool, + min_t(int, fls(delta>>1), 11)); + } +-out: +- preempt_enable(); + } + + void add_input_randomness(unsigned int type, unsigned int code, +Index: linux-2.6-tip/drivers/char/rocket.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/rocket.c ++++ linux-2.6-tip/drivers/char/rocket.c +@@ -150,12 +150,14 @@ static Word_t aiop_intr_bits[AIOP_CTL_SI + AIOP_INTR_BIT_3 + }; + ++#ifdef CONFIG_PCI + static Word_t upci_aiop_intr_bits[AIOP_CTL_SIZE] = { + UPCI_AIOP_INTR_BIT_0, + UPCI_AIOP_INTR_BIT_1, + UPCI_AIOP_INTR_BIT_2, + UPCI_AIOP_INTR_BIT_3 + }; ++#endif + + static Byte_t RData[RDATASIZE] = { + 0x00, 0x09, 0xf6, 0x82, +@@ -227,7 +229,6 @@ static unsigned long nextLineNumber; + static int __init init_ISA(int i); + static void rp_wait_until_sent(struct tty_struct *tty, int timeout); + static void rp_flush_buffer(struct tty_struct *tty); +-static void rmSpeakerReset(CONTROLLER_T * CtlP, unsigned long model); + static unsigned char GetLineNumber(int ctrl, int aiop, int ch); + static unsigned char SetLineNumber(int ctrl, int aiop, int ch); + static void rp_start(struct tty_struct *tty); +@@ -241,11 +242,14 @@ static void sDisInterrupts(CHANNEL_T * C + static void sModemReset(CONTROLLER_T * CtlP, int chan, int on); + static void sPCIModemReset(CONTROLLER_T * CtlP, int chan, int on); + static int sWriteTxPrioByte(CHANNEL_T * ChP, Byte_t Data); ++#ifdef CONFIG_PCI ++static void rmSpeakerReset(CONTROLLER_T * CtlP, unsigned long model); + static int sPCIInitController(CONTROLLER_T * CtlP, int CtlNum, + ByteIO_t * AiopIOList, int AiopIOListSize, + WordIO_t ConfigIO, int IRQNum, Byte_t Frequency, + int PeriodicOnly, int altChanRingIndicator, + int UPCIRingInd); ++#endif + static int sInitController(CONTROLLER_T * CtlP, int CtlNum, ByteIO_t MudbacIO, + ByteIO_t * AiopIOList, int AiopIOListSize, + int IRQNum, Byte_t Frequency, int PeriodicOnly); +@@ -1751,7 +1755,7 @@ static struct pci_device_id __devinitdat + { PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_ANY_ID) }, + { } + }; +-MODULE_DEVICE_TABLE(pci, rocket_pci_ids); ++MODULE_STATIC_DEVICE_TABLE(pci, rocket_pci_ids); + + /* + * Called when a PCI card is found. Retrieves and stores model information, +@@ -2533,6 +2537,7 @@ static int sInitController(CONTROLLER_T + return (CtlP->NumAiop); + } + ++#ifdef CONFIG_PCI + /*************************************************************************** + Function: sPCIInitController + Purpose: Initialization of controller global registers and controller +@@ -2652,6 +2657,7 @@ static int sPCIInitController(CONTROLLER + else + return (CtlP->NumAiop); + } ++#endif /* CONFIG_PCI */ + + /*************************************************************************** + Function: sReadAiopID +@@ -3142,6 +3148,7 @@ static void sPCIModemReset(CONTROLLER_T + sOutB(addr + chan, 0); /* apply or remove reset */ + } + ++#ifdef CONFIG_PCI + /* Resets the speaker controller on RocketModem II and III devices */ + static void rmSpeakerReset(CONTROLLER_T * CtlP, unsigned long model) + { +@@ -3160,6 +3167,7 @@ static void rmSpeakerReset(CONTROLLER_T + sOutB(addr, 0); + } + } ++#endif /* CONFIG_PCI */ + + /* Returns the line number given the controller (board), aiop and channel number */ + static unsigned char GetLineNumber(int ctrl, int aiop, int ch) +Index: linux-2.6-tip/drivers/char/rtc.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/rtc.c ++++ linux-2.6-tip/drivers/char/rtc.c +@@ -188,7 +188,9 @@ static int rtc_proc_open(struct inode *i + * timer (but you would need to have an awful timing before you'd trip on it) + */ + static unsigned long rtc_status; /* bitmapped status byte. */ ++#if defined(RTC_IRQ) || defined(CONFIG_PROC_FS) + static unsigned long rtc_freq; /* Current periodic IRQ rate */ ++#endif + static unsigned long rtc_irq_data; /* our output to the world */ + static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */ + +@@ -1074,7 +1076,9 @@ no_irq: + #endif + + #if defined(__alpha__) || defined(__mips__) ++#ifdef CONFIG_PROC_FS + rtc_freq = HZ; ++#endif + + /* Each operating system on an Alpha uses its own epoch. + Let's try to guess which one we are using now. */ +@@ -1197,10 +1201,12 @@ static void rtc_dropped_irq(unsigned lon + + spin_unlock_irq(&rtc_lock); + ++#ifndef CONFIG_PREEMPT_RT + if (printk_ratelimit()) { + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + freq); + } ++#endif + + /* Now we have new data */ + wake_up_interruptible(&rtc_wait); +Index: linux-2.6-tip/drivers/char/specialix.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/specialix.c ++++ linux-2.6-tip/drivers/char/specialix.c +@@ -2359,7 +2359,7 @@ static struct pci_device_id specialx_pci + { PCI_DEVICE(PCI_VENDOR_ID_SPECIALIX, PCI_DEVICE_ID_SPECIALIX_IO8) }, + { } + }; +-MODULE_DEVICE_TABLE(pci, specialx_pci_tbl); ++MODULE_STATIC_DEVICE_TABLE(pci, specialx_pci_tbl); + + module_init(specialix_init_module); + module_exit(specialix_exit_module); +Index: linux-2.6-tip/drivers/char/sysrq.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/sysrq.c ++++ linux-2.6-tip/drivers/char/sysrq.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -35,7 +36,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + +@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int ke + struct pt_regs *regs = get_irq_regs(); + if (regs) + show_regs(regs); ++ perf_counter_print_debug(); + } + static struct sysrq_key_op sysrq_showregs_op = { + .handler = sysrq_handle_showregs, +@@ -283,7 +285,7 @@ static void sysrq_ftrace_dump(int key, s + } + static struct sysrq_key_op sysrq_ftrace_dump_op = { + .handler = sysrq_ftrace_dump, +- .help_msg = "dumpZ-ftrace-buffer", ++ .help_msg = "dump-ftrace-buffer(Z)", + .action_msg = "Dump ftrace buffer", + .enable_mask = SYSRQ_ENABLE_DUMP, + }; +Index: linux-2.6-tip/drivers/clocksource/acpi_pm.c +=================================================================== +--- linux-2.6-tip.orig/drivers/clocksource/acpi_pm.c ++++ linux-2.6-tip/drivers/clocksource/acpi_pm.c +@@ -143,7 +143,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SE + #endif + + #ifndef CONFIG_X86_64 +-#include "mach_timer.h" ++#include + #define PMTMR_EXPECTED_RATE \ + ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) + /* +Index: linux-2.6-tip/drivers/clocksource/cyclone.c +=================================================================== +--- linux-2.6-tip.orig/drivers/clocksource/cyclone.c ++++ linux-2.6-tip/drivers/clocksource/cyclone.c +@@ -7,7 +7,7 @@ + #include + #include + +-#include "mach_timer.h" ++#include + + #define CYCLONE_CBAR_ADDR 0xFEB00CD0 /* base address ptr */ + #define CYCLONE_PMCC_OFFSET 0x51A0 /* offset to control register */ +Index: linux-2.6-tip/drivers/eisa/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/eisa/Kconfig ++++ linux-2.6-tip/drivers/eisa/Kconfig +@@ -3,7 +3,7 @@ + # + config EISA_VLB_PRIMING + bool "Vesa Local Bus priming" +- depends on X86_PC && EISA ++ depends on X86 && EISA + default n + ---help--- + Activate this option if your system contains a Vesa Local +@@ -24,11 +24,11 @@ config EISA_PCI_EISA + When in doubt, say Y. + + # Using EISA_VIRTUAL_ROOT on something other than an Alpha or +-# an X86_PC may lead to crashes... ++# an X86 may lead to crashes... + + config EISA_VIRTUAL_ROOT + bool "EISA virtual root device" +- depends on EISA && (ALPHA || X86_PC) ++ depends on EISA && (ALPHA || X86) + default y + ---help--- + Activate this option if your system only have EISA bus +Index: linux-2.6-tip/drivers/firmware/dcdbas.c +=================================================================== +--- linux-2.6-tip.orig/drivers/firmware/dcdbas.c ++++ linux-2.6-tip/drivers/firmware/dcdbas.c +@@ -244,7 +244,7 @@ static ssize_t host_control_on_shutdown_ + */ + int dcdbas_smi_request(struct smi_cmd *smi_cmd) + { +- cpumask_t old_mask; ++ cpumask_var_t old_mask; + int ret = 0; + + if (smi_cmd->magic != SMI_CMD_MAGIC) { +@@ -254,8 +254,11 @@ int dcdbas_smi_request(struct smi_cmd *s + } + + /* SMI requires CPU 0 */ +- old_mask = current->cpus_allowed; +- set_cpus_allowed_ptr(current, &cpumask_of_cpu(0)); ++ if (!alloc_cpumask_var(&old_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ cpumask_copy(old_mask, ¤t->cpus_allowed); ++ set_cpus_allowed_ptr(current, cpumask_of(0)); + if (smp_processor_id() != 0) { + dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n", + __func__); +@@ -275,7 +278,8 @@ int dcdbas_smi_request(struct smi_cmd *s + ); + + out: +- set_cpus_allowed_ptr(current, &old_mask); ++ set_cpus_allowed_ptr(current, old_mask); ++ free_cpumask_var(old_mask); + return ret; + } + +Index: linux-2.6-tip/drivers/firmware/iscsi_ibft.c +=================================================================== +--- linux-2.6-tip.orig/drivers/firmware/iscsi_ibft.c ++++ linux-2.6-tip/drivers/firmware/iscsi_ibft.c +@@ -938,8 +938,8 @@ static int __init ibft_init(void) + return -ENOMEM; + + if (ibft_addr) { +- printk(KERN_INFO "iBFT detected at 0x%lx.\n", +- virt_to_phys((void *)ibft_addr)); ++ printk(KERN_INFO "iBFT detected at 0x%llx.\n", ++ (u64)virt_to_phys((void *)ibft_addr)); + + rc = ibft_check_device(); + if (rc) +Index: linux-2.6-tip/drivers/gpu/drm/drm_proc.c +=================================================================== +--- linux-2.6-tip.orig/drivers/gpu/drm/drm_proc.c ++++ linux-2.6-tip/drivers/gpu/drm/drm_proc.c +@@ -678,9 +678,9 @@ static int drm__vma_info(char *buf, char + *start = &buf[offset]; + *eof = 0; + +- DRM_PROC_PRINT("vma use count: %d, high_memory = %p, 0x%08lx\n", ++ DRM_PROC_PRINT("vma use count: %d, high_memory = %p, 0x%llx\n", + atomic_read(&dev->vma_count), +- high_memory, virt_to_phys(high_memory)); ++ high_memory, (u64)virt_to_phys(high_memory)); + list_for_each_entry(pt, &dev->vmalist, head) { + if (!(vma = pt->vma)) + continue; +Index: linux-2.6-tip/drivers/hwmon/adt7473.c +=================================================================== +--- linux-2.6-tip.orig/drivers/hwmon/adt7473.c ++++ linux-2.6-tip/drivers/hwmon/adt7473.c +@@ -848,6 +848,8 @@ static ssize_t show_pwm_auto_temp(struct + } + /* shouldn't ever get here */ + BUG(); ++ ++ return 0; + } + + static ssize_t set_pwm_auto_temp(struct device *dev, +Index: linux-2.6-tip/drivers/hwmon/i5k_amb.c +=================================================================== +--- linux-2.6-tip.orig/drivers/hwmon/i5k_amb.c ++++ linux-2.6-tip/drivers/hwmon/i5k_amb.c +@@ -480,7 +480,7 @@ static unsigned long i5k_channel_pci_id( + case PCI_DEVICE_ID_INTEL_5400_ERR: + return PCI_DEVICE_ID_INTEL_5400_FBD0 + channel; + default: +- BUG(); ++ panic("i5k_channel_pci_id: unknown chipset!"); + } + } + +Index: linux-2.6-tip/drivers/i2c/busses/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/i2c/busses/Kconfig ++++ linux-2.6-tip/drivers/i2c/busses/Kconfig +@@ -56,6 +56,9 @@ config I2C_AMD756 + config I2C_AMD756_S4882 + tristate "SMBus multiplexing on the Tyan S4882" + depends on I2C_AMD756 && X86 && EXPERIMENTAL ++ # broke an Athlon 64 X2 Asus A8N-E with: ++ # http://redhat.com/~mingo/misc/config-Thu_Jul_17_11_34_08_CEST_2008.bad ++ depends on 0 + help + Enabling this option will add specific SMBus support for the Tyan + S4882 motherboard. On this 4-CPU board, the SMBus is multiplexed +@@ -150,6 +153,9 @@ config I2C_NFORCE2 + config I2C_NFORCE2_S4985 + tristate "SMBus multiplexing on the Tyan S4985" + depends on I2C_NFORCE2 && X86 && EXPERIMENTAL ++ # broke a T60 Core2Duo with: ++ # http://redhat.com/~mingo/misc/config-Thu_Jul_17_10_47_42_CEST_2008.bad ++ depends on 0 + help + Enabling this option will add specific SMBus support for the Tyan + S4985 motherboard. On this 4-CPU board, the SMBus is multiplexed +Index: linux-2.6-tip/drivers/ieee1394/csr1212.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ieee1394/csr1212.c ++++ linux-2.6-tip/drivers/ieee1394/csr1212.c +@@ -35,6 +35,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -387,6 +388,7 @@ csr1212_new_descriptor_leaf(u8 dtype, u3 + if (!kv) + return NULL; + ++ kmemcheck_annotate_bitfield(kv->value.leaf.data[0]); + CSR1212_DESCRIPTOR_LEAF_SET_TYPE(kv, dtype); + CSR1212_DESCRIPTOR_LEAF_SET_SPECIFIER_ID(kv, specifier_id); + +Index: linux-2.6-tip/drivers/ieee1394/nodemgr.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ieee1394/nodemgr.c ++++ linux-2.6-tip/drivers/ieee1394/nodemgr.c +@@ -10,6 +10,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -39,7 +40,10 @@ struct nodemgr_csr_info { + struct hpsb_host *host; + nodeid_t nodeid; + unsigned int generation; +- unsigned int speed_unverified:1; ++ ++ kmemcheck_define_bitfield(flags, { ++ unsigned int speed_unverified:1; ++ }); + }; + + +@@ -1295,6 +1299,7 @@ static void nodemgr_node_scan_one(struct + ci = kmalloc(sizeof(*ci), GFP_KERNEL); + if (!ci) + return; ++ kmemcheck_annotate_bitfield(ci->flags); + + ci->host = host; + ci->nodeid = nodeid; +Index: linux-2.6-tip/drivers/infiniband/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/infiniband/Kconfig ++++ linux-2.6-tip/drivers/infiniband/Kconfig +@@ -2,6 +2,7 @@ menuconfig INFINIBAND + tristate "InfiniBand support" + depends on PCI || BROKEN + depends on HAS_IOMEM ++ depends on 0 + ---help--- + Core support for InfiniBand (IB). Make sure to also select + any protocols you wish to use as well as drivers for your +Index: linux-2.6-tip/drivers/infiniband/hw/amso1100/c2_vq.c +=================================================================== +--- linux-2.6-tip.orig/drivers/infiniband/hw/amso1100/c2_vq.c ++++ linux-2.6-tip/drivers/infiniband/hw/amso1100/c2_vq.c +@@ -107,7 +107,7 @@ struct c2_vq_req *vq_req_alloc(struct c2 + r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL); + if (r) { + init_waitqueue_head(&r->wait_object); +- r->reply_msg = (u64) NULL; ++ r->reply_msg = (u64) (long) NULL; + r->event = 0; + r->cm_id = NULL; + r->qp = NULL; +@@ -123,7 +123,7 @@ struct c2_vq_req *vq_req_alloc(struct c2 + */ + void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r) + { +- r->reply_msg = (u64) NULL; ++ r->reply_msg = (u64) (long) NULL; + if (atomic_dec_and_test(&r->refcnt)) { + kfree(r); + } +@@ -151,7 +151,7 @@ void vq_req_get(struct c2_dev *c2dev, st + void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r) + { + if (atomic_dec_and_test(&r->refcnt)) { +- if (r->reply_msg != (u64) NULL) ++ if (r->reply_msg != (u64) (long) NULL) + vq_repbuf_free(c2dev, + (void *) (unsigned long) r->reply_msg); + kfree(r); +@@ -258,3 +258,4 @@ void vq_repbuf_free(struct c2_dev *c2dev + { + kmem_cache_free(c2dev->host_msg_cache, reply); + } ++ +Index: linux-2.6-tip/drivers/infiniband/hw/ipath/ipath_driver.c +=================================================================== +--- linux-2.6-tip.orig/drivers/infiniband/hw/ipath/ipath_driver.c ++++ linux-2.6-tip/drivers/infiniband/hw/ipath/ipath_driver.c +@@ -2715,7 +2715,7 @@ static void ipath_hol_signal_up(struct i + * to prevent HoL blocking, then start the HoL timer that + * periodically continues, then stop procs, so they can detect + * link down if they want, and do something about it. +- * Timer may already be running, so use __mod_timer, not add_timer. ++ * Timer may already be running, so use mod_timer, not add_timer. + */ + void ipath_hol_down(struct ipath_devdata *dd) + { +@@ -2724,7 +2724,7 @@ void ipath_hol_down(struct ipath_devdata + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); +- __mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); ++ mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); + } + + /* +@@ -2763,7 +2763,7 @@ void ipath_hol_event(unsigned long opaqu + else { + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); +- __mod_timer(&dd->ipath_hol_timer, ++ mod_timer(&dd->ipath_hol_timer, + dd->ipath_hol_timer.expires); + } + } +Index: linux-2.6-tip/drivers/input/keyboard/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/input/keyboard/Kconfig ++++ linux-2.6-tip/drivers/input/keyboard/Kconfig +@@ -13,11 +13,11 @@ menuconfig INPUT_KEYBOARD + if INPUT_KEYBOARD + + config KEYBOARD_ATKBD +- tristate "AT keyboard" if EMBEDDED || !X86_PC ++ tristate "AT keyboard" if EMBEDDED || !X86 + default y + select SERIO + select SERIO_LIBPS2 +- select SERIO_I8042 if X86_PC ++ select SERIO_I8042 if X86 + select SERIO_GSCPS2 if GSC + help + Say Y here if you want to use a standard AT or PS/2 keyboard. Usually +Index: linux-2.6-tip/drivers/input/mouse/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/input/mouse/Kconfig ++++ linux-2.6-tip/drivers/input/mouse/Kconfig +@@ -17,7 +17,7 @@ config MOUSE_PS2 + default y + select SERIO + select SERIO_LIBPS2 +- select SERIO_I8042 if X86_PC ++ select SERIO_I8042 if X86 + select SERIO_GSCPS2 if GSC + help + Say Y here if you have a PS/2 mouse connected to your system. This +Index: linux-2.6-tip/drivers/input/touchscreen/htcpen.c +=================================================================== +--- linux-2.6-tip.orig/drivers/input/touchscreen/htcpen.c ++++ linux-2.6-tip/drivers/input/touchscreen/htcpen.c +@@ -47,12 +47,6 @@ static int invert_y; + module_param(invert_y, bool, 0644); + MODULE_PARM_DESC(invert_y, "If set, Y axis is inverted"); + +-static struct pnp_device_id pnp_ids[] = { +- { .id = "PNP0cc0" }, +- { .id = "" } +-}; +-MODULE_DEVICE_TABLE(pnp, pnp_ids); +- + static irqreturn_t htcpen_interrupt(int irq, void *handle) + { + struct input_dev *htcpen_dev = handle; +@@ -253,3 +247,4 @@ static void __exit htcpen_isa_exit(void) + + module_init(htcpen_isa_init); + module_exit(htcpen_isa_exit); ++ +Index: linux-2.6-tip/drivers/isdn/capi/capidrv.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/capi/capidrv.c ++++ linux-2.6-tip/drivers/isdn/capi/capidrv.c +@@ -1551,8 +1551,8 @@ static int decodeFVteln(char *teln, unsi + + static int FVteln2capi20(char *teln, u8 AdditionalInfo[1+2+2+31]) + { +- unsigned long bmask; +- int active; ++ unsigned long uninitialized_var(bmask); ++ int uninitialized_var(active); + int rc, i; + + rc = decodeFVteln(teln, &bmask, &active); +Index: linux-2.6-tip/drivers/isdn/hardware/eicon/maintidi.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/hardware/eicon/maintidi.c ++++ linux-2.6-tip/drivers/isdn/hardware/eicon/maintidi.c +@@ -959,7 +959,7 @@ static int process_idi_event (diva_strac + } + if (!strncmp("State\\Layer2 No1", path, pVar->path_length)) { + char* tmp = &pLib->lines[0].pInterface->Layer2[0]; +- dword l2_state; ++ dword uninitialized_var(l2_state); + diva_strace_read_uint (pVar, &l2_state); + + switch (l2_state) { +Index: linux-2.6-tip/drivers/isdn/hardware/eicon/message.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/hardware/eicon/message.c ++++ linux-2.6-tip/drivers/isdn/hardware/eicon/message.c +@@ -2682,7 +2682,7 @@ byte connect_b3_req(dword Id, word Numbe + if (!(fax_control_bits & T30_CONTROL_BIT_MORE_DOCUMENTS) + || (fax_feature_bits & T30_FEATURE_BIT_MORE_DOCUMENTS)) + { +- len = (byte)(&(((T30_INFO *) 0)->universal_6)); ++ len = (byte)(offsetof(T30_INFO, universal_6)); + fax_info_change = false; + if (ncpi->length >= 4) + { +@@ -2744,7 +2744,7 @@ byte connect_b3_req(dword Id, word Numbe + for (i = 0; i < w; i++) + ((T30_INFO *)(plci->fax_connect_info_buffer))->station_id[i] = fax_parms[4].info[1+i]; + ((T30_INFO *)(plci->fax_connect_info_buffer))->head_line_len = 0; +- len = (byte)(((T30_INFO *) 0)->station_id + 20); ++ len = (byte)(offsetof(T30_INFO, station_id) + 20); + w = fax_parms[5].length; + if (w > 20) + w = 20; +@@ -2778,7 +2778,7 @@ byte connect_b3_req(dword Id, word Numbe + } + else + { +- len = (byte)(&(((T30_INFO *) 0)->universal_6)); ++ len = (byte)(offsetof(T30_INFO, universal_6)); + } + fax_info_change = true; + +@@ -2881,7 +2881,7 @@ byte connect_b3_res(dword Id, word Numbe + && (plci->nsf_control_bits & T30_NSF_CONTROL_BIT_ENABLE_NSF) + && (plci->nsf_control_bits & T30_NSF_CONTROL_BIT_NEGOTIATE_RESP)) + { +- len = ((byte)(((T30_INFO *) 0)->station_id + 20)); ++ len = (byte)(offsetof(T30_INFO, station_id) + 20); + if (plci->fax_connect_info_length < len) + { + ((T30_INFO *)(plci->fax_connect_info_buffer))->station_id_len = 0; +@@ -3782,7 +3782,7 @@ static byte manufacturer_res(dword Id, w + break; + } + ncpi = &m_parms[1]; +- len = ((byte)(((T30_INFO *) 0)->station_id + 20)); ++ len = (byte)(offsetof(T30_INFO, station_id) + 20); + if (plci->fax_connect_info_length < len) + { + ((T30_INFO *)(plci->fax_connect_info_buffer))->station_id_len = 0; +@@ -6485,7 +6485,7 @@ static void nl_ind(PLCI *plci) + word info = 0; + word fax_feature_bits; + byte fax_send_edata_ack; +- static byte v120_header_buffer[2 + 3]; ++ static byte v120_header_buffer[2 + 3] __attribute__ ((aligned(8))); + static word fax_info[] = { + 0, /* T30_SUCCESS */ + _FAX_NO_CONNECTION, /* T30_ERR_NO_DIS_RECEIVED */ +@@ -6824,7 +6824,7 @@ static void nl_ind(PLCI *plci) + if ((plci->requested_options_conn | plci->requested_options | a->requested_options_table[plci->appl->Id-1]) + & ((1L << PRIVATE_FAX_SUB_SEP_PWD) | (1L << PRIVATE_FAX_NONSTANDARD))) + { +- i = ((word)(((T30_INFO *) 0)->station_id + 20)) + ((T30_INFO *)plci->NL.RBuffer->P)->head_line_len; ++ i = ((word)(offsetof(T30_INFO, station_id) + 20)) + ((T30_INFO *)plci->NL.RBuffer->P)->head_line_len; + while (i < plci->NL.RBuffer->length) + plci->ncpi_buffer[++len] = plci->NL.RBuffer->P[i++]; + } +@@ -7216,7 +7216,7 @@ static void nl_ind(PLCI *plci) + { + plci->RData[1].P = plci->RData[0].P; + plci->RData[1].PLength = plci->RData[0].PLength; +- plci->RData[0].P = v120_header_buffer + (-((int) v120_header_buffer) & 3); ++ plci->RData[0].P = v120_header_buffer; + if ((plci->NL.RBuffer->P[0] & V120_HEADER_EXTEND_BIT) || (plci->NL.RLength == 1)) + plci->RData[0].PLength = 1; + else +@@ -8395,6 +8395,7 @@ static word add_b23(PLCI *plci, API_PARS + /* copy head line to NLC */ + if(b3_config_parms[3].length) + { ++ byte *head_line = (void *) ((T30_INFO *)&nlc[1] + 1); + + pos = (byte)(fax_head_line_time (&(((T30_INFO *)&nlc[1])->station_id[20]))); + if (pos != 0) +@@ -8403,17 +8404,17 @@ static word add_b23(PLCI *plci, API_PARS + pos = 0; + else + { +- ((T30_INFO *)&nlc[1])->station_id[20 + pos++] = ' '; +- ((T30_INFO *)&nlc[1])->station_id[20 + pos++] = ' '; ++ head_line[pos++] = ' '; ++ head_line[pos++] = ' '; + len = (byte)b3_config_parms[2].length; + if (len > 20) + len = 20; + if (CAPI_MAX_DATE_TIME_LENGTH + 2 + len + 2 + b3_config_parms[3].length <= CAPI_MAX_HEAD_LINE_SPACE) + { + for (i = 0; i < len; i++) +- ((T30_INFO *)&nlc[1])->station_id[20 + pos++] = ((byte *)b3_config_parms[2].info)[1+i]; +- ((T30_INFO *)&nlc[1])->station_id[20 + pos++] = ' '; +- ((T30_INFO *)&nlc[1])->station_id[20 + pos++] = ' '; ++ head_line[pos++] = ((byte *)b3_config_parms[2].info)[1+i]; ++ head_line[pos++] = ' '; ++ head_line[pos++] = ' '; + } + } + } +@@ -8424,7 +8425,7 @@ static word add_b23(PLCI *plci, API_PARS + ((T30_INFO *)&nlc[1])->head_line_len = (byte)(pos + len); + nlc[0] += (byte)(pos + len); + for (i = 0; i < len; i++) +- ((T30_INFO *)&nlc[1])->station_id[20 + pos++] = ((byte *)b3_config_parms[3].info)[1+i]; ++ head_line[pos++] = ((byte *)b3_config_parms[3].info)[1+i]; + } + else + ((T30_INFO *)&nlc[1])->head_line_len = 0; +@@ -8453,7 +8454,7 @@ static word add_b23(PLCI *plci, API_PARS + fax_control_bits |= T30_CONTROL_BIT_ACCEPT_SEL_POLLING; + } + len = nlc[0]; +- pos = ((byte)(((T30_INFO *) 0)->station_id + 20)); ++ pos = (byte)(offsetof(T30_INFO, station_id) + 20); + if (pos < plci->fax_connect_info_length) + { + for (i = 1 + plci->fax_connect_info_buffer[pos]; i != 0; i--) +@@ -8505,7 +8506,7 @@ static word add_b23(PLCI *plci, API_PARS + } + + PUT_WORD(&(((T30_INFO *)&nlc[1])->control_bits_low), fax_control_bits); +- len = ((byte)(((T30_INFO *) 0)->station_id + 20)); ++ len = (byte)(offsetof(T30_INFO, station_id) + 20); + for (i = 0; i < len; i++) + plci->fax_connect_info_buffer[i] = nlc[1+i]; + ((T30_INFO *) plci->fax_connect_info_buffer)->head_line_len = 0; +@@ -15049,3 +15050,4 @@ static void diva_free_dma_descriptor (PL + } + + /*------------------------------------------------------------------*/ ++ +Index: linux-2.6-tip/drivers/isdn/hisax/config.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/hisax/config.c ++++ linux-2.6-tip/drivers/isdn/hisax/config.c +@@ -1980,7 +1980,7 @@ static struct pci_device_id hisax_pci_tb + { } /* Terminating entry */ + }; + +-MODULE_DEVICE_TABLE(pci, hisax_pci_tbl); ++MODULE_STATIC_DEVICE_TABLE(pci, hisax_pci_tbl); + #endif /* CONFIG_PCI */ + + module_init(HiSax_init); +Index: linux-2.6-tip/drivers/isdn/i4l/isdn_common.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/i4l/isdn_common.c ++++ linux-2.6-tip/drivers/isdn/i4l/isdn_common.c +@@ -1280,7 +1280,9 @@ isdn_ioctl(struct inode *inode, struct f + int ret; + int i; + char __user *p; ++#ifdef CONFIG_NETDEVICES + char *s; ++#endif + union iocpar { + char name[10]; + char bname[22]; +Index: linux-2.6-tip/drivers/isdn/i4l/isdn_ppp.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/i4l/isdn_ppp.c ++++ linux-2.6-tip/drivers/isdn/i4l/isdn_ppp.c +@@ -466,7 +466,7 @@ static int get_filter(void __user *arg, + *p = code; + return uprog.len; + } +-#endif /* CONFIG_IPPP_FILTER */ ++#endif + + /* + * ippp device ioctl +Index: linux-2.6-tip/drivers/isdn/icn/icn.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/icn/icn.c ++++ linux-2.6-tip/drivers/isdn/icn/icn.c +@@ -717,7 +717,7 @@ icn_sendbuf(int channel, int ack, struct + return 0; + if (card->sndcount[channel] > ICN_MAX_SQUEUE) + return 0; +- #warning TODO test headroom or use skb->nb to flag ACK ++ /* TODO test headroom or use skb->nb to flag ACK: */ + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + /* Push ACK flag as one +Index: linux-2.6-tip/drivers/isdn/mISDN/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/mISDN/Kconfig ++++ linux-2.6-tip/drivers/isdn/mISDN/Kconfig +@@ -4,6 +4,9 @@ + + menuconfig MISDN + tristate "Modular ISDN driver" ++ # broken with: ++ # http://redhat.com/~mingo/misc/config-Sun_Jul_27_08_30_16_CEST_2008.bad ++ depends on 0 + help + Enable support for the modular ISDN driver. + +Index: linux-2.6-tip/drivers/isdn/sc/card.h +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/sc/card.h ++++ linux-2.6-tip/drivers/isdn/sc/card.h +@@ -82,7 +82,7 @@ typedef struct { + int ioport[MAX_IO_REGS]; /* Index to I/O ports */ + int shmem_pgport; /* port for the exp mem page reg. */ + int shmem_magic; /* adapter magic number */ +- unsigned int rambase; /* Shared RAM base address */ ++ u8 __iomem *rambase; /* Shared RAM base address */ + unsigned int ramsize; /* Size of shared memory */ + RspMessage async_msg; /* Async response message */ + int want_async_messages; /* Snoop the Q ? */ +Index: linux-2.6-tip/drivers/isdn/sc/init.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/sc/init.c ++++ linux-2.6-tip/drivers/isdn/sc/init.c +@@ -27,7 +27,7 @@ static const char *boardname[] = { "Data + /* insmod set parameters */ + static unsigned int io[] = {0,0,0,0}; + static unsigned char irq[] = {0,0,0,0}; +-static unsigned long ram[] = {0,0,0,0}; ++static u8 __iomem * ram[] = {0,0,0,0}; + static int do_reset = 0; + + module_param_array(io, int, NULL, 0); +@@ -35,7 +35,7 @@ module_param_array(irq, int, NULL, 0); + module_param_array(ram, int, NULL, 0); + module_param(do_reset, bool, 0); + +-static int identify_board(unsigned long, unsigned int); ++static int identify_board(u8 __iomem *rambase, unsigned int iobase); + + static int __init sc_init(void) + { +@@ -153,7 +153,7 @@ static int __init sc_init(void) + outb(0xFF, io[b] + RESET_OFFSET); + msleep_interruptible(10000); + } +- pr_debug("RAM Base for board %d is 0x%lx, %s probe\n", b, ++ pr_debug("RAM Base for board %d is %p, %s probe\n", b, + ram[b], ram[b] == 0 ? "will" : "won't"); + + if(ram[b]) { +@@ -162,10 +162,10 @@ static int __init sc_init(void) + * Just look for a signature and ID the + * board model + */ +- if(request_region(ram[b], SRAM_PAGESIZE, "sc test")) { +- pr_debug("request_region for RAM base 0x%lx succeeded\n", ram[b]); ++ if (request_region((unsigned long)ram[b], SRAM_PAGESIZE, "sc test")) { ++ pr_debug("request_region for RAM base %p succeeded\n", ram[b]); + model = identify_board(ram[b], io[b]); +- release_region(ram[b], SRAM_PAGESIZE); ++ release_region((unsigned long)ram[b], SRAM_PAGESIZE); + } + } + else { +@@ -177,12 +177,12 @@ static int __init sc_init(void) + pr_debug("Checking RAM address 0x%x...\n", i); + if(request_region(i, SRAM_PAGESIZE, "sc test")) { + pr_debug(" request_region succeeded\n"); +- model = identify_board(i, io[b]); ++ model = identify_board((u8 __iomem *)i, io[b]); + release_region(i, SRAM_PAGESIZE); + if (model >= 0) { + pr_debug(" Identified a %s\n", + boardname[model]); +- ram[b] = i; ++ ram[b] = (u8 __iomem *)i; + break; + } + pr_debug(" Unidentifed or inaccessible\n"); +@@ -199,7 +199,7 @@ static int __init sc_init(void) + * Nope, there was no place in RAM for the + * board, or it couldn't be identified + */ +- pr_debug("Failed to find an adapter at 0x%lx\n", ram[b]); ++ pr_debug("Failed to find an adapter at %p\n", ram[b]); + continue; + } + +@@ -222,7 +222,7 @@ static int __init sc_init(void) + features = BRI_FEATURES; + break; + } +- switch(ram[b] >> 12 & 0x0F) { ++ switch((unsigned long)ram[b] >> 12 & 0x0F) { + case 0x0: + pr_debug("RAM Page register set to EXP_PAGE0\n"); + pgport = EXP_PAGE0; +@@ -358,10 +358,10 @@ static int __init sc_init(void) + pr_debug("Requesting I/O Port %#x\n", + sc_adapter[cinst]->ioport[IRQ_SELECT]); + sc_adapter[cinst]->rambase = ram[b]; +- request_region(sc_adapter[cinst]->rambase, SRAM_PAGESIZE, +- interface->id); ++ request_region((unsigned long)sc_adapter[cinst]->rambase, ++ SRAM_PAGESIZE, interface->id); + +- pr_info(" %s (%d) - %s %d channels IRQ %d, I/O Base 0x%x, RAM Base 0x%lx\n", ++ pr_info(" %s (%d) - %s %d channels IRQ %d, I/O Base 0x%x, RAM Base %p\n", + sc_adapter[cinst]->devicename, + sc_adapter[cinst]->driverId, + boardname[model], channels, irq[b], io[b], ram[b]); +@@ -400,7 +400,7 @@ static void __exit sc_exit(void) + /* + * Release shared RAM + */ +- release_region(sc_adapter[i]->rambase, SRAM_PAGESIZE); ++ release_region((unsigned long)sc_adapter[i]->rambase, SRAM_PAGESIZE); + + /* + * Release the IRQ +@@ -434,7 +434,7 @@ static void __exit sc_exit(void) + pr_info("SpellCaster ISA ISDN Adapter Driver Unloaded.\n"); + } + +-static int identify_board(unsigned long rambase, unsigned int iobase) ++static int identify_board(u8 __iomem *rambase, unsigned int iobase) + { + unsigned int pgport; + unsigned long sig; +@@ -444,15 +444,15 @@ static int identify_board(unsigned long + HWConfig_pl hwci; + int x; + +- pr_debug("Attempting to identify adapter @ 0x%lx io 0x%x\n", ++ pr_debug("Attempting to identify adapter @ %p io 0x%x\n", + rambase, iobase); + + /* + * Enable the base pointer + */ +- outb(rambase >> 12, iobase + 0x2c00); ++ outb((unsigned long)rambase >> 12, iobase + 0x2c00); + +- switch(rambase >> 12 & 0x0F) { ++ switch((unsigned long)rambase >> 12 & 0x0F) { + case 0x0: + pgport = iobase + PG0_OFFSET; + pr_debug("Page Register offset is 0x%x\n", PG0_OFFSET); +@@ -473,7 +473,7 @@ static int identify_board(unsigned long + pr_debug("Page Register offset is 0x%x\n", PG3_OFFSET); + break; + default: +- pr_debug("Invalid rambase 0x%lx\n", rambase); ++ pr_debug("Invalid rambase %p\n", rambase); + return -1; + } + +Index: linux-2.6-tip/drivers/isdn/sc/scioc.h +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/sc/scioc.h ++++ linux-2.6-tip/drivers/isdn/sc/scioc.h +@@ -86,7 +86,7 @@ typedef struct { + char load_ver[11]; + char proc_ver[11]; + int iobase; +- long rambase; ++ u8 __iomem *rambase; + char irq; + long ramsize; + char interface; +Index: linux-2.6-tip/drivers/isdn/sc/timer.c +=================================================================== +--- linux-2.6-tip.orig/drivers/isdn/sc/timer.c ++++ linux-2.6-tip/drivers/isdn/sc/timer.c +@@ -27,7 +27,7 @@ + static void setup_ports(int card) + { + +- outb((sc_adapter[card]->rambase >> 12), sc_adapter[card]->ioport[EXP_BASE]); ++ outb(((long)sc_adapter[card]->rambase >> 12), sc_adapter[card]->ioport[EXP_BASE]); + + /* And the IRQ */ + outb((sc_adapter[card]->interrupt | 0x80), +Index: linux-2.6-tip/drivers/lguest/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/lguest/Kconfig ++++ linux-2.6-tip/drivers/lguest/Kconfig +@@ -1,6 +1,6 @@ + config LGUEST + tristate "Linux hypervisor example code" +- depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX && !X86_VOYAGER ++ depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX + select HVC_DRIVER + ---help--- + This is a very simple module which allows you to run +Index: linux-2.6-tip/drivers/md/dm-raid1.c +=================================================================== +--- linux-2.6-tip.orig/drivers/md/dm-raid1.c ++++ linux-2.6-tip/drivers/md/dm-raid1.c +@@ -923,7 +923,7 @@ static int parse_features(struct mirror_ + static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) + { + int r; +- unsigned int nr_mirrors, m, args_used; ++ unsigned int nr_mirrors, m, uninitialized_var(args_used); + struct mirror_set *ms; + struct dm_dirty_log *dl; + +Index: linux-2.6-tip/drivers/media/dvb/dvb-usb/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/media/dvb/dvb-usb/Kconfig ++++ linux-2.6-tip/drivers/media/dvb/dvb-usb/Kconfig +@@ -235,6 +235,7 @@ config DVB_USB_OPERA1 + config DVB_USB_AF9005 + tristate "Afatech AF9005 DVB-T USB1.1 support" + depends on DVB_USB && EXPERIMENTAL ++ depends on 0 + select MEDIA_TUNER_MT2060 if !MEDIA_TUNER_CUSTOMIZE + select MEDIA_TUNER_QT1010 if !MEDIA_TUNER_CUSTOMIZE + help +Index: linux-2.6-tip/drivers/media/video/cx88/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/media/video/cx88/Kconfig ++++ linux-2.6-tip/drivers/media/video/cx88/Kconfig +@@ -1,6 +1,8 @@ + config VIDEO_CX88 + tristate "Conexant 2388x (bt878 successor) support" + depends on VIDEO_DEV && PCI && I2C && INPUT ++ # build failure, see config-Mon_Oct_20_13_45_14_CEST_2008.bad ++ depends on BROKEN + select I2C_ALGOBIT + select VIDEO_BTCX + select VIDEOBUF_DMA_SG +Index: linux-2.6-tip/drivers/memstick/core/mspro_block.c +=================================================================== +--- linux-2.6-tip.orig/drivers/memstick/core/mspro_block.c ++++ linux-2.6-tip/drivers/memstick/core/mspro_block.c +@@ -651,6 +651,7 @@ has_int_reg: + + default: + BUG(); ++ return -EINVAL; + } + } + +Index: linux-2.6-tip/drivers/message/fusion/mptbase.c +=================================================================== +--- linux-2.6-tip.orig/drivers/message/fusion/mptbase.c ++++ linux-2.6-tip/drivers/message/fusion/mptbase.c +@@ -126,7 +126,9 @@ static int mfcounter = 0; + * Public data... + */ + ++#ifdef CONFIG_PROC_FS + static struct proc_dir_entry *mpt_proc_root_dir; ++#endif + + #define WHOINIT_UNKNOWN 0xAA + +Index: linux-2.6-tip/drivers/message/i2o/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/message/i2o/Kconfig ++++ linux-2.6-tip/drivers/message/i2o/Kconfig +@@ -54,7 +54,7 @@ config I2O_EXT_ADAPTEC_DMA64 + + config I2O_CONFIG + tristate "I2O Configuration support" +- depends on VIRT_TO_BUS ++ depends on VIRT_TO_BUS && (BROKEN || !64BIT) + ---help--- + Say Y for support of the configuration interface for the I2O adapters. + If you have a RAID controller from Adaptec and you want to use the +@@ -66,6 +66,8 @@ config I2O_CONFIG + Note: If you want to use the new API you have to download the + i2o_config patch from http://i2o.shadowconnect.com/ + ++ Note: This is broken on 64-bit architectures. ++ + config I2O_CONFIG_OLD_IOCTL + bool "Enable ioctls (OBSOLETE)" + depends on I2O_CONFIG +Index: linux-2.6-tip/drivers/mfd/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/mfd/Kconfig ++++ linux-2.6-tip/drivers/mfd/Kconfig +@@ -210,6 +210,8 @@ config MFD_WM8350_I2C + tristate "Support Wolfson Microelectronics WM8350 with I2C" + select MFD_WM8350 + depends on I2C ++ # build failure ++ depends on 0 + help + The WM8350 is an integrated audio and power management + subsystem with watchdog and RTC functionality for embedded +Index: linux-2.6-tip/drivers/mfd/da903x.c +=================================================================== +--- linux-2.6-tip.orig/drivers/mfd/da903x.c ++++ linux-2.6-tip/drivers/mfd/da903x.c +@@ -75,6 +75,7 @@ static inline int __da903x_read(struct i + { + int ret; + ++ *val = 0; + ret = i2c_smbus_read_byte_data(client, reg); + if (ret < 0) { + dev_err(&client->dev, "failed reading at 0x%02x\n", reg); +Index: linux-2.6-tip/drivers/misc/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/misc/Kconfig ++++ linux-2.6-tip/drivers/misc/Kconfig +@@ -76,6 +76,34 @@ config IBM_ASM + information on the specific driver level and support statement + for your IBM server. + ++config HWLAT_DETECTOR ++ tristate "Testing module to detect hardware-induced latencies" ++ depends on DEBUG_FS ++ default m ++ ---help--- ++ A simple hardware latency detector. Use this module to detect ++ large latencies introduced by the behavior of the underlying ++ system firmware external to Linux. We do this using periodic ++ use of stop_machine to grab all available CPUs and measure ++ for unexplainable gaps in the CPU timestamp counter(s). By ++ default, the module is not enabled until the "enable" file ++ within the "hwlat_detector" debugfs directory is toggled. ++ ++ This module is often used to detect SMI (System Management ++ Interrupts) on x86 systems, though is not x86 specific. To ++ this end, we default to using a sample window of 1 second, ++ during which we will sample for 0.5 seconds. If an SMI or ++ similar event occurs during that time, it is recorded ++ into an 8K samples global ring buffer until retreived. ++ ++ WARNING: This software should never be enabled (it can be built ++ but should not be turned on after it is loaded) in a production ++ environment where high latencies are a concern since the ++ sampling mechanism actually introduces latencies for ++ regular tasks while the CPU(s) are being held. ++ ++ If unsure, say N ++ + config PHANTOM + tristate "Sensable PHANToM (PCI)" + depends on PCI +@@ -162,7 +190,7 @@ config ENCLOSURE_SERVICES + config SGI_XP + tristate "Support communication between SGI SSIs" + depends on NET +- depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP ++ depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_UV) && SMP + select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 + select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 + select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP +@@ -189,7 +217,7 @@ config HP_ILO + + config SGI_GRU + tristate "SGI GRU driver" +- depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP ++ depends on (X86_UV || IA64_SGI_UV || IA64_GENERIC) && SMP + default n + select MMU_NOTIFIER + ---help--- +@@ -218,6 +246,8 @@ config DELL_LAPTOP + depends on BACKLIGHT_CLASS_DEVICE + depends on RFKILL + depends on POWER_SUPPLY ++ # broken build with: config-Thu_Jan_15_01_30_52_CET_2009.bad ++ depends on 0 + default n + ---help--- + This driver adds support for rfkill and backlight control to Dell +Index: linux-2.6-tip/drivers/misc/c2port/core.c +=================================================================== +--- linux-2.6-tip.orig/drivers/misc/c2port/core.c ++++ linux-2.6-tip/drivers/misc/c2port/core.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -893,6 +894,7 @@ struct c2port_device *c2port_device_regi + c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL); + if (unlikely(!c2dev)) + return ERR_PTR(-ENOMEM); ++ kmemcheck_annotate_bitfield(c2dev->flags); + + ret = idr_pre_get(&c2port_idr, GFP_KERNEL); + if (!ret) { +Index: linux-2.6-tip/drivers/misc/ics932s401.c +=================================================================== +--- linux-2.6-tip.orig/drivers/misc/ics932s401.c ++++ linux-2.6-tip/drivers/misc/ics932s401.c +@@ -374,7 +374,7 @@ static ssize_t show_value(struct device + struct device_attribute *devattr, + char *buf) + { +- int x; ++ int x = 0; + + if (devattr == &dev_attr_usb_clock) + x = 48000; +@@ -392,7 +392,7 @@ static ssize_t show_spread(struct device + { + struct ics932s401_data *data = ics932s401_update_device(dev); + int reg; +- unsigned long val; ++ unsigned long val = 0; + + if (!(data->regs[ICS932S401_REG_CFG2] & ICS932S401_CFG1_SPREAD)) + return sprintf(buf, "0%%\n"); +Index: linux-2.6-tip/drivers/misc/sgi-gru/grufault.c +=================================================================== +--- linux-2.6-tip.orig/drivers/misc/sgi-gru/grufault.c ++++ linux-2.6-tip/drivers/misc/sgi-gru/grufault.c +@@ -282,8 +282,8 @@ static int gru_try_dropin(struct gru_thr + { + struct mm_struct *mm = gts->ts_mm; + struct vm_area_struct *vma; +- int pageshift, asid, write, ret; +- unsigned long paddr, gpa, vaddr; ++ int uninitialized_var(pageshift), asid, write, ret; ++ unsigned long uninitialized_var(paddr), gpa, vaddr; + + /* + * NOTE: The GRU contains magic hardware that eliminates races between +Index: linux-2.6-tip/drivers/misc/sgi-gru/grufile.c +=================================================================== +--- linux-2.6-tip.orig/drivers/misc/sgi-gru/grufile.c ++++ linux-2.6-tip/drivers/misc/sgi-gru/grufile.c +@@ -36,23 +36,11 @@ + #include + #include + #include ++#include + #include "gru.h" + #include "grulib.h" + #include "grutables.h" + +-#if defined CONFIG_X86_64 +-#include +-#include +-#define IS_UV() is_uv_system() +-#elif defined CONFIG_IA64 +-#include +-#include +-/* temp support for running on hardware simulator */ +-#define IS_UV() IS_MEDUSA() || ia64_platform_is("uv") +-#else +-#define IS_UV() 0 +-#endif +- + #include + #include + +@@ -381,7 +369,7 @@ static int __init gru_init(void) + char id[10]; + void *gru_start_vaddr; + +- if (!IS_UV()) ++ if (!is_uv_system()) + return 0; + + #if defined CONFIG_IA64 +@@ -451,7 +439,7 @@ static void __exit gru_exit(void) + int order = get_order(sizeof(struct gru_state) * + GRU_CHIPLETS_PER_BLADE); + +- if (!IS_UV()) ++ if (!is_uv_system()) + return; + + for (i = 0; i < GRU_CHIPLETS_PER_BLADE; i++) +Index: linux-2.6-tip/drivers/misc/sgi-xp/xp.h +=================================================================== +--- linux-2.6-tip.orig/drivers/misc/sgi-xp/xp.h ++++ linux-2.6-tip/drivers/misc/sgi-xp/xp.h +@@ -15,19 +15,19 @@ + + #include + +-#ifdef CONFIG_IA64 ++#if defined CONFIG_X86_UV || defined CONFIG_IA64_SGI_UV ++#include ++#define is_uv() is_uv_system() ++#endif ++ ++#ifndef is_uv ++#define is_uv() 0 ++#endif ++ ++#if defined CONFIG_IA64 + #include + #include /* defines is_shub1() and is_shub2() */ + #define is_shub() ia64_platform_is("sn2") +-#ifdef CONFIG_IA64_SGI_UV +-#define is_uv() ia64_platform_is("uv") +-#else +-#define is_uv() 0 +-#endif +-#endif +-#ifdef CONFIG_X86_64 +-#include +-#define is_uv() is_uv_system() + #endif + + #ifndef is_shub1 +@@ -42,10 +42,6 @@ + #define is_shub() 0 + #endif + +-#ifndef is_uv +-#define is_uv() 0 +-#endif +- + #ifdef USE_DBUG_ON + #define DBUG_ON(condition) BUG_ON(condition) + #else +Index: linux-2.6-tip/drivers/misc/sgi-xp/xpc_main.c +=================================================================== +--- linux-2.6-tip.orig/drivers/misc/sgi-xp/xpc_main.c ++++ linux-2.6-tip/drivers/misc/sgi-xp/xpc_main.c +@@ -318,7 +318,7 @@ xpc_hb_checker(void *ignore) + + /* this thread was marked active by xpc_hb_init() */ + +- set_cpus_allowed_ptr(current, &cpumask_of_cpu(XPC_HB_CHECK_CPU)); ++ set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU)); + + /* set our heartbeating to other partitions into motion */ + xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); +Index: linux-2.6-tip/drivers/mtd/devices/mtd_dataflash.c +=================================================================== +--- linux-2.6-tip.orig/drivers/mtd/devices/mtd_dataflash.c ++++ linux-2.6-tip/drivers/mtd/devices/mtd_dataflash.c +@@ -679,7 +679,7 @@ add_dataflash_otp(struct spi_device *spi + dev_set_drvdata(&spi->dev, priv); + + if (mtd_has_partitions()) { +- struct mtd_partition *parts; ++ struct mtd_partition *uninitialized_var(parts); + int nr_parts = 0; + + #ifdef CONFIG_MTD_CMDLINE_PARTS +Index: linux-2.6-tip/drivers/mtd/devices/phram.c +=================================================================== +--- linux-2.6-tip.orig/drivers/mtd/devices/phram.c ++++ linux-2.6-tip/drivers/mtd/devices/phram.c +@@ -235,7 +235,7 @@ static int phram_setup(const char *val, + { + char buf[64+12+12], *str = buf; + char *token[3]; +- char *name; ++ char *uninitialized_var(name); + uint32_t start; + uint32_t len; + int i, ret; +Index: linux-2.6-tip/drivers/mtd/nand/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/mtd/nand/Kconfig ++++ linux-2.6-tip/drivers/mtd/nand/Kconfig +@@ -273,7 +273,7 @@ config MTD_NAND_CAFE + + config MTD_NAND_CS553X + tristate "NAND support for CS5535/CS5536 (AMD Geode companion chip)" +- depends on X86_32 && (X86_PC || X86_GENERICARCH) ++ depends on X86_32 + help + The CS553x companion chips for the AMD Geode processor + include NAND flash controllers with built-in hardware ECC +Index: linux-2.6-tip/drivers/net/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/net/Kconfig ++++ linux-2.6-tip/drivers/net/Kconfig +@@ -776,6 +776,8 @@ config NET_VENDOR_SMC + config WD80x3 + tristate "WD80*3 support" + depends on NET_VENDOR_SMC && ISA ++ # broken build ++ depends on 0 + select CRC32 + help + If you have a network (Ethernet) card of this type, say Y and read +@@ -1162,6 +1164,8 @@ config EEXPRESS_PRO + config HPLAN_PLUS + tristate "HP PCLAN+ (27247B and 27252A) support" + depends on NET_ISA ++ # broken build with config-Mon_Jul_21_20_21_08_CEST_2008.bad ++ depends on 0 + select CRC32 + help + If you have a network (Ethernet) card of this type, say Y and read +@@ -2559,6 +2563,8 @@ config MYRI10GE_DCA + + config NETXEN_NIC + tristate "NetXen Multi port (1/10) Gigabit Ethernet NIC" ++ # build breakage ++ depends on 0 + depends on PCI + help + This enables the support for NetXen's Gigabit Ethernet card. +Index: linux-2.6-tip/drivers/net/Makefile +=================================================================== +--- linux-2.6-tip.orig/drivers/net/Makefile ++++ linux-2.6-tip/drivers/net/Makefile +@@ -111,7 +111,7 @@ ifeq ($(CONFIG_FEC_MPC52xx_MDIO),y) + obj-$(CONFIG_FEC_MPC52xx) += fec_mpc52xx_phy.o + endif + obj-$(CONFIG_68360_ENET) += 68360enet.o +-obj-$(CONFIG_WD80x3) += wd.o 8390.o ++obj-$(CONFIG_WD80x3) += wd.o 8390p.o + obj-$(CONFIG_EL2) += 3c503.o 8390p.o + obj-$(CONFIG_NE2000) += ne.o 8390p.o + obj-$(CONFIG_NE2_MCA) += ne2.o 8390p.o +Index: linux-2.6-tip/drivers/net/e1000/e1000_main.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/e1000/e1000_main.c ++++ linux-2.6-tip/drivers/net/e1000/e1000_main.c +@@ -2056,14 +2056,10 @@ void e1000_free_all_tx_resources(struct + static void e1000_unmap_and_free_tx_resource(struct e1000_adapter *adapter, + struct e1000_buffer *buffer_info) + { +- if (buffer_info->dma) { +- pci_unmap_page(adapter->pdev, +- buffer_info->dma, +- buffer_info->length, +- PCI_DMA_TODEVICE); +- buffer_info->dma = 0; +- } ++ buffer_info->dma = 0; + if (buffer_info->skb) { ++ skb_dma_unmap(&adapter->pdev->dev, buffer_info->skb, ++ DMA_TO_DEVICE); + dev_kfree_skb_any(buffer_info->skb); + buffer_info->skb = NULL; + } +@@ -2914,16 +2910,24 @@ static int e1000_tx_map(struct e1000_ada + unsigned int mss) + { + struct e1000_hw *hw = &adapter->hw; +- struct e1000_buffer *buffer_info; +- unsigned int len = skb->len; +- unsigned int offset = 0, size, count = 0, i; ++ unsigned int len = skb_headlen(skb); ++ unsigned int offset, size, count = 0, i; + unsigned int f; +- len -= skb->data_len; ++ dma_addr_t map; + + i = tx_ring->next_to_use; + ++ if (skb_dma_map(&adapter->pdev->dev, skb, DMA_TO_DEVICE)) { ++ dev_err(&adapter->pdev->dev, "TX DMA map failed\n"); ++ dev_kfree_skb(skb); ++ return -2; ++ } ++ ++ map = skb_shinfo(skb)->dma_maps[0]; ++ offset = 0; ++ + while (len) { +- buffer_info = &tx_ring->buffer_info[i]; ++ struct e1000_buffer *buffer_info = &tx_ring->buffer_info[i]; + size = min(len, max_per_txd); + /* Workaround for Controller erratum -- + * descriptor for non-tso packet in a linear SKB that follows a +@@ -2956,11 +2960,7 @@ static int e1000_tx_map(struct e1000_ada + size -= 4; + + buffer_info->length = size; +- buffer_info->dma = +- pci_map_single(adapter->pdev, +- skb->data + offset, +- size, +- PCI_DMA_TODEVICE); ++ buffer_info->dma = map + offset; + buffer_info->time_stamp = jiffies; + buffer_info->next_to_watch = i; + +@@ -2975,9 +2975,11 @@ static int e1000_tx_map(struct e1000_ada + + frag = &skb_shinfo(skb)->frags[f]; + len = frag->size; +- offset = frag->page_offset; ++ map = skb_shinfo(skb)->dma_maps[f + 1]; ++ offset = 0; + + while (len) { ++ struct e1000_buffer *buffer_info; + buffer_info = &tx_ring->buffer_info[i]; + size = min(len, max_per_txd); + /* Workaround for premature desc write-backs +@@ -2993,12 +2995,7 @@ static int e1000_tx_map(struct e1000_ada + size -= 4; + + buffer_info->length = size; +- buffer_info->dma = +- pci_map_page(adapter->pdev, +- frag->page, +- offset, +- size, +- PCI_DMA_TODEVICE); ++ buffer_info->dma = map + offset; + buffer_info->time_stamp = jiffies; + buffer_info->next_to_watch = i; + +@@ -3012,6 +3009,7 @@ static int e1000_tx_map(struct e1000_ada + i = (i == 0) ? tx_ring->count - 1 : i - 1; + tx_ring->buffer_info[i].skb = skb; + tx_ring->buffer_info[first].next_to_watch = i; ++ smp_wmb(); + + return count; + } +@@ -3290,9 +3288,7 @@ static int e1000_xmit_frame(struct sk_bu + (hw->mac_type == e1000_82573)) + e1000_transfer_dhcp_info(adapter, skb); + +- if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) +- /* Collision - tell upper layer to requeue */ +- return NETDEV_TX_LOCKED; ++ spin_lock_irqsave(&tx_ring->tx_lock, flags); + + /* need: count + 2 desc gap to keep tail from touching + * head, otherwise try next time */ +@@ -3869,6 +3865,11 @@ static bool e1000_clean_tx_irq(struct e1 + /* Detect a transmit hang in hardware, this serializes the + * check with the clearing of time_stamp and movement of i */ + adapter->detect_tx_hung = false; ++ /* ++ * read barrier to make sure that the ->dma member and time ++ * stamp are updated fully ++ */ ++ smp_rmb(); + if (tx_ring->buffer_info[eop].dma && + time_after(jiffies, tx_ring->buffer_info[eop].time_stamp + + (adapter->tx_timeout_factor * HZ)) +Index: linux-2.6-tip/drivers/net/e1000e/netdev.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/e1000e/netdev.c ++++ linux-2.6-tip/drivers/net/e1000e/netdev.c +@@ -565,12 +565,10 @@ next_desc: + static void e1000_put_txbuf(struct e1000_adapter *adapter, + struct e1000_buffer *buffer_info) + { +- if (buffer_info->dma) { +- pci_unmap_page(adapter->pdev, buffer_info->dma, +- buffer_info->length, PCI_DMA_TODEVICE); +- buffer_info->dma = 0; +- } ++ buffer_info->dma = 0; + if (buffer_info->skb) { ++ skb_dma_unmap(&adapter->pdev->dev, buffer_info->skb, ++ DMA_TO_DEVICE); + dev_kfree_skb_any(buffer_info->skb); + buffer_info->skb = NULL; + } +@@ -683,6 +681,11 @@ static bool e1000_clean_tx_irq(struct e1 + * check with the clearing of time_stamp and movement of i + */ + adapter->detect_tx_hung = 0; ++ /* ++ * read barrier to make sure that the ->dma member and time ++ * stamp are updated fully ++ */ ++ smp_rmb(); + if (tx_ring->buffer_info[eop].dma && + time_after(jiffies, tx_ring->buffer_info[eop].time_stamp + + (adapter->tx_timeout_factor * HZ)) +@@ -3831,15 +3834,25 @@ static int e1000_tx_map(struct e1000_ada + unsigned int mss) + { + struct e1000_ring *tx_ring = adapter->tx_ring; +- struct e1000_buffer *buffer_info; +- unsigned int len = skb->len - skb->data_len; +- unsigned int offset = 0, size, count = 0, i; ++ unsigned int len = skb_headlen(skb); ++ unsigned int offset, size, count = 0, i; + unsigned int f; ++ dma_addr_t map; + + i = tx_ring->next_to_use; + ++ if (skb_dma_map(&adapter->pdev->dev, skb, DMA_TO_DEVICE)) { ++ dev_err(&adapter->pdev->dev, "TX DMA map failed\n"); ++ adapter->tx_dma_failed++; ++ dev_kfree_skb(skb); ++ return -2; ++ } ++ ++ map = skb_shinfo(skb)->dma_maps[0]; ++ offset = 0; ++ + while (len) { +- buffer_info = &tx_ring->buffer_info[i]; ++ struct e1000_buffer *buffer_info = &tx_ring->buffer_info[i]; + size = min(len, max_per_txd); + + /* Workaround for premature desc write-backs +@@ -3850,16 +3863,7 @@ static int e1000_tx_map(struct e1000_ada + buffer_info->length = size; + /* set time_stamp *before* dma to help avoid a possible race */ + buffer_info->time_stamp = jiffies; +- buffer_info->dma = +- pci_map_single(adapter->pdev, +- skb->data + offset, +- size, +- PCI_DMA_TODEVICE); +- if (pci_dma_mapping_error(adapter->pdev, buffer_info->dma)) { +- dev_err(&adapter->pdev->dev, "TX DMA map failed\n"); +- adapter->tx_dma_failed++; +- return -1; +- } ++ buffer_info->dma = map + offset; + buffer_info->next_to_watch = i; + + len -= size; +@@ -3875,9 +3879,11 @@ static int e1000_tx_map(struct e1000_ada + + frag = &skb_shinfo(skb)->frags[f]; + len = frag->size; +- offset = frag->page_offset; ++ map = skb_shinfo(skb)->dma_maps[f + 1]; ++ offset = 0; + + while (len) { ++ struct e1000_buffer *buffer_info; + buffer_info = &tx_ring->buffer_info[i]; + size = min(len, max_per_txd); + /* Workaround for premature desc write-backs +@@ -3887,20 +3893,7 @@ static int e1000_tx_map(struct e1000_ada + + buffer_info->length = size; + buffer_info->time_stamp = jiffies; +- buffer_info->dma = +- pci_map_page(adapter->pdev, +- frag->page, +- offset, +- size, +- PCI_DMA_TODEVICE); +- if (pci_dma_mapping_error(adapter->pdev, +- buffer_info->dma)) { +- dev_err(&adapter->pdev->dev, +- "TX DMA page map failed\n"); +- adapter->tx_dma_failed++; +- return -1; +- } +- ++ buffer_info->dma = map + offset; + buffer_info->next_to_watch = i; + + len -= size; +@@ -3920,6 +3913,7 @@ static int e1000_tx_map(struct e1000_ada + + tx_ring->buffer_info[i].skb = skb; + tx_ring->buffer_info[first].next_to_watch = i; ++ smp_wmb(); + + return count; + } +@@ -4138,9 +4132,7 @@ static int e1000_xmit_frame(struct sk_bu + if (adapter->hw.mac.tx_pkt_filtering) + e1000_transfer_dhcp_info(adapter, skb); + +- if (!spin_trylock_irqsave(&adapter->tx_queue_lock, irq_flags)) +- /* Collision - tell upper layer to requeue */ +- return NETDEV_TX_LOCKED; ++ spin_lock_irqsave(&adapter->tx_queue_lock, irq_flags); + + /* + * need: count + 2 desc gap to keep tail from touching +Index: linux-2.6-tip/drivers/net/ne3210.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/ne3210.c ++++ linux-2.6-tip/drivers/net/ne3210.c +@@ -150,7 +150,8 @@ static int __init ne3210_eisa_probe (str + if (phys_mem < virt_to_phys(high_memory)) { + printk(KERN_CRIT "ne3210.c: Card RAM overlaps with normal memory!!!\n"); + printk(KERN_CRIT "ne3210.c: Use EISA SCU to set card memory below 1MB,\n"); +- printk(KERN_CRIT "ne3210.c: or to an address above 0x%lx.\n", virt_to_phys(high_memory)); ++ printk(KERN_CRIT "ne3210.c: or to an address above 0x%llx.\n", ++ (u64)virt_to_phys(high_memory)); + printk(KERN_CRIT "ne3210.c: Driver NOT installed.\n"); + retval = -EINVAL; + goto out3; +Index: linux-2.6-tip/drivers/net/sfc/efx.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/sfc/efx.c ++++ linux-2.6-tip/drivers/net/sfc/efx.c +@@ -850,20 +850,27 @@ static void efx_fini_io(struct efx_nic * + * interrupts across them. */ + static int efx_wanted_rx_queues(void) + { +- cpumask_t core_mask; ++ cpumask_var_t core_mask; + int count; + int cpu; + +- cpus_clear(core_mask); ++ if (!alloc_cpumask_var(&core_mask, GFP_KERNEL)) { ++ printk(KERN_WARNING ++ "efx.c: allocation failure, irq balancing hobbled\n"); ++ return 1; ++ } ++ ++ cpumask_clear(core_mask); + count = 0; + for_each_online_cpu(cpu) { +- if (!cpu_isset(cpu, core_mask)) { ++ if (!cpumask_test_cpu(cpu, core_mask)) { + ++count; +- cpus_or(core_mask, core_mask, +- topology_core_siblings(cpu)); ++ cpumask_or(core_mask, core_mask, ++ topology_core_cpumask(cpu)); + } + } + ++ free_cpumask_var(core_mask); + return count; + } + +Index: linux-2.6-tip/drivers/net/sfc/falcon.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/sfc/falcon.c ++++ linux-2.6-tip/drivers/net/sfc/falcon.c +@@ -338,10 +338,10 @@ static int falcon_alloc_special_buffer(s + nic_data->next_buffer_table += buffer->entries; + + EFX_LOG(efx, "allocating special buffers %d-%d at %llx+%x " +- "(virt %p phys %lx)\n", buffer->index, ++ "(virt %p phys %llx)\n", buffer->index, + buffer->index + buffer->entries - 1, +- (unsigned long long)buffer->dma_addr, len, +- buffer->addr, virt_to_phys(buffer->addr)); ++ (u64)buffer->dma_addr, len, ++ buffer->addr, (u64)virt_to_phys(buffer->addr)); + + return 0; + } +@@ -353,10 +353,10 @@ static void falcon_free_special_buffer(s + return; + + EFX_LOG(efx, "deallocating special buffers %d-%d at %llx+%x " +- "(virt %p phys %lx)\n", buffer->index, ++ "(virt %p phys %llx)\n", buffer->index, + buffer->index + buffer->entries - 1, +- (unsigned long long)buffer->dma_addr, buffer->len, +- buffer->addr, virt_to_phys(buffer->addr)); ++ (u64)buffer->dma_addr, buffer->len, ++ buffer->addr, (u64)virt_to_phys(buffer->addr)); + + pci_free_consistent(efx->pci_dev, buffer->len, buffer->addr, + buffer->dma_addr); +@@ -2343,10 +2343,10 @@ int falcon_probe_port(struct efx_nic *ef + FALCON_MAC_STATS_SIZE); + if (rc) + return rc; +- EFX_LOG(efx, "stats buffer at %llx (virt %p phys %lx)\n", +- (unsigned long long)efx->stats_buffer.dma_addr, ++ EFX_LOG(efx, "stats buffer at %llx (virt %p phys %llx)\n", ++ (u64)efx->stats_buffer.dma_addr, + efx->stats_buffer.addr, +- virt_to_phys(efx->stats_buffer.addr)); ++ (u64)virt_to_phys(efx->stats_buffer.addr)); + + return 0; + } +@@ -2921,9 +2921,9 @@ int falcon_probe_nic(struct efx_nic *efx + goto fail4; + BUG_ON(efx->irq_status.dma_addr & 0x0f); + +- EFX_LOG(efx, "INT_KER at %llx (virt %p phys %lx)\n", +- (unsigned long long)efx->irq_status.dma_addr, +- efx->irq_status.addr, virt_to_phys(efx->irq_status.addr)); ++ EFX_LOG(efx, "INT_KER at %llx (virt %p phys %llx)\n", ++ (u64)efx->irq_status.dma_addr, ++ efx->irq_status.addr, (u64)virt_to_phys(efx->irq_status.addr)); + + falcon_probe_spi_devices(efx); + +Index: linux-2.6-tip/drivers/net/sky2.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/sky2.c ++++ linux-2.6-tip/drivers/net/sky2.c +@@ -2748,7 +2748,7 @@ static u32 sky2_mhz(const struct sky2_hw + return 156; + + default: +- BUG(); ++ panic("sky2_mhz: unknown chip id!"); + } + } + +Index: linux-2.6-tip/drivers/net/wimax/i2400m/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/net/wimax/i2400m/Kconfig ++++ linux-2.6-tip/drivers/net/wimax/i2400m/Kconfig +@@ -13,6 +13,8 @@ comment "Enable MMC support to see WiMAX + config WIMAX_I2400M_USB + tristate "Intel Wireless WiMAX Connection 2400 over USB (including 5x50)" + depends on WIMAX && USB ++ # build failure: config-Thu_Jan__8_10_51_13_CET_2009.bad ++ depends on 0 + select WIMAX_I2400M + help + Select if you have a device based on the Intel WiMAX +Index: linux-2.6-tip/drivers/net/wireless/arlan-main.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/wireless/arlan-main.c ++++ linux-2.6-tip/drivers/net/wireless/arlan-main.c +@@ -1082,8 +1082,8 @@ static int __init arlan_probe_here(struc + if (arlan_check_fingerprint(memaddr)) + return -ENODEV; + +- printk(KERN_NOTICE "%s: Arlan found at %x, \n ", dev->name, +- (int) virt_to_phys((void*)memaddr)); ++ printk(KERN_NOTICE "%s: Arlan found at %llx, \n ", dev->name, ++ (u64) virt_to_phys((void*)memaddr)); + + ap->card = (void *) memaddr; + dev->mem_start = memaddr; +Index: linux-2.6-tip/drivers/net/wireless/b43/b43.h +=================================================================== +--- linux-2.6-tip.orig/drivers/net/wireless/b43/b43.h ++++ linux-2.6-tip/drivers/net/wireless/b43/b43.h +@@ -852,7 +852,8 @@ void b43warn(struct b43_wl *wl, const ch + void b43dbg(struct b43_wl *wl, const char *fmt, ...) + __attribute__ ((format(printf, 2, 3))); + #else /* DEBUG */ +-# define b43dbg(wl, fmt...) do { /* nothing */ } while (0) ++static inline void __attribute__ ((format(printf, 2, 3))) ++b43dbg(struct b43_wl *wl, const char *fmt, ...) { } + #endif /* DEBUG */ + + /* A WARN_ON variant that vanishes when b43 debugging is disabled. +Index: linux-2.6-tip/drivers/net/wireless/ray_cs.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/wireless/ray_cs.c ++++ linux-2.6-tip/drivers/net/wireless/ray_cs.c +@@ -294,7 +294,9 @@ static char hop_pattern_length[] = { 1, + JAPAN_TEST_HOP_MOD + }; + ++#ifdef CONFIG_PROC_FS + static char rcsid[] = "Raylink/WebGear wireless LAN - Corey "; ++#endif + + /*============================================================================= + ray_attach() creates an "instance" of the driver, allocating +Index: linux-2.6-tip/drivers/net/wireless/zd1201.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/wireless/zd1201.c ++++ linux-2.6-tip/drivers/net/wireless/zd1201.c +@@ -593,6 +593,9 @@ static inline int zd1201_getconfig16(str + int err; + __le16 zdval; + ++ /* initialize */ ++ *val = 0; ++ + err = zd1201_getconfig(zd, rid, &zdval, sizeof(__le16)); + if (err) + return err; +Index: linux-2.6-tip/drivers/oprofile/buffer_sync.c +=================================================================== +--- linux-2.6-tip.orig/drivers/oprofile/buffer_sync.c ++++ linux-2.6-tip/drivers/oprofile/buffer_sync.c +@@ -38,7 +38,7 @@ + + static LIST_HEAD(dying_tasks); + static LIST_HEAD(dead_tasks); +-static cpumask_t marked_cpus = CPU_MASK_NONE; ++static cpumask_var_t marked_cpus; + static DEFINE_SPINLOCK(task_mortuary); + static void process_task_mortuary(void); + +@@ -154,6 +154,10 @@ int sync_start(void) + { + int err; + ++ if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL)) ++ return -ENOMEM; ++ cpumask_clear(marked_cpus); ++ + start_cpu_work(); + + err = task_handoff_register(&task_free_nb); +@@ -179,6 +183,7 @@ out2: + task_handoff_unregister(&task_free_nb); + out1: + end_sync(); ++ free_cpumask_var(marked_cpus); + goto out; + } + +@@ -190,6 +195,7 @@ void sync_stop(void) + profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); + task_handoff_unregister(&task_free_nb); + end_sync(); ++ free_cpumask_var(marked_cpus); + } + + +@@ -456,10 +462,10 @@ static void mark_done(int cpu) + { + int i; + +- cpu_set(cpu, marked_cpus); ++ cpumask_set_cpu(cpu, marked_cpus); + + for_each_online_cpu(i) { +- if (!cpu_isset(i, marked_cpus)) ++ if (!cpumask_test_cpu(i, marked_cpus)) + return; + } + +@@ -468,7 +474,7 @@ static void mark_done(int cpu) + */ + process_task_mortuary(); + +- cpus_clear(marked_cpus); ++ cpumask_clear(marked_cpus); + } + + +Index: linux-2.6-tip/drivers/oprofile/cpu_buffer.c +=================================================================== +--- linux-2.6-tip.orig/drivers/oprofile/cpu_buffer.c ++++ linux-2.6-tip/drivers/oprofile/cpu_buffer.c +@@ -161,7 +161,7 @@ struct op_sample + { + entry->event = ring_buffer_lock_reserve + (op_ring_buffer_write, sizeof(struct op_sample) + +- size * sizeof(entry->sample->data[0]), &entry->irq_flags); ++ size * sizeof(entry->sample->data[0])); + if (entry->event) + entry->sample = ring_buffer_event_data(entry->event); + else +@@ -178,8 +178,7 @@ struct op_sample + + int op_cpu_buffer_write_commit(struct op_entry *entry) + { +- return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event, +- entry->irq_flags); ++ return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event); + } + + struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) +Index: linux-2.6-tip/drivers/pci/dmar.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/dmar.c ++++ linux-2.6-tip/drivers/pci/dmar.c +@@ -31,6 +31,8 @@ + #include + #include + #include ++#include ++#include + + #undef PREFIX + #define PREFIX "DMAR:" +@@ -42,6 +44,7 @@ + LIST_HEAD(dmar_drhd_units); + + static struct acpi_table_header * __initdata dmar_tbl; ++static acpi_size dmar_tbl_size; + + static void __init dmar_register_drhd_unit(struct dmar_drhd_unit *drhd) + { +@@ -297,8 +300,9 @@ static int __init dmar_table_detect(void + acpi_status status = AE_OK; + + /* if we could find DMAR table, then there are DMAR devices */ +- status = acpi_get_table(ACPI_SIG_DMAR, 0, +- (struct acpi_table_header **)&dmar_tbl); ++ status = acpi_get_table_with_size(ACPI_SIG_DMAR, 0, ++ (struct acpi_table_header **)&dmar_tbl, ++ &dmar_tbl_size); + + if (ACPI_SUCCESS(status) && !dmar_tbl) { + printk (KERN_WARNING PREFIX "Unable to map DMAR\n"); +@@ -498,6 +502,7 @@ void __init detect_intel_iommu(void) + iommu_detected = 1; + #endif + } ++ early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size); + dmar_tbl = NULL; + } + +@@ -515,6 +520,7 @@ int alloc_iommu(struct dmar_drhd_unit *d + return -ENOMEM; + + iommu->seq_id = iommu_allocated++; ++ sprintf (iommu->name, "dmar%d", iommu->seq_id); + + iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE); + if (!iommu->reg) { +@@ -757,6 +763,42 @@ int qi_flush_iotlb(struct intel_iommu *i + } + + /* ++ * Disable Queued Invalidation interface. ++ */ ++void dmar_disable_qi(struct intel_iommu *iommu) ++{ ++ unsigned long flags; ++ u32 sts; ++ cycles_t start_time = get_cycles(); ++ ++ if (!ecap_qis(iommu->ecap)) ++ return; ++ ++ spin_lock_irqsave(&iommu->register_lock, flags); ++ ++ sts = dmar_readq(iommu->reg + DMAR_GSTS_REG); ++ if (!(sts & DMA_GSTS_QIES)) ++ goto end; ++ ++ /* ++ * Give a chance to HW to complete the pending invalidation requests. ++ */ ++ while ((readl(iommu->reg + DMAR_IQT_REG) != ++ readl(iommu->reg + DMAR_IQH_REG)) && ++ (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time))) ++ cpu_relax(); ++ ++ iommu->gcmd &= ~DMA_GCMD_QIE; ++ ++ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ ++ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, ++ !(sts & DMA_GSTS_QIES), sts); ++end: ++ spin_unlock_irqrestore(&iommu->register_lock, flags); ++} ++ ++/* + * Enable Queued Invalidation interface. This is a must to support + * interrupt-remapping. Also used by DMA-remapping, which replaces + * register based IOTLB invalidation. +@@ -776,20 +818,20 @@ int dmar_enable_qi(struct intel_iommu *i + if (iommu->qi) + return 0; + +- iommu->qi = kmalloc(sizeof(*qi), GFP_KERNEL); ++ iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC); + if (!iommu->qi) + return -ENOMEM; + + qi = iommu->qi; + +- qi->desc = (void *)(get_zeroed_page(GFP_KERNEL)); ++ qi->desc = (void *)(get_zeroed_page(GFP_ATOMIC)); + if (!qi->desc) { + kfree(qi); + iommu->qi = 0; + return -ENOMEM; + } + +- qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_KERNEL); ++ qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_ATOMIC); + if (!qi->desc_status) { + free_page((unsigned long) qi->desc); + kfree(qi); +@@ -818,3 +860,254 @@ int dmar_enable_qi(struct intel_iommu *i + + return 0; + } ++ ++/* iommu interrupt handling. Most stuff are MSI-like. */ ++ ++enum faulttype { ++ DMA_REMAP, ++ INTR_REMAP, ++ UNKNOWN, ++}; ++ ++static const char *dma_remap_fault_reasons[] = ++{ ++ "Software", ++ "Present bit in root entry is clear", ++ "Present bit in context entry is clear", ++ "Invalid context entry", ++ "Access beyond MGAW", ++ "PTE Write access is not set", ++ "PTE Read access is not set", ++ "Next page table ptr is invalid", ++ "Root table address invalid", ++ "Context table ptr is invalid", ++ "non-zero reserved fields in RTP", ++ "non-zero reserved fields in CTP", ++ "non-zero reserved fields in PTE", ++}; ++ ++static const char *intr_remap_fault_reasons[] = ++{ ++ "Detected reserved fields in the decoded interrupt-remapped request", ++ "Interrupt index exceeded the interrupt-remapping table size", ++ "Present field in the IRTE entry is clear", ++ "Error accessing interrupt-remapping table pointed by IRTA_REG", ++ "Detected reserved fields in the IRTE entry", ++ "Blocked a compatibility format interrupt request", ++ "Blocked an interrupt request due to source-id verification failure", ++}; ++ ++#define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1) ++ ++const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type) ++{ ++ if (fault_reason >= 0x20 && (fault_reason <= 0x20 + ++ ARRAY_SIZE(intr_remap_fault_reasons))) { ++ *fault_type = INTR_REMAP; ++ return intr_remap_fault_reasons[fault_reason - 0x20]; ++ } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) { ++ *fault_type = DMA_REMAP; ++ return dma_remap_fault_reasons[fault_reason]; ++ } else { ++ *fault_type = UNKNOWN; ++ return "Unknown"; ++ } ++} ++ ++void dmar_msi_unmask(unsigned int irq) ++{ ++ struct intel_iommu *iommu = get_irq_data(irq); ++ unsigned long flag; ++ ++ /* unmask it */ ++ spin_lock_irqsave(&iommu->register_lock, flag); ++ writel(0, iommu->reg + DMAR_FECTL_REG); ++ /* Read a reg to force flush the post write */ ++ readl(iommu->reg + DMAR_FECTL_REG); ++ spin_unlock_irqrestore(&iommu->register_lock, flag); ++} ++ ++void dmar_msi_mask(unsigned int irq) ++{ ++ unsigned long flag; ++ struct intel_iommu *iommu = get_irq_data(irq); ++ ++ /* mask it */ ++ spin_lock_irqsave(&iommu->register_lock, flag); ++ writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG); ++ /* Read a reg to force flush the post write */ ++ readl(iommu->reg + DMAR_FECTL_REG); ++ spin_unlock_irqrestore(&iommu->register_lock, flag); ++} ++ ++void dmar_msi_write(int irq, struct msi_msg *msg) ++{ ++ struct intel_iommu *iommu = get_irq_data(irq); ++ unsigned long flag; ++ ++ spin_lock_irqsave(&iommu->register_lock, flag); ++ writel(msg->data, iommu->reg + DMAR_FEDATA_REG); ++ writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG); ++ writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG); ++ spin_unlock_irqrestore(&iommu->register_lock, flag); ++} ++ ++void dmar_msi_read(int irq, struct msi_msg *msg) ++{ ++ struct intel_iommu *iommu = get_irq_data(irq); ++ unsigned long flag; ++ ++ spin_lock_irqsave(&iommu->register_lock, flag); ++ msg->data = readl(iommu->reg + DMAR_FEDATA_REG); ++ msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG); ++ msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG); ++ spin_unlock_irqrestore(&iommu->register_lock, flag); ++} ++ ++static int dmar_fault_do_one(struct intel_iommu *iommu, int type, ++ u8 fault_reason, u16 source_id, unsigned long long addr) ++{ ++ const char *reason; ++ int fault_type; ++ ++ reason = dmar_get_fault_reason(fault_reason, &fault_type); ++ ++ if (fault_type == INTR_REMAP) ++ printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] " ++ "fault index %llx\n" ++ "INTR-REMAP:[fault reason %02d] %s\n", ++ (source_id >> 8), PCI_SLOT(source_id & 0xFF), ++ PCI_FUNC(source_id & 0xFF), addr >> 48, ++ fault_reason, reason); ++ else ++ printk(KERN_ERR ++ "DMAR:[%s] Request device [%02x:%02x.%d] " ++ "fault addr %llx \n" ++ "DMAR:[fault reason %02d] %s\n", ++ (type ? "DMA Read" : "DMA Write"), ++ (source_id >> 8), PCI_SLOT(source_id & 0xFF), ++ PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason); ++ return 0; ++} ++ ++#define PRIMARY_FAULT_REG_LEN (16) ++irqreturn_t dmar_fault(int irq, void *dev_id) ++{ ++ struct intel_iommu *iommu = dev_id; ++ int reg, fault_index; ++ u32 fault_status; ++ unsigned long flag; ++ ++ spin_lock_irqsave(&iommu->register_lock, flag); ++ fault_status = readl(iommu->reg + DMAR_FSTS_REG); ++ if (fault_status) ++ printk(KERN_ERR "DRHD: handling fault status reg %x\n", ++ fault_status); ++ ++ /* TBD: ignore advanced fault log currently */ ++ if (!(fault_status & DMA_FSTS_PPF)) ++ goto clear_rest; ++ ++ fault_index = dma_fsts_fault_record_index(fault_status); ++ reg = cap_fault_reg_offset(iommu->cap); ++ while (1) { ++ u8 fault_reason; ++ u16 source_id; ++ u64 guest_addr; ++ int type; ++ u32 data; ++ ++ /* highest 32 bits */ ++ data = readl(iommu->reg + reg + ++ fault_index * PRIMARY_FAULT_REG_LEN + 12); ++ if (!(data & DMA_FRCD_F)) ++ break; ++ ++ fault_reason = dma_frcd_fault_reason(data); ++ type = dma_frcd_type(data); ++ ++ data = readl(iommu->reg + reg + ++ fault_index * PRIMARY_FAULT_REG_LEN + 8); ++ source_id = dma_frcd_source_id(data); ++ ++ guest_addr = dmar_readq(iommu->reg + reg + ++ fault_index * PRIMARY_FAULT_REG_LEN); ++ guest_addr = dma_frcd_page_addr(guest_addr); ++ /* clear the fault */ ++ writel(DMA_FRCD_F, iommu->reg + reg + ++ fault_index * PRIMARY_FAULT_REG_LEN + 12); ++ ++ spin_unlock_irqrestore(&iommu->register_lock, flag); ++ ++ dmar_fault_do_one(iommu, type, fault_reason, ++ source_id, guest_addr); ++ ++ fault_index++; ++ if (fault_index > cap_num_fault_regs(iommu->cap)) ++ fault_index = 0; ++ spin_lock_irqsave(&iommu->register_lock, flag); ++ } ++clear_rest: ++ /* clear all the other faults */ ++ fault_status = readl(iommu->reg + DMAR_FSTS_REG); ++ writel(fault_status, iommu->reg + DMAR_FSTS_REG); ++ ++ spin_unlock_irqrestore(&iommu->register_lock, flag); ++ return IRQ_HANDLED; ++} ++ ++int dmar_set_interrupt(struct intel_iommu *iommu) ++{ ++ int irq, ret; ++ ++ /* ++ * Check if the fault interrupt is already initialized. ++ */ ++ if (iommu->irq) ++ return 0; ++ ++ irq = create_irq(); ++ if (!irq) { ++ printk(KERN_ERR "IOMMU: no free vectors\n"); ++ return -EINVAL; ++ } ++ ++ set_irq_data(irq, iommu); ++ iommu->irq = irq; ++ ++ ret = arch_setup_dmar_msi(irq); ++ if (ret) { ++ set_irq_data(irq, NULL); ++ iommu->irq = 0; ++ destroy_irq(irq); ++ return 0; ++ } ++ ++ ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu); ++ if (ret) ++ printk(KERN_ERR "IOMMU: can't request irq\n"); ++ return ret; ++} ++ ++int __init enable_drhd_fault_handling(void) ++{ ++ struct dmar_drhd_unit *drhd; ++ ++ /* ++ * Enable fault control interrupt. ++ */ ++ for_each_drhd_unit(drhd) { ++ int ret; ++ struct intel_iommu *iommu = drhd->iommu; ++ ret = dmar_set_interrupt(iommu); ++ ++ if (ret) { ++ printk(KERN_ERR "DRHD %Lx: failed to enable fault, " ++ " interrupt, ret %d\n", ++ (unsigned long long)drhd->reg_base_addr, ret); ++ return -1; ++ } ++ } ++ ++ return 0; ++} +Index: linux-2.6-tip/drivers/pci/hotplug/cpqphp.h +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/hotplug/cpqphp.h ++++ linux-2.6-tip/drivers/pci/hotplug/cpqphp.h +@@ -449,7 +449,7 @@ extern u8 cpqhp_disk_irq; + + /* inline functions */ + +-static inline char *slot_name(struct slot *slot) ++static inline const char *slot_name(struct slot *slot) + { + return hotplug_slot_name(slot->hotplug_slot); + } +Index: linux-2.6-tip/drivers/pci/hotplug/ibmphp_core.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/hotplug/ibmphp_core.c ++++ linux-2.6-tip/drivers/pci/hotplug/ibmphp_core.c +@@ -1419,3 +1419,4 @@ static void __exit ibmphp_exit(void) + } + + module_init(ibmphp_init); ++module_exit(ibmphp_exit); +Index: linux-2.6-tip/drivers/pci/intel-iommu.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/intel-iommu.c ++++ linux-2.6-tip/drivers/pci/intel-iommu.c +@@ -1010,194 +1010,6 @@ static int iommu_disable_translation(str + return 0; + } + +-/* iommu interrupt handling. Most stuff are MSI-like. */ +- +-static const char *fault_reason_strings[] = +-{ +- "Software", +- "Present bit in root entry is clear", +- "Present bit in context entry is clear", +- "Invalid context entry", +- "Access beyond MGAW", +- "PTE Write access is not set", +- "PTE Read access is not set", +- "Next page table ptr is invalid", +- "Root table address invalid", +- "Context table ptr is invalid", +- "non-zero reserved fields in RTP", +- "non-zero reserved fields in CTP", +- "non-zero reserved fields in PTE", +-}; +-#define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1) +- +-const char *dmar_get_fault_reason(u8 fault_reason) +-{ +- if (fault_reason > MAX_FAULT_REASON_IDX) +- return "Unknown"; +- else +- return fault_reason_strings[fault_reason]; +-} +- +-void dmar_msi_unmask(unsigned int irq) +-{ +- struct intel_iommu *iommu = get_irq_data(irq); +- unsigned long flag; +- +- /* unmask it */ +- spin_lock_irqsave(&iommu->register_lock, flag); +- writel(0, iommu->reg + DMAR_FECTL_REG); +- /* Read a reg to force flush the post write */ +- readl(iommu->reg + DMAR_FECTL_REG); +- spin_unlock_irqrestore(&iommu->register_lock, flag); +-} +- +-void dmar_msi_mask(unsigned int irq) +-{ +- unsigned long flag; +- struct intel_iommu *iommu = get_irq_data(irq); +- +- /* mask it */ +- spin_lock_irqsave(&iommu->register_lock, flag); +- writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG); +- /* Read a reg to force flush the post write */ +- readl(iommu->reg + DMAR_FECTL_REG); +- spin_unlock_irqrestore(&iommu->register_lock, flag); +-} +- +-void dmar_msi_write(int irq, struct msi_msg *msg) +-{ +- struct intel_iommu *iommu = get_irq_data(irq); +- unsigned long flag; +- +- spin_lock_irqsave(&iommu->register_lock, flag); +- writel(msg->data, iommu->reg + DMAR_FEDATA_REG); +- writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG); +- writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG); +- spin_unlock_irqrestore(&iommu->register_lock, flag); +-} +- +-void dmar_msi_read(int irq, struct msi_msg *msg) +-{ +- struct intel_iommu *iommu = get_irq_data(irq); +- unsigned long flag; +- +- spin_lock_irqsave(&iommu->register_lock, flag); +- msg->data = readl(iommu->reg + DMAR_FEDATA_REG); +- msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG); +- msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG); +- spin_unlock_irqrestore(&iommu->register_lock, flag); +-} +- +-static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type, +- u8 fault_reason, u16 source_id, unsigned long long addr) +-{ +- const char *reason; +- +- reason = dmar_get_fault_reason(fault_reason); +- +- printk(KERN_ERR +- "DMAR:[%s] Request device [%02x:%02x.%d] " +- "fault addr %llx \n" +- "DMAR:[fault reason %02d] %s\n", +- (type ? "DMA Read" : "DMA Write"), +- (source_id >> 8), PCI_SLOT(source_id & 0xFF), +- PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason); +- return 0; +-} +- +-#define PRIMARY_FAULT_REG_LEN (16) +-static irqreturn_t iommu_page_fault(int irq, void *dev_id) +-{ +- struct intel_iommu *iommu = dev_id; +- int reg, fault_index; +- u32 fault_status; +- unsigned long flag; +- +- spin_lock_irqsave(&iommu->register_lock, flag); +- fault_status = readl(iommu->reg + DMAR_FSTS_REG); +- +- /* TBD: ignore advanced fault log currently */ +- if (!(fault_status & DMA_FSTS_PPF)) +- goto clear_overflow; +- +- fault_index = dma_fsts_fault_record_index(fault_status); +- reg = cap_fault_reg_offset(iommu->cap); +- while (1) { +- u8 fault_reason; +- u16 source_id; +- u64 guest_addr; +- int type; +- u32 data; +- +- /* highest 32 bits */ +- data = readl(iommu->reg + reg + +- fault_index * PRIMARY_FAULT_REG_LEN + 12); +- if (!(data & DMA_FRCD_F)) +- break; +- +- fault_reason = dma_frcd_fault_reason(data); +- type = dma_frcd_type(data); +- +- data = readl(iommu->reg + reg + +- fault_index * PRIMARY_FAULT_REG_LEN + 8); +- source_id = dma_frcd_source_id(data); +- +- guest_addr = dmar_readq(iommu->reg + reg + +- fault_index * PRIMARY_FAULT_REG_LEN); +- guest_addr = dma_frcd_page_addr(guest_addr); +- /* clear the fault */ +- writel(DMA_FRCD_F, iommu->reg + reg + +- fault_index * PRIMARY_FAULT_REG_LEN + 12); +- +- spin_unlock_irqrestore(&iommu->register_lock, flag); +- +- iommu_page_fault_do_one(iommu, type, fault_reason, +- source_id, guest_addr); +- +- fault_index++; +- if (fault_index > cap_num_fault_regs(iommu->cap)) +- fault_index = 0; +- spin_lock_irqsave(&iommu->register_lock, flag); +- } +-clear_overflow: +- /* clear primary fault overflow */ +- fault_status = readl(iommu->reg + DMAR_FSTS_REG); +- if (fault_status & DMA_FSTS_PFO) +- writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG); +- +- spin_unlock_irqrestore(&iommu->register_lock, flag); +- return IRQ_HANDLED; +-} +- +-int dmar_set_interrupt(struct intel_iommu *iommu) +-{ +- int irq, ret; +- +- irq = create_irq(); +- if (!irq) { +- printk(KERN_ERR "IOMMU: no free vectors\n"); +- return -EINVAL; +- } +- +- set_irq_data(irq, iommu); +- iommu->irq = irq; +- +- ret = arch_setup_dmar_msi(irq); +- if (ret) { +- set_irq_data(irq, NULL); +- iommu->irq = 0; +- destroy_irq(irq); +- return 0; +- } +- +- /* Force fault register is cleared */ +- iommu_page_fault(irq, iommu); +- +- ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu); +- if (ret) +- printk(KERN_ERR "IOMMU: can't request irq\n"); +- return ret; +-} + + static int iommu_init_domains(struct intel_iommu *iommu) + { +@@ -1993,7 +1805,7 @@ static int __init init_dmars(void) + struct dmar_rmrr_unit *rmrr; + struct pci_dev *pdev; + struct intel_iommu *iommu; +- int i, ret, unit = 0; ++ int i, ret; + + /* + * for each drhd +@@ -2049,11 +1861,40 @@ static int __init init_dmars(void) + } + } + ++ /* ++ * Start from the sane iommu hardware state. ++ */ ++ for_each_drhd_unit(drhd) { ++ if (drhd->ignored) ++ continue; ++ ++ iommu = drhd->iommu; ++ ++ /* ++ * If the queued invalidation is already initialized by us ++ * (for example, while enabling interrupt-remapping) then ++ * we got the things already rolling from a sane state. ++ */ ++ if (iommu->qi) ++ continue; ++ ++ /* ++ * Clear any previous faults. ++ */ ++ dmar_fault(-1, iommu); ++ /* ++ * Disable queued invalidation if supported and already enabled ++ * before OS handover. ++ */ ++ dmar_disable_qi(iommu); ++ } ++ + for_each_drhd_unit(drhd) { + if (drhd->ignored) + continue; + + iommu = drhd->iommu; ++ + if (dmar_enable_qi(iommu)) { + /* + * Queued Invalidate not enabled, use Register Based +@@ -2115,7 +1956,6 @@ static int __init init_dmars(void) + if (drhd->ignored) + continue; + iommu = drhd->iommu; +- sprintf (iommu->name, "dmar%d", unit++); + + iommu_flush_write_buffer(iommu); + +@@ -2290,11 +2130,13 @@ error: + return 0; + } + +-dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr, +- size_t size, int dir) ++static dma_addr_t intel_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { +- return __intel_map_single(hwdev, paddr, size, dir, +- to_pci_dev(hwdev)->dma_mask); ++ return __intel_map_single(dev, page_to_phys(page) + offset, size, ++ dir, to_pci_dev(dev)->dma_mask); + } + + static void flush_unmaps(void) +@@ -2358,8 +2200,9 @@ static void add_unmap(struct dmar_domain + spin_unlock_irqrestore(&async_umap_flush_lock, flags); + } + +-void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, +- int dir) ++static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct pci_dev *pdev = to_pci_dev(dev); + struct dmar_domain *domain; +@@ -2403,8 +2246,14 @@ void intel_unmap_single(struct device *d + } + } + +-void *intel_alloc_coherent(struct device *hwdev, size_t size, +- dma_addr_t *dma_handle, gfp_t flags) ++static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, ++ int dir) ++{ ++ intel_unmap_page(dev, dev_addr, size, dir, NULL); ++} ++ ++static void *intel_alloc_coherent(struct device *hwdev, size_t size, ++ dma_addr_t *dma_handle, gfp_t flags) + { + void *vaddr; + int order; +@@ -2427,8 +2276,8 @@ void *intel_alloc_coherent(struct device + return NULL; + } + +-void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, +- dma_addr_t dma_handle) ++static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, ++ dma_addr_t dma_handle) + { + int order; + +@@ -2441,8 +2290,9 @@ void intel_free_coherent(struct device * + + #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg))) + +-void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, +- int nelems, int dir) ++static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + int i; + struct pci_dev *pdev = to_pci_dev(hwdev); +@@ -2499,8 +2349,8 @@ static int intel_nontranslate_map_sg(str + return nelems; + } + +-int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, +- int dir) ++static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, ++ enum dma_data_direction dir, struct dma_attrs *attrs) + { + void *addr; + int i; +@@ -2580,13 +2430,19 @@ int intel_map_sg(struct device *hwdev, s + return nelems; + } + +-static struct dma_mapping_ops intel_dma_ops = { ++static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr) ++{ ++ return !dma_addr; ++} ++ ++struct dma_map_ops intel_dma_ops = { + .alloc_coherent = intel_alloc_coherent, + .free_coherent = intel_free_coherent, +- .map_single = intel_map_single, +- .unmap_single = intel_unmap_single, + .map_sg = intel_map_sg, + .unmap_sg = intel_unmap_sg, ++ .map_page = intel_map_page, ++ .unmap_page = intel_unmap_page, ++ .mapping_error = intel_mapping_error, + }; + + static inline int iommu_domain_cache_init(void) +Index: linux-2.6-tip/drivers/pci/intr_remapping.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/intr_remapping.c ++++ linux-2.6-tip/drivers/pci/intr_remapping.c +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + #include + #include "intr_remapping.h" + +@@ -20,7 +21,7 @@ struct irq_2_iommu { + u8 irte_mask; + }; + +-#ifdef CONFIG_SPARSE_IRQ ++#ifdef CONFIG_GENERIC_HARDIRQS + static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu) + { + struct irq_2_iommu *iommu; +@@ -116,21 +117,22 @@ int get_irte(int irq, struct irte *entry + { + int index; + struct irq_2_iommu *irq_iommu; ++ unsigned long flags; + + if (!entry) + return -1; + +- spin_lock(&irq_2_ir_lock); ++ spin_lock_irqsave(&irq_2_ir_lock, flags); + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return -1; + } + + index = irq_iommu->irte_index + irq_iommu->sub_handle; + *entry = *(irq_iommu->iommu->ir_table->base + index); + +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return 0; + } + +@@ -140,6 +142,7 @@ int alloc_irte(struct intel_iommu *iommu + struct irq_2_iommu *irq_iommu; + u16 index, start_index; + unsigned int mask = 0; ++ unsigned long flags; + int i; + + if (!count) +@@ -169,7 +172,7 @@ int alloc_irte(struct intel_iommu *iommu + return -1; + } + +- spin_lock(&irq_2_ir_lock); ++ spin_lock_irqsave(&irq_2_ir_lock, flags); + do { + for (i = index; i < index + count; i++) + if (table->base[i].present) +@@ -181,7 +184,7 @@ int alloc_irte(struct intel_iommu *iommu + index = (index + count) % INTR_REMAP_TABLE_ENTRIES; + + if (index == start_index) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + printk(KERN_ERR "can't allocate an IRTE\n"); + return -1; + } +@@ -192,7 +195,7 @@ int alloc_irte(struct intel_iommu *iommu + + irq_iommu = irq_2_iommu_alloc(irq); + if (!irq_iommu) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + printk(KERN_ERR "can't allocate irq_2_iommu\n"); + return -1; + } +@@ -202,7 +205,7 @@ int alloc_irte(struct intel_iommu *iommu + irq_iommu->sub_handle = 0; + irq_iommu->irte_mask = mask; + +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return index; + } +@@ -222,30 +225,32 @@ int map_irq_to_irte_handle(int irq, u16 + { + int index; + struct irq_2_iommu *irq_iommu; ++ unsigned long flags; + +- spin_lock(&irq_2_ir_lock); ++ spin_lock_irqsave(&irq_2_ir_lock, flags); + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return -1; + } + + *sub_handle = irq_iommu->sub_handle; + index = irq_iommu->irte_index; +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return index; + } + + int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) + { + struct irq_2_iommu *irq_iommu; ++ unsigned long flags; + +- spin_lock(&irq_2_ir_lock); ++ spin_lock_irqsave(&irq_2_ir_lock, flags); + + irq_iommu = irq_2_iommu_alloc(irq); + + if (!irq_iommu) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + printk(KERN_ERR "can't allocate irq_2_iommu\n"); + return -1; + } +@@ -255,7 +260,7 @@ int set_irte_irq(int irq, struct intel_i + irq_iommu->sub_handle = subhandle; + irq_iommu->irte_mask = 0; + +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return 0; + } +@@ -263,11 +268,12 @@ int set_irte_irq(int irq, struct intel_i + int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index) + { + struct irq_2_iommu *irq_iommu; ++ unsigned long flags; + +- spin_lock(&irq_2_ir_lock); ++ spin_lock_irqsave(&irq_2_ir_lock, flags); + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return -1; + } + +@@ -276,7 +282,7 @@ int clear_irte_irq(int irq, struct intel + irq_iommu->sub_handle = 0; + irq_2_iommu(irq)->irte_mask = 0; + +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return 0; + } +@@ -288,11 +294,12 @@ int modify_irte(int irq, struct irte *ir + struct irte *irte; + struct intel_iommu *iommu; + struct irq_2_iommu *irq_iommu; ++ unsigned long flags; + +- spin_lock(&irq_2_ir_lock); ++ spin_lock_irqsave(&irq_2_ir_lock, flags); + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return -1; + } + +@@ -301,11 +308,11 @@ int modify_irte(int irq, struct irte *ir + index = irq_iommu->irte_index + irq_iommu->sub_handle; + irte = &iommu->ir_table->base[index]; + +- set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1)); ++ set_64bit((unsigned long *)irte, irte_modified->low); + __iommu_flush_cache(iommu, irte, sizeof(*irte)); + + rc = qi_flush_iec(iommu, index, 0); +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return rc; + } +@@ -316,11 +323,12 @@ int flush_irte(int irq) + int index; + struct intel_iommu *iommu; + struct irq_2_iommu *irq_iommu; ++ unsigned long flags; + +- spin_lock(&irq_2_ir_lock); ++ spin_lock_irqsave(&irq_2_ir_lock, flags); + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return -1; + } + +@@ -329,7 +337,7 @@ int flush_irte(int irq) + index = irq_iommu->irte_index + irq_iommu->sub_handle; + + rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask); +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return rc; + } +@@ -362,11 +370,12 @@ int free_irte(int irq) + struct irte *irte; + struct intel_iommu *iommu; + struct irq_2_iommu *irq_iommu; ++ unsigned long flags; + +- spin_lock(&irq_2_ir_lock); ++ spin_lock_irqsave(&irq_2_ir_lock, flags); + irq_iommu = valid_irq_2_iommu(irq); + if (!irq_iommu) { +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return -1; + } + +@@ -377,7 +386,7 @@ int free_irte(int irq) + + if (!irq_iommu->sub_handle) { + for (i = 0; i < (1 << irq_iommu->irte_mask); i++) +- set_64bit((unsigned long *)irte, 0); ++ set_64bit((unsigned long *)(irte + i), 0); + rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask); + } + +@@ -386,7 +395,7 @@ int free_irte(int irq) + irq_iommu->sub_handle = 0; + irq_iommu->irte_mask = 0; + +- spin_unlock(&irq_2_ir_lock); ++ spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return rc; + } +@@ -438,12 +447,12 @@ static int setup_intr_remapping(struct i + struct page *pages; + + ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table), +- GFP_KERNEL); ++ GFP_ATOMIC); + + if (!iommu->ir_table) + return -ENOMEM; + +- pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, INTR_REMAP_PAGE_ORDER); ++ pages = alloc_pages(GFP_ATOMIC | __GFP_ZERO, INTR_REMAP_PAGE_ORDER); + + if (!pages) { + printk(KERN_ERR "failed to allocate pages of order %d\n", +@@ -458,11 +467,55 @@ static int setup_intr_remapping(struct i + return 0; + } + ++/* ++ * Disable Interrupt Remapping. ++ */ ++static void disable_intr_remapping(struct intel_iommu *iommu) ++{ ++ unsigned long flags; ++ u32 sts; ++ ++ if (!ecap_ir_support(iommu->ecap)) ++ return; ++ ++ spin_lock_irqsave(&iommu->register_lock, flags); ++ ++ sts = dmar_readq(iommu->reg + DMAR_GSTS_REG); ++ if (!(sts & DMA_GSTS_IRES)) ++ goto end; ++ ++ iommu->gcmd &= ~DMA_GCMD_IRE; ++ writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ++ ++ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, ++ readl, !(sts & DMA_GSTS_IRES), sts); ++ ++end: ++ spin_unlock_irqrestore(&iommu->register_lock, flags); ++} ++ + int __init enable_intr_remapping(int eim) + { + struct dmar_drhd_unit *drhd; + int setup = 0; + ++ for_each_drhd_unit(drhd) { ++ struct intel_iommu *iommu = drhd->iommu; ++ ++ /* ++ * Clear previous faults. ++ */ ++ dmar_fault(-1, iommu); ++ ++ /* ++ * Disable intr remapping and queued invalidation, if already ++ * enabled prior to OS handover. ++ */ ++ disable_intr_remapping(iommu); ++ ++ dmar_disable_qi(iommu); ++ } ++ + /* + * check for the Interrupt-remapping support + */ +Index: linux-2.6-tip/drivers/pci/pci-driver.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/pci-driver.c ++++ linux-2.6-tip/drivers/pci/pci-driver.c +@@ -212,10 +212,9 @@ static int pci_call_probe(struct pci_dri + node = dev_to_node(&dev->dev); + if (node >= 0) { + int cpu; +- node_to_cpumask_ptr(nodecpumask, node); + + get_online_cpus(); +- cpu = cpumask_any_and(nodecpumask, cpu_online_mask); ++ cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); + if (cpu < nr_cpu_ids) + error = work_on_cpu(cpu, local_pci_probe, &ddi); + else +Index: linux-2.6-tip/drivers/pci/search.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/search.c ++++ linux-2.6-tip/drivers/pci/search.c +@@ -277,8 +277,12 @@ static struct pci_dev *pci_get_dev_by_id + match_pci_dev_by_id); + if (dev) + pdev = to_pci_dev(dev); ++ ++ /* ++ * FIXME: take the cast off, when pci_dev_put() is made const: ++ */ + if (from) +- pci_dev_put(from); ++ pci_dev_put((struct pci_dev *)from); + return pdev; + } + +Index: linux-2.6-tip/drivers/platform/x86/fujitsu-laptop.c +=================================================================== +--- linux-2.6-tip.orig/drivers/platform/x86/fujitsu-laptop.c ++++ linux-2.6-tip/drivers/platform/x86/fujitsu-laptop.c +@@ -1301,4 +1301,4 @@ static struct pnp_device_id pnp_ids[] = + {.id = ""} + }; + +-MODULE_DEVICE_TABLE(pnp, pnp_ids); ++MODULE_STATIC_DEVICE_TABLE(pnp, pnp_ids); +Index: linux-2.6-tip/drivers/platform/x86/toshiba_acpi.c +=================================================================== +--- linux-2.6-tip.orig/drivers/platform/x86/toshiba_acpi.c ++++ linux-2.6-tip/drivers/platform/x86/toshiba_acpi.c +@@ -729,8 +729,8 @@ static int __init toshiba_acpi_init(void + { + acpi_status status = AE_OK; + u32 hci_result; +- bool bt_present; +- bool bt_on; ++ bool uninitialized_var(bt_present); ++ bool uninitialized_var(bt_on); + bool radio_on; + int ret = 0; + +Index: linux-2.6-tip/drivers/pnp/pnpbios/core.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pnp/pnpbios/core.c ++++ linux-2.6-tip/drivers/pnp/pnpbios/core.c +@@ -573,6 +573,8 @@ static int __init pnpbios_init(void) + + fs_initcall(pnpbios_init); + ++#ifdef CONFIG_HOTPLUG ++ + static int __init pnpbios_thread_init(void) + { + struct task_struct *task; +@@ -583,16 +585,18 @@ static int __init pnpbios_thread_init(vo + #endif + if (pnpbios_disabled) + return 0; +-#ifdef CONFIG_HOTPLUG ++ + init_completion(&unload_sem); + task = kthread_run(pnp_dock_thread, NULL, "kpnpbiosd"); + if (!IS_ERR(task)) + unloading = 0; +-#endif ++ + return 0; + } + + /* Start the kernel thread later: */ + module_init(pnpbios_thread_init); + ++#endif ++ + EXPORT_SYMBOL(pnpbios_protocol); +Index: linux-2.6-tip/drivers/scsi/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/Kconfig ++++ linux-2.6-tip/drivers/scsi/Kconfig +@@ -608,6 +608,7 @@ config SCSI_FLASHPOINT + config LIBFC + tristate "LibFC module" + select SCSI_FC_ATTRS ++ select CRC32 + ---help--- + Fibre Channel library module + +Index: linux-2.6-tip/drivers/scsi/advansys.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/advansys.c ++++ linux-2.6-tip/drivers/scsi/advansys.c +@@ -68,7 +68,9 @@ + * 7. advansys_info is not safe against multiple simultaneous callers + * 8. Add module_param to override ISA/VLB ioport array + */ +-#warning this driver is still not properly converted to the DMA API ++#ifdef CONFIG_ALLOW_WARNINGS ++# warning this driver is still not properly converted to the DMA API ++#endif + + /* Enable driver /proc statistics. */ + #define ADVANSYS_STATS +@@ -10516,7 +10518,7 @@ AscSendScsiQueue(ASC_DVC_VAR *asc_dvc, A + { + PortAddr iop_base; + uchar free_q_head; +- uchar next_qp; ++ uchar uninitialized_var(next_qp); + uchar tid_no; + uchar target_ix; + int sta; +@@ -10945,7 +10947,7 @@ static int asc_execute_scsi_cmnd(struct + err_code = asc_dvc->err_code; + } else { + ADV_DVC_VAR *adv_dvc = &boardp->dvc_var.adv_dvc_var; +- ADV_SCSI_REQ_Q *adv_scsiqp; ++ ADV_SCSI_REQ_Q *uninitialized_var(adv_scsiqp); + + switch (adv_build_req(boardp, scp, &adv_scsiqp)) { + case ASC_NOERROR: +@@ -13877,7 +13879,9 @@ static int __devinit advansys_board_foun + #endif + err_free_proc: + kfree(boardp->prtbuf); ++#ifdef CONFIG_PROC_FS + err_unmap: ++#endif + if (boardp->ioremap_addr) + iounmap(boardp->ioremap_addr); + err_shost: +Index: linux-2.6-tip/drivers/scsi/dpt_i2o.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/dpt_i2o.c ++++ linux-2.6-tip/drivers/scsi/dpt_i2o.c +@@ -183,7 +183,7 @@ static struct pci_device_id dptids[] = { + { PCI_DPT_VENDOR_ID, PCI_DPT_RAPTOR_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,}, + { 0, } + }; +-MODULE_DEVICE_TABLE(pci,dptids); ++MODULE_STATIC_DEVICE_TABLE(pci,dptids); + + static int adpt_detect(struct scsi_host_template* sht) + { +Index: linux-2.6-tip/drivers/scsi/dtc.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/dtc.c ++++ linux-2.6-tip/drivers/scsi/dtc.c +@@ -165,36 +165,6 @@ static const struct signature { + + #define NO_SIGNATURES ARRAY_SIZE(signatures) + +-#ifndef MODULE +-/* +- * Function : dtc_setup(char *str, int *ints) +- * +- * Purpose : LILO command line initialization of the overrides array, +- * +- * Inputs : str - unused, ints - array of integer parameters with ints[0] +- * equal to the number of ints. +- * +- */ +- +-static void __init dtc_setup(char *str, int *ints) +-{ +- static int commandline_current = 0; +- int i; +- if (ints[0] != 2) +- printk("dtc_setup: usage dtc=address,irq\n"); +- else if (commandline_current < NO_OVERRIDES) { +- overrides[commandline_current].address = ints[1]; +- overrides[commandline_current].irq = ints[2]; +- for (i = 0; i < NO_BASES; ++i) +- if (bases[i].address == ints[1]) { +- bases[i].noauto = 1; +- break; +- } +- ++commandline_current; +- } +-} +-#endif +- + /* + * Function : int dtc_detect(struct scsi_host_template * tpnt) + * +Index: linux-2.6-tip/drivers/scsi/fdomain.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/fdomain.c ++++ linux-2.6-tip/drivers/scsi/fdomain.c +@@ -1774,7 +1774,7 @@ static struct pci_device_id fdomain_pci_ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, + { } + }; +-MODULE_DEVICE_TABLE(pci, fdomain_pci_tbl); ++MODULE_STATIC_DEVICE_TABLE(pci, fdomain_pci_tbl); + #endif + #define driver_template fdomain_driver_template + #include "scsi_module.c" +Index: linux-2.6-tip/drivers/scsi/g_NCR5380.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/g_NCR5380.c ++++ linux-2.6-tip/drivers/scsi/g_NCR5380.c +@@ -938,18 +938,6 @@ module_param(ncr_53c400a, int, 0); + module_param(dtc_3181e, int, 0); + MODULE_LICENSE("GPL"); + +- +-static struct isapnp_device_id id_table[] __devinitdata = { +- { +- ISAPNP_ANY_ID, ISAPNP_ANY_ID, +- ISAPNP_VENDOR('D', 'T', 'C'), ISAPNP_FUNCTION(0x436e), +- 0}, +- {0} +-}; +- +-MODULE_DEVICE_TABLE(isapnp, id_table); +- +- + __setup("ncr5380=", do_NCR5380_setup); + __setup("ncr53c400=", do_NCR53C400_setup); + __setup("ncr53c400a=", do_NCR53C400A_setup); +Index: linux-2.6-tip/drivers/scsi/initio.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/initio.c ++++ linux-2.6-tip/drivers/scsi/initio.c +@@ -136,7 +136,7 @@ static struct pci_device_id i91u_pci_dev + { PCI_VENDOR_ID_DOMEX, I920_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + { } + }; +-MODULE_DEVICE_TABLE(pci, i91u_pci_devices); ++MODULE_STATIC_DEVICE_TABLE(pci, i91u_pci_devices); + + #define DEBUG_INTERRUPT 0 + #define DEBUG_QUEUE 0 +Index: linux-2.6-tip/drivers/scsi/lpfc/lpfc_els.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/lpfc/lpfc_els.c ++++ linux-2.6-tip/drivers/scsi/lpfc/lpfc_els.c +@@ -3968,7 +3968,8 @@ lpfc_els_rcv_rscn(struct lpfc_vport *vpo + struct lpfc_dmabuf *pcmd; + uint32_t *lp, *datap; + IOCB_t *icmd; +- uint32_t payload_len, length, nportid, *cmd; ++ uint32_t payload_len, uninitialized_var(length), nportid, ++ *uninitialized_var(cmd); + int rscn_cnt; + int rscn_id = 0, hba_id = 0; + int i; +Index: linux-2.6-tip/drivers/scsi/megaraid/megaraid_mm.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/megaraid/megaraid_mm.c ++++ linux-2.6-tip/drivers/scsi/megaraid/megaraid_mm.c +@@ -117,7 +117,7 @@ mraid_mm_ioctl(struct inode *inode, stru + int rval; + mraid_mmadp_t *adp; + uint8_t old_ioctl; +- int drvrcmd_rval; ++ int uninitialized_var(drvrcmd_rval); + void __user *argp = (void __user *)arg; + + /* +Index: linux-2.6-tip/drivers/scsi/ncr53c8xx.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/ncr53c8xx.c ++++ linux-2.6-tip/drivers/scsi/ncr53c8xx.c +@@ -8295,7 +8295,7 @@ __setup("ncr53c8xx=", ncr53c8xx_setup); + struct Scsi_Host * __init ncr_attach(struct scsi_host_template *tpnt, + int unit, struct ncr_device *device) + { +- struct host_data *host_data; ++ struct host_data *uninitialized_var(host_data); + struct ncb *np = NULL; + struct Scsi_Host *instance = NULL; + u_long flags = 0; +Index: linux-2.6-tip/drivers/scsi/qla4xxx/ql4_mbx.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/qla4xxx/ql4_mbx.c ++++ linux-2.6-tip/drivers/scsi/qla4xxx/ql4_mbx.c +@@ -867,7 +867,7 @@ int qla4xxx_send_tgts(struct scsi_qla_ho + { + struct dev_db_entry *fw_ddb_entry; + dma_addr_t fw_ddb_entry_dma; +- uint32_t ddb_index; ++ uint32_t uninitialized_var(ddb_index); + int ret_val = QLA_SUCCESS; + + +Index: linux-2.6-tip/drivers/scsi/scsi_lib.c +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/scsi_lib.c ++++ linux-2.6-tip/drivers/scsi/scsi_lib.c +@@ -703,71 +703,6 @@ void scsi_run_host_queues(struct Scsi_Ho + + static void __scsi_release_buffers(struct scsi_cmnd *, int); + +-/* +- * Function: scsi_end_request() +- * +- * Purpose: Post-processing of completed commands (usually invoked at end +- * of upper level post-processing and scsi_io_completion). +- * +- * Arguments: cmd - command that is complete. +- * error - 0 if I/O indicates success, < 0 for I/O error. +- * bytes - number of bytes of completed I/O +- * requeue - indicates whether we should requeue leftovers. +- * +- * Lock status: Assumed that lock is not held upon entry. +- * +- * Returns: cmd if requeue required, NULL otherwise. +- * +- * Notes: This is called for block device requests in order to +- * mark some number of sectors as complete. +- * +- * We are guaranteeing that the request queue will be goosed +- * at some point during this call. +- * Notes: If cmd was requeued, upon return it will be a stale pointer. +- */ +-static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error, +- int bytes, int requeue) +-{ +- struct request_queue *q = cmd->device->request_queue; +- struct request *req = cmd->request; +- +- /* +- * If there are blocks left over at the end, set up the command +- * to queue the remainder of them. +- */ +- if (blk_end_request(req, error, bytes)) { +- int leftover = (req->hard_nr_sectors << 9); +- +- if (blk_pc_request(req)) +- leftover = req->data_len; +- +- /* kill remainder if no retrys */ +- if (error && scsi_noretry_cmd(cmd)) +- blk_end_request(req, error, leftover); +- else { +- if (requeue) { +- /* +- * Bleah. Leftovers again. Stick the +- * leftovers in the front of the +- * queue, and goose the queue again. +- */ +- scsi_release_buffers(cmd); +- scsi_requeue_command(q, cmd); +- cmd = NULL; +- } +- return cmd; +- } +- } +- +- /* +- * This will goose the queue request function at the end, so we don't +- * need to worry about launching another command. +- */ +- __scsi_release_buffers(cmd, 0); +- scsi_next_command(cmd); +- return NULL; +-} +- + static inline unsigned int scsi_sgtable_index(unsigned short nents) + { + unsigned int index; +@@ -929,7 +864,6 @@ static void scsi_end_bidi_request(struct + void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) + { + int result = cmd->result; +- int this_count; + struct request_queue *q = cmd->device->request_queue; + struct request *req = cmd->request; + int error = 0; +@@ -980,24 +914,37 @@ void scsi_io_completion(struct scsi_cmnd + SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " + "%d bytes done.\n", + req->nr_sectors, good_bytes)); +- +- /* A number of bytes were successfully read. If there +- * are leftovers and there is some kind of error +- * (result != 0), retry the rest. +- */ +- if (scsi_end_request(cmd, error, good_bytes, result == 0) == NULL) ++ if (blk_end_request(req, error, good_bytes) == 0) { ++ /* This request is completely finished; start the next one */ ++ __scsi_release_buffers(cmd, 0); ++ scsi_next_command(cmd); + return; +- this_count = blk_rq_bytes(req); +- +- error = -EIO; ++ } + +- if (host_byte(result) == DID_RESET) { ++ /* The request isn't finished yet. Figure out what to do next. */ ++ if (result == 0) { ++ /* No error, so carry out the remainder of the request. ++ * Failure to make forward progress counts against the ++ * the number of retries. ++ */ ++ if (good_bytes > 0 || --req->retries >= 0) ++ action = ACTION_REPREP; ++ else { ++ description = "Retries exhausted"; ++ action = ACTION_FAIL; ++ error = -EIO; ++ } ++ } else if (error && scsi_noretry_cmd(cmd)) { ++ /* Retrys are disallowed, so kill the remainder. */ ++ action = ACTION_FAIL; ++ } else if (host_byte(result) == DID_RESET) { + /* Third party bus reset or reset for error recovery + * reasons. Just retry the command and see what + * happens. + */ + action = ACTION_RETRY; + } else if (sense_valid && !sense_deferred) { ++ error = -EIO; + switch (sshdr.sense_key) { + case UNIT_ATTENTION: + if (cmd->device->removable) { +@@ -1097,7 +1044,7 @@ void scsi_io_completion(struct scsi_cmnd + if (driver_byte(result) & DRIVER_SENSE) + scsi_print_sense("", cmd); + } +- blk_end_request(req, -EIO, blk_rq_bytes(req)); ++ blk_end_request(req, error, blk_rq_bytes(req)); + scsi_next_command(cmd); + break; + case ACTION_REPREP: +Index: linux-2.6-tip/drivers/telephony/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/telephony/Kconfig ++++ linux-2.6-tip/drivers/telephony/Kconfig +@@ -20,6 +20,8 @@ if PHONE + config PHONE_IXJ + tristate "QuickNet Internet LineJack/PhoneJack support" + depends on ISA || PCI ++ # build breakage, config-Sat_Jul_19_00_58_16_CEST_2008.bad ++ depends on 0 + ---help--- + Say M if you have a telephony card manufactured by Quicknet + Technologies, Inc. These include the Internet PhoneJACK and +Index: linux-2.6-tip/drivers/telephony/ixj.c +=================================================================== +--- linux-2.6-tip.orig/drivers/telephony/ixj.c ++++ linux-2.6-tip/drivers/telephony/ixj.c +@@ -288,7 +288,7 @@ static struct pci_device_id ixj_pci_tbl[ + { } + }; + +-MODULE_DEVICE_TABLE(pci, ixj_pci_tbl); ++MODULE_STATIC_DEVICE_TABLE(pci, ixj_pci_tbl); + + /************************************************************************ + * +Index: linux-2.6-tip/drivers/usb/atm/ueagle-atm.c +=================================================================== +--- linux-2.6-tip.orig/drivers/usb/atm/ueagle-atm.c ++++ linux-2.6-tip/drivers/usb/atm/ueagle-atm.c +@@ -1427,7 +1427,7 @@ static int uea_stat_e1(struct uea_softc + static int uea_stat_e4(struct uea_softc *sc) + { + u32 data; +- u32 tmp_arr[2]; ++ u32 tmp_arr[2] = { 0, }; + int ret; + + uea_enters(INS_TO_USBDEV(sc)); +Index: linux-2.6-tip/drivers/usb/gadget/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/usb/gadget/Kconfig ++++ linux-2.6-tip/drivers/usb/gadget/Kconfig +@@ -15,6 +15,9 @@ + + menuconfig USB_GADGET + tristate "USB Gadget Support" ++ # crashes on titan with: ++ # http://redhat.com/~mingo/misc/config-Tue_Jul_22_13_44_45_CEST_2008.bad ++ depends on 0 + help + USB is a master/slave protocol, organized with one master + host (such as a PC) controlling up to 127 peripheral devices. +Index: linux-2.6-tip/drivers/usb/host/Kconfig +=================================================================== +--- linux-2.6-tip.orig/drivers/usb/host/Kconfig ++++ linux-2.6-tip/drivers/usb/host/Kconfig +@@ -329,6 +329,8 @@ config USB_WHCI_HCD + tristate "Wireless USB Host Controller Interface (WHCI) driver (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on PCI && USB ++ depends on 0 ++ + select USB_WUSB + select UWB_WHCI + help +Index: linux-2.6-tip/drivers/usb/serial/io_edgeport.c +=================================================================== +--- linux-2.6-tip.orig/drivers/usb/serial/io_edgeport.c ++++ linux-2.6-tip/drivers/usb/serial/io_edgeport.c +@@ -293,7 +293,7 @@ static void update_edgeport_E2PROM(struc + __u16 BootBuildNumber; + __u32 Bootaddr; + const struct ihex_binrec *rec; +- const struct firmware *fw; ++ const struct firmware *uninitialized_var(fw); + const char *fw_name; + int response; + +@@ -2457,7 +2457,7 @@ static int send_cmd_write_baud_rate(stru + unsigned char *cmdBuffer; + unsigned char *currCmd; + int cmdLen = 0; +- int divisor; ++ int uninitialized_var(divisor); + int status; + unsigned char number = + edge_port->port->number - edge_port->port->serial->minor; +Index: linux-2.6-tip/drivers/usb/serial/keyspan.c +=================================================================== +--- linux-2.6-tip.orig/drivers/usb/serial/keyspan.c ++++ linux-2.6-tip/drivers/usb/serial/keyspan.c +@@ -1345,7 +1345,7 @@ static int keyspan_fake_startup(struct u + int response; + const struct ihex_binrec *record; + char *fw_name; +- const struct firmware *fw; ++ const struct firmware *uninitialized_var(fw); + + dbg("Keyspan startup version %04x product %04x", + le16_to_cpu(serial->dev->descriptor.bcdDevice), +Index: linux-2.6-tip/drivers/usb/serial/keyspan_pda.c +=================================================================== +--- linux-2.6-tip.orig/drivers/usb/serial/keyspan_pda.c ++++ linux-2.6-tip/drivers/usb/serial/keyspan_pda.c +@@ -456,7 +456,7 @@ static int keyspan_pda_tiocmget(struct t + struct usb_serial_port *port = tty->driver_data; + struct usb_serial *serial = port->serial; + int rc; +- unsigned char status; ++ unsigned char uninitialized_var(status); + int value; + + rc = keyspan_pda_get_modem_info(serial, &status); +@@ -478,7 +478,7 @@ static int keyspan_pda_tiocmset(struct t + struct usb_serial_port *port = tty->driver_data; + struct usb_serial *serial = port->serial; + int rc; +- unsigned char status; ++ unsigned char uninitialized_var(status); + + rc = keyspan_pda_get_modem_info(serial, &status); + if (rc < 0) +@@ -726,7 +726,7 @@ static int keyspan_pda_fake_startup(stru + int response; + const char *fw_name; + const struct ihex_binrec *record; +- const struct firmware *fw; ++ const struct firmware *uninitialized_var(fw); + + /* download the firmware here ... */ + response = ezusb_set_reset(serial, 1); +Index: linux-2.6-tip/drivers/usb/serial/mos7720.c +=================================================================== +--- linux-2.6-tip.orig/drivers/usb/serial/mos7720.c ++++ linux-2.6-tip/drivers/usb/serial/mos7720.c +@@ -959,7 +959,7 @@ static int send_cmd_write_baud_rate(stru + { + struct usb_serial_port *port; + struct usb_serial *serial; +- int divisor; ++ int uninitialized_var(divisor); + int status; + unsigned char data; + unsigned char number; +Index: linux-2.6-tip/drivers/uwb/i1480/i1480-est.c +=================================================================== +--- linux-2.6-tip.orig/drivers/uwb/i1480/i1480-est.c ++++ linux-2.6-tip/drivers/uwb/i1480/i1480-est.c +@@ -96,4 +96,4 @@ static struct usb_device_id i1480_est_id + { USB_DEVICE(0x8086, 0x0c3b), }, + { }, + }; +-MODULE_DEVICE_TABLE(usb, i1480_est_id_table); ++MODULE_STATIC_DEVICE_TABLE(usb, i1480_est_id_table); +Index: linux-2.6-tip/drivers/uwb/whc-rc.c +=================================================================== +--- linux-2.6-tip.orig/drivers/uwb/whc-rc.c ++++ linux-2.6-tip/drivers/uwb/whc-rc.c +@@ -452,7 +452,7 @@ static struct pci_device_id whcrc_id_tab + { PCI_DEVICE_CLASS(PCI_CLASS_WIRELESS_WHCI, ~0) }, + { /* empty last entry */ } + }; +-MODULE_DEVICE_TABLE(pci, whcrc_id_table); ++MODULE_STATIC_DEVICE_TABLE(pci, whcrc_id_table); + + static struct umc_driver whcrc_driver = { + .name = "whc-rc", +Index: linux-2.6-tip/drivers/uwb/wlp/messages.c +=================================================================== +--- linux-2.6-tip.orig/drivers/uwb/wlp/messages.c ++++ linux-2.6-tip/drivers/uwb/wlp/messages.c +@@ -903,7 +903,7 @@ int wlp_parse_f0(struct wlp *wlp, struct + size_t len = skb->len; + size_t used; + ssize_t result; +- struct wlp_nonce enonce, rnonce; ++ struct wlp_nonce uninitialized_var(enonce), uninitialized_var(rnonce); + enum wlp_assc_error assc_err; + char enonce_buf[WLP_WSS_NONCE_STRSIZE]; + char rnonce_buf[WLP_WSS_NONCE_STRSIZE]; +Index: linux-2.6-tip/drivers/video/aty/atyfb_base.c +=================================================================== +--- linux-2.6-tip.orig/drivers/video/aty/atyfb_base.c ++++ linux-2.6-tip/drivers/video/aty/atyfb_base.c +@@ -430,7 +430,7 @@ static int __devinit correct_chipset(str + u16 type; + u32 chip_id; + const char *name; +- int i; ++ long i; + + for (i = ARRAY_SIZE(aty_chips) - 1; i >= 0; i--) + if (par->pci_id == aty_chips[i].pci_id) +@@ -529,8 +529,10 @@ static int __devinit correct_chipset(str + return 0; + } + ++#if defined(CONFIG_FB_ATY_GX) || defined(CONFIG_FB_ATY_CT) + static char ram_dram[] __devinitdata = "DRAM"; + static char ram_resv[] __devinitdata = "RESV"; ++#endif + #ifdef CONFIG_FB_ATY_GX + static char ram_vram[] __devinitdata = "VRAM"; + #endif /* CONFIG_FB_ATY_GX */ +@@ -3860,3 +3862,4 @@ MODULE_PARM_DESC(mode, "Specify resoluti + module_param(nomtrr, bool, 0); + MODULE_PARM_DESC(nomtrr, "bool: disable use of MTRR registers"); + #endif ++ +Index: linux-2.6-tip/drivers/video/matrox/matroxfb_crtc2.c +=================================================================== +--- linux-2.6-tip.orig/drivers/video/matrox/matroxfb_crtc2.c ++++ linux-2.6-tip/drivers/video/matrox/matroxfb_crtc2.c +@@ -262,7 +262,7 @@ static int matroxfb_dh_open(struct fb_in + #define m2info (container_of(info, struct matroxfb_dh_fb_info, fbcon)) + MINFO_FROM(m2info->primary_dev); + +- if (MINFO) { ++ if (MINFO != NULL) { + int err; + + if (ACCESS_FBINFO(dead)) { +@@ -282,7 +282,7 @@ static int matroxfb_dh_release(struct fb + int err = 0; + MINFO_FROM(m2info->primary_dev); + +- if (MINFO) { ++ if (MINFO != NULL) { + err = ACCESS_FBINFO(fbops).fb_release(&ACCESS_FBINFO(fbcon), user); + } + return err; +Index: linux-2.6-tip/drivers/video/mb862xx/mb862xxfb.c +=================================================================== +--- linux-2.6-tip.orig/drivers/video/mb862xx/mb862xxfb.c ++++ linux-2.6-tip/drivers/video/mb862xx/mb862xxfb.c +@@ -85,6 +85,8 @@ static inline unsigned int chan_to_field + return chan << bf->offset; + } + ++#if defined(CONFIG_FB_MB862XX_PCI_GDC) || defined(CONFIG_FB_MB862XX_LIME) ++ + static int mb862xxfb_setcolreg(unsigned regno, + unsigned red, unsigned green, unsigned blue, + unsigned transp, struct fb_info *info) +@@ -458,6 +460,8 @@ static ssize_t mb862xxfb_show_dispregs(s + + static DEVICE_ATTR(dispregs, 0444, mb862xxfb_show_dispregs, NULL); + ++#endif ++ + irqreturn_t mb862xx_intr(int irq, void *dev_id) + { + struct mb862xxfb_par *par = (struct mb862xxfb_par *) dev_id; +Index: linux-2.6-tip/drivers/video/sis/init301.c +=================================================================== +--- linux-2.6-tip.orig/drivers/video/sis/init301.c ++++ linux-2.6-tip/drivers/video/sis/init301.c +@@ -6691,7 +6691,7 @@ SiS_SetGroup2(struct SiS_Private *SiS_Pr + bool newtvphase; + const unsigned char *TimingPoint; + #ifdef SIS315H +- unsigned short resindex, CRT2Index; ++ unsigned short uninitialized_var(resindex), uninitialized_var(CRT2Index); + const struct SiS_Part2PortTbl *CRT2Part2Ptr = NULL; + + if(SiS_Pr->SiS_VBInfo & SetCRT2ToLCDA) return; +Index: linux-2.6-tip/drivers/video/sis/sis_main.c +=================================================================== +--- linux-2.6-tip.orig/drivers/video/sis/sis_main.c ++++ linux-2.6-tip/drivers/video/sis/sis_main.c +@@ -4175,6 +4175,7 @@ sisfb_find_rom(struct pci_dev *pdev) + return myrombase; + } + ++#if defined(CONFIG_FB_SIS_300) || defined(CONFIG_FB_SIS_315) + static void __devinit + sisfb_post_map_vram(struct sis_video_info *ivideo, unsigned int *mapsize, + unsigned int min) +@@ -4197,6 +4198,7 @@ sisfb_post_map_vram(struct sis_video_inf + } + } + } ++#endif + + #ifdef CONFIG_FB_SIS_300 + static int __devinit +Index: linux-2.6-tip/drivers/watchdog/alim1535_wdt.c +=================================================================== +--- linux-2.6-tip.orig/drivers/watchdog/alim1535_wdt.c ++++ linux-2.6-tip/drivers/watchdog/alim1535_wdt.c +@@ -306,7 +306,7 @@ static struct pci_device_id ali_pci_tbl[ + { PCI_VENDOR_ID_AL, 0x1535, PCI_ANY_ID, PCI_ANY_ID,}, + { 0, }, + }; +-MODULE_DEVICE_TABLE(pci, ali_pci_tbl); ++MODULE_STATIC_DEVICE_TABLE(pci, ali_pci_tbl); + + /* + * ali_find_watchdog - find a 1535 and 7101 +Index: linux-2.6-tip/drivers/watchdog/alim7101_wdt.c +=================================================================== +--- linux-2.6-tip.orig/drivers/watchdog/alim7101_wdt.c ++++ linux-2.6-tip/drivers/watchdog/alim7101_wdt.c +@@ -427,7 +427,7 @@ static struct pci_device_id alim7101_pci + { } + }; + +-MODULE_DEVICE_TABLE(pci, alim7101_pci_tbl); ++MODULE_STATIC_DEVICE_TABLE(pci, alim7101_pci_tbl); + + MODULE_AUTHOR("Steve Hill"); + MODULE_DESCRIPTION("ALi M7101 PMU Computer Watchdog Timer driver"); +Index: linux-2.6-tip/drivers/watchdog/i6300esb.c +=================================================================== +--- linux-2.6-tip.orig/drivers/watchdog/i6300esb.c ++++ linux-2.6-tip/drivers/watchdog/i6300esb.c +@@ -355,20 +355,6 @@ static struct notifier_block esb_notifie + }; + + /* +- * Data for PCI driver interface +- * +- * This data only exists for exporting the supported +- * PCI ids via MODULE_DEVICE_TABLE. We do not actually +- * register a pci_driver, because someone else might one day +- * want to register another driver on the same PCI id. +- */ +-static struct pci_device_id esb_pci_tbl[] = { +- { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_9), }, +- { 0, }, /* End of list */ +-}; +-MODULE_DEVICE_TABLE(pci, esb_pci_tbl); +- +-/* + * Init & exit routines + */ + +Index: linux-2.6-tip/drivers/watchdog/rdc321x_wdt.c +=================================================================== +--- linux-2.6-tip.orig/drivers/watchdog/rdc321x_wdt.c ++++ linux-2.6-tip/drivers/watchdog/rdc321x_wdt.c +@@ -37,7 +37,7 @@ + #include + #include + +-#include ++#include + + #define RDC_WDT_MASK 0x80000000 /* Mask */ + #define RDC_WDT_EN 0x00800000 /* Enable bit */ +Index: linux-2.6-tip/drivers/watchdog/w83697ug_wdt.c +=================================================================== +--- linux-2.6-tip.orig/drivers/watchdog/w83697ug_wdt.c ++++ linux-2.6-tip/drivers/watchdog/w83697ug_wdt.c +@@ -79,7 +79,7 @@ MODULE_PARM_DESC(nowayout, + (same as EFER) */ + #define WDT_EFDR (WDT_EFIR+1) /* Extended Function Data Register */ + +-static void w83697ug_select_wd_register(void) ++static int w83697ug_select_wd_register(void) + { + unsigned char c; + unsigned char version; +@@ -102,7 +102,7 @@ static void w83697ug_select_wd_register( + + } else { + printk(KERN_ERR PFX "No W83697UG/UF could be found\n"); +- return; ++ return -EIO; + } + + outb_p(0x07, WDT_EFER); /* point to logical device number reg */ +@@ -110,6 +110,8 @@ static void w83697ug_select_wd_register( + outb_p(0x30, WDT_EFER); /* select CR30 */ + c = inb_p(WDT_EFDR); + outb_p(c || 0x01, WDT_EFDR); /* set bit 0 to activate GPIO2 */ ++ ++ return 0; + } + + static void w83697ug_unselect_wd_register(void) +@@ -117,11 +119,12 @@ static void w83697ug_unselect_wd_registe + outb_p(0xAA, WDT_EFER); /* Leave extended function mode */ + } + +-static void w83697ug_init(void) ++static int w83697ug_init(void) + { + unsigned char t; + +- w83697ug_select_wd_register(); ++ if (w83697ug_select_wd_register()) ++ return -EIO; + + outb_p(0xF6, WDT_EFER); /* Select CRF6 */ + t = inb_p(WDT_EFDR); /* read CRF6 */ +@@ -137,6 +140,8 @@ static void w83697ug_init(void) + outb_p(t, WDT_EFDR); /* Write back to CRF5 */ + + w83697ug_unselect_wd_register(); ++ ++ return 0; + } + + static void wdt_ctrl(int timeout) +@@ -347,7 +352,11 @@ static int __init wdt_init(void) + goto out; + } + +- w83697ug_init(); ++ ret = w83697ug_init(); ++ if (ret) { ++ printk(KERN_ERR PFX "init failed\n"); ++ goto unreg_regions; ++ } + + ret = register_reboot_notifier(&wdt_notifier); + if (ret != 0) { +Index: linux-2.6-tip/drivers/xen/cpu_hotplug.c +=================================================================== +--- linux-2.6-tip.orig/drivers/xen/cpu_hotplug.c ++++ linux-2.6-tip/drivers/xen/cpu_hotplug.c +@@ -10,7 +10,7 @@ static void enable_hotplug_cpu(int cpu) + if (!cpu_present(cpu)) + arch_register_cpu(cpu); + +- cpu_set(cpu, cpu_present_map); ++ set_cpu_present(cpu, true); + } + + static void disable_hotplug_cpu(int cpu) +@@ -18,7 +18,7 @@ static void disable_hotplug_cpu(int cpu) + if (cpu_present(cpu)) + arch_unregister_cpu(cpu); + +- cpu_clear(cpu, cpu_present_map); ++ set_cpu_present(cpu, false); + } + + static void vcpu_hotplug(unsigned int cpu) +Index: linux-2.6-tip/drivers/xen/events.c +=================================================================== +--- linux-2.6-tip.orig/drivers/xen/events.c ++++ linux-2.6-tip/drivers/xen/events.c +@@ -26,9 +26,11 @@ + #include + #include + #include ++#include + + #include + #include ++#include + #include + #include + #include +@@ -50,36 +52,55 @@ static DEFINE_PER_CPU(int, virq_to_irq[N + /* IRQ <-> IPI mapping */ + static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; + +-/* Packed IRQ information: binding type, sub-type index, and event channel. */ +-struct packed_irq +-{ +- unsigned short evtchn; +- unsigned char index; +- unsigned char type; +-}; +- +-static struct packed_irq irq_info[NR_IRQS]; +- +-/* Binding types. */ +-enum { +- IRQT_UNBOUND, ++/* Interrupt types. */ ++enum xen_irq_type { ++ IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN + }; + +-/* Convenient shorthand for packed representation of an unbound IRQ. */ +-#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) ++/* ++ * Packed IRQ information: ++ * type - enum xen_irq_type ++ * event channel - irq->event channel mapping ++ * cpu - cpu this event channel is bound to ++ * index - type-specific information: ++ * PIRQ - vector, with MSB being "needs EIO" ++ * VIRQ - virq number ++ * IPI - IPI vector ++ * EVTCHN - ++ */ ++struct irq_info ++{ ++ enum xen_irq_type type; /* type */ ++ unsigned short evtchn; /* event channel */ ++ unsigned short cpu; /* cpu bound */ ++ ++ union { ++ unsigned short virq; ++ enum ipi_vector ipi; ++ struct { ++ unsigned short gsi; ++ unsigned short vector; ++ } pirq; ++ } u; ++}; ++ ++static struct irq_info irq_info[NR_IRQS]; + + static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 + }; +-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; +-static u8 cpu_evtchn[NR_EVENT_CHANNELS]; +- +-/* Reference counts for bindings to IRQs. */ +-static int irq_bindcount[NR_IRQS]; ++struct cpu_evtchn_s { ++ unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG]; ++}; ++static struct cpu_evtchn_s *cpu_evtchn_mask_p; ++static inline unsigned long *cpu_evtchn_mask(int cpu) ++{ ++ return cpu_evtchn_mask_p[cpu].bits; ++} + + /* Xen will never allocate port zero for any purpose. */ + #define VALID_EVTCHN(chn) ((chn) != 0) +@@ -87,27 +108,108 @@ static int irq_bindcount[NR_IRQS]; + static struct irq_chip xen_dynamic_chip; + + /* Constructor for packed IRQ information. */ +-static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) ++static struct irq_info mk_unbound_info(void) ++{ ++ return (struct irq_info) { .type = IRQT_UNBOUND }; ++} ++ ++static struct irq_info mk_evtchn_info(unsigned short evtchn) ++{ ++ return (struct irq_info) { .type = IRQT_EVTCHN, .evtchn = evtchn, ++ .cpu = 0 }; ++} ++ ++static struct irq_info mk_ipi_info(unsigned short evtchn, enum ipi_vector ipi) + { +- return (struct packed_irq) { evtchn, index, type }; ++ return (struct irq_info) { .type = IRQT_IPI, .evtchn = evtchn, ++ .cpu = 0, .u.ipi = ipi }; ++} ++ ++static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq) ++{ ++ return (struct irq_info) { .type = IRQT_VIRQ, .evtchn = evtchn, ++ .cpu = 0, .u.virq = virq }; ++} ++ ++static struct irq_info mk_pirq_info(unsigned short evtchn, ++ unsigned short gsi, unsigned short vector) ++{ ++ return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn, ++ .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } }; + } + + /* + * Accessors for packed IRQ information. + */ +-static inline unsigned int evtchn_from_irq(int irq) ++static struct irq_info *info_for_irq(unsigned irq) ++{ ++ return &irq_info[irq]; ++} ++ ++static unsigned int evtchn_from_irq(unsigned irq) ++{ ++ return info_for_irq(irq)->evtchn; ++} ++ ++static enum ipi_vector ipi_from_irq(unsigned irq) ++{ ++ struct irq_info *info = info_for_irq(irq); ++ ++ BUG_ON(info == NULL); ++ BUG_ON(info->type != IRQT_IPI); ++ ++ return info->u.ipi; ++} ++ ++static unsigned virq_from_irq(unsigned irq) + { +- return irq_info[irq].evtchn; ++ struct irq_info *info = info_for_irq(irq); ++ ++ BUG_ON(info == NULL); ++ BUG_ON(info->type != IRQT_VIRQ); ++ ++ return info->u.virq; ++} ++ ++static unsigned gsi_from_irq(unsigned irq) ++{ ++ struct irq_info *info = info_for_irq(irq); ++ ++ BUG_ON(info == NULL); ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ return info->u.pirq.gsi; ++} ++ ++static unsigned vector_from_irq(unsigned irq) ++{ ++ struct irq_info *info = info_for_irq(irq); ++ ++ BUG_ON(info == NULL); ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ return info->u.pirq.vector; + } + +-static inline unsigned int index_from_irq(int irq) ++static enum xen_irq_type type_from_irq(unsigned irq) + { +- return irq_info[irq].index; ++ return info_for_irq(irq)->type; + } + +-static inline unsigned int type_from_irq(int irq) ++static unsigned cpu_from_irq(unsigned irq) + { +- return irq_info[irq].type; ++ return info_for_irq(irq)->cpu; ++} ++ ++static unsigned int cpu_from_evtchn(unsigned int evtchn) ++{ ++ int irq = evtchn_to_irq[evtchn]; ++ unsigned ret = 0; ++ ++ if (irq != -1) ++ ret = cpu_from_irq(irq); ++ ++ return ret; + } + + static inline unsigned long active_evtchns(unsigned int cpu, +@@ -115,7 +217,7 @@ static inline unsigned long active_evtch + unsigned int idx) + { + return (sh->evtchn_pending[idx] & +- cpu_evtchn_mask[cpu][idx] & ++ cpu_evtchn_mask(cpu)[idx] & + ~sh->evtchn_mask[idx]); + } + +@@ -125,13 +227,13 @@ static void bind_evtchn_to_cpu(unsigned + + BUG_ON(irq == -1); + #ifdef CONFIG_SMP +- irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); ++ cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); + #endif + +- __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); +- __set_bit(chn, cpu_evtchn_mask[cpu]); ++ __clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); ++ __set_bit(chn, cpu_evtchn_mask(cpu)); + +- cpu_evtchn[chn] = cpu; ++ irq_info[irq].cpu = cpu; + } + + static void init_evtchn_cpu_bindings(void) +@@ -142,17 +244,11 @@ static void init_evtchn_cpu_bindings(voi + + /* By default all event channels notify CPU#0. */ + for_each_irq_desc(i, desc) { +- desc->affinity = cpumask_of_cpu(0); ++ cpumask_copy(desc->affinity, cpumask_of(0)); + } + #endif + +- memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); +- memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +-} +- +-static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +-{ +- return cpu_evtchn[evtchn]; ++ memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0))); + } + + static inline void clear_evtchn(int port) +@@ -232,9 +328,8 @@ static int find_unbound_irq(void) + int irq; + struct irq_desc *desc; + +- /* Only allocate from dynirq range */ + for (irq = 0; irq < nr_irqs; irq++) +- if (irq_bindcount[irq] == 0) ++ if (irq_info[irq].type == IRQT_UNBOUND) + break; + + if (irq == nr_irqs) +@@ -244,6 +339,8 @@ static int find_unbound_irq(void) + if (WARN_ON(desc == NULL)) + return -1; + ++ dynamic_irq_init(irq); ++ + return irq; + } + +@@ -258,16 +355,13 @@ int bind_evtchn_to_irq(unsigned int evtc + if (irq == -1) { + irq = find_unbound_irq(); + +- dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "event"); + + evtchn_to_irq[evtchn] = irq; +- irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); ++ irq_info[irq] = mk_evtchn_info(evtchn); + } + +- irq_bindcount[irq]++; +- + spin_unlock(&irq_mapping_update_lock); + + return irq; +@@ -282,12 +376,12 @@ static int bind_ipi_to_irq(unsigned int + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(ipi_to_irq, cpu)[ipi]; ++ + if (irq == -1) { + irq = find_unbound_irq(); + if (irq < 0) + goto out; + +- dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "ipi"); + +@@ -298,15 +392,12 @@ static int bind_ipi_to_irq(unsigned int + evtchn = bind_ipi.port; + + evtchn_to_irq[evtchn] = irq; +- irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); +- ++ irq_info[irq] = mk_ipi_info(evtchn, ipi); + per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + +- irq_bindcount[irq]++; +- + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +@@ -332,20 +423,17 @@ static int bind_virq_to_irq(unsigned int + + irq = find_unbound_irq(); + +- dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "virq"); + + evtchn_to_irq[evtchn] = irq; +- irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); ++ irq_info[irq] = mk_virq_info(evtchn, virq); + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + +- irq_bindcount[irq]++; +- + spin_unlock(&irq_mapping_update_lock); + + return irq; +@@ -358,7 +446,7 @@ static void unbind_from_irq(unsigned int + + spin_lock(&irq_mapping_update_lock); + +- if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { ++ if (VALID_EVTCHN(evtchn)) { + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); +@@ -366,11 +454,11 @@ static void unbind_from_irq(unsigned int + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) +- [index_from_irq(irq)] = -1; ++ [virq_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) +- [index_from_irq(irq)] = -1; ++ [ipi_from_irq(irq)] = -1; + break; + default: + break; +@@ -380,7 +468,7 @@ static void unbind_from_irq(unsigned int + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] = -1; +- irq_info[irq] = IRQ_UNBOUND; ++ irq_info[irq] = mk_unbound_info(); + + dynamic_irq_cleanup(irq); + } +@@ -498,8 +586,8 @@ irqreturn_t xen_debug_interrupt(int irq, + for(i = 0; i < NR_EVENT_CHANNELS; i++) { + if (sync_test_bit(i, sh->evtchn_pending)) { + printk(" %d: event %d -> irq %d\n", +- cpu_evtchn[i], i, +- evtchn_to_irq[i]); ++ cpu_from_evtchn(i), i, ++ evtchn_to_irq[i]); + } + } + +@@ -508,7 +596,6 @@ irqreturn_t xen_debug_interrupt(int irq, + return IRQ_HANDLED; + } + +- + /* + * Search the CPUs pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for +@@ -521,11 +608,15 @@ irqreturn_t xen_debug_interrupt(int irq, + void xen_evtchn_do_upcall(struct pt_regs *regs) + { + int cpu = get_cpu(); ++ struct pt_regs *old_regs = set_irq_regs(regs); + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + static DEFINE_PER_CPU(unsigned, nesting_count); + unsigned count; + ++ exit_idle(); ++ irq_enter(); ++ + do { + unsigned long pending_words; + +@@ -550,7 +641,7 @@ void xen_evtchn_do_upcall(struct pt_regs + int irq = evtchn_to_irq[port]; + + if (irq != -1) +- xen_do_IRQ(irq, regs); ++ handle_irq(irq, regs); + } + } + +@@ -561,12 +652,17 @@ void xen_evtchn_do_upcall(struct pt_regs + } while(count != 1); + + out: ++ irq_exit(); ++ set_irq_regs(old_regs); ++ + put_cpu(); + } + + /* Rebind a new event channel to an existing irq. */ + void rebind_evtchn_irq(int evtchn, int irq) + { ++ struct irq_info *info = info_for_irq(irq); ++ + /* Make sure the irq is masked, since the new event channel + will also be masked. */ + disable_irq(irq); +@@ -576,11 +672,11 @@ void rebind_evtchn_irq(int evtchn, int i + /* After resume the irq<->evtchn mappings are all cleared out */ + BUG_ON(evtchn_to_irq[evtchn] != -1); + /* Expect irq to have been bound before, +- so the bindcount should be non-0 */ +- BUG_ON(irq_bindcount[irq] == 0); ++ so there should be a proper type */ ++ BUG_ON(info->type == IRQT_UNBOUND); + + evtchn_to_irq[evtchn] = irq; +- irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); ++ irq_info[irq] = mk_evtchn_info(evtchn); + + spin_unlock(&irq_mapping_update_lock); + +@@ -690,8 +786,7 @@ static void restore_cpu_virqs(unsigned i + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) + continue; + +- BUG_ON(irq_info[irq].type != IRQT_VIRQ); +- BUG_ON(irq_info[irq].index != virq); ++ BUG_ON(virq_from_irq(irq) != virq); + + /* Get a new binding from Xen. */ + bind_virq.virq = virq; +@@ -703,7 +798,7 @@ static void restore_cpu_virqs(unsigned i + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; +- irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); ++ irq_info[irq] = mk_virq_info(evtchn, virq); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ +@@ -720,8 +815,7 @@ static void restore_cpu_ipis(unsigned in + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) + continue; + +- BUG_ON(irq_info[irq].type != IRQT_IPI); +- BUG_ON(irq_info[irq].index != ipi); ++ BUG_ON(ipi_from_irq(irq) != ipi); + + /* Get a new binding from Xen. */ + bind_ipi.vcpu = cpu; +@@ -732,7 +826,7 @@ static void restore_cpu_ipis(unsigned in + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; +- irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); ++ irq_info[irq] = mk_ipi_info(evtchn, ipi); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ +@@ -812,8 +906,11 @@ void xen_irq_resume(void) + + static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-dyn", ++ ++ .disable = disable_dynirq, + .mask = disable_dynirq, + .unmask = enable_dynirq, ++ + .ack = ack_dynirq, + .set_affinity = set_affinity_irq, + .retrigger = retrigger_dynirq, +@@ -822,6 +919,10 @@ static struct irq_chip xen_dynamic_chip + void __init xen_init_IRQ(void) + { + int i; ++ size_t size = nr_cpu_ids * sizeof(struct cpu_evtchn_s); ++ ++ cpu_evtchn_mask_p = alloc_bootmem(size); ++ BUG_ON(cpu_evtchn_mask_p == NULL); + + init_evtchn_cpu_bindings(); + +@@ -829,9 +930,5 @@ void __init xen_init_IRQ(void) + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + +- /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ +- for (i = 0; i < nr_irqs; i++) +- irq_bindcount[i] = 0; +- + irq_ctx_init(smp_processor_id()); + } +Index: linux-2.6-tip/drivers/xen/manage.c +=================================================================== +--- linux-2.6-tip.orig/drivers/xen/manage.c ++++ linux-2.6-tip/drivers/xen/manage.c +@@ -108,7 +108,7 @@ static void do_suspend(void) + /* XXX use normal device tree? */ + xenbus_suspend(); + +- err = stop_machine(xen_suspend, &cancelled, &cpumask_of_cpu(0)); ++ err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + if (err) { + printk(KERN_ERR "failed to start xen_suspend: %d\n", err); + goto out; +Index: linux-2.6-tip/fs/Kconfig +=================================================================== +--- linux-2.6-tip.orig/fs/Kconfig ++++ linux-2.6-tip/fs/Kconfig +@@ -40,7 +40,7 @@ config FS_POSIX_ACL + default n + + config FILE_LOCKING +- bool "Enable POSIX file locking API" if EMBEDDED ++ bool "Enable POSIX file locking API" if BROKEN + default y + help + This option enables standard file locking support, required +Index: linux-2.6-tip/fs/afs/dir.c +=================================================================== +--- linux-2.6-tip.orig/fs/afs/dir.c ++++ linux-2.6-tip/fs/afs/dir.c +@@ -564,7 +564,7 @@ static struct dentry *afs_lookup(struct + static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) + { + struct afs_vnode *vnode, *dir; +- struct afs_fid fid; ++ struct afs_fid fid = { 0, }; + struct dentry *parent; + struct key *key; + void *dir_version; +Index: linux-2.6-tip/fs/befs/debug.c +=================================================================== +--- linux-2.6-tip.orig/fs/befs/debug.c ++++ linux-2.6-tip/fs/befs/debug.c +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + + #endif /* __KERNEL__ */ + +Index: linux-2.6-tip/fs/befs/linuxvfs.c +=================================================================== +--- linux-2.6-tip.orig/fs/befs/linuxvfs.c ++++ linux-2.6-tip/fs/befs/linuxvfs.c +@@ -168,7 +168,7 @@ befs_lookup(struct inode *dir, struct de + befs_off_t offset; + int ret; + int utfnamelen; +- char *utfname; ++ char *uninitialized_var(utfname); + const char *name = dentry->d_name.name; + + befs_debug(sb, "---> befs_lookup() " +@@ -221,8 +221,8 @@ befs_readdir(struct file *filp, void *di + size_t keysize; + unsigned char d_type; + char keybuf[BEFS_NAME_LEN + 1]; +- char *nlsname; +- int nlsnamelen; ++ char *uninitialized_var(nlsname); ++ int uninitialized_var(nlsnamelen); + const char *dirname = filp->f_path.dentry->d_name.name; + + befs_debug(sb, "---> befs_readdir() " +Index: linux-2.6-tip/fs/cifs/cifssmb.c +=================================================================== +--- linux-2.6-tip.orig/fs/cifs/cifssmb.c ++++ linux-2.6-tip/fs/cifs/cifssmb.c +@@ -3118,7 +3118,7 @@ CIFSSMBGetCIFSACL(const int xid, struct + __u32 parm_len; + __u32 acl_len; + struct smb_com_ntransact_rsp *pSMBr; +- char *pdata; ++ char *uninitialized_var(pdata); + + /* validate_nttransact */ + rc = validate_ntransact(iov[0].iov_base, (char **)&parm, +Index: linux-2.6-tip/fs/cifs/readdir.c +=================================================================== +--- linux-2.6-tip.orig/fs/cifs/readdir.c ++++ linux-2.6-tip/fs/cifs/readdir.c +@@ -906,7 +906,7 @@ static int cifs_filldir(char *pfindEntry + __u64 inum; + struct cifs_sb_info *cifs_sb; + struct inode *tmp_inode; +- struct dentry *tmp_dentry; ++ struct dentry *uninitialized_var(tmp_dentry); + + /* get filename and len into qstring */ + /* get dentry */ +@@ -990,7 +990,7 @@ int cifs_readdir(struct file *file, void + struct cifs_sb_info *cifs_sb; + struct cifsTconInfo *pTcon; + struct cifsFileInfo *cifsFile = NULL; +- char *current_entry; ++ char *uninitialized_var(current_entry); + int num_to_fill = 0; + char *tmp_buf = NULL; + char *end_of_smb; +Index: linux-2.6-tip/fs/coda/Makefile +=================================================================== +--- linux-2.6-tip.orig/fs/coda/Makefile ++++ linux-2.6-tip/fs/coda/Makefile +@@ -5,7 +5,9 @@ + obj-$(CONFIG_CODA_FS) += coda.o + + coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \ +- coda_linux.o symlink.o pioctl.o sysctl.o ++ coda_linux.o symlink.o pioctl.o ++ ++coda-$(CONFIG_SYSCTL) += sysctl.o + + # If you want debugging output, please uncomment the following line. + +Index: linux-2.6-tip/fs/coda/coda_int.h +=================================================================== +--- linux-2.6-tip.orig/fs/coda/coda_int.h ++++ linux-2.6-tip/fs/coda/coda_int.h +@@ -12,8 +12,13 @@ void coda_destroy_inodecache(void); + int coda_init_inodecache(void); + int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, + int datasync); ++#ifdef CONFIG_SYSCTL + void coda_sysctl_init(void); + void coda_sysctl_clean(void); ++#else ++static inline void coda_sysctl_init(void) { } ++static inline void coda_sysctl_clean(void) { } ++#endif + + #endif /* _CODA_INT_ */ + +Index: linux-2.6-tip/fs/coda/sysctl.c +=================================================================== +--- linux-2.6-tip.orig/fs/coda/sysctl.c ++++ linux-2.6-tip/fs/coda/sysctl.c +@@ -57,18 +57,14 @@ static ctl_table fs_table[] = { + + void coda_sysctl_init(void) + { +-#ifdef CONFIG_SYSCTL +- if ( !fs_table_header ) ++ if (!fs_table_header) + fs_table_header = register_sysctl_table(fs_table); +-#endif + } + + void coda_sysctl_clean(void) + { +-#ifdef CONFIG_SYSCTL +- if ( fs_table_header ) { ++ if (fs_table_header) { + unregister_sysctl_table(fs_table_header); + fs_table_header = NULL; + } +-#endif + } +Index: linux-2.6-tip/fs/compat_binfmt_elf.c +=================================================================== +--- linux-2.6-tip.orig/fs/compat_binfmt_elf.c ++++ linux-2.6-tip/fs/compat_binfmt_elf.c +@@ -42,6 +42,7 @@ + #define elf_prstatus compat_elf_prstatus + #define elf_prpsinfo compat_elf_prpsinfo + ++#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) + /* + * Compat version of cputime_to_compat_timeval, perhaps this + * should be an inline in . +@@ -55,8 +56,9 @@ static void cputime_to_compat_timeval(co + value->tv_usec = tv.tv_usec; + } + +-#undef cputime_to_timeval +-#define cputime_to_timeval cputime_to_compat_timeval ++# undef cputime_to_timeval ++# define cputime_to_timeval cputime_to_compat_timeval ++#endif + + + /* +Index: linux-2.6-tip/fs/configfs/symlink.c +=================================================================== +--- linux-2.6-tip.orig/fs/configfs/symlink.c ++++ linux-2.6-tip/fs/configfs/symlink.c +@@ -135,7 +135,7 @@ int configfs_symlink(struct inode *dir, + struct path path; + struct configfs_dirent *sd; + struct config_item *parent_item; +- struct config_item *target_item; ++ struct config_item *uninitialized_var(target_item); + struct config_item_type *type; + + ret = -EPERM; /* What lack-of-symlink returns */ +Index: linux-2.6-tip/fs/dcache.c +=================================================================== +--- linux-2.6-tip.orig/fs/dcache.c ++++ linux-2.6-tip/fs/dcache.c +@@ -726,8 +726,9 @@ void shrink_dcache_for_umount(struct sup + { + struct dentry *dentry; + +- if (down_read_trylock(&sb->s_umount)) +- BUG(); ++// -rt: this might succeed there ... ++// if (down_read_trylock(&sb->s_umount)) ++// BUG(); + + dentry = sb->s_root; + sb->s_root = NULL; +@@ -1877,6 +1878,8 @@ out_nolock: + shouldnt_be_hashed: + spin_unlock(&dcache_lock); + BUG(); ++ ++ return NULL; + } + + static int prepend(char **buffer, int *buflen, const char *str, int namelen) +Index: linux-2.6-tip/fs/debugfs/inode.c +=================================================================== +--- linux-2.6-tip.orig/fs/debugfs/inode.c ++++ linux-2.6-tip/fs/debugfs/inode.c +@@ -30,6 +30,7 @@ + + static struct vfsmount *debugfs_mount; + static int debugfs_mount_count; ++static bool debugfs_registered; + + static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) + { +@@ -496,6 +497,16 @@ exit: + } + EXPORT_SYMBOL_GPL(debugfs_rename); + ++/** ++ * debugfs_initialized - Tells whether debugfs has been registered ++ */ ++bool debugfs_initialized(void) ++{ ++ return debugfs_registered; ++} ++EXPORT_SYMBOL_GPL(debugfs_initialized); ++ ++ + static struct kobject *debug_kobj; + + static int __init debugfs_init(void) +@@ -509,11 +520,16 @@ static int __init debugfs_init(void) + retval = register_filesystem(&debug_fs_type); + if (retval) + kobject_put(debug_kobj); ++ else ++ debugfs_registered = true; ++ + return retval; + } + + static void __exit debugfs_exit(void) + { ++ debugfs_registered = false; ++ + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + unregister_filesystem(&debug_fs_type); + kobject_put(debug_kobj); +Index: linux-2.6-tip/fs/ecryptfs/keystore.c +=================================================================== +--- linux-2.6-tip.orig/fs/ecryptfs/keystore.c ++++ linux-2.6-tip/fs/ecryptfs/keystore.c +@@ -1013,7 +1013,7 @@ decrypt_pki_encrypted_session_key(struct + struct ecryptfs_message *msg = NULL; + char *auth_tok_sig; + char *payload; +- size_t payload_len; ++ size_t uninitialized_var(payload_len); + int rc; + + rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); +@@ -1845,7 +1845,7 @@ pki_encrypt_session_key(struct ecryptfs_ + { + struct ecryptfs_msg_ctx *msg_ctx = NULL; + char *payload = NULL; +- size_t payload_len; ++ size_t uninitialized_var(payload_len); + struct ecryptfs_message *msg; + int rc; + +Index: linux-2.6-tip/fs/eventpoll.c +=================================================================== +--- linux-2.6-tip.orig/fs/eventpoll.c ++++ linux-2.6-tip/fs/eventpoll.c +@@ -1098,7 +1098,7 @@ retry: + SYSCALL_DEFINE1(epoll_create1, int, flags) + { + int error, fd = -1; +- struct eventpoll *ep; ++ struct eventpoll *uninitialized_var(ep); + + /* Check the EPOLL_* constant for consistency. */ + BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); +Index: linux-2.6-tip/fs/exec.c +=================================================================== +--- linux-2.6-tip.orig/fs/exec.c ++++ linux-2.6-tip/fs/exec.c +@@ -33,6 +33,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -46,6 +47,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -509,7 +511,7 @@ static int shift_arg_pages(struct vm_are + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; +- struct mmu_gather *tlb; ++ struct mmu_gather tlb; + + BUG_ON(new_start > new_end); + +@@ -534,12 +536,12 @@ static int shift_arg_pages(struct vm_are + return -ENOMEM; + + lru_add_drain(); +- tlb = tlb_gather_mmu(mm, 0); ++ tlb_gather_mmu(&tlb, mm, 0); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ +- free_pgd_range(tlb, new_end, old_end, new_end, ++ free_pgd_range(&tlb, new_end, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : 0); + } else { + /* +@@ -548,10 +550,10 @@ static int shift_arg_pages(struct vm_are + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ +- free_pgd_range(tlb, old_start, old_end, new_end, ++ free_pgd_range(&tlb, old_start, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : 0); + } +- tlb_finish_mmu(tlb, new_end, old_end); ++ tlb_finish_mmu(&tlb, new_end, old_end); + + /* + * shrink the vma to just the new range. +@@ -738,10 +740,12 @@ static int exec_mmap(struct mm_struct *m + } + } + task_lock(tsk); ++ local_irq_disable(); + active_mm = tsk->active_mm; ++ activate_mm(active_mm, mm); + tsk->mm = mm; + tsk->active_mm = mm; +- activate_mm(active_mm, mm); ++ local_irq_enable(); + task_unlock(tsk); + arch_pick_mmap_layout(mm); + if (old_mm) { +@@ -1010,6 +1014,13 @@ int flush_old_exec(struct linux_binprm * + + current->personality &= ~bprm->per_clear; + ++ /* ++ * Flush performance counters when crossing a ++ * security domain: ++ */ ++ if (!get_dumpable(current->mm)) ++ perf_counter_exit_task(current); ++ + /* An exec changes our domain. We are no longer part of the thread + group */ + +Index: linux-2.6-tip/fs/ext4/extents.c +=================================================================== +--- linux-2.6-tip.orig/fs/ext4/extents.c ++++ linux-2.6-tip/fs/ext4/extents.c +@@ -1159,6 +1159,7 @@ ext4_ext_search_right(struct inode *inod + return 0; + } + ++ ix = NULL; /* avoid gcc false positive warning */ + /* go up and search for index to the right */ + while (--depth >= 0) { + ix = path[depth].p_idx; +Index: linux-2.6-tip/fs/fat/namei_vfat.c +=================================================================== +--- linux-2.6-tip.orig/fs/fat/namei_vfat.c ++++ linux-2.6-tip/fs/fat/namei_vfat.c +@@ -595,12 +595,12 @@ static int vfat_build_slots(struct inode + struct fat_mount_options *opts = &sbi->options; + struct msdos_dir_slot *ps; + struct msdos_dir_entry *de; +- unsigned char cksum, lcase; ++ unsigned char cksum, uninitialized_var(lcase); + unsigned char msdos_name[MSDOS_NAME]; + wchar_t *uname; + __le16 time, date; + u8 time_cs; +- int err, ulen, usize, i; ++ int err, uninitialized_var(ulen), usize, i; + loff_t offset; + + *nr_slots = 0; +Index: linux-2.6-tip/fs/jfs/jfs_dmap.c +=================================================================== +--- linux-2.6-tip.orig/fs/jfs/jfs_dmap.c ++++ linux-2.6-tip/fs/jfs/jfs_dmap.c +@@ -1618,7 +1618,7 @@ static int dbAllocAny(struct bmap * bmp, + */ + static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) + { +- int rc, leafidx, lev; ++ int rc, uninitialized_var(leafidx), lev; + s64 b, lblkno; + struct dmapctl *dcp; + int budmin; +Index: linux-2.6-tip/fs/libfs.c +=================================================================== +--- linux-2.6-tip.orig/fs/libfs.c ++++ linux-2.6-tip/fs/libfs.c +@@ -574,6 +574,21 @@ ssize_t memory_read_from_buffer(void *to + * possibly a read which collects the result - which is stored in a + * file-local buffer. + */ ++ ++void simple_transaction_set(struct file *file, size_t n) ++{ ++ struct simple_transaction_argresp *ar = file->private_data; ++ ++ BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); ++ ++ /* ++ * The barrier ensures that ar->size will really remain zero until ++ * ar->data is ready for reading. ++ */ ++ smp_mb(); ++ ar->size = n; ++} ++ + char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) + { + struct simple_transaction_argresp *ar; +@@ -819,6 +834,7 @@ EXPORT_SYMBOL(simple_sync_file); + EXPORT_SYMBOL(simple_unlink); + EXPORT_SYMBOL(simple_read_from_buffer); + EXPORT_SYMBOL(memory_read_from_buffer); ++EXPORT_SYMBOL(simple_transaction_set); + EXPORT_SYMBOL(simple_transaction_get); + EXPORT_SYMBOL(simple_transaction_read); + EXPORT_SYMBOL(simple_transaction_release); +Index: linux-2.6-tip/fs/locks.c +=================================================================== +--- linux-2.6-tip.orig/fs/locks.c ++++ linux-2.6-tip/fs/locks.c +@@ -1567,7 +1567,7 @@ EXPORT_SYMBOL(flock_lock_file_wait); + SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) + { + struct file *filp; +- struct file_lock *lock; ++ struct file_lock *uninitialized_var(lock); + int can_sleep, unlock; + int error; + +Index: linux-2.6-tip/fs/ocfs2/aops.c +=================================================================== +--- linux-2.6-tip.orig/fs/ocfs2/aops.c ++++ linux-2.6-tip/fs/ocfs2/aops.c +@@ -1646,7 +1646,7 @@ int ocfs2_write_begin_nolock(struct addr + { + int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + unsigned int clusters_to_alloc, extents_to_split; +- struct ocfs2_write_ctxt *wc; ++ struct ocfs2_write_ctxt *uninitialized_var(wc); + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; +Index: linux-2.6-tip/fs/ocfs2/cluster/heartbeat.c +=================================================================== +--- linux-2.6-tip.orig/fs/ocfs2/cluster/heartbeat.c ++++ linux-2.6-tip/fs/ocfs2/cluster/heartbeat.c +@@ -1026,8 +1026,8 @@ static ssize_t o2hb_region_block_bytes_w + size_t count) + { + int status; +- unsigned long block_bytes; +- unsigned int block_bits; ++ unsigned long uninitialized_var(block_bytes); ++ unsigned int uninitialized_var(block_bits); + + if (reg->hr_bdev) + return -EINVAL; +Index: linux-2.6-tip/fs/ocfs2/ioctl.c +=================================================================== +--- linux-2.6-tip.orig/fs/ocfs2/ioctl.c ++++ linux-2.6-tip/fs/ocfs2/ioctl.c +@@ -111,7 +111,7 @@ bail: + long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + { + struct inode *inode = filp->f_path.dentry->d_inode; +- unsigned int flags; ++ unsigned int uninitialized_var(flags); + int new_clusters; + int status; + struct ocfs2_space_resv sr; +Index: linux-2.6-tip/fs/ocfs2/slot_map.c +=================================================================== +--- linux-2.6-tip.orig/fs/ocfs2/slot_map.c ++++ linux-2.6-tip/fs/ocfs2/slot_map.c +@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct + { + int status = 0; + u64 blkno; +- unsigned long long blocks, bytes; ++ unsigned long long blocks, uninitialized_var(bytes); + unsigned int i; + struct buffer_head *bh; + +Index: linux-2.6-tip/fs/ocfs2/stack_user.c +=================================================================== +--- linux-2.6-tip.orig/fs/ocfs2/stack_user.c ++++ linux-2.6-tip/fs/ocfs2/stack_user.c +@@ -807,7 +807,7 @@ static int fs_protocol_compare(struct oc + static int user_cluster_connect(struct ocfs2_cluster_connection *conn) + { + dlm_lockspace_t *fsdlm; +- struct ocfs2_live_connection *control; ++ struct ocfs2_live_connection *uninitialized_var(control); + int rc = 0; + + BUG_ON(conn == NULL); +Index: linux-2.6-tip/fs/omfs/file.c +=================================================================== +--- linux-2.6-tip.orig/fs/omfs/file.c ++++ linux-2.6-tip/fs/omfs/file.c +@@ -237,14 +237,14 @@ static int omfs_get_block(struct inode * + struct buffer_head *bh; + sector_t next, offset; + int ret; +- u64 new_block; ++ u64 uninitialized_var(new_block); + u32 max_extents; + int extent_count; + struct omfs_extent *oe; + struct omfs_extent_entry *entry; + struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); + int max_blocks = bh_result->b_size >> inode->i_blkbits; +- int remain; ++ int uninitialized_var(remain); + + ret = -EIO; + bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino)); +Index: linux-2.6-tip/fs/partitions/check.c +=================================================================== +--- linux-2.6-tip.orig/fs/partitions/check.c ++++ linux-2.6-tip/fs/partitions/check.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "check.h" + +@@ -294,6 +295,9 @@ static struct attribute_group part_attr_ + + static struct attribute_group *part_attr_groups[] = { + &part_attr_group, ++#ifdef CONFIG_BLK_DEV_IO_TRACE ++ &blk_trace_attr_group, ++#endif + NULL + }; + +Index: linux-2.6-tip/fs/proc/loadavg.c +=================================================================== +--- linux-2.6-tip.orig/fs/proc/loadavg.c ++++ linux-2.6-tip/fs/proc/loadavg.c +@@ -12,20 +12,14 @@ + + static int loadavg_proc_show(struct seq_file *m, void *v) + { +- int a, b, c; +- unsigned long seq; ++ unsigned long avnrun[3]; + +- do { +- seq = read_seqbegin(&xtime_lock); +- a = avenrun[0] + (FIXED_1/200); +- b = avenrun[1] + (FIXED_1/200); +- c = avenrun[2] + (FIXED_1/200); +- } while (read_seqretry(&xtime_lock, seq)); +- +- seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", +- LOAD_INT(a), LOAD_FRAC(a), +- LOAD_INT(b), LOAD_FRAC(b), +- LOAD_INT(c), LOAD_FRAC(c), ++ get_avenrun(avnrun, FIXED_1/200, 0); ++ ++ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", ++ LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), ++ LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), ++ LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), + nr_running(), nr_threads, + task_active_pid_ns(current)->last_pid); + return 0; +Index: linux-2.6-tip/fs/reiserfs/do_balan.c +=================================================================== +--- linux-2.6-tip.orig/fs/reiserfs/do_balan.c ++++ linux-2.6-tip/fs/reiserfs/do_balan.c +@@ -1295,9 +1295,8 @@ static int balance_leaf(struct tree_bala + + RFALSE(ih, "PAP-12210: ih must be 0"); + +- if (is_direntry_le_ih +- (aux_ih = +- B_N_PITEM_HEAD(tbS0, item_pos))) { ++ aux_ih = B_N_PITEM_HEAD(tbS0, item_pos); ++ if (is_direntry_le_ih(aux_ih)) { + /* we append to directory item */ + + int entry_count; +Index: linux-2.6-tip/fs/reiserfs/lbalance.c +=================================================================== +--- linux-2.6-tip.orig/fs/reiserfs/lbalance.c ++++ linux-2.6-tip/fs/reiserfs/lbalance.c +@@ -389,7 +389,8 @@ static void leaf_item_bottle(struct buff + + if (last_first == FIRST_TO_LAST) { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ +- if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) ++ ih = B_N_PITEM_HEAD(src, item_num); ++ if (is_direntry_le_ih(ih)) + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, + item_num, 0, cpy_bytes); + else { +@@ -417,7 +418,8 @@ static void leaf_item_bottle(struct buff + } + } else { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ +- if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) ++ ih = B_N_PITEM_HEAD(src, item_num); ++ if (is_direntry_le_ih(ih)) + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + item_num, + I_ENTRY_COUNT(ih) - cpy_bytes, +@@ -774,8 +776,8 @@ void leaf_delete_items(struct buffer_inf + leaf_delete_items_entirely(cur_bi, first + 1, + del_num - 1); + +- if (is_direntry_le_ih +- (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1))) ++ ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1); ++ if (is_direntry_le_ih(ih)) + /* the last item is directory */ + /* len = numbers of directory entries in this item */ + len = ih_entry_count(ih); +Index: linux-2.6-tip/fs/squashfs/export.c +=================================================================== +--- linux-2.6-tip.orig/fs/squashfs/export.c ++++ linux-2.6-tip/fs/squashfs/export.c +@@ -40,6 +40,7 @@ + #include + #include + #include ++#include + + #include "squashfs_fs.h" + #include "squashfs_fs_sb.h" +Index: linux-2.6-tip/fs/udf/truncate.c +=================================================================== +--- linux-2.6-tip.orig/fs/udf/truncate.c ++++ linux-2.6-tip/fs/udf/truncate.c +@@ -87,7 +87,7 @@ void udf_truncate_tail_extent(struct ino + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(long_ad); + else +- BUG(); ++ panic("udf_truncate_tail_extent: unknown alloc type!"); + + /* Find the last extent in the file */ + while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { +@@ -214,7 +214,7 @@ void udf_truncate_extents(struct inode * + else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(long_ad); + else +- BUG(); ++ panic("udf_truncate_extents: unknown alloc type!"); + + etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); + byte_offset = (offset << sb->s_blocksize_bits) + +Index: linux-2.6-tip/fs/xfs/linux-2.6/xfs_xattr.c +=================================================================== +--- linux-2.6-tip.orig/fs/xfs/linux-2.6/xfs_xattr.c ++++ linux-2.6-tip/fs/xfs/linux-2.6/xfs_xattr.c +@@ -30,20 +30,6 @@ + + + /* +- * ACL handling. Should eventually be moved into xfs_acl.c +- */ +- +-static int +-xfs_decode_acl(const char *name) +-{ +- if (strcmp(name, "posix_acl_access") == 0) +- return _ACL_TYPE_ACCESS; +- else if (strcmp(name, "posix_acl_default") == 0) +- return _ACL_TYPE_DEFAULT; +- return -EINVAL; +-} +- +-/* + * Get system extended attributes which at the moment only + * includes Posix ACLs. + */ +Index: linux-2.6-tip/fs/xfs/xfs_acl.c +=================================================================== +--- linux-2.6-tip.orig/fs/xfs/xfs_acl.c ++++ linux-2.6-tip/fs/xfs/xfs_acl.c +@@ -51,6 +51,19 @@ kmem_zone_t *xfs_acl_zone; + + + /* ++ * ACL handling. ++ */ ++int ++xfs_decode_acl(const char *name) ++{ ++ if (strcmp(name, "posix_acl_access") == 0) ++ return _ACL_TYPE_ACCESS; ++ else if (strcmp(name, "posix_acl_default") == 0) ++ return _ACL_TYPE_DEFAULT; ++ return -EINVAL; ++} ++ ++/* + * Test for existence of access ACL attribute as efficiently as possible. + */ + int +Index: linux-2.6-tip/fs/xfs/xfs_acl.h +=================================================================== +--- linux-2.6-tip.orig/fs/xfs/xfs_acl.h ++++ linux-2.6-tip/fs/xfs/xfs_acl.h +@@ -58,6 +58,7 @@ extern struct kmem_zone *xfs_acl_zone; + (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name)) + #define xfs_acl_zone_destroy(zone) kmem_zone_destroy(zone) + ++extern int xfs_decode_acl(const char *); + extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *); + extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *); + extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *); +@@ -79,6 +80,7 @@ extern int xfs_acl_vremove(struct inode + #define _ACL_FREE(a) ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0) + + #else ++#define xfs_decode_acl(name) (-EINVAL) + #define xfs_acl_zone_init(zone,name) + #define xfs_acl_zone_destroy(zone) + #define xfs_acl_vset(v,p,sz,t) (-EOPNOTSUPP) +Index: linux-2.6-tip/fs/xfs/xfs_mount.c +=================================================================== +--- linux-2.6-tip.orig/fs/xfs/xfs_mount.c ++++ linux-2.6-tip/fs/xfs/xfs_mount.c +@@ -1424,6 +1424,8 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fi + /* find modified range */ + + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); ++ if ((long)f < 0) /* work around gcc warning */ ++ return; + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + first = xfs_sb_info[f].offset; + +Index: linux-2.6-tip/include/acpi/acpiosxf.h +=================================================================== +--- linux-2.6-tip.orig/include/acpi/acpiosxf.h ++++ linux-2.6-tip/include/acpi/acpiosxf.h +@@ -61,7 +61,7 @@ typedef enum { + OSL_EC_BURST_HANDLER + } acpi_execute_type; + +-#define ACPI_NO_UNIT_LIMIT ((u32) -1) ++#define ACPI_NO_UNIT_LIMIT (INT_MAX/2) + #define ACPI_MUTEX_SEM 1 + + /* Functions for acpi_os_signal */ +@@ -144,6 +144,7 @@ void __iomem *acpi_os_map_memory(acpi_ph + acpi_size length); + + void acpi_os_unmap_memory(void __iomem * logical_address, acpi_size size); ++void early_acpi_os_unmap_memory(void __iomem * virt, acpi_size size); + + #ifdef ACPI_FUTURE_USAGE + acpi_status +Index: linux-2.6-tip/include/acpi/acpixf.h +=================================================================== +--- linux-2.6-tip.orig/include/acpi/acpixf.h ++++ linux-2.6-tip/include/acpi/acpixf.h +@@ -130,6 +130,10 @@ acpi_get_table_header(acpi_string signat + struct acpi_table_header *out_table_header); + + acpi_status ++acpi_get_table_with_size(acpi_string signature, ++ u32 instance, struct acpi_table_header **out_table, ++ acpi_size *tbl_size); ++acpi_status + acpi_get_table(acpi_string signature, + u32 instance, struct acpi_table_header **out_table); + +Index: linux-2.6-tip/include/asm-frv/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/asm-frv/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/include/asm-frv/swab.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-frv/swab.h ++++ linux-2.6-tip/include/asm-frv/swab.h +@@ -1,7 +1,7 @@ + #ifndef _ASM_SWAB_H + #define _ASM_SWAB_H + +-#include ++#include + + #if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__) + # define __SWAB_64_THRU_32__ +Index: linux-2.6-tip/include/asm-generic/bug.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/bug.h ++++ linux-2.6-tip/include/asm-generic/bug.h +@@ -3,6 +3,10 @@ + + #include + ++#ifndef __ASSEMBLY__ ++extern void __WARN_ON(const char *func, const char *file, const int line); ++#endif /* __ASSEMBLY__ */ ++ + #ifdef CONFIG_BUG + + #ifdef CONFIG_GENERIC_BUG +@@ -103,10 +107,9 @@ extern void warn_slowpath(const char *fi + #endif + + #ifndef WARN +-#define WARN(condition, format...) ({ \ +- int __ret_warn_on = !!(condition); \ +- unlikely(__ret_warn_on); \ +-}) ++static inline int __attribute__ ((format(printf, 2, 3))) ++__WARN(int condition, const char *fmt, ...) { return condition; } ++#define WARN(condition, format...) __WARN(!!(condition), format) + #endif + + #endif +@@ -140,4 +143,18 @@ extern void warn_slowpath(const char *fi + # define WARN_ON_SMP(x) do { } while (0) + #endif + ++#ifdef CONFIG_PREEMPT_RT ++# define BUG_ON_RT(c) BUG_ON(c) ++# define BUG_ON_NONRT(c) do { } while (0) ++# define WARN_ON_RT(condition) WARN_ON(condition) ++# define WARN_ON_NONRT(condition) do { } while (0) ++# define WARN_ON_ONCE_NONRT(condition) do { } while (0) ++#else ++# define BUG_ON_RT(c) do { } while (0) ++# define BUG_ON_NONRT(c) BUG_ON(c) ++# define WARN_ON_RT(condition) do { } while (0) ++# define WARN_ON_NONRT(condition) WARN_ON(condition) ++# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition) ++#endif ++ + #endif +Index: linux-2.6-tip/include/asm-generic/fcntl.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/fcntl.h ++++ linux-2.6-tip/include/asm-generic/fcntl.h +@@ -117,9 +117,9 @@ + struct flock { + short l_type; + short l_whence; +- off_t l_start; +- off_t l_len; +- pid_t l_pid; ++ __kernel_off_t l_start; ++ __kernel_off_t l_len; ++ __kernel_pid_t l_pid; + __ARCH_FLOCK_PAD + }; + #endif +@@ -140,9 +140,9 @@ struct flock { + struct flock64 { + short l_type; + short l_whence; +- loff_t l_start; +- loff_t l_len; +- pid_t l_pid; ++ __kernel_loff_t l_start; ++ __kernel_loff_t l_len; ++ __kernel_pid_t l_pid; + __ARCH_FLOCK64_PAD + }; + #endif +Index: linux-2.6-tip/include/asm-generic/percpu.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/percpu.h ++++ linux-2.6-tip/include/asm-generic/percpu.h +@@ -9,6 +9,9 @@ + */ + #define per_cpu_var(var) per_cpu__##var + ++#define __per_cpu_var_lock(var) per_cpu__lock_##var##_locked ++#define __per_cpu_var_lock_var(var) per_cpu__##var##_locked ++ + #ifdef CONFIG_SMP + + /* +@@ -60,6 +63,14 @@ extern unsigned long __per_cpu_offset[NR + #define __raw_get_cpu_var(var) \ + (*SHIFT_PERCPU_PTR(&per_cpu_var(var), __my_cpu_offset)) + ++#define per_cpu_lock(var, cpu) \ ++ (*SHIFT_PERCPU_PTR(&__per_cpu_var_lock(var), per_cpu_offset(cpu))) ++#define per_cpu_var_locked(var, cpu) \ ++ (*SHIFT_PERCPU_PTR(&__per_cpu_var_lock_var(var), per_cpu_offset(cpu))) ++#define __get_cpu_lock(var, cpu) \ ++ per_cpu_lock(var, cpu) ++#define __get_cpu_var_locked(var, cpu) \ ++ per_cpu_var_locked(var, cpu) + + #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA + extern void setup_per_cpu_areas(void); +@@ -68,9 +79,11 @@ extern void setup_per_cpu_areas(void); + #else /* ! SMP */ + + #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) ++#define per_cpu_var_locked(var, cpu) (*((void)(cpu), &__per_cpu_var_lock_var(var))) + #define __get_cpu_var(var) per_cpu_var(var) + #define __raw_get_cpu_var(var) per_cpu_var(var) +- ++#define __get_cpu_lock(var, cpu) __per_cpu_var_lock(var) ++#define __get_cpu_var_locked(var, cpu) __per_cpu_var_lock_var(var) + #endif /* SMP */ + + #ifndef PER_CPU_ATTRIBUTES +@@ -79,5 +92,60 @@ extern void setup_per_cpu_areas(void); + + #define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \ + __typeof__(type) per_cpu_var(name) ++#define DECLARE_PER_CPU_LOCKED(type, name) \ ++ extern PER_CPU_ATTRIBUTES spinlock_t __per_cpu_var_lock(name); \ ++ extern PER_CPU_ATTRIBUTES __typeof__(type) __per_cpu_var_lock_var(name) ++ ++/* ++ * Optional methods for optimized non-lvalue per-cpu variable access. ++ * ++ * @var can be a percpu variable or a field of it and its size should ++ * equal char, int or long. percpu_read() evaluates to a lvalue and ++ * all others to void. ++ * ++ * These operations are guaranteed to be atomic w.r.t. preemption. ++ * The generic versions use plain get/put_cpu_var(). Archs are ++ * encouraged to implement single-instruction alternatives which don't ++ * require preemption protection. ++ */ ++#ifndef percpu_read ++# define percpu_read(var) \ ++ ({ \ ++ typeof(per_cpu_var(var)) __tmp_var__; \ ++ __tmp_var__ = get_cpu_var(var); \ ++ put_cpu_var(var); \ ++ __tmp_var__; \ ++ }) ++#endif ++ ++#define __percpu_generic_to_op(var, val, op) \ ++do { \ ++ get_cpu_var(var) op val; \ ++ put_cpu_var(var); \ ++} while (0) ++ ++#ifndef percpu_write ++# define percpu_write(var, val) __percpu_generic_to_op(var, (val), =) ++#endif ++ ++#ifndef percpu_add ++# define percpu_add(var, val) __percpu_generic_to_op(var, (val), +=) ++#endif ++ ++#ifndef percpu_sub ++# define percpu_sub(var, val) __percpu_generic_to_op(var, (val), -=) ++#endif ++ ++#ifndef percpu_and ++# define percpu_and(var, val) __percpu_generic_to_op(var, (val), &=) ++#endif ++ ++#ifndef percpu_or ++# define percpu_or(var, val) __percpu_generic_to_op(var, (val), |=) ++#endif ++ ++#ifndef percpu_xor ++# define percpu_xor(var, val) __percpu_generic_to_op(var, (val), ^=) ++#endif + + #endif /* _ASM_GENERIC_PERCPU_H_ */ +Index: linux-2.6-tip/include/asm-generic/sections.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/sections.h ++++ linux-2.6-tip/include/asm-generic/sections.h +@@ -9,7 +9,7 @@ extern char __bss_start[], __bss_stop[]; + extern char __init_begin[], __init_end[]; + extern char _sinittext[], _einittext[]; + extern char _end[]; +-extern char __per_cpu_start[], __per_cpu_end[]; ++extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; + extern char __kprobes_text_start[], __kprobes_text_end[]; + extern char __initdata_begin[], __initdata_end[]; + extern char __start_rodata[], __end_rodata[]; +Index: linux-2.6-tip/include/asm-generic/siginfo.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/siginfo.h ++++ linux-2.6-tip/include/asm-generic/siginfo.h +@@ -23,7 +23,7 @@ typedef union sigval { + #endif + + #ifndef __ARCH_SI_UID_T +-#define __ARCH_SI_UID_T uid_t ++#define __ARCH_SI_UID_T __kernel_uid32_t + #endif + + /* +@@ -47,13 +47,13 @@ typedef struct siginfo { + + /* kill() */ + struct { +- pid_t _pid; /* sender's pid */ ++ __kernel_pid_t _pid; /* sender's pid */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + } _kill; + + /* POSIX.1b timers */ + struct { +- timer_t _tid; /* timer id */ ++ __kernel_timer_t _tid; /* timer id */ + int _overrun; /* overrun count */ + char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)]; + sigval_t _sigval; /* same as below */ +@@ -62,18 +62,18 @@ typedef struct siginfo { + + /* POSIX.1b signals */ + struct { +- pid_t _pid; /* sender's pid */ ++ __kernel_pid_t _pid; /* sender's pid */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + sigval_t _sigval; + } _rt; + + /* SIGCHLD */ + struct { +- pid_t _pid; /* which child */ ++ __kernel_pid_t _pid; /* which child */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + int _status; /* exit code */ +- clock_t _utime; +- clock_t _stime; ++ __kernel_clock_t _utime; ++ __kernel_clock_t _stime; + } _sigchld; + + /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ +Index: linux-2.6-tip/include/asm-generic/statfs.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/statfs.h ++++ linux-2.6-tip/include/asm-generic/statfs.h +@@ -1,8 +1,9 @@ + #ifndef _GENERIC_STATFS_H + #define _GENERIC_STATFS_H + +-#ifndef __KERNEL_STRICT_NAMES +-# include ++#include ++ ++#ifdef __KERNEL__ + typedef __kernel_fsid_t fsid_t; + #endif + +Index: linux-2.6-tip/include/asm-generic/vmlinux.lds.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/vmlinux.lds.h ++++ linux-2.6-tip/include/asm-generic/vmlinux.lds.h +@@ -61,6 +61,30 @@ + #define BRANCH_PROFILE() + #endif + ++#ifdef CONFIG_EVENT_TRACER ++#define FTRACE_EVENTS() VMLINUX_SYMBOL(__start_ftrace_events) = .; \ ++ *(_ftrace_events) \ ++ VMLINUX_SYMBOL(__stop_ftrace_events) = .; ++#else ++#define FTRACE_EVENTS() ++#endif ++ ++#ifdef CONFIG_TRACING ++#define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .; \ ++ *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \ ++ VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .; ++#else ++#define TRACE_PRINTKS() ++#endif ++ ++#ifdef CONFIG_FTRACE_SYSCALLS ++#define TRACE_SYSCALLS() VMLINUX_SYMBOL(__start_syscalls_metadata) = .; \ ++ *(__syscalls_metadata) \ ++ VMLINUX_SYMBOL(__stop_syscalls_metadata) = .; ++#else ++#define TRACE_SYSCALLS() ++#endif ++ + /* .data section */ + #define DATA_DATA \ + *(.data) \ +@@ -81,7 +105,10 @@ + *(__tracepoints) \ + VMLINUX_SYMBOL(__stop___tracepoints) = .; \ + LIKELY_PROFILE() \ +- BRANCH_PROFILE() ++ BRANCH_PROFILE() \ ++ TRACE_PRINTKS() \ ++ FTRACE_EVENTS() \ ++ TRACE_SYSCALLS() + + #define RO_DATA(align) \ + . = ALIGN((align)); \ +@@ -430,12 +457,59 @@ + *(.initcall7.init) \ + *(.initcall7s.init) + ++/** ++ * PERCPU_VADDR - define output section for percpu area ++ * @vaddr: explicit base address (optional) ++ * @phdr: destination PHDR (optional) ++ * ++ * Macro which expands to output section for percpu area. If @vaddr ++ * is not blank, it specifies explicit base address and all percpu ++ * symbols will be offset from the given address. If blank, @vaddr ++ * always equals @laddr + LOAD_OFFSET. ++ * ++ * @phdr defines the output PHDR to use if not blank. Be warned that ++ * output PHDR is sticky. If @phdr is specified, the next output ++ * section in the linker script will go there too. @phdr should have ++ * a leading colon. ++ * ++ * Note that this macros defines __per_cpu_load as an absolute symbol. ++ * If there is no need to put the percpu section at a predetermined ++ * address, use PERCPU(). ++ */ ++#define PERCPU_VADDR(vaddr, phdr) \ ++ VMLINUX_SYMBOL(__per_cpu_load) = .; \ ++ .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \ ++ - LOAD_OFFSET) { \ ++ VMLINUX_SYMBOL(__per_cpu_start) = .; \ ++ *(.data.percpu.first) \ ++ *(.data.percpu.page_aligned) \ ++ *(.data.percpu) \ ++ *(.data.percpu.shared_aligned) \ ++ VMLINUX_SYMBOL(__per_cpu_end) = .; \ ++ } phdr \ ++ . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu); ++ ++/** ++ * PERCPU - define output section for percpu area, simple version ++ * @align: required alignment ++ * ++ * Align to @align and outputs output section for percpu area. This ++ * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and ++ * __per_cpu_start will be identical. ++ * ++ * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except ++ * that __per_cpu_load is defined as a relative symbol against ++ * .data.percpu which is required for relocatable x86_32 ++ * configuration. ++ */ + #define PERCPU(align) \ + . = ALIGN(align); \ +- VMLINUX_SYMBOL(__per_cpu_start) = .; \ +- .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ ++ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ ++ VMLINUX_SYMBOL(__per_cpu_load) = .; \ ++ VMLINUX_SYMBOL(__per_cpu_start) = .; \ ++ *(.data.percpu.first) \ + *(.data.percpu.page_aligned) \ + *(.data.percpu) \ + *(.data.percpu.shared_aligned) \ +- } \ +- VMLINUX_SYMBOL(__per_cpu_end) = .; ++ VMLINUX_SYMBOL(__per_cpu_end) = .; \ ++ } +Index: linux-2.6-tip/include/asm-m32r/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/asm-m32r/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/include/asm-m32r/swab.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-m32r/swab.h ++++ linux-2.6-tip/include/asm-m32r/swab.h +@@ -1,7 +1,7 @@ + #ifndef _ASM_M32R_SWAB_H + #define _ASM_M32R_SWAB_H + +-#include ++#include + + #if !defined(__STRICT_ANSI__) || defined(__KERNEL__) + # define __SWAB_64_THRU_32__ +Index: linux-2.6-tip/include/asm-mn10300/ftrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/asm-mn10300/ftrace.h +@@ -0,0 +1 @@ ++/* empty */ +Index: linux-2.6-tip/include/asm-mn10300/swab.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-mn10300/swab.h ++++ linux-2.6-tip/include/asm-mn10300/swab.h +@@ -11,7 +11,7 @@ + #ifndef _ASM_SWAB_H + #define _ASM_SWAB_H + +-#include ++#include + + #ifdef __GNUC__ + +Index: linux-2.6-tip/include/drm/drm.h +=================================================================== +--- linux-2.6-tip.orig/include/drm/drm.h ++++ linux-2.6-tip/include/drm/drm.h +@@ -36,8 +36,7 @@ + #ifndef _DRM_H_ + #define _DRM_H_ + +-#if defined(__KERNEL__) +-#endif ++#include + #include /* For _IO* macros */ + #define DRM_IOCTL_NR(n) _IOC_NR(n) + #define DRM_IOC_VOID _IOC_NONE +@@ -497,8 +496,8 @@ union drm_wait_vblank { + * \sa drmModesetCtl(). + */ + struct drm_modeset_ctl { +- uint32_t crtc; +- uint32_t cmd; ++ __u32 crtc; ++ __u32 cmd; + }; + + /** +@@ -574,29 +573,29 @@ struct drm_set_version { + /** DRM_IOCTL_GEM_CLOSE ioctl argument type */ + struct drm_gem_close { + /** Handle of the object to be closed. */ +- uint32_t handle; +- uint32_t pad; ++ __u32 handle; ++ __u32 pad; + }; + + /** DRM_IOCTL_GEM_FLINK ioctl argument type */ + struct drm_gem_flink { + /** Handle for the object being named */ +- uint32_t handle; ++ __u32 handle; + + /** Returned global name */ +- uint32_t name; ++ __u32 name; + }; + + /** DRM_IOCTL_GEM_OPEN ioctl argument type */ + struct drm_gem_open { + /** Name of object being opened */ +- uint32_t name; ++ __u32 name; + + /** Returned handle for the object */ +- uint32_t handle; ++ __u32 handle; + + /** Returned size of the object */ +- uint64_t size; ++ __u64 size; + }; + + #include "drm_mode.h" +Index: linux-2.6-tip/include/drm/drm_mode.h +=================================================================== +--- linux-2.6-tip.orig/include/drm/drm_mode.h ++++ linux-2.6-tip/include/drm/drm_mode.h +@@ -27,11 +27,8 @@ + #ifndef _DRM_MODE_H + #define _DRM_MODE_H + +-#if !defined(__KERNEL__) && !defined(_KERNEL) +-#include +-#else + #include +-#endif ++#include + + #define DRM_DISPLAY_INFO_LEN 32 + #define DRM_CONNECTOR_NAME_LEN 32 +@@ -81,41 +78,41 @@ + #define DRM_MODE_DITHERING_ON 1 + + struct drm_mode_modeinfo { +- uint32_t clock; +- uint16_t hdisplay, hsync_start, hsync_end, htotal, hskew; +- uint16_t vdisplay, vsync_start, vsync_end, vtotal, vscan; ++ __u32 clock; ++ __u16 hdisplay, hsync_start, hsync_end, htotal, hskew; ++ __u16 vdisplay, vsync_start, vsync_end, vtotal, vscan; + +- uint32_t vrefresh; /* vertical refresh * 1000 */ ++ __u32 vrefresh; /* vertical refresh * 1000 */ + +- uint32_t flags; +- uint32_t type; ++ __u32 flags; ++ __u32 type; + char name[DRM_DISPLAY_MODE_LEN]; + }; + + struct drm_mode_card_res { +- uint64_t fb_id_ptr; +- uint64_t crtc_id_ptr; +- uint64_t connector_id_ptr; +- uint64_t encoder_id_ptr; +- uint32_t count_fbs; +- uint32_t count_crtcs; +- uint32_t count_connectors; +- uint32_t count_encoders; +- uint32_t min_width, max_width; +- uint32_t min_height, max_height; ++ __u64 fb_id_ptr; ++ __u64 crtc_id_ptr; ++ __u64 connector_id_ptr; ++ __u64 encoder_id_ptr; ++ __u32 count_fbs; ++ __u32 count_crtcs; ++ __u32 count_connectors; ++ __u32 count_encoders; ++ __u32 min_width, max_width; ++ __u32 min_height, max_height; + }; + + struct drm_mode_crtc { +- uint64_t set_connectors_ptr; +- uint32_t count_connectors; ++ __u64 set_connectors_ptr; ++ __u32 count_connectors; + +- uint32_t crtc_id; /**< Id */ +- uint32_t fb_id; /**< Id of framebuffer */ ++ __u32 crtc_id; /**< Id */ ++ __u32 fb_id; /**< Id of framebuffer */ + +- uint32_t x, y; /**< Position on the frameuffer */ ++ __u32 x, y; /**< Position on the frameuffer */ + +- uint32_t gamma_size; +- uint32_t mode_valid; ++ __u32 gamma_size; ++ __u32 mode_valid; + struct drm_mode_modeinfo mode; + }; + +@@ -126,13 +123,13 @@ struct drm_mode_crtc { + #define DRM_MODE_ENCODER_TVDAC 4 + + struct drm_mode_get_encoder { +- uint32_t encoder_id; +- uint32_t encoder_type; ++ __u32 encoder_id; ++ __u32 encoder_type; + +- uint32_t crtc_id; /**< Id of crtc */ ++ __u32 crtc_id; /**< Id of crtc */ + +- uint32_t possible_crtcs; +- uint32_t possible_clones; ++ __u32 possible_crtcs; ++ __u32 possible_clones; + }; + + /* This is for connectors with multiple signal types. */ +@@ -161,23 +158,23 @@ struct drm_mode_get_encoder { + + struct drm_mode_get_connector { + +- uint64_t encoders_ptr; +- uint64_t modes_ptr; +- uint64_t props_ptr; +- uint64_t prop_values_ptr; +- +- uint32_t count_modes; +- uint32_t count_props; +- uint32_t count_encoders; +- +- uint32_t encoder_id; /**< Current Encoder */ +- uint32_t connector_id; /**< Id */ +- uint32_t connector_type; +- uint32_t connector_type_id; +- +- uint32_t connection; +- uint32_t mm_width, mm_height; /**< HxW in millimeters */ +- uint32_t subpixel; ++ __u64 encoders_ptr; ++ __u64 modes_ptr; ++ __u64 props_ptr; ++ __u64 prop_values_ptr; ++ ++ __u32 count_modes; ++ __u32 count_props; ++ __u32 count_encoders; ++ ++ __u32 encoder_id; /**< Current Encoder */ ++ __u32 connector_id; /**< Id */ ++ __u32 connector_type; ++ __u32 connector_type_id; ++ ++ __u32 connection; ++ __u32 mm_width, mm_height; /**< HxW in millimeters */ ++ __u32 subpixel; + }; + + #define DRM_MODE_PROP_PENDING (1<<0) +@@ -187,46 +184,46 @@ struct drm_mode_get_connector { + #define DRM_MODE_PROP_BLOB (1<<4) + + struct drm_mode_property_enum { +- uint64_t value; ++ __u64 value; + char name[DRM_PROP_NAME_LEN]; + }; + + struct drm_mode_get_property { +- uint64_t values_ptr; /* values and blob lengths */ +- uint64_t enum_blob_ptr; /* enum and blob id ptrs */ ++ __u64 values_ptr; /* values and blob lengths */ ++ __u64 enum_blob_ptr; /* enum and blob id ptrs */ + +- uint32_t prop_id; +- uint32_t flags; ++ __u32 prop_id; ++ __u32 flags; + char name[DRM_PROP_NAME_LEN]; + +- uint32_t count_values; +- uint32_t count_enum_blobs; ++ __u32 count_values; ++ __u32 count_enum_blobs; + }; + + struct drm_mode_connector_set_property { +- uint64_t value; +- uint32_t prop_id; +- uint32_t connector_id; ++ __u64 value; ++ __u32 prop_id; ++ __u32 connector_id; + }; + + struct drm_mode_get_blob { +- uint32_t blob_id; +- uint32_t length; +- uint64_t data; ++ __u32 blob_id; ++ __u32 length; ++ __u64 data; + }; + + struct drm_mode_fb_cmd { +- uint32_t fb_id; +- uint32_t width, height; +- uint32_t pitch; +- uint32_t bpp; +- uint32_t depth; ++ __u32 fb_id; ++ __u32 width, height; ++ __u32 pitch; ++ __u32 bpp; ++ __u32 depth; + /* driver specific handle */ +- uint32_t handle; ++ __u32 handle; + }; + + struct drm_mode_mode_cmd { +- uint32_t connector_id; ++ __u32 connector_id; + struct drm_mode_modeinfo mode; + }; + +@@ -248,24 +245,24 @@ struct drm_mode_mode_cmd { + * y + */ + struct drm_mode_cursor { +- uint32_t flags; +- uint32_t crtc_id; +- int32_t x; +- int32_t y; +- uint32_t width; +- uint32_t height; ++ __u32 flags; ++ __u32 crtc_id; ++ __s32 x; ++ __s32 y; ++ __u32 width; ++ __u32 height; + /* driver specific handle */ +- uint32_t handle; ++ __u32 handle; + }; + + struct drm_mode_crtc_lut { +- uint32_t crtc_id; +- uint32_t gamma_size; ++ __u32 crtc_id; ++ __u32 gamma_size; + + /* pointers to arrays */ +- uint64_t red; +- uint64_t green; +- uint64_t blue; ++ __u64 red; ++ __u64 green; ++ __u64 blue; + }; + + #endif +Index: linux-2.6-tip/include/drm/i915_drm.h +=================================================================== +--- linux-2.6-tip.orig/include/drm/i915_drm.h ++++ linux-2.6-tip/include/drm/i915_drm.h +@@ -30,7 +30,7 @@ + /* Please note that modifications to all structs defined here are + * subject to backwards-compatibility constraints. + */ +- ++#include + #include "drm.h" + + /* Each region is a minimum of 16k, and there are at most 255 of them. +@@ -116,15 +116,15 @@ typedef struct _drm_i915_sarea { + + /* fill out some space for old userspace triple buffer */ + drm_handle_t unused_handle; +- uint32_t unused1, unused2, unused3; ++ __u32 unused1, unused2, unused3; + + /* buffer object handles for static buffers. May change + * over the lifetime of the client. + */ +- uint32_t front_bo_handle; +- uint32_t back_bo_handle; +- uint32_t unused_bo_handle; +- uint32_t depth_bo_handle; ++ __u32 front_bo_handle; ++ __u32 back_bo_handle; ++ __u32 unused_bo_handle; ++ __u32 depth_bo_handle; + + } drm_i915_sarea_t; + +@@ -327,7 +327,7 @@ typedef struct drm_i915_vblank_swap { + } drm_i915_vblank_swap_t; + + typedef struct drm_i915_hws_addr { +- uint64_t addr; ++ __u64 addr; + } drm_i915_hws_addr_t; + + struct drm_i915_gem_init { +@@ -335,12 +335,12 @@ struct drm_i915_gem_init { + * Beginning offset in the GTT to be managed by the DRM memory + * manager. + */ +- uint64_t gtt_start; ++ __u64 gtt_start; + /** + * Ending offset in the GTT to be managed by the DRM memory + * manager. + */ +- uint64_t gtt_end; ++ __u64 gtt_end; + }; + + struct drm_i915_gem_create { +@@ -349,94 +349,94 @@ struct drm_i915_gem_create { + * + * The (page-aligned) allocated size for the object will be returned. + */ +- uint64_t size; ++ __u64 size; + /** + * Returned handle for the object. + * + * Object handles are nonzero. + */ +- uint32_t handle; +- uint32_t pad; ++ __u32 handle; ++ __u32 pad; + }; + + struct drm_i915_gem_pread { + /** Handle for the object being read. */ +- uint32_t handle; +- uint32_t pad; ++ __u32 handle; ++ __u32 pad; + /** Offset into the object to read from */ +- uint64_t offset; ++ __u64 offset; + /** Length of data to read */ +- uint64_t size; ++ __u64 size; + /** + * Pointer to write the data into. + * + * This is a fixed-size type for 32/64 compatibility. + */ +- uint64_t data_ptr; ++ __u64 data_ptr; + }; + + struct drm_i915_gem_pwrite { + /** Handle for the object being written to. */ +- uint32_t handle; +- uint32_t pad; ++ __u32 handle; ++ __u32 pad; + /** Offset into the object to write to */ +- uint64_t offset; ++ __u64 offset; + /** Length of data to write */ +- uint64_t size; ++ __u64 size; + /** + * Pointer to read the data from. + * + * This is a fixed-size type for 32/64 compatibility. + */ +- uint64_t data_ptr; ++ __u64 data_ptr; + }; + + struct drm_i915_gem_mmap { + /** Handle for the object being mapped. */ +- uint32_t handle; +- uint32_t pad; ++ __u32 handle; ++ __u32 pad; + /** Offset in the object to map. */ +- uint64_t offset; ++ __u64 offset; + /** + * Length of data to map. + * + * The value will be page-aligned. + */ +- uint64_t size; ++ __u64 size; + /** + * Returned pointer the data was mapped at. + * + * This is a fixed-size type for 32/64 compatibility. + */ +- uint64_t addr_ptr; ++ __u64 addr_ptr; + }; + + struct drm_i915_gem_mmap_gtt { + /** Handle for the object being mapped. */ +- uint32_t handle; +- uint32_t pad; ++ __u32 handle; ++ __u32 pad; + /** + * Fake offset to use for subsequent mmap call + * + * This is a fixed-size type for 32/64 compatibility. + */ +- uint64_t offset; ++ __u64 offset; + }; + + struct drm_i915_gem_set_domain { + /** Handle for the object */ +- uint32_t handle; ++ __u32 handle; + + /** New read domains */ +- uint32_t read_domains; ++ __u32 read_domains; + + /** New write domain */ +- uint32_t write_domain; ++ __u32 write_domain; + }; + + struct drm_i915_gem_sw_finish { + /** Handle for the object */ +- uint32_t handle; ++ __u32 handle; + }; + + struct drm_i915_gem_relocation_entry { +@@ -448,16 +448,16 @@ struct drm_i915_gem_relocation_entry { + * a relocation list for state buffers and not re-write it per + * exec using the buffer. + */ +- uint32_t target_handle; ++ __u32 target_handle; + + /** + * Value to be added to the offset of the target buffer to make up + * the relocation entry. + */ +- uint32_t delta; ++ __u32 delta; + + /** Offset in the buffer the relocation entry will be written into */ +- uint64_t offset; ++ __u64 offset; + + /** + * Offset value of the target buffer that the relocation entry was last +@@ -467,12 +467,12 @@ struct drm_i915_gem_relocation_entry { + * and writing the relocation. This value is written back out by + * the execbuffer ioctl when the relocation is written. + */ +- uint64_t presumed_offset; ++ __u64 presumed_offset; + + /** + * Target memory domains read by this operation. + */ +- uint32_t read_domains; ++ __u32 read_domains; + + /** + * Target memory domains written by this operation. +@@ -481,7 +481,7 @@ struct drm_i915_gem_relocation_entry { + * execbuffer operation, so that where there are conflicts, + * the application will get -EINVAL back. + */ +- uint32_t write_domain; ++ __u32 write_domain; + }; + + /** @{ +@@ -512,24 +512,24 @@ struct drm_i915_gem_exec_object { + * User's handle for a buffer to be bound into the GTT for this + * operation. + */ +- uint32_t handle; ++ __u32 handle; + + /** Number of relocations to be performed on this buffer */ +- uint32_t relocation_count; ++ __u32 relocation_count; + /** + * Pointer to array of struct drm_i915_gem_relocation_entry containing + * the relocations to be performed in this buffer. + */ +- uint64_t relocs_ptr; ++ __u64 relocs_ptr; + + /** Required alignment in graphics aperture */ +- uint64_t alignment; ++ __u64 alignment; + + /** + * Returned value of the updated offset of the object, for future + * presumed_offset writes. + */ +- uint64_t offset; ++ __u64 offset; + }; + + struct drm_i915_gem_execbuffer { +@@ -543,44 +543,44 @@ struct drm_i915_gem_execbuffer { + * a buffer is performing refer to buffers that have already appeared + * in the validate list. + */ +- uint64_t buffers_ptr; +- uint32_t buffer_count; ++ __u64 buffers_ptr; ++ __u32 buffer_count; + + /** Offset in the batchbuffer to start execution from. */ +- uint32_t batch_start_offset; ++ __u32 batch_start_offset; + /** Bytes used in batchbuffer from batch_start_offset */ +- uint32_t batch_len; +- uint32_t DR1; +- uint32_t DR4; +- uint32_t num_cliprects; ++ __u32 batch_len; ++ __u32 DR1; ++ __u32 DR4; ++ __u32 num_cliprects; + /** This is a struct drm_clip_rect *cliprects */ +- uint64_t cliprects_ptr; ++ __u64 cliprects_ptr; + }; + + struct drm_i915_gem_pin { + /** Handle of the buffer to be pinned. */ +- uint32_t handle; +- uint32_t pad; ++ __u32 handle; ++ __u32 pad; + + /** alignment required within the aperture */ +- uint64_t alignment; ++ __u64 alignment; + + /** Returned GTT offset of the buffer. */ +- uint64_t offset; ++ __u64 offset; + }; + + struct drm_i915_gem_unpin { + /** Handle of the buffer to be unpinned. */ +- uint32_t handle; +- uint32_t pad; ++ __u32 handle; ++ __u32 pad; + }; + + struct drm_i915_gem_busy { + /** Handle of the buffer to check for busy */ +- uint32_t handle; ++ __u32 handle; + + /** Return busy status (1 if busy, 0 if idle) */ +- uint32_t busy; ++ __u32 busy; + }; + + #define I915_TILING_NONE 0 +@@ -597,7 +597,7 @@ struct drm_i915_gem_busy { + + struct drm_i915_gem_set_tiling { + /** Handle of the buffer to have its tiling state updated */ +- uint32_t handle; ++ __u32 handle; + + /** + * Tiling mode for the object (I915_TILING_NONE, I915_TILING_X, +@@ -611,47 +611,47 @@ struct drm_i915_gem_set_tiling { + * + * Buffer contents become undefined when changing tiling_mode. + */ +- uint32_t tiling_mode; ++ __u32 tiling_mode; + + /** + * Stride in bytes for the object when in I915_TILING_X or + * I915_TILING_Y. + */ +- uint32_t stride; ++ __u32 stride; + + /** + * Returned address bit 6 swizzling required for CPU access through + * mmap mapping. + */ +- uint32_t swizzle_mode; ++ __u32 swizzle_mode; + }; + + struct drm_i915_gem_get_tiling { + /** Handle of the buffer to get tiling state for. */ +- uint32_t handle; ++ __u32 handle; + + /** + * Current tiling mode for the object (I915_TILING_NONE, I915_TILING_X, + * I915_TILING_Y). + */ +- uint32_t tiling_mode; ++ __u32 tiling_mode; + + /** + * Returned address bit 6 swizzling required for CPU access through + * mmap mapping. + */ +- uint32_t swizzle_mode; ++ __u32 swizzle_mode; + }; + + struct drm_i915_gem_get_aperture { + /** Total size of the aperture used by i915_gem_execbuffer, in bytes */ +- uint64_t aper_size; ++ __u64 aper_size; + + /** + * Available space in the aperture used by i915_gem_execbuffer, in + * bytes + */ +- uint64_t aper_available_size; ++ __u64 aper_available_size; + }; + + #endif /* _I915_DRM_H_ */ +Index: linux-2.6-tip/include/drm/mga_drm.h +=================================================================== +--- linux-2.6-tip.orig/include/drm/mga_drm.h ++++ linux-2.6-tip/include/drm/mga_drm.h +@@ -35,6 +35,8 @@ + #ifndef __MGA_DRM_H__ + #define __MGA_DRM_H__ + ++#include ++ + /* WARNING: If you change any of these defines, make sure to change the + * defines in the Xserver file (mga_sarea.h) + */ +@@ -255,8 +257,8 @@ typedef struct _drm_mga_sarea { + #define DRM_IOCTL_MGA_ILOAD DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_ILOAD, drm_mga_iload_t) + #define DRM_IOCTL_MGA_BLIT DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_BLIT, drm_mga_blit_t) + #define DRM_IOCTL_MGA_GETPARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_MGA_GETPARAM, drm_mga_getparam_t) +-#define DRM_IOCTL_MGA_SET_FENCE DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_SET_FENCE, uint32_t) +-#define DRM_IOCTL_MGA_WAIT_FENCE DRM_IOWR(DRM_COMMAND_BASE + DRM_MGA_WAIT_FENCE, uint32_t) ++#define DRM_IOCTL_MGA_SET_FENCE DRM_IOW( DRM_COMMAND_BASE + DRM_MGA_SET_FENCE, __u32) ++#define DRM_IOCTL_MGA_WAIT_FENCE DRM_IOWR(DRM_COMMAND_BASE + DRM_MGA_WAIT_FENCE, __u32) + #define DRM_IOCTL_MGA_DMA_BOOTSTRAP DRM_IOWR(DRM_COMMAND_BASE + DRM_MGA_DMA_BOOTSTRAP, drm_mga_dma_bootstrap_t) + + typedef struct _drm_mga_warp_index { +@@ -310,7 +312,7 @@ typedef struct drm_mga_dma_bootstrap { + */ + /*@{ */ + unsigned long texture_handle; /**< Handle used to map AGP textures. */ +- uint32_t texture_size; /**< Size of the AGP texture region. */ ++ __u32 texture_size; /**< Size of the AGP texture region. */ + /*@} */ + + /** +@@ -319,7 +321,7 @@ typedef struct drm_mga_dma_bootstrap { + * On return from the DRM_MGA_DMA_BOOTSTRAP ioctl, this field will be + * filled in with the actual AGP mode. If AGP was not available + */ +- uint32_t primary_size; ++ __u32 primary_size; + + /** + * Requested number of secondary DMA buffers. +@@ -329,7 +331,7 @@ typedef struct drm_mga_dma_bootstrap { + * allocated. Particularly when PCI DMA is used, this may be + * (subtantially) less than the number requested. + */ +- uint32_t secondary_bin_count; ++ __u32 secondary_bin_count; + + /** + * Requested size of each secondary DMA buffer. +@@ -338,7 +340,7 @@ typedef struct drm_mga_dma_bootstrap { + * dma_mga_dma_bootstrap::secondary_bin_count, it is \b not allowed + * to reduce dma_mga_dma_bootstrap::secondary_bin_size. + */ +- uint32_t secondary_bin_size; ++ __u32 secondary_bin_size; + + /** + * Bit-wise mask of AGPSTAT2_* values. Currently only \c AGPSTAT2_1X, +@@ -350,12 +352,12 @@ typedef struct drm_mga_dma_bootstrap { + * filled in with the actual AGP mode. If AGP was not available + * (i.e., PCI DMA was used), this value will be zero. + */ +- uint32_t agp_mode; ++ __u32 agp_mode; + + /** + * Desired AGP GART size, measured in megabytes. + */ +- uint8_t agp_size; ++ __u8 agp_size; + } drm_mga_dma_bootstrap_t; + + typedef struct drm_mga_clear { +Index: linux-2.6-tip/include/drm/radeon_drm.h +=================================================================== +--- linux-2.6-tip.orig/include/drm/radeon_drm.h ++++ linux-2.6-tip/include/drm/radeon_drm.h +@@ -33,6 +33,8 @@ + #ifndef __RADEON_DRM_H__ + #define __RADEON_DRM_H__ + ++#include ++ + /* WARNING: If you change any of these defines, make sure to change the + * defines in the X server file (radeon_sarea.h) + */ +@@ -722,7 +724,7 @@ typedef struct drm_radeon_irq_wait { + + typedef struct drm_radeon_setparam { + unsigned int param; +- int64_t value; ++ __s64 value; + } drm_radeon_setparam_t; + + #define RADEON_SETPARAM_FB_LOCATION 1 /* determined framebuffer location */ +Index: linux-2.6-tip/include/drm/via_drm.h +=================================================================== +--- linux-2.6-tip.orig/include/drm/via_drm.h ++++ linux-2.6-tip/include/drm/via_drm.h +@@ -24,6 +24,8 @@ + #ifndef _VIA_DRM_H_ + #define _VIA_DRM_H_ + ++#include ++ + /* WARNING: These defines must be the same as what the Xserver uses. + * if you change them, you must change the defines in the Xserver. + */ +@@ -114,19 +116,19 @@ + #define VIA_MEM_UNKNOWN 4 + + typedef struct { +- uint32_t offset; +- uint32_t size; ++ __u32 offset; ++ __u32 size; + } drm_via_agp_t; + + typedef struct { +- uint32_t offset; +- uint32_t size; ++ __u32 offset; ++ __u32 size; + } drm_via_fb_t; + + typedef struct { +- uint32_t context; +- uint32_t type; +- uint32_t size; ++ __u32 context; ++ __u32 type; ++ __u32 size; + unsigned long index; + unsigned long offset; + } drm_via_mem_t; +@@ -148,9 +150,9 @@ typedef struct _drm_via_futex { + VIA_FUTEX_WAIT = 0x00, + VIA_FUTEX_WAKE = 0X01 + } func; +- uint32_t ms; +- uint32_t lock; +- uint32_t val; ++ __u32 ms; ++ __u32 lock; ++ __u32 val; + } drm_via_futex_t; + + typedef struct _drm_via_dma_init { +@@ -211,7 +213,7 @@ typedef struct _drm_via_cmdbuf_size { + VIA_CMDBUF_LAG = 0x02 + } func; + int wait; +- uint32_t size; ++ __u32 size; + } drm_via_cmdbuf_size_t; + + typedef enum { +@@ -236,8 +238,8 @@ enum drm_via_irqs { + struct drm_via_wait_irq_request { + unsigned irq; + via_irq_seq_type_t type; +- uint32_t sequence; +- uint32_t signal; ++ __u32 sequence; ++ __u32 signal; + }; + + typedef union drm_via_irqwait { +@@ -246,7 +248,7 @@ typedef union drm_via_irqwait { + } drm_via_irqwait_t; + + typedef struct drm_via_blitsync { +- uint32_t sync_handle; ++ __u32 sync_handle; + unsigned engine; + } drm_via_blitsync_t; + +@@ -257,16 +259,16 @@ typedef struct drm_via_blitsync { + */ + + typedef struct drm_via_dmablit { +- uint32_t num_lines; +- uint32_t line_length; ++ __u32 num_lines; ++ __u32 line_length; + +- uint32_t fb_addr; +- uint32_t fb_stride; ++ __u32 fb_addr; ++ __u32 fb_stride; + + unsigned char *mem_addr; +- uint32_t mem_stride; ++ __u32 mem_stride; + +- uint32_t flags; ++ __u32 flags; + int to_fb; + + drm_via_blitsync_t sync; +Index: linux-2.6-tip/include/linux/acpi.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/acpi.h ++++ linux-2.6-tip/include/linux/acpi.h +@@ -79,6 +79,7 @@ typedef int (*acpi_table_handler) (struc + typedef int (*acpi_table_entry_handler) (struct acpi_subtable_header *header, const unsigned long end); + + char * __acpi_map_table (unsigned long phys_addr, unsigned long size); ++void __acpi_unmap_table(char *map, unsigned long size); + int early_acpi_boot_init(void); + int acpi_boot_init (void); + int acpi_boot_table_init (void); +Index: linux-2.6-tip/include/linux/agpgart.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/agpgart.h ++++ linux-2.6-tip/include/linux/agpgart.h +@@ -77,20 +77,20 @@ typedef struct _agp_setup { + * The "prot" down below needs still a "sleep" flag somehow ... + */ + typedef struct _agp_segment { +- off_t pg_start; /* starting page to populate */ +- size_t pg_count; /* number of pages */ +- int prot; /* prot flags for mmap */ ++ __kernel_off_t pg_start; /* starting page to populate */ ++ __kernel_size_t pg_count; /* number of pages */ ++ int prot; /* prot flags for mmap */ + } agp_segment; + + typedef struct _agp_region { +- pid_t pid; /* pid of process */ +- size_t seg_count; /* number of segments */ ++ __kernel_pid_t pid; /* pid of process */ ++ __kernel_size_t seg_count; /* number of segments */ + struct _agp_segment *seg_list; + } agp_region; + + typedef struct _agp_allocate { + int key; /* tag of allocation */ +- size_t pg_count; /* number of pages */ ++ __kernel_size_t pg_count;/* number of pages */ + __u32 type; /* 0 == normal, other devspec */ + __u32 physical; /* device specific (some devices + * need a phys address of the +@@ -100,7 +100,7 @@ typedef struct _agp_allocate { + + typedef struct _agp_bind { + int key; /* tag of allocation */ +- off_t pg_start; /* starting page to populate */ ++ __kernel_off_t pg_start;/* starting page to populate */ + } agp_bind; + + typedef struct _agp_unbind { +Index: linux-2.6-tip/include/linux/atmlec.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/atmlec.h ++++ linux-2.6-tip/include/linux/atmlec.h +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + /* ATM lec daemon control socket */ + #define ATMLEC_CTRL _IO('a', ATMIOC_LANE) +@@ -78,8 +79,8 @@ struct atmlec_msg { + } normal; + struct atmlec_config_msg config; + struct { +- uint16_t lec_id; /* requestor lec_id */ +- uint32_t tran_id; /* transaction id */ ++ __u16 lec_id; /* requestor lec_id */ ++ __u32 tran_id; /* transaction id */ + unsigned char mac_addr[ETH_ALEN]; /* dst mac addr */ + unsigned char atm_addr[ATM_ESA_LEN]; /* reqestor ATM addr */ + } proxy; /* +Index: linux-2.6-tip/include/linux/atmmpc.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/atmmpc.h ++++ linux-2.6-tip/include/linux/atmmpc.h +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + + #define ATMMPC_CTRL _IO('a', ATMIOC_MPOA) + #define ATMMPC_DATA _IO('a', ATMIOC_MPOA+1) +@@ -18,39 +19,39 @@ struct atmmpc_ioc { + }; + + typedef struct in_ctrl_info { +- uint8_t Last_NHRP_CIE_code; +- uint8_t Last_Q2931_cause_value; +- uint8_t eg_MPC_ATM_addr[ATM_ESA_LEN]; ++ __u8 Last_NHRP_CIE_code; ++ __u8 Last_Q2931_cause_value; ++ __u8 eg_MPC_ATM_addr[ATM_ESA_LEN]; + __be32 tag; + __be32 in_dst_ip; /* IP address this ingress MPC sends packets to */ +- uint16_t holding_time; +- uint32_t request_id; ++ __u16 holding_time; ++ __u32 request_id; + } in_ctrl_info; + + typedef struct eg_ctrl_info { +- uint8_t DLL_header[256]; +- uint8_t DH_length; ++ __u8 DLL_header[256]; ++ __u8 DH_length; + __be32 cache_id; + __be32 tag; + __be32 mps_ip; + __be32 eg_dst_ip; /* IP address to which ingress MPC sends packets */ +- uint8_t in_MPC_data_ATM_addr[ATM_ESA_LEN]; +- uint16_t holding_time; ++ __u8 in_MPC_data_ATM_addr[ATM_ESA_LEN]; ++ __u16 holding_time; + } eg_ctrl_info; + + struct mpc_parameters { +- uint16_t mpc_p1; /* Shortcut-Setup Frame Count */ +- uint16_t mpc_p2; /* Shortcut-Setup Frame Time */ +- uint8_t mpc_p3[8]; /* Flow-detection Protocols */ +- uint16_t mpc_p4; /* MPC Initial Retry Time */ +- uint16_t mpc_p5; /* MPC Retry Time Maximum */ +- uint16_t mpc_p6; /* Hold Down Time */ ++ __u16 mpc_p1; /* Shortcut-Setup Frame Count */ ++ __u16 mpc_p2; /* Shortcut-Setup Frame Time */ ++ __u8 mpc_p3[8]; /* Flow-detection Protocols */ ++ __u16 mpc_p4; /* MPC Initial Retry Time */ ++ __u16 mpc_p5; /* MPC Retry Time Maximum */ ++ __u16 mpc_p6; /* Hold Down Time */ + } ; + + struct k_message { +- uint16_t type; ++ __u16 type; + __be32 ip_mask; +- uint8_t MPS_ctrl[ATM_ESA_LEN]; ++ __u8 MPS_ctrl[ATM_ESA_LEN]; + union { + in_ctrl_info in_info; + eg_ctrl_info eg_info; +@@ -61,11 +62,11 @@ struct k_message { + + struct llc_snap_hdr { + /* RFC 1483 LLC/SNAP encapsulation for routed IP PDUs */ +- uint8_t dsap; /* Destination Service Access Point (0xAA) */ +- uint8_t ssap; /* Source Service Access Point (0xAA) */ +- uint8_t ui; /* Unnumbered Information (0x03) */ +- uint8_t org[3]; /* Organizational identification (0x000000) */ +- uint8_t type[2]; /* Ether type (for IP) (0x0800) */ ++ __u8 dsap; /* Destination Service Access Point (0xAA) */ ++ __u8 ssap; /* Source Service Access Point (0xAA) */ ++ __u8 ui; /* Unnumbered Information (0x03) */ ++ __u8 org[3]; /* Organizational identification (0x000000) */ ++ __u8 type[2]; /* Ether type (for IP) (0x0800) */ + }; + + /* TLVs this MPC recognizes */ +Index: linux-2.6-tip/include/linux/audit.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/audit.h ++++ linux-2.6-tip/include/linux/audit.h +@@ -606,7 +606,8 @@ extern int audit_enabled; + #define audit_log(c,g,t,f,...) do { ; } while (0) + #define audit_log_start(c,g,t) ({ NULL; }) + #define audit_log_vformat(b,f,a) do { ; } while (0) +-#define audit_log_format(b,f,...) do { ; } while (0) ++static inline void __attribute__ ((format(printf, 2, 3))) ++audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { } + #define audit_log_end(b) do { ; } while (0) + #define audit_log_n_hex(a,b,l) do { ; } while (0) + #define audit_log_n_string(a,c,l) do { ; } while (0) +Index: linux-2.6-tip/include/linux/blktrace_api.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/blktrace_api.h ++++ linux-2.6-tip/include/linux/blktrace_api.h +@@ -144,6 +144,9 @@ struct blk_user_trace_setup { + + #ifdef __KERNEL__ + #if defined(CONFIG_BLK_DEV_IO_TRACE) ++ ++#include ++ + struct blk_trace { + int trace_state; + struct rchan *rchan; +@@ -194,6 +197,8 @@ extern int blk_trace_setup(struct reques + extern int blk_trace_startstop(struct request_queue *q, int start); + extern int blk_trace_remove(struct request_queue *q); + ++extern struct attribute_group blk_trace_attr_group; ++ + #else /* !CONFIG_BLK_DEV_IO_TRACE */ + #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) + #define blk_trace_shutdown(q) do { } while (0) +Index: linux-2.6-tip/include/linux/bootmem.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/bootmem.h ++++ linux-2.6-tip/include/linux/bootmem.h +@@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long a + #define BOOTMEM_DEFAULT 0 + #define BOOTMEM_EXCLUSIVE (1<<0) + ++extern int reserve_bootmem(unsigned long addr, ++ unsigned long size, ++ int flags); + extern int reserve_bootmem_node(pg_data_t *pgdat, +- unsigned long physaddr, +- unsigned long size, +- int flags); +-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE +-extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags); +-#endif ++ unsigned long physaddr, ++ unsigned long size, ++ int flags); + +-extern void *__alloc_bootmem_nopanic(unsigned long size, ++extern void *__alloc_bootmem(unsigned long size, + unsigned long align, + unsigned long goal); +-extern void *__alloc_bootmem(unsigned long size, ++extern void *__alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); +-extern void *__alloc_bootmem_low(unsigned long size, +- unsigned long align, +- unsigned long goal); + extern void *__alloc_bootmem_node(pg_data_t *pgdat, + unsigned long size, + unsigned long align, +@@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopani + unsigned long size, + unsigned long align, + unsigned long goal); ++extern void *__alloc_bootmem_low(unsigned long size, ++ unsigned long align, ++ unsigned long goal); + extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal); +-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE ++ + #define alloc_bootmem(x) \ + __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) + #define alloc_bootmem_nopanic(x) \ + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +-#define alloc_bootmem_low(x) \ +- __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) + #define alloc_bootmem_pages(x) \ + __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + #define alloc_bootmem_pages_nopanic(x) \ + __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +-#define alloc_bootmem_low_pages(x) \ +- __alloc_bootmem_low(x, PAGE_SIZE, 0) + #define alloc_bootmem_node(pgdat, x) \ + __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) + #define alloc_bootmem_pages_node(pgdat, x) \ + __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) ++#define alloc_bootmem_pages_node_nopanic(pgdat, x) \ ++ __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) ++ ++#define alloc_bootmem_low(x) \ ++ __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) ++#define alloc_bootmem_low_pages(x) \ ++ __alloc_bootmem_low(x, PAGE_SIZE, 0) + #define alloc_bootmem_low_pages_node(pgdat, x) \ + __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) +-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ + + extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, + int flags); +Index: linux-2.6-tip/include/linux/c2port.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/c2port.h ++++ linux-2.6-tip/include/linux/c2port.h +@@ -10,6 +10,7 @@ + */ + + #include ++#include + + #define C2PORT_NAME_LEN 32 + +@@ -20,8 +21,10 @@ + /* Main struct */ + struct c2port_ops; + struct c2port_device { +- unsigned int access:1; +- unsigned int flash_access:1; ++ kmemcheck_define_bitfield(flags, { ++ unsigned int access:1; ++ unsigned int flash_access:1; ++ }); + + int id; + char name[C2PORT_NAME_LEN]; +Index: linux-2.6-tip/include/linux/cm4000_cs.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/cm4000_cs.h ++++ linux-2.6-tip/include/linux/cm4000_cs.h +@@ -1,6 +1,8 @@ + #ifndef _CM4000_H_ + #define _CM4000_H_ + ++#include ++ + #define MAX_ATR 33 + + #define CM4000_MAX_DEV 4 +@@ -10,9 +12,9 @@ + * not to break compilation of userspace apps. -HW */ + + typedef struct atreq { +- int32_t atr_len; ++ __s32 atr_len; + unsigned char atr[64]; +- int32_t power_act; ++ __s32 power_act; + unsigned char bIFSD; + unsigned char bIFSC; + } atreq_t; +@@ -22,13 +24,13 @@ typedef struct atreq { + * member sizes. This leads to CONFIG_COMPAT breakage, since 32bit userspace + * will lay out the structure members differently than the 64bit kernel. + * +- * I've changed "ptsreq.protocol" from "unsigned long" to "u_int32_t". ++ * I've changed "ptsreq.protocol" from "unsigned long" to "__u32". + * On 32bit this will make no difference. With 64bit kernels, it will make + * 32bit apps work, too. + */ + + typedef struct ptsreq { +- u_int32_t protocol; /*T=0: 2^0, T=1: 2^1*/ ++ __u32 protocol; /*T=0: 2^0, T=1: 2^1*/ + unsigned char flags; + unsigned char pts1; + unsigned char pts2; +Index: linux-2.6-tip/include/linux/cn_proc.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/cn_proc.h ++++ linux-2.6-tip/include/linux/cn_proc.h +@@ -65,20 +65,20 @@ struct proc_event { + } ack; + + struct fork_proc_event { +- pid_t parent_pid; +- pid_t parent_tgid; +- pid_t child_pid; +- pid_t child_tgid; ++ __kernel_pid_t parent_pid; ++ __kernel_pid_t parent_tgid; ++ __kernel_pid_t child_pid; ++ __kernel_pid_t child_tgid; + } fork; + + struct exec_proc_event { +- pid_t process_pid; +- pid_t process_tgid; ++ __kernel_pid_t process_pid; ++ __kernel_pid_t process_tgid; + } exec; + + struct id_proc_event { +- pid_t process_pid; +- pid_t process_tgid; ++ __kernel_pid_t process_pid; ++ __kernel_pid_t process_tgid; + union { + __u32 ruid; /* task uid */ + __u32 rgid; /* task gid */ +@@ -90,8 +90,8 @@ struct proc_event { + } id; + + struct exit_proc_event { +- pid_t process_pid; +- pid_t process_tgid; ++ __kernel_pid_t process_pid; ++ __kernel_pid_t process_tgid; + __u32 exit_code, exit_signal; + } exit; + } event_data; +Index: linux-2.6-tip/include/linux/coda_linux.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/coda_linux.h ++++ linux-2.6-tip/include/linux/coda_linux.h +@@ -51,10 +51,6 @@ void coda_vattr_to_iattr(struct inode *, + void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *); + unsigned short coda_flags_to_cflags(unsigned short); + +-/* sysctl.h */ +-void coda_sysctl_init(void); +-void coda_sysctl_clean(void); +- + #define CODA_ALLOC(ptr, cast, size) do { \ + if (size < PAGE_SIZE) \ + ptr = kmalloc((unsigned long) size, GFP_KERNEL); \ +Index: linux-2.6-tip/include/linux/coda_psdev.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/coda_psdev.h ++++ linux-2.6-tip/include/linux/coda_psdev.h +@@ -6,6 +6,7 @@ + #define CODA_PSDEV_MAJOR 67 + #define MAX_CODADEVS 5 /* how many do we allow */ + ++#ifdef __KERNEL__ + struct kstatfs; + + /* communication pending/processing queues */ +@@ -24,7 +25,6 @@ static inline struct venus_comm *coda_vc + return (struct venus_comm *)((sb)->s_fs_info); + } + +- + /* upcalls */ + int venus_rootfid(struct super_block *sb, struct CodaFid *fidp); + int venus_getattr(struct super_block *sb, struct CodaFid *fid, +@@ -64,6 +64,12 @@ int coda_downcall(int opcode, union outp + int venus_fsync(struct super_block *sb, struct CodaFid *fid); + int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); + ++/* ++ * Statistics ++ */ ++ ++extern struct venus_comm coda_comms[]; ++#endif /* __KERNEL__ */ + + /* messages between coda filesystem in kernel and Venus */ + struct upc_req { +@@ -82,11 +88,4 @@ struct upc_req { + #define REQ_WRITE 0x4 + #define REQ_ABORT 0x8 + +- +-/* +- * Statistics +- */ +- +-extern struct venus_comm coda_comms[]; +- + #endif +Index: linux-2.6-tip/include/linux/compat.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/compat.h ++++ linux-2.6-tip/include/linux/compat.h +@@ -208,6 +208,8 @@ int copy_siginfo_from_user32(siginfo_t * + int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from); + int get_compat_sigevent(struct sigevent *event, + const struct compat_sigevent __user *u_event); ++long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, ++ struct compat_siginfo __user *uinfo); + + static inline int compat_timeval_compare(struct compat_timeval *lhs, + struct compat_timeval *rhs) +Index: linux-2.6-tip/include/linux/compiler-gcc4.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/compiler-gcc4.h ++++ linux-2.6-tip/include/linux/compiler-gcc4.h +@@ -3,8 +3,10 @@ + #endif + + /* GCC 4.1.[01] miscompiles __weak */ +-#if __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ <= 1 +-# error Your version of gcc miscompiles the __weak directive ++#ifdef __KERNEL__ ++# if __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ <= 1 ++# error Your version of gcc miscompiles the __weak directive ++# endif + #endif + + #define __used __attribute__((__used__)) +Index: linux-2.6-tip/include/linux/compiler.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/compiler.h ++++ linux-2.6-tip/include/linux/compiler.h +@@ -68,6 +68,7 @@ struct ftrace_branch_data { + unsigned long miss; + unsigned long hit; + }; ++ unsigned long miss_hit[2]; + }; + }; + +@@ -126,10 +127,7 @@ void ftrace_likely_update(struct ftrace_ + .line = __LINE__, \ + }; \ + ______r = !!(cond); \ +- if (______r) \ +- ______f.hit++; \ +- else \ +- ______f.miss++; \ ++ ______f.miss_hit[______r]++; \ + ______r; \ + })) + #endif /* CONFIG_PROFILE_ALL_BRANCHES */ +Index: linux-2.6-tip/include/linux/cyclades.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/cyclades.h ++++ linux-2.6-tip/include/linux/cyclades.h +@@ -82,9 +82,9 @@ struct cyclades_monitor { + * open) + */ + struct cyclades_idle_stats { +- time_t in_use; /* Time device has been in use (secs) */ +- time_t recv_idle; /* Time since last char received (secs) */ +- time_t xmit_idle; /* Time since last char transmitted (secs) */ ++ __kernel_time_t in_use; /* Time device has been in use (secs) */ ++ __kernel_time_t recv_idle; /* Time since last char received (secs) */ ++ __kernel_time_t xmit_idle; /* Time since last char transmitted (secs) */ + unsigned long recv_bytes; /* Bytes received */ + unsigned long xmit_bytes; /* Bytes transmitted */ + unsigned long overruns; /* Input overruns */ +Index: linux-2.6-tip/include/linux/debugfs.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/debugfs.h ++++ linux-2.6-tip/include/linux/debugfs.h +@@ -71,6 +71,9 @@ struct dentry *debugfs_create_bool(const + struct dentry *debugfs_create_blob(const char *name, mode_t mode, + struct dentry *parent, + struct debugfs_blob_wrapper *blob); ++ ++bool debugfs_initialized(void); ++ + #else + + #include +@@ -183,6 +186,11 @@ static inline struct dentry *debugfs_cre + return ERR_PTR(-ENODEV); + } + ++static inline bool debugfs_initialized(void) ++{ ++ return false; ++} ++ + #endif + + #endif +Index: linux-2.6-tip/include/linux/decompress/bunzip2.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/decompress/bunzip2.h +@@ -0,0 +1,10 @@ ++#ifndef DECOMPRESS_BUNZIP2_H ++#define DECOMPRESS_BUNZIP2_H ++ ++int bunzip2(unsigned char *inbuf, int len, ++ int(*fill)(void*, unsigned int), ++ int(*flush)(void*, unsigned int), ++ unsigned char *output, ++ int *pos, ++ void(*error)(char *x)); ++#endif +Index: linux-2.6-tip/include/linux/decompress/generic.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/decompress/generic.h +@@ -0,0 +1,33 @@ ++#ifndef DECOMPRESS_GENERIC_H ++#define DECOMPRESS_GENERIC_H ++ ++/* Minimal chunksize to be read. ++ *Bzip2 prefers at least 4096 ++ *Lzma prefers 0x10000 */ ++#define COMPR_IOBUF_SIZE 4096 ++ ++typedef int (*decompress_fn) (unsigned char *inbuf, int len, ++ int(*fill)(void*, unsigned int), ++ int(*writebb)(void*, unsigned int), ++ unsigned char *output, ++ int *posp, ++ void(*error)(char *x)); ++ ++/* inbuf - input buffer ++ *len - len of pre-read data in inbuf ++ *fill - function to fill inbuf if empty ++ *writebb - function to write out outbug ++ *posp - if non-null, input position (number of bytes read) will be ++ * returned here ++ * ++ *If len != 0, the inbuf is initialized (with as much data), and fill ++ *should not be called ++ *If len = 0, the inbuf is allocated, but empty. Its size is IOBUF_SIZE ++ *fill should be called (repeatedly...) to read data, at most IOBUF_SIZE ++ */ ++ ++/* Utility routine to detect the decompression method */ ++decompress_fn decompress_method(const unsigned char *inbuf, int len, ++ const char **name); ++ ++#endif +Index: linux-2.6-tip/include/linux/decompress/inflate.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/decompress/inflate.h +@@ -0,0 +1,13 @@ ++#ifndef INFLATE_H ++#define INFLATE_H ++ ++/* Other housekeeping constants */ ++#define INBUFSIZ 4096 ++ ++int gunzip(unsigned char *inbuf, int len, ++ int(*fill)(void*, unsigned int), ++ int(*flush)(void*, unsigned int), ++ unsigned char *output, ++ int *pos, ++ void(*error_fn)(char *x)); ++#endif +Index: linux-2.6-tip/include/linux/decompress/mm.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/decompress/mm.h +@@ -0,0 +1,87 @@ ++/* ++ * linux/compr_mm.h ++ * ++ * Memory management for pre-boot and ramdisk uncompressors ++ * ++ * Authors: Alain Knaff ++ * ++ */ ++ ++#ifndef DECOMPR_MM_H ++#define DECOMPR_MM_H ++ ++#ifdef STATIC ++ ++/* Code active when included from pre-boot environment: */ ++ ++/* A trivial malloc implementation, adapted from ++ * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 ++ */ ++static unsigned long malloc_ptr; ++static int malloc_count; ++ ++static void *malloc(int size) ++{ ++ void *p; ++ ++ if (size < 0) ++ error("Malloc error"); ++ if (!malloc_ptr) ++ malloc_ptr = free_mem_ptr; ++ ++ malloc_ptr = (malloc_ptr + 3) & ~3; /* Align */ ++ ++ p = (void *)malloc_ptr; ++ malloc_ptr += size; ++ ++ if (free_mem_end_ptr && malloc_ptr >= free_mem_end_ptr) ++ error("Out of memory"); ++ ++ malloc_count++; ++ return p; ++} ++ ++static void free(void *where) ++{ ++ malloc_count--; ++ if (!malloc_count) ++ malloc_ptr = free_mem_ptr; ++} ++ ++#define large_malloc(a) malloc(a) ++#define large_free(a) free(a) ++ ++#define set_error_fn(x) ++ ++#define INIT ++ ++#else /* STATIC */ ++ ++/* Code active when compiled standalone for use when loading ramdisk: */ ++ ++#include ++#include ++#include ++#include ++ ++/* Use defines rather than static inline in order to avoid spurious ++ * warnings when not needed (indeed large_malloc / large_free are not ++ * needed by inflate */ ++ ++#define malloc(a) kmalloc(a, GFP_KERNEL) ++#define free(a) kfree(a) ++ ++#define large_malloc(a) vmalloc(a) ++#define large_free(a) vfree(a) ++ ++static void(*error)(char *m); ++#define set_error_fn(x) error = x; ++ ++#define INIT __init ++#define STATIC ++ ++#include ++ ++#endif /* STATIC */ ++ ++#endif /* DECOMPR_MM_H */ +Index: linux-2.6-tip/include/linux/decompress/unlzma.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/decompress/unlzma.h +@@ -0,0 +1,12 @@ ++#ifndef DECOMPRESS_UNLZMA_H ++#define DECOMPRESS_UNLZMA_H ++ ++int unlzma(unsigned char *, int, ++ int(*fill)(void*, unsigned int), ++ int(*flush)(void*, unsigned int), ++ unsigned char *output, ++ int *posp, ++ void(*error)(char *x) ++ ); ++ ++#endif +Index: linux-2.6-tip/include/linux/dlm_netlink.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/dlm_netlink.h ++++ linux-2.6-tip/include/linux/dlm_netlink.h +@@ -9,6 +9,8 @@ + #ifndef _DLM_NETLINK_H + #define _DLM_NETLINK_H + ++#include ++ + enum { + DLM_STATUS_WAITING = 1, + DLM_STATUS_GRANTED = 2, +@@ -18,16 +20,16 @@ enum { + #define DLM_LOCK_DATA_VERSION 1 + + struct dlm_lock_data { +- uint16_t version; +- uint32_t lockspace_id; ++ __u16 version; ++ __u32 lockspace_id; + int nodeid; + int ownpid; +- uint32_t id; +- uint32_t remid; +- uint64_t xid; +- int8_t status; +- int8_t grmode; +- int8_t rqmode; ++ __u32 id; ++ __u32 remid; ++ __u64 xid; ++ __s8 status; ++ __s8 grmode; ++ __s8 rqmode; + unsigned long timestamp; + int resource_namelen; + char resource_name[DLM_RESNAME_MAXLEN]; +Index: linux-2.6-tip/include/linux/dm-ioctl.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/dm-ioctl.h ++++ linux-2.6-tip/include/linux/dm-ioctl.h +@@ -113,20 +113,20 @@ struct dm_ioctl { + * return -ENOTTY) fill out this field, even if the + * command failed. + */ +- uint32_t version[3]; /* in/out */ +- uint32_t data_size; /* total size of data passed in ++ __u32 version[3]; /* in/out */ ++ __u32 data_size; /* total size of data passed in + * including this struct */ + +- uint32_t data_start; /* offset to start of data ++ __u32 data_start; /* offset to start of data + * relative to start of this struct */ + +- uint32_t target_count; /* in/out */ +- int32_t open_count; /* out */ +- uint32_t flags; /* in/out */ +- uint32_t event_nr; /* in/out */ +- uint32_t padding; ++ __u32 target_count; /* in/out */ ++ __s32 open_count; /* out */ ++ __u32 flags; /* in/out */ ++ __u32 event_nr; /* in/out */ ++ __u32 padding; + +- uint64_t dev; /* in/out */ ++ __u64 dev; /* in/out */ + + char name[DM_NAME_LEN]; /* device name */ + char uuid[DM_UUID_LEN]; /* unique identifier for +@@ -139,9 +139,9 @@ struct dm_ioctl { + * dm_ioctl. + */ + struct dm_target_spec { +- uint64_t sector_start; +- uint64_t length; +- int32_t status; /* used when reading from kernel only */ ++ __u64 sector_start; ++ __u64 length; ++ __s32 status; /* used when reading from kernel only */ + + /* + * Location of the next dm_target_spec. +@@ -153,7 +153,7 @@ struct dm_target_spec { + * (that follows the dm_ioctl struct) to the start of the "next" + * dm_target_spec. + */ +- uint32_t next; ++ __u32 next; + + char target_type[DM_MAX_TYPE_NAME]; + +@@ -168,17 +168,17 @@ struct dm_target_spec { + * Used to retrieve the target dependencies. + */ + struct dm_target_deps { +- uint32_t count; /* Array size */ +- uint32_t padding; /* unused */ +- uint64_t dev[0]; /* out */ ++ __u32 count; /* Array size */ ++ __u32 padding; /* unused */ ++ __u64 dev[0]; /* out */ + }; + + /* + * Used to get a list of all dm devices. + */ + struct dm_name_list { +- uint64_t dev; +- uint32_t next; /* offset to the next record from ++ __u64 dev; ++ __u32 next; /* offset to the next record from + the _start_ of this */ + char name[0]; + }; +@@ -187,8 +187,8 @@ struct dm_name_list { + * Used to retrieve the target versions + */ + struct dm_target_versions { +- uint32_t next; +- uint32_t version[3]; ++ __u32 next; ++ __u32 version[3]; + + char name[0]; + }; +@@ -197,7 +197,7 @@ struct dm_target_versions { + * Used to pass message to a target + */ + struct dm_target_msg { +- uint64_t sector; /* Device sector */ ++ __u64 sector; /* Device sector */ + + char message[0]; + }; +Index: linux-2.6-tip/include/linux/dma-debug.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/dma-debug.h +@@ -0,0 +1,174 @@ ++/* ++ * Copyright (C) 2008 Advanced Micro Devices, Inc. ++ * ++ * Author: Joerg Roedel ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published ++ * by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#ifndef __DMA_DEBUG_H ++#define __DMA_DEBUG_H ++ ++#include ++ ++struct device; ++struct scatterlist; ++struct bus_type; ++ ++#ifdef CONFIG_DMA_API_DEBUG ++ ++extern void dma_debug_add_bus(struct bus_type *bus); ++ ++extern void dma_debug_init(u32 num_entries); ++ ++extern void debug_dma_map_page(struct device *dev, struct page *page, ++ size_t offset, size_t size, ++ int direction, dma_addr_t dma_addr, ++ bool map_single); ++ ++extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, ++ size_t size, int direction, bool map_single); ++ ++extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, ++ int nents, int mapped_ents, int direction); ++ ++extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, ++ int nelems, int dir); ++ ++extern void debug_dma_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t dma_addr, void *virt); ++ ++extern void debug_dma_free_coherent(struct device *dev, size_t size, ++ void *virt, dma_addr_t addr); ++ ++extern void debug_dma_sync_single_for_cpu(struct device *dev, ++ dma_addr_t dma_handle, size_t size, ++ int direction); ++ ++extern void debug_dma_sync_single_for_device(struct device *dev, ++ dma_addr_t dma_handle, ++ size_t size, int direction); ++ ++extern void debug_dma_sync_single_range_for_cpu(struct device *dev, ++ dma_addr_t dma_handle, ++ unsigned long offset, ++ size_t size, ++ int direction); ++ ++extern void debug_dma_sync_single_range_for_device(struct device *dev, ++ dma_addr_t dma_handle, ++ unsigned long offset, ++ size_t size, int direction); ++ ++extern void debug_dma_sync_sg_for_cpu(struct device *dev, ++ struct scatterlist *sg, ++ int nelems, int direction); ++ ++extern void debug_dma_sync_sg_for_device(struct device *dev, ++ struct scatterlist *sg, ++ int nelems, int direction); ++ ++extern void debug_dma_dump_mappings(struct device *dev); ++ ++#else /* CONFIG_DMA_API_DEBUG */ ++ ++static inline void dma_debug_add_bus(struct bus_type *bus) ++{ ++} ++ ++static inline void dma_debug_init(u32 num_entries) ++{ ++} ++ ++static inline void debug_dma_map_page(struct device *dev, struct page *page, ++ size_t offset, size_t size, ++ int direction, dma_addr_t dma_addr, ++ bool map_single) ++{ ++} ++ ++static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, ++ size_t size, int direction, ++ bool map_single) ++{ ++} ++ ++static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, ++ int nents, int mapped_ents, int direction) ++{ ++} ++ ++static inline void debug_dma_unmap_sg(struct device *dev, ++ struct scatterlist *sglist, ++ int nelems, int dir) ++{ ++} ++ ++static inline void debug_dma_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t dma_addr, void *virt) ++{ ++} ++ ++static inline void debug_dma_free_coherent(struct device *dev, size_t size, ++ void *virt, dma_addr_t addr) ++{ ++} ++ ++static inline void debug_dma_sync_single_for_cpu(struct device *dev, ++ dma_addr_t dma_handle, ++ size_t size, int direction) ++{ ++} ++ ++static inline void debug_dma_sync_single_for_device(struct device *dev, ++ dma_addr_t dma_handle, ++ size_t size, int direction) ++{ ++} ++ ++static inline void debug_dma_sync_single_range_for_cpu(struct device *dev, ++ dma_addr_t dma_handle, ++ unsigned long offset, ++ size_t size, ++ int direction) ++{ ++} ++ ++static inline void debug_dma_sync_single_range_for_device(struct device *dev, ++ dma_addr_t dma_handle, ++ unsigned long offset, ++ size_t size, ++ int direction) ++{ ++} ++ ++static inline void debug_dma_sync_sg_for_cpu(struct device *dev, ++ struct scatterlist *sg, ++ int nelems, int direction) ++{ ++} ++ ++static inline void debug_dma_sync_sg_for_device(struct device *dev, ++ struct scatterlist *sg, ++ int nelems, int direction) ++{ ++} ++ ++static inline void debug_dma_dump_mappings(struct device *dev) ++{ ++} ++ ++#endif /* CONFIG_DMA_API_DEBUG */ ++ ++#endif /* __DMA_DEBUG_H */ +Index: linux-2.6-tip/include/linux/dma-mapping.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/dma-mapping.h ++++ linux-2.6-tip/include/linux/dma-mapping.h +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* These definitions mirror those in pci.h, so they can be used + * interchangeably with their PCI_ counterparts */ +@@ -13,6 +15,52 @@ enum dma_data_direction { + DMA_NONE = 3, + }; + ++struct dma_map_ops { ++ void* (*alloc_coherent)(struct device *dev, size_t size, ++ dma_addr_t *dma_handle, gfp_t gfp); ++ void (*free_coherent)(struct device *dev, size_t size, ++ void *vaddr, dma_addr_t dma_handle); ++ dma_addr_t (*map_page)(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ int (*map_sg)(struct device *dev, struct scatterlist *sg, ++ int nents, enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ void (*unmap_sg)(struct device *dev, ++ struct scatterlist *sg, int nents, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ void (*sync_single_for_cpu)(struct device *dev, ++ dma_addr_t dma_handle, size_t size, ++ enum dma_data_direction dir); ++ void (*sync_single_for_device)(struct device *dev, ++ dma_addr_t dma_handle, size_t size, ++ enum dma_data_direction dir); ++ void (*sync_single_range_for_cpu)(struct device *dev, ++ dma_addr_t dma_handle, ++ unsigned long offset, ++ size_t size, ++ enum dma_data_direction dir); ++ void (*sync_single_range_for_device)(struct device *dev, ++ dma_addr_t dma_handle, ++ unsigned long offset, ++ size_t size, ++ enum dma_data_direction dir); ++ void (*sync_sg_for_cpu)(struct device *dev, ++ struct scatterlist *sg, int nents, ++ enum dma_data_direction dir); ++ void (*sync_sg_for_device)(struct device *dev, ++ struct scatterlist *sg, int nents, ++ enum dma_data_direction dir); ++ int (*mapping_error)(struct device *dev, dma_addr_t dma_addr); ++ int (*dma_supported)(struct device *dev, u64 mask); ++ int is_phys; ++}; ++ + #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) + + /* +Index: linux-2.6-tip/include/linux/dmar.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/dmar.h ++++ linux-2.6-tip/include/linux/dmar.h +@@ -24,10 +24,10 @@ + #include + #include + #include ++#include + +-#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP) + struct intel_iommu; +- ++#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP) + struct dmar_drhd_unit { + struct list_head list; /* list of drhd units */ + struct acpi_dmar_header *hdr; /* ACPI header */ +@@ -49,7 +49,7 @@ extern int dmar_dev_scope_init(void); + + /* Intel IOMMU detection */ + extern void detect_intel_iommu(void); +- ++extern int enable_drhd_fault_handling(void); + + extern int parse_ioapics_under_ir(void); + extern int alloc_iommu(struct dmar_drhd_unit *); +@@ -63,12 +63,12 @@ static inline int dmar_table_init(void) + { + return -ENODEV; + } ++static inline int enable_drhd_fault_handling(void) ++{ ++ return -1; ++} + #endif /* !CONFIG_DMAR && !CONFIG_INTR_REMAP */ + +-#ifdef CONFIG_INTR_REMAP +-extern int intr_remapping_enabled; +-extern int enable_intr_remapping(int); +- + struct irte { + union { + struct { +@@ -97,6 +97,10 @@ struct irte { + __u64 high; + }; + }; ++#ifdef CONFIG_INTR_REMAP ++extern int intr_remapping_enabled; ++extern int enable_intr_remapping(int); ++ + extern int get_irte(int irq, struct irte *entry); + extern int modify_irte(int irq, struct irte *irte_modified); + extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count); +@@ -111,14 +115,40 @@ extern int irq_remapped(int irq); + extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev); + extern struct intel_iommu *map_ioapic_to_ir(int apic); + #else ++static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) ++{ ++ return -1; ++} ++static inline int modify_irte(int irq, struct irte *irte_modified) ++{ ++ return -1; ++} ++static inline int free_irte(int irq) ++{ ++ return -1; ++} ++static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle) ++{ ++ return -1; ++} ++static inline int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, ++ u16 sub_handle) ++{ ++ return -1; ++} ++static inline struct intel_iommu *map_dev_to_ir(struct pci_dev *dev) ++{ ++ return NULL; ++} ++static inline struct intel_iommu *map_ioapic_to_ir(int apic) ++{ ++ return NULL; ++} + #define irq_remapped(irq) (0) + #define enable_intr_remapping(mode) (-1) + #define intr_remapping_enabled (0) + #endif + +-#ifdef CONFIG_DMAR +-extern const char *dmar_get_fault_reason(u8 fault_reason); +- + /* Can't use the common MSI interrupt functions + * since DMAR is not a pci device + */ +@@ -127,8 +157,10 @@ extern void dmar_msi_mask(unsigned int i + extern void dmar_msi_read(int irq, struct msi_msg *msg); + extern void dmar_msi_write(int irq, struct msi_msg *msg); + extern int dmar_set_interrupt(struct intel_iommu *iommu); ++extern irqreturn_t dmar_fault(int irq, void *dev_id); + extern int arch_setup_dmar_msi(unsigned int irq); + ++#ifdef CONFIG_DMAR + extern int iommu_detected, no_iommu; + extern struct list_head dmar_rmrr_units; + struct dmar_rmrr_unit { +Index: linux-2.6-tip/include/linux/dvb/audio.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/dvb/audio.h ++++ linux-2.6-tip/include/linux/dvb/audio.h +@@ -76,7 +76,7 @@ struct audio_karaoke{ /* if Vocal1 or V + } audio_karaoke_t; /* into left and right */ + + +-typedef uint16_t audio_attributes_t; ++typedef __u16 audio_attributes_t; + /* bits: descr. */ + /* 15-13 audio coding mode (0=ac3, 2=mpeg1, 3=mpeg2ext, 4=LPCM, 6=DTS, */ + /* 12 multichannel extension */ +Index: linux-2.6-tip/include/linux/dvb/video.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/dvb/video.h ++++ linux-2.6-tip/include/linux/dvb/video.h +@@ -132,12 +132,12 @@ struct video_command { + #define VIDEO_VSYNC_FIELD_PROGRESSIVE (3) + + struct video_event { +- int32_t type; ++ __s32 type; + #define VIDEO_EVENT_SIZE_CHANGED 1 + #define VIDEO_EVENT_FRAME_RATE_CHANGED 2 + #define VIDEO_EVENT_DECODER_STOPPED 3 + #define VIDEO_EVENT_VSYNC 4 +- time_t timestamp; ++ __kernel_time_t timestamp; + union { + video_size_t size; + unsigned int frame_rate; /* in frames per 1000sec */ +@@ -157,25 +157,25 @@ struct video_status { + + struct video_still_picture { + char __user *iFrame; /* pointer to a single iframe in memory */ +- int32_t size; ++ __s32 size; + }; + + + typedef + struct video_highlight { + int active; /* 1=show highlight, 0=hide highlight */ +- uint8_t contrast1; /* 7- 4 Pattern pixel contrast */ ++ __u8 contrast1; /* 7- 4 Pattern pixel contrast */ + /* 3- 0 Background pixel contrast */ +- uint8_t contrast2; /* 7- 4 Emphasis pixel-2 contrast */ ++ __u8 contrast2; /* 7- 4 Emphasis pixel-2 contrast */ + /* 3- 0 Emphasis pixel-1 contrast */ +- uint8_t color1; /* 7- 4 Pattern pixel color */ ++ __u8 color1; /* 7- 4 Pattern pixel color */ + /* 3- 0 Background pixel color */ +- uint8_t color2; /* 7- 4 Emphasis pixel-2 color */ ++ __u8 color2; /* 7- 4 Emphasis pixel-2 color */ + /* 3- 0 Emphasis pixel-1 color */ +- uint32_t ypos; /* 23-22 auto action mode */ ++ __u32 ypos; /* 23-22 auto action mode */ + /* 21-12 start y */ + /* 9- 0 end y */ +- uint32_t xpos; /* 23-22 button color number */ ++ __u32 xpos; /* 23-22 button color number */ + /* 21-12 start x */ + /* 9- 0 end x */ + } video_highlight_t; +@@ -189,17 +189,17 @@ typedef struct video_spu { + + typedef struct video_spu_palette { /* SPU Palette information */ + int length; +- uint8_t __user *palette; ++ __u8 __user *palette; + } video_spu_palette_t; + + + typedef struct video_navi_pack { + int length; /* 0 ... 1024 */ +- uint8_t data[1024]; ++ __u8 data[1024]; + } video_navi_pack_t; + + +-typedef uint16_t video_attributes_t; ++typedef __u16 video_attributes_t; + /* bits: descr. */ + /* 15-14 Video compression mode (0=MPEG-1, 1=MPEG-2) */ + /* 13-12 TV system (0=525/60, 1=625/50) */ +Index: linux-2.6-tip/include/linux/elfcore.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/elfcore.h ++++ linux-2.6-tip/include/linux/elfcore.h +@@ -111,6 +111,15 @@ static inline void elf_core_copy_regs(el + #endif + } + ++static inline void elf_core_copy_kernel_regs(elf_gregset_t *elfregs, struct pt_regs *regs) ++{ ++#ifdef ELF_CORE_COPY_KERNEL_REGS ++ ELF_CORE_COPY_KERNEL_REGS((*elfregs), regs); ++#else ++ elf_core_copy_regs(elfregs, regs); ++#endif ++} ++ + static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) + { + #ifdef ELF_CORE_COPY_TASK_REGS +Index: linux-2.6-tip/include/linux/fdtable.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/fdtable.h ++++ linux-2.6-tip/include/linux/fdtable.h +@@ -5,12 +5,14 @@ + #ifndef __LINUX_FDTABLE_H + #define __LINUX_FDTABLE_H + +-#include + #include + #include + #include + #include + #include ++#include ++ ++#include + + /* + * The default fd array needs to be at least BITS_PER_LONG, +Index: linux-2.6-tip/include/linux/fs.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/fs.h ++++ linux-2.6-tip/include/linux/fs.h +@@ -671,7 +671,7 @@ struct inode { + umode_t i_mode; + spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ + struct mutex i_mutex; +- struct rw_semaphore i_alloc_sem; ++ struct compat_rw_semaphore i_alloc_sem; + const struct inode_operations *i_op; + const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ + struct super_block *i_sb; +@@ -1081,13 +1081,25 @@ extern int lock_may_write(struct inode * + #define posix_lock_file_wait(a, b) ({ -ENOLCK; }) + #define posix_unblock_lock(a, b) (-ENOENT) + #define vfs_test_lock(a, b) ({ 0; }) +-#define vfs_lock_file(a, b, c, d) (-ENOLCK) ++static inline int ++vfs_lock_file(struct file *filp, unsigned int cmd, ++ struct file_lock *fl, struct file_lock *conf) ++{ ++ return -ENOLCK; ++} + #define vfs_cancel_lock(a, b) ({ 0; }) + #define flock_lock_file_wait(a, b) ({ -ENOLCK; }) + #define __break_lease(a, b) ({ 0; }) +-#define lease_get_mtime(a, b) ({ }) ++static inline void lease_get_mtime(struct inode *inode, struct timespec *time) ++{ ++ *time = (struct timespec) { 0, }; ++} + #define generic_setlease(a, b, c) ({ -EINVAL; }) +-#define vfs_setlease(a, b, c) ({ -EINVAL; }) ++static inline int ++vfs_setlease(struct file *filp, long arg, struct file_lock **lease) ++{ ++ return -EINVAL; ++} + #define lease_modify(a, b) ({ -EINVAL; }) + #define lock_may_read(a, b, c) ({ 1; }) + #define lock_may_write(a, b, c) ({ 1; }) +@@ -1611,9 +1623,9 @@ int __put_super_and_need_restart(struct + + /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ + #define fops_get(fops) \ +- (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) ++ (((fops != NULL) && try_module_get((fops)->owner) ? (fops) : NULL)) + #define fops_put(fops) \ +- do { if (fops) module_put((fops)->owner); } while(0) ++ do { if (fops != NULL) module_put((fops)->owner); } while(0) + + extern int register_filesystem(struct file_system_type *); + extern int unregister_filesystem(struct file_system_type *); +@@ -1689,7 +1701,7 @@ static inline int break_lease(struct ino + #else /* !CONFIG_FILE_LOCKING */ + #define locks_mandatory_locked(a) ({ 0; }) + #define locks_mandatory_area(a, b, c, d, e) ({ 0; }) +-#define __mandatory_lock(a) ({ 0; }) ++static inline int __mandatory_lock(struct inode *ino) { return 0; } + #define mandatory_lock(a) ({ 0; }) + #define locks_verify_locked(a) ({ 0; }) + #define locks_verify_truncate(a, b, c) ({ 0; }) +@@ -2171,19 +2183,7 @@ ssize_t simple_transaction_read(struct f + size_t size, loff_t *pos); + int simple_transaction_release(struct inode *inode, struct file *file); + +-static inline void simple_transaction_set(struct file *file, size_t n) +-{ +- struct simple_transaction_argresp *ar = file->private_data; +- +- BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); +- +- /* +- * The barrier ensures that ar->size will really remain zero until +- * ar->data is ready for reading. +- */ +- smp_mb(); +- ar->size = n; +-} ++void simple_transaction_set(struct file *file, size_t n); + + /* + * simple attribute files +@@ -2230,27 +2230,6 @@ ssize_t simple_attr_read(struct file *fi + ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); + +- +-#ifdef CONFIG_SECURITY +-static inline char *alloc_secdata(void) +-{ +- return (char *)get_zeroed_page(GFP_KERNEL); +-} +- +-static inline void free_secdata(void *secdata) +-{ +- free_page((unsigned long)secdata); +-} +-#else +-static inline char *alloc_secdata(void) +-{ +- return (char *)1; +-} +- +-static inline void free_secdata(void *secdata) +-{ } +-#endif /* CONFIG_SECURITY */ +- + struct ctl_table; + int proc_nr_files(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +Index: linux-2.6-tip/include/linux/ftrace.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/ftrace.h ++++ linux-2.6-tip/include/linux/ftrace.h +@@ -1,15 +1,18 @@ + #ifndef _LINUX_FTRACE_H + #define _LINUX_FTRACE_H + +-#include +-#include +-#include +-#include +-#include +-#include ++#include + #include ++#include + #include ++#include ++#include + #include ++#include ++#include ++#include ++ ++#include + + #ifdef CONFIG_FUNCTION_TRACER + +@@ -95,9 +98,41 @@ stack_trace_sysctl(struct ctl_table *tab + loff_t *ppos); + #endif + ++struct ftrace_func_command { ++ struct list_head list; ++ char *name; ++ int (*func)(char *func, char *cmd, ++ char *params, int enable); ++}; ++ + #ifdef CONFIG_DYNAMIC_FTRACE +-/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ +-#include ++ ++int ftrace_arch_code_modify_prepare(void); ++int ftrace_arch_code_modify_post_process(void); ++ ++struct seq_file; ++ ++struct ftrace_probe_ops { ++ void (*func)(unsigned long ip, ++ unsigned long parent_ip, ++ void **data); ++ int (*callback)(unsigned long ip, void **data); ++ void (*free)(void **data); ++ int (*print)(struct seq_file *m, ++ unsigned long ip, ++ struct ftrace_probe_ops *ops, ++ void *data); ++}; ++ ++extern int ++register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ++ void *data); ++extern void ++unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ++ void *data); ++extern void ++unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); ++extern void unregister_ftrace_function_probe_all(char *glob); + + enum { + FTRACE_FL_FREE = (1 << 0), +@@ -110,15 +145,23 @@ enum { + }; + + struct dyn_ftrace { +- struct list_head list; +- unsigned long ip; /* address of mcount call-site */ +- unsigned long flags; +- struct dyn_arch_ftrace arch; ++ union { ++ unsigned long ip; /* address of mcount call-site */ ++ struct dyn_ftrace *freelist; ++ }; ++ union { ++ unsigned long flags; ++ struct dyn_ftrace *newlist; ++ }; ++ struct dyn_arch_ftrace arch; + }; + + int ftrace_force_update(void); + void ftrace_set_filter(unsigned char *buf, int len, int reset); + ++int register_ftrace_command(struct ftrace_func_command *cmd); ++int unregister_ftrace_command(struct ftrace_func_command *cmd); ++ + /* defined in arch */ + extern int ftrace_ip_converted(unsigned long ip); + extern int ftrace_dyn_arch_init(void *data); +@@ -126,6 +169,10 @@ extern int ftrace_update_ftrace_func(ftr + extern void ftrace_caller(void); + extern void ftrace_call(void); + extern void mcount_call(void); ++ ++#ifndef FTRACE_ADDR ++#define FTRACE_ADDR ((unsigned long)ftrace_caller) ++#endif + #ifdef CONFIG_FUNCTION_GRAPH_TRACER + extern void ftrace_graph_caller(void); + extern int ftrace_enable_ftrace_graph_caller(void); +@@ -136,7 +183,7 @@ static inline int ftrace_disable_ftrace_ + #endif + + /** +- * ftrace_make_nop - convert code into top ++ * ftrace_make_nop - convert code into nop + * @mod: module structure if called by module load initialization + * @rec: the mcount call site record + * @addr: the address that the call site should be calling +@@ -181,7 +228,6 @@ extern int ftrace_make_nop(struct module + */ + extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); + +- + /* May be defined in arch */ + extern int ftrace_arch_read_dyn_info(char *buf, int size); + +@@ -198,6 +244,14 @@ extern void ftrace_enable_daemon(void); + # define ftrace_disable_daemon() do { } while (0) + # define ftrace_enable_daemon() do { } while (0) + static inline void ftrace_release(void *start, unsigned long size) { } ++static inline int register_ftrace_command(struct ftrace_func_command *cmd) ++{ ++ return -EINVAL; ++} ++static inline int unregister_ftrace_command(char *cmd_name) ++{ ++ return -EINVAL; ++} + #endif /* CONFIG_DYNAMIC_FTRACE */ + + /* totally disable ftrace - can not re-enable after this */ +@@ -233,24 +287,25 @@ static inline void __ftrace_enabled_rest + #endif + } + +-#ifdef CONFIG_FRAME_POINTER +-/* TODO: need to fix this for ARM */ +-# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +-# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +-# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +-# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +-# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) +-# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +-# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) +-#else +-# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +-# define CALLER_ADDR1 0UL +-# define CALLER_ADDR2 0UL +-# define CALLER_ADDR3 0UL +-# define CALLER_ADDR4 0UL +-# define CALLER_ADDR5 0UL +-# define CALLER_ADDR6 0UL +-#endif ++#ifndef HAVE_ARCH_CALLER_ADDR ++# ifdef CONFIG_FRAME_POINTER ++# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) ++# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) ++# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) ++# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) ++# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) ++# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) ++# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) ++# else ++# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) ++# define CALLER_ADDR1 0UL ++# define CALLER_ADDR2 0UL ++# define CALLER_ADDR3 0UL ++# define CALLER_ADDR4 0UL ++# define CALLER_ADDR5 0UL ++# define CALLER_ADDR6 0UL ++# endif ++#endif /* ifndef HAVE_ARCH_CALLER_ADDR */ + + #ifdef CONFIG_IRQSOFF_TRACER + extern void time_hardirqs_on(unsigned long a0, unsigned long a1); +@@ -268,54 +323,6 @@ static inline void __ftrace_enabled_rest + # define trace_preempt_off(a0, a1) do { } while (0) + #endif + +-#ifdef CONFIG_TRACING +-extern int ftrace_dump_on_oops; +- +-extern void tracing_start(void); +-extern void tracing_stop(void); +-extern void ftrace_off_permanent(void); +- +-extern void +-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); +- +-/** +- * ftrace_printk - printf formatting in the ftrace buffer +- * @fmt: the printf format for printing +- * +- * Note: __ftrace_printk is an internal function for ftrace_printk and +- * the @ip is passed in via the ftrace_printk macro. +- * +- * This function allows a kernel developer to debug fast path sections +- * that printk is not appropriate for. By scattering in various +- * printk like tracing in the code, a developer can quickly see +- * where problems are occurring. +- * +- * This is intended as a debugging tool for the developer only. +- * Please refrain from leaving ftrace_printks scattered around in +- * your code. +- */ +-# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt) +-extern int +-__ftrace_printk(unsigned long ip, const char *fmt, ...) +- __attribute__ ((format (printf, 2, 3))); +-extern void ftrace_dump(void); +-#else +-static inline void +-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } +-static inline int +-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +- +-static inline void tracing_start(void) { } +-static inline void tracing_stop(void) { } +-static inline void ftrace_off_permanent(void) { } +-static inline int +-ftrace_printk(const char *fmt, ...) +-{ +- return 0; +-} +-static inline void ftrace_dump(void) { } +-#endif +- + #ifdef CONFIG_FTRACE_MCOUNT_RECORD + extern void ftrace_init(void); + extern void ftrace_init_module(struct module *mod, +@@ -327,36 +334,6 @@ ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { } + #endif + +-enum { +- POWER_NONE = 0, +- POWER_CSTATE = 1, +- POWER_PSTATE = 2, +-}; +- +-struct power_trace { +-#ifdef CONFIG_POWER_TRACER +- ktime_t stamp; +- ktime_t end; +- int type; +- int state; +-#endif +-}; +- +-#ifdef CONFIG_POWER_TRACER +-extern void trace_power_start(struct power_trace *it, unsigned int type, +- unsigned int state); +-extern void trace_power_mark(struct power_trace *it, unsigned int type, +- unsigned int state); +-extern void trace_power_end(struct power_trace *it); +-#else +-static inline void trace_power_start(struct power_trace *it, unsigned int type, +- unsigned int state) { } +-static inline void trace_power_mark(struct power_trace *it, unsigned int type, +- unsigned int state) { } +-static inline void trace_power_end(struct power_trace *it) { } +-#endif +- +- + /* + * Structure that defines an entry function trace. + */ +@@ -380,6 +357,28 @@ struct ftrace_graph_ret { + #ifdef CONFIG_FUNCTION_GRAPH_TRACER + + /* ++ * Stack of return addresses for functions ++ * of a thread. ++ * Used in struct thread_info ++ */ ++struct ftrace_ret_stack { ++ unsigned long ret; ++ unsigned long func; ++ unsigned long long calltime; ++ unsigned long long subtime; ++}; ++ ++/* ++ * Primary handler of a function return. ++ * It relays on ftrace_return_to_handler. ++ * Defined in entry_32/64.S ++ */ ++extern void return_to_handler(void); ++ ++extern int ++ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth); ++ ++/* + * Sometimes we don't want to trace a function with the function + * graph tracer but we want them to keep traced by the usual function + * tracer if the function graph tracer is not configured. +@@ -490,6 +489,50 @@ static inline int test_tsk_trace_graph(s + return tsk->trace & TSK_TRACE_FL_GRAPH; + } + ++extern int ftrace_dump_on_oops; ++ + #endif /* CONFIG_TRACING */ + ++ ++#ifdef CONFIG_HW_BRANCH_TRACER ++ ++void trace_hw_branch(u64 from, u64 to); ++void trace_hw_branch_oops(void); ++ ++#else /* CONFIG_HW_BRANCH_TRACER */ ++ ++static inline void trace_hw_branch(u64 from, u64 to) {} ++static inline void trace_hw_branch_oops(void) {} ++ ++#endif /* CONFIG_HW_BRANCH_TRACER */ ++ ++/* ++ * A syscall entry in the ftrace syscalls array. ++ * ++ * @name: name of the syscall ++ * @nb_args: number of parameters it takes ++ * @types: list of types as strings ++ * @args: list of args as strings (args[i] matches types[i]) ++ */ ++struct syscall_metadata { ++ const char *name; ++ int nb_args; ++ const char **types; ++ const char **args; ++}; ++ ++#ifdef CONFIG_FTRACE_SYSCALLS ++extern void arch_init_ftrace_syscalls(void); ++extern struct syscall_metadata *syscall_nr_to_meta(int nr); ++extern void start_ftrace_syscalls(void); ++extern void stop_ftrace_syscalls(void); ++extern void ftrace_syscall_enter(struct pt_regs *regs); ++extern void ftrace_syscall_exit(struct pt_regs *regs); ++#else ++static inline void start_ftrace_syscalls(void) { } ++static inline void stop_ftrace_syscalls(void) { } ++static inline void ftrace_syscall_enter(struct pt_regs *regs) { } ++static inline void ftrace_syscall_exit(struct pt_regs *regs) { } ++#endif ++ + #endif /* _LINUX_FTRACE_H */ +Index: linux-2.6-tip/include/linux/ftrace_irq.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/ftrace_irq.h ++++ linux-2.6-tip/include/linux/ftrace_irq.h +@@ -2,7 +2,7 @@ + #define _LINUX_FTRACE_IRQ_H + + +-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) ++#ifdef CONFIG_FTRACE_NMI_ENTER + extern void ftrace_nmi_enter(void); + extern void ftrace_nmi_exit(void); + #else +Index: linux-2.6-tip/include/linux/futex.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/futex.h ++++ linux-2.6-tip/include/linux/futex.h +@@ -23,6 +23,8 @@ union ktime; + #define FUTEX_TRYLOCK_PI 8 + #define FUTEX_WAIT_BITSET 9 + #define FUTEX_WAKE_BITSET 10 ++#define FUTEX_WAIT_REQUEUE_PI 11 ++#define FUTEX_CMP_REQUEUE_PI 12 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -38,6 +40,10 @@ union ktime; + #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) + #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) + #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) ++#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ ++ FUTEX_PRIVATE_FLAG) ++#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ ++ FUTEX_PRIVATE_FLAG) + + /* + * Support for robust futexes: the kernel cleans up held futexes at +Index: linux-2.6-tip/include/linux/gfp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/gfp.h ++++ linux-2.6-tip/include/linux/gfp.h +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + + struct vm_area_struct; + +@@ -51,7 +52,13 @@ struct vm_area_struct; + #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ + #define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ + +-#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ ++#ifdef CONFIG_KMEMCHECK ++#define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */ ++#else ++#define __GFP_NOTRACK ((__force gfp_t)0) ++#endif ++ ++#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ + #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) + + /* This equals 0, but use constants in case they ever change */ +Index: linux-2.6-tip/include/linux/hardirq.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/hardirq.h ++++ linux-2.6-tip/include/linux/hardirq.h +@@ -15,71 +15,74 @@ + * - bits 0-7 are the preemption count (max preemption depth: 256) + * - bits 8-15 are the softirq count (max # of softirqs: 256) + * +- * The hardirq count can be overridden per architecture, the default is: ++ * The hardirq count can in theory reach the same as NR_IRQS. ++ * In reality, the number of nested IRQS is limited to the stack ++ * size as well. For archs with over 1000 IRQS it is not practical ++ * to expect that they will all nest. We give a max of 10 bits for ++ * hardirq nesting. An arch may choose to give less than 10 bits. ++ * m68k expects it to be 8. + * +- * - bits 16-27 are the hardirq count (max # of hardirqs: 4096) +- * - ( bit 28 is the PREEMPT_ACTIVE flag. ) ++ * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024) ++ * - bit 26 is the NMI_MASK ++ * - bit 28 is the PREEMPT_ACTIVE flag + * + * PREEMPT_MASK: 0x000000ff + * SOFTIRQ_MASK: 0x0000ff00 +- * HARDIRQ_MASK: 0x0fff0000 ++ * HARDIRQ_MASK: 0x03ff0000 ++ * NMI_MASK: 0x04000000 + */ + #define PREEMPT_BITS 8 + #define SOFTIRQ_BITS 8 ++#define NMI_BITS 1 + +-#ifndef HARDIRQ_BITS +-#define HARDIRQ_BITS 12 ++#define MAX_HARDIRQ_BITS 10 + +-#ifndef MAX_HARDIRQS_PER_CPU +-#define MAX_HARDIRQS_PER_CPU NR_IRQS ++#ifndef HARDIRQ_BITS ++# define HARDIRQ_BITS MAX_HARDIRQ_BITS + #endif + +-/* +- * The hardirq mask has to be large enough to have space for potentially +- * all IRQ sources in the system nesting on a single CPU. +- */ +-#if (1 << HARDIRQ_BITS) < MAX_HARDIRQS_PER_CPU +-# error HARDIRQ_BITS is too low! +-#endif ++#if HARDIRQ_BITS > MAX_HARDIRQ_BITS ++#error HARDIRQ_BITS too high! + #endif + + #define PREEMPT_SHIFT 0 + #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) + #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) ++#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + + #define __IRQ_MASK(x) ((1UL << (x))-1) + + #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) + #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) + #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) ++#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) + + #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) + #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) + #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) ++#define NMI_OFFSET (1UL << NMI_SHIFT) + +-#if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS)) ++#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) + #error PREEMPT_ACTIVE is too low! + #endif + + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) + #define softirq_count() (preempt_count() & SOFTIRQ_MASK) +-#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) ++#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ ++ | NMI_MASK)) + + /* + * Are we doing bottom half or hardware interrupt processing? + * Are we in a softirq context? Interrupt context? + */ +-#define in_irq() (hardirq_count()) +-#define in_softirq() (softirq_count()) +-#define in_interrupt() (irq_count()) +- +-#if defined(CONFIG_PREEMPT) +-# define PREEMPT_INATOMIC_BASE kernel_locked() +-# define PREEMPT_CHECK_OFFSET 1 +-#else +-# define PREEMPT_INATOMIC_BASE 0 +-# define PREEMPT_CHECK_OFFSET 0 +-#endif ++#define in_irq() (hardirq_count() || (current->flags & PF_HARDIRQ)) ++#define in_softirq() (softirq_count() || (current->flags & PF_SOFTIRQ)) ++#define in_interrupt() (irq_count()) ++ ++/* ++ * Are we in NMI context? ++ */ ++#define in_nmi() (preempt_count() & NMI_MASK) + + /* + * Are we running in atomic context? WARNING: this macro cannot +@@ -88,14 +91,7 @@ + * used in the general case to determine whether sleeping is possible. + * Do not use in_atomic() in driver code. + */ +-#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_INATOMIC_BASE) +- +-/* +- * Check whether we were atomic before we did preempt_disable(): +- * (used by the scheduler, *after* releasing the kernel lock) +- */ +-#define in_atomic_preempt_off() \ +- ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) ++#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) + + #ifdef CONFIG_PREEMPT + # define preemptible() (preempt_count() == 0 && !irqs_disabled()) +@@ -164,20 +160,24 @@ extern void irq_enter(void); + */ + extern void irq_exit(void); + +-#define nmi_enter() \ +- do { \ +- ftrace_nmi_enter(); \ +- lockdep_off(); \ +- rcu_nmi_enter(); \ +- __irq_enter(); \ ++#define nmi_enter() \ ++ do { \ ++ ftrace_nmi_enter(); \ ++ BUG_ON(in_nmi()); \ ++ add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ ++ lockdep_off(); \ ++ rcu_nmi_enter(); \ ++ trace_hardirq_enter(); \ + } while (0) + +-#define nmi_exit() \ +- do { \ +- __irq_exit(); \ +- rcu_nmi_exit(); \ +- lockdep_on(); \ +- ftrace_nmi_exit(); \ ++#define nmi_exit() \ ++ do { \ ++ trace_hardirq_exit(); \ ++ rcu_nmi_exit(); \ ++ lockdep_on(); \ ++ BUG_ON(!in_nmi()); \ ++ sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ ++ ftrace_nmi_exit(); \ + } while (0) + + #endif /* LINUX_HARDIRQ_H */ +Index: linux-2.6-tip/include/linux/if_arcnet.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/if_arcnet.h ++++ linux-2.6-tip/include/linux/if_arcnet.h +@@ -16,6 +16,7 @@ + #ifndef _LINUX_IF_ARCNET_H + #define _LINUX_IF_ARCNET_H + ++#include + #include + + +@@ -57,10 +58,10 @@ + */ + struct arc_rfc1201 + { +- uint8_t proto; /* protocol ID field - varies */ +- uint8_t split_flag; /* for use with split packets */ ++ __u8 proto; /* protocol ID field - varies */ ++ __u8 split_flag; /* for use with split packets */ + __be16 sequence; /* sequence number */ +- uint8_t payload[0]; /* space remaining in packet (504 bytes)*/ ++ __u8 payload[0]; /* space remaining in packet (504 bytes)*/ + }; + #define RFC1201_HDR_SIZE 4 + +@@ -70,8 +71,8 @@ struct arc_rfc1201 + */ + struct arc_rfc1051 + { +- uint8_t proto; /* ARC_P_RFC1051_ARP/RFC1051_IP */ +- uint8_t payload[0]; /* 507 bytes */ ++ __u8 proto; /* ARC_P_RFC1051_ARP/RFC1051_IP */ ++ __u8 payload[0]; /* 507 bytes */ + }; + #define RFC1051_HDR_SIZE 1 + +@@ -82,20 +83,20 @@ struct arc_rfc1051 + */ + struct arc_eth_encap + { +- uint8_t proto; /* Always ARC_P_ETHER */ ++ __u8 proto; /* Always ARC_P_ETHER */ + struct ethhdr eth; /* standard ethernet header (yuck!) */ +- uint8_t payload[0]; /* 493 bytes */ ++ __u8 payload[0]; /* 493 bytes */ + }; + #define ETH_ENCAP_HDR_SIZE 14 + + + struct arc_cap + { +- uint8_t proto; +- uint8_t cookie[sizeof(int)]; /* Actually NOT sent over the network */ ++ __u8 proto; ++ __u8 cookie[sizeof(int)]; /* Actually NOT sent over the network */ + union { +- uint8_t ack; +- uint8_t raw[0]; /* 507 bytes */ ++ __u8 ack; ++ __u8 raw[0]; /* 507 bytes */ + } mes; + }; + +@@ -109,7 +110,7 @@ struct arc_cap + */ + struct arc_hardware + { +- uint8_t source, /* source ARCnet - filled in automagically */ ++ __u8 source, /* source ARCnet - filled in automagically */ + dest, /* destination ARCnet - 0 for broadcast */ + offset[2]; /* offset bytes (some weird semantics) */ + }; +@@ -130,7 +131,7 @@ struct archdr + struct arc_rfc1051 rfc1051; + struct arc_eth_encap eth_encap; + struct arc_cap cap; +- uint8_t raw[0]; /* 508 bytes */ ++ __u8 raw[0]; /* 508 bytes */ + } soft; + }; + +Index: linux-2.6-tip/include/linux/if_pppol2tp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/if_pppol2tp.h ++++ linux-2.6-tip/include/linux/if_pppol2tp.h +@@ -26,7 +26,7 @@ + */ + struct pppol2tp_addr + { +- pid_t pid; /* pid that owns the fd. ++ __kernel_pid_t pid; /* pid that owns the fd. + * 0 => current */ + int fd; /* FD of UDP socket to use */ + +Index: linux-2.6-tip/include/linux/in6.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/in6.h ++++ linux-2.6-tip/include/linux/in6.h +@@ -44,11 +44,11 @@ struct in6_addr + * NOTE: Be aware the IN6ADDR_* constants and in6addr_* externals are defined + * in network byte order, not in host byte order as are the IPv4 equivalents + */ ++#ifdef __KERNEL__ + extern const struct in6_addr in6addr_any; + #define IN6ADDR_ANY_INIT { { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } } } + extern const struct in6_addr in6addr_loopback; + #define IN6ADDR_LOOPBACK_INIT { { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } } +-#ifdef __KERNEL__ + extern const struct in6_addr in6addr_linklocal_allnodes; + #define IN6ADDR_LINKLOCAL_ALLNODES_INIT \ + { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } } +Index: linux-2.6-tip/include/linux/init.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/init.h ++++ linux-2.6-tip/include/linux/init.h +@@ -313,16 +313,20 @@ void __init parse_early_param(void); + #define __initdata_or_module __initdata + #endif /*CONFIG_MODULES*/ + +-/* Functions marked as __devexit may be discarded at kernel link time, depending +- on config options. Newer versions of binutils detect references from +- retained sections to discarded sections and flag an error. Pointers to +- __devexit functions must use __devexit_p(function_name), the wrapper will +- insert either the function_name or NULL, depending on the config options. ++/* ++ * Functions marked as __devexit may be discarded at kernel link time, ++ * depending on config options. Newer versions of binutils detect ++ * references from retained sections to discarded sections and flag an ++ * error. ++ * ++ * Pointers to __devexit functions must use __devexit_p(function_name), ++ * the wrapper will insert either the function_name or NULL, depending on ++ * the config options. + */ + #if defined(MODULE) || defined(CONFIG_HOTPLUG) +-#define __devexit_p(x) x ++# define __devexit_p(x) x + #else +-#define __devexit_p(x) NULL ++# define __devexit_p(x) ((void *)((long)(x) & 0) /* NULL */) + #endif + + #ifdef MODULE +Index: linux-2.6-tip/include/linux/init_task.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/init_task.h ++++ linux-2.6-tip/include/linux/init_task.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + + extern struct files_struct init_files; + extern struct fs_struct init_fs; +@@ -51,7 +52,7 @@ extern struct fs_struct init_fs; + .cputimer = { \ + .cputime = INIT_CPUTIME, \ + .running = 0, \ +- .lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ ++ .lock = RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + }, \ + } + +@@ -120,6 +121,18 @@ extern struct group_info init_groups; + + extern struct cred init_cred; + ++#ifdef CONFIG_PERF_COUNTERS ++# define INIT_PERF_COUNTERS(tsk) \ ++ .perf_counter_ctx.counter_list = \ ++ LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ ++ .perf_counter_ctx.event_list = \ ++ LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ ++ .perf_counter_ctx.lock = \ ++ RAW_SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), ++#else ++# define INIT_PERF_COUNTERS(tsk) ++#endif ++ + /* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) +@@ -147,6 +160,7 @@ extern struct cred init_cred; + .nr_cpus_allowed = NR_CPUS, \ + }, \ + .tasks = LIST_HEAD_INIT(tsk.tasks), \ ++ .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ + .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ + .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ + .real_parent = &tsk, \ +@@ -173,8 +187,9 @@ extern struct cred init_cred; + .journal_info = NULL, \ + .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + .fs_excl = ATOMIC_INIT(0), \ +- .pi_lock = __SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ + .timer_slack_ns = 50000, /* 50 usec default slack */ \ ++ .posix_timer_list = NULL, \ ++ .pi_lock = RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ + .pids = { \ + [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ + [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ +@@ -184,6 +199,7 @@ extern struct cred init_cred; + INIT_IDS \ + INIT_TRACE_IRQFLAGS \ + INIT_LOCKDEP \ ++ INIT_PERF_COUNTERS(tsk) \ + } + + +Index: linux-2.6-tip/include/linux/intel-iommu.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/intel-iommu.h ++++ linux-2.6-tip/include/linux/intel-iommu.h +@@ -292,6 +292,8 @@ struct intel_iommu { + spinlock_t register_lock; /* protect register handling */ + int seq_id; /* sequence id of the iommu */ + int agaw; /* agaw of this iommu */ ++ unsigned int irq; ++ unsigned char name[13]; /* Device Name */ + + #ifdef CONFIG_DMAR + unsigned long *domain_ids; /* bitmap of domains */ +@@ -299,8 +301,6 @@ struct intel_iommu { + spinlock_t lock; /* protect context, domain ids */ + struct root_entry *root_entry; /* virtual address */ + +- unsigned int irq; +- unsigned char name[7]; /* Device Name */ + struct iommu_flush flush; + #endif + struct q_inval *qi; /* Queued invalidation info */ +@@ -321,6 +321,7 @@ extern struct dmar_drhd_unit * dmar_find + extern int alloc_iommu(struct dmar_drhd_unit *drhd); + extern void free_iommu(struct intel_iommu *iommu); + extern int dmar_enable_qi(struct intel_iommu *iommu); ++extern void dmar_disable_qi(struct intel_iommu *iommu); + extern void qi_global_iec(struct intel_iommu *iommu); + + extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, +@@ -331,11 +332,4 @@ extern int qi_flush_iotlb(struct intel_i + + extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); + +-extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t); +-extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t); +-extern dma_addr_t intel_map_single(struct device *, phys_addr_t, size_t, int); +-extern void intel_unmap_single(struct device *, dma_addr_t, size_t, int); +-extern int intel_map_sg(struct device *, struct scatterlist *, int, int); +-extern void intel_unmap_sg(struct device *, struct scatterlist *, int, int); +- + #endif +Index: linux-2.6-tip/include/linux/interrupt.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/interrupt.h ++++ linux-2.6-tip/include/linux/interrupt.h +@@ -54,13 +54,26 @@ + #define IRQF_SAMPLE_RANDOM 0x00000040 + #define IRQF_SHARED 0x00000080 + #define IRQF_PROBE_SHARED 0x00000100 +-#define IRQF_TIMER 0x00000200 ++#define __IRQF_TIMER 0x00000200 + #define IRQF_PERCPU 0x00000400 + #define IRQF_NOBALANCING 0x00000800 + #define IRQF_IRQPOLL 0x00001000 ++#define IRQF_NODELAY 0x00002000 ++#define IRQF_TIMER (__IRQF_TIMER | IRQF_NODELAY) + + typedef irqreturn_t (*irq_handler_t)(int, void *); + ++/** ++ * struct irqaction - per interrupt action descriptor ++ * @handler: interrupt handler function ++ * @flags: flags (see IRQF_* above) ++ * @mask: no comment as it is useless and about to be removed ++ * @name: name of the device ++ * @dev_id: cookie to identify the device ++ * @next: pointer to the next irqaction for shared interrupts ++ * @irq: interrupt number ++ * @dir: pointer to the proc/irq/NN/name entry ++ */ + struct irqaction { + irq_handler_t handler; + unsigned long flags; +@@ -69,19 +82,23 @@ struct irqaction { + void *dev_id; + struct irqaction *next; + int irq; +- struct proc_dir_entry *dir; ++ struct proc_dir_entry *dir, *threaded; + }; + + extern irqreturn_t no_action(int cpl, void *dev_id); +-extern int __must_check request_irq(unsigned int, irq_handler_t handler, +- unsigned long, const char *, void *); ++ ++extern int __must_check ++request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, ++ const char *name, void *dev); ++ + extern void free_irq(unsigned int, void *); + + struct device; + +-extern int __must_check devm_request_irq(struct device *dev, unsigned int irq, +- irq_handler_t handler, unsigned long irqflags, +- const char *devname, void *dev_id); ++extern int __must_check ++devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, ++ unsigned long irqflags, const char *devname, void *dev_id); ++ + extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); + + /* +@@ -99,7 +116,7 @@ extern void devm_free_irq(struct device + #ifdef CONFIG_LOCKDEP + # define local_irq_enable_in_hardirq() do { } while (0) + #else +-# define local_irq_enable_in_hardirq() local_irq_enable() ++# define local_irq_enable_in_hardirq() local_irq_enable_nort() + #endif + + extern void disable_irq_nosync(unsigned int irq); +@@ -224,6 +241,7 @@ static inline int disable_irq_wake(unsig + + #ifndef __ARCH_SET_SOFTIRQ_PENDING + #define set_softirq_pending(x) (local_softirq_pending() = (x)) ++// FIXME: PREEMPT_RT: set_bit()? + #define or_softirq_pending(x) (local_softirq_pending() |= (x)) + #endif + +@@ -254,10 +272,17 @@ enum + SCHED_SOFTIRQ, + HRTIMER_SOFTIRQ, + RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ ++ /* Entries after this are ignored in split softirq mode */ ++ MAX_SOFTIRQ, + + NR_SOFTIRQS + }; + ++/* map softirq index to softirq name. update 'softirq_to_name' in ++ * kernel/softirq.c when adding a new softirq. ++ */ ++extern char *softirq_to_name[NR_SOFTIRQS]; ++ + /* softirq mask and active fields moved to irq_cpustat_t in + * asm/hardirq.h to get better cache usage. KAO + */ +@@ -267,14 +292,21 @@ struct softirq_action + void (*action)(struct softirq_action *); + }; + ++#ifdef CONFIG_PREEMPT_HARDIRQS ++# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr) ++# define __do_raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) ++#else ++# define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) ++# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr) ++#endif ++ + asmlinkage void do_softirq(void); + asmlinkage void __do_softirq(void); + extern void open_softirq(int nr, void (*action)(struct softirq_action *)); + extern void softirq_init(void); +-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) + extern void raise_softirq_irqoff(unsigned int nr); + extern void raise_softirq(unsigned int nr); +-extern void wakeup_softirqd(void); ++extern void softirq_check_pending_idle(void); + + /* This is the worklist that queues up per-cpu softirq work. + * +@@ -284,6 +316,11 @@ extern void wakeup_softirqd(void); + * only be accessed by the local cpu that they are for. + */ + DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++extern void wait_for_softirq(int softirq); ++#else ++# define wait_for_softirq(x) do {} while(0) ++#endif + + /* Try to send a softirq to a remote cpu. If this cannot be done, the + * work will be queued to the local cpu. +@@ -309,8 +346,9 @@ extern void __send_remote_softirq(struct + to be executed on some cpu at least once after this. + * If the tasklet is already scheduled, but its excecution is still not + started, it will be executed only once. +- * If this tasklet is already running on another CPU (or schedule is called +- from tasklet itself), it is rescheduled for later. ++ * If this tasklet is already running on another CPU, it is rescheduled ++ for later. ++ * Schedule must not be called from the tasklet itself (a lockup occurs) + * Tasklet is strictly serialized wrt itself, but not + wrt another tasklets. If client needs some intertask synchronization, + he makes it with spinlocks. +@@ -335,27 +373,36 @@ struct tasklet_struct name = { NULL, 0, + enum + { + TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ +- TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ ++ TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */ ++ TASKLET_STATE_PENDING /* Tasklet is pending */ + }; + +-#ifdef CONFIG_SMP ++#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED) ++#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN) ++#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING) ++ ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) + static inline int tasklet_trylock(struct tasklet_struct *t) + { + return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); + } + ++static inline int tasklet_tryunlock(struct tasklet_struct *t) ++{ ++ return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN; ++} ++ + static inline void tasklet_unlock(struct tasklet_struct *t) + { + smp_mb__before_clear_bit(); + clear_bit(TASKLET_STATE_RUN, &(t)->state); + } + +-static inline void tasklet_unlock_wait(struct tasklet_struct *t) +-{ +- while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } +-} ++extern void tasklet_unlock_wait(struct tasklet_struct *t); ++ + #else + #define tasklet_trylock(t) 1 ++#define tasklet_tryunlock(t) 1 + #define tasklet_unlock_wait(t) do { } while (0) + #define tasklet_unlock(t) do { } while (0) + #endif +@@ -376,6 +423,20 @@ static inline void tasklet_hi_schedule(s + __tasklet_hi_schedule(t); + } + ++extern void __tasklet_hi_schedule_first(struct tasklet_struct *t); ++ ++/* ++ * This version avoids touching any other tasklets. Needed for kmemcheck ++ * in order not to take any page faults while enqueueing this tasklet; ++ * consider VERY carefully whether you really need this or ++ * tasklet_hi_schedule()... ++ */ ++static inline void tasklet_hi_schedule_first(struct tasklet_struct *t) ++{ ++ if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) ++ __tasklet_hi_schedule_first(t); ++} ++ + + static inline void tasklet_disable_nosync(struct tasklet_struct *t) + { +@@ -390,22 +451,14 @@ static inline void tasklet_disable(struc + smp_mb(); + } + +-static inline void tasklet_enable(struct tasklet_struct *t) +-{ +- smp_mb__before_atomic_dec(); +- atomic_dec(&t->count); +-} +- +-static inline void tasklet_hi_enable(struct tasklet_struct *t) +-{ +- smp_mb__before_atomic_dec(); +- atomic_dec(&t->count); +-} ++extern void tasklet_enable(struct tasklet_struct *t); ++extern void tasklet_hi_enable(struct tasklet_struct *t); + + extern void tasklet_kill(struct tasklet_struct *t); + extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); + extern void tasklet_init(struct tasklet_struct *t, + void (*func)(unsigned long), unsigned long data); ++void takeover_tasklets(unsigned int cpu); + + /* + * Autoprobing for irqs: +@@ -463,12 +516,52 @@ static inline void init_irq_proc(void) + } + #endif + ++#if defined(CONFIG_GENERIC_HARDIRQS) && defined(CONFIG_DEBUG_SHIRQ) ++extern void debug_poll_all_shared_irqs(void); ++#else ++static inline void debug_poll_all_shared_irqs(void) { } ++#endif ++ + int show_interrupts(struct seq_file *p, void *v); + + struct irq_desc; + + extern int early_irq_init(void); ++extern int arch_probe_nr_irqs(void); + extern int arch_early_irq_init(void); + extern int arch_init_chip_data(struct irq_desc *desc, int cpu); + ++#ifdef CONFIG_PREEMPT_RT ++# define local_irq_disable_nort() do { } while (0) ++# define local_irq_enable_nort() do { } while (0) ++# define local_irq_enable_rt() local_irq_enable() ++# define local_irq_save_nort(flags) do { local_save_flags(flags); } while (0) ++# define local_irq_restore_nort(flags) do { (void)(flags); } while (0) ++# define spin_lock_nort(lock) do { } while (0) ++# define spin_unlock_nort(lock) do { } while (0) ++# define spin_lock_bh_nort(lock) do { } while (0) ++# define spin_unlock_bh_nort(lock) do { } while (0) ++# define spin_lock_rt(lock) spin_lock(lock) ++# define spin_unlock_rt(lock) spin_unlock(lock) ++# define smp_processor_id_rt(cpu) (cpu) ++# define in_atomic_rt() (!oops_in_progress && \ ++ (in_atomic() || irqs_disabled())) ++# define read_trylock_rt(lock) ({read_lock(lock); 1; }) ++#else ++# define local_irq_disable_nort() local_irq_disable() ++# define local_irq_enable_nort() local_irq_enable() ++# define local_irq_enable_rt() do { } while (0) ++# define local_irq_save_nort(flags) local_irq_save(flags) ++# define local_irq_restore_nort(flags) local_irq_restore(flags) ++# define spin_lock_rt(lock) do { } while (0) ++# define spin_unlock_rt(lock) do { } while (0) ++# define spin_lock_nort(lock) spin_lock(lock) ++# define spin_unlock_nort(lock) spin_unlock(lock) ++# define spin_lock_bh_nort(lock) spin_lock_bh(lock) ++# define spin_unlock_bh_nort(lock) spin_unlock_bh(lock) ++# define smp_processor_id_rt(cpu) smp_processor_id() ++# define in_atomic_rt() 0 ++# define read_trylock_rt(lock) read_trylock(lock) ++#endif ++ + #endif +Index: linux-2.6-tip/include/linux/ip_vs.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/ip_vs.h ++++ linux-2.6-tip/include/linux/ip_vs.h +@@ -96,10 +96,10 @@ + */ + struct ip_vs_service_user { + /* virtual service addresses */ +- u_int16_t protocol; ++ __u16 protocol; + __be32 addr; /* virtual ip address */ + __be16 port; +- u_int32_t fwmark; /* firwall mark of service */ ++ __u32 fwmark; /* firwall mark of service */ + + /* virtual service options */ + char sched_name[IP_VS_SCHEDNAME_MAXLEN]; +@@ -119,8 +119,8 @@ struct ip_vs_dest_user { + int weight; /* destination weight */ + + /* thresholds for active connections */ +- u_int32_t u_threshold; /* upper threshold */ +- u_int32_t l_threshold; /* lower threshold */ ++ __u32 u_threshold; /* upper threshold */ ++ __u32 l_threshold; /* lower threshold */ + }; + + +@@ -159,10 +159,10 @@ struct ip_vs_getinfo { + /* The argument to IP_VS_SO_GET_SERVICE */ + struct ip_vs_service_entry { + /* which service: user fills in these */ +- u_int16_t protocol; ++ __u16 protocol; + __be32 addr; /* virtual address */ + __be16 port; +- u_int32_t fwmark; /* firwall mark of service */ ++ __u32 fwmark; /* firwall mark of service */ + + /* service options */ + char sched_name[IP_VS_SCHEDNAME_MAXLEN]; +@@ -184,12 +184,12 @@ struct ip_vs_dest_entry { + unsigned conn_flags; /* connection flags */ + int weight; /* destination weight */ + +- u_int32_t u_threshold; /* upper threshold */ +- u_int32_t l_threshold; /* lower threshold */ ++ __u32 u_threshold; /* upper threshold */ ++ __u32 l_threshold; /* lower threshold */ + +- u_int32_t activeconns; /* active connections */ +- u_int32_t inactconns; /* inactive connections */ +- u_int32_t persistconns; /* persistent connections */ ++ __u32 activeconns; /* active connections */ ++ __u32 inactconns; /* inactive connections */ ++ __u32 persistconns; /* persistent connections */ + + /* statistics */ + struct ip_vs_stats_user stats; +@@ -199,10 +199,10 @@ struct ip_vs_dest_entry { + /* The argument to IP_VS_SO_GET_DESTS */ + struct ip_vs_get_dests { + /* which service: user fills in these */ +- u_int16_t protocol; ++ __u16 protocol; + __be32 addr; /* virtual address */ + __be16 port; +- u_int32_t fwmark; /* firwall mark of service */ ++ __u32 fwmark; /* firwall mark of service */ + + /* number of real servers */ + unsigned int num_dests; +Index: linux-2.6-tip/include/linux/irq.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/irq.h ++++ linux-2.6-tip/include/linux/irq.h +@@ -20,10 +20,12 @@ + #include + #include + #include ++#include + + #include + #include + #include ++#include + + struct irq_desc; + typedef void (*irq_flow_handler_t)(unsigned int irq, +@@ -65,6 +67,7 @@ typedef void (*irq_flow_handler_t)(unsig + #define IRQ_SPURIOUS_DISABLED 0x00800000 /* IRQ was disabled by the spurious trap */ + #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ + #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ ++#define IRQ_NODELAY 0x40000000 /* IRQ must run immediately */ + + #ifdef CONFIG_IRQ_PER_CPU + # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) +@@ -151,6 +154,8 @@ struct irq_2_iommu; + * @irq_count: stats field to detect stalled irqs + * @last_unhandled: aging timer for unhandled count + * @irqs_unhandled: stats field for spurious unhandled interrupts ++ * @thread: Thread pointer for threaded preemptible irq handling ++ * @wait_for_handler: Waitqueue to wait for a running preemptible handler + * @lock: locking for SMP + * @affinity: IRQ affinity on SMP + * @cpu: cpu index useful for balancing +@@ -160,12 +165,10 @@ struct irq_2_iommu; + */ + struct irq_desc { + unsigned int irq; +-#ifdef CONFIG_SPARSE_IRQ + struct timer_rand_state *timer_rand_state; + unsigned int *kstat_irqs; +-# ifdef CONFIG_INTR_REMAP ++#ifdef CONFIG_INTR_REMAP + struct irq_2_iommu *irq_2_iommu; +-# endif + #endif + irq_flow_handler_t handle_irq; + struct irq_chip *chip; +@@ -180,13 +183,16 @@ struct irq_desc { + unsigned int irq_count; /* For detecting broken IRQs */ + unsigned long last_unhandled; /* Aging timer for unhandled count */ + unsigned int irqs_unhandled; +- spinlock_t lock; ++ struct task_struct *thread; ++ wait_queue_head_t wait_for_handler; ++ cycles_t timestamp; ++ raw_spinlock_t lock; + #ifdef CONFIG_SMP +- cpumask_t affinity; ++ cpumask_var_t affinity; + unsigned int cpu; +-#endif + #ifdef CONFIG_GENERIC_PENDING_IRQ +- cpumask_t pending_mask; ++ cpumask_var_t pending_mask; ++#endif + #endif + #ifdef CONFIG_PROC_FS + struct proc_dir_entry *dir; +@@ -202,12 +208,6 @@ extern void arch_free_chip_data(struct i + extern struct irq_desc irq_desc[NR_IRQS]; + #else /* CONFIG_SPARSE_IRQ */ + extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); +- +-#define kstat_irqs_this_cpu(DESC) \ +- ((DESC)->kstat_irqs[smp_processor_id()]) +-#define kstat_incr_irqs_this_cpu(irqno, DESC) \ +- ((DESC)->kstat_irqs[smp_processor_id()]++) +- + #endif /* CONFIG_SPARSE_IRQ */ + + extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); +@@ -226,7 +226,6 @@ irq_remap_to_desc(unsigned int irq, stru + * Migration helpers for obsolete names, they will go away: + */ + #define hw_interrupt_type irq_chip +-typedef struct irq_chip hw_irq_controller; + #define no_irq_type no_irq_chip + typedef struct irq_desc irq_desc_t; + +@@ -236,6 +235,7 @@ typedef struct irq_desc irq_desc_t; + #include + + extern int setup_irq(unsigned int irq, struct irqaction *new); ++extern void remove_irq(unsigned int irq, struct irqaction *act); + + #ifdef CONFIG_GENERIC_HARDIRQS + +@@ -280,7 +280,7 @@ static inline int irq_balancing_disabled + } + + /* Handle irq action chains: */ +-extern int handle_IRQ_event(unsigned int irq, struct irqaction *action); ++extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action); + + /* + * Built-in IRQ handlers for various IRQ types, +@@ -325,7 +325,7 @@ static inline void generic_handle_irq(un + + /* Handling of unhandled and spurious interrupts: */ + extern void note_interrupt(unsigned int irq, struct irq_desc *desc, +- int action_ret); ++ irqreturn_t action_ret); + + /* Resending of interrupts :*/ + void check_irq_resend(struct irq_desc *desc, unsigned int irq); +@@ -418,8 +418,102 @@ extern int set_irq_msi(unsigned int irq, + #define get_irq_desc_data(desc) ((desc)->handler_data) + #define get_irq_desc_msi(desc) ((desc)->msi_desc) + +-#endif /* CONFIG_GENERIC_HARDIRQS */ ++/* Early initialization of irqs */ ++extern void early_init_hardirqs(void); ++ ++#if defined(CONFIG_PREEMPT_HARDIRQS) ++extern void init_hardirqs(void); ++#else ++static inline void init_hardirqs(void) { } ++#endif ++ ++#else /* end GENERIC HARDIRQS */ ++ ++static inline void early_init_hardirqs(void) { } ++static inline void init_hardirqs(void) { } ++ ++#endif /* !CONFIG_GENERIC_HARDIRQS */ + + #endif /* !CONFIG_S390 */ + ++#ifdef CONFIG_SMP ++/** ++ * init_alloc_desc_masks - allocate cpumasks for irq_desc ++ * @desc: pointer to irq_desc struct ++ * @cpu: cpu which will be handling the cpumasks ++ * @boot: true if need bootmem ++ * ++ * Allocates affinity and pending_mask cpumask if required. ++ * Returns true if successful (or not required). ++ * Side effect: affinity has all bits set, pending_mask has all bits clear. ++ */ ++static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, ++ bool boot) ++{ ++ int node; ++ ++ if (boot) { ++ alloc_bootmem_cpumask_var(&desc->affinity); ++ cpumask_setall(desc->affinity); ++ ++#ifdef CONFIG_GENERIC_PENDING_IRQ ++ alloc_bootmem_cpumask_var(&desc->pending_mask); ++ cpumask_clear(desc->pending_mask); ++#endif ++ return true; ++ } ++ ++ node = cpu_to_node(cpu); ++ ++ if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) ++ return false; ++ cpumask_setall(desc->affinity); ++ ++#ifdef CONFIG_GENERIC_PENDING_IRQ ++ if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) { ++ free_cpumask_var(desc->affinity); ++ return false; ++ } ++ cpumask_clear(desc->pending_mask); ++#endif ++ return true; ++} ++ ++/** ++ * init_copy_desc_masks - copy cpumasks for irq_desc ++ * @old_desc: pointer to old irq_desc struct ++ * @new_desc: pointer to new irq_desc struct ++ * ++ * Insures affinity and pending_masks are copied to new irq_desc. ++ * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the ++ * irq_desc struct so the copy is redundant. ++ */ ++ ++static inline void init_copy_desc_masks(struct irq_desc *old_desc, ++ struct irq_desc *new_desc) ++{ ++#ifdef CONFIG_CPUMASKS_OFFSTACK ++ cpumask_copy(new_desc->affinity, old_desc->affinity); ++ ++#ifdef CONFIG_GENERIC_PENDING_IRQ ++ cpumask_copy(new_desc->pending_mask, old_desc->pending_mask); ++#endif ++#endif ++} ++ ++#else /* !CONFIG_SMP */ ++ ++static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, ++ bool boot) ++{ ++ return true; ++} ++ ++static inline void init_copy_desc_masks(struct irq_desc *old_desc, ++ struct irq_desc *new_desc) ++{ ++} ++ ++#endif /* CONFIG_SMP */ ++ + #endif /* _LINUX_IRQ_H */ +Index: linux-2.6-tip/include/linux/irqflags.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/irqflags.h ++++ linux-2.6-tip/include/linux/irqflags.h +@@ -13,6 +13,9 @@ + + #include + ++/* dummy wrapper for now: */ ++#define BUILD_CHECK_IRQ_FLAGS(flags) ++ + #ifdef CONFIG_TRACE_IRQFLAGS + extern void trace_softirqs_on(unsigned long ip); + extern void trace_softirqs_off(unsigned long ip); +@@ -24,8 +27,8 @@ + # define trace_softirqs_enabled(p) ((p)->softirqs_enabled) + # define trace_hardirq_enter() do { current->hardirq_context++; } while (0) + # define trace_hardirq_exit() do { current->hardirq_context--; } while (0) +-# define trace_softirq_enter() do { current->softirq_context++; } while (0) +-# define trace_softirq_exit() do { current->softirq_context--; } while (0) ++# define lockdep_softirq_enter() do { current->softirq_context++; } while (0) ++# define lockdep_softirq_exit() do { current->softirq_context--; } while (0) + # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, + #else + # define trace_hardirqs_on() do { } while (0) +@@ -38,8 +41,8 @@ + # define trace_softirqs_enabled(p) 0 + # define trace_hardirq_enter() do { } while (0) + # define trace_hardirq_exit() do { } while (0) +-# define trace_softirq_enter() do { } while (0) +-# define trace_softirq_exit() do { } while (0) ++# define lockdep_softirq_enter() do { } while (0) ++# define lockdep_softirq_exit() do { } while (0) + # define INIT_TRACE_IRQFLAGS + #endif + +Index: linux-2.6-tip/include/linux/irqnr.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/irqnr.h ++++ linux-2.6-tip/include/linux/irqnr.h +@@ -20,6 +20,7 @@ + + # define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1; irq >= 0; irq--) ++ + #else /* CONFIG_GENERIC_HARDIRQS */ + + extern int nr_irqs; +@@ -28,13 +29,17 @@ extern struct irq_desc *irq_to_desc(unsi + # define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; \ + irq++, desc = irq_to_desc(irq)) \ +- if (desc) ++ if (!desc) \ ++ ; \ ++ else + + + # define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; \ + irq--, desc = irq_to_desc(irq)) \ +- if (desc) ++ if (!desc) \ ++ ; \ ++ else + + #endif /* CONFIG_GENERIC_HARDIRQS */ + +Index: linux-2.6-tip/include/linux/irqreturn.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/irqreturn.h ++++ linux-2.6-tip/include/linux/irqreturn.h +@@ -1,25 +1,17 @@ +-/* irqreturn.h */ + #ifndef _LINUX_IRQRETURN_H + #define _LINUX_IRQRETURN_H + +-/* +- * For 2.4.x compatibility, 2.4.x can use +- * +- * typedef void irqreturn_t; +- * #define IRQ_NONE +- * #define IRQ_HANDLED +- * #define IRQ_RETVAL(x) +- * +- * To mix old-style and new-style irq handler returns. +- * +- * IRQ_NONE means we didn't handle it. +- * IRQ_HANDLED means that we did have a valid interrupt and handled it. +- * IRQ_RETVAL(x) selects on the two depending on x being non-zero (for handled) ++/** ++ * enum irqreturn ++ * @IRQ_NONE interrupt was not from this device ++ * @IRQ_HANDLED interrupt was handled by this device + */ +-typedef int irqreturn_t; ++enum irqreturn { ++ IRQ_NONE, ++ IRQ_HANDLED, ++}; + +-#define IRQ_NONE (0) +-#define IRQ_HANDLED (1) +-#define IRQ_RETVAL(x) ((x) != 0) ++typedef enum irqreturn irqreturn_t; ++#define IRQ_RETVAL(x) ((x) != IRQ_NONE) + + #endif +Index: linux-2.6-tip/include/linux/ivtvfb.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/ivtvfb.h ++++ linux-2.6-tip/include/linux/ivtvfb.h +@@ -33,6 +33,6 @@ struct ivtvfb_dma_frame { + }; + + #define IVTVFB_IOC_DMA_FRAME _IOW('V', BASE_VIDIOC_PRIVATE+0, struct ivtvfb_dma_frame) +-#define FBIO_WAITFORVSYNC _IOW('F', 0x20, u_int32_t) ++#define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32) + + #endif +Index: linux-2.6-tip/include/linux/jffs2.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/jffs2.h ++++ linux-2.6-tip/include/linux/jffs2.h +@@ -12,6 +12,7 @@ + #ifndef __LINUX_JFFS2_H__ + #define __LINUX_JFFS2_H__ + ++#include + #include + + /* You must include something which defines the C99 uintXX_t types. +@@ -91,15 +92,15 @@ + byteswapping */ + + typedef struct { +- uint32_t v32; ++ __u32 v32; + } __attribute__((packed)) jint32_t; + + typedef struct { +- uint32_t m; ++ __u32 m; + } __attribute__((packed)) jmode_t; + + typedef struct { +- uint16_t v16; ++ __u16 v16; + } __attribute__((packed)) jint16_t; + + struct jffs2_unknown_node +@@ -121,12 +122,12 @@ struct jffs2_raw_dirent + jint32_t version; + jint32_t ino; /* == zero for unlink */ + jint32_t mctime; +- uint8_t nsize; +- uint8_t type; +- uint8_t unused[2]; ++ __u8 nsize; ++ __u8 type; ++ __u8 unused[2]; + jint32_t node_crc; + jint32_t name_crc; +- uint8_t name[0]; ++ __u8 name[0]; + }; + + /* The JFFS2 raw inode structure: Used for storage on physical media. */ +@@ -153,12 +154,12 @@ struct jffs2_raw_inode + jint32_t offset; /* Where to begin to write. */ + jint32_t csize; /* (Compressed) data size */ + jint32_t dsize; /* Size of the node's data. (after decompression) */ +- uint8_t compr; /* Compression algorithm used */ +- uint8_t usercompr; /* Compression algorithm requested by the user */ ++ __u8 compr; /* Compression algorithm used */ ++ __u8 usercompr; /* Compression algorithm requested by the user */ + jint16_t flags; /* See JFFS2_INO_FLAG_* */ + jint32_t data_crc; /* CRC for the (compressed) data. */ + jint32_t node_crc; /* CRC for the raw inode (excluding data) */ +- uint8_t data[0]; ++ __u8 data[0]; + }; + + struct jffs2_raw_xattr { +@@ -168,12 +169,12 @@ struct jffs2_raw_xattr { + jint32_t hdr_crc; + jint32_t xid; /* XATTR identifier number */ + jint32_t version; +- uint8_t xprefix; +- uint8_t name_len; ++ __u8 xprefix; ++ __u8 name_len; + jint16_t value_len; + jint32_t data_crc; + jint32_t node_crc; +- uint8_t data[0]; ++ __u8 data[0]; + } __attribute__((packed)); + + struct jffs2_raw_xref +Index: linux-2.6-tip/include/linux/kernel.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/kernel.h ++++ linux-2.6-tip/include/linux/kernel.h +@@ -122,7 +122,7 @@ extern int _cond_resched(void); + # define might_resched() do { } while (0) + #endif + +-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP ++#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) + void __might_sleep(char *file, int line); + /** + * might_sleep - annotation for functions that can sleep +@@ -242,6 +242,19 @@ extern struct ratelimit_state printk_rat + extern int printk_ratelimit(void); + extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msec); ++ ++/* ++ * Print a one-time message (analogous to WARN_ONCE() et al): ++ */ ++#define printk_once(x...) ({ \ ++ static int __print_once = 1; \ ++ \ ++ if (__print_once) { \ ++ __print_once = 0; \ ++ printk(x); \ ++ } \ ++}) ++ + #else + static inline int vprintk(const char *s, va_list args) + __attribute__ ((format (printf, 1, 0))); +@@ -253,6 +266,10 @@ static inline int printk_ratelimit(void) + static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ + unsigned int interval_msec) \ + { return false; } ++ ++/* No effect, but we still get type checking even in the !PRINTK case: */ ++#define printk_once(x...) printk(x) ++ + #endif + + extern int printk_needs_cpu(int cpu); +@@ -261,6 +278,12 @@ extern void printk_tick(void); + extern void asmlinkage __attribute__((format(printf, 1, 2))) + early_printk(const char *fmt, ...); + ++#ifdef CONFIG_PREEMPT_RT ++extern void zap_rt_locks(void); ++#else ++# define zap_rt_locks() do { } while (0) ++#endif ++ + unsigned long int_sqrt(unsigned long); + + static inline void console_silent(void) +@@ -289,6 +312,7 @@ extern int root_mountflags; + /* Values used for system_state */ + extern enum system_states { + SYSTEM_BOOTING, ++ SYSTEM_BOOTING_SCHEDULER_OK, + SYSTEM_RUNNING, + SYSTEM_HALT, + SYSTEM_POWER_OFF, +@@ -368,6 +392,139 @@ static inline char *pack_hex_byte(char * + #endif + + /* ++ * General tracing related utility functions - trace_printk(), ++ * tracing_on/tracing_off and tracing_start()/tracing_stop ++ * ++ * Use tracing_on/tracing_off when you want to quickly turn on or off ++ * tracing. It simply enables or disables the recording of the trace events. ++ * This also corresponds to the user space debugfs/tracing/tracing_on ++ * file, which gives a means for the kernel and userspace to interact. ++ * Place a tracing_off() in the kernel where you want tracing to end. ++ * From user space, examine the trace, and then echo 1 > tracing_on ++ * to continue tracing. ++ * ++ * tracing_stop/tracing_start has slightly more overhead. It is used ++ * by things like suspend to ram where disabling the recording of the ++ * trace is not enough, but tracing must actually stop because things ++ * like calling smp_processor_id() may crash the system. ++ * ++ * Most likely, you want to use tracing_on/tracing_off. ++ */ ++#ifdef CONFIG_RING_BUFFER ++void tracing_on(void); ++void tracing_off(void); ++/* trace_off_permanent stops recording with no way to bring it back */ ++void tracing_off_permanent(void); ++int tracing_is_on(void); ++#else ++static inline void tracing_on(void) { } ++static inline void tracing_off(void) { } ++static inline void tracing_off_permanent(void) { } ++static inline int tracing_is_on(void) { return 0; } ++#endif ++#ifdef CONFIG_TRACING ++extern void tracing_start(void); ++extern void tracing_stop(void); ++extern void ftrace_off_permanent(void); ++ ++extern void ++ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); ++ ++static inline void __attribute__ ((format (printf, 1, 2))) ++____trace_printk_check_format(const char *fmt, ...) ++{ ++} ++#define __trace_printk_check_format(fmt, args...) \ ++do { \ ++ if (0) \ ++ ____trace_printk_check_format(fmt, ##args); \ ++} while (0) ++ ++/** ++ * trace_printk - printf formatting in the ftrace buffer ++ * @fmt: the printf format for printing ++ * ++ * Note: __trace_printk is an internal function for trace_printk and ++ * the @ip is passed in via the trace_printk macro. ++ * ++ * This function allows a kernel developer to debug fast path sections ++ * that printk is not appropriate for. By scattering in various ++ * printk like tracing in the code, a developer can quickly see ++ * where problems are occurring. ++ * ++ * This is intended as a debugging tool for the developer only. ++ * Please refrain from leaving trace_printks scattered around in ++ * your code. ++ */ ++ ++#define trace_printk(fmt, args...) \ ++do { \ ++ __trace_printk_check_format(fmt, ##args); \ ++ if (__builtin_constant_p(fmt)) { \ ++ static const char *trace_printk_fmt \ ++ __attribute__((section("__trace_printk_fmt"))) = \ ++ __builtin_constant_p(fmt) ? fmt : NULL; \ ++ \ ++ __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \ ++ } else \ ++ __trace_printk(_THIS_IP_, fmt, ##args); \ ++} while (0) ++ ++extern int ++__trace_bprintk(unsigned long ip, const char *fmt, ...) ++ __attribute__ ((format (printf, 2, 3))); ++ ++extern int ++__trace_printk(unsigned long ip, const char *fmt, ...) ++ __attribute__ ((format (printf, 2, 3))); ++ ++/* ++ * The double __builtin_constant_p is because gcc will give us an error ++ * if we try to allocate the static variable to fmt if it is not a ++ * constant. Even with the outer if statement. ++ */ ++#define ftrace_vprintk(fmt, vargs) \ ++do { \ ++ if (__builtin_constant_p(fmt)) { \ ++ static const char *trace_printk_fmt \ ++ __attribute__((section("__trace_printk_fmt"))) = \ ++ __builtin_constant_p(fmt) ? fmt : NULL; \ ++ \ ++ __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \ ++ } else \ ++ __ftrace_vprintk(_THIS_IP_, fmt, vargs); \ ++} while (0) ++ ++extern int ++__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); ++ ++extern int ++__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); ++ ++extern void ftrace_dump(void); ++#else ++static inline void ++ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } ++static inline int ++trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); ++ ++static inline void tracing_start(void) { } ++static inline void tracing_stop(void) { } ++static inline void ftrace_off_permanent(void) { } ++static inline int ++trace_printk(const char *fmt, ...) ++{ ++ return 0; ++} ++static inline int ++ftrace_vprintk(const char *fmt, va_list ap) ++{ ++ return 0; ++} ++static inline void ftrace_dump(void) { } ++#endif /* CONFIG_TRACING */ ++ ++/* + * Display an IP address in readable format. + */ + +Index: linux-2.6-tip/include/linux/kernel_stat.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/kernel_stat.h ++++ linux-2.6-tip/include/linux/kernel_stat.h +@@ -23,12 +23,14 @@ struct cpu_usage_stat { + cputime64_t idle; + cputime64_t iowait; + cputime64_t steal; ++ cputime64_t user_rt; ++ cputime64_t system_rt; + cputime64_t guest; + }; + + struct kernel_stat { + struct cpu_usage_stat cpustat; +-#ifndef CONFIG_SPARSE_IRQ ++#ifndef CONFIG_GENERIC_HARDIRQS + unsigned int irqs[NR_IRQS]; + #endif + }; +@@ -41,7 +43,7 @@ DECLARE_PER_CPU(struct kernel_stat, ksta + + extern unsigned long long nr_context_switches(void); + +-#ifndef CONFIG_SPARSE_IRQ ++#ifndef CONFIG_GENERIC_HARDIRQS + #define kstat_irqs_this_cpu(irq) \ + (kstat_this_cpu.irqs[irq]) + +@@ -52,16 +54,19 @@ static inline void kstat_incr_irqs_this_ + { + kstat_this_cpu.irqs[irq]++; + } +-#endif +- + +-#ifndef CONFIG_SPARSE_IRQ + static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) + { + return kstat_cpu(cpu).irqs[irq]; + } + #else ++#include + extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); ++#define kstat_irqs_this_cpu(DESC) \ ++ ((DESC)->kstat_irqs[smp_processor_id()]) ++#define kstat_incr_irqs_this_cpu(irqno, DESC) \ ++ ((DESC)->kstat_irqs[smp_processor_id()]++) ++ + #endif + + /* +@@ -78,7 +83,15 @@ static inline unsigned int kstat_irqs(un + return sum; + } + ++ ++/* ++ * Lock/unlock the current runqueue - to extract task statistics: ++ */ ++extern void curr_rq_lock_irq_save(unsigned long *flags); ++extern void curr_rq_unlock_irq_restore(unsigned long *flags); ++extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); + extern unsigned long long task_delta_exec(struct task_struct *); ++ + extern void account_user_time(struct task_struct *, cputime_t, cputime_t); + extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); + extern void account_steal_time(cputime_t); +Index: linux-2.6-tip/include/linux/key.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/key.h ++++ linux-2.6-tip/include/linux/key.h +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include + + #ifdef __KERNEL__ +Index: linux-2.6-tip/include/linux/kmemcheck.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/kmemcheck.h +@@ -0,0 +1,154 @@ ++#ifndef LINUX_KMEMCHECK_H ++#define LINUX_KMEMCHECK_H ++ ++#include ++#include ++ ++/* ++ * How to use: If you have a struct using bitfields, for example ++ * ++ * struct a { ++ * int x:8, y:8; ++ * }; ++ * ++ * then this should be rewritten as ++ * ++ * struct a { ++ * kmemcheck_define_bitfield(flags, { ++ * int x:8, y:8; ++ * }); ++ * }; ++ * ++ * Now the "flags" member may be used to refer to the bitfield (and things ++ * like &x.flags is allowed). As soon as the struct is allocated, the bit- ++ * fields should be annotated: ++ * ++ * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL); ++ * if (a) ++ * kmemcheck_annotate_bitfield(a->flags); ++ * ++ * Note: We provide the same definitions for both kmemcheck and non- ++ * kmemcheck kernels. This makes it harder to introduce accidental errors. ++ */ ++#define kmemcheck_define_bitfield(name, fields...) \ ++ union { \ ++ struct fields name; \ ++ struct fields; \ ++ }; \ ++ \ ++ /* \ ++ * Erk. Due to gcc bug, we'll get a "error: \ ++ * flexible array member in otherwise empty \ ++ * struct without this. \ ++ */ \ ++ int kmemcheck_dummy_##name##_[0]; ++ ++#ifdef CONFIG_KMEMCHECK ++extern int kmemcheck_enabled; ++ ++void kmemcheck_init(void); ++ ++/* The slab-related functions. */ ++void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node); ++void kmemcheck_free_shadow(struct page *page, int order); ++void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, ++ size_t size); ++void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size); ++ ++void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order, ++ gfp_t gfpflags); ++ ++void kmemcheck_show_pages(struct page *p, unsigned int n); ++void kmemcheck_hide_pages(struct page *p, unsigned int n); ++ ++bool kmemcheck_page_is_tracked(struct page *p); ++ ++void kmemcheck_mark_unallocated(void *address, unsigned int n); ++void kmemcheck_mark_uninitialized(void *address, unsigned int n); ++void kmemcheck_mark_initialized(void *address, unsigned int n); ++void kmemcheck_mark_freed(void *address, unsigned int n); ++ ++void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n); ++void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n); ++void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n); ++ ++int kmemcheck_show_addr(unsigned long address); ++int kmemcheck_hide_addr(unsigned long address); ++ ++#define kmemcheck_annotate_bitfield(field) \ ++ do { \ ++ kmemcheck_mark_initialized(&(field), sizeof(field)); \ ++ } while (0) ++#else ++#define kmemcheck_enabled 0 ++ ++static inline void kmemcheck_init(void) ++{ ++} ++ ++static inline void ++kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) ++{ ++} ++ ++static inline void ++kmemcheck_free_shadow(struct page *page, int order) ++{ ++} ++ ++static inline void ++kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, ++ size_t size) ++{ ++} ++ ++static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object, ++ size_t size) ++{ ++} ++ ++static inline void kmemcheck_pagealloc_alloc(struct page *p, ++ unsigned int order, gfp_t gfpflags) ++{ ++} ++ ++static inline bool kmemcheck_page_is_tracked(struct page *p) ++{ ++ return false; ++} ++ ++static inline void kmemcheck_mark_unallocated(void *address, unsigned int n) ++{ ++} ++ ++static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n) ++{ ++} ++ ++static inline void kmemcheck_mark_initialized(void *address, unsigned int n) ++{ ++} ++ ++static inline void kmemcheck_mark_freed(void *address, unsigned int n) ++{ ++} ++ ++static inline void kmemcheck_mark_unallocated_pages(struct page *p, ++ unsigned int n) ++{ ++} ++ ++static inline void kmemcheck_mark_uninitialized_pages(struct page *p, ++ unsigned int n) ++{ ++} ++ ++static inline void kmemcheck_mark_initialized_pages(struct page *p, ++ unsigned int n) ++{ ++} ++ ++#define kmemcheck_annotate_bitfield(field) do { } while (0) ++#endif /* CONFIG_KMEMCHECK */ ++ ++#endif /* LINUX_KMEMCHECK_H */ +Index: linux-2.6-tip/include/linux/kprobes.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/kprobes.h ++++ linux-2.6-tip/include/linux/kprobes.h +@@ -156,7 +156,7 @@ struct kretprobe { + int nmissed; + size_t data_size; + struct hlist_head free_instances; +- spinlock_t lock; ++ raw_spinlock_t lock; + }; + + struct kretprobe_instance { +@@ -182,6 +182,14 @@ struct kprobe_blackpoint { + DECLARE_PER_CPU(struct kprobe *, current_kprobe); + DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + ++/* ++ * For #ifdef avoidance: ++ */ ++static inline int kprobes_built_in(void) ++{ ++ return 1; ++} ++ + #ifdef CONFIG_KRETPROBES + extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs); +@@ -271,8 +279,16 @@ void unregister_kretprobes(struct kretpr + void kprobe_flush_task(struct task_struct *tk); + void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); + +-#else /* CONFIG_KPROBES */ ++#else /* !CONFIG_KPROBES: */ + ++static inline int kprobes_built_in(void) ++{ ++ return 0; ++} ++static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) ++{ ++ return 0; ++} + static inline struct kprobe *get_kprobe(void *addr) + { + return NULL; +@@ -329,5 +345,5 @@ static inline void unregister_kretprobes + static inline void kprobe_flush_task(struct task_struct *tk) + { + } +-#endif /* CONFIG_KPROBES */ +-#endif /* _LINUX_KPROBES_H */ ++#endif /* CONFIG_KPROBES */ ++#endif /* _LINUX_KPROBES_H */ +Index: linux-2.6-tip/include/linux/latencytop.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/latencytop.h ++++ linux-2.6-tip/include/linux/latencytop.h +@@ -9,6 +9,7 @@ + #ifndef _INCLUDE_GUARD_LATENCYTOP_H_ + #define _INCLUDE_GUARD_LATENCYTOP_H_ + ++#include + #ifdef CONFIG_LATENCYTOP + + #define LT_SAVECOUNT 32 +@@ -24,7 +25,14 @@ struct latency_record { + + struct task_struct; + +-void account_scheduler_latency(struct task_struct *task, int usecs, int inter); ++extern int latencytop_enabled; ++void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); ++static inline void ++account_scheduler_latency(struct task_struct *task, int usecs, int inter) ++{ ++ if (unlikely(latencytop_enabled)) ++ __account_scheduler_latency(task, usecs, inter); ++} + + void clear_all_latency_tracing(struct task_struct *p); + +Index: linux-2.6-tip/include/linux/lockdep.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/lockdep.h ++++ linux-2.6-tip/include/linux/lockdep.h +@@ -20,43 +20,10 @@ struct lockdep_map; + #include + + /* +- * Lock-class usage-state bits: ++ * We'd rather not expose kernel/lockdep_states.h this wide, but we do need ++ * the total number of states... :-( + */ +-enum lock_usage_bit +-{ +- LOCK_USED = 0, +- LOCK_USED_IN_HARDIRQ, +- LOCK_USED_IN_SOFTIRQ, +- LOCK_ENABLED_SOFTIRQS, +- LOCK_ENABLED_HARDIRQS, +- LOCK_USED_IN_HARDIRQ_READ, +- LOCK_USED_IN_SOFTIRQ_READ, +- LOCK_ENABLED_SOFTIRQS_READ, +- LOCK_ENABLED_HARDIRQS_READ, +- LOCK_USAGE_STATES +-}; +- +-/* +- * Usage-state bitmasks: +- */ +-#define LOCKF_USED (1 << LOCK_USED) +-#define LOCKF_USED_IN_HARDIRQ (1 << LOCK_USED_IN_HARDIRQ) +-#define LOCKF_USED_IN_SOFTIRQ (1 << LOCK_USED_IN_SOFTIRQ) +-#define LOCKF_ENABLED_HARDIRQS (1 << LOCK_ENABLED_HARDIRQS) +-#define LOCKF_ENABLED_SOFTIRQS (1 << LOCK_ENABLED_SOFTIRQS) +- +-#define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS) +-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) +- +-#define LOCKF_USED_IN_HARDIRQ_READ (1 << LOCK_USED_IN_HARDIRQ_READ) +-#define LOCKF_USED_IN_SOFTIRQ_READ (1 << LOCK_USED_IN_SOFTIRQ_READ) +-#define LOCKF_ENABLED_HARDIRQS_READ (1 << LOCK_ENABLED_HARDIRQS_READ) +-#define LOCKF_ENABLED_SOFTIRQS_READ (1 << LOCK_ENABLED_SOFTIRQS_READ) +- +-#define LOCKF_ENABLED_IRQS_READ \ +- (LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ) +-#define LOCKF_USED_IN_IRQ_READ \ +- (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) ++#define XXX_LOCK_USAGE_STATES (1+3*4) + + #define MAX_LOCKDEP_SUBCLASSES 8UL + +@@ -97,7 +64,7 @@ struct lock_class { + * IRQ/softirq usage tracking bits: + */ + unsigned long usage_mask; +- struct stack_trace usage_traces[LOCK_USAGE_STATES]; ++ struct stack_trace usage_traces[XXX_LOCK_USAGE_STATES]; + + /* + * These fields represent a directed graph of lock dependencies, +@@ -324,7 +291,11 @@ static inline void lock_set_subclass(str + lock_set_class(lock, lock->name, lock->key, subclass, ip); + } + +-# define INIT_LOCKDEP .lockdep_recursion = 0, ++extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); ++extern void lockdep_clear_current_reclaim_state(void); ++extern void lockdep_trace_alloc(gfp_t mask); ++ ++# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, + + #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) + +@@ -342,6 +313,9 @@ static inline void lockdep_on(void) + # define lock_release(l, n, i) do { } while (0) + # define lock_set_class(l, n, k, s, i) do { } while (0) + # define lock_set_subclass(l, s, i) do { } while (0) ++# define lockdep_set_current_reclaim_state(g) do { } while (0) ++# define lockdep_clear_current_reclaim_state() do { } while (0) ++# define lockdep_trace_alloc(g) do { } while (0) + # define lockdep_init() do { } while (0) + # define lockdep_info() do { } while (0) + # define lockdep_init_map(lock, name, key, sub) \ +Index: linux-2.6-tip/include/linux/magic.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/magic.h ++++ linux-2.6-tip/include/linux/magic.h +@@ -49,4 +49,5 @@ + #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA + #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA + ++#define STACK_END_MAGIC 0x57AC6E9D + #endif /* __LINUX_MAGIC_H__ */ +Index: linux-2.6-tip/include/linux/matroxfb.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/matroxfb.h ++++ linux-2.6-tip/include/linux/matroxfb.h +@@ -37,7 +37,7 @@ enum matroxfb_ctrl_id { + MATROXFB_CID_LAST + }; + +-#define FBIO_WAITFORVSYNC _IOW('F', 0x20, u_int32_t) ++#define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32) + + #endif + +Index: linux-2.6-tip/include/linux/mca-legacy.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/mca-legacy.h ++++ linux-2.6-tip/include/linux/mca-legacy.h +@@ -9,7 +9,7 @@ + + #include + +-#warning "MCA legacy - please move your driver to the new sysfs api" ++/* #warning "MCA legacy - please move your driver to the new sysfs api" */ + + /* MCA_NOTFOUND is an error condition. The other two indicate + * motherboard POS registers contain the adapter. They might be +Index: linux-2.6-tip/include/linux/memory.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/memory.h ++++ linux-2.6-tip/include/linux/memory.h +@@ -99,4 +99,10 @@ enum mem_add_context { BOOT, HOTPLUG }; + #define hotplug_memory_notifier(fn, pri) do { } while (0) + #endif + ++/* ++ * Kernel text modification mutex, used for code patching. Users of this lock ++ * can sleep. ++ */ ++extern struct mutex text_mutex; ++ + #endif /* _LINUX_MEMORY_H_ */ +Index: linux-2.6-tip/include/linux/mm.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/mm.h ++++ linux-2.6-tip/include/linux/mm.h +@@ -98,12 +98,13 @@ extern unsigned int kobjsize(const void + #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ + #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ + #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +-#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it. Refer note in VM_PFNMAP_AT_MMAP below */ ++#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ + #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ + + #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ + #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ + #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ ++#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ + + #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ + #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +@@ -127,17 +128,6 @@ extern unsigned int kobjsize(const void + #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) + + /* +- * pfnmap vmas that are fully mapped at mmap time (not mapped on fault). +- * Used by x86 PAT to identify such PFNMAP mappings and optimize their handling. +- * Note VM_INSERTPAGE flag is overloaded here. i.e, +- * VM_INSERTPAGE && !VM_PFNMAP implies +- * The vma has had "vm_insert_page()" done on it +- * VM_INSERTPAGE && VM_PFNMAP implies +- * The vma is PFNMAP with full mapping at mmap time +- */ +-#define VM_PFNMAP_AT_MMAP (VM_INSERTPAGE | VM_PFNMAP) +- +-/* + * mapping from the currently active vm_flags protection bits (the + * low four bits) to a page protection mask.. + */ +@@ -157,7 +147,7 @@ extern pgprot_t protection_map[16]; + */ + static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) + { +- return ((vma->vm_flags & VM_PFNMAP_AT_MMAP) == VM_PFNMAP_AT_MMAP); ++ return (vma->vm_flags & VM_PFN_AT_MMAP); + } + + static inline int is_pfn_mapping(struct vm_area_struct *vma) +@@ -614,23 +604,39 @@ static __always_inline void *lowmem_page + #endif + + #if defined(WANT_PAGE_VIRTUAL) +-#define page_address(page) ((page)->virtual) +-#define set_page_address(page, address) \ +- do { \ +- (page)->virtual = (address); \ +- } while(0) +-#define page_address_init() do { } while(0) ++/* ++ * wrap page->virtual so it is safe to set/read locklessly ++ */ ++#define page_address(page) \ ++ ({ typeof((page)->virtual) v = (page)->virtual; \ ++ smp_read_barrier_depends(); \ ++ v; }) ++ ++static inline int set_page_address(struct page *page, void *address) ++{ ++ if (address) ++ return cmpxchg(&page->virtual, NULL, address) == NULL; ++ else { ++ /* ++ * cmpxchg is a bit abused because it is not guaranteed ++ * safe wrt direct assignment on all platforms. ++ */ ++ void *virt = page->virtual; ++ return cmpxchg(&page->vitrual, virt, NULL) == virt; ++ } ++} ++void page_address_init(void); + #endif + + #if defined(HASHED_PAGE_VIRTUAL) + void *page_address(struct page *page); +-void set_page_address(struct page *page, void *virtual); ++int set_page_address(struct page *page, void *virtual); + void page_address_init(void); + #endif + + #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) + #define page_address(page) lowmem_page_address(page) +-#define set_page_address(page, address) do { } while(0) ++#define set_page_address(page, address) (0) + #define page_address_init() do { } while(0) + #endif + +@@ -771,7 +777,7 @@ int zap_vma_ptes(struct vm_area_struct * + unsigned long size); + unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *); +-unsigned long unmap_vmas(struct mmu_gather **tlb, ++unsigned long unmap_vmas(struct mmu_gather *tlb, + struct vm_area_struct *start_vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted, + struct zap_details *); +@@ -951,27 +957,85 @@ static inline pmd_t *pmd_alloc(struct mm + * overflow into the next struct page (as it might with DEBUG_SPINLOCK). + * When freeing, reset page->mapping so free_pages_check won't complain. + */ ++#ifndef CONFIG_PREEMPT_RT ++ + #define __pte_lockptr(page) &((page)->ptl) +-#define pte_lock_init(_page) do { \ +- spin_lock_init(__pte_lockptr(_page)); \ +-} while (0) ++ ++static inline struct page *pte_lock_init(struct page *page) ++{ ++ spin_lock_init(__pte_lockptr(page)); ++ return page; ++} ++ + #define pte_lock_deinit(page) ((page)->mapping = NULL) ++ ++#else /* PREEMPT_RT */ ++ ++/* ++ * On PREEMPT_RT the spinlock_t's are too large to embed in the ++ * page frame, hence it only has a pointer and we need to dynamically ++ * allocate the lock when we allocate PTE-pages. ++ * ++ * This is an overall win, since only a small fraction of the pages ++ * will be PTE pages under normal circumstances. ++ */ ++ ++#define __pte_lockptr(page) ((page)->ptl) ++ ++/* ++ * Heinous hack, relies on the caller doing something like: ++ * ++ * pte = alloc_pages(PGALLOC_GFP, 0); ++ * if (pte) ++ * pgtable_page_ctor(pte); ++ * return pte; ++ * ++ * This ensures we release the page and return NULL when the ++ * lock allocation fails. ++ */ ++static inline struct page *pte_lock_init(struct page *page) ++{ ++ page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); ++ if (page->ptl) { ++ spin_lock_init(__pte_lockptr(page)); ++ } else { ++ __free_page(page); ++ page = NULL; ++ } ++ return page; ++} ++ ++static inline void pte_lock_deinit(struct page *page) ++{ ++ kfree(page->ptl); ++ page->mapping = NULL; ++} ++ ++#endif /* PREEMPT_RT */ ++ + #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) + #else /* !USE_SPLIT_PTLOCKS */ + /* + * We use mm->page_table_lock to guard all pagetable pages of the mm. + */ +-#define pte_lock_init(page) do {} while (0) ++static inline struct page *pte_lock_init(struct page *page) { return page; } + #define pte_lock_deinit(page) do {} while (0) + #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) + #endif /* USE_SPLIT_PTLOCKS */ + +-static inline void pgtable_page_ctor(struct page *page) ++static inline struct page *__pgtable_page_ctor(struct page *page) + { +- pte_lock_init(page); +- inc_zone_page_state(page, NR_PAGETABLE); ++ page = pte_lock_init(page); ++ if (page) ++ inc_zone_page_state(page, NR_PAGETABLE); ++ return page; + } + ++#define pgtable_page_ctor(page) \ ++do { \ ++ page = __pgtable_page_ctor(page); \ ++} while (0) ++ + static inline void pgtable_page_dtor(struct page *page) + { + pte_lock_deinit(page); +Index: linux-2.6-tip/include/linux/mm_types.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/mm_types.h ++++ linux-2.6-tip/include/linux/mm_types.h +@@ -68,7 +68,11 @@ struct page { + */ + }; + #if USE_SPLIT_PTLOCKS ++#ifndef CONFIG_PREEMPT_RT + spinlock_t ptl; ++#else ++ spinlock_t *ptl; ++#endif + #endif + struct kmem_cache *slab; /* SLUB: Pointer to slab */ + struct page *first_page; /* Compound tail pages */ +@@ -94,6 +98,14 @@ struct page { + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ + #endif /* WANT_PAGE_VIRTUAL */ ++ ++#ifdef CONFIG_KMEMCHECK ++ /* ++ * kmemcheck wants to track the status of each byte in a page; this ++ * is a pointer to such a status block. NULL if not tracked. ++ */ ++ void *shadow; ++#endif + }; + + /* +@@ -233,6 +245,9 @@ struct mm_struct { + /* Architecture-specific MM context */ + mm_context_t context; + ++ /* realtime bits */ ++ struct list_head delayed_drop; ++ + /* Swap token stuff */ + /* + * Last value of global fault stamp as seen by this process. +Index: linux-2.6-tip/include/linux/mmiotrace.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/mmiotrace.h ++++ linux-2.6-tip/include/linux/mmiotrace.h +@@ -1,5 +1,5 @@ +-#ifndef MMIOTRACE_H +-#define MMIOTRACE_H ++#ifndef _LINUX_MMIOTRACE_H ++#define _LINUX_MMIOTRACE_H + + #include + #include +@@ -13,28 +13,34 @@ typedef void (*kmmio_post_handler_t)(str + unsigned long condition, struct pt_regs *); + + struct kmmio_probe { +- struct list_head list; /* kmmio internal list */ +- unsigned long addr; /* start location of the probe point */ +- unsigned long len; /* length of the probe region */ +- kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ +- kmmio_post_handler_t post_handler; /* Called after addr is executed */ +- void *private; ++ /* kmmio internal list: */ ++ struct list_head list; ++ /* start location of the probe point: */ ++ unsigned long addr; ++ /* length of the probe region: */ ++ unsigned long len; ++ /* Called before addr is executed: */ ++ kmmio_pre_handler_t pre_handler; ++ /* Called after addr is executed: */ ++ kmmio_post_handler_t post_handler; ++ void *private; + }; + ++extern unsigned int kmmio_count; ++ ++extern int register_kmmio_probe(struct kmmio_probe *p); ++extern void unregister_kmmio_probe(struct kmmio_probe *p); ++ ++#ifdef CONFIG_MMIOTRACE + /* kmmio is active by some kmmio_probes? */ + static inline int is_kmmio_active(void) + { +- extern unsigned int kmmio_count; + return kmmio_count; + } + +-extern int register_kmmio_probe(struct kmmio_probe *p); +-extern void unregister_kmmio_probe(struct kmmio_probe *p); +- + /* Called from page fault handler. */ + extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); + +-#ifdef CONFIG_MMIOTRACE + /* Called from ioremap.c */ + extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr); +@@ -43,7 +49,17 @@ extern void mmiotrace_iounmap(volatile v + /* For anyone to insert markers. Remember trailing newline. */ + extern int mmiotrace_printk(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); +-#else ++#else /* !CONFIG_MMIOTRACE: */ ++static inline int is_kmmio_active(void) ++{ ++ return 0; ++} ++ ++static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr) ++{ ++ return 0; ++} ++ + static inline void mmiotrace_ioremap(resource_size_t offset, + unsigned long size, void __iomem *addr) + { +@@ -63,28 +79,28 @@ static inline int mmiotrace_printk(const + #endif /* CONFIG_MMIOTRACE */ + + enum mm_io_opcode { +- MMIO_READ = 0x1, /* struct mmiotrace_rw */ +- MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ +- MMIO_PROBE = 0x3, /* struct mmiotrace_map */ +- MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ +- MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ ++ MMIO_READ = 0x1, /* struct mmiotrace_rw */ ++ MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ ++ MMIO_PROBE = 0x3, /* struct mmiotrace_map */ ++ MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ ++ MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ + }; + + struct mmiotrace_rw { +- resource_size_t phys; /* PCI address of register */ +- unsigned long value; +- unsigned long pc; /* optional program counter */ +- int map_id; +- unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ +- unsigned char width; /* size of register access in bytes */ ++ resource_size_t phys; /* PCI address of register */ ++ unsigned long value; ++ unsigned long pc; /* optional program counter */ ++ int map_id; ++ unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ ++ unsigned char width; /* size of register access in bytes */ + }; + + struct mmiotrace_map { +- resource_size_t phys; /* base address in PCI space */ +- unsigned long virt; /* base virtual address */ +- unsigned long len; /* mapping size */ +- int map_id; +- unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ ++ resource_size_t phys; /* base address in PCI space */ ++ unsigned long virt; /* base virtual address */ ++ unsigned long len; /* mapping size */ ++ int map_id; ++ unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ + }; + + /* in kernel/trace/trace_mmiotrace.c */ +@@ -94,4 +110,4 @@ extern void mmio_trace_rw(struct mmiotra + extern void mmio_trace_mapping(struct mmiotrace_map *map); + extern int mmio_trace_printk(const char *fmt, va_list args); + +-#endif /* MMIOTRACE_H */ ++#endif /* _LINUX_MMIOTRACE_H */ +Index: linux-2.6-tip/include/linux/mmzone.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/mmzone.h ++++ linux-2.6-tip/include/linux/mmzone.h +@@ -764,12 +764,6 @@ extern int numa_zonelist_order_handler(s + extern char numa_zonelist_order[]; + #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ + +-#include +-/* Returns the number of the current Node. */ +-#ifndef numa_node_id +-#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) +-#endif +- + #ifndef CONFIG_NEED_MULTIPLE_NODES + + extern struct pglist_data contig_page_data; +Index: linux-2.6-tip/include/linux/module.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/module.h ++++ linux-2.6-tip/include/linux/module.h +@@ -78,18 +78,34 @@ void sort_extable(struct exception_table + struct exception_table_entry *finish); + void sort_main_extable(void); + ++/* ++ * Return a pointer to the current module, but only if within a module ++ */ + #ifdef MODULE +-#define MODULE_GENERIC_TABLE(gtype,name) \ +-extern const struct gtype##_id __mod_##gtype##_table \ +- __attribute__ ((unused, alias(__stringify(name)))) +- + extern struct module __this_module; + #define THIS_MODULE (&__this_module) + #else /* !MODULE */ +-#define MODULE_GENERIC_TABLE(gtype,name) + #define THIS_MODULE ((struct module *)0) + #endif + ++/* ++ * Declare a module table ++ * - this suppresses "'name' defined but not used" warnings from the compiler ++ * as the table may not actually be used by the code within the module ++ */ ++#ifdef MODULE ++#define MODULE_GENERIC_TABLE(gtype,name) \ ++extern const struct gtype##_id __mod_##gtype##_table \ ++ __attribute__ ((unused, alias(__stringify(name)))) ++#define MODULE_STATIC_GENERIC_TABLE(gtype,name) \ ++extern const struct gtype##_id __mod_##gtype##_table \ ++ __attribute__ ((unused, alias(__stringify(name)))) ++#else ++#define MODULE_GENERIC_TABLE(gtype,name) ++#define MODULE_STATIC_GENERIC_TABLE(gtype,name) \ ++static __typeof__((name)) name __attribute__((unused)); ++#endif ++ + /* Generic info of form tag = "info" */ + #define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info) + +@@ -139,6 +155,8 @@ extern struct module __this_module; + + #define MODULE_DEVICE_TABLE(type,name) \ + MODULE_GENERIC_TABLE(type##_device,name) ++#define MODULE_STATIC_DEVICE_TABLE(type,name) \ ++ MODULE_STATIC_GENERIC_TABLE(type##_device,name) + + /* Version of form [:][-]. + Or for CVS/RCS ID version, everything but the number is stripped. +@@ -329,6 +347,11 @@ struct module + unsigned int num_tracepoints; + #endif + ++#ifdef CONFIG_TRACING ++ const char **trace_bprintk_fmt_start; ++ unsigned int num_trace_bprintk_fmt; ++#endif ++ + #ifdef CONFIG_MODULE_UNLOAD + /* What modules depend on me? */ + struct list_head modules_which_use_me; +Index: linux-2.6-tip/include/linux/mroute6.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/mroute6.h ++++ linux-2.6-tip/include/linux/mroute6.h +@@ -65,7 +65,7 @@ struct mif6ctl { + mifi_t mif6c_mifi; /* Index of MIF */ + unsigned char mif6c_flags; /* MIFF_ flags */ + unsigned char vifc_threshold; /* ttl limit */ +- u_short mif6c_pifi; /* the index of the physical IF */ ++ __u16 mif6c_pifi; /* the index of the physical IF */ + unsigned int vifc_rate_limit; /* Rate limiter values (NI) */ + }; + +Index: linux-2.6-tip/include/linux/mutex.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/mutex.h ++++ linux-2.6-tip/include/linux/mutex.h +@@ -12,11 +12,83 @@ + + #include + #include ++#include + #include + #include + + #include + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ ++ , .dep_map = { .name = #lockname } ++#else ++# define __DEP_MAP_MUTEX_INITIALIZER(lockname) ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT ++ ++#include ++ ++struct mutex { ++ struct rt_mutex lock; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++ ++#define __MUTEX_INITIALIZER(mutexname) \ ++ { \ ++ .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ ++ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ ++ } ++ ++#define DEFINE_MUTEX(mutexname) \ ++ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) ++ ++extern void ++__mutex_init(struct mutex *lock, char *name, struct lock_class_key *key); ++ ++extern void __lockfunc _mutex_lock(struct mutex *lock); ++extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); ++extern int __lockfunc _mutex_lock_killable(struct mutex *lock); ++extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_trylock(struct mutex *lock); ++extern void __lockfunc _mutex_unlock(struct mutex *lock); ++ ++#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) ++#define mutex_lock(l) _mutex_lock(l) ++#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) ++#define mutex_lock_killable(l) _mutex_lock_killable(l) ++#define mutex_trylock(l) _mutex_trylock(l) ++#define mutex_unlock(l) _mutex_unlock(l) ++#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) ++# define mutex_lock_interruptible_nested(l, s) \ ++ _mutex_lock_interruptible_nested(l, s) ++# define mutex_lock_killable_nested(l, s) \ ++ _mutex_lock_killable_nested(l, s) ++#else ++# define mutex_lock_nested(l, s) _mutex_lock(l) ++# define mutex_lock_interruptible_nested(l, s) \ ++ _mutex_lock_interruptible(l) ++# define mutex_lock_killable_nested(l, s) \ ++ _mutex_lock_killable(l) ++#endif ++ ++# define mutex_init(mutex) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __mutex_init((mutex), #mutex, &__key); \ ++} while (0) ++ ++#else /* PREEMPT_RT */ ++ + /* + * Simple, straightforward mutexes with strict semantics: + * +@@ -50,8 +122,10 @@ struct mutex { + atomic_t count; + spinlock_t wait_lock; + struct list_head wait_list; +-#ifdef CONFIG_DEBUG_MUTEXES ++#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) + struct thread_info *owner; ++#endif ++#ifdef CONFIG_DEBUG_MUTEXES + const char *name; + void *magic; + #endif +@@ -68,7 +142,6 @@ struct mutex_waiter { + struct list_head list; + struct task_struct *task; + #ifdef CONFIG_DEBUG_MUTEXES +- struct mutex *lock; + void *magic; + #endif + }; +@@ -86,13 +159,6 @@ do { \ + # define mutex_destroy(mutex) do { } while (0) + #endif + +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ +- , .dep_map = { .name = #lockname } +-#else +-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +-#endif +- + #define __MUTEX_INITIALIZER(lockname) \ + { .count = ATOMIC_INIT(1) \ + , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ +@@ -150,4 +216,29 @@ extern int __must_check mutex_lock_killa + extern int mutex_trylock(struct mutex *lock); + extern void mutex_unlock(struct mutex *lock); + ++#endif /* !PREEMPT_RT */ ++ ++/** ++ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 ++ * @cnt: the atomic which we are to dec ++ * @lock: the mutex to return holding if we dec to 0 ++ * ++ * return true and hold lock if we dec to 0, return false otherwise ++ */ ++static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) ++{ ++ /* dec if we can't possibly hit 0 */ ++ if (atomic_add_unless(cnt, -1, 1)) ++ return 0; ++ /* we might hit 0, so take the lock */ ++ mutex_lock(lock); ++ if (!atomic_dec_and_test(cnt)) { ++ /* when we actually did the dec, we didn't hit 0 */ ++ mutex_unlock(lock); ++ return 0; ++ } ++ /* we hit 0, and we hold the lock */ ++ return 1; ++} ++ + #endif +Index: linux-2.6-tip/include/linux/netfilter/nf_conntrack_tcp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/nf_conntrack_tcp.h ++++ linux-2.6-tip/include/linux/netfilter/nf_conntrack_tcp.h +@@ -2,6 +2,8 @@ + #define _NF_CONNTRACK_TCP_H + /* TCP tracking. */ + ++#include ++ + /* This is exposed to userspace (ctnetlink) */ + enum tcp_conntrack { + TCP_CONNTRACK_NONE, +@@ -34,8 +36,8 @@ enum tcp_conntrack { + #define IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED 0x10 + + struct nf_ct_tcp_flags { +- u_int8_t flags; +- u_int8_t mask; ++ __u8 flags; ++ __u8 mask; + }; + + #ifdef __KERNEL__ +Index: linux-2.6-tip/include/linux/netfilter/nfnetlink.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/nfnetlink.h ++++ linux-2.6-tip/include/linux/netfilter/nfnetlink.h +@@ -25,8 +25,8 @@ enum nfnetlink_groups { + /* General form of address family dependent message. + */ + struct nfgenmsg { +- u_int8_t nfgen_family; /* AF_xxx */ +- u_int8_t version; /* nfnetlink version */ ++ __u8 nfgen_family; /* AF_xxx */ ++ __u8 version; /* nfnetlink version */ + __be16 res_id; /* resource id */ + }; + +Index: linux-2.6-tip/include/linux/netfilter/nfnetlink_compat.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/nfnetlink_compat.h ++++ linux-2.6-tip/include/linux/netfilter/nfnetlink_compat.h +@@ -1,5 +1,8 @@ + #ifndef _NFNETLINK_COMPAT_H + #define _NFNETLINK_COMPAT_H ++ ++#include ++ + #ifndef __KERNEL__ + /* Old nfnetlink macros for userspace */ + +@@ -20,8 +23,8 @@ + + struct nfattr + { +- u_int16_t nfa_len; +- u_int16_t nfa_type; /* we use 15 bits for the type, and the highest ++ __u16 nfa_len; ++ __u16 nfa_type; /* we use 15 bits for the type, and the highest + * bit to indicate whether the payload is nested */ + }; + +Index: linux-2.6-tip/include/linux/netfilter/nfnetlink_log.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/nfnetlink_log.h ++++ linux-2.6-tip/include/linux/netfilter/nfnetlink_log.h +@@ -17,14 +17,14 @@ enum nfulnl_msg_types { + + struct nfulnl_msg_packet_hdr { + __be16 hw_protocol; /* hw protocol (network order) */ +- u_int8_t hook; /* netfilter hook */ +- u_int8_t _pad; ++ __u8 hook; /* netfilter hook */ ++ __u8 _pad; + }; + + struct nfulnl_msg_packet_hw { + __be16 hw_addrlen; +- u_int16_t _pad; +- u_int8_t hw_addr[8]; ++ __u16 _pad; ++ __u8 hw_addr[8]; + }; + + struct nfulnl_msg_packet_timestamp { +@@ -35,12 +35,12 @@ struct nfulnl_msg_packet_timestamp { + enum nfulnl_attr_type { + NFULA_UNSPEC, + NFULA_PACKET_HDR, +- NFULA_MARK, /* u_int32_t nfmark */ ++ NFULA_MARK, /* __u32 nfmark */ + NFULA_TIMESTAMP, /* nfulnl_msg_packet_timestamp */ +- NFULA_IFINDEX_INDEV, /* u_int32_t ifindex */ +- NFULA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ +- NFULA_IFINDEX_PHYSINDEV, /* u_int32_t ifindex */ +- NFULA_IFINDEX_PHYSOUTDEV, /* u_int32_t ifindex */ ++ NFULA_IFINDEX_INDEV, /* __u32 ifindex */ ++ NFULA_IFINDEX_OUTDEV, /* __u32 ifindex */ ++ NFULA_IFINDEX_PHYSINDEV, /* __u32 ifindex */ ++ NFULA_IFINDEX_PHYSOUTDEV, /* __u32 ifindex */ + NFULA_HWADDR, /* nfulnl_msg_packet_hw */ + NFULA_PAYLOAD, /* opaque data payload */ + NFULA_PREFIX, /* string prefix */ +@@ -65,23 +65,23 @@ enum nfulnl_msg_config_cmds { + }; + + struct nfulnl_msg_config_cmd { +- u_int8_t command; /* nfulnl_msg_config_cmds */ ++ __u8 command; /* nfulnl_msg_config_cmds */ + } __attribute__ ((packed)); + + struct nfulnl_msg_config_mode { + __be32 copy_range; +- u_int8_t copy_mode; +- u_int8_t _pad; ++ __u8 copy_mode; ++ __u8 _pad; + } __attribute__ ((packed)); + + enum nfulnl_attr_config { + NFULA_CFG_UNSPEC, + NFULA_CFG_CMD, /* nfulnl_msg_config_cmd */ + NFULA_CFG_MODE, /* nfulnl_msg_config_mode */ +- NFULA_CFG_NLBUFSIZ, /* u_int32_t buffer size */ +- NFULA_CFG_TIMEOUT, /* u_int32_t in 1/100 s */ +- NFULA_CFG_QTHRESH, /* u_int32_t */ +- NFULA_CFG_FLAGS, /* u_int16_t */ ++ NFULA_CFG_NLBUFSIZ, /* __u32 buffer size */ ++ NFULA_CFG_TIMEOUT, /* __u32 in 1/100 s */ ++ NFULA_CFG_QTHRESH, /* __u32 */ ++ NFULA_CFG_FLAGS, /* __u16 */ + __NFULA_CFG_MAX + }; + #define NFULA_CFG_MAX (__NFULA_CFG_MAX -1) +Index: linux-2.6-tip/include/linux/netfilter/nfnetlink_queue.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/nfnetlink_queue.h ++++ linux-2.6-tip/include/linux/netfilter/nfnetlink_queue.h +@@ -15,13 +15,13 @@ enum nfqnl_msg_types { + struct nfqnl_msg_packet_hdr { + __be32 packet_id; /* unique ID of packet in queue */ + __be16 hw_protocol; /* hw protocol (network order) */ +- u_int8_t hook; /* netfilter hook */ ++ __u8 hook; /* netfilter hook */ + } __attribute__ ((packed)); + + struct nfqnl_msg_packet_hw { + __be16 hw_addrlen; +- u_int16_t _pad; +- u_int8_t hw_addr[8]; ++ __u16 _pad; ++ __u8 hw_addr[8]; + }; + + struct nfqnl_msg_packet_timestamp { +@@ -33,12 +33,12 @@ enum nfqnl_attr_type { + NFQA_UNSPEC, + NFQA_PACKET_HDR, + NFQA_VERDICT_HDR, /* nfqnl_msg_verdict_hrd */ +- NFQA_MARK, /* u_int32_t nfmark */ ++ NFQA_MARK, /* __u32 nfmark */ + NFQA_TIMESTAMP, /* nfqnl_msg_packet_timestamp */ +- NFQA_IFINDEX_INDEV, /* u_int32_t ifindex */ +- NFQA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ +- NFQA_IFINDEX_PHYSINDEV, /* u_int32_t ifindex */ +- NFQA_IFINDEX_PHYSOUTDEV, /* u_int32_t ifindex */ ++ NFQA_IFINDEX_INDEV, /* __u32 ifindex */ ++ NFQA_IFINDEX_OUTDEV, /* __u32 ifindex */ ++ NFQA_IFINDEX_PHYSINDEV, /* __u32 ifindex */ ++ NFQA_IFINDEX_PHYSOUTDEV, /* __u32 ifindex */ + NFQA_HWADDR, /* nfqnl_msg_packet_hw */ + NFQA_PAYLOAD, /* opaque data payload */ + +@@ -61,8 +61,8 @@ enum nfqnl_msg_config_cmds { + }; + + struct nfqnl_msg_config_cmd { +- u_int8_t command; /* nfqnl_msg_config_cmds */ +- u_int8_t _pad; ++ __u8 command; /* nfqnl_msg_config_cmds */ ++ __u8 _pad; + __be16 pf; /* AF_xxx for PF_[UN]BIND */ + }; + +@@ -74,7 +74,7 @@ enum nfqnl_config_mode { + + struct nfqnl_msg_config_params { + __be32 copy_range; +- u_int8_t copy_mode; /* enum nfqnl_config_mode */ ++ __u8 copy_mode; /* enum nfqnl_config_mode */ + } __attribute__ ((packed)); + + +@@ -82,7 +82,7 @@ enum nfqnl_attr_config { + NFQA_CFG_UNSPEC, + NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */ + NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */ +- NFQA_CFG_QUEUE_MAXLEN, /* u_int32_t */ ++ NFQA_CFG_QUEUE_MAXLEN, /* __u32 */ + __NFQA_CFG_MAX + }; + #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1) +Index: linux-2.6-tip/include/linux/netfilter/x_tables.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/x_tables.h ++++ linux-2.6-tip/include/linux/netfilter/x_tables.h +@@ -1,6 +1,8 @@ + #ifndef _X_TABLES_H + #define _X_TABLES_H + ++#include ++ + #define XT_FUNCTION_MAXNAMELEN 30 + #define XT_TABLE_MAXNAMELEN 32 + +@@ -8,22 +10,22 @@ struct xt_entry_match + { + union { + struct { +- u_int16_t match_size; ++ __u16 match_size; + + /* Used by userspace */ + char name[XT_FUNCTION_MAXNAMELEN-1]; + +- u_int8_t revision; ++ __u8 revision; + } user; + struct { +- u_int16_t match_size; ++ __u16 match_size; + + /* Used inside the kernel */ + struct xt_match *match; + } kernel; + + /* Total length */ +- u_int16_t match_size; ++ __u16 match_size; + } u; + + unsigned char data[0]; +@@ -33,22 +35,22 @@ struct xt_entry_target + { + union { + struct { +- u_int16_t target_size; ++ __u16 target_size; + + /* Used by userspace */ + char name[XT_FUNCTION_MAXNAMELEN-1]; + +- u_int8_t revision; ++ __u8 revision; + } user; + struct { +- u_int16_t target_size; ++ __u16 target_size; + + /* Used inside the kernel */ + struct xt_target *target; + } kernel; + + /* Total length */ +- u_int16_t target_size; ++ __u16 target_size; + } u; + + unsigned char data[0]; +@@ -74,7 +76,7 @@ struct xt_get_revision + { + char name[XT_FUNCTION_MAXNAMELEN-1]; + +- u_int8_t revision; ++ __u8 revision; + }; + + /* CONTINUE verdict for targets */ +@@ -90,10 +92,10 @@ struct xt_get_revision + */ + struct _xt_align + { +- u_int8_t u8; +- u_int16_t u16; +- u_int32_t u32; +- u_int64_t u64; ++ __u8 u8; ++ __u16 u16; ++ __u32 u32; ++ __u64 u64; + }; + + #define XT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) \ +@@ -109,7 +111,7 @@ struct _xt_align + + struct xt_counters + { +- u_int64_t pcnt, bcnt; /* Packet and byte counters */ ++ __u64 pcnt, bcnt; /* Packet and byte counters */ + }; + + /* The argument to IPT_SO_ADD_COUNTERS. */ +Index: linux-2.6-tip/include/linux/netfilter/xt_CLASSIFY.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_CLASSIFY.h ++++ linux-2.6-tip/include/linux/netfilter/xt_CLASSIFY.h +@@ -1,8 +1,10 @@ + #ifndef _XT_CLASSIFY_H + #define _XT_CLASSIFY_H + ++#include ++ + struct xt_classify_target_info { +- u_int32_t priority; ++ __u32 priority; + }; + + #endif /*_XT_CLASSIFY_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_CONNMARK.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_CONNMARK.h ++++ linux-2.6-tip/include/linux/netfilter/xt_CONNMARK.h +@@ -1,6 +1,8 @@ + #ifndef _XT_CONNMARK_H_target + #define _XT_CONNMARK_H_target + ++#include ++ + /* Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * +@@ -19,12 +21,12 @@ enum { + struct xt_connmark_target_info { + unsigned long mark; + unsigned long mask; +- u_int8_t mode; ++ __u8 mode; + }; + + struct xt_connmark_tginfo1 { +- u_int32_t ctmark, ctmask, nfmask; +- u_int8_t mode; ++ __u32 ctmark, ctmask, nfmask; ++ __u8 mode; + }; + + #endif /*_XT_CONNMARK_H_target*/ +Index: linux-2.6-tip/include/linux/netfilter/xt_CONNSECMARK.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_CONNSECMARK.h ++++ linux-2.6-tip/include/linux/netfilter/xt_CONNSECMARK.h +@@ -1,13 +1,15 @@ + #ifndef _XT_CONNSECMARK_H_target + #define _XT_CONNSECMARK_H_target + ++#include ++ + enum { + CONNSECMARK_SAVE = 1, + CONNSECMARK_RESTORE, + }; + + struct xt_connsecmark_target_info { +- u_int8_t mode; ++ __u8 mode; + }; + + #endif /*_XT_CONNSECMARK_H_target */ +Index: linux-2.6-tip/include/linux/netfilter/xt_DSCP.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_DSCP.h ++++ linux-2.6-tip/include/linux/netfilter/xt_DSCP.h +@@ -11,15 +11,16 @@ + #ifndef _XT_DSCP_TARGET_H + #define _XT_DSCP_TARGET_H + #include ++#include + + /* target info */ + struct xt_DSCP_info { +- u_int8_t dscp; ++ __u8 dscp; + }; + + struct xt_tos_target_info { +- u_int8_t tos_value; +- u_int8_t tos_mask; ++ __u8 tos_value; ++ __u8 tos_mask; + }; + + #endif /* _XT_DSCP_TARGET_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_MARK.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_MARK.h ++++ linux-2.6-tip/include/linux/netfilter/xt_MARK.h +@@ -1,6 +1,8 @@ + #ifndef _XT_MARK_H_target + #define _XT_MARK_H_target + ++#include ++ + /* Version 0 */ + struct xt_mark_target_info { + unsigned long mark; +@@ -15,11 +17,11 @@ enum { + + struct xt_mark_target_info_v1 { + unsigned long mark; +- u_int8_t mode; ++ __u8 mode; + }; + + struct xt_mark_tginfo2 { +- u_int32_t mark, mask; ++ __u32 mark, mask; + }; + + #endif /*_XT_MARK_H_target */ +Index: linux-2.6-tip/include/linux/netfilter/xt_NFLOG.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_NFLOG.h ++++ linux-2.6-tip/include/linux/netfilter/xt_NFLOG.h +@@ -1,17 +1,19 @@ + #ifndef _XT_NFLOG_TARGET + #define _XT_NFLOG_TARGET + ++#include ++ + #define XT_NFLOG_DEFAULT_GROUP 0x1 + #define XT_NFLOG_DEFAULT_THRESHOLD 0 + + #define XT_NFLOG_MASK 0x0 + + struct xt_nflog_info { +- u_int32_t len; +- u_int16_t group; +- u_int16_t threshold; +- u_int16_t flags; +- u_int16_t pad; ++ __u32 len; ++ __u16 group; ++ __u16 threshold; ++ __u16 flags; ++ __u16 pad; + char prefix[64]; + }; + +Index: linux-2.6-tip/include/linux/netfilter/xt_NFQUEUE.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_NFQUEUE.h ++++ linux-2.6-tip/include/linux/netfilter/xt_NFQUEUE.h +@@ -8,9 +8,11 @@ + #ifndef _XT_NFQ_TARGET_H + #define _XT_NFQ_TARGET_H + ++#include ++ + /* target info */ + struct xt_NFQ_info { +- u_int16_t queuenum; ++ __u16 queuenum; + }; + + #endif /* _XT_NFQ_TARGET_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_RATEEST.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_RATEEST.h ++++ linux-2.6-tip/include/linux/netfilter/xt_RATEEST.h +@@ -1,10 +1,12 @@ + #ifndef _XT_RATEEST_TARGET_H + #define _XT_RATEEST_TARGET_H + ++#include ++ + struct xt_rateest_target_info { + char name[IFNAMSIZ]; +- int8_t interval; +- u_int8_t ewma_log; ++ __s8 interval; ++ __u8 ewma_log; + + /* Used internally by the kernel */ + struct xt_rateest *est __attribute__((aligned(8))); +Index: linux-2.6-tip/include/linux/netfilter/xt_SECMARK.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_SECMARK.h ++++ linux-2.6-tip/include/linux/netfilter/xt_SECMARK.h +@@ -1,6 +1,8 @@ + #ifndef _XT_SECMARK_H_target + #define _XT_SECMARK_H_target + ++#include ++ + /* + * This is intended for use by various security subsystems (but not + * at the same time). +@@ -12,12 +14,12 @@ + #define SECMARK_SELCTX_MAX 256 + + struct xt_secmark_target_selinux_info { +- u_int32_t selsid; ++ __u32 selsid; + char selctx[SECMARK_SELCTX_MAX]; + }; + + struct xt_secmark_target_info { +- u_int8_t mode; ++ __u8 mode; + union { + struct xt_secmark_target_selinux_info sel; + } u; +Index: linux-2.6-tip/include/linux/netfilter/xt_TCPMSS.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_TCPMSS.h ++++ linux-2.6-tip/include/linux/netfilter/xt_TCPMSS.h +@@ -1,8 +1,10 @@ + #ifndef _XT_TCPMSS_H + #define _XT_TCPMSS_H + ++#include ++ + struct xt_tcpmss_info { +- u_int16_t mss; ++ __u16 mss; + }; + + #define XT_TCPMSS_CLAMP_PMTU 0xffff +Index: linux-2.6-tip/include/linux/netfilter/xt_connbytes.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_connbytes.h ++++ linux-2.6-tip/include/linux/netfilter/xt_connbytes.h +@@ -1,6 +1,8 @@ + #ifndef _XT_CONNBYTES_H + #define _XT_CONNBYTES_H + ++#include ++ + enum xt_connbytes_what { + XT_CONNBYTES_PKTS, + XT_CONNBYTES_BYTES, +@@ -19,7 +21,7 @@ struct xt_connbytes_info + aligned_u64 from; /* count to be matched */ + aligned_u64 to; /* count to be matched */ + } count; +- u_int8_t what; /* ipt_connbytes_what */ +- u_int8_t direction; /* ipt_connbytes_direction */ ++ __u8 what; /* ipt_connbytes_what */ ++ __u8 direction; /* ipt_connbytes_direction */ + }; + #endif +Index: linux-2.6-tip/include/linux/netfilter/xt_connmark.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_connmark.h ++++ linux-2.6-tip/include/linux/netfilter/xt_connmark.h +@@ -1,6 +1,8 @@ + #ifndef _XT_CONNMARK_H + #define _XT_CONNMARK_H + ++#include ++ + /* Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * +@@ -12,12 +14,12 @@ + + struct xt_connmark_info { + unsigned long mark, mask; +- u_int8_t invert; ++ __u8 invert; + }; + + struct xt_connmark_mtinfo1 { +- u_int32_t mark, mask; +- u_int8_t invert; ++ __u32 mark, mask; ++ __u8 invert; + }; + + #endif /*_XT_CONNMARK_H*/ +Index: linux-2.6-tip/include/linux/netfilter/xt_conntrack.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_conntrack.h ++++ linux-2.6-tip/include/linux/netfilter/xt_conntrack.h +@@ -63,9 +63,9 @@ struct xt_conntrack_info + unsigned long expires_min, expires_max; + + /* Flags word */ +- u_int8_t flags; ++ __u8 flags; + /* Inverse flags */ +- u_int8_t invflags; ++ __u8 invflags; + }; + + struct xt_conntrack_mtinfo1 { +@@ -73,12 +73,12 @@ struct xt_conntrack_mtinfo1 { + union nf_inet_addr origdst_addr, origdst_mask; + union nf_inet_addr replsrc_addr, replsrc_mask; + union nf_inet_addr repldst_addr, repldst_mask; +- u_int32_t expires_min, expires_max; +- u_int16_t l4proto; ++ __u32 expires_min, expires_max; ++ __u16 l4proto; + __be16 origsrc_port, origdst_port; + __be16 replsrc_port, repldst_port; +- u_int16_t match_flags, invert_flags; +- u_int8_t state_mask, status_mask; ++ __u16 match_flags, invert_flags; ++ __u8 state_mask, status_mask; + }; + + #endif /*_XT_CONNTRACK_H*/ +Index: linux-2.6-tip/include/linux/netfilter/xt_dccp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_dccp.h ++++ linux-2.6-tip/include/linux/netfilter/xt_dccp.h +@@ -1,6 +1,8 @@ + #ifndef _XT_DCCP_H_ + #define _XT_DCCP_H_ + ++#include ++ + #define XT_DCCP_SRC_PORTS 0x01 + #define XT_DCCP_DEST_PORTS 0x02 + #define XT_DCCP_TYPE 0x04 +@@ -9,14 +11,14 @@ + #define XT_DCCP_VALID_FLAGS 0x0f + + struct xt_dccp_info { +- u_int16_t dpts[2]; /* Min, Max */ +- u_int16_t spts[2]; /* Min, Max */ ++ __u16 dpts[2]; /* Min, Max */ ++ __u16 spts[2]; /* Min, Max */ + +- u_int16_t flags; +- u_int16_t invflags; ++ __u16 flags; ++ __u16 invflags; + +- u_int16_t typemask; +- u_int8_t option; ++ __u16 typemask; ++ __u8 option; + }; + + #endif /* _XT_DCCP_H_ */ +Index: linux-2.6-tip/include/linux/netfilter/xt_dscp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_dscp.h ++++ linux-2.6-tip/include/linux/netfilter/xt_dscp.h +@@ -10,20 +10,22 @@ + #ifndef _XT_DSCP_H + #define _XT_DSCP_H + ++#include ++ + #define XT_DSCP_MASK 0xfc /* 11111100 */ + #define XT_DSCP_SHIFT 2 + #define XT_DSCP_MAX 0x3f /* 00111111 */ + + /* match info */ + struct xt_dscp_info { +- u_int8_t dscp; +- u_int8_t invert; ++ __u8 dscp; ++ __u8 invert; + }; + + struct xt_tos_match_info { +- u_int8_t tos_mask; +- u_int8_t tos_value; +- u_int8_t invert; ++ __u8 tos_mask; ++ __u8 tos_value; ++ __u8 invert; + }; + + #endif /* _XT_DSCP_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_esp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_esp.h ++++ linux-2.6-tip/include/linux/netfilter/xt_esp.h +@@ -1,10 +1,12 @@ + #ifndef _XT_ESP_H + #define _XT_ESP_H + ++#include ++ + struct xt_esp + { +- u_int32_t spis[2]; /* Security Parameter Index */ +- u_int8_t invflags; /* Inverse flags */ ++ __u32 spis[2]; /* Security Parameter Index */ ++ __u8 invflags; /* Inverse flags */ + }; + + /* Values for "invflags" field in struct xt_esp. */ +Index: linux-2.6-tip/include/linux/netfilter/xt_hashlimit.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_hashlimit.h ++++ linux-2.6-tip/include/linux/netfilter/xt_hashlimit.h +@@ -1,6 +1,8 @@ + #ifndef _XT_HASHLIMIT_H + #define _XT_HASHLIMIT_H + ++#include ++ + /* timings are in milliseconds. */ + #define XT_HASHLIMIT_SCALE 10000 + /* 1/10,000 sec period => max of 10,000/sec. Min rate is then 429490 +@@ -18,15 +20,15 @@ enum { + }; + + struct hashlimit_cfg { +- u_int32_t mode; /* bitmask of XT_HASHLIMIT_HASH_* */ +- u_int32_t avg; /* Average secs between packets * scale */ +- u_int32_t burst; /* Period multiplier for upper limit. */ ++ __u32 mode; /* bitmask of XT_HASHLIMIT_HASH_* */ ++ __u32 avg; /* Average secs between packets * scale */ ++ __u32 burst; /* Period multiplier for upper limit. */ + + /* user specified */ +- u_int32_t size; /* how many buckets */ +- u_int32_t max; /* max number of entries */ +- u_int32_t gc_interval; /* gc interval */ +- u_int32_t expire; /* when do entries expire? */ ++ __u32 size; /* how many buckets */ ++ __u32 max; /* max number of entries */ ++ __u32 gc_interval; /* gc interval */ ++ __u32 expire; /* when do entries expire? */ + }; + + struct xt_hashlimit_info { +@@ -42,17 +44,17 @@ struct xt_hashlimit_info { + }; + + struct hashlimit_cfg1 { +- u_int32_t mode; /* bitmask of XT_HASHLIMIT_HASH_* */ +- u_int32_t avg; /* Average secs between packets * scale */ +- u_int32_t burst; /* Period multiplier for upper limit. */ ++ __u32 mode; /* bitmask of XT_HASHLIMIT_HASH_* */ ++ __u32 avg; /* Average secs between packets * scale */ ++ __u32 burst; /* Period multiplier for upper limit. */ + + /* user specified */ +- u_int32_t size; /* how many buckets */ +- u_int32_t max; /* max number of entries */ +- u_int32_t gc_interval; /* gc interval */ +- u_int32_t expire; /* when do entries expire? */ ++ __u32 size; /* how many buckets */ ++ __u32 max; /* max number of entries */ ++ __u32 gc_interval; /* gc interval */ ++ __u32 expire; /* when do entries expire? */ + +- u_int8_t srcmask, dstmask; ++ __u8 srcmask, dstmask; + }; + + struct xt_hashlimit_mtinfo1 { +Index: linux-2.6-tip/include/linux/netfilter/xt_iprange.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_iprange.h ++++ linux-2.6-tip/include/linux/netfilter/xt_iprange.h +@@ -1,6 +1,8 @@ + #ifndef _LINUX_NETFILTER_XT_IPRANGE_H + #define _LINUX_NETFILTER_XT_IPRANGE_H 1 + ++#include ++ + enum { + IPRANGE_SRC = 1 << 0, /* match source IP address */ + IPRANGE_DST = 1 << 1, /* match destination IP address */ +@@ -11,7 +13,7 @@ enum { + struct xt_iprange_mtinfo { + union nf_inet_addr src_min, src_max; + union nf_inet_addr dst_min, dst_max; +- u_int8_t flags; ++ __u8 flags; + }; + + #endif /* _LINUX_NETFILTER_XT_IPRANGE_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_length.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_length.h ++++ linux-2.6-tip/include/linux/netfilter/xt_length.h +@@ -1,9 +1,11 @@ + #ifndef _XT_LENGTH_H + #define _XT_LENGTH_H + ++#include ++ + struct xt_length_info { +- u_int16_t min, max; +- u_int8_t invert; ++ __u16 min, max; ++ __u8 invert; + }; + + #endif /*_XT_LENGTH_H*/ +Index: linux-2.6-tip/include/linux/netfilter/xt_limit.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_limit.h ++++ linux-2.6-tip/include/linux/netfilter/xt_limit.h +@@ -1,19 +1,21 @@ + #ifndef _XT_RATE_H + #define _XT_RATE_H + ++#include ++ + /* timings are in milliseconds. */ + #define XT_LIMIT_SCALE 10000 + + /* 1/10,000 sec period => max of 10,000/sec. Min rate is then 429490 + seconds, or one every 59 hours. */ + struct xt_rateinfo { +- u_int32_t avg; /* Average secs between packets * scale */ +- u_int32_t burst; /* Period multiplier for upper limit. */ ++ __u32 avg; /* Average secs between packets * scale */ ++ __u32 burst; /* Period multiplier for upper limit. */ + + /* Used internally by the kernel */ + unsigned long prev; +- u_int32_t credit; +- u_int32_t credit_cap, cost; ++ __u32 credit; ++ __u32 credit_cap, cost; + + /* Ugly, ugly fucker. */ + struct xt_rateinfo *master; +Index: linux-2.6-tip/include/linux/netfilter/xt_mark.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_mark.h ++++ linux-2.6-tip/include/linux/netfilter/xt_mark.h +@@ -1,14 +1,16 @@ + #ifndef _XT_MARK_H + #define _XT_MARK_H + ++#include ++ + struct xt_mark_info { + unsigned long mark, mask; +- u_int8_t invert; ++ __u8 invert; + }; + + struct xt_mark_mtinfo1 { +- u_int32_t mark, mask; +- u_int8_t invert; ++ __u32 mark, mask; ++ __u8 invert; + }; + + #endif /*_XT_MARK_H*/ +Index: linux-2.6-tip/include/linux/netfilter/xt_multiport.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_multiport.h ++++ linux-2.6-tip/include/linux/netfilter/xt_multiport.h +@@ -1,6 +1,8 @@ + #ifndef _XT_MULTIPORT_H + #define _XT_MULTIPORT_H + ++#include ++ + enum xt_multiport_flags + { + XT_MULTIPORT_SOURCE, +@@ -13,18 +15,18 @@ enum xt_multiport_flags + /* Must fit inside union xt_matchinfo: 16 bytes */ + struct xt_multiport + { +- u_int8_t flags; /* Type of comparison */ +- u_int8_t count; /* Number of ports */ +- u_int16_t ports[XT_MULTI_PORTS]; /* Ports */ ++ __u8 flags; /* Type of comparison */ ++ __u8 count; /* Number of ports */ ++ __u16 ports[XT_MULTI_PORTS]; /* Ports */ + }; + + struct xt_multiport_v1 + { +- u_int8_t flags; /* Type of comparison */ +- u_int8_t count; /* Number of ports */ +- u_int16_t ports[XT_MULTI_PORTS]; /* Ports */ +- u_int8_t pflags[XT_MULTI_PORTS]; /* Port flags */ +- u_int8_t invert; /* Invert flag */ ++ __u8 flags; /* Type of comparison */ ++ __u8 count; /* Number of ports */ ++ __u16 ports[XT_MULTI_PORTS]; /* Ports */ ++ __u8 pflags[XT_MULTI_PORTS]; /* Port flags */ ++ __u8 invert; /* Invert flag */ + }; + + #endif /*_XT_MULTIPORT_H*/ +Index: linux-2.6-tip/include/linux/netfilter/xt_owner.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_owner.h ++++ linux-2.6-tip/include/linux/netfilter/xt_owner.h +@@ -1,6 +1,8 @@ + #ifndef _XT_OWNER_MATCH_H + #define _XT_OWNER_MATCH_H + ++#include ++ + enum { + XT_OWNER_UID = 1 << 0, + XT_OWNER_GID = 1 << 1, +@@ -8,9 +10,9 @@ enum { + }; + + struct xt_owner_match_info { +- u_int32_t uid_min, uid_max; +- u_int32_t gid_min, gid_max; +- u_int8_t match, invert; ++ __u32 uid_min, uid_max; ++ __u32 gid_min, gid_max; ++ __u8 match, invert; + }; + + #endif /* _XT_OWNER_MATCH_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_physdev.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_physdev.h ++++ linux-2.6-tip/include/linux/netfilter/xt_physdev.h +@@ -1,6 +1,8 @@ + #ifndef _XT_PHYSDEV_H + #define _XT_PHYSDEV_H + ++#include ++ + #ifdef __KERNEL__ + #include + #endif +@@ -17,8 +19,8 @@ struct xt_physdev_info { + char in_mask[IFNAMSIZ]; + char physoutdev[IFNAMSIZ]; + char out_mask[IFNAMSIZ]; +- u_int8_t invert; +- u_int8_t bitmask; ++ __u8 invert; ++ __u8 bitmask; + }; + + #endif /*_XT_PHYSDEV_H*/ +Index: linux-2.6-tip/include/linux/netfilter/xt_policy.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_policy.h ++++ linux-2.6-tip/include/linux/netfilter/xt_policy.h +@@ -1,6 +1,8 @@ + #ifndef _XT_POLICY_H + #define _XT_POLICY_H + ++#include ++ + #define XT_POLICY_MAX_ELEM 4 + + enum xt_policy_flags +@@ -19,7 +21,7 @@ enum xt_policy_modes + + struct xt_policy_spec + { +- u_int8_t saddr:1, ++ __u8 saddr:1, + daddr:1, + proto:1, + mode:1, +@@ -55,9 +57,9 @@ struct xt_policy_elem + #endif + }; + __be32 spi; +- u_int32_t reqid; +- u_int8_t proto; +- u_int8_t mode; ++ __u32 reqid; ++ __u8 proto; ++ __u8 mode; + + struct xt_policy_spec match; + struct xt_policy_spec invert; +@@ -66,8 +68,8 @@ struct xt_policy_elem + struct xt_policy_info + { + struct xt_policy_elem pol[XT_POLICY_MAX_ELEM]; +- u_int16_t flags; +- u_int16_t len; ++ __u16 flags; ++ __u16 len; + }; + + #endif /* _XT_POLICY_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_rateest.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_rateest.h ++++ linux-2.6-tip/include/linux/netfilter/xt_rateest.h +@@ -1,6 +1,8 @@ + #ifndef _XT_RATEEST_MATCH_H + #define _XT_RATEEST_MATCH_H + ++#include ++ + enum xt_rateest_match_flags { + XT_RATEEST_MATCH_INVERT = 1<<0, + XT_RATEEST_MATCH_ABS = 1<<1, +@@ -20,12 +22,12 @@ enum xt_rateest_match_mode { + struct xt_rateest_match_info { + char name1[IFNAMSIZ]; + char name2[IFNAMSIZ]; +- u_int16_t flags; +- u_int16_t mode; +- u_int32_t bps1; +- u_int32_t pps1; +- u_int32_t bps2; +- u_int32_t pps2; ++ __u16 flags; ++ __u16 mode; ++ __u32 bps1; ++ __u32 pps1; ++ __u32 bps2; ++ __u32 pps2; + + /* Used internally by the kernel */ + struct xt_rateest *est1 __attribute__((aligned(8))); +Index: linux-2.6-tip/include/linux/netfilter/xt_realm.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_realm.h ++++ linux-2.6-tip/include/linux/netfilter/xt_realm.h +@@ -1,10 +1,12 @@ + #ifndef _XT_REALM_H + #define _XT_REALM_H + ++#include ++ + struct xt_realm_info { +- u_int32_t id; +- u_int32_t mask; +- u_int8_t invert; ++ __u32 id; ++ __u32 mask; ++ __u8 invert; + }; + + #endif /* _XT_REALM_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_recent.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_recent.h ++++ linux-2.6-tip/include/linux/netfilter/xt_recent.h +@@ -1,6 +1,8 @@ + #ifndef _LINUX_NETFILTER_XT_RECENT_H + #define _LINUX_NETFILTER_XT_RECENT_H 1 + ++#include ++ + enum { + XT_RECENT_CHECK = 1 << 0, + XT_RECENT_SET = 1 << 1, +@@ -15,12 +17,12 @@ enum { + }; + + struct xt_recent_mtinfo { +- u_int32_t seconds; +- u_int32_t hit_count; +- u_int8_t check_set; +- u_int8_t invert; ++ __u32 seconds; ++ __u32 hit_count; ++ __u8 check_set; ++ __u8 invert; + char name[XT_RECENT_NAME_LEN]; +- u_int8_t side; ++ __u8 side; + }; + + #endif /* _LINUX_NETFILTER_XT_RECENT_H */ +Index: linux-2.6-tip/include/linux/netfilter/xt_sctp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_sctp.h ++++ linux-2.6-tip/include/linux/netfilter/xt_sctp.h +@@ -1,6 +1,8 @@ + #ifndef _XT_SCTP_H_ + #define _XT_SCTP_H_ + ++#include ++ + #define XT_SCTP_SRC_PORTS 0x01 + #define XT_SCTP_DEST_PORTS 0x02 + #define XT_SCTP_CHUNK_TYPES 0x04 +@@ -8,49 +10,49 @@ + #define XT_SCTP_VALID_FLAGS 0x07 + + struct xt_sctp_flag_info { +- u_int8_t chunktype; +- u_int8_t flag; +- u_int8_t flag_mask; ++ __u8 chunktype; ++ __u8 flag; ++ __u8 flag_mask; + }; + + #define XT_NUM_SCTP_FLAGS 4 + + struct xt_sctp_info { +- u_int16_t dpts[2]; /* Min, Max */ +- u_int16_t spts[2]; /* Min, Max */ ++ __u16 dpts[2]; /* Min, Max */ ++ __u16 spts[2]; /* Min, Max */ + +- u_int32_t chunkmap[256 / sizeof (u_int32_t)]; /* Bit mask of chunks to be matched according to RFC 2960 */ ++ __u32 chunkmap[256 / sizeof (__u32)]; /* Bit mask of chunks to be matched according to RFC 2960 */ + + #define SCTP_CHUNK_MATCH_ANY 0x01 /* Match if any of the chunk types are present */ + #define SCTP_CHUNK_MATCH_ALL 0x02 /* Match if all of the chunk types are present */ + #define SCTP_CHUNK_MATCH_ONLY 0x04 /* Match if these are the only chunk types present */ + +- u_int32_t chunk_match_type; ++ __u32 chunk_match_type; + struct xt_sctp_flag_info flag_info[XT_NUM_SCTP_FLAGS]; + int flag_count; + +- u_int32_t flags; +- u_int32_t invflags; ++ __u32 flags; ++ __u32 invflags; + }; + + #define bytes(type) (sizeof(type) * 8) + + #define SCTP_CHUNKMAP_SET(chunkmap, type) \ + do { \ +- (chunkmap)[type / bytes(u_int32_t)] |= \ +- 1 << (type % bytes(u_int32_t)); \ ++ (chunkmap)[type / bytes(__u32)] |= \ ++ 1 << (type % bytes(__u32)); \ + } while (0) + + #define SCTP_CHUNKMAP_CLEAR(chunkmap, type) \ + do { \ +- (chunkmap)[type / bytes(u_int32_t)] &= \ +- ~(1 << (type % bytes(u_int32_t))); \ ++ (chunkmap)[type / bytes(__u32)] &= \ ++ ~(1 << (type % bytes(__u32))); \ + } while (0) + + #define SCTP_CHUNKMAP_IS_SET(chunkmap, type) \ + ({ \ +- ((chunkmap)[type / bytes (u_int32_t)] & \ +- (1 << (type % bytes (u_int32_t)))) ? 1: 0; \ ++ ((chunkmap)[type / bytes (__u32)] & \ ++ (1 << (type % bytes (__u32)))) ? 1: 0; \ + }) + + #define SCTP_CHUNKMAP_RESET(chunkmap) \ +@@ -65,7 +67,7 @@ struct xt_sctp_info { + #define SCTP_CHUNKMAP_IS_CLEAR(chunkmap) \ + __sctp_chunkmap_is_clear((chunkmap), ARRAY_SIZE(chunkmap)) + static inline bool +-__sctp_chunkmap_is_clear(const u_int32_t *chunkmap, unsigned int n) ++__sctp_chunkmap_is_clear(const __u32 *chunkmap, unsigned int n) + { + unsigned int i; + for (i = 0; i < n; ++i) +@@ -77,7 +79,7 @@ __sctp_chunkmap_is_clear(const u_int32_t + #define SCTP_CHUNKMAP_IS_ALL_SET(chunkmap) \ + __sctp_chunkmap_is_all_set((chunkmap), ARRAY_SIZE(chunkmap)) + static inline bool +-__sctp_chunkmap_is_all_set(const u_int32_t *chunkmap, unsigned int n) ++__sctp_chunkmap_is_all_set(const __u32 *chunkmap, unsigned int n) + { + unsigned int i; + for (i = 0; i < n; ++i) +Index: linux-2.6-tip/include/linux/netfilter/xt_statistic.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_statistic.h ++++ linux-2.6-tip/include/linux/netfilter/xt_statistic.h +@@ -1,6 +1,8 @@ + #ifndef _XT_STATISTIC_H + #define _XT_STATISTIC_H + ++#include ++ + enum xt_statistic_mode { + XT_STATISTIC_MODE_RANDOM, + XT_STATISTIC_MODE_NTH, +@@ -14,17 +16,17 @@ enum xt_statistic_flags { + #define XT_STATISTIC_MASK 0x1 + + struct xt_statistic_info { +- u_int16_t mode; +- u_int16_t flags; ++ __u16 mode; ++ __u16 flags; + union { + struct { +- u_int32_t probability; ++ __u32 probability; + } random; + struct { +- u_int32_t every; +- u_int32_t packet; ++ __u32 every; ++ __u32 packet; + /* Used internally by the kernel */ +- u_int32_t count; ++ __u32 count; + } nth; + } u; + struct xt_statistic_info *master __attribute__((aligned(8))); +Index: linux-2.6-tip/include/linux/netfilter/xt_string.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_string.h ++++ linux-2.6-tip/include/linux/netfilter/xt_string.h +@@ -1,6 +1,8 @@ + #ifndef _XT_STRING_H + #define _XT_STRING_H + ++#include ++ + #define XT_STRING_MAX_PATTERN_SIZE 128 + #define XT_STRING_MAX_ALGO_NAME_SIZE 16 + +@@ -11,18 +13,18 @@ enum { + + struct xt_string_info + { +- u_int16_t from_offset; +- u_int16_t to_offset; ++ __u16 from_offset; ++ __u16 to_offset; + char algo[XT_STRING_MAX_ALGO_NAME_SIZE]; + char pattern[XT_STRING_MAX_PATTERN_SIZE]; +- u_int8_t patlen; ++ __u8 patlen; + union { + struct { +- u_int8_t invert; ++ __u8 invert; + } v0; + + struct { +- u_int8_t flags; ++ __u8 flags; + } v1; + } u; + +Index: linux-2.6-tip/include/linux/netfilter/xt_tcpmss.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_tcpmss.h ++++ linux-2.6-tip/include/linux/netfilter/xt_tcpmss.h +@@ -1,9 +1,11 @@ + #ifndef _XT_TCPMSS_MATCH_H + #define _XT_TCPMSS_MATCH_H + ++#include ++ + struct xt_tcpmss_match_info { +- u_int16_t mss_min, mss_max; +- u_int8_t invert; ++ __u16 mss_min, mss_max; ++ __u8 invert; + }; + + #endif /*_XT_TCPMSS_MATCH_H*/ +Index: linux-2.6-tip/include/linux/netfilter/xt_tcpudp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter/xt_tcpudp.h ++++ linux-2.6-tip/include/linux/netfilter/xt_tcpudp.h +@@ -1,15 +1,17 @@ + #ifndef _XT_TCPUDP_H + #define _XT_TCPUDP_H + ++#include ++ + /* TCP matching stuff */ + struct xt_tcp + { +- u_int16_t spts[2]; /* Source port range. */ +- u_int16_t dpts[2]; /* Destination port range. */ +- u_int8_t option; /* TCP Option iff non-zero*/ +- u_int8_t flg_mask; /* TCP flags mask byte */ +- u_int8_t flg_cmp; /* TCP flags compare byte */ +- u_int8_t invflags; /* Inverse flags */ ++ __u16 spts[2]; /* Source port range. */ ++ __u16 dpts[2]; /* Destination port range. */ ++ __u8 option; /* TCP Option iff non-zero*/ ++ __u8 flg_mask; /* TCP flags mask byte */ ++ __u8 flg_cmp; /* TCP flags compare byte */ ++ __u8 invflags; /* Inverse flags */ + }; + + /* Values for "inv" field in struct ipt_tcp. */ +@@ -22,9 +24,9 @@ struct xt_tcp + /* UDP matching stuff */ + struct xt_udp + { +- u_int16_t spts[2]; /* Source port range. */ +- u_int16_t dpts[2]; /* Destination port range. */ +- u_int8_t invflags; /* Inverse flags */ ++ __u16 spts[2]; /* Source port range. */ ++ __u16 dpts[2]; /* Destination port range. */ ++ __u8 invflags; /* Inverse flags */ + }; + + /* Values for "invflags" field in struct ipt_udp. */ +Index: linux-2.6-tip/include/linux/netfilter_ipv4/ipt_owner.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter_ipv4/ipt_owner.h ++++ linux-2.6-tip/include/linux/netfilter_ipv4/ipt_owner.h +@@ -9,10 +9,10 @@ + #define IPT_OWNER_COMM 0x10 + + struct ipt_owner_info { +- uid_t uid; +- gid_t gid; +- pid_t pid; +- pid_t sid; ++ __kernel_uid32_t uid; ++ __kernel_gid32_t gid; ++ __kernel_pid_t pid; ++ __kernel_pid_t sid; + char comm[16]; + u_int8_t match, invert; /* flags */ + }; +Index: linux-2.6-tip/include/linux/netfilter_ipv6/ip6t_owner.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netfilter_ipv6/ip6t_owner.h ++++ linux-2.6-tip/include/linux/netfilter_ipv6/ip6t_owner.h +@@ -8,10 +8,10 @@ + #define IP6T_OWNER_SID 0x08 + + struct ip6t_owner_info { +- uid_t uid; +- gid_t gid; +- pid_t pid; +- pid_t sid; ++ __kernel_uid32_t uid; ++ __kernel_gid32_t gid; ++ __kernel_pid_t pid; ++ __kernel_pid_t sid; + u_int8_t match, invert; /* flags */ + }; + +Index: linux-2.6-tip/include/linux/nubus.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/nubus.h ++++ linux-2.6-tip/include/linux/nubus.h +@@ -237,6 +237,7 @@ struct nubus_dirent + int mask; + }; + ++#ifdef __KERNEL__ + struct nubus_board { + struct nubus_board* next; + struct nubus_dev* first_dev; +@@ -351,6 +352,7 @@ void nubus_get_rsrc_mem(void* dest, + void nubus_get_rsrc_str(void* dest, + const struct nubus_dirent *dirent, + int maxlen); ++#endif /* __KERNEL__ */ + + /* We'd like to get rid of this eventually. Only daynaport.c uses it now. */ + static inline void *nubus_slot_addr(int slot) +Index: linux-2.6-tip/include/linux/pci.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/pci.h ++++ linux-2.6-tip/include/linux/pci.h +@@ -923,7 +923,10 @@ static inline struct pci_dev *pci_get_cl + return NULL; + } + +-#define pci_dev_present(ids) (0) ++static inline int pci_dev_present(const struct pci_device_id *ids) ++{ ++ return 0; ++} + #define no_pci_devices() (1) + #define pci_dev_put(dev) do { } while (0) + +Index: linux-2.6-tip/include/linux/percpu.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/percpu.h ++++ linux-2.6-tip/include/linux/percpu.h +@@ -5,53 +5,76 @@ + #include /* For kmalloc() */ + #include + #include ++#include + + #include + ++#ifndef PER_CPU_BASE_SECTION ++#ifdef CONFIG_SMP ++#define PER_CPU_BASE_SECTION ".data.percpu" ++#else ++#define PER_CPU_BASE_SECTION ".data" ++#endif ++#endif ++ + #ifdef CONFIG_SMP +-#define DEFINE_PER_CPU(type, name) \ +- __attribute__((__section__(".data.percpu"))) \ +- PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name + + #ifdef MODULE +-#define SHARED_ALIGNED_SECTION ".data.percpu" ++#define PER_CPU_SHARED_ALIGNED_SECTION "" + #else +-#define SHARED_ALIGNED_SECTION ".data.percpu.shared_aligned" ++#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned" + #endif ++#define PER_CPU_FIRST_SECTION ".first" + +-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ +- __attribute__((__section__(SHARED_ALIGNED_SECTION))) \ +- PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name \ +- ____cacheline_aligned_in_smp ++#else ++ ++#define PER_CPU_SHARED_ALIGNED_SECTION "" ++#define PER_CPU_FIRST_SECTION "" ++ ++#endif + +-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ +- __attribute__((__section__(".data.percpu.page_aligned"))) \ ++#define DEFINE_PER_CPU_SECTION(type, name, section) \ ++ __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ + PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name +-#else ++ ++#define DEFINE_PER_CPU_SPINLOCK(name, section) \ ++ __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ ++ PER_CPU_ATTRIBUTES __DEFINE_SPINLOCK(per_cpu__lock_##name##_locked); ++ + #define DEFINE_PER_CPU(type, name) \ +- PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name ++ DEFINE_PER_CPU_SECTION(type, name, "") + +-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ +- DEFINE_PER_CPU(type, name) ++#define DEFINE_PER_CPU_LOCKED(type, name) \ ++ DEFINE_PER_CPU_SPINLOCK(name, "") \ ++ DEFINE_PER_CPU_SECTION(type, name##_locked, "") + +-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ +- DEFINE_PER_CPU(type, name) +-#endif ++#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ++ ____cacheline_aligned_in_smp ++ ++#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, ".page_aligned") ++ ++#define DEFINE_PER_CPU_FIRST(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) + + #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) ++#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var##_locked) + #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) ++#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var##_locked) + +-/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */ +-#ifndef PERCPU_ENOUGH_ROOM ++/* enough to cover all DEFINE_PER_CPUs in modules */ + #ifdef CONFIG_MODULES +-#define PERCPU_MODULE_RESERVE 8192 ++#define PERCPU_MODULE_RESERVE (8 << 10) + #else +-#define PERCPU_MODULE_RESERVE 0 ++#define PERCPU_MODULE_RESERVE 0 + #endif + ++#ifndef PERCPU_ENOUGH_ROOM + #define PERCPU_ENOUGH_ROOM \ +- (__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE) +-#endif /* PERCPU_ENOUGH_ROOM */ ++ (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ ++ PERCPU_MODULE_RESERVE) ++#endif + + /* + * Must be an lvalue. Since @var must be a simple identifier, +@@ -63,54 +86,141 @@ + &__get_cpu_var(var); })) + #define put_cpu_var(var) preempt_enable() + ++/* ++ * Per-CPU data structures with an additional lock - useful for ++ * PREEMPT_RT code that wants to reschedule but also wants ++ * per-CPU data structures. ++ * ++ * 'cpu' gets updated with the CPU the task is currently executing on. ++ * ++ * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables ++ * are the same as the normal per-CPU variables, so there no ++ * runtime overhead. ++ */ ++#ifdef CONFIG_PREEMPT_RT ++#define get_cpu_var_locked(var, cpuptr) \ ++(*({ \ ++ spinlock_t *__lock; \ ++ int __cpu; \ ++ \ ++again: \ ++ __cpu = raw_smp_processor_id(); \ ++ __lock = &__get_cpu_lock(var, __cpu); \ ++ spin_lock(__lock); \ ++ if (!cpu_online(__cpu)) { \ ++ spin_unlock(__lock); \ ++ goto again; \ ++ } \ ++ *(cpuptr) = __cpu; \ ++ &__get_cpu_var_locked(var, __cpu); \ ++})) ++#else ++#define get_cpu_var_locked(var, cpuptr) \ ++(*({ \ ++ int __cpu; \ ++ \ ++ preempt_disable(); \ ++ __cpu = smp_processor_id(); \ ++ spin_lock(&__get_cpu_lock(var, __cpu)); \ ++ preempt_enable(); \ ++ *(cpuptr) = __cpu; \ ++ &__get_cpu_var_locked(var, __cpu); \ ++})) ++#endif ++ ++#define put_cpu_var_locked(var, cpu) \ ++ do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0) ++ + #ifdef CONFIG_SMP + ++#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA ++ ++/* minimum unit size, also is the maximum supported allocation size */ ++#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) ++ ++/* ++ * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy ++ * back on the first chunk for dynamic percpu allocation if arch is ++ * manually allocating and mapping it for faster access (as a part of ++ * large page mapping for example). ++ * ++ * The following values give between one and two pages of free space ++ * after typical minimal boot (2-way SMP, single disk and NIC) with ++ * both defconfig and a distro config on x86_64 and 32. More ++ * intelligent way to determine this would be nice. ++ */ ++#if BITS_PER_LONG > 32 ++#define PERCPU_DYNAMIC_RESERVE (20 << 10) ++#else ++#define PERCPU_DYNAMIC_RESERVE (12 << 10) ++#endif ++ ++extern void *pcpu_base_addr; ++ ++typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); ++typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); ++ ++extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, ++ size_t static_size, size_t reserved_size, ++ ssize_t dyn_size, ssize_t unit_size, ++ void *base_addr, ++ pcpu_populate_pte_fn_t populate_pte_fn); ++ ++extern ssize_t __init pcpu_embed_first_chunk( ++ size_t static_size, size_t reserved_size, ++ ssize_t dyn_size, ssize_t unit_size); ++ ++/* ++ * Use this to get to a cpu's version of the per-cpu object ++ * dynamically allocated. Non-atomic access to the current CPU's ++ * version should probably be combined with get_cpu()/put_cpu(). ++ */ ++#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) ++ ++extern void *__alloc_reserved_percpu(size_t size, size_t align); ++ ++#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ ++ + struct percpu_data { + void *ptrs[1]; + }; + + #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) +-/* +- * Use this to get to a cpu's version of the per-cpu object dynamically +- * allocated. Non-atomic access to the current CPU's version should +- * probably be combined with get_cpu()/put_cpu(). +- */ +-#define percpu_ptr(ptr, cpu) \ +-({ \ +- struct percpu_data *__p = __percpu_disguise(ptr); \ +- (__typeof__(ptr))__p->ptrs[(cpu)]; \ ++ ++#define per_cpu_ptr(ptr, cpu) \ ++({ \ ++ struct percpu_data *__p = __percpu_disguise(ptr); \ ++ (__typeof__(ptr))__p->ptrs[(cpu)]; \ + }) + +-extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); +-extern void percpu_free(void *__pdata); ++#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ ++ ++extern void *__alloc_percpu(size_t size, size_t align); ++extern void free_percpu(void *__pdata); + + #else /* CONFIG_SMP */ + +-#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) ++#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) + +-static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) ++static inline void *__alloc_percpu(size_t size, size_t align) + { +- return kzalloc(size, gfp); ++ /* ++ * Can't easily make larger alignment work with kmalloc. WARN ++ * on it. Larger alignment should only be used for module ++ * percpu sections on SMP for which this path isn't used. ++ */ ++ WARN_ON_ONCE(align > SMP_CACHE_BYTES); ++ return kzalloc(size, GFP_KERNEL); + } + +-static inline void percpu_free(void *__pdata) ++static inline void free_percpu(void *p) + { +- kfree(__pdata); ++ kfree(p); + } + + #endif /* CONFIG_SMP */ + +-#define percpu_alloc_mask(size, gfp, mask) \ +- __percpu_alloc_mask((size), (gfp), &(mask)) +- +-#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map) +- +-/* (legacy) interface for use without CPU hotplug handling */ +- +-#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \ +- cpu_possible_map) +-#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type)) +-#define free_percpu(ptr) percpu_free((ptr)) +-#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu)) ++#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ ++ __alignof__(type)) + + #endif /* __LINUX_PERCPU_H */ +Index: linux-2.6-tip/include/linux/perf_counter.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/perf_counter.h +@@ -0,0 +1,477 @@ ++/* ++ * Performance counters: ++ * ++ * Copyright(C) 2008, Thomas Gleixner ++ * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar ++ * ++ * Data type definitions, declarations, prototypes. ++ * ++ * Started by: Thomas Gleixner and Ingo Molnar ++ * ++ * For licencing details see kernel-base/COPYING ++ */ ++#ifndef _LINUX_PERF_COUNTER_H ++#define _LINUX_PERF_COUNTER_H ++ ++#include ++#include ++#include ++ ++/* ++ * User-space ABI bits: ++ */ ++ ++/* ++ * hw_event.type ++ */ ++enum perf_event_types { ++ PERF_TYPE_HARDWARE = 0, ++ PERF_TYPE_SOFTWARE = 1, ++ PERF_TYPE_TRACEPOINT = 2, ++ ++ /* ++ * available TYPE space, raw is the max value. ++ */ ++ ++ PERF_TYPE_RAW = 128, ++}; ++ ++/* ++ * Generalized performance counter event types, used by the hw_event.event_id ++ * parameter of the sys_perf_counter_open() syscall: ++ */ ++enum hw_event_ids { ++ /* ++ * Common hardware events, generalized by the kernel: ++ */ ++ PERF_COUNT_CPU_CYCLES = 0, ++ PERF_COUNT_INSTRUCTIONS = 1, ++ PERF_COUNT_CACHE_REFERENCES = 2, ++ PERF_COUNT_CACHE_MISSES = 3, ++ PERF_COUNT_BRANCH_INSTRUCTIONS = 4, ++ PERF_COUNT_BRANCH_MISSES = 5, ++ PERF_COUNT_BUS_CYCLES = 6, ++ ++ PERF_HW_EVENTS_MAX = 7, ++}; ++ ++/* ++ * Special "software" counters provided by the kernel, even if the hardware ++ * does not support performance counters. These counters measure various ++ * physical and sw events of the kernel (and allow the profiling of them as ++ * well): ++ */ ++enum sw_event_ids { ++ PERF_COUNT_CPU_CLOCK = 0, ++ PERF_COUNT_TASK_CLOCK = 1, ++ PERF_COUNT_PAGE_FAULTS = 2, ++ PERF_COUNT_CONTEXT_SWITCHES = 3, ++ PERF_COUNT_CPU_MIGRATIONS = 4, ++ PERF_COUNT_PAGE_FAULTS_MIN = 5, ++ PERF_COUNT_PAGE_FAULTS_MAJ = 6, ++ ++ PERF_SW_EVENTS_MAX = 7, ++}; ++ ++/* ++ * IRQ-notification data record type: ++ */ ++enum perf_counter_record_type { ++ PERF_RECORD_SIMPLE = 0, ++ PERF_RECORD_IRQ = 1, ++ PERF_RECORD_GROUP = 2, ++}; ++ ++#define __PERF_COUNTER_MASK(name) \ ++ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ ++ PERF_COUNTER_##name##_SHIFT) ++ ++#define PERF_COUNTER_RAW_BITS 1 ++#define PERF_COUNTER_RAW_SHIFT 63 ++#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW) ++ ++#define PERF_COUNTER_CONFIG_BITS 63 ++#define PERF_COUNTER_CONFIG_SHIFT 0 ++#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG) ++ ++#define PERF_COUNTER_TYPE_BITS 7 ++#define PERF_COUNTER_TYPE_SHIFT 56 ++#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE) ++ ++#define PERF_COUNTER_EVENT_BITS 56 ++#define PERF_COUNTER_EVENT_SHIFT 0 ++#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) ++ ++/* ++ * Bits that can be set in hw_event.read_format to request that ++ * reads on the counter should return the indicated quantities, ++ * in increasing order of bit value, after the counter value. ++ */ ++enum perf_counter_read_format { ++ PERF_FORMAT_TOTAL_TIME_ENABLED = 1, ++ PERF_FORMAT_TOTAL_TIME_RUNNING = 2, ++}; ++ ++/* ++ * Hardware event to monitor via a performance monitoring counter: ++ */ ++struct perf_counter_hw_event { ++ /* ++ * The MSB of the config word signifies if the rest contains cpu ++ * specific (raw) counter configuration data, if unset, the next ++ * 7 bits are an event type and the rest of the bits are the event ++ * identifier. ++ */ ++ __u64 config; ++ ++ __u64 irq_period; ++ __u64 record_type; ++ __u64 read_format; ++ ++ __u64 disabled : 1, /* off by default */ ++ nmi : 1, /* NMI sampling */ ++ inherit : 1, /* children inherit it */ ++ pinned : 1, /* must always be on PMU */ ++ exclusive : 1, /* only group on PMU */ ++ exclude_user : 1, /* don't count user */ ++ exclude_kernel : 1, /* ditto kernel */ ++ exclude_hv : 1, /* ditto hypervisor */ ++ exclude_idle : 1, /* don't count when idle */ ++ include_tid : 1, /* include the tid */ ++ ++ __reserved_1 : 54; ++ ++ __u32 extra_config_len; ++ __u32 __reserved_4; ++ ++ __u64 __reserved_2; ++ __u64 __reserved_3; ++}; ++ ++/* ++ * Ioctls that can be done on a perf counter fd: ++ */ ++#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) ++#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) ++ ++/* ++ * Structure of the page that can be mapped via mmap ++ */ ++struct perf_counter_mmap_page { ++ __u32 version; /* version number of this structure */ ++ __u32 compat_version; /* lowest version this is compat with */ ++ __u32 lock; /* seqlock for synchronization */ ++ __u32 index; /* hardware counter identifier */ ++ __s64 offset; /* add to hardware counter value */ ++ ++ __u32 data_head; /* head in the data section */ ++}; ++ ++struct perf_event_header { ++ __u32 type; ++ __u32 size; ++}; ++ ++enum perf_event_type { ++ PERF_EVENT_IP = 0, ++ PERF_EVENT_GROUP = 1, ++ ++ __PERF_EVENT_TID = 0x100, ++}; ++ ++#ifdef __KERNEL__ ++/* ++ * Kernel-internal data types and definitions: ++ */ ++ ++#ifdef CONFIG_PERF_COUNTERS ++# include ++#endif ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct task_struct; ++ ++static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event) ++{ ++ return hw_event->config & PERF_COUNTER_RAW_MASK; ++} ++ ++static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event) ++{ ++ return hw_event->config & PERF_COUNTER_CONFIG_MASK; ++} ++ ++static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event) ++{ ++ return (hw_event->config & PERF_COUNTER_TYPE_MASK) >> ++ PERF_COUNTER_TYPE_SHIFT; ++} ++ ++static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event) ++{ ++ return hw_event->config & PERF_COUNTER_EVENT_MASK; ++} ++ ++/** ++ * struct hw_perf_counter - performance counter hardware details: ++ */ ++struct hw_perf_counter { ++#ifdef CONFIG_PERF_COUNTERS ++ union { ++ struct { /* hardware */ ++ u64 config; ++ unsigned long config_base; ++ unsigned long counter_base; ++ int nmi; ++ unsigned int idx; ++ }; ++ union { /* software */ ++ atomic64_t count; ++ struct hrtimer hrtimer; ++ }; ++ }; ++ atomic64_t prev_count; ++ u64 irq_period; ++ atomic64_t period_left; ++#endif ++}; ++ ++struct perf_counter; ++ ++/** ++ * struct hw_perf_counter_ops - performance counter hw ops ++ */ ++struct hw_perf_counter_ops { ++ int (*enable) (struct perf_counter *counter); ++ void (*disable) (struct perf_counter *counter); ++ void (*read) (struct perf_counter *counter); ++}; ++ ++/** ++ * enum perf_counter_active_state - the states of a counter ++ */ ++enum perf_counter_active_state { ++ PERF_COUNTER_STATE_ERROR = -2, ++ PERF_COUNTER_STATE_OFF = -1, ++ PERF_COUNTER_STATE_INACTIVE = 0, ++ PERF_COUNTER_STATE_ACTIVE = 1, ++}; ++ ++struct file; ++ ++struct perf_mmap_data { ++ struct rcu_head rcu_head; ++ int nr_pages; ++ atomic_t wakeup; ++ atomic_t head; ++ struct perf_counter_mmap_page *user_page; ++ void *data_pages[0]; ++}; ++ ++/** ++ * struct perf_counter - performance counter kernel representation: ++ */ ++struct perf_counter { ++#ifdef CONFIG_PERF_COUNTERS ++ struct list_head list_entry; ++ struct list_head event_entry; ++ struct list_head sibling_list; ++ int nr_siblings; ++ struct perf_counter *group_leader; ++ const struct hw_perf_counter_ops *hw_ops; ++ ++ enum perf_counter_active_state state; ++ enum perf_counter_active_state prev_state; ++ atomic64_t count; ++ ++ /* ++ * These are the total time in nanoseconds that the counter ++ * has been enabled (i.e. eligible to run, and the task has ++ * been scheduled in, if this is a per-task counter) ++ * and running (scheduled onto the CPU), respectively. ++ * ++ * They are computed from tstamp_enabled, tstamp_running and ++ * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. ++ */ ++ u64 total_time_enabled; ++ u64 total_time_running; ++ ++ /* ++ * These are timestamps used for computing total_time_enabled ++ * and total_time_running when the counter is in INACTIVE or ++ * ACTIVE state, measured in nanoseconds from an arbitrary point ++ * in time. ++ * tstamp_enabled: the notional time when the counter was enabled ++ * tstamp_running: the notional time when the counter was scheduled on ++ * tstamp_stopped: in INACTIVE state, the notional time when the ++ * counter was scheduled off. ++ */ ++ u64 tstamp_enabled; ++ u64 tstamp_running; ++ u64 tstamp_stopped; ++ ++ struct perf_counter_hw_event hw_event; ++ struct hw_perf_counter hw; ++ ++ struct perf_counter_context *ctx; ++ struct task_struct *task; ++ struct file *filp; ++ ++ struct perf_counter *parent; ++ struct list_head child_list; ++ ++ /* ++ * These accumulate total time (in nanoseconds) that children ++ * counters have been enabled and running, respectively. ++ */ ++ atomic64_t child_total_time_enabled; ++ atomic64_t child_total_time_running; ++ ++ /* ++ * Protect attach/detach and child_list: ++ */ ++ struct mutex mutex; ++ ++ int oncpu; ++ int cpu; ++ ++ /* mmap bits */ ++ struct mutex mmap_mutex; ++ atomic_t mmap_count; ++ struct perf_mmap_data *data; ++ ++ /* poll related */ ++ wait_queue_head_t waitq; ++ /* optional: for NMIs */ ++ int wakeup_pending; ++ ++ void (*destroy)(struct perf_counter *); ++ struct rcu_head rcu_head; ++#endif ++}; ++ ++/** ++ * struct perf_counter_context - counter context structure ++ * ++ * Used as a container for task counters and CPU counters as well: ++ */ ++struct perf_counter_context { ++#ifdef CONFIG_PERF_COUNTERS ++ /* ++ * Protect the states of the counters in the list, ++ * nr_active, and the list: ++ */ ++ raw_spinlock_t lock; ++ /* ++ * Protect the list of counters. Locking either mutex or lock ++ * is sufficient to ensure the list doesn't change; to change ++ * the list you need to lock both the mutex and the spinlock. ++ */ ++ struct mutex mutex; ++ ++ struct list_head counter_list; ++ struct list_head event_list; ++ int nr_counters; ++ int nr_active; ++ int is_active; ++ struct task_struct *task; ++ ++ /* ++ * time_now is the current time in nanoseconds since an arbitrary ++ * point in the past. For per-task counters, this is based on the ++ * task clock, and for per-cpu counters it is based on the cpu clock. ++ * time_lost is an offset from the task/cpu clock, used to make it ++ * appear that time only passes while the context is scheduled in. ++ */ ++ u64 time_now; ++ u64 time_lost; ++#endif ++}; ++ ++/** ++ * struct perf_counter_cpu_context - per cpu counter context structure ++ */ ++struct perf_cpu_context { ++ struct perf_counter_context ctx; ++ struct perf_counter_context *task_ctx; ++ int active_oncpu; ++ int max_pertask; ++ int exclusive; ++ ++ /* ++ * Recursion avoidance: ++ * ++ * task, softirq, irq, nmi context ++ */ ++ int recursion[4]; ++}; ++ ++/* ++ * Set by architecture code: ++ */ ++extern int perf_max_counters; ++ ++#ifdef CONFIG_PERF_COUNTERS ++extern const struct hw_perf_counter_ops * ++hw_perf_counter_init(struct perf_counter *counter); ++ ++extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); ++extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); ++extern void perf_counter_task_tick(struct task_struct *task, int cpu); ++extern void perf_counter_init_task(struct task_struct *child); ++extern void perf_counter_exit_task(struct task_struct *child); ++extern void perf_counter_notify(struct pt_regs *regs); ++extern void perf_counter_print_debug(void); ++extern void perf_counter_unthrottle(void); ++extern u64 hw_perf_save_disable(void); ++extern void hw_perf_restore(u64 ctrl); ++extern int perf_counter_task_disable(void); ++extern int perf_counter_task_enable(void); ++extern int hw_perf_group_sched_in(struct perf_counter *group_leader, ++ struct perf_cpu_context *cpuctx, ++ struct perf_counter_context *ctx, int cpu); ++extern void perf_counter_update_userpage(struct perf_counter *counter); ++ ++extern void perf_counter_output(struct perf_counter *counter, ++ int nmi, struct pt_regs *regs); ++/* ++ * Return 1 for a software counter, 0 for a hardware counter ++ */ ++static inline int is_software_counter(struct perf_counter *counter) ++{ ++ return !perf_event_raw(&counter->hw_event) && ++ perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; ++} ++ ++extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); ++ ++#else ++static inline void ++perf_counter_task_sched_in(struct task_struct *task, int cpu) { } ++static inline void ++perf_counter_task_sched_out(struct task_struct *task, int cpu) { } ++static inline void ++perf_counter_task_tick(struct task_struct *task, int cpu) { } ++static inline void perf_counter_init_task(struct task_struct *child) { } ++static inline void perf_counter_exit_task(struct task_struct *child) { } ++static inline void perf_counter_notify(struct pt_regs *regs) { } ++static inline void perf_counter_print_debug(void) { } ++static inline void perf_counter_unthrottle(void) { } ++static inline void hw_perf_restore(u64 ctrl) { } ++static inline u64 hw_perf_save_disable(void) { return 0; } ++static inline int perf_counter_task_disable(void) { return -EINVAL; } ++static inline int perf_counter_task_enable(void) { return -EINVAL; } ++ ++static inline void perf_swcounter_event(u32 event, u64 nr, ++ int nmi, struct pt_regs *regs) { } ++#endif ++ ++#endif /* __KERNEL__ */ ++#endif /* _LINUX_PERF_COUNTER_H */ +Index: linux-2.6-tip/include/linux/pfkeyv2.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/pfkeyv2.h ++++ linux-2.6-tip/include/linux/pfkeyv2.h +@@ -12,187 +12,187 @@ + #define PFKEYV2_REVISION 199806L + + struct sadb_msg { +- uint8_t sadb_msg_version; +- uint8_t sadb_msg_type; +- uint8_t sadb_msg_errno; +- uint8_t sadb_msg_satype; +- uint16_t sadb_msg_len; +- uint16_t sadb_msg_reserved; +- uint32_t sadb_msg_seq; +- uint32_t sadb_msg_pid; ++ __u8 sadb_msg_version; ++ __u8 sadb_msg_type; ++ __u8 sadb_msg_errno; ++ __u8 sadb_msg_satype; ++ __u16 sadb_msg_len; ++ __u16 sadb_msg_reserved; ++ __u32 sadb_msg_seq; ++ __u32 sadb_msg_pid; + } __attribute__((packed)); + /* sizeof(struct sadb_msg) == 16 */ + + struct sadb_ext { +- uint16_t sadb_ext_len; +- uint16_t sadb_ext_type; ++ __u16 sadb_ext_len; ++ __u16 sadb_ext_type; + } __attribute__((packed)); + /* sizeof(struct sadb_ext) == 4 */ + + struct sadb_sa { +- uint16_t sadb_sa_len; +- uint16_t sadb_sa_exttype; ++ __u16 sadb_sa_len; ++ __u16 sadb_sa_exttype; + __be32 sadb_sa_spi; +- uint8_t sadb_sa_replay; +- uint8_t sadb_sa_state; +- uint8_t sadb_sa_auth; +- uint8_t sadb_sa_encrypt; +- uint32_t sadb_sa_flags; ++ __u8 sadb_sa_replay; ++ __u8 sadb_sa_state; ++ __u8 sadb_sa_auth; ++ __u8 sadb_sa_encrypt; ++ __u32 sadb_sa_flags; + } __attribute__((packed)); + /* sizeof(struct sadb_sa) == 16 */ + + struct sadb_lifetime { +- uint16_t sadb_lifetime_len; +- uint16_t sadb_lifetime_exttype; +- uint32_t sadb_lifetime_allocations; +- uint64_t sadb_lifetime_bytes; +- uint64_t sadb_lifetime_addtime; +- uint64_t sadb_lifetime_usetime; ++ __u16 sadb_lifetime_len; ++ __u16 sadb_lifetime_exttype; ++ __u32 sadb_lifetime_allocations; ++ __u64 sadb_lifetime_bytes; ++ __u64 sadb_lifetime_addtime; ++ __u64 sadb_lifetime_usetime; + } __attribute__((packed)); + /* sizeof(struct sadb_lifetime) == 32 */ + + struct sadb_address { +- uint16_t sadb_address_len; +- uint16_t sadb_address_exttype; +- uint8_t sadb_address_proto; +- uint8_t sadb_address_prefixlen; +- uint16_t sadb_address_reserved; ++ __u16 sadb_address_len; ++ __u16 sadb_address_exttype; ++ __u8 sadb_address_proto; ++ __u8 sadb_address_prefixlen; ++ __u16 sadb_address_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_address) == 8 */ + + struct sadb_key { +- uint16_t sadb_key_len; +- uint16_t sadb_key_exttype; +- uint16_t sadb_key_bits; +- uint16_t sadb_key_reserved; ++ __u16 sadb_key_len; ++ __u16 sadb_key_exttype; ++ __u16 sadb_key_bits; ++ __u16 sadb_key_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_key) == 8 */ + + struct sadb_ident { +- uint16_t sadb_ident_len; +- uint16_t sadb_ident_exttype; +- uint16_t sadb_ident_type; +- uint16_t sadb_ident_reserved; +- uint64_t sadb_ident_id; ++ __u16 sadb_ident_len; ++ __u16 sadb_ident_exttype; ++ __u16 sadb_ident_type; ++ __u16 sadb_ident_reserved; ++ __u64 sadb_ident_id; + } __attribute__((packed)); + /* sizeof(struct sadb_ident) == 16 */ + + struct sadb_sens { +- uint16_t sadb_sens_len; +- uint16_t sadb_sens_exttype; +- uint32_t sadb_sens_dpd; +- uint8_t sadb_sens_sens_level; +- uint8_t sadb_sens_sens_len; +- uint8_t sadb_sens_integ_level; +- uint8_t sadb_sens_integ_len; +- uint32_t sadb_sens_reserved; ++ __u16 sadb_sens_len; ++ __u16 sadb_sens_exttype; ++ __u32 sadb_sens_dpd; ++ __u8 sadb_sens_sens_level; ++ __u8 sadb_sens_sens_len; ++ __u8 sadb_sens_integ_level; ++ __u8 sadb_sens_integ_len; ++ __u32 sadb_sens_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_sens) == 16 */ + + /* followed by: +- uint64_t sadb_sens_bitmap[sens_len]; +- uint64_t sadb_integ_bitmap[integ_len]; */ ++ __u64 sadb_sens_bitmap[sens_len]; ++ __u64 sadb_integ_bitmap[integ_len]; */ + + struct sadb_prop { +- uint16_t sadb_prop_len; +- uint16_t sadb_prop_exttype; +- uint8_t sadb_prop_replay; +- uint8_t sadb_prop_reserved[3]; ++ __u16 sadb_prop_len; ++ __u16 sadb_prop_exttype; ++ __u8 sadb_prop_replay; ++ __u8 sadb_prop_reserved[3]; + } __attribute__((packed)); + /* sizeof(struct sadb_prop) == 8 */ + + /* followed by: + struct sadb_comb sadb_combs[(sadb_prop_len + +- sizeof(uint64_t) - sizeof(struct sadb_prop)) / ++ sizeof(__u64) - sizeof(struct sadb_prop)) / + sizeof(struct sadb_comb)]; */ + + struct sadb_comb { +- uint8_t sadb_comb_auth; +- uint8_t sadb_comb_encrypt; +- uint16_t sadb_comb_flags; +- uint16_t sadb_comb_auth_minbits; +- uint16_t sadb_comb_auth_maxbits; +- uint16_t sadb_comb_encrypt_minbits; +- uint16_t sadb_comb_encrypt_maxbits; +- uint32_t sadb_comb_reserved; +- uint32_t sadb_comb_soft_allocations; +- uint32_t sadb_comb_hard_allocations; +- uint64_t sadb_comb_soft_bytes; +- uint64_t sadb_comb_hard_bytes; +- uint64_t sadb_comb_soft_addtime; +- uint64_t sadb_comb_hard_addtime; +- uint64_t sadb_comb_soft_usetime; +- uint64_t sadb_comb_hard_usetime; ++ __u8 sadb_comb_auth; ++ __u8 sadb_comb_encrypt; ++ __u16 sadb_comb_flags; ++ __u16 sadb_comb_auth_minbits; ++ __u16 sadb_comb_auth_maxbits; ++ __u16 sadb_comb_encrypt_minbits; ++ __u16 sadb_comb_encrypt_maxbits; ++ __u32 sadb_comb_reserved; ++ __u32 sadb_comb_soft_allocations; ++ __u32 sadb_comb_hard_allocations; ++ __u64 sadb_comb_soft_bytes; ++ __u64 sadb_comb_hard_bytes; ++ __u64 sadb_comb_soft_addtime; ++ __u64 sadb_comb_hard_addtime; ++ __u64 sadb_comb_soft_usetime; ++ __u64 sadb_comb_hard_usetime; + } __attribute__((packed)); + /* sizeof(struct sadb_comb) == 72 */ + + struct sadb_supported { +- uint16_t sadb_supported_len; +- uint16_t sadb_supported_exttype; +- uint32_t sadb_supported_reserved; ++ __u16 sadb_supported_len; ++ __u16 sadb_supported_exttype; ++ __u32 sadb_supported_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_supported) == 8 */ + + /* followed by: + struct sadb_alg sadb_algs[(sadb_supported_len + +- sizeof(uint64_t) - sizeof(struct sadb_supported)) / ++ sizeof(__u64) - sizeof(struct sadb_supported)) / + sizeof(struct sadb_alg)]; */ + + struct sadb_alg { +- uint8_t sadb_alg_id; +- uint8_t sadb_alg_ivlen; +- uint16_t sadb_alg_minbits; +- uint16_t sadb_alg_maxbits; +- uint16_t sadb_alg_reserved; ++ __u8 sadb_alg_id; ++ __u8 sadb_alg_ivlen; ++ __u16 sadb_alg_minbits; ++ __u16 sadb_alg_maxbits; ++ __u16 sadb_alg_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_alg) == 8 */ + + struct sadb_spirange { +- uint16_t sadb_spirange_len; +- uint16_t sadb_spirange_exttype; +- uint32_t sadb_spirange_min; +- uint32_t sadb_spirange_max; +- uint32_t sadb_spirange_reserved; ++ __u16 sadb_spirange_len; ++ __u16 sadb_spirange_exttype; ++ __u32 sadb_spirange_min; ++ __u32 sadb_spirange_max; ++ __u32 sadb_spirange_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_spirange) == 16 */ + + struct sadb_x_kmprivate { +- uint16_t sadb_x_kmprivate_len; +- uint16_t sadb_x_kmprivate_exttype; +- uint32_t sadb_x_kmprivate_reserved; ++ __u16 sadb_x_kmprivate_len; ++ __u16 sadb_x_kmprivate_exttype; ++ __u32 sadb_x_kmprivate_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_x_kmprivate) == 8 */ + + struct sadb_x_sa2 { +- uint16_t sadb_x_sa2_len; +- uint16_t sadb_x_sa2_exttype; +- uint8_t sadb_x_sa2_mode; +- uint8_t sadb_x_sa2_reserved1; +- uint16_t sadb_x_sa2_reserved2; +- uint32_t sadb_x_sa2_sequence; +- uint32_t sadb_x_sa2_reqid; ++ __u16 sadb_x_sa2_len; ++ __u16 sadb_x_sa2_exttype; ++ __u8 sadb_x_sa2_mode; ++ __u8 sadb_x_sa2_reserved1; ++ __u16 sadb_x_sa2_reserved2; ++ __u32 sadb_x_sa2_sequence; ++ __u32 sadb_x_sa2_reqid; + } __attribute__((packed)); + /* sizeof(struct sadb_x_sa2) == 16 */ + + struct sadb_x_policy { +- uint16_t sadb_x_policy_len; +- uint16_t sadb_x_policy_exttype; +- uint16_t sadb_x_policy_type; +- uint8_t sadb_x_policy_dir; +- uint8_t sadb_x_policy_reserved; +- uint32_t sadb_x_policy_id; +- uint32_t sadb_x_policy_priority; ++ __u16 sadb_x_policy_len; ++ __u16 sadb_x_policy_exttype; ++ __u16 sadb_x_policy_type; ++ __u8 sadb_x_policy_dir; ++ __u8 sadb_x_policy_reserved; ++ __u32 sadb_x_policy_id; ++ __u32 sadb_x_policy_priority; + } __attribute__((packed)); + /* sizeof(struct sadb_x_policy) == 16 */ + + struct sadb_x_ipsecrequest { +- uint16_t sadb_x_ipsecrequest_len; +- uint16_t sadb_x_ipsecrequest_proto; +- uint8_t sadb_x_ipsecrequest_mode; +- uint8_t sadb_x_ipsecrequest_level; +- uint16_t sadb_x_ipsecrequest_reserved1; +- uint32_t sadb_x_ipsecrequest_reqid; +- uint32_t sadb_x_ipsecrequest_reserved2; ++ __u16 sadb_x_ipsecrequest_len; ++ __u16 sadb_x_ipsecrequest_proto; ++ __u8 sadb_x_ipsecrequest_mode; ++ __u8 sadb_x_ipsecrequest_level; ++ __u16 sadb_x_ipsecrequest_reserved1; ++ __u32 sadb_x_ipsecrequest_reqid; ++ __u32 sadb_x_ipsecrequest_reserved2; + } __attribute__((packed)); + /* sizeof(struct sadb_x_ipsecrequest) == 16 */ + +@@ -200,38 +200,38 @@ struct sadb_x_ipsecrequest { + * type of NAT-T is supported, draft-ietf-ipsec-udp-encaps-06 + */ + struct sadb_x_nat_t_type { +- uint16_t sadb_x_nat_t_type_len; +- uint16_t sadb_x_nat_t_type_exttype; +- uint8_t sadb_x_nat_t_type_type; +- uint8_t sadb_x_nat_t_type_reserved[3]; ++ __u16 sadb_x_nat_t_type_len; ++ __u16 sadb_x_nat_t_type_exttype; ++ __u8 sadb_x_nat_t_type_type; ++ __u8 sadb_x_nat_t_type_reserved[3]; + } __attribute__((packed)); + /* sizeof(struct sadb_x_nat_t_type) == 8 */ + + /* Pass a NAT Traversal port (Source or Dest port) */ + struct sadb_x_nat_t_port { +- uint16_t sadb_x_nat_t_port_len; +- uint16_t sadb_x_nat_t_port_exttype; ++ __u16 sadb_x_nat_t_port_len; ++ __u16 sadb_x_nat_t_port_exttype; + __be16 sadb_x_nat_t_port_port; +- uint16_t sadb_x_nat_t_port_reserved; ++ __u16 sadb_x_nat_t_port_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_x_nat_t_port) == 8 */ + + /* Generic LSM security context */ + struct sadb_x_sec_ctx { +- uint16_t sadb_x_sec_len; +- uint16_t sadb_x_sec_exttype; +- uint8_t sadb_x_ctx_alg; /* LSMs: e.g., selinux == 1 */ +- uint8_t sadb_x_ctx_doi; +- uint16_t sadb_x_ctx_len; ++ __u16 sadb_x_sec_len; ++ __u16 sadb_x_sec_exttype; ++ __u8 sadb_x_ctx_alg; /* LSMs: e.g., selinux == 1 */ ++ __u8 sadb_x_ctx_doi; ++ __u16 sadb_x_ctx_len; + } __attribute__((packed)); + /* sizeof(struct sadb_sec_ctx) = 8 */ + + /* Used by MIGRATE to pass addresses IKE will use to perform + * negotiation with the peer */ + struct sadb_x_kmaddress { +- uint16_t sadb_x_kmaddress_len; +- uint16_t sadb_x_kmaddress_exttype; +- uint32_t sadb_x_kmaddress_reserved; ++ __u16 sadb_x_kmaddress_len; ++ __u16 sadb_x_kmaddress_exttype; ++ __u32 sadb_x_kmaddress_reserved; + } __attribute__((packed)); + /* sizeof(struct sadb_x_kmaddress) == 8 */ + +Index: linux-2.6-tip/include/linux/pipe_fs_i.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/pipe_fs_i.h ++++ linux-2.6-tip/include/linux/pipe_fs_i.h +@@ -1,9 +1,9 @@ + #ifndef _LINUX_PIPE_FS_I_H + #define _LINUX_PIPE_FS_I_H + +-#define PIPEFS_MAGIC 0x50495045 ++#define PIPEFS_MAGIC 0x50495045 + +-#define PIPE_BUFFERS (16) ++#define PIPE_BUFFERS 64 + + #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ + #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ +Index: linux-2.6-tip/include/linux/pkt_sched.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/pkt_sched.h ++++ linux-2.6-tip/include/linux/pkt_sched.h +@@ -515,7 +515,7 @@ enum + + struct tc_drr_stats + { +- __u32 deficit; ++ u32 deficit; + }; + + #endif +Index: linux-2.6-tip/include/linux/plist.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/plist.h ++++ linux-2.6-tip/include/linux/plist.h +@@ -81,7 +81,7 @@ struct plist_head { + struct list_head prio_list; + struct list_head node_list; + #ifdef CONFIG_DEBUG_PI_LIST +- spinlock_t *lock; ++ raw_spinlock_t *lock; + #endif + }; + +@@ -96,16 +96,19 @@ struct plist_node { + # define PLIST_HEAD_LOCK_INIT(_lock) + #endif + ++#define _PLIST_HEAD_INIT(head) \ ++ .prio_list = LIST_HEAD_INIT((head).prio_list), \ ++ .node_list = LIST_HEAD_INIT((head).node_list) ++ + /** + * PLIST_HEAD_INIT - static struct plist_head initializer + * @head: struct plist_head variable name +- * @_lock: lock to initialize for this list ++ * @_lock: lock * to initialize for this list + */ + #define PLIST_HEAD_INIT(head, _lock) \ + { \ +- .prio_list = LIST_HEAD_INIT((head).prio_list), \ +- .node_list = LIST_HEAD_INIT((head).node_list), \ +- PLIST_HEAD_LOCK_INIT(&(_lock)) \ ++ _PLIST_HEAD_INIT(head), \ ++ PLIST_HEAD_LOCK_INIT(_lock) \ + } + + /** +@@ -116,7 +119,7 @@ struct plist_node { + #define PLIST_NODE_INIT(node, __prio) \ + { \ + .prio = (__prio), \ +- .plist = PLIST_HEAD_INIT((node).plist, NULL), \ ++ .plist = { _PLIST_HEAD_INIT((node).plist) }, \ + } + + /** +@@ -125,7 +128,7 @@ struct plist_node { + * @lock: list spinlock, remembered for debugging + */ + static inline void +-plist_head_init(struct plist_head *head, spinlock_t *lock) ++plist_head_init(struct plist_head *head, raw_spinlock_t *lock) + { + INIT_LIST_HEAD(&head->prio_list); + INIT_LIST_HEAD(&head->node_list); +Index: linux-2.6-tip/include/linux/poison.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/poison.h ++++ linux-2.6-tip/include/linux/poison.h +@@ -2,13 +2,25 @@ + #define _LINUX_POISON_H + + /********** include/linux/list.h **********/ ++ ++/* ++ * Architectures might want to move the poison pointer offset ++ * into some well-recognized area such as 0xdead000000000000, ++ * that is also not mappable by user-space exploits: ++ */ ++#ifdef CONFIG_ILLEGAL_POINTER_VALUE ++# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL) ++#else ++# define POISON_POINTER_DELTA 0 ++#endif ++ + /* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +-#define LIST_POISON1 ((void *) 0x00100100) +-#define LIST_POISON2 ((void *) 0x00200200) ++#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) ++#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) + + /********** include/linux/timer.h **********/ + /* +Index: linux-2.6-tip/include/linux/ppp_defs.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/ppp_defs.h ++++ linux-2.6-tip/include/linux/ppp_defs.h +@@ -177,8 +177,8 @@ struct ppp_comp_stats { + * the last NP packet was sent or received. + */ + struct ppp_idle { +- time_t xmit_idle; /* time since last NP packet sent */ +- time_t recv_idle; /* time since last NP packet received */ ++ __kernel_time_t xmit_idle; /* time since last NP packet sent */ ++ __kernel_time_t recv_idle; /* time since last NP packet received */ + }; + + #endif /* _PPP_DEFS_H_ */ +Index: linux-2.6-tip/include/linux/prctl.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/prctl.h ++++ linux-2.6-tip/include/linux/prctl.h +@@ -85,4 +85,7 @@ + #define PR_SET_TIMERSLACK 29 + #define PR_GET_TIMERSLACK 30 + ++#define PR_TASK_PERF_COUNTERS_DISABLE 31 ++#define PR_TASK_PERF_COUNTERS_ENABLE 32 ++ + #endif /* _LINUX_PRCTL_H */ +Index: linux-2.6-tip/include/linux/rcuclassic.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/rcuclassic.h ++++ linux-2.6-tip/include/linux/rcuclassic.h +@@ -36,7 +36,6 @@ + #include + #include + #include +-#include + #include + #include + +@@ -108,25 +107,14 @@ struct rcu_data { + struct rcu_head barrier; + }; + +-DECLARE_PER_CPU(struct rcu_data, rcu_data); +-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); +- + /* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +-static inline void rcu_qsctr_inc(int cpu) +-{ +- struct rcu_data *rdp = &per_cpu(rcu_data, cpu); +- rdp->passed_quiesc = 1; +-} +-static inline void rcu_bh_qsctr_inc(int cpu) +-{ +- struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); +- rdp->passed_quiesc = 1; +-} ++extern void rcu_qsctr_inc(int cpu); ++extern void rcu_bh_qsctr_inc(int cpu); + + extern int rcu_pending(int cpu); + extern int rcu_needs_cpu(int cpu); +Index: linux-2.6-tip/include/linux/rcupdate.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/rcupdate.h ++++ linux-2.6-tip/include/linux/rcupdate.h +@@ -36,7 +36,6 @@ + #include + #include + #include +-#include + #include + #include + #include +Index: linux-2.6-tip/include/linux/rcupreempt.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/rcupreempt.h ++++ linux-2.6-tip/include/linux/rcupreempt.h +@@ -36,34 +36,19 @@ + #include + #include + #include +-#include ++#include + #include + #include + +-struct rcu_dyntick_sched { +- int dynticks; +- int dynticks_snap; +- int sched_qs; +- int sched_qs_snap; +- int sched_dynticks_snap; +-}; +- +-DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched); +- +-static inline void rcu_qsctr_inc(int cpu) +-{ +- struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); +- +- rdssp->sched_qs++; +-} +-#define rcu_bh_qsctr_inc(cpu) ++extern void rcu_qsctr_inc(int cpu); ++static inline void rcu_bh_qsctr_inc(int cpu) { } + + /* + * Someone might want to pass call_rcu_bh as a function pointer. + * So this needs to just be a rename and not a macro function. + * (no parentheses) + */ +-#define call_rcu_bh call_rcu ++#define call_rcu_bh call_rcu + + /** + * call_rcu_sched - Queue RCU callback for invocation after sched grace period. +@@ -117,30 +102,12 @@ extern struct rcupreempt_trace *rcupreem + struct softirq_action; + + #ifdef CONFIG_NO_HZ +- +-static inline void rcu_enter_nohz(void) +-{ +- static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); +- +- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ +- __get_cpu_var(rcu_dyntick_sched).dynticks++; +- WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs); +-} +- +-static inline void rcu_exit_nohz(void) +-{ +- static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); +- +- __get_cpu_var(rcu_dyntick_sched).dynticks++; +- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ +- WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1), +- &rs); +-} +- +-#else /* CONFIG_NO_HZ */ +-#define rcu_enter_nohz() do { } while (0) +-#define rcu_exit_nohz() do { } while (0) +-#endif /* CONFIG_NO_HZ */ ++extern void rcu_enter_nohz(void); ++extern void rcu_exit_nohz(void); ++#else ++# define rcu_enter_nohz() do { } while (0) ++# define rcu_exit_nohz() do { } while (0) ++#endif + + /* + * A context switch is a grace period for rcupreempt synchronize_rcu() +Index: linux-2.6-tip/include/linux/rcutree.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/rcutree.h ++++ linux-2.6-tip/include/linux/rcutree.h +@@ -33,7 +33,6 @@ + #include + #include + #include +-#include + #include + #include + +@@ -236,30 +235,8 @@ struct rcu_state { + #endif /* #ifdef CONFIG_NO_HZ */ + }; + +-extern struct rcu_state rcu_state; +-DECLARE_PER_CPU(struct rcu_data, rcu_data); +- +-extern struct rcu_state rcu_bh_state; +-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); +- +-/* +- * Increment the quiescent state counter. +- * The counter is a bit degenerated: We do not need to know +- * how many quiescent states passed, just if there was at least +- * one since the start of the grace period. Thus just a flag. +- */ +-static inline void rcu_qsctr_inc(int cpu) +-{ +- struct rcu_data *rdp = &per_cpu(rcu_data, cpu); +- rdp->passed_quiesc = 1; +- rdp->passed_quiesc_completed = rdp->completed; +-} +-static inline void rcu_bh_qsctr_inc(int cpu) +-{ +- struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); +- rdp->passed_quiesc = 1; +- rdp->passed_quiesc_completed = rdp->completed; +-} ++extern void rcu_qsctr_inc(int cpu); ++extern void rcu_bh_qsctr_inc(int cpu); + + extern int rcu_pending(int cpu); + extern int rcu_needs_cpu(int cpu); +Index: linux-2.6-tip/include/linux/reiserfs_fs.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/reiserfs_fs.h ++++ linux-2.6-tip/include/linux/reiserfs_fs.h +@@ -28,8 +28,6 @@ + #include + #endif + +-struct fid; +- + /* + * include/linux/reiser_fs.h + * +@@ -37,6 +35,33 @@ struct fid; + * + */ + ++/* ioctl's command */ ++#define REISERFS_IOC_UNPACK _IOW(0xCD,1,long) ++/* define following flags to be the same as in ext2, so that chattr(1), ++ lsattr(1) will work with us. */ ++#define REISERFS_IOC_GETFLAGS FS_IOC_GETFLAGS ++#define REISERFS_IOC_SETFLAGS FS_IOC_SETFLAGS ++#define REISERFS_IOC_GETVERSION FS_IOC_GETVERSION ++#define REISERFS_IOC_SETVERSION FS_IOC_SETVERSION ++ ++#ifdef __KERNEL__ ++/* the 32 bit compat definitions with int argument */ ++#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int) ++#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS ++#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS ++#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION ++#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION ++ ++/* Locking primitives */ ++/* Right now we are still falling back to (un)lock_kernel, but eventually that ++ would evolve into real per-fs locks */ ++#define reiserfs_write_lock( sb ) lock_kernel() ++#define reiserfs_write_unlock( sb ) unlock_kernel() ++ ++/* xattr stuff */ ++#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem) ++struct fid; ++ + /* in reading the #defines, it may help to understand that they employ + the following abbreviations: + +@@ -698,6 +723,7 @@ static inline void cpu_key_k_offset_dec( + /* object identifier for root dir */ + #define REISERFS_ROOT_OBJECTID 2 + #define REISERFS_ROOT_PARENT_OBJECTID 1 ++ + extern struct reiserfs_key root_key; + + /* +@@ -1540,7 +1566,6 @@ struct reiserfs_iget_args { + /* FUNCTION DECLARATIONS */ + /***************************************************************************/ + +-/*#ifdef __KERNEL__*/ + #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) + + #define journal_trans_half(blocksize) \ +@@ -2178,29 +2203,6 @@ long reiserfs_compat_ioctl(struct file * + unsigned int cmd, unsigned long arg); + int reiserfs_unpack(struct inode *inode, struct file *filp); + +-/* ioctl's command */ +-#define REISERFS_IOC_UNPACK _IOW(0xCD,1,long) +-/* define following flags to be the same as in ext2, so that chattr(1), +- lsattr(1) will work with us. */ +-#define REISERFS_IOC_GETFLAGS FS_IOC_GETFLAGS +-#define REISERFS_IOC_SETFLAGS FS_IOC_SETFLAGS +-#define REISERFS_IOC_GETVERSION FS_IOC_GETVERSION +-#define REISERFS_IOC_SETVERSION FS_IOC_SETVERSION +- +-/* the 32 bit compat definitions with int argument */ +-#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int) +-#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +-#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +-#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION +-#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION +- +-/* Locking primitives */ +-/* Right now we are still falling back to (un)lock_kernel, but eventually that +- would evolve into real per-fs locks */ +-#define reiserfs_write_lock( sb ) lock_kernel() +-#define reiserfs_write_unlock( sb ) unlock_kernel() +- +-/* xattr stuff */ +-#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem) + ++#endif /* __KERNEL__ */ + #endif /* _LINUX_REISER_FS_H */ +Index: linux-2.6-tip/include/linux/ring_buffer.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/ring_buffer.h ++++ linux-2.6-tip/include/linux/ring_buffer.h +@@ -1,6 +1,7 @@ + #ifndef _LINUX_RING_BUFFER_H + #define _LINUX_RING_BUFFER_H + ++#include + #include + #include + +@@ -8,20 +9,26 @@ struct ring_buffer; + struct ring_buffer_iter; + + /* +- * Don't reference this struct directly, use functions below. ++ * Don't refer to this struct directly, use functions below. + */ + struct ring_buffer_event { +- u32 type:2, len:3, time_delta:27; ++ kmemcheck_define_bitfield(bitfield, { ++ u32 type:2, len:3, time_delta:27; ++ }); ++ + u32 array[]; + }; + + /** + * enum ring_buffer_type - internal ring buffer types + * +- * @RINGBUF_TYPE_PADDING: Left over page padding +- * array is ignored +- * size is variable depending on how much ++ * @RINGBUF_TYPE_PADDING: Left over page padding or discarded event ++ * If time_delta is 0: ++ * array is ignored ++ * size is variable depending on how much + * padding is needed ++ * If time_delta is non zero: ++ * everything else same as RINGBUF_TYPE_DATA + * + * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta + * array[0] = time delta (28 .. 59) +@@ -65,6 +72,8 @@ ring_buffer_event_time_delta(struct ring + return event->time_delta; + } + ++void ring_buffer_event_discard(struct ring_buffer_event *event); ++ + /* + * size is in bytes for each per CPU buffer. + */ +@@ -74,13 +83,10 @@ void ring_buffer_free(struct ring_buffer + + int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); + +-struct ring_buffer_event * +-ring_buffer_lock_reserve(struct ring_buffer *buffer, +- unsigned long length, +- unsigned long *flags); ++struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer, ++ unsigned long length); + int ring_buffer_unlock_commit(struct ring_buffer *buffer, +- struct ring_buffer_event *event, +- unsigned long flags); ++ struct ring_buffer_event *event); + int ring_buffer_write(struct ring_buffer *buffer, + unsigned long length, void *data); + +@@ -121,17 +127,19 @@ unsigned long ring_buffer_overruns(struc + unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); + unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); + +-u64 ring_buffer_time_stamp(int cpu); +-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); ++u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); ++void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, ++ int cpu, u64 *ts); ++void ring_buffer_set_clock(struct ring_buffer *buffer, ++ u64 (*clock)(void)); ++ ++size_t ring_buffer_page_len(void *page); + +-void tracing_on(void); +-void tracing_off(void); +-void tracing_off_permanent(void); + + void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); + void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); +-int ring_buffer_read_page(struct ring_buffer *buffer, +- void **data_page, int cpu, int full); ++int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, ++ size_t len, int cpu, int full); + + enum ring_buffer_flags { + RB_FL_OVERWRITE = 1 << 0, +Index: linux-2.6-tip/include/linux/sched.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/sched.h ++++ linux-2.6-tip/include/linux/sched.h +@@ -71,6 +71,7 @@ struct sched_param { + #include + #include + #include ++#include + #include + #include + #include +@@ -91,6 +92,28 @@ struct sched_param { + + #include + ++#ifdef CONFIG_PREEMPT ++extern int kernel_preemption; ++#else ++# define kernel_preemption 0 ++#endif ++#ifdef CONFIG_PREEMPT_VOLUNTARY ++extern int voluntary_preemption; ++#else ++# define voluntary_preemption 0 ++#endif ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++extern int softirq_preemption; ++#else ++# define softirq_preemption 0 ++#endif ++ ++#ifdef CONFIG_PREEMPT_HARDIRQS ++extern int hardirq_preemption; ++#else ++# define hardirq_preemption 0 ++#endif ++ + struct mem_cgroup; + struct exec_domain; + struct futex_pi_state; +@@ -115,6 +138,7 @@ struct bts_tracer; + * 11 bit fractions. + */ + extern unsigned long avenrun[]; /* Load averages */ ++extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); + + #define FSHIFT 11 /* nr of bits of precision */ + #define FIXED_1 (1<exit_state */ +-#define EXIT_ZOMBIE 16 +-#define EXIT_DEAD 32 ++#define EXIT_ZOMBIE 32 ++#define EXIT_DEAD 64 + /* in tsk->state again */ +-#define TASK_DEAD 64 +-#define TASK_WAKEKILL 128 ++#define TASK_DEAD 128 ++#define TASK_WAKEKILL 256 + + /* Convenience macros for the sake of set_task_state */ + #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) +@@ -193,7 +222,8 @@ extern unsigned long long time_sync_thre + #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) + + /* get_task_state() */ +-#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ ++#define TASK_REPORT (TASK_RUNNING | TASK_RUNNING_MUTEX | \ ++ TASK_INTERRUPTIBLE | \ + TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ + __TASK_TRACED) + +@@ -210,6 +240,28 @@ extern unsigned long long time_sync_thre + #define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) + ++// #define PREEMPT_DIRECT ++ ++#ifdef CONFIG_X86_LOCAL_APIC ++extern void nmi_show_all_regs(void); ++#else ++# define nmi_show_all_regs() do { } while (0) ++#endif ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct exec_domain; ++ + /* + * set_current_state() includes a barrier so that the write of current->state + * is correctly serialised wrt the caller's subsequent test of whether to +@@ -290,6 +342,12 @@ extern void scheduler_tick(void); + + extern void sched_show_task(struct task_struct *p); + ++#ifdef CONFIG_GENERIC_HARDIRQS ++extern int debug_direct_keyboard; ++#else ++# define debug_direct_keyboard 0 ++#endif ++ + #ifdef CONFIG_DETECT_SOFTLOCKUP + extern void softlockup_tick(void); + extern void touch_softlockup_watchdog(void); +@@ -298,17 +356,11 @@ extern int proc_dosoftlockup_thresh(stru + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos); + extern unsigned int softlockup_panic; +-extern unsigned long sysctl_hung_task_check_count; +-extern unsigned long sysctl_hung_task_timeout_secs; +-extern unsigned long sysctl_hung_task_warnings; + extern int softlockup_thresh; + #else + static inline void softlockup_tick(void) + { + } +-static inline void spawn_softlockup_task(void) +-{ +-} + static inline void touch_softlockup_watchdog(void) + { + } +@@ -317,6 +369,15 @@ static inline void touch_all_softlockup_ + } + #endif + ++#ifdef CONFIG_DETECT_HUNG_TASK ++extern unsigned int sysctl_hung_task_panic; ++extern unsigned long sysctl_hung_task_check_count; ++extern unsigned long sysctl_hung_task_timeout_secs; ++extern unsigned long sysctl_hung_task_warnings; ++extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, ++ struct file *filp, void __user *buffer, ++ size_t *lenp, loff_t *ppos); ++#endif + + /* Attach to any functions which should be ignored in wchan output. */ + #define __sched __attribute__((__section__(".sched.text"))) +@@ -332,7 +393,14 @@ extern signed long schedule_timeout(sign + extern signed long schedule_timeout_interruptible(signed long timeout); + extern signed long schedule_timeout_killable(signed long timeout); + extern signed long schedule_timeout_uninterruptible(signed long timeout); ++asmlinkage void __schedule(void); + asmlinkage void schedule(void); ++extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); ++/* ++ * This one can be called with interrupts disabled, only ++ * to be used by lowlevel arch code! ++ */ ++asmlinkage void __sched __schedule(void); + + struct nsproxy; + struct user_namespace; +@@ -480,7 +548,7 @@ struct task_cputime { + struct thread_group_cputimer { + struct task_cputime cputime; + int running; +- spinlock_t lock; ++ raw_spinlock_t lock; + }; + + /* +@@ -999,6 +1067,7 @@ struct sched_class { + struct rq *busiest, struct sched_domain *sd, + enum cpu_idle_type idle); + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); ++ int (*needs_post_schedule) (struct rq *this_rq); + void (*post_schedule) (struct rq *this_rq); + void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); + +@@ -1053,6 +1122,11 @@ struct sched_entity { + u64 last_wakeup; + u64 avg_overlap; + ++ u64 nr_migrations; ++ ++ u64 start_runtime; ++ u64 avg_wakeup; ++ + #ifdef CONFIG_SCHEDSTATS + u64 wait_start; + u64 wait_max; +@@ -1068,7 +1142,6 @@ struct sched_entity { + u64 exec_max; + u64 slice_max; + +- u64 nr_migrations; + u64 nr_migrations_cold; + u64 nr_failed_migrations_affine; + u64 nr_failed_migrations_running; +@@ -1122,10 +1195,8 @@ struct task_struct { + int lock_depth; /* BKL lock depth */ + + #ifdef CONFIG_SMP +-#ifdef __ARCH_WANT_UNLOCKED_CTXSW + int oncpu; + #endif +-#endif + + int prio, static_prio, normal_prio; + unsigned int rt_priority; +@@ -1165,6 +1236,7 @@ struct task_struct { + #endif + + struct list_head tasks; ++ struct plist_node pushable_tasks; + + struct mm_struct *mm, *active_mm; + +@@ -1179,10 +1251,9 @@ struct task_struct { + pid_t pid; + pid_t tgid; + +-#ifdef CONFIG_CC_STACKPROTECTOR + /* Canary value for the -fstack-protector gcc feature */ + unsigned long stack_canary; +-#endif ++ + /* + * pointers to (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with +@@ -1238,6 +1309,8 @@ struct task_struct { + struct task_cputime cputime_expires; + struct list_head cpu_timers[3]; + ++ struct task_struct* posix_timer_list; ++ + /* process credentials */ + const struct cred *real_cred; /* objective and real subjective task + * credentials (COW) */ +@@ -1255,9 +1328,8 @@ struct task_struct { + /* ipc stuff */ + struct sysv_sem sysvsem; + #endif +-#ifdef CONFIG_DETECT_SOFTLOCKUP ++#ifdef CONFIG_DETECT_HUNG_TASK + /* hung task detection */ +- unsigned long last_switch_timestamp; + unsigned long last_switch_count; + #endif + /* CPU-specific state of this task */ +@@ -1271,6 +1343,7 @@ struct task_struct { + /* signal handlers */ + struct signal_struct *signal; + struct sighand_struct *sighand; ++ struct sigqueue *sigqueue_cache; + + sigset_t blocked, real_blocked; + sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ +@@ -1295,7 +1368,7 @@ struct task_struct { + spinlock_t alloc_lock; + + /* Protection of the PI data structures: */ +- spinlock_t pi_lock; ++ raw_spinlock_t pi_lock; + + #ifdef CONFIG_RT_MUTEXES + /* PI waiters blocked on a rt_mutex held by this task */ +@@ -1308,6 +1381,7 @@ struct task_struct { + /* mutex deadlock detection */ + struct mutex_waiter *blocked_on; + #endif ++ int pagefault_disabled; + #ifdef CONFIG_TRACE_IRQFLAGS + unsigned int irq_events; + int hardirqs_enabled; +@@ -1329,6 +1403,27 @@ struct task_struct { + int lockdep_depth; + unsigned int lockdep_recursion; + struct held_lock held_locks[MAX_LOCK_DEPTH]; ++ gfp_t lockdep_reclaim_gfp; ++#endif ++ ++/* realtime bits */ ++ ++#define MAX_PREEMPT_TRACE 25 ++#define MAX_LOCK_STACK MAX_PREEMPT_TRACE ++#ifdef CONFIG_DEBUG_PREEMPT ++ atomic_t lock_count; ++# ifdef CONFIG_PREEMPT_RT ++ struct rt_mutex *owned_lock[MAX_LOCK_STACK]; ++# endif ++#endif ++#ifdef CONFIG_DETECT_SOFTLOCKUP ++ unsigned long softlockup_count; /* Count to keep track how long the ++ * thread is in the kernel without ++ * sleeping. ++ */ ++#endif ++#ifdef CONFIG_DEBUG_RT_MUTEXES ++ void *last_kernel_lock; + #endif + + /* journalling filesystem info */ +@@ -1370,7 +1465,9 @@ struct task_struct { + #endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; ++ struct task_struct *futex_wakeup; + #endif ++ struct perf_counter_context perf_counter_ctx; + #ifdef CONFIG_NUMA + struct mempolicy *mempolicy; + short il_next; +@@ -1406,6 +1503,8 @@ struct task_struct { + int curr_ret_stack; + /* Stack of return addresses for return function tracing */ + struct ftrace_ret_stack *ret_stack; ++ /* time stamp for last schedule */ ++ unsigned long long ftrace_timestamp; + /* + * Number of functions that haven't been traced + * because of depth overrun. +@@ -1418,11 +1517,24 @@ struct task_struct { + /* state flags for use by tracers */ + unsigned long trace; + #endif ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * Temporary hack, until we find a solution to ++ * handle printk in atomic operations. ++ */ ++ int in_printk; ++#endif + }; + + /* Future-safe accessor for struct task_struct's cpus_allowed. */ + #define tsk_cpumask(tsk) (&(tsk)->cpus_allowed) + ++#ifdef CONFIG_PREEMPT_RT ++# define set_printk_might_sleep(x) do { current->in_printk = x; } while(0) ++#else ++# define set_printk_might_sleep(x) do { } while(0) ++#endif ++ + /* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH +@@ -1587,6 +1699,15 @@ extern struct pid *cad_pid; + extern void free_task(struct task_struct *tsk); + #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) + ++#ifdef CONFIG_PREEMPT_RT ++extern void __put_task_struct_cb(struct rcu_head *rhp); ++ ++static inline void put_task_struct(struct task_struct *t) ++{ ++ if (atomic_dec_and_test(&t->usage)) ++ call_rcu(&t->rcu, __put_task_struct_cb); ++} ++#else + extern void __put_task_struct(struct task_struct *t); + + static inline void put_task_struct(struct task_struct *t) +@@ -1594,6 +1715,7 @@ static inline void put_task_struct(struc + if (atomic_dec_and_test(&t->usage)) + __put_task_struct(t); + } ++#endif + + extern cputime_t task_utime(struct task_struct *p); + extern cputime_t task_stime(struct task_struct *p); +@@ -1608,13 +1730,16 @@ extern cputime_t task_gtime(struct task_ + #define PF_EXITING 0x00000004 /* getting shut down */ + #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ + #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ ++#define PF_NOSCHED 0x00000020 /* Userspace does not expect scheduling */ + #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ ++#define PF_HARDIRQ 0x00000080 /* hardirq context */ + #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ + #define PF_DUMPCORE 0x00000200 /* dumped core */ + #define PF_SIGNALED 0x00000400 /* killed by a signal */ + #define PF_MEMALLOC 0x00000800 /* Allocating memory */ + #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ + #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ ++#define PF_KMAP 0x00004000 /* this context has a kmap */ + #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ + #define PF_FROZEN 0x00010000 /* frozen for system suspend */ + #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ +@@ -1627,6 +1752,7 @@ extern cputime_t task_gtime(struct task_ + #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ + #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ + #define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ ++#define PF_SOFTIRQ 0x08000000 /* softirq context */ + #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ + #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ + #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ +@@ -1674,6 +1800,16 @@ static inline int set_cpus_allowed(struc + return set_cpus_allowed_ptr(p, &new_mask); + } + ++/* ++ * Architectures can set this to 1 if they have specified ++ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, ++ * but then during bootup it turns out that sched_clock() ++ * is reliable after all: ++ */ ++#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK ++extern int sched_clock_stable; ++#endif ++ + extern unsigned long long sched_clock(void); + + extern void sched_clock_init(void); +@@ -1755,9 +1891,14 @@ int sched_rt_handler(struct ctl_table *t + + extern unsigned int sysctl_sched_compat_yield; + ++extern void task_setprio(struct task_struct *p, int prio); ++ + #ifdef CONFIG_RT_MUTEXES + extern int rt_mutex_getprio(struct task_struct *p); +-extern void rt_mutex_setprio(struct task_struct *p, int prio); ++static inline void rt_mutex_setprio(struct task_struct *p, int prio) ++{ ++ task_setprio(p, prio); ++} + extern void rt_mutex_adjust_pi(struct task_struct *p); + #else + static inline int rt_mutex_getprio(struct task_struct *p) +@@ -1781,6 +1922,7 @@ extern struct task_struct *curr_task(int + extern void set_curr_task(int cpu, struct task_struct *p); + + void yield(void); ++void __yield(void); + + /* + * The default (Linux) execution domain. +@@ -1848,6 +1990,9 @@ extern void do_timer(unsigned long ticks + + extern int wake_up_state(struct task_struct *tsk, unsigned int state); + extern int wake_up_process(struct task_struct *tsk); ++extern int wake_up_process_mutex(struct task_struct * tsk); ++extern int wake_up_process_sync(struct task_struct * tsk); ++extern int wake_up_process_mutex_sync(struct task_struct * tsk); + extern void wake_up_new_task(struct task_struct *tsk, + unsigned long clone_flags); + #ifdef CONFIG_SMP +@@ -1935,12 +2080,20 @@ extern struct mm_struct * mm_alloc(void) + + /* mmdrop drops the mm and the page tables */ + extern void __mmdrop(struct mm_struct *); ++extern void __mmdrop_delayed(struct mm_struct *); ++ + static inline void mmdrop(struct mm_struct * mm) + { + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); + } + ++static inline void mmdrop_delayed(struct mm_struct * mm) ++{ ++ if (atomic_dec_and_test(&mm->mm_count)) ++ __mmdrop_delayed(mm); ++} ++ + /* mmput gets rid of the mappings and all user-space */ + extern void mmput(struct mm_struct *); + /* Grab a reference to a task's mm, if it is not already going away */ +@@ -2091,6 +2244,19 @@ static inline int object_is_on_stack(voi + + extern void thread_info_cache_init(void); + ++#ifdef CONFIG_DEBUG_STACK_USAGE ++static inline unsigned long stack_not_used(struct task_struct *p) ++{ ++ unsigned long *n = end_of_stack(p); ++ ++ do { /* Skip over canary */ ++ n++; ++ } while (!*n); ++ ++ return (unsigned long)n - (unsigned long)end_of_stack(p); ++} ++#endif ++ + /* set thread flags in other task's structures + * - see asm/thread_info.h for TIF_xxxx flags available + */ +@@ -2180,19 +2346,27 @@ static inline int cond_resched(void) + return _cond_resched(); + } + #endif +-extern int cond_resched_lock(spinlock_t * lock); ++extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock); ++extern int __cond_resched_spinlock(spinlock_t *spinlock); ++ ++#define cond_resched_lock(lock) \ ++ PICK_SPIN_OP_RET(__cond_resched_raw_spinlock, __cond_resched_spinlock,\ ++ lock) ++ + extern int cond_resched_softirq(void); + static inline int cond_resched_bkl(void) + { + return _cond_resched(); + } ++extern int cond_resched_softirq_context(void); ++extern int cond_resched_hardirq_context(void); + + /* + * Does a critical section need to be broken due to another + * task waiting?: (technically does not depend on CONFIG_PREEMPT, + * but a general need for low latency) + */ +-static inline int spin_needbreak(spinlock_t *lock) ++static inline int __raw_spin_needbreak(raw_spinlock_t *lock) + { + #ifdef CONFIG_PREEMPT + return spin_is_contended(lock); +@@ -2218,6 +2392,40 @@ static inline void thread_group_cputime_ + { + } + ++#ifdef CONFIG_PREEMPT_RT ++static inline int __spin_needbreak(spinlock_t *lock) ++{ ++ struct task_struct *tsk = current; ++ ++ /* break if we are priority boosted */ ++ return tsk->prio < tsk->normal_prio; ++} ++#else ++static inline int __spin_needbreak(spinlock_t *lock) ++{ ++ /* should never be call outside of RT */ ++ BUG(); ++ return 0; ++} ++#endif ++ ++#define spin_needbreak(lock) \ ++ PICK_SPIN_OP_RET(__raw_spin_needbreak, __spin_needbreak, lock) ++ ++static inline int softirq_need_resched(void) ++{ ++ if (softirq_preemption && (current->flags & PF_SOFTIRQ)) ++ return need_resched(); ++ return 0; ++} ++ ++static inline int hardirq_need_resched(void) ++{ ++ if (hardirq_preemption && (current->flags & PF_HARDIRQ)) ++ return need_resched(); ++ return 0; ++} ++ + /* + * Reevaluate whether the task has signals pending delivery. + * Wake the task if so. +@@ -2344,6 +2552,13 @@ static inline void inc_syscw(struct task + #define TASK_SIZE_OF(tsk) TASK_SIZE + #endif + ++/* ++ * Call the function if the target task is executing on a CPU right now: ++ */ ++extern void task_oncpu_function_call(struct task_struct *p, ++ void (*func) (void *info), void *info); ++ ++ + #ifdef CONFIG_MM_OWNER + extern void mm_update_next_owner(struct mm_struct *mm); + extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); +@@ -2357,7 +2572,14 @@ static inline void mm_init_owner(struct + } + #endif /* CONFIG_MM_OWNER */ + +-#define TASK_STATE_TO_CHAR_STR "RSDTtZX" ++#define TASK_STATE_TO_CHAR_STR "RMSDTtZX" ++ ++#ifdef CONFIG_SMP ++static inline int task_is_current(struct task_struct *task) ++{ ++ return task->oncpu; ++} ++#endif + + #endif /* __KERNEL__ */ + +Index: linux-2.6-tip/include/linux/security.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/security.h ++++ linux-2.6-tip/include/linux/security.h +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include + + /* Maximum number of letters for an LSM name string */ +@@ -2966,5 +2967,28 @@ static inline void securityfs_remove(str + + #endif + ++#ifdef CONFIG_SECURITY ++ ++static inline char *alloc_secdata(void) ++{ ++ return (char *)get_zeroed_page(GFP_KERNEL); ++} ++ ++static inline void free_secdata(void *secdata) ++{ ++ free_page((unsigned long)secdata); ++} ++ ++#else ++ ++static inline char *alloc_secdata(void) ++{ ++ return (char *)1; ++} ++ ++static inline void free_secdata(void *secdata) ++{ } ++#endif /* CONFIG_SECURITY */ ++ + #endif /* ! __LINUX_SECURITY_H */ + +Index: linux-2.6-tip/include/linux/selinux_netlink.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/selinux_netlink.h ++++ linux-2.6-tip/include/linux/selinux_netlink.h +@@ -12,6 +12,8 @@ + #ifndef _LINUX_SELINUX_NETLINK_H + #define _LINUX_SELINUX_NETLINK_H + ++#include ++ + /* Message types. */ + #define SELNL_MSG_BASE 0x10 + enum { +@@ -38,11 +40,11 @@ enum selinux_nlgroups { + + /* Message structures */ + struct selnl_msg_setenforce { +- int32_t val; ++ __s32 val; + }; + + struct selnl_msg_policyload { +- u_int32_t seqno; ++ __u32 seqno; + }; + + #endif /* _LINUX_SELINUX_NETLINK_H */ +Index: linux-2.6-tip/include/linux/signal.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/signal.h ++++ linux-2.6-tip/include/linux/signal.h +@@ -225,6 +225,7 @@ static inline void init_sigpending(struc + } + + extern void flush_sigqueue(struct sigpending *queue); ++extern void flush_task_sigqueue(struct task_struct *tsk); + + /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ + static inline int valid_signal(unsigned long sig) +@@ -235,6 +236,8 @@ static inline int valid_signal(unsigned + extern int next_signal(struct sigpending *pending, sigset_t *mask); + extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); + extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); ++extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, ++ siginfo_t *info); + extern long do_sigpending(void __user *, unsigned long); + extern int sigprocmask(int, sigset_t *, sigset_t *); + extern int show_unhandled_signals; +Index: linux-2.6-tip/include/linux/skbuff.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/skbuff.h ++++ linux-2.6-tip/include/linux/skbuff.h +@@ -15,6 +15,7 @@ + #define _LINUX_SKBUFF_H + + #include ++#include + #include + #include + #include +@@ -100,6 +101,9 @@ struct pipe_inode_info; + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + struct nf_conntrack { + atomic_t use; ++#ifdef CONFIG_PREEMPT_RT ++ struct rcu_head rcu; ++#endif + }; + #endif + +@@ -295,16 +299,18 @@ struct sk_buff { + }; + }; + __u32 priority; +- __u8 local_df:1, +- cloned:1, +- ip_summed:2, +- nohdr:1, +- nfctinfo:3; +- __u8 pkt_type:3, +- fclone:2, +- ipvs_property:1, +- peeked:1, +- nf_trace:1; ++ kmemcheck_define_bitfield(flags1, { ++ __u8 local_df:1, ++ cloned:1, ++ ip_summed:2, ++ nohdr:1, ++ nfctinfo:3; ++ __u8 pkt_type:3, ++ fclone:2, ++ ipvs_property:1, ++ peeked:1, ++ nf_trace:1; ++ }); + __be16 protocol; + + void (*destructor)(struct sk_buff *skb); +@@ -324,13 +330,17 @@ struct sk_buff { + __u16 tc_verd; /* traffic control verdict */ + #endif + #endif ++ ++ kmemcheck_define_bitfield(flags2, { + #ifdef CONFIG_IPV6_NDISC_NODETYPE +- __u8 ndisc_nodetype:2; ++ __u8 ndisc_nodetype:2; + #endif + #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) +- __u8 do_not_encrypt:1; +- __u8 requeue:1; ++ __u8 do_not_encrypt:1; ++ __u8 requeue:1; + #endif ++ }); ++ + /* 0/13/14 bit hole */ + + #ifdef CONFIG_NET_DMA +Index: linux-2.6-tip/include/linux/slab.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/slab.h ++++ linux-2.6-tip/include/linux/slab.h +@@ -62,6 +62,13 @@ + # define SLAB_DEBUG_OBJECTS 0x00000000UL + #endif + ++/* Don't track use of uninitialized memory */ ++#ifdef CONFIG_KMEMCHECK ++# define SLAB_NOTRACK 0x00800000UL ++#else ++# define SLAB_NOTRACK 0x00000000UL ++#endif ++ + /* The following flags affect the page allocator grouping pages by mobility */ + #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ + #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ +Index: linux-2.6-tip/include/linux/slab_def.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/slab_def.h ++++ linux-2.6-tip/include/linux/slab_def.h +@@ -14,6 +14,88 @@ + #include /* kmalloc_sizes.h needs PAGE_SIZE */ + #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ + #include ++#include ++ ++/* ++ * struct kmem_cache ++ * ++ * manages a cache. ++ */ ++ ++struct kmem_cache { ++/* 1) per-cpu data, touched during every alloc/free */ ++ struct array_cache *array[NR_CPUS]; ++/* 2) Cache tunables. Protected by cache_chain_mutex */ ++ unsigned int batchcount; ++ unsigned int limit; ++ unsigned int shared; ++ ++ unsigned int buffer_size; ++ u32 reciprocal_buffer_size; ++/* 3) touched by every alloc & free from the backend */ ++ ++ unsigned int flags; /* constant flags */ ++ unsigned int num; /* # of objs per slab */ ++ ++/* 4) cache_grow/shrink */ ++ /* order of pgs per slab (2^n) */ ++ unsigned int gfporder; ++ ++ /* force GFP flags, e.g. GFP_DMA */ ++ gfp_t gfpflags; ++ ++ size_t colour; /* cache colouring range */ ++ unsigned int colour_off; /* colour offset */ ++ struct kmem_cache *slabp_cache; ++ unsigned int slab_size; ++ unsigned int dflags; /* dynamic flags */ ++ ++ /* constructor func */ ++ void (*ctor)(void *obj); ++ ++/* 5) cache creation/removal */ ++ const char *name; ++ struct list_head next; ++ ++/* 6) statistics */ ++#ifdef CONFIG_DEBUG_SLAB ++ unsigned long num_active; ++ unsigned long num_allocations; ++ unsigned long high_mark; ++ unsigned long grown; ++ unsigned long reaped; ++ unsigned long errors; ++ unsigned long max_freeable; ++ unsigned long node_allocs; ++ unsigned long node_frees; ++ unsigned long node_overflow; ++ atomic_t allochit; ++ atomic_t allocmiss; ++ atomic_t freehit; ++ atomic_t freemiss; ++ ++ /* ++ * If debugging is enabled, then the allocator can add additional ++ * fields and/or padding to every object. buffer_size contains the total ++ * object size including these internal fields, the following two ++ * variables contain the offset to the user object and its size. ++ */ ++ int obj_offset; ++ int obj_size; ++#endif /* CONFIG_DEBUG_SLAB */ ++ ++ /* ++ * We put nodelists[] at the end of kmem_cache, because we want to size ++ * this array to nr_node_ids slots instead of MAX_NUMNODES ++ * (see kmem_cache_init()) ++ * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache ++ * is statically defined, so we reserve the max number of nodes. ++ */ ++ struct kmem_list3 *nodelists[MAX_NUMNODES]; ++ /* ++ * Do not add fields after nodelists[] ++ */ ++}; + + /* Size description struct for general caches. */ + struct cache_sizes { +@@ -28,8 +110,26 @@ extern struct cache_sizes malloc_sizes[] + void *kmem_cache_alloc(struct kmem_cache *, gfp_t); + void *__kmalloc(size_t size, gfp_t flags); + +-static inline void *kmalloc(size_t size, gfp_t flags) ++#ifdef CONFIG_KMEMTRACE ++extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags); ++extern size_t slab_buffer_size(struct kmem_cache *cachep); ++#else ++static __always_inline void * ++kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) ++{ ++ return kmem_cache_alloc(cachep, flags); ++} ++static inline size_t slab_buffer_size(struct kmem_cache *cachep) ++{ ++ return 0; ++} ++#endif ++ ++static __always_inline void *kmalloc(size_t size, gfp_t flags) + { ++ struct kmem_cache *cachep; ++ void *ret; ++ + if (__builtin_constant_p(size)) { + int i = 0; + +@@ -47,10 +147,17 @@ static inline void *kmalloc(size_t size, + found: + #ifdef CONFIG_ZONE_DMA + if (flags & GFP_DMA) +- return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, +- flags); ++ cachep = malloc_sizes[i].cs_dmacachep; ++ else + #endif +- return kmem_cache_alloc(malloc_sizes[i].cs_cachep, flags); ++ cachep = malloc_sizes[i].cs_cachep; ++ ++ ret = kmem_cache_alloc_notrace(cachep, flags); ++ ++ trace_kmalloc(_THIS_IP_, ret, ++ size, slab_buffer_size(cachep), flags); ++ ++ return ret; + } + return __kmalloc(size, flags); + } +@@ -59,8 +166,25 @@ found: + extern void *__kmalloc_node(size_t size, gfp_t flags, int node); + extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); + +-static inline void *kmalloc_node(size_t size, gfp_t flags, int node) ++#ifdef CONFIG_KMEMTRACE ++extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, ++ gfp_t flags, ++ int nodeid); ++#else ++static __always_inline void * ++kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, ++ gfp_t flags, ++ int nodeid) ++{ ++ return kmem_cache_alloc_node(cachep, flags, nodeid); ++} ++#endif ++ ++static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) + { ++ struct kmem_cache *cachep; ++ void *ret; ++ + if (__builtin_constant_p(size)) { + int i = 0; + +@@ -78,11 +202,18 @@ static inline void *kmalloc_node(size_t + found: + #ifdef CONFIG_ZONE_DMA + if (flags & GFP_DMA) +- return kmem_cache_alloc_node(malloc_sizes[i].cs_dmacachep, +- flags, node); ++ cachep = malloc_sizes[i].cs_dmacachep; ++ else + #endif +- return kmem_cache_alloc_node(malloc_sizes[i].cs_cachep, +- flags, node); ++ cachep = malloc_sizes[i].cs_cachep; ++ ++ ret = kmem_cache_alloc_node_notrace(cachep, flags, node); ++ ++ trace_kmalloc_node(_THIS_IP_, ret, ++ size, slab_buffer_size(cachep), ++ flags, node); ++ ++ return ret; + } + return __kmalloc_node(size, flags, node); + } +Index: linux-2.6-tip/include/linux/slob_def.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/slob_def.h ++++ linux-2.6-tip/include/linux/slob_def.h +@@ -3,14 +3,15 @@ + + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); + +-static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) ++static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep, ++ gfp_t flags) + { + return kmem_cache_alloc_node(cachep, flags, -1); + } + + void *__kmalloc_node(size_t size, gfp_t flags, int node); + +-static inline void *kmalloc_node(size_t size, gfp_t flags, int node) ++static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) + { + return __kmalloc_node(size, flags, node); + } +@@ -23,12 +24,12 @@ static inline void *kmalloc_node(size_t + * kmalloc is the normal method of allocating memory + * in the kernel. + */ +-static inline void *kmalloc(size_t size, gfp_t flags) ++static __always_inline void *kmalloc(size_t size, gfp_t flags) + { + return __kmalloc_node(size, flags, -1); + } + +-static inline void *__kmalloc(size_t size, gfp_t flags) ++static __always_inline void *__kmalloc(size_t size, gfp_t flags) + { + return kmalloc(size, flags); + } +Index: linux-2.6-tip/include/linux/slub_def.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/slub_def.h ++++ linux-2.6-tip/include/linux/slub_def.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + + enum stat_item { + ALLOC_FASTPATH, /* Allocation from cpu slab */ +@@ -121,10 +122,23 @@ struct kmem_cache { + #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE) + + /* ++ * Maximum kmalloc object size handled by SLUB. Larger object allocations ++ * are passed through to the page allocator. The page allocator "fastpath" ++ * is relatively slow so we need this value sufficiently high so that ++ * performance critical objects are allocated through the SLUB fastpath. ++ * ++ * This should be dropped to PAGE_SIZE / 2 once the page allocator ++ * "fastpath" becomes competitive with the slab allocator fastpaths. ++ */ ++#define SLUB_MAX_SIZE (PAGE_SIZE) ++ ++#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1) ++ ++/* + * We keep the general caches in an array of slab caches that are used for + * 2^x bytes of allocations. + */ +-extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1]; ++extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT]; + + /* + * Sorry that the following has to be that ugly but some versions of GCC +@@ -204,15 +218,32 @@ static __always_inline struct kmem_cache + void *kmem_cache_alloc(struct kmem_cache *, gfp_t); + void *__kmalloc(size_t size, gfp_t flags); + ++#ifdef CONFIG_KMEMTRACE ++extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags); ++#else ++static __always_inline void * ++kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) ++{ ++ return kmem_cache_alloc(s, gfpflags); ++} ++#endif ++ + static __always_inline void *kmalloc_large(size_t size, gfp_t flags) + { +- return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size)); ++ unsigned int order = get_order(size); ++ void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); ++ ++ trace_kmalloc(_THIS_IP_, ret, size, PAGE_SIZE << order, flags); ++ ++ return ret; + } + + static __always_inline void *kmalloc(size_t size, gfp_t flags) + { ++ void *ret; ++ + if (__builtin_constant_p(size)) { +- if (size > PAGE_SIZE) ++ if (size > SLUB_MAX_SIZE) + return kmalloc_large(size, flags); + + if (!(flags & SLUB_DMA)) { +@@ -221,7 +252,11 @@ static __always_inline void *kmalloc(siz + if (!s) + return ZERO_SIZE_PTR; + +- return kmem_cache_alloc(s, flags); ++ ret = kmem_cache_alloc_notrace(s, flags); ++ ++ trace_kmalloc(_THIS_IP_, ret, size, s->size, flags); ++ ++ return ret; + } + } + return __kmalloc(size, flags); +@@ -231,16 +266,37 @@ static __always_inline void *kmalloc(siz + void *__kmalloc_node(size_t size, gfp_t flags, int node); + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); + ++#ifdef CONFIG_KMEMTRACE ++extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, ++ gfp_t gfpflags, ++ int node); ++#else ++static __always_inline void * ++kmem_cache_alloc_node_notrace(struct kmem_cache *s, ++ gfp_t gfpflags, ++ int node) ++{ ++ return kmem_cache_alloc_node(s, gfpflags, node); ++} ++#endif ++ + static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) + { ++ void *ret; ++ + if (__builtin_constant_p(size) && +- size <= PAGE_SIZE && !(flags & SLUB_DMA)) { ++ size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) { + struct kmem_cache *s = kmalloc_slab(size); + + if (!s) + return ZERO_SIZE_PTR; + +- return kmem_cache_alloc_node(s, flags, node); ++ ret = kmem_cache_alloc_node_notrace(s, flags, node); ++ ++ trace_kmalloc_node(_THIS_IP_, ret, ++ size, s->size, flags, node); ++ ++ return ret; + } + return __kmalloc_node(size, flags, node); + } +Index: linux-2.6-tip/include/linux/smp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/smp.h ++++ linux-2.6-tip/include/linux/smp.h +@@ -38,7 +38,7 @@ int smp_call_function_single(int cpuid, + /* + * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. + * (defined in asm header): +- */ ++ */ + + /* + * stops all CPUs but the current one: +@@ -50,6 +50,16 @@ extern void smp_send_stop(void); + */ + extern void smp_send_reschedule(int cpu); + ++/* ++ * trigger a reschedule on all other CPUs: ++ */ ++extern void smp_send_reschedule_allbutself(void); ++ ++/* ++ * trigger a reschedule on all other CPUs: ++ */ ++extern void smp_send_reschedule_allbutself(void); ++ + + /* + * Prepare machine for booting other CPUs. +@@ -82,7 +92,8 @@ smp_call_function_mask(cpumask_t mask, v + return 0; + } + +-void __smp_call_function_single(int cpuid, struct call_single_data *data); ++void __smp_call_function_single(int cpuid, struct call_single_data *data, ++ int wait); + + /* + * Generic and arch helpers +@@ -121,6 +132,8 @@ extern unsigned int setup_max_cpus; + + #else /* !SMP */ + ++static inline void smp_send_stop(void) { } ++ + /* + * These macros fold the SMP functionality into a single CPU system + */ +@@ -139,6 +152,7 @@ static inline int up_smp_call_function(v + 0; \ + }) + static inline void smp_send_reschedule(int cpu) { } ++static inline void smp_send_reschedule_allbutself(void) { } + #define num_booting_cpus() 1 + #define smp_prepare_boot_cpu() do {} while (0) + #define smp_call_function_mask(mask, func, info, wait) \ +@@ -174,7 +188,13 @@ static inline void init_call_single_data + + #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) + #define put_cpu() preempt_enable() +-#define put_cpu_no_resched() preempt_enable_no_resched() ++#define put_cpu_no_resched() __preempt_enable_no_resched() ++ ++/* ++ * Callback to arch code if there's nosmp or maxcpus=0 on the ++ * boot command line: ++ */ ++extern void arch_disable_smp_support(void); + + void smp_setup_processor_id(void); + +Index: linux-2.6-tip/include/linux/socket.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/socket.h ++++ linux-2.6-tip/include/linux/socket.h +@@ -24,10 +24,12 @@ struct __kernel_sockaddr_storage { + #include /* pid_t */ + #include /* __user */ + +-#ifdef CONFIG_PROC_FS ++#ifdef __KERNEL__ ++# ifdef CONFIG_PROC_FS + struct seq_file; + extern void socket_seq_show(struct seq_file *seq); +-#endif ++# endif ++#endif /* __KERNEL__ */ + + typedef unsigned short sa_family_t; + +Index: linux-2.6-tip/include/linux/stackprotector.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/stackprotector.h +@@ -0,0 +1,16 @@ ++#ifndef _LINUX_STACKPROTECTOR_H ++#define _LINUX_STACKPROTECTOR_H 1 ++ ++#include ++#include ++#include ++ ++#ifdef CONFIG_CC_STACKPROTECTOR ++# include ++#else ++static inline void boot_init_stack_canary(void) ++{ ++} ++#endif ++ ++#endif +Index: linux-2.6-tip/include/linux/stacktrace.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/stacktrace.h ++++ linux-2.6-tip/include/linux/stacktrace.h +@@ -4,6 +4,8 @@ + struct task_struct; + + #ifdef CONFIG_STACKTRACE ++struct task_struct; ++ + struct stack_trace { + unsigned int nr_entries, max_entries; + unsigned long *entries; +@@ -11,6 +13,7 @@ struct stack_trace { + }; + + extern void save_stack_trace(struct stack_trace *trace); ++extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp); + extern void save_stack_trace_tsk(struct task_struct *tsk, + struct stack_trace *trace); + +Index: linux-2.6-tip/include/linux/string.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/string.h ++++ linux-2.6-tip/include/linux/string.h +@@ -10,6 +10,7 @@ + #include /* for inline */ + #include /* for size_t */ + #include /* for NULL */ ++#include + + extern char *strndup_user(const char __user *, long); + +@@ -111,6 +112,12 @@ extern void argv_free(char **argv); + + extern bool sysfs_streq(const char *s1, const char *s2); + ++#ifdef CONFIG_BINARY_PRINTF ++int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); ++int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); ++int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); ++#endif ++ + extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available); + +Index: linux-2.6-tip/include/linux/suspend_ioctls.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/suspend_ioctls.h ++++ linux-2.6-tip/include/linux/suspend_ioctls.h +@@ -1,14 +1,15 @@ + #ifndef _LINUX_SUSPEND_IOCTLS_H + #define _LINUX_SUSPEND_IOCTLS_H + ++#include + /* + * This structure is used to pass the values needed for the identification + * of the resume swap area from a user space to the kernel via the + * SNAPSHOT_SET_SWAP_AREA ioctl + */ + struct resume_swap_area { +- loff_t offset; +- u_int32_t dev; ++ __kernel_loff_t offset; ++ __u32 dev; + } __attribute__((packed)); + + #define SNAPSHOT_IOC_MAGIC '3' +@@ -20,13 +21,13 @@ struct resume_swap_area { + #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) + #define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \ + struct resume_swap_area) +-#define SNAPSHOT_GET_IMAGE_SIZE _IOR(SNAPSHOT_IOC_MAGIC, 14, loff_t) ++#define SNAPSHOT_GET_IMAGE_SIZE _IOR(SNAPSHOT_IOC_MAGIC, 14, __kernel_loff_t) + #define SNAPSHOT_PLATFORM_SUPPORT _IO(SNAPSHOT_IOC_MAGIC, 15) + #define SNAPSHOT_POWER_OFF _IO(SNAPSHOT_IOC_MAGIC, 16) + #define SNAPSHOT_CREATE_IMAGE _IOW(SNAPSHOT_IOC_MAGIC, 17, int) + #define SNAPSHOT_PREF_IMAGE_SIZE _IO(SNAPSHOT_IOC_MAGIC, 18) +-#define SNAPSHOT_AVAIL_SWAP_SIZE _IOR(SNAPSHOT_IOC_MAGIC, 19, loff_t) +-#define SNAPSHOT_ALLOC_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 20, loff_t) ++#define SNAPSHOT_AVAIL_SWAP_SIZE _IOR(SNAPSHOT_IOC_MAGIC, 19, __kernel_loff_t) ++#define SNAPSHOT_ALLOC_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 20, __kernel_loff_t) + #define SNAPSHOT_IOC_MAXNR 20 + + #endif /* _LINUX_SUSPEND_IOCTLS_H */ +Index: linux-2.6-tip/include/linux/swiotlb.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/swiotlb.h ++++ linux-2.6-tip/include/linux/swiotlb.h +@@ -31,7 +31,7 @@ extern dma_addr_t swiotlb_phys_to_bus(st + phys_addr_t address); + extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); + +-extern int swiotlb_arch_range_needs_mapping(void *ptr, size_t size); ++extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size); + + extern void + *swiotlb_alloc_coherent(struct device *hwdev, size_t size, +@@ -41,20 +41,13 @@ extern void + swiotlb_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle); + +-extern dma_addr_t +-swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir); +- +-extern void +-swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, +- size_t size, int dir); +- +-extern dma_addr_t +-swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size, +- int dir, struct dma_attrs *attrs); +- +-extern void +-swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr, +- size_t size, int dir, struct dma_attrs *attrs); ++extern dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs); + + extern int + swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, +@@ -66,36 +59,38 @@ swiotlb_unmap_sg(struct device *hwdev, s + + extern int + swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, +- int dir, struct dma_attrs *attrs); ++ enum dma_data_direction dir, struct dma_attrs *attrs); + + extern void + swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, +- int nelems, int dir, struct dma_attrs *attrs); ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs); + + extern void + swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, +- size_t size, int dir); ++ size_t size, enum dma_data_direction dir); + + extern void + swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, +- int nelems, int dir); ++ int nelems, enum dma_data_direction dir); + + extern void + swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, +- size_t size, int dir); ++ size_t size, enum dma_data_direction dir); + + extern void + swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, +- int nelems, int dir); ++ int nelems, enum dma_data_direction dir); + + extern void + swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, +- unsigned long offset, size_t size, int dir); ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir); + + extern void + swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr, + unsigned long offset, size_t size, +- int dir); ++ enum dma_data_direction dir); + + extern int + swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); +Index: linux-2.6-tip/include/linux/syscalls.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/syscalls.h ++++ linux-2.6-tip/include/linux/syscalls.h +@@ -55,6 +55,7 @@ struct compat_timeval; + struct robust_list_head; + struct getcpu_cache; + struct old_linux_dirent; ++struct perf_counter_hw_event; + + #include + #include +@@ -65,6 +66,7 @@ struct old_linux_dirent; + #include + #include + #include ++#include + + #define __SC_DECL1(t1, a1) t1 a1 + #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__) +@@ -95,7 +97,46 @@ struct old_linux_dirent; + #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) + #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) + ++#ifdef CONFIG_FTRACE_SYSCALLS ++#define __SC_STR_ADECL1(t, a) #a ++#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__) ++#define __SC_STR_ADECL3(t, a, ...) #a, __SC_STR_ADECL2(__VA_ARGS__) ++#define __SC_STR_ADECL4(t, a, ...) #a, __SC_STR_ADECL3(__VA_ARGS__) ++#define __SC_STR_ADECL5(t, a, ...) #a, __SC_STR_ADECL4(__VA_ARGS__) ++#define __SC_STR_ADECL6(t, a, ...) #a, __SC_STR_ADECL5(__VA_ARGS__) ++ ++#define __SC_STR_TDECL1(t, a) #t ++#define __SC_STR_TDECL2(t, a, ...) #t, __SC_STR_TDECL1(__VA_ARGS__) ++#define __SC_STR_TDECL3(t, a, ...) #t, __SC_STR_TDECL2(__VA_ARGS__) ++#define __SC_STR_TDECL4(t, a, ...) #t, __SC_STR_TDECL3(__VA_ARGS__) ++#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) ++#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) ++ ++#define SYSCALL_METADATA(sname, nb) \ ++ static const struct syscall_metadata __used \ ++ __attribute__((__aligned__(4))) \ ++ __attribute__((section("__syscalls_metadata"))) \ ++ __syscall_meta_##sname = { \ ++ .name = "sys"#sname, \ ++ .nb_args = nb, \ ++ .types = types_##sname, \ ++ .args = args_##sname, \ ++ } ++ ++#define SYSCALL_DEFINE0(sname) \ ++ static const struct syscall_metadata __used \ ++ __attribute__((__aligned__(4))) \ ++ __attribute__((section("__syscalls_metadata"))) \ ++ __syscall_meta_##sname = { \ ++ .name = "sys_"#sname, \ ++ .nb_args = 0, \ ++ }; \ ++ asmlinkage long sys_##sname(void) ++ ++#else + #define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) ++#endif ++ + #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) + #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) + #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) +@@ -117,10 +158,26 @@ struct old_linux_dirent; + #endif + #endif + ++#ifdef CONFIG_FTRACE_SYSCALLS ++#define SYSCALL_DEFINEx(x, sname, ...) \ ++ static const char *types_##sname[] = { \ ++ __SC_STR_TDECL##x(__VA_ARGS__) \ ++ }; \ ++ static const char *args_##sname[] = { \ ++ __SC_STR_ADECL##x(__VA_ARGS__) \ ++ }; \ ++ SYSCALL_METADATA(sname, x); \ ++ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) ++#else ++#define SYSCALL_DEFINEx(x, sname, ...) \ ++ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) ++#endif ++ + #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS + + #define SYSCALL_DEFINE(name) static inline long SYSC_##name +-#define SYSCALL_DEFINEx(x, name, ...) \ ++ ++#define __SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ + static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ + asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ +@@ -134,7 +191,7 @@ struct old_linux_dirent; + #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ + + #define SYSCALL_DEFINE(name) asmlinkage long sys_##name +-#define SYSCALL_DEFINEx(x, name, ...) \ ++#define __SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) + + #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ +@@ -694,4 +751,8 @@ asmlinkage long sys_pipe(int __user *); + + int kernel_execve(const char *filename, char *const argv[], char *const envp[]); + ++ ++asmlinkage long sys_perf_counter_open( ++ const struct perf_counter_hw_event __user *hw_event_uptr, ++ pid_t pid, int cpu, int group_fd, unsigned long flags); + #endif +Index: linux-2.6-tip/include/linux/thread_info.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/thread_info.h ++++ linux-2.6-tip/include/linux/thread_info.h +@@ -21,13 +21,14 @@ struct restart_block { + struct { + unsigned long arg0, arg1, arg2, arg3; + }; +- /* For futex_wait */ ++ /* For futex_wait and futex_wait_requeue_pi */ + struct { + u32 *uaddr; + u32 val; + u32 flags; + u32 bitset; + u64 time; ++ u32 *uaddr2; + } futex; + /* For nanosleep */ + struct { +Index: linux-2.6-tip/include/linux/time.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/time.h ++++ linux-2.6-tip/include/linux/time.h +@@ -12,14 +12,14 @@ + #ifndef _STRUCT_TIMESPEC + #define _STRUCT_TIMESPEC + struct timespec { +- time_t tv_sec; /* seconds */ +- long tv_nsec; /* nanoseconds */ ++ __kernel_time_t tv_sec; /* seconds */ ++ long tv_nsec; /* nanoseconds */ + }; + #endif + + struct timeval { +- time_t tv_sec; /* seconds */ +- suseconds_t tv_usec; /* microseconds */ ++ __kernel_time_t tv_sec; /* seconds */ ++ __kernel_suseconds_t tv_usec; /* microseconds */ + }; + + struct timezone { +@@ -99,7 +99,7 @@ static inline struct timespec timespec_s + + extern struct timespec xtime; + extern struct timespec wall_to_monotonic; +-extern seqlock_t xtime_lock; ++extern raw_seqlock_t xtime_lock; + + extern unsigned long read_persistent_clock(void); + extern int update_persistent_clock(struct timespec now); +Index: linux-2.6-tip/include/linux/timer.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/timer.h ++++ linux-2.6-tip/include/linux/timer.h +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + struct tvec_base; + +@@ -21,52 +22,126 @@ struct timer_list { + char start_comm[16]; + int start_pid; + #endif ++#ifdef CONFIG_LOCKDEP ++ struct lockdep_map lockdep_map; ++#endif + }; + + extern struct tvec_base boot_tvec_bases; + ++#ifdef CONFIG_LOCKDEP ++/* ++ * NB: because we have to copy the lockdep_map, setting the lockdep_map key ++ * (second argument) here is required, otherwise it could be initialised to ++ * the copy of the lockdep_map later! We use the pointer to and the string ++ * ":" as the key resp. the name of the lockdep_map. ++ */ ++#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) \ ++ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn), ++#else ++#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) ++#endif ++ + #define TIMER_INITIALIZER(_function, _expires, _data) { \ + .entry = { .prev = TIMER_ENTRY_STATIC }, \ + .function = (_function), \ + .expires = (_expires), \ + .data = (_data), \ + .base = &boot_tvec_bases, \ ++ __TIMER_LOCKDEP_MAP_INITIALIZER( \ ++ __FILE__ ":" __stringify(__LINE__)) \ + } + + #define DEFINE_TIMER(_name, _function, _expires, _data) \ + struct timer_list _name = \ + TIMER_INITIALIZER(_function, _expires, _data) + +-void init_timer(struct timer_list *timer); +-void init_timer_deferrable(struct timer_list *timer); ++void init_timer_key(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key); ++void init_timer_deferrable_key(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key); ++ ++#ifdef CONFIG_LOCKDEP ++#define init_timer(timer) \ ++ do { \ ++ static struct lock_class_key __key; \ ++ init_timer_key((timer), #timer, &__key); \ ++ } while (0) ++ ++#define init_timer_deferrable(timer) \ ++ do { \ ++ static struct lock_class_key __key; \ ++ init_timer_deferrable_key((timer), #timer, &__key); \ ++ } while (0) ++ ++#define init_timer_on_stack(timer) \ ++ do { \ ++ static struct lock_class_key __key; \ ++ init_timer_on_stack_key((timer), #timer, &__key); \ ++ } while (0) ++ ++#define setup_timer(timer, fn, data) \ ++ do { \ ++ static struct lock_class_key __key; \ ++ setup_timer_key((timer), #timer, &__key, (fn), (data));\ ++ } while (0) ++ ++#define setup_timer_on_stack(timer, fn, data) \ ++ do { \ ++ static struct lock_class_key __key; \ ++ setup_timer_on_stack_key((timer), #timer, &__key, \ ++ (fn), (data)); \ ++ } while (0) ++#else ++#define init_timer(timer)\ ++ init_timer_key((timer), NULL, NULL) ++#define init_timer_deferrable(timer)\ ++ init_timer_deferrable_key((timer), NULL, NULL) ++#define init_timer_on_stack(timer)\ ++ init_timer_on_stack_key((timer), NULL, NULL) ++#define setup_timer(timer, fn, data)\ ++ setup_timer_key((timer), NULL, NULL, (fn), (data)) ++#define setup_timer_on_stack(timer, fn, data)\ ++ setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data)) ++#endif + + #ifdef CONFIG_DEBUG_OBJECTS_TIMERS +-extern void init_timer_on_stack(struct timer_list *timer); ++extern void init_timer_on_stack_key(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key); + extern void destroy_timer_on_stack(struct timer_list *timer); + #else + static inline void destroy_timer_on_stack(struct timer_list *timer) { } +-static inline void init_timer_on_stack(struct timer_list *timer) ++static inline void init_timer_on_stack_key(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key) + { +- init_timer(timer); ++ init_timer_key(timer, name, key); + } + #endif + +-static inline void setup_timer(struct timer_list * timer, ++static inline void setup_timer_key(struct timer_list * timer, ++ const char *name, ++ struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) + { + timer->function = function; + timer->data = data; +- init_timer(timer); ++ init_timer_key(timer, name, key); + } + +-static inline void setup_timer_on_stack(struct timer_list *timer, ++static inline void setup_timer_on_stack_key(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) + { + timer->function = function; + timer->data = data; +- init_timer_on_stack(timer); ++ init_timer_on_stack_key(timer, name, key); + } + + /** +@@ -86,8 +161,8 @@ static inline int timer_pending(const st + + extern void add_timer_on(struct timer_list *timer, int cpu); + extern int del_timer(struct timer_list * timer); +-extern int __mod_timer(struct timer_list *timer, unsigned long expires); + extern int mod_timer(struct timer_list *timer, unsigned long expires); ++extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); + + /* + * The jiffies value which is added to now, when there is no timer +@@ -146,30 +221,14 @@ static inline void timer_stats_timer_cle + } + #endif + +-/** +- * add_timer - start a timer +- * @timer: the timer to be added +- * +- * The kernel will do a ->function(->data) callback from the +- * timer interrupt at the ->expires point in the future. The +- * current time is 'jiffies'. +- * +- * The timer's ->expires, ->function (and if the handler uses it, ->data) +- * fields must be set prior calling this function. +- * +- * Timers with an ->expires field in the past will be executed in the next +- * timer tick. +- */ +-static inline void add_timer(struct timer_list *timer) +-{ +- BUG_ON(timer_pending(timer)); +- __mod_timer(timer, timer->expires); +-} ++extern void add_timer(struct timer_list *timer); + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) ++ extern int timer_pending_sync(struct timer_list *timer); + extern int try_to_del_timer_sync(struct timer_list *timer); + extern int del_timer_sync(struct timer_list *timer); + #else ++# define timer_pending_sync(t) timer_pending(t) + # define try_to_del_timer_sync(t) del_timer(t) + # define del_timer_sync(t) del_timer(t) + #endif +Index: linux-2.6-tip/include/linux/times.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/times.h ++++ linux-2.6-tip/include/linux/times.h +@@ -4,10 +4,10 @@ + #include + + struct tms { +- clock_t tms_utime; +- clock_t tms_stime; +- clock_t tms_cutime; +- clock_t tms_cstime; ++ __kernel_clock_t tms_utime; ++ __kernel_clock_t tms_stime; ++ __kernel_clock_t tms_cutime; ++ __kernel_clock_t tms_cstime; + }; + + #endif +Index: linux-2.6-tip/include/linux/timex.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/timex.h ++++ linux-2.6-tip/include/linux/timex.h +@@ -190,7 +190,7 @@ struct timex { + * offset and maximum frequency tolerance. + */ + #define SHIFT_USEC 16 /* frequency offset scale (shift) */ +-#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) ++#define PPM_SCALE ((s64)NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) + #define PPM_SCALE_INV_SHIFT 19 + #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ + PPM_SCALE + 1) +Index: linux-2.6-tip/include/linux/topology.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/topology.h ++++ linux-2.6-tip/include/linux/topology.h +@@ -38,11 +38,7 @@ + #endif + + #ifndef nr_cpus_node +-#define nr_cpus_node(node) \ +- ({ \ +- node_to_cpumask_ptr(__tmp__, node); \ +- cpus_weight(*__tmp__); \ +- }) ++#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) + #endif + + #define for_each_node_with_cpus(node) \ +@@ -193,5 +189,16 @@ int arch_update_cpu_topology(void); + #ifndef topology_core_siblings + #define topology_core_siblings(cpu) cpumask_of_cpu(cpu) + #endif ++#ifndef topology_thread_cpumask ++#define topology_thread_cpumask(cpu) cpumask_of(cpu) ++#endif ++#ifndef topology_core_cpumask ++#define topology_core_cpumask(cpu) cpumask_of(cpu) ++#endif ++ ++/* Returns the number of the current Node. */ ++#ifndef numa_node_id ++#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) ++#endif + + #endif /* _LINUX_TOPOLOGY_H */ +Index: linux-2.6-tip/include/linux/trace_clock.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/trace_clock.h +@@ -0,0 +1,19 @@ ++#ifndef _LINUX_TRACE_CLOCK_H ++#define _LINUX_TRACE_CLOCK_H ++ ++/* ++ * 3 trace clock variants, with differing scalability/precision ++ * tradeoffs: ++ * ++ * - local: CPU-local trace clock ++ * - medium: scalable global clock with some jitter ++ * - global: globally monotonic, serialized clock ++ */ ++#include ++#include ++ ++extern u64 notrace trace_clock_local(void); ++extern u64 notrace trace_clock(void); ++extern u64 notrace trace_clock_global(void); ++ ++#endif /* _LINUX_TRACE_CLOCK_H */ +Index: linux-2.6-tip/include/linux/tracepoint.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/tracepoint.h ++++ linux-2.6-tip/include/linux/tracepoint.h +@@ -31,8 +31,8 @@ struct tracepoint { + * Keep in sync with vmlinux.lds.h. + */ + +-#define TPPROTO(args...) args +-#define TPARGS(args...) args ++#define TP_PROTO(args...) args ++#define TP_ARGS(args...) args + + #ifdef CONFIG_TRACEPOINTS + +@@ -65,7 +65,7 @@ struct tracepoint { + { \ + if (unlikely(__tracepoint_##name.state)) \ + __DO_TRACE(&__tracepoint_##name, \ +- TPPROTO(proto), TPARGS(args)); \ ++ TP_PROTO(proto), TP_ARGS(args)); \ + } \ + static inline int register_trace_##name(void (*probe)(proto)) \ + { \ +@@ -153,4 +153,114 @@ static inline void tracepoint_synchroniz + synchronize_sched(); + } + ++#define PARAMS(args...) args ++#define TRACE_FORMAT(name, proto, args, fmt) \ ++ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ++ ++ ++/* ++ * For use with the TRACE_EVENT macro: ++ * ++ * We define a tracepoint, its arguments, its printk format ++ * and its 'fast binay record' layout. ++ * ++ * Firstly, name your tracepoint via TRACE_EVENT(name : the ++ * 'subsystem_event' notation is fine. ++ * ++ * Think about this whole construct as the ++ * 'trace_sched_switch() function' from now on. ++ * ++ * ++ * TRACE_EVENT(sched_switch, ++ * ++ * * ++ * * A function has a regular function arguments ++ * * prototype, declare it via TP_PROTO(): ++ * * ++ * ++ * TP_PROTO(struct rq *rq, struct task_struct *prev, ++ * struct task_struct *next), ++ * ++ * * ++ * * Define the call signature of the 'function'. ++ * * (Design sidenote: we use this instead of a ++ * * TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.) ++ * * ++ * ++ * TP_ARGS(rq, prev, next), ++ * ++ * * ++ * * Fast binary tracing: define the trace record via ++ * * TP_STRUCT__entry(). You can think about it like a ++ * * regular C structure local variable definition. ++ * * ++ * * This is how the trace record is structured and will ++ * * be saved into the ring buffer. These are the fields ++ * * that will be exposed to user-space in ++ * * /debug/tracing/events/<*>/format. ++ * * ++ * * The declared 'local variable' is called '__entry' ++ * * ++ * * __field(pid_t, prev_prid) is equivalent to a standard declariton: ++ * * ++ * * pid_t prev_pid; ++ * * ++ * * __array(char, prev_comm, TASK_COMM_LEN) is equivalent to: ++ * * ++ * * char prev_comm[TASK_COMM_LEN]; ++ * * ++ * ++ * TP_STRUCT__entry( ++ * __array( char, prev_comm, TASK_COMM_LEN ) ++ * __field( pid_t, prev_pid ) ++ * __field( int, prev_prio ) ++ * __array( char, next_comm, TASK_COMM_LEN ) ++ * __field( pid_t, next_pid ) ++ * __field( int, next_prio ) ++ * ), ++ * ++ * * ++ * * Assign the entry into the trace record, by embedding ++ * * a full C statement block into TP_fast_assign(). You ++ * * can refer to the trace record as '__entry' - ++ * * otherwise you can put arbitrary C code in here. ++ * * ++ * * Note: this C code will execute every time a trace event ++ * * happens, on an active tracepoint. ++ * * ++ * ++ * TP_fast_assign( ++ * memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); ++ * __entry->prev_pid = prev->pid; ++ * __entry->prev_prio = prev->prio; ++ * memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); ++ * __entry->next_pid = next->pid; ++ * __entry->next_prio = next->prio; ++ * ) ++ * ++ * * ++ * * Formatted output of a trace record via TP_printk(). ++ * * This is how the tracepoint will appear under ftrace ++ * * plugins that make use of this tracepoint. ++ * * ++ * * (raw-binary tracing wont actually perform this step.) ++ * * ++ * ++ * TP_printk("task %s:%d [%d] ==> %s:%d [%d]", ++ * __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, ++ * __entry->next_comm, __entry->next_pid, __entry->next_prio), ++ * ++ * ); ++ * ++ * This macro construct is thus used for the regular printk format ++ * tracing setup, it is used to construct a function pointer based ++ * tracepoint callback (this is used by programmatic plugins and ++ * can also by used by generic instrumentation like SystemTap), and ++ * it is also used to expose a structured trace record in ++ * /debug/tracing/events/. ++ */ ++ ++#define TRACE_EVENT(name, proto, args, struct, assign, print) \ ++ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) ++ + #endif +Index: linux-2.6-tip/include/linux/types.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/types.h ++++ linux-2.6-tip/include/linux/types.h +@@ -1,6 +1,9 @@ + #ifndef _LINUX_TYPES_H + #define _LINUX_TYPES_H + ++#include ++ ++#ifndef __ASSEMBLY__ + #ifdef __KERNEL__ + + #define DECLARE_BITMAP(name,bits) \ +@@ -9,9 +12,8 @@ + #endif + + #include +-#include + +-#ifndef __KERNEL_STRICT_NAMES ++#ifdef __KERNEL__ + + typedef __u32 __kernel_dev_t; + +@@ -29,7 +31,6 @@ typedef __kernel_timer_t timer_t; + typedef __kernel_clockid_t clockid_t; + typedef __kernel_mqd_t mqd_t; + +-#ifdef __KERNEL__ + typedef _Bool bool; + + typedef __kernel_uid32_t uid_t; +@@ -45,14 +46,6 @@ typedef __kernel_old_uid_t old_uid_t; + typedef __kernel_old_gid_t old_gid_t; + #endif /* CONFIG_UID16 */ + +-/* libc5 includes this file to define uid_t, thus uid_t can never change +- * when it is included by non-kernel code +- */ +-#else +-typedef __kernel_uid_t uid_t; +-typedef __kernel_gid_t gid_t; +-#endif /* __KERNEL__ */ +- + #if defined(__GNUC__) + typedef __kernel_loff_t loff_t; + #endif +@@ -154,7 +147,7 @@ typedef unsigned long blkcnt_t; + #define pgoff_t unsigned long + #endif + +-#endif /* __KERNEL_STRICT_NAMES */ ++#endif /* __KERNEL__ */ + + /* + * Below are truly Linux-specific types that should never collide with +@@ -212,5 +205,5 @@ struct ustat { + }; + + #endif /* __KERNEL__ */ +- ++#endif /* __ASSEMBLY__ */ + #endif /* _LINUX_TYPES_H */ +Index: linux-2.6-tip/include/linux/ucb1400.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/ucb1400.h ++++ linux-2.6-tip/include/linux/ucb1400.h +@@ -134,8 +134,8 @@ static inline void ucb1400_adc_enable(st + ucb1400_reg_write(ac97, UCB_ADC_CR, UCB_ADC_ENA); + } + +-static unsigned int ucb1400_adc_read(struct snd_ac97 *ac97, u16 adc_channel, +- int adcsync) ++static inline unsigned int ++ucb1400_adc_read(struct snd_ac97 *ac97, u16 adc_channel, int adcsync) + { + unsigned int val; + +Index: linux-2.6-tip/include/linux/utime.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/utime.h ++++ linux-2.6-tip/include/linux/utime.h +@@ -4,8 +4,8 @@ + #include + + struct utimbuf { +- time_t actime; +- time_t modtime; ++ __kernel_time_t actime; ++ __kernel_time_t modtime; + }; + + #endif +Index: linux-2.6-tip/include/linux/vmalloc.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/vmalloc.h ++++ linux-2.6-tip/include/linux/vmalloc.h +@@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area( + + extern int map_vm_area(struct vm_struct *area, pgprot_t prot, + struct page ***pages); ++extern int map_kernel_range_noflush(unsigned long start, unsigned long size, ++ pgprot_t prot, struct page **pages); ++extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); + extern void unmap_kernel_range(unsigned long addr, unsigned long size); + + /* Allocate/destroy a 'vmalloc' VM area. */ +@@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr + */ + extern rwlock_t vmlist_lock; + extern struct vm_struct *vmlist; ++extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); + + #endif /* _LINUX_VMALLOC_H */ +Index: linux-2.6-tip/include/linux/xfrm.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/xfrm.h ++++ linux-2.6-tip/include/linux/xfrm.h +@@ -58,7 +58,7 @@ struct xfrm_selector + __u8 prefixlen_s; + __u8 proto; + int ifindex; +- uid_t user; ++ __kernel_uid32_t user; + }; + + #define XFRM_INF (~(__u64)0) +Index: linux-2.6-tip/include/mtd/inftl-user.h +=================================================================== +--- linux-2.6-tip.orig/include/mtd/inftl-user.h ++++ linux-2.6-tip/include/mtd/inftl-user.h +@@ -16,33 +16,33 @@ + /* Block Control Information */ + + struct inftl_bci { +- uint8_t ECCsig[6]; +- uint8_t Status; +- uint8_t Status1; ++ __u8 ECCsig[6]; ++ __u8 Status; ++ __u8 Status1; + } __attribute__((packed)); + + struct inftl_unithead1 { +- uint16_t virtualUnitNo; +- uint16_t prevUnitNo; +- uint8_t ANAC; +- uint8_t NACs; +- uint8_t parityPerField; +- uint8_t discarded; ++ __u16 virtualUnitNo; ++ __u16 prevUnitNo; ++ __u8 ANAC; ++ __u8 NACs; ++ __u8 parityPerField; ++ __u8 discarded; + } __attribute__((packed)); + + struct inftl_unithead2 { +- uint8_t parityPerField; +- uint8_t ANAC; +- uint16_t prevUnitNo; +- uint16_t virtualUnitNo; +- uint8_t NACs; +- uint8_t discarded; ++ __u8 parityPerField; ++ __u8 ANAC; ++ __u16 prevUnitNo; ++ __u16 virtualUnitNo; ++ __u8 NACs; ++ __u8 discarded; + } __attribute__((packed)); + + struct inftl_unittail { +- uint8_t Reserved[4]; +- uint16_t EraseMark; +- uint16_t EraseMark1; ++ __u8 Reserved[4]; ++ __u16 EraseMark; ++ __u16 EraseMark1; + } __attribute__((packed)); + + union inftl_uci { +Index: linux-2.6-tip/include/mtd/jffs2-user.h +=================================================================== +--- linux-2.6-tip.orig/include/mtd/jffs2-user.h ++++ linux-2.6-tip/include/mtd/jffs2-user.h +@@ -7,6 +7,7 @@ + + /* This file is blessed for inclusion by userspace */ + #include ++#include + #include + #include + +@@ -19,8 +20,8 @@ + + extern int target_endian; + +-#define t16(x) ({ uint16_t __b = (x); (target_endian==__BYTE_ORDER)?__b:bswap_16(__b); }) +-#define t32(x) ({ uint32_t __b = (x); (target_endian==__BYTE_ORDER)?__b:bswap_32(__b); }) ++#define t16(x) ({ __u16 __b = (x); (target_endian==__BYTE_ORDER)?__b:bswap_16(__b); }) ++#define t32(x) ({ __u32 __b = (x); (target_endian==__BYTE_ORDER)?__b:bswap_32(__b); }) + + #define cpu_to_je16(x) ((jint16_t){t16(x)}) + #define cpu_to_je32(x) ((jint32_t){t32(x)}) +Index: linux-2.6-tip/include/mtd/mtd-abi.h +=================================================================== +--- linux-2.6-tip.orig/include/mtd/mtd-abi.h ++++ linux-2.6-tip/include/mtd/mtd-abi.h +@@ -5,14 +5,16 @@ + #ifndef __MTD_ABI_H__ + #define __MTD_ABI_H__ + ++#include ++ + struct erase_info_user { +- uint32_t start; +- uint32_t length; ++ __u32 start; ++ __u32 length; + }; + + struct mtd_oob_buf { +- uint32_t start; +- uint32_t length; ++ __u32 start; ++ __u32 length; + unsigned char __user *ptr; + }; + +@@ -48,30 +50,30 @@ struct mtd_oob_buf { + #define MTD_OTP_USER 2 + + struct mtd_info_user { +- uint8_t type; +- uint32_t flags; +- uint32_t size; // Total size of the MTD +- uint32_t erasesize; +- uint32_t writesize; +- uint32_t oobsize; // Amount of OOB data per block (e.g. 16) ++ __u8 type; ++ __u32 flags; ++ __u32 size; // Total size of the MTD ++ __u32 erasesize; ++ __u32 writesize; ++ __u32 oobsize; // Amount of OOB data per block (e.g. 16) + /* The below two fields are obsolete and broken, do not use them + * (TODO: remove at some point) */ +- uint32_t ecctype; +- uint32_t eccsize; ++ __u32 ecctype; ++ __u32 eccsize; + }; + + struct region_info_user { +- uint32_t offset; /* At which this region starts, ++ __u32 offset; /* At which this region starts, + * from the beginning of the MTD */ +- uint32_t erasesize; /* For this region */ +- uint32_t numblocks; /* Number of blocks in this region */ +- uint32_t regionindex; ++ __u32 erasesize; /* For this region */ ++ __u32 numblocks; /* Number of blocks in this region */ ++ __u32 regionindex; + }; + + struct otp_info { +- uint32_t start; +- uint32_t length; +- uint32_t locked; ++ __u32 start; ++ __u32 length; ++ __u32 locked; + }; + + #define MEMGETINFO _IOR('M', 1, struct mtd_info_user) +@@ -84,8 +86,8 @@ struct otp_info { + #define MEMGETREGIONINFO _IOWR('M', 8, struct region_info_user) + #define MEMSETOOBSEL _IOW('M', 9, struct nand_oobinfo) + #define MEMGETOOBSEL _IOR('M', 10, struct nand_oobinfo) +-#define MEMGETBADBLOCK _IOW('M', 11, loff_t) +-#define MEMSETBADBLOCK _IOW('M', 12, loff_t) ++#define MEMGETBADBLOCK _IOW('M', 11, __kernel_loff_t) ++#define MEMSETBADBLOCK _IOW('M', 12, __kernel_loff_t) + #define OTPSELECT _IOR('M', 13, int) + #define OTPGETREGIONCOUNT _IOW('M', 14, int) + #define OTPGETREGIONINFO _IOW('M', 15, struct otp_info) +@@ -99,15 +101,15 @@ struct otp_info { + * interfaces + */ + struct nand_oobinfo { +- uint32_t useecc; +- uint32_t eccbytes; +- uint32_t oobfree[8][2]; +- uint32_t eccpos[32]; ++ __u32 useecc; ++ __u32 eccbytes; ++ __u32 oobfree[8][2]; ++ __u32 eccpos[32]; + }; + + struct nand_oobfree { +- uint32_t offset; +- uint32_t length; ++ __u32 offset; ++ __u32 length; + }; + + #define MTD_MAX_OOBFREE_ENTRIES 8 +@@ -116,9 +118,9 @@ struct nand_oobfree { + * diagnosis and to allow creation of raw images + */ + struct nand_ecclayout { +- uint32_t eccbytes; +- uint32_t eccpos[64]; +- uint32_t oobavail; ++ __u32 eccbytes; ++ __u32 eccpos[64]; ++ __u32 oobavail; + struct nand_oobfree oobfree[MTD_MAX_OOBFREE_ENTRIES]; + }; + +@@ -131,10 +133,10 @@ struct nand_ecclayout { + * @bbtblocks: number of blocks reserved for bad block tables + */ + struct mtd_ecc_stats { +- uint32_t corrected; +- uint32_t failed; +- uint32_t badblocks; +- uint32_t bbtblocks; ++ __u32 corrected; ++ __u32 failed; ++ __u32 badblocks; ++ __u32 bbtblocks; + }; + + /* +Index: linux-2.6-tip/include/mtd/nftl-user.h +=================================================================== +--- linux-2.6-tip.orig/include/mtd/nftl-user.h ++++ linux-2.6-tip/include/mtd/nftl-user.h +@@ -6,33 +6,35 @@ + #ifndef __MTD_NFTL_USER_H__ + #define __MTD_NFTL_USER_H__ + ++#include ++ + /* Block Control Information */ + + struct nftl_bci { + unsigned char ECCSig[6]; +- uint8_t Status; +- uint8_t Status1; ++ __u8 Status; ++ __u8 Status1; + }__attribute__((packed)); + + /* Unit Control Information */ + + struct nftl_uci0 { +- uint16_t VirtUnitNum; +- uint16_t ReplUnitNum; +- uint16_t SpareVirtUnitNum; +- uint16_t SpareReplUnitNum; ++ __u16 VirtUnitNum; ++ __u16 ReplUnitNum; ++ __u16 SpareVirtUnitNum; ++ __u16 SpareReplUnitNum; + } __attribute__((packed)); + + struct nftl_uci1 { +- uint32_t WearInfo; +- uint16_t EraseMark; +- uint16_t EraseMark1; ++ __u32 WearInfo; ++ __u16 EraseMark; ++ __u16 EraseMark1; + } __attribute__((packed)); + + struct nftl_uci2 { +- uint16_t FoldMark; +- uint16_t FoldMark1; +- uint32_t unused; ++ __u16 FoldMark; ++ __u16 FoldMark1; ++ __u32 unused; + } __attribute__((packed)); + + union nftl_uci { +@@ -50,9 +52,9 @@ struct nftl_oob { + + struct NFTLMediaHeader { + char DataOrgID[6]; +- uint16_t NumEraseUnits; +- uint16_t FirstPhysicalEUN; +- uint32_t FormattedSize; ++ __u16 NumEraseUnits; ++ __u16 FirstPhysicalEUN; ++ __u32 FormattedSize; + unsigned char UnitSizeFactor; + } __attribute__((packed)); + +Index: linux-2.6-tip/include/mtd/ubi-user.h +=================================================================== +--- linux-2.6-tip.orig/include/mtd/ubi-user.h ++++ linux-2.6-tip/include/mtd/ubi-user.h +@@ -21,6 +21,8 @@ + #ifndef __UBI_USER_H__ + #define __UBI_USER_H__ + ++#include ++ + /* + * UBI device creation (the same as MTD device attachment) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +@@ -152,7 +154,7 @@ + /* Create an UBI volume */ + #define UBI_IOCMKVOL _IOW(UBI_IOC_MAGIC, 0, struct ubi_mkvol_req) + /* Remove an UBI volume */ +-#define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, int32_t) ++#define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, __s32) + /* Re-size an UBI volume */ + #define UBI_IOCRSVOL _IOW(UBI_IOC_MAGIC, 2, struct ubi_rsvol_req) + /* Re-name volumes */ +@@ -165,24 +167,24 @@ + /* Attach an MTD device */ + #define UBI_IOCATT _IOW(UBI_CTRL_IOC_MAGIC, 64, struct ubi_attach_req) + /* Detach an MTD device */ +-#define UBI_IOCDET _IOW(UBI_CTRL_IOC_MAGIC, 65, int32_t) ++#define UBI_IOCDET _IOW(UBI_CTRL_IOC_MAGIC, 65, __s32) + + /* ioctl commands of UBI volume character devices */ + + #define UBI_VOL_IOC_MAGIC 'O' + + /* Start UBI volume update */ +-#define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, int64_t) ++#define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, __s64) + /* LEB erasure command, used for debugging, disabled by default */ +-#define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, int32_t) ++#define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, __s32) + /* Atomic LEB change command */ +-#define UBI_IOCEBCH _IOW(UBI_VOL_IOC_MAGIC, 2, int32_t) ++#define UBI_IOCEBCH _IOW(UBI_VOL_IOC_MAGIC, 2, __s32) + /* Map LEB command */ + #define UBI_IOCEBMAP _IOW(UBI_VOL_IOC_MAGIC, 3, struct ubi_map_req) + /* Unmap LEB command */ +-#define UBI_IOCEBUNMAP _IOW(UBI_VOL_IOC_MAGIC, 4, int32_t) ++#define UBI_IOCEBUNMAP _IOW(UBI_VOL_IOC_MAGIC, 4, __s32) + /* Check if LEB is mapped command */ +-#define UBI_IOCEBISMAP _IOR(UBI_VOL_IOC_MAGIC, 5, int32_t) ++#define UBI_IOCEBISMAP _IOR(UBI_VOL_IOC_MAGIC, 5, __s32) + /* Set an UBI volume property */ + #define UBI_IOCSETPROP _IOW(UBI_VOL_IOC_MAGIC, 6, struct ubi_set_prop_req) + +@@ -260,10 +262,10 @@ enum { + * sub-page of the first page and add needed padding. + */ + struct ubi_attach_req { +- int32_t ubi_num; +- int32_t mtd_num; +- int32_t vid_hdr_offset; +- int8_t padding[12]; ++ __s32 ubi_num; ++ __s32 mtd_num; ++ __s32 vid_hdr_offset; ++ __s8 padding[12]; + }; + + /** +@@ -298,13 +300,13 @@ struct ubi_attach_req { + * BLOBs, without caring about how to properly align them. + */ + struct ubi_mkvol_req { +- int32_t vol_id; +- int32_t alignment; +- int64_t bytes; +- int8_t vol_type; +- int8_t padding1; +- int16_t name_len; +- int8_t padding2[4]; ++ __s32 vol_id; ++ __s32 alignment; ++ __s64 bytes; ++ __s8 vol_type; ++ __s8 padding1; ++ __s16 name_len; ++ __s8 padding2[4]; + char name[UBI_MAX_VOLUME_NAME + 1]; + } __attribute__ ((packed)); + +@@ -320,8 +322,8 @@ struct ubi_mkvol_req { + * zero number of bytes). + */ + struct ubi_rsvol_req { +- int64_t bytes; +- int32_t vol_id; ++ __s64 bytes; ++ __s32 vol_id; + } __attribute__ ((packed)); + + /** +@@ -356,12 +358,12 @@ struct ubi_rsvol_req { + * re-name request. + */ + struct ubi_rnvol_req { +- int32_t count; +- int8_t padding1[12]; ++ __s32 count; ++ __s8 padding1[12]; + struct { +- int32_t vol_id; +- int16_t name_len; +- int8_t padding2[2]; ++ __s32 vol_id; ++ __s16 name_len; ++ __s8 padding2[2]; + char name[UBI_MAX_VOLUME_NAME + 1]; + } ents[UBI_MAX_RNVOL]; + } __attribute__ ((packed)); +@@ -375,10 +377,10 @@ struct ubi_rnvol_req { + * @padding: reserved for future, not used, has to be zeroed + */ + struct ubi_leb_change_req { +- int32_t lnum; +- int32_t bytes; +- int8_t dtype; +- int8_t padding[7]; ++ __s32 lnum; ++ __s32 bytes; ++ __s8 dtype; ++ __s8 padding[7]; + } __attribute__ ((packed)); + + /** +@@ -388,9 +390,9 @@ struct ubi_leb_change_req { + * @padding: reserved for future, not used, has to be zeroed + */ + struct ubi_map_req { +- int32_t lnum; +- int8_t dtype; +- int8_t padding[3]; ++ __s32 lnum; ++ __s8 dtype; ++ __s8 padding[3]; + } __attribute__ ((packed)); + + +@@ -402,9 +404,9 @@ struct ubi_map_req { + * @value: value to set + */ + struct ubi_set_prop_req { +- uint8_t property; +- uint8_t padding[7]; +- uint64_t value; ++ __u8 property; ++ __u8 padding[7]; ++ __u64 value; + } __attribute__ ((packed)); + + #endif /* __UBI_USER_H__ */ +Index: linux-2.6-tip/include/net/inet_sock.h +=================================================================== +--- linux-2.6-tip.orig/include/net/inet_sock.h ++++ linux-2.6-tip/include/net/inet_sock.h +@@ -17,6 +17,7 @@ + #define _INET_SOCK_H + + ++#include + #include + #include + #include +@@ -66,14 +67,16 @@ struct inet_request_sock { + __be32 loc_addr; + __be32 rmt_addr; + __be16 rmt_port; +- u16 snd_wscale : 4, +- rcv_wscale : 4, +- tstamp_ok : 1, +- sack_ok : 1, +- wscale_ok : 1, +- ecn_ok : 1, +- acked : 1, +- no_srccheck: 1; ++ kmemcheck_define_bitfield(flags, { ++ u16 snd_wscale : 4, ++ rcv_wscale : 4, ++ tstamp_ok : 1, ++ sack_ok : 1, ++ wscale_ok : 1, ++ ecn_ok : 1, ++ acked : 1, ++ no_srccheck: 1; ++ }); + struct ip_options *opt; + }; + +@@ -198,9 +201,12 @@ static inline int inet_sk_ehashfn(const + static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops) + { + struct request_sock *req = reqsk_alloc(ops); ++ struct inet_request_sock *ireq = inet_rsk(req); + +- if (req != NULL) +- inet_rsk(req)->opt = NULL; ++ if (req != NULL) { ++ kmemcheck_annotate_bitfield(ireq->flags); ++ ireq->opt = NULL; ++ } + + return req; + } +Index: linux-2.6-tip/include/net/inet_timewait_sock.h +=================================================================== +--- linux-2.6-tip.orig/include/net/inet_timewait_sock.h ++++ linux-2.6-tip/include/net/inet_timewait_sock.h +@@ -16,6 +16,7 @@ + #define _INET_TIMEWAIT_SOCK_ + + ++#include + #include + #include + #include +@@ -127,10 +128,12 @@ struct inet_timewait_sock { + __be32 tw_rcv_saddr; + __be16 tw_dport; + __u16 tw_num; +- /* And these are ours. */ +- __u8 tw_ipv6only:1, +- tw_transparent:1; +- /* 15 bits hole, try to pack */ ++ kmemcheck_define_bitfield(flags, { ++ /* And these are ours. */ ++ __u8 tw_ipv6only:1, ++ tw_transparent:1; ++ /* 14 bits hole, try to pack */ ++ }); + __u16 tw_ipv6_offset; + unsigned long tw_ttd; + struct inet_bind_bucket *tw_tb; +Index: linux-2.6-tip/include/net/sock.h +=================================================================== +--- linux-2.6-tip.orig/include/net/sock.h ++++ linux-2.6-tip/include/net/sock.h +@@ -218,9 +218,11 @@ struct sock { + #define sk_hash __sk_common.skc_hash + #define sk_prot __sk_common.skc_prot + #define sk_net __sk_common.skc_net +- unsigned char sk_shutdown : 2, +- sk_no_check : 2, +- sk_userlocks : 4; ++ kmemcheck_define_bitfield(flags, { ++ unsigned char sk_shutdown : 2, ++ sk_no_check : 2, ++ sk_userlocks : 4; ++ }); + unsigned char sk_protocol; + unsigned short sk_type; + int sk_rcvbuf; +Index: linux-2.6-tip/include/sound/asound.h +=================================================================== +--- linux-2.6-tip.orig/include/sound/asound.h ++++ linux-2.6-tip/include/sound/asound.h +@@ -23,9 +23,10 @@ + #ifndef __SOUND_ASOUND_H + #define __SOUND_ASOUND_H + ++#include ++ + #ifdef __KERNEL__ + #include +-#include + #include + #include + +@@ -342,7 +343,7 @@ struct snd_interval { + #define SNDRV_MASK_MAX 256 + + struct snd_mask { +- u_int32_t bits[(SNDRV_MASK_MAX+31)/32]; ++ __u32 bits[(SNDRV_MASK_MAX+31)/32]; + }; + + struct snd_pcm_hw_params { +@@ -385,7 +386,7 @@ struct snd_pcm_sw_params { + + struct snd_pcm_channel_info { + unsigned int channel; +- off_t offset; /* mmap offset */ ++ __kernel_off_t offset; /* mmap offset */ + unsigned int first; /* offset to first sample in bits */ + unsigned int step; /* samples distance in bits */ + }; +@@ -789,7 +790,7 @@ struct snd_ctl_elem_info { + snd_ctl_elem_type_t type; /* R: value type - SNDRV_CTL_ELEM_TYPE_* */ + unsigned int access; /* R: value access (bitmask) - SNDRV_CTL_ELEM_ACCESS_* */ + unsigned int count; /* count of values */ +- pid_t owner; /* owner's PID of this control */ ++ __kernel_pid_t owner; /* owner's PID of this control */ + union { + struct { + long min; /* R: minimum value */ +Index: linux-2.6-tip/include/sound/emu10k1.h +=================================================================== +--- linux-2.6-tip.orig/include/sound/emu10k1.h ++++ linux-2.6-tip/include/sound/emu10k1.h +@@ -1,6 +1,8 @@ + #ifndef __SOUND_EMU10K1_H + #define __SOUND_EMU10K1_H + ++#include ++ + /* + * Copyright (c) by Jaroslav Kysela , + * Creative Labs, Inc. +@@ -34,6 +36,7 @@ + #include + #include + #include ++ + #include + + /* ------------------- DEFINES -------------------- */ +@@ -2171,7 +2174,7 @@ struct snd_emu10k1_fx8010_code { + char name[128]; + + DECLARE_BITMAP(gpr_valid, 0x200); /* bitmask of valid initializers */ +- u_int32_t __user *gpr_map; /* initializers */ ++ __u32 __user *gpr_map; /* initializers */ + + unsigned int gpr_add_control_count; /* count of GPR controls to add/replace */ + struct snd_emu10k1_fx8010_control_gpr __user *gpr_add_controls; /* GPR controls to add/replace */ +@@ -2184,11 +2187,11 @@ struct snd_emu10k1_fx8010_code { + struct snd_emu10k1_fx8010_control_gpr __user *gpr_list_controls; /* listed GPR controls */ + + DECLARE_BITMAP(tram_valid, 0x100); /* bitmask of valid initializers */ +- u_int32_t __user *tram_data_map; /* data initializers */ +- u_int32_t __user *tram_addr_map; /* map initializers */ ++ __u32 __user *tram_data_map; /* data initializers */ ++ __u32 __user *tram_addr_map; /* map initializers */ + + DECLARE_BITMAP(code_valid, 1024); /* bitmask of valid instructions */ +- u_int32_t __user *code; /* one instruction - 64 bits */ ++ __u32 __user *code; /* one instruction - 64 bits */ + }; + + struct snd_emu10k1_fx8010_tram { +Index: linux-2.6-tip/include/trace/block.h +=================================================================== +--- linux-2.6-tip.orig/include/trace/block.h ++++ linux-2.6-tip/include/trace/block.h +@@ -5,72 +5,72 @@ + #include + + DECLARE_TRACE(block_rq_abort, +- TPPROTO(struct request_queue *q, struct request *rq), +- TPARGS(q, rq)); ++ TP_PROTO(struct request_queue *q, struct request *rq), ++ TP_ARGS(q, rq)); + + DECLARE_TRACE(block_rq_insert, +- TPPROTO(struct request_queue *q, struct request *rq), +- TPARGS(q, rq)); ++ TP_PROTO(struct request_queue *q, struct request *rq), ++ TP_ARGS(q, rq)); + + DECLARE_TRACE(block_rq_issue, +- TPPROTO(struct request_queue *q, struct request *rq), +- TPARGS(q, rq)); ++ TP_PROTO(struct request_queue *q, struct request *rq), ++ TP_ARGS(q, rq)); + + DECLARE_TRACE(block_rq_requeue, +- TPPROTO(struct request_queue *q, struct request *rq), +- TPARGS(q, rq)); ++ TP_PROTO(struct request_queue *q, struct request *rq), ++ TP_ARGS(q, rq)); + + DECLARE_TRACE(block_rq_complete, +- TPPROTO(struct request_queue *q, struct request *rq), +- TPARGS(q, rq)); ++ TP_PROTO(struct request_queue *q, struct request *rq), ++ TP_ARGS(q, rq)); + + DECLARE_TRACE(block_bio_bounce, +- TPPROTO(struct request_queue *q, struct bio *bio), +- TPARGS(q, bio)); ++ TP_PROTO(struct request_queue *q, struct bio *bio), ++ TP_ARGS(q, bio)); + + DECLARE_TRACE(block_bio_complete, +- TPPROTO(struct request_queue *q, struct bio *bio), +- TPARGS(q, bio)); ++ TP_PROTO(struct request_queue *q, struct bio *bio), ++ TP_ARGS(q, bio)); + + DECLARE_TRACE(block_bio_backmerge, +- TPPROTO(struct request_queue *q, struct bio *bio), +- TPARGS(q, bio)); ++ TP_PROTO(struct request_queue *q, struct bio *bio), ++ TP_ARGS(q, bio)); + + DECLARE_TRACE(block_bio_frontmerge, +- TPPROTO(struct request_queue *q, struct bio *bio), +- TPARGS(q, bio)); ++ TP_PROTO(struct request_queue *q, struct bio *bio), ++ TP_ARGS(q, bio)); + + DECLARE_TRACE(block_bio_queue, +- TPPROTO(struct request_queue *q, struct bio *bio), +- TPARGS(q, bio)); ++ TP_PROTO(struct request_queue *q, struct bio *bio), ++ TP_ARGS(q, bio)); + + DECLARE_TRACE(block_getrq, +- TPPROTO(struct request_queue *q, struct bio *bio, int rw), +- TPARGS(q, bio, rw)); ++ TP_PROTO(struct request_queue *q, struct bio *bio, int rw), ++ TP_ARGS(q, bio, rw)); + + DECLARE_TRACE(block_sleeprq, +- TPPROTO(struct request_queue *q, struct bio *bio, int rw), +- TPARGS(q, bio, rw)); ++ TP_PROTO(struct request_queue *q, struct bio *bio, int rw), ++ TP_ARGS(q, bio, rw)); + + DECLARE_TRACE(block_plug, +- TPPROTO(struct request_queue *q), +- TPARGS(q)); ++ TP_PROTO(struct request_queue *q), ++ TP_ARGS(q)); + + DECLARE_TRACE(block_unplug_timer, +- TPPROTO(struct request_queue *q), +- TPARGS(q)); ++ TP_PROTO(struct request_queue *q), ++ TP_ARGS(q)); + + DECLARE_TRACE(block_unplug_io, +- TPPROTO(struct request_queue *q), +- TPARGS(q)); ++ TP_PROTO(struct request_queue *q), ++ TP_ARGS(q)); + + DECLARE_TRACE(block_split, +- TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), +- TPARGS(q, bio, pdu)); ++ TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), ++ TP_ARGS(q, bio, pdu)); + + DECLARE_TRACE(block_remap, +- TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev, +- sector_t from, sector_t to), +- TPARGS(q, bio, dev, from, to)); ++ TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, ++ sector_t from, sector_t to), ++ TP_ARGS(q, bio, dev, from, to)); + + #endif +Index: linux-2.6-tip/include/trace/irq.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/irq.h +@@ -0,0 +1,9 @@ ++#ifndef _TRACE_IRQ_H ++#define _TRACE_IRQ_H ++ ++#include ++#include ++ ++#include ++ ++#endif +Index: linux-2.6-tip/include/trace/irq_event_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/irq_event_types.h +@@ -0,0 +1,55 @@ ++ ++/* use instead */ ++#ifndef TRACE_FORMAT ++# error Do not include this file directly. ++# error Unless you know what you are doing. ++#endif ++ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM irq ++ ++/* ++ * Tracepoint for entry of interrupt handler: ++ */ ++TRACE_FORMAT(irq_handler_entry, ++ TP_PROTO(int irq, struct irqaction *action), ++ TP_ARGS(irq, action), ++ TP_FMT("irq=%d handler=%s", irq, action->name) ++ ); ++ ++/* ++ * Tracepoint for return of an interrupt handler: ++ */ ++TRACE_EVENT(irq_handler_exit, ++ ++ TP_PROTO(int irq, struct irqaction *action, int ret), ++ ++ TP_ARGS(irq, action, ret), ++ ++ TP_STRUCT__entry( ++ __field( int, irq ) ++ __field( int, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->irq = irq; ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("irq=%d return=%s", ++ __entry->irq, __entry->ret ? "handled" : "unhandled") ++); ++ ++TRACE_FORMAT(softirq_entry, ++ TP_PROTO(struct softirq_action *h, struct softirq_action *vec), ++ TP_ARGS(h, vec), ++ TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) ++ ); ++ ++TRACE_FORMAT(softirq_exit, ++ TP_PROTO(struct softirq_action *h, struct softirq_action *vec), ++ TP_ARGS(h, vec), ++ TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) ++ ); ++ ++#undef TRACE_SYSTEM +Index: linux-2.6-tip/include/trace/kmemtrace.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/kmemtrace.h +@@ -0,0 +1,63 @@ ++/* ++ * Copyright (C) 2008 Eduard - Gabriel Munteanu ++ * ++ * This file is released under GPL version 2. ++ */ ++ ++#ifndef _LINUX_KMEMTRACE_H ++#define _LINUX_KMEMTRACE_H ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++#ifdef CONFIG_KMEMTRACE ++extern void kmemtrace_init(void); ++#else ++static inline void kmemtrace_init(void) ++{ ++} ++#endif ++ ++DECLARE_TRACE(kmalloc, ++ TP_PROTO(unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags), ++ TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); ++DECLARE_TRACE(kmem_cache_alloc, ++ TP_PROTO(unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags), ++ TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); ++DECLARE_TRACE(kmalloc_node, ++ TP_PROTO(unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags, ++ int node), ++ TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); ++DECLARE_TRACE(kmem_cache_alloc_node, ++ TP_PROTO(unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags, ++ int node), ++ TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); ++DECLARE_TRACE(kfree, ++ TP_PROTO(unsigned long call_site, const void *ptr), ++ TP_ARGS(call_site, ptr)); ++DECLARE_TRACE(kmem_cache_free, ++ TP_PROTO(unsigned long call_site, const void *ptr), ++ TP_ARGS(call_site, ptr)); ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _LINUX_KMEMTRACE_H */ ++ +Index: linux-2.6-tip/include/trace/lockdep.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/lockdep.h +@@ -0,0 +1,9 @@ ++#ifndef _TRACE_LOCKDEP_H ++#define _TRACE_LOCKDEP_H ++ ++#include ++#include ++ ++#include ++ ++#endif +Index: linux-2.6-tip/include/trace/lockdep_event_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/lockdep_event_types.h +@@ -0,0 +1,44 @@ ++ ++#ifndef TRACE_FORMAT ++# error Do not include this file directly. ++# error Unless you know what you are doing. ++#endif ++ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM lock ++ ++#ifdef CONFIG_LOCKDEP ++ ++TRACE_FORMAT(lock_acquire, ++ TP_PROTO(struct lockdep_map *lock, unsigned int subclass, ++ int trylock, int read, int check, ++ struct lockdep_map *next_lock, unsigned long ip), ++ TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), ++ TP_FMT("%s%s%s", trylock ? "try " : "", ++ read ? "read " : "", lock->name) ++ ); ++ ++TRACE_FORMAT(lock_release, ++ TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), ++ TP_ARGS(lock, nested, ip), ++ TP_FMT("%s", lock->name) ++ ); ++ ++#ifdef CONFIG_LOCK_STAT ++ ++TRACE_FORMAT(lock_contended, ++ TP_PROTO(struct lockdep_map *lock, unsigned long ip), ++ TP_ARGS(lock, ip), ++ TP_FMT("%s", lock->name) ++ ); ++ ++TRACE_FORMAT(lock_acquired, ++ TP_PROTO(struct lockdep_map *lock, unsigned long ip), ++ TP_ARGS(lock, ip), ++ TP_FMT("%s", lock->name) ++ ); ++ ++#endif ++#endif ++ ++#undef TRACE_SYSTEM +Index: linux-2.6-tip/include/trace/power.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/power.h +@@ -0,0 +1,32 @@ ++#ifndef _TRACE_POWER_H ++#define _TRACE_POWER_H ++ ++#include ++#include ++ ++enum { ++ POWER_NONE = 0, ++ POWER_CSTATE = 1, ++ POWER_PSTATE = 2, ++}; ++ ++struct power_trace { ++ ktime_t stamp; ++ ktime_t end; ++ int type; ++ int state; ++}; ++ ++DECLARE_TRACE(power_start, ++ TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state), ++ TP_ARGS(it, type, state)); ++ ++DECLARE_TRACE(power_mark, ++ TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state), ++ TP_ARGS(it, type, state)); ++ ++DECLARE_TRACE(power_end, ++ TP_PROTO(struct power_trace *it), ++ TP_ARGS(it)); ++ ++#endif /* _TRACE_POWER_H */ +Index: linux-2.6-tip/include/trace/sched.h +=================================================================== +--- linux-2.6-tip.orig/include/trace/sched.h ++++ linux-2.6-tip/include/trace/sched.h +@@ -4,53 +4,6 @@ + #include + #include + +-DECLARE_TRACE(sched_kthread_stop, +- TPPROTO(struct task_struct *t), +- TPARGS(t)); +- +-DECLARE_TRACE(sched_kthread_stop_ret, +- TPPROTO(int ret), +- TPARGS(ret)); +- +-DECLARE_TRACE(sched_wait_task, +- TPPROTO(struct rq *rq, struct task_struct *p), +- TPARGS(rq, p)); +- +-DECLARE_TRACE(sched_wakeup, +- TPPROTO(struct rq *rq, struct task_struct *p, int success), +- TPARGS(rq, p, success)); +- +-DECLARE_TRACE(sched_wakeup_new, +- TPPROTO(struct rq *rq, struct task_struct *p, int success), +- TPARGS(rq, p, success)); +- +-DECLARE_TRACE(sched_switch, +- TPPROTO(struct rq *rq, struct task_struct *prev, +- struct task_struct *next), +- TPARGS(rq, prev, next)); +- +-DECLARE_TRACE(sched_migrate_task, +- TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), +- TPARGS(p, orig_cpu, dest_cpu)); +- +-DECLARE_TRACE(sched_process_free, +- TPPROTO(struct task_struct *p), +- TPARGS(p)); +- +-DECLARE_TRACE(sched_process_exit, +- TPPROTO(struct task_struct *p), +- TPARGS(p)); +- +-DECLARE_TRACE(sched_process_wait, +- TPPROTO(struct pid *pid), +- TPARGS(pid)); +- +-DECLARE_TRACE(sched_process_fork, +- TPPROTO(struct task_struct *parent, struct task_struct *child), +- TPARGS(parent, child)); +- +-DECLARE_TRACE(sched_signal_send, +- TPPROTO(int sig, struct task_struct *p), +- TPARGS(sig, p)); ++#include + + #endif +Index: linux-2.6-tip/include/trace/sched_event_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/sched_event_types.h +@@ -0,0 +1,368 @@ ++ ++/* use instead */ ++#ifndef TRACE_EVENT ++# error Do not include this file directly. ++# error Unless you know what you are doing. ++#endif ++ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM sched ++ ++/* ++ * Tracepoint for calling kthread_stop, performed to end a kthread: ++ */ ++TRACE_EVENT(sched_kthread_stop, ++ ++ TP_PROTO(struct task_struct *t), ++ ++ TP_ARGS(t), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, t->comm, TASK_COMM_LEN); ++ __entry->pid = t->pid; ++ ), ++ ++ TP_printk("task %s:%d", __entry->comm, __entry->pid) ++); ++ ++/* ++ * Tracepoint for the return value of the kthread stopping: ++ */ ++TRACE_EVENT(sched_kthread_stop_ret, ++ ++ TP_PROTO(int ret), ++ ++ TP_ARGS(ret), ++ ++ TP_STRUCT__entry( ++ __field( int, ret ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ret = ret; ++ ), ++ ++ TP_printk("ret %d", __entry->ret) ++); ++ ++/* ++ * Tracepoint for waiting on task to unschedule: ++ * ++ * (NOTE: the 'rq' argument is not used by generic trace events, ++ * but used by the latency tracer plugin. ) ++ */ ++TRACE_EVENT(sched_wait_task, ++ ++ TP_PROTO(struct rq *rq, struct task_struct *p), ++ ++ TP_ARGS(rq, p), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, prio ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, p->comm, TASK_COMM_LEN); ++ __entry->pid = p->pid; ++ __entry->prio = p->prio; ++ ), ++ ++ TP_printk("task %s:%d [%d]", ++ __entry->comm, __entry->pid, __entry->prio) ++); ++ ++/* ++ * Tracepoint for waking up a task: ++ * ++ * (NOTE: the 'rq' argument is not used by generic trace events, ++ * but used by the latency tracer plugin. ) ++ */ ++TRACE_EVENT(sched_wakeup, ++ ++ TP_PROTO(struct rq *rq, struct task_struct *p, int success), ++ ++ TP_ARGS(rq, p, success), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, prio ) ++ __field( int, success ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, p->comm, TASK_COMM_LEN); ++ __entry->pid = p->pid; ++ __entry->prio = p->prio; ++ __entry->success = success; ++ ), ++ ++ TP_printk("task %s:%d [%d] success=%d", ++ __entry->comm, __entry->pid, __entry->prio, ++ __entry->success) ++); ++ ++/* ++ * Tracepoint for waking up a new task: ++ * ++ * (NOTE: the 'rq' argument is not used by generic trace events, ++ * but used by the latency tracer plugin. ) ++ */ ++TRACE_EVENT(sched_wakeup_new, ++ ++ TP_PROTO(struct rq *rq, struct task_struct *p, int success), ++ ++ TP_ARGS(rq, p, success), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, prio ) ++ __field( int, success ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, p->comm, TASK_COMM_LEN); ++ __entry->pid = p->pid; ++ __entry->prio = p->prio; ++ __entry->success = success; ++ ), ++ ++ TP_printk("task %s:%d [%d] success=%d", ++ __entry->comm, __entry->pid, __entry->prio, ++ __entry->success) ++); ++ ++/* ++ * Tracepoint for task switches, performed by the scheduler: ++ * ++ * (NOTE: the 'rq' argument is not used by generic trace events, ++ * but used by the latency tracer plugin. ) ++ */ ++TRACE_EVENT(sched_switch, ++ ++ TP_PROTO(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next), ++ ++ TP_ARGS(rq, prev, next), ++ ++ TP_STRUCT__entry( ++ __array( char, prev_comm, TASK_COMM_LEN ) ++ __field( pid_t, prev_pid ) ++ __field( int, prev_prio ) ++ __array( char, next_comm, TASK_COMM_LEN ) ++ __field( pid_t, next_pid ) ++ __field( int, next_prio ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); ++ __entry->prev_pid = prev->pid; ++ __entry->prev_prio = prev->prio; ++ memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); ++ __entry->next_pid = next->pid; ++ __entry->next_prio = next->prio; ++ ), ++ ++ TP_printk("task %s:%d [%d] ==> %s:%d [%d]", ++ __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, ++ __entry->next_comm, __entry->next_pid, __entry->next_prio) ++); ++ ++/* ++ * Tracepoint for a task being migrated: ++ */ ++TRACE_EVENT(sched_migrate_task, ++ ++ TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), ++ ++ TP_ARGS(p, orig_cpu, dest_cpu), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, prio ) ++ __field( int, orig_cpu ) ++ __field( int, dest_cpu ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, p->comm, TASK_COMM_LEN); ++ __entry->pid = p->pid; ++ __entry->prio = p->prio; ++ __entry->orig_cpu = orig_cpu; ++ __entry->dest_cpu = dest_cpu; ++ ), ++ ++ TP_printk("task %s:%d [%d] from: %d to: %d", ++ __entry->comm, __entry->pid, __entry->prio, ++ __entry->orig_cpu, __entry->dest_cpu) ++); ++ ++/* ++ * Tracepoint for freeing a task: ++ */ ++TRACE_EVENT(sched_process_free, ++ ++ TP_PROTO(struct task_struct *p), ++ ++ TP_ARGS(p), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, prio ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, p->comm, TASK_COMM_LEN); ++ __entry->pid = p->pid; ++ __entry->prio = p->prio; ++ ), ++ ++ TP_printk("task %s:%d [%d]", ++ __entry->comm, __entry->pid, __entry->prio) ++); ++ ++/* ++ * Tracepoint for a task exiting: ++ */ ++TRACE_EVENT(sched_process_exit, ++ ++ TP_PROTO(struct task_struct *p), ++ ++ TP_ARGS(p), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, prio ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, p->comm, TASK_COMM_LEN); ++ __entry->pid = p->pid; ++ __entry->prio = p->prio; ++ ), ++ ++ TP_printk("task %s:%d [%d]", ++ __entry->comm, __entry->pid, __entry->prio) ++); ++ ++/* ++ * Tracepoint for priority boosting/deboosting of a task: ++ * ++ * (NOTE: the 'rq' argument is not used by generic trace events, ++ * but used by the latency tracer plugin. ) ++ */ ++TRACE_EVENT(sched_task_setprio, ++ ++ TP_PROTO(struct rq *rq, struct task_struct *p, int oldprio), ++ ++ TP_ARGS(rq, p, oldprio), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, prio ) ++ __field( int, oldprio ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, p->comm, TASK_COMM_LEN); ++ __entry->pid = p->pid; ++ __entry->prio = p->prio; ++ __entry->oldprio = oldprio; ++ ), ++ ++ TP_printk("task %s:%d [%d] oldprio=%d", ++ __entry->comm, __entry->pid, __entry->prio, ++ __entry->oldprio) ++); ++ ++/* ++ * Tracepoint for a waiting task: ++ */ ++TRACE_EVENT(sched_process_wait, ++ ++ TP_PROTO(struct pid *pid), ++ ++ TP_ARGS(pid), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, prio ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ++ __entry->pid = pid_nr(pid); ++ __entry->prio = current->prio; ++ ), ++ ++ TP_printk("task %s:%d [%d]", ++ __entry->comm, __entry->pid, __entry->prio) ++); ++ ++/* ++ * Tracepoint for do_fork: ++ */ ++TRACE_EVENT(sched_process_fork, ++ ++ TP_PROTO(struct task_struct *parent, struct task_struct *child), ++ ++ TP_ARGS(parent, child), ++ ++ TP_STRUCT__entry( ++ __array( char, parent_comm, TASK_COMM_LEN ) ++ __field( pid_t, parent_pid ) ++ __array( char, child_comm, TASK_COMM_LEN ) ++ __field( pid_t, child_pid ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); ++ __entry->parent_pid = parent->pid; ++ memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); ++ __entry->child_pid = child->pid; ++ ), ++ ++ TP_printk("parent %s:%d child %s:%d", ++ __entry->parent_comm, __entry->parent_pid, ++ __entry->child_comm, __entry->child_pid) ++); ++ ++/* ++ * Tracepoint for sending a signal: ++ */ ++TRACE_EVENT(sched_signal_send, ++ ++ TP_PROTO(int sig, struct task_struct *p), ++ ++ TP_ARGS(sig, p), ++ ++ TP_STRUCT__entry( ++ __field( int, sig ) ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, p->comm, TASK_COMM_LEN); ++ __entry->pid = p->pid; ++ __entry->sig = sig; ++ ), ++ ++ TP_printk("sig: %d task %s:%d", ++ __entry->sig, __entry->comm, __entry->pid) ++); ++ ++#undef TRACE_SYSTEM +Index: linux-2.6-tip/include/trace/trace_event_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/trace_event_types.h +@@ -0,0 +1,5 @@ ++/* trace/_event_types.h here */ ++ ++#include ++#include ++#include +Index: linux-2.6-tip/include/trace/trace_events.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/trace_events.h +@@ -0,0 +1,5 @@ ++/* trace/.h here */ ++ ++#include ++#include ++#include +Index: linux-2.6-tip/include/trace/workqueue.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/trace/workqueue.h +@@ -0,0 +1,25 @@ ++#ifndef __TRACE_WORKQUEUE_H ++#define __TRACE_WORKQUEUE_H ++ ++#include ++#include ++#include ++ ++DECLARE_TRACE(workqueue_insertion, ++ TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), ++ TP_ARGS(wq_thread, work)); ++ ++DECLARE_TRACE(workqueue_execution, ++ TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), ++ TP_ARGS(wq_thread, work)); ++ ++/* Trace the creation of one workqueue thread on a cpu */ ++DECLARE_TRACE(workqueue_creation, ++ TP_PROTO(struct task_struct *wq_thread, int cpu), ++ TP_ARGS(wq_thread, cpu)); ++ ++DECLARE_TRACE(workqueue_destruction, ++ TP_PROTO(struct task_struct *wq_thread), ++ TP_ARGS(wq_thread)); ++ ++#endif /* __TRACE_WORKQUEUE_H */ +Index: linux-2.6-tip/init/Kconfig +=================================================================== +--- linux-2.6-tip.orig/init/Kconfig ++++ linux-2.6-tip/init/Kconfig +@@ -101,6 +101,66 @@ config LOCALVERSION_AUTO + + which is done within the script "scripts/setlocalversion".) + ++config HAVE_KERNEL_GZIP ++ bool ++ ++config HAVE_KERNEL_BZIP2 ++ bool ++ ++config HAVE_KERNEL_LZMA ++ bool ++ ++choice ++ prompt "Kernel compression mode" ++ default KERNEL_GZIP ++ depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA ++ help ++ The linux kernel is a kind of self-extracting executable. ++ Several compression algorithms are available, which differ ++ in efficiency, compression and decompression speed. ++ Compression speed is only relevant when building a kernel. ++ Decompression speed is relevant at each boot. ++ ++ If you have any problems with bzip2 or lzma compressed ++ kernels, mail me (Alain Knaff) . (An older ++ version of this functionality (bzip2 only), for 2.4, was ++ supplied by Christian Ludwig) ++ ++ High compression options are mostly useful for users, who ++ are low on disk space (embedded systems), but for whom ram ++ size matters less. ++ ++ If in doubt, select 'gzip' ++ ++config KERNEL_GZIP ++ bool "Gzip" ++ depends on HAVE_KERNEL_GZIP ++ help ++ The old and tried gzip compression. Its compression ratio is ++ the poorest among the 3 choices; however its speed (both ++ compression and decompression) is the fastest. ++ ++config KERNEL_BZIP2 ++ bool "Bzip2" ++ depends on HAVE_KERNEL_BZIP2 ++ help ++ Its compression ratio and speed is intermediate. ++ Decompression speed is slowest among the three. The kernel ++ size is about 10% smaller with bzip2, in comparison to gzip. ++ Bzip2 uses a large amount of memory. For modern kernels you ++ will need at least 8MB RAM or more for booting. ++ ++config KERNEL_LZMA ++ bool "LZMA" ++ depends on HAVE_KERNEL_LZMA ++ help ++ The most recent compression algorithm. ++ Its ratio is best, decompression speed is between the other ++ two. Compression is slowest. The kernel size is about 33% ++ smaller with LZMA in comparison to gzip. ++ ++endchoice ++ + config SWAP + bool "Support for paging of anonymous memory (swap)" + depends on MMU && BLOCK +@@ -246,6 +306,7 @@ choice + + config CLASSIC_RCU + bool "Classic RCU" ++ depends on !PREEMPT_RT + help + This option selects the classic RCU implementation that is + designed for best read-side performance on non-realtime +@@ -255,6 +316,7 @@ config CLASSIC_RCU + + config TREE_RCU + bool "Tree-based hierarchical RCU" ++ depends on !PREEMPT_RT + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or +@@ -857,6 +919,41 @@ config AIO + by some high performance threaded applications. Disabling + this option saves about 7k. + ++config HAVE_PERF_COUNTERS ++ bool ++ ++menu "Performance Counters" ++ ++config PERF_COUNTERS ++ bool "Kernel Performance Counters" ++ depends on HAVE_PERF_COUNTERS ++ default y ++ select ANON_INODES ++ help ++ Enable kernel support for performance counter hardware. ++ ++ Performance counters are special hardware registers available ++ on most modern CPUs. These registers count the number of certain ++ types of hw events: such as instructions executed, cachemisses ++ suffered, or branches mis-predicted - without slowing down the ++ kernel or applications. These registers can also trigger interrupts ++ when a threshold number of events have passed - and can thus be ++ used to profile the code that runs on that CPU. ++ ++ The Linux Performance Counter subsystem provides an abstraction of ++ these hardware capabilities, available via a system call. It ++ provides per task and per CPU counters, and it provides event ++ capabilities on top of those. ++ ++ Say Y if unsure. ++ ++config EVENT_PROFILE ++ bool "Tracepoint profile sources" ++ depends on PERF_COUNTERS && EVENT_TRACER ++ default y ++ ++endmenu ++ + config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EMBEDDED +@@ -912,6 +1009,7 @@ config SLAB + + config SLUB + bool "SLUB (Unqueued Allocator)" ++ depends on !PREEMPT_RT + help + SLUB is a slab allocator that minimizes cache line usage + instead of managing queues of cached objects (SLAB approach). +@@ -922,6 +1020,8 @@ config SLUB + + config SLOB + depends on EMBEDDED ++ # lockups observed: ++ depends on 0 + bool "SLOB (Simple Allocator)" + help + SLOB replaces the stock allocator with a drastically simpler +@@ -945,7 +1045,7 @@ config TRACEPOINTS + + config MARKERS + bool "Activate markers" +- depends on TRACEPOINTS ++ select TRACEPOINTS + help + Place an empty function call at each marker site. Can be + dynamically changed for a probe function. +@@ -966,7 +1066,6 @@ config SLABINFO + + config RT_MUTEXES + boolean +- select PLIST + + config BASE_SMALL + int +Index: linux-2.6-tip/init/do_mounts.c +=================================================================== +--- linux-2.6-tip.orig/init/do_mounts.c ++++ linux-2.6-tip/init/do_mounts.c +@@ -228,9 +228,13 @@ static int __init do_mount_root(char *na + return 0; + } + ++#if PAGE_SIZE < PATH_MAX ++# error increase the fs_names allocation size here ++#endif ++ + void __init mount_block_root(char *name, int flags) + { +- char *fs_names = __getname(); ++ char *fs_names = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + char *p; + #ifdef CONFIG_BLOCK + char b[BDEVNAME_SIZE]; +@@ -282,7 +286,7 @@ retry: + #endif + panic("VFS: Unable to mount root fs on %s", b); + out: +- putname(fs_names); ++ free_pages((unsigned long)fs_names, 1); + } + + #ifdef CONFIG_ROOT_NFS +Index: linux-2.6-tip/init/do_mounts_rd.c +=================================================================== +--- linux-2.6-tip.orig/init/do_mounts_rd.c ++++ linux-2.6-tip/init/do_mounts_rd.c +@@ -11,6 +11,9 @@ + #include "do_mounts.h" + #include "../fs/squashfs/squashfs_fs.h" + ++#include ++ ++ + int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */ + + static int __init prompt_ramdisk(char *str) +@@ -29,7 +32,7 @@ static int __init ramdisk_start_setup(ch + } + __setup("ramdisk_start=", ramdisk_start_setup); + +-static int __init crd_load(int in_fd, int out_fd); ++static int __init crd_load(int in_fd, int out_fd, decompress_fn deco); + + /* + * This routine tries to find a RAM disk image to load, and returns the +@@ -38,15 +41,15 @@ static int __init crd_load(int in_fd, in + * numbers could not be found. + * + * We currently check for the following magic numbers: +- * minix +- * ext2 ++ * minix ++ * ext2 + * romfs + * cramfs + * squashfs +- * gzip ++ * gzip + */ +-static int __init +-identify_ramdisk_image(int fd, int start_block) ++static int __init ++identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) + { + const int size = 512; + struct minix_super_block *minixsb; +@@ -56,6 +59,7 @@ identify_ramdisk_image(int fd, int start + struct squashfs_super_block *squashfsb; + int nblocks = -1; + unsigned char *buf; ++ const char *compress_name; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) +@@ -69,18 +73,19 @@ identify_ramdisk_image(int fd, int start + memset(buf, 0xe5, size); + + /* +- * Read block 0 to test for gzipped kernel ++ * Read block 0 to test for compressed kernel + */ + sys_lseek(fd, start_block * BLOCK_SIZE, 0); + sys_read(fd, buf, size); + +- /* +- * If it matches the gzip magic numbers, return 0 +- */ +- if (buf[0] == 037 && ((buf[1] == 0213) || (buf[1] == 0236))) { +- printk(KERN_NOTICE +- "RAMDISK: Compressed image found at block %d\n", +- start_block); ++ *decompressor = decompress_method(buf, size, &compress_name); ++ if (compress_name) { ++ printk(KERN_NOTICE "RAMDISK: %s image found at block %d\n", ++ compress_name, start_block); ++ if (!*decompressor) ++ printk(KERN_EMERG ++ "RAMDISK: %s decompressor not configured!\n", ++ compress_name); + nblocks = 0; + goto done; + } +@@ -142,7 +147,7 @@ identify_ramdisk_image(int fd, int start + printk(KERN_NOTICE + "RAMDISK: Couldn't find valid RAM disk image starting at %d.\n", + start_block); +- ++ + done: + sys_lseek(fd, start_block * BLOCK_SIZE, 0); + kfree(buf); +@@ -157,6 +162,7 @@ int __init rd_load_image(char *from) + int nblocks, i, disk; + char *buf = NULL; + unsigned short rotate = 0; ++ decompress_fn decompressor = NULL; + #if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES) + char rotator[4] = { '|' , '/' , '-' , '\\' }; + #endif +@@ -169,12 +175,12 @@ int __init rd_load_image(char *from) + if (in_fd < 0) + goto noclose_input; + +- nblocks = identify_ramdisk_image(in_fd, rd_image_start); ++ nblocks = identify_ramdisk_image(in_fd, rd_image_start, &decompressor); + if (nblocks < 0) + goto done; + + if (nblocks == 0) { +- if (crd_load(in_fd, out_fd) == 0) ++ if (crd_load(in_fd, out_fd, decompressor) == 0) + goto successful_load; + goto done; + } +@@ -200,7 +206,7 @@ int __init rd_load_image(char *from) + nblocks, rd_blocks); + goto done; + } +- ++ + /* + * OK, time to copy in the data + */ +@@ -273,138 +279,48 @@ int __init rd_load_disk(int n) + return rd_load_image("/dev/root"); + } + +-/* +- * gzip declarations +- */ +- +-#define OF(args) args +- +-#ifndef memzero +-#define memzero(s, n) memset ((s), 0, (n)) +-#endif +- +-typedef unsigned char uch; +-typedef unsigned short ush; +-typedef unsigned long ulg; +- +-#define INBUFSIZ 4096 +-#define WSIZE 0x8000 /* window size--must be a power of two, and */ +- /* at least 32K for zip's deflate method */ +- +-static uch *inbuf; +-static uch *window; +- +-static unsigned insize; /* valid bytes in inbuf */ +-static unsigned inptr; /* index of next byte to be processed in inbuf */ +-static unsigned outcnt; /* bytes in output buffer */ + static int exit_code; +-static int unzip_error; +-static long bytes_out; ++static int decompress_error; + static int crd_infd, crd_outfd; + +-#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) +- +-/* Diagnostic functions (stubbed out) */ +-#define Assert(cond,msg) +-#define Trace(x) +-#define Tracev(x) +-#define Tracevv(x) +-#define Tracec(c,x) +-#define Tracecv(c,x) +- +-#define STATIC static +-#define INIT __init +- +-static int __init fill_inbuf(void); +-static void __init flush_window(void); +-static void __init error(char *m); +- +-#define NO_INFLATE_MALLOC +- +-#include "../lib/inflate.c" +- +-/* =========================================================================== +- * Fill the input buffer. This is called only when the buffer is empty +- * and at least one byte is really needed. +- * Returning -1 does not guarantee that gunzip() will ever return. +- */ +-static int __init fill_inbuf(void) ++static int __init compr_fill(void *buf, unsigned int len) + { +- if (exit_code) return -1; +- +- insize = sys_read(crd_infd, inbuf, INBUFSIZ); +- if (insize == 0) { +- error("RAMDISK: ran out of compressed data"); +- return -1; +- } +- +- inptr = 1; +- +- return inbuf[0]; ++ int r = sys_read(crd_infd, buf, len); ++ if (r < 0) ++ printk(KERN_ERR "RAMDISK: error while reading compressed data"); ++ else if (r == 0) ++ printk(KERN_ERR "RAMDISK: EOF while reading compressed data"); ++ return r; + } + +-/* =========================================================================== +- * Write the output window window[0..outcnt-1] and update crc and bytes_out. +- * (Used for the decompressed data only.) +- */ +-static void __init flush_window(void) ++static int __init compr_flush(void *window, unsigned int outcnt) + { +- ulg c = crc; /* temporary variable */ +- unsigned n, written; +- uch *in, ch; +- +- written = sys_write(crd_outfd, window, outcnt); +- if (written != outcnt && unzip_error == 0) { +- printk(KERN_ERR "RAMDISK: incomplete write (%d != %d) %ld\n", +- written, outcnt, bytes_out); +- unzip_error = 1; +- } +- in = window; +- for (n = 0; n < outcnt; n++) { +- ch = *in++; +- c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); +- } +- crc = c; +- bytes_out += (ulg)outcnt; +- outcnt = 0; ++ int written = sys_write(crd_outfd, window, outcnt); ++ if (written != outcnt) { ++ if (decompress_error == 0) ++ printk(KERN_ERR ++ "RAMDISK: incomplete write (%d != %d)\n", ++ written, outcnt); ++ decompress_error = 1; ++ return -1; ++ } ++ return outcnt; + } + + static void __init error(char *x) + { + printk(KERN_ERR "%s\n", x); + exit_code = 1; +- unzip_error = 1; ++ decompress_error = 1; + } + +-static int __init crd_load(int in_fd, int out_fd) ++static int __init crd_load(int in_fd, int out_fd, decompress_fn deco) + { + int result; +- +- insize = 0; /* valid bytes in inbuf */ +- inptr = 0; /* index of next byte to be processed in inbuf */ +- outcnt = 0; /* bytes in output buffer */ +- exit_code = 0; +- bytes_out = 0; +- crc = (ulg)0xffffffffL; /* shift register contents */ +- + crd_infd = in_fd; + crd_outfd = out_fd; +- inbuf = kmalloc(INBUFSIZ, GFP_KERNEL); +- if (!inbuf) { +- printk(KERN_ERR "RAMDISK: Couldn't allocate gzip buffer\n"); +- return -1; +- } +- window = kmalloc(WSIZE, GFP_KERNEL); +- if (!window) { +- printk(KERN_ERR "RAMDISK: Couldn't allocate gzip window\n"); +- kfree(inbuf); +- return -1; +- } +- makecrc(); +- result = gunzip(); +- if (unzip_error) ++ result = deco(NULL, 0, compr_fill, compr_flush, NULL, NULL, error); ++ if (decompress_error) + result = 1; +- kfree(inbuf); +- kfree(window); + return result; + } +Index: linux-2.6-tip/init/initramfs.c +=================================================================== +--- linux-2.6-tip.orig/init/initramfs.c ++++ linux-2.6-tip/init/initramfs.c +@@ -390,11 +390,13 @@ static int __init write_buffer(char *buf + return len - count; + } + +-static void __init flush_buffer(char *buf, unsigned len) ++static int __init flush_buffer(void *bufv, unsigned len) + { ++ char *buf = (char *) bufv; + int written; ++ int origLen = len; + if (message) +- return; ++ return -1; + while ((written = write_buffer(buf, len)) < len && !message) { + char c = buf[written]; + if (c == '0') { +@@ -408,84 +410,28 @@ static void __init flush_buffer(char *bu + } else + error("junk in compressed archive"); + } ++ return origLen; + } + +-/* +- * gzip declarations +- */ ++static unsigned my_inptr; /* index of next byte to be processed in inbuf */ + +-#define OF(args) args +- +-#ifndef memzero +-#define memzero(s, n) memset ((s), 0, (n)) +-#endif +- +-typedef unsigned char uch; +-typedef unsigned short ush; +-typedef unsigned long ulg; +- +-#define WSIZE 0x8000 /* window size--must be a power of two, and */ +- /* at least 32K for zip's deflate method */ +- +-static uch *inbuf; +-static uch *window; +- +-static unsigned insize; /* valid bytes in inbuf */ +-static unsigned inptr; /* index of next byte to be processed in inbuf */ +-static unsigned outcnt; /* bytes in output buffer */ +-static long bytes_out; +- +-#define get_byte() (inptr < insize ? inbuf[inptr++] : -1) +- +-/* Diagnostic functions (stubbed out) */ +-#define Assert(cond,msg) +-#define Trace(x) +-#define Tracev(x) +-#define Tracevv(x) +-#define Tracec(c,x) +-#define Tracecv(c,x) +- +-#define STATIC static +-#define INIT __init +- +-static void __init flush_window(void); +-static void __init error(char *m); +- +-#define NO_INFLATE_MALLOC +- +-#include "../lib/inflate.c" +- +-/* =========================================================================== +- * Write the output window window[0..outcnt-1] and update crc and bytes_out. +- * (Used for the decompressed data only.) +- */ +-static void __init flush_window(void) +-{ +- ulg c = crc; /* temporary variable */ +- unsigned n; +- uch *in, ch; +- +- flush_buffer(window, outcnt); +- in = window; +- for (n = 0; n < outcnt; n++) { +- ch = *in++; +- c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); +- } +- crc = c; +- bytes_out += (ulg)outcnt; +- outcnt = 0; +-} ++#include + + static char * __init unpack_to_rootfs(char *buf, unsigned len, int check_only) + { + int written; ++ decompress_fn decompress; ++ const char *compress_name; ++ static __initdata char msg_buf[64]; ++ + dry_run = check_only; + header_buf = kmalloc(110, GFP_KERNEL); + symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); + name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); +- window = kmalloc(WSIZE, GFP_KERNEL); +- if (!window || !header_buf || !symlink_buf || !name_buf) ++ ++ if (!header_buf || !symlink_buf || !name_buf) + panic("can't allocate buffers"); ++ + state = Start; + this_header = 0; + message = NULL; +@@ -505,22 +451,25 @@ static char * __init unpack_to_rootfs(ch + continue; + } + this_header = 0; +- insize = len; +- inbuf = buf; +- inptr = 0; +- outcnt = 0; /* bytes in output buffer */ +- bytes_out = 0; +- crc = (ulg)0xffffffffL; /* shift register contents */ +- makecrc(); +- gunzip(); ++ decompress = decompress_method(buf, len, &compress_name); ++ if (decompress) ++ decompress(buf, len, NULL, flush_buffer, NULL, ++ &my_inptr, error); ++ else if (compress_name) { ++ if (!message) { ++ snprintf(msg_buf, sizeof msg_buf, ++ "compression method %s not configured", ++ compress_name); ++ message = msg_buf; ++ } ++ } + if (state != Reset) +- error("junk in gzipped archive"); +- this_header = saved_offset + inptr; +- buf += inptr; +- len -= inptr; ++ error("junk in compressed archive"); ++ this_header = saved_offset + my_inptr; ++ buf += my_inptr; ++ len -= my_inptr; + } + dir_utime(); +- kfree(window); + kfree(name_buf); + kfree(symlink_buf); + kfree(header_buf); +@@ -579,7 +528,7 @@ static int __init populate_rootfs(void) + char *err = unpack_to_rootfs(__initramfs_start, + __initramfs_end - __initramfs_start, 0); + if (err) +- panic(err); ++ panic(err); /* Failed to decompress INTERNAL initramfs */ + if (initrd_start) { + #ifdef CONFIG_BLK_DEV_RAM + int fd; +@@ -605,9 +554,12 @@ static int __init populate_rootfs(void) + printk(KERN_INFO "Unpacking initramfs..."); + err = unpack_to_rootfs((char *)initrd_start, + initrd_end - initrd_start, 0); +- if (err) +- panic(err); +- printk(" done\n"); ++ if (err) { ++ printk(" failed!\n"); ++ printk(KERN_EMERG "%s\n", err); ++ } else { ++ printk(" done\n"); ++ } + free_initrd(); + #endif + } +Index: linux-2.6-tip/init/main.c +=================================================================== +--- linux-2.6-tip.orig/init/main.c ++++ linux-2.6-tip/init/main.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -35,6 +36,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -48,6 +50,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -61,6 +64,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -70,6 +74,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_X86_LOCAL_APIC + #include +@@ -135,14 +140,14 @@ unsigned int __initdata setup_max_cpus = + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to . + */ +-#ifndef CONFIG_X86_IO_APIC +-static inline void disable_ioapic_setup(void) {}; +-#endif ++ ++void __weak arch_disable_smp_support(void) { } + + static int __init nosmp(char *str) + { + setup_max_cpus = 0; +- disable_ioapic_setup(); ++ arch_disable_smp_support(); ++ + return 0; + } + +@@ -152,14 +157,14 @@ static int __init maxcpus(char *str) + { + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) +- disable_ioapic_setup(); ++ arch_disable_smp_support(); + + return 0; + } + + early_param("maxcpus", maxcpus); + #else +-#define setup_max_cpus NR_CPUS ++const unsigned int setup_max_cpus = NR_CPUS; + #endif + + /* +@@ -452,6 +457,8 @@ static noinline void __init_refok rest_i + { + int pid; + ++ system_state = SYSTEM_BOOTING_SCHEDULER_OK; ++ + kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); + numa_default_policy(); + pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); +@@ -464,7 +471,7 @@ static noinline void __init_refok rest_i + */ + init_idle_bootup_task(current); + rcu_scheduler_starting(); +- preempt_enable_no_resched(); ++ __preempt_enable_no_resched(); + schedule(); + preempt_disable(); + +@@ -540,6 +547,12 @@ asmlinkage void __init start_kernel(void + */ + lockdep_init(); + debug_objects_early_init(); ++ ++ /* ++ * Set up the the initial canary ASAP: ++ */ ++ boot_init_stack_canary(); ++ + cgroup_init_early(); + + local_irq_disable(); +@@ -574,8 +587,10 @@ asmlinkage void __init start_kernel(void + * fragile until we cpu_idle() for the first time. + */ + preempt_disable(); ++ + build_all_zonelists(); + page_alloc_init(); ++ early_init_hardirqs(); + printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); + parse_early_param(); + parse_args("Booting kernel", static_command_line, __start___param, +@@ -642,6 +657,7 @@ asmlinkage void __init start_kernel(void + enable_debug_pagealloc(); + cpu_hotplug_init(); + kmem_cache_init(); ++ kmemtrace_init(); + debug_objects_mem_init(); + idr_init_cache(); + setup_per_cpu_pageset(); +@@ -683,6 +699,9 @@ asmlinkage void __init start_kernel(void + + ftrace_init(); + ++#ifdef CONFIG_PREEMPT_RT ++ WARN_ON(irqs_disabled()); ++#endif + /* Do the rest non-__init'ed, we're now alive */ + rest_init(); + } +@@ -763,6 +782,7 @@ static void __init do_basic_setup(void) + { + rcu_init_sched(); /* needed by module_init stage. */ + init_workqueues(); ++ cpuset_init_smp(); + usermodehelper_init(); + driver_init(); + init_irq_proc(); +@@ -772,9 +792,14 @@ static void __init do_basic_setup(void) + static void __init do_pre_smp_initcalls(void) + { + initcall_t *call; ++ extern int spawn_desched_task(void); ++ ++ /* kmemcheck must initialize before all early initcalls: */ ++ kmemcheck_init(); + + for (call = __initcall_start; call < __early_initcall_end; call++) + do_one_initcall(*call); ++ spawn_desched_task(); + } + + static void run_init_process(char *init_filename) +@@ -809,6 +834,9 @@ static noinline int init_post(void) + printk(KERN_WARNING "Failed to execute %s\n", + ramdisk_execute_command); + } ++#ifdef CONFIG_PREEMPT_RT ++ WARN_ON(irqs_disabled()); ++#endif + + /* + * We try each of these until one succeeds. +@@ -850,14 +878,14 @@ static int __init kernel_init(void * unu + + smp_prepare_cpus(setup_max_cpus); + ++ init_hardirqs(); ++ + do_pre_smp_initcalls(); + start_boot_trace(); + + smp_init(); + sched_init_smp(); + +- cpuset_init_smp(); +- + do_basic_setup(); + + /* +@@ -872,7 +900,57 @@ static int __init kernel_init(void * unu + ramdisk_execute_command = NULL; + prepare_namespace(); + } ++#ifdef CONFIG_PREEMPT_RT ++ WARN_ON(irqs_disabled()); ++#endif + ++#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_MUTEXES) + defined(CONFIG_IRQSOFF_TRACER) + defined(CONFIG_PREEMPT_TRACER) + defined(CONFIG_STACK_TRACER) + defined(CONFIG_INTERRUPT_OFF_HIST) + defined(CONFIG_PREEMPT_OFF_HIST) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC) + defined(CONFIG_LOCKDEP) + (defined(CONFIG_FTRACE) - defined(CONFIG_FTRACE_MCOUNT_RECORD))) ++ ++#if DEBUG_COUNT > 0 ++ printk(KERN_ERR "*****************************************************************************\n"); ++ printk(KERN_ERR "* *\n"); ++#if DEBUG_COUNT == 1 ++ printk(KERN_ERR "* REMINDER, the following debugging option is turned on in your .config: *\n"); ++#else ++ printk(KERN_ERR "* REMINDER, the following debugging options are turned on in your .config: *\n"); ++#endif ++ printk(KERN_ERR "* *\n"); ++#ifdef CONFIG_FTRACE ++ printk(KERN_ERR "* CONFIG_FTRACE *\n"); ++#endif ++#ifdef CONFIG_DEBUG_RT_MUTEXES ++ printk(KERN_ERR "* CONFIG_DEBUG_RT_MUTEXES *\n"); ++#endif ++#ifdef CONFIG_IRQSOFF_TRACER ++ printk(KERN_ERR "* CONFIG_IRQSOFF_TRACER *\n"); ++#endif ++#ifdef CONFIG_PREEMPT_TRACER ++ printk(KERN_ERR "* CONFIG_PREEMPT_TRACER *\n"); ++#endif ++#ifdef CONFIG_INTERRUPT_OFF_HIST ++ printk(KERN_ERR "* CONFIG_INTERRUPT_OFF_HIST *\n"); ++#endif ++#ifdef CONFIG_PREEMPT_OFF_HIST ++ printk(KERN_ERR "* CONFIG_PREEMPT_OFF_HIST *\n"); ++#endif ++#ifdef CONFIG_DEBUG_SLAB ++ printk(KERN_ERR "* CONFIG_DEBUG_SLAB *\n"); ++#endif ++#ifdef CONFIG_DEBUG_PAGEALLOC ++ printk(KERN_ERR "* CONFIG_DEBUG_PAGEALLOC *\n"); ++#endif ++#ifdef CONFIG_LOCKDEP ++ printk(KERN_ERR "* CONFIG_LOCKDEP *\n"); ++#endif ++ printk(KERN_ERR "* *\n"); ++#if DEBUG_COUNT == 1 ++ printk(KERN_ERR "* it may increase runtime overhead and latencies. *\n"); ++#else ++ printk(KERN_ERR "* they may increase runtime overhead and latencies. *\n"); ++#endif ++ printk(KERN_ERR "* *\n"); ++ printk(KERN_ERR "*****************************************************************************\n"); ++#endif + /* + * Ok, we have completed the initial bootup, and + * we're essentially up and running. Get rid of the +@@ -880,5 +958,7 @@ static int __init kernel_init(void * unu + */ + + init_post(); ++ WARN_ON(debug_direct_keyboard); ++ + return 0; + } +Index: linux-2.6-tip/kernel/Makefile +=================================================================== +--- linux-2.6-tip.orig/kernel/Makefile ++++ linux-2.6-tip/kernel/Makefile +@@ -7,7 +7,7 @@ obj-y = sched.o fork.o exec_domain.o + sysctl.o capability.o ptrace.o timer.o user.o \ + signal.o sys.o kmod.o workqueue.o pid.o \ + rcupdate.o extable.o params.o posix-timers.o \ +- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ ++ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \ + hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ + notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ + async.o +@@ -27,7 +27,10 @@ obj-$(CONFIG_PROFILING) += profile.o + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o + obj-$(CONFIG_STACKTRACE) += stacktrace.o + obj-y += time/ ++ifneq ($(CONFIG_PREEMPT_RT),y) ++obj-y += mutex.o + obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o ++endif + obj-$(CONFIG_LOCKDEP) += lockdep.o + ifeq ($(CONFIG_PROC_FS),y) + obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +@@ -39,6 +42,7 @@ endif + obj-$(CONFIG_RT_MUTEXES) += rtmutex.o + obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o + obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o ++obj-$(CONFIG_PREEMPT_RT) += rt.o + obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o + obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o + ifneq ($(CONFIG_SMP),y) +@@ -74,6 +78,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o + obj-$(CONFIG_KPROBES) += kprobes.o + obj-$(CONFIG_KGDB) += kgdb.o + obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o ++obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o + obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ + obj-$(CONFIG_SECCOMP) += seccomp.o + obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +@@ -93,6 +98,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) + obj-$(CONFIG_FUNCTION_TRACER) += trace/ + obj-$(CONFIG_TRACING) += trace/ + obj-$(CONFIG_SMP) += sched_cpupri.o ++obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o + + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + # According to Alan Modra , the -fno-omit-frame-pointer is +Index: linux-2.6-tip/kernel/auditsc.c +=================================================================== +--- linux-2.6-tip.orig/kernel/auditsc.c ++++ linux-2.6-tip/kernel/auditsc.c +@@ -741,6 +741,7 @@ void audit_filter_inodes(struct task_str + rcu_read_unlock(); + } + ++#ifdef CONFIG_AUDIT_TREE + static void audit_set_auditable(struct audit_context *ctx) + { + if (!ctx->prio) { +@@ -748,6 +749,7 @@ static void audit_set_auditable(struct a + ctx->current_state = AUDIT_RECORD_CONTEXT; + } + } ++#endif + + static inline struct audit_context *audit_get_context(struct task_struct *tsk, + int return_valid, +Index: linux-2.6-tip/kernel/compat.c +=================================================================== +--- linux-2.6-tip.orig/kernel/compat.c ++++ linux-2.6-tip/kernel/compat.c +@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigse + + } + ++asmlinkage long ++compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, ++ struct compat_siginfo __user *uinfo) ++{ ++ siginfo_t info; ++ ++ if (copy_siginfo_from_user32(&info, uinfo)) ++ return -EFAULT; ++ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); ++} ++ + #ifdef __ARCH_WANT_COMPAT_SYS_TIME + + /* compat_time_t is a 32 bit "long" and needs to get converted. */ +Index: linux-2.6-tip/kernel/exit.c +=================================================================== +--- linux-2.6-tip.orig/kernel/exit.c ++++ linux-2.6-tip/kernel/exit.c +@@ -75,7 +75,9 @@ static void __unhash_process(struct task + detach_pid(p, PIDTYPE_SID); + + list_del_rcu(&p->tasks); ++ preempt_disable(); + __get_cpu_var(process_counts)--; ++ preempt_enable(); + } + list_del_rcu(&p->thread_group); + list_del_init(&p->sibling); +@@ -138,7 +140,7 @@ static void __exit_signal(struct task_st + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ +- flush_sigqueue(&tsk->pending); ++ flush_task_sigqueue(tsk); + + tsk->signal = NULL; + tsk->sighand = NULL; +@@ -162,6 +164,9 @@ static void delayed_put_task_struct(stru + { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + ++#ifdef CONFIG_PERF_COUNTERS ++ WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); ++#endif + trace_sched_process_free(tsk); + put_task_struct(tsk); + } +@@ -694,9 +699,11 @@ static void exit_mm(struct task_struct * + task_lock(tsk); + tsk->mm = NULL; + up_read(&mm->mmap_sem); ++ preempt_disable(); // FIXME + enter_lazy_tlb(mm, current); + /* We don't want this task to be frozen prematurely */ + clear_freeze_flag(tsk); ++ preempt_enable(); + task_unlock(tsk); + mm_update_next_owner(mm); + mmput(mm); +@@ -945,12 +952,9 @@ static void check_stack_usage(void) + { + static DEFINE_SPINLOCK(low_water_lock); + static int lowest_to_date = THREAD_SIZE; +- unsigned long *n = end_of_stack(current); + unsigned long free; + +- while (*n == 0) +- n++; +- free = (unsigned long)n - (unsigned long)end_of_stack(current); ++ free = stack_not_used(current); + + if (free >= lowest_to_date) + return; +@@ -1061,10 +1065,6 @@ NORET_TYPE void do_exit(long code) + tsk->mempolicy = NULL; + #endif + #ifdef CONFIG_FUTEX +- /* +- * This must happen late, after the PID is not +- * hashed anymore: +- */ + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + if (unlikely(current->pi_state_cache)) +@@ -1087,14 +1087,17 @@ NORET_TYPE void do_exit(long code) + if (tsk->splice_pipe) + __free_pipe_info(tsk->splice_pipe); + +- preempt_disable(); ++again: ++ local_irq_disable(); + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; +- schedule(); +- BUG(); +- /* Avoid "noreturn function does return". */ +- for (;;) +- cpu_relax(); /* For when BUG is null */ ++ __schedule(); ++ printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n", ++ current->comm, current->pid); ++ printk(KERN_ERR ".... flags: %08x, count: %d, state: %08lx\n", ++ current->flags, atomic_read(¤t->usage), current->state); ++ printk(KERN_ERR ".... trying again ...\n"); ++ goto again; + } + + EXPORT_SYMBOL_GPL(do_exit); +@@ -1331,6 +1334,12 @@ static int wait_task_zombie(struct task_ + */ + read_unlock(&tasklist_lock); + ++ /* ++ * Flush inherited counters to the parent - before the parent ++ * gets woken up by child-exit notifications. ++ */ ++ perf_counter_exit_task(p); ++ + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; +@@ -1537,6 +1546,7 @@ static int wait_consider_task(struct tas + int __user *stat_addr, struct rusage __user *ru) + { + int ret = eligible_child(type, pid, options, p); ++ BUG_ON(!atomic_read(&p->usage)); + if (!ret) + return ret; + +Index: linux-2.6-tip/kernel/extable.c +=================================================================== +--- linux-2.6-tip.orig/kernel/extable.c ++++ linux-2.6-tip/kernel/extable.c +@@ -15,11 +15,22 @@ + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ ++#include ++#include + #include ++#include + #include +-#include +-#include ++ + #include ++#include ++ ++/* ++ * mutex protecting text section modification (dynamic code patching). ++ * some users need to sleep (allocating memory...) while they hold this lock. ++ * ++ * NOT exported to modules - patching kernel text is a really delicate matter. ++ */ ++DEFINE_MUTEX(text_mutex); + + extern struct exception_table_entry __start___ex_table[]; + extern struct exception_table_entry __stop___ex_table[]; +@@ -41,24 +52,43 @@ const struct exception_table_entry *sear + return e; + } + +-__notrace_funcgraph int core_kernel_text(unsigned long addr) ++static inline int init_kernel_text(unsigned long addr) ++{ ++ if (addr >= (unsigned long)_sinittext && ++ addr <= (unsigned long)_einittext) ++ return 1; ++ return 0; ++} ++ ++int core_kernel_text(unsigned long addr) + { + if (addr >= (unsigned long)_stext && + addr <= (unsigned long)_etext) + return 1; + + if (system_state == SYSTEM_BOOTING && +- addr >= (unsigned long)_sinittext && +- addr <= (unsigned long)_einittext) ++ init_kernel_text(addr)) + return 1; + return 0; + } + +-__notrace_funcgraph int __kernel_text_address(unsigned long addr) ++int __kernel_text_address(unsigned long addr) + { + if (core_kernel_text(addr)) + return 1; +- return __module_text_address(addr) != NULL; ++ if (__module_text_address(addr)) ++ return 1; ++ /* ++ * There might be init symbols in saved stacktraces. ++ * Give those symbols a chance to be printed in ++ * backtraces (such as lockdep traces). ++ * ++ * Since we are after the module-symbols check, there's ++ * no danger of address overlap: ++ */ ++ if (init_kernel_text(addr)) ++ return 1; ++ return 0; + } + + int kernel_text_address(unsigned long addr) +Index: linux-2.6-tip/kernel/fork.c +=================================================================== +--- linux-2.6-tip.orig/kernel/fork.c ++++ linux-2.6-tip/kernel/fork.c +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -49,6 +50,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -61,6 +64,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -79,10 +83,23 @@ int max_threads; /* tunable limit on nr + + DEFINE_PER_CPU(unsigned long, process_counts) = 0; + ++#ifdef CONFIG_PREEMPT_RT ++DEFINE_RWLOCK(tasklist_lock); /* outer */ ++#else + __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ ++#endif + + DEFINE_TRACE(sched_process_fork); + ++/* ++ * Delayed mmdrop. In the PREEMPT_RT case we ++ * dont want to do this from the scheduling ++ * context. ++ */ ++static DEFINE_PER_CPU(struct task_struct *, desched_task); ++ ++static DEFINE_PER_CPU(struct list_head, delayed_drop_list); ++ + int nr_processes(void) + { + int cpu; +@@ -159,6 +176,16 @@ void __put_task_struct(struct task_struc + free_task(tsk); + } + ++#ifdef CONFIG_PREEMPT_RT ++void __put_task_struct_cb(struct rcu_head *rhp) ++{ ++ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); ++ ++ __put_task_struct(tsk); ++ ++} ++#endif ++ + /* + * macro override instead of weak attribute alias, to workaround + * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. +@@ -169,6 +196,8 @@ void __put_task_struct(struct task_struc + + void __init fork_init(unsigned long mempages) + { ++ int i; ++ + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR + #ifndef ARCH_MIN_TASKALIGN + #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES +@@ -176,7 +205,7 @@ void __init fork_init(unsigned long memp + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", sizeof(struct task_struct), +- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); ++ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + #endif + + /* do the arch specific task caches init */ +@@ -199,6 +228,9 @@ void __init fork_init(unsigned long memp + init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; + init_task.signal->rlim[RLIMIT_SIGPENDING] = + init_task.signal->rlim[RLIMIT_NPROC]; ++ ++ for (i = 0; i < NR_CPUS; i++) ++ INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); + } + + int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, +@@ -212,6 +244,8 @@ static struct task_struct *dup_task_stru + { + struct task_struct *tsk; + struct thread_info *ti; ++ unsigned long *stackend; ++ + int err; + + prepare_to_copy(orig); +@@ -237,6 +271,8 @@ static struct task_struct *dup_task_stru + goto out; + + setup_thread_stack(tsk, orig); ++ stackend = end_of_stack(tsk); ++ *stackend = STACK_END_MAGIC; /* for overflow detection */ + + #ifdef CONFIG_CC_STACKPROTECTOR + tsk->stack_canary = get_random_int(); +@@ -276,6 +312,7 @@ static int dup_mmap(struct mm_struct *mm + mm->locked_vm = 0; + mm->mmap = NULL; + mm->mmap_cache = NULL; ++ INIT_LIST_HEAD(&mm->delayed_drop); + mm->free_area_cache = oldmm->mmap_base; + mm->cached_hole_size = ~0UL; + mm->map_count = 0; +@@ -639,6 +676,9 @@ static int copy_mm(unsigned long clone_f + + tsk->min_flt = tsk->maj_flt = 0; + tsk->nvcsw = tsk->nivcsw = 0; ++#ifdef CONFIG_DETECT_HUNG_TASK ++ tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; ++#endif + + tsk->mm = NULL; + tsk->active_mm = NULL; +@@ -901,6 +941,9 @@ static void rt_mutex_init_task(struct ta + #ifdef CONFIG_RT_MUTEXES + plist_head_init(&p->pi_waiters, &p->pi_lock); + p->pi_blocked_on = NULL; ++# ifdef CONFIG_DEBUG_RT_MUTEXES ++ p->last_kernel_lock = NULL; ++# endif + #endif + } + +@@ -972,6 +1015,7 @@ static struct task_struct *copy_process( + goto fork_out; + + rt_mutex_init_task(p); ++ perf_counter_init_task(p); + + #ifdef CONFIG_PROVE_LOCKING + DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); +@@ -1018,6 +1062,7 @@ static struct task_struct *copy_process( + + clear_tsk_thread_flag(p, TIF_SIGPENDING); + init_sigpending(&p->pending); ++ p->sigqueue_cache = NULL; + + p->utime = cputime_zero; + p->stime = cputime_zero; +@@ -1029,16 +1074,11 @@ static struct task_struct *copy_process( + + p->default_timer_slack_ns = current->timer_slack_ns; + +-#ifdef CONFIG_DETECT_SOFTLOCKUP +- p->last_switch_count = 0; +- p->last_switch_timestamp = 0; +-#endif +- + task_io_accounting_init(&p->ioac); + acct_clear_integrals(p); + + posix_cpu_timers_init(p); +- ++ p->posix_timer_list = NULL; + p->lock_depth = -1; /* -1 = no lock */ + do_posix_clock_monotonic_gettime(&p->start_time); + p->real_start_time = p->start_time; +@@ -1074,6 +1114,7 @@ static struct task_struct *copy_process( + p->hardirq_context = 0; + p->softirq_context = 0; + #endif ++ p->pagefault_disabled = 0; + #ifdef CONFIG_LOCKDEP + p->lockdep_depth = 0; /* no locks held yet */ + p->curr_chain_key = 0; +@@ -1111,6 +1152,9 @@ static struct task_struct *copy_process( + retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); + if (retval) + goto bad_fork_cleanup_io; ++#ifdef CONFIG_DEBUG_PREEMPT ++ atomic_set(&p->lock_count, 0); ++#endif + + if (pid != &init_struct_pid) { + retval = -ENOMEM; +@@ -1150,6 +1194,7 @@ static struct task_struct *copy_process( + #endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; ++ p->futex_wakeup = NULL; + #endif + /* + * sigaltstack should be cleared when sharing the same VM +@@ -1197,11 +1242,13 @@ static struct task_struct *copy_process( + * to ensure it is on a valid CPU (and if not, just force it back to + * parent's CPU). This avoids alot of nasty races. + */ ++ preempt_disable(); + p->cpus_allowed = current->cpus_allowed; + p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; + if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || + !cpu_online(task_cpu(p)))) + set_task_cpu(p, smp_processor_id()); ++ preempt_enable(); + + /* CLONE_PARENT re-uses the old parent */ + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { +@@ -1251,7 +1298,9 @@ static struct task_struct *copy_process( + attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); + attach_pid(p, PIDTYPE_SID, task_session(current)); + list_add_tail_rcu(&p->tasks, &init_task.tasks); ++ preempt_disable(); + __get_cpu_var(process_counts)++; ++ preempt_enable(); + } + attach_pid(p, PIDTYPE_PID, pid); + nr_threads++; +@@ -1457,20 +1506,20 @@ void __init proc_caches_init(void) + { + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, +- sighand_ctor); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| ++ SLAB_NOTRACK, sighand_ctor); + signal_cachep = kmem_cache_create("signal_cache", + sizeof(struct signal_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + files_cachep = kmem_cache_create("files_cache", + sizeof(struct files_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + fs_cachep = kmem_cache_create("fs_cache", + sizeof(struct fs_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + mmap_init(); + } + +@@ -1726,3 +1775,138 @@ int unshare_files(struct files_struct ** + task_unlock(task); + return 0; + } ++ ++static int mmdrop_complete(void) ++{ ++ struct list_head *head; ++ int ret = 0; ++ ++ head = &get_cpu_var(delayed_drop_list); ++ while (!list_empty(head)) { ++ struct mm_struct *mm = list_entry(head->next, ++ struct mm_struct, delayed_drop); ++ list_del(&mm->delayed_drop); ++ put_cpu_var(delayed_drop_list); ++ ++ __mmdrop(mm); ++ ret = 1; ++ ++ head = &get_cpu_var(delayed_drop_list); ++ } ++ put_cpu_var(delayed_drop_list); ++ ++ return ret; ++} ++ ++/* ++ * We dont want to do complex work from the scheduler, thus ++ * we delay the work to a per-CPU worker thread: ++ */ ++void __mmdrop_delayed(struct mm_struct *mm) ++{ ++ struct task_struct *desched_task; ++ struct list_head *head; ++ ++ head = &get_cpu_var(delayed_drop_list); ++ list_add_tail(&mm->delayed_drop, head); ++ desched_task = __get_cpu_var(desched_task); ++ if (desched_task) ++ wake_up_process(desched_task); ++ put_cpu_var(delayed_drop_list); ++} ++ ++static void takeover_delayed_drop(int hotcpu) ++{ ++ struct list_head *head = &per_cpu(delayed_drop_list, hotcpu); ++ ++ while (!list_empty(head)) { ++ struct mm_struct *mm = list_entry(head->next, ++ struct mm_struct, delayed_drop); ++ ++ list_del(&mm->delayed_drop); ++ __mmdrop_delayed(mm); ++ } ++} ++ ++static int desched_thread(void * __bind_cpu) ++{ ++ set_user_nice(current, -10); ++ current->flags |= PF_NOFREEZE | PF_SOFTIRQ; ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ while (!kthread_should_stop()) { ++ ++ if (mmdrop_complete()) ++ continue; ++ schedule(); ++ ++ /* ++ * This must be called from time to time on ia64, and is a ++ * no-op on other archs. Used to be in cpu_idle(), but with ++ * the new -rt semantics it can't stay there. ++ */ ++ check_pgt_cache(); ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ __set_current_state(TASK_RUNNING); ++ return 0; ++} ++ ++static int __devinit cpu_callback(struct notifier_block *nfb, ++ unsigned long action, ++ void *hcpu) ++{ ++ int hotcpu = (unsigned long)hcpu; ++ struct task_struct *p; ++ ++ switch (action) { ++ case CPU_UP_PREPARE: ++ ++ BUG_ON(per_cpu(desched_task, hotcpu)); ++ INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); ++ p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); ++ if (IS_ERR(p)) { ++ printk("desched_thread for %i failed\n", hotcpu); ++ return NOTIFY_BAD; ++ } ++ per_cpu(desched_task, hotcpu) = p; ++ kthread_bind(p, hotcpu); ++ break; ++ case CPU_ONLINE: ++ ++ wake_up_process(per_cpu(desched_task, hotcpu)); ++ break; ++#ifdef CONFIG_HOTPLUG_CPU ++ case CPU_UP_CANCELED: ++ ++ /* Unbind so it can run. Fall thru. */ ++ kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); ++ case CPU_DEAD: ++ ++ p = per_cpu(desched_task, hotcpu); ++ per_cpu(desched_task, hotcpu) = NULL; ++ kthread_stop(p); ++ takeover_delayed_drop(hotcpu); ++ takeover_tasklets(hotcpu); ++ break; ++#endif /* CONFIG_HOTPLUG_CPU */ ++ } ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block __devinitdata cpu_nfb = { ++ .notifier_call = cpu_callback ++}; ++ ++__init int spawn_desched_task(void) ++{ ++ void *cpu = (void *)(long)smp_processor_id(); ++ ++ cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); ++ cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); ++ register_cpu_notifier(&cpu_nfb); ++ return 0; ++} ++ +Index: linux-2.6-tip/kernel/futex.c +=================================================================== +--- linux-2.6-tip.orig/kernel/futex.c ++++ linux-2.6-tip/kernel/futex.c +@@ -19,6 +19,10 @@ + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet + * ++ * Requeue-PI support by Darren Hart ++ * Copyright (C) IBM Corporation, 2009 ++ * Thanks to Thomas Gleixner for conceptual design and careful reviews. ++ * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. +@@ -96,8 +100,8 @@ struct futex_pi_state { + */ + struct futex_q { + struct plist_node list; +- /* There can only be a single waiter */ +- wait_queue_head_t waiter; ++ /* Waiter reference */ ++ struct task_struct *task; + + /* Which hash list lock to use: */ + spinlock_t *lock_ptr; +@@ -107,14 +111,18 @@ struct futex_q { + + /* Optional priority inheritance state: */ + struct futex_pi_state *pi_state; +- struct task_struct *task; ++ ++ /* rt_waiter storage for requeue_pi: */ ++ struct rt_mutex_waiter *rt_waiter; + + /* Bitset for the optional bitmasked wakeup */ + u32 bitset; + }; + + /* +- * Split the global futex_lock into every hash list lock. ++ * Hash buckets are shared by all the futex_keys that hash to the same ++ * location. Each key may have multiple futex_q structures, one for each task ++ * waiting on a futex. + */ + struct futex_hash_bucket { + spinlock_t lock; +@@ -189,8 +197,7 @@ static void drop_futex_key_refs(union fu + /** + * get_futex_key - Get parameters which are the keys for a futex. + * @uaddr: virtual address of the futex +- * @shared: NULL for a PROCESS_PRIVATE futex, +- * ¤t->mm->mmap_sem for a PROCESS_SHARED futex ++ * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) + * +@@ -201,9 +208,7 @@ static void drop_futex_key_refs(union fu + * offset_within_page). For private mappings, it's (uaddr, current->mm). + * We can usually work out the index without swapping in the page. + * +- * fshared is NULL for PROCESS_PRIVATE futexes +- * For other futexes, it points to ¤t->mm->mmap_sem and +- * caller must have taken the reader lock. but NOT any spinlocks. ++ * lock_page() might sleep, the caller should not hold a spinlock. + */ + static int + get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +@@ -279,6 +284,50 @@ void put_futex_key(int fshared, union fu + drop_futex_key_refs(key); + } + ++/* ++ * get_user_writeable - get user page and verify RW access ++ * @uaddr: pointer to faulting user space address ++ * ++ * We cannot write to the user space address and get_user just faults ++ * the page in, but does not tell us whether the mapping is writeable. ++ * ++ * We can not rely on access_ok() for private futexes as it is just a ++ * range check and we can neither rely on get_user_pages() as there ++ * might be a mprotect(PROT_READ) for that mapping after ++ * get_user_pages() and before the fault in the atomic write access. ++ */ ++static int get_user_writeable(u32 __user *uaddr) ++{ ++ unsigned long addr = (unsigned long)uaddr; ++ struct page *page; ++ int ret; ++ ++ ret = get_user_pages_fast(addr, 1, 1, &page); ++ if (ret > 0) ++ put_page(page); ++ ++ return ret; ++} ++ ++/** ++ * futex_top_waiter() - Return the highest priority waiter on a futex ++ * @hb: the hash bucket the futex_q's reside in ++ * @key: the futex key (to distinguish it from other futex futex_q's) ++ * ++ * Must be called with the hb lock held. ++ */ ++static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, ++ union futex_key *key) ++{ ++ struct futex_q *this; ++ ++ plist_for_each_entry(this, &hb->chain, list) { ++ if (match_futex(&this->key, key)) ++ return this; ++ } ++ return NULL; ++} ++ + static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) + { + u32 curval; +@@ -301,41 +350,6 @@ static int get_futex_value_locked(u32 *d + return ret ? -EFAULT : 0; + } + +-/* +- * Fault handling. +- */ +-static int futex_handle_fault(unsigned long address, int attempt) +-{ +- struct vm_area_struct * vma; +- struct mm_struct *mm = current->mm; +- int ret = -EFAULT; +- +- if (attempt > 2) +- return ret; +- +- down_read(&mm->mmap_sem); +- vma = find_vma(mm, address); +- if (vma && address >= vma->vm_start && +- (vma->vm_flags & VM_WRITE)) { +- int fault; +- fault = handle_mm_fault(mm, vma, address, 1); +- if (unlikely((fault & VM_FAULT_ERROR))) { +-#if 0 +- /* XXX: let's do this when we verify it is OK */ +- if (ret & VM_FAULT_OOM) +- ret = -ENOMEM; +-#endif +- } else { +- ret = 0; +- if (fault & VM_FAULT_MAJOR) +- current->maj_flt++; +- else +- current->min_flt++; +- } +- } +- up_read(&mm->mmap_sem); +- return ret; +-} + + /* + * PI code: +@@ -575,29 +589,203 @@ lookup_pi_state(u32 uval, struct futex_h + return 0; + } + ++/** ++ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex ++ * @uaddr: the pi futex user address ++ * @hb: the pi futex hash bucket ++ * @key: the futex key associated with uaddr and hb ++ * @ps: the pi_state pointer where we store the result of the ++ * lookup ++ * @task: the task to perform the atomic lock work for. This will ++ * be "current" except in the case of requeue pi. ++ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) ++ * ++ * Returns: ++ * 0 - ready to wait ++ * 1 - acquired the lock ++ * <0 - error ++ * ++ * The hb->lock and futex_key refs shall be held by the caller. ++ */ ++static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, ++ union futex_key *key, ++ struct futex_pi_state **ps, ++ struct task_struct *task, int set_waiters) ++{ ++ int lock_taken, ret, ownerdied = 0; ++ u32 uval, newval, curval; ++ ++retry: ++ ret = lock_taken = 0; ++ ++ /* ++ * To avoid races, we attempt to take the lock here again ++ * (by doing a 0 -> TID atomic cmpxchg), while holding all ++ * the locks. It will most likely not succeed. ++ */ ++ newval = task_pid_vnr(task); ++ if (set_waiters) ++ newval |= FUTEX_WAITERS; ++ ++ curval = cmpxchg_futex_value_locked(uaddr, 0, newval); ++ ++ if (unlikely(curval == -EFAULT)) ++ return -EFAULT; ++ ++ /* ++ * Detect deadlocks. ++ */ ++ if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) ++ return -EDEADLK; ++ ++ /* ++ * Surprise - we got the lock. Just return to userspace: ++ */ ++ if (unlikely(!curval)) ++ return 1; ++ ++ uval = curval; ++ ++ /* ++ * Set the FUTEX_WAITERS flag, so the owner will know it has someone ++ * to wake at the next unlock. ++ */ ++ newval = curval | FUTEX_WAITERS; ++ ++ /* ++ * There are two cases, where a futex might have no owner (the ++ * owner TID is 0): OWNER_DIED. We take over the futex in this ++ * case. We also do an unconditional take over, when the owner ++ * of the futex died. ++ * ++ * This is safe as we are protected by the hash bucket lock ! ++ */ ++ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { ++ /* Keep the OWNER_DIED bit */ ++ newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); ++ ownerdied = 0; ++ lock_taken = 1; ++ } ++ ++ curval = cmpxchg_futex_value_locked(uaddr, uval, newval); ++ ++ if (unlikely(curval == -EFAULT)) ++ return -EFAULT; ++ if (unlikely(curval != uval)) ++ goto retry; ++ ++ /* ++ * We took the lock due to owner died take over. ++ */ ++ if (unlikely(lock_taken)) ++ return 1; ++ ++ /* ++ * We dont have the lock. Look up the PI state (or create it if ++ * we are the first waiter): ++ */ ++ ret = lookup_pi_state(uval, hb, key, ps); ++ ++ if (unlikely(ret)) { ++ switch (ret) { ++ case -ESRCH: ++ /* ++ * No owner found for this futex. Check if the ++ * OWNER_DIED bit is set to figure out whether ++ * this is a robust futex or not. ++ */ ++ if (get_futex_value_locked(&curval, uaddr)) ++ return -EFAULT; ++ ++ /* ++ * We simply start over in case of a robust ++ * futex. The code above will take the futex ++ * and return happy. ++ */ ++ if (curval & FUTEX_OWNER_DIED) { ++ ownerdied = 1; ++ goto retry; ++ } ++ default: ++ break; ++ } ++ } ++ ++ return ret; ++} ++ + /* + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. + */ +-static void wake_futex(struct futex_q *q) ++static void wake_futex(struct task_struct **wake_list, struct futex_q *q) + { +- plist_del(&q->list, &q->list.plist); ++ struct task_struct *p = q->task; ++ + /* +- * The lock in wake_up_all() is a crucial memory barrier after the +- * plist_del() and also before assigning to q->lock_ptr. ++ * We set q->lock_ptr = NULL _before_ we wake up the task. If ++ * a non futex wake up happens on another CPU then the task ++ * might exit and p would dereference a non existing task ++ * struct. Prevent this by holding a reference on p across the ++ * wake up. + */ +- wake_up(&q->waiter); ++ get_task_struct(p); ++ ++ plist_del(&q->list, &q->list.plist); + /* +- * The waiting task can free the futex_q as soon as this is written, +- * without taking any locks. This must come last. +- * +- * A memory barrier is required here to prevent the following store +- * to lock_ptr from getting ahead of the wakeup. Clearing the lock +- * at the end of wake_up_all() does not prevent this store from +- * moving. ++ * The waiting task can free the futex_q as soon as ++ * q->lock_ptr = NULL is written, without taking any locks. A ++ * memory barrier is required here to prevent the following ++ * store to lock_ptr from getting ahead of the plist_del. + */ + smp_wmb(); + q->lock_ptr = NULL; ++ ++ /* ++ * Atomically grab the task, if ->futex_wakeup is !0 already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * This cmpxchg() implies a full barrier, which pairs with the write ++ * barrier implied by the wakeup in wake_futex_list(). ++ */ ++ if (cmpxchg(&p->futex_wakeup, 0, p) != 0) { ++ /* ++ * It was already queued, drop the extra ref and we're done. ++ */ ++ put_task_struct(p); ++ return; ++ } ++ ++ /* ++ * Put the task on our wakeup list by atomically switching it with ++ * the list head. (XXX its a local list, no possible concurrency, ++ * this could be written without cmpxchg). ++ */ ++ do { ++ p->futex_wakeup = *wake_list; ++ } while (cmpxchg(wake_list, p->futex_wakeup, p) != p->futex_wakeup); ++} ++ ++/* ++ * For each task on the list, deliver the pending wakeup and release the ++ * task reference obtained in wake_futex(). ++ */ ++static void wake_futex_list(struct task_struct *head) ++{ ++ while (head != &init_task) { ++ struct task_struct *next = head->futex_wakeup; ++ ++ head->futex_wakeup = NULL; ++ /* ++ * wake_up_state() implies a wmb() to pair with the queueing ++ * in wake_futex() so as to not miss wakeups. ++ */ ++ wake_up_state(head, TASK_NORMAL); ++ put_task_struct(head); ++ ++ head = next; ++ } + } + + static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +@@ -694,9 +882,16 @@ double_lock_hb(struct futex_hash_bucket + } + } + ++static inline void ++double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) ++{ ++ spin_unlock(&hb1->lock); ++ if (hb1 != hb2) ++ spin_unlock(&hb2->lock); ++} ++ + /* +- * Wake up all waiters hashed on the physical page that is mapped +- * to this virtual address: ++ * Wake up waiters matching bitset queued on this futex (uaddr). + */ + static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) + { +@@ -704,6 +899,7 @@ static int futex_wake(u32 __user *uaddr, + struct futex_q *this, *next; + struct plist_head *head; + union futex_key key = FUTEX_KEY_INIT; ++ struct task_struct *wake_list = &init_task; + int ret; + + if (!bitset) +@@ -719,7 +915,7 @@ static int futex_wake(u32 __user *uaddr, + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key)) { +- if (this->pi_state) { ++ if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + break; + } +@@ -728,7 +924,7 @@ static int futex_wake(u32 __user *uaddr, + if (!(this->bitset & bitset)) + continue; + +- wake_futex(this); ++ wake_futex(&wake_list, this); + if (++ret >= nr_wake) + break; + } +@@ -736,6 +932,8 @@ static int futex_wake(u32 __user *uaddr, + + spin_unlock(&hb->lock); + put_futex_key(fshared, &key); ++ ++ wake_futex_list(wake_list); + out: + return ret; + } +@@ -752,9 +950,10 @@ futex_wake_op(u32 __user *uaddr1, int fs + struct futex_hash_bucket *hb1, *hb2; + struct plist_head *head; + struct futex_q *this, *next; +- int ret, op_ret, attempt = 0; ++ struct task_struct *wake_list = &init_task; ++ int ret, op_ret; + +-retryfull: ++retry: + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + if (unlikely(ret != 0)) + goto out; +@@ -765,16 +964,12 @@ retryfull: + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + +-retry: + double_lock_hb(hb1, hb2); +- ++retry_private: + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { +- u32 dummy; + +- spin_unlock(&hb1->lock); +- if (hb1 != hb2) +- spin_unlock(&hb2->lock); ++ double_unlock_hb(hb1, hb2); + + #ifndef CONFIG_MMU + /* +@@ -790,33 +985,23 @@ retry: + goto out_put_keys; + } + +- /* +- * futex_atomic_op_inuser needs to both read and write +- * *(int __user *)uaddr2, but we can't modify it +- * non-atomically. Therefore, if get_user below is not +- * enough, we need to handle the fault ourselves, while +- * still holding the mmap_sem. +- */ +- if (attempt++) { +- ret = futex_handle_fault((unsigned long)uaddr2, +- attempt); +- if (ret) +- goto out_put_keys; +- goto retry; +- } +- +- ret = get_user(dummy, uaddr2); ++ ret = get_user_writeable(uaddr2); + if (ret) +- return ret; ++ goto out_put_keys; ++ ++ if (!fshared) ++ goto retry_private; + +- goto retryfull; ++ put_futex_key(fshared, &key2); ++ put_futex_key(fshared, &key1); ++ goto retry; + } + + head = &hb1->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key1)) { +- wake_futex(this); ++ wake_futex(&wake_list, this); + if (++ret >= nr_wake) + break; + } +@@ -828,7 +1013,7 @@ retry: + op_ret = 0; + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key2)) { +- wake_futex(this); ++ wake_futex(&wake_list, this); + if (++op_ret >= nr_wake2) + break; + } +@@ -836,41 +1021,208 @@ retry: + ret += op_ret; + } + +- spin_unlock(&hb1->lock); +- if (hb1 != hb2) +- spin_unlock(&hb2->lock); ++ double_unlock_hb(hb1, hb2); + out_put_keys: + put_futex_key(fshared, &key2); + out_put_key1: + put_futex_key(fshared, &key1); ++ ++ wake_futex_list(wake_list); + out: + return ret; + } + +-/* +- * Requeue all waiters hashed on one physical page to another +- * physical page. ++/** ++ * requeue_futex() - Requeue a futex_q from one hb to another ++ * @q: the futex_q to requeue ++ * @hb1: the source hash_bucket ++ * @hb2: the target hash_bucket ++ * @key2: the new key for the requeued futex_q ++ */ ++static inline ++void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, ++ struct futex_hash_bucket *hb2, union futex_key *key2) ++{ ++ ++ /* ++ * If key1 and key2 hash to the same bucket, no need to ++ * requeue. ++ */ ++ if (likely(&hb1->chain != &hb2->chain)) { ++ plist_del(&q->list, &hb1->chain); ++ plist_add(&q->list, &hb2->chain); ++ q->lock_ptr = &hb2->lock; ++#ifdef CONFIG_DEBUG_PI_LIST ++# ifdef CONFIG_PREEMPT_RT ++ q->list.plist.lock = NULL; ++# else ++ q->list.plist.lock = &hb2->lock; ++# endif ++#endif ++ } ++ get_futex_key_refs(key2); ++ q->key = *key2; ++} ++ ++/** ++ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue ++ * q: the futex_q ++ * key: the key of the requeue target futex ++ * ++ * During futex_requeue, with requeue_pi=1, it is possible to acquire the ++ * target futex if it is uncontended or via a lock steal. Set the futex_q key ++ * to the requeue target futex so the waiter can detect the wakeup on the right ++ * futex, but remove it from the hb and NULL the rt_waiter so it can detect ++ * atomic lock acquisition. Must be called with the q->lock_ptr held. ++ */ ++static inline ++void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) ++{ ++ drop_futex_key_refs(&q->key); ++ get_futex_key_refs(key); ++ q->key = *key; ++ ++ WARN_ON(plist_node_empty(&q->list)); ++ plist_del(&q->list, &q->list.plist); ++ ++ WARN_ON(!q->rt_waiter); ++ q->rt_waiter = NULL; ++ ++ wake_up_state(q->task, TASK_NORMAL); ++} ++ ++/** ++ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter ++ * @pifutex: the user address of the to futex ++ * @hb1: the from futex hash bucket, must be locked by the caller ++ * @hb2: the to futex hash bucket, must be locked by the caller ++ * @key1: the from futex key ++ * @key2: the to futex key ++ * @ps: address to store the pi_state pointer ++ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) ++ * ++ * Try and get the lock on behalf of the top waiter if we can do it atomically. ++ * Wake the top waiter if we succeed. If the caller specified set_waiters, ++ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. ++ * hb1 and hb2 must be held by the caller. ++ * ++ * Returns: ++ * 0 - failed to acquire the lock atomicly ++ * 1 - acquired the lock ++ * <0 - error ++ */ ++static int futex_proxy_trylock_atomic(u32 __user *pifutex, ++ struct futex_hash_bucket *hb1, ++ struct futex_hash_bucket *hb2, ++ union futex_key *key1, union futex_key *key2, ++ struct futex_pi_state **ps, int set_waiters) ++{ ++ struct futex_q *top_waiter = NULL; ++ u32 curval; ++ int ret; ++ ++ if (get_futex_value_locked(&curval, pifutex)) ++ return -EFAULT; ++ ++ /* ++ * Find the top_waiter and determine if there are additional waiters. ++ * If the caller intends to requeue more than 1 waiter to pifutex, ++ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, ++ * as we have means to handle the possible fault. If not, don't set ++ * the bit unecessarily as it will force the subsequent unlock to enter ++ * the kernel. ++ */ ++ top_waiter = futex_top_waiter(hb1, key1); ++ ++ /* There are no waiters, nothing for us to do. */ ++ if (!top_waiter) ++ return 0; ++ ++ /* ++ * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in ++ * the contended case or if set_waiters is 1. The pi_state is returned ++ * in ps in contended cases. ++ */ ++ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, ++ set_waiters); ++ if (ret == 1) ++ requeue_pi_wake_futex(top_waiter, key2); ++ ++ return ret; ++} ++ ++/** ++ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 ++ * uaddr1: source futex user address ++ * uaddr2: target futex user address ++ * nr_wake: number of waiters to wake (must be 1 for requeue_pi) ++ * nr_requeue: number of waiters to requeue (0-INT_MAX) ++ * requeue_pi: if we are attempting to requeue from a non-pi futex to a ++ * pi futex (pi to pi requeue is not supported) ++ * ++ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire ++ * uaddr2 atomically on behalf of the top waiter. ++ * ++ * Returns: ++ * >=0 - on success, the number of tasks requeued or woken ++ * <0 - on error + */ + static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, +- int nr_wake, int nr_requeue, u32 *cmpval) ++ int nr_wake, int nr_requeue, u32 *cmpval, ++ int requeue_pi) + { + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; ++ int drop_count = 0, task_count = 0, ret; ++ struct futex_pi_state *pi_state = NULL; + struct futex_hash_bucket *hb1, *hb2; + struct plist_head *head1; + struct futex_q *this, *next; +- int ret, drop_count = 0; ++ struct task_struct *wake_list = &init_task; ++ u32 curval2; ++ ++ if (requeue_pi) { ++ /* ++ * requeue_pi requires a pi_state, try to allocate it now ++ * without any locks in case it fails. ++ */ ++ if (refill_pi_state_cache()) ++ return -ENOMEM; ++ /* ++ * requeue_pi must wake as many tasks as it can, up to nr_wake ++ * + nr_requeue, since it acquires the rt_mutex prior to ++ * returning to userspace, so as to not leave the rt_mutex with ++ * waiters and no owner. However, second and third wake-ups ++ * cannot be predicted as they involve race conditions with the ++ * first wake and a fault while looking up the pi_state. Both ++ * pthread_cond_signal() and pthread_cond_broadcast() should ++ * use nr_wake=1. ++ */ ++ if (nr_wake != 1) ++ return -EINVAL; ++ } + + retry: ++ if (pi_state != NULL) { ++ /* ++ * We will have to lookup the pi_state again, so free this one ++ * to keep the accounting correct. ++ */ ++ free_pi_state(pi_state); ++ pi_state = NULL; ++ } ++ + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + if (unlikely(ret != 0)) + goto out; +- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); ++ ret = get_futex_key(uaddr2, fshared, &key2, ++ requeue_pi ? VERIFY_WRITE : VERIFY_READ); + if (unlikely(ret != 0)) + goto out_put_key1; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + ++retry_private: + double_lock_hb(hb1, hb2); + + if (likely(cmpval != NULL)) { +@@ -879,16 +1231,18 @@ retry: + ret = get_futex_value_locked(&curval, uaddr1); + + if (unlikely(ret)) { +- spin_unlock(&hb1->lock); +- if (hb1 != hb2) +- spin_unlock(&hb2->lock); ++ double_unlock_hb(hb1, hb2); + + ret = get_user(curval, uaddr1); ++ if (ret) ++ goto out_put_keys; + +- if (!ret) +- goto retry; ++ if (!fshared) ++ goto retry_private; + +- goto out_put_keys; ++ put_futex_key(fshared, &key2); ++ put_futex_key(fshared, &key1); ++ goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; +@@ -896,40 +1250,110 @@ retry: + } + } + +- head1 = &hb1->chain; +- plist_for_each_entry_safe(this, next, head1, list) { +- if (!match_futex (&this->key, &key1)) +- continue; +- if (++ret <= nr_wake) { +- wake_futex(this); +- } else { +- /* +- * If key1 and key2 hash to the same bucket, no need to +- * requeue. +- */ +- if (likely(head1 != &hb2->chain)) { +- plist_del(&this->list, &hb1->chain); +- plist_add(&this->list, &hb2->chain); +- this->lock_ptr = &hb2->lock; +-#ifdef CONFIG_DEBUG_PI_LIST +- this->list.plist.lock = &hb2->lock; +-#endif +- } +- this->key = key2; +- get_futex_key_refs(&key2); +- drop_count++; ++ if (requeue_pi && (task_count - nr_wake < nr_requeue)) { ++ /* ++ * Attempt to acquire uaddr2 and wake the top waiter. If we ++ * intend to requeue waiters, force setting the FUTEX_WAITERS ++ * bit. We force this here where we are able to easily handle ++ * faults rather in the requeue loop below. ++ */ ++ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, ++ &key2, &pi_state, nr_requeue); + +- if (ret - nr_wake >= nr_requeue) +- break; ++ /* ++ * At this point the top_waiter has either taken uaddr2 or is ++ * waiting on it. If the former, then the pi_state will not ++ * exist yet, look it up one more time to ensure we have a ++ * reference to it. ++ */ ++ if (ret == 1) { ++ WARN_ON(pi_state); ++ task_count++; ++ ret = get_futex_value_locked(&curval2, uaddr2); ++ if (!ret) ++ ret = lookup_pi_state(curval2, hb2, &key2, ++ &pi_state); ++ } ++ ++ switch (ret) { ++ case 0: ++ break; ++ case -EFAULT: ++ double_unlock_hb(hb1, hb2); ++ put_futex_key(fshared, &key2); ++ put_futex_key(fshared, &key1); ++ ret = get_user_writeable(uaddr2); ++ if (!ret) ++ goto retry; ++ goto out; ++ case -EAGAIN: ++ /* The owner was exiting, try again. */ ++ double_unlock_hb(hb1, hb2); ++ put_futex_key(fshared, &key2); ++ put_futex_key(fshared, &key1); ++ cond_resched(); ++ goto retry; ++ default: ++ goto out_unlock; + } + } + ++ head1 = &hb1->chain; ++ plist_for_each_entry_safe(this, next, head1, list) { ++ if (task_count - nr_wake >= nr_requeue) ++ break; ++ ++ if (!match_futex(&this->key, &key1)) ++ continue; ++ ++ WARN_ON(!requeue_pi && this->rt_waiter); ++ WARN_ON(requeue_pi && !this->rt_waiter); ++ ++ /* ++ * Wake nr_wake waiters. For requeue_pi, if we acquired the ++ * lock, we already woke the top_waiter. If not, it will be ++ * woken by futex_unlock_pi(). ++ */ ++ if (++task_count <= nr_wake && !requeue_pi) { ++ wake_futex(&wake_list, this); ++ continue; ++ } ++ ++ /* ++ * Requeue nr_requeue waiters and possibly one more in the case ++ * of requeue_pi if we couldn't acquire the lock atomically. ++ */ ++ if (requeue_pi) { ++ /* Prepare the waiter to take the rt_mutex. */ ++ atomic_inc(&pi_state->refcount); ++ this->pi_state = pi_state; ++ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, ++ this->rt_waiter, ++ this->task, 1); ++ if (ret == 1) { ++ /* We got the lock. */ ++ requeue_pi_wake_futex(this, &key2); ++ continue; ++ } else if (ret) { ++ /* -EDEADLK */ ++ this->pi_state = NULL; ++ free_pi_state(pi_state); ++ goto out_unlock; ++ } ++ } ++ requeue_futex(this, hb1, hb2, &key2); ++ drop_count++; ++ } ++ + out_unlock: +- spin_unlock(&hb1->lock); +- if (hb1 != hb2) +- spin_unlock(&hb2->lock); ++ double_unlock_hb(hb1, hb2); + +- /* drop_futex_key_refs() must be called outside the spinlocks. */ ++ /* ++ * drop_futex_key_refs() must be called outside the spinlocks. During ++ * the requeue we moved futex_q's from the hash bucket at key1 to the ++ * one at key2 and updated their key pointer. We no longer need to ++ * hold the references to key1. ++ */ + while (--drop_count >= 0) + drop_futex_key_refs(&key1); + +@@ -937,8 +1361,12 @@ out_put_keys: + put_futex_key(fshared, &key2); + out_put_key1: + put_futex_key(fshared, &key1); ++ ++ wake_futex_list(wake_list); + out: +- return ret; ++ if (pi_state != NULL) ++ free_pi_state(pi_state); ++ return ret ? ret : task_count; + } + + /* The key must be already stored in q->key. */ +@@ -946,8 +1374,6 @@ static inline struct futex_hash_bucket * + { + struct futex_hash_bucket *hb; + +- init_waitqueue_head(&q->waiter); +- + get_futex_key_refs(&q->key); + hb = hash_futex(&q->key); + q->lock_ptr = &hb->lock; +@@ -972,8 +1398,12 @@ static inline void queue_me(struct futex + + plist_node_init(&q->list, prio); + #ifdef CONFIG_DEBUG_PI_LIST ++#ifdef CONFIG_PREEMPT_RT ++ q->list.plist.lock = NULL; ++#else + q->list.plist.lock = &hb->lock; + #endif ++#endif + plist_add(&q->list, &hb->chain); + q->task = current; + spin_unlock(&hb->lock); +@@ -1065,7 +1495,7 @@ static int fixup_pi_state_owner(u32 __us + struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; + u32 uval, curval, newval; +- int ret, attempt = 0; ++ int ret; + + /* Owner died? */ + if (!pi_state->owner) +@@ -1078,11 +1508,9 @@ static int fixup_pi_state_owner(u32 __us + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * +- * Note: We write the user space value _before_ changing the +- * pi_state because we can fault here. Imagine swapped out +- * pages or a fork, which was running right before we acquired +- * mmap_sem, that marked all the anonymous memory readonly for +- * cow. ++ * Note: We write the user space value _before_ changing the pi_state ++ * because we can fault here. Imagine swapped out pages or a fork ++ * that marked all the anonymous memory readonly for cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault +@@ -1138,7 +1566,7 @@ retry: + handle_fault: + spin_unlock(q->lock_ptr); + +- ret = futex_handle_fault((unsigned long)uaddr, attempt++); ++ ret = get_user_writeable(uaddr); + + spin_lock(q->lock_ptr); + +@@ -1160,37 +1588,158 @@ handle_fault: + */ + #define FLAGS_SHARED 0x01 + #define FLAGS_CLOCKRT 0x02 ++#define FLAGS_HAS_TIMEOUT 0x04 + + static long futex_wait_restart(struct restart_block *restart); + +-static int futex_wait(u32 __user *uaddr, int fshared, +- u32 val, ktime_t *abs_time, u32 bitset, int clockrt) ++/** ++ * fixup_owner() - Post lock pi_state and corner case management ++ * @uaddr: user address of the futex ++ * @fshared: whether the futex is shared (1) or not (0) ++ * @q: futex_q (contains pi_state and access to the rt_mutex) ++ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) ++ * ++ * After attempting to lock an rt_mutex, this function is called to cleanup ++ * the pi_state owner as well as handle race conditions that may allow us to ++ * acquire the lock. Must be called with the hb lock held. ++ * ++ * Returns: ++ * 1 - success, lock taken ++ * 0 - success, lock not taken ++ * <0 - on error (-EFAULT) ++ */ ++static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, ++ int locked) + { +- struct task_struct *curr = current; +- struct restart_block *restart; +- DECLARE_WAITQUEUE(wait, curr); +- struct futex_hash_bucket *hb; +- struct futex_q q; +- u32 uval; +- int ret; +- struct hrtimer_sleeper t; +- int rem = 0; ++ struct task_struct *owner; ++ int ret = 0; + +- if (!bitset) +- return -EINVAL; ++ if (locked) { ++ /* ++ * Got the lock. We might not be the anticipated owner if we ++ * did a lock-steal - fix up the PI-state in that case: ++ */ ++ if (q->pi_state->owner != current) ++ ret = fixup_pi_state_owner(uaddr, q, current, fshared); ++ goto out; ++ } + +- q.pi_state = NULL; +- q.bitset = bitset; +-retry: +- q.key = FUTEX_KEY_INIT; +- ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); +- if (unlikely(ret != 0)) ++ /* ++ * Catch the rare case, where the lock was released when we were on the ++ * way back before we locked the hash bucket. ++ */ ++ if (q->pi_state->owner == current) { ++ /* ++ * Try to get the rt_mutex now. This might fail as some other ++ * task acquired the rt_mutex after we removed ourself from the ++ * rt_mutex waiters list. ++ */ ++ if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { ++ locked = 1; ++ goto out; ++ } ++ ++ /* ++ * pi_state is incorrect, some other task did a lock steal and ++ * we returned due to timeout or signal without taking the ++ * rt_mutex. Too late. We can access the rt_mutex_owner without ++ * locking, as the other task is now blocked on the hash bucket ++ * lock. Fix the state up. ++ */ ++ owner = rt_mutex_owner(&q->pi_state->pi_mutex); ++ ret = fixup_pi_state_owner(uaddr, q, owner, fshared); + goto out; ++ } + +- hb = queue_lock(&q); ++ /* ++ * Paranoia check. If we did not take the lock, then we should not be ++ * the owner, nor the pending owner, of the rt_mutex. ++ */ ++ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) ++ printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " ++ "pi-state %p\n", ret, ++ q->pi_state->pi_mutex.owner, ++ q->pi_state->owner); ++ ++out: ++ return ret ? ret : locked; ++} ++ ++/** ++ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal ++ * @hb: the futex hash bucket, must be locked by the caller ++ * @q: the futex_q to queue up on ++ * @timeout: the prepared hrtimer_sleeper, or null for no timeout ++ */ ++static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, ++ struct hrtimer_sleeper *timeout) ++{ ++ queue_me(q, hb); + + /* +- * Access the page AFTER the futex is queued. ++ * There might have been scheduling since the queue_me(), as we ++ * cannot hold a spinlock across the get_user() in case it ++ * faults, and we cannot just set TASK_INTERRUPTIBLE state when ++ * queueing ourselves into the futex hash. This code thus has to ++ * rely on the futex_wake() code removing us from hash when it ++ * wakes us up. ++ */ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ /* Arm the timer */ ++ if (timeout) { ++ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); ++ if (!hrtimer_active(&timeout->timer)) ++ timeout->task = NULL; ++ } ++ ++ /* ++ * !plist_node_empty() is safe here without any lock. ++ * q.lock_ptr != 0 is not safe, because of ordering against wakeup. ++ */ ++ if (likely(!plist_node_empty(&q->list))) { ++ unsigned long nosched_flag = current->flags & PF_NOSCHED; ++ ++ current->flags &= ~PF_NOSCHED; ++ ++ /* ++ * If the timer has already expired, current will already be ++ * flagged for rescheduling. Only call schedule if there ++ * is no timeout, or if it has yet to expire. ++ */ ++ if (!timeout || timeout->task) ++ schedule(); ++ ++ current->flags |= nosched_flag; ++ } ++ __set_current_state(TASK_RUNNING); ++} ++ ++/** ++ * futex_wait_setup() - Prepare to wait on a futex ++ * @uaddr: the futex userspace address ++ * @val: the expected value ++ * @fshared: whether the futex is shared (1) or not (0) ++ * @q: the associated futex_q ++ * @hb: storage for hash_bucket pointer to be returned to caller ++ * ++ * Setup the futex_q and locate the hash_bucket. Get the futex value and ++ * compare it with the expected value. Handle atomic faults internally. ++ * Return with the hb lock held and a q.key reference on success, and unlocked ++ * with no q.key reference on failure. ++ * ++ * Returns: ++ * 0 - uaddr contains val and hb has been locked ++ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked ++ */ ++static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, ++ struct futex_q *q, struct futex_hash_bucket **hb) ++{ ++ u32 uval; ++ int ret; ++ ++ /* ++ * Access the page AFTER the hash-bucket is locked. + * Order is important: + * + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); +@@ -1205,95 +1754,83 @@ retry: + * A consequence is that futex_wait() can return zero and absorb + * a wakeup when *uaddr != val on entry to the syscall. This is + * rare, but normal. +- * +- * for shared futexes, we hold the mmap semaphore, so the mapping +- * cannot have changed since we looked it up in get_futex_key. + */ ++retry: ++ q->key = FUTEX_KEY_INIT; ++ ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); ++ if (unlikely(ret != 0)) ++ return ret; ++ ++retry_private: ++ *hb = queue_lock(q); ++ + ret = get_futex_value_locked(&uval, uaddr); + +- if (unlikely(ret)) { +- queue_unlock(&q, hb); +- put_futex_key(fshared, &q.key); ++ if (ret) { ++ queue_unlock(q, *hb); + + ret = get_user(uval, uaddr); ++ if (ret) ++ goto out; + +- if (!ret) +- goto retry; +- goto out; +- } +- ret = -EWOULDBLOCK; +- if (unlikely(uval != val)) { +- queue_unlock(&q, hb); +- goto out_put_key; ++ if (!fshared) ++ goto retry_private; ++ ++ put_futex_key(fshared, &q->key); ++ goto retry; + } + +- /* Only actually queue if *uaddr contained val. */ +- queue_me(&q, hb); ++ if (uval != val) { ++ queue_unlock(q, *hb); ++ ret = -EWOULDBLOCK; ++ } + +- /* +- * There might have been scheduling since the queue_me(), as we +- * cannot hold a spinlock across the get_user() in case it +- * faults, and we cannot just set TASK_INTERRUPTIBLE state when +- * queueing ourselves into the futex hash. This code thus has to +- * rely on the futex_wake() code removing us from hash when it +- * wakes us up. +- */ ++out: ++ if (ret) ++ put_futex_key(fshared, &q->key); ++ return ret; ++} + +- /* add_wait_queue is the barrier after __set_current_state. */ +- __set_current_state(TASK_INTERRUPTIBLE); +- add_wait_queue(&q.waiter, &wait); +- /* +- * !plist_node_empty() is safe here without any lock. +- * q.lock_ptr != 0 is not safe, because of ordering against wakeup. +- */ +- if (likely(!plist_node_empty(&q.list))) { +- if (!abs_time) +- schedule(); +- else { +- unsigned long slack; +- slack = current->timer_slack_ns; +- if (rt_task(current)) +- slack = 0; +- hrtimer_init_on_stack(&t.timer, +- clockrt ? CLOCK_REALTIME : +- CLOCK_MONOTONIC, +- HRTIMER_MODE_ABS); +- hrtimer_init_sleeper(&t, current); +- hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); +- +- hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); +- if (!hrtimer_active(&t.timer)) +- t.task = NULL; ++static int futex_wait(u32 __user *uaddr, int fshared, ++ u32 val, ktime_t *abs_time, u32 bitset, int clockrt) ++{ ++ struct hrtimer_sleeper timeout, *to = NULL; ++ struct restart_block *restart; ++ struct futex_hash_bucket *hb; ++ struct futex_q q; ++ int ret; + +- /* +- * the timer could have already expired, in which +- * case current would be flagged for rescheduling. +- * Don't bother calling schedule. +- */ +- if (likely(t.task)) +- schedule(); ++ if (!bitset) ++ return -EINVAL; + +- hrtimer_cancel(&t.timer); ++ q.pi_state = NULL; ++ q.bitset = bitset; ++ q.rt_waiter = NULL; + +- /* Flag if a timeout occured */ +- rem = (t.task == NULL); ++ if (abs_time) { ++ to = &timeout; + +- destroy_hrtimer_on_stack(&t.timer); +- } ++ hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : ++ CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ++ hrtimer_init_sleeper(to, current); ++ hrtimer_set_expires_range_ns(&to->timer, *abs_time, ++ current->timer_slack_ns); + } +- __set_current_state(TASK_RUNNING); + +- /* +- * NOTE: we don't remove ourselves from the waitqueue because +- * we are the only user of it. +- */ ++ /* Prepare to wait on uaddr. */ ++ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); ++ if (ret) ++ goto out; ++ ++ /* queue_me and wait for wakeup, timeout, or a signal. */ ++ futex_wait_queue_me(hb, &q, to); + + /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; + if (!unqueue_me(&q)) + goto out_put_key; + ret = -ETIMEDOUT; +- if (rem) ++ if (to && !to->task) + goto out_put_key; + + /* +@@ -1310,7 +1847,7 @@ retry: + restart->futex.val = val; + restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; +- restart->futex.flags = 0; ++ restart->futex.flags = FLAGS_HAS_TIMEOUT; + + if (fshared) + restart->futex.flags |= FLAGS_SHARED; +@@ -1322,6 +1859,10 @@ retry: + out_put_key: + put_futex_key(fshared, &q.key); + out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } + return ret; + } + +@@ -1330,13 +1871,16 @@ static long futex_wait_restart(struct re + { + u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + int fshared = 0; +- ktime_t t; ++ ktime_t t, *tp = NULL; + +- t.tv64 = restart->futex.time; ++ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { ++ t.tv64 = restart->futex.time; ++ tp = &t; ++ } + restart->fn = do_no_restart_syscall; + if (restart->futex.flags & FLAGS_SHARED) + fshared = 1; +- return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, ++ return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, + restart->futex.bitset, + restart->futex.flags & FLAGS_CLOCKRT); + } +@@ -1352,11 +1896,9 @@ static int futex_lock_pi(u32 __user *uad + int detect, ktime_t *time, int trylock) + { + struct hrtimer_sleeper timeout, *to = NULL; +- struct task_struct *curr = current; + struct futex_hash_bucket *hb; +- u32 uval, newval, curval; + struct futex_q q; +- int ret, lock_taken, ownerdied = 0, attempt = 0; ++ int res, ret; + + if (refill_pi_state_cache()) + return -ENOMEM; +@@ -1370,117 +1912,34 @@ static int futex_lock_pi(u32 __user *uad + } + + q.pi_state = NULL; ++ q.rt_waiter = NULL; + retry: + q.key = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + +-retry_unlocked: ++retry_private: + hb = queue_lock(&q); + +-retry_locked: +- ret = lock_taken = 0; +- +- /* +- * To avoid races, we attempt to take the lock here again +- * (by doing a 0 -> TID atomic cmpxchg), while holding all +- * the locks. It will most likely not succeed. +- */ +- newval = task_pid_vnr(current); +- +- curval = cmpxchg_futex_value_locked(uaddr, 0, newval); +- +- if (unlikely(curval == -EFAULT)) +- goto uaddr_faulted; +- +- /* +- * Detect deadlocks. In case of REQUEUE_PI this is a valid +- * situation and we return success to user space. +- */ +- if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { +- ret = -EDEADLK; +- goto out_unlock_put_key; +- } +- +- /* +- * Surprise - we got the lock. Just return to userspace: +- */ +- if (unlikely(!curval)) +- goto out_unlock_put_key; +- +- uval = curval; +- +- /* +- * Set the WAITERS flag, so the owner will know it has someone +- * to wake at next unlock +- */ +- newval = curval | FUTEX_WAITERS; +- +- /* +- * There are two cases, where a futex might have no owner (the +- * owner TID is 0): OWNER_DIED. We take over the futex in this +- * case. We also do an unconditional take over, when the owner +- * of the futex died. +- * +- * This is safe as we are protected by the hash bucket lock ! +- */ +- if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { +- /* Keep the OWNER_DIED bit */ +- newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); +- ownerdied = 0; +- lock_taken = 1; +- } +- +- curval = cmpxchg_futex_value_locked(uaddr, uval, newval); +- +- if (unlikely(curval == -EFAULT)) +- goto uaddr_faulted; +- if (unlikely(curval != uval)) +- goto retry_locked; +- +- /* +- * We took the lock due to owner died take over. +- */ +- if (unlikely(lock_taken)) +- goto out_unlock_put_key; +- +- /* +- * We dont have the lock. Look up the PI state (or create it if +- * we are the first waiter): +- */ +- ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); +- ++ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); + if (unlikely(ret)) { + switch (ret) { +- ++ case 1: ++ /* We got the lock. */ ++ ret = 0; ++ goto out_unlock_put_key; ++ case -EFAULT: ++ goto uaddr_faulted; + case -EAGAIN: + /* + * Task is exiting and we just wait for the + * exit to complete. + */ + queue_unlock(&q, hb); ++ put_futex_key(fshared, &q.key); + cond_resched(); + goto retry; +- +- case -ESRCH: +- /* +- * No owner found for this futex. Check if the +- * OWNER_DIED bit is set to figure out whether +- * this is a robust futex or not. +- */ +- if (get_futex_value_locked(&curval, uaddr)) +- goto uaddr_faulted; +- +- /* +- * We simply start over in case of a robust +- * futex. The code above will take the futex +- * and return happy. +- */ +- if (curval & FUTEX_OWNER_DIED) { +- ownerdied = 1; +- goto retry_locked; +- } + default: + goto out_unlock_put_key; + } +@@ -1504,74 +1963,29 @@ retry_locked: + } + + spin_lock(q.lock_ptr); ++ /* ++ * Fixup the pi_state owner and possibly acquire the lock if we ++ * haven't already. ++ */ ++ res = fixup_owner(uaddr, fshared, &q, !ret); ++ /* ++ * If fixup_owner() returned an error, proprogate that. If it acquired ++ * the lock, clear our -ETIMEDOUT or -EINTR. ++ */ ++ if (res) ++ ret = (res < 0) ? res : 0; + +- if (!ret) { +- /* +- * Got the lock. We might not be the anticipated owner +- * if we did a lock-steal - fix up the PI-state in +- * that case: +- */ +- if (q.pi_state->owner != curr) +- ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); +- } else { +- /* +- * Catch the rare case, where the lock was released +- * when we were on the way back before we locked the +- * hash bucket. +- */ +- if (q.pi_state->owner == curr) { +- /* +- * Try to get the rt_mutex now. This might +- * fail as some other task acquired the +- * rt_mutex after we removed ourself from the +- * rt_mutex waiters list. +- */ +- if (rt_mutex_trylock(&q.pi_state->pi_mutex)) +- ret = 0; +- else { +- /* +- * pi_state is incorrect, some other +- * task did a lock steal and we +- * returned due to timeout or signal +- * without taking the rt_mutex. Too +- * late. We can access the +- * rt_mutex_owner without locking, as +- * the other task is now blocked on +- * the hash bucket lock. Fix the state +- * up. +- */ +- struct task_struct *owner; +- int res; +- +- owner = rt_mutex_owner(&q.pi_state->pi_mutex); +- res = fixup_pi_state_owner(uaddr, &q, owner, +- fshared); +- +- /* propagate -EFAULT, if the fixup failed */ +- if (res) +- ret = res; +- } +- } else { +- /* +- * Paranoia check. If we did not take the lock +- * in the trylock above, then we should not be +- * the owner of the rtmutex, neither the real +- * nor the pending one: +- */ +- if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) +- printk(KERN_ERR "futex_lock_pi: ret = %d " +- "pi-mutex: %p pi-state %p\n", ret, +- q.pi_state->pi_mutex.owner, +- q.pi_state->owner); +- } +- } ++ /* ++ * If fixup_owner() faulted and was unable to handle the fault, unlock ++ * it and return the fault to userspace. ++ */ ++ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) ++ rt_mutex_unlock(&q.pi_state->pi_mutex); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + +- if (to) +- destroy_hrtimer_on_stack(&to->timer); +- return ret != -EINTR ? ret : -ERESTARTNOINTR; ++ goto out; + + out_unlock_put_key: + queue_unlock(&q, hb); +@@ -1581,32 +1995,20 @@ out_put_key: + out: + if (to) + destroy_hrtimer_on_stack(&to->timer); +- return ret; ++ return ret != -EINTR ? ret : -ERESTARTNOINTR; + + uaddr_faulted: +- /* +- * We have to r/w *(int __user *)uaddr, and we have to modify it +- * atomically. Therefore, if we continue to fault after get_user() +- * below, we need to handle the fault ourselves, while still holding +- * the mmap_sem. This can occur if the uaddr is under contention as +- * we have to drop the mmap_sem in order to call get_user(). +- */ + queue_unlock(&q, hb); + +- if (attempt++) { +- ret = futex_handle_fault((unsigned long)uaddr, attempt); +- if (ret) +- goto out_put_key; +- goto retry_unlocked; +- } ++ ret = get_user_writeable(uaddr); ++ if (ret) ++ goto out_put_key; + +- ret = get_user(uval, uaddr); +- if (!ret) +- goto retry; ++ if (!fshared) ++ goto retry_private; + +- if (to) +- destroy_hrtimer_on_stack(&to->timer); +- return ret; ++ put_futex_key(fshared, &q.key); ++ goto retry; + } + + /* +@@ -1621,7 +2023,7 @@ static int futex_unlock_pi(u32 __user *u + u32 uval; + struct plist_head *head; + union futex_key key = FUTEX_KEY_INIT; +- int ret, attempt = 0; ++ int ret; + + retry: + if (get_user(uval, uaddr)) +@@ -1637,7 +2039,6 @@ retry: + goto out; + + hb = hash_futex(&key); +-retry_unlocked: + spin_lock(&hb->lock); + + /* +@@ -1694,27 +2095,236 @@ out: + return ret; + + pi_faulted: ++ spin_unlock(&hb->lock); ++ put_futex_key(fshared, &key); ++ ++ ret = get_user_writeable(uaddr); ++ if (!ret) ++ goto retry; ++ ++ return ret; ++} ++ ++/** ++ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex ++ * @hb: the hash_bucket futex_q was original enqueued on ++ * @q: the futex_q woken while waiting to be requeued ++ * @key2: the futex_key of the requeue target futex ++ * @timeout: the timeout associated with the wait (NULL if none) ++ * ++ * Detect if the task was woken on the initial futex as opposed to the requeue ++ * target futex. If so, determine if it was a timeout or a signal that caused ++ * the wakeup and return the appropriate error code to the caller. Must be ++ * called with the hb lock held. ++ * ++ * Returns ++ * 0 - no early wakeup detected ++ * <0 - -ETIMEDOUT or -ERESTARTNOINTR ++ */ ++static inline ++int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, ++ struct futex_q *q, union futex_key *key2, ++ struct hrtimer_sleeper *timeout) ++{ ++ int ret = 0; ++ ++ /* ++ * With the hb lock held, we avoid races while we process the wakeup. ++ * We only need to hold hb (and not hb2) to ensure atomicity as the ++ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. ++ * It can't be requeued from uaddr2 to something else since we don't ++ * support a PI aware source futex for requeue. ++ */ ++ if (!match_futex(&q->key, key2)) { ++ WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); ++ /* ++ * We were woken prior to requeue by a timeout or a signal. ++ * Unqueue the futex_q and determine which it was. ++ */ ++ plist_del(&q->list, &q->list.plist); ++ drop_futex_key_refs(&q->key); ++ ++ if (timeout && !timeout->task) ++ ret = -ETIMEDOUT; ++ else ++ ret = -ERESTARTNOINTR; ++ } ++ return ret; ++} ++ ++/** ++ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 ++ * @uaddr: the futex we initialyl wait on (non-pi) ++ * @fshared: whether the futexes are shared (1) or not (0). They must be ++ * the same type, no requeueing from private to shared, etc. ++ * @val: the expected value of uaddr ++ * @abs_time: absolute timeout ++ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. ++ * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) ++ * @uaddr2: the pi futex we will take prior to returning to user-space ++ * ++ * The caller will wait on uaddr and will be requeued by futex_requeue() to ++ * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and ++ * complete the acquisition of the rt_mutex prior to returning to userspace. ++ * This ensures the rt_mutex maintains an owner when it has waiters; without ++ * one, the pi logic wouldn't know which task to boost/deboost, if there was a ++ * need to. ++ * ++ * We call schedule in futex_wait_queue_me() when we enqueue and return there ++ * via the following: ++ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() ++ * 2) wakeup on uaddr2 after a requeue and subsequent unlock ++ * 3) signal (before or after requeue) ++ * 4) timeout (before or after requeue) ++ * ++ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. ++ * ++ * If 2, we may then block on trying to take the rt_mutex and return via: ++ * 5) successful lock ++ * 6) signal ++ * 7) timeout ++ * 8) other lock acquisition failure ++ * ++ * If 6, we setup a restart_block with futex_lock_pi() as the function. ++ * ++ * If 4 or 7, we cleanup and return with -ETIMEDOUT. ++ * ++ * Returns: ++ * 0 - On success ++ * <0 - On error ++ */ ++static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, ++ u32 val, ktime_t *abs_time, u32 bitset, ++ int clockrt, u32 __user *uaddr2) ++{ ++ struct hrtimer_sleeper timeout, *to = NULL; ++ struct rt_mutex_waiter rt_waiter; ++ struct rt_mutex *pi_mutex = NULL; ++ struct futex_hash_bucket *hb; ++ union futex_key key2; ++ struct futex_q q; ++ int res, ret; ++ ++ if (!bitset) ++ return -EINVAL; ++ ++ if (abs_time) { ++ to = &timeout; ++ hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : ++ CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ++ hrtimer_init_sleeper(to, current); ++ hrtimer_set_expires_range_ns(&to->timer, *abs_time, ++ current->timer_slack_ns); ++ } ++ + /* +- * We have to r/w *(int __user *)uaddr, and we have to modify it +- * atomically. Therefore, if we continue to fault after get_user() +- * below, we need to handle the fault ourselves, while still holding +- * the mmap_sem. This can occur if the uaddr is under contention as +- * we have to drop the mmap_sem in order to call get_user(). ++ * The waiter is allocated on our stack, manipulated by the requeue ++ * code while we sleep on uaddr. + */ ++ debug_rt_mutex_init_waiter(&rt_waiter); ++ rt_waiter.task = NULL; ++ ++ q.pi_state = NULL; ++ q.bitset = bitset; ++ q.rt_waiter = &rt_waiter; ++ ++ key2 = FUTEX_KEY_INIT; ++ ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); ++ if (unlikely(ret != 0)) ++ goto out; ++ ++ /* Prepare to wait on uaddr. */ ++ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); ++ if (ret) ++ goto out_key2; ++ ++ /* Queue the futex_q, drop the hb lock, wait for wakeup. */ ++ futex_wait_queue_me(hb, &q, to); ++ ++ spin_lock(&hb->lock); ++ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); ++ if (ret) ++ goto out_put_keys; + +- if (attempt++) { +- ret = futex_handle_fault((unsigned long)uaddr, attempt); +- if (ret) +- goto out; +- uval = 0; +- goto retry_unlocked; ++ /* ++ * In order for us to be here, we know our q.key == key2, and since ++ * we took the hb->lock above, we also know that futex_requeue() has ++ * completed and we no longer have to concern ourselves with a wakeup ++ * race with the atomic proxy lock acquition by the requeue code. ++ */ ++ ++ /* Check if the requeue code acquired the second futex for us. */ ++ if (!q.rt_waiter) { ++ /* ++ * Got the lock. We might not be the anticipated owner if we ++ * did a lock-steal - fix up the PI-state in that case. ++ */ ++ if (q.pi_state && (q.pi_state->owner != current)) { ++ spin_lock(q.lock_ptr); ++ ret = fixup_pi_state_owner(uaddr2, &q, current, ++ fshared); ++ spin_unlock(q.lock_ptr); ++ } ++ } else { ++ /* ++ * We have been woken up by futex_unlock_pi(), a timeout, or a ++ * signal. futex_unlock_pi() will not destroy the lock_ptr nor ++ * the pi_state. ++ */ ++ WARN_ON(!&q.pi_state); ++ pi_mutex = &q.pi_state->pi_mutex; ++ ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); ++ debug_rt_mutex_free_waiter(&rt_waiter); ++ ++ spin_lock(q.lock_ptr); ++ /* ++ * Fixup the pi_state owner and possibly acquire the lock if we ++ * haven't already. ++ */ ++ res = fixup_owner(uaddr2, fshared, &q, !ret); ++ /* ++ * If fixup_owner() returned an error, proprogate that. If it ++ * acquired the lock, clear our -ETIMEDOUT or -EINTR. ++ */ ++ if (res) ++ ret = (res < 0) ? res : 0; ++ ++ /* Unqueue and drop the lock. */ ++ unqueue_me_pi(&q); + } + +- ret = get_user(uval, uaddr); +- if (!ret) +- goto retry; ++ /* ++ * If fixup_pi_state_owner() faulted and was unable to handle the ++ * fault, unlock the rt_mutex and return the fault to userspace. ++ */ ++ if (ret == -EFAULT) { ++ if (rt_mutex_owner(pi_mutex) == current) ++ rt_mutex_unlock(pi_mutex); ++ } else if (ret == -EINTR) { ++ /* ++ * We've already been requeued, but we have no way to ++ * restart by calling futex_lock_pi() directly. We ++ * could restart the syscall, but that will look at ++ * the user space value and return right away. So we ++ * drop back with EWOULDBLOCK to tell user space that ++ * "val" has been changed. That's the same what the ++ * restart of the syscall would do in ++ * futex_wait_setup(). ++ */ ++ ret = -EWOULDBLOCK; ++ } ++ ++out_put_keys: ++ put_futex_key(fshared, &q.key); ++out_key2: ++ put_futex_key(fshared, &key2); + ++out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } + return ret; + } + +@@ -1940,7 +2550,7 @@ long do_futex(u32 __user *uaddr, int op, + fshared = 1; + + clockrt = op & FUTEX_CLOCK_REALTIME; +- if (clockrt && cmd != FUTEX_WAIT_BITSET) ++ if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + return -ENOSYS; + + switch (cmd) { +@@ -1955,10 +2565,11 @@ long do_futex(u32 __user *uaddr, int op, + ret = futex_wake(uaddr, fshared, val, val3); + break; + case FUTEX_REQUEUE: +- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); ++ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); + break; + case FUTEX_CMP_REQUEUE: +- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); ++ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, ++ 0); + break; + case FUTEX_WAKE_OP: + ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); +@@ -1975,6 +2586,15 @@ long do_futex(u32 __user *uaddr, int op, + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + break; ++ case FUTEX_WAIT_REQUEUE_PI: ++ val3 = FUTEX_BITSET_MATCH_ANY; ++ ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, ++ clockrt, uaddr2); ++ break; ++ case FUTEX_CMP_REQUEUE_PI: ++ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, ++ 1); ++ break; + default: + ret = -ENOSYS; + } +@@ -1992,7 +2612,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uad + int cmd = op & FUTEX_CMD_MASK; + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || +- cmd == FUTEX_WAIT_BITSET)) { ++ cmd == FUTEX_WAIT_BITSET || ++ cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (copy_from_user(&ts, utime, sizeof(ts)) != 0) + return -EFAULT; + if (!timespec_valid(&ts)) +@@ -2004,11 +2625,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uad + tp = &t; + } + /* +- * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. ++ * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. + * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. + */ + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || +- cmd == FUTEX_WAKE_OP) ++ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (u32) (unsigned long) utime; + + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); +@@ -2034,7 +2655,11 @@ static int __init futex_init(void) + futex_cmpxchg_enabled = 1; + + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { ++#ifdef CONFIG_PREEMPT_RT ++ plist_head_init(&futex_queues[i].chain, NULL); ++#else + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); ++#endif + spin_lock_init(&futex_queues[i].lock); + } + +Index: linux-2.6-tip/kernel/hung_task.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/hung_task.c +@@ -0,0 +1,217 @@ ++/* ++ * Detect Hung Task ++ * ++ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * The number of tasks checked: ++ */ ++unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; ++ ++/* ++ * Limit number of tasks checked in a batch. ++ * ++ * This value controls the preemptibility of khungtaskd since preemption ++ * is disabled during the critical section. It also controls the size of ++ * the RCU grace period. So it needs to be upper-bound. ++ */ ++#define HUNG_TASK_BATCHING 1024 ++ ++/* ++ * Zero means infinite timeout - no checking done: ++ */ ++unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; ++ ++unsigned long __read_mostly sysctl_hung_task_warnings = 10; ++ ++static int __read_mostly did_panic; ++ ++static struct task_struct *watchdog_task; ++ ++/* ++ * Should we panic (and reboot, if panic_timeout= is set) when a ++ * hung task is detected: ++ */ ++unsigned int __read_mostly sysctl_hung_task_panic = ++ CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; ++ ++static int __init hung_task_panic_setup(char *str) ++{ ++ sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); ++ ++ return 1; ++} ++__setup("hung_task_panic=", hung_task_panic_setup); ++ ++static int ++hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) ++{ ++ did_panic = 1; ++ ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block panic_block = { ++ .notifier_call = hung_task_panic, ++}; ++ ++static void check_hung_task(struct task_struct *t, unsigned long timeout) ++{ ++ unsigned long switch_count = t->nvcsw + t->nivcsw; ++ ++ /* ++ * Ensure the task is not frozen. ++ * Also, when a freshly created task is scheduled once, changes ++ * its state to TASK_UNINTERRUPTIBLE without having ever been ++ * switched out once, it musn't be checked. ++ */ ++ if (unlikely(t->flags & PF_FROZEN || !switch_count)) ++ return; ++ ++ if (switch_count != t->last_switch_count) { ++ t->last_switch_count = switch_count; ++ return; ++ } ++ if (!sysctl_hung_task_warnings) ++ return; ++ sysctl_hung_task_warnings--; ++ ++ /* ++ * Ok, the task did not get scheduled for more than 2 minutes, ++ * complain: ++ */ ++ printk(KERN_ERR "INFO: task %s:%d blocked for more than " ++ "%ld seconds.\n", t->comm, t->pid, timeout); ++ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" ++ " disables this message.\n"); ++ sched_show_task(t); ++ __debug_show_held_locks(t); ++ ++ touch_nmi_watchdog(); ++ ++ if (sysctl_hung_task_panic) ++ panic("hung_task: blocked tasks"); ++} ++ ++/* ++ * To avoid extending the RCU grace period for an unbounded amount of time, ++ * periodically exit the critical section and enter a new one. ++ * ++ * For preemptible RCU it is sufficient to call rcu_read_unlock in order ++ * exit the grace period. For classic RCU, a reschedule is required. ++ */ ++static void rcu_lock_break(struct task_struct *g, struct task_struct *t) ++{ ++ get_task_struct(g); ++ get_task_struct(t); ++ rcu_read_unlock(); ++ cond_resched(); ++ rcu_read_lock(); ++ put_task_struct(t); ++ put_task_struct(g); ++} ++ ++/* ++ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for ++ * a really long time (120 seconds). If that happens, print out ++ * a warning. ++ */ ++static void check_hung_uninterruptible_tasks(unsigned long timeout) ++{ ++ int max_count = sysctl_hung_task_check_count; ++ int batch_count = HUNG_TASK_BATCHING; ++ struct task_struct *g, *t; ++ ++ /* ++ * If the system crashed already then all bets are off, ++ * do not report extra hung tasks: ++ */ ++ if (test_taint(TAINT_DIE) || did_panic) ++ return; ++ ++ rcu_read_lock(); ++ do_each_thread(g, t) { ++ if (!--max_count) ++ goto unlock; ++ if (!--batch_count) { ++ batch_count = HUNG_TASK_BATCHING; ++ rcu_lock_break(g, t); ++ /* Exit if t or g was unhashed during refresh. */ ++ if (t->state == TASK_DEAD || g->state == TASK_DEAD) ++ goto unlock; ++ } ++ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ ++ if (t->state == TASK_UNINTERRUPTIBLE) ++ check_hung_task(t, timeout); ++ } while_each_thread(g, t); ++ unlock: ++ rcu_read_unlock(); ++} ++ ++static unsigned long timeout_jiffies(unsigned long timeout) ++{ ++ /* timeout of 0 will disable the watchdog */ ++ return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; ++} ++ ++/* ++ * Process updating of timeout sysctl ++ */ ++int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, ++ struct file *filp, void __user *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ int ret; ++ ++ ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); ++ ++ if (ret || !write) ++ goto out; ++ ++ wake_up_process(watchdog_task); ++ ++ out: ++ return ret; ++} ++ ++/* ++ * kthread which checks for tasks stuck in D state ++ */ ++static int watchdog(void *dummy) ++{ ++ set_user_nice(current, 0); ++ ++ for ( ; ; ) { ++ unsigned long timeout = sysctl_hung_task_timeout_secs; ++ ++ while (schedule_timeout_interruptible(timeout_jiffies(timeout))) ++ timeout = sysctl_hung_task_timeout_secs; ++ ++ check_hung_uninterruptible_tasks(timeout); ++ } ++ ++ return 0; ++} ++ ++static int __init hung_task_init(void) ++{ ++ atomic_notifier_chain_register(&panic_notifier_list, &panic_block); ++ watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); ++ ++ return 0; ++} ++ ++module_init(hung_task_init); +Index: linux-2.6-tip/kernel/irq/chip.c +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/chip.c ++++ linux-2.6-tip/kernel/irq/chip.c +@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq) + desc->irq_count = 0; + desc->irqs_unhandled = 0; + #ifdef CONFIG_SMP +- cpumask_setall(&desc->affinity); ++ cpumask_setall(desc->affinity); ++#ifdef CONFIG_GENERIC_PENDING_IRQ ++ cpumask_clear(desc->pending_mask); ++#endif + #endif + spin_unlock_irqrestore(&desc->lock, flags); + } +@@ -78,6 +81,7 @@ void dynamic_irq_cleanup(unsigned int ir + desc->handle_irq = handle_bad_irq; + desc->chip = &no_irq_chip; + desc->name = NULL; ++ clear_kstat_irqs(desc); + spin_unlock_irqrestore(&desc->lock, flags); + } + +@@ -289,8 +293,10 @@ static inline void mask_ack_irq(struct i + if (desc->chip->mask_ack) + desc->chip->mask_ack(irq); + else { +- desc->chip->mask(irq); +- desc->chip->ack(irq); ++ if (desc->chip->mask) ++ desc->chip->mask(irq); ++ if (desc->chip->ack) ++ desc->chip->ack(irq); + } + } + +@@ -314,8 +320,10 @@ handle_simple_irq(unsigned int irq, stru + + spin_lock(&desc->lock); + +- if (unlikely(desc->status & IRQ_INPROGRESS)) ++ if (unlikely(desc->status & IRQ_INPROGRESS)) { ++ desc->status |= IRQ_PENDING; + goto out_unlock; ++ } + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + kstat_incr_irqs_this_cpu(irq, desc); + +@@ -324,6 +332,11 @@ handle_simple_irq(unsigned int irq, stru + goto out_unlock; + + desc->status |= IRQ_INPROGRESS; ++ /* ++ * hardirq redirection to the irqd process context: ++ */ ++ if (redirect_hardirq(desc)) ++ goto out_unlock; + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, action); +@@ -332,6 +345,8 @@ handle_simple_irq(unsigned int irq, stru + + spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; ++ if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) ++ desc->chip->unmask(irq); + out_unlock: + spin_unlock(&desc->lock); + } +@@ -370,6 +385,13 @@ handle_level_irq(unsigned int irq, struc + goto out_unlock; + + desc->status |= IRQ_INPROGRESS; ++ ++ /* ++ * hardirq redirection to the irqd process context: ++ */ ++ if (redirect_hardirq(desc)) ++ goto out_unlock; ++ + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, action); +@@ -403,18 +425,16 @@ handle_fasteoi_irq(unsigned int irq, str + + spin_lock(&desc->lock); + +- if (unlikely(desc->status & IRQ_INPROGRESS)) +- goto out; +- + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + kstat_incr_irqs_this_cpu(irq, desc); + + /* +- * If its disabled or no action available ++ * If it's running, disabled or no action available + * then mask it and get out of here: + */ + action = desc->action; +- if (unlikely(!action || (desc->status & IRQ_DISABLED))) { ++ if (unlikely(!action || (desc->status & (IRQ_INPROGRESS | ++ IRQ_DISABLED)))) { + desc->status |= IRQ_PENDING; + if (desc->chip->mask) + desc->chip->mask(irq); +@@ -422,6 +442,15 @@ handle_fasteoi_irq(unsigned int irq, str + } + + desc->status |= IRQ_INPROGRESS; ++ /* ++ * In the threaded case we fall back to a mask+eoi sequence: ++ */ ++ if (redirect_hardirq(desc)) { ++ if (desc->chip->mask) ++ desc->chip->mask(irq); ++ goto out; ++ } ++ + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + +@@ -431,10 +460,11 @@ handle_fasteoi_irq(unsigned int irq, str + + spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; ++ if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) ++ desc->chip->unmask(irq); + out: + desc->chip->eoi(irq); + desc = irq_remap_to_desc(irq, desc); +- + spin_unlock(&desc->lock); + } + +@@ -476,12 +506,19 @@ handle_edge_irq(unsigned int irq, struct + kstat_incr_irqs_this_cpu(irq, desc); + + /* Start handling the irq */ +- desc->chip->ack(irq); ++ if (desc->chip->ack) ++ desc->chip->ack(irq); + desc = irq_remap_to_desc(irq, desc); + + /* Mark the IRQ currently in progress.*/ + desc->status |= IRQ_INPROGRESS; + ++ /* ++ * hardirq redirection to the irqd process context: ++ */ ++ if (redirect_hardirq(desc)) ++ goto out_unlock; ++ + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; +Index: linux-2.6-tip/kernel/irq/handle.c +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/handle.c ++++ linux-2.6-tip/kernel/irq/handle.c +@@ -13,10 +13,13 @@ + #include + #include + #include ++#include + #include + #include + #include + #include ++#include ++#include + + #include "internals.h" + +@@ -69,33 +72,33 @@ int nr_irqs = NR_IRQS; + EXPORT_SYMBOL_GPL(nr_irqs); + + #ifdef CONFIG_SPARSE_IRQ ++ + static struct irq_desc irq_desc_init = { + .irq = -1, + .status = IRQ_DISABLED, + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, +- .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), +-#ifdef CONFIG_SMP +- .affinity = CPU_MASK_ALL +-#endif ++ .lock = RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + }; + + void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) + { +- unsigned long bytes; +- char *ptr; + int node; +- +- /* Compute how many bytes we need per irq and allocate them */ +- bytes = nr * sizeof(unsigned int); ++ void *ptr; + + node = cpu_to_node(cpu); +- ptr = kzalloc_node(bytes, GFP_ATOMIC, node); +- printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node); ++ ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); + +- if (ptr) +- desc->kstat_irqs = (unsigned int *)ptr; ++ /* ++ * don't overwite if can not get new one ++ * init_copy_kstat_irqs() could still use old one ++ */ ++ if (ptr) { ++ printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", ++ cpu, node); ++ desc->kstat_irqs = ptr; ++ } + } + + static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +@@ -103,6 +106,7 @@ static void init_one_irq_desc(int irq, s + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); + + spin_lock_init(&desc->lock); ++ init_waitqueue_head(&desc->wait_for_handler); + desc->irq = irq; + #ifdef CONFIG_SMP + desc->cpu = cpu; +@@ -113,6 +117,10 @@ static void init_one_irq_desc(int irq, s + printk(KERN_ERR "can not alloc kstat_irqs\n"); + BUG_ON(1); + } ++ if (!init_alloc_desc_masks(desc, cpu, false)) { ++ printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); ++ BUG_ON(1); ++ } + arch_init_chip_data(desc, cpu); + } + +@@ -121,7 +129,7 @@ static void init_one_irq_desc(int irq, s + */ + DEFINE_SPINLOCK(sparse_irq_lock); + +-struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; ++struct irq_desc **irq_desc_ptrs __read_mostly; + + static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { + [0 ... NR_IRQS_LEGACY-1] = { +@@ -130,15 +138,11 @@ static struct irq_desc irq_desc_legacy[N + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, +- .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), +-#ifdef CONFIG_SMP +- .affinity = CPU_MASK_ALL +-#endif ++ .lock = RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + } + }; + +-/* FIXME: use bootmem alloc ...*/ +-static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; ++static unsigned int *kstat_irqs_legacy; + + int __init early_irq_init(void) + { +@@ -148,18 +152,30 @@ int __init early_irq_init(void) + + init_irq_default_affinity(); + ++ /* initialize nr_irqs based on nr_cpu_ids */ ++ arch_probe_nr_irqs(); ++ printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); ++ + desc = irq_desc_legacy; + legacy_count = ARRAY_SIZE(irq_desc_legacy); + ++ /* allocate irq_desc_ptrs array based on nr_irqs */ ++ irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); ++ ++ /* allocate based on nr_cpu_ids */ ++ /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ ++ kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * ++ sizeof(int)); ++ + for (i = 0; i < legacy_count; i++) { + desc[i].irq = i; +- desc[i].kstat_irqs = kstat_irqs_legacy[i]; ++ desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); +- ++ init_alloc_desc_masks(&desc[i], 0, true); + irq_desc_ptrs[i] = desc + i; + } + +- for (i = legacy_count; i < NR_IRQS; i++) ++ for (i = legacy_count; i < nr_irqs; i++) + irq_desc_ptrs[i] = NULL; + + return arch_early_irq_init(); +@@ -167,7 +183,10 @@ int __init early_irq_init(void) + + struct irq_desc *irq_to_desc(unsigned int irq) + { +- return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; ++ if (irq_desc_ptrs && irq < nr_irqs) ++ return irq_desc_ptrs[irq]; ++ ++ return NULL; + } + + struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +@@ -176,10 +195,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(u + unsigned long flags; + int node; + +- if (irq >= NR_IRQS) { +- printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", +- irq, NR_IRQS); +- WARN_ON(1); ++ if (irq >= nr_irqs) { ++ WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", ++ irq, nr_irqs); + return NULL; + } + +@@ -220,13 +238,11 @@ struct irq_desc irq_desc[NR_IRQS] __cach + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, +- .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), +-#ifdef CONFIG_SMP +- .affinity = CPU_MASK_ALL +-#endif ++ .lock = RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), + } + }; + ++static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; + int __init early_irq_init(void) + { + struct irq_desc *desc; +@@ -235,12 +251,16 @@ int __init early_irq_init(void) + + init_irq_default_affinity(); + ++ printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); ++ + desc = irq_desc; + count = ARRAY_SIZE(irq_desc); + +- for (i = 0; i < count; i++) ++ for (i = 0; i < count; i++) { + desc[i].irq = i; +- ++ init_alloc_desc_masks(&desc[i], 0, true); ++ desc[i].kstat_irqs = kstat_irqs_all[i]; ++ } + return arch_early_irq_init(); + } + +@@ -255,6 +275,11 @@ struct irq_desc *irq_to_desc_alloc_cpu(u + } + #endif /* !CONFIG_SPARSE_IRQ */ + ++void clear_kstat_irqs(struct irq_desc *desc) ++{ ++ memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); ++} ++ + /* + * What should we do if we get a hw irq event on an illegal vector? + * Each architecture has to answer this themself. +@@ -316,6 +341,9 @@ irqreturn_t no_action(int cpl, void *dev + return IRQ_NONE; + } + ++DEFINE_TRACE(irq_handler_entry); ++DEFINE_TRACE(irq_handler_exit); ++ + /** + * handle_IRQ_event - irq action chain handler + * @irq: the interrupt number +@@ -328,25 +356,98 @@ irqreturn_t handle_IRQ_event(unsigned in + irqreturn_t ret, retval = IRQ_NONE; + unsigned int status = 0; + +- if (!(action->flags & IRQF_DISABLED)) +- local_irq_enable_in_hardirq(); ++#ifdef __i386__ ++ if (debug_direct_keyboard && irq == 1) ++ lockdep_off(); ++#endif ++ ++ /* ++ * Unconditionally enable interrupts for threaded ++ * IRQ handlers: ++ */ ++ if (!hardirq_count() || !(action->flags & IRQF_DISABLED)) ++ local_irq_enable(); + + do { ++ unsigned int preempt_count = preempt_count(); ++ ++ trace_irq_handler_entry(irq, action); + ret = action->handler(irq, action->dev_id); ++ trace_irq_handler_exit(irq, action, ret); ++ ++ if (preempt_count() != preempt_count) { ++ print_symbol("BUG: unbalanced irq-handler preempt count" ++ " in %s!\n", ++ (unsigned long) action->handler); ++ printk("entered with %08x, exited with %08x.\n", ++ preempt_count, preempt_count()); ++ dump_stack(); ++ preempt_count() = preempt_count; ++ } ++ + if (ret == IRQ_HANDLED) + status |= action->flags; + retval |= ret; + action = action->next; + } while (action); + +- if (status & IRQF_SAMPLE_RANDOM) ++ if (status & IRQF_SAMPLE_RANDOM) { ++ local_irq_enable(); + add_interrupt_randomness(irq); ++ } + local_irq_disable(); + ++#ifdef __i386__ ++ if (debug_direct_keyboard && irq == 1) ++ lockdep_on(); ++#endif + return retval; + } + ++/* ++ * Hack - used for development only. ++ */ ++int __read_mostly debug_direct_keyboard = 0; ++ ++int __init debug_direct_keyboard_setup(char *str) ++{ ++ debug_direct_keyboard = 1; ++ printk(KERN_INFO "Switching IRQ 1 (keyboard) to to direct!\n"); ++#ifdef CONFIG_PREEMPT_RT ++ printk(KERN_INFO "WARNING: kernel may easily crash this way!\n"); ++#endif ++ return 1; ++} ++ ++__setup("debug_direct_keyboard", debug_direct_keyboard_setup); ++ ++int redirect_hardirq(struct irq_desc *desc) ++{ ++ /* ++ * Direct execution: ++ */ ++ if (!hardirq_preemption || (desc->status & IRQ_NODELAY) || ++ !desc->thread) ++ return 0; ++ ++#ifdef __i386__ ++ if (debug_direct_keyboard && desc->irq == 1) ++ return 0; ++#endif ++ ++ BUG_ON(!irqs_disabled()); ++ if (desc->thread && desc->thread->state != TASK_RUNNING) ++ wake_up_process(desc->thread); ++ ++ return 1; ++} ++ + #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ ++ ++#ifdef CONFIG_ENABLE_WARN_DEPRECATED ++# warning __do_IRQ is deprecated. Please convert to proper flow handlers ++#endif ++ + /** + * __do_IRQ - original all in one highlevel IRQ handler + * @irq: the interrupt number +@@ -364,6 +465,11 @@ unsigned int __do_IRQ(unsigned int irq) + struct irqaction *action; + unsigned int status; + ++#ifdef CONFIG_PREEMPT_RT ++ printk(KERN_WARNING "__do_IRQ called for irq %d. " ++ "PREEMPT_RT will crash your system soon\n", irq); ++ printk(KERN_WARNING "I hope you have a fire-extinguisher handy!\n"); ++#endif + kstat_incr_irqs_this_cpu(irq, desc); + + if (CHECK_IRQ_PER_CPU(desc->status)) { +@@ -385,6 +491,13 @@ unsigned int __do_IRQ(unsigned int irq) + desc->chip->end(irq); + return 1; + } ++ /* ++ * If the task is currently running in user mode, don't ++ * detect soft lockups. If CONFIG_DETECT_SOFTLOCKUP is not ++ * configured, this should be optimized out. ++ */ ++ if (user_mode(get_irq_regs())) ++ touch_softlockup_watchdog(); + + spin_lock(&desc->lock); + if (desc->chip->ack) { +@@ -467,12 +580,10 @@ void early_init_irq_lock_class(void) + } + } + +-#ifdef CONFIG_SPARSE_IRQ + unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) + { + struct irq_desc *desc = irq_to_desc(irq); + return desc ? desc->kstat_irqs[cpu] : 0; + } +-#endif + EXPORT_SYMBOL(kstat_irqs_cpu); + +Index: linux-2.6-tip/kernel/irq/internals.h +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/internals.h ++++ linux-2.6-tip/kernel/irq/internals.h +@@ -15,8 +15,20 @@ extern int __irq_set_trigger(struct irq_ + + extern struct lock_class_key irq_desc_lock_class; + extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); ++extern void clear_kstat_irqs(struct irq_desc *desc); + extern spinlock_t sparse_irq_lock; ++ ++#ifdef CONFIG_SPARSE_IRQ ++/* irq_desc_ptrs allocated at boot time */ ++extern struct irq_desc **irq_desc_ptrs; ++#else ++/* irq_desc_ptrs is a fixed size array */ + extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; ++#endif ++ ++extern int redirect_hardirq(struct irq_desc *desc); ++ ++void recalculate_desc_flags(struct irq_desc *desc); + + #ifdef CONFIG_PROC_FS + extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); +Index: linux-2.6-tip/kernel/irq/manage.c +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/manage.c ++++ linux-2.6-tip/kernel/irq/manage.c +@@ -8,8 +8,10 @@ + */ + + #include +-#include + #include ++#include ++#include ++#include + #include + #include + +@@ -43,8 +45,12 @@ void synchronize_irq(unsigned int irq) + * Wait until we're out of the critical section. This might + * give the wrong answer due to the lack of memory barriers. + */ +- while (desc->status & IRQ_INPROGRESS) +- cpu_relax(); ++ if (hardirq_preemption && !(desc->status & IRQ_NODELAY)) ++ wait_event(desc->wait_for_handler, ++ !(desc->status & IRQ_INPROGRESS)); ++ else ++ while (desc->status & IRQ_INPROGRESS) ++ cpu_relax(); + + /* Ok, that indicated we're done: double-check carefully. */ + spin_lock_irqsave(&desc->lock, flags); +@@ -90,14 +96,14 @@ int irq_set_affinity(unsigned int irq, c + + #ifdef CONFIG_GENERIC_PENDING_IRQ + if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { +- cpumask_copy(&desc->affinity, cpumask); ++ cpumask_copy(desc->affinity, cpumask); + desc->chip->set_affinity(irq, cpumask); + } else { + desc->status |= IRQ_MOVE_PENDING; +- cpumask_copy(&desc->pending_mask, cpumask); ++ cpumask_copy(desc->pending_mask, cpumask); + } + #else +- cpumask_copy(&desc->affinity, cpumask); ++ cpumask_copy(desc->affinity, cpumask); + desc->chip->set_affinity(irq, cpumask); + #endif + desc->status |= IRQ_AFFINITY_SET; +@@ -109,7 +115,7 @@ int irq_set_affinity(unsigned int irq, c + /* + * Generic version of the affinity autoselector. + */ +-int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) ++static int setup_affinity(unsigned int irq, struct irq_desc *desc) + { + if (!irq_can_set_affinity(irq)) + return 0; +@@ -119,21 +125,21 @@ int do_irq_select_affinity(unsigned int + * one of the targets is online. + */ + if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { +- if (cpumask_any_and(&desc->affinity, cpu_online_mask) ++ if (cpumask_any_and(desc->affinity, cpu_online_mask) + < nr_cpu_ids) + goto set_affinity; + else + desc->status &= ~IRQ_AFFINITY_SET; + } + +- cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); ++ cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); + set_affinity: +- desc->chip->set_affinity(irq, &desc->affinity); ++ desc->chip->set_affinity(irq, desc->affinity); + + return 0; + } + #else +-static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d) ++static inline int setup_affinity(unsigned int irq, struct irq_desc *d) + { + return irq_select_affinity(irq); + } +@@ -149,14 +155,14 @@ int irq_select_affinity_usr(unsigned int + int ret; + + spin_lock_irqsave(&desc->lock, flags); +- ret = do_irq_select_affinity(irq, desc); ++ ret = setup_affinity(irq, desc); + spin_unlock_irqrestore(&desc->lock, flags); + + return ret; + } + + #else +-static inline int do_irq_select_affinity(int irq, struct irq_desc *desc) ++static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) + { + return 0; + } +@@ -255,6 +261,14 @@ void enable_irq(unsigned int irq) + spin_lock_irqsave(&desc->lock, flags); + __enable_irq(desc, irq); + spin_unlock_irqrestore(&desc->lock, flags); ++#ifdef CONFIG_HARDIRQS_SW_RESEND ++ /* ++ * Do a bh disable/enable pair to trigger any pending ++ * irq resend logic: ++ */ ++ local_bh_disable(); ++ local_bh_enable(); ++#endif + } + EXPORT_SYMBOL(enable_irq); + +@@ -317,6 +331,21 @@ int set_irq_wake(unsigned int irq, unsig + EXPORT_SYMBOL(set_irq_wake); + + /* ++ * If any action has IRQF_NODELAY then turn IRQ_NODELAY on: ++ */ ++void recalculate_desc_flags(struct irq_desc *desc) ++{ ++ struct irqaction *action; ++ ++ desc->status &= ~IRQ_NODELAY; ++ for (action = desc->action ; action; action = action->next) ++ if (action->flags & IRQF_NODELAY) ++ desc->status |= IRQ_NODELAY; ++} ++ ++static int start_irq_thread(int irq, struct irq_desc *desc); ++ ++/* + * Internal function that tells the architecture code whether a + * particular irq has been exclusively allocated or is available + * for driver use. +@@ -389,9 +418,9 @@ int __irq_set_trigger(struct irq_desc *d + * allocate special interrupts that are part of the architecture. + */ + static int +-__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) ++__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) + { +- struct irqaction *old, **p; ++ struct irqaction *old, **old_ptr; + const char *old_name = NULL; + unsigned long flags; + int shared = 0; +@@ -419,12 +448,15 @@ __setup_irq(unsigned int irq, struct irq + rand_initialize_irq(irq); + } + ++ if (!(new->flags & IRQF_NODELAY)) ++ if (start_irq_thread(irq, desc)) ++ return -ENOMEM; + /* + * The following block of code has to be executed atomically + */ + spin_lock_irqsave(&desc->lock, flags); +- p = &desc->action; +- old = *p; ++ old_ptr = &desc->action; ++ old = *old_ptr; + if (old) { + /* + * Can't share interrupts unless both agree to and are +@@ -447,8 +479,8 @@ __setup_irq(unsigned int irq, struct irq + + /* add new interrupt at end of irq queue */ + do { +- p = &old->next; +- old = *p; ++ old_ptr = &old->next; ++ old = *old_ptr; + } while (old); + shared = 1; + } +@@ -488,7 +520,7 @@ __setup_irq(unsigned int irq, struct irq + desc->status |= IRQ_NO_BALANCING; + + /* Set default affinity mask once everything is setup */ +- do_irq_select_affinity(irq, desc); ++ setup_affinity(irq, desc); + + } else if ((new->flags & IRQF_TRIGGER_MASK) + && (new->flags & IRQF_TRIGGER_MASK) +@@ -499,11 +531,17 @@ __setup_irq(unsigned int irq, struct irq + (int)(new->flags & IRQF_TRIGGER_MASK)); + } + +- *p = new; ++ *old_ptr = new; ++ ++ /* ++ * Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY: ++ */ ++ recalculate_desc_flags(desc); + + /* Reset broken irq detection when installing new handler */ + desc->irq_count = 0; + desc->irqs_unhandled = 0; ++ init_waitqueue_head(&desc->wait_for_handler); + + /* + * Check whether we disabled the irq via the spurious handler +@@ -518,7 +556,7 @@ __setup_irq(unsigned int irq, struct irq + + new->irq = irq; + register_irq_proc(irq, desc); +- new->dir = NULL; ++ new->dir = new->threaded = NULL; + register_handler_proc(irq, new); + + return 0; +@@ -549,90 +587,118 @@ int setup_irq(unsigned int irq, struct i + + return __setup_irq(irq, desc, act); + } ++EXPORT_SYMBOL_GPL(setup_irq); + +-/** +- * free_irq - free an interrupt +- * @irq: Interrupt line to free +- * @dev_id: Device identity to free +- * +- * Remove an interrupt handler. The handler is removed and if the +- * interrupt line is no longer in use by any driver it is disabled. +- * On a shared IRQ the caller must ensure the interrupt is disabled +- * on the card it drives before calling this function. The function +- * does not return until any executing interrupts for this IRQ +- * have completed. +- * +- * This function must not be called from interrupt context. ++ /* ++ * Internal function to unregister an irqaction - used to free ++ * regular and special interrupts that are part of the architecture. + */ +-void free_irq(unsigned int irq, void *dev_id) ++static struct irqaction *__free_irq(unsigned int irq, void *dev_id) + { + struct irq_desc *desc = irq_to_desc(irq); +- struct irqaction **p; ++ struct irqaction *action, **action_ptr; + unsigned long flags; + +- WARN_ON(in_interrupt()); ++ WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); + + if (!desc) +- return; ++ return NULL; + + spin_lock_irqsave(&desc->lock, flags); +- p = &desc->action; ++ ++ /* ++ * There can be multiple actions per IRQ descriptor, find the right ++ * one based on the dev_id: ++ */ ++ action_ptr = &desc->action; + for (;;) { +- struct irqaction *action = *p; ++ action = *action_ptr; + +- if (action) { +- struct irqaction **pp = p; ++ if (!action) { ++ WARN(1, "Trying to free already-free IRQ %d\n", irq); ++ spin_unlock_irqrestore(&desc->lock, flags); ++ ++ return NULL; ++ } + +- p = &action->next; +- if (action->dev_id != dev_id) +- continue; ++ if (action->dev_id == dev_id) ++ break; ++ action_ptr = &action->next; ++ } + +- /* Found it - now remove it from the list of entries */ +- *pp = action->next; ++ /* Found it - now remove it from the list of entries: */ ++ *action_ptr = action->next; + +- /* Currently used only by UML, might disappear one day.*/ ++ /* Currently used only by UML, might disappear one day: */ + #ifdef CONFIG_IRQ_RELEASE_METHOD +- if (desc->chip->release) +- desc->chip->release(irq, dev_id); ++ if (desc->chip->release) ++ desc->chip->release(irq, dev_id); + #endif + +- if (!desc->action) { +- desc->status |= IRQ_DISABLED; +- if (desc->chip->shutdown) +- desc->chip->shutdown(irq); +- else +- desc->chip->disable(irq); +- } +- spin_unlock_irqrestore(&desc->lock, flags); +- unregister_handler_proc(irq, action); ++ /* If this was the last handler, shut down the IRQ line: */ ++ if (!desc->action) { ++ desc->status |= IRQ_DISABLED; ++ if (desc->chip->shutdown) ++ desc->chip->shutdown(irq); ++ else ++ desc->chip->disable(irq); ++ } ++ recalculate_desc_flags(desc); ++ spin_unlock_irqrestore(&desc->lock, flags); ++ ++ unregister_handler_proc(irq, action); ++ ++ /* Make sure it's not being used on another CPU: */ ++ synchronize_irq(irq); + +- /* Make sure it's not being used on another CPU */ +- synchronize_irq(irq); +-#ifdef CONFIG_DEBUG_SHIRQ +- /* +- * It's a shared IRQ -- the driver ought to be +- * prepared for it to happen even now it's +- * being freed, so let's make sure.... We do +- * this after actually deregistering it, to +- * make sure that a 'real' IRQ doesn't run in +- * parallel with our fake +- */ +- if (action->flags & IRQF_SHARED) { +- local_irq_save(flags); +- action->handler(irq, dev_id); +- local_irq_restore(flags); +- } +-#endif +- kfree(action); +- return; +- } +- printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); + #ifdef CONFIG_DEBUG_SHIRQ +- dump_stack(); +-#endif +- spin_unlock_irqrestore(&desc->lock, flags); +- return; ++ /* ++ * It's a shared IRQ -- the driver ought to be prepared for an IRQ ++ * event to happen even now it's being freed, so let's make sure that ++ * is so by doing an extra call to the handler .... ++ * ++ * ( We do this after actually deregistering it, to make sure that a ++ * 'real' IRQ doesn't run in * parallel with our fake. ) ++ */ ++ if (action->flags & IRQF_SHARED) { ++ local_irq_save_nort(flags); ++ action->handler(irq, dev_id); ++ local_irq_restore_nort(flags); + } ++#endif ++ return action; ++} ++ ++/** ++ * remove_irq - free an interrupt ++ * @irq: Interrupt line to free ++ * @act: irqaction for the interrupt ++ * ++ * Used to remove interrupts statically setup by the early boot process. ++ */ ++void remove_irq(unsigned int irq, struct irqaction *act) ++{ ++ __free_irq(irq, act->dev_id); ++} ++EXPORT_SYMBOL_GPL(remove_irq); ++ ++/** ++ * free_irq - free an interrupt allocated with request_irq ++ * @irq: Interrupt line to free ++ * @dev_id: Device identity to free ++ * ++ * Remove an interrupt handler. The handler is removed and if the ++ * interrupt line is no longer in use by any driver it is disabled. ++ * On a shared IRQ the caller must ensure the interrupt is disabled ++ * on the card it drives before calling this function. The function ++ * does not return until any executing interrupts for this IRQ ++ * have completed. ++ * ++ * This function must not be called from interrupt context. ++ */ ++void free_irq(unsigned int irq, void *dev_id) ++{ ++ kfree(__free_irq(irq, dev_id)); + } + EXPORT_SYMBOL(free_irq); + +@@ -679,11 +745,12 @@ int request_irq(unsigned int irq, irq_ha + * the behavior is classified as "will not fix" so we need to + * start nudging drivers away from using that idiom. + */ +- if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) +- == (IRQF_SHARED|IRQF_DISABLED)) +- pr_warning("IRQ %d/%s: IRQF_DISABLED is not " +- "guaranteed on shared IRQs\n", +- irq, devname); ++ if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) == ++ (IRQF_SHARED|IRQF_DISABLED)) { ++ pr_warning( ++ "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n", ++ irq, devname); ++ } + + #ifdef CONFIG_LOCKDEP + /* +@@ -709,15 +776,13 @@ int request_irq(unsigned int irq, irq_ha + if (!handler) + return -EINVAL; + +- action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); ++ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = irqflags; +- cpus_clear(action->mask); + action->name = devname; +- action->next = NULL; + action->dev_id = dev_id; + + retval = __setup_irq(irq, desc, action); +@@ -735,14 +800,289 @@ int request_irq(unsigned int irq, irq_ha + unsigned long flags; + + disable_irq(irq); +- local_irq_save(flags); ++ local_irq_save_nort(flags); + + handler(irq, dev_id); + +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + enable_irq(irq); + } + #endif + return retval; + } + EXPORT_SYMBOL(request_irq); ++ ++#ifdef CONFIG_PREEMPT_HARDIRQS ++ ++int hardirq_preemption = 1; ++ ++EXPORT_SYMBOL(hardirq_preemption); ++ ++/* ++ * Real-Time Preemption depends on hardirq threading: ++ */ ++#ifndef CONFIG_PREEMPT_RT ++ ++static int __init hardirq_preempt_setup (char *str) ++{ ++ if (!strncmp(str, "off", 3)) ++ hardirq_preemption = 0; ++ else ++ get_option(&str, &hardirq_preemption); ++ if (!hardirq_preemption) ++ printk("turning off hardirq preemption!\n"); ++ ++ return 1; ++} ++ ++__setup("hardirq-preempt=", hardirq_preempt_setup); ++ ++#endif ++ ++/* ++ * threaded simple handler ++ */ ++static void thread_simple_irq(irq_desc_t *desc) ++{ ++ struct irqaction *action = desc->action; ++ unsigned int irq = desc->irq; ++ irqreturn_t action_ret; ++ ++ do { ++ if (!action || desc->depth) ++ break; ++ desc->status &= ~IRQ_PENDING; ++ spin_unlock(&desc->lock); ++ action_ret = handle_IRQ_event(irq, action); ++ cond_resched_hardirq_context(); ++ spin_lock_irq(&desc->lock); ++ if (!noirqdebug) ++ note_interrupt(irq, desc, action_ret); ++ } while (desc->status & IRQ_PENDING); ++ desc->status &= ~IRQ_INPROGRESS; ++} ++ ++/* ++ * threaded level type irq handler ++ */ ++static void thread_level_irq(irq_desc_t *desc) ++{ ++ unsigned int irq = desc->irq; ++ ++ thread_simple_irq(desc); ++ if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) ++ desc->chip->unmask(irq); ++} ++ ++/* ++ * threaded fasteoi type irq handler ++ */ ++static void thread_fasteoi_irq(irq_desc_t *desc) ++{ ++ unsigned int irq = desc->irq; ++ ++ thread_simple_irq(desc); ++ if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) ++ desc->chip->unmask(irq); ++} ++ ++/* ++ * threaded edge type IRQ handler ++ */ ++static void thread_edge_irq(irq_desc_t *desc) ++{ ++ unsigned int irq = desc->irq; ++ ++ do { ++ struct irqaction *action = desc->action; ++ irqreturn_t action_ret; ++ ++ if (unlikely(!action)) { ++ desc->status &= ~IRQ_INPROGRESS; ++ desc->chip->mask(irq); ++ return; ++ } ++ ++ /* ++ * When another irq arrived while we were handling ++ * one, we could have masked the irq. ++ * Renable it, if it was not disabled in meantime. ++ */ ++ if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) == ++ (IRQ_PENDING | IRQ_MASKED)) && !desc->depth)) ++ desc->chip->unmask(irq); ++ ++ desc->status &= ~IRQ_PENDING; ++ spin_unlock(&desc->lock); ++ action_ret = handle_IRQ_event(irq, action); ++ cond_resched_hardirq_context(); ++ spin_lock_irq(&desc->lock); ++ if (!noirqdebug) ++ note_interrupt(irq, desc, action_ret); ++ } while ((desc->status & IRQ_PENDING) && !desc->depth); ++ ++ desc->status &= ~IRQ_INPROGRESS; ++} ++ ++/* ++ * threaded edge type IRQ handler ++ */ ++static void thread_do_irq(irq_desc_t *desc) ++{ ++ unsigned int irq = desc->irq; ++ ++ do { ++ struct irqaction *action = desc->action; ++ irqreturn_t action_ret; ++ ++ if (unlikely(!action)) { ++ desc->status &= ~IRQ_INPROGRESS; ++ desc->chip->disable(irq); ++ return; ++ } ++ ++ desc->status &= ~IRQ_PENDING; ++ spin_unlock(&desc->lock); ++ action_ret = handle_IRQ_event(irq, action); ++ cond_resched_hardirq_context(); ++ spin_lock_irq(&desc->lock); ++ if (!noirqdebug) ++ note_interrupt(irq, desc, action_ret); ++ } while ((desc->status & IRQ_PENDING) && !desc->depth); ++ ++ desc->status &= ~IRQ_INPROGRESS; ++ desc->chip->end(irq); ++} ++ ++static void do_hardirq(struct irq_desc *desc) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&desc->lock, flags); ++ ++ if (!(desc->status & IRQ_INPROGRESS)) ++ goto out; ++ ++ if (desc->handle_irq == handle_simple_irq) ++ thread_simple_irq(desc); ++ else if (desc->handle_irq == handle_level_irq) ++ thread_level_irq(desc); ++ else if (desc->handle_irq == handle_fasteoi_irq) ++ thread_fasteoi_irq(desc); ++ else if (desc->handle_irq == handle_edge_irq) ++ thread_edge_irq(desc); ++ else ++ thread_do_irq(desc); ++ out: ++ spin_unlock_irqrestore(&desc->lock, flags); ++ ++ if (waitqueue_active(&desc->wait_for_handler)) ++ wake_up(&desc->wait_for_handler); ++} ++ ++extern asmlinkage void __do_softirq(void); ++ ++static int do_irqd(void * __desc) ++{ ++ struct sched_param param = { 0, }; ++ struct irq_desc *desc = __desc; ++ ++#ifdef CONFIG_SMP ++ set_cpus_allowed_ptr(current, desc->affinity); ++#endif ++ current->flags |= PF_NOFREEZE | PF_HARDIRQ; ++ ++ /* ++ * Set irq thread priority to SCHED_FIFO/50: ++ */ ++ param.sched_priority = MAX_USER_RT_PRIO/2; ++ ++ sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); ++ ++ while (!kthread_should_stop()) { ++ local_irq_disable_nort(); ++ set_current_state(TASK_INTERRUPTIBLE); ++#ifndef CONFIG_PREEMPT_RT ++ irq_enter(); ++#endif ++ do_hardirq(desc); ++#ifndef CONFIG_PREEMPT_RT ++ irq_exit(); ++#endif ++ local_irq_enable_nort(); ++ cond_resched(); ++#ifdef CONFIG_SMP ++ /* ++ * Did IRQ affinities change? ++ */ ++ if (!cpumask_equal(¤t->cpus_allowed, desc->affinity)) ++ set_cpus_allowed_ptr(current, desc->affinity); ++#endif ++ schedule(); ++ } ++ __set_current_state(TASK_RUNNING); ++ ++ return 0; ++} ++ ++static int ok_to_create_irq_threads; ++ ++static int start_irq_thread(int irq, struct irq_desc *desc) ++{ ++ if (desc->thread || !ok_to_create_irq_threads) ++ return 0; ++ ++ init_waitqueue_head(&desc->wait_for_handler); ++ ++ desc->thread = kthread_create(do_irqd, desc, "IRQ-%d", irq); ++ if (!desc->thread) { ++ printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq); ++ return -ENOMEM; ++ } ++ ++ /* ++ * An interrupt may have come in before the thread pointer was ++ * stored in desc->thread; make sure the thread gets woken up in ++ * such a case: ++ */ ++ smp_mb(); ++ wake_up_process(desc->thread); ++ ++ return 0; ++} ++ ++/* ++ * Start hardirq threads for all IRQs that are registered already. ++ * ++ * New ones will be started at the time of IRQ setup from now on. ++ */ ++void __init init_hardirqs(void) ++{ ++ struct irq_desc *desc; ++ int irq; ++ ++ ok_to_create_irq_threads = 1; ++ ++ for_each_irq_desc(irq, desc) { ++ if (desc->action && !(desc->status & IRQ_NODELAY)) ++ start_irq_thread(irq, desc); ++ } ++} ++ ++#else ++ ++static int start_irq_thread(int irq, struct irq_desc *desc) ++{ ++ return 0; ++} ++ ++#endif ++ ++void __init early_init_hardirqs(void) ++{ ++ struct irq_desc *desc; ++ int i; ++ ++ for_each_irq_desc(i, desc) ++ init_waitqueue_head(&desc->wait_for_handler); ++} +Index: linux-2.6-tip/kernel/irq/migration.c +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/migration.c ++++ linux-2.6-tip/kernel/irq/migration.c +@@ -18,7 +18,7 @@ void move_masked_irq(int irq) + + desc->status &= ~IRQ_MOVE_PENDING; + +- if (unlikely(cpumask_empty(&desc->pending_mask))) ++ if (unlikely(cpumask_empty(desc->pending_mask))) + return; + + if (!desc->chip->set_affinity) +@@ -38,18 +38,19 @@ void move_masked_irq(int irq) + * For correct operation this depends on the caller + * masking the irqs. + */ +- if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) ++ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) + < nr_cpu_ids)) { +- cpumask_and(&desc->affinity, +- &desc->pending_mask, cpu_online_mask); +- desc->chip->set_affinity(irq, &desc->affinity); ++ cpumask_and(desc->affinity, ++ desc->pending_mask, cpu_online_mask); ++ desc->chip->set_affinity(irq, desc->affinity); + } +- cpumask_clear(&desc->pending_mask); ++ cpumask_clear(desc->pending_mask); + } + + void move_native_irq(int irq) + { + struct irq_desc *desc = irq_to_desc(irq); ++ int mask = 1; + + if (likely(!(desc->status & IRQ_MOVE_PENDING))) + return; +@@ -57,8 +58,17 @@ void move_native_irq(int irq) + if (unlikely(desc->status & IRQ_DISABLED)) + return; + +- desc->chip->mask(irq); ++ /* ++ * If the irq is already in progress, it should be masked. ++ * If we unmask it, we might cause an interrupt storm on RT. ++ */ ++ if (unlikely(desc->status & IRQ_INPROGRESS)) ++ mask = 0; ++ ++ if (mask) ++ desc->chip->mask(irq); + move_masked_irq(irq); +- desc->chip->unmask(irq); ++ if (mask) ++ desc->chip->unmask(irq); + } + +Index: linux-2.6-tip/kernel/irq/numa_migrate.c +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/numa_migrate.c ++++ linux-2.6-tip/kernel/irq/numa_migrate.c +@@ -17,16 +17,11 @@ static void init_copy_kstat_irqs(struct + struct irq_desc *desc, + int cpu, int nr) + { +- unsigned long bytes; +- + init_kstat_irqs(desc, cpu, nr); + +- if (desc->kstat_irqs != old_desc->kstat_irqs) { +- /* Compute how many bytes we need per irq and allocate them */ +- bytes = nr * sizeof(unsigned int); +- +- memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes); +- } ++ if (desc->kstat_irqs != old_desc->kstat_irqs) ++ memcpy(desc->kstat_irqs, old_desc->kstat_irqs, ++ nr * sizeof(*desc->kstat_irqs)); + } + + static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) +@@ -38,15 +33,23 @@ static void free_kstat_irqs(struct irq_d + old_desc->kstat_irqs = NULL; + } + +-static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, ++static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, + struct irq_desc *desc, int cpu) + { + memcpy(desc, old_desc, sizeof(struct irq_desc)); ++ if (!init_alloc_desc_masks(desc, cpu, false)) { ++ printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " ++ "for migration.\n", irq); ++ return false; ++ } + spin_lock_init(&desc->lock); ++ init_waitqueue_head(&desc->wait_for_handler); + desc->cpu = cpu; + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); ++ init_copy_desc_masks(old_desc, desc); + arch_init_copy_chip_data(old_desc, desc, cpu); ++ return true; + } + + static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) +@@ -76,12 +79,18 @@ static struct irq_desc *__real_move_irq_ + node = cpu_to_node(cpu); + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + if (!desc) { +- printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); ++ printk(KERN_ERR "irq %d: can not get new irq_desc " ++ "for migration.\n", irq); ++ /* still use old one */ ++ desc = old_desc; ++ goto out_unlock; ++ } ++ if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { + /* still use old one */ ++ kfree(desc); + desc = old_desc; + goto out_unlock; + } +- init_copy_one_irq_desc(irq, old_desc, desc, cpu); + + irq_desc_ptrs[irq] = desc; + spin_unlock_irqrestore(&sparse_irq_lock, flags); +Index: linux-2.6-tip/kernel/irq/proc.c +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/proc.c ++++ linux-2.6-tip/kernel/irq/proc.c +@@ -7,6 +7,8 @@ + */ + + #include ++#include ++#include + #include + #include + #include +@@ -20,11 +22,11 @@ static struct proc_dir_entry *root_irq_d + static int irq_affinity_proc_show(struct seq_file *m, void *v) + { + struct irq_desc *desc = irq_to_desc((long)m->private); +- const struct cpumask *mask = &desc->affinity; ++ const struct cpumask *mask = desc->affinity; + + #ifdef CONFIG_GENERIC_PENDING_IRQ + if (desc->status & IRQ_MOVE_PENDING) +- mask = &desc->pending_mask; ++ mask = desc->pending_mask; + #endif + seq_cpumask(m, mask); + seq_putc(m, '\n'); +@@ -116,6 +118,9 @@ static ssize_t default_affinity_write(st + goto out; + } + ++ /* create /proc/irq/prof_cpu_mask */ ++ create_prof_cpu_mask(root_irq_dir); ++ + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least +@@ -160,45 +165,6 @@ static int irq_spurious_read(char *page, + jiffies_to_msecs(desc->last_unhandled)); + } + +-#define MAX_NAMELEN 128 +- +-static int name_unique(unsigned int irq, struct irqaction *new_action) +-{ +- struct irq_desc *desc = irq_to_desc(irq); +- struct irqaction *action; +- unsigned long flags; +- int ret = 1; +- +- spin_lock_irqsave(&desc->lock, flags); +- for (action = desc->action ; action; action = action->next) { +- if ((action != new_action) && action->name && +- !strcmp(new_action->name, action->name)) { +- ret = 0; +- break; +- } +- } +- spin_unlock_irqrestore(&desc->lock, flags); +- return ret; +-} +- +-void register_handler_proc(unsigned int irq, struct irqaction *action) +-{ +- char name [MAX_NAMELEN]; +- struct irq_desc *desc = irq_to_desc(irq); +- +- if (!desc->dir || action->dir || !action->name || +- !name_unique(irq, action)) +- return; +- +- memset(name, 0, MAX_NAMELEN); +- snprintf(name, MAX_NAMELEN, "%s", action->name); +- +- /* create /proc/irq/1234/handler/ */ +- action->dir = proc_mkdir(name, desc->dir); +-} +- +-#undef MAX_NAMELEN +- + #define MAX_NAMELEN 10 + + void register_irq_proc(unsigned int irq, struct irq_desc *desc) +@@ -232,6 +198,8 @@ void register_irq_proc(unsigned int irq, + + void unregister_handler_proc(unsigned int irq, struct irqaction *action) + { ++ if (action->threaded) ++ remove_proc_entry(action->threaded->name, action->dir); + if (action->dir) { + struct irq_desc *desc = irq_to_desc(irq); + +@@ -247,6 +215,91 @@ static void register_default_affinity_pr + #endif + } + ++#ifndef CONFIG_PREEMPT_RT ++ ++static int threaded_read_proc(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ return sprintf(page, "%c\n", ++ ((struct irqaction *)data)->flags & IRQF_NODELAY ? '0' : '1'); ++} ++ ++static int threaded_write_proc(struct file *file, const char __user *buffer, ++ unsigned long count, void *data) ++{ ++ int c; ++ struct irqaction *action = data; ++ irq_desc_t *desc = irq_to_desc(action->irq); ++ ++ if (get_user(c, buffer)) ++ return -EFAULT; ++ if (c != '0' && c != '1') ++ return -EINVAL; ++ ++ spin_lock_irq(&desc->lock); ++ ++ if (c == '0') ++ action->flags |= IRQF_NODELAY; ++ if (c == '1') ++ action->flags &= ~IRQF_NODELAY; ++ recalculate_desc_flags(desc); ++ ++ spin_unlock_irq(&desc->lock); ++ ++ return 1; ++} ++ ++#endif ++ ++#define MAX_NAMELEN 128 ++ ++static int name_unique(unsigned int irq, struct irqaction *new_action) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ struct irqaction *action; ++ ++ for (action = desc->action ; action; action = action->next) ++ if ((action != new_action) && action->name && ++ !strcmp(new_action->name, action->name)) ++ return 0; ++ return 1; ++} ++ ++void register_handler_proc(unsigned int irq, struct irqaction *action) ++{ ++ char name [MAX_NAMELEN]; ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ if (!desc->dir || action->dir || !action->name || ++ !name_unique(irq, action)) ++ return; ++ ++ memset(name, 0, MAX_NAMELEN); ++ snprintf(name, MAX_NAMELEN, "%s", action->name); ++ ++ /* create /proc/irq/1234/handler/ */ ++ action->dir = proc_mkdir(name, desc->dir); ++ ++ if (!action->dir) ++ return; ++#ifndef CONFIG_PREEMPT_RT ++ { ++ struct proc_dir_entry *entry; ++ /* create /proc/irq/1234/handler/threaded */ ++ entry = create_proc_entry("threaded", 0600, action->dir); ++ if (!entry) ++ return; ++ entry->nlink = 1; ++ entry->data = (void *)action; ++ entry->read_proc = threaded_read_proc; ++ entry->write_proc = threaded_write_proc; ++ action->threaded = entry; ++ } ++#endif ++} ++ ++#undef MAX_NAMELEN ++ + void init_irq_proc(void) + { + unsigned int irq; +Index: linux-2.6-tip/kernel/irq/spurious.c +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/spurious.c ++++ linux-2.6-tip/kernel/irq/spurious.c +@@ -14,6 +14,11 @@ + #include + #include + ++#ifdef CONFIG_X86_IO_APIC ++# include ++# include ++#endif ++ + static int irqfixup __read_mostly; + + #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) +@@ -54,9 +59,8 @@ static int try_one_irq(int irq, struct i + } + action = action->next; + } +- local_irq_disable(); + /* Now clean up the flags */ +- spin_lock(&desc->lock); ++ spin_lock_irq(&desc->lock); + action = desc->action; + + /* +@@ -104,7 +108,7 @@ static int misrouted_irq(int irq) + return ok; + } + +-static void poll_spurious_irqs(unsigned long dummy) ++static void poll_all_shared_irqs(void) + { + struct irq_desc *desc; + int i; +@@ -123,11 +127,23 @@ static void poll_spurious_irqs(unsigned + + try_one_irq(i, desc); + } ++} ++ ++static void poll_spurious_irqs(unsigned long dummy) ++{ ++ poll_all_shared_irqs(); + + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + } + ++#ifdef CONFIG_DEBUG_SHIRQ ++void debug_poll_all_shared_irqs(void) ++{ ++ poll_all_shared_irqs(); ++} ++#endif ++ + /* + * If 99,900 of the previous 100,000 interrupts have not been handled + * then assume that the IRQ is stuck in some manner. Drop a diagnostic +@@ -246,6 +262,12 @@ void note_interrupt(unsigned int irq, st + * The interrupt is stuck + */ + __report_bad_irq(irq, desc, action_ret); ++#ifdef CONFIG_X86_IO_APIC ++ if (!sis_apic_bug) { ++ sis_apic_bug = 1; ++ printk(KERN_ERR "turning off IO-APIC fast mode.\n"); ++ } ++#else + /* + * Now kill the IRQ + */ +@@ -256,6 +278,7 @@ void note_interrupt(unsigned int irq, st + + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); ++#endif + } + desc->irqs_unhandled = 0; + } +@@ -276,6 +299,11 @@ MODULE_PARM_DESC(noirqdebug, "Disable ir + + static int __init irqfixup_setup(char *str) + { ++#ifdef CONFIG_PREEMPT_RT ++ printk(KERN_WARNING "irqfixup boot option not supported " ++ "w/ CONFIG_PREEMPT_RT\n"); ++ return 1; ++#endif + irqfixup = 1; + printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); + printk(KERN_WARNING "This may impact system performance.\n"); +@@ -289,6 +317,11 @@ MODULE_PARM_DESC("irqfixup", "0: No fixu + + static int __init irqpoll_setup(char *str) + { ++#ifdef CONFIG_PREEMPT_RT ++ printk(KERN_WARNING "irqpoll boot option not supported " ++ "w/ CONFIG_PREEMPT_RT\n"); ++ return 1; ++#endif + irqfixup = 2; + printk(KERN_WARNING "Misrouted IRQ fixup and polling support " + "enabled\n"); +Index: linux-2.6-tip/kernel/kexec.c +=================================================================== +--- linux-2.6-tip.orig/kernel/kexec.c ++++ linux-2.6-tip/kernel/kexec.c +@@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; +- elf_core_copy_regs(&prstatus.pr_reg, regs); ++ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +Index: linux-2.6-tip/kernel/kprobes.c +=================================================================== +--- linux-2.6-tip.orig/kernel/kprobes.c ++++ linux-2.6-tip/kernel/kprobes.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -72,10 +73,10 @@ static bool kprobe_enabled; + static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ + static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; + static struct { +- spinlock_t lock ____cacheline_aligned_in_smp; ++ raw_spinlock_t lock ____cacheline_aligned_in_smp; + } kretprobe_table_locks[KPROBE_TABLE_SIZE]; + +-static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) ++static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) + { + return &(kretprobe_table_locks[hash].lock); + } +@@ -414,7 +415,7 @@ void __kprobes kretprobe_hash_lock(struc + struct hlist_head **head, unsigned long *flags) + { + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); +- spinlock_t *hlist_lock; ++ raw_spinlock_t *hlist_lock; + + *head = &kretprobe_inst_table[hash]; + hlist_lock = kretprobe_table_lock_ptr(hash); +@@ -424,7 +425,7 @@ void __kprobes kretprobe_hash_lock(struc + static void __kprobes kretprobe_table_lock(unsigned long hash, + unsigned long *flags) + { +- spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); ++ raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + spin_lock_irqsave(hlist_lock, *flags); + } + +@@ -432,7 +433,7 @@ void __kprobes kretprobe_hash_unlock(str + unsigned long *flags) + { + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); +- spinlock_t *hlist_lock; ++ raw_spinlock_t *hlist_lock; + + hlist_lock = kretprobe_table_lock_ptr(hash); + spin_unlock_irqrestore(hlist_lock, *flags); +@@ -440,7 +441,7 @@ void __kprobes kretprobe_hash_unlock(str + + void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) + { +- spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); ++ raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + spin_unlock_irqrestore(hlist_lock, *flags); + } + +@@ -699,9 +700,10 @@ int __kprobes register_kprobe(struct kpr + goto out; + } + ++ mutex_lock(&text_mutex); + ret = arch_prepare_kprobe(p); + if (ret) +- goto out; ++ goto out_unlock_text; + + INIT_HLIST_NODE(&p->hlist); + hlist_add_head_rcu(&p->hlist, +@@ -710,6 +712,8 @@ int __kprobes register_kprobe(struct kpr + if (kprobe_enabled) + arch_arm_kprobe(p); + ++out_unlock_text: ++ mutex_unlock(&text_mutex); + out: + mutex_unlock(&kprobe_mutex); + +@@ -746,8 +750,11 @@ valid_p: + * enabled and not gone - otherwise, the breakpoint would + * already have been removed. We save on flushing icache. + */ +- if (kprobe_enabled && !kprobe_gone(old_p)) ++ if (kprobe_enabled && !kprobe_gone(old_p)) { ++ mutex_lock(&text_mutex); + arch_disarm_kprobe(p); ++ mutex_unlock(&text_mutex); ++ } + hlist_del_rcu(&old_p->hlist); + } else { + if (p->break_handler && !kprobe_gone(p)) +@@ -1278,12 +1285,14 @@ static void __kprobes enable_all_kprobes + if (kprobe_enabled) + goto already_enabled; + ++ mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (!kprobe_gone(p)) + arch_arm_kprobe(p); + } ++ mutex_unlock(&text_mutex); + + kprobe_enabled = true; + printk(KERN_INFO "Kprobes globally enabled\n"); +@@ -1308,6 +1317,7 @@ static void __kprobes disable_all_kprobe + + kprobe_enabled = false; + printk(KERN_INFO "Kprobes globally disabled\n"); ++ mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { +@@ -1316,6 +1326,7 @@ static void __kprobes disable_all_kprobe + } + } + ++ mutex_unlock(&text_mutex); + mutex_unlock(&kprobe_mutex); + /* Allow all currently running kprobes to complete */ + synchronize_sched(); +Index: linux-2.6-tip/kernel/latencytop.c +=================================================================== +--- linux-2.6-tip.orig/kernel/latencytop.c ++++ linux-2.6-tip/kernel/latencytop.c +@@ -9,6 +9,44 @@ + * as published by the Free Software Foundation; version 2 + * of the License. + */ ++ ++/* ++ * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is ++ * used by the "latencytop" userspace tool. The latency that is tracked is not ++ * the 'traditional' interrupt latency (which is primarily caused by something ++ * else consuming CPU), but instead, it is the latency an application encounters ++ * because the kernel sleeps on its behalf for various reasons. ++ * ++ * This code tracks 2 levels of statistics: ++ * 1) System level latency ++ * 2) Per process latency ++ * ++ * The latency is stored in fixed sized data structures in an accumulated form; ++ * if the "same" latency cause is hit twice, this will be tracked as one entry ++ * in the data structure. Both the count, total accumulated latency and maximum ++ * latency are tracked in this data structure. When the fixed size structure is ++ * full, no new causes are tracked until the buffer is flushed by writing to ++ * the /proc file; the userspace tool does this on a regular basis. ++ * ++ * A latency cause is identified by a stringified backtrace at the point that ++ * the scheduler gets invoked. The userland tool will use this string to ++ * identify the cause of the latency in human readable form. ++ * ++ * The information is exported via /proc/latency_stats and /proc//latency. ++ * These files look like this: ++ * ++ * Latency Top version : v0.1 ++ * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl ++ * | | | | ++ * | | | +----> the stringified backtrace ++ * | | +---------> The maximum latency for this entry in microseconds ++ * | +--------------> The accumulated latency for this entry (microseconds) ++ * +-------------------> The number of times this entry is hit ++ * ++ * (note: the average latency is the accumulated latency divided by the number ++ * of times) ++ */ ++ + #include + #include + #include +@@ -72,7 +110,7 @@ account_global_scheduler_latency(struct + firstnonnull = i; + continue; + } +- for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { ++ for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long record = lat->backtrace[q]; + + if (latency_record[i].backtrace[q] != record) { +@@ -101,31 +139,52 @@ account_global_scheduler_latency(struct + memcpy(&latency_record[i], lat, sizeof(struct latency_record)); + } + +-static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) ++/* ++ * Iterator to store a backtrace into a latency record entry ++ */ ++static inline void store_stacktrace(struct task_struct *tsk, ++ struct latency_record *lat) + { + struct stack_trace trace; + + memset(&trace, 0, sizeof(trace)); + trace.max_entries = LT_BACKTRACEDEPTH; + trace.entries = &lat->backtrace[0]; +- trace.skip = 0; + save_stack_trace_tsk(tsk, &trace); + } + ++/** ++ * __account_scheduler_latency - record an occured latency ++ * @tsk - the task struct of the task hitting the latency ++ * @usecs - the duration of the latency in microseconds ++ * @inter - 1 if the sleep was interruptible, 0 if uninterruptible ++ * ++ * This function is the main entry point for recording latency entries ++ * as called by the scheduler. ++ * ++ * This function has a few special cases to deal with normal 'non-latency' ++ * sleeps: specifically, interruptible sleep longer than 5 msec is skipped ++ * since this usually is caused by waiting for events via select() and co. ++ * ++ * Negative latencies (caused by time going backwards) are also explicitly ++ * skipped. ++ */ + void __sched +-account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) ++__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) + { + unsigned long flags; + int i, q; + struct latency_record lat; + +- if (!latencytop_enabled) +- return; +- + /* Long interruptible waits are generally user requested... */ + if (inter && usecs > 5000) + return; + ++ /* Negative sleeps are time going backwards */ ++ /* Zero-time sleeps are non-interesting */ ++ if (usecs <= 0) ++ return; ++ + memset(&lat, 0, sizeof(lat)); + lat.count = 1; + lat.time = usecs; +@@ -143,12 +202,12 @@ account_scheduler_latency(struct task_st + if (tsk->latency_record_count >= LT_SAVECOUNT) + goto out_unlock; + +- for (i = 0; i < LT_SAVECOUNT ; i++) { ++ for (i = 0; i < LT_SAVECOUNT; i++) { + struct latency_record *mylat; + int same = 1; + + mylat = &tsk->latency_record[i]; +- for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { ++ for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long record = lat.backtrace[q]; + + if (mylat->backtrace[q] != record) { +@@ -186,7 +245,7 @@ static int lstats_show(struct seq_file * + for (i = 0; i < MAXLR; i++) { + if (latency_record[i].backtrace[0]) { + int q; +- seq_printf(m, "%i %li %li ", ++ seq_printf(m, "%i %lu %lu ", + latency_record[i].count, + latency_record[i].time, + latency_record[i].max); +@@ -223,7 +282,7 @@ static int lstats_open(struct inode *ino + return single_open(filp, lstats_show, NULL); + } + +-static struct file_operations lstats_fops = { ++static const struct file_operations lstats_fops = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, +@@ -236,4 +295,4 @@ static int __init init_lstats_procfs(voi + proc_create("latency_stats", 0644, NULL, &lstats_fops); + return 0; + } +-__initcall(init_lstats_procfs); ++device_initcall(init_lstats_procfs); +Index: linux-2.6-tip/kernel/lockdep.c +=================================================================== +--- linux-2.6-tip.orig/kernel/lockdep.c ++++ linux-2.6-tip/kernel/lockdep.c +@@ -41,6 +41,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -68,7 +70,7 @@ module_param(lock_stat, int, 0644); + * to use a raw spinlock - we really dont want the spinlock + * code to recurse back into the lockdep code... + */ +-static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; ++static __raw_spinlock_t lockdep_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + + static int graph_lock(void) + { +@@ -310,12 +312,14 @@ EXPORT_SYMBOL(lockdep_on); + #if VERBOSE + # define HARDIRQ_VERBOSE 1 + # define SOFTIRQ_VERBOSE 1 ++# define RECLAIM_VERBOSE 1 + #else + # define HARDIRQ_VERBOSE 0 + # define SOFTIRQ_VERBOSE 0 ++# define RECLAIM_VERBOSE 0 + #endif + +-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE ++#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE + /* + * Quick filtering for interesting events: + */ +@@ -430,30 +434,24 @@ atomic_t nr_find_usage_forwards_checks; + atomic_t nr_find_usage_forwards_recursions; + atomic_t nr_find_usage_backwards_checks; + atomic_t nr_find_usage_backwards_recursions; +-# define debug_atomic_inc(ptr) atomic_inc(ptr) +-# define debug_atomic_dec(ptr) atomic_dec(ptr) +-# define debug_atomic_read(ptr) atomic_read(ptr) +-#else +-# define debug_atomic_inc(ptr) do { } while (0) +-# define debug_atomic_dec(ptr) do { } while (0) +-# define debug_atomic_read(ptr) 0 + #endif + + /* + * Locking printouts: + */ + ++#define __USAGE(__STATE) \ ++ [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \ ++ [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \ ++ [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\ ++ [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R", ++ + static const char *usage_str[] = + { +- [LOCK_USED] = "initial-use ", +- [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", +- [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", +- [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", +- [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W", +- [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R", +- [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R", +- [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R", +- [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R", ++#define LOCKDEP_STATE(__STATE) __USAGE(__STATE) ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE ++ [LOCK_USED] = "INITIAL USE", + }; + + const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +@@ -461,46 +459,45 @@ const char * __get_key_name(struct lockd + return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); + } + +-void +-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4) ++static inline unsigned long lock_flag(enum lock_usage_bit bit) + { +- *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; +- +- if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) +- *c1 = '+'; +- else +- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) +- *c1 = '-'; ++ return 1UL << bit; ++} + +- if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) +- *c2 = '+'; +- else +- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) +- *c2 = '-'; ++static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) ++{ ++ char c = '.'; + +- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) +- *c3 = '-'; +- if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { +- *c3 = '+'; +- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) +- *c3 = '?'; +- } +- +- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) +- *c4 = '-'; +- if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { +- *c4 = '+'; +- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) +- *c4 = '?'; ++ if (class->usage_mask & lock_flag(bit + 2)) ++ c = '+'; ++ if (class->usage_mask & lock_flag(bit)) { ++ c = '-'; ++ if (class->usage_mask & lock_flag(bit + 2)) ++ c = '?'; + } ++ ++ return c; ++} ++ ++void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) ++{ ++ int i = 0; ++ ++#define LOCKDEP_STATE(__STATE) \ ++ usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \ ++ usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE ++ ++ usage[i] = '\0'; + } + + static void print_lock_name(struct lock_class *class) + { +- char str[KSYM_NAME_LEN], c1, c2, c3, c4; ++ char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; + const char *name; + +- get_usage_chars(class, &c1, &c2, &c3, &c4); ++ get_usage_chars(class, usage); + + name = class->name; + if (!name) { +@@ -513,7 +510,7 @@ static void print_lock_name(struct lock_ + if (class->subclass) + printk("/%d", class->subclass); + } +- printk("){%c%c%c%c}", c1, c2, c3, c4); ++ printk("){%s}", usage); + } + + static void print_lockdep_cache(struct lockdep_map *lock) +@@ -760,7 +757,7 @@ register_lock_class(struct lockdep_map * + */ + if (!static_obj(lock->key)) { + debug_locks_off(); +- printk("INFO: trying to register non-static key.\n"); ++ printk("INFO: trying to register non-static key %p.\n", lock->key); + printk("the code is fine but needs lockdep annotation.\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); +@@ -796,6 +793,7 @@ register_lock_class(struct lockdep_map * + + printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); + printk("turning off the locking correctness validator.\n"); ++ dump_stack(); + return NULL; + } + class = lock_classes + nr_lock_classes++; +@@ -846,6 +844,21 @@ out_unlock_set: + return class; + } + ++#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_TRACE_IRQFLAGS) ++ ++#define RECURSION_LIMIT 40 ++ ++static int noinline print_infinite_recursion_bug(void) ++{ ++ if (!debug_locks_off_graph_unlock()) ++ return 0; ++ ++ WARN_ON(1); ++ ++ return 0; ++} ++#endif /* CONFIG_PROVE_LOCKING || CONFIG_TRACE_IRQFLAGS */ ++ + #ifdef CONFIG_PROVE_LOCKING + /* + * Allocate a lockdep entry. (assumes the graph_lock held, returns +@@ -859,6 +872,7 @@ static struct lock_list *alloc_list_entr + + printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); + printk("turning off the locking correctness validator.\n"); ++ dump_stack(); + return NULL; + } + return list_entries + nr_list_entries++; +@@ -976,18 +990,6 @@ static noinline int print_circular_bug_t + return 0; + } + +-#define RECURSION_LIMIT 40 +- +-static int noinline print_infinite_recursion_bug(void) +-{ +- if (!debug_locks_off_graph_unlock()) +- return 0; +- +- WARN_ON(1); +- +- return 0; +-} +- + unsigned long __lockdep_count_forward_deps(struct lock_class *class, + unsigned int depth) + { +@@ -1180,6 +1182,7 @@ find_usage_backwards(struct lock_class * + return 1; + } + ++#ifdef CONFIG_PROVE_LOCKING + static int + print_bad_irq_dependency(struct task_struct *curr, + struct held_lock *prev, +@@ -1240,6 +1243,7 @@ print_bad_irq_dependency(struct task_str + + return 0; + } ++#endif /* CONFIG_PROVE_LOCKING */ + + static int + check_usage(struct task_struct *curr, struct held_lock *prev, +@@ -1263,9 +1267,49 @@ check_usage(struct task_struct *curr, st + bit_backwards, bit_forwards, irqclass); + } + +-static int +-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, +- struct held_lock *next) ++static const char *state_names[] = { ++#define LOCKDEP_STATE(__STATE) \ ++ __stringify(__STATE), ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE ++}; ++ ++static const char *state_rnames[] = { ++#define LOCKDEP_STATE(__STATE) \ ++ __stringify(__STATE)"-READ", ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE ++}; ++ ++static inline const char *state_name(enum lock_usage_bit bit) ++{ ++ return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; ++} ++ ++static int exclusive_bit(int new_bit) ++{ ++ /* ++ * USED_IN ++ * USED_IN_READ ++ * ENABLED ++ * ENABLED_READ ++ * ++ * bit 0 - write/read ++ * bit 1 - used_in/enabled ++ * bit 2+ state ++ */ ++ ++ int state = new_bit & ~3; ++ int dir = new_bit & 2; ++ ++ /* ++ * keep state, bit flip the direction and strip read. ++ */ ++ return state | (dir ^ 2); ++} ++ ++static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, ++ struct held_lock *next, enum lock_usage_bit bit) + { + /* + * Prove that the new dependency does not connect a hardirq-safe +@@ -1273,38 +1317,34 @@ check_prev_add_irq(struct task_struct *c + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ +- if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, +- LOCK_ENABLED_HARDIRQS, "hard")) ++ if (!check_usage(curr, prev, next, bit, ++ exclusive_bit(bit), state_name(bit))) + return 0; + ++ bit++; /* _READ */ ++ + /* + * Prove that the new dependency does not connect a hardirq-safe-read + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ +- if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, +- LOCK_ENABLED_HARDIRQS, "hard-read")) ++ if (!check_usage(curr, prev, next, bit, ++ exclusive_bit(bit), state_name(bit))) + return 0; + +- /* +- * Prove that the new dependency does not connect a softirq-safe +- * lock with a softirq-unsafe lock - to achieve this we search +- * the backwards-subgraph starting at , and the +- * forwards-subgraph starting at : +- */ +- if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, +- LOCK_ENABLED_SOFTIRQS, "soft")) +- return 0; +- /* +- * Prove that the new dependency does not connect a softirq-safe-read +- * lock with a softirq-unsafe lock - to achieve this we search +- * the backwards-subgraph starting at , and the +- * forwards-subgraph starting at : +- */ +- if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, +- LOCK_ENABLED_SOFTIRQS, "soft")) ++ return 1; ++} ++ ++static int ++check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, ++ struct held_lock *next) ++{ ++#define LOCKDEP_STATE(__STATE) \ ++ if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ + return 0; ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE + + return 1; + } +@@ -1649,6 +1689,7 @@ cache_hit: + + printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); + printk("turning off the locking correctness validator.\n"); ++ dump_stack(); + return 0; + } + chain = lock_chains + nr_lock_chains++; +@@ -1861,9 +1902,9 @@ print_irq_inversion_bug(struct task_stru + curr->comm, task_pid_nr(curr)); + print_lock(this); + if (forwards) +- printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); ++ printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); + else +- printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); ++ printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); + print_lock_name(other); + printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + +@@ -1933,7 +1974,7 @@ void print_irqtrace_events(struct task_s + print_ip_sym(curr->softirq_disable_ip); + } + +-static int hardirq_verbose(struct lock_class *class) ++static int HARDIRQ_verbose(struct lock_class *class) + { + #if HARDIRQ_VERBOSE + return class_filter(class); +@@ -1941,7 +1982,7 @@ static int hardirq_verbose(struct lock_c + return 0; + } + +-static int softirq_verbose(struct lock_class *class) ++static int SOFTIRQ_verbose(struct lock_class *class) + { + #if SOFTIRQ_VERBOSE + return class_filter(class); +@@ -1949,185 +1990,95 @@ static int softirq_verbose(struct lock_c + return 0; + } + ++static int RECLAIM_FS_verbose(struct lock_class *class) ++{ ++#if RECLAIM_VERBOSE ++ return class_filter(class); ++#endif ++ return 0; ++} ++ + #define STRICT_READ_CHECKS 1 + +-static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, ++static int (*state_verbose_f[])(struct lock_class *class) = { ++#define LOCKDEP_STATE(__STATE) \ ++ __STATE##_verbose, ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE ++}; ++ ++static inline int state_verbose(enum lock_usage_bit bit, ++ struct lock_class *class) ++{ ++ return state_verbose_f[bit >> 2](class); ++} ++ ++typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, ++ enum lock_usage_bit bit, const char *name); ++ ++static int ++mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) + { +- int ret = 1; ++ int excl_bit = exclusive_bit(new_bit); ++ int read = new_bit & 1; ++ int dir = new_bit & 2; + +- switch(new_bit) { +- case LOCK_USED_IN_HARDIRQ: +- if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) +- return 0; +- if (!valid_state(curr, this, new_bit, +- LOCK_ENABLED_HARDIRQS_READ)) +- return 0; +- /* +- * just marked it hardirq-safe, check that this lock +- * took no hardirq-unsafe lock in the past: +- */ +- if (!check_usage_forwards(curr, this, +- LOCK_ENABLED_HARDIRQS, "hard")) +- return 0; +-#if STRICT_READ_CHECKS +- /* +- * just marked it hardirq-safe, check that this lock +- * took no hardirq-unsafe-read lock in the past: +- */ +- if (!check_usage_forwards(curr, this, +- LOCK_ENABLED_HARDIRQS_READ, "hard-read")) +- return 0; +-#endif +- if (hardirq_verbose(hlock_class(this))) +- ret = 2; +- break; +- case LOCK_USED_IN_SOFTIRQ: +- if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) +- return 0; +- if (!valid_state(curr, this, new_bit, +- LOCK_ENABLED_SOFTIRQS_READ)) +- return 0; +- /* +- * just marked it softirq-safe, check that this lock +- * took no softirq-unsafe lock in the past: +- */ +- if (!check_usage_forwards(curr, this, +- LOCK_ENABLED_SOFTIRQS, "soft")) +- return 0; +-#if STRICT_READ_CHECKS +- /* +- * just marked it softirq-safe, check that this lock +- * took no softirq-unsafe-read lock in the past: +- */ +- if (!check_usage_forwards(curr, this, +- LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) +- return 0; +-#endif +- if (softirq_verbose(hlock_class(this))) +- ret = 2; +- break; +- case LOCK_USED_IN_HARDIRQ_READ: +- if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) +- return 0; +- /* +- * just marked it hardirq-read-safe, check that this lock +- * took no hardirq-unsafe lock in the past: +- */ +- if (!check_usage_forwards(curr, this, +- LOCK_ENABLED_HARDIRQS, "hard")) +- return 0; +- if (hardirq_verbose(hlock_class(this))) +- ret = 2; +- break; +- case LOCK_USED_IN_SOFTIRQ_READ: +- if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) +- return 0; +- /* +- * just marked it softirq-read-safe, check that this lock +- * took no softirq-unsafe lock in the past: +- */ +- if (!check_usage_forwards(curr, this, +- LOCK_ENABLED_SOFTIRQS, "soft")) +- return 0; +- if (softirq_verbose(hlock_class(this))) +- ret = 2; +- break; +- case LOCK_ENABLED_HARDIRQS: +- if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) +- return 0; +- if (!valid_state(curr, this, new_bit, +- LOCK_USED_IN_HARDIRQ_READ)) +- return 0; +- /* +- * just marked it hardirq-unsafe, check that no hardirq-safe +- * lock in the system ever took it in the past: +- */ +- if (!check_usage_backwards(curr, this, +- LOCK_USED_IN_HARDIRQ, "hard")) +- return 0; +-#if STRICT_READ_CHECKS +- /* +- * just marked it hardirq-unsafe, check that no +- * hardirq-safe-read lock in the system ever took +- * it in the past: +- */ +- if (!check_usage_backwards(curr, this, +- LOCK_USED_IN_HARDIRQ_READ, "hard-read")) +- return 0; +-#endif +- if (hardirq_verbose(hlock_class(this))) +- ret = 2; +- break; +- case LOCK_ENABLED_SOFTIRQS: +- if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) +- return 0; +- if (!valid_state(curr, this, new_bit, +- LOCK_USED_IN_SOFTIRQ_READ)) +- return 0; +- /* +- * just marked it softirq-unsafe, check that no softirq-safe +- * lock in the system ever took it in the past: +- */ +- if (!check_usage_backwards(curr, this, +- LOCK_USED_IN_SOFTIRQ, "soft")) +- return 0; +-#if STRICT_READ_CHECKS +- /* +- * just marked it softirq-unsafe, check that no +- * softirq-safe-read lock in the system ever took +- * it in the past: +- */ +- if (!check_usage_backwards(curr, this, +- LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) +- return 0; +-#endif +- if (softirq_verbose(hlock_class(this))) +- ret = 2; +- break; +- case LOCK_ENABLED_HARDIRQS_READ: +- if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) +- return 0; +-#if STRICT_READ_CHECKS +- /* +- * just marked it hardirq-read-unsafe, check that no +- * hardirq-safe lock in the system ever took it in the past: +- */ +- if (!check_usage_backwards(curr, this, +- LOCK_USED_IN_HARDIRQ, "hard")) +- return 0; +-#endif +- if (hardirq_verbose(hlock_class(this))) +- ret = 2; +- break; +- case LOCK_ENABLED_SOFTIRQS_READ: +- if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) ++ /* ++ * mark USED_IN has to look forwards -- to ensure no dependency ++ * has ENABLED state, which would allow recursion deadlocks. ++ * ++ * mark ENABLED has to look backwards -- to ensure no dependee ++ * has USED_IN state, which, again, would allow recursion deadlocks. ++ */ ++ check_usage_f usage = dir ? ++ check_usage_backwards : check_usage_forwards; ++ ++ /* ++ * Validate that this particular lock does not have conflicting ++ * usage states. ++ */ ++ if (!valid_state(curr, this, new_bit, excl_bit)) ++ return 0; ++ ++ /* ++ * Validate that the lock dependencies don't have conflicting usage ++ * states. ++ */ ++ if ((!read || !dir || STRICT_READ_CHECKS) && ++ !usage(curr, this, excl_bit, state_name(new_bit & ~1))) ++ return 0; ++ ++ /* ++ * Check for read in write conflicts ++ */ ++ if (!read) { ++ if (!valid_state(curr, this, new_bit, excl_bit + 1)) + return 0; +-#if STRICT_READ_CHECKS +- /* +- * just marked it softirq-read-unsafe, check that no +- * softirq-safe lock in the system ever took it in the past: +- */ +- if (!check_usage_backwards(curr, this, +- LOCK_USED_IN_SOFTIRQ, "soft")) ++ ++ if (STRICT_READ_CHECKS && ++ !usage(curr, this, excl_bit + 1, ++ state_name(new_bit + 1))) + return 0; +-#endif +- if (softirq_verbose(hlock_class(this))) +- ret = 2; +- break; +- default: +- WARN_ON(1); +- break; + } + +- return ret; ++ if (state_verbose(new_bit, hlock_class(this))) ++ return 2; ++ ++ return 1; + } + ++enum mark_type { ++#define LOCKDEP_STATE(__STATE) __STATE, ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE ++}; ++ + /* + * Mark all held locks with a usage bit: + */ + static int +-mark_held_locks(struct task_struct *curr, int hardirq) ++mark_held_locks(struct task_struct *curr, enum mark_type mark) + { + enum lock_usage_bit usage_bit; + struct held_lock *hlock; +@@ -2136,17 +2087,12 @@ mark_held_locks(struct task_struct *curr + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + +- if (hardirq) { +- if (hlock->read) +- usage_bit = LOCK_ENABLED_HARDIRQS_READ; +- else +- usage_bit = LOCK_ENABLED_HARDIRQS; +- } else { +- if (hlock->read) +- usage_bit = LOCK_ENABLED_SOFTIRQS_READ; +- else +- usage_bit = LOCK_ENABLED_SOFTIRQS; +- } ++ usage_bit = 2 + (mark << 2); /* ENABLED */ ++ if (hlock->read) ++ usage_bit += 1; /* READ */ ++ ++ BUG_ON(usage_bit >= LOCK_USAGE_STATES); ++ + if (!mark_lock(curr, hlock, usage_bit)) + return 0; + } +@@ -2200,7 +2146,7 @@ void trace_hardirqs_on_caller(unsigned l + * We are going to turn hardirqs on, so set the + * usage bit for all held locks: + */ +- if (!mark_held_locks(curr, 1)) ++ if (!mark_held_locks(curr, HARDIRQ)) + return; + /* + * If we have softirqs enabled, then set the usage +@@ -2208,7 +2154,7 @@ void trace_hardirqs_on_caller(unsigned l + * this bit from being set before) + */ + if (curr->softirqs_enabled) +- if (!mark_held_locks(curr, 0)) ++ if (!mark_held_locks(curr, SOFTIRQ)) + return; + + curr->hardirq_enable_ip = ip; +@@ -2288,7 +2234,7 @@ void trace_softirqs_on(unsigned long ip) + * enabled too: + */ + if (curr->hardirqs_enabled) +- mark_held_locks(curr, 0); ++ mark_held_locks(curr, SOFTIRQ); + } + + /* +@@ -2317,6 +2263,48 @@ void trace_softirqs_off(unsigned long ip + debug_atomic_inc(&redundant_softirqs_off); + } + ++static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) ++{ ++ struct task_struct *curr = current; ++ ++ if (unlikely(!debug_locks)) ++ return; ++ ++ /* no reclaim without waiting on it */ ++ if (!(gfp_mask & __GFP_WAIT)) ++ return; ++ ++ /* this guy won't enter reclaim */ ++ if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) ++ return; ++ ++ /* We're only interested __GFP_FS allocations for now */ ++ if (!(gfp_mask & __GFP_FS)) ++ return; ++ ++ if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) ++ return; ++ ++ mark_held_locks(curr, RECLAIM_FS); ++} ++ ++static void check_flags(unsigned long flags); ++ ++void lockdep_trace_alloc(gfp_t gfp_mask) ++{ ++ unsigned long flags; ++ ++ if (unlikely(current->lockdep_recursion)) ++ return; ++ ++ raw_local_irq_save(flags); ++ check_flags(flags); ++ current->lockdep_recursion = 1; ++ __lockdep_trace_alloc(gfp_mask, flags); ++ current->lockdep_recursion = 0; ++ raw_local_irq_restore(flags); ++} ++ + static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) + { + /* +@@ -2345,19 +2333,35 @@ static int mark_irqflags(struct task_str + if (!hlock->hardirqs_off) { + if (hlock->read) { + if (!mark_lock(curr, hlock, +- LOCK_ENABLED_HARDIRQS_READ)) ++ LOCK_ENABLED_HARDIRQ_READ)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, +- LOCK_ENABLED_SOFTIRQS_READ)) ++ LOCK_ENABLED_SOFTIRQ_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, +- LOCK_ENABLED_HARDIRQS)) ++ LOCK_ENABLED_HARDIRQ)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, +- LOCK_ENABLED_SOFTIRQS)) ++ LOCK_ENABLED_SOFTIRQ)) ++ return 0; ++ } ++ } ++ ++ /* ++ * We reuse the irq context infrastructure more broadly as a general ++ * context checking code. This tests GFP_FS recursion (a lock taken ++ * during reclaim for a GFP_FS allocation is held over a GFP_FS ++ * allocation). ++ */ ++ if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { ++ if (hlock->read) { ++ if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) ++ return 0; ++ } else { ++ if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) + return 0; + } + } +@@ -2412,6 +2416,10 @@ static inline int separate_irq_context(s + return 0; + } + ++void lockdep_trace_alloc(gfp_t gfp_mask) ++{ ++} ++ + #endif + + /* +@@ -2445,14 +2453,13 @@ static int mark_lock(struct task_struct + return 0; + + switch (new_bit) { +- case LOCK_USED_IN_HARDIRQ: +- case LOCK_USED_IN_SOFTIRQ: +- case LOCK_USED_IN_HARDIRQ_READ: +- case LOCK_USED_IN_SOFTIRQ_READ: +- case LOCK_ENABLED_HARDIRQS: +- case LOCK_ENABLED_SOFTIRQS: +- case LOCK_ENABLED_HARDIRQS_READ: +- case LOCK_ENABLED_SOFTIRQS_READ: ++#define LOCKDEP_STATE(__STATE) \ ++ case LOCK_USED_IN_##__STATE: \ ++ case LOCK_USED_IN_##__STATE##_READ: \ ++ case LOCK_ENABLED_##__STATE: \ ++ case LOCK_ENABLED_##__STATE##_READ: ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) + return 0; +@@ -2542,6 +2549,7 @@ static int __lock_acquire(struct lockdep + debug_locks_off(); + printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); + printk("turning off the locking correctness validator.\n"); ++ dump_stack(); + return 0; + } + +@@ -2638,6 +2646,7 @@ static int __lock_acquire(struct lockdep + debug_locks_off(); + printk("BUG: MAX_LOCK_DEPTH too low!\n"); + printk("turning off the locking correctness validator.\n"); ++ dump_stack(); + return 0; + } + +@@ -2925,6 +2934,8 @@ void lock_set_class(struct lockdep_map * + } + EXPORT_SYMBOL_GPL(lock_set_class); + ++DEFINE_TRACE(lock_acquire); ++ + /* + * We are not always called with irqs disabled - do that here, + * and also avoid lockdep recursion: +@@ -2935,6 +2946,8 @@ void lock_acquire(struct lockdep_map *lo + { + unsigned long flags; + ++ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); ++ + if (unlikely(current->lockdep_recursion)) + return; + +@@ -2949,11 +2962,15 @@ void lock_acquire(struct lockdep_map *lo + } + EXPORT_SYMBOL_GPL(lock_acquire); + ++DEFINE_TRACE(lock_release); ++ + void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip) + { + unsigned long flags; + ++ trace_lock_release(lock, nested, ip); ++ + if (unlikely(current->lockdep_recursion)) + return; + +@@ -2966,6 +2983,16 @@ void lock_release(struct lockdep_map *lo + } + EXPORT_SYMBOL_GPL(lock_release); + ++void lockdep_set_current_reclaim_state(gfp_t gfp_mask) ++{ ++ current->lockdep_reclaim_gfp = gfp_mask; ++} ++ ++void lockdep_clear_current_reclaim_state(void) ++{ ++ current->lockdep_reclaim_gfp = 0; ++} ++ + #ifdef CONFIG_LOCK_STAT + static int + print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, +@@ -3092,10 +3119,14 @@ found_it: + lock->ip = ip; + } + ++DEFINE_TRACE(lock_contended); ++ + void lock_contended(struct lockdep_map *lock, unsigned long ip) + { + unsigned long flags; + ++ trace_lock_contended(lock, ip); ++ + if (unlikely(!lock_stat)) + return; + +@@ -3111,10 +3142,14 @@ void lock_contended(struct lockdep_map * + } + EXPORT_SYMBOL_GPL(lock_contended); + ++DEFINE_TRACE(lock_acquired); ++ + void lock_acquired(struct lockdep_map *lock, unsigned long ip) + { + unsigned long flags; + ++ trace_lock_acquired(lock, ip); ++ + if (unlikely(!lock_stat)) + return; + +Index: linux-2.6-tip/kernel/lockdep_internals.h +=================================================================== +--- linux-2.6-tip.orig/kernel/lockdep_internals.h ++++ linux-2.6-tip/kernel/lockdep_internals.h +@@ -7,6 +7,45 @@ + */ + + /* ++ * Lock-class usage-state bits: ++ */ ++enum lock_usage_bit { ++#define LOCKDEP_STATE(__STATE) \ ++ LOCK_USED_IN_##__STATE, \ ++ LOCK_USED_IN_##__STATE##_READ, \ ++ LOCK_ENABLED_##__STATE, \ ++ LOCK_ENABLED_##__STATE##_READ, ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE ++ LOCK_USED, ++ LOCK_USAGE_STATES ++}; ++ ++/* ++ * Usage-state bitmasks: ++ */ ++#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE), ++ ++enum { ++#define LOCKDEP_STATE(__STATE) \ ++ __LOCKF(USED_IN_##__STATE) \ ++ __LOCKF(USED_IN_##__STATE##_READ) \ ++ __LOCKF(ENABLED_##__STATE) \ ++ __LOCKF(ENABLED_##__STATE##_READ) ++#include "lockdep_states.h" ++#undef LOCKDEP_STATE ++ __LOCKF(USED) ++}; ++ ++#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) ++#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) ++ ++#define LOCKF_ENABLED_IRQ_READ \ ++ (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) ++#define LOCKF_USED_IN_IRQ_READ \ ++ (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) ++ ++/* + * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies + * we track. + * +@@ -31,8 +70,10 @@ + extern struct list_head all_lock_classes; + extern struct lock_chain lock_chains[]; + +-extern void +-get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); ++#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) ++ ++extern void get_usage_chars(struct lock_class *class, ++ char usage[LOCK_USAGE_CHARS]); + + extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); + +Index: linux-2.6-tip/kernel/lockdep_proc.c +=================================================================== +--- linux-2.6-tip.orig/kernel/lockdep_proc.c ++++ linux-2.6-tip/kernel/lockdep_proc.c +@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, vo + { + struct lock_class *class = v; + struct lock_list *entry; +- char c1, c2, c3, c4; ++ char usage[LOCK_USAGE_CHARS]; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, "all lock classes:\n"); +@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, vo + seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); + #endif + +- get_usage_chars(class, &c1, &c2, &c3, &c4); +- seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); ++ get_usage_chars(class, usage); ++ seq_printf(m, " %s", usage); + + seq_printf(m, ": "); + print_name(m, class); +@@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq + nr_uncategorized++; + if (class->usage_mask & LOCKF_USED_IN_IRQ) + nr_irq_safe++; +- if (class->usage_mask & LOCKF_ENABLED_IRQS) ++ if (class->usage_mask & LOCKF_ENABLED_IRQ) + nr_irq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) + nr_softirq_safe++; +- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) ++ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ) + nr_softirq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) + nr_hardirq_safe++; +- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) ++ if (class->usage_mask & LOCKF_ENABLED_HARDIRQ) + nr_hardirq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) + nr_irq_read_safe++; +- if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) ++ if (class->usage_mask & LOCKF_ENABLED_IRQ_READ) + nr_irq_read_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) + nr_softirq_read_safe++; +- if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) ++ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ) + nr_softirq_read_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) + nr_hardirq_read_safe++; +- if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) ++ if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) + nr_hardirq_read_unsafe++; + + #ifdef CONFIG_PROVE_LOCKING +@@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m + static void seq_header(struct seq_file *m) + { + seq_printf(m, "lock_stat version 0.3\n"); ++ ++ if (unlikely(!debug_locks)) ++ seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); ++ + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " + "%14s %14s\n", +Index: linux-2.6-tip/kernel/lockdep_states.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/lockdep_states.h +@@ -0,0 +1,9 @@ ++/* ++ * Lockdep states, ++ * ++ * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever ++ * you add one, or come up with a nice dynamic solution. ++ */ ++LOCKDEP_STATE(HARDIRQ) ++LOCKDEP_STATE(SOFTIRQ) ++LOCKDEP_STATE(RECLAIM_FS) +Index: linux-2.6-tip/kernel/marker.c +=================================================================== +--- linux-2.6-tip.orig/kernel/marker.c ++++ linux-2.6-tip/kernel/marker.c +@@ -432,7 +432,7 @@ static int remove_marker(const char *nam + { + struct hlist_head *head; + struct hlist_node *node; +- struct marker_entry *e; ++ struct marker_entry *uninitialized_var(e); + int found = 0; + size_t len = strlen(name) + 1; + u32 hash = jhash(name, len-1, 0); +Index: linux-2.6-tip/kernel/module.c +=================================================================== +--- linux-2.6-tip.orig/kernel/module.c ++++ linux-2.6-tip/kernel/module.c +@@ -51,6 +51,7 @@ + #include + #include + #include ++#include + + #if 0 + #define DEBUGP printk +@@ -366,6 +367,34 @@ static struct module *find_module(const + } + + #ifdef CONFIG_SMP ++ ++#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA ++ ++static void *percpu_modalloc(unsigned long size, unsigned long align, ++ const char *name) ++{ ++ void *ptr; ++ ++ if (align > PAGE_SIZE) { ++ printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", ++ name, align, PAGE_SIZE); ++ align = PAGE_SIZE; ++ } ++ ++ ptr = __alloc_reserved_percpu(size, align); ++ if (!ptr) ++ printk(KERN_WARNING ++ "Could not allocate %lu bytes percpu data\n", size); ++ return ptr; ++} ++ ++static void percpu_modfree(void *freeme) ++{ ++ free_percpu(freeme); ++} ++ ++#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ ++ + /* Number of blocks used and allocated. */ + static unsigned int pcpu_num_used, pcpu_num_allocated; + /* Size of each block. -ve means used. */ +@@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme) + } + } + +-static unsigned int find_pcpusec(Elf_Ehdr *hdr, +- Elf_Shdr *sechdrs, +- const char *secstrings) +-{ +- return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); +-} +- +-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +-{ +- int cpu; +- +- for_each_possible_cpu(cpu) +- memcpy(pcpudest + per_cpu_offset(cpu), from, size); +-} +- + static int percpu_modinit(void) + { + pcpu_num_used = 2; +@@ -513,7 +527,26 @@ static int percpu_modinit(void) + return 0; + } + __initcall(percpu_modinit); ++ ++#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ ++ ++static unsigned int find_pcpusec(Elf_Ehdr *hdr, ++ Elf_Shdr *sechdrs, ++ const char *secstrings) ++{ ++ return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); ++} ++ ++static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ memcpy(pcpudest + per_cpu_offset(cpu), from, size); ++} ++ + #else /* ... !CONFIG_SMP */ ++ + static inline void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) + { +@@ -535,6 +568,7 @@ static inline void percpu_modcopy(void * + /* pcpusec should be 0, and size of that section should be 0. */ + BUG_ON(size != 0); + } ++ + #endif /* CONFIG_SMP */ + + #define MODINFO_ATTR(field) \ +@@ -2288,8 +2322,8 @@ static noinline struct module *load_modu + ftrace_release(mod->module_core, mod->core_size); + free_unload: + module_unload_free(mod); +- free_init: + #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) ++ free_init: + percpu_modfree(mod->refptr); + #endif + module_free(mod, mod->module_init); +@@ -2737,7 +2771,7 @@ int is_module_address(unsigned long addr + + + /* Is this a valid kernel address? */ +-__notrace_funcgraph struct module *__module_text_address(unsigned long addr) ++struct module *__module_text_address(unsigned long addr) + { + struct module *mod; + +Index: linux-2.6-tip/kernel/mutex-debug.c +=================================================================== +--- linux-2.6-tip.orig/kernel/mutex-debug.c ++++ linux-2.6-tip/kernel/mutex-debug.c +@@ -26,11 +26,6 @@ + /* + * Must be called with lock->wait_lock held. + */ +-void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner) +-{ +- lock->owner = new_owner; +-} +- + void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) + { + memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); +@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex + + /* Mark the current thread as blocked on the lock: */ + ti->task->blocked_on = waiter; +- waiter->lock = lock; + } + + void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lo + DEBUG_LOCKS_WARN_ON(lock->magic != lock); + DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); +- DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); ++ mutex_clear_owner(lock); + } + + void debug_mutex_init(struct mutex *lock, const char *name, +@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); + #endif +- lock->owner = NULL; + lock->magic = lock; + } + +Index: linux-2.6-tip/kernel/mutex-debug.h +=================================================================== +--- linux-2.6-tip.orig/kernel/mutex-debug.h ++++ linux-2.6-tip/kernel/mutex-debug.h +@@ -13,14 +13,6 @@ + /* + * This must be called with lock->wait_lock held. + */ +-extern void +-debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner); +- +-static inline void debug_mutex_clear_owner(struct mutex *lock) +-{ +- lock->owner = NULL; +-} +- + extern void debug_mutex_lock_common(struct mutex *lock, + struct mutex_waiter *waiter); + extern void debug_mutex_wake_waiter(struct mutex *lock, +@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mu + extern void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key); + ++static inline void mutex_set_owner(struct mutex *lock) ++{ ++ lock->owner = current_thread_info(); ++} ++ ++static inline void mutex_clear_owner(struct mutex *lock) ++{ ++ lock->owner = NULL; ++} ++ + #define spin_lock_mutex(lock, flags) \ + do { \ + struct mutex *l = container_of(lock, struct mutex, wait_lock); \ +Index: linux-2.6-tip/kernel/mutex.c +=================================================================== +--- linux-2.6-tip.orig/kernel/mutex.c ++++ linux-2.6-tip/kernel/mutex.c +@@ -10,6 +10,11 @@ + * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and + * David Howells for suggestions and improvements. + * ++ * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline ++ * from the -rt tree, where it was originally implemented for rtmutexes ++ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale ++ * and Sven Dietrich. ++ * + * Also see Documentation/mutex-design.txt. + */ + #include +@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const c + atomic_set(&lock->count, 1); + spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); ++ mutex_clear_owner(lock); + + debug_mutex_init(lock, name, key); + } +@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mu + * 'unlocked' into 'locked' state. + */ + __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); ++ mutex_set_owner(lock); + } + + EXPORT_SYMBOL(mutex_lock); +@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex * + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ ++#ifndef CONFIG_DEBUG_MUTEXES ++ /* ++ * When debugging is enabled we must not clear the owner before time, ++ * the slow path will always be taken, and that clears the owner field ++ * after verifying that it was indeed current. ++ */ ++ mutex_clear_owner(lock); ++#endif + __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); + } + +@@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, + { + struct task_struct *task = current; + struct mutex_waiter waiter; +- unsigned int old_val; + unsigned long flags; + ++ preempt_disable(); ++ mutex_acquire(&lock->dep_map, subclass, 0, ip); ++#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) ++ /* ++ * Optimistic spinning. ++ * ++ * We try to spin for acquisition when we find that there are no ++ * pending waiters and the lock owner is currently running on a ++ * (different) CPU. ++ * ++ * The rationale is that if the lock owner is running, it is likely to ++ * release the lock soon. ++ * ++ * Since this needs the lock owner, and this mutex implementation ++ * doesn't track the owner atomically in the lock field, we need to ++ * track it non-atomically. ++ * ++ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock ++ * to serialize everything. ++ */ ++ ++ for (;;) { ++ struct thread_info *owner; ++ ++ /* ++ * If there's an owner, wait for it to either ++ * release the lock or go to sleep. ++ */ ++ owner = ACCESS_ONCE(lock->owner); ++ if (owner && !mutex_spin_on_owner(lock, owner)) ++ break; ++ ++ if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { ++ lock_acquired(&lock->dep_map, ip); ++ mutex_set_owner(lock); ++ preempt_enable(); ++ return 0; ++ } ++ ++ /* ++ * When there's no owner, we might have preempted between the ++ * owner acquiring the lock and setting the owner field. If ++ * we're an RT task that will live-lock because we won't let ++ * the owner complete. ++ */ ++ if (!owner && (need_resched() || rt_task(task))) ++ break; ++ ++ /* ++ * The cpu_relax() call is a compiler barrier which forces ++ * everything in this loop to be re-loaded. We don't need ++ * memory barriers as we'll eventually observe the right ++ * values at the cost of a few extra spins. ++ */ ++ cpu_relax(); ++ } ++#endif + spin_lock_mutex(&lock->wait_lock, flags); + + debug_mutex_lock_common(lock, &waiter); +- mutex_acquire(&lock->dep_map, subclass, 0, ip); + debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + + /* add waiting tasks to the end of the waitqueue (FIFO): */ + list_add_tail(&waiter.list, &lock->wait_list); + waiter.task = task; + +- old_val = atomic_xchg(&lock->count, -1); +- if (old_val == 1) ++ if (atomic_xchg(&lock->count, -1) == 1) + goto done; + + lock_contended(&lock->dep_map, ip); +@@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, + * that when we release the lock, we properly wake up the + * other waiters: + */ +- old_val = atomic_xchg(&lock->count, -1); +- if (old_val == 1) ++ if (atomic_xchg(&lock->count, -1) == 1) + break; + + /* +@@ -173,21 +241,28 @@ __mutex_lock_common(struct mutex *lock, + spin_unlock_mutex(&lock->wait_lock, flags); + + debug_mutex_free_waiter(&waiter); ++ preempt_enable(); + return -EINTR; + } + __set_task_state(task, state); + + /* didnt get the lock, go to sleep: */ + spin_unlock_mutex(&lock->wait_lock, flags); +- schedule(); ++ ++ local_irq_disable(); ++ __preempt_enable_no_resched(); ++ __schedule(); ++ preempt_disable(); ++ local_irq_enable(); ++ + spin_lock_mutex(&lock->wait_lock, flags); + } + + done: + lock_acquired(&lock->dep_map, ip); + /* got the lock - rejoice! */ +- mutex_remove_waiter(lock, &waiter, task_thread_info(task)); +- debug_mutex_set_owner(lock, task_thread_info(task)); ++ mutex_remove_waiter(lock, &waiter, current_thread_info()); ++ mutex_set_owner(lock); + + /* set it to 0 if there are no waiters left: */ + if (likely(list_empty(&lock->wait_list))) +@@ -196,6 +271,7 @@ done: + spin_unlock_mutex(&lock->wait_lock, flags); + + debug_mutex_free_waiter(&waiter); ++ preempt_enable(); + + return 0; + } +@@ -222,7 +298,8 @@ int __sched + mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) + { + might_sleep(); +- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); ++ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, ++ subclass, _RET_IP_); + } + + EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); +@@ -260,8 +337,6 @@ __mutex_unlock_common_slowpath(atomic_t + wake_up_process(waiter->task); + } + +- debug_mutex_clear_owner(lock); +- + spin_unlock_mutex(&lock->wait_lock, flags); + } + +@@ -298,18 +373,30 @@ __mutex_lock_interruptible_slowpath(atom + */ + int __sched mutex_lock_interruptible(struct mutex *lock) + { ++ int ret; ++ + might_sleep(); +- return __mutex_fastpath_lock_retval ++ ret = __mutex_fastpath_lock_retval + (&lock->count, __mutex_lock_interruptible_slowpath); ++ if (!ret) ++ mutex_set_owner(lock); ++ ++ return ret; + } + + EXPORT_SYMBOL(mutex_lock_interruptible); + + int __sched mutex_lock_killable(struct mutex *lock) + { ++ int ret; ++ + might_sleep(); +- return __mutex_fastpath_lock_retval ++ ret = __mutex_fastpath_lock_retval + (&lock->count, __mutex_lock_killable_slowpath); ++ if (!ret) ++ mutex_set_owner(lock); ++ ++ return ret; + } + EXPORT_SYMBOL(mutex_lock_killable); + +@@ -352,9 +439,10 @@ static inline int __mutex_trylock_slowpa + + prev = atomic_xchg(&lock->count, -1); + if (likely(prev == 1)) { +- debug_mutex_set_owner(lock, current_thread_info()); ++ mutex_set_owner(lock); + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + } ++ + /* Set it back to 0 if there are no waiters: */ + if (likely(list_empty(&lock->wait_list))) + atomic_set(&lock->count, 0); +@@ -380,8 +468,13 @@ static inline int __mutex_trylock_slowpa + */ + int __sched mutex_trylock(struct mutex *lock) + { +- return __mutex_fastpath_trylock(&lock->count, +- __mutex_trylock_slowpath); ++ int ret; ++ ++ ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); ++ if (ret) ++ mutex_set_owner(lock); ++ ++ return ret; + } + + EXPORT_SYMBOL(mutex_trylock); +Index: linux-2.6-tip/kernel/mutex.h +=================================================================== +--- linux-2.6-tip.orig/kernel/mutex.h ++++ linux-2.6-tip/kernel/mutex.h +@@ -16,8 +16,26 @@ + #define mutex_remove_waiter(lock, waiter, ti) \ + __list_del((waiter)->list.prev, (waiter)->list.next) + +-#define debug_mutex_set_owner(lock, new_owner) do { } while (0) +-#define debug_mutex_clear_owner(lock) do { } while (0) ++#ifdef CONFIG_SMP ++static inline void mutex_set_owner(struct mutex *lock) ++{ ++ lock->owner = current_thread_info(); ++} ++ ++static inline void mutex_clear_owner(struct mutex *lock) ++{ ++ lock->owner = NULL; ++} ++#else ++static inline void mutex_set_owner(struct mutex *lock) ++{ ++} ++ ++static inline void mutex_clear_owner(struct mutex *lock) ++{ ++} ++#endif ++ + #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) + #define debug_mutex_free_waiter(waiter) do { } while (0) + #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +Index: linux-2.6-tip/kernel/panic.c +=================================================================== +--- linux-2.6-tip.orig/kernel/panic.c ++++ linux-2.6-tip/kernel/panic.c +@@ -8,19 +8,19 @@ + * This function is used through-out the kernel (including mm and fs) + * to indicate a major problem. + */ ++#include ++#include ++#include ++#include + #include +-#include +-#include ++#include + #include +-#include +-#include ++#include ++#include ++#include + #include +-#include ++#include + #include +-#include +-#include +-#include +-#include + #include + + int panic_on_oops; +@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink); + * + * This function never returns. + */ +- + NORET_TYPE void panic(const char * fmt, ...) + { +- long i; + static char buf[1024]; + va_list args; +-#if defined(CONFIG_S390) +- unsigned long caller = (unsigned long) __builtin_return_address(0); +-#endif ++ long i; + + /* +- * It's possible to come here directly from a panic-assertion and not +- * have preempt disabled. Some functions called from here want ++ * It's possible to come here directly from a panic-assertion and ++ * not have preempt disabled. Some functions called from here want + * preempt to be disabled. No point enabling it later though... + */ + preempt_disable(); +@@ -74,7 +70,9 @@ NORET_TYPE void panic(const char * fmt, + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); +- bust_spinlocks(0); ++#ifdef CONFIG_DEBUG_BUGVERBOSE ++ dump_stack(); ++#endif + + /* + * If we have crashed and we have a crash kernel loaded let it handle +@@ -83,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, + */ + crash_kexec(NULL); + +-#ifdef CONFIG_SMP + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a panic + * situation. + */ + smp_send_stop(); +-#endif + + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + +@@ -99,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, + + if (panic_timeout > 0) { + /* +- * Delay timeout seconds before rebooting the machine. +- * We can't use the "normal" timers since we just panicked.. +- */ +- printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); ++ * Delay timeout seconds before rebooting the machine. ++ * We can't use the "normal" timers since we just panicked. ++ */ ++ printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); ++ + for (i = 0; i < panic_timeout*1000; ) { + touch_nmi_watchdog(); + i += panic_blink(i); + mdelay(1); + i++; + } +- /* This will not be a clean reboot, with everything +- * shutting down. But if there is a chance of +- * rebooting the system it will be rebooted. ++ /* ++ * This will not be a clean reboot, with everything ++ * shutting down. But if there is a chance of ++ * rebooting the system it will be rebooted. + */ + emergency_restart(); + } +@@ -124,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, + } + #endif + #if defined(CONFIG_S390) +- disabled_wait(caller); ++ { ++ unsigned long caller; ++ ++ caller = (unsigned long)__builtin_return_address(0); ++ disabled_wait(caller); ++ } + #endif + local_irq_enable(); +- for (i = 0;;) { ++ for (i = 0; ; ) { + touch_softlockup_watchdog(); + i += panic_blink(i); + mdelay(1); + i++; + } ++ bust_spinlocks(0); + } + + EXPORT_SYMBOL(panic); + + + struct tnt { +- u8 bit; +- char true; +- char false; ++ u8 bit; ++ char true; ++ char false; + }; + + static const struct tnt tnts[] = { +- { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, +- { TAINT_FORCED_MODULE, 'F', ' ' }, +- { TAINT_UNSAFE_SMP, 'S', ' ' }, +- { TAINT_FORCED_RMMOD, 'R', ' ' }, +- { TAINT_MACHINE_CHECK, 'M', ' ' }, +- { TAINT_BAD_PAGE, 'B', ' ' }, +- { TAINT_USER, 'U', ' ' }, +- { TAINT_DIE, 'D', ' ' }, +- { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, +- { TAINT_WARN, 'W', ' ' }, +- { TAINT_CRAP, 'C', ' ' }, ++ { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, ++ { TAINT_FORCED_MODULE, 'F', ' ' }, ++ { TAINT_UNSAFE_SMP, 'S', ' ' }, ++ { TAINT_FORCED_RMMOD, 'R', ' ' }, ++ { TAINT_MACHINE_CHECK, 'M', ' ' }, ++ { TAINT_BAD_PAGE, 'B', ' ' }, ++ { TAINT_USER, 'U', ' ' }, ++ { TAINT_DIE, 'D', ' ' }, ++ { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, ++ { TAINT_WARN, 'W', ' ' }, ++ { TAINT_CRAP, 'C', ' ' }, + }; + + /** +@@ -192,7 +196,8 @@ const char *print_tainted(void) + *s = 0; + } else + snprintf(buf, sizeof(buf), "Not tainted"); +- return(buf); ++ ++ return buf; + } + + int test_taint(unsigned flag) +@@ -208,7 +213,8 @@ unsigned long get_taint(void) + + void add_taint(unsigned flag) + { +- debug_locks = 0; /* can't trust the integrity of the kernel anymore */ ++ /* can't trust the integrity of the kernel anymore: */ ++ debug_locks = 0; + set_bit(flag, &tainted_mask); + } + EXPORT_SYMBOL(add_taint); +@@ -263,8 +269,8 @@ static void do_oops_enter_exit(void) + } + + /* +- * Return true if the calling CPU is allowed to print oops-related info. This +- * is a bit racy.. ++ * Return true if the calling CPU is allowed to print oops-related info. ++ * This is a bit racy.. + */ + int oops_may_print(void) + { +@@ -273,20 +279,23 @@ int oops_may_print(void) + + /* + * Called when the architecture enters its oops handler, before it prints +- * anything. If this is the first CPU to oops, and it's oopsing the first time +- * then let it proceed. ++ * anything. If this is the first CPU to oops, and it's oopsing the first ++ * time then let it proceed. + * +- * This is all enabled by the pause_on_oops kernel boot option. We do all this +- * to ensure that oopses don't scroll off the screen. It has the side-effect +- * of preventing later-oopsing CPUs from mucking up the display, too. ++ * This is all enabled by the pause_on_oops kernel boot option. We do all ++ * this to ensure that oopses don't scroll off the screen. It has the ++ * side-effect of preventing later-oopsing CPUs from mucking up the display, ++ * too. + * +- * It turns out that the CPU which is allowed to print ends up pausing for the +- * right duration, whereas all the other CPUs pause for twice as long: once in +- * oops_enter(), once in oops_exit(). ++ * It turns out that the CPU which is allowed to print ends up pausing for ++ * the right duration, whereas all the other CPUs pause for twice as long: ++ * once in oops_enter(), once in oops_exit(). + */ + void oops_enter(void) + { +- debug_locks_off(); /* can't trust the integrity of the kernel anymore */ ++ tracing_off(); ++ /* can't trust the integrity of the kernel anymore: */ ++ debug_locks_off(); + do_oops_enter_exit(); + } + +@@ -355,15 +364,18 @@ EXPORT_SYMBOL(warn_slowpath); + #endif + + #ifdef CONFIG_CC_STACKPROTECTOR ++ + /* + * Called when gcc's -fstack-protector feature is used, and + * gcc detects corruption of the on-stack canary value + */ + void __stack_chk_fail(void) + { +- panic("stack-protector: Kernel stack is corrupted"); ++ panic("stack-protector: Kernel stack is corrupted in: %p\n", ++ __builtin_return_address(0)); + } + EXPORT_SYMBOL(__stack_chk_fail); ++ + #endif + + core_param(panic, panic_timeout, int, 0644); +Index: linux-2.6-tip/kernel/perf_counter.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/perf_counter.c +@@ -0,0 +1,2787 @@ ++/* ++ * Performance counter core code ++ * ++ * Copyright(C) 2008 Thomas Gleixner ++ * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar ++ * ++ * ++ * For licensing details see kernel-base/COPYING ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++/* ++ * Each CPU has a list of per CPU counters: ++ */ ++DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); ++ ++int perf_max_counters __read_mostly = 1; ++static int perf_reserved_percpu __read_mostly; ++static int perf_overcommit __read_mostly = 1; ++ ++/* ++ * Mutex for (sysadmin-configurable) counter reservations: ++ */ ++static DEFINE_MUTEX(perf_resource_mutex); ++ ++/* ++ * Architecture provided APIs - weak aliases: ++ */ ++extern __weak const struct hw_perf_counter_ops * ++hw_perf_counter_init(struct perf_counter *counter) ++{ ++ return NULL; ++} ++ ++u64 __weak hw_perf_save_disable(void) { return 0; } ++void __weak hw_perf_restore(u64 ctrl) { barrier(); } ++void __weak hw_perf_counter_setup(int cpu) { barrier(); } ++int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, ++ struct perf_cpu_context *cpuctx, ++ struct perf_counter_context *ctx, int cpu) ++{ ++ return 0; ++} ++ ++void __weak perf_counter_print_debug(void) { } ++ ++static void ++list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) ++{ ++ struct perf_counter *group_leader = counter->group_leader; ++ ++ /* ++ * Depending on whether it is a standalone or sibling counter, ++ * add it straight to the context's counter list, or to the group ++ * leader's sibling list: ++ */ ++ if (counter->group_leader == counter) ++ list_add_tail(&counter->list_entry, &ctx->counter_list); ++ else { ++ list_add_tail(&counter->list_entry, &group_leader->sibling_list); ++ group_leader->nr_siblings++; ++ } ++ ++ list_add_rcu(&counter->event_entry, &ctx->event_list); ++} ++ ++static void ++list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) ++{ ++ struct perf_counter *sibling, *tmp; ++ ++ list_del_init(&counter->list_entry); ++ list_del_rcu(&counter->event_entry); ++ ++ if (counter->group_leader != counter) ++ counter->group_leader->nr_siblings--; ++ ++ /* ++ * If this was a group counter with sibling counters then ++ * upgrade the siblings to singleton counters by adding them ++ * to the context list directly: ++ */ ++ list_for_each_entry_safe(sibling, tmp, ++ &counter->sibling_list, list_entry) { ++ ++ list_move_tail(&sibling->list_entry, &ctx->counter_list); ++ sibling->group_leader = sibling; ++ } ++} ++ ++static void ++counter_sched_out(struct perf_counter *counter, ++ struct perf_cpu_context *cpuctx, ++ struct perf_counter_context *ctx) ++{ ++ if (counter->state != PERF_COUNTER_STATE_ACTIVE) ++ return; ++ ++ counter->state = PERF_COUNTER_STATE_INACTIVE; ++ counter->tstamp_stopped = ctx->time_now; ++ counter->hw_ops->disable(counter); ++ counter->oncpu = -1; ++ ++ if (!is_software_counter(counter)) ++ cpuctx->active_oncpu--; ++ ctx->nr_active--; ++ if (counter->hw_event.exclusive || !cpuctx->active_oncpu) ++ cpuctx->exclusive = 0; ++} ++ ++static void ++group_sched_out(struct perf_counter *group_counter, ++ struct perf_cpu_context *cpuctx, ++ struct perf_counter_context *ctx) ++{ ++ struct perf_counter *counter; ++ ++ if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) ++ return; ++ ++ counter_sched_out(group_counter, cpuctx, ctx); ++ ++ /* ++ * Schedule out siblings (if any): ++ */ ++ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) ++ counter_sched_out(counter, cpuctx, ctx); ++ ++ if (group_counter->hw_event.exclusive) ++ cpuctx->exclusive = 0; ++} ++ ++/* ++ * Cross CPU call to remove a performance counter ++ * ++ * We disable the counter on the hardware level first. After that we ++ * remove it from the context list. ++ */ ++static void __perf_counter_remove_from_context(void *info) ++{ ++ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); ++ struct perf_counter *counter = info; ++ struct perf_counter_context *ctx = counter->ctx; ++ unsigned long flags; ++ u64 perf_flags; ++ ++ /* ++ * If this is a task context, we need to check whether it is ++ * the current task context of this cpu. If not it has been ++ * scheduled out before the smp call arrived. ++ */ ++ if (ctx->task && cpuctx->task_ctx != ctx) ++ return; ++ ++ curr_rq_lock_irq_save(&flags); ++ spin_lock(&ctx->lock); ++ ++ counter_sched_out(counter, cpuctx, ctx); ++ ++ counter->task = NULL; ++ ctx->nr_counters--; ++ ++ /* ++ * Protect the list operation against NMI by disabling the ++ * counters on a global level. NOP for non NMI based counters. ++ */ ++ perf_flags = hw_perf_save_disable(); ++ list_del_counter(counter, ctx); ++ hw_perf_restore(perf_flags); ++ ++ if (!ctx->task) { ++ /* ++ * Allow more per task counters with respect to the ++ * reservation: ++ */ ++ cpuctx->max_pertask = ++ min(perf_max_counters - ctx->nr_counters, ++ perf_max_counters - perf_reserved_percpu); ++ } ++ ++ spin_unlock(&ctx->lock); ++ curr_rq_unlock_irq_restore(&flags); ++} ++ ++ ++/* ++ * Remove the counter from a task's (or a CPU's) list of counters. ++ * ++ * Must be called with counter->mutex and ctx->mutex held. ++ * ++ * CPU counters are removed with a smp call. For task counters we only ++ * call when the task is on a CPU. ++ */ ++static void perf_counter_remove_from_context(struct perf_counter *counter) ++{ ++ struct perf_counter_context *ctx = counter->ctx; ++ struct task_struct *task = ctx->task; ++ ++ if (!task) { ++ /* ++ * Per cpu counters are removed via an smp call and ++ * the removal is always sucessful. ++ */ ++ smp_call_function_single(counter->cpu, ++ __perf_counter_remove_from_context, ++ counter, 1); ++ return; ++ } ++ ++retry: ++ task_oncpu_function_call(task, __perf_counter_remove_from_context, ++ counter); ++ ++ spin_lock_irq(&ctx->lock); ++ /* ++ * If the context is active we need to retry the smp call. ++ */ ++ if (ctx->nr_active && !list_empty(&counter->list_entry)) { ++ spin_unlock_irq(&ctx->lock); ++ goto retry; ++ } ++ ++ /* ++ * The lock prevents that this context is scheduled in so we ++ * can remove the counter safely, if the call above did not ++ * succeed. ++ */ ++ if (!list_empty(&counter->list_entry)) { ++ ctx->nr_counters--; ++ list_del_counter(counter, ctx); ++ counter->task = NULL; ++ } ++ spin_unlock_irq(&ctx->lock); ++} ++ ++/* ++ * Get the current time for this context. ++ * If this is a task context, we use the task's task clock, ++ * or for a per-cpu context, we use the cpu clock. ++ */ ++static u64 get_context_time(struct perf_counter_context *ctx, int update) ++{ ++ struct task_struct *curr = ctx->task; ++ ++ if (!curr) ++ return cpu_clock(smp_processor_id()); ++ ++ return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime; ++} ++ ++/* ++ * Update the record of the current time in a context. ++ */ ++static void update_context_time(struct perf_counter_context *ctx, int update) ++{ ++ ctx->time_now = get_context_time(ctx, update) - ctx->time_lost; ++} ++ ++/* ++ * Update the total_time_enabled and total_time_running fields for a counter. ++ */ ++static void update_counter_times(struct perf_counter *counter) ++{ ++ struct perf_counter_context *ctx = counter->ctx; ++ u64 run_end; ++ ++ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { ++ counter->total_time_enabled = ctx->time_now - ++ counter->tstamp_enabled; ++ if (counter->state == PERF_COUNTER_STATE_INACTIVE) ++ run_end = counter->tstamp_stopped; ++ else ++ run_end = ctx->time_now; ++ counter->total_time_running = run_end - counter->tstamp_running; ++ } ++} ++ ++/* ++ * Update total_time_enabled and total_time_running for all counters in a group. ++ */ ++static void update_group_times(struct perf_counter *leader) ++{ ++ struct perf_counter *counter; ++ ++ update_counter_times(leader); ++ list_for_each_entry(counter, &leader->sibling_list, list_entry) ++ update_counter_times(counter); ++} ++ ++/* ++ * Cross CPU call to disable a performance counter ++ */ ++static void __perf_counter_disable(void *info) ++{ ++ struct perf_counter *counter = info; ++ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); ++ struct perf_counter_context *ctx = counter->ctx; ++ unsigned long flags; ++ ++ /* ++ * If this is a per-task counter, need to check whether this ++ * counter's task is the current task on this cpu. ++ */ ++ if (ctx->task && cpuctx->task_ctx != ctx) ++ return; ++ ++ curr_rq_lock_irq_save(&flags); ++ spin_lock(&ctx->lock); ++ ++ /* ++ * If the counter is on, turn it off. ++ * If it is in error state, leave it in error state. ++ */ ++ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { ++ update_context_time(ctx, 1); ++ update_counter_times(counter); ++ if (counter == counter->group_leader) ++ group_sched_out(counter, cpuctx, ctx); ++ else ++ counter_sched_out(counter, cpuctx, ctx); ++ counter->state = PERF_COUNTER_STATE_OFF; ++ } ++ ++ spin_unlock(&ctx->lock); ++ curr_rq_unlock_irq_restore(&flags); ++} ++ ++/* ++ * Disable a counter. ++ */ ++static void perf_counter_disable(struct perf_counter *counter) ++{ ++ struct perf_counter_context *ctx = counter->ctx; ++ struct task_struct *task = ctx->task; ++ ++ if (!task) { ++ /* ++ * Disable the counter on the cpu that it's on ++ */ ++ smp_call_function_single(counter->cpu, __perf_counter_disable, ++ counter, 1); ++ return; ++ } ++ ++ retry: ++ task_oncpu_function_call(task, __perf_counter_disable, counter); ++ ++ spin_lock_irq(&ctx->lock); ++ /* ++ * If the counter is still active, we need to retry the cross-call. ++ */ ++ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { ++ spin_unlock_irq(&ctx->lock); ++ goto retry; ++ } ++ ++ /* ++ * Since we have the lock this context can't be scheduled ++ * in, so we can change the state safely. ++ */ ++ if (counter->state == PERF_COUNTER_STATE_INACTIVE) { ++ update_counter_times(counter); ++ counter->state = PERF_COUNTER_STATE_OFF; ++ } ++ ++ spin_unlock_irq(&ctx->lock); ++} ++ ++/* ++ * Disable a counter and all its children. ++ */ ++static void perf_counter_disable_family(struct perf_counter *counter) ++{ ++ struct perf_counter *child; ++ ++ perf_counter_disable(counter); ++ ++ /* ++ * Lock the mutex to protect the list of children ++ */ ++ mutex_lock(&counter->mutex); ++ list_for_each_entry(child, &counter->child_list, child_list) ++ perf_counter_disable(child); ++ mutex_unlock(&counter->mutex); ++} ++ ++static int ++counter_sched_in(struct perf_counter *counter, ++ struct perf_cpu_context *cpuctx, ++ struct perf_counter_context *ctx, ++ int cpu) ++{ ++ if (counter->state <= PERF_COUNTER_STATE_OFF) ++ return 0; ++ ++ counter->state = PERF_COUNTER_STATE_ACTIVE; ++ counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ ++ /* ++ * The new state must be visible before we turn it on in the hardware: ++ */ ++ smp_wmb(); ++ ++ if (counter->hw_ops->enable(counter)) { ++ counter->state = PERF_COUNTER_STATE_INACTIVE; ++ counter->oncpu = -1; ++ return -EAGAIN; ++ } ++ ++ counter->tstamp_running += ctx->time_now - counter->tstamp_stopped; ++ ++ if (!is_software_counter(counter)) ++ cpuctx->active_oncpu++; ++ ctx->nr_active++; ++ ++ if (counter->hw_event.exclusive) ++ cpuctx->exclusive = 1; ++ ++ return 0; ++} ++ ++/* ++ * Return 1 for a group consisting entirely of software counters, ++ * 0 if the group contains any hardware counters. ++ */ ++static int is_software_only_group(struct perf_counter *leader) ++{ ++ struct perf_counter *counter; ++ ++ if (!is_software_counter(leader)) ++ return 0; ++ ++ list_for_each_entry(counter, &leader->sibling_list, list_entry) ++ if (!is_software_counter(counter)) ++ return 0; ++ ++ return 1; ++} ++ ++/* ++ * Work out whether we can put this counter group on the CPU now. ++ */ ++static int group_can_go_on(struct perf_counter *counter, ++ struct perf_cpu_context *cpuctx, ++ int can_add_hw) ++{ ++ /* ++ * Groups consisting entirely of software counters can always go on. ++ */ ++ if (is_software_only_group(counter)) ++ return 1; ++ /* ++ * If an exclusive group is already on, no other hardware ++ * counters can go on. ++ */ ++ if (cpuctx->exclusive) ++ return 0; ++ /* ++ * If this group is exclusive and there are already ++ * counters on the CPU, it can't go on. ++ */ ++ if (counter->hw_event.exclusive && cpuctx->active_oncpu) ++ return 0; ++ /* ++ * Otherwise, try to add it if all previous groups were able ++ * to go on. ++ */ ++ return can_add_hw; ++} ++ ++static void add_counter_to_ctx(struct perf_counter *counter, ++ struct perf_counter_context *ctx) ++{ ++ list_add_counter(counter, ctx); ++ ctx->nr_counters++; ++ counter->prev_state = PERF_COUNTER_STATE_OFF; ++ counter->tstamp_enabled = ctx->time_now; ++ counter->tstamp_running = ctx->time_now; ++ counter->tstamp_stopped = ctx->time_now; ++} ++ ++/* ++ * Cross CPU call to install and enable a performance counter ++ */ ++static void __perf_install_in_context(void *info) ++{ ++ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); ++ struct perf_counter *counter = info; ++ struct perf_counter_context *ctx = counter->ctx; ++ struct perf_counter *leader = counter->group_leader; ++ int cpu = smp_processor_id(); ++ unsigned long flags; ++ u64 perf_flags; ++ int err; ++ ++ /* ++ * If this is a task context, we need to check whether it is ++ * the current task context of this cpu. If not it has been ++ * scheduled out before the smp call arrived. ++ */ ++ if (ctx->task && cpuctx->task_ctx != ctx) ++ return; ++ ++ curr_rq_lock_irq_save(&flags); ++ spin_lock(&ctx->lock); ++ update_context_time(ctx, 1); ++ ++ /* ++ * Protect the list operation against NMI by disabling the ++ * counters on a global level. NOP for non NMI based counters. ++ */ ++ perf_flags = hw_perf_save_disable(); ++ ++ add_counter_to_ctx(counter, ctx); ++ ++ /* ++ * Don't put the counter on if it is disabled or if ++ * it is in a group and the group isn't on. ++ */ ++ if (counter->state != PERF_COUNTER_STATE_INACTIVE || ++ (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) ++ goto unlock; ++ ++ /* ++ * An exclusive counter can't go on if there are already active ++ * hardware counters, and no hardware counter can go on if there ++ * is already an exclusive counter on. ++ */ ++ if (!group_can_go_on(counter, cpuctx, 1)) ++ err = -EEXIST; ++ else ++ err = counter_sched_in(counter, cpuctx, ctx, cpu); ++ ++ if (err) { ++ /* ++ * This counter couldn't go on. If it is in a group ++ * then we have to pull the whole group off. ++ * If the counter group is pinned then put it in error state. ++ */ ++ if (leader != counter) ++ group_sched_out(leader, cpuctx, ctx); ++ if (leader->hw_event.pinned) { ++ update_group_times(leader); ++ leader->state = PERF_COUNTER_STATE_ERROR; ++ } ++ } ++ ++ if (!err && !ctx->task && cpuctx->max_pertask) ++ cpuctx->max_pertask--; ++ ++ unlock: ++ hw_perf_restore(perf_flags); ++ ++ spin_unlock(&ctx->lock); ++ curr_rq_unlock_irq_restore(&flags); ++} ++ ++/* ++ * Attach a performance counter to a context ++ * ++ * First we add the counter to the list with the hardware enable bit ++ * in counter->hw_config cleared. ++ * ++ * If the counter is attached to a task which is on a CPU we use a smp ++ * call to enable it in the task context. The task might have been ++ * scheduled away, but we check this in the smp call again. ++ * ++ * Must be called with ctx->mutex held. ++ */ ++static void ++perf_install_in_context(struct perf_counter_context *ctx, ++ struct perf_counter *counter, ++ int cpu) ++{ ++ struct task_struct *task = ctx->task; ++ ++ if (!task) { ++ /* ++ * Per cpu counters are installed via an smp call and ++ * the install is always sucessful. ++ */ ++ smp_call_function_single(cpu, __perf_install_in_context, ++ counter, 1); ++ return; ++ } ++ ++ counter->task = task; ++retry: ++ task_oncpu_function_call(task, __perf_install_in_context, ++ counter); ++ ++ spin_lock_irq(&ctx->lock); ++ /* ++ * we need to retry the smp call. ++ */ ++ if (ctx->is_active && list_empty(&counter->list_entry)) { ++ spin_unlock_irq(&ctx->lock); ++ goto retry; ++ } ++ ++ /* ++ * The lock prevents that this context is scheduled in so we ++ * can add the counter safely, if it the call above did not ++ * succeed. ++ */ ++ if (list_empty(&counter->list_entry)) ++ add_counter_to_ctx(counter, ctx); ++ spin_unlock_irq(&ctx->lock); ++} ++ ++/* ++ * Cross CPU call to enable a performance counter ++ */ ++static void __perf_counter_enable(void *info) ++{ ++ struct perf_counter *counter = info; ++ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); ++ struct perf_counter_context *ctx = counter->ctx; ++ struct perf_counter *leader = counter->group_leader; ++ unsigned long flags; ++ int err; ++ ++ /* ++ * If this is a per-task counter, need to check whether this ++ * counter's task is the current task on this cpu. ++ */ ++ if (ctx->task && cpuctx->task_ctx != ctx) ++ return; ++ ++ curr_rq_lock_irq_save(&flags); ++ spin_lock(&ctx->lock); ++ update_context_time(ctx, 1); ++ ++ counter->prev_state = counter->state; ++ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) ++ goto unlock; ++ counter->state = PERF_COUNTER_STATE_INACTIVE; ++ counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled; ++ ++ /* ++ * If the counter is in a group and isn't the group leader, ++ * then don't put it on unless the group is on. ++ */ ++ if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) ++ goto unlock; ++ ++ if (!group_can_go_on(counter, cpuctx, 1)) ++ err = -EEXIST; ++ else ++ err = counter_sched_in(counter, cpuctx, ctx, ++ smp_processor_id()); ++ ++ if (err) { ++ /* ++ * If this counter can't go on and it's part of a ++ * group, then the whole group has to come off. ++ */ ++ if (leader != counter) ++ group_sched_out(leader, cpuctx, ctx); ++ if (leader->hw_event.pinned) { ++ update_group_times(leader); ++ leader->state = PERF_COUNTER_STATE_ERROR; ++ } ++ } ++ ++ unlock: ++ spin_unlock(&ctx->lock); ++ curr_rq_unlock_irq_restore(&flags); ++} ++ ++/* ++ * Enable a counter. ++ */ ++static void perf_counter_enable(struct perf_counter *counter) ++{ ++ struct perf_counter_context *ctx = counter->ctx; ++ struct task_struct *task = ctx->task; ++ ++ if (!task) { ++ /* ++ * Enable the counter on the cpu that it's on ++ */ ++ smp_call_function_single(counter->cpu, __perf_counter_enable, ++ counter, 1); ++ return; ++ } ++ ++ spin_lock_irq(&ctx->lock); ++ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) ++ goto out; ++ ++ /* ++ * If the counter is in error state, clear that first. ++ * That way, if we see the counter in error state below, we ++ * know that it has gone back into error state, as distinct ++ * from the task having been scheduled away before the ++ * cross-call arrived. ++ */ ++ if (counter->state == PERF_COUNTER_STATE_ERROR) ++ counter->state = PERF_COUNTER_STATE_OFF; ++ ++ retry: ++ spin_unlock_irq(&ctx->lock); ++ task_oncpu_function_call(task, __perf_counter_enable, counter); ++ ++ spin_lock_irq(&ctx->lock); ++ ++ /* ++ * If the context is active and the counter is still off, ++ * we need to retry the cross-call. ++ */ ++ if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) ++ goto retry; ++ ++ /* ++ * Since we have the lock this context can't be scheduled ++ * in, so we can change the state safely. ++ */ ++ if (counter->state == PERF_COUNTER_STATE_OFF) { ++ counter->state = PERF_COUNTER_STATE_INACTIVE; ++ counter->tstamp_enabled = ctx->time_now - ++ counter->total_time_enabled; ++ } ++ out: ++ spin_unlock_irq(&ctx->lock); ++} ++ ++/* ++ * Enable a counter and all its children. ++ */ ++static void perf_counter_enable_family(struct perf_counter *counter) ++{ ++ struct perf_counter *child; ++ ++ perf_counter_enable(counter); ++ ++ /* ++ * Lock the mutex to protect the list of children ++ */ ++ mutex_lock(&counter->mutex); ++ list_for_each_entry(child, &counter->child_list, child_list) ++ perf_counter_enable(child); ++ mutex_unlock(&counter->mutex); ++} ++ ++void __perf_counter_sched_out(struct perf_counter_context *ctx, ++ struct perf_cpu_context *cpuctx) ++{ ++ struct perf_counter *counter; ++ u64 flags; ++ ++ spin_lock(&ctx->lock); ++ ctx->is_active = 0; ++ if (likely(!ctx->nr_counters)) ++ goto out; ++ update_context_time(ctx, 0); ++ ++ flags = hw_perf_save_disable(); ++ if (ctx->nr_active) { ++ list_for_each_entry(counter, &ctx->counter_list, list_entry) ++ group_sched_out(counter, cpuctx, ctx); ++ } ++ hw_perf_restore(flags); ++ out: ++ spin_unlock(&ctx->lock); ++} ++ ++/* ++ * Called from scheduler to remove the counters of the current task, ++ * with interrupts disabled. ++ * ++ * We stop each counter and update the counter value in counter->count. ++ * ++ * This does not protect us against NMI, but disable() ++ * sets the disabled bit in the control field of counter _before_ ++ * accessing the counter control register. If a NMI hits, then it will ++ * not restart the counter. ++ */ ++void perf_counter_task_sched_out(struct task_struct *task, int cpu) ++{ ++ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); ++ struct perf_counter_context *ctx = &task->perf_counter_ctx; ++ struct pt_regs *regs; ++ ++ if (likely(!cpuctx->task_ctx)) ++ return; ++ ++ regs = task_pt_regs(task); ++ perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); ++ __perf_counter_sched_out(ctx, cpuctx); ++ ++ cpuctx->task_ctx = NULL; ++} ++ ++static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) ++{ ++ __perf_counter_sched_out(&cpuctx->ctx, cpuctx); ++} ++ ++static int ++group_sched_in(struct perf_counter *group_counter, ++ struct perf_cpu_context *cpuctx, ++ struct perf_counter_context *ctx, ++ int cpu) ++{ ++ struct perf_counter *counter, *partial_group; ++ int ret; ++ ++ if (group_counter->state == PERF_COUNTER_STATE_OFF) ++ return 0; ++ ++ ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); ++ if (ret) ++ return ret < 0 ? ret : 0; ++ ++ group_counter->prev_state = group_counter->state; ++ if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) ++ return -EAGAIN; ++ ++ /* ++ * Schedule in siblings as one group (if any): ++ */ ++ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { ++ counter->prev_state = counter->state; ++ if (counter_sched_in(counter, cpuctx, ctx, cpu)) { ++ partial_group = counter; ++ goto group_error; ++ } ++ } ++ ++ return 0; ++ ++group_error: ++ /* ++ * Groups can be scheduled in as one unit only, so undo any ++ * partial group before returning: ++ */ ++ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { ++ if (counter == partial_group) ++ break; ++ counter_sched_out(counter, cpuctx, ctx); ++ } ++ counter_sched_out(group_counter, cpuctx, ctx); ++ ++ return -EAGAIN; ++} ++ ++static void ++__perf_counter_sched_in(struct perf_counter_context *ctx, ++ struct perf_cpu_context *cpuctx, int cpu) ++{ ++ struct perf_counter *counter; ++ u64 flags; ++ int can_add_hw = 1; ++ ++ spin_lock(&ctx->lock); ++ ctx->is_active = 1; ++ if (likely(!ctx->nr_counters)) ++ goto out; ++ ++ /* ++ * Add any time since the last sched_out to the lost time ++ * so it doesn't get included in the total_time_enabled and ++ * total_time_running measures for counters in the context. ++ */ ++ ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now; ++ ++ flags = hw_perf_save_disable(); ++ ++ /* ++ * First go through the list and put on any pinned groups ++ * in order to give them the best chance of going on. ++ */ ++ list_for_each_entry(counter, &ctx->counter_list, list_entry) { ++ if (counter->state <= PERF_COUNTER_STATE_OFF || ++ !counter->hw_event.pinned) ++ continue; ++ if (counter->cpu != -1 && counter->cpu != cpu) ++ continue; ++ ++ if (group_can_go_on(counter, cpuctx, 1)) ++ group_sched_in(counter, cpuctx, ctx, cpu); ++ ++ /* ++ * If this pinned group hasn't been scheduled, ++ * put it in error state. ++ */ ++ if (counter->state == PERF_COUNTER_STATE_INACTIVE) { ++ update_group_times(counter); ++ counter->state = PERF_COUNTER_STATE_ERROR; ++ } ++ } ++ ++ list_for_each_entry(counter, &ctx->counter_list, list_entry) { ++ /* ++ * Ignore counters in OFF or ERROR state, and ++ * ignore pinned counters since we did them already. ++ */ ++ if (counter->state <= PERF_COUNTER_STATE_OFF || ++ counter->hw_event.pinned) ++ continue; ++ ++ /* ++ * Listen to the 'cpu' scheduling filter constraint ++ * of counters: ++ */ ++ if (counter->cpu != -1 && counter->cpu != cpu) ++ continue; ++ ++ if (group_can_go_on(counter, cpuctx, can_add_hw)) { ++ if (group_sched_in(counter, cpuctx, ctx, cpu)) ++ can_add_hw = 0; ++ } ++ } ++ hw_perf_restore(flags); ++ out: ++ spin_unlock(&ctx->lock); ++} ++ ++/* ++ * Called from scheduler to add the counters of the current task ++ * with interrupts disabled. ++ * ++ * We restore the counter value and then enable it. ++ * ++ * This does not protect us against NMI, but enable() ++ * sets the enabled bit in the control field of counter _before_ ++ * accessing the counter control register. If a NMI hits, then it will ++ * keep the counter running. ++ */ ++void perf_counter_task_sched_in(struct task_struct *task, int cpu) ++{ ++ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); ++ struct perf_counter_context *ctx = &task->perf_counter_ctx; ++ ++ __perf_counter_sched_in(ctx, cpuctx, cpu); ++ cpuctx->task_ctx = ctx; ++} ++ ++static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) ++{ ++ struct perf_counter_context *ctx = &cpuctx->ctx; ++ ++ __perf_counter_sched_in(ctx, cpuctx, cpu); ++} ++ ++int perf_counter_task_disable(void) ++{ ++ struct task_struct *curr = current; ++ struct perf_counter_context *ctx = &curr->perf_counter_ctx; ++ struct perf_counter *counter; ++ unsigned long flags; ++ u64 perf_flags; ++ int cpu; ++ ++ if (likely(!ctx->nr_counters)) ++ return 0; ++ ++ curr_rq_lock_irq_save(&flags); ++ cpu = smp_processor_id(); ++ ++ /* force the update of the task clock: */ ++ __task_delta_exec(curr, 1); ++ ++ perf_counter_task_sched_out(curr, cpu); ++ ++ spin_lock(&ctx->lock); ++ ++ /* ++ * Disable all the counters: ++ */ ++ perf_flags = hw_perf_save_disable(); ++ ++ list_for_each_entry(counter, &ctx->counter_list, list_entry) { ++ if (counter->state != PERF_COUNTER_STATE_ERROR) { ++ update_group_times(counter); ++ counter->state = PERF_COUNTER_STATE_OFF; ++ } ++ } ++ ++ hw_perf_restore(perf_flags); ++ ++ spin_unlock(&ctx->lock); ++ ++ curr_rq_unlock_irq_restore(&flags); ++ ++ return 0; ++} ++ ++int perf_counter_task_enable(void) ++{ ++ struct task_struct *curr = current; ++ struct perf_counter_context *ctx = &curr->perf_counter_ctx; ++ struct perf_counter *counter; ++ unsigned long flags; ++ u64 perf_flags; ++ int cpu; ++ ++ if (likely(!ctx->nr_counters)) ++ return 0; ++ ++ curr_rq_lock_irq_save(&flags); ++ cpu = smp_processor_id(); ++ ++ /* force the update of the task clock: */ ++ __task_delta_exec(curr, 1); ++ ++ perf_counter_task_sched_out(curr, cpu); ++ ++ spin_lock(&ctx->lock); ++ ++ /* ++ * Disable all the counters: ++ */ ++ perf_flags = hw_perf_save_disable(); ++ ++ list_for_each_entry(counter, &ctx->counter_list, list_entry) { ++ if (counter->state > PERF_COUNTER_STATE_OFF) ++ continue; ++ counter->state = PERF_COUNTER_STATE_INACTIVE; ++ counter->tstamp_enabled = ctx->time_now - ++ counter->total_time_enabled; ++ counter->hw_event.disabled = 0; ++ } ++ hw_perf_restore(perf_flags); ++ ++ spin_unlock(&ctx->lock); ++ ++ perf_counter_task_sched_in(curr, cpu); ++ ++ curr_rq_unlock_irq_restore(&flags); ++ ++ return 0; ++} ++ ++/* ++ * Round-robin a context's counters: ++ */ ++static void rotate_ctx(struct perf_counter_context *ctx) ++{ ++ struct perf_counter *counter; ++ u64 perf_flags; ++ ++ if (!ctx->nr_counters) ++ return; ++ ++ spin_lock(&ctx->lock); ++ /* ++ * Rotate the first entry last (works just fine for group counters too): ++ */ ++ perf_flags = hw_perf_save_disable(); ++ list_for_each_entry(counter, &ctx->counter_list, list_entry) { ++ list_move_tail(&counter->list_entry, &ctx->counter_list); ++ break; ++ } ++ hw_perf_restore(perf_flags); ++ ++ spin_unlock(&ctx->lock); ++} ++ ++void perf_counter_task_tick(struct task_struct *curr, int cpu) ++{ ++ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); ++ struct perf_counter_context *ctx = &curr->perf_counter_ctx; ++ const int rotate_percpu = 0; ++ ++ if (rotate_percpu) ++ perf_counter_cpu_sched_out(cpuctx); ++ perf_counter_task_sched_out(curr, cpu); ++ ++ if (rotate_percpu) ++ rotate_ctx(&cpuctx->ctx); ++ rotate_ctx(ctx); ++ ++ if (rotate_percpu) ++ perf_counter_cpu_sched_in(cpuctx, cpu); ++ perf_counter_task_sched_in(curr, cpu); ++} ++ ++/* ++ * Cross CPU call to read the hardware counter ++ */ ++static void __read(void *info) ++{ ++ struct perf_counter *counter = info; ++ struct perf_counter_context *ctx = counter->ctx; ++ unsigned long flags; ++ ++ curr_rq_lock_irq_save(&flags); ++ if (ctx->is_active) ++ update_context_time(ctx, 1); ++ counter->hw_ops->read(counter); ++ update_counter_times(counter); ++ curr_rq_unlock_irq_restore(&flags); ++} ++ ++static u64 perf_counter_read(struct perf_counter *counter) ++{ ++ /* ++ * If counter is enabled and currently active on a CPU, update the ++ * value in the counter structure: ++ */ ++ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { ++ smp_call_function_single(counter->oncpu, ++ __read, counter, 1); ++ } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { ++ update_counter_times(counter); ++ } ++ ++ return atomic64_read(&counter->count); ++} ++ ++static void put_context(struct perf_counter_context *ctx) ++{ ++ if (ctx->task) ++ put_task_struct(ctx->task); ++} ++ ++static struct perf_counter_context *find_get_context(pid_t pid, int cpu) ++{ ++ struct perf_cpu_context *cpuctx; ++ struct perf_counter_context *ctx; ++ struct task_struct *task; ++ ++ /* ++ * If cpu is not a wildcard then this is a percpu counter: ++ */ ++ if (cpu != -1) { ++ /* Must be root to operate on a CPU counter: */ ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EACCES); ++ ++ if (cpu < 0 || cpu > num_possible_cpus()) ++ return ERR_PTR(-EINVAL); ++ ++ /* ++ * We could be clever and allow to attach a counter to an ++ * offline CPU and activate it when the CPU comes up, but ++ * that's for later. ++ */ ++ if (!cpu_isset(cpu, cpu_online_map)) ++ return ERR_PTR(-ENODEV); ++ ++ cpuctx = &per_cpu(perf_cpu_context, cpu); ++ ctx = &cpuctx->ctx; ++ ++ return ctx; ++ } ++ ++ rcu_read_lock(); ++ if (!pid) ++ task = current; ++ else ++ task = find_task_by_vpid(pid); ++ if (task) ++ get_task_struct(task); ++ rcu_read_unlock(); ++ ++ if (!task) ++ return ERR_PTR(-ESRCH); ++ ++ ctx = &task->perf_counter_ctx; ++ ctx->task = task; ++ ++ /* Reuse ptrace permission checks for now. */ ++ if (!ptrace_may_access(task, PTRACE_MODE_READ)) { ++ put_context(ctx); ++ return ERR_PTR(-EACCES); ++ } ++ ++ return ctx; ++} ++ ++static void free_counter_rcu(struct rcu_head *head) ++{ ++ struct perf_counter *counter; ++ ++ counter = container_of(head, struct perf_counter, rcu_head); ++ kfree(counter); ++} ++ ++static void free_counter(struct perf_counter *counter) ++{ ++ if (counter->destroy) ++ counter->destroy(counter); ++ ++ call_rcu(&counter->rcu_head, free_counter_rcu); ++} ++ ++/* ++ * Called when the last reference to the file is gone. ++ */ ++static int perf_release(struct inode *inode, struct file *file) ++{ ++ struct perf_counter *counter = file->private_data; ++ struct perf_counter_context *ctx = counter->ctx; ++ ++ file->private_data = NULL; ++ ++ mutex_lock(&ctx->mutex); ++ mutex_lock(&counter->mutex); ++ ++ perf_counter_remove_from_context(counter); ++ ++ mutex_unlock(&counter->mutex); ++ mutex_unlock(&ctx->mutex); ++ ++ free_counter(counter); ++ put_context(ctx); ++ ++ return 0; ++} ++ ++/* ++ * Read the performance counter - simple non blocking version for now ++ */ ++static ssize_t ++perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) ++{ ++ u64 values[3]; ++ int n; ++ ++ /* ++ * Return end-of-file for a read on a counter that is in ++ * error state (i.e. because it was pinned but it couldn't be ++ * scheduled on to the CPU at some point). ++ */ ++ if (counter->state == PERF_COUNTER_STATE_ERROR) ++ return 0; ++ ++ mutex_lock(&counter->mutex); ++ values[0] = perf_counter_read(counter); ++ n = 1; ++ if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) ++ values[n++] = counter->total_time_enabled + ++ atomic64_read(&counter->child_total_time_enabled); ++ if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) ++ values[n++] = counter->total_time_running + ++ atomic64_read(&counter->child_total_time_running); ++ mutex_unlock(&counter->mutex); ++ ++ if (count < n * sizeof(u64)) ++ return -EINVAL; ++ count = n * sizeof(u64); ++ ++ if (copy_to_user(buf, values, count)) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t ++perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) ++{ ++ struct perf_counter *counter = file->private_data; ++ ++ return perf_read_hw(counter, buf, count); ++} ++ ++static unsigned int perf_poll(struct file *file, poll_table *wait) ++{ ++ struct perf_counter *counter = file->private_data; ++ struct perf_mmap_data *data; ++ unsigned int events; ++ ++ rcu_read_lock(); ++ data = rcu_dereference(counter->data); ++ if (data) ++ events = atomic_xchg(&data->wakeup, 0); ++ else ++ events = POLL_HUP; ++ rcu_read_unlock(); ++ ++ poll_wait(file, &counter->waitq, wait); ++ ++ return events; ++} ++ ++static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ struct perf_counter *counter = file->private_data; ++ int err = 0; ++ ++ switch (cmd) { ++ case PERF_COUNTER_IOC_ENABLE: ++ perf_counter_enable_family(counter); ++ break; ++ case PERF_COUNTER_IOC_DISABLE: ++ perf_counter_disable_family(counter); ++ break; ++ default: ++ err = -ENOTTY; ++ } ++ return err; ++} ++ ++static void __perf_counter_update_userpage(struct perf_counter *counter, ++ struct perf_mmap_data *data) ++{ ++ struct perf_counter_mmap_page *userpg = data->user_page; ++ ++ /* ++ * Disable preemption so as to not let the corresponding user-space ++ * spin too long if we get preempted. ++ */ ++ preempt_disable(); ++ ++userpg->lock; ++ smp_wmb(); ++ userpg->index = counter->hw.idx; ++ userpg->offset = atomic64_read(&counter->count); ++ if (counter->state == PERF_COUNTER_STATE_ACTIVE) ++ userpg->offset -= atomic64_read(&counter->hw.prev_count); ++ ++ userpg->data_head = atomic_read(&data->head); ++ smp_wmb(); ++ ++userpg->lock; ++ preempt_enable(); ++} ++ ++void perf_counter_update_userpage(struct perf_counter *counter) ++{ ++ struct perf_mmap_data *data; ++ ++ rcu_read_lock(); ++ data = rcu_dereference(counter->data); ++ if (data) ++ __perf_counter_update_userpage(counter, data); ++ rcu_read_unlock(); ++} ++ ++static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ struct perf_counter *counter = vma->vm_file->private_data; ++ struct perf_mmap_data *data; ++ int ret = VM_FAULT_SIGBUS; ++ ++ rcu_read_lock(); ++ data = rcu_dereference(counter->data); ++ if (!data) ++ goto unlock; ++ ++ if (vmf->pgoff == 0) { ++ vmf->page = virt_to_page(data->user_page); ++ } else { ++ int nr = vmf->pgoff - 1; ++ ++ if ((unsigned)nr > data->nr_pages) ++ goto unlock; ++ ++ vmf->page = virt_to_page(data->data_pages[nr]); ++ } ++ get_page(vmf->page); ++ ret = 0; ++unlock: ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) ++{ ++ struct perf_mmap_data *data; ++ unsigned long size; ++ int i; ++ ++ WARN_ON(atomic_read(&counter->mmap_count)); ++ ++ size = sizeof(struct perf_mmap_data); ++ size += nr_pages * sizeof(void *); ++ ++ data = kzalloc(size, GFP_KERNEL); ++ if (!data) ++ goto fail; ++ ++ data->user_page = (void *)get_zeroed_page(GFP_KERNEL); ++ if (!data->user_page) ++ goto fail_user_page; ++ ++ for (i = 0; i < nr_pages; i++) { ++ data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); ++ if (!data->data_pages[i]) ++ goto fail_data_pages; ++ } ++ ++ data->nr_pages = nr_pages; ++ ++ rcu_assign_pointer(counter->data, data); ++ ++ return 0; ++ ++fail_data_pages: ++ for (i--; i >= 0; i--) ++ free_page((unsigned long)data->data_pages[i]); ++ ++ free_page((unsigned long)data->user_page); ++ ++fail_user_page: ++ kfree(data); ++ ++fail: ++ return -ENOMEM; ++} ++ ++static void __perf_mmap_data_free(struct rcu_head *rcu_head) ++{ ++ struct perf_mmap_data *data = container_of(rcu_head, ++ struct perf_mmap_data, rcu_head); ++ int i; ++ ++ free_page((unsigned long)data->user_page); ++ for (i = 0; i < data->nr_pages; i++) ++ free_page((unsigned long)data->data_pages[i]); ++ kfree(data); ++} ++ ++static void perf_mmap_data_free(struct perf_counter *counter) ++{ ++ struct perf_mmap_data *data = counter->data; ++ ++ WARN_ON(atomic_read(&counter->mmap_count)); ++ ++ rcu_assign_pointer(counter->data, NULL); ++ call_rcu(&data->rcu_head, __perf_mmap_data_free); ++} ++ ++static void perf_mmap_open(struct vm_area_struct *vma) ++{ ++ struct perf_counter *counter = vma->vm_file->private_data; ++ ++ atomic_inc(&counter->mmap_count); ++} ++ ++static void perf_mmap_close(struct vm_area_struct *vma) ++{ ++ struct perf_counter *counter = vma->vm_file->private_data; ++ ++ if (atomic_dec_and_mutex_lock(&counter->mmap_count, ++ &counter->mmap_mutex)) { ++ perf_mmap_data_free(counter); ++ mutex_unlock(&counter->mmap_mutex); ++ } ++} ++ ++static struct vm_operations_struct perf_mmap_vmops = { ++ .open = perf_mmap_open, ++ .close = perf_mmap_close, ++ .fault = perf_mmap_fault, ++}; ++ ++static int perf_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ struct perf_counter *counter = file->private_data; ++ unsigned long vma_size; ++ unsigned long nr_pages; ++ unsigned long locked, lock_limit; ++ int ret = 0; ++ ++ if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) ++ return -EINVAL; ++ ++ vma_size = vma->vm_end - vma->vm_start; ++ nr_pages = (vma_size / PAGE_SIZE) - 1; ++ ++ /* ++ * If we have data pages ensure they're a power-of-two number, so we ++ * can do bitmasks instead of modulo. ++ */ ++ if (nr_pages != 0 && !is_power_of_2(nr_pages)) ++ return -EINVAL; ++ ++ if (vma_size != PAGE_SIZE * (1 + nr_pages)) ++ return -EINVAL; ++ ++ if (vma->vm_pgoff != 0) ++ return -EINVAL; ++ ++ locked = vma_size >> PAGE_SHIFT; ++ locked += vma->vm_mm->locked_vm; ++ ++ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; ++ lock_limit >>= PAGE_SHIFT; ++ ++ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) ++ return -EPERM; ++ ++ mutex_lock(&counter->mmap_mutex); ++ if (atomic_inc_not_zero(&counter->mmap_count)) ++ goto out; ++ ++ WARN_ON(counter->data); ++ ret = perf_mmap_data_alloc(counter, nr_pages); ++ if (!ret) ++ atomic_set(&counter->mmap_count, 1); ++out: ++ mutex_unlock(&counter->mmap_mutex); ++ ++ vma->vm_flags &= ~VM_MAYWRITE; ++ vma->vm_flags |= VM_RESERVED; ++ vma->vm_ops = &perf_mmap_vmops; ++ ++ return ret; ++} ++ ++static const struct file_operations perf_fops = { ++ .release = perf_release, ++ .read = perf_read, ++ .poll = perf_poll, ++ .unlocked_ioctl = perf_ioctl, ++ .compat_ioctl = perf_ioctl, ++ .mmap = perf_mmap, ++}; ++ ++/* ++ * Output ++ */ ++ ++struct perf_output_handle { ++ struct perf_counter *counter; ++ struct perf_mmap_data *data; ++ unsigned int offset; ++ unsigned int head; ++ int wakeup; ++}; ++ ++static int perf_output_begin(struct perf_output_handle *handle, ++ struct perf_counter *counter, unsigned int size) ++{ ++ struct perf_mmap_data *data; ++ unsigned int offset, head; ++ ++ rcu_read_lock(); ++ data = rcu_dereference(counter->data); ++ if (!data) ++ goto out; ++ ++ if (!data->nr_pages) ++ goto out; ++ ++ do { ++ offset = head = atomic_read(&data->head); ++ head += size; ++ } while (atomic_cmpxchg(&data->head, offset, head) != offset); ++ ++ handle->counter = counter; ++ handle->data = data; ++ handle->offset = offset; ++ handle->head = head; ++ handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); ++ ++ return 0; ++ ++out: ++ rcu_read_unlock(); ++ ++ return -ENOSPC; ++} ++ ++static void perf_output_copy(struct perf_output_handle *handle, ++ void *buf, unsigned int len) ++{ ++ unsigned int pages_mask; ++ unsigned int offset; ++ unsigned int size; ++ void **pages; ++ ++ offset = handle->offset; ++ pages_mask = handle->data->nr_pages - 1; ++ pages = handle->data->data_pages; ++ ++ do { ++ unsigned int page_offset; ++ int nr; ++ ++ nr = (offset >> PAGE_SHIFT) & pages_mask; ++ page_offset = offset & (PAGE_SIZE - 1); ++ size = min_t(unsigned int, PAGE_SIZE - page_offset, len); ++ ++ memcpy(pages[nr] + page_offset, buf, size); ++ ++ len -= size; ++ buf += size; ++ offset += size; ++ } while (len); ++ ++ handle->offset = offset; ++ ++ WARN_ON_ONCE(handle->offset > handle->head); ++} ++ ++#define perf_output_put(handle, x) \ ++ perf_output_copy((handle), &(x), sizeof(x)) ++ ++static void perf_output_end(struct perf_output_handle *handle, int nmi) ++{ ++ if (handle->wakeup) { ++ (void)atomic_xchg(&handle->data->wakeup, POLL_IN); ++ __perf_counter_update_userpage(handle->counter, handle->data); ++ if (nmi) { ++ handle->counter->wakeup_pending = 1; ++ set_perf_counter_pending(); ++ } else ++ wake_up(&handle->counter->waitq); ++ } ++ rcu_read_unlock(); ++} ++ ++static int perf_output_write(struct perf_counter *counter, int nmi, ++ void *buf, ssize_t size) ++{ ++ struct perf_output_handle handle; ++ int ret; ++ ++ ret = perf_output_begin(&handle, counter, size); ++ if (ret) ++ goto out; ++ ++ perf_output_copy(&handle, buf, size); ++ perf_output_end(&handle, nmi); ++ ++out: ++ return ret; ++} ++ ++static void perf_output_simple(struct perf_counter *counter, ++ int nmi, struct pt_regs *regs) ++{ ++ unsigned int size; ++ struct { ++ struct perf_event_header header; ++ u64 ip; ++ u32 pid, tid; ++ } event; ++ ++ event.header.type = PERF_EVENT_IP; ++ event.ip = instruction_pointer(regs); ++ ++ size = sizeof(event); ++ ++ if (counter->hw_event.include_tid) { ++ /* namespace issues */ ++ event.pid = current->group_leader->pid; ++ event.tid = current->pid; ++ ++ event.header.type |= __PERF_EVENT_TID; ++ } else ++ size -= sizeof(u64); ++ ++ event.header.size = size; ++ ++ perf_output_write(counter, nmi, &event, size); ++} ++ ++static void perf_output_group(struct perf_counter *counter, int nmi) ++{ ++ struct perf_output_handle handle; ++ struct perf_event_header header; ++ struct perf_counter *leader, *sub; ++ unsigned int size; ++ struct { ++ u64 event; ++ u64 counter; ++ } entry; ++ int ret; ++ ++ size = sizeof(header) + counter->nr_siblings * sizeof(entry); ++ ++ ret = perf_output_begin(&handle, counter, size); ++ if (ret) ++ return; ++ ++ header.type = PERF_EVENT_GROUP; ++ header.size = size; ++ ++ perf_output_put(&handle, header); ++ ++ leader = counter->group_leader; ++ list_for_each_entry(sub, &leader->sibling_list, list_entry) { ++ if (sub != counter) ++ sub->hw_ops->read(sub); ++ ++ entry.event = sub->hw_event.config; ++ entry.counter = atomic64_read(&sub->count); ++ ++ perf_output_put(&handle, entry); ++ } ++ ++ perf_output_end(&handle, nmi); ++} ++ ++void perf_counter_output(struct perf_counter *counter, ++ int nmi, struct pt_regs *regs) ++{ ++ switch (counter->hw_event.record_type) { ++ case PERF_RECORD_SIMPLE: ++ return; ++ ++ case PERF_RECORD_IRQ: ++ perf_output_simple(counter, nmi, regs); ++ break; ++ ++ case PERF_RECORD_GROUP: ++ perf_output_group(counter, nmi); ++ break; ++ } ++} ++ ++/* ++ * Generic software counter infrastructure ++ */ ++ ++static void perf_swcounter_update(struct perf_counter *counter) ++{ ++ struct hw_perf_counter *hwc = &counter->hw; ++ u64 prev, now; ++ s64 delta; ++ ++again: ++ prev = atomic64_read(&hwc->prev_count); ++ now = atomic64_read(&hwc->count); ++ if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) ++ goto again; ++ ++ delta = now - prev; ++ ++ atomic64_add(delta, &counter->count); ++ atomic64_sub(delta, &hwc->period_left); ++} ++ ++static void perf_swcounter_set_period(struct perf_counter *counter) ++{ ++ struct hw_perf_counter *hwc = &counter->hw; ++ s64 left = atomic64_read(&hwc->period_left); ++ s64 period = hwc->irq_period; ++ ++ if (unlikely(left <= -period)) { ++ left = period; ++ atomic64_set(&hwc->period_left, left); ++ } ++ ++ if (unlikely(left <= 0)) { ++ left += period; ++ atomic64_add(period, &hwc->period_left); ++ } ++ ++ atomic64_set(&hwc->prev_count, -left); ++ atomic64_set(&hwc->count, -left); ++} ++ ++static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) ++{ ++ struct perf_counter *counter; ++ struct pt_regs *regs; ++ ++ counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); ++ counter->hw_ops->read(counter); ++ ++ regs = get_irq_regs(); ++ /* ++ * In case we exclude kernel IPs or are somehow not in interrupt ++ * context, provide the next best thing, the user IP. ++ */ ++ if ((counter->hw_event.exclude_kernel || !regs) && ++ !counter->hw_event.exclude_user) ++ regs = task_pt_regs(current); ++ ++ if (regs) ++ perf_counter_output(counter, 0, regs); ++ ++ hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); ++ ++ return HRTIMER_RESTART; ++} ++ ++static void perf_swcounter_overflow(struct perf_counter *counter, ++ int nmi, struct pt_regs *regs) ++{ ++ perf_swcounter_update(counter); ++ perf_swcounter_set_period(counter); ++ perf_counter_output(counter, nmi, regs); ++} ++ ++static int perf_swcounter_match(struct perf_counter *counter, ++ enum perf_event_types type, ++ u32 event, struct pt_regs *regs) ++{ ++ if (counter->state != PERF_COUNTER_STATE_ACTIVE) ++ return 0; ++ ++ if (perf_event_raw(&counter->hw_event)) ++ return 0; ++ ++ if (perf_event_type(&counter->hw_event) != type) ++ return 0; ++ ++ if (perf_event_id(&counter->hw_event) != event) ++ return 0; ++ ++ if (counter->hw_event.exclude_user && user_mode(regs)) ++ return 0; ++ ++ if (counter->hw_event.exclude_kernel && !user_mode(regs)) ++ return 0; ++ ++ return 1; ++} ++ ++static void perf_swcounter_add(struct perf_counter *counter, u64 nr, ++ int nmi, struct pt_regs *regs) ++{ ++ int neg = atomic64_add_negative(nr, &counter->hw.count); ++ if (counter->hw.irq_period && !neg) ++ perf_swcounter_overflow(counter, nmi, regs); ++} ++ ++static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, ++ enum perf_event_types type, u32 event, ++ u64 nr, int nmi, struct pt_regs *regs) ++{ ++ struct perf_counter *counter; ++ ++ if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) ++ return; ++ ++ rcu_read_lock(); ++ list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { ++ if (perf_swcounter_match(counter, type, event, regs)) ++ perf_swcounter_add(counter, nr, nmi, regs); ++ } ++ rcu_read_unlock(); ++} ++ ++static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) ++{ ++ if (in_nmi()) ++ return &cpuctx->recursion[3]; ++ ++ if (in_irq()) ++ return &cpuctx->recursion[2]; ++ ++ if (in_softirq()) ++ return &cpuctx->recursion[1]; ++ ++ return &cpuctx->recursion[0]; ++} ++ ++static void __perf_swcounter_event(enum perf_event_types type, u32 event, ++ u64 nr, int nmi, struct pt_regs *regs) ++{ ++ struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); ++ int *recursion = perf_swcounter_recursion_context(cpuctx); ++ ++ if (*recursion) ++ goto out; ++ ++ (*recursion)++; ++ barrier(); ++ ++ perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); ++ if (cpuctx->task_ctx) { ++ perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, ++ nr, nmi, regs); ++ } ++ ++ barrier(); ++ (*recursion)--; ++ ++out: ++ put_cpu_var(perf_cpu_context); ++} ++ ++void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) ++{ ++ __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); ++} ++ ++static void perf_swcounter_read(struct perf_counter *counter) ++{ ++ perf_swcounter_update(counter); ++} ++ ++static int perf_swcounter_enable(struct perf_counter *counter) ++{ ++ perf_swcounter_set_period(counter); ++ return 0; ++} ++ ++static void perf_swcounter_disable(struct perf_counter *counter) ++{ ++ perf_swcounter_update(counter); ++} ++ ++static const struct hw_perf_counter_ops perf_ops_generic = { ++ .enable = perf_swcounter_enable, ++ .disable = perf_swcounter_disable, ++ .read = perf_swcounter_read, ++}; ++ ++/* ++ * Software counter: cpu wall time clock ++ */ ++ ++static void cpu_clock_perf_counter_update(struct perf_counter *counter) ++{ ++ int cpu = raw_smp_processor_id(); ++ s64 prev; ++ u64 now; ++ ++ now = cpu_clock(cpu); ++ prev = atomic64_read(&counter->hw.prev_count); ++ atomic64_set(&counter->hw.prev_count, now); ++ atomic64_add(now - prev, &counter->count); ++} ++ ++static int cpu_clock_perf_counter_enable(struct perf_counter *counter) ++{ ++ struct hw_perf_counter *hwc = &counter->hw; ++ int cpu = raw_smp_processor_id(); ++ ++ atomic64_set(&hwc->prev_count, cpu_clock(cpu)); ++ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hwc->hrtimer.function = perf_swcounter_hrtimer; ++ if (hwc->irq_period) { ++ __hrtimer_start_range_ns(&hwc->hrtimer, ++ ns_to_ktime(hwc->irq_period), 0, ++ HRTIMER_MODE_REL, 0); ++ } ++ ++ return 0; ++} ++ ++static void cpu_clock_perf_counter_disable(struct perf_counter *counter) ++{ ++ hrtimer_cancel(&counter->hw.hrtimer); ++ cpu_clock_perf_counter_update(counter); ++} ++ ++static void cpu_clock_perf_counter_read(struct perf_counter *counter) ++{ ++ cpu_clock_perf_counter_update(counter); ++} ++ ++static const struct hw_perf_counter_ops perf_ops_cpu_clock = { ++ .enable = cpu_clock_perf_counter_enable, ++ .disable = cpu_clock_perf_counter_disable, ++ .read = cpu_clock_perf_counter_read, ++}; ++ ++/* ++ * Software counter: task time clock ++ */ ++ ++/* ++ * Called from within the scheduler: ++ */ ++static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) ++{ ++ struct task_struct *curr = counter->task; ++ u64 delta; ++ ++ delta = __task_delta_exec(curr, update); ++ ++ return curr->se.sum_exec_runtime + delta; ++} ++ ++static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) ++{ ++ u64 prev; ++ s64 delta; ++ ++ prev = atomic64_read(&counter->hw.prev_count); ++ ++ atomic64_set(&counter->hw.prev_count, now); ++ ++ delta = now - prev; ++ ++ atomic64_add(delta, &counter->count); ++} ++ ++static int task_clock_perf_counter_enable(struct perf_counter *counter) ++{ ++ struct hw_perf_counter *hwc = &counter->hw; ++ ++ atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); ++ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hwc->hrtimer.function = perf_swcounter_hrtimer; ++ if (hwc->irq_period) { ++ __hrtimer_start_range_ns(&hwc->hrtimer, ++ ns_to_ktime(hwc->irq_period), 0, ++ HRTIMER_MODE_REL, 0); ++ } ++ ++ return 0; ++} ++ ++static void task_clock_perf_counter_disable(struct perf_counter *counter) ++{ ++ hrtimer_cancel(&counter->hw.hrtimer); ++ task_clock_perf_counter_update(counter, ++ task_clock_perf_counter_val(counter, 0)); ++} ++ ++static void task_clock_perf_counter_read(struct perf_counter *counter) ++{ ++ task_clock_perf_counter_update(counter, ++ task_clock_perf_counter_val(counter, 1)); ++} ++ ++static const struct hw_perf_counter_ops perf_ops_task_clock = { ++ .enable = task_clock_perf_counter_enable, ++ .disable = task_clock_perf_counter_disable, ++ .read = task_clock_perf_counter_read, ++}; ++ ++/* ++ * Software counter: cpu migrations ++ */ ++ ++static inline u64 get_cpu_migrations(struct perf_counter *counter) ++{ ++ struct task_struct *curr = counter->ctx->task; ++ ++ if (curr) ++ return curr->se.nr_migrations; ++ return cpu_nr_migrations(smp_processor_id()); ++} ++ ++static void cpu_migrations_perf_counter_update(struct perf_counter *counter) ++{ ++ u64 prev, now; ++ s64 delta; ++ ++ prev = atomic64_read(&counter->hw.prev_count); ++ now = get_cpu_migrations(counter); ++ ++ atomic64_set(&counter->hw.prev_count, now); ++ ++ delta = now - prev; ++ ++ atomic64_add(delta, &counter->count); ++} ++ ++static void cpu_migrations_perf_counter_read(struct perf_counter *counter) ++{ ++ cpu_migrations_perf_counter_update(counter); ++} ++ ++static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) ++{ ++ if (counter->prev_state <= PERF_COUNTER_STATE_OFF) ++ atomic64_set(&counter->hw.prev_count, ++ get_cpu_migrations(counter)); ++ return 0; ++} ++ ++static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) ++{ ++ cpu_migrations_perf_counter_update(counter); ++} ++ ++static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { ++ .enable = cpu_migrations_perf_counter_enable, ++ .disable = cpu_migrations_perf_counter_disable, ++ .read = cpu_migrations_perf_counter_read, ++}; ++ ++#ifdef CONFIG_EVENT_PROFILE ++void perf_tpcounter_event(int event_id) ++{ ++ struct pt_regs *regs = get_irq_regs(); ++ ++ if (!regs) ++ regs = task_pt_regs(current); ++ ++ __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); ++} ++ ++extern int ftrace_profile_enable(int); ++extern void ftrace_profile_disable(int); ++ ++static void tp_perf_counter_destroy(struct perf_counter *counter) ++{ ++ ftrace_profile_disable(perf_event_id(&counter->hw_event)); ++} ++ ++static const struct hw_perf_counter_ops * ++tp_perf_counter_init(struct perf_counter *counter) ++{ ++ int event_id = perf_event_id(&counter->hw_event); ++ int ret; ++ ++ ret = ftrace_profile_enable(event_id); ++ if (ret) ++ return NULL; ++ ++ counter->destroy = tp_perf_counter_destroy; ++ counter->hw.irq_period = counter->hw_event.irq_period; ++ ++ return &perf_ops_generic; ++} ++#else ++static const struct hw_perf_counter_ops * ++tp_perf_counter_init(struct perf_counter *counter) ++{ ++ return NULL; ++} ++#endif ++ ++static const struct hw_perf_counter_ops * ++sw_perf_counter_init(struct perf_counter *counter) ++{ ++ struct perf_counter_hw_event *hw_event = &counter->hw_event; ++ const struct hw_perf_counter_ops *hw_ops = NULL; ++ struct hw_perf_counter *hwc = &counter->hw; ++ ++ /* ++ * Software counters (currently) can't in general distinguish ++ * between user, kernel and hypervisor events. ++ * However, context switches and cpu migrations are considered ++ * to be kernel events, and page faults are never hypervisor ++ * events. ++ */ ++ switch (perf_event_id(&counter->hw_event)) { ++ case PERF_COUNT_CPU_CLOCK: ++ hw_ops = &perf_ops_cpu_clock; ++ ++ if (hw_event->irq_period && hw_event->irq_period < 10000) ++ hw_event->irq_period = 10000; ++ break; ++ case PERF_COUNT_TASK_CLOCK: ++ /* ++ * If the user instantiates this as a per-cpu counter, ++ * use the cpu_clock counter instead. ++ */ ++ if (counter->ctx->task) ++ hw_ops = &perf_ops_task_clock; ++ else ++ hw_ops = &perf_ops_cpu_clock; ++ ++ if (hw_event->irq_period && hw_event->irq_period < 10000) ++ hw_event->irq_period = 10000; ++ break; ++ case PERF_COUNT_PAGE_FAULTS: ++ case PERF_COUNT_PAGE_FAULTS_MIN: ++ case PERF_COUNT_PAGE_FAULTS_MAJ: ++ case PERF_COUNT_CONTEXT_SWITCHES: ++ hw_ops = &perf_ops_generic; ++ break; ++ case PERF_COUNT_CPU_MIGRATIONS: ++ if (!counter->hw_event.exclude_kernel) ++ hw_ops = &perf_ops_cpu_migrations; ++ break; ++ } ++ ++ if (hw_ops) ++ hwc->irq_period = hw_event->irq_period; ++ ++ return hw_ops; ++} ++ ++/* ++ * Allocate and initialize a counter structure ++ */ ++static struct perf_counter * ++perf_counter_alloc(struct perf_counter_hw_event *hw_event, ++ int cpu, ++ struct perf_counter_context *ctx, ++ struct perf_counter *group_leader, ++ gfp_t gfpflags) ++{ ++ const struct hw_perf_counter_ops *hw_ops; ++ struct perf_counter *counter; ++ ++ counter = kzalloc(sizeof(*counter), gfpflags); ++ if (!counter) ++ return NULL; ++ ++ /* ++ * Single counters are their own group leaders, with an ++ * empty sibling list: ++ */ ++ if (!group_leader) ++ group_leader = counter; ++ ++ mutex_init(&counter->mutex); ++ INIT_LIST_HEAD(&counter->list_entry); ++ INIT_LIST_HEAD(&counter->event_entry); ++ INIT_LIST_HEAD(&counter->sibling_list); ++ init_waitqueue_head(&counter->waitq); ++ ++ mutex_init(&counter->mmap_mutex); ++ ++ INIT_LIST_HEAD(&counter->child_list); ++ ++ counter->cpu = cpu; ++ counter->hw_event = *hw_event; ++ counter->wakeup_pending = 0; ++ counter->group_leader = group_leader; ++ counter->hw_ops = NULL; ++ counter->ctx = ctx; ++ ++ counter->state = PERF_COUNTER_STATE_INACTIVE; ++ if (hw_event->disabled) ++ counter->state = PERF_COUNTER_STATE_OFF; ++ ++ hw_ops = NULL; ++ ++ if (perf_event_raw(hw_event)) { ++ hw_ops = hw_perf_counter_init(counter); ++ goto done; ++ } ++ ++ switch (perf_event_type(hw_event)) { ++ case PERF_TYPE_HARDWARE: ++ hw_ops = hw_perf_counter_init(counter); ++ break; ++ ++ case PERF_TYPE_SOFTWARE: ++ hw_ops = sw_perf_counter_init(counter); ++ break; ++ ++ case PERF_TYPE_TRACEPOINT: ++ hw_ops = tp_perf_counter_init(counter); ++ break; ++ } ++ ++ if (!hw_ops) { ++ kfree(counter); ++ return NULL; ++ } ++done: ++ counter->hw_ops = hw_ops; ++ ++ return counter; ++} ++ ++/** ++ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu ++ * ++ * @hw_event_uptr: event type attributes for monitoring/sampling ++ * @pid: target pid ++ * @cpu: target cpu ++ * @group_fd: group leader counter fd ++ */ ++SYSCALL_DEFINE5(perf_counter_open, ++ const struct perf_counter_hw_event __user *, hw_event_uptr, ++ pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) ++{ ++ struct perf_counter *counter, *group_leader; ++ struct perf_counter_hw_event hw_event; ++ struct perf_counter_context *ctx; ++ struct file *counter_file = NULL; ++ struct file *group_file = NULL; ++ int fput_needed = 0; ++ int fput_needed2 = 0; ++ int ret; ++ ++ /* for future expandability... */ ++ if (flags) ++ return -EINVAL; ++ ++ if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) ++ return -EFAULT; ++ ++ /* ++ * Get the target context (task or percpu): ++ */ ++ ctx = find_get_context(pid, cpu); ++ if (IS_ERR(ctx)) ++ return PTR_ERR(ctx); ++ ++ /* ++ * Look up the group leader (we will attach this counter to it): ++ */ ++ group_leader = NULL; ++ if (group_fd != -1) { ++ ret = -EINVAL; ++ group_file = fget_light(group_fd, &fput_needed); ++ if (!group_file) ++ goto err_put_context; ++ if (group_file->f_op != &perf_fops) ++ goto err_put_context; ++ ++ group_leader = group_file->private_data; ++ /* ++ * Do not allow a recursive hierarchy (this new sibling ++ * becoming part of another group-sibling): ++ */ ++ if (group_leader->group_leader != group_leader) ++ goto err_put_context; ++ /* ++ * Do not allow to attach to a group in a different ++ * task or CPU context: ++ */ ++ if (group_leader->ctx != ctx) ++ goto err_put_context; ++ /* ++ * Only a group leader can be exclusive or pinned ++ */ ++ if (hw_event.exclusive || hw_event.pinned) ++ goto err_put_context; ++ } ++ ++ ret = -EINVAL; ++ counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, ++ GFP_KERNEL); ++ if (!counter) ++ goto err_put_context; ++ ++ ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); ++ if (ret < 0) ++ goto err_free_put_context; ++ ++ counter_file = fget_light(ret, &fput_needed2); ++ if (!counter_file) ++ goto err_free_put_context; ++ ++ counter->filp = counter_file; ++ mutex_lock(&ctx->mutex); ++ perf_install_in_context(ctx, counter, cpu); ++ mutex_unlock(&ctx->mutex); ++ ++ fput_light(counter_file, fput_needed2); ++ ++out_fput: ++ fput_light(group_file, fput_needed); ++ ++ return ret; ++ ++err_free_put_context: ++ kfree(counter); ++ ++err_put_context: ++ put_context(ctx); ++ ++ goto out_fput; ++} ++ ++/* ++ * Initialize the perf_counter context in a task_struct: ++ */ ++static void ++__perf_counter_init_context(struct perf_counter_context *ctx, ++ struct task_struct *task) ++{ ++ memset(ctx, 0, sizeof(*ctx)); ++ spin_lock_init(&ctx->lock); ++ mutex_init(&ctx->mutex); ++ INIT_LIST_HEAD(&ctx->counter_list); ++ INIT_LIST_HEAD(&ctx->event_list); ++ ctx->task = task; ++} ++ ++/* ++ * inherit a counter from parent task to child task: ++ */ ++static struct perf_counter * ++inherit_counter(struct perf_counter *parent_counter, ++ struct task_struct *parent, ++ struct perf_counter_context *parent_ctx, ++ struct task_struct *child, ++ struct perf_counter *group_leader, ++ struct perf_counter_context *child_ctx) ++{ ++ struct perf_counter *child_counter; ++ ++ /* ++ * Instead of creating recursive hierarchies of counters, ++ * we link inherited counters back to the original parent, ++ * which has a filp for sure, which we use as the reference ++ * count: ++ */ ++ if (parent_counter->parent) ++ parent_counter = parent_counter->parent; ++ ++ child_counter = perf_counter_alloc(&parent_counter->hw_event, ++ parent_counter->cpu, child_ctx, ++ group_leader, GFP_KERNEL); ++ if (!child_counter) ++ return NULL; ++ ++ /* ++ * Link it up in the child's context: ++ */ ++ child_counter->task = child; ++ add_counter_to_ctx(child_counter, child_ctx); ++ ++ child_counter->parent = parent_counter; ++ /* ++ * inherit into child's child as well: ++ */ ++ child_counter->hw_event.inherit = 1; ++ ++ /* ++ * Get a reference to the parent filp - we will fput it ++ * when the child counter exits. This is safe to do because ++ * we are in the parent and we know that the filp still ++ * exists and has a nonzero count: ++ */ ++ atomic_long_inc(&parent_counter->filp->f_count); ++ ++ /* ++ * Link this into the parent counter's child list ++ */ ++ mutex_lock(&parent_counter->mutex); ++ list_add_tail(&child_counter->child_list, &parent_counter->child_list); ++ ++ /* ++ * Make the child state follow the state of the parent counter, ++ * not its hw_event.disabled bit. We hold the parent's mutex, ++ * so we won't race with perf_counter_{en,dis}able_family. ++ */ ++ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) ++ child_counter->state = PERF_COUNTER_STATE_INACTIVE; ++ else ++ child_counter->state = PERF_COUNTER_STATE_OFF; ++ ++ mutex_unlock(&parent_counter->mutex); ++ ++ return child_counter; ++} ++ ++static int inherit_group(struct perf_counter *parent_counter, ++ struct task_struct *parent, ++ struct perf_counter_context *parent_ctx, ++ struct task_struct *child, ++ struct perf_counter_context *child_ctx) ++{ ++ struct perf_counter *leader; ++ struct perf_counter *sub; ++ ++ leader = inherit_counter(parent_counter, parent, parent_ctx, ++ child, NULL, child_ctx); ++ if (!leader) ++ return -ENOMEM; ++ list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { ++ if (!inherit_counter(sub, parent, parent_ctx, ++ child, leader, child_ctx)) ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static void sync_child_counter(struct perf_counter *child_counter, ++ struct perf_counter *parent_counter) ++{ ++ u64 parent_val, child_val; ++ ++ parent_val = atomic64_read(&parent_counter->count); ++ child_val = atomic64_read(&child_counter->count); ++ ++ /* ++ * Add back the child's count to the parent's count: ++ */ ++ atomic64_add(child_val, &parent_counter->count); ++ atomic64_add(child_counter->total_time_enabled, ++ &parent_counter->child_total_time_enabled); ++ atomic64_add(child_counter->total_time_running, ++ &parent_counter->child_total_time_running); ++ ++ /* ++ * Remove this counter from the parent's list ++ */ ++ mutex_lock(&parent_counter->mutex); ++ list_del_init(&child_counter->child_list); ++ mutex_unlock(&parent_counter->mutex); ++ ++ /* ++ * Release the parent counter, if this was the last ++ * reference to it. ++ */ ++ fput(parent_counter->filp); ++} ++ ++static void ++__perf_counter_exit_task(struct task_struct *child, ++ struct perf_counter *child_counter, ++ struct perf_counter_context *child_ctx) ++{ ++ struct perf_counter *parent_counter; ++ struct perf_counter *sub, *tmp; ++ ++ /* ++ * If we do not self-reap then we have to wait for the ++ * child task to unschedule (it will happen for sure), ++ * so that its counter is at its final count. (This ++ * condition triggers rarely - child tasks usually get ++ * off their CPU before the parent has a chance to ++ * get this far into the reaping action) ++ */ ++ if (child != current) { ++ wait_task_inactive(child, 0); ++ list_del_init(&child_counter->list_entry); ++ update_counter_times(child_counter); ++ } else { ++ struct perf_cpu_context *cpuctx; ++ unsigned long flags; ++ u64 perf_flags; ++ ++ /* ++ * Disable and unlink this counter. ++ * ++ * Be careful about zapping the list - IRQ/NMI context ++ * could still be processing it: ++ */ ++ curr_rq_lock_irq_save(&flags); ++ perf_flags = hw_perf_save_disable(); ++ ++ cpuctx = &__get_cpu_var(perf_cpu_context); ++ ++ group_sched_out(child_counter, cpuctx, child_ctx); ++ update_counter_times(child_counter); ++ ++ list_del_init(&child_counter->list_entry); ++ ++ child_ctx->nr_counters--; ++ ++ hw_perf_restore(perf_flags); ++ curr_rq_unlock_irq_restore(&flags); ++ } ++ ++ parent_counter = child_counter->parent; ++ /* ++ * It can happen that parent exits first, and has counters ++ * that are still around due to the child reference. These ++ * counters need to be zapped - but otherwise linger. ++ */ ++ if (parent_counter) { ++ sync_child_counter(child_counter, parent_counter); ++ list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, ++ list_entry) { ++ if (sub->parent) { ++ sync_child_counter(sub, sub->parent); ++ free_counter(sub); ++ } ++ } ++ free_counter(child_counter); ++ } ++} ++ ++/* ++ * When a child task exits, feed back counter values to parent counters. ++ * ++ * Note: we may be running in child context, but the PID is not hashed ++ * anymore so new counters will not be added. ++ */ ++void perf_counter_exit_task(struct task_struct *child) ++{ ++ struct perf_counter *child_counter, *tmp; ++ struct perf_counter_context *child_ctx; ++ ++ child_ctx = &child->perf_counter_ctx; ++ ++ if (likely(!child_ctx->nr_counters)) ++ return; ++ ++ list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, ++ list_entry) ++ __perf_counter_exit_task(child, child_counter, child_ctx); ++} ++ ++/* ++ * Initialize the perf_counter context in task_struct ++ */ ++void perf_counter_init_task(struct task_struct *child) ++{ ++ struct perf_counter_context *child_ctx, *parent_ctx; ++ struct perf_counter *counter; ++ struct task_struct *parent = current; ++ ++ child_ctx = &child->perf_counter_ctx; ++ parent_ctx = &parent->perf_counter_ctx; ++ ++ __perf_counter_init_context(child_ctx, child); ++ ++ /* ++ * This is executed from the parent task context, so inherit ++ * counters that have been marked for cloning: ++ */ ++ ++ if (likely(!parent_ctx->nr_counters)) ++ return; ++ ++ /* ++ * Lock the parent list. No need to lock the child - not PID ++ * hashed yet and not running, so nobody can access it. ++ */ ++ mutex_lock(&parent_ctx->mutex); ++ ++ /* ++ * We dont have to disable NMIs - we are only looking at ++ * the list, not manipulating it: ++ */ ++ list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { ++ if (!counter->hw_event.inherit) ++ continue; ++ ++ if (inherit_group(counter, parent, ++ parent_ctx, child, child_ctx)) ++ break; ++ } ++ ++ mutex_unlock(&parent_ctx->mutex); ++} ++ ++static void __cpuinit perf_counter_init_cpu(int cpu) ++{ ++ struct perf_cpu_context *cpuctx; ++ ++ cpuctx = &per_cpu(perf_cpu_context, cpu); ++ __perf_counter_init_context(&cpuctx->ctx, NULL); ++ ++ mutex_lock(&perf_resource_mutex); ++ cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; ++ mutex_unlock(&perf_resource_mutex); ++ ++ hw_perf_counter_setup(cpu); ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void __perf_counter_exit_cpu(void *info) ++{ ++ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); ++ struct perf_counter_context *ctx = &cpuctx->ctx; ++ struct perf_counter *counter, *tmp; ++ ++ list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) ++ __perf_counter_remove_from_context(counter); ++} ++static void perf_counter_exit_cpu(int cpu) ++{ ++ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); ++ struct perf_counter_context *ctx = &cpuctx->ctx; ++ ++ mutex_lock(&ctx->mutex); ++ smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); ++ mutex_unlock(&ctx->mutex); ++} ++#else ++static inline void perf_counter_exit_cpu(int cpu) { } ++#endif ++ ++static int __cpuinit ++perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) ++{ ++ unsigned int cpu = (long)hcpu; ++ ++ switch (action) { ++ ++ case CPU_UP_PREPARE: ++ case CPU_UP_PREPARE_FROZEN: ++ perf_counter_init_cpu(cpu); ++ break; ++ ++ case CPU_DOWN_PREPARE: ++ case CPU_DOWN_PREPARE_FROZEN: ++ perf_counter_exit_cpu(cpu); ++ break; ++ ++ default: ++ break; ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block __cpuinitdata perf_cpu_nb = { ++ .notifier_call = perf_cpu_notify, ++}; ++ ++static int __init perf_counter_init(void) ++{ ++ perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, ++ (void *)(long)smp_processor_id()); ++ register_cpu_notifier(&perf_cpu_nb); ++ ++ return 0; ++} ++early_initcall(perf_counter_init); ++ ++static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) ++{ ++ return sprintf(buf, "%d\n", perf_reserved_percpu); ++} ++ ++static ssize_t ++perf_set_reserve_percpu(struct sysdev_class *class, ++ const char *buf, ++ size_t count) ++{ ++ struct perf_cpu_context *cpuctx; ++ unsigned long val; ++ int err, cpu, mpt; ++ ++ err = strict_strtoul(buf, 10, &val); ++ if (err) ++ return err; ++ if (val > perf_max_counters) ++ return -EINVAL; ++ ++ mutex_lock(&perf_resource_mutex); ++ perf_reserved_percpu = val; ++ for_each_online_cpu(cpu) { ++ cpuctx = &per_cpu(perf_cpu_context, cpu); ++ spin_lock_irq(&cpuctx->ctx.lock); ++ mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, ++ perf_max_counters - perf_reserved_percpu); ++ cpuctx->max_pertask = mpt; ++ spin_unlock_irq(&cpuctx->ctx.lock); ++ } ++ mutex_unlock(&perf_resource_mutex); ++ ++ return count; ++} ++ ++static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) ++{ ++ return sprintf(buf, "%d\n", perf_overcommit); ++} ++ ++static ssize_t ++perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) ++{ ++ unsigned long val; ++ int err; ++ ++ err = strict_strtoul(buf, 10, &val); ++ if (err) ++ return err; ++ if (val > 1) ++ return -EINVAL; ++ ++ mutex_lock(&perf_resource_mutex); ++ perf_overcommit = val; ++ mutex_unlock(&perf_resource_mutex); ++ ++ return count; ++} ++ ++static SYSDEV_CLASS_ATTR( ++ reserve_percpu, ++ 0644, ++ perf_show_reserve_percpu, ++ perf_set_reserve_percpu ++ ); ++ ++static SYSDEV_CLASS_ATTR( ++ overcommit, ++ 0644, ++ perf_show_overcommit, ++ perf_set_overcommit ++ ); ++ ++static struct attribute *perfclass_attrs[] = { ++ &attr_reserve_percpu.attr, ++ &attr_overcommit.attr, ++ NULL ++}; ++ ++static struct attribute_group perfclass_attr_group = { ++ .attrs = perfclass_attrs, ++ .name = "perf_counters", ++}; ++ ++static int __init perf_counter_sysfs_init(void) ++{ ++ return sysfs_create_group(&cpu_sysdev_class.kset.kobj, ++ &perfclass_attr_group); ++} ++device_initcall(perf_counter_sysfs_init); +Index: linux-2.6-tip/kernel/posix-cpu-timers.c +=================================================================== +--- linux-2.6-tip.orig/kernel/posix-cpu-timers.c ++++ linux-2.6-tip/kernel/posix-cpu-timers.c +@@ -558,7 +558,7 @@ static void arm_timer(struct k_itimer *t + p->cpu_timers : p->signal->cpu_timers); + head += CPUCLOCK_WHICH(timer->it_clock); + +- BUG_ON(!irqs_disabled()); ++ BUG_ON_NONRT(!irqs_disabled()); + spin_lock(&p->sighand->siglock); + + listpos = head; +@@ -746,7 +746,7 @@ int posix_cpu_timer_set(struct k_itimer + /* + * Disarm any old timer after extracting its expiry time. + */ +- BUG_ON(!irqs_disabled()); ++ BUG_ON_NONRT(!irqs_disabled()); + + ret = 0; + spin_lock(&p->sighand->siglock); +@@ -1371,7 +1371,8 @@ static inline int fastpath_timer_check(s + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } +- return 0; ++ ++ return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; + } + + /* +@@ -1379,12 +1380,11 @@ static inline int fastpath_timer_check(s + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +-void run_posix_cpu_timers(struct task_struct *tsk) ++void __run_posix_cpu_timers(struct task_struct *tsk) + { + LIST_HEAD(firing); + struct k_itimer *timer, *next; + +- BUG_ON(!irqs_disabled()); + + /* + * The fast path checks that there are no expired thread or thread +@@ -1436,6 +1436,177 @@ void run_posix_cpu_timers(struct task_st + } + } + ++#include ++#include ++DEFINE_PER_CPU(struct task_struct *, posix_timer_task); ++DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); ++ ++static int posix_cpu_timers_thread(void *data) ++{ ++ int cpu = (long)data; ++ ++ BUG_ON(per_cpu(posix_timer_task,cpu) != current); ++ ++ while (!kthread_should_stop()) { ++ struct task_struct *tsk = NULL; ++ struct task_struct *next = NULL; ++ ++ if (cpu_is_offline(cpu)) ++ goto wait_to_die; ++ ++ /* grab task list */ ++ raw_local_irq_disable(); ++ tsk = per_cpu(posix_timer_tasklist, cpu); ++ per_cpu(posix_timer_tasklist, cpu) = NULL; ++ raw_local_irq_enable(); ++ ++ /* its possible the list is empty, just return */ ++ if (!tsk) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ continue; ++ } ++ ++ /* Process task list */ ++ while (1) { ++ /* save next */ ++ next = tsk->posix_timer_list; ++ ++ /* run the task timers, clear its ptr and ++ * unreference it ++ */ ++ __run_posix_cpu_timers(tsk); ++ tsk->posix_timer_list = NULL; ++ put_task_struct(tsk); ++ ++ /* check if this is the last on the list */ ++ if (next == tsk) ++ break; ++ tsk = next; ++ } ++ } ++ return 0; ++ ++wait_to_die: ++ /* Wait for kthread_stop */ ++ set_current_state(TASK_INTERRUPTIBLE); ++ while (!kthread_should_stop()) { ++ schedule(); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ __set_current_state(TASK_RUNNING); ++ return 0; ++} ++ ++static inline int __fastpath_timer_check(struct task_struct *tsk) ++{ ++ /* tsk == current, ensure it is safe to use ->signal/sighand */ ++ if (unlikely(tsk->exit_state)) ++ return 0; ++ ++ if (!task_cputime_zero(&tsk->cputime_expires)) ++ return 1; ++ ++ if (!task_cputime_zero(&tsk->signal->cputime_expires)) ++ return 1; ++ ++ return 0; ++} ++ ++void run_posix_cpu_timers(struct task_struct *tsk) ++{ ++ unsigned long cpu = smp_processor_id(); ++ struct task_struct *tasklist; ++ ++ BUG_ON(!irqs_disabled()); ++ if(!per_cpu(posix_timer_task, cpu)) ++ return; ++ /* get per-cpu references */ ++ tasklist = per_cpu(posix_timer_tasklist, cpu); ++ ++ /* check to see if we're already queued */ ++ if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) { ++ get_task_struct(tsk); ++ if (tasklist) { ++ tsk->posix_timer_list = tasklist; ++ } else { ++ /* ++ * The list is terminated by a self-pointing ++ * task_struct ++ */ ++ tsk->posix_timer_list = tsk; ++ } ++ per_cpu(posix_timer_tasklist, cpu) = tsk; ++ ++ wake_up_process(per_cpu(posix_timer_task, cpu)); ++ } ++} ++ ++/* ++ * posix_cpu_thread_call - callback that gets triggered when a CPU is added. ++ * Here we can start up the necessary migration thread for the new CPU. ++ */ ++static int posix_cpu_thread_call(struct notifier_block *nfb, ++ unsigned long action, void *hcpu) ++{ ++ int cpu = (long)hcpu; ++ struct task_struct *p; ++ struct sched_param param; ++ ++ switch (action) { ++ case CPU_UP_PREPARE: ++ p = kthread_create(posix_cpu_timers_thread, hcpu, ++ "posixcputmr/%d",cpu); ++ if (IS_ERR(p)) ++ return NOTIFY_BAD; ++ p->flags |= PF_NOFREEZE; ++ kthread_bind(p, cpu); ++ /* Must be high prio to avoid getting starved */ ++ param.sched_priority = MAX_RT_PRIO-1; ++ sched_setscheduler(p, SCHED_FIFO, ¶m); ++ per_cpu(posix_timer_task,cpu) = p; ++ break; ++ case CPU_ONLINE: ++ /* Strictly unneccessary, as first user will wake it. */ ++ wake_up_process(per_cpu(posix_timer_task,cpu)); ++ break; ++#ifdef CONFIG_HOTPLUG_CPU ++ case CPU_UP_CANCELED: ++ /* Unbind it from offline cpu so it can run. Fall thru. */ ++ kthread_bind(per_cpu(posix_timer_task,cpu), ++ any_online_cpu(cpu_online_map)); ++ kthread_stop(per_cpu(posix_timer_task,cpu)); ++ per_cpu(posix_timer_task,cpu) = NULL; ++ break; ++ case CPU_DEAD: ++ kthread_stop(per_cpu(posix_timer_task,cpu)); ++ per_cpu(posix_timer_task,cpu) = NULL; ++ break; ++#endif ++ } ++ return NOTIFY_OK; ++} ++ ++/* Register at highest priority so that task migration (migrate_all_tasks) ++ * happens before everything else. ++ */ ++static struct notifier_block __devinitdata posix_cpu_thread_notifier = { ++ .notifier_call = posix_cpu_thread_call, ++ .priority = 10 ++}; ++ ++static int __init posix_cpu_thread_init(void) ++{ ++ void *cpu = (void *)(long)smp_processor_id(); ++ /* Start one for boot CPU. */ ++ posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu); ++ posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu); ++ register_cpu_notifier(&posix_cpu_thread_notifier); ++ return 0; ++} ++early_initcall(posix_cpu_thread_init); ++ + /* + * Set one of the process-wide special case CPU timers. + * The tsk->sighand->siglock must be held by the caller. +@@ -1701,6 +1872,12 @@ static __init int init_posix_cpu_timers( + .nsleep = thread_cpu_nsleep, + .nsleep_restart = thread_cpu_nsleep_restart, + }; ++ unsigned long cpu; ++ ++ /* init the per-cpu posix_timer_tasklets */ ++ for_each_cpu_mask(cpu, cpu_possible_map) { ++ per_cpu(posix_timer_tasklist, cpu) = NULL; ++ } + + register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); +Index: linux-2.6-tip/kernel/power/snapshot.c +=================================================================== +--- linux-2.6-tip.orig/kernel/power/snapshot.c ++++ linux-2.6-tip/kernel/power/snapshot.c +@@ -486,8 +486,8 @@ static int memory_bm_find_bit(struct mem + + static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) + { +- void *addr; +- unsigned int bit; ++ unsigned int bit = 0; ++ void *addr = NULL; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); +@@ -520,8 +520,8 @@ static void memory_bm_clear_bit(struct m + + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) + { +- void *addr; +- unsigned int bit; ++ unsigned int bit = 0; ++ void *addr = NULL; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); +Index: linux-2.6-tip/kernel/printk.c +=================================================================== +--- linux-2.6-tip.orig/kernel/printk.c ++++ linux-2.6-tip/kernel/printk.c +@@ -91,7 +91,7 @@ static int console_locked, console_suspe + * It is also used in interesting ways to provide interlocking in + * release_console_sem(). + */ +-static DEFINE_SPINLOCK(logbuf_lock); ++static DEFINE_RAW_SPINLOCK(logbuf_lock); + + #define LOG_BUF_MASK (log_buf_len-1) + #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) +@@ -395,9 +395,13 @@ static void __call_console_drivers(unsig + + for (con = console_drivers; con; con = con->next) { + if ((con->flags & CON_ENABLED) && con->write && +- (cpu_online(smp_processor_id()) || +- (con->flags & CON_ANYTIME))) ++ console_atomic_safe(con) && ++ (cpu_online(raw_smp_processor_id()) || ++ (con->flags & CON_ANYTIME))) { ++ set_printk_might_sleep(1); + con->write(con, &LOG_BUF(start), end - start); ++ set_printk_might_sleep(0); ++ } + } + } + +@@ -511,6 +515,7 @@ static void zap_locks(void) + spin_lock_init(&logbuf_lock); + /* And make sure that we print immediately */ + init_MUTEX(&console_sem); ++ zap_rt_locks(); + } + + #if defined(CONFIG_PRINTK_TIME) +@@ -592,7 +597,8 @@ static inline int can_use_console(unsign + * interrupts disabled. It should return with 'lockbuf_lock' + * released but interrupts still disabled. + */ +-static int acquire_console_semaphore_for_printk(unsigned int cpu) ++static int acquire_console_semaphore_for_printk(unsigned int cpu, ++ unsigned long flags) + { + int retval = 0; + +@@ -613,6 +619,8 @@ static int acquire_console_semaphore_for + } + printk_cpu = UINT_MAX; + spin_unlock(&logbuf_lock); ++ lockdep_on(); ++ local_irq_restore(flags); + return retval; + } + static const char recursion_bug_msg [] = +@@ -634,7 +642,7 @@ asmlinkage int vprintk(const char *fmt, + preempt_disable(); + /* This stops the holder of console_sem just where we want him */ + raw_local_irq_save(flags); +- this_cpu = smp_processor_id(); ++ this_cpu = raw_smp_processor_id(); + + /* + * Ouch, printk recursed into itself! +@@ -649,7 +657,8 @@ asmlinkage int vprintk(const char *fmt, + */ + if (!oops_in_progress) { + recursion_bug = 1; +- goto out_restore_irqs; ++ raw_local_irq_restore(flags); ++ goto out; + } + zap_locks(); + } +@@ -657,6 +666,7 @@ asmlinkage int vprintk(const char *fmt, + lockdep_off(); + spin_lock(&logbuf_lock); + printk_cpu = this_cpu; ++ preempt_enable(); + + if (recursion_bug) { + recursion_bug = 0; +@@ -726,14 +736,10 @@ asmlinkage int vprintk(const char *fmt, + * will release 'logbuf_lock' regardless of whether it + * actually gets the semaphore or not. + */ +- if (acquire_console_semaphore_for_printk(this_cpu)) ++ if (acquire_console_semaphore_for_printk(this_cpu, flags)) + release_console_sem(); + +- lockdep_on(); +-out_restore_irqs: +- raw_local_irq_restore(flags); +- +- preempt_enable(); ++out: + return printed_len; + } + EXPORT_SYMBOL(printk); +@@ -996,15 +1002,35 @@ void release_console_sem(void) + _con_start = con_start; + _log_end = log_end; + con_start = log_end; /* Flush */ ++ /* ++ * on PREEMPT_RT, call console drivers with ++ * interrupts enabled (if printk was called ++ * with interrupts disabled): ++ */ ++#ifdef CONFIG_PREEMPT_RT ++ spin_unlock_irqrestore(&logbuf_lock, flags); ++#else + spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ ++#endif + call_console_drivers(_con_start, _log_end); + start_critical_timings(); ++#ifndef CONFIG_PREEMPT_RT + local_irq_restore(flags); ++#endif + } + console_locked = 0; +- up(&console_sem); + spin_unlock_irqrestore(&logbuf_lock, flags); ++ up(&console_sem); ++ /* ++ * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd ++ * up only if we are in a preemptible section. We normally dont ++ * printk from non-preemptible sections so this is for the emergency ++ * case only. ++ */ ++#ifdef CONFIG_PREEMPT_RT ++ if (!in_atomic() && !irqs_disabled()) ++#endif + if (wake_klogd) + wake_up_klogd(); + } +@@ -1280,6 +1306,23 @@ int printk_ratelimit(void) + } + EXPORT_SYMBOL(printk_ratelimit); + ++static DEFINE_RAW_SPINLOCK(warn_lock); ++ ++void __WARN_ON(const char *func, const char *file, const int line) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&warn_lock, flags); ++ printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n", ++ current->comm, current->pid, raw_smp_processor_id(), ++ func, file, line); ++ dump_stack(); ++ spin_unlock_irqrestore(&warn_lock, flags); ++} ++ ++EXPORT_SYMBOL(__WARN_ON); ++ ++ + /** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state +@@ -1292,8 +1335,11 @@ EXPORT_SYMBOL(printk_ratelimit); + bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msecs) + { +- if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { +- *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); ++ if (*caller_jiffies == 0 ++ || !time_in_range(jiffies, *caller_jiffies, ++ *caller_jiffies ++ + msecs_to_jiffies(interval_msecs))) { ++ *caller_jiffies = jiffies; + return true; + } + return false; +Index: linux-2.6-tip/kernel/profile.c +=================================================================== +--- linux-2.6-tip.orig/kernel/profile.c ++++ linux-2.6-tip/kernel/profile.c +@@ -263,6 +263,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook) + * + * -- wli + */ ++#ifdef CONFIG_PROC_FS + static void __profile_flip_buffers(void *unused) + { + int cpu = smp_processor_id(); +@@ -308,57 +309,6 @@ static void profile_discard_flip_buffers + mutex_unlock(&profile_flip_mutex); + } + +-void profile_hits(int type, void *__pc, unsigned int nr_hits) +-{ +- unsigned long primary, secondary, flags, pc = (unsigned long)__pc; +- int i, j, cpu; +- struct profile_hit *hits; +- +- if (prof_on != type || !prof_buffer) +- return; +- pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); +- i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; +- secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; +- cpu = get_cpu(); +- hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; +- if (!hits) { +- put_cpu(); +- return; +- } +- /* +- * We buffer the global profiler buffer into a per-CPU +- * queue and thus reduce the number of global (and possibly +- * NUMA-alien) accesses. The write-queue is self-coalescing: +- */ +- local_irq_save(flags); +- do { +- for (j = 0; j < PROFILE_GRPSZ; ++j) { +- if (hits[i + j].pc == pc) { +- hits[i + j].hits += nr_hits; +- goto out; +- } else if (!hits[i + j].hits) { +- hits[i + j].pc = pc; +- hits[i + j].hits = nr_hits; +- goto out; +- } +- } +- i = (i + secondary) & (NR_PROFILE_HIT - 1); +- } while (i != primary); +- +- /* +- * Add the current hit(s) and flush the write-queue out +- * to the global buffer: +- */ +- atomic_add(nr_hits, &prof_buffer[pc]); +- for (i = 0; i < NR_PROFILE_HIT; ++i) { +- atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); +- hits[i].pc = hits[i].hits = 0; +- } +-out: +- local_irq_restore(flags); +- put_cpu(); +-} +- + static int __cpuinit profile_cpu_callback(struct notifier_block *info, + unsigned long action, void *__cpu) + { +@@ -417,6 +367,60 @@ out_free: + } + return NOTIFY_OK; + } ++#endif /* CONFIG_PROC_FS */ ++ ++void profile_hits(int type, void *__pc, unsigned int nr_hits) ++{ ++ unsigned long primary, secondary, flags, pc = (unsigned long)__pc; ++ int i, j, cpu; ++ struct profile_hit *hits; ++ ++ if (prof_on != type || !prof_buffer) ++ return; ++ pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); ++ i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; ++ secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; ++ cpu = get_cpu(); ++ hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; ++ if (!hits) { ++ put_cpu(); ++ return; ++ } ++ /* ++ * We buffer the global profiler buffer into a per-CPU ++ * queue and thus reduce the number of global (and possibly ++ * NUMA-alien) accesses. The write-queue is self-coalescing: ++ */ ++ local_irq_save(flags); ++ do { ++ for (j = 0; j < PROFILE_GRPSZ; ++j) { ++ if (hits[i + j].pc == pc) { ++ hits[i + j].hits += nr_hits; ++ goto out; ++ } else if (!hits[i + j].hits) { ++ hits[i + j].pc = pc; ++ hits[i + j].hits = nr_hits; ++ goto out; ++ } ++ } ++ i = (i + secondary) & (NR_PROFILE_HIT - 1); ++ } while (i != primary); ++ ++ /* ++ * Add the current hit(s) and flush the write-queue out ++ * to the global buffer: ++ */ ++ atomic_add(nr_hits, &prof_buffer[pc]); ++ for (i = 0; i < NR_PROFILE_HIT; ++i) { ++ atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); ++ hits[i].pc = hits[i].hits = 0; ++ } ++out: ++ local_irq_restore(flags); ++ put_cpu(); ++} ++ ++ + #else /* !CONFIG_SMP */ + #define profile_flip_buffers() do { } while (0) + #define profile_discard_flip_buffers() do { } while (0) +@@ -610,7 +614,7 @@ out_cleanup: + #define create_hash_tables() ({ 0; }) + #endif + +-int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ ++int create_proc_profile(void) + { + struct proc_dir_entry *entry; + +Index: linux-2.6-tip/kernel/ptrace.c +=================================================================== +--- linux-2.6-tip.orig/kernel/ptrace.c ++++ linux-2.6-tip/kernel/ptrace.c +@@ -613,8 +613,6 @@ SYSCALL_DEFINE4(ptrace, long, request, l + goto out_put_task_struct; + + ret = arch_ptrace(child, request, addr, data); +- if (ret < 0) +- goto out_put_task_struct; + + out_put_task_struct: + put_task_struct(child); +Index: linux-2.6-tip/kernel/rcuclassic.c +=================================================================== +--- linux-2.6-tip.orig/kernel/rcuclassic.c ++++ linux-2.6-tip/kernel/rcuclassic.c +@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = + .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), + .cpumask = CPU_BITS_NONE, + }; ++ + static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .cur = -300, + .completed = -300, +@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk + .cpumask = CPU_BITS_NONE, + }; + +-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; +-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; ++static DEFINE_PER_CPU(struct rcu_data, rcu_data); ++static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); ++ ++/* ++ * Increment the quiescent state counter. ++ * The counter is a bit degenerated: We do not need to know ++ * how many quiescent states passed, just if there was at least ++ * one since the start of the grace period. Thus just a flag. ++ */ ++void rcu_qsctr_inc(int cpu) ++{ ++ struct rcu_data *rdp = &per_cpu(rcu_data, cpu); ++ rdp->passed_quiesc = 1; ++} ++ ++void rcu_bh_qsctr_inc(int cpu) ++{ ++ struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); ++ rdp->passed_quiesc = 1; ++} + + static int blimit = 10; + static int qhimark = 10000; +Index: linux-2.6-tip/kernel/rcupdate.c +=================================================================== +--- linux-2.6-tip.orig/kernel/rcupdate.c ++++ linux-2.6-tip/kernel/rcupdate.c +@@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type) + } + } + ++static inline void wait_migrated_callbacks(void); ++ + /* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. +@@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrie + complete(&rcu_barrier_completion); + wait_for_completion(&rcu_barrier_completion); + mutex_unlock(&rcu_barrier_mutex); ++ wait_migrated_callbacks(); + } + + /** +@@ -176,9 +179,50 @@ void rcu_barrier_sched(void) + } + EXPORT_SYMBOL_GPL(rcu_barrier_sched); + ++static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); ++static struct rcu_head rcu_migrate_head[3]; ++static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); ++ ++static void rcu_migrate_callback(struct rcu_head *notused) ++{ ++ if (atomic_dec_and_test(&rcu_migrate_type_count)) ++ wake_up(&rcu_migrate_wq); ++} ++ ++static inline void wait_migrated_callbacks(void) ++{ ++ wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); ++} ++ ++static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, ++ unsigned long action, void *hcpu) ++{ ++ if (action == CPU_DYING) { ++ /* ++ * preempt_disable() in on_each_cpu() prevents stop_machine(), ++ * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" ++ * returns, all online cpus have queued rcu_barrier_func(), ++ * and the dead cpu(if it exist) queues rcu_migrate_callback()s. ++ * ++ * These callbacks ensure _rcu_barrier() waits for all ++ * RCU callbacks of the specified type to complete. ++ */ ++ atomic_set(&rcu_migrate_type_count, 3); ++ call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); ++ call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); ++ call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); ++ } else if (action == CPU_POST_DEAD) { ++ /* rcu_migrate_head is protected by cpu_add_remove_lock */ ++ wait_migrated_callbacks(); ++ } ++ ++ return NOTIFY_OK; ++} ++ + void __init rcu_init(void) + { + __rcu_init(); ++ hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); + } + + void rcu_scheduler_starting(void) +Index: linux-2.6-tip/kernel/rcupreempt.c +=================================================================== +--- linux-2.6-tip.orig/kernel/rcupreempt.c ++++ linux-2.6-tip/kernel/rcupreempt.c +@@ -71,7 +71,7 @@ + */ + #define GP_STAGES 2 + struct rcu_data { +- spinlock_t lock; /* Protect rcu_data fields. */ ++ raw_spinlock_t lock; /* Protect rcu_data fields. */ + long completed; /* Number of last completed batch. */ + int waitlistcount; + struct rcu_head *nextlist; +@@ -138,7 +138,7 @@ enum rcu_sched_sleep_states { + }; + + struct rcu_ctrlblk { +- spinlock_t fliplock; /* Protect state-machine transitions. */ ++ raw_spinlock_t fliplock; /* Protect state-machine transitions. */ + long completed; /* Number of last completed batch. */ + enum rcu_try_flip_states rcu_try_flip_state; /* The current state of + the rcu state machine */ +@@ -147,9 +147,53 @@ struct rcu_ctrlblk { + wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ + }; + ++struct rcu_dyntick_sched { ++ int dynticks; ++ int dynticks_snap; ++ int sched_qs; ++ int sched_qs_snap; ++ int sched_dynticks_snap; ++}; ++ ++static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { ++ .dynticks = 1, ++}; ++ ++void rcu_qsctr_inc(int cpu) ++{ ++ struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); ++ ++ rdssp->sched_qs++; ++} ++ ++#ifdef CONFIG_NO_HZ ++ ++void rcu_enter_nohz(void) ++{ ++ static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); ++ ++ smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ ++ __get_cpu_var(rcu_dyntick_sched).dynticks++; ++ WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs); ++} ++ ++void rcu_exit_nohz(void) ++{ ++ static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); ++ ++ __get_cpu_var(rcu_dyntick_sched).dynticks++; ++ smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ ++ WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1), ++ &rs); ++} ++ ++#endif /* CONFIG_NO_HZ */ ++ ++ + static DEFINE_PER_CPU(struct rcu_data, rcu_data); ++ + static struct rcu_ctrlblk rcu_ctrlblk = { +- .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), ++ .fliplock = RAW_SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), + .completed = 0, + .rcu_try_flip_state = rcu_try_flip_idle_state, + .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), +@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(stru + } + } + +-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { +- .dynticks = 1, +-}; +- + #ifdef CONFIG_NO_HZ + static DEFINE_PER_CPU(int, rcu_update_flag); + +Index: linux-2.6-tip/kernel/rcutree.c +=================================================================== +--- linux-2.6-tip.orig/kernel/rcutree.c ++++ linux-2.6-tip/kernel/rcutree.c +@@ -78,6 +78,26 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data + struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); + DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); + ++/* ++ * Increment the quiescent state counter. ++ * The counter is a bit degenerated: We do not need to know ++ * how many quiescent states passed, just if there was at least ++ * one since the start of the grace period. Thus just a flag. ++ */ ++void rcu_qsctr_inc(int cpu) ++{ ++ struct rcu_data *rdp = &per_cpu(rcu_data, cpu); ++ rdp->passed_quiesc = 1; ++ rdp->passed_quiesc_completed = rdp->completed; ++} ++ ++void rcu_bh_qsctr_inc(int cpu) ++{ ++ struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); ++ rdp->passed_quiesc = 1; ++ rdp->passed_quiesc_completed = rdp->completed; ++} ++ + #ifdef CONFIG_NO_HZ + DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = 1, +Index: linux-2.6-tip/kernel/rcutree.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/rcutree.h +@@ -0,0 +1,10 @@ ++ ++/* ++ * RCU implementation internal declarations: ++ */ ++extern struct rcu_state rcu_state; ++DECLARE_PER_CPU(struct rcu_data, rcu_data); ++ ++extern struct rcu_state rcu_bh_state; ++DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); ++ +Index: linux-2.6-tip/kernel/rcutree_trace.c +=================================================================== +--- linux-2.6-tip.orig/kernel/rcutree_trace.c ++++ linux-2.6-tip/kernel/rcutree_trace.c +@@ -43,6 +43,8 @@ + #include + #include + ++#include "rcutree.h" ++ + static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) + { + if (!rdp->beenonline) +Index: linux-2.6-tip/kernel/relay.c +=================================================================== +--- linux-2.6-tip.orig/kernel/relay.c ++++ linux-2.6-tip/kernel/relay.c +@@ -343,6 +343,10 @@ static void wakeup_readers(unsigned long + { + struct rchan_buf *buf = (struct rchan_buf *)data; + wake_up_interruptible(&buf->read_wait); ++ /* ++ * Stupid polling for now: ++ */ ++ mod_timer(&buf->timer, jiffies + 1); + } + + /** +@@ -360,6 +364,7 @@ static void __relay_reset(struct rchan_b + init_waitqueue_head(&buf->read_wait); + kref_init(&buf->kref); + setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); ++ mod_timer(&buf->timer, jiffies + 1); + } else + del_timer_sync(&buf->timer); + +@@ -677,9 +682,7 @@ int relay_late_setup_files(struct rchan + */ + for_each_online_cpu(i) { + if (unlikely(!chan->buf[i])) { +- printk(KERN_ERR "relay_late_setup_files: CPU %u " +- "has no buffer, it must have!\n", i); +- BUG(); ++ WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); + err = -EINVAL; + break; + } +@@ -742,15 +745,6 @@ size_t relay_switch_subbuf(struct rchan_ + else + buf->early_bytes += buf->chan->subbuf_size - + buf->padding[old_subbuf]; +- smp_mb(); +- if (waitqueue_active(&buf->read_wait)) +- /* +- * Calling wake_up_interruptible() from here +- * will deadlock if we happen to be logging +- * from the scheduler (trying to re-grab +- * rq->lock), so defer it. +- */ +- __mod_timer(&buf->timer, jiffies + 1); + } + + old = buf->data; +Index: linux-2.6-tip/kernel/rtmutex.c +=================================================================== +--- linux-2.6-tip.orig/kernel/rtmutex.c ++++ linux-2.6-tip/kernel/rtmutex.c +@@ -8,12 +8,20 @@ + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen + * ++ * Adaptive Spinlocks: ++ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, ++ * and Peter Morreale, ++ * Adaptive Spinlocks simplification: ++ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt ++ * + * See Documentation/rt-mutex-design.txt for details. + */ + #include + #include + #include + #include ++#include ++#include + + #include "rtmutex_common.h" + +@@ -97,6 +105,22 @@ static inline void mark_rt_mutex_waiters + } + #endif + ++int pi_initialized; ++ ++/* ++ * we initialize the wait_list runtime. (Could be done build-time and/or ++ * boot-time.) ++ */ ++static inline void init_lists(struct rt_mutex *lock) ++{ ++ if (unlikely(!lock->wait_list.prio_list.prev)) { ++ plist_head_init(&lock->wait_list, &lock->wait_lock); ++#ifdef CONFIG_DEBUG_RT_MUTEXES ++ pi_initialized++; ++#endif ++ } ++} ++ + /* + * Calculate task priority from the waiter list priority + * +@@ -253,13 +277,13 @@ static int rt_mutex_adjust_prio_chain(st + plist_add(&waiter->list_entry, &lock->wait_list); + + /* Release the task */ +- spin_unlock_irqrestore(&task->pi_lock, flags); ++ spin_unlock(&task->pi_lock); + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + get_task_struct(task); +- spin_lock_irqsave(&task->pi_lock, flags); ++ spin_lock(&task->pi_lock); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ +@@ -277,10 +301,10 @@ static int rt_mutex_adjust_prio_chain(st + __rt_mutex_adjust_prio(task); + } + +- spin_unlock_irqrestore(&task->pi_lock, flags); ++ spin_unlock(&task->pi_lock); + + top_waiter = rt_mutex_top_waiter(lock); +- spin_unlock(&lock->wait_lock); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; +@@ -300,21 +324,21 @@ static int rt_mutex_adjust_prio_chain(st + * assigned pending owner [which might not have taken the + * lock yet]: + */ +-static inline int try_to_steal_lock(struct rt_mutex *lock) ++static inline int try_to_steal_lock(struct rt_mutex *lock, ++ struct task_struct *task, int mode) + { + struct task_struct *pendowner = rt_mutex_owner(lock); + struct rt_mutex_waiter *next; +- unsigned long flags; + + if (!rt_mutex_owner_pending(lock)) + return 0; + +- if (pendowner == current) ++ if (pendowner == task) + return 1; + +- spin_lock_irqsave(&pendowner->pi_lock, flags); +- if (current->prio >= pendowner->prio) { +- spin_unlock_irqrestore(&pendowner->pi_lock, flags); ++ spin_lock(&pendowner->pi_lock); ++ if (!lock_is_stealable(task, pendowner, mode)) { ++ spin_unlock(&pendowner->pi_lock); + return 0; + } + +@@ -324,7 +348,7 @@ static inline int try_to_steal_lock(stru + * priority. + */ + if (likely(!rt_mutex_has_waiters(lock))) { +- spin_unlock_irqrestore(&pendowner->pi_lock, flags); ++ spin_unlock(&pendowner->pi_lock); + return 1; + } + +@@ -332,27 +356,27 @@ static inline int try_to_steal_lock(stru + next = rt_mutex_top_waiter(lock); + plist_del(&next->pi_list_entry, &pendowner->pi_waiters); + __rt_mutex_adjust_prio(pendowner); +- spin_unlock_irqrestore(&pendowner->pi_lock, flags); ++ spin_unlock(&pendowner->pi_lock); + + /* + * We are going to steal the lock and a waiter was + * enqueued on the pending owners pi_waiters queue. So + * we have to enqueue this waiter into +- * current->pi_waiters list. This covers the case, +- * where current is boosted because it holds another ++ * task->pi_waiters list. This covers the case, ++ * where task is boosted because it holds another + * lock and gets unboosted because the booster is + * interrupted, so we would delay a waiter with higher +- * priority as current->normal_prio. ++ * priority as task->normal_prio. + * + * Note: in the rare case of a SCHED_OTHER task changing + * its priority and thus stealing the lock, next->task +- * might be current: ++ * might be task: + */ +- if (likely(next->task != current)) { +- spin_lock_irqsave(¤t->pi_lock, flags); +- plist_add(&next->pi_list_entry, ¤t->pi_waiters); +- __rt_mutex_adjust_prio(current); +- spin_unlock_irqrestore(¤t->pi_lock, flags); ++ if (likely(next->task != task)) { ++ spin_lock(&task->pi_lock); ++ plist_add(&next->pi_list_entry, &task->pi_waiters); ++ __rt_mutex_adjust_prio(task); ++ spin_unlock(&task->pi_lock); + } + return 1; + } +@@ -366,7 +390,7 @@ static inline int try_to_steal_lock(stru + * + * Must be called with lock->wait_lock held. + */ +-static int try_to_take_rt_mutex(struct rt_mutex *lock) ++static int do_try_to_take_rt_mutex(struct rt_mutex *lock, int mode) + { + /* + * We have to be careful here if the atomic speedups are +@@ -389,7 +413,7 @@ static int try_to_take_rt_mutex(struct r + */ + mark_rt_mutex_waiters(lock); + +- if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) ++ if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current, mode)) + return 0; + + /* We got the lock. */ +@@ -402,6 +426,11 @@ static int try_to_take_rt_mutex(struct r + return 1; + } + ++static inline int try_to_take_rt_mutex(struct rt_mutex *lock) ++{ ++ return do_try_to_take_rt_mutex(lock, STEAL_NORMAL); ++} ++ + /* + * Task blocks on lock. + * +@@ -411,38 +440,38 @@ static int try_to_take_rt_mutex(struct r + */ + static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, +- int detect_deadlock) ++ struct task_struct *task, ++ int detect_deadlock, unsigned long flags) + { + struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex_waiter *top_waiter = waiter; +- unsigned long flags; + int chain_walk = 0, res; + +- spin_lock_irqsave(¤t->pi_lock, flags); +- __rt_mutex_adjust_prio(current); +- waiter->task = current; ++ spin_lock(&task->pi_lock); ++ __rt_mutex_adjust_prio(task); ++ waiter->task = task; + waiter->lock = lock; +- plist_node_init(&waiter->list_entry, current->prio); +- plist_node_init(&waiter->pi_list_entry, current->prio); ++ plist_node_init(&waiter->list_entry, task->prio); ++ plist_node_init(&waiter->pi_list_entry, task->prio); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + +- current->pi_blocked_on = waiter; ++ task->pi_blocked_on = waiter; + +- spin_unlock_irqrestore(¤t->pi_lock, flags); ++ spin_unlock(&task->pi_lock); + + if (waiter == rt_mutex_top_waiter(lock)) { +- spin_lock_irqsave(&owner->pi_lock, flags); ++ spin_lock(&owner->pi_lock); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) + chain_walk = 1; +- spin_unlock_irqrestore(&owner->pi_lock, flags); ++ spin_unlock(&owner->pi_lock); + } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + chain_walk = 1; +@@ -457,12 +486,12 @@ static int task_blocks_on_rt_mutex(struc + */ + get_task_struct(owner); + +- spin_unlock(&lock->wait_lock); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); + + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, +- current); ++ task); + +- spin_lock(&lock->wait_lock); ++ spin_lock_irq(&lock->wait_lock); + + return res; + } +@@ -475,13 +504,13 @@ static int task_blocks_on_rt_mutex(struc + * + * Called with lock->wait_lock held. + */ +-static void wakeup_next_waiter(struct rt_mutex *lock) ++static void wakeup_next_waiter(struct rt_mutex *lock, int savestate) + { + struct rt_mutex_waiter *waiter; + struct task_struct *pendowner; +- unsigned long flags; ++ struct rt_mutex_waiter *next; + +- spin_lock_irqsave(¤t->pi_lock, flags); ++ spin_lock(¤t->pi_lock); + + waiter = rt_mutex_top_waiter(lock); + plist_del(&waiter->list_entry, &lock->wait_list); +@@ -496,9 +525,44 @@ static void wakeup_next_waiter(struct rt + pendowner = waiter->task; + waiter->task = NULL; + ++ /* ++ * Do the wakeup before the ownership change to give any spinning ++ * waiter grantees a headstart over the other threads that will ++ * trigger once owner changes. ++ */ ++ if (!savestate) ++ wake_up_process(pendowner); ++ else { ++ /* ++ * We can skip the actual (expensive) wakeup if the ++ * waiter is already running, but we have to be careful ++ * of race conditions because they may be about to sleep. ++ * ++ * The waiter-side protocol has the following pattern: ++ * 1: Set state != RUNNING ++ * 2: Conditionally sleep if waiter->task != NULL; ++ * ++ * And the owner-side has the following: ++ * A: Set waiter->task = NULL ++ * B: Conditionally wake if the state != RUNNING ++ * ++ * As long as we ensure 1->2 order, and A->B order, we ++ * will never miss a wakeup. ++ * ++ * Therefore, this barrier ensures that waiter->task = NULL ++ * is visible before we test the pendowner->state. The ++ * corresponding barrier is in the sleep logic. ++ */ ++ smp_mb(); ++ ++ /* If !RUNNING && !RUNNING_MUTEX */ ++ if (pendowner->state & ~TASK_RUNNING_MUTEX) ++ wake_up_process_mutex(pendowner); ++ } ++ + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + +- spin_unlock_irqrestore(¤t->pi_lock, flags); ++ spin_unlock(¤t->pi_lock); + + /* + * Clear the pi_blocked_on variable and enqueue a possible +@@ -507,7 +571,13 @@ static void wakeup_next_waiter(struct rt + * waiter with higher priority than pending-owner->normal_prio + * is blocked on the unboosted (pending) owner. + */ +- spin_lock_irqsave(&pendowner->pi_lock, flags); ++ ++ if (rt_mutex_has_waiters(lock)) ++ next = rt_mutex_top_waiter(lock); ++ else ++ next = NULL; ++ ++ spin_lock(&pendowner->pi_lock); + + WARN_ON(!pendowner->pi_blocked_on); + WARN_ON(pendowner->pi_blocked_on != waiter); +@@ -515,15 +585,10 @@ static void wakeup_next_waiter(struct rt + + pendowner->pi_blocked_on = NULL; + +- if (rt_mutex_has_waiters(lock)) { +- struct rt_mutex_waiter *next; +- +- next = rt_mutex_top_waiter(lock); ++ if (next) + plist_add(&next->pi_list_entry, &pendowner->pi_waiters); +- } +- spin_unlock_irqrestore(&pendowner->pi_lock, flags); + +- wake_up_process(pendowner); ++ spin_unlock(&pendowner->pi_lock); + } + + /* +@@ -532,22 +597,22 @@ static void wakeup_next_waiter(struct rt + * Must be called with lock->wait_lock held + */ + static void remove_waiter(struct rt_mutex *lock, +- struct rt_mutex_waiter *waiter) ++ struct rt_mutex_waiter *waiter, ++ unsigned long flags) + { + int first = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); +- unsigned long flags; + int chain_walk = 0; + +- spin_lock_irqsave(¤t->pi_lock, flags); ++ spin_lock(¤t->pi_lock); + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->task = NULL; + current->pi_blocked_on = NULL; +- spin_unlock_irqrestore(¤t->pi_lock, flags); ++ spin_unlock(¤t->pi_lock); + + if (first && owner != current) { + +- spin_lock_irqsave(&owner->pi_lock, flags); ++ spin_lock(&owner->pi_lock); + + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + +@@ -562,7 +627,7 @@ static void remove_waiter(struct rt_mute + if (owner->pi_blocked_on) + chain_walk = 1; + +- spin_unlock_irqrestore(&owner->pi_lock, flags); ++ spin_unlock(&owner->pi_lock); + } + + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); +@@ -573,11 +638,11 @@ static void remove_waiter(struct rt_mute + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + +- spin_unlock(&lock->wait_lock); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); + + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + +- spin_lock(&lock->wait_lock); ++ spin_lock_irq(&lock->wait_lock); + } + + /* +@@ -598,45 +663,413 @@ void rt_mutex_adjust_pi(struct task_stru + return; + } + +- spin_unlock_irqrestore(&task->pi_lock, flags); +- + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); ++ spin_unlock_irqrestore(&task->pi_lock, flags); ++ + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); + } + + /* +- * Slow path lock function: ++ * preemptible spin_lock functions: + */ +-static int __sched +-rt_mutex_slowlock(struct rt_mutex *lock, int state, +- struct hrtimer_sleeper *timeout, +- int detect_deadlock) ++ ++#ifdef CONFIG_PREEMPT_RT ++ ++static inline void ++rt_spin_lock_fastlock(struct rt_mutex *lock, ++ void (*slowfn)(struct rt_mutex *lock)) ++{ ++ /* Temporary HACK! */ ++ if (likely(!current->in_printk)) ++ might_sleep(); ++ else if (in_atomic() || irqs_disabled()) ++ /* don't grab locks for printk in atomic */ ++ return; ++ ++ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) ++ rt_mutex_deadlock_account_lock(lock, current); ++ else ++ slowfn(lock); ++} ++ ++static inline void ++rt_spin_lock_fastunlock(struct rt_mutex *lock, ++ void (*slowfn)(struct rt_mutex *lock)) ++{ ++ /* Temporary HACK! */ ++ if (unlikely(rt_mutex_owner(lock) != current) && current->in_printk) ++ /* don't grab locks for printk in atomic */ ++ return; ++ ++ if (likely(rt_mutex_cmpxchg(lock, current, NULL))) ++ rt_mutex_deadlock_account_unlock(current); ++ else ++ slowfn(lock); ++} ++ ++ ++#ifdef CONFIG_SMP ++static int adaptive_wait(struct rt_mutex_waiter *waiter, ++ struct task_struct *orig_owner) ++{ ++ for (;;) { ++ ++ /* we are the owner? */ ++ if (!waiter->task) ++ return 0; ++ ++ /* Owner changed? Then lets update the original */ ++ if (orig_owner != rt_mutex_owner(waiter->lock)) ++ return 0; ++ ++ /* Owner went to bed, so should we */ ++ if (!task_is_current(orig_owner)) ++ return 1; ++ ++ cpu_relax(); ++ } ++} ++#else ++static int adaptive_wait(struct rt_mutex_waiter *waiter, ++ struct task_struct *orig_owner) ++{ ++ return 1; ++} ++#endif ++ ++/* ++ * The state setting needs to preserve the original state and needs to ++ * take care of non rtmutex wakeups. ++ * ++ * Called with rtmutex->wait_lock held to serialize against rtmutex ++ * wakeups(). ++ */ ++static inline unsigned long ++rt_set_current_blocked_state(unsigned long saved_state) ++{ ++ unsigned long state, block_state; ++ ++ /* ++ * If state is TASK_INTERRUPTIBLE, then we set the state for ++ * blocking to TASK_INTERRUPTIBLE as well, otherwise we would ++ * miss real wakeups via wake_up_interruptible(). If such a ++ * wakeup happens we see the running state and preserve it in ++ * saved_state. Now we can ignore further wakeups as we will ++ * return in state running from our "spin" sleep. ++ */ ++ if (saved_state == TASK_INTERRUPTIBLE) ++ block_state = TASK_INTERRUPTIBLE; ++ else ++ block_state = TASK_UNINTERRUPTIBLE; ++ ++ state = xchg(¤t->state, block_state); ++ /* ++ * Take care of non rtmutex wakeups. rtmutex wakeups ++ * or TASK_RUNNING_MUTEX to (UN)INTERRUPTIBLE. ++ */ ++ if (state == TASK_RUNNING) ++ saved_state = TASK_RUNNING; ++ ++ return saved_state; ++} ++ ++static inline void rt_restore_current_state(unsigned long saved_state) ++{ ++ unsigned long state = xchg(¤t->state, saved_state); ++ ++ if (state == TASK_RUNNING) ++ current->state = TASK_RUNNING; ++} ++ ++/* ++ * Slow path lock function spin_lock style: this variant is very ++ * careful not to miss any non-lock wakeups. ++ * ++ * The wakeup side uses wake_up_process_mutex, which, combined with ++ * the xchg code of this function is a transparent sleep/wakeup ++ * mechanism nested within any existing sleep/wakeup mechanism. This ++ * enables the seemless use of arbitrary (blocking) spinlocks within ++ * sleep/wakeup event loops. ++ */ ++static void noinline __sched ++rt_spin_lock_slowlock(struct rt_mutex *lock) + { + struct rt_mutex_waiter waiter; +- int ret = 0; ++ unsigned long saved_state, flags; ++ struct task_struct *orig_owner; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + +- spin_lock(&lock->wait_lock); ++ spin_lock_irqsave(&lock->wait_lock, flags); ++ init_lists(lock); + +- /* Try to acquire the lock again: */ +- if (try_to_take_rt_mutex(lock)) { +- spin_unlock(&lock->wait_lock); +- return 0; ++ BUG_ON(rt_mutex_owner(lock) == current); ++ ++ /* ++ * Here we save whatever state the task was in originally, ++ * we'll restore it at the end of the function and we'll take ++ * any intermediate wakeup into account as well, independently ++ * of the lock sleep/wakeup mechanism. When we get a real ++ * wakeup the task->state is TASK_RUNNING and we change ++ * saved_state accordingly. If we did not get a real wakeup ++ * then we return with the saved state. We need to be careful ++ * about original state TASK_INTERRUPTIBLE as well, as we ++ * could miss a wakeup_interruptible() ++ */ ++ saved_state = rt_set_current_blocked_state(current->state); ++ ++ for (;;) { ++ unsigned long saved_flags; ++ int saved_lock_depth = current->lock_depth; ++ ++ /* Try to acquire the lock */ ++ if (do_try_to_take_rt_mutex(lock, STEAL_LATERAL)) ++ break; ++ ++ /* ++ * waiter.task is NULL the first time we come here and ++ * when we have been woken up by the previous owner ++ * but the lock got stolen by an higher prio task. ++ */ ++ if (!waiter.task) { ++ task_blocks_on_rt_mutex(lock, &waiter, current, 0, ++ flags); ++ /* Wakeup during boost ? */ ++ if (unlikely(!waiter.task)) ++ continue; ++ } ++ ++ /* ++ * Prevent schedule() to drop BKL, while waiting for ++ * the lock ! We restore lock_depth when we come back. ++ */ ++ saved_flags = current->flags & PF_NOSCHED; ++ current->lock_depth = -1; ++ current->flags &= ~PF_NOSCHED; ++ orig_owner = rt_mutex_owner(lock); ++ get_task_struct(orig_owner); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ debug_rt_mutex_print_deadlock(&waiter); ++ ++ if (adaptive_wait(&waiter, orig_owner)) { ++ put_task_struct(orig_owner); ++ ++ if (waiter.task) ++ schedule_rt_mutex(lock); ++ } else ++ put_task_struct(orig_owner); ++ ++ spin_lock_irqsave(&lock->wait_lock, flags); ++ current->flags |= saved_flags; ++ current->lock_depth = saved_lock_depth; ++ saved_state = rt_set_current_blocked_state(saved_state); + } + +- set_current_state(state); ++ rt_restore_current_state(saved_state); + +- /* Setup the timer, when timeout != NULL */ +- if (unlikely(timeout)) { +- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); +- if (!hrtimer_active(&timeout->timer)) +- timeout->task = NULL; ++ /* ++ * Extremely rare case, if we got woken up by a non-mutex wakeup, ++ * and we managed to steal the lock despite us not being the ++ * highest-prio waiter (due to SCHED_OTHER changing prio), then we ++ * can end up with a non-NULL waiter.task: ++ */ ++ if (unlikely(waiter.task)) ++ remove_waiter(lock, &waiter, flags); ++ /* ++ * try_to_take_rt_mutex() sets the waiter bit ++ * unconditionally. We might have to fix that up: ++ */ ++ fixup_rt_mutex_waiters(lock); ++ ++ spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ debug_rt_mutex_free_waiter(&waiter); ++} ++ ++/* ++ * Slow path to release a rt_mutex spin_lock style ++ */ ++static void noinline __sched ++rt_spin_lock_slowunlock(struct rt_mutex *lock) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&lock->wait_lock, flags); ++ ++ debug_rt_mutex_unlock(lock); ++ ++ rt_mutex_deadlock_account_unlock(current); ++ ++ if (!rt_mutex_has_waiters(lock)) { ++ lock->owner = NULL; ++ spin_unlock_irqrestore(&lock->wait_lock, flags); ++ return; + } + ++ wakeup_next_waiter(lock, 1); ++ ++ spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ /* Undo pi boosting.when necessary */ ++ rt_mutex_adjust_prio(current); ++} ++ ++void __lockfunc rt_spin_lock(spinlock_t *lock) ++{ ++ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++} ++EXPORT_SYMBOL(rt_spin_lock); ++ ++void __lockfunc __rt_spin_lock(struct rt_mutex *lock) ++{ ++ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); ++} ++EXPORT_SYMBOL(__rt_spin_lock); ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ ++void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) ++{ ++ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++} ++EXPORT_SYMBOL(rt_spin_lock_nested); ++ ++#endif ++ ++void __lockfunc rt_spin_unlock(spinlock_t *lock) ++{ ++ /* NOTE: we always pass in '1' for nested, for simplicity */ ++ spin_release(&lock->dep_map, 1, _RET_IP_); ++ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); ++} ++EXPORT_SYMBOL(rt_spin_unlock); ++ ++void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) ++{ ++ rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); ++} ++EXPORT_SYMBOL(__rt_spin_unlock); ++ ++/* ++ * Wait for the lock to get unlocked: instead of polling for an unlock ++ * (like raw spinlocks do), we lock and unlock, to force the kernel to ++ * schedule if there's contention: ++ */ ++void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) ++{ ++ spin_lock(lock); ++ spin_unlock(lock); ++} ++EXPORT_SYMBOL(rt_spin_unlock_wait); ++ ++int __lockfunc rt_spin_trylock(spinlock_t *lock) ++{ ++ int ret = rt_mutex_trylock(&lock->lock); ++ ++ if (ret) ++ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ ++ return ret; ++} ++EXPORT_SYMBOL(rt_spin_trylock); ++ ++int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) ++{ ++ int ret; ++ ++ *flags = 0; ++ ret = rt_mutex_trylock(&lock->lock); ++ if (ret) ++ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ ++ return ret; ++} ++EXPORT_SYMBOL(rt_spin_trylock_irqsave); ++ ++int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic) ++{ ++ /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ ++ if (atomic_add_unless(atomic, -1, 1)) ++ return 0; ++ rt_spin_lock(lock); ++ if (atomic_dec_and_test(atomic)) ++ return 1; ++ rt_spin_unlock(lock); ++ return 0; ++} ++EXPORT_SYMBOL(_atomic_dec_and_spin_lock); ++ ++void ++__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held lock: ++ */ ++ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++ __rt_mutex_init(&lock->lock, name); ++} ++EXPORT_SYMBOL(__rt_spin_lock_init); ++ ++#endif ++ ++static inline int rt_release_bkl(struct rt_mutex *lock, unsigned long flags) ++{ ++ int saved_lock_depth = current->lock_depth; ++ ++#ifdef CONFIG_LOCK_KERNEL ++ current->lock_depth = -1; ++ /* ++ * try_to_take_lock set the waiters, make sure it's ++ * still correct. ++ */ ++ fixup_rt_mutex_waiters(lock); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ up(&kernel_sem); ++ ++ spin_lock_irq(&lock->wait_lock); ++#endif ++ return saved_lock_depth; ++} ++ ++static inline void rt_reacquire_bkl(int saved_lock_depth) ++{ ++#ifdef CONFIG_LOCK_KERNEL ++ down(&kernel_sem); ++ current->lock_depth = saved_lock_depth; ++#endif ++} ++ ++/** ++ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop ++ * @lock: the rt_mutex to take ++ * @state: the state the task should block in (TASK_INTERRUPTIBLE ++ * or TASK_UNINTERRUPTIBLE) ++ * @timeout: the pre-initialized and started timer, or NULL for none ++ * @waiter: the pre-initialized rt_mutex_waiter ++ * @detect_deadlock: passed to task_blocks_on_rt_mutex ++ * ++ * lock->wait_lock must be held by the caller. ++ */ ++static int __sched ++__rt_mutex_slowlock(struct rt_mutex *lock, int state, ++ struct hrtimer_sleeper *timeout, ++ struct rt_mutex_waiter *waiter, ++ int detect_deadlock, unsigned long flags) ++{ ++ int ret = 0; ++ + for (;;) { ++ unsigned long saved_flags; ++ + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock)) + break; +@@ -656,19 +1089,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, + } + + /* +- * waiter.task is NULL the first time we come here and ++ * waiter->task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ +- if (!waiter.task) { +- ret = task_blocks_on_rt_mutex(lock, &waiter, +- detect_deadlock); ++ if (!waiter->task) { ++ ret = task_blocks_on_rt_mutex(lock, waiter, current, ++ detect_deadlock, flags); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ +- if (unlikely(!waiter.task)) { ++ if (unlikely(!waiter->task)) { + /* + * Reset the return value. We might + * have returned with -EDEADLK and the +@@ -682,21 +1115,72 @@ rt_mutex_slowlock(struct rt_mutex *lock, + break; + } + +- spin_unlock(&lock->wait_lock); ++ saved_flags = current->flags & PF_NOSCHED; ++ current->flags &= ~PF_NOSCHED; + +- debug_rt_mutex_print_deadlock(&waiter); ++ spin_unlock_irq(&lock->wait_lock); ++ ++ debug_rt_mutex_print_deadlock(waiter); + +- if (waiter.task) ++ if (waiter->task) + schedule_rt_mutex(lock); + +- spin_lock(&lock->wait_lock); ++ spin_lock_irq(&lock->wait_lock); ++ ++ current->flags |= saved_flags; + set_current_state(state); + } + ++ return ret; ++} ++ ++/* ++ * Slow path lock function: ++ */ ++static int __sched ++rt_mutex_slowlock(struct rt_mutex *lock, int state, ++ struct hrtimer_sleeper *timeout, ++ int detect_deadlock) ++{ ++ int ret = 0, saved_lock_depth = -1; ++ struct rt_mutex_waiter waiter; ++ unsigned long flags; ++ ++ debug_rt_mutex_init_waiter(&waiter); ++ waiter.task = NULL; ++ ++ spin_lock_irqsave(&lock->wait_lock, flags); ++ init_lists(lock); ++ ++ /* Try to acquire the lock again: */ ++ if (try_to_take_rt_mutex(lock)) { ++ spin_unlock_irqrestore(&lock->wait_lock, flags); ++ return 0; ++ } ++ ++ /* ++ * We drop the BKL here before we go into the wait loop to avoid a ++ * possible deadlock in the scheduler. ++ */ ++ if (unlikely(current->lock_depth >= 0)) ++ saved_lock_depth = rt_release_bkl(lock, flags); ++ ++ set_current_state(state); ++ ++ /* Setup the timer, when timeout != NULL */ ++ if (unlikely(timeout)) { ++ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); ++ if (!hrtimer_active(&timeout->timer)) ++ timeout->task = NULL; ++ } ++ ++ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, ++ detect_deadlock, flags); ++ + set_current_state(TASK_RUNNING); + + if (unlikely(waiter.task)) +- remove_waiter(lock, &waiter); ++ remove_waiter(lock, &waiter, flags); + + /* + * try_to_take_rt_mutex() sets the waiter bit +@@ -704,7 +1188,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, + */ + fixup_rt_mutex_waiters(lock); + +- spin_unlock(&lock->wait_lock); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); + + /* Remove pending timer: */ + if (unlikely(timeout)) +@@ -718,6 +1202,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + ++ /* Must we reaquire the BKL? */ ++ if (unlikely(saved_lock_depth >= 0)) ++ rt_reacquire_bkl(saved_lock_depth); ++ + debug_rt_mutex_free_waiter(&waiter); + + return ret; +@@ -729,12 +1217,15 @@ rt_mutex_slowlock(struct rt_mutex *lock, + static inline int + rt_mutex_slowtrylock(struct rt_mutex *lock) + { ++ unsigned long flags; + int ret = 0; + +- spin_lock(&lock->wait_lock); ++ spin_lock_irqsave(&lock->wait_lock, flags); + + if (likely(rt_mutex_owner(lock) != current)) { + ++ init_lists(lock); ++ + ret = try_to_take_rt_mutex(lock); + /* + * try_to_take_rt_mutex() sets the lock waiters +@@ -743,7 +1234,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lo + fixup_rt_mutex_waiters(lock); + } + +- spin_unlock(&lock->wait_lock); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); + + return ret; + } +@@ -754,7 +1245,9 @@ rt_mutex_slowtrylock(struct rt_mutex *lo + static void __sched + rt_mutex_slowunlock(struct rt_mutex *lock) + { +- spin_lock(&lock->wait_lock); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&lock->wait_lock, flags); + + debug_rt_mutex_unlock(lock); + +@@ -762,13 +1255,13 @@ rt_mutex_slowunlock(struct rt_mutex *loc + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; +- spin_unlock(&lock->wait_lock); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); + return; + } + +- wakeup_next_waiter(lock); ++ wakeup_next_waiter(lock, 0); + +- spin_unlock(&lock->wait_lock); ++ spin_unlock_irqrestore(&lock->wait_lock, flags); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +@@ -830,6 +1323,27 @@ rt_mutex_fastunlock(struct rt_mutex *loc + } + + /** ++ * rt_mutex_lock_killable - lock a rt_mutex killable ++ * ++ * @lock: the rt_mutex to be locked ++ * @detect_deadlock: deadlock detection on/off ++ * ++ * Returns: ++ * 0 on success ++ * -EINTR when interrupted by a signal ++ * -EDEADLK when the lock would deadlock (when deadlock detection is on) ++ */ ++int __sched rt_mutex_lock_killable(struct rt_mutex *lock, ++ int detect_deadlock) ++{ ++ might_sleep(); ++ ++ return rt_mutex_fastlock(lock, TASK_KILLABLE, ++ detect_deadlock, rt_mutex_slowlock); ++} ++EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); ++ ++/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked +@@ -986,6 +1500,62 @@ void rt_mutex_proxy_unlock(struct rt_mut + } + + /** ++ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task ++ * @lock: the rt_mutex to take ++ * @waiter: the pre-initialized rt_mutex_waiter ++ * @task: the task to prepare ++ * @detect_deadlock: perform deadlock detection (1) or not (0) ++ * ++ * Returns: ++ * 0 - task blocked on lock ++ * 1 - acquired the lock for task, caller should wake it up ++ * <0 - error ++ * ++ * Special API call for FUTEX_REQUEUE_PI support. ++ */ ++int rt_mutex_start_proxy_lock(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ struct task_struct *task, int detect_deadlock) ++{ ++ unsigned long flags; ++ int ret; ++ ++ spin_lock_irqsave(&lock->wait_lock, flags); ++ ++ mark_rt_mutex_waiters(lock); ++ ++ if (!rt_mutex_owner(lock) || ++ try_to_steal_lock(lock, task, STEAL_NORMAL)) { ++ /* We got the lock for task. */ ++ debug_rt_mutex_lock(lock); ++ ++ rt_mutex_set_owner(lock, task, 0); ++ ++ rt_mutex_deadlock_account_lock(lock, task); ++ return 1; ++ } ++ ++ ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock, ++ flags); ++ ++ ++ if (ret && !waiter->task) { ++ /* ++ * Reset the return value. We might have ++ * returned with -EDEADLK and the owner ++ * released the lock while we were walking the ++ * pi chain. Let the waiter sort it out. ++ */ ++ ret = 0; ++ } ++ spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ debug_rt_mutex_print_deadlock(waiter); ++ ++ return ret; ++} ++ ++/** + * rt_mutex_next_owner - return the next owner of the lock + * + * @lock: the rt lock query +@@ -1004,3 +1574,58 @@ struct task_struct *rt_mutex_next_owner( + + return rt_mutex_top_waiter(lock)->task; + } ++ ++/** ++ * rt_mutex_finish_proxy_lock() - Complete lock acquisition ++ * @lock: the rt_mutex we were woken on ++ * @to: the timeout, null if none. hrtimer should already have ++ * been started. ++ * @waiter: the pre-initialized rt_mutex_waiter ++ * @detect_deadlock: perform deadlock detection (1) or not (0) ++ * ++ * Complete the lock acquisition started our behalf by another thread. ++ * ++ * Returns: ++ * 0 - success ++ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK ++ * ++ * Special API call for PI-futex requeue support ++ */ ++int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, ++ struct hrtimer_sleeper *to, ++ struct rt_mutex_waiter *waiter, ++ int detect_deadlock) ++{ ++ unsigned long flags; ++ int ret; ++ ++ spin_lock_irqsave(&lock->wait_lock, flags); ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, ++ detect_deadlock, flags); ++ ++ set_current_state(TASK_RUNNING); ++ ++ if (unlikely(waiter->task)) ++ remove_waiter(lock, waiter, flags); ++ ++ /* ++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might ++ * have to fix that up. ++ */ ++ fixup_rt_mutex_waiters(lock); ++ ++ spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ /* ++ * Readjust priority, when we did not get the lock. We might have been ++ * the pending owner and boosted. Since we did not take the lock, the ++ * PI boost has to go. ++ */ ++ if (unlikely(ret)) ++ rt_mutex_adjust_prio(current); ++ ++ return ret; ++} +Index: linux-2.6-tip/kernel/rtmutex_common.h +=================================================================== +--- linux-2.6-tip.orig/kernel/rtmutex_common.h ++++ linux-2.6-tip/kernel/rtmutex_common.h +@@ -120,6 +120,34 @@ extern void rt_mutex_init_proxy_locked(s + struct task_struct *proxy_owner); + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); ++extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ struct task_struct *task, ++ int detect_deadlock); ++extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, ++ struct hrtimer_sleeper *to, ++ struct rt_mutex_waiter *waiter, ++ int detect_deadlock); ++ ++ ++#define STEAL_LATERAL 1 ++#define STEAL_NORMAL 0 ++ ++/* ++ * Note that RT tasks are excluded from lateral-steals to prevent the ++ * introduction of an unbounded latency ++ */ ++static inline int lock_is_stealable(struct task_struct *task, ++ struct task_struct *pendowner, int mode) ++{ ++ if (mode == STEAL_NORMAL || rt_task(task)) { ++ if (task->prio >= pendowner->prio) ++ return 0; ++ } else if (task->prio > pendowner->prio) ++ return 0; ++ ++ return 1; ++} + + #ifdef CONFIG_DEBUG_RT_MUTEXES + # include "rtmutex-debug.h" +Index: linux-2.6-tip/kernel/sched.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sched.c ++++ linux-2.6-tip/kernel/sched.c +@@ -4,6 +4,7 @@ + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds ++ * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe +@@ -16,6 +17,7 @@ + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2004-10-13 Real-Time Preemption support by Ingo Molnar + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements +@@ -60,6 +62,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -105,6 +108,20 @@ + #define NICE_0_LOAD SCHED_LOAD_SCALE + #define NICE_0_SHIFT SCHED_LOAD_SHIFT + ++#if (BITS_PER_LONG < 64) ++#define JIFFIES_TO_NS64(TIME) \ ++ ((unsigned long long)(TIME) * ((unsigned long) (1000000000 / HZ))) ++ ++#define NS64_TO_JIFFIES(TIME) \ ++ ((((unsigned long long)((TIME)) >> BITS_PER_LONG) * \ ++ (1 + NS_TO_JIFFIES(~0UL))) + NS_TO_JIFFIES((unsigned long)(TIME))) ++#else /* BITS_PER_LONG < 64 */ ++ ++#define NS64_TO_JIFFIES(TIME) NS_TO_JIFFIES(TIME) ++#define JIFFIES_TO_NS64(TIME) JIFFIES_TO_NS(TIME) ++ ++#endif /* BITS_PER_LONG < 64 */ ++ + /* + * These are the 'tuning knobs' of the scheduler: + * +@@ -123,6 +140,7 @@ DEFINE_TRACE(sched_wakeup); + DEFINE_TRACE(sched_wakeup_new); + DEFINE_TRACE(sched_switch); + DEFINE_TRACE(sched_migrate_task); ++DEFINE_TRACE(sched_task_setprio); + + #ifdef CONFIG_SMP + +@@ -148,6 +166,32 @@ static inline void sg_inc_cpu_power(stru + } + #endif + ++#define TASK_PREEMPTS_CURR(p, rq) \ ++ ((p)->prio < (rq)->curr->prio) ++ ++/* ++ * Tweaks for current ++ */ ++ ++#ifdef CURRENT_PTR ++struct task_struct * const ___current = &init_task; ++struct task_struct ** const current_ptr = (struct task_struct ** const)&___current; ++struct thread_info * const current_ti = &init_thread_union.thread_info; ++struct thread_info ** const current_ti_ptr = (struct thread_info ** const)¤t_ti; ++ ++EXPORT_SYMBOL(___current); ++EXPORT_SYMBOL(current_ti); ++ ++/* ++ * The scheduler itself doesnt want 'current' to be cached ++ * during context-switches: ++ */ ++# undef current ++# define current __current() ++# undef current_thread_info ++# define current_thread_info() __current_thread_info() ++#endif ++ + static inline int rt_policy(int policy) + { + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) +@@ -170,7 +214,7 @@ struct rt_prio_array { + + struct rt_bandwidth { + /* nests inside the rq lock: */ +- spinlock_t rt_runtime_lock; ++ raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +@@ -211,6 +255,7 @@ void init_rt_bandwidth(struct rt_bandwid + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rt_b->rt_period_timer.irqsafe = 1; + rt_b->rt_period_timer.function = sched_rt_period_timer; + } + +@@ -338,6 +383,13 @@ static DEFINE_PER_CPU(struct rt_rq, init + */ + static DEFINE_SPINLOCK(task_group_lock); + ++#ifdef CONFIG_SMP ++static int root_task_group_empty(void) ++{ ++ return list_empty(&root_task_group.children); ++} ++#endif ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_USER_SCHED + # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) +@@ -398,6 +450,13 @@ static inline void set_task_rq(struct ta + + #else + ++#ifdef CONFIG_SMP ++static int root_task_group_empty(void) ++{ ++ return 1; ++} ++#endif ++ + static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } + static inline struct task_group *task_group(struct task_struct *p) + { +@@ -474,17 +533,24 @@ struct rt_rq { + struct rt_prio_array active; + unsigned long rt_nr_running; + #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +- int highest_prio; /* highest queued rt task prio */ ++ struct { ++ int curr; /* highest queued rt task prio */ ++#ifdef CONFIG_SMP ++ int next; /* next highest */ ++#endif ++ } highest_prio; + #endif + #ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + int overloaded; ++ struct plist_head pushable_tasks; + #endif ++ unsigned long rt_nr_uninterruptible; + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ +- spinlock_t rt_runtime_lock; ++ raw_spinlock_t rt_runtime_lock; + + #ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; +@@ -547,7 +613,7 @@ static struct root_domain def_root_domai + */ + struct rq { + /* runqueue lock: */ +- spinlock_t lock; ++ raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because +@@ -556,7 +622,6 @@ struct rq { + unsigned long nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; +- unsigned char idle_at_tick; + #ifdef CONFIG_NO_HZ + unsigned long last_tick_seen; + unsigned char in_nohz_recently; +@@ -565,6 +630,7 @@ struct rq { + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; ++ u64 nr_migrations_in; + + struct cfs_rq cfs; + struct rt_rq rt; +@@ -585,6 +651,8 @@ struct rq { + */ + unsigned long nr_uninterruptible; + ++ unsigned long switch_timestamp; ++ unsigned long slice_avg; + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; +@@ -597,6 +665,7 @@ struct rq { + struct root_domain *rd; + struct sched_domain *sd; + ++ unsigned char idle_at_tick; + /* For active balancing */ + int active_balance; + int push_cpu; +@@ -610,6 +679,10 @@ struct rq { + struct list_head migration_queue; + #endif + ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ + #ifdef CONFIG_SCHED_HRTICK + #ifdef CONFIG_SMP + int hrtick_csd_pending; +@@ -625,9 +698,6 @@ struct rq { + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ +- unsigned int yld_exp_empty; +- unsigned int yld_act_empty; +- unsigned int yld_both_empty; + unsigned int yld_count; + + /* schedule() stats */ +@@ -641,6 +711,13 @@ struct rq { + + /* BKL stats */ + unsigned int bkl_count; ++ ++ /* RT-overload stats: */ ++ unsigned long rto_schedule; ++ unsigned long rto_schedule_tail; ++ unsigned long rto_wakeup; ++ unsigned long rto_pulled; ++ unsigned long rto_pushed; + #endif + }; + +@@ -675,11 +752,18 @@ static inline int cpu_of(struct rq *rq) + #define task_rq(p) cpu_rq(task_cpu(p)) + #define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +-static inline void update_rq_clock(struct rq *rq) ++inline void update_rq_clock(struct rq *rq) + { + rq->clock = sched_clock_cpu(cpu_of(rq)); + } + ++#ifndef CONFIG_SMP ++int task_is_current(struct task_struct *task) ++{ ++ return task_rq(task)->curr == task; ++} ++#endif ++ + /* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +@@ -868,11 +952,23 @@ static inline u64 global_rt_runtime(void + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; + } + ++/* ++ * We really dont want to do anything complex within switch_to() ++ * on PREEMPT_RT - this check enforces this. ++ */ ++#ifdef prepare_arch_switch ++# ifdef CONFIG_PREEMPT_RT ++# error FIXME ++# else ++# define _finish_arch_switch finish_arch_switch ++# endif ++#endif ++ + #ifndef prepare_arch_switch + # define prepare_arch_switch(next) do { } while (0) + #endif + #ifndef finish_arch_switch +-# define finish_arch_switch(prev) do { } while (0) ++# define _finish_arch_switch(prev) do { } while (0) + #endif + + static inline int task_current(struct rq *rq, struct task_struct *p) +@@ -880,18 +976,39 @@ static inline int task_current(struct rq + return rq->curr == p; + } + +-#ifndef __ARCH_WANT_UNLOCKED_CTXSW + static inline int task_running(struct rq *rq, struct task_struct *p) + { ++#ifdef CONFIG_SMP ++ return p->oncpu; ++#else + return task_current(rq, p); ++#endif + } + ++#ifndef __ARCH_WANT_UNLOCKED_CTXSW + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) + { ++#ifdef CONFIG_SMP ++ /* ++ * We can optimise this out completely for !SMP, because the ++ * SMP rebalancing from interrupt is the only thing that cares ++ * here. ++ */ ++ next->oncpu = 1; ++#endif + } + + static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) + { ++#ifdef CONFIG_SMP ++ /* ++ * After ->oncpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ */ ++ smp_wmb(); ++ prev->oncpu = 0; ++#endif + #ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +@@ -903,18 +1020,10 @@ static inline void finish_lock_switch(st + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + +- spin_unlock_irq(&rq->lock); ++ spin_unlock(&rq->lock); + } + + #else /* __ARCH_WANT_UNLOCKED_CTXSW */ +-static inline int task_running(struct rq *rq, struct task_struct *p) +-{ +-#ifdef CONFIG_SMP +- return p->oncpu; +-#else +- return task_current(rq, p); +-#endif +-} + + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) + { +@@ -944,8 +1053,8 @@ static inline void finish_lock_switch(st + smp_wmb(); + prev->oncpu = 0; + #endif +-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW +- local_irq_enable(); ++#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW ++ local_irq_disable(); + #endif + } + #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ +@@ -986,6 +1095,26 @@ static struct rq *task_rq_lock(struct ta + } + } + ++void curr_rq_lock_irq_save(unsigned long *flags) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_save(*flags); ++ rq = cpu_rq(smp_processor_id()); ++ spin_lock(&rq->lock); ++} ++ ++void curr_rq_unlock_irq_restore(unsigned long *flags) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ rq = cpu_rq(smp_processor_id()); ++ spin_unlock(&rq->lock); ++ local_irq_restore(*flags); ++} ++ + void task_rq_unlock_wait(struct task_struct *p) + { + struct rq *rq = task_rq(p); +@@ -1100,7 +1229,7 @@ static void hrtick_start(struct rq *rq, + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { +- __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); ++ __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + rq->hrtick_csd_pending = 1; + } + } +@@ -1157,6 +1286,7 @@ static void init_rq_hrtick(struct rq *rq + + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; ++ rq->hrtick_timer.irqsafe = 1; + } + #else /* CONFIG_SCHED_HRTICK */ + static inline void hrtick_clear(struct rq *rq) +@@ -1191,10 +1321,10 @@ static void resched_task(struct task_str + + assert_spin_locked(&task_rq(p)->lock); + +- if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) ++ if (test_tsk_need_resched(p)) + return; + +- set_tsk_thread_flag(p, TIF_NEED_RESCHED); ++ set_tsk_need_resched(p); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) +@@ -1232,7 +1362,7 @@ void wake_up_idle_cpu(int cpu) + { + struct rq *rq = cpu_rq(cpu); + +- if (cpu == smp_processor_id()) ++ if (cpu == raw_smp_processor_id()) + return; + + /* +@@ -1250,7 +1380,7 @@ void wake_up_idle_cpu(int cpu) + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ +- set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); ++ set_tsk_need_resched(rq->idle); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); +@@ -1618,21 +1748,42 @@ static inline void update_shares_locked( + + #endif + ++#ifdef CONFIG_PREEMPT ++ + /* +- * double_lock_balance - lock the busiest runqueue, this_rq is locked already. ++ * fair double_lock_balance: Safely acquires both rq->locks in a fair ++ * way at the expense of forcing extra atomic operations in all ++ * invocations. This assures that the double_lock is acquired using the ++ * same underlying policy as the spinlock_t on this architecture, which ++ * reduces latency compared to the unfair variant below. However, it ++ * also adds more overhead and therefore may reduce throughput. + */ +-static int double_lock_balance(struct rq *this_rq, struct rq *busiest) ++static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) ++ __releases(this_rq->lock) ++ __acquires(busiest->lock) ++ __acquires(this_rq->lock) ++{ ++ spin_unlock(&this_rq->lock); ++ double_rq_lock(this_rq, busiest); ++ ++ return 1; ++} ++ ++#else ++/* ++ * Unfair double_lock_balance: Optimizes throughput at the expense of ++ * latency by eliminating extra atomic operations when the locks are ++ * already in proper order on entry. This favors lower cpu-ids and will ++ * grant the double lock to lower cpus over higher ids under contention, ++ * regardless of entry order into the function. ++ */ ++static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) + { + int ret = 0; + +- if (unlikely(!irqs_disabled())) { +- /* printk() doesn't work good under rq->lock */ +- spin_unlock(&this_rq->lock); +- BUG_ON(1); +- } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); +@@ -1645,6 +1796,22 @@ static int double_lock_balance(struct rq + return ret; + } + ++#endif /* CONFIG_PREEMPT */ ++ ++/* ++ * double_lock_balance - lock the busiest runqueue, this_rq is locked already. ++ */ ++static int double_lock_balance(struct rq *this_rq, struct rq *busiest) ++{ ++ if (unlikely(!irqs_disabled())) { ++ /* printk() doesn't work good under rq->lock */ ++ spin_unlock(&this_rq->lock); ++ BUG_ON(1); ++ } ++ ++ return _double_lock_balance(this_rq, busiest); ++} ++ + static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) + { +@@ -1662,6 +1829,8 @@ static void cfs_rq_set_shares(struct cfs + } + #endif + ++static void calc_load_account_active(struct rq *this_rq); ++ + #include "sched_stats.h" + #include "sched_idletask.c" + #include "sched_fair.c" +@@ -1713,6 +1882,9 @@ static void update_avg(u64 *avg, u64 sam + + static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) + { ++ if (wakeup) ++ p->se.start_runtime = p->se.sum_exec_runtime; ++ + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, wakeup); + p->se.on_rq = 1; +@@ -1720,10 +1892,15 @@ static void enqueue_task(struct rq *rq, + + static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) + { +- if (sleep && p->se.last_wakeup) { +- update_avg(&p->se.avg_overlap, +- p->se.sum_exec_runtime - p->se.last_wakeup); +- p->se.last_wakeup = 0; ++ if (sleep) { ++ if (p->se.last_wakeup) { ++ update_avg(&p->se.avg_overlap, ++ p->se.sum_exec_runtime - p->se.last_wakeup); ++ p->se.last_wakeup = 0; ++ } else { ++ update_avg(&p->se.avg_wakeup, ++ sysctl_sched_wakeup_granularity); ++ } + } + + sched_info_dequeued(p); +@@ -1754,6 +1931,8 @@ static inline int normal_prio(struct tas + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); ++ ++// trace_special_pid(p->pid, PRIO(p), __PRIO(prio)); + return prio; + } + +@@ -1893,12 +2072,15 @@ void set_task_cpu(struct task_struct *p, + p->se.sleep_start -= clock_offset; + if (p->se.block_start) + p->se.block_start -= clock_offset; ++#endif + if (old_cpu != new_cpu) { +- schedstat_inc(p, se.nr_migrations); ++ p->se.nr_migrations++; ++ new_rq->nr_migrations_in++; ++#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, old_rq->clock, NULL)) + schedstat_inc(p, se.nr_forced2_migrations); +- } + #endif ++ } + p->se.vruntime -= old_cfsrq->min_vruntime - + new_cfsrq->min_vruntime; + +@@ -2025,7 +2207,7 @@ unsigned long wait_task_inactive(struct + * it must be off the runqueue _entirely_, and not + * preempted! + * +- * So if it wa still runnable (but just not actively ++ * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ +@@ -2250,6 +2432,47 @@ static int sched_balance_self(int cpu, i + + #endif /* CONFIG_SMP */ + ++#ifdef CONFIG_DEBUG_PREEMPT ++void notrace preempt_enable_no_resched(void) ++{ ++ static int once = 1; ++ ++ barrier(); ++ dec_preempt_count(); ++ ++ if (once && !preempt_count()) { ++ once = 0; ++ printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n", ++ current->comm, current->pid); ++ dump_stack(); ++ } ++} ++ ++EXPORT_SYMBOL(preempt_enable_no_resched); ++#endif ++ ++ ++/** ++ * task_oncpu_function_call - call a function on the cpu on which a task runs ++ * @p: the task to evaluate ++ * @func: the function to be called ++ * @info: the function call argument ++ * ++ * Calls the function @func when the task is currently running. This might ++ * be on the current CPU, which just calls the function directly ++ */ ++void task_oncpu_function_call(struct task_struct *p, ++ void (*func) (void *info), void *info) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if (task_curr(p)) ++ smp_call_function_single(cpu, func, info, 1); ++ preempt_enable(); ++} ++ + /*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread +@@ -2264,7 +2487,8 @@ static int sched_balance_self(int cpu, i + * + * returns failure only if the task is already active. + */ +-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int sync, int mutex) + { + int cpu, orig_cpu, this_cpu, success = 0; + unsigned long flags; +@@ -2275,7 +2499,7 @@ static int try_to_wake_up(struct task_st + sync = 0; + + #ifdef CONFIG_SMP +- if (sched_feat(LB_WAKEUP_UPDATE)) { ++ if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { + struct sched_domain *sd; + + this_cpu = raw_smp_processor_id(); +@@ -2290,6 +2514,13 @@ static int try_to_wake_up(struct task_st + } + #endif + ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * sync wakeups can increase wakeup latencies: ++ */ ++ if (rt_task(p)) ++ sync = 0; ++#endif + smp_wmb(); + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); +@@ -2353,18 +2584,43 @@ out_activate: + activate_task(rq, p, 1); + success = 1; + ++ /* ++ * Only attribute actual wakeups done by this task. ++ */ ++ if (!in_interrupt()) { ++ struct sched_entity *se = ¤t->se; ++ u64 sample = se->sum_exec_runtime; ++ ++ if (se->last_wakeup) ++ sample -= se->last_wakeup; ++ else ++ sample -= se->start_runtime; ++ update_avg(&se->avg_wakeup, sample); ++ ++ se->last_wakeup = se->sum_exec_runtime; ++ } ++ + out_running: + trace_sched_wakeup(rq, p, success); + check_preempt_curr(rq, p, sync); + +- p->state = TASK_RUNNING; ++ /* ++ * For a mutex wakeup we or TASK_RUNNING_MUTEX to the task ++ * state to preserve the original state, so a real wakeup ++ * still can see the (UN)INTERRUPTIBLE bits in the state check ++ * above. We dont have to worry about the | TASK_RUNNING_MUTEX ++ * here. The waiter is serialized by the mutex lock and nobody ++ * else can fiddle with p->state as we hold rq lock. ++ */ ++ if (mutex) ++ p->state |= TASK_RUNNING_MUTEX; ++ else ++ p->state = TASK_RUNNING; + #ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); + #endif + out: +- current->se.last_wakeup = current->se.sum_exec_runtime; +- + task_rq_unlock(rq, &flags); + + return success; +@@ -2372,13 +2628,31 @@ out: + + int wake_up_process(struct task_struct *p) + { +- return try_to_wake_up(p, TASK_ALL, 0); ++ return try_to_wake_up(p, TASK_ALL, 0, 0); + } + EXPORT_SYMBOL(wake_up_process); + ++int wake_up_process_sync(struct task_struct * p) ++{ ++ return try_to_wake_up(p, TASK_ALL, 1, 0); ++} ++EXPORT_SYMBOL(wake_up_process_sync); ++ ++int wake_up_process_mutex(struct task_struct * p) ++{ ++ return try_to_wake_up(p, TASK_ALL, 0, 1); ++} ++EXPORT_SYMBOL(wake_up_process_mutex); ++ ++int wake_up_process_mutex_sync(struct task_struct * p) ++{ ++ return try_to_wake_up(p, TASK_ALL, 1, 1); ++} ++EXPORT_SYMBOL(wake_up_process_mutex_sync); ++ + int wake_up_state(struct task_struct *p, unsigned int state) + { +- return try_to_wake_up(p, state, 0); ++ return try_to_wake_up(p, state, 0, 0); + } + + /* +@@ -2392,8 +2666,11 @@ static void __sched_fork(struct task_str + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; ++ p->se.nr_migrations = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; ++ p->se.start_runtime = 0; ++ p->se.avg_wakeup = sysctl_sched_wakeup_granularity; + + #ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; +@@ -2449,13 +2726,15 @@ void sched_fork(struct task_struct *p, i + if (likely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); + #endif +-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) ++#if defined(CONFIG_SMP) + p->oncpu = 0; + #endif + #ifdef CONFIG_PREEMPT + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; + #endif ++ plist_node_init(&p->pushable_tasks, MAX_PRIO); ++ + put_cpu(); + } + +@@ -2499,7 +2778,7 @@ void wake_up_new_task(struct task_struct + #ifdef CONFIG_PREEMPT_NOTIFIERS + + /** +- * preempt_notifier_register - tell me when current is being being preempted & rescheduled ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled + * @notifier: notifier struct to register + */ + void preempt_notifier_register(struct preempt_notifier *notifier) +@@ -2525,8 +2804,17 @@ static void fire_sched_in_preempt_notifi + struct preempt_notifier *notifier; + struct hlist_node *node; + ++ if (hlist_empty(&curr->preempt_notifiers)) ++ return; ++ ++ /* ++ * The KVM sched in notifier expects to be called with ++ * interrupts enabled. ++ */ ++ local_irq_enable(); + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++ local_irq_disable(); + } + + static void +@@ -2596,6 +2884,12 @@ static void finish_task_switch(struct rq + { + struct mm_struct *mm = rq->prev_mm; + long prev_state; ++#ifdef CONFIG_SMP ++ int post_schedule = 0; ++ ++ if (current->sched_class->needs_post_schedule) ++ post_schedule = current->sched_class->needs_post_schedule(rq); ++#endif + + rq->prev_mm = NULL; + +@@ -2611,16 +2905,21 @@ static void finish_task_switch(struct rq + * Manfred Spraul + */ + prev_state = prev->state; +- finish_arch_switch(prev); ++ _finish_arch_switch(prev); ++ perf_counter_task_sched_in(current, cpu_of(rq)); + finish_lock_switch(rq, prev); + #ifdef CONFIG_SMP +- if (current->sched_class->post_schedule) ++ if (post_schedule) + current->sched_class->post_schedule(rq); + #endif + + fire_sched_in_preempt_notifiers(current); ++ /* ++ * Delay the final freeing of the mm or task, so that we dont have ++ * to do complex work from within the scheduler: ++ */ + if (mm) +- mmdrop(mm); ++ mmdrop_delayed(mm); + if (unlikely(prev_state == TASK_DEAD)) { + /* + * Remove function-return probe instances associated with this +@@ -2638,12 +2937,15 @@ static void finish_task_switch(struct rq + asmlinkage void schedule_tail(struct task_struct *prev) + __releases(rq->lock) + { +- struct rq *rq = this_rq(); +- +- finish_task_switch(rq, prev); ++ preempt_disable(); ++ finish_task_switch(this_rq(), prev); ++ __preempt_enable_no_resched(); ++ local_irq_enable(); + #ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); ++#else ++ preempt_check_resched(); + #endif + if (current->set_child_tid) + put_user(task_pid_vnr(current), current->set_child_tid); +@@ -2691,6 +2993,11 @@ context_switch(struct rq *rq, struct tas + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + #endif + ++#ifdef CURRENT_PTR ++ barrier(); ++ *current_ptr = next; ++ *current_ti_ptr = next->thread_info; ++#endif + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + +@@ -2737,6 +3044,11 @@ unsigned long nr_uninterruptible(void) + return sum; + } + ++unsigned long nr_uninterruptible_cpu(int cpu) ++{ ++ return cpu_rq(cpu)->nr_uninterruptible; ++} ++ + unsigned long long nr_context_switches(void) + { + int i; +@@ -2755,22 +3067,91 @@ unsigned long nr_iowait(void) + for_each_possible_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + ++ /* ++ * Since we read the counters lockless, it might be slightly ++ * inaccurate. Do not allow it to go below zero though: ++ */ ++ if (unlikely((long)sum < 0)) ++ sum = 0; ++ + return sum; + } + +-unsigned long nr_active(void) ++/* Variables and functions for calc_load */ ++static atomic_long_t calc_load_tasks; ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++static unsigned long ++calc_load(unsigned long load, unsigned long exp, unsigned long active) + { +- unsigned long i, running = 0, uninterruptible = 0; ++ load *= exp; ++ load += active * (FIXED_1 - exp); ++ return load >> FSHIFT; ++} + +- for_each_online_cpu(i) { +- running += cpu_rq(i)->nr_running; +- uninterruptible += cpu_rq(i)->nr_uninterruptible; +- } ++/* ++ * calc_load - update the avenrun load estimates 10 ticks after the ++ * CPUs have updated calc_load_tasks. ++ */ ++void calc_global_load(void) ++{ ++ unsigned long upd = calc_load_update + 10; ++ long active; ++ ++ if (time_before(jiffies, upd)) ++ return; ++ ++ active = atomic_long_read(&calc_load_tasks); ++ active = active > 0 ? active * FIXED_1 : 0; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update += LOAD_FREQ; ++} ++ ++/* ++ * Either called from update_cpu_load() or from a cpu going idle ++ */ ++static void calc_load_account_active(struct rq *this_rq) ++{ ++ long nr_active, delta; ++ ++ nr_active = this_rq->nr_running; ++ nr_active += (long) this_rq->nr_uninterruptible; + +- if (unlikely((long)uninterruptible < 0)) +- uninterruptible = 0; ++ if (nr_active != this_rq->calc_load_active) { ++ delta = nr_active - this_rq->calc_load_active; ++ this_rq->calc_load_active = nr_active; ++ atomic_long_add(delta, &calc_load_tasks); ++ } ++} + +- return running + uninterruptible; ++/* ++ * Externally visible per-cpu scheduler statistics: ++ * cpu_nr_migrations(cpu) - number of migrations into that cpu ++ */ ++u64 cpu_nr_migrations(int cpu) ++{ ++ return cpu_rq(cpu)->nr_migrations_in; + } + + /* +@@ -2801,6 +3182,11 @@ static void update_cpu_load(struct rq *t + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + } ++ ++ if (time_after_eq(jiffies, this_rq->calc_load_update)) { ++ this_rq->calc_load_update += LOAD_FREQ; ++ calc_load_account_active(this_rq); ++ } + } + + #ifdef CONFIG_SMP +@@ -2921,6 +3307,7 @@ int can_migrate_task(struct task_struct + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) + { ++ int tsk_cache_hot = 0; + /* + * We do not migrate tasks that are: + * 1) running (obviously), or +@@ -2944,10 +3331,11 @@ int can_migrate_task(struct task_struct + * 2) too many balance attempts have failed. + */ + +- if (!task_hot(p, rq->clock, sd) || +- sd->nr_balance_failed > sd->cache_nice_tries) { ++ tsk_cache_hot = task_hot(p, rq->clock, sd); ++ if (!tsk_cache_hot || ++ sd->nr_balance_failed > sd->cache_nice_tries) { + #ifdef CONFIG_SCHEDSTATS +- if (task_hot(p, rq->clock, sd)) { ++ if (tsk_cache_hot) { + schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(p, se.nr_forced_migrations); + } +@@ -2955,7 +3343,7 @@ int can_migrate_task(struct task_struct + return 1; + } + +- if (task_hot(p, rq->clock, sd)) { ++ if (tsk_cache_hot) { + schedstat_inc(p, se.nr_failed_migrations_hot); + return 0; + } +@@ -2995,6 +3383,16 @@ next: + pulled++; + rem_load_move -= p->se.load.weight; + ++#ifdef CONFIG_PREEMPT ++ /* ++ * NEWIDLE balancing is a source of latency, so preemptible kernels ++ * will stop after the first task is pulled to minimize the critical ++ * section. ++ */ ++ if (idle == CPU_NEWLY_IDLE) ++ goto out; ++#endif ++ + /* + * We only want to steal up to the prescribed amount of weighted load. + */ +@@ -3041,9 +3439,15 @@ static int move_tasks(struct rq *this_rq + sd, idle, all_pinned, &this_best_prio); + class = class->next; + ++#ifdef CONFIG_PREEMPT ++ /* ++ * NEWIDLE balancing is a source of latency, so preemptible ++ * kernels will stop after the first task is pulled to minimize ++ * the critical section. ++ */ + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; +- ++#endif + } while (class && max_load_move > total_load_moved); + + return total_load_moved > 0; +@@ -3093,246 +3497,479 @@ static int move_one_task(struct rq *this + + return 0; + } +- +-/* +- * find_busiest_group finds and returns the busiest CPU group within the +- * domain. It calculates and returns the amount of weighted load which +- * should be moved to restore balance via the imbalance parameter. ++/********** Helpers for find_busiest_group ************************/ ++/** ++ * sd_lb_stats - Structure to store the statistics of a sched_domain ++ * during load balancing. + */ +-static struct sched_group * +-find_busiest_group(struct sched_domain *sd, int this_cpu, +- unsigned long *imbalance, enum cpu_idle_type idle, +- int *sd_idle, const struct cpumask *cpus, int *balance) +-{ +- struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; +- unsigned long max_load, avg_load, total_load, this_load, total_pwr; +- unsigned long max_pull; +- unsigned long busiest_load_per_task, busiest_nr_running; +- unsigned long this_load_per_task, this_nr_running; +- int load_idx, group_imb = 0; ++struct sd_lb_stats { ++ struct sched_group *busiest; /* Busiest group in this sd */ ++ struct sched_group *this; /* Local group in this sd */ ++ unsigned long total_load; /* Total load of all groups in sd */ ++ unsigned long total_pwr; /* Total power of all groups in sd */ ++ unsigned long avg_load; /* Average load across all groups in sd */ ++ ++ /** Statistics of this group */ ++ unsigned long this_load; ++ unsigned long this_load_per_task; ++ unsigned long this_nr_running; ++ ++ /* Statistics of the busiest group */ ++ unsigned long max_load; ++ unsigned long busiest_load_per_task; ++ unsigned long busiest_nr_running; ++ ++ int group_imb; /* Is there imbalance in this sd */ + #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +- int power_savings_balance = 1; +- unsigned long leader_nr_running = 0, min_load_per_task = 0; +- unsigned long min_nr_running = ULONG_MAX; +- struct sched_group *group_min = NULL, *group_leader = NULL; ++ int power_savings_balance; /* Is powersave balance needed for this sd */ ++ struct sched_group *group_min; /* Least loaded group in sd */ ++ struct sched_group *group_leader; /* Group which relieves group_min */ ++ unsigned long min_load_per_task; /* load_per_task in group_min */ ++ unsigned long leader_nr_running; /* Nr running of group_leader */ ++ unsigned long min_nr_running; /* Nr running of group_min */ + #endif ++}; + +- max_load = this_load = total_load = total_pwr = 0; +- busiest_load_per_task = busiest_nr_running = 0; +- this_load_per_task = this_nr_running = 0; ++/** ++ * sg_lb_stats - stats of a sched_group required for load_balancing ++ */ ++struct sg_lb_stats { ++ unsigned long avg_load; /*Avg load across the CPUs of the group */ ++ unsigned long group_load; /* Total load over the CPUs of the group */ ++ unsigned long sum_nr_running; /* Nr tasks running in the group */ ++ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ ++ unsigned long group_capacity; ++ int group_imb; /* Is there an imbalance in the group ? */ ++}; ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_cpus(group)); ++} ++ ++/** ++ * get_sd_load_idx - Obtain the load index for a given sched domain. ++ * @sd: The sched_domain whose load_idx is to be obtained. ++ * @idle: The Idle status of the CPU for whose sd load_icx is obtained. ++ */ ++static inline int get_sd_load_idx(struct sched_domain *sd, ++ enum cpu_idle_type idle) ++{ ++ int load_idx; + +- if (idle == CPU_NOT_IDLE) ++ switch (idle) { ++ case CPU_NOT_IDLE: + load_idx = sd->busy_idx; +- else if (idle == CPU_NEWLY_IDLE) ++ break; ++ ++ case CPU_NEWLY_IDLE: + load_idx = sd->newidle_idx; +- else ++ break; ++ default: + load_idx = sd->idle_idx; ++ break; ++ } + +- do { +- unsigned long load, group_capacity, max_cpu_load, min_cpu_load; +- int local_group; +- int i; +- int __group_imb = 0; +- unsigned int balance_cpu = -1, first_idle_cpu = 0; +- unsigned long sum_nr_running, sum_weighted_load; +- unsigned long sum_avg_load_per_task; +- unsigned long avg_load_per_task; ++ return load_idx; ++} + +- local_group = cpumask_test_cpu(this_cpu, +- sched_group_cpus(group)); + +- if (local_group) +- balance_cpu = cpumask_first(sched_group_cpus(group)); ++#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) ++/** ++ * init_sd_power_savings_stats - Initialize power savings statistics for ++ * the given sched_domain, during load balancing. ++ * ++ * @sd: Sched domain whose power-savings statistics are to be initialized. ++ * @sds: Variable containing the statistics for sd. ++ * @idle: Idle status of the CPU at which we're performing load-balancing. ++ */ ++static inline void init_sd_power_savings_stats(struct sched_domain *sd, ++ struct sd_lb_stats *sds, enum cpu_idle_type idle) ++{ ++ /* ++ * Busy processors will not participate in power savings ++ * balance. ++ */ ++ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) ++ sds->power_savings_balance = 0; ++ else { ++ sds->power_savings_balance = 1; ++ sds->min_nr_running = ULONG_MAX; ++ sds->leader_nr_running = 0; ++ } ++} + +- /* Tally up the load of all CPUs in the group */ +- sum_weighted_load = sum_nr_running = avg_load = 0; +- sum_avg_load_per_task = avg_load_per_task = 0; ++/** ++ * update_sd_power_savings_stats - Update the power saving stats for a ++ * sched_domain while performing load balancing. ++ * ++ * @group: sched_group belonging to the sched_domain under consideration. ++ * @sds: Variable containing the statistics of the sched_domain ++ * @local_group: Does group contain the CPU for which we're performing ++ * load balancing ? ++ * @sgs: Variable containing the statistics of the group. ++ */ ++static inline void update_sd_power_savings_stats(struct sched_group *group, ++ struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) ++{ + +- max_cpu_load = 0; +- min_cpu_load = ~0UL; ++ if (!sds->power_savings_balance) ++ return; + +- for_each_cpu_and(i, sched_group_cpus(group), cpus) { +- struct rq *rq = cpu_rq(i); ++ /* ++ * If the local group is idle or completely loaded ++ * no need to do power savings balance at this domain ++ */ ++ if (local_group && (sds->this_nr_running >= sgs->group_capacity || ++ !sds->this_nr_running)) ++ sds->power_savings_balance = 0; + +- if (*sd_idle && rq->nr_running) +- *sd_idle = 0; ++ /* ++ * If a group is already running at full capacity or idle, ++ * don't include that group in power savings calculations ++ */ ++ if (!sds->power_savings_balance || ++ sgs->sum_nr_running >= sgs->group_capacity || ++ !sgs->sum_nr_running) ++ return; + +- /* Bias balancing toward cpus of our domain */ +- if (local_group) { +- if (idle_cpu(i) && !first_idle_cpu) { +- first_idle_cpu = 1; +- balance_cpu = i; +- } ++ /* ++ * Calculate the group which has the least non-idle load. ++ * This is the group from where we need to pick up the load ++ * for saving power ++ */ ++ if ((sgs->sum_nr_running < sds->min_nr_running) || ++ (sgs->sum_nr_running == sds->min_nr_running && ++ group_first_cpu(group) > group_first_cpu(sds->group_min))) { ++ sds->group_min = group; ++ sds->min_nr_running = sgs->sum_nr_running; ++ sds->min_load_per_task = sgs->sum_weighted_load / ++ sgs->sum_nr_running; ++ } + +- load = target_load(i, load_idx); +- } else { +- load = source_load(i, load_idx); +- if (load > max_cpu_load) +- max_cpu_load = load; +- if (min_cpu_load > load) +- min_cpu_load = load; +- } ++ /* ++ * Calculate the group which is almost near its ++ * capacity but still has some space to pick up some load ++ * from other group and save more power ++ */ ++ if (sgs->sum_nr_running > sgs->group_capacity - 1) ++ return; + +- avg_load += load; +- sum_nr_running += rq->nr_running; +- sum_weighted_load += weighted_cpuload(i); ++ if (sgs->sum_nr_running > sds->leader_nr_running || ++ (sgs->sum_nr_running == sds->leader_nr_running && ++ group_first_cpu(group) < group_first_cpu(sds->group_leader))) { ++ sds->group_leader = group; ++ sds->leader_nr_running = sgs->sum_nr_running; ++ } ++} + +- sum_avg_load_per_task += cpu_avg_load_per_task(i); +- } ++/** ++ * check_power_save_busiest_group - Check if we have potential to perform ++ * some power-savings balance. If yes, set the busiest group to be ++ * the least loaded group in the sched_domain, so that it's CPUs can ++ * be put to idle. ++ * ++ * @sds: Variable containing the statistics of the sched_domain ++ * under consideration. ++ * @this_cpu: Cpu at which we're currently performing load-balancing. ++ * @imbalance: Variable to store the imbalance. ++ * ++ * Returns 1 if there is potential to perform power-savings balance. ++ * Else returns 0. ++ */ ++static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, ++ int this_cpu, unsigned long *imbalance) ++{ ++ if (!sds->power_savings_balance) ++ return 0; + +- /* +- * First idle cpu or the first cpu(busiest) in this sched group +- * is eligible for doing load balancing at this and above +- * domains. In the newly idle case, we will allow all the cpu's +- * to do the newly idle load balance. +- */ +- if (idle != CPU_NEWLY_IDLE && local_group && +- balance_cpu != this_cpu && balance) { +- *balance = 0; +- goto ret; +- } ++ if (sds->this != sds->group_leader || ++ sds->group_leader == sds->group_min) ++ return 0; + +- total_load += avg_load; +- total_pwr += group->__cpu_power; ++ *imbalance = sds->min_load_per_task; ++ sds->busiest = sds->group_min; + +- /* Adjust by relative CPU power of the group */ +- avg_load = sg_div_cpu_power(group, +- avg_load * SCHED_LOAD_SCALE); ++ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { ++ cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = ++ group_first_cpu(sds->group_leader); ++ } + ++ return 1; + +- /* +- * Consider the group unbalanced when the imbalance is larger +- * than the average weight of two tasks. +- * +- * APZ: with cgroup the avg task weight can vary wildly and +- * might not be a suitable number - should we keep a +- * normalized nr_running number somewhere that negates +- * the hierarchy? +- */ +- avg_load_per_task = sg_div_cpu_power(group, +- sum_avg_load_per_task * SCHED_LOAD_SCALE); ++} ++#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ ++static inline void init_sd_power_savings_stats(struct sched_domain *sd, ++ struct sd_lb_stats *sds, enum cpu_idle_type idle) ++{ ++ return; ++} ++ ++static inline void update_sd_power_savings_stats(struct sched_group *group, ++ struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) ++{ ++ return; ++} ++ ++static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, ++ int this_cpu, unsigned long *imbalance) ++{ ++ return 0; ++} ++#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ ++ ++ ++/** ++ * update_sg_lb_stats - Update sched_group's statistics for load balancing. ++ * @group: sched_group whose statistics are to be updated. ++ * @this_cpu: Cpu for which load balance is currently performed. ++ * @idle: Idle status of this_cpu ++ * @load_idx: Load index of sched_domain of this_cpu for load calc. ++ * @sd_idle: Idle status of the sched_domain containing group. ++ * @local_group: Does group contain this_cpu. ++ * @cpus: Set of cpus considered for load balancing. ++ * @balance: Should we balance. ++ * @sgs: variable to hold the statistics for this group. ++ */ ++static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, ++ enum cpu_idle_type idle, int load_idx, int *sd_idle, ++ int local_group, const struct cpumask *cpus, ++ int *balance, struct sg_lb_stats *sgs) ++{ ++ unsigned long load, max_cpu_load, min_cpu_load; ++ int i; ++ unsigned int balance_cpu = -1, first_idle_cpu = 0; ++ unsigned long sum_avg_load_per_task; ++ unsigned long avg_load_per_task; ++ ++ if (local_group) ++ balance_cpu = group_first_cpu(group); + +- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) +- __group_imb = 1; ++ /* Tally up the load of all CPUs in the group */ ++ sum_avg_load_per_task = avg_load_per_task = 0; ++ max_cpu_load = 0; ++ min_cpu_load = ~0UL; + +- group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; ++ for_each_cpu_and(i, sched_group_cpus(group), cpus) { ++ struct rq *rq = cpu_rq(i); + ++ if (*sd_idle && rq->nr_running) ++ *sd_idle = 0; ++ ++ /* Bias balancing toward cpus of our domain */ + if (local_group) { +- this_load = avg_load; +- this = group; +- this_nr_running = sum_nr_running; +- this_load_per_task = sum_weighted_load; +- } else if (avg_load > max_load && +- (sum_nr_running > group_capacity || __group_imb)) { +- max_load = avg_load; +- busiest = group; +- busiest_nr_running = sum_nr_running; +- busiest_load_per_task = sum_weighted_load; +- group_imb = __group_imb; ++ if (idle_cpu(i) && !first_idle_cpu) { ++ first_idle_cpu = 1; ++ balance_cpu = i; ++ } ++ ++ load = target_load(i, load_idx); ++ } else { ++ load = source_load(i, load_idx); ++ if (load > max_cpu_load) ++ max_cpu_load = load; ++ if (min_cpu_load > load) ++ min_cpu_load = load; + } + +-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +- /* +- * Busy processors will not participate in power savings +- * balance. +- */ +- if (idle == CPU_NOT_IDLE || +- !(sd->flags & SD_POWERSAVINGS_BALANCE)) +- goto group_next; ++ sgs->group_load += load; ++ sgs->sum_nr_running += rq->nr_running; ++ sgs->sum_weighted_load += weighted_cpuload(i); + +- /* +- * If the local group is idle or completely loaded +- * no need to do power savings balance at this domain +- */ +- if (local_group && (this_nr_running >= group_capacity || +- !this_nr_running)) +- power_savings_balance = 0; ++ sum_avg_load_per_task += cpu_avg_load_per_task(i); ++ } + +- /* +- * If a group is already running at full capacity or idle, +- * don't include that group in power savings calculations +- */ +- if (!power_savings_balance || sum_nr_running >= group_capacity +- || !sum_nr_running) +- goto group_next; ++ /* ++ * First idle cpu or the first cpu(busiest) in this sched group ++ * is eligible for doing load balancing at this and above ++ * domains. In the newly idle case, we will allow all the cpu's ++ * to do the newly idle load balance. ++ */ ++ if (idle != CPU_NEWLY_IDLE && local_group && ++ balance_cpu != this_cpu && balance) { ++ *balance = 0; ++ return; ++ } + +- /* +- * Calculate the group which has the least non-idle load. +- * This is the group from where we need to pick up the load +- * for saving power +- */ +- if ((sum_nr_running < min_nr_running) || +- (sum_nr_running == min_nr_running && +- cpumask_first(sched_group_cpus(group)) > +- cpumask_first(sched_group_cpus(group_min)))) { +- group_min = group; +- min_nr_running = sum_nr_running; +- min_load_per_task = sum_weighted_load / +- sum_nr_running; +- } ++ /* Adjust by relative CPU power of the group */ ++ sgs->avg_load = sg_div_cpu_power(group, ++ sgs->group_load * SCHED_LOAD_SCALE); + +- /* +- * Calculate the group which is almost near its +- * capacity but still has some space to pick up some load +- * from other group and save more power +- */ +- if (sum_nr_running <= group_capacity - 1) { +- if (sum_nr_running > leader_nr_running || +- (sum_nr_running == leader_nr_running && +- cpumask_first(sched_group_cpus(group)) < +- cpumask_first(sched_group_cpus(group_leader)))) { +- group_leader = group; +- leader_nr_running = sum_nr_running; +- } ++ ++ /* ++ * Consider the group unbalanced when the imbalance is larger ++ * than the average weight of two tasks. ++ * ++ * APZ: with cgroup the avg task weight can vary wildly and ++ * might not be a suitable number - should we keep a ++ * normalized nr_running number somewhere that negates ++ * the hierarchy? ++ */ ++ avg_load_per_task = sg_div_cpu_power(group, ++ sum_avg_load_per_task * SCHED_LOAD_SCALE); ++ ++ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) ++ sgs->group_imb = 1; ++ ++ sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; ++ ++} ++ ++/** ++ * update_sd_lb_stats - Update sched_group's statistics for load balancing. ++ * @sd: sched_domain whose statistics are to be updated. ++ * @this_cpu: Cpu for which load balance is currently performed. ++ * @idle: Idle status of this_cpu ++ * @sd_idle: Idle status of the sched_domain containing group. ++ * @cpus: Set of cpus considered for load balancing. ++ * @balance: Should we balance. ++ * @sds: variable to hold the statistics for this sched_domain. ++ */ ++static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, ++ enum cpu_idle_type idle, int *sd_idle, ++ const struct cpumask *cpus, int *balance, ++ struct sd_lb_stats *sds) ++{ ++ struct sched_group *group = sd->groups; ++ struct sg_lb_stats sgs; ++ int load_idx; ++ ++ init_sd_power_savings_stats(sd, sds, idle); ++ load_idx = get_sd_load_idx(sd, idle); ++ ++ do { ++ int local_group; ++ ++ local_group = cpumask_test_cpu(this_cpu, ++ sched_group_cpus(group)); ++ memset(&sgs, 0, sizeof(sgs)); ++ update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, ++ local_group, cpus, balance, &sgs); ++ ++ if (local_group && balance && !(*balance)) ++ return; ++ ++ sds->total_load += sgs.group_load; ++ sds->total_pwr += group->__cpu_power; ++ ++ if (local_group) { ++ sds->this_load = sgs.avg_load; ++ sds->this = group; ++ sds->this_nr_running = sgs.sum_nr_running; ++ sds->this_load_per_task = sgs.sum_weighted_load; ++ } else if (sgs.avg_load > sds->max_load && ++ (sgs.sum_nr_running > sgs.group_capacity || ++ sgs.group_imb)) { ++ sds->max_load = sgs.avg_load; ++ sds->busiest = group; ++ sds->busiest_nr_running = sgs.sum_nr_running; ++ sds->busiest_load_per_task = sgs.sum_weighted_load; ++ sds->group_imb = sgs.group_imb; + } +-group_next: +-#endif ++ ++ update_sd_power_savings_stats(group, sds, local_group, &sgs); + group = group->next; + } while (group != sd->groups); + +- if (!busiest || this_load >= max_load || busiest_nr_running == 0) +- goto out_balanced; +- +- avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; ++} + +- if (this_load >= avg_load || +- 100*max_load <= sd->imbalance_pct*this_load) +- goto out_balanced; ++/** ++ * fix_small_imbalance - Calculate the minor imbalance that exists ++ * amongst the groups of a sched_domain, during ++ * load balancing. ++ * @sds: Statistics of the sched_domain whose imbalance is to be calculated. ++ * @this_cpu: The cpu at whose sched_domain we're performing load-balance. ++ * @imbalance: Variable to store the imbalance. ++ */ ++static inline void fix_small_imbalance(struct sd_lb_stats *sds, ++ int this_cpu, unsigned long *imbalance) ++{ ++ unsigned long tmp, pwr_now = 0, pwr_move = 0; ++ unsigned int imbn = 2; ++ ++ if (sds->this_nr_running) { ++ sds->this_load_per_task /= sds->this_nr_running; ++ if (sds->busiest_load_per_task > ++ sds->this_load_per_task) ++ imbn = 1; ++ } else ++ sds->this_load_per_task = ++ cpu_avg_load_per_task(this_cpu); + +- busiest_load_per_task /= busiest_nr_running; +- if (group_imb) +- busiest_load_per_task = min(busiest_load_per_task, avg_load); ++ if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= ++ sds->busiest_load_per_task * imbn) { ++ *imbalance = sds->busiest_load_per_task; ++ return; ++ } + + /* +- * We're trying to get all the cpus to the average_load, so we don't +- * want to push ourselves above the average load, nor do we wish to +- * reduce the max loaded cpu below the average load, as either of these +- * actions would just result in more rebalancing later, and ping-pong +- * tasks around. Thus we look for the minimum possible imbalance. +- * Negative imbalances (*we* are more loaded than anyone else) will +- * be counted as no imbalance for these purposes -- we can't fix that +- * by pulling tasks to us. Be careful of negative numbers as they'll +- * appear as very large values with unsigned longs. +- */ +- if (max_load <= busiest_load_per_task) +- goto out_balanced; ++ * OK, we don't have enough imbalance to justify moving tasks, ++ * however we may be able to increase total CPU power used by ++ * moving them. ++ */ ++ ++ pwr_now += sds->busiest->__cpu_power * ++ min(sds->busiest_load_per_task, sds->max_load); ++ pwr_now += sds->this->__cpu_power * ++ min(sds->this_load_per_task, sds->this_load); ++ pwr_now /= SCHED_LOAD_SCALE; ++ ++ /* Amount of load we'd subtract */ ++ tmp = sg_div_cpu_power(sds->busiest, ++ sds->busiest_load_per_task * SCHED_LOAD_SCALE); ++ if (sds->max_load > tmp) ++ pwr_move += sds->busiest->__cpu_power * ++ min(sds->busiest_load_per_task, sds->max_load - tmp); ++ ++ /* Amount of load we'd add */ ++ if (sds->max_load * sds->busiest->__cpu_power < ++ sds->busiest_load_per_task * SCHED_LOAD_SCALE) ++ tmp = sg_div_cpu_power(sds->this, ++ sds->max_load * sds->busiest->__cpu_power); ++ else ++ tmp = sg_div_cpu_power(sds->this, ++ sds->busiest_load_per_task * SCHED_LOAD_SCALE); ++ pwr_move += sds->this->__cpu_power * ++ min(sds->this_load_per_task, sds->this_load + tmp); ++ pwr_move /= SCHED_LOAD_SCALE; ++ ++ /* Move if we gain throughput */ ++ if (pwr_move > pwr_now) ++ *imbalance = sds->busiest_load_per_task; ++} + ++/** ++ * calculate_imbalance - Calculate the amount of imbalance present within the ++ * groups of a given sched_domain during load balance. ++ * @sds: statistics of the sched_domain whose imbalance is to be calculated. ++ * @this_cpu: Cpu for which currently load balance is being performed. ++ * @imbalance: The variable to store the imbalance. ++ */ ++static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, ++ unsigned long *imbalance) ++{ ++ unsigned long max_pull; + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ +- if (max_load < avg_load) { ++ if (sds->max_load < sds->avg_load) { + *imbalance = 0; +- goto small_imbalance; ++ return fix_small_imbalance(sds, this_cpu, imbalance); + } + + /* Don't want to pull so many tasks that a group would go idle */ +- max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); ++ max_pull = min(sds->max_load - sds->avg_load, ++ sds->max_load - sds->busiest_load_per_task); + + /* How much load to actually move to equalise the imbalance */ +- *imbalance = min(max_pull * busiest->__cpu_power, +- (avg_load - this_load) * this->__cpu_power) ++ *imbalance = min(max_pull * sds->busiest->__cpu_power, ++ (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + / SCHED_LOAD_SCALE; + + /* +@@ -3341,78 +3978,110 @@ group_next: + * a think about bumping its value to force at least one task to be + * moved + */ +- if (*imbalance < busiest_load_per_task) { +- unsigned long tmp, pwr_now, pwr_move; +- unsigned int imbn; +- +-small_imbalance: +- pwr_move = pwr_now = 0; +- imbn = 2; +- if (this_nr_running) { +- this_load_per_task /= this_nr_running; +- if (busiest_load_per_task > this_load_per_task) +- imbn = 1; +- } else +- this_load_per_task = cpu_avg_load_per_task(this_cpu); ++ if (*imbalance < sds->busiest_load_per_task) ++ return fix_small_imbalance(sds, this_cpu, imbalance); + +- if (max_load - this_load + busiest_load_per_task >= +- busiest_load_per_task * imbn) { +- *imbalance = busiest_load_per_task; +- return busiest; +- } ++} ++/******* find_busiest_group() helpers end here *********************/ + +- /* +- * OK, we don't have enough imbalance to justify moving tasks, +- * however we may be able to increase total CPU power used by +- * moving them. +- */ +- +- pwr_now += busiest->__cpu_power * +- min(busiest_load_per_task, max_load); +- pwr_now += this->__cpu_power * +- min(this_load_per_task, this_load); +- pwr_now /= SCHED_LOAD_SCALE; +- +- /* Amount of load we'd subtract */ +- tmp = sg_div_cpu_power(busiest, +- busiest_load_per_task * SCHED_LOAD_SCALE); +- if (max_load > tmp) +- pwr_move += busiest->__cpu_power * +- min(busiest_load_per_task, max_load - tmp); +- +- /* Amount of load we'd add */ +- if (max_load * busiest->__cpu_power < +- busiest_load_per_task * SCHED_LOAD_SCALE) +- tmp = sg_div_cpu_power(this, +- max_load * busiest->__cpu_power); +- else +- tmp = sg_div_cpu_power(this, +- busiest_load_per_task * SCHED_LOAD_SCALE); +- pwr_move += this->__cpu_power * +- min(this_load_per_task, this_load + tmp); +- pwr_move /= SCHED_LOAD_SCALE; +- +- /* Move if we gain throughput */ +- if (pwr_move > pwr_now) +- *imbalance = busiest_load_per_task; +- } ++/** ++ * find_busiest_group - Returns the busiest group within the sched_domain ++ * if there is an imbalance. If there isn't an imbalance, and ++ * the user has opted for power-savings, it returns a group whose ++ * CPUs can be put to idle by rebalancing those tasks elsewhere, if ++ * such a group exists. ++ * ++ * Also calculates the amount of weighted load which should be moved ++ * to restore balance. ++ * ++ * @sd: The sched_domain whose busiest group is to be returned. ++ * @this_cpu: The cpu for which load balancing is currently being performed. ++ * @imbalance: Variable which stores amount of weighted load which should ++ * be moved to restore balance/put a group to idle. ++ * @idle: The idle status of this_cpu. ++ * @sd_idle: The idleness of sd ++ * @cpus: The set of CPUs under consideration for load-balancing. ++ * @balance: Pointer to a variable indicating if this_cpu ++ * is the appropriate cpu to perform load balancing at this_level. ++ * ++ * Returns: - the busiest group if imbalance exists. ++ * - If no imbalance and user has opted for power-savings balance, ++ * return the least loaded group whose CPUs can be ++ * put to idle by rebalancing its tasks onto our group. ++ */ ++static struct sched_group * ++find_busiest_group(struct sched_domain *sd, int this_cpu, ++ unsigned long *imbalance, enum cpu_idle_type idle, ++ int *sd_idle, const struct cpumask *cpus, int *balance) ++{ ++ struct sd_lb_stats sds; + +- return busiest; ++ memset(&sds, 0, sizeof(sds)); + +-out_balanced: +-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) ++ /* ++ * Compute the various statistics relavent for load balancing at ++ * this level. ++ */ ++ update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, ++ balance, &sds); ++ ++ /* Cases where imbalance does not exist from POV of this_cpu */ ++ /* 1) this_cpu is not the appropriate cpu to perform load balancing ++ * at this level. ++ * 2) There is no busy sibling group to pull from. ++ * 3) This group is the busiest group. ++ * 4) This group is more busy than the avg busieness at this ++ * sched_domain. ++ * 5) The imbalance is within the specified limit. ++ * 6) Any rebalance would lead to ping-pong ++ */ ++ if (balance && !(*balance)) + goto ret; + +- if (this == group_leader && group_leader != group_min) { +- *imbalance = min_load_per_task; +- if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { +- cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = +- cpumask_first(sched_group_cpus(group_leader)); +- } +- return group_min; +- } +-#endif ++ if (!sds.busiest || sds.busiest_nr_running == 0) ++ goto out_balanced; ++ ++ if (sds.this_load >= sds.max_load) ++ goto out_balanced; ++ ++ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; ++ ++ if (sds.this_load >= sds.avg_load) ++ goto out_balanced; ++ ++ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) ++ goto out_balanced; ++ ++ sds.busiest_load_per_task /= sds.busiest_nr_running; ++ if (sds.group_imb) ++ sds.busiest_load_per_task = ++ min(sds.busiest_load_per_task, sds.avg_load); ++ ++ /* ++ * We're trying to get all the cpus to the average_load, so we don't ++ * want to push ourselves above the average load, nor do we wish to ++ * reduce the max loaded cpu below the average load, as either of these ++ * actions would just result in more rebalancing later, and ping-pong ++ * tasks around. Thus we look for the minimum possible imbalance. ++ * Negative imbalances (*we* are more loaded than anyone else) will ++ * be counted as no imbalance for these purposes -- we can't fix that ++ * by pulling tasks to us. Be careful of negative numbers as they'll ++ * appear as very large values with unsigned longs. ++ */ ++ if (sds.max_load <= sds.busiest_load_per_task) ++ goto out_balanced; ++ ++ /* Looks like there is an imbalance. Compute it */ ++ calculate_imbalance(&sds, this_cpu, imbalance); ++ return sds.busiest; ++ ++out_balanced: ++ /* ++ * There is no obvious imbalance. But check if we can do some balancing ++ * to save power. ++ */ ++ if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) ++ return sds.busiest; + ret: + *imbalance = 0; + return NULL; +@@ -3456,19 +4125,23 @@ find_busiest_queue(struct sched_group *g + */ + #define MAX_PINNED_INTERVAL 512 + ++/* Working cpumask for load_balance and load_balance_newidle. */ ++static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); ++ + /* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + */ + static int load_balance(int this_cpu, struct rq *this_rq, + struct sched_domain *sd, enum cpu_idle_type idle, +- int *balance, struct cpumask *cpus) ++ int *balance) + { + int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + struct sched_group *group; + unsigned long imbalance; + struct rq *busiest; + unsigned long flags; ++ struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + + cpumask_setall(cpus); + +@@ -3623,8 +4296,7 @@ out: + * this_rq is locked. + */ + static int +-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, +- struct cpumask *cpus) ++load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) + { + struct sched_group *group; + struct rq *busiest = NULL; +@@ -3632,6 +4304,7 @@ load_balance_newidle(int this_cpu, struc + int ld_moved = 0; + int sd_idle = 0; + int all_pinned = 0; ++ struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + + cpumask_setall(cpus); + +@@ -3772,10 +4445,6 @@ static void idle_balance(int this_cpu, s + struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + HZ; +- cpumask_var_t tmpmask; +- +- if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) +- return; + + for_each_domain(this_cpu, sd) { + unsigned long interval; +@@ -3786,7 +4455,7 @@ static void idle_balance(int this_cpu, s + if (sd->flags & SD_BALANCE_NEWIDLE) + /* If we've pulled tasks over stop searching: */ + pulled_task = load_balance_newidle(this_cpu, this_rq, +- sd, tmpmask); ++ sd); + + interval = msecs_to_jiffies(sd->balance_interval); + if (time_after(next_balance, sd->last_balance + interval)) +@@ -3801,7 +4470,6 @@ static void idle_balance(int this_cpu, s + */ + this_rq->next_balance = next_balance; + } +- free_cpumask_var(tmpmask); + } + + /* +@@ -3951,11 +4619,6 @@ static void rebalance_domains(int cpu, e + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; + int need_serialize; +- cpumask_var_t tmp; +- +- /* Fails alloc? Rebalancing probably not a priority right now. */ +- if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) +- return; + + for_each_domain(cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) +@@ -3980,7 +4643,7 @@ static void rebalance_domains(int cpu, e + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { +- if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { ++ if (load_balance(cpu, rq, sd, idle, &balance)) { + /* + * We've pulled tasks over so either we're no + * longer idle, or one of our SMT siblings is +@@ -4014,8 +4677,6 @@ out: + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; +- +- free_cpumask_var(tmp); + } + + /* +@@ -4025,7 +4686,7 @@ out: + */ + static void run_rebalance_domains(struct softirq_action *h) + { +- int this_cpu = smp_processor_id(); ++ int this_cpu = raw_smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); + enum cpu_idle_type idle = this_rq->idle_at_tick ? + CPU_IDLE : CPU_NOT_IDLE; +@@ -4065,6 +4726,11 @@ static void run_rebalance_domains(struct + #endif + } + ++static inline int on_null_domain(int cpu) ++{ ++ return !rcu_dereference(cpu_rq(cpu)->sd); ++} ++ + /* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + * +@@ -4122,7 +4788,9 @@ static inline void trigger_load_balance( + cpumask_test_cpu(cpu, nohz.cpu_mask)) + return; + #endif +- if (time_after_eq(jiffies, rq->next_balance)) ++ /* Don't need to rebalance while attached to NULL domain */ ++ if (time_after_eq(jiffies, rq->next_balance) && ++ likely(!on_null_domain(cpu))) + raise_softirq(SCHED_SOFTIRQ); + } + +@@ -4161,6 +4829,29 @@ static u64 do_task_delta_exec(struct tas + return ns; + } + ++unsigned long long __task_delta_exec(struct task_struct *p, int update) ++{ ++ s64 delta_exec; ++ struct rq *rq; ++ ++ rq = task_rq(p); ++ WARN_ON_ONCE(!runqueue_is_locked()); ++ WARN_ON_ONCE(!task_current(rq, p)); ++ ++ if (update) ++ update_rq_clock(rq); ++ ++ delta_exec = rq->clock - p->se.exec_start; ++ ++ WARN_ON_ONCE(delta_exec < 0); ++ ++ return delta_exec; ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been banked in ++ * @p in case that task is currently running. ++ */ + unsigned long long task_delta_exec(struct task_struct *p) + { + unsigned long flags; +@@ -4235,7 +4926,9 @@ void account_user_time(struct task_struc + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (rt_task(p)) ++ cpustat->user_rt = cputime64_add(cpustat->user_rt, tmp); ++ else if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +@@ -4293,10 +4986,12 @@ void account_system_time(struct task_str + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (hardirq_count() - hardirq_offset) ++ if (hardirq_count() - hardirq_offset || (p->flags & PF_HARDIRQ)) + cpustat->irq = cputime64_add(cpustat->irq, tmp); +- else if (softirq_count()) ++ else if (softirq_count() || (p->flags & PF_SOFTIRQ)) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); ++ else if (rt_task(p)) ++ cpustat->system_rt = cputime64_add(cpustat->system_rt, tmp); + else + cpustat->system = cputime64_add(cpustat->system, tmp); + +@@ -4449,10 +5144,14 @@ void scheduler_tick(void) + + sched_clock_tick(); + ++ BUG_ON(!irqs_disabled()); ++ + spin_lock(&rq->lock); + update_rq_clock(rq); + update_cpu_load(rq); +- curr->sched_class->task_tick(rq, curr, 0); ++ if (curr != rq->idle && curr->se.on_rq) ++ curr->sched_class->task_tick(rq, curr, 0); ++ perf_counter_task_tick(curr, cpu); + spin_unlock(&rq->lock); + + #ifdef CONFIG_SMP +@@ -4461,10 +5160,7 @@ void scheduler_tick(void) + #endif + } + +-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +- defined(CONFIG_PREEMPT_TRACER)) +- +-static inline unsigned long get_parent_ip(unsigned long addr) ++unsigned long notrace get_parent_ip(unsigned long addr) + { + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; +@@ -4474,6 +5170,9 @@ static inline unsigned long get_parent_i + return addr; + } + ++#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++ + void __kprobes add_preempt_count(int val) + { + #ifdef CONFIG_DEBUG_PREEMPT +@@ -4527,8 +5226,8 @@ static noinline void __schedule_bug(stru + { + struct pt_regs *regs = get_irq_regs(); + +- printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", +- prev->comm, prev->pid, preempt_count()); ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d, CPU#%d\n", ++ prev->comm, preempt_count(), prev->pid, smp_processor_id()); + + debug_show_held_locks(prev); + print_modules(); +@@ -4546,12 +5245,14 @@ static noinline void __schedule_bug(stru + */ + static inline void schedule_debug(struct task_struct *prev) + { ++// WARN_ON(system_state == SYSTEM_BOOTING); ++ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ +- if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) ++ if (unlikely(in_atomic() && !prev->exit_state)) + __schedule_bug(prev); + + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); +@@ -4565,11 +5266,33 @@ static inline void schedule_debug(struct + #endif + } + ++static void put_prev_task(struct rq *rq, struct task_struct *prev) ++{ ++ if (prev->state == TASK_RUNNING) { ++ u64 runtime = prev->se.sum_exec_runtime; ++ ++ runtime -= prev->se.prev_sum_exec_runtime; ++ runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); ++ ++ /* ++ * In order to avoid avg_overlap growing stale when we are ++ * indeed overlapping and hence not getting put to sleep, grow ++ * the avg_overlap on preemption. ++ * ++ * We use the average preemption runtime because that ++ * correlates to the amount of cache footprint a task can ++ * build up. ++ */ ++ update_avg(&prev->se.avg_overlap, runtime); ++ } ++ prev->sched_class->put_prev_task(rq, prev); ++} ++ + /* + * Pick up the highest-prio task: + */ + static inline struct task_struct * +-pick_next_task(struct rq *rq, struct task_struct *prev) ++pick_next_task(struct rq *rq) + { + const struct sched_class *class; + struct task_struct *p; +@@ -4600,15 +5323,13 @@ pick_next_task(struct rq *rq, struct tas + /* + * schedule() is the main scheduler function. + */ +-asmlinkage void __sched schedule(void) ++asmlinkage void __sched __schedule(void) + { + struct task_struct *prev, *next; + unsigned long *switch_count; + struct rq *rq; + int cpu; + +-need_resched: +- preempt_disable(); + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_qsctr_inc(cpu); +@@ -4616,10 +5337,11 @@ need_resched: + switch_count = &prev->nivcsw; + + release_kernel_lock(prev); +-need_resched_nonpreemptible: + + schedule_debug(prev); + ++ preempt_disable(); ++ + if (sched_feat(HRTICK)) + hrtick_clear(rq); + +@@ -4627,52 +5349,158 @@ need_resched_nonpreemptible: + update_rq_clock(rq); + clear_tsk_need_resched(prev); + +- if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { ++ if (!(prev->state & TASK_RUNNING_MUTEX) && prev->state && ++ !(preempt_count() & PREEMPT_ACTIVE)) { + if (unlikely(signal_pending_state(prev->state, prev))) + prev->state = TASK_RUNNING; +- else ++ else { ++ touch_softlockup_watchdog(); + deactivate_task(rq, prev, 1); ++ } + switch_count = &prev->nvcsw; + } + ++ if (preempt_count() & PREEMPT_ACTIVE) ++ sub_preempt_count(PREEMPT_ACTIVE); ++ + #ifdef CONFIG_SMP + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); + #endif + +- if (unlikely(!rq->nr_running)) +- idle_balance(cpu, rq); +- +- prev->sched_class->put_prev_task(rq, prev); +- next = pick_next_task(rq, prev); ++ if (unlikely(!rq->nr_running)) ++ idle_balance(cpu, rq); ++ ++ put_prev_task(rq, prev); ++ next = pick_next_task(rq); ++ ++ if (likely(prev != next)) { ++ sched_info_switch(prev, next); ++ perf_counter_task_sched_out(prev, cpu); ++ ++ rq->nr_switches++; ++ rq->curr = next; ++ ++*switch_count; ++ ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ /* ++ * the context switch might have flipped the stack from under ++ * us, hence refresh the local variables. ++ */ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ __preempt_enable_no_resched(); ++ } else { ++ __preempt_enable_no_resched(); ++ spin_unlock(&rq->lock); ++ } ++ ++ reacquire_kernel_lock(current); ++} ++ ++asmlinkage void __sched schedule(void) ++{ ++need_resched: ++ local_irq_disable(); ++ __schedule(); ++ local_irq_enable(); ++ ++ if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) ++ goto need_resched; ++} ++EXPORT_SYMBOL(schedule); ++ ++#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT) ++/* ++ * Look out! "owner" is an entirely speculative pointer ++ * access and not reliable. ++ */ ++int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) ++{ ++ unsigned int cpu; ++ struct rq *rq; ++ ++ if (!sched_feat(OWNER_SPIN)) ++ return 0; ++ ++#ifdef CONFIG_DEBUG_PAGEALLOC ++ /* ++ * Need to access the cpu field knowing that ++ * DEBUG_PAGEALLOC could have unmapped it if ++ * the mutex owner just released it and exited. ++ */ ++ if (probe_kernel_address(&owner->cpu, cpu)) ++ goto out; ++#else ++ cpu = owner->cpu; ++#endif ++ ++ /* ++ * Even if the access succeeded (likely case), ++ * the cpu field may no longer be valid. ++ */ ++ if (cpu >= nr_cpumask_bits) ++ goto out; + +- if (likely(prev != next)) { +- sched_info_switch(prev, next); ++ /* ++ * We need to validate that we can do a ++ * get_cpu() and that we have the percpu area. ++ */ ++ if (!cpu_online(cpu)) ++ goto out; + +- rq->nr_switches++; +- rq->curr = next; +- ++*switch_count; ++ rq = cpu_rq(cpu); + +- context_switch(rq, prev, next); /* unlocks the rq */ ++ for (;;) { + /* +- * the context switch might have flipped the stack from under +- * us, hence refresh the local variables. ++ * Owner changed, break to re-assess state. + */ +- cpu = smp_processor_id(); +- rq = cpu_rq(cpu); +- } else +- spin_unlock_irq(&rq->lock); ++ if (lock->owner != owner) ++ break; + +- if (unlikely(reacquire_kernel_lock(current) < 0)) +- goto need_resched_nonpreemptible; ++ /* ++ * Is that owner really running on that cpu? ++ */ ++ if (task_thread_info(rq->curr) != owner || need_resched()) ++ return 0; + +- preempt_enable_no_resched(); +- if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) +- goto need_resched; ++ cpu_relax(); ++ } ++out: ++ return 1; + } +-EXPORT_SYMBOL(schedule); ++#endif + + #ifdef CONFIG_PREEMPT ++ ++/* ++ * Global flag to turn preemption off on a CONFIG_PREEMPT kernel: ++ */ ++int kernel_preemption = 1; ++ ++static int __init preempt_setup (char *str) ++{ ++ if (!strncmp(str, "off", 3)) { ++ if (kernel_preemption) { ++ printk(KERN_INFO "turning off kernel preemption!\n"); ++ kernel_preemption = 0; ++ } ++ return 1; ++ } ++ if (!strncmp(str, "on", 2)) { ++ if (!kernel_preemption) { ++ printk(KERN_INFO "turning on kernel preemption!\n"); ++ kernel_preemption = 1; ++ } ++ return 1; ++ } ++ get_option(&str, &kernel_preemption); ++ ++ return 1; ++} ++ ++__setup("preempt=", preempt_setup); ++ + /* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt +@@ -4681,7 +5509,11 @@ EXPORT_SYMBOL(schedule); + asmlinkage void __sched preempt_schedule(void) + { + struct thread_info *ti = current_thread_info(); ++ struct task_struct *task = current; ++ int saved_lock_depth; + ++ if (!kernel_preemption) ++ return; + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. +@@ -4690,45 +5522,71 @@ asmlinkage void __sched preempt_schedule + return; + + do { ++ local_irq_disable(); + add_preempt_count(PREEMPT_ACTIVE); +- schedule(); +- sub_preempt_count(PREEMPT_ACTIVE); ++ ++ /* ++ * We keep the big kernel semaphore locked, but we ++ * clear ->lock_depth so that schedule() doesnt ++ * auto-release the semaphore: ++ */ ++ saved_lock_depth = task->lock_depth; ++ task->lock_depth = -1; ++ __schedule(); ++ task->lock_depth = saved_lock_depth; ++ local_irq_enable(); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); +- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); ++ } while (need_resched()); + } + EXPORT_SYMBOL(preempt_schedule); + + /* +- * this is the entry point to schedule() from kernel preemption +- * off of irq context. +- * Note, that this is called and return with irqs disabled. This will +- * protect us against recursive calling from irq. ++ * this is is the entry point for the IRQ return path. Called with ++ * interrupts disabled. To avoid infinite irq-entry recursion problems ++ * with fast-paced IRQ sources we do all of this carefully to never ++ * enable interrupts again. + */ + asmlinkage void __sched preempt_schedule_irq(void) + { + struct thread_info *ti = current_thread_info(); ++ struct task_struct *task = current; ++ int saved_lock_depth; + +- /* Catch callers which need to be fixed */ +- BUG_ON(ti->preempt_count || !irqs_disabled()); ++ if (!kernel_preemption) ++ return; ++ /* ++ * If there is a non-zero preempt_count then just return. ++ * (interrupts are disabled) ++ */ ++ if (unlikely(ti->preempt_count)) ++ return; + + do { ++ local_irq_disable(); + add_preempt_count(PREEMPT_ACTIVE); +- local_irq_enable(); +- schedule(); ++ ++ /* ++ * We keep the big kernel semaphore locked, but we ++ * clear ->lock_depth so that schedule() doesnt ++ * auto-release the semaphore: ++ */ ++ saved_lock_depth = task->lock_depth; ++ task->lock_depth = -1; ++ __schedule(); + local_irq_disable(); +- sub_preempt_count(PREEMPT_ACTIVE); ++ task->lock_depth = saved_lock_depth; + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); +- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); ++ } while (need_resched()); + } + + #endif /* CONFIG_PREEMPT */ +@@ -4736,7 +5594,7 @@ asmlinkage void __sched preempt_schedule + int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) + { +- return try_to_wake_up(curr->private, mode, sync); ++ return try_to_wake_up(curr->private, mode, sync, 0); + } + EXPORT_SYMBOL(default_wake_function); + +@@ -4776,7 +5634,7 @@ void __wake_up(wait_queue_head_t *q, uns + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); +- __wake_up_common(q, mode, nr_exclusive, 0, key); ++ __wake_up_common(q, mode, nr_exclusive, 1, key); + spin_unlock_irqrestore(&q->lock, flags); + } + EXPORT_SYMBOL(__wake_up); +@@ -4835,7 +5693,7 @@ void complete(struct completion *x) + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; +- __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); ++ __wake_up_common(&x->wait, TASK_NORMAL, 1, 1, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); + } + EXPORT_SYMBOL(complete); +@@ -4852,7 +5710,7 @@ void complete_all(struct completion *x) + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; +- __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); ++ __wake_up_common(&x->wait, TASK_NORMAL, 0, 1, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); + } + EXPORT_SYMBOL(complete_all); +@@ -5066,19 +5924,19 @@ long __sched sleep_on_timeout(wait_queue + } + EXPORT_SYMBOL(sleep_on_timeout); + +-#ifdef CONFIG_RT_MUTEXES +- + /* +- * rt_mutex_setprio - set the current priority of a task ++ * task_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * +- * Used by the rt_mutex code to implement priority inheritance logic. ++ * Used by the rt_mutex code to implement priority inheritance logic ++ * and by rcupreempt-boost to boost priorities of tasks sleeping ++ * with rcu locks. + */ +-void rt_mutex_setprio(struct task_struct *p, int prio) ++void task_setprio(struct task_struct *p, int prio) + { + unsigned long flags; + int oldprio, on_rq, running; +@@ -5088,6 +5946,25 @@ void rt_mutex_setprio(struct task_struct + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ + update_rq_clock(rq); + + oldprio = p->prio; +@@ -5105,6 +5982,8 @@ void rt_mutex_setprio(struct task_struct + + p->prio = prio; + ++ trace_sched_task_setprio(rq, p, oldprio); ++ + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) { +@@ -5112,11 +5991,11 @@ void rt_mutex_setprio(struct task_struct + + check_class_changed(rq, p, prev_class, oldprio, running); + } ++ ++out_unlock: + task_rq_unlock(rq, &flags); + } + +-#endif +- + void set_user_nice(struct task_struct *p, long nice) + { + int old_prio, delta, on_rq; +@@ -5202,7 +6081,7 @@ SYSCALL_DEFINE1(nice, int, increment) + if (increment > 40) + increment = 40; + +- nice = PRIO_TO_NICE(current->static_prio) + increment; ++ nice = TASK_NICE(current) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) +@@ -5751,19 +6630,53 @@ SYSCALL_DEFINE0(sched_yield) + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ +- __release(rq->lock); +- spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +- _raw_spin_unlock(&rq->lock); +- preempt_enable_no_resched(); ++ spin_unlock_no_resched(&rq->lock); + +- schedule(); ++ __schedule(); ++ ++ local_irq_enable(); ++ preempt_check_resched(); + + return 0; + } + ++#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) ++void __might_sleep(char *file, int line) ++{ ++#ifdef in_atomic ++ static unsigned long prev_jiffy; /* ratelimiting */ ++ ++ if ((!in_atomic() && !irqs_disabled()) || ++ system_state != SYSTEM_RUNNING || oops_in_progress) ++ return; ++ ++ if (debug_direct_keyboard && hardirq_count()) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ dump_stack(); ++#endif ++} ++EXPORT_SYMBOL(__might_sleep); ++#endif ++ + static void __cond_resched(void) + { +-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP ++#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) + __might_sleep(__FILE__, __LINE__); + #endif + /* +@@ -5772,10 +6685,11 @@ static void __cond_resched(void) + * cond_resched() call. + */ + do { ++ local_irq_disable(); + add_preempt_count(PREEMPT_ACTIVE); +- schedule(); +- sub_preempt_count(PREEMPT_ACTIVE); ++ __schedule(); + } while (need_resched()); ++ local_irq_enable(); + } + + int __sched _cond_resched(void) +@@ -5797,13 +6711,13 @@ EXPORT_SYMBOL(_cond_resched); + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +-int cond_resched_lock(spinlock_t *lock) ++int __cond_resched_raw_spinlock(raw_spinlock_t *lock) + { + int resched = need_resched() && system_state == SYSTEM_RUNNING; + int ret = 0; + + if (spin_needbreak(lock) || resched) { +- spin_unlock(lock); ++ spin_unlock_no_resched(lock); + if (resched && need_resched()) + __cond_resched(); + else +@@ -5813,12 +6727,36 @@ int cond_resched_lock(spinlock_t *lock) + } + return ret; + } +-EXPORT_SYMBOL(cond_resched_lock); ++EXPORT_SYMBOL(__cond_resched_raw_spinlock); + +-int __sched cond_resched_softirq(void) ++#ifdef CONFIG_PREEMPT_RT ++ ++int __cond_resched_spinlock(spinlock_t *lock) + { +- BUG_ON(!in_softirq()); ++ int resched = need_resched() && system_state == SYSTEM_RUNNING; + ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock_no_resched(lock); ++ __cond_resched(); ++ spin_lock(lock); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(__cond_resched_spinlock); ++ ++#endif ++ ++/* ++ * Voluntarily preempt a process context that has softirqs disabled: ++ */ ++int __sched cond_resched_softirq(void) ++{ ++#ifndef CONFIG_PREEMPT_SOFTIRQS ++ WARN_ON_ONCE(!in_softirq()); ++ if (!in_softirq()) ++ return 0; ++#endif + if (need_resched() && system_state == SYSTEM_RUNNING) { + local_bh_enable(); + __cond_resched(); +@@ -5829,17 +6767,102 @@ int __sched cond_resched_softirq(void) + } + EXPORT_SYMBOL(cond_resched_softirq); + ++/* ++ * Voluntarily preempt a softirq context (possible with softirq threading): ++ */ ++int __sched cond_resched_softirq_context(void) ++{ ++ WARN_ON_ONCE(!in_softirq()); ++ ++ if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { ++ raw_local_irq_disable(); ++ _local_bh_enable(); ++ raw_local_irq_enable(); ++ __cond_resched(); ++ local_bh_disable(); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(cond_resched_softirq_context); ++ ++/* ++ * Preempt a hardirq context if necessary (possible with hardirq threading): ++ */ ++int cond_resched_hardirq_context(void) ++{ ++ WARN_ON_ONCE(!in_irq()); ++ WARN_ON_ONCE(!irqs_disabled()); ++ ++ if (hardirq_need_resched()) { ++#ifndef CONFIG_PREEMPT_RT ++ irq_exit(); ++#endif ++ local_irq_enable(); ++ __cond_resched(); ++#ifndef CONFIG_PREEMPT_RT ++ local_irq_disable(); ++ __irq_enter(); ++#endif ++ ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(cond_resched_hardirq_context); ++ ++#ifdef CONFIG_PREEMPT_VOLUNTARY ++ ++int voluntary_preemption = 1; ++ ++EXPORT_SYMBOL(voluntary_preemption); ++ ++static int __init voluntary_preempt_setup (char *str) ++{ ++ if (!strncmp(str, "off", 3)) ++ voluntary_preemption = 0; ++ else ++ get_option(&str, &voluntary_preemption); ++ if (!voluntary_preemption) ++ printk("turning off voluntary preemption!\n"); ++ ++ return 1; ++} ++ ++__setup("voluntary-preempt=", voluntary_preempt_setup); ++ ++#endif ++ + /** + * yield - yield the current processor to other threads. + * + * This is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +-void __sched yield(void) ++void __sched __yield(void) + { + set_current_state(TASK_RUNNING); + sys_sched_yield(); + } ++ ++void __sched yield(void) ++{ ++ static int once = 1; ++ ++ /* ++ * it's a bug to rely on yield() with RT priorities. We print ++ * the first occurance after bootup ... this will still give ++ * us an idea about the scope of the problem, without spamming ++ * the syslog: ++ */ ++ if (once && rt_task(current)) { ++ once = 0; ++ printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n", ++ current->comm, current->pid); ++ dump_stack(); ++ } ++ __yield(); ++} + EXPORT_SYMBOL(yield); + + /* +@@ -5987,26 +7010,26 @@ void sched_show_task(struct task_struct + unsigned state; + + state = p->state ? __ffs(p->state) + 1 : 0; +- printk(KERN_INFO "%-13.13s %c", p->comm, +- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); ++ printk("%-13.13s %c (%03lx) [%p]", p->comm, ++ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?', ++ (unsigned long) p->state, p); + #if BITS_PER_LONG == 32 +- if (state == TASK_RUNNING) ++ if (0 && (state == TASK_RUNNING)) + printk(KERN_CONT " running "); + else + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); + #else +- if (state == TASK_RUNNING) ++ if (0 && (state == TASK_RUNNING)) + printk(KERN_CONT " running task "); + else + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); + #endif ++ if (task_curr(p)) ++ printk("[curr] "); ++ else if (p->se.on_rq) ++ printk("[on rq #%d] ", task_cpu(p)); + #ifdef CONFIG_DEBUG_STACK_USAGE +- { +- unsigned long *n = end_of_stack(p); +- while (!*n) +- n++; +- free = (unsigned long)n - (unsigned long)end_of_stack(p); +- } ++ free = stack_not_used(p); + #endif + printk(KERN_CONT "%5lu %5d %6d\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent)); +@@ -6017,6 +7040,7 @@ void sched_show_task(struct task_struct + void show_state_filter(unsigned long state_filter) + { + struct task_struct *g, *p; ++ int do_unlock = 1; + + #if BITS_PER_LONG == 32 + printk(KERN_INFO +@@ -6025,7 +7049,16 @@ void show_state_filter(unsigned long sta + printk(KERN_INFO + " task PC stack pid father\n"); + #endif ++#ifdef CONFIG_PREEMPT_RT ++ if (!read_trylock(&tasklist_lock)) { ++ printk("hm, tasklist_lock write-locked.\n"); ++ printk("ignoring ...\n"); ++ do_unlock = 0; ++ } ++#else + read_lock(&tasklist_lock); ++#endif ++ + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow +@@ -6041,7 +7074,8 @@ void show_state_filter(unsigned long sta + #ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); + #endif +- read_unlock(&tasklist_lock); ++ if (do_unlock) ++ read_unlock(&tasklist_lock); + /* + * Only show locks if all tasks are dumped: + */ +@@ -6077,17 +7111,14 @@ void __cpuinit init_idle(struct task_str + __set_task_cpu(idle, cpu); + + rq->curr = rq->idle = idle; +-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) ++#if defined(CONFIG_SMP) + idle->oncpu = 1; + #endif + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +-#if defined(CONFIG_PREEMPT) +- task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); +-#else + task_thread_info(idle)->preempt_count = 0; +-#endif ++ + /* + * The idle tasks have their own, simple scheduling class: + */ +@@ -6216,11 +7247,18 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) + { + struct rq *rq_dest, *rq_src; ++ unsigned long flags; + int ret = 0, on_rq; + + if (unlikely(!cpu_active(dest_cpu))) + return ret; + ++ /* ++ * PREEMPT_RT: this relies on write_lock_irq(&tasklist_lock) ++ * disabling interrupts - which on PREEMPT_RT does not do: ++ */ ++ local_irq_save(flags); ++ + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + +@@ -6245,6 +7283,8 @@ done: + ret = 1; + fail: + double_rq_unlock(rq_src, rq_dest); ++ local_irq_restore(flags); ++ + return ret; + } + +@@ -6442,7 +7482,11 @@ void idle_task_exit(void) + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); ++#ifdef CONFIG_PREEMPT_RT ++ mmdrop_delayed(mm); ++#else + mmdrop(mm); ++#endif + } + + /* called under rq->lock with disabled interrupts */ +@@ -6480,7 +7524,7 @@ static void migrate_dead_tasks(unsigned + if (!rq->nr_running) + break; + update_rq_clock(rq); +- next = pick_next_task(rq, rq->curr); ++ next = pick_next_task(rq); + if (!next) + break; + next->sched_class->put_prev_task(rq, next); +@@ -6488,6 +7532,14 @@ static void migrate_dead_tasks(unsigned + + } + } ++ ++/* ++ * remove the tasks which were accounted by rq from calc_load_tasks. ++ */ ++static void calc_global_load_remove(struct rq *rq) ++{ ++ atomic_long_sub(rq->calc_load_active, &calc_load_tasks); ++} + #endif /* CONFIG_HOTPLUG_CPU */ + + #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +@@ -6722,6 +7774,8 @@ migration_call(struct notifier_block *nf + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); ++ rq->calc_load_update = calc_load_update; ++ rq->calc_load_active = 0; + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + +@@ -6761,7 +7815,7 @@ migration_call(struct notifier_block *nf + cpuset_unlock(); + migrate_nr_uninterruptible(rq); + BUG_ON(rq->nr_running != 0); +- ++ calc_global_load_remove(rq); + /* + * No need to migrate the tasks: it was best-effort if + * they didn't take sched_hotcpu_mutex. Just wake up +@@ -7311,7 +8365,7 @@ cpu_to_core_group(int cpu, const struct + { + int group; + +- cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); ++ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); + group = cpumask_first(mask); + if (sg) + *sg = &per_cpu(sched_group_core, group).sg; +@@ -7340,7 +8394,7 @@ cpu_to_phys_group(int cpu, const struct + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); + #elif defined(CONFIG_SCHED_SMT) +- cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); ++ cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); + group = cpumask_first(mask); + #else + group = cpu; +@@ -7683,7 +8737,7 @@ static int __build_sched_domains(const s + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), +- &per_cpu(cpu_sibling_map, i), cpu_map); ++ topology_thread_cpumask(i), cpu_map); + sd->parent = p; + p->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); +@@ -7694,7 +8748,7 @@ static int __build_sched_domains(const s + /* Set up CPU (sibling) groups */ + for_each_cpu(i, cpu_map) { + cpumask_and(this_sibling_map, +- &per_cpu(cpu_sibling_map, i), cpu_map); ++ topology_thread_cpumask(i), cpu_map); + if (i != cpumask_first(this_sibling_map)) + continue; + +@@ -8275,11 +9329,15 @@ static void init_rt_rq(struct rt_rq *rt_ + __set_bit(MAX_RT_PRIO, array->bitmap); + + #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +- rt_rq->highest_prio = MAX_RT_PRIO; ++ rt_rq->highest_prio.curr = MAX_RT_PRIO; ++#ifdef CONFIG_SMP ++ rt_rq->highest_prio.next = MAX_RT_PRIO; ++#endif + #endif + #ifdef CONFIG_SMP + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; ++ plist_head_init(&rq->rt.pushable_tasks, &rq->lock); + #endif + + rt_rq->rt_time = 0; +@@ -8366,6 +9424,9 @@ void __init sched_init(void) + #ifdef CONFIG_USER_SCHED + alloc_size *= 2; + #endif ++#ifdef CONFIG_CPUMASK_OFFSTACK ++ alloc_size += num_possible_cpus() * cpumask_size(); ++#endif + /* + * As sched_init() is called before page_alloc is setup, + * we use alloc_bootmem(). +@@ -8403,6 +9464,12 @@ void __init sched_init(void) + ptr += nr_cpu_ids * sizeof(void **); + #endif /* CONFIG_USER_SCHED */ + #endif /* CONFIG_RT_GROUP_SCHED */ ++#ifdef CONFIG_CPUMASK_OFFSTACK ++ for_each_possible_cpu(i) { ++ per_cpu(load_balance_tmpmask, i) = (void *)ptr; ++ ptr += cpumask_size(); ++ } ++#endif /* CONFIG_CPUMASK_OFFSTACK */ + } + + #ifdef CONFIG_SMP +@@ -8438,6 +9505,8 @@ void __init sched_init(void) + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + rq->nr_running = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; + init_cfs_rq(&rq->cfs, rq); + init_rt_rq(&rq->rt, rq); + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -8538,6 +9607,9 @@ void __init sched_init(void) + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + ++#ifdef CONFIG_PREEMPT_RT ++ printk("Real-Time Preemption Support (C) 2004-2007 Ingo Molnar\n"); ++#endif + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, +@@ -8545,6 +9617,9 @@ void __init sched_init(void) + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ + /* + * During early bootup we pretend to be a normal task: + */ +@@ -8562,36 +9637,6 @@ void __init sched_init(void) + scheduler_running = 1; + } + +-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +-void __might_sleep(char *file, int line) +-{ +-#ifdef in_atomic +- static unsigned long prev_jiffy; /* ratelimiting */ +- +- if ((!in_atomic() && !irqs_disabled()) || +- system_state != SYSTEM_RUNNING || oops_in_progress) +- return; +- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) +- return; +- prev_jiffy = jiffies; +- +- printk(KERN_ERR +- "BUG: sleeping function called from invalid context at %s:%d\n", +- file, line); +- printk(KERN_ERR +- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", +- in_atomic(), irqs_disabled(), +- current->pid, current->comm); +- +- debug_show_held_locks(current); +- if (irqs_disabled()) +- print_irqtrace_events(current); +- dump_stack(); +-#endif +-} +-EXPORT_SYMBOL(__might_sleep); +-#endif +- + #ifdef CONFIG_MAGIC_SYSRQ + static void normalize_task(struct rq *rq, struct task_struct *p) + { +@@ -9547,7 +10592,7 @@ cpuacct_destroy(struct cgroup_subsys *ss + + static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) + { +- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); ++ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + + #ifndef CONFIG_64BIT +@@ -9566,7 +10611,7 @@ static u64 cpuacct_cpuusage_read(struct + + static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) + { +- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); ++ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + + #ifndef CONFIG_64BIT + /* +@@ -9655,16 +10700,20 @@ static void cpuacct_charge(struct task_s + struct cpuacct *ca; + int cpu; + +- if (!cpuacct_subsys.active) ++ if (unlikely(!cpuacct_subsys.active)) + return; + + cpu = task_cpu(tsk); ++ ++ rcu_read_lock(); + ca = task_ca(tsk); + +- for (; ca; ca = ca->parent) { +- u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); ++ do { ++ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; +- } ++ ca = ca->parent; ++ } while (ca); ++ rcu_read_unlock(); + } + + struct cgroup_subsys cpuacct_subsys = { +Index: linux-2.6-tip/kernel/sched_clock.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sched_clock.c ++++ linux-2.6-tip/kernel/sched_clock.c +@@ -24,11 +24,12 @@ + * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat + * consistent between cpus (never more than 2 jiffies difference). + */ +-#include +-#include + #include +-#include ++#include + #include ++#include ++#include ++#include + + /* + * Scheduler clock - returns current time in nanosec units. +@@ -43,6 +44,7 @@ unsigned long long __attribute__((weak)) + static __read_mostly int sched_clock_running; + + #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK ++__read_mostly int sched_clock_stable; + + struct sched_clock_data { + /* +@@ -50,7 +52,7 @@ struct sched_clock_data { + * from within instrumentation code so we dont want to do any + * instrumentation ourselves. + */ +- raw_spinlock_t lock; ++ __raw_spinlock_t lock; + + u64 tick_raw; + u64 tick_gtod; +@@ -77,7 +79,7 @@ void sched_clock_init(void) + for_each_possible_cpu(cpu) { + struct sched_clock_data *scd = cpu_sdc(cpu); + +- scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; ++ scd->lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + scd->tick_raw = 0; + scd->tick_gtod = ktime_now; + scd->clock = ktime_now; +@@ -87,7 +89,7 @@ void sched_clock_init(void) + } + + /* +- * min,max except they take wrapping into account ++ * min, max except they take wrapping into account + */ + + static inline u64 wrap_min(u64 x, u64 y) +@@ -111,15 +113,13 @@ static u64 __update_sched_clock(struct s + s64 delta = now - scd->tick_raw; + u64 clock, min_clock, max_clock; + +- WARN_ON_ONCE(!irqs_disabled()); +- + if (unlikely(delta < 0)) + delta = 0; + + /* + * scd->clock = clamp(scd->tick_gtod + delta, +- * max(scd->tick_gtod, scd->clock), +- * scd->tick_gtod + TICK_NSEC); ++ * max(scd->tick_gtod, scd->clock), ++ * scd->tick_gtod + TICK_NSEC); + */ + + clock = scd->tick_gtod + delta; +@@ -148,8 +148,20 @@ static void lock_double_clock(struct sch + + u64 sched_clock_cpu(int cpu) + { +- struct sched_clock_data *scd = cpu_sdc(cpu); + u64 now, clock, this_clock, remote_clock; ++ struct sched_clock_data *scd; ++ ++ if (sched_clock_stable) ++ return sched_clock(); ++ ++ scd = cpu_sdc(cpu); ++ ++ /* ++ * Normally this is not called in NMI context - but if it is, ++ * trying to do any locking here is totally lethal. ++ */ ++ if (unlikely(in_nmi())) ++ return scd->clock; + + if (unlikely(!sched_clock_running)) + return 0ull; +@@ -195,14 +207,18 @@ u64 sched_clock_cpu(int cpu) + + void sched_clock_tick(void) + { +- struct sched_clock_data *scd = this_scd(); ++ struct sched_clock_data *scd; + u64 now, now_gtod; + ++ if (sched_clock_stable) ++ return; ++ + if (unlikely(!sched_clock_running)) + return; + + WARN_ON_ONCE(!irqs_disabled()); + ++ scd = this_scd(); + now_gtod = ktime_to_ns(ktime_get()); + now = sched_clock(); + +@@ -250,7 +266,7 @@ u64 sched_clock_cpu(int cpu) + return sched_clock(); + } + +-#endif ++#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + + unsigned long long cpu_clock(int cpu) + { +Index: linux-2.6-tip/kernel/sched_debug.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sched_debug.c ++++ linux-2.6-tip/kernel/sched_debug.c +@@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m + P(nr_switches); + P(nr_load_updates); + P(nr_uninterruptible); +- SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); + PN(next_balance); + P(curr->pid); + PN(clock); +@@ -281,15 +280,25 @@ static void print_cpu(struct seq_file *m + P(cpu_load[2]); + P(cpu_load[3]); + P(cpu_load[4]); ++#ifdef CONFIG_PREEMPT_RT ++ /* Print rt related rq stats */ ++ P(rt.rt_nr_running); ++ P(rt.rt_nr_uninterruptible); ++# ifdef CONFIG_SCHEDSTATS ++ P(rto_schedule); ++ P(rto_schedule_tail); ++ P(rto_wakeup); ++ P(rto_pulled); ++ P(rto_pushed); ++# endif ++#endif ++ + #undef P + #undef PN + + #ifdef CONFIG_SCHEDSTATS + #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + +- P(yld_exp_empty); +- P(yld_act_empty); +- P(yld_both_empty); + P(yld_count); + + P(sched_switch); +@@ -314,7 +323,7 @@ static int sched_debug_show(struct seq_f + u64 now = ktime_to_ns(ktime_get()); + int cpu; + +- SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n", ++ SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); +@@ -325,6 +334,7 @@ static int sched_debug_show(struct seq_f + SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) + #define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) ++ P(jiffies); + PN(sysctl_sched_latency); + PN(sysctl_sched_min_granularity); + PN(sysctl_sched_wakeup_granularity); +@@ -397,6 +407,7 @@ void proc_sched_show_task(struct task_st + PN(se.vruntime); + PN(se.sum_exec_runtime); + PN(se.avg_overlap); ++ PN(se.avg_wakeup); + + nr_switches = p->nvcsw + p->nivcsw; + +Index: linux-2.6-tip/kernel/sched_fair.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sched_fair.c ++++ linux-2.6-tip/kernel/sched_fair.c +@@ -1314,16 +1314,63 @@ out: + } + #endif /* CONFIG_SMP */ + +-static unsigned long wakeup_gran(struct sched_entity *se) ++/* ++ * Adaptive granularity ++ * ++ * se->avg_wakeup gives the average time a task runs until it does a wakeup, ++ * with the limit of wakeup_gran -- when it never does a wakeup. ++ * ++ * So the smaller avg_wakeup is the faster we want this task to preempt, ++ * but we don't want to treat the preemptee unfairly and therefore allow it ++ * to run for at least the amount of time we'd like to run. ++ * ++ * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one ++ * ++ * NOTE: we use *nr_running to scale with load, this nicely matches the ++ * degrading latency on load. ++ */ ++static unsigned long ++adaptive_gran(struct sched_entity *curr, struct sched_entity *se) ++{ ++ u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; ++ u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running; ++ u64 gran = 0; ++ ++ if (this_run < expected_wakeup) ++ gran = expected_wakeup - this_run; ++ ++ return min_t(s64, gran, sysctl_sched_wakeup_granularity); ++} ++ ++static unsigned long ++wakeup_gran(struct sched_entity *curr, struct sched_entity *se) + { + unsigned long gran = sysctl_sched_wakeup_granularity; + ++ if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN)) ++ gran = adaptive_gran(curr, se); ++ + /* +- * More easily preempt - nice tasks, while not making it harder for +- * + nice tasks. ++ * Since its curr running now, convert the gran from real-time ++ * to virtual-time in his units. + */ +- if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD) +- gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); ++ if (sched_feat(ASYM_GRAN)) { ++ /* ++ * By using 'se' instead of 'curr' we penalize light tasks, so ++ * they get preempted easier. That is, if 'se' < 'curr' then ++ * the resulting gran will be larger, therefore penalizing the ++ * lighter, if otoh 'se' > 'curr' then the resulting gran will ++ * be smaller, again penalizing the lighter task. ++ * ++ * This is especially important for buddies when the leftmost ++ * task is higher priority than the buddy. ++ */ ++ if (unlikely(se->load.weight != NICE_0_LOAD)) ++ gran = calc_delta_fair(gran, se); ++ } else { ++ if (unlikely(curr->load.weight != NICE_0_LOAD)) ++ gran = calc_delta_fair(gran, curr); ++ } + + return gran; + } +@@ -1350,7 +1397,7 @@ wakeup_preempt_entity(struct sched_entit + if (vdiff <= 0) + return -1; + +- gran = wakeup_gran(curr); ++ gran = wakeup_gran(curr, se); + if (vdiff > gran) + return 1; + +Index: linux-2.6-tip/kernel/sched_features.h +=================================================================== +--- linux-2.6-tip.orig/kernel/sched_features.h ++++ linux-2.6-tip/kernel/sched_features.h +@@ -1,5 +1,6 @@ + SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) +-SCHED_FEAT(NORMALIZED_SLEEPER, 1) ++SCHED_FEAT(NORMALIZED_SLEEPER, 0) ++SCHED_FEAT(ADAPTIVE_GRAN, 1) + SCHED_FEAT(WAKEUP_PREEMPT, 1) + SCHED_FEAT(START_DEBIT, 1) + SCHED_FEAT(AFFINE_WAKEUPS, 1) +@@ -13,3 +14,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1) + SCHED_FEAT(ASYM_EFF_LOAD, 1) + SCHED_FEAT(WAKEUP_OVERLAP, 0) + SCHED_FEAT(LAST_BUDDY, 1) ++SCHED_FEAT(OWNER_SPIN, 1) +Index: linux-2.6-tip/kernel/sched_idletask.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sched_idletask.c ++++ linux-2.6-tip/kernel/sched_idletask.c +@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(stru + static struct task_struct *pick_next_task_idle(struct rq *rq) + { + schedstat_inc(rq, sched_goidle); +- ++ /* adjust the active tasks as we might go into a long sleep */ ++ calc_load_account_active(rq); + return rq->idle; + } + +Index: linux-2.6-tip/kernel/sched_rt.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sched_rt.c ++++ linux-2.6-tip/kernel/sched_rt.c +@@ -3,6 +3,44 @@ + * policies) + */ + ++static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) ++{ ++ return container_of(rt_se, struct task_struct, rt); ++} ++ ++#ifdef CONFIG_RT_GROUP_SCHED ++ ++#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) ++ ++static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) ++{ ++ return rt_rq->rq; ++} ++ ++static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) ++{ ++ return rt_se->rt_rq; ++} ++ ++#else /* CONFIG_RT_GROUP_SCHED */ ++ ++#define rt_entity_is_task(rt_se) (1) ++ ++static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) ++{ ++ return container_of(rt_rq, struct rq, rt); ++} ++ ++static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) ++{ ++ struct task_struct *p = rt_task_of(rt_se); ++ struct rq *rq = task_rq(p); ++ ++ return &rq->rt; ++} ++ ++#endif /* CONFIG_RT_GROUP_SCHED */ ++ + #ifdef CONFIG_SMP + + static inline int rt_overloaded(struct rq *rq) +@@ -37,25 +75,79 @@ static inline void rt_clear_overload(str + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); + } + +-static void update_rt_migration(struct rq *rq) ++static void update_rt_migration(struct rt_rq *rt_rq) + { +- if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { +- if (!rq->rt.overloaded) { +- rt_set_overload(rq); +- rq->rt.overloaded = 1; ++ if (rt_rq->rt_nr_migratory > 1) { ++ if (!rt_rq->overloaded) { ++ rt_set_overload(rq_of_rt_rq(rt_rq)); ++ rt_rq->overloaded = 1; + } +- } else if (rq->rt.overloaded) { +- rt_clear_overload(rq); +- rq->rt.overloaded = 0; ++ } else if (rt_rq->overloaded) { ++ rt_clear_overload(rq_of_rt_rq(rt_rq)); ++ rt_rq->overloaded = 0; + } + } +-#endif /* CONFIG_SMP */ + +-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) ++static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) + { +- return container_of(rt_se, struct task_struct, rt); ++ if (!rt_entity_is_task(rt_se)) ++ return; ++ ++ rt_rq = &rq_of_rt_rq(rt_rq)->rt; ++ ++ if (rt_se->nr_cpus_allowed > 1) ++ rt_rq->rt_nr_migratory++; ++ ++ update_rt_migration(rt_rq); ++} ++ ++static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++{ ++ if (!rt_entity_is_task(rt_se)) ++ return; ++ ++ rt_rq = &rq_of_rt_rq(rt_rq)->rt; ++ ++ if (rt_se->nr_cpus_allowed > 1) ++ rt_rq->rt_nr_migratory--; ++ ++ update_rt_migration(rt_rq); ++} ++ ++static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) ++{ ++ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); ++ plist_node_init(&p->pushable_tasks, p->prio); ++ plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); ++} ++ ++static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) ++{ ++ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + } + ++#else ++ ++static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) ++{ ++} ++ ++static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) ++{ ++} ++ ++static inline ++void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++{ ++} ++ ++static inline ++void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++{ ++} ++ ++#endif /* CONFIG_SMP */ ++ + static inline int on_rt_rq(struct sched_rt_entity *rt_se) + { + return !list_empty(&rt_se->run_list); +@@ -79,16 +171,6 @@ static inline u64 sched_rt_period(struct + #define for_each_leaf_rt_rq(rt_rq, rq) \ + list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) + +-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +-{ +- return rt_rq->rq; +-} +- +-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +-{ +- return rt_se->rt_rq; +-} +- + #define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = rt_se->parent) + +@@ -108,7 +190,7 @@ static void sched_rt_rq_enqueue(struct r + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se); +- if (rt_rq->highest_prio < curr->prio) ++ if (rt_rq->highest_prio.curr < curr->prio) + resched_task(curr); + } + } +@@ -176,19 +258,6 @@ static inline u64 sched_rt_period(struct + #define for_each_leaf_rt_rq(rt_rq, rq) \ + for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +-{ +- return container_of(rt_rq, struct rq, rt); +-} +- +-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +-{ +- struct task_struct *p = rt_task_of(rt_se); +- struct rq *rq = task_rq(p); +- +- return &rq->rt; +-} +- + #define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = NULL) + +@@ -473,7 +542,7 @@ static inline int rt_se_prio(struct sche + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq) +- return rt_rq->highest_prio; ++ return rt_rq->highest_prio.curr; + #endif + + return rt_task_of(rt_se)->prio; +@@ -547,91 +616,174 @@ static void update_curr_rt(struct rq *rq + } + } + +-static inline +-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++#if defined CONFIG_SMP ++ ++static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); ++ ++static inline int next_prio(struct rq *rq) + { +- WARN_ON(!rt_prio(rt_se_prio(rt_se))); +- rt_rq->rt_nr_running++; +-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +- if (rt_se_prio(rt_se) < rt_rq->highest_prio) { +-#ifdef CONFIG_SMP +- struct rq *rq = rq_of_rt_rq(rt_rq); +-#endif ++ struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); ++ ++ if (next && rt_prio(next->prio)) ++ return next->prio; ++ else ++ return MAX_RT_PRIO; ++} ++ ++static void ++inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) ++{ ++ struct rq *rq = rq_of_rt_rq(rt_rq); ++ ++ if (prio < prev_prio) { ++ ++ /* ++ * If the new task is higher in priority than anything on the ++ * run-queue, we know that the previous high becomes our ++ * next-highest. ++ */ ++ rt_rq->highest_prio.next = prev_prio; + +- rt_rq->highest_prio = rt_se_prio(rt_se); +-#ifdef CONFIG_SMP + if (rq->online) +- cpupri_set(&rq->rd->cpupri, rq->cpu, +- rt_se_prio(rt_se)); +-#endif +- } +-#endif +-#ifdef CONFIG_SMP +- if (rt_se->nr_cpus_allowed > 1) { +- struct rq *rq = rq_of_rt_rq(rt_rq); ++ cpupri_set(&rq->rd->cpupri, rq->cpu, prio); + +- rq->rt.rt_nr_migratory++; +- } ++ } else if (prio == rt_rq->highest_prio.curr) ++ /* ++ * If the next task is equal in priority to the highest on ++ * the run-queue, then we implicitly know that the next highest ++ * task cannot be any lower than current ++ */ ++ rt_rq->highest_prio.next = prio; ++ else if (prio < rt_rq->highest_prio.next) ++ /* ++ * Otherwise, we need to recompute next-highest ++ */ ++ rt_rq->highest_prio.next = next_prio(rq); ++} + +- update_rt_migration(rq_of_rt_rq(rt_rq)); +-#endif +-#ifdef CONFIG_RT_GROUP_SCHED +- if (rt_se_boosted(rt_se)) +- rt_rq->rt_nr_boosted++; ++static void ++dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) ++{ ++ struct rq *rq = rq_of_rt_rq(rt_rq); + +- if (rt_rq->tg) +- start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +-#else +- start_rt_bandwidth(&def_rt_bandwidth); +-#endif ++ if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) ++ rt_rq->highest_prio.next = next_prio(rq); ++ ++ if (rq->online && rt_rq->highest_prio.curr != prev_prio) ++ cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); + } + ++#else /* CONFIG_SMP */ ++ + static inline +-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +-{ +-#ifdef CONFIG_SMP +- int highest_prio = rt_rq->highest_prio; +-#endif ++void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} ++static inline ++void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} ++ ++#endif /* CONFIG_SMP */ + +- WARN_ON(!rt_prio(rt_se_prio(rt_se))); +- WARN_ON(!rt_rq->rt_nr_running); +- rt_rq->rt_nr_running--; + #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED ++static void ++inc_rt_prio(struct rt_rq *rt_rq, int prio) ++{ ++ int prev_prio = rt_rq->highest_prio.curr; ++ ++ if (prio < prev_prio) ++ rt_rq->highest_prio.curr = prio; ++ ++ inc_rt_prio_smp(rt_rq, prio, prev_prio); ++} ++ ++static void ++dec_rt_prio(struct rt_rq *rt_rq, int prio) ++{ ++ int prev_prio = rt_rq->highest_prio.curr; ++ + if (rt_rq->rt_nr_running) { +- struct rt_prio_array *array; + +- WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); +- if (rt_se_prio(rt_se) == rt_rq->highest_prio) { +- /* recalculate */ +- array = &rt_rq->active; +- rt_rq->highest_prio = ++ WARN_ON(prio < prev_prio); ++ ++ /* ++ * This may have been our highest task, and therefore ++ * we may have some recomputation to do ++ */ ++ if (prio == prev_prio) { ++ struct rt_prio_array *array = &rt_rq->active; ++ ++ rt_rq->highest_prio.curr = + sched_find_first_bit(array->bitmap); +- } /* otherwise leave rq->highest prio alone */ ++ } ++ + } else +- rt_rq->highest_prio = MAX_RT_PRIO; +-#endif +-#ifdef CONFIG_SMP +- if (rt_se->nr_cpus_allowed > 1) { +- struct rq *rq = rq_of_rt_rq(rt_rq); +- rq->rt.rt_nr_migratory--; +- } ++ rt_rq->highest_prio.curr = MAX_RT_PRIO; + +- if (rt_rq->highest_prio != highest_prio) { +- struct rq *rq = rq_of_rt_rq(rt_rq); ++ dec_rt_prio_smp(rt_rq, prio, prev_prio); ++} + +- if (rq->online) +- cpupri_set(&rq->rd->cpupri, rq->cpu, +- rt_rq->highest_prio); +- } ++#else ++ ++static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} ++static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} ++ ++#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ + +- update_rt_migration(rq_of_rt_rq(rt_rq)); +-#endif /* CONFIG_SMP */ + #ifdef CONFIG_RT_GROUP_SCHED ++ ++static void ++inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++{ ++ if (rt_se_boosted(rt_se)) ++ rt_rq->rt_nr_boosted++; ++ ++ if (rt_rq->tg) ++ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); ++} ++ ++static void ++dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++{ + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted--; + + WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); +-#endif ++} ++ ++#else /* CONFIG_RT_GROUP_SCHED */ ++ ++static void ++inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++{ ++ start_rt_bandwidth(&def_rt_bandwidth); ++} ++ ++static inline ++void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} ++ ++#endif /* CONFIG_RT_GROUP_SCHED */ ++ ++static inline ++void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++{ ++ int prio = rt_se_prio(rt_se); ++ ++ WARN_ON(!rt_prio(prio)); ++ rt_rq->rt_nr_running++; ++ ++ inc_rt_prio(rt_rq, prio); ++ inc_rt_migration(rt_se, rt_rq); ++ inc_rt_group(rt_se, rt_rq); ++} ++ ++static inline ++void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) ++{ ++ WARN_ON(!rt_prio(rt_se_prio(rt_se))); ++ WARN_ON(!rt_rq->rt_nr_running); ++ rt_rq->rt_nr_running--; ++ ++ dec_rt_prio(rt_rq, rt_se_prio(rt_se)); ++ dec_rt_migration(rt_se, rt_rq); ++ dec_rt_group(rt_se, rt_rq); + } + + static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) +@@ -706,6 +858,55 @@ static void dequeue_rt_entity(struct sch + } + } + ++static inline void incr_rt_nr_uninterruptible(struct task_struct *p, ++ struct rq *rq) ++{ ++ rq->rt.rt_nr_uninterruptible++; ++} ++ ++static inline void decr_rt_nr_uninterruptible(struct task_struct *p, ++ struct rq *rq) ++{ ++ rq->rt.rt_nr_uninterruptible--; ++} ++ ++unsigned long rt_nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->rt.rt_nr_running; ++ ++ return sum; ++} ++ ++unsigned long rt_nr_running_cpu(int cpu) ++{ ++ return cpu_rq(cpu)->rt.rt_nr_running; ++} ++ ++unsigned long rt_nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->rt.rt_nr_uninterruptible; ++ ++ /* ++ * Since we read the counters lockless, it might be slightly ++ * inaccurate. Do not allow it to go below zero though: ++ */ ++ if (unlikely((long)sum < 0)) ++ sum = 0; ++ ++ return sum; ++} ++ ++unsigned long rt_nr_uninterruptible_cpu(int cpu) ++{ ++ return cpu_rq(cpu)->rt.rt_nr_uninterruptible; ++} ++ + /* + * Adding/removing a task to/from a priority array: + */ +@@ -718,6 +919,12 @@ static void enqueue_task_rt(struct rq *r + + enqueue_rt_entity(rt_se); + ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ decr_rt_nr_uninterruptible(p, rq); ++ ++ if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) ++ enqueue_pushable_task(rq, p); ++ + inc_cpu_load(rq, p->se.load.weight); + } + +@@ -726,8 +933,14 @@ static void dequeue_task_rt(struct rq *r + struct sched_rt_entity *rt_se = &p->rt; + + update_curr_rt(rq); ++ ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ incr_rt_nr_uninterruptible(p, rq); ++ + dequeue_rt_entity(rt_se); + ++ dequeue_pushable_task(rq, p); ++ + dec_cpu_load(rq, p->se.load.weight); + } + +@@ -878,7 +1091,7 @@ static struct sched_rt_entity *pick_next + return next; + } + +-static struct task_struct *pick_next_task_rt(struct rq *rq) ++static struct task_struct *_pick_next_task_rt(struct rq *rq) + { + struct sched_rt_entity *rt_se; + struct task_struct *p; +@@ -900,6 +1113,18 @@ static struct task_struct *pick_next_tas + + p = rt_task_of(rt_se); + p->se.exec_start = rq->clock; ++ ++ return p; ++} ++ ++static struct task_struct *pick_next_task_rt(struct rq *rq) ++{ ++ struct task_struct *p = _pick_next_task_rt(rq); ++ ++ /* The running task is never eligible for pushing */ ++ if (p) ++ dequeue_pushable_task(rq, p); ++ + return p; + } + +@@ -907,6 +1132,13 @@ static void put_prev_task_rt(struct rq * + { + update_curr_rt(rq); + p->se.exec_start = 0; ++ ++ /* ++ * The previous task needs to be made eligible for pushing ++ * if it is still active ++ */ ++ if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) ++ enqueue_pushable_task(rq, p); + } + + #ifdef CONFIG_SMP +@@ -960,12 +1192,13 @@ static struct task_struct *pick_next_hig + + static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); + +-static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) ++static inline int pick_optimal_cpu(int this_cpu, ++ const struct cpumask *mask) + { + int first; + + /* "this_cpu" is cheaper to preempt than a remote processor */ +- if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) ++ if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) + return this_cpu; + + first = cpumask_first(mask); +@@ -981,6 +1214,7 @@ static int find_lowest_rq(struct task_st + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + int this_cpu = smp_processor_id(); + int cpu = task_cpu(task); ++ cpumask_var_t domain_mask; + + if (task->rt.nr_cpus_allowed == 1) + return -1; /* No other targets possible */ +@@ -1013,19 +1247,25 @@ static int find_lowest_rq(struct task_st + if (this_cpu == cpu) + this_cpu = -1; /* Skip this_cpu opt if the same */ + +- for_each_domain(cpu, sd) { +- if (sd->flags & SD_WAKE_AFFINE) { +- cpumask_t domain_mask; +- int best_cpu; +- +- cpumask_and(&domain_mask, sched_domain_span(sd), +- lowest_mask); +- +- best_cpu = pick_optimal_cpu(this_cpu, +- &domain_mask); +- if (best_cpu != -1) +- return best_cpu; ++ if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { ++ for_each_domain(cpu, sd) { ++ if (sd->flags & SD_WAKE_AFFINE) { ++ int best_cpu; ++ ++ cpumask_and(domain_mask, ++ sched_domain_span(sd), ++ lowest_mask); ++ ++ best_cpu = pick_optimal_cpu(this_cpu, ++ domain_mask); ++ ++ if (best_cpu != -1) { ++ free_cpumask_var(domain_mask); ++ return best_cpu; ++ } ++ } + } ++ free_cpumask_var(domain_mask); + } + + /* +@@ -1072,7 +1312,7 @@ static struct rq *find_lock_lowest_rq(st + } + + /* If this rq is still suitable use it. */ +- if (lowest_rq->rt.highest_prio > task->prio) ++ if (lowest_rq->rt.highest_prio.curr > task->prio) + break; + + /* try again */ +@@ -1083,6 +1323,31 @@ static struct rq *find_lock_lowest_rq(st + return lowest_rq; + } + ++static inline int has_pushable_tasks(struct rq *rq) ++{ ++ return !plist_head_empty(&rq->rt.pushable_tasks); ++} ++ ++static struct task_struct *pick_next_pushable_task(struct rq *rq) ++{ ++ struct task_struct *p; ++ ++ if (!has_pushable_tasks(rq)) ++ return NULL; ++ ++ p = plist_first_entry(&rq->rt.pushable_tasks, ++ struct task_struct, pushable_tasks); ++ ++ BUG_ON(rq->cpu != task_cpu(p)); ++ BUG_ON(task_current(rq, p)); ++ BUG_ON(p->rt.nr_cpus_allowed <= 1); ++ ++ BUG_ON(!p->se.on_rq); ++ BUG_ON(!rt_task(p)); ++ ++ return p; ++} ++ + /* + * If the current CPU has more than one RT task, see if the non + * running task can migrate over to a CPU that is running a task +@@ -1092,13 +1357,11 @@ static int push_rt_task(struct rq *rq) + { + struct task_struct *next_task; + struct rq *lowest_rq; +- int ret = 0; +- int paranoid = RT_MAX_TRIES; + + if (!rq->rt.overloaded) + return 0; + +- next_task = pick_next_highest_task_rt(rq, -1); ++ next_task = pick_next_pushable_task(rq); + if (!next_task) + return 0; + +@@ -1127,16 +1390,34 @@ static int push_rt_task(struct rq *rq) + struct task_struct *task; + /* + * find lock_lowest_rq releases rq->lock +- * so it is possible that next_task has changed. +- * If it has, then try again. ++ * so it is possible that next_task has migrated. ++ * ++ * We need to make sure that the task is still on the same ++ * run-queue and is also still the next task eligible for ++ * pushing. + */ +- task = pick_next_highest_task_rt(rq, -1); +- if (unlikely(task != next_task) && task && paranoid--) { +- put_task_struct(next_task); +- next_task = task; +- goto retry; ++ task = pick_next_pushable_task(rq); ++ if (task_cpu(next_task) == rq->cpu && task == next_task) { ++ /* ++ * If we get here, the task hasnt moved at all, but ++ * it has failed to push. We will not try again, ++ * since the other cpus will pull from us when they ++ * are ready. ++ */ ++ dequeue_pushable_task(rq, next_task); ++ goto out; + } +- goto out; ++ ++ if (!task) ++ /* No more tasks, just exit */ ++ goto out; ++ ++ /* ++ * Something has shifted, try again. ++ */ ++ put_task_struct(next_task); ++ next_task = task; ++ goto retry; + } + + deactivate_task(rq, next_task, 0); +@@ -1147,23 +1428,12 @@ static int push_rt_task(struct rq *rq) + + double_unlock_balance(rq, lowest_rq); + +- ret = 1; + out: + put_task_struct(next_task); + +- return ret; ++ return 1; + } + +-/* +- * TODO: Currently we just use the second highest prio task on +- * the queue, and stop when it can't migrate (or there's +- * no more RT tasks). There may be a case where a lower +- * priority RT task has a different affinity than the +- * higher RT task. In this case the lower RT task could +- * possibly be able to migrate where as the higher priority +- * RT task could not. We currently ignore this issue. +- * Enhancements are welcome! +- */ + static void push_rt_tasks(struct rq *rq) + { + /* push_rt_task will return true if it moved an RT */ +@@ -1174,33 +1444,35 @@ static void push_rt_tasks(struct rq *rq) + static int pull_rt_task(struct rq *this_rq) + { + int this_cpu = this_rq->cpu, ret = 0, cpu; +- struct task_struct *p, *next; ++ struct task_struct *p; + struct rq *src_rq; + + if (likely(!rt_overloaded(this_rq))) + return 0; + +- next = pick_next_task_rt(this_rq); +- + for_each_cpu(cpu, this_rq->rd->rto_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); ++ ++ /* ++ * Don't bother taking the src_rq->lock if the next highest ++ * task is known to be lower-priority than our current task. ++ * This may look racy, but if this value is about to go ++ * logically higher, the src_rq will push this task away. ++ * And if its going logically lower, we do not care ++ */ ++ if (src_rq->rt.highest_prio.next >= ++ this_rq->rt.highest_prio.curr) ++ continue; ++ + /* + * We can potentially drop this_rq's lock in + * double_lock_balance, and another CPU could +- * steal our next task - hence we must cause +- * the caller to recalculate the next task +- * in that case: ++ * alter this_rq + */ +- if (double_lock_balance(this_rq, src_rq)) { +- struct task_struct *old_next = next; +- +- next = pick_next_task_rt(this_rq); +- if (next != old_next) +- ret = 1; +- } ++ double_lock_balance(this_rq, src_rq); + + /* + * Are there still pullable RT tasks? +@@ -1214,7 +1486,7 @@ static int pull_rt_task(struct rq *this_ + * Do we have an RT task that preempts + * the to-be-scheduled task? + */ +- if (p && (!next || (p->prio < next->prio))) { ++ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->se.on_rq); + +@@ -1224,12 +1496,9 @@ static int pull_rt_task(struct rq *this_ + * This is just that p is wakeing up and hasn't + * had a chance to schedule. We only pull + * p if it is lower in priority than the +- * current task on the run queue or +- * this_rq next task is lower in prio than +- * the current task on that rq. ++ * current task on the run queue + */ +- if (p->prio < src_rq->curr->prio || +- (next && next->prio < src_rq->curr->prio)) ++ if (p->prio < src_rq->curr->prio) + goto skip; + + ret = 1; +@@ -1242,13 +1511,7 @@ static int pull_rt_task(struct rq *this_ + * case there's an even higher prio task + * in another runqueue. (low likelyhood + * but possible) +- * +- * Update next so that we won't pick a task +- * on another cpu with a priority lower (or equal) +- * than the one we just picked. + */ +- next = p; +- + } + skip: + double_unlock_balance(this_rq, src_rq); +@@ -1260,24 +1523,29 @@ static int pull_rt_task(struct rq *this_ + static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) + { + /* Try to pull RT tasks here if we lower this rq's prio */ +- if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) ++ if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) { + pull_rt_task(rq); ++ schedstat_inc(rq, rto_schedule); ++ } ++} ++ ++/* ++ * assumes rq->lock is held ++ */ ++static int needs_post_schedule_rt(struct rq *rq) ++{ ++ return has_pushable_tasks(rq); + } + + static void post_schedule_rt(struct rq *rq) + { + /* +- * If we have more than one rt_task queued, then +- * see if we can push the other rt_tasks off to other CPUS. +- * Note we may release the rq lock, and since +- * the lock was owned by prev, we need to release it +- * first via finish_lock_switch and then reaquire it here. ++ * This is only called if needs_post_schedule_rt() indicates that ++ * we need to push tasks away + */ +- if (unlikely(rq->rt.overloaded)) { +- spin_lock_irq(&rq->lock); +- push_rt_tasks(rq); +- spin_unlock_irq(&rq->lock); +- } ++ spin_lock_irq(&rq->lock); ++ push_rt_tasks(rq); ++ spin_unlock_irq(&rq->lock); + } + + /* +@@ -1288,7 +1556,8 @@ static void task_wake_up_rt(struct rq *r + { + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && +- rq->rt.overloaded) ++ has_pushable_tasks(rq) && ++ p->rt.nr_cpus_allowed > 1) + push_rt_tasks(rq); + } + +@@ -1324,6 +1593,23 @@ static void set_cpus_allowed_rt(struct t + if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { + struct rq *rq = task_rq(p); + ++ if (!task_current(rq, p)) { ++ /* ++ * Make sure we dequeue this task from the pushable list ++ * before going further. It will either remain off of ++ * the list because we are no longer pushable, or it ++ * will be requeued. ++ */ ++ if (p->rt.nr_cpus_allowed > 1) ++ dequeue_pushable_task(rq, p); ++ ++ /* ++ * Requeue if our weight is changing and still > 1 ++ */ ++ if (weight > 1) ++ enqueue_pushable_task(rq, p); ++ } ++ + if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { + rq->rt.rt_nr_migratory++; + } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { +@@ -1331,7 +1617,7 @@ static void set_cpus_allowed_rt(struct t + rq->rt.rt_nr_migratory--; + } + +- update_rt_migration(rq); ++ update_rt_migration(&rq->rt); + } + + cpumask_copy(&p->cpus_allowed, new_mask); +@@ -1346,7 +1632,7 @@ static void rq_online_rt(struct rq *rq) + + __enable_runtime(rq); + +- cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); ++ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); + } + + /* Assumes rq->lock is held */ +@@ -1438,7 +1724,7 @@ static void prio_changed_rt(struct rq *r + * can release the rq lock and p could migrate. + * Only reschedule if p is still on the same runqueue. + */ +- if (p->prio > rq->rt.highest_prio && rq->curr == p) ++ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) + resched_task(p); + #else + /* For UP simply resched on drop of prio */ +@@ -1509,6 +1795,9 @@ static void set_curr_task_rt(struct rq * + struct task_struct *p = rq->curr; + + p->se.exec_start = rq->clock; ++ ++ /* The running task is never eligible for pushing */ ++ dequeue_pushable_task(rq, p); + } + + static const struct sched_class rt_sched_class = { +@@ -1531,6 +1820,7 @@ static const struct sched_class rt_sched + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, + .pre_schedule = pre_schedule_rt, ++ .needs_post_schedule = needs_post_schedule_rt, + .post_schedule = post_schedule_rt, + .task_wake_up = task_wake_up_rt, + .switched_from = switched_from_rt, +Index: linux-2.6-tip/kernel/sched_stats.h +=================================================================== +--- linux-2.6-tip.orig/kernel/sched_stats.h ++++ linux-2.6-tip/kernel/sched_stats.h +@@ -4,7 +4,7 @@ + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +-#define SCHEDSTAT_VERSION 14 ++#define SCHEDSTAT_VERSION 15 + + static int show_schedstat(struct seq_file *seq, void *v) + { +@@ -26,9 +26,8 @@ static int show_schedstat(struct seq_fil + + /* runqueue-specific stats */ + seq_printf(seq, +- "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu", +- cpu, rq->yld_both_empty, +- rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, ++ "cpu%d %u %u %u %u %u %u %llu %llu %lu", ++ cpu, rq->yld_count, + rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, + rq->rq_cpu_time, +Index: linux-2.6-tip/kernel/signal.c +=================================================================== +--- linux-2.6-tip.orig/kernel/signal.c ++++ linux-2.6-tip/kernel/signal.c +@@ -179,13 +179,46 @@ int next_signal(struct sigpending *pendi + return sig; + } + ++#ifdef __HAVE_ARCH_CMPXCHG ++static inline struct sigqueue *get_task_cache(struct task_struct *t) ++{ ++ struct sigqueue *q = t->sigqueue_cache; ++ ++ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) ++ return NULL; ++ ++ return q; ++} ++ ++static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) ++{ ++ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) ++ return 0; ++ ++ return 1; ++} ++ ++#else ++ ++static inline struct sigqueue *get_task_cache(struct task_struct *t) ++{ ++ return NULL; ++} ++ ++static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) ++{ ++ return 1; ++} ++ ++#endif ++ + /* + * allocate a new signal queue record + * - this may be called without locks if and only if t == current, otherwise an + * appopriate lock must be held to stop the target task from exiting + */ +-static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, +- int override_rlimit) ++static struct sigqueue *__sigqueue_do_alloc(struct task_struct *t, gfp_t flags, ++ int override_rlimit, int fromslab) + { + struct sigqueue *q = NULL; + struct user_struct *user; +@@ -200,8 +233,14 @@ static struct sigqueue *__sigqueue_alloc + atomic_inc(&user->sigpending); + if (override_rlimit || + atomic_read(&user->sigpending) <= +- t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) +- q = kmem_cache_alloc(sigqueue_cachep, flags); ++ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { ++ ++ if (!fromslab) ++ q = get_task_cache(t); ++ if (!q) ++ q = kmem_cache_alloc(sigqueue_cachep, flags); ++ } ++ + if (unlikely(q == NULL)) { + atomic_dec(&user->sigpending); + free_uid(user); +@@ -214,6 +253,12 @@ static struct sigqueue *__sigqueue_alloc + return q; + } + ++static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, ++ int override_rlimit) ++{ ++ return __sigqueue_do_alloc(t, flags, override_rlimit, 0); ++} ++ + static void __sigqueue_free(struct sigqueue *q) + { + if (q->flags & SIGQUEUE_PREALLOC) +@@ -223,6 +268,21 @@ static void __sigqueue_free(struct sigqu + kmem_cache_free(sigqueue_cachep, q); + } + ++static void sigqueue_free_current(struct sigqueue *q) ++{ ++ struct user_struct *up; ++ ++ if (q->flags & SIGQUEUE_PREALLOC) ++ return; ++ ++ up = q->user; ++ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { ++ atomic_dec(&up->sigpending); ++ free_uid(up); ++ } else ++ __sigqueue_free(q); ++} ++ + void flush_sigqueue(struct sigpending *queue) + { + struct sigqueue *q; +@@ -236,6 +296,21 @@ void flush_sigqueue(struct sigpending *q + } + + /* ++ * Called from __exit_signal. Flush tsk->pending and ++ * tsk->sigqueue_cache ++ */ ++void flush_task_sigqueue(struct task_struct *tsk) ++{ ++ struct sigqueue *q; ++ ++ flush_sigqueue(&tsk->pending); ++ ++ q = get_task_cache(tsk); ++ if (q) ++ kmem_cache_free(sigqueue_cachep, q); ++} ++ ++/* + * Flush all pending signals for a task. + */ + void flush_signals(struct task_struct *t) +@@ -378,7 +453,7 @@ static void collect_signal(int sig, stru + still_pending: + list_del_init(&first->list); + copy_siginfo(info, &first->info); +- __sigqueue_free(first); ++ sigqueue_free_current(first); + } else { + /* Ok, it wasn't in the queue. This must be + a fast-pathed signal or we must have been +@@ -423,6 +498,8 @@ int dequeue_signal(struct task_struct *t + { + int signr; + ++ WARN_ON_ONCE(tsk != current); ++ + /* We only dequeue private signals from ourselves, we don't let + * signalfd steal them + */ +@@ -505,6 +582,9 @@ void signal_wake_up(struct task_struct * + + set_tsk_thread_flag(t, TIF_SIGPENDING); + ++ if (unlikely(t == current)) ++ return; ++ + /* + * For SIGKILL, we want to wake it up in the stopped/traced/killable + * case. We don't check t->state here because there is a race with it +@@ -821,7 +901,9 @@ static int send_signal(int sig, struct s + + trace_sched_signal_send(sig, t); + ++#ifdef CONFIG_SMP + assert_spin_locked(&t->sighand->siglock); ++#endif + if (!prepare_signal(sig, t)) + return 0; + +@@ -1276,7 +1358,8 @@ struct sigqueue *sigqueue_alloc(void) + { + struct sigqueue *q; + +- if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) ++ /* Preallocated sigqueue objects always from the slabcache ! */ ++ if ((q = __sigqueue_do_alloc(current, GFP_KERNEL, 0, 1))) + q->flags |= SIGQUEUE_PREALLOC; + return(q); + } +@@ -1575,15 +1658,9 @@ static void ptrace_stop(int exit_code, i + read_lock(&tasklist_lock); + if (may_ptrace_stop()) { + do_notify_parent_cldstop(current, CLD_TRAPPED); +- /* +- * Don't want to allow preemption here, because +- * sys_ptrace() needs this task to be inactive. +- * +- * XXX: implement read_unlock_no_resched(). +- */ +- preempt_disable(); ++ ++ current->flags &= ~PF_NOSCHED; + read_unlock(&tasklist_lock); +- preempt_enable_no_resched(); + schedule(); + } else { + /* +@@ -1652,6 +1729,7 @@ finish_stop(int stop_count) + } + + do { ++ current->flags &= ~PF_NOSCHED; + schedule(); + } while (try_to_freeze()); + /* +@@ -2243,24 +2321,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, s + return kill_something_info(sig, &info, pid); + } + +-static int do_tkill(pid_t tgid, pid_t pid, int sig) ++static int ++do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) + { +- int error; +- struct siginfo info; + struct task_struct *p; + unsigned long flags; +- +- error = -ESRCH; +- info.si_signo = sig; +- info.si_errno = 0; +- info.si_code = SI_TKILL; +- info.si_pid = task_tgid_vnr(current); +- info.si_uid = current_uid(); ++ int error = -ESRCH; + + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { +- error = check_kill_permission(sig, &info, p); ++ error = check_kill_permission(sig, info, p); + /* + * The null signal is a permissions and process existence + * probe. No signal is actually delivered. +@@ -2270,7 +2341,7 @@ static int do_tkill(pid_t tgid, pid_t pi + * signal is private anyway. + */ + if (!error && sig && lock_task_sighand(p, &flags)) { +- error = specific_send_sig_info(sig, &info, p); ++ error = specific_send_sig_info(sig, info, p); + unlock_task_sighand(p, &flags); + } + } +@@ -2279,6 +2350,19 @@ static int do_tkill(pid_t tgid, pid_t pi + return error; + } + ++static int do_tkill(pid_t tgid, pid_t pid, int sig) ++{ ++ struct siginfo info; ++ ++ info.si_signo = sig; ++ info.si_errno = 0; ++ info.si_code = SI_TKILL; ++ info.si_pid = task_tgid_vnr(current); ++ info.si_uid = current_uid(); ++ ++ return do_send_specific(tgid, pid, sig, &info); ++} ++ + /** + * sys_tgkill - send signal to one specific thread + * @tgid: the thread group ID of the thread +@@ -2328,6 +2412,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, + return kill_proc_info(sig, &info, pid); + } + ++long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) ++{ ++ /* This is only valid for single tasks */ ++ if (pid <= 0 || tgid <= 0) ++ return -EINVAL; ++ ++ /* Not even root can pretend to send signals from the kernel. ++ Nor can they impersonate a kill(), which adds source info. */ ++ if (info->si_code >= 0) ++ return -EPERM; ++ info->si_signo = sig; ++ ++ return do_send_specific(tgid, pid, sig, info); ++} ++ ++SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, ++ siginfo_t __user *, uinfo) ++{ ++ siginfo_t info; ++ ++ if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) ++ return -EFAULT; ++ ++ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); ++} ++ + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) + { + struct task_struct *t = current; +Index: linux-2.6-tip/kernel/smp.c +=================================================================== +--- linux-2.6-tip.orig/kernel/smp.c ++++ linux-2.6-tip/kernel/smp.c +@@ -2,40 +2,82 @@ + * Generic helpers for smp ipi calls + * + * (C) Jens Axboe 2008 +- * + */ +-#include +-#include +-#include + #include + #include ++#include ++#include ++#include ++#include + #include ++#include + + static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); +-static LIST_HEAD(call_function_queue); +-__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); ++ ++static struct { ++ struct list_head queue; ++ raw_spinlock_t lock; ++} call_function __cacheline_aligned_in_smp = ++ { ++ .queue = LIST_HEAD_INIT(call_function.queue), ++ .lock = RAW_SPIN_LOCK_UNLOCKED(call_function.lock), ++ }; + + enum { +- CSD_FLAG_WAIT = 0x01, +- CSD_FLAG_ALLOC = 0x02, +- CSD_FLAG_LOCK = 0x04, ++ CSD_FLAG_LOCK = 0x01, + }; + + struct call_function_data { +- struct call_single_data csd; +- spinlock_t lock; +- unsigned int refs; +- struct rcu_head rcu_head; +- unsigned long cpumask_bits[]; ++ struct call_single_data csd; ++ raw_spinlock_t lock; ++ unsigned int refs; ++ cpumask_var_t cpumask; + }; + + struct call_single_queue { +- struct list_head list; +- spinlock_t lock; ++ struct list_head list; ++ raw_spinlock_t lock; ++}; ++ ++static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { ++ .lock = RAW_SPIN_LOCK_UNLOCKED(cfd_data.lock), ++}; ++ ++static int ++hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) ++{ ++ long cpu = (long)hcpu; ++ struct call_function_data *cfd = &per_cpu(cfd_data, cpu); ++ ++ switch (action) { ++ case CPU_UP_PREPARE: ++ case CPU_UP_PREPARE_FROZEN: ++ if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, ++ cpu_to_node(cpu))) ++ return NOTIFY_BAD; ++ break; ++ ++#ifdef CONFIG_CPU_HOTPLUG ++ case CPU_UP_CANCELED: ++ case CPU_UP_CANCELED_FROZEN: ++ ++ case CPU_DEAD: ++ case CPU_DEAD_FROZEN: ++ free_cpumask_var(cfd->cpumask); ++ break; ++#endif ++ }; ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { ++ .notifier_call = hotplug_cfd, + }; + + static int __cpuinit init_call_single_data(void) + { ++ void *cpu = (void *)(long)smp_processor_id(); + int i; + + for_each_possible_cpu(i) { +@@ -44,29 +86,63 @@ static int __cpuinit init_call_single_da + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->list); + } ++ ++ hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); ++ register_cpu_notifier(&hotplug_cfd_notifier); ++ + return 0; + } + early_initcall(init_call_single_data); + +-static void csd_flag_wait(struct call_single_data *data) ++/* ++ * csd_lock/csd_unlock used to serialize access to per-cpu csd resources ++ * ++ * For non-synchronous ipi calls the csd can still be in use by the ++ * previous function call. For multi-cpu calls its even more interesting ++ * as we'll have to ensure no other cpu is observing our csd. ++ */ ++static void csd_lock_wait(struct call_single_data *data) + { +- /* Wait for response */ +- do { +- if (!(data->flags & CSD_FLAG_WAIT)) +- break; ++ while (data->flags & CSD_FLAG_LOCK) + cpu_relax(); +- } while (1); ++} ++ ++static void csd_lock(struct call_single_data *data) ++{ ++ csd_lock_wait(data); ++ data->flags = CSD_FLAG_LOCK; ++ ++ /* ++ * prevent CPU from reordering the above assignment ++ * to ->flags with any subsequent assignments to other ++ * fields of the specified call_single_data structure: ++ */ ++ smp_mb(); ++} ++ ++static void csd_unlock(struct call_single_data *data) ++{ ++ WARN_ON(!(data->flags & CSD_FLAG_LOCK)); ++ ++ /* ++ * ensure we're all done before releasing data: ++ */ ++ smp_mb(); ++ ++ data->flags &= ~CSD_FLAG_LOCK; + } + + /* +- * Insert a previously allocated call_single_data element for execution +- * on the given CPU. data must already have ->func, ->info, and ->flags set. ++ * Insert a previously allocated call_single_data element ++ * for execution on the given CPU. data must already have ++ * ->func, ->info, and ->flags set. + */ +-static void generic_exec_single(int cpu, struct call_single_data *data) ++static ++void generic_exec_single(int cpu, struct call_single_data *data, int wait) + { + struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); +- int wait = data->flags & CSD_FLAG_WAIT, ipi; + unsigned long flags; ++ int ipi; + + spin_lock_irqsave(&dst->lock, flags); + ipi = list_empty(&dst->list); +@@ -74,24 +150,21 @@ static void generic_exec_single(int cpu, + spin_unlock_irqrestore(&dst->lock, flags); + + /* +- * Make the list addition visible before sending the ipi. ++ * The list addition should be visible before sending the IPI ++ * handler locks the list to pull the entry off it because of ++ * normal cache coherency rules implied by spinlocks. ++ * ++ * If IPIs can go out of order to the cache coherency protocol ++ * in an architecture, sufficient synchronisation should be added ++ * to arch code to make it appear to obey cache coherency WRT ++ * locking and barrier primitives. Generic code isn't really ++ * equipped to do the right thing... + */ +- smp_mb(); +- + if (ipi) + arch_send_call_function_single_ipi(cpu); + + if (wait) +- csd_flag_wait(data); +-} +- +-static void rcu_free_call_data(struct rcu_head *head) +-{ +- struct call_function_data *data; +- +- data = container_of(head, struct call_function_data, rcu_head); +- +- kfree(data); ++ csd_lock_wait(data); + } + + /* +@@ -104,99 +177,83 @@ void generic_smp_call_function_interrupt + int cpu = get_cpu(); + + /* +- * It's ok to use list_for_each_rcu() here even though we may delete +- * 'pos', since list_del_rcu() doesn't clear ->next ++ * Ensure entry is visible on call_function_queue after we have ++ * entered the IPI. See comment in smp_call_function_many. ++ * If we don't have this, then we may miss an entry on the list ++ * and never get another IPI to process it. ++ */ ++ smp_mb(); ++ ++ /* ++ * It's ok to use list_for_each_rcu() here even though we may ++ * delete 'pos', since list_del_rcu() doesn't clear ->next + */ +- rcu_read_lock(); +- list_for_each_entry_rcu(data, &call_function_queue, csd.list) { ++ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { + int refs; + +- if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) ++ spin_lock(&data->lock); ++ if (!cpumask_test_cpu(cpu, data->cpumask)) { ++ spin_unlock(&data->lock); + continue; ++ } ++ cpumask_clear_cpu(cpu, data->cpumask); ++ spin_unlock(&data->lock); + + data->csd.func(data->csd.info); + + spin_lock(&data->lock); +- cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits)); + WARN_ON(data->refs == 0); +- data->refs--; +- refs = data->refs; ++ refs = --data->refs; ++ if (!refs) { ++ spin_lock(&call_function.lock); ++ list_del_rcu(&data->csd.list); ++ spin_unlock(&call_function.lock); ++ } + spin_unlock(&data->lock); + + if (refs) + continue; + +- spin_lock(&call_function_lock); +- list_del_rcu(&data->csd.list); +- spin_unlock(&call_function_lock); +- +- if (data->csd.flags & CSD_FLAG_WAIT) { +- /* +- * serialize stores to data with the flag clear +- * and wakeup +- */ +- smp_wmb(); +- data->csd.flags &= ~CSD_FLAG_WAIT; +- } +- if (data->csd.flags & CSD_FLAG_ALLOC) +- call_rcu(&data->rcu_head, rcu_free_call_data); ++ csd_unlock(&data->csd); + } +- rcu_read_unlock(); + + put_cpu(); + } + + /* +- * Invoked by arch to handle an IPI for call function single. Must be called +- * from the arch with interrupts disabled. ++ * Invoked by arch to handle an IPI for call function single. Must be ++ * called from the arch with interrupts disabled. + */ + void generic_smp_call_function_single_interrupt(void) + { + struct call_single_queue *q = &__get_cpu_var(call_single_queue); ++ unsigned int data_flags; + LIST_HEAD(list); + +- /* +- * Need to see other stores to list head for checking whether +- * list is empty without holding q->lock +- */ +- smp_read_barrier_depends(); +- while (!list_empty(&q->list)) { +- unsigned int data_flags; +- +- spin_lock(&q->lock); +- list_replace_init(&q->list, &list); +- spin_unlock(&q->lock); +- +- while (!list_empty(&list)) { +- struct call_single_data *data; +- +- data = list_entry(list.next, struct call_single_data, +- list); +- list_del(&data->list); +- +- /* +- * 'data' can be invalid after this call if +- * flags == 0 (when called through +- * generic_exec_single(), so save them away before +- * making the call. +- */ +- data_flags = data->flags; +- +- data->func(data->info); +- +- if (data_flags & CSD_FLAG_WAIT) { +- smp_wmb(); +- data->flags &= ~CSD_FLAG_WAIT; +- } else if (data_flags & CSD_FLAG_LOCK) { +- smp_wmb(); +- data->flags &= ~CSD_FLAG_LOCK; +- } else if (data_flags & CSD_FLAG_ALLOC) +- kfree(data); +- } ++ spin_lock(&q->lock); ++ list_replace_init(&q->list, &list); ++ spin_unlock(&q->lock); ++ ++ while (!list_empty(&list)) { ++ struct call_single_data *data; ++ ++ data = list_entry(list.next, struct call_single_data, list); ++ list_del(&data->list); ++ + /* +- * See comment on outer loop ++ * 'data' can be invalid after this call if flags == 0 ++ * (when called through generic_exec_single()), ++ * so save them away before making the call: + */ +- smp_read_barrier_depends(); ++ data_flags = data->flags; ++ ++ data->func(data->info); ++ ++ /* ++ * Unlocked CSDs are valid through generic_exec_single(): ++ */ ++ if (data_flags & CSD_FLAG_LOCK) ++ csd_unlock(data); + } + } + +@@ -215,65 +272,45 @@ static DEFINE_PER_CPU(struct call_single + int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int wait) + { +- struct call_single_data d; ++ struct call_single_data d = { ++ .flags = 0, ++ }; + unsigned long flags; +- /* prevent preemption and reschedule on another processor, +- as well as CPU removal */ +- int me = get_cpu(); ++ int this_cpu; + int err = 0; + ++ /* ++ * prevent preemption and reschedule on another processor, ++ * as well as CPU removal ++ */ ++ this_cpu = get_cpu(); ++ + /* Can deadlock when called with interrupts disabled */ +- WARN_ON(irqs_disabled()); ++ WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + +- if (cpu == me) { ++ if (cpu == this_cpu) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); +- } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { +- struct call_single_data *data; ++ } else { ++ if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { ++ struct call_single_data *data = &d; ++ ++ if (!wait) ++ data = &__get_cpu_var(csd_data); ++ ++ csd_lock(data); + +- if (!wait) { +- /* +- * We are calling a function on a single CPU +- * and we are not going to wait for it to finish. +- * We first try to allocate the data, but if we +- * fail, we fall back to use a per cpu data to pass +- * the information to that CPU. Since all callers +- * of this code will use the same data, we must +- * synchronize the callers to prevent a new caller +- * from corrupting the data before the callee +- * can access it. +- * +- * The CSD_FLAG_LOCK is used to let us know when +- * the IPI handler is done with the data. +- * The first caller will set it, and the callee +- * will clear it. The next caller must wait for +- * it to clear before we set it again. This +- * will make sure the callee is done with the +- * data before a new caller will use it. +- */ +- data = kmalloc(sizeof(*data), GFP_ATOMIC); +- if (data) +- data->flags = CSD_FLAG_ALLOC; +- else { +- data = &per_cpu(csd_data, me); +- while (data->flags & CSD_FLAG_LOCK) +- cpu_relax(); +- data->flags = CSD_FLAG_LOCK; +- } ++ data->func = func; ++ data->info = info; ++ generic_exec_single(cpu, data, wait); + } else { +- data = &d; +- data->flags = CSD_FLAG_WAIT; ++ err = -ENXIO; /* CPU not online */ + } +- +- data->func = func; +- data->info = info; +- generic_exec_single(cpu, data); +- } else { +- err = -ENXIO; /* CPU not online */ + } + + put_cpu(); ++ + return err; + } + EXPORT_SYMBOL(smp_call_function_single); +@@ -283,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single); + * @cpu: The CPU to run on. + * @data: Pre-allocated and setup data structure + * +- * Like smp_call_function_single(), but allow caller to pass in a pre-allocated +- * data structure. Useful for embedding @data inside other structures, for +- * instance. +- * ++ * Like smp_call_function_single(), but allow caller to pass in a ++ * pre-allocated data structure. Useful for embedding @data inside ++ * other structures, for instance. + */ +-void __smp_call_function_single(int cpu, struct call_single_data *data) ++void __smp_call_function_single(int cpu, struct call_single_data *data, ++ int wait) + { ++ csd_lock(data); ++ + /* Can deadlock when called with interrupts disabled */ +- WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); ++ WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); + +- generic_exec_single(cpu, data); ++ generic_exec_single(cpu, data, wait); + } + +-/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ ++/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ ++ + #ifndef arch_send_call_function_ipi_mask +-#define arch_send_call_function_ipi_mask(maskp) \ +- arch_send_call_function_ipi(*(maskp)) ++# define arch_send_call_function_ipi_mask(maskp) \ ++ arch_send_call_function_ipi(*(maskp)) + #endif + + /** +@@ -307,7 +347,8 @@ void __smp_call_function_single(int cpu, + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. +- * @wait: If true, wait (atomically) until function has completed on other CPUs. ++ * @wait: If true, wait (atomically) until function has completed ++ * on other CPUs. + * + * If @wait is true, then returns once @func has returned. Note that @wait + * will be implicitly turned on in case of allocation failures, since +@@ -318,27 +359,27 @@ void __smp_call_function_single(int cpu, + * must be disabled when calling this function. + */ + void smp_call_function_many(const struct cpumask *mask, +- void (*func)(void *), void *info, +- bool wait) ++ void (*func)(void *), void *info, bool wait) + { + struct call_function_data *data; + unsigned long flags; +- int cpu, next_cpu; ++ int cpu, next_cpu, this_cpu = smp_processor_id(); + + /* Can deadlock when called with interrupts disabled */ +- WARN_ON(irqs_disabled()); ++ WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + +- /* So, what's a CPU they want? Ignoring this one. */ ++ /* So, what's a CPU they want? Ignoring this one. */ + cpu = cpumask_first_and(mask, cpu_online_mask); +- if (cpu == smp_processor_id()) ++ if (cpu == this_cpu) + cpu = cpumask_next_and(cpu, mask, cpu_online_mask); ++ + /* No online cpus? We're done. */ + if (cpu >= nr_cpu_ids) + return; + + /* Do we have another CPU which isn't us? */ + next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); +- if (next_cpu == smp_processor_id()) ++ if (next_cpu == this_cpu) + next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); + + /* Fastpath: do that cpu by itself. */ +@@ -347,43 +388,40 @@ void smp_call_function_many(const struct + return; + } + +- data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); +- if (unlikely(!data)) { +- /* Slow path. */ +- for_each_online_cpu(cpu) { +- if (cpu == smp_processor_id()) +- continue; +- if (cpumask_test_cpu(cpu, mask)) +- smp_call_function_single(cpu, func, info, wait); +- } +- return; +- } ++ data = &__get_cpu_var(cfd_data); ++ csd_lock(&data->csd); + +- spin_lock_init(&data->lock); +- data->csd.flags = CSD_FLAG_ALLOC; +- if (wait) +- data->csd.flags |= CSD_FLAG_WAIT; ++ spin_lock_irqsave(&data->lock, flags); + data->csd.func = func; + data->csd.info = info; +- cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); +- cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); +- data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); +- +- spin_lock_irqsave(&call_function_lock, flags); +- list_add_tail_rcu(&data->csd.list, &call_function_queue); +- spin_unlock_irqrestore(&call_function_lock, flags); ++ cpumask_and(data->cpumask, mask, cpu_online_mask); ++ cpumask_clear_cpu(this_cpu, data->cpumask); ++ data->refs = cpumask_weight(data->cpumask); ++ ++ spin_lock(&call_function.lock); ++ /* ++ * Place entry at the _HEAD_ of the list, so that any cpu still ++ * observing the entry in generic_smp_call_function_interrupt() ++ * will not miss any other list entries: ++ */ ++ list_add_rcu(&data->csd.list, &call_function.queue); ++ spin_unlock(&call_function.lock); ++ ++ spin_unlock_irqrestore(&data->lock, flags); + + /* + * Make the list addition visible before sending the ipi. ++ * (IPIs must obey or appear to obey normal Linux cache ++ * coherency rules -- see comment in generic_exec_single). + */ + smp_mb(); + + /* Send a message to all CPUs in the map */ +- arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); ++ arch_send_call_function_ipi_mask(data->cpumask); + +- /* optionally wait for the CPUs to complete */ ++ /* Optionally wait for the CPUs to complete */ + if (wait) +- csd_flag_wait(&data->csd); ++ csd_lock_wait(&data->csd); + } + EXPORT_SYMBOL(smp_call_function_many); + +@@ -391,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many); + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. +- * @wait: If true, wait (atomically) until function has completed on other CPUs. ++ * @wait: If true, wait (atomically) until function has completed ++ * on other CPUs. + * + * Returns 0. + * +@@ -407,26 +446,27 @@ int smp_call_function(void (*func)(void + preempt_disable(); + smp_call_function_many(cpu_online_mask, func, info, wait); + preempt_enable(); ++ + return 0; + } + EXPORT_SYMBOL(smp_call_function); + + void ipi_call_lock(void) + { +- spin_lock(&call_function_lock); ++ spin_lock(&call_function.lock); + } + + void ipi_call_unlock(void) + { +- spin_unlock(&call_function_lock); ++ spin_unlock(&call_function.lock); + } + + void ipi_call_lock_irq(void) + { +- spin_lock_irq(&call_function_lock); ++ spin_lock_irq(&call_function.lock); + } + + void ipi_call_unlock_irq(void) + { +- spin_unlock_irq(&call_function_lock); ++ spin_unlock_irq(&call_function.lock); + } +Index: linux-2.6-tip/kernel/softirq.c +=================================================================== +--- linux-2.6-tip.orig/kernel/softirq.c ++++ linux-2.6-tip/kernel/softirq.c +@@ -8,21 +8,31 @@ + * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. ++ * ++ * Softirq-split implemetation by ++ * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + */ + + #include ++#include ++#include ++#include + #include + #include + #include ++#include + #include + #include + #include ++#include + #include + #include + #include + #include ++#include + #include + #include ++#include + + #include + /* +@@ -50,7 +60,116 @@ EXPORT_SYMBOL(irq_stat); + + static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; + +-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); ++struct softirqdata { ++ int nr; ++ unsigned long cpu; ++ struct task_struct *tsk; ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ wait_queue_head_t wait; ++ int running; ++#endif ++}; ++ ++static DEFINE_PER_CPU(struct softirqdata [MAX_SOFTIRQ], ksoftirqd); ++ ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++/* ++ * Preempting the softirq causes cases that would not be a ++ * problem when the softirq is not preempted. That is a ++ * process may have code to spin while waiting for a softirq ++ * to finish on another CPU. But if it happens that the ++ * process has preempted the softirq, this could cause a ++ * deadlock. ++ */ ++void wait_for_softirq(int softirq) ++{ ++ struct softirqdata *data = &__get_cpu_var(ksoftirqd)[softirq]; ++ ++ if (data->running) { ++ DECLARE_WAITQUEUE(wait, current); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ add_wait_queue(&data->wait, &wait); ++ if (data->running) ++ schedule(); ++ remove_wait_queue(&data->wait, &wait); ++ __set_current_state(TASK_RUNNING); ++ } ++} ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * On preempt-rt a softirq might be blocked on a lock. There might be ++ * no other runnable task on this CPU because the lock owner runs on ++ * some other CPU. So we have to go into idle with the pending bit ++ * set. Therefor we need to check this otherwise we warn about false ++ * positives which confuses users and defeats the whole purpose of ++ * this test. ++ * ++ * This code is called with interrupts disabled. ++ */ ++void softirq_check_pending_idle(void) ++{ ++ static int rate_limit; ++ u32 warnpending = 0, pending = local_softirq_pending(); ++ int curr = 0; ++ ++ if (rate_limit >= 10) ++ return; ++ ++ while (pending) { ++ if (pending & 1) { ++ struct task_struct *tsk; ++ ++ tsk = __get_cpu_var(ksoftirqd)[curr].tsk; ++ /* ++ * The wakeup code in rtmutex.c wakes up the ++ * task _before_ it sets pi_blocked_on to NULL ++ * under tsk->pi_lock. So we need to check for ++ * both: state and pi_blocked_on. ++ */ ++ spin_lock(&tsk->pi_lock); ++ ++ if (!tsk->pi_blocked_on && ++ !(tsk->state == TASK_RUNNING) && ++ !(tsk->state & TASK_RUNNING_MUTEX)) ++ warnpending |= 1 << curr; ++ ++ spin_unlock(&tsk->pi_lock); ++ } ++ pending >>= 1; ++ curr++; ++ } ++ ++ if (warnpending) { ++ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", ++ warnpending); ++ rate_limit++; ++ } ++} ++ ++#else ++/* ++ * On !PREEMPT_RT we just printk rate limited: ++ */ ++void softirq_check_pending_idle(void) ++{ ++ static int ratelimit; ++ ++ if (ratelimit < 10) { ++ printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", ++ local_softirq_pending()); ++ rate_limit++; ++ } ++} ++ ++#endif ++ ++char *softirq_to_name[NR_SOFTIRQS] = { ++ "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", ++ "TASKLET", "SCHED", "HRTIMER", "RCU" ++}; + + /* + * we cannot loop indefinitely here to avoid userspace starvation, +@@ -58,16 +177,34 @@ static DEFINE_PER_CPU(struct task_struct + * to the pending events, so lets the scheduler to balance + * the softirq load for us. + */ +-void wakeup_softirqd(void) ++static void wakeup_softirqd(int softirq) + { + /* Interrupts are disabled: no need to stop preemption */ +- struct task_struct *tsk = __get_cpu_var(ksoftirqd); ++ struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; + + if (tsk && tsk->state != TASK_RUNNING) + wake_up_process(tsk); + } + + /* ++ * Wake up the softirq threads which have work ++ */ ++static void trigger_softirqs(void) ++{ ++ u32 pending = local_softirq_pending(); ++ int curr = 0; ++ ++ while (pending) { ++ if (pending & 1) ++ wakeup_softirqd(curr); ++ pending >>= 1; ++ curr++; ++ } ++} ++ ++#ifndef CONFIG_PREEMPT_HARDIRQS ++ ++/* + * This one is for softirq.c-internal use, + * where hardirqs are disabled legitimately: + */ +@@ -79,13 +216,23 @@ static void __local_bh_disable(unsigned + WARN_ON_ONCE(in_irq()); + + raw_local_irq_save(flags); +- add_preempt_count(SOFTIRQ_OFFSET); ++ /* ++ * The preempt tracer hooks into add_preempt_count and will break ++ * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET ++ * is set and before current->softirq_enabled is cleared. ++ * We must manually increment preempt_count here and manually ++ * call the trace_preempt_off later. ++ */ ++ preempt_count() += SOFTIRQ_OFFSET; + /* + * Were softirqs turned off above: + */ + if (softirq_count() == SOFTIRQ_OFFSET) + trace_softirqs_off(ip); + raw_local_irq_restore(flags); ++ ++ if (preempt_count() == SOFTIRQ_OFFSET) ++ trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + } + #else /* !CONFIG_TRACE_IRQFLAGS */ + static inline void __local_bh_disable(unsigned long ip) +@@ -109,7 +256,6 @@ EXPORT_SYMBOL(local_bh_disable); + */ + void _local_bh_enable(void) + { +- WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (softirq_count() == SOFTIRQ_OFFSET) +@@ -119,17 +265,22 @@ void _local_bh_enable(void) + + EXPORT_SYMBOL(_local_bh_enable); + +-static inline void _local_bh_enable_ip(unsigned long ip) ++void local_bh_enable(void) + { +- WARN_ON_ONCE(in_irq() || irqs_disabled()); + #ifdef CONFIG_TRACE_IRQFLAGS +- local_irq_disable(); ++ unsigned long flags; ++ ++ WARN_ON_ONCE(in_irq()); ++#endif ++ ++#ifdef CONFIG_TRACE_IRQFLAGS ++ local_irq_save(flags); + #endif + /* + * Are softirqs going to be turned on now: + */ + if (softirq_count() == SOFTIRQ_OFFSET) +- trace_softirqs_on(ip); ++ trace_softirqs_on((unsigned long)__builtin_return_address(0)); + /* + * Keep preemption disabled until we are done with + * softirq processing: +@@ -141,23 +292,45 @@ static inline void _local_bh_enable_ip(u + + dec_preempt_count(); + #ifdef CONFIG_TRACE_IRQFLAGS +- local_irq_enable(); ++ local_irq_restore(flags); + #endif + preempt_check_resched(); + } +- +-void local_bh_enable(void) +-{ +- _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +-} + EXPORT_SYMBOL(local_bh_enable); + + void local_bh_enable_ip(unsigned long ip) + { +- _local_bh_enable_ip(ip); ++#ifdef CONFIG_TRACE_IRQFLAGS ++ unsigned long flags; ++ ++ WARN_ON_ONCE(in_irq()); ++ ++ local_irq_save(flags); ++#endif ++ /* ++ * Are softirqs going to be turned on now: ++ */ ++ if (softirq_count() == SOFTIRQ_OFFSET) ++ trace_softirqs_on(ip); ++ /* ++ * Keep preemption disabled until we are done with ++ * softirq processing: ++ */ ++ sub_preempt_count(SOFTIRQ_OFFSET - 1); ++ ++ if (unlikely(!in_interrupt() && local_softirq_pending())) ++ do_softirq(); ++ ++ dec_preempt_count(); ++#ifdef CONFIG_TRACE_IRQFLAGS ++ local_irq_restore(flags); ++#endif ++ preempt_check_resched(); + } + EXPORT_SYMBOL(local_bh_enable_ip); + ++#endif ++ + /* + * We restart softirq processing MAX_SOFTIRQ_RESTART times, + * and we fall back to softirqd after that. +@@ -167,63 +340,146 @@ EXPORT_SYMBOL(local_bh_enable_ip); + * we want to handle softirqs as soon as possible, but they + * should not be able to lock up the box. + */ +-#define MAX_SOFTIRQ_RESTART 10 ++#define MAX_SOFTIRQ_RESTART 20 + +-asmlinkage void __do_softirq(void) ++DEFINE_TRACE(softirq_entry); ++DEFINE_TRACE(softirq_exit); ++ ++static DEFINE_PER_CPU(u32, softirq_running); ++ ++/* ++ * Debug check for leaking preempt counts in h->action handlers: ++ */ ++ ++static inline void debug_check_preempt_count_start(__u32 *preempt_count) + { +- struct softirq_action *h; +- __u32 pending; ++#ifdef CONFIG_DEBUG_PREEMPT ++ *preempt_count = preempt_count(); ++#endif ++} ++ ++static inline void ++ debug_check_preempt_count_stop(__u32 *preempt_count, struct softirq_action *h) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (*preempt_count == preempt_count()) ++ return; ++ ++ print_symbol("BUG: %Ps exited with wrong preemption count!\n", ++ (unsigned long)h->action); ++ printk("=> enter: %08x, exit: %08x.\n", *preempt_count, preempt_count()); ++ preempt_count() = *preempt_count; ++#endif ++} ++ ++/* ++ * Execute softirq handlers: ++ */ ++static void ___do_softirq(const int same_prio_only) ++{ ++ __u32 pending, available_mask, same_prio_skipped, preempt_count; + int max_restart = MAX_SOFTIRQ_RESTART; +- int cpu; ++ struct softirq_action *h; ++ int cpu, softirq; + + pending = local_softirq_pending(); + account_system_vtime(current); + +- __local_bh_disable((unsigned long)__builtin_return_address(0)); +- trace_softirq_enter(); +- + cpu = smp_processor_id(); + restart: ++ available_mask = -1; ++ softirq = 0; ++ same_prio_skipped = 0; + /* Reset the pending bitmask before enabling irqs */ + set_softirq_pending(0); + +- local_irq_enable(); +- + h = softirq_vec; + + do { +- if (pending & 1) { +- int prev_count = preempt_count(); ++ u32 softirq_mask = 1 << softirq; + +- h->action(h); ++ if (!(pending & 1)) ++ goto next; + +- if (unlikely(prev_count != preempt_count())) { +- printk(KERN_ERR "huh, entered softirq %td %p" +- "with preempt_count %08x," +- " exited with %08x?\n", h - softirq_vec, +- h->action, prev_count, preempt_count()); +- preempt_count() = prev_count; +- } ++ debug_check_preempt_count_start(&preempt_count); + +- rcu_bh_qsctr_inc(cpu); ++#if defined(CONFIG_PREEMPT_SOFTIRQS) && defined(CONFIG_PREEMPT_HARDIRQS) ++ /* ++ * If executed by a same-prio hardirq thread ++ * then skip pending softirqs that belong ++ * to softirq threads with different priority: ++ */ ++ if (same_prio_only) { ++ struct task_struct *tsk; ++ ++ tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; ++ if (tsk && tsk->normal_prio != current->normal_prio) { ++ same_prio_skipped |= softirq_mask; ++ available_mask &= ~softirq_mask; ++ goto next; ++ } + } ++#endif ++ /* ++ * Is this softirq already being processed? ++ */ ++ if (per_cpu(softirq_running, cpu) & softirq_mask) { ++ available_mask &= ~softirq_mask; ++ goto next; ++ } ++ per_cpu(softirq_running, cpu) |= softirq_mask; ++ local_irq_enable(); ++ ++ h->action(h); ++ ++ debug_check_preempt_count_stop(&preempt_count, h); ++ ++ rcu_bh_qsctr_inc(cpu); ++ cond_resched_softirq_context(); ++ local_irq_disable(); ++ per_cpu(softirq_running, cpu) &= ~softirq_mask; ++next: + h++; ++ softirq++; + pending >>= 1; + } while (pending); + +- local_irq_disable(); +- ++ or_softirq_pending(same_prio_skipped); + pending = local_softirq_pending(); +- if (pending && --max_restart) +- goto restart; ++ if (pending & available_mask) { ++ if (--max_restart) ++ goto restart; ++ } + + if (pending) +- wakeup_softirqd(); ++ trigger_softirqs(); ++} ++ ++asmlinkage void __do_softirq(void) ++{ ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ /* ++ * 'preempt harder'. Push all softirq processing off to ksoftirqd. ++ */ ++ if (softirq_preemption) { ++ if (local_softirq_pending()) ++ trigger_softirqs(); ++ return; ++ } ++#endif ++ /* ++ * 'immediate' softirq execution: ++ */ ++ __local_bh_disable((unsigned long)__builtin_return_address(0)); ++ lockdep_softirq_enter(); ++ ++ ___do_softirq(0); + +- trace_softirq_exit(); ++ lockdep_softirq_exit(); + + account_system_vtime(current); + _local_bh_enable(); ++ + } + + #ifndef __ARCH_HAS_DO_SOFTIRQ +@@ -286,7 +542,7 @@ void irq_exit(void) + if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) + tick_nohz_stop_sched_tick(0); + #endif +- preempt_enable_no_resched(); ++ __preempt_enable_no_resched(); + } + + /* +@@ -294,19 +550,11 @@ void irq_exit(void) + */ + inline void raise_softirq_irqoff(unsigned int nr) + { +- __raise_softirq_irqoff(nr); ++ __do_raise_softirq_irqoff(nr); + +- /* +- * If we're in an interrupt or softirq, we're done +- * (this also catches softirq-disabled code). We will +- * actually run the softirq once we return from +- * the irq or softirq. +- * +- * Otherwise we wake up ksoftirqd to make sure we +- * schedule the softirq soon. +- */ +- if (!in_interrupt()) +- wakeup_softirqd(); ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ wakeup_softirqd(nr); ++#endif + } + + void raise_softirq(unsigned int nr) +@@ -333,15 +581,45 @@ struct tasklet_head + static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); + static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); + ++static void inline ++__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) ++{ ++ if (tasklet_trylock(t)) { ++again: ++ /* We may have been preempted before tasklet_trylock ++ * and __tasklet_action may have already run. ++ * So double check the sched bit while the takslet ++ * is locked before adding it to the list. ++ */ ++ if (test_bit(TASKLET_STATE_SCHED, &t->state)) { ++ t->next = NULL; ++ *head->tail = t; ++ head->tail = &(t->next); ++ raise_softirq_irqoff(nr); ++ tasklet_unlock(t); ++ } else { ++ /* This is subtle. If we hit the corner case above ++ * It is possible that we get preempted right here, ++ * and another task has successfully called ++ * tasklet_schedule(), then this function, and ++ * failed on the trylock. Thus we must be sure ++ * before releasing the tasklet lock, that the ++ * SCHED_BIT is clear. Otherwise the tasklet ++ * may get its SCHED_BIT set, but not added to the ++ * list ++ */ ++ if (!tasklet_tryunlock(t)) ++ goto again; ++ } ++ } ++} ++ + void __tasklet_schedule(struct tasklet_struct *t) + { + unsigned long flags; + + local_irq_save(flags); +- t->next = NULL; +- *__get_cpu_var(tasklet_vec).tail = t; +- __get_cpu_var(tasklet_vec).tail = &(t->next); +- raise_softirq_irqoff(TASKLET_SOFTIRQ); ++ __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ); + local_irq_restore(flags); + } + +@@ -352,50 +630,127 @@ void __tasklet_hi_schedule(struct taskle + unsigned long flags; + + local_irq_save(flags); +- t->next = NULL; +- *__get_cpu_var(tasklet_hi_vec).tail = t; +- __get_cpu_var(tasklet_hi_vec).tail = &(t->next); +- raise_softirq_irqoff(HI_SOFTIRQ); ++ __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ); + local_irq_restore(flags); + } + + EXPORT_SYMBOL(__tasklet_hi_schedule); + +-static void tasklet_action(struct softirq_action *a) ++void __tasklet_hi_schedule_first(struct tasklet_struct *t) + { +- struct tasklet_struct *list; ++ __tasklet_hi_schedule(t); ++} + +- local_irq_disable(); +- list = __get_cpu_var(tasklet_vec).head; +- __get_cpu_var(tasklet_vec).head = NULL; +- __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; +- local_irq_enable(); ++EXPORT_SYMBOL(__tasklet_hi_schedule_first); ++ ++void tasklet_enable(struct tasklet_struct *t) ++{ ++ if (!atomic_dec_and_test(&t->count)) ++ return; ++ if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) ++ tasklet_schedule(t); ++} ++ ++EXPORT_SYMBOL(tasklet_enable); ++ ++void tasklet_hi_enable(struct tasklet_struct *t) ++{ ++ if (!atomic_dec_and_test(&t->count)) ++ return; ++ if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) ++ tasklet_hi_schedule(t); ++} ++ ++EXPORT_SYMBOL(tasklet_hi_enable); ++ ++static void ++__tasklet_action(struct softirq_action *a, struct tasklet_struct *list) ++{ ++ int loops = 1000000; + + while (list) { + struct tasklet_struct *t = list; + + list = list->next; + +- if (tasklet_trylock(t)) { +- if (!atomic_read(&t->count)) { +- if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) +- BUG(); +- t->func(t->data); +- tasklet_unlock(t); +- continue; +- } +- tasklet_unlock(t); ++ /* ++ * Should always succeed - after a tasklist got on the ++ * list (after getting the SCHED bit set from 0 to 1), ++ * nothing but the tasklet softirq it got queued to can ++ * lock it: ++ */ ++ if (!tasklet_trylock(t)) { ++ WARN_ON(1); ++ continue; + } + +- local_irq_disable(); + t->next = NULL; +- *__get_cpu_var(tasklet_vec).tail = t; +- __get_cpu_var(tasklet_vec).tail = &(t->next); +- __raise_softirq_irqoff(TASKLET_SOFTIRQ); +- local_irq_enable(); ++ ++ /* ++ * If we cannot handle the tasklet because it's disabled, ++ * mark it as pending. tasklet_enable() will later ++ * re-schedule the tasklet. ++ */ ++ if (unlikely(atomic_read(&t->count))) { ++out_disabled: ++ /* implicit unlock: */ ++ wmb(); ++ t->state = TASKLET_STATEF_PENDING; ++ continue; ++ } ++ ++ /* ++ * After this point on the tasklet might be rescheduled ++ * on another CPU, but it can only be added to another ++ * CPU's tasklet list if we unlock the tasklet (which we ++ * dont do yet). ++ */ ++ if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) ++ WARN_ON(1); ++ ++again: ++ t->func(t->data); ++ ++ /* ++ * Try to unlock the tasklet. We must use cmpxchg, because ++ * another CPU might have scheduled or disabled the tasklet. ++ * We only allow the STATE_RUN -> 0 transition here. ++ */ ++ while (!tasklet_tryunlock(t)) { ++ /* ++ * If it got disabled meanwhile, bail out: ++ */ ++ if (atomic_read(&t->count)) ++ goto out_disabled; ++ /* ++ * If it got scheduled meanwhile, re-execute ++ * the tasklet function: ++ */ ++ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) ++ goto again; ++ if (!--loops) { ++ printk("hm, tasklet state: %08lx\n", t->state); ++ WARN_ON(1); ++ tasklet_unlock(t); ++ break; ++ } ++ } + } + } + ++static void tasklet_action(struct softirq_action *a) ++{ ++ struct tasklet_struct *list; ++ ++ local_irq_disable(); ++ list = __get_cpu_var(tasklet_vec).head; ++ __get_cpu_var(tasklet_vec).head = NULL; ++ __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; ++ local_irq_enable(); ++ ++ __tasklet_action(a, list); ++} ++ + static void tasklet_hi_action(struct softirq_action *a) + { + struct tasklet_struct *list; +@@ -406,29 +761,7 @@ static void tasklet_hi_action(struct sof + __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; + local_irq_enable(); + +- while (list) { +- struct tasklet_struct *t = list; +- +- list = list->next; +- +- if (tasklet_trylock(t)) { +- if (!atomic_read(&t->count)) { +- if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) +- BUG(); +- t->func(t->data); +- tasklet_unlock(t); +- continue; +- } +- tasklet_unlock(t); +- } +- +- local_irq_disable(); +- t->next = NULL; +- *__get_cpu_var(tasklet_hi_vec).tail = t; +- __get_cpu_var(tasklet_hi_vec).tail = &(t->next); +- __raise_softirq_irqoff(HI_SOFTIRQ); +- local_irq_enable(); +- } ++ __tasklet_action(a, list); + } + + +@@ -451,7 +784,7 @@ void tasklet_kill(struct tasklet_struct + + while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { + do +- yield(); ++ msleep(1); + while (test_bit(TASKLET_STATE_SCHED, &t->state)); + } + tasklet_unlock_wait(t); +@@ -496,7 +829,7 @@ static int __try_remote_softirq(struct c + cp->flags = 0; + cp->priv = softirq; + +- __smp_call_function_single(cpu, cp); ++ __smp_call_function_single(cpu, cp, 0); + return 0; + } + return 1; +@@ -602,34 +935,100 @@ void __init softirq_init(void) + open_softirq(HI_SOFTIRQ, tasklet_hi_action); + } + +-static int ksoftirqd(void * __bind_cpu) ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) ++ ++void tasklet_unlock_wait(struct tasklet_struct *t) + { ++ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { ++ /* ++ * Hack for now to avoid this busy-loop: ++ */ ++#ifdef CONFIG_PREEMPT_RT ++ msleep(1); ++#else ++ barrier(); ++#endif ++ } ++} ++EXPORT_SYMBOL(tasklet_unlock_wait); ++ ++#endif ++ ++static int ksoftirqd(void * __data) ++{ ++ /* Priority needs to be below hardirqs */ ++ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 - 1}; ++ struct softirqdata *data = __data; ++ u32 softirq_mask = (1 << data->nr); ++ struct softirq_action *h; ++ int cpu = data->cpu; ++ ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ init_waitqueue_head(&data->wait); ++#endif ++ ++ sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); ++ current->flags |= PF_SOFTIRQ; + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + preempt_disable(); +- if (!local_softirq_pending()) { +- preempt_enable_no_resched(); ++ if (!(local_softirq_pending() & softirq_mask)) { ++sleep_more: ++ __preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } + + __set_current_state(TASK_RUNNING); + +- while (local_softirq_pending()) { ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ data->running = 1; ++#endif ++ ++ while (local_softirq_pending() & softirq_mask) { + /* Preempt disable stops cpu going offline. + If already offline, we'll be on wrong CPU: + don't process */ +- if (cpu_is_offline((long)__bind_cpu)) ++ if (cpu_is_offline(cpu)) + goto wait_to_die; +- do_softirq(); +- preempt_enable_no_resched(); ++ ++ local_irq_disable(); ++ /* ++ * Is the softirq already being executed by ++ * a hardirq context? ++ */ ++ if (per_cpu(softirq_running, cpu) & softirq_mask) { ++ local_irq_enable(); ++ set_current_state(TASK_INTERRUPTIBLE); ++ goto sleep_more; ++ } ++ per_cpu(softirq_running, cpu) |= softirq_mask; ++ __preempt_enable_no_resched(); ++ set_softirq_pending(local_softirq_pending() & ~softirq_mask); ++ local_bh_disable(); ++ local_irq_enable(); ++ ++ h = &softirq_vec[data->nr]; ++ if (h) ++ h->action(h); ++ rcu_bh_qsctr_inc(data->cpu); ++ ++ local_irq_disable(); ++ per_cpu(softirq_running, cpu) &= ~softirq_mask; ++ _local_bh_enable(); ++ local_irq_enable(); ++ + cond_resched(); + preempt_disable(); +- rcu_qsctr_inc((long)__bind_cpu); ++ rcu_qsctr_inc(data->cpu); + } + preempt_enable(); + set_current_state(TASK_INTERRUPTIBLE); ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ data->running = 0; ++ wake_up(&data->wait); ++#endif + } + __set_current_state(TASK_RUNNING); + return 0; +@@ -679,7 +1078,7 @@ void tasklet_kill_immediate(struct taskl + BUG(); + } + +-static void takeover_tasklets(unsigned int cpu) ++void takeover_tasklets(unsigned int cpu) + { + /* CPU is dead, so no lock needed. */ + local_irq_disable(); +@@ -705,49 +1104,83 @@ static void takeover_tasklets(unsigned i + } + #endif /* CONFIG_HOTPLUG_CPU */ + ++static const char *softirq_names [] = ++{ ++ [HI_SOFTIRQ] = "high", ++ [SCHED_SOFTIRQ] = "sched", ++ [TIMER_SOFTIRQ] = "timer", ++ [NET_TX_SOFTIRQ] = "net-tx", ++ [NET_RX_SOFTIRQ] = "net-rx", ++ [BLOCK_SOFTIRQ] = "block", ++ [TASKLET_SOFTIRQ] = "tasklet", ++#ifdef CONFIG_HIGH_RES_TIMERS ++ [HRTIMER_SOFTIRQ] = "hrtimer", ++#endif ++ [RCU_SOFTIRQ] = "rcu", ++}; ++ + static int __cpuinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) + { +- int hotcpu = (unsigned long)hcpu; ++ int hotcpu = (unsigned long)hcpu, i; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: +- p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); +- if (IS_ERR(p)) { +- printk("ksoftirqd for %i failed\n", hotcpu); +- return NOTIFY_BAD; +- } +- kthread_bind(p, hotcpu); +- per_cpu(ksoftirqd, hotcpu) = p; +- break; ++ for (i = 0; i < MAX_SOFTIRQ; i++) { ++ per_cpu(ksoftirqd, hotcpu)[i].nr = i; ++ per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; ++ per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; ++ } ++ for (i = 0; i < MAX_SOFTIRQ; i++) { ++ p = kthread_create(ksoftirqd, ++ &per_cpu(ksoftirqd, hotcpu)[i], ++ "sirq-%s/%d", softirq_names[i], ++ hotcpu); ++ if (IS_ERR(p)) { ++ printk("ksoftirqd %d for %i failed\n", i, ++ hotcpu); ++ return NOTIFY_BAD; ++ } ++ kthread_bind(p, hotcpu); ++ per_cpu(ksoftirqd, hotcpu)[i].tsk = p; ++ } ++ break; ++ break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: +- wake_up_process(per_cpu(ksoftirqd, hotcpu)); ++ for (i = 0; i < MAX_SOFTIRQ; i++) ++ wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk); + break; + #ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: +- if (!per_cpu(ksoftirqd, hotcpu)) +- break; +- /* Unbind so it can run. Fall thru. */ +- kthread_bind(per_cpu(ksoftirqd, hotcpu), +- cpumask_any(cpu_online_mask)); ++#if 0 ++ for (i = 0; i < MAX_SOFTIRQ; i++) { ++ if (!per_cpu(ksoftirqd, hotcpu)[i].tsk) ++ continue; ++ kthread_bind(per_cpu(ksoftirqd, hotcpu)[i].tsk, ++ any_online_cpu(cpu_online_map)); ++ } ++#endif + case CPU_DEAD: + case CPU_DEAD_FROZEN: { +- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; ++ struct sched_param param; + +- p = per_cpu(ksoftirqd, hotcpu); +- per_cpu(ksoftirqd, hotcpu) = NULL; +- sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); +- kthread_stop(p); ++ for (i = 0; i < MAX_SOFTIRQ; i++) { ++ param.sched_priority = MAX_RT_PRIO-1; ++ p = per_cpu(ksoftirqd, hotcpu)[i].tsk; ++ sched_setscheduler(p, SCHED_FIFO, ¶m); ++ per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; ++ kthread_stop(p); ++ } + takeover_tasklets(hotcpu); + break; +- } +-#endif /* CONFIG_HOTPLUG_CPU */ + } ++#endif /* CONFIG_HOTPLUG_CPU */ ++ } + return NOTIFY_OK; + } + +@@ -767,6 +1200,34 @@ static __init int spawn_ksoftirqd(void) + } + early_initcall(spawn_ksoftirqd); + ++ ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ ++int softirq_preemption = 1; ++ ++EXPORT_SYMBOL(softirq_preemption); ++ ++/* ++ * Real-Time Preemption depends on softirq threading: ++ */ ++#ifndef CONFIG_PREEMPT_RT ++ ++static int __init softirq_preempt_setup (char *str) ++{ ++ if (!strncmp(str, "off", 3)) ++ softirq_preemption = 0; ++ else ++ get_option(&str, &softirq_preemption); ++ if (!softirq_preemption) ++ printk("turning off softirq preemption!\n"); ++ ++ return 1; ++} ++ ++__setup("softirq-preempt=", softirq_preempt_setup); ++#endif ++#endif ++ + #ifdef CONFIG_SMP + /* + * Call a function on all processors +@@ -796,6 +1257,11 @@ int __init __weak early_irq_init(void) + return 0; + } + ++int __init __weak arch_probe_nr_irqs(void) ++{ ++ return 0; ++} ++ + int __init __weak arch_early_irq_init(void) + { + return 0; +Index: linux-2.6-tip/kernel/softlockup.c +=================================================================== +--- linux-2.6-tip.orig/kernel/softlockup.c ++++ linux-2.6-tip/kernel/softlockup.c +@@ -20,7 +20,7 @@ + + #include + +-static DEFINE_SPINLOCK(print_lock); ++static DEFINE_RAW_SPINLOCK(print_lock); + + static DEFINE_PER_CPU(unsigned long, touch_timestamp); + static DEFINE_PER_CPU(unsigned long, print_timestamp); +@@ -166,97 +166,11 @@ void softlockup_tick(void) + } + + /* +- * Have a reasonable limit on the number of tasks checked: +- */ +-unsigned long __read_mostly sysctl_hung_task_check_count = 1024; +- +-/* +- * Zero means infinite timeout - no checking done: +- */ +-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; +- +-unsigned long __read_mostly sysctl_hung_task_warnings = 10; +- +-/* +- * Only do the hung-tasks check on one CPU: +- */ +-static int check_cpu __read_mostly = -1; +- +-static void check_hung_task(struct task_struct *t, unsigned long now) +-{ +- unsigned long switch_count = t->nvcsw + t->nivcsw; +- +- if (t->flags & PF_FROZEN) +- return; +- +- if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { +- t->last_switch_count = switch_count; +- t->last_switch_timestamp = now; +- return; +- } +- if ((long)(now - t->last_switch_timestamp) < +- sysctl_hung_task_timeout_secs) +- return; +- if (!sysctl_hung_task_warnings) +- return; +- sysctl_hung_task_warnings--; +- +- /* +- * Ok, the task did not get scheduled for more than 2 minutes, +- * complain: +- */ +- printk(KERN_ERR "INFO: task %s:%d blocked for more than " +- "%ld seconds.\n", t->comm, t->pid, +- sysctl_hung_task_timeout_secs); +- printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" +- " disables this message.\n"); +- sched_show_task(t); +- __debug_show_held_locks(t); +- +- t->last_switch_timestamp = now; +- touch_nmi_watchdog(); +- +- if (softlockup_panic) +- panic("softlockup: blocked tasks"); +-} +- +-/* +- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for +- * a really long time (120 seconds). If that happens, print out +- * a warning. +- */ +-static void check_hung_uninterruptible_tasks(int this_cpu) +-{ +- int max_count = sysctl_hung_task_check_count; +- unsigned long now = get_timestamp(this_cpu); +- struct task_struct *g, *t; +- +- /* +- * If the system crashed already then all bets are off, +- * do not report extra hung tasks: +- */ +- if (test_taint(TAINT_DIE) || did_panic) +- return; +- +- read_lock(&tasklist_lock); +- do_each_thread(g, t) { +- if (!--max_count) +- goto unlock; +- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ +- if (t->state == TASK_UNINTERRUPTIBLE) +- check_hung_task(t, now); +- } while_each_thread(g, t); +- unlock: +- read_unlock(&tasklist_lock); +-} +- +-/* + * The watchdog thread - runs every second and touches the timestamp. + */ + static int watchdog(void *__bind_cpu) + { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; +- int this_cpu = (long)__bind_cpu; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + +@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu) + if (kthread_should_stop()) + break; + +- if (this_cpu == check_cpu) { +- if (sysctl_hung_task_timeout_secs) +- check_hung_uninterruptible_tasks(this_cpu); +- } +- + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: +- check_cpu = cpumask_any(cpu_online_mask); + wake_up_process(per_cpu(watchdog_task, hotcpu)); + break; + #ifdef CONFIG_HOTPLUG_CPU +- case CPU_DOWN_PREPARE: +- case CPU_DOWN_PREPARE_FROZEN: +- if (hotcpu == check_cpu) { +- /* Pick any other online cpu. */ +- check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); +- } +- break; +- + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!per_cpu(watchdog_task, hotcpu)) +Index: linux-2.6-tip/kernel/stop_machine.c +=================================================================== +--- linux-2.6-tip.orig/kernel/stop_machine.c ++++ linux-2.6-tip/kernel/stop_machine.c +@@ -40,6 +40,8 @@ static atomic_t thread_ack; + static DEFINE_MUTEX(lock); + /* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */ + static DEFINE_MUTEX(setup_lock); ++/* do not start up until all worklets have been placed: */ ++static DEFINE_MUTEX(startup_lock); + /* Users of stop_machine. */ + static int refcount; + static struct workqueue_struct *stop_machine_wq; +@@ -71,6 +73,15 @@ static void stop_cpu(struct work_struct + int cpu = smp_processor_id(); + int err; + ++ /* ++ * Wait for the startup loop to finish: ++ */ ++ mutex_lock(&startup_lock); ++ /* ++ * Let other threads continue too: ++ */ ++ mutex_unlock(&startup_lock); ++ + if (!active_cpus) { + if (cpu == cpumask_first(cpu_online_mask)) + smdata = &active; +@@ -166,16 +177,21 @@ int __stop_machine(int (*fn)(void *), vo + + set_state(STOPMACHINE_PREPARE); + +- /* Schedule the stop_cpu work on all cpus: hold this CPU so one +- * doesn't hit this CPU until we're ready. */ +- get_cpu(); ++ /* ++ * Schedule the stop_cpu work on all cpus before allowing any ++ * of the CPUs to execute it: ++ */ ++ mutex_lock(&startup_lock); ++ + for_each_online_cpu(i) { +- sm_work = percpu_ptr(stop_machine_work, i); ++ sm_work = per_cpu_ptr(stop_machine_work, i); + INIT_WORK(sm_work, stop_cpu); + queue_work_on(i, stop_machine_wq, sm_work); + } +- /* This will release the thread on our CPU. */ +- put_cpu(); ++ ++ /* This will release the thread on all CPUs: */ ++ mutex_unlock(&startup_lock); ++ + flush_workqueue(stop_machine_wq); + ret = active.fnret; + mutex_unlock(&lock); +Index: linux-2.6-tip/kernel/sys.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sys.c ++++ linux-2.6-tip/kernel/sys.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -32,11 +33,13 @@ + #include + #include + #include ++#include + #include + #include + + #include + #include ++#include + #include + #include + +@@ -278,6 +281,15 @@ out_unlock: + */ + void emergency_restart(void) + { ++ /* ++ * Call the notifier chain if we are not in an ++ * atomic context: ++ */ ++#ifdef CONFIG_PREEMPT ++ if (!in_atomic() && !irqs_disabled()) ++ blocking_notifier_call_chain(&reboot_notifier_list, ++ SYS_RESTART, NULL); ++#endif + machine_emergency_restart(); + } + EXPORT_SYMBOL_GPL(emergency_restart); +@@ -1800,6 +1812,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsi + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; ++ case PR_TASK_PERF_COUNTERS_DISABLE: ++ error = perf_counter_task_disable(); ++ break; ++ case PR_TASK_PERF_COUNTERS_ENABLE: ++ error = perf_counter_task_enable(); ++ break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; +Index: linux-2.6-tip/kernel/sys_ni.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sys_ni.c ++++ linux-2.6-tip/kernel/sys_ni.c +@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime) + cond_syscall(compat_sys_timerfd_gettime); + cond_syscall(sys_eventfd); + cond_syscall(sys_eventfd2); ++ ++/* performance counters: */ ++cond_syscall(sys_perf_counter_open); +Index: linux-2.6-tip/kernel/sysctl.c +=================================================================== +--- linux-2.6-tip.orig/kernel/sysctl.c ++++ linux-2.6-tip/kernel/sysctl.c +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -812,6 +813,19 @@ static struct ctl_table kern_table[] = { + .extra1 = &neg_one, + .extra2 = &sixty, + }, ++#endif ++#ifdef CONFIG_DETECT_HUNG_TASK ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "hung_task_panic", ++ .data = &sysctl_hung_task_panic, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hung_task_check_count", +@@ -827,7 +841,7 @@ static struct ctl_table kern_table[] = { + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, +- .proc_handler = &proc_doulongvec_minmax, ++ .proc_handler = &proc_dohung_task_timeout_secs, + .strategy = &sysctl_intvec, + }, + { +@@ -887,6 +901,16 @@ static struct ctl_table kern_table[] = { + .proc_handler = &proc_dointvec, + }, + #endif ++#ifdef CONFIG_KMEMCHECK ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "kmemcheck", ++ .data = &kmemcheck_enabled, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, +Index: linux-2.6-tip/kernel/time/clockevents.c +=================================================================== +--- linux-2.6-tip.orig/kernel/time/clockevents.c ++++ linux-2.6-tip/kernel/time/clockevents.c +@@ -27,7 +27,7 @@ static LIST_HEAD(clockevents_released); + static RAW_NOTIFIER_HEAD(clockevents_chain); + + /* Protection for the above */ +-static DEFINE_SPINLOCK(clockevents_lock); ++static DEFINE_RAW_SPINLOCK(clockevents_lock); + + /** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds +@@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_e + if (dev->mode != mode) { + dev->set_mode(mode, dev); + dev->mode = mode; ++ ++ /* ++ * A nsec2cyc multiplicator of 0 is invalid and we'd crash ++ * on it, so fix it up and emit a warning: ++ */ ++ if (mode == CLOCK_EVT_MODE_ONESHOT) { ++ if (unlikely(!dev->mult)) { ++ dev->mult = 1; ++ WARN_ON(1); ++ } ++ } + } + } + +@@ -168,15 +179,6 @@ void clockevents_register_device(struct + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(!dev->cpumask); + +- /* +- * A nsec2cyc multiplicator of 0 is invalid and we'd crash +- * on it, so fix it up and emit a warning: +- */ +- if (unlikely(!dev->mult)) { +- dev->mult = 1; +- WARN_ON(1); +- } +- + spin_lock(&clockevents_lock); + + list_add(&dev->list, &clockevent_devices); +Index: linux-2.6-tip/kernel/time/ntp.c +=================================================================== +--- linux-2.6-tip.orig/kernel/time/ntp.c ++++ linux-2.6-tip/kernel/time/ntp.c +@@ -1,71 +1,129 @@ + /* +- * linux/kernel/time/ntp.c +- * + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ +- +-#include +-#include +-#include +-#include +-#include + #include +-#include + #include + #include +-#include ++#include ++#include ++#include ++#include ++#include ++#include + + /* +- * Timekeeping variables ++ * NTP timekeeping variables: + */ +-unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ +-unsigned long tick_nsec; /* ACTHZ period (nsec) */ +-u64 tick_length; +-static u64 tick_length_base; + +-static struct hrtimer leap_timer; ++/* USER_HZ period (usecs): */ ++unsigned long tick_usec = TICK_USEC; ++ ++/* ACTHZ period (nsecs): */ ++unsigned long tick_nsec; ++ ++u64 tick_length; ++static u64 tick_length_base; ++ ++static struct hrtimer leap_timer; + +-#define MAX_TICKADJ 500 /* microsecs */ +-#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ +- NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) ++#define MAX_TICKADJ 500LL /* usecs */ ++#define MAX_TICKADJ_SCALED \ ++ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) + + /* + * phase-lock loop variables + */ +-/* TIME_ERROR prevents overwriting the CMOS clock */ +-static int time_state = TIME_OK; /* clock synchronization status */ +-int time_status = STA_UNSYNC; /* clock status bits */ +-static long time_tai; /* TAI offset (s) */ +-static s64 time_offset; /* time adjustment (ns) */ +-static long time_constant = 2; /* pll time constant */ +-long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ +-long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ +-static s64 time_freq; /* frequency offset (scaled ns/s)*/ +-static long time_reftime; /* time at last adjustment (s) */ +-long time_adjust; +-static long ntp_tick_adj; + ++/* ++ * clock synchronization status ++ * ++ * (TIME_ERROR prevents overwriting the CMOS clock) ++ */ ++static int time_state = TIME_OK; ++ ++/* clock status bits: */ ++int time_status = STA_UNSYNC; ++ ++/* TAI offset (secs): */ ++static long time_tai; ++ ++/* time adjustment (nsecs): */ ++static s64 time_offset; ++ ++/* pll time constant: */ ++static long time_constant = 2; ++ ++/* maximum error (usecs): */ ++long time_maxerror = NTP_PHASE_LIMIT; ++ ++/* estimated error (usecs): */ ++long time_esterror = NTP_PHASE_LIMIT; ++ ++/* frequency offset (scaled nsecs/secs): */ ++static s64 time_freq; ++ ++/* time at last adjustment (secs): */ ++static long time_reftime; ++ ++long time_adjust; ++ ++/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ ++static s64 ntp_tick_adj; ++ ++/* ++ * NTP methods: ++ */ ++ ++/* ++ * Update (tick_length, tick_length_base, tick_nsec), based ++ * on (tick_usec, ntp_tick_adj, time_freq): ++ */ + static void ntp_update_frequency(void) + { +- u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) +- << NTP_SCALE_SHIFT; +- second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; +- second_length += time_freq; ++ u64 second_length; ++ u64 new_base; ++ ++ second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) ++ << NTP_SCALE_SHIFT; + +- tick_length_base = second_length; ++ second_length += ntp_tick_adj; ++ second_length += time_freq; + +- tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; +- tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); ++ tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; ++ new_base = div_u64(second_length, NTP_INTERVAL_FREQ); ++ ++ /* ++ * Don't wait for the next second_overflow, apply ++ * the change to the tick length immediately: ++ */ ++ tick_length += new_base - tick_length_base; ++ tick_length_base = new_base; ++} ++ ++static inline s64 ntp_update_offset_fll(s64 offset64, long secs) ++{ ++ time_status &= ~STA_MODE; ++ ++ if (secs < MINSEC) ++ return 0; ++ ++ if (!(time_status & STA_FLL) && (secs <= MAXSEC)) ++ return 0; ++ ++ time_status |= STA_MODE; ++ ++ return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); + } + + static void ntp_update_offset(long offset) + { +- long mtemp; + s64 freq_adj; ++ s64 offset64; ++ long secs; + + if (!(time_status & STA_PLL)) + return; +@@ -84,24 +142,23 @@ static void ntp_update_offset(long offse + * Select how the frequency is to be controlled + * and in which mode (PLL or FLL). + */ +- if (time_status & STA_FREQHOLD || time_reftime == 0) +- time_reftime = xtime.tv_sec; +- mtemp = xtime.tv_sec - time_reftime; ++ secs = xtime.tv_sec - time_reftime; ++ if (unlikely(time_status & STA_FREQHOLD)) ++ secs = 0; ++ + time_reftime = xtime.tv_sec; + +- freq_adj = (s64)offset * mtemp; +- freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); +- time_status &= ~STA_MODE; +- if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { +- freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL), +- mtemp); +- time_status |= STA_MODE; +- } +- freq_adj += time_freq; +- freq_adj = min(freq_adj, MAXFREQ_SCALED); +- time_freq = max(freq_adj, -MAXFREQ_SCALED); ++ offset64 = offset; ++ freq_adj = (offset64 * secs) << ++ (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + +- time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); ++ freq_adj += ntp_update_offset_fll(offset64, secs); ++ ++ freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); ++ ++ time_freq = max(freq_adj, -MAXFREQ_SCALED); ++ ++ time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); + } + + /** +@@ -111,15 +168,15 @@ static void ntp_update_offset(long offse + */ + void ntp_clear(void) + { +- time_adjust = 0; /* stop active adjtime() */ +- time_status |= STA_UNSYNC; +- time_maxerror = NTP_PHASE_LIMIT; +- time_esterror = NTP_PHASE_LIMIT; ++ time_adjust = 0; /* stop active adjtime() */ ++ time_status |= STA_UNSYNC; ++ time_maxerror = NTP_PHASE_LIMIT; ++ time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + +- tick_length = tick_length_base; +- time_offset = 0; ++ tick_length = tick_length_base; ++ time_offset = 0; + } + + /* +@@ -140,8 +197,8 @@ static enum hrtimer_restart ntp_leap_sec + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + time_state = TIME_OOP; +- printk(KERN_NOTICE "Clock: " +- "inserting leap second 23:59:60 UTC\n"); ++ printk(KERN_NOTICE ++ "Clock: inserting leap second 23:59:60 UTC\n"); + hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); + res = HRTIMER_RESTART; + break; +@@ -150,8 +207,8 @@ static enum hrtimer_restart ntp_leap_sec + time_tai--; + wall_to_monotonic.tv_sec--; + time_state = TIME_WAIT; +- printk(KERN_NOTICE "Clock: " +- "deleting leap second 23:59:59 UTC\n"); ++ printk(KERN_NOTICE ++ "Clock: deleting leap second 23:59:59 UTC\n"); + break; + case TIME_OOP: + time_tai++; +@@ -179,7 +236,7 @@ static enum hrtimer_restart ntp_leap_sec + */ + void second_overflow(void) + { +- s64 time_adj; ++ s64 delta; + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ / NSEC_PER_USEC; +@@ -192,24 +249,30 @@ void second_overflow(void) + * Compute the phase adjustment for the next second. The offset is + * reduced by a fixed factor times the time constant. + */ +- tick_length = tick_length_base; +- time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); +- time_offset -= time_adj; +- tick_length += time_adj; +- +- if (unlikely(time_adjust)) { +- if (time_adjust > MAX_TICKADJ) { +- time_adjust -= MAX_TICKADJ; +- tick_length += MAX_TICKADJ_SCALED; +- } else if (time_adjust < -MAX_TICKADJ) { +- time_adjust += MAX_TICKADJ; +- tick_length -= MAX_TICKADJ_SCALED; +- } else { +- tick_length += (s64)(time_adjust * NSEC_PER_USEC / +- NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; +- time_adjust = 0; +- } ++ tick_length = tick_length_base; ++ ++ delta = shift_right(time_offset, SHIFT_PLL + time_constant); ++ time_offset -= delta; ++ tick_length += delta; ++ ++ if (!time_adjust) ++ return; ++ ++ if (time_adjust > MAX_TICKADJ) { ++ time_adjust -= MAX_TICKADJ; ++ tick_length += MAX_TICKADJ_SCALED; ++ return; + } ++ ++ if (time_adjust < -MAX_TICKADJ) { ++ time_adjust += MAX_TICKADJ; ++ tick_length -= MAX_TICKADJ_SCALED; ++ return; ++ } ++ ++ tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) ++ << NTP_SCALE_SHIFT; ++ time_adjust = 0; + } + + #ifdef CONFIG_GENERIC_CMOS_UPDATE +@@ -233,12 +296,13 @@ static void sync_cmos_clock(struct work_ + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ +- if (!ntp_synced()) ++ if (!ntp_synced()) { + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; ++ } + + getnstimeofday(&now); + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) +@@ -270,7 +334,116 @@ static void notify_cmos_timer(void) + static inline void notify_cmos_timer(void) { } + #endif + +-/* adjtimex mainly allows reading (and writing, if superuser) of ++/* ++ * Start the leap seconds timer: ++ */ ++static inline void ntp_start_leap_timer(struct timespec *ts) ++{ ++ long now = ts->tv_sec; ++ ++ if (time_status & STA_INS) { ++ time_state = TIME_INS; ++ now += 86400 - now % 86400; ++ hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); ++ ++ return; ++ } ++ ++ if (time_status & STA_DEL) { ++ time_state = TIME_DEL; ++ now += 86400 - (now + 1) % 86400; ++ hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); ++ } ++} ++ ++/* ++ * Propagate a new txc->status value into the NTP state: ++ */ ++static inline void process_adj_status(struct timex *txc, struct timespec *ts) ++{ ++ if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { ++ time_state = TIME_OK; ++ time_status = STA_UNSYNC; ++ } ++ ++ /* ++ * If we turn on PLL adjustments then reset the ++ * reference time to current time. ++ */ ++ if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) ++ time_reftime = xtime.tv_sec; ++ ++ /* only set allowed bits */ ++ time_status &= STA_RONLY; ++ time_status |= txc->status & ~STA_RONLY; ++ ++ switch (time_state) { ++ case TIME_OK: ++ ntp_start_leap_timer(ts); ++ break; ++ case TIME_INS: ++ case TIME_DEL: ++ time_state = TIME_OK; ++ ntp_start_leap_timer(ts); ++ case TIME_WAIT: ++ if (!(time_status & (STA_INS | STA_DEL))) ++ time_state = TIME_OK; ++ break; ++ case TIME_OOP: ++ hrtimer_restart(&leap_timer); ++ break; ++ } ++} ++/* ++ * Called with the xtime lock held, so we can access and modify ++ * all the global NTP state: ++ */ ++static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) ++{ ++ if (txc->modes & ADJ_STATUS) ++ process_adj_status(txc, ts); ++ ++ if (txc->modes & ADJ_NANO) ++ time_status |= STA_NANO; ++ ++ if (txc->modes & ADJ_MICRO) ++ time_status &= ~STA_NANO; ++ ++ if (txc->modes & ADJ_FREQUENCY) { ++ time_freq = txc->freq * PPM_SCALE; ++ time_freq = min(time_freq, MAXFREQ_SCALED); ++ time_freq = max(time_freq, -MAXFREQ_SCALED); ++ } ++ ++ if (txc->modes & ADJ_MAXERROR) ++ time_maxerror = txc->maxerror; ++ ++ if (txc->modes & ADJ_ESTERROR) ++ time_esterror = txc->esterror; ++ ++ if (txc->modes & ADJ_TIMECONST) { ++ time_constant = txc->constant; ++ if (!(time_status & STA_NANO)) ++ time_constant += 4; ++ time_constant = min(time_constant, (long)MAXTC); ++ time_constant = max(time_constant, 0l); ++ } ++ ++ if (txc->modes & ADJ_TAI && txc->constant > 0) ++ time_tai = txc->constant; ++ ++ if (txc->modes & ADJ_OFFSET) ++ ntp_update_offset(txc->offset); ++ ++ if (txc->modes & ADJ_TICK) ++ tick_usec = txc->tick; ++ ++ if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) ++ ntp_update_frequency(); ++} ++ ++/* ++ * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ + int do_adjtimex(struct timex *txc) +@@ -291,11 +464,14 @@ int do_adjtimex(struct timex *txc) + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + +- /* if the quartz is off by more than 10% something is VERY wrong! */ ++ /* ++ * if the quartz is off by more than 10% then ++ * something is VERY wrong! ++ */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) +- return -EINVAL; ++ return -EINVAL; + + if (txc->modes & ADJ_STATUS && time_state != TIME_OK) + hrtimer_cancel(&leap_timer); +@@ -305,7 +481,6 @@ int do_adjtimex(struct timex *txc) + + write_seqlock_irq(&xtime_lock); + +- /* If there are input parameters, then process them */ + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + +@@ -315,98 +490,24 @@ int do_adjtimex(struct timex *txc) + ntp_update_frequency(); + } + txc->offset = save_adjust; +- goto adj_done; +- } +- if (txc->modes) { +- long sec; +- +- if (txc->modes & ADJ_STATUS) { +- if ((time_status & STA_PLL) && +- !(txc->status & STA_PLL)) { +- time_state = TIME_OK; +- time_status = STA_UNSYNC; +- } +- /* only set allowed bits */ +- time_status &= STA_RONLY; +- time_status |= txc->status & ~STA_RONLY; +- +- switch (time_state) { +- case TIME_OK: +- start_timer: +- sec = ts.tv_sec; +- if (time_status & STA_INS) { +- time_state = TIME_INS; +- sec += 86400 - sec % 86400; +- hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); +- } else if (time_status & STA_DEL) { +- time_state = TIME_DEL; +- sec += 86400 - (sec + 1) % 86400; +- hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); +- } +- break; +- case TIME_INS: +- case TIME_DEL: +- time_state = TIME_OK; +- goto start_timer; +- break; +- case TIME_WAIT: +- if (!(time_status & (STA_INS | STA_DEL))) +- time_state = TIME_OK; +- break; +- case TIME_OOP: +- hrtimer_restart(&leap_timer); +- break; +- } +- } +- +- if (txc->modes & ADJ_NANO) +- time_status |= STA_NANO; +- if (txc->modes & ADJ_MICRO) +- time_status &= ~STA_NANO; +- +- if (txc->modes & ADJ_FREQUENCY) { +- time_freq = (s64)txc->freq * PPM_SCALE; +- time_freq = min(time_freq, MAXFREQ_SCALED); +- time_freq = max(time_freq, -MAXFREQ_SCALED); +- } +- +- if (txc->modes & ADJ_MAXERROR) +- time_maxerror = txc->maxerror; +- if (txc->modes & ADJ_ESTERROR) +- time_esterror = txc->esterror; +- +- if (txc->modes & ADJ_TIMECONST) { +- time_constant = txc->constant; +- if (!(time_status & STA_NANO)) +- time_constant += 4; +- time_constant = min(time_constant, (long)MAXTC); +- time_constant = max(time_constant, 0l); +- } +- +- if (txc->modes & ADJ_TAI && txc->constant > 0) +- time_tai = txc->constant; +- +- if (txc->modes & ADJ_OFFSET) +- ntp_update_offset(txc->offset); +- if (txc->modes & ADJ_TICK) +- tick_usec = txc->tick; ++ } else { + +- if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) +- ntp_update_frequency(); +- } ++ /* If there are input parameters, then process them: */ ++ if (txc->modes) ++ process_adjtimex_modes(txc, &ts); + +- txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, ++ txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); +- if (!(time_status & STA_NANO)) +- txc->offset /= NSEC_PER_USEC; ++ if (!(time_status & STA_NANO)) ++ txc->offset /= NSEC_PER_USEC; ++ } + +-adj_done: + result = time_state; /* mostly `TIME_OK' */ + if (time_status & (STA_UNSYNC|STA_CLOCKERR)) + result = TIME_ERROR; + + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * +- (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); ++ PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; +@@ -425,6 +526,7 @@ adj_done: + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; ++ + write_sequnlock_irq(&xtime_lock); + + txc->time.tv_sec = ts.tv_sec; +@@ -440,6 +542,8 @@ adj_done: + static int __init ntp_tick_adj_setup(char *str) + { + ntp_tick_adj = simple_strtol(str, NULL, 0); ++ ntp_tick_adj <<= NTP_SCALE_SHIFT; ++ + return 1; + } + +Index: linux-2.6-tip/kernel/time/timekeeping.c +=================================================================== +--- linux-2.6-tip.orig/kernel/time/timekeeping.c ++++ linux-2.6-tip/kernel/time/timekeeping.c +@@ -22,9 +22,9 @@ + + /* + * This read-write spinlock protects us from races in SMP while +- * playing with xtime and avenrun. ++ * playing with xtime. + */ +-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); ++__cacheline_aligned_in_smp DEFINE_RAW_SEQLOCK(xtime_lock); + + + /* +Index: linux-2.6-tip/kernel/timer.c +=================================================================== +--- linux-2.6-tip.orig/kernel/timer.c ++++ linux-2.6-tip/kernel/timer.c +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -69,6 +70,7 @@ struct tvec_root { + struct tvec_base { + spinlock_t lock; + struct timer_list *running_timer; ++ wait_queue_head_t wait_for_running_timer; + unsigned long timer_jiffies; + struct tvec_root tv1; + struct tvec tv2; +@@ -316,9 +318,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relat + static inline void set_running_timer(struct tvec_base *base, + struct timer_list *timer) + { +-#ifdef CONFIG_SMP + base->running_timer = timer; +-#endif + } + + static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +@@ -491,14 +491,18 @@ static inline void debug_timer_free(stru + debug_object_free(timer, &timer_debug_descr); + } + +-static void __init_timer(struct timer_list *timer); +- +-void init_timer_on_stack(struct timer_list *timer) ++static void __init_timer(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key); ++ ++void init_timer_on_stack_key(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key) + { + debug_object_init_on_stack(timer, &timer_debug_descr); +- __init_timer(timer); ++ __init_timer(timer, name, key); + } +-EXPORT_SYMBOL_GPL(init_timer_on_stack); ++EXPORT_SYMBOL_GPL(init_timer_on_stack_key); + + void destroy_timer_on_stack(struct timer_list *timer) + { +@@ -512,7 +516,9 @@ static inline void debug_timer_activate( + static inline void debug_timer_deactivate(struct timer_list *timer) { } + #endif + +-static void __init_timer(struct timer_list *timer) ++static void __init_timer(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key) + { + timer->entry.next = NULL; + timer->base = __raw_get_cpu_var(tvec_bases); +@@ -521,6 +527,7 @@ static void __init_timer(struct timer_li + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); + #endif ++ lockdep_init_map(&timer->lockdep_map, name, key, 0); + } + + /** +@@ -530,19 +537,23 @@ static void __init_timer(struct timer_li + * init_timer() must be done to a timer prior calling *any* of the + * other timer functions. + */ +-void init_timer(struct timer_list *timer) ++void init_timer_key(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key) + { + debug_timer_init(timer); +- __init_timer(timer); ++ __init_timer(timer, name, key); + } +-EXPORT_SYMBOL(init_timer); ++EXPORT_SYMBOL(init_timer_key); + +-void init_timer_deferrable(struct timer_list *timer) ++void init_timer_deferrable_key(struct timer_list *timer, ++ const char *name, ++ struct lock_class_key *key) + { +- init_timer(timer); ++ init_timer_key(timer, name, key); + timer_set_deferrable(timer); + } +-EXPORT_SYMBOL(init_timer_deferrable); ++EXPORT_SYMBOL(init_timer_deferrable_key); + + static inline void detach_timer(struct timer_list *timer, + int clear_pending) +@@ -589,11 +600,12 @@ static struct tvec_base *lock_timer_base + } + } + +-int __mod_timer(struct timer_list *timer, unsigned long expires) ++static inline int ++__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) + { + struct tvec_base *base, *new_base; + unsigned long flags; +- int ret = 0; ++ int cpu, ret = 0; + + timer_stats_timer_set_start_info(timer); + BUG_ON(!timer->function); +@@ -603,11 +615,15 @@ int __mod_timer(struct timer_list *timer + if (timer_pending(timer)) { + detach_timer(timer, 0); + ret = 1; ++ } else { ++ if (pending_only) ++ goto out_unlock; + } + + debug_timer_activate(timer); + +- new_base = __get_cpu_var(tvec_bases); ++ cpu = raw_smp_processor_id(); ++ new_base = per_cpu(tvec_bases, cpu); + + if (base != new_base) { + /* +@@ -629,42 +645,28 @@ int __mod_timer(struct timer_list *timer + + timer->expires = expires; + internal_add_timer(base, timer); ++ ++out_unlock: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; + } + +-EXPORT_SYMBOL(__mod_timer); +- + /** +- * add_timer_on - start a timer on a particular CPU +- * @timer: the timer to be added +- * @cpu: the CPU to start it on ++ * mod_timer_pending - modify a pending timer's timeout ++ * @timer: the pending timer to be modified ++ * @expires: new timeout in jiffies + * +- * This is not very scalable on SMP. Double adds are not possible. ++ * mod_timer_pending() is the same for pending timers as mod_timer(), ++ * but will not re-activate and modify already deleted timers. ++ * ++ * It is useful for unserialized use of timers. + */ +-void add_timer_on(struct timer_list *timer, int cpu) ++int mod_timer_pending(struct timer_list *timer, unsigned long expires) + { +- struct tvec_base *base = per_cpu(tvec_bases, cpu); +- unsigned long flags; +- +- timer_stats_timer_set_start_info(timer); +- BUG_ON(timer_pending(timer) || !timer->function); +- spin_lock_irqsave(&base->lock, flags); +- timer_set_base(timer, base); +- debug_timer_activate(timer); +- internal_add_timer(base, timer); +- /* +- * Check whether the other CPU is idle and needs to be +- * triggered to reevaluate the timer wheel when nohz is +- * active. We are protected against the other CPU fiddling +- * with the timer by holding the timer base lock. This also +- * makes sure that a CPU on the way to idle can not evaluate +- * the timer wheel. +- */ +- wake_up_idle_cpu(cpu); +- spin_unlock_irqrestore(&base->lock, flags); ++ return __mod_timer(timer, expires, true); + } ++EXPORT_SYMBOL(mod_timer_pending); + + /** + * mod_timer - modify a timer's timeout +@@ -688,9 +690,6 @@ void add_timer_on(struct timer_list *tim + */ + int mod_timer(struct timer_list *timer, unsigned long expires) + { +- BUG_ON(!timer->function); +- +- timer_stats_timer_set_start_info(timer); + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified +@@ -699,12 +698,74 @@ int mod_timer(struct timer_list *timer, + if (timer->expires == expires && timer_pending(timer)) + return 1; + +- return __mod_timer(timer, expires); ++ return __mod_timer(timer, expires, false); + } +- + EXPORT_SYMBOL(mod_timer); + + /** ++ * add_timer - start a timer ++ * @timer: the timer to be added ++ * ++ * The kernel will do a ->function(->data) callback from the ++ * timer interrupt at the ->expires point in the future. The ++ * current time is 'jiffies'. ++ * ++ * The timer's ->expires, ->function (and if the handler uses it, ->data) ++ * fields must be set prior calling this function. ++ * ++ * Timers with an ->expires field in the past will be executed in the next ++ * timer tick. ++ */ ++void add_timer(struct timer_list *timer) ++{ ++ BUG_ON(timer_pending(timer)); ++ mod_timer(timer, timer->expires); ++} ++EXPORT_SYMBOL(add_timer); ++ ++/** ++ * add_timer_on - start a timer on a particular CPU ++ * @timer: the timer to be added ++ * @cpu: the CPU to start it on ++ * ++ * This is not very scalable on SMP. Double adds are not possible. ++ */ ++void add_timer_on(struct timer_list *timer, int cpu) ++{ ++ struct tvec_base *base = per_cpu(tvec_bases, cpu); ++ unsigned long flags; ++ ++ timer_stats_timer_set_start_info(timer); ++ BUG_ON(timer_pending(timer) || !timer->function); ++ spin_lock_irqsave(&base->lock, flags); ++ timer_set_base(timer, base); ++ debug_timer_activate(timer); ++ internal_add_timer(base, timer); ++ /* ++ * Check whether the other CPU is idle and needs to be ++ * triggered to reevaluate the timer wheel when nohz is ++ * active. We are protected against the other CPU fiddling ++ * with the timer by holding the timer base lock. This also ++ * makes sure that a CPU on the way to idle can not evaluate ++ * the timer wheel. ++ */ ++ wake_up_idle_cpu(cpu); ++ spin_unlock_irqrestore(&base->lock, flags); ++} ++ ++/* ++ * Wait for a running timer ++ */ ++void wait_for_running_timer(struct timer_list *timer) ++{ ++ struct tvec_base *base = timer->base; ++ ++ if (base->running_timer == timer) ++ wait_event(base->wait_for_running_timer, ++ base->running_timer != timer); ++} ++ ++/** + * del_timer - deactive a timer. + * @timer: the timer to be deactivated + * +@@ -733,10 +794,36 @@ int del_timer(struct timer_list *timer) + + return ret; + } +- + EXPORT_SYMBOL(del_timer); + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) ++/* ++ * This function checks whether a timer is active and not running on any ++ * CPU. Upon successful (ret >= 0) exit the timer is not queued and the ++ * handler is not running on any CPU. ++ * ++ * It must not be called from interrupt contexts. ++ */ ++int timer_pending_sync(struct timer_list *timer) ++{ ++ struct tvec_base *base; ++ unsigned long flags; ++ int ret = -1; ++ ++ base = lock_timer_base(timer, &flags); ++ ++ if (base->running_timer == timer) ++ goto out; ++ ++ ret = 0; ++ if (timer_pending(timer)) ++ ret = 1; ++out: ++ spin_unlock_irqrestore(&base->lock, flags); ++ ++ return ret; ++} ++ + /** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del +@@ -767,7 +854,6 @@ out: + + return ret; + } +- + EXPORT_SYMBOL(try_to_del_timer_sync); + + /** +@@ -789,14 +875,22 @@ EXPORT_SYMBOL(try_to_del_timer_sync); + */ + int del_timer_sync(struct timer_list *timer) + { ++#ifdef CONFIG_LOCKDEP ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ lock_map_acquire(&timer->lockdep_map); ++ lock_map_release(&timer->lockdep_map); ++ local_irq_restore(flags); ++#endif ++ + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; +- cpu_relax(); ++ wait_for_running_timer(timer); + } + } +- + EXPORT_SYMBOL(del_timer_sync); + #endif + +@@ -839,6 +933,20 @@ static inline void __run_timers(struct t + struct list_head *head = &work_list; + int index = base->timer_jiffies & TVR_MASK; + ++ if (softirq_need_resched()) { ++ spin_unlock_irq(&base->lock); ++ wake_up(&base->wait_for_running_timer); ++ cond_resched_softirq_context(); ++ cpu_relax(); ++ spin_lock_irq(&base->lock); ++ /* ++ * We can simply continue after preemption, nobody ++ * else can touch timer_jiffies so 'index' is still ++ * valid. Any new jiffy will be taken care of in ++ * subsequent loops: ++ */ ++ } ++ + /* + * Cascade timers: + */ +@@ -861,23 +969,48 @@ static inline void __run_timers(struct t + + set_running_timer(base, timer); + detach_timer(timer, 1); ++ + spin_unlock_irq(&base->lock); + { + int preempt_count = preempt_count(); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * It is permissible to free the timer from ++ * inside the function that is called from ++ * it, this we need to take into account for ++ * lockdep too. To avoid bogus "held lock ++ * freed" warnings as well as problems when ++ * looking into timer->lockdep_map, make a ++ * copy and use that here. ++ */ ++ struct lockdep_map lockdep_map = ++ timer->lockdep_map; ++#endif ++ /* ++ * Couple the lock chain with the lock chain at ++ * del_timer_sync() by acquiring the lock_map ++ * around the fn() call here and in ++ * del_timer_sync(). ++ */ ++ lock_map_acquire(&lockdep_map); ++ + fn(data); ++ ++ lock_map_release(&lockdep_map); ++ + if (preempt_count != preempt_count()) { +- printk(KERN_ERR "huh, entered %p " +- "with preempt_count %08x, exited" +- " with %08x?\n", +- fn, preempt_count, +- preempt_count()); +- BUG(); ++ print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); ++ printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); ++ preempt_count() = preempt_count; + } + } ++ set_running_timer(base, NULL); ++ cond_resched_softirq_context(); + spin_lock_irq(&base->lock); + } + } +- set_running_timer(base, NULL); ++ wake_up(&base->wait_for_running_timer); + spin_unlock_irq(&base->lock); + } + +@@ -1007,9 +1140,22 @@ unsigned long get_next_timer_interrupt(u + struct tvec_base *base = __get_cpu_var(tvec_bases); + unsigned long expires; + ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * On PREEMPT_RT we cannot sleep here. If the trylock does not ++ * succeed then we return the worst-case 'expires in 1 tick' ++ * value: ++ */ ++ if (spin_trylock(&base->lock)) { ++ expires = __next_timer_interrupt(base); ++ spin_unlock(&base->lock); ++ } else ++ expires = now + 1; ++#else + spin_lock(&base->lock); + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); ++#endif + + if (time_before_eq(expires, now)) + return now; +@@ -1029,62 +1175,21 @@ void update_process_times(int user_tick) + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); ++ scheduler_tick(); + run_local_timers(); + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_tick); +- printk_tick(); +- scheduler_tick(); + run_posix_cpu_timers(p); + } + + /* +- * Nr of active tasks - counted in fixed-point numbers +- */ +-static unsigned long count_active_tasks(void) +-{ +- return nr_active() * FIXED_1; +-} +- +-/* +- * Hmm.. Changed this, as the GNU make sources (load.c) seems to +- * imply that avenrun[] is the standard name for this kind of thing. +- * Nothing else seems to be standardized: the fractional size etc +- * all seem to differ on different machines. +- * +- * Requires xtime_lock to access. +- */ +-unsigned long avenrun[3]; +- +-EXPORT_SYMBOL(avenrun); +- +-/* +- * calc_load - given tick count, update the avenrun load estimates. +- * This is called while holding a write_lock on xtime_lock. +- */ +-static inline void calc_load(unsigned long ticks) +-{ +- unsigned long active_tasks; /* fixed-point */ +- static int count = LOAD_FREQ; +- +- count -= ticks; +- if (unlikely(count < 0)) { +- active_tasks = count_active_tasks(); +- do { +- CALC_LOAD(avenrun[0], EXP_1, active_tasks); +- CALC_LOAD(avenrun[1], EXP_5, active_tasks); +- CALC_LOAD(avenrun[2], EXP_15, active_tasks); +- count += LOAD_FREQ; +- } while (count < 0); +- } +-} +- +-/* + * This function runs timers and the timer-tq in bottom half context. + */ + static void run_timer_softirq(struct softirq_action *h) + { +- struct tvec_base *base = __get_cpu_var(tvec_bases); ++ struct tvec_base *base = per_cpu(tvec_bases, raw_smp_processor_id()); + ++ printk_tick(); + hrtimer_run_pending(); + + if (time_after_eq(jiffies, base->timer_jiffies)) +@@ -1102,16 +1207,6 @@ void run_local_timers(void) + } + + /* +- * Called by the timer interrupt. xtime_lock must already be taken +- * by the timer IRQ! +- */ +-static inline void update_times(unsigned long ticks) +-{ +- update_wall_time(); +- calc_load(ticks); +-} +- +-/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... +@@ -1120,7 +1215,8 @@ static inline void update_times(unsigned + void do_timer(unsigned long ticks) + { + jiffies_64 += ticks; +- update_times(ticks); ++ update_wall_time(); ++ calc_global_load(); + } + + #ifdef __ARCH_WANT_SYS_ALARM +@@ -1268,7 +1364,7 @@ signed long __sched schedule_timeout(sig + expire = timeout + jiffies; + + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); +- __mod_timer(&timer, expire); ++ __mod_timer(&timer, expire, false); + schedule(); + del_singleshot_timer_sync(&timer); + +@@ -1321,37 +1417,17 @@ int do_sysinfo(struct sysinfo *info) + { + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; +- unsigned long seq; ++ struct timespec tp; + + memset(info, 0, sizeof(struct sysinfo)); + +- do { +- struct timespec tp; +- seq = read_seqbegin(&xtime_lock); +- +- /* +- * This is annoying. The below is the same thing +- * posix_get_clock_monotonic() does, but it wants to +- * take the lock which we want to cover the loads stuff +- * too. +- */ +- +- getnstimeofday(&tp); +- tp.tv_sec += wall_to_monotonic.tv_sec; +- tp.tv_nsec += wall_to_monotonic.tv_nsec; +- monotonic_to_bootbased(&tp); +- if (tp.tv_nsec - NSEC_PER_SEC >= 0) { +- tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; +- tp.tv_sec++; +- } +- info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); ++ ktime_get_ts(&tp); ++ monotonic_to_bootbased(&tp); ++ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + +- info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); +- info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); +- info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); ++ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); + +- info->procs = nr_threads; +- } while (read_seqretry(&xtime_lock, seq)); ++ info->procs = nr_threads; + + si_meminfo(info); + si_swapinfo(info); +@@ -1454,6 +1530,7 @@ static int __cpuinit init_timers_cpu(int + } + + spin_lock_init(&base->lock); ++ init_waitqueue_head(&base->wait_for_running_timer); + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); +@@ -1485,6 +1562,7 @@ static void __cpuinit migrate_timers(int + { + struct tvec_base *old_base; + struct tvec_base *new_base; ++ unsigned long flags; + int i; + + BUG_ON(cpu_online(cpu)); +@@ -1494,8 +1572,11 @@ static void __cpuinit migrate_timers(int + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ +- spin_lock_irq(&new_base->lock); +- spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); ++ local_irq_save(flags); ++ while (!spin_trylock(&new_base->lock)) ++ cpu_relax(); ++ while (!spin_trylock(&old_base->lock)) ++ cpu_relax(); + + BUG_ON(old_base->running_timer); + +@@ -1509,7 +1590,9 @@ static void __cpuinit migrate_timers(int + } + + spin_unlock(&old_base->lock); +- spin_unlock_irq(&new_base->lock); ++ spin_unlock(&new_base->lock); ++ local_irq_restore(flags); ++ + put_cpu_var(tvec_bases); + } + #endif /* CONFIG_HOTPLUG_CPU */ +Index: linux-2.6-tip/kernel/trace/Kconfig +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/Kconfig ++++ linux-2.6-tip/kernel/trace/Kconfig +@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT + config NOP_TRACER + bool + ++config HAVE_FTRACE_NMI_ENTER ++ bool ++ + config HAVE_FUNCTION_TRACER + bool + +@@ -31,12 +34,20 @@ config HAVE_FTRACE_MCOUNT_RECORD + config HAVE_HW_BRANCH_TRACER + bool + ++config HAVE_FTRACE_SYSCALLS ++ bool ++ + config TRACER_MAX_TRACE + bool + + config RING_BUFFER + bool + ++config FTRACE_NMI_ENTER ++ bool ++ depends on HAVE_FTRACE_NMI_ENTER ++ default y ++ + config TRACING + bool + select DEBUG_FS +@@ -44,13 +55,29 @@ config TRACING + select STACKTRACE if STACKTRACE_SUPPORT + select TRACEPOINTS + select NOP_TRACER ++ select BINARY_PRINTF ++ ++# ++# Minimum requirements an architecture has to meet for us to ++# be able to offer generic tracing facilities: ++# ++config TRACING_SUPPORT ++ bool ++ # PPC32 has no irqflags tracing support, but it can use most of the ++ # tracers anyway, they were tested to build and work. Note that new ++ # exceptions to this list aren't welcomed, better implement the ++ # irqflags tracing for your architecture. ++ depends on TRACE_IRQFLAGS_SUPPORT || PPC32 ++ depends on STACKTRACE_SUPPORT ++ default y ++ ++if TRACING_SUPPORT + + menu "Tracers" + + config FUNCTION_TRACER + bool "Kernel Function Tracer" + depends on HAVE_FUNCTION_TRACER +- depends on DEBUG_KERNEL + select FRAME_POINTER + select KALLSYMS + select TRACING +@@ -83,7 +110,6 @@ config IRQSOFF_TRACER + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on GENERIC_TIME +- depends on DEBUG_KERNEL + select TRACE_IRQFLAGS + select TRACING + select TRACER_MAX_TRACE +@@ -106,7 +132,6 @@ config PREEMPT_TRACER + default n + depends on GENERIC_TIME + depends on PREEMPT +- depends on DEBUG_KERNEL + select TRACING + select TRACER_MAX_TRACE + help +@@ -127,13 +152,13 @@ config SYSPROF_TRACER + bool "Sysprof Tracer" + depends on X86 + select TRACING ++ select CONTEXT_SWITCH_TRACER + help + This tracer provides the trace needed by the 'Sysprof' userspace + tool. + + config SCHED_TRACER + bool "Scheduling Latency Tracer" +- depends on DEBUG_KERNEL + select TRACING + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE +@@ -143,16 +168,30 @@ config SCHED_TRACER + + config CONTEXT_SWITCH_TRACER + bool "Trace process context switches" +- depends on DEBUG_KERNEL + select TRACING + select MARKERS + help + This tracer gets called from the context switch and records + all switching of tasks. + ++config EVENT_TRACER ++ bool "Trace various events in the kernel" ++ select TRACING ++ help ++ This tracer hooks to various trace points in the kernel ++ allowing the user to pick and choose which trace point they ++ want to trace. ++ ++config FTRACE_SYSCALLS ++ bool "Trace syscalls" ++ depends on HAVE_FTRACE_SYSCALLS ++ select TRACING ++ select KALLSYMS ++ help ++ Basic tracer to catch the syscall entry and exit events. ++ + config BOOT_TRACER + bool "Trace boot initcalls" +- depends on DEBUG_KERNEL + select TRACING + select CONTEXT_SWITCH_TRACER + help +@@ -165,13 +204,11 @@ config BOOT_TRACER + representation of the delays during initcalls - but the raw + /debug/tracing/trace text output is readable too. + +- ( Note that tracing self tests can't be enabled if this tracer is +- selected, because the self-tests are an initcall as well and that +- would invalidate the boot trace. ) ++ You must pass in ftrace=initcall to the kernel command line ++ to enable this on bootup. + + config TRACE_BRANCH_PROFILING + bool "Trace likely/unlikely profiler" +- depends on DEBUG_KERNEL + select TRACING + help + This tracer profiles all the the likely and unlikely macros +@@ -224,7 +261,6 @@ config BRANCH_TRACER + + config POWER_TRACER + bool "Trace power consumption behavior" +- depends on DEBUG_KERNEL + depends on X86 + select TRACING + help +@@ -236,7 +272,6 @@ config POWER_TRACER + config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FUNCTION_TRACER +- depends on DEBUG_KERNEL + select FUNCTION_TRACER + select STACKTRACE + select KALLSYMS +@@ -260,17 +295,73 @@ config STACK_TRACER + + config HW_BRANCH_TRACER + depends on HAVE_HW_BRANCH_TRACER ++ depends on !PREEMPT_RT + bool "Trace hw branches" + select TRACING + help + This tracer records all branches on the system in a circular + buffer giving access to the last N branches for each cpu. + ++config KMEMTRACE ++ bool "Trace SLAB allocations" ++ select TRACING ++ help ++ kmemtrace provides tracing for slab allocator functions, such as ++ kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected ++ data is then fed to the userspace application in order to analyse ++ allocation hotspots, internal fragmentation and so on, making it ++ possible to see how well an allocator performs, as well as debug ++ and profile kernel code. ++ ++ This requires an userspace application to use. See ++ Documentation/vm/kmemtrace.txt for more information. ++ ++ Saying Y will make the kernel somewhat larger and slower. However, ++ if you disable kmemtrace at run-time or boot-time, the performance ++ impact is minimal (depending on the arch the kernel is built for). ++ ++ If unsure, say N. ++ ++config WORKQUEUE_TRACER ++ bool "Trace workqueues" if !PREEMPT_RT ++ select TRACING ++ help ++ The workqueue tracer provides some statistical informations ++ about each cpu workqueue thread such as the number of the ++ works inserted and executed since their creation. It can help ++ to evaluate the amount of work each of them have to perform. ++ For example it can help a developer to decide whether he should ++ choose a per cpu workqueue instead of a singlethreaded one. ++ ++config BLK_DEV_IO_TRACE ++ bool "Support for tracing block io actions" ++ depends on SYSFS ++ depends on BLOCK ++ select RELAY ++ select DEBUG_FS ++ select TRACEPOINTS ++ select TRACING ++ select STACKTRACE ++ help ++ Say Y here if you want to be able to trace the block layer actions ++ on a given queue. Tracing allows you to see any traffic happening ++ on a block device queue. For more information (and the userspace ++ support tools needed), fetch the blktrace tools from: ++ ++ git://git.kernel.dk/blktrace.git ++ ++ Tracing also is possible using the ftrace interface, e.g.: ++ ++ echo 1 > /sys/block/sda/sda1/trace/enable ++ echo blk > /sys/kernel/debug/tracing/current_tracer ++ cat /sys/kernel/debug/tracing/trace_pipe ++ ++ If unsure, say N. ++ + config DYNAMIC_FTRACE + bool "enable/disable ftrace tracepoints dynamically" + depends on FUNCTION_TRACER + depends on HAVE_DYNAMIC_FTRACE +- depends on DEBUG_KERNEL + default y + help + This option will modify all the calls to ftrace dynamically +@@ -286,6 +377,20 @@ config DYNAMIC_FTRACE + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. + ++config FUNCTION_PROFILER ++ bool "Kernel function profiler" ++ depends on FUNCTION_TRACER ++ default n ++ help ++ This option enables the kernel function profiler. A file is created ++ in debugfs called function_profile_enabled which defaults to zero. ++ When a 1 is echoed into this file profiling begins, and when a ++ zero is entered, profiling stops. A file in the trace_stats ++ directory called functions, that show the list of functions that ++ have been hit and their counters. ++ ++ If in doubt, say N ++ + config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE +@@ -296,7 +401,7 @@ config FTRACE_SELFTEST + + config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" +- depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER ++ depends on TRACING + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup +@@ -304,9 +409,23 @@ config FTRACE_STARTUP_TEST + functioning properly. It will do tests on all the configured + tracers of ftrace. + ++config INTERRUPT_OFF_HIST ++ bool "Interrupts off critical timings histogram" ++ depends on IRQSOFF_TRACER ++ help ++ This option uses the infrastructure of the critical ++ irqs off timings to create a histogram of latencies. ++ ++config PREEMPT_OFF_HIST ++ bool "Preempt off critical timings histogram" ++ depends on PREEMPT_TRACER ++ help ++ This option uses the infrastructure of the critical ++ preemption off timings to create a histogram of latencies. ++ + config MMIOTRACE + bool "Memory mapped IO tracing" +- depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI ++ depends on HAVE_MMIOTRACE_SUPPORT && PCI + select TRACING + help + Mmiotrace traces Memory Mapped I/O access and is meant for +@@ -328,3 +447,6 @@ config MMIOTRACE_TEST + Say N, unless you absolutely know what you are doing. + + endmenu ++ ++endif # TRACING_SUPPORT ++ +Index: linux-2.6-tip/kernel/trace/Makefile +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/Makefile ++++ linux-2.6-tip/kernel/trace/Makefile +@@ -19,6 +19,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftra + obj-$(CONFIG_RING_BUFFER) += ring_buffer.o + + obj-$(CONFIG_TRACING) += trace.o ++obj-$(CONFIG_TRACING) += trace_clock.o ++obj-$(CONFIG_TRACING) += trace_output.o ++obj-$(CONFIG_TRACING) += trace_stat.o ++obj-$(CONFIG_TRACING) += trace_printk.o + obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o + obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o + obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +@@ -33,5 +37,17 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += t + obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o + obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o + obj-$(CONFIG_POWER_TRACER) += trace_power.o ++obj-$(CONFIG_KMEMTRACE) += kmemtrace.o ++obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o ++obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ++obj-$(CONFIG_EVENT_TRACER) += trace_events.o ++obj-$(CONFIG_EVENT_TRACER) += events.o ++obj-$(CONFIG_EVENT_TRACER) += trace_export.o ++obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o ++obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o ++obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o ++ ++obj-$(CONFIG_INTERRUPT_OFF_HIST) += trace_hist.o ++obj-$(CONFIG_PREEMPT_OFF_HIST) += trace_hist.o + + libftrace-y := ftrace.o +Index: linux-2.6-tip/kernel/trace/blktrace.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/blktrace.c +@@ -0,0 +1,1515 @@ ++/* ++ * Copyright (C) 2006 Jens Axboe ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "trace_output.h" ++ ++static unsigned int blktrace_seq __read_mostly = 1; ++ ++static struct trace_array *blk_tr; ++static bool blk_tracer_enabled __read_mostly; ++ ++/* Select an alternative, minimalistic output than the original one */ ++#define TRACE_BLK_OPT_CLASSIC 0x1 ++ ++static struct tracer_opt blk_tracer_opts[] = { ++ /* Default disable the minimalistic output */ ++ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, ++ { } ++}; ++ ++static struct tracer_flags blk_tracer_flags = { ++ .val = 0, ++ .opts = blk_tracer_opts, ++}; ++ ++/* Global reference count of probes */ ++static atomic_t blk_probes_ref = ATOMIC_INIT(0); ++ ++static void blk_register_tracepoints(void); ++static void blk_unregister_tracepoints(void); ++ ++/* ++ * Send out a notify message. ++ */ ++static void trace_note(struct blk_trace *bt, pid_t pid, int action, ++ const void *data, size_t len) ++{ ++ struct blk_io_trace *t; ++ ++ if (!bt->rchan) ++ return; ++ ++ t = relay_reserve(bt->rchan, sizeof(*t) + len); ++ if (t) { ++ const int cpu = smp_processor_id(); ++ ++ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; ++ t->time = ktime_to_ns(ktime_get()); ++ t->device = bt->dev; ++ t->action = action; ++ t->pid = pid; ++ t->cpu = cpu; ++ t->pdu_len = len; ++ memcpy((void *) t + sizeof(*t), data, len); ++ } ++} ++ ++/* ++ * Send out a notify for this process, if we haven't done so since a trace ++ * started ++ */ ++static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) ++{ ++ tsk->btrace_seq = blktrace_seq; ++ trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); ++} ++ ++static void trace_note_time(struct blk_trace *bt) ++{ ++ struct timespec now; ++ unsigned long flags; ++ u32 words[2]; ++ ++ getnstimeofday(&now); ++ words[0] = now.tv_sec; ++ words[1] = now.tv_nsec; ++ ++ local_irq_save(flags); ++ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); ++ local_irq_restore(flags); ++} ++ ++void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) ++{ ++ int n; ++ va_list args; ++ unsigned long flags; ++ char *buf; ++ ++ if (blk_tracer_enabled) { ++ va_start(args, fmt); ++ ftrace_vprintk(fmt, args); ++ va_end(args); ++ return; ++ } ++ ++ if (!bt->msg_data) ++ return; ++ ++ local_irq_save(flags); ++ buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); ++ va_start(args, fmt); ++ n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); ++ va_end(args); ++ ++ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL_GPL(__trace_note_message); ++ ++static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, ++ pid_t pid) ++{ ++ if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) ++ return 1; ++ if (sector < bt->start_lba || sector > bt->end_lba) ++ return 1; ++ if (bt->pid && pid != bt->pid) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Data direction bit lookup ++ */ ++static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), ++ BLK_TC_ACT(BLK_TC_WRITE) }; ++ ++/* The ilog2() calls fall out because they're constant */ ++#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ ++ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) ++ ++/* ++ * The worker for the various blk_add_trace*() types. Fills out a ++ * blk_io_trace structure and places it in a per-cpu subbuffer. ++ */ ++static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, ++ int rw, u32 what, int error, int pdu_len, void *pdu_data) ++{ ++ struct task_struct *tsk = current; ++ struct ring_buffer_event *event = NULL; ++ struct blk_io_trace *t; ++ unsigned long flags = 0; ++ unsigned long *sequence; ++ pid_t pid; ++ int cpu, pc = 0; ++ ++ if (unlikely(bt->trace_state != Blktrace_running && ++ !blk_tracer_enabled)) ++ return; ++ ++ what |= ddir_act[rw & WRITE]; ++ what |= MASK_TC_BIT(rw, BARRIER); ++ what |= MASK_TC_BIT(rw, SYNCIO); ++ what |= MASK_TC_BIT(rw, AHEAD); ++ what |= MASK_TC_BIT(rw, META); ++ what |= MASK_TC_BIT(rw, DISCARD); ++ ++ pid = tsk->pid; ++ if (unlikely(act_log_check(bt, what, sector, pid))) ++ return; ++ cpu = raw_smp_processor_id(); ++ ++ if (blk_tracer_enabled) { ++ tracing_record_cmdline(current); ++ ++ pc = preempt_count(); ++ event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, ++ sizeof(*t) + pdu_len, ++ 0, pc); ++ if (!event) ++ return; ++ t = ring_buffer_event_data(event); ++ goto record_it; ++ } ++ ++ /* ++ * A word about the locking here - we disable interrupts to reserve ++ * some space in the relay per-cpu buffer, to prevent an irq ++ * from coming in and stepping on our toes. ++ */ ++ local_irq_save(flags); ++ ++ if (unlikely(tsk->btrace_seq != blktrace_seq)) ++ trace_note_tsk(bt, tsk); ++ ++ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); ++ if (t) { ++ sequence = per_cpu_ptr(bt->sequence, cpu); ++ ++ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; ++ t->sequence = ++(*sequence); ++ t->time = ktime_to_ns(ktime_get()); ++record_it: ++ /* ++ * These two are not needed in ftrace as they are in the ++ * generic trace_entry, filled by tracing_generic_entry_update, ++ * but for the trace_event->bin() synthesizer benefit we do it ++ * here too. ++ */ ++ t->cpu = cpu; ++ t->pid = pid; ++ ++ t->sector = sector; ++ t->bytes = bytes; ++ t->action = what; ++ t->device = bt->dev; ++ t->error = error; ++ t->pdu_len = pdu_len; ++ ++ if (pdu_len) ++ memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); ++ ++ if (blk_tracer_enabled) { ++ trace_buffer_unlock_commit(blk_tr, event, 0, pc); ++ return; ++ } ++ } ++ ++ local_irq_restore(flags); ++} ++ ++static struct dentry *blk_tree_root; ++static DEFINE_MUTEX(blk_tree_mutex); ++ ++static void blk_trace_cleanup(struct blk_trace *bt) ++{ ++ debugfs_remove(bt->msg_file); ++ debugfs_remove(bt->dropped_file); ++ relay_close(bt->rchan); ++ free_percpu(bt->sequence); ++ free_percpu(bt->msg_data); ++ kfree(bt); ++ if (atomic_dec_and_test(&blk_probes_ref)) ++ blk_unregister_tracepoints(); ++} ++ ++int blk_trace_remove(struct request_queue *q) ++{ ++ struct blk_trace *bt; ++ ++ bt = xchg(&q->blk_trace, NULL); ++ if (!bt) ++ return -EINVAL; ++ ++ if (bt->trace_state != Blktrace_running) ++ blk_trace_cleanup(bt); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(blk_trace_remove); ++ ++static int blk_dropped_open(struct inode *inode, struct file *filp) ++{ ++ filp->private_data = inode->i_private; ++ ++ return 0; ++} ++ ++static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, ++ size_t count, loff_t *ppos) ++{ ++ struct blk_trace *bt = filp->private_data; ++ char buf[16]; ++ ++ snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); ++ ++ return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); ++} ++ ++static const struct file_operations blk_dropped_fops = { ++ .owner = THIS_MODULE, ++ .open = blk_dropped_open, ++ .read = blk_dropped_read, ++}; ++ ++static int blk_msg_open(struct inode *inode, struct file *filp) ++{ ++ filp->private_data = inode->i_private; ++ ++ return 0; ++} ++ ++static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, ++ size_t count, loff_t *ppos) ++{ ++ char *msg; ++ struct blk_trace *bt; ++ ++ if (count > BLK_TN_MAX_MSG) ++ return -EINVAL; ++ ++ msg = kmalloc(count, GFP_KERNEL); ++ if (msg == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(msg, buffer, count)) { ++ kfree(msg); ++ return -EFAULT; ++ } ++ ++ bt = filp->private_data; ++ __trace_note_message(bt, "%s", msg); ++ kfree(msg); ++ ++ return count; ++} ++ ++static const struct file_operations blk_msg_fops = { ++ .owner = THIS_MODULE, ++ .open = blk_msg_open, ++ .write = blk_msg_write, ++}; ++ ++/* ++ * Keep track of how many times we encountered a full subbuffer, to aid ++ * the user space app in telling how many lost events there were. ++ */ ++static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, ++ void *prev_subbuf, size_t prev_padding) ++{ ++ struct blk_trace *bt; ++ ++ if (!relay_buf_full(buf)) ++ return 1; ++ ++ bt = buf->chan->private_data; ++ atomic_inc(&bt->dropped); ++ return 0; ++} ++ ++static int blk_remove_buf_file_callback(struct dentry *dentry) ++{ ++ struct dentry *parent = dentry->d_parent; ++ debugfs_remove(dentry); ++ ++ /* ++ * this will fail for all but the last file, but that is ok. what we ++ * care about is the top level buts->name directory going away, when ++ * the last trace file is gone. Then we don't have to rmdir() that ++ * manually on trace stop, so it nicely solves the issue with ++ * force killing of running traces. ++ */ ++ ++ debugfs_remove(parent); ++ return 0; ++} ++ ++static struct dentry *blk_create_buf_file_callback(const char *filename, ++ struct dentry *parent, ++ int mode, ++ struct rchan_buf *buf, ++ int *is_global) ++{ ++ return debugfs_create_file(filename, mode, parent, buf, ++ &relay_file_operations); ++} ++ ++static struct rchan_callbacks blk_relay_callbacks = { ++ .subbuf_start = blk_subbuf_start_callback, ++ .create_buf_file = blk_create_buf_file_callback, ++ .remove_buf_file = blk_remove_buf_file_callback, ++}; ++ ++/* ++ * Setup everything required to start tracing ++ */ ++int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ++ struct blk_user_trace_setup *buts) ++{ ++ struct blk_trace *old_bt, *bt = NULL; ++ struct dentry *dir = NULL; ++ int ret, i; ++ ++ if (!buts->buf_size || !buts->buf_nr) ++ return -EINVAL; ++ ++ strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); ++ buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; ++ ++ /* ++ * some device names have larger paths - convert the slashes ++ * to underscores for this to work as expected ++ */ ++ for (i = 0; i < strlen(buts->name); i++) ++ if (buts->name[i] == '/') ++ buts->name[i] = '_'; ++ ++ ret = -ENOMEM; ++ bt = kzalloc(sizeof(*bt), GFP_KERNEL); ++ if (!bt) ++ goto err; ++ ++ bt->sequence = alloc_percpu(unsigned long); ++ if (!bt->sequence) ++ goto err; ++ ++ bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); ++ if (!bt->msg_data) ++ goto err; ++ ++ ret = -ENOENT; ++ ++ mutex_lock(&blk_tree_mutex); ++ if (!blk_tree_root) { ++ blk_tree_root = debugfs_create_dir("block", NULL); ++ if (!blk_tree_root) { ++ mutex_unlock(&blk_tree_mutex); ++ goto err; ++ } ++ } ++ mutex_unlock(&blk_tree_mutex); ++ ++ dir = debugfs_create_dir(buts->name, blk_tree_root); ++ ++ if (!dir) ++ goto err; ++ ++ bt->dir = dir; ++ bt->dev = dev; ++ atomic_set(&bt->dropped, 0); ++ ++ ret = -EIO; ++ bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, ++ &blk_dropped_fops); ++ if (!bt->dropped_file) ++ goto err; ++ ++ bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); ++ if (!bt->msg_file) ++ goto err; ++ ++ bt->rchan = relay_open("trace", dir, buts->buf_size, ++ buts->buf_nr, &blk_relay_callbacks, bt); ++ if (!bt->rchan) ++ goto err; ++ ++ bt->act_mask = buts->act_mask; ++ if (!bt->act_mask) ++ bt->act_mask = (u16) -1; ++ ++ bt->start_lba = buts->start_lba; ++ bt->end_lba = buts->end_lba; ++ if (!bt->end_lba) ++ bt->end_lba = -1ULL; ++ ++ bt->pid = buts->pid; ++ bt->trace_state = Blktrace_setup; ++ ++ ret = -EBUSY; ++ old_bt = xchg(&q->blk_trace, bt); ++ if (old_bt) { ++ (void) xchg(&q->blk_trace, old_bt); ++ goto err; ++ } ++ ++ if (atomic_add_return(1, &blk_probes_ref) == 1) ++ blk_register_tracepoints(); ++ ++ return 0; ++err: ++ if (bt) { ++ if (bt->msg_file) ++ debugfs_remove(bt->msg_file); ++ if (bt->dropped_file) ++ debugfs_remove(bt->dropped_file); ++ free_percpu(bt->sequence); ++ free_percpu(bt->msg_data); ++ if (bt->rchan) ++ relay_close(bt->rchan); ++ kfree(bt); ++ } ++ return ret; ++} ++ ++int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ++ char __user *arg) ++{ ++ struct blk_user_trace_setup buts; ++ int ret; ++ ++ ret = copy_from_user(&buts, arg, sizeof(buts)); ++ if (ret) ++ return -EFAULT; ++ ++ ret = do_blk_trace_setup(q, name, dev, &buts); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user(arg, &buts, sizeof(buts))) ++ return -EFAULT; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(blk_trace_setup); ++ ++int blk_trace_startstop(struct request_queue *q, int start) ++{ ++ int ret; ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (bt == NULL) ++ return -EINVAL; ++ ++ /* ++ * For starting a trace, we can transition from a setup or stopped ++ * trace. For stopping a trace, the state must be running ++ */ ++ ret = -EINVAL; ++ if (start) { ++ if (bt->trace_state == Blktrace_setup || ++ bt->trace_state == Blktrace_stopped) { ++ blktrace_seq++; ++ smp_mb(); ++ bt->trace_state = Blktrace_running; ++ ++ trace_note_time(bt); ++ ret = 0; ++ } ++ } else { ++ if (bt->trace_state == Blktrace_running) { ++ bt->trace_state = Blktrace_stopped; ++ relay_flush(bt->rchan); ++ ret = 0; ++ } ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(blk_trace_startstop); ++ ++/** ++ * blk_trace_ioctl: - handle the ioctls associated with tracing ++ * @bdev: the block device ++ * @cmd: the ioctl cmd ++ * @arg: the argument data, if any ++ * ++ **/ ++int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) ++{ ++ struct request_queue *q; ++ int ret, start = 0; ++ char b[BDEVNAME_SIZE]; ++ ++ q = bdev_get_queue(bdev); ++ if (!q) ++ return -ENXIO; ++ ++ mutex_lock(&bdev->bd_mutex); ++ ++ switch (cmd) { ++ case BLKTRACESETUP: ++ bdevname(bdev, b); ++ ret = blk_trace_setup(q, b, bdev->bd_dev, arg); ++ break; ++ case BLKTRACESTART: ++ start = 1; ++ case BLKTRACESTOP: ++ ret = blk_trace_startstop(q, start); ++ break; ++ case BLKTRACETEARDOWN: ++ ret = blk_trace_remove(q); ++ break; ++ default: ++ ret = -ENOTTY; ++ break; ++ } ++ ++ mutex_unlock(&bdev->bd_mutex); ++ return ret; ++} ++ ++/** ++ * blk_trace_shutdown: - stop and cleanup trace structures ++ * @q: the request queue associated with the device ++ * ++ **/ ++void blk_trace_shutdown(struct request_queue *q) ++{ ++ if (q->blk_trace) { ++ blk_trace_startstop(q, 0); ++ blk_trace_remove(q); ++ } ++} ++ ++/* ++ * blktrace probes ++ */ ++ ++/** ++ * blk_add_trace_rq - Add a trace for a request oriented action ++ * @q: queue the io is for ++ * @rq: the source request ++ * @what: the action ++ * ++ * Description: ++ * Records an action against a request. Will log the bio offset + size. ++ * ++ **/ ++static void blk_add_trace_rq(struct request_queue *q, struct request *rq, ++ u32 what) ++{ ++ struct blk_trace *bt = q->blk_trace; ++ int rw = rq->cmd_flags & 0x03; ++ ++ if (likely(!bt)) ++ return; ++ ++ if (blk_discard_rq(rq)) ++ rw |= (1 << BIO_RW_DISCARD); ++ ++ if (blk_pc_request(rq)) { ++ what |= BLK_TC_ACT(BLK_TC_PC); ++ __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, ++ sizeof(rq->cmd), rq->cmd); ++ } else { ++ what |= BLK_TC_ACT(BLK_TC_FS); ++ __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, ++ rw, what, rq->errors, 0, NULL); ++ } ++} ++ ++static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) ++{ ++ blk_add_trace_rq(q, rq, BLK_TA_ABORT); ++} ++ ++static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) ++{ ++ blk_add_trace_rq(q, rq, BLK_TA_INSERT); ++} ++ ++static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) ++{ ++ blk_add_trace_rq(q, rq, BLK_TA_ISSUE); ++} ++ ++static void blk_add_trace_rq_requeue(struct request_queue *q, ++ struct request *rq) ++{ ++ blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); ++} ++ ++static void blk_add_trace_rq_complete(struct request_queue *q, ++ struct request *rq) ++{ ++ blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); ++} ++ ++/** ++ * blk_add_trace_bio - Add a trace for a bio oriented action ++ * @q: queue the io is for ++ * @bio: the source bio ++ * @what: the action ++ * ++ * Description: ++ * Records an action against a bio. Will log the bio offset + size. ++ * ++ **/ ++static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, ++ u32 what) ++{ ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (likely(!bt)) ++ return; ++ ++ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, ++ !bio_flagged(bio, BIO_UPTODATE), 0, NULL); ++} ++ ++static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) ++{ ++ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); ++} ++ ++static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) ++{ ++ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); ++} ++ ++static void blk_add_trace_bio_backmerge(struct request_queue *q, ++ struct bio *bio) ++{ ++ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); ++} ++ ++static void blk_add_trace_bio_frontmerge(struct request_queue *q, ++ struct bio *bio) ++{ ++ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); ++} ++ ++static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) ++{ ++ blk_add_trace_bio(q, bio, BLK_TA_QUEUE); ++} ++ ++static void blk_add_trace_getrq(struct request_queue *q, ++ struct bio *bio, int rw) ++{ ++ if (bio) ++ blk_add_trace_bio(q, bio, BLK_TA_GETRQ); ++ else { ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (bt) ++ __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); ++ } ++} ++ ++ ++static void blk_add_trace_sleeprq(struct request_queue *q, ++ struct bio *bio, int rw) ++{ ++ if (bio) ++ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); ++ else { ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (bt) ++ __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, ++ 0, 0, NULL); ++ } ++} ++ ++static void blk_add_trace_plug(struct request_queue *q) ++{ ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (bt) ++ __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); ++} ++ ++static void blk_add_trace_unplug_io(struct request_queue *q) ++{ ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (bt) { ++ unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; ++ __be64 rpdu = cpu_to_be64(pdu); ++ ++ __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, ++ sizeof(rpdu), &rpdu); ++ } ++} ++ ++static void blk_add_trace_unplug_timer(struct request_queue *q) ++{ ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (bt) { ++ unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; ++ __be64 rpdu = cpu_to_be64(pdu); ++ ++ __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, ++ sizeof(rpdu), &rpdu); ++ } ++} ++ ++static void blk_add_trace_split(struct request_queue *q, struct bio *bio, ++ unsigned int pdu) ++{ ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (bt) { ++ __be64 rpdu = cpu_to_be64(pdu); ++ ++ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, ++ BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), ++ sizeof(rpdu), &rpdu); ++ } ++} ++ ++/** ++ * blk_add_trace_remap - Add a trace for a remap operation ++ * @q: queue the io is for ++ * @bio: the source bio ++ * @dev: target device ++ * @from: source sector ++ * @to: target sector ++ * ++ * Description: ++ * Device mapper or raid target sometimes need to split a bio because ++ * it spans a stripe (or similar). Add a trace for that action. ++ * ++ **/ ++static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, ++ dev_t dev, sector_t from, sector_t to) ++{ ++ struct blk_trace *bt = q->blk_trace; ++ struct blk_io_trace_remap r; ++ ++ if (likely(!bt)) ++ return; ++ ++ r.device = cpu_to_be32(dev); ++ r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); ++ r.sector = cpu_to_be64(to); ++ ++ __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, ++ !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); ++} ++ ++/** ++ * blk_add_driver_data - Add binary message with driver-specific data ++ * @q: queue the io is for ++ * @rq: io request ++ * @data: driver-specific data ++ * @len: length of driver-specific data ++ * ++ * Description: ++ * Some drivers might want to write driver-specific data per request. ++ * ++ **/ ++void blk_add_driver_data(struct request_queue *q, ++ struct request *rq, ++ void *data, size_t len) ++{ ++ struct blk_trace *bt = q->blk_trace; ++ ++ if (likely(!bt)) ++ return; ++ ++ if (blk_pc_request(rq)) ++ __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, ++ rq->errors, len, data); ++ else ++ __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, ++ 0, BLK_TA_DRV_DATA, rq->errors, len, data); ++} ++EXPORT_SYMBOL_GPL(blk_add_driver_data); ++ ++static void blk_register_tracepoints(void) ++{ ++ int ret; ++ ++ ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); ++ WARN_ON(ret); ++ ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); ++ WARN_ON(ret); ++ ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); ++ WARN_ON(ret); ++ ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); ++ WARN_ON(ret); ++ ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); ++ WARN_ON(ret); ++ ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); ++ WARN_ON(ret); ++ ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); ++ WARN_ON(ret); ++ ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); ++ WARN_ON(ret); ++ ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); ++ WARN_ON(ret); ++ ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); ++ WARN_ON(ret); ++ ret = register_trace_block_getrq(blk_add_trace_getrq); ++ WARN_ON(ret); ++ ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); ++ WARN_ON(ret); ++ ret = register_trace_block_plug(blk_add_trace_plug); ++ WARN_ON(ret); ++ ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); ++ WARN_ON(ret); ++ ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); ++ WARN_ON(ret); ++ ret = register_trace_block_split(blk_add_trace_split); ++ WARN_ON(ret); ++ ret = register_trace_block_remap(blk_add_trace_remap); ++ WARN_ON(ret); ++} ++ ++static void blk_unregister_tracepoints(void) ++{ ++ unregister_trace_block_remap(blk_add_trace_remap); ++ unregister_trace_block_split(blk_add_trace_split); ++ unregister_trace_block_unplug_io(blk_add_trace_unplug_io); ++ unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); ++ unregister_trace_block_plug(blk_add_trace_plug); ++ unregister_trace_block_sleeprq(blk_add_trace_sleeprq); ++ unregister_trace_block_getrq(blk_add_trace_getrq); ++ unregister_trace_block_bio_queue(blk_add_trace_bio_queue); ++ unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); ++ unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); ++ unregister_trace_block_bio_complete(blk_add_trace_bio_complete); ++ unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); ++ unregister_trace_block_rq_complete(blk_add_trace_rq_complete); ++ unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); ++ unregister_trace_block_rq_issue(blk_add_trace_rq_issue); ++ unregister_trace_block_rq_insert(blk_add_trace_rq_insert); ++ unregister_trace_block_rq_abort(blk_add_trace_rq_abort); ++ ++ tracepoint_synchronize_unregister(); ++} ++ ++/* ++ * struct blk_io_tracer formatting routines ++ */ ++ ++static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) ++{ ++ int i = 0; ++ int tc = t->action >> BLK_TC_SHIFT; ++ ++ if (tc & BLK_TC_DISCARD) ++ rwbs[i++] = 'D'; ++ else if (tc & BLK_TC_WRITE) ++ rwbs[i++] = 'W'; ++ else if (t->bytes) ++ rwbs[i++] = 'R'; ++ else ++ rwbs[i++] = 'N'; ++ ++ if (tc & BLK_TC_AHEAD) ++ rwbs[i++] = 'A'; ++ if (tc & BLK_TC_BARRIER) ++ rwbs[i++] = 'B'; ++ if (tc & BLK_TC_SYNC) ++ rwbs[i++] = 'S'; ++ if (tc & BLK_TC_META) ++ rwbs[i++] = 'M'; ++ ++ rwbs[i] = '\0'; ++} ++ ++static inline ++const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) ++{ ++ return (const struct blk_io_trace *)ent; ++} ++ ++static inline const void *pdu_start(const struct trace_entry *ent) ++{ ++ return te_blk_io_trace(ent) + 1; ++} ++ ++static inline u32 t_sec(const struct trace_entry *ent) ++{ ++ return te_blk_io_trace(ent)->bytes >> 9; ++} ++ ++static inline unsigned long long t_sector(const struct trace_entry *ent) ++{ ++ return te_blk_io_trace(ent)->sector; ++} ++ ++static inline __u16 t_error(const struct trace_entry *ent) ++{ ++ return te_blk_io_trace(ent)->error; ++} ++ ++static __u64 get_pdu_int(const struct trace_entry *ent) ++{ ++ const __u64 *val = pdu_start(ent); ++ return be64_to_cpu(*val); ++} ++ ++static void get_pdu_remap(const struct trace_entry *ent, ++ struct blk_io_trace_remap *r) ++{ ++ const struct blk_io_trace_remap *__r = pdu_start(ent); ++ __u64 sector = __r->sector; ++ ++ r->device = be32_to_cpu(__r->device); ++ r->device_from = be32_to_cpu(__r->device_from); ++ r->sector = be64_to_cpu(sector); ++} ++ ++static int blk_log_action_iter(struct trace_iterator *iter, const char *act) ++{ ++ char rwbs[6]; ++ unsigned long long ts = ns2usecs(iter->ts); ++ unsigned long usec_rem = do_div(ts, USEC_PER_SEC); ++ unsigned secs = (unsigned long)ts; ++ const struct trace_entry *ent = iter->ent; ++ const struct blk_io_trace *t = (const struct blk_io_trace *)ent; ++ ++ fill_rwbs(rwbs, t); ++ ++ return trace_seq_printf(&iter->seq, ++ "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", ++ MAJOR(t->device), MINOR(t->device), iter->cpu, ++ secs, usec_rem, ent->pid, act, rwbs); ++} ++ ++static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, ++ const char *act) ++{ ++ char rwbs[6]; ++ fill_rwbs(rwbs, t); ++ return trace_seq_printf(s, "%3d,%-3d %2s %3s ", ++ MAJOR(t->device), MINOR(t->device), act, rwbs); ++} ++ ++static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) ++{ ++ char cmd[TASK_COMM_LEN]; ++ ++ trace_find_cmdline(ent->pid, cmd); ++ ++ if (t_sec(ent)) ++ return trace_seq_printf(s, "%llu + %u [%s]\n", ++ t_sector(ent), t_sec(ent), cmd); ++ return trace_seq_printf(s, "[%s]\n", cmd); ++} ++ ++static int blk_log_with_error(struct trace_seq *s, ++ const struct trace_entry *ent) ++{ ++ if (t_sec(ent)) ++ return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), ++ t_sec(ent), t_error(ent)); ++ return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); ++} ++ ++static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) ++{ ++ struct blk_io_trace_remap r = { .device = 0, }; ++ ++ get_pdu_remap(ent, &r); ++ return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", ++ t_sector(ent), ++ t_sec(ent), MAJOR(r.device), MINOR(r.device), ++ (unsigned long long)r.sector); ++} ++ ++static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) ++{ ++ char cmd[TASK_COMM_LEN]; ++ ++ trace_find_cmdline(ent->pid, cmd); ++ ++ return trace_seq_printf(s, "[%s]\n", cmd); ++} ++ ++static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) ++{ ++ char cmd[TASK_COMM_LEN]; ++ ++ trace_find_cmdline(ent->pid, cmd); ++ ++ return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); ++} ++ ++static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) ++{ ++ char cmd[TASK_COMM_LEN]; ++ ++ trace_find_cmdline(ent->pid, cmd); ++ ++ return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), ++ get_pdu_int(ent), cmd); ++} ++ ++/* ++ * struct tracer operations ++ */ ++ ++static void blk_tracer_print_header(struct seq_file *m) ++{ ++ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) ++ return; ++ seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" ++ "# | | | | | |\n"); ++} ++ ++static void blk_tracer_start(struct trace_array *tr) ++{ ++ if (atomic_add_return(1, &blk_probes_ref) == 1) ++ blk_register_tracepoints(); ++ trace_flags &= ~TRACE_ITER_CONTEXT_INFO; ++} ++ ++static int blk_tracer_init(struct trace_array *tr) ++{ ++ blk_tr = tr; ++ blk_tracer_start(tr); ++ blk_tracer_enabled = true; ++ return 0; ++} ++ ++static void blk_tracer_stop(struct trace_array *tr) ++{ ++ trace_flags |= TRACE_ITER_CONTEXT_INFO; ++ if (atomic_dec_and_test(&blk_probes_ref)) ++ blk_unregister_tracepoints(); ++} ++ ++static void blk_tracer_reset(struct trace_array *tr) ++{ ++ if (!atomic_read(&blk_probes_ref)) ++ return; ++ ++ blk_tracer_enabled = false; ++ blk_tracer_stop(tr); ++} ++ ++static const struct { ++ const char *act[2]; ++ int (*print)(struct trace_seq *s, const struct trace_entry *ent); ++} what2act[] = { ++ [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, ++ [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, ++ [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, ++ [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, ++ [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, ++ [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, ++ [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, ++ [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, ++ [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, ++ [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, ++ [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, ++ [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, ++ [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, ++ [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, ++ [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, ++}; ++ ++static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, ++ int flags) ++{ ++ struct trace_seq *s = &iter->seq; ++ const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; ++ const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); ++ int ret; ++ ++ if (!trace_print_context(iter)) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) ++ ret = trace_seq_printf(s, "Bad pc action %x\n", what); ++ else { ++ const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); ++ ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); ++ if (ret) ++ ret = what2act[what].print(s, iter->ent); ++ } ++ ++ return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; ++} ++ ++static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; ++ const int offset = offsetof(struct blk_io_trace, sector); ++ struct blk_io_trace old = { ++ .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, ++ .time = iter->ts, ++ }; ++ ++ if (!trace_seq_putmem(s, &old, offset)) ++ return 0; ++ return trace_seq_putmem(s, &t->sector, ++ sizeof(old) - offset + t->pdu_len); ++} ++ ++static enum print_line_t ++blk_trace_event_print_binary(struct trace_iterator *iter, int flags) ++{ ++ return blk_trace_synthesize_old_trace(iter) ? ++ TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; ++} ++ ++static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) ++{ ++ const struct blk_io_trace *t; ++ u16 what; ++ int ret; ++ ++ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) ++ return TRACE_TYPE_UNHANDLED; ++ ++ t = (const struct blk_io_trace *)iter->ent; ++ what = t->action & ((1 << BLK_TC_SHIFT) - 1); ++ ++ if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) ++ ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); ++ else { ++ const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); ++ ret = blk_log_action_iter(iter, what2act[what].act[long_act]); ++ if (ret) ++ ret = what2act[what].print(&iter->seq, iter->ent); ++ } ++ ++ return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; ++} ++ ++static struct tracer blk_tracer __read_mostly = { ++ .name = "blk", ++ .init = blk_tracer_init, ++ .reset = blk_tracer_reset, ++ .start = blk_tracer_start, ++ .stop = blk_tracer_stop, ++ .print_header = blk_tracer_print_header, ++ .print_line = blk_tracer_print_line, ++ .flags = &blk_tracer_flags, ++}; ++ ++static struct trace_event trace_blk_event = { ++ .type = TRACE_BLK, ++ .trace = blk_trace_event_print, ++ .binary = blk_trace_event_print_binary, ++}; ++ ++static int __init init_blk_tracer(void) ++{ ++ if (!register_ftrace_event(&trace_blk_event)) { ++ pr_warning("Warning: could not register block events\n"); ++ return 1; ++ } ++ ++ if (register_tracer(&blk_tracer) != 0) { ++ pr_warning("Warning: could not register the block tracer\n"); ++ unregister_ftrace_event(&trace_blk_event); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++device_initcall(init_blk_tracer); ++ ++static int blk_trace_remove_queue(struct request_queue *q) ++{ ++ struct blk_trace *bt; ++ ++ bt = xchg(&q->blk_trace, NULL); ++ if (bt == NULL) ++ return -EINVAL; ++ ++ kfree(bt); ++ return 0; ++} ++ ++/* ++ * Setup everything required to start tracing ++ */ ++static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) ++{ ++ struct blk_trace *old_bt, *bt = NULL; ++ ++ bt = kzalloc(sizeof(*bt), GFP_KERNEL); ++ if (!bt) ++ return -ENOMEM; ++ ++ bt->dev = dev; ++ bt->act_mask = (u16)-1; ++ bt->end_lba = -1ULL; ++ ++ old_bt = xchg(&q->blk_trace, bt); ++ if (old_bt != NULL) { ++ (void)xchg(&q->blk_trace, old_bt); ++ kfree(bt); ++ return -EBUSY; ++ } ++ ++ return 0; ++} ++ ++/* ++ * sysfs interface to enable and configure tracing ++ */ ++ ++static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf); ++static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count); ++#define BLK_TRACE_DEVICE_ATTR(_name) \ ++ DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ ++ sysfs_blk_trace_attr_show, \ ++ sysfs_blk_trace_attr_store) ++ ++static BLK_TRACE_DEVICE_ATTR(enable); ++static BLK_TRACE_DEVICE_ATTR(act_mask); ++static BLK_TRACE_DEVICE_ATTR(pid); ++static BLK_TRACE_DEVICE_ATTR(start_lba); ++static BLK_TRACE_DEVICE_ATTR(end_lba); ++ ++static struct attribute *blk_trace_attrs[] = { ++ &dev_attr_enable.attr, ++ &dev_attr_act_mask.attr, ++ &dev_attr_pid.attr, ++ &dev_attr_start_lba.attr, ++ &dev_attr_end_lba.attr, ++ NULL ++}; ++ ++struct attribute_group blk_trace_attr_group = { ++ .name = "trace", ++ .attrs = blk_trace_attrs, ++}; ++ ++static const struct { ++ int mask; ++ const char *str; ++} mask_maps[] = { ++ { BLK_TC_READ, "read" }, ++ { BLK_TC_WRITE, "write" }, ++ { BLK_TC_BARRIER, "barrier" }, ++ { BLK_TC_SYNC, "sync" }, ++ { BLK_TC_QUEUE, "queue" }, ++ { BLK_TC_REQUEUE, "requeue" }, ++ { BLK_TC_ISSUE, "issue" }, ++ { BLK_TC_COMPLETE, "complete" }, ++ { BLK_TC_FS, "fs" }, ++ { BLK_TC_PC, "pc" }, ++ { BLK_TC_AHEAD, "ahead" }, ++ { BLK_TC_META, "meta" }, ++ { BLK_TC_DISCARD, "discard" }, ++ { BLK_TC_DRV_DATA, "drv_data" }, ++}; ++ ++static int blk_trace_str2mask(const char *str) ++{ ++ int i; ++ int mask = 0; ++ char *s, *token; ++ ++ s = kstrdup(str, GFP_KERNEL); ++ if (s == NULL) ++ return -ENOMEM; ++ s = strstrip(s); ++ ++ while (1) { ++ token = strsep(&s, ","); ++ if (token == NULL) ++ break; ++ ++ if (*token == '\0') ++ continue; ++ ++ for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { ++ if (strcasecmp(token, mask_maps[i].str) == 0) { ++ mask |= mask_maps[i].mask; ++ break; ++ } ++ } ++ if (i == ARRAY_SIZE(mask_maps)) { ++ mask = -EINVAL; ++ break; ++ } ++ } ++ kfree(s); ++ ++ return mask; ++} ++ ++static ssize_t blk_trace_mask2str(char *buf, int mask) ++{ ++ int i; ++ char *p = buf; ++ ++ for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { ++ if (mask & mask_maps[i].mask) { ++ p += sprintf(p, "%s%s", ++ (p == buf) ? "" : ",", mask_maps[i].str); ++ } ++ } ++ *p++ = '\n'; ++ ++ return p - buf; ++} ++ ++static struct request_queue *blk_trace_get_queue(struct block_device *bdev) ++{ ++ if (bdev->bd_disk == NULL) ++ return NULL; ++ ++ return bdev_get_queue(bdev); ++} ++ ++static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ struct hd_struct *p = dev_to_part(dev); ++ struct request_queue *q; ++ struct block_device *bdev; ++ ssize_t ret = -ENXIO; ++ ++ lock_kernel(); ++ bdev = bdget(part_devt(p)); ++ if (bdev == NULL) ++ goto out_unlock_kernel; ++ ++ q = blk_trace_get_queue(bdev); ++ if (q == NULL) ++ goto out_bdput; ++ ++ mutex_lock(&bdev->bd_mutex); ++ ++ if (attr == &dev_attr_enable) { ++ ret = sprintf(buf, "%u\n", !!q->blk_trace); ++ goto out_unlock_bdev; ++ } ++ ++ if (q->blk_trace == NULL) ++ ret = sprintf(buf, "disabled\n"); ++ else if (attr == &dev_attr_act_mask) ++ ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); ++ else if (attr == &dev_attr_pid) ++ ret = sprintf(buf, "%u\n", q->blk_trace->pid); ++ else if (attr == &dev_attr_start_lba) ++ ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); ++ else if (attr == &dev_attr_end_lba) ++ ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); ++ ++out_unlock_bdev: ++ mutex_unlock(&bdev->bd_mutex); ++out_bdput: ++ bdput(bdev); ++out_unlock_kernel: ++ unlock_kernel(); ++ return ret; ++} ++ ++static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct block_device *bdev; ++ struct request_queue *q; ++ struct hd_struct *p; ++ u64 value; ++ ssize_t ret = -EINVAL; ++ ++ if (count == 0) ++ goto out; ++ ++ if (attr == &dev_attr_act_mask) { ++ if (sscanf(buf, "%llx", &value) != 1) { ++ /* Assume it is a list of trace category names */ ++ ret = blk_trace_str2mask(buf); ++ if (ret < 0) ++ goto out; ++ value = ret; ++ } ++ } else if (sscanf(buf, "%llu", &value) != 1) ++ goto out; ++ ++ ret = -ENXIO; ++ ++ lock_kernel(); ++ p = dev_to_part(dev); ++ bdev = bdget(part_devt(p)); ++ if (bdev == NULL) ++ goto out_unlock_kernel; ++ ++ q = blk_trace_get_queue(bdev); ++ if (q == NULL) ++ goto out_bdput; ++ ++ mutex_lock(&bdev->bd_mutex); ++ ++ if (attr == &dev_attr_enable) { ++ if (value) ++ ret = blk_trace_setup_queue(q, bdev->bd_dev); ++ else ++ ret = blk_trace_remove_queue(q); ++ goto out_unlock_bdev; ++ } ++ ++ ret = 0; ++ if (q->blk_trace == NULL) ++ ret = blk_trace_setup_queue(q, bdev->bd_dev); ++ ++ if (ret == 0) { ++ if (attr == &dev_attr_act_mask) ++ q->blk_trace->act_mask = value; ++ else if (attr == &dev_attr_pid) ++ q->blk_trace->pid = value; ++ else if (attr == &dev_attr_start_lba) ++ q->blk_trace->start_lba = value; ++ else if (attr == &dev_attr_end_lba) ++ q->blk_trace->end_lba = value; ++ } ++ ++out_unlock_bdev: ++ mutex_unlock(&bdev->bd_mutex); ++out_bdput: ++ bdput(bdev); ++out_unlock_kernel: ++ unlock_kernel(); ++out: ++ return ret ? ret : count; ++} ++ +Index: linux-2.6-tip/kernel/trace/events.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/events.c +@@ -0,0 +1,14 @@ ++/* ++ * This is the place to register all trace points as events. ++ */ ++ ++#include ++ ++#include ++ ++#include "trace_output.h" ++ ++#include "trace_events_stage_1.h" ++#include "trace_events_stage_2.h" ++#include "trace_events_stage_3.h" ++ +Index: linux-2.6-tip/kernel/trace/ftrace.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/ftrace.c ++++ linux-2.6-tip/kernel/trace/ftrace.c +@@ -27,10 +27,14 @@ + #include + #include + #include ++#include ++ ++#include + + #include + +-#include "trace.h" ++#include "trace_output.h" ++#include "trace_stat.h" + + #define FTRACE_WARN_ON(cond) \ + do { \ +@@ -44,14 +48,14 @@ + ftrace_kill(); \ + } while (0) + ++/* hash bits for specific function selection */ ++#define FTRACE_HASH_BITS 7 ++#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) ++ + /* ftrace_enabled is a method to turn ftrace on or off */ + int ftrace_enabled __read_mostly; + static int last_ftrace_enabled; + +-/* set when tracing only a pid */ +-struct pid *ftrace_pid_trace; +-static struct pid * const ftrace_swapper_pid = &init_struct_pid; +- + /* Quick disabling of function tracer. */ + int function_trace_stop; + +@@ -61,9 +65,7 @@ int function_trace_stop; + */ + static int ftrace_disabled __read_mostly; + +-static DEFINE_SPINLOCK(ftrace_lock); +-static DEFINE_MUTEX(ftrace_sysctl_lock); +-static DEFINE_MUTEX(ftrace_start_lock); ++static DEFINE_MUTEX(ftrace_lock); + + static struct ftrace_ops ftrace_list_end __read_mostly = + { +@@ -134,9 +136,6 @@ static void ftrace_test_stop_func(unsign + + static int __register_ftrace_function(struct ftrace_ops *ops) + { +- /* should not be called from interrupt context */ +- spin_lock(&ftrace_lock); +- + ops->next = ftrace_list; + /* + * We are entering ops into the ftrace_list but another +@@ -172,18 +171,12 @@ static int __register_ftrace_function(st + #endif + } + +- spin_unlock(&ftrace_lock); +- + return 0; + } + + static int __unregister_ftrace_function(struct ftrace_ops *ops) + { + struct ftrace_ops **p; +- int ret = 0; +- +- /* should not be called from interrupt context */ +- spin_lock(&ftrace_lock); + + /* + * If we are removing the last function, then simply point +@@ -192,17 +185,15 @@ static int __unregister_ftrace_function( + if (ftrace_list == ops && ops->next == &ftrace_list_end) { + ftrace_trace_function = ftrace_stub; + ftrace_list = &ftrace_list_end; +- goto out; ++ return 0; + } + + for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + +- if (*p != ops) { +- ret = -1; +- goto out; +- } ++ if (*p != ops) ++ return -1; + + *p = (*p)->next; + +@@ -223,21 +214,15 @@ static int __unregister_ftrace_function( + } + } + +- out: +- spin_unlock(&ftrace_lock); +- +- return ret; ++ return 0; + } + + static void ftrace_update_pid_func(void) + { + ftrace_func_t func; + +- /* should not be called from interrupt context */ +- spin_lock(&ftrace_lock); +- + if (ftrace_trace_function == ftrace_stub) +- goto out; ++ return; + + func = ftrace_trace_function; + +@@ -254,498 +239,1103 @@ static void ftrace_update_pid_func(void) + #else + __ftrace_trace_function = func; + #endif +- +- out: +- spin_unlock(&ftrace_lock); + } + +-#ifdef CONFIG_DYNAMIC_FTRACE +-#ifndef CONFIG_FTRACE_MCOUNT_RECORD +-# error Dynamic ftrace depends on MCOUNT_RECORD ++#ifdef CONFIG_FUNCTION_PROFILER ++struct ftrace_profile { ++ struct hlist_node node; ++ unsigned long ip; ++ unsigned long counter; ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ unsigned long long time; + #endif +- +-/* +- * Since MCOUNT_ADDR may point to mcount itself, we do not want +- * to get it confused by reading a reference in the code as we +- * are parsing on objcopy output of text. Use a variable for +- * it instead. +- */ +-static unsigned long mcount_addr = MCOUNT_ADDR; +- +-enum { +- FTRACE_ENABLE_CALLS = (1 << 0), +- FTRACE_DISABLE_CALLS = (1 << 1), +- FTRACE_UPDATE_TRACE_FUNC = (1 << 2), +- FTRACE_ENABLE_MCOUNT = (1 << 3), +- FTRACE_DISABLE_MCOUNT = (1 << 4), +- FTRACE_START_FUNC_RET = (1 << 5), +- FTRACE_STOP_FUNC_RET = (1 << 6), + }; + +-static int ftrace_filtered; +- +-static LIST_HEAD(ftrace_new_addrs); +- +-static DEFINE_MUTEX(ftrace_regex_lock); ++struct ftrace_profile_page { ++ struct ftrace_profile_page *next; ++ unsigned long index; ++ struct ftrace_profile records[]; ++}; + +-struct ftrace_page { +- struct ftrace_page *next; +- unsigned long index; +- struct dyn_ftrace records[]; ++struct ftrace_profile_stat { ++ atomic_t disabled; ++ struct hlist_head *hash; ++ struct ftrace_profile_page *pages; ++ struct ftrace_profile_page *start; ++ struct tracer_stat stat; + }; + +-#define ENTRIES_PER_PAGE \ +- ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) ++#define PROFILE_RECORDS_SIZE \ ++ (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +-/* estimate from running different kernels */ +-#define NR_TO_INIT 10000 ++#define PROFILES_PER_PAGE \ ++ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +-static struct ftrace_page *ftrace_pages_start; +-static struct ftrace_page *ftrace_pages; ++static int ftrace_profile_bits; ++static int ftrace_profile_enabled; ++static DEFINE_MUTEX(ftrace_profile_lock); + +-static struct dyn_ftrace *ftrace_free_records; ++static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); + ++#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +-#ifdef CONFIG_KPROBES ++static void * ++function_stat_next(void *v, int idx) ++{ ++ struct ftrace_profile *rec = v; ++ struct ftrace_profile_page *pg; + +-static int frozen_record_count; ++ pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + +-static inline void freeze_record(struct dyn_ftrace *rec) +-{ +- if (!(rec->flags & FTRACE_FL_FROZEN)) { +- rec->flags |= FTRACE_FL_FROZEN; +- frozen_record_count++; ++ again: ++ rec++; ++ if ((void *)rec >= (void *)&pg->records[pg->index]) { ++ pg = pg->next; ++ if (!pg) ++ return NULL; ++ rec = &pg->records[0]; ++ if (!rec->counter) ++ goto again; + } ++ ++ return rec; + } + +-static inline void unfreeze_record(struct dyn_ftrace *rec) ++static void *function_stat_start(struct tracer_stat *trace) + { +- if (rec->flags & FTRACE_FL_FROZEN) { +- rec->flags &= ~FTRACE_FL_FROZEN; +- frozen_record_count--; +- } ++ struct ftrace_profile_stat *stat = ++ container_of(trace, struct ftrace_profile_stat, stat); ++ ++ if (!stat || !stat->start) ++ return NULL; ++ ++ return function_stat_next(&stat->start->records[0], 0); + } + +-static inline int record_frozen(struct dyn_ftrace *rec) ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++/* function graph compares on total time */ ++static int function_stat_cmp(void *p1, void *p2) + { +- return rec->flags & FTRACE_FL_FROZEN; ++ struct ftrace_profile *a = p1; ++ struct ftrace_profile *b = p2; ++ ++ if (a->time < b->time) ++ return -1; ++ if (a->time > b->time) ++ return 1; ++ else ++ return 0; + } + #else +-# define freeze_record(rec) ({ 0; }) +-# define unfreeze_record(rec) ({ 0; }) +-# define record_frozen(rec) ({ 0; }) +-#endif /* CONFIG_KPROBES */ ++/* not function graph compares against hits */ ++static int function_stat_cmp(void *p1, void *p2) ++{ ++ struct ftrace_profile *a = p1; ++ struct ftrace_profile *b = p2; + +-static void ftrace_free_rec(struct dyn_ftrace *rec) ++ if (a->counter < b->counter) ++ return -1; ++ if (a->counter > b->counter) ++ return 1; ++ else ++ return 0; ++} ++#endif ++ ++static int function_stat_headers(struct seq_file *m) + { +- rec->ip = (unsigned long)ftrace_free_records; +- ftrace_free_records = rec; +- rec->flags |= FTRACE_FL_FREE; ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ seq_printf(m, " Function " ++ "Hit Time Avg\n" ++ " -------- " ++ "--- ---- ---\n"); ++#else ++ seq_printf(m, " Function Hit\n" ++ " -------- ---\n"); ++#endif ++ return 0; + } + +-void ftrace_release(void *start, unsigned long size) ++static int function_stat_show(struct seq_file *m, void *v) + { +- struct dyn_ftrace *rec; +- struct ftrace_page *pg; +- unsigned long s = (unsigned long)start; +- unsigned long e = s + size; +- int i; ++ struct ftrace_profile *rec = v; ++ char str[KSYM_SYMBOL_LEN]; ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ static DEFINE_MUTEX(mutex); ++ static struct trace_seq s; ++ unsigned long long avg; ++#endif + +- if (ftrace_disabled || !start) +- return; ++ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); ++ seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +- /* should not be called from interrupt context */ +- spin_lock(&ftrace_lock); ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ seq_printf(m, " "); ++ avg = rec->time; ++ if (rec->counter) ++ do_div(avg, rec->counter); ++ ++ mutex_lock(&mutex); ++ trace_seq_init(&s); ++ trace_print_graph_duration(rec->time, &s); ++ trace_seq_puts(&s, " "); ++ trace_print_graph_duration(avg, &s); ++ trace_print_seq(m, &s); ++ mutex_unlock(&mutex); ++#endif ++ seq_putc(m, '\n'); + +- for (pg = ftrace_pages_start; pg; pg = pg->next) { +- for (i = 0; i < pg->index; i++) { +- rec = &pg->records[i]; ++ return 0; ++} + +- if ((rec->ip >= s) && (rec->ip < e)) +- ftrace_free_rec(rec); +- } ++static void ftrace_profile_reset(struct ftrace_profile_stat *stat) ++{ ++ struct ftrace_profile_page *pg; ++ ++ pg = stat->pages = stat->start; ++ ++ while (pg) { ++ memset(pg->records, 0, PROFILE_RECORDS_SIZE); ++ pg->index = 0; ++ pg = pg->next; + } +- spin_unlock(&ftrace_lock); ++ ++ memset(stat->hash, 0, ++ FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); + } + +-static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) ++int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) + { +- struct dyn_ftrace *rec; ++ struct ftrace_profile_page *pg; ++ int functions; ++ int pages; ++ int i; + +- /* First check for freed records */ +- if (ftrace_free_records) { +- rec = ftrace_free_records; ++ /* If we already allocated, do nothing */ ++ if (stat->pages) ++ return 0; + +- if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { +- FTRACE_WARN_ON_ONCE(1); +- ftrace_free_records = NULL; +- return NULL; +- } ++ stat->pages = (void *)get_zeroed_page(GFP_KERNEL); ++ if (!stat->pages) ++ return -ENOMEM; + +- ftrace_free_records = (void *)rec->ip; +- memset(rec, 0, sizeof(*rec)); +- return rec; ++#ifdef CONFIG_DYNAMIC_FTRACE ++ functions = ftrace_update_tot_cnt; ++#else ++ /* ++ * We do not know the number of functions that exist because ++ * dynamic tracing is what counts them. With past experience ++ * we have around 20K functions. That should be more than enough. ++ * It is highly unlikely we will execute every function in ++ * the kernel. ++ */ ++ functions = 20000; ++#endif ++ ++ pg = stat->start = stat->pages; ++ ++ pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); ++ ++ for (i = 0; i < pages; i++) { ++ pg->next = (void *)get_zeroed_page(GFP_KERNEL); ++ if (!pg->next) ++ goto out_free; ++ pg = pg->next; + } + +- if (ftrace_pages->index == ENTRIES_PER_PAGE) { +- if (!ftrace_pages->next) { +- /* allocate another page */ +- ftrace_pages->next = +- (void *)get_zeroed_page(GFP_KERNEL); +- if (!ftrace_pages->next) +- return NULL; +- } +- ftrace_pages = ftrace_pages->next; ++ return 0; ++ ++ out_free: ++ pg = stat->start; ++ while (pg) { ++ unsigned long tmp = (unsigned long)pg; ++ ++ pg = pg->next; ++ free_page(tmp); + } + +- return &ftrace_pages->records[ftrace_pages->index++]; ++ free_page((unsigned long)stat->pages); ++ stat->pages = NULL; ++ stat->start = NULL; ++ ++ return -ENOMEM; + } + +-static struct dyn_ftrace * +-ftrace_record_ip(unsigned long ip) ++static int ftrace_profile_init_cpu(int cpu) + { +- struct dyn_ftrace *rec; ++ struct ftrace_profile_stat *stat; ++ int size; + +- if (ftrace_disabled) +- return NULL; ++ stat = &per_cpu(ftrace_profile_stats, cpu); + +- rec = ftrace_alloc_dyn_node(ip); +- if (!rec) +- return NULL; ++ if (stat->hash) { ++ /* If the profile is already created, simply reset it */ ++ ftrace_profile_reset(stat); ++ return 0; ++ } + +- rec->ip = ip; ++ /* ++ * We are profiling all functions, but usually only a few thousand ++ * functions are hit. We'll make a hash of 1024 items. ++ */ ++ size = FTRACE_PROFILE_HASH_SIZE; + +- list_add(&rec->list, &ftrace_new_addrs); ++ stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + +- return rec; +-} ++ if (!stat->hash) ++ return -ENOMEM; + +-static void print_ip_ins(const char *fmt, unsigned char *p) +-{ +- int i; ++ if (!ftrace_profile_bits) { ++ size--; + +- printk(KERN_CONT "%s", fmt); ++ for (; size; size >>= 1) ++ ftrace_profile_bits++; ++ } + +- for (i = 0; i < MCOUNT_INSN_SIZE; i++) +- printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); ++ /* Preallocate the function profiling pages */ ++ if (ftrace_profile_pages_init(stat) < 0) { ++ kfree(stat->hash); ++ stat->hash = NULL; ++ return -ENOMEM; ++ } ++ ++ return 0; + } + +-static void ftrace_bug(int failed, unsigned long ip) ++static int ftrace_profile_init(void) + { +- switch (failed) { +- case -EFAULT: +- FTRACE_WARN_ON_ONCE(1); +- pr_info("ftrace faulted on modifying "); +- print_ip_sym(ip); +- break; +- case -EINVAL: +- FTRACE_WARN_ON_ONCE(1); +- pr_info("ftrace failed to modify "); +- print_ip_sym(ip); +- print_ip_ins(" actual: ", (unsigned char *)ip); +- printk(KERN_CONT "\n"); +- break; +- case -EPERM: +- FTRACE_WARN_ON_ONCE(1); +- pr_info("ftrace faulted on writing "); +- print_ip_sym(ip); +- break; +- default: +- FTRACE_WARN_ON_ONCE(1); +- pr_info("ftrace faulted on unknown error "); +- print_ip_sym(ip); ++ int cpu; ++ int ret = 0; ++ ++ for_each_online_cpu(cpu) { ++ ret = ftrace_profile_init_cpu(cpu); ++ if (ret) ++ break; + } ++ ++ return ret; + } + ++/* interrupts must be disabled */ ++static struct ftrace_profile * ++ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) ++{ ++ struct ftrace_profile *rec; ++ struct hlist_head *hhd; ++ struct hlist_node *n; ++ unsigned long key; + +-static int +-__ftrace_replace_code(struct dyn_ftrace *rec, int enable) ++ key = hash_long(ip, ftrace_profile_bits); ++ hhd = &stat->hash[key]; ++ ++ if (hlist_empty(hhd)) ++ return NULL; ++ ++ hlist_for_each_entry_rcu(rec, n, hhd, node) { ++ if (rec->ip == ip) ++ return rec; ++ } ++ ++ return NULL; ++} ++ ++static void ftrace_add_profile(struct ftrace_profile_stat *stat, ++ struct ftrace_profile *rec) + { +- unsigned long ip, fl; +- unsigned long ftrace_addr; ++ unsigned long key; + +- ftrace_addr = (unsigned long)ftrace_caller; ++ key = hash_long(rec->ip, ftrace_profile_bits); ++ hlist_add_head_rcu(&rec->node, &stat->hash[key]); ++} + +- ip = rec->ip; ++/* ++ * The memory is already allocated, this simply finds a new record to use. ++ */ ++static struct ftrace_profile * ++ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) ++{ ++ struct ftrace_profile *rec = NULL; ++ ++ /* prevent recursion (from NMIs) */ ++ if (atomic_inc_return(&stat->disabled) != 1) ++ goto out; + + /* +- * If this record is not to be traced and +- * it is not enabled then do nothing. +- * +- * If this record is not to be traced and +- * it is enabled then disabled it. +- * ++ * Try to find the function again since an NMI ++ * could have added it + */ +- if (rec->flags & FTRACE_FL_NOTRACE) { +- if (rec->flags & FTRACE_FL_ENABLED) +- rec->flags &= ~FTRACE_FL_ENABLED; +- else +- return 0; ++ rec = ftrace_find_profiled_func(stat, ip); ++ if (rec) ++ goto out; + +- } else if (ftrace_filtered && enable) { +- /* +- * Filtering is on: +- */ +- +- fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); ++ if (stat->pages->index == PROFILES_PER_PAGE) { ++ if (!stat->pages->next) ++ goto out; ++ stat->pages = stat->pages->next; ++ } + +- /* Record is filtered and enabled, do nothing */ +- if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) +- return 0; ++ rec = &stat->pages->records[stat->pages->index++]; ++ rec->ip = ip; ++ ftrace_add_profile(stat, rec); + +- /* Record is not filtered and is not enabled do nothing */ +- if (!fl) +- return 0; ++ out: ++ atomic_dec(&stat->disabled); + +- /* Record is not filtered but enabled, disable it */ +- if (fl == FTRACE_FL_ENABLED) +- rec->flags &= ~FTRACE_FL_ENABLED; +- else +- /* Otherwise record is filtered but not enabled, enable it */ +- rec->flags |= FTRACE_FL_ENABLED; +- } else { +- /* Disable or not filtered */ ++ return rec; ++} + +- if (enable) { +- /* if record is enabled, do nothing */ +- if (rec->flags & FTRACE_FL_ENABLED) +- return 0; ++static void ++function_profile_call(unsigned long ip, unsigned long parent_ip) ++{ ++ struct ftrace_profile_stat *stat; ++ struct ftrace_profile *rec; ++ unsigned long flags; + +- rec->flags |= FTRACE_FL_ENABLED; ++ if (!ftrace_profile_enabled) ++ return; + +- } else { ++ local_irq_save(flags); + +- /* if record is not enabled do nothing */ +- if (!(rec->flags & FTRACE_FL_ENABLED)) +- return 0; ++ stat = &__get_cpu_var(ftrace_profile_stats); ++ if (!stat->hash || !ftrace_profile_enabled) ++ goto out; + +- rec->flags &= ~FTRACE_FL_ENABLED; +- } ++ rec = ftrace_find_profiled_func(stat, ip); ++ if (!rec) { ++ rec = ftrace_profile_alloc(stat, ip); ++ if (!rec) ++ goto out; + } + +- if (rec->flags & FTRACE_FL_ENABLED) +- return ftrace_make_call(rec, ftrace_addr); +- else +- return ftrace_make_nop(NULL, rec, ftrace_addr); ++ rec->counter++; ++ out: ++ local_irq_restore(flags); + } + +-static void ftrace_replace_code(int enable) ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++static int profile_graph_entry(struct ftrace_graph_ent *trace) + { +- int i, failed; +- struct dyn_ftrace *rec; +- struct ftrace_page *pg; +- +- for (pg = ftrace_pages_start; pg; pg = pg->next) { +- for (i = 0; i < pg->index; i++) { +- rec = &pg->records[i]; +- +- /* +- * Skip over free records and records that have +- * failed. +- */ +- if (rec->flags & FTRACE_FL_FREE || +- rec->flags & FTRACE_FL_FAILED) +- continue; +- +- /* ignore updates to this record's mcount site */ +- if (get_kprobe((void *)rec->ip)) { +- freeze_record(rec); +- continue; +- } else { +- unfreeze_record(rec); +- } +- +- failed = __ftrace_replace_code(rec, enable); +- if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { +- rec->flags |= FTRACE_FL_FAILED; +- if ((system_state == SYSTEM_BOOTING) || +- !core_kernel_text(rec->ip)) { +- ftrace_free_rec(rec); +- } else +- ftrace_bug(failed, rec->ip); +- } +- } +- } ++ function_profile_call(trace->func, 0); ++ return 1; + } + +-static int +-ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ++static void profile_graph_return(struct ftrace_graph_ret *trace) + { +- unsigned long ip; +- int ret; ++ struct ftrace_profile_stat *stat; ++ unsigned long long calltime; ++ struct ftrace_profile *rec; ++ unsigned long flags; + +- ip = rec->ip; ++ local_irq_save(flags); ++ stat = &__get_cpu_var(ftrace_profile_stats); ++ if (!stat->hash || !ftrace_profile_enabled) ++ goto out; + +- ret = ftrace_make_nop(mod, rec, mcount_addr); +- if (ret) { +- ftrace_bug(ret, ip); +- rec->flags |= FTRACE_FL_FAILED; +- return 0; +- } +- return 1; +-} ++ calltime = trace->rettime - trace->calltime; + +-static int __ftrace_modify_code(void *data) +-{ +- int *command = data; ++ if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { ++ int index; + +- if (*command & FTRACE_ENABLE_CALLS) +- ftrace_replace_code(1); +- else if (*command & FTRACE_DISABLE_CALLS) +- ftrace_replace_code(0); ++ index = trace->depth; + +- if (*command & FTRACE_UPDATE_TRACE_FUNC) +- ftrace_update_ftrace_func(ftrace_trace_function); ++ /* Append this call time to the parent time to subtract */ ++ if (index) ++ current->ret_stack[index - 1].subtime += calltime; + +- if (*command & FTRACE_START_FUNC_RET) +- ftrace_enable_ftrace_graph_caller(); +- else if (*command & FTRACE_STOP_FUNC_RET) +- ftrace_disable_ftrace_graph_caller(); ++ if (current->ret_stack[index].subtime < calltime) ++ calltime -= current->ret_stack[index].subtime; ++ else ++ calltime = 0; ++ } + +- return 0; ++ rec = ftrace_find_profiled_func(stat, trace->func); ++ if (rec) ++ rec->time += calltime; ++ ++ out: ++ local_irq_restore(flags); + } + +-static void ftrace_run_update_code(int command) ++static int register_ftrace_profiler(void) + { +- stop_machine(__ftrace_modify_code, &command, NULL); ++ return register_ftrace_graph(&profile_graph_return, ++ &profile_graph_entry); + } + +-static ftrace_func_t saved_ftrace_func; +-static int ftrace_start_up; +- +-static void ftrace_startup_enable(int command) ++static void unregister_ftrace_profiler(void) + { +- if (saved_ftrace_func != ftrace_trace_function) { +- saved_ftrace_func = ftrace_trace_function; +- command |= FTRACE_UPDATE_TRACE_FUNC; +- } ++ unregister_ftrace_graph(); ++} ++#else ++static struct ftrace_ops ftrace_profile_ops __read_mostly = ++{ ++ .func = function_profile_call, ++}; + +- if (!command || !ftrace_enabled) +- return; ++static int register_ftrace_profiler(void) ++{ ++ return register_ftrace_function(&ftrace_profile_ops); ++} + +- ftrace_run_update_code(command); ++static void unregister_ftrace_profiler(void) ++{ ++ unregister_ftrace_function(&ftrace_profile_ops); + } ++#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +-static void ftrace_startup(int command) ++static ssize_t ++ftrace_profile_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos) + { +- if (unlikely(ftrace_disabled)) +- return; ++ unsigned long val; ++ char buf[64]; ++ int ret; + +- mutex_lock(&ftrace_start_lock); +- ftrace_start_up++; +- command |= FTRACE_ENABLE_CALLS; ++ if (cnt >= sizeof(buf)) ++ return -EINVAL; + +- ftrace_startup_enable(command); ++ if (copy_from_user(&buf, ubuf, cnt)) ++ return -EFAULT; + +- mutex_unlock(&ftrace_start_lock); +-} ++ buf[cnt] = 0; + +-static void ftrace_shutdown(int command) +-{ +- if (unlikely(ftrace_disabled)) +- return; ++ ret = strict_strtoul(buf, 10, &val); ++ if (ret < 0) ++ return ret; + +- mutex_lock(&ftrace_start_lock); +- ftrace_start_up--; +- if (!ftrace_start_up) +- command |= FTRACE_DISABLE_CALLS; ++ val = !!val; + +- if (saved_ftrace_func != ftrace_trace_function) { +- saved_ftrace_func = ftrace_trace_function; +- command |= FTRACE_UPDATE_TRACE_FUNC; ++ mutex_lock(&ftrace_profile_lock); ++ if (ftrace_profile_enabled ^ val) { ++ if (val) { ++ ret = ftrace_profile_init(); ++ if (ret < 0) { ++ cnt = ret; ++ goto out; ++ } ++ ++ ret = register_ftrace_profiler(); ++ if (ret < 0) { ++ cnt = ret; ++ goto out; ++ } ++ ftrace_profile_enabled = 1; ++ } else { ++ ftrace_profile_enabled = 0; ++ /* ++ * unregister_ftrace_profiler calls stop_machine ++ * so this acts like an synchronize_sched. ++ */ ++ unregister_ftrace_profiler(); ++ } + } ++ out: ++ mutex_unlock(&ftrace_profile_lock); + +- if (!command || !ftrace_enabled) +- goto out; ++ filp->f_pos += cnt; + +- ftrace_run_update_code(command); +- out: +- mutex_unlock(&ftrace_start_lock); ++ return cnt; + } + +-static void ftrace_startup_sysctl(void) ++static ssize_t ++ftrace_profile_read(struct file *filp, char __user *ubuf, ++ size_t cnt, loff_t *ppos) + { +- int command = FTRACE_ENABLE_MCOUNT; ++ char buf[64]; ++ int r; + +- if (unlikely(ftrace_disabled)) +- return; ++ r = sprintf(buf, "%u\n", ftrace_profile_enabled); ++ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); ++} + +- mutex_lock(&ftrace_start_lock); +- /* Force update next time */ +- saved_ftrace_func = NULL; +- /* ftrace_start_up is true if we want ftrace running */ +- if (ftrace_start_up) +- command |= FTRACE_ENABLE_CALLS; ++static const struct file_operations ftrace_profile_fops = { ++ .open = tracing_open_generic, ++ .read = ftrace_profile_read, ++ .write = ftrace_profile_write, ++}; + +- ftrace_run_update_code(command); +- mutex_unlock(&ftrace_start_lock); +-} ++/* used to initialize the real stat files */ ++static struct tracer_stat function_stats __initdata = { ++ .name = "functions", ++ .stat_start = function_stat_start, ++ .stat_next = function_stat_next, ++ .stat_cmp = function_stat_cmp, ++ .stat_headers = function_stat_headers, ++ .stat_show = function_stat_show ++}; + +-static void ftrace_shutdown_sysctl(void) ++static void ftrace_profile_debugfs(struct dentry *d_tracer) + { +- int command = FTRACE_DISABLE_MCOUNT; ++ struct ftrace_profile_stat *stat; ++ struct dentry *entry; ++ char *name; ++ int ret; ++ int cpu; + +- if (unlikely(ftrace_disabled)) +- return; ++ for_each_possible_cpu(cpu) { ++ stat = &per_cpu(ftrace_profile_stats, cpu); + +- mutex_lock(&ftrace_start_lock); +- /* ftrace_start_up is true if ftrace is running */ +- if (ftrace_start_up) +- command |= FTRACE_DISABLE_CALLS; ++ /* allocate enough for function name + cpu number */ ++ name = kmalloc(32, GFP_KERNEL); ++ if (!name) { ++ /* ++ * The files created are permanent, if something happens ++ * we still do not free memory. ++ */ ++ kfree(stat); ++ WARN(1, ++ "Could not allocate stat file for cpu %d\n", ++ cpu); ++ return; ++ } ++ stat->stat = function_stats; ++ snprintf(name, 32, "function%d", cpu); ++ stat->stat.name = name; ++ ret = register_stat_tracer(&stat->stat); ++ if (ret) { ++ WARN(1, ++ "Could not register function stat for cpu %d\n", ++ cpu); ++ kfree(name); ++ return; ++ } ++ } + +- ftrace_run_update_code(command); +- mutex_unlock(&ftrace_start_lock); ++ entry = debugfs_create_file("function_profile_enabled", 0644, ++ d_tracer, NULL, &ftrace_profile_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs " ++ "'function_profile_enabled' entry\n"); + } + +-static cycle_t ftrace_update_time; +-static unsigned long ftrace_update_cnt; +-unsigned long ftrace_update_tot_cnt; +- +-static int ftrace_update_code(struct module *mod) ++#else /* CONFIG_FUNCTION_PROFILER */ ++static void ftrace_profile_debugfs(struct dentry *d_tracer) + { +- struct dyn_ftrace *p, *t; +- cycle_t start, stop; ++} ++#endif /* CONFIG_FUNCTION_PROFILER */ + +- start = ftrace_now(raw_smp_processor_id()); +- ftrace_update_cnt = 0; ++/* set when tracing only a pid */ ++struct pid *ftrace_pid_trace; ++static struct pid * const ftrace_swapper_pid = &init_struct_pid; + +- list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) { ++#ifdef CONFIG_DYNAMIC_FTRACE + +- /* If something went wrong, bail without enabling anything */ +- if (unlikely(ftrace_disabled)) +- return -1; ++#ifndef CONFIG_FTRACE_MCOUNT_RECORD ++# error Dynamic ftrace depends on MCOUNT_RECORD ++#endif + +- list_del_init(&p->list); ++static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; + +- /* convert record (i.e, patch mcount-call with NOP) */ +- if (ftrace_code_disable(mod, p)) { +- p->flags |= FTRACE_FL_CONVERTED; +- ftrace_update_cnt++; +- } else +- ftrace_free_rec(p); +- } ++struct ftrace_func_probe { ++ struct hlist_node node; ++ struct ftrace_probe_ops *ops; ++ unsigned long flags; ++ unsigned long ip; ++ void *data; ++ struct rcu_head rcu; ++}; + +- stop = ftrace_now(raw_smp_processor_id()); +- ftrace_update_time = stop - start; +- ftrace_update_tot_cnt += ftrace_update_cnt; ++enum { ++ FTRACE_ENABLE_CALLS = (1 << 0), ++ FTRACE_DISABLE_CALLS = (1 << 1), ++ FTRACE_UPDATE_TRACE_FUNC = (1 << 2), ++ FTRACE_ENABLE_MCOUNT = (1 << 3), ++ FTRACE_DISABLE_MCOUNT = (1 << 4), ++ FTRACE_START_FUNC_RET = (1 << 5), ++ FTRACE_STOP_FUNC_RET = (1 << 6), ++}; + +- return 0; +-} ++static int ftrace_filtered; + +-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +-{ +- struct ftrace_page *pg; +- int cnt; +- int i; ++static struct dyn_ftrace *ftrace_new_addrs; + +- /* allocate a few pages */ +- ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); +- if (!ftrace_pages_start) +- return -1; ++static DEFINE_MUTEX(ftrace_regex_lock); + +- /* +- * Allocate a few more pages. +- * ++struct ftrace_page { ++ struct ftrace_page *next; ++ int index; ++ struct dyn_ftrace records[]; ++}; ++ ++#define ENTRIES_PER_PAGE \ ++ ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) ++ ++/* estimate from running different kernels */ ++#define NR_TO_INIT 10000 ++ ++static struct ftrace_page *ftrace_pages_start; ++static struct ftrace_page *ftrace_pages; ++ ++static struct dyn_ftrace *ftrace_free_records; ++ ++/* ++ * This is a double for. Do not use 'break' to break out of the loop, ++ * you must use a goto. ++ */ ++#define do_for_each_ftrace_rec(pg, rec) \ ++ for (pg = ftrace_pages_start; pg; pg = pg->next) { \ ++ int _____i; \ ++ for (_____i = 0; _____i < pg->index; _____i++) { \ ++ rec = &pg->records[_____i]; ++ ++#define while_for_each_ftrace_rec() \ ++ } \ ++ } ++ ++#ifdef CONFIG_KPROBES ++ ++static int frozen_record_count; ++ ++static inline void freeze_record(struct dyn_ftrace *rec) ++{ ++ if (!(rec->flags & FTRACE_FL_FROZEN)) { ++ rec->flags |= FTRACE_FL_FROZEN; ++ frozen_record_count++; ++ } ++} ++ ++static inline void unfreeze_record(struct dyn_ftrace *rec) ++{ ++ if (rec->flags & FTRACE_FL_FROZEN) { ++ rec->flags &= ~FTRACE_FL_FROZEN; ++ frozen_record_count--; ++ } ++} ++ ++static inline int record_frozen(struct dyn_ftrace *rec) ++{ ++ return rec->flags & FTRACE_FL_FROZEN; ++} ++#else ++# define freeze_record(rec) ({ 0; }) ++# define unfreeze_record(rec) ({ 0; }) ++# define record_frozen(rec) ({ 0; }) ++#endif /* CONFIG_KPROBES */ ++ ++static void ftrace_free_rec(struct dyn_ftrace *rec) ++{ ++ rec->freelist = ftrace_free_records; ++ ftrace_free_records = rec; ++ rec->flags |= FTRACE_FL_FREE; ++} ++ ++void ftrace_release(void *start, unsigned long size) ++{ ++ struct dyn_ftrace *rec; ++ struct ftrace_page *pg; ++ unsigned long s = (unsigned long)start; ++ unsigned long e = s + size; ++ ++ if (ftrace_disabled || !start) ++ return; ++ ++ mutex_lock(&ftrace_lock); ++ do_for_each_ftrace_rec(pg, rec) { ++ if ((rec->ip >= s) && (rec->ip < e) && ++ !(rec->flags & FTRACE_FL_FREE)) ++ ftrace_free_rec(rec); ++ } while_for_each_ftrace_rec(); ++ mutex_unlock(&ftrace_lock); ++} ++ ++static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) ++{ ++ struct dyn_ftrace *rec; ++ ++ /* First check for freed records */ ++ if (ftrace_free_records) { ++ rec = ftrace_free_records; ++ ++ if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { ++ FTRACE_WARN_ON_ONCE(1); ++ ftrace_free_records = NULL; ++ return NULL; ++ } ++ ++ ftrace_free_records = rec->freelist; ++ memset(rec, 0, sizeof(*rec)); ++ return rec; ++ } ++ ++ if (ftrace_pages->index == ENTRIES_PER_PAGE) { ++ if (!ftrace_pages->next) { ++ /* allocate another page */ ++ ftrace_pages->next = ++ (void *)get_zeroed_page(GFP_KERNEL); ++ if (!ftrace_pages->next) ++ return NULL; ++ } ++ ftrace_pages = ftrace_pages->next; ++ } ++ ++ return &ftrace_pages->records[ftrace_pages->index++]; ++} ++ ++static struct dyn_ftrace * ++ftrace_record_ip(unsigned long ip) ++{ ++ struct dyn_ftrace *rec; ++ ++ if (ftrace_disabled) ++ return NULL; ++ ++ rec = ftrace_alloc_dyn_node(ip); ++ if (!rec) ++ return NULL; ++ ++ rec->ip = ip; ++ rec->newlist = ftrace_new_addrs; ++ ftrace_new_addrs = rec; ++ ++ return rec; ++} ++ ++static void print_ip_ins(const char *fmt, unsigned char *p) ++{ ++ int i; ++ ++ printk(KERN_CONT "%s", fmt); ++ ++ for (i = 0; i < MCOUNT_INSN_SIZE; i++) ++ printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); ++} ++ ++static void ftrace_bug(int failed, unsigned long ip) ++{ ++ switch (failed) { ++ case -EFAULT: ++ FTRACE_WARN_ON_ONCE(1); ++ pr_info("ftrace faulted on modifying "); ++ print_ip_sym(ip); ++ break; ++ case -EINVAL: ++ FTRACE_WARN_ON_ONCE(1); ++ pr_info("ftrace failed to modify "); ++ print_ip_sym(ip); ++ print_ip_ins(" actual: ", (unsigned char *)ip); ++ printk(KERN_CONT "\n"); ++ break; ++ case -EPERM: ++ FTRACE_WARN_ON_ONCE(1); ++ pr_info("ftrace faulted on writing "); ++ print_ip_sym(ip); ++ break; ++ default: ++ FTRACE_WARN_ON_ONCE(1); ++ pr_info("ftrace faulted on unknown error "); ++ print_ip_sym(ip); ++ } ++} ++ ++ ++static int ++__ftrace_replace_code(struct dyn_ftrace *rec, int enable) ++{ ++ unsigned long ftrace_addr; ++ unsigned long ip, fl; ++ ++ ftrace_addr = (unsigned long)FTRACE_ADDR; ++ ++ ip = rec->ip; ++ ++ /* ++ * If this record is not to be traced and ++ * it is not enabled then do nothing. ++ * ++ * If this record is not to be traced and ++ * it is enabled then disable it. ++ * ++ */ ++ if (rec->flags & FTRACE_FL_NOTRACE) { ++ if (rec->flags & FTRACE_FL_ENABLED) ++ rec->flags &= ~FTRACE_FL_ENABLED; ++ else ++ return 0; ++ ++ } else if (ftrace_filtered && enable) { ++ /* ++ * Filtering is on: ++ */ ++ ++ fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); ++ ++ /* Record is filtered and enabled, do nothing */ ++ if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ++ return 0; ++ ++ /* Record is not filtered or enabled, do nothing */ ++ if (!fl) ++ return 0; ++ ++ /* Record is not filtered but enabled, disable it */ ++ if (fl == FTRACE_FL_ENABLED) ++ rec->flags &= ~FTRACE_FL_ENABLED; ++ else ++ /* Otherwise record is filtered but not enabled, enable it */ ++ rec->flags |= FTRACE_FL_ENABLED; ++ } else { ++ /* Disable or not filtered */ ++ ++ if (enable) { ++ /* if record is enabled, do nothing */ ++ if (rec->flags & FTRACE_FL_ENABLED) ++ return 0; ++ ++ rec->flags |= FTRACE_FL_ENABLED; ++ ++ } else { ++ ++ /* if record is not enabled, do nothing */ ++ if (!(rec->flags & FTRACE_FL_ENABLED)) ++ return 0; ++ ++ rec->flags &= ~FTRACE_FL_ENABLED; ++ } ++ } ++ ++ if (rec->flags & FTRACE_FL_ENABLED) ++ return ftrace_make_call(rec, ftrace_addr); ++ else ++ return ftrace_make_nop(NULL, rec, ftrace_addr); ++} ++ ++static void ftrace_replace_code(int enable) ++{ ++ struct dyn_ftrace *rec; ++ struct ftrace_page *pg; ++ int failed; ++ ++ do_for_each_ftrace_rec(pg, rec) { ++ /* ++ * Skip over free records, records that have ++ * failed and not converted. ++ */ ++ if (rec->flags & FTRACE_FL_FREE || ++ rec->flags & FTRACE_FL_FAILED || ++ !(rec->flags & FTRACE_FL_CONVERTED)) ++ continue; ++ ++ /* ignore updates to this record's mcount site */ ++ if (get_kprobe((void *)rec->ip)) { ++ freeze_record(rec); ++ continue; ++ } else { ++ unfreeze_record(rec); ++ } ++ ++ failed = __ftrace_replace_code(rec, enable); ++ if (failed) { ++ rec->flags |= FTRACE_FL_FAILED; ++ if ((system_state == SYSTEM_BOOTING) || ++ !core_kernel_text(rec->ip)) { ++ ftrace_free_rec(rec); ++ } else { ++ ftrace_bug(failed, rec->ip); ++ /* Stop processing */ ++ return; ++ } ++ } ++ } while_for_each_ftrace_rec(); ++} ++ ++static int ++ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ++{ ++ unsigned long ip; ++ int ret; ++ ++ ip = rec->ip; ++ ++ ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); ++ if (ret) { ++ ftrace_bug(ret, ip); ++ rec->flags |= FTRACE_FL_FAILED; ++ return 0; ++ } ++ return 1; ++} ++ ++/* ++ * archs can override this function if they must do something ++ * before the modifying code is performed. ++ */ ++int __weak ftrace_arch_code_modify_prepare(void) ++{ ++ return 0; ++} ++ ++/* ++ * archs can override this function if they must do something ++ * after the modifying code is performed. ++ */ ++int __weak ftrace_arch_code_modify_post_process(void) ++{ ++ return 0; ++} ++ ++static int __ftrace_modify_code(void *data) ++{ ++ int *command = data; ++ ++ if (*command & FTRACE_ENABLE_CALLS) ++ ftrace_replace_code(1); ++ else if (*command & FTRACE_DISABLE_CALLS) ++ ftrace_replace_code(0); ++ ++ if (*command & FTRACE_UPDATE_TRACE_FUNC) ++ ftrace_update_ftrace_func(ftrace_trace_function); ++ ++ if (*command & FTRACE_START_FUNC_RET) ++ ftrace_enable_ftrace_graph_caller(); ++ else if (*command & FTRACE_STOP_FUNC_RET) ++ ftrace_disable_ftrace_graph_caller(); ++ ++ return 0; ++} ++ ++static void ftrace_run_update_code(int command) ++{ ++ int ret; ++ ++ ret = ftrace_arch_code_modify_prepare(); ++ FTRACE_WARN_ON(ret); ++ if (ret) ++ return; ++ ++ stop_machine(__ftrace_modify_code, &command, NULL); ++ ++ ret = ftrace_arch_code_modify_post_process(); ++ FTRACE_WARN_ON(ret); ++} ++ ++static ftrace_func_t saved_ftrace_func; ++static int ftrace_start_up; ++ ++static void ftrace_startup_enable(int command) ++{ ++ if (saved_ftrace_func != ftrace_trace_function) { ++ saved_ftrace_func = ftrace_trace_function; ++ command |= FTRACE_UPDATE_TRACE_FUNC; ++ } ++ ++ if (!command || !ftrace_enabled) ++ return; ++ ++ ftrace_run_update_code(command); ++} ++ ++static void ftrace_startup(int command) ++{ ++ if (unlikely(ftrace_disabled)) ++ return; ++ ++ ftrace_start_up++; ++ command |= FTRACE_ENABLE_CALLS; ++ ++ ftrace_startup_enable(command); ++} ++ ++static void ftrace_shutdown(int command) ++{ ++ if (unlikely(ftrace_disabled)) ++ return; ++ ++ ftrace_start_up--; ++ if (!ftrace_start_up) ++ command |= FTRACE_DISABLE_CALLS; ++ ++ if (saved_ftrace_func != ftrace_trace_function) { ++ saved_ftrace_func = ftrace_trace_function; ++ command |= FTRACE_UPDATE_TRACE_FUNC; ++ } ++ ++ if (!command || !ftrace_enabled) ++ return; ++ ++ ftrace_run_update_code(command); ++} ++ ++static void ftrace_startup_sysctl(void) ++{ ++ int command = FTRACE_ENABLE_MCOUNT; ++ ++ if (unlikely(ftrace_disabled)) ++ return; ++ ++ /* Force update next time */ ++ saved_ftrace_func = NULL; ++ /* ftrace_start_up is true if we want ftrace running */ ++ if (ftrace_start_up) ++ command |= FTRACE_ENABLE_CALLS; ++ ++ ftrace_run_update_code(command); ++} ++ ++static void ftrace_shutdown_sysctl(void) ++{ ++ int command = FTRACE_DISABLE_MCOUNT; ++ ++ if (unlikely(ftrace_disabled)) ++ return; ++ ++ /* ftrace_start_up is true if ftrace is running */ ++ if (ftrace_start_up) ++ command |= FTRACE_DISABLE_CALLS; ++ ++ ftrace_run_update_code(command); ++} ++ ++static cycle_t ftrace_update_time; ++static unsigned long ftrace_update_cnt; ++unsigned long ftrace_update_tot_cnt; ++ ++static int ftrace_update_code(struct module *mod) ++{ ++ struct dyn_ftrace *p; ++ cycle_t start, stop; ++ ++ start = ftrace_now(raw_smp_processor_id()); ++ ftrace_update_cnt = 0; ++ ++ while (ftrace_new_addrs) { ++ ++ /* If something went wrong, bail without enabling anything */ ++ if (unlikely(ftrace_disabled)) ++ return -1; ++ ++ p = ftrace_new_addrs; ++ ftrace_new_addrs = p->newlist; ++ p->flags = 0L; ++ ++ /* convert record (i.e, patch mcount-call with NOP) */ ++ if (ftrace_code_disable(mod, p)) { ++ p->flags |= FTRACE_FL_CONVERTED; ++ ftrace_update_cnt++; ++ } else ++ ftrace_free_rec(p); ++ } ++ ++ stop = ftrace_now(raw_smp_processor_id()); ++ ftrace_update_time = stop - start; ++ ftrace_update_tot_cnt += ftrace_update_cnt; ++ ++ return 0; ++} ++ ++static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) ++{ ++ struct ftrace_page *pg; ++ int cnt; ++ int i; ++ ++ /* allocate a few pages */ ++ ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); ++ if (!ftrace_pages_start) ++ return -1; ++ ++ /* ++ * Allocate a few more pages. ++ * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: +@@ -759,365 +1349,904 @@ static int __init ftrace_dyn_table_alloc + + pg = ftrace_pages = ftrace_pages_start; + +- cnt = num_to_init / ENTRIES_PER_PAGE; +- pr_info("ftrace: allocating %ld entries in %d pages\n", +- num_to_init, cnt + 1); ++ cnt = num_to_init / ENTRIES_PER_PAGE; ++ pr_info("ftrace: allocating %ld entries in %d pages\n", ++ num_to_init, cnt + 1); ++ ++ for (i = 0; i < cnt; i++) { ++ pg->next = (void *)get_zeroed_page(GFP_KERNEL); ++ ++ /* If we fail, we'll try later anyway */ ++ if (!pg->next) ++ break; ++ ++ pg = pg->next; ++ } ++ ++ return 0; ++} ++ ++enum { ++ FTRACE_ITER_FILTER = (1 << 0), ++ FTRACE_ITER_CONT = (1 << 1), ++ FTRACE_ITER_NOTRACE = (1 << 2), ++ FTRACE_ITER_FAILURES = (1 << 3), ++ FTRACE_ITER_PRINTALL = (1 << 4), ++ FTRACE_ITER_HASH = (1 << 5), ++}; ++ ++#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ ++ ++struct ftrace_iterator { ++ struct ftrace_page *pg; ++ int hidx; ++ int idx; ++ unsigned flags; ++ unsigned char buffer[FTRACE_BUFF_MAX+1]; ++ unsigned buffer_idx; ++ unsigned filtered; ++}; ++ ++static void * ++t_hash_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct ftrace_iterator *iter = m->private; ++ struct hlist_node *hnd = v; ++ struct hlist_head *hhd; ++ ++ WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); ++ ++ (*pos)++; ++ ++ retry: ++ if (iter->hidx >= FTRACE_FUNC_HASHSIZE) ++ return NULL; ++ ++ hhd = &ftrace_func_hash[iter->hidx]; ++ ++ if (hlist_empty(hhd)) { ++ iter->hidx++; ++ hnd = NULL; ++ goto retry; ++ } ++ ++ if (!hnd) ++ hnd = hhd->first; ++ else { ++ hnd = hnd->next; ++ if (!hnd) { ++ iter->hidx++; ++ goto retry; ++ } ++ } ++ ++ return hnd; ++} ++ ++static void *t_hash_start(struct seq_file *m, loff_t *pos) ++{ ++ struct ftrace_iterator *iter = m->private; ++ void *p = NULL; ++ ++ iter->flags |= FTRACE_ITER_HASH; ++ ++ return t_hash_next(m, p, pos); ++} ++ ++static int t_hash_show(struct seq_file *m, void *v) ++{ ++ struct ftrace_func_probe *rec; ++ struct hlist_node *hnd = v; ++ char str[KSYM_SYMBOL_LEN]; ++ ++ rec = hlist_entry(hnd, struct ftrace_func_probe, node); ++ ++ if (rec->ops->print) ++ return rec->ops->print(m, rec->ip, rec->ops, rec->data); ++ ++ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); ++ seq_printf(m, "%s:", str); ++ ++ kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); ++ seq_printf(m, "%s", str); ++ ++ if (rec->data) ++ seq_printf(m, ":%p", rec->data); ++ seq_putc(m, '\n'); ++ ++ return 0; ++} ++ ++static void * ++t_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct ftrace_iterator *iter = m->private; ++ struct dyn_ftrace *rec = NULL; ++ ++ if (iter->flags & FTRACE_ITER_HASH) ++ return t_hash_next(m, v, pos); ++ ++ (*pos)++; ++ ++ if (iter->flags & FTRACE_ITER_PRINTALL) ++ return NULL; ++ ++ retry: ++ if (iter->idx >= iter->pg->index) { ++ if (iter->pg->next) { ++ iter->pg = iter->pg->next; ++ iter->idx = 0; ++ goto retry; ++ } else { ++ iter->idx = -1; ++ } ++ } else { ++ rec = &iter->pg->records[iter->idx++]; ++ if ((rec->flags & FTRACE_FL_FREE) || ++ ++ (!(iter->flags & FTRACE_ITER_FAILURES) && ++ (rec->flags & FTRACE_FL_FAILED)) || ++ ++ ((iter->flags & FTRACE_ITER_FAILURES) && ++ !(rec->flags & FTRACE_FL_FAILED)) || ++ ++ ((iter->flags & FTRACE_ITER_FILTER) && ++ !(rec->flags & FTRACE_FL_FILTER)) || ++ ++ ((iter->flags & FTRACE_ITER_NOTRACE) && ++ !(rec->flags & FTRACE_FL_NOTRACE))) { ++ rec = NULL; ++ goto retry; ++ } ++ } ++ ++ return rec; ++} ++ ++static void *t_start(struct seq_file *m, loff_t *pos) ++{ ++ struct ftrace_iterator *iter = m->private; ++ void *p = NULL; ++ ++ mutex_lock(&ftrace_lock); ++ /* ++ * For set_ftrace_filter reading, if we have the filter ++ * off, we can short cut and just print out that all ++ * functions are enabled. ++ */ ++ if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { ++ if (*pos > 0) ++ return t_hash_start(m, pos); ++ iter->flags |= FTRACE_ITER_PRINTALL; ++ (*pos)++; ++ return iter; ++ } ++ ++ if (iter->flags & FTRACE_ITER_HASH) ++ return t_hash_start(m, pos); ++ ++ if (*pos > 0) { ++ if (iter->idx < 0) ++ return p; ++ (*pos)--; ++ iter->idx--; ++ } ++ ++ p = t_next(m, p, pos); ++ ++ if (!p) ++ return t_hash_start(m, pos); ++ ++ return p; ++} ++ ++static void t_stop(struct seq_file *m, void *p) ++{ ++ mutex_unlock(&ftrace_lock); ++} ++ ++static int t_show(struct seq_file *m, void *v) ++{ ++ struct ftrace_iterator *iter = m->private; ++ struct dyn_ftrace *rec = v; ++ char str[KSYM_SYMBOL_LEN]; ++ ++ if (iter->flags & FTRACE_ITER_HASH) ++ return t_hash_show(m, v); ++ ++ if (iter->flags & FTRACE_ITER_PRINTALL) { ++ seq_printf(m, "#### all functions enabled ####\n"); ++ return 0; ++ } ++ ++ if (!rec) ++ return 0; ++ ++ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); ++ ++ seq_printf(m, "%s\n", str); ++ ++ return 0; ++} ++ ++static struct seq_operations show_ftrace_seq_ops = { ++ .start = t_start, ++ .next = t_next, ++ .stop = t_stop, ++ .show = t_show, ++}; ++ ++static int ++ftrace_avail_open(struct inode *inode, struct file *file) ++{ ++ struct ftrace_iterator *iter; ++ int ret; ++ ++ if (unlikely(ftrace_disabled)) ++ return -ENODEV; ++ ++ iter = kzalloc(sizeof(*iter), GFP_KERNEL); ++ if (!iter) ++ return -ENOMEM; ++ ++ iter->pg = ftrace_pages_start; ++ ++ ret = seq_open(file, &show_ftrace_seq_ops); ++ if (!ret) { ++ struct seq_file *m = file->private_data; ++ ++ m->private = iter; ++ } else { ++ kfree(iter); ++ } ++ ++ return ret; ++} ++ ++int ftrace_avail_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *m = (struct seq_file *)file->private_data; ++ struct ftrace_iterator *iter = m->private; ++ ++ seq_release(inode, file); ++ kfree(iter); + +- for (i = 0; i < cnt; i++) { +- pg->next = (void *)get_zeroed_page(GFP_KERNEL); ++ return 0; ++} + +- /* If we fail, we'll try later anyway */ +- if (!pg->next) +- break; ++static int ++ftrace_failures_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ struct seq_file *m; ++ struct ftrace_iterator *iter; + +- pg = pg->next; ++ ret = ftrace_avail_open(inode, file); ++ if (!ret) { ++ m = (struct seq_file *)file->private_data; ++ iter = (struct ftrace_iterator *)m->private; ++ iter->flags = FTRACE_ITER_FAILURES; + } + +- return 0; ++ return ret; + } + +-enum { +- FTRACE_ITER_FILTER = (1 << 0), +- FTRACE_ITER_CONT = (1 << 1), +- FTRACE_ITER_NOTRACE = (1 << 2), +- FTRACE_ITER_FAILURES = (1 << 3), +-}; + +-#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ ++static void ftrace_filter_reset(int enable) ++{ ++ struct ftrace_page *pg; ++ struct dyn_ftrace *rec; ++ unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + +-struct ftrace_iterator { +- struct ftrace_page *pg; +- unsigned idx; +- unsigned flags; +- unsigned char buffer[FTRACE_BUFF_MAX+1]; +- unsigned buffer_idx; +- unsigned filtered; +-}; ++ mutex_lock(&ftrace_lock); ++ if (enable) ++ ftrace_filtered = 0; ++ do_for_each_ftrace_rec(pg, rec) { ++ if (rec->flags & FTRACE_FL_FAILED) ++ continue; ++ rec->flags &= ~type; ++ } while_for_each_ftrace_rec(); ++ mutex_unlock(&ftrace_lock); ++} + +-static void * +-t_next(struct seq_file *m, void *v, loff_t *pos) ++static int ++ftrace_regex_open(struct inode *inode, struct file *file, int enable) + { +- struct ftrace_iterator *iter = m->private; +- struct dyn_ftrace *rec = NULL; ++ struct ftrace_iterator *iter; ++ int ret = 0; + +- (*pos)++; ++ if (unlikely(ftrace_disabled)) ++ return -ENODEV; + +- /* should not be called from interrupt context */ +- spin_lock(&ftrace_lock); +- retry: +- if (iter->idx >= iter->pg->index) { +- if (iter->pg->next) { +- iter->pg = iter->pg->next; +- iter->idx = 0; +- goto retry; +- } else { +- iter->idx = -1; +- } +- } else { +- rec = &iter->pg->records[iter->idx++]; +- if ((rec->flags & FTRACE_FL_FREE) || ++ iter = kzalloc(sizeof(*iter), GFP_KERNEL); ++ if (!iter) ++ return -ENOMEM; + +- (!(iter->flags & FTRACE_ITER_FAILURES) && +- (rec->flags & FTRACE_FL_FAILED)) || ++ mutex_lock(&ftrace_regex_lock); ++ if ((file->f_mode & FMODE_WRITE) && ++ !(file->f_flags & O_APPEND)) ++ ftrace_filter_reset(enable); + +- ((iter->flags & FTRACE_ITER_FAILURES) && +- !(rec->flags & FTRACE_FL_FAILED)) || ++ if (file->f_mode & FMODE_READ) { ++ iter->pg = ftrace_pages_start; ++ iter->flags = enable ? FTRACE_ITER_FILTER : ++ FTRACE_ITER_NOTRACE; + +- ((iter->flags & FTRACE_ITER_FILTER) && +- !(rec->flags & FTRACE_FL_FILTER)) || ++ ret = seq_open(file, &show_ftrace_seq_ops); ++ if (!ret) { ++ struct seq_file *m = file->private_data; ++ m->private = iter; ++ } else ++ kfree(iter); ++ } else ++ file->private_data = iter; ++ mutex_unlock(&ftrace_regex_lock); + +- ((iter->flags & FTRACE_ITER_NOTRACE) && +- !(rec->flags & FTRACE_FL_NOTRACE))) { +- rec = NULL; +- goto retry; ++ return ret; ++} ++ ++static int ++ftrace_filter_open(struct inode *inode, struct file *file) ++{ ++ return ftrace_regex_open(inode, file, 1); ++} ++ ++static int ++ftrace_notrace_open(struct inode *inode, struct file *file) ++{ ++ return ftrace_regex_open(inode, file, 0); ++} ++ ++static loff_t ++ftrace_regex_lseek(struct file *file, loff_t offset, int origin) ++{ ++ loff_t ret; ++ ++ if (file->f_mode & FMODE_READ) ++ ret = seq_lseek(file, offset, origin); ++ else ++ file->f_pos = ret = 1; ++ ++ return ret; ++} ++ ++enum { ++ MATCH_FULL, ++ MATCH_FRONT_ONLY, ++ MATCH_MIDDLE_ONLY, ++ MATCH_END_ONLY, ++}; ++ ++/* ++ * (static function - no need for kernel doc) ++ * ++ * Pass in a buffer containing a glob and this function will ++ * set search to point to the search part of the buffer and ++ * return the type of search it is (see enum above). ++ * This does modify buff. ++ * ++ * Returns enum type. ++ * search returns the pointer to use for comparison. ++ * not returns 1 if buff started with a '!' ++ * 0 otherwise. ++ */ ++static int ++ftrace_setup_glob(char *buff, int len, char **search, int *not) ++{ ++ int type = MATCH_FULL; ++ int i; ++ ++ if (buff[0] == '!') { ++ *not = 1; ++ buff++; ++ len--; ++ } else ++ *not = 0; ++ ++ *search = buff; ++ ++ for (i = 0; i < len; i++) { ++ if (buff[i] == '*') { ++ if (!i) { ++ *search = buff + 1; ++ type = MATCH_END_ONLY; ++ } else { ++ if (type == MATCH_END_ONLY) ++ type = MATCH_MIDDLE_ONLY; ++ else ++ type = MATCH_FRONT_ONLY; ++ buff[i] = 0; ++ break; ++ } + } + } +- spin_unlock(&ftrace_lock); + +- return rec; ++ return type; + } + +-static void *t_start(struct seq_file *m, loff_t *pos) ++static int ftrace_match(char *str, char *regex, int len, int type) + { +- struct ftrace_iterator *iter = m->private; +- void *p = NULL; ++ int matched = 0; ++ char *ptr; + +- if (*pos > 0) { +- if (iter->idx < 0) +- return p; +- (*pos)--; +- iter->idx--; ++ switch (type) { ++ case MATCH_FULL: ++ if (strcmp(str, regex) == 0) ++ matched = 1; ++ break; ++ case MATCH_FRONT_ONLY: ++ if (strncmp(str, regex, len) == 0) ++ matched = 1; ++ break; ++ case MATCH_MIDDLE_ONLY: ++ if (strstr(str, regex)) ++ matched = 1; ++ break; ++ case MATCH_END_ONLY: ++ ptr = strstr(str, regex); ++ if (ptr && (ptr[len] == 0)) ++ matched = 1; ++ break; + } + +- p = t_next(m, p, pos); ++ return matched; ++} + +- return p; ++static int ++ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) ++{ ++ char str[KSYM_SYMBOL_LEN]; ++ ++ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); ++ return ftrace_match(str, regex, len, type); + } + +-static void t_stop(struct seq_file *m, void *p) ++static void ftrace_match_records(char *buff, int len, int enable) + { ++ unsigned int search_len; ++ struct ftrace_page *pg; ++ struct dyn_ftrace *rec; ++ unsigned long flag; ++ char *search; ++ int type; ++ int not; ++ ++ flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; ++ type = ftrace_setup_glob(buff, len, &search, ¬); ++ ++ search_len = strlen(search); ++ ++ mutex_lock(&ftrace_lock); ++ do_for_each_ftrace_rec(pg, rec) { ++ ++ if (rec->flags & FTRACE_FL_FAILED) ++ continue; ++ ++ if (ftrace_match_record(rec, search, search_len, type)) { ++ if (not) ++ rec->flags &= ~flag; ++ else ++ rec->flags |= flag; ++ } ++ /* ++ * Only enable filtering if we have a function that ++ * is filtered on. ++ */ ++ if (enable && (rec->flags & FTRACE_FL_FILTER)) ++ ftrace_filtered = 1; ++ } while_for_each_ftrace_rec(); ++ mutex_unlock(&ftrace_lock); + } + +-static int t_show(struct seq_file *m, void *v) ++static int ++ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, ++ char *regex, int len, int type) + { +- struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; ++ char *modname; + +- if (!rec) ++ kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); ++ ++ if (!modname || strcmp(modname, mod)) + return 0; + +- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); ++ /* blank search means to match all funcs in the mod */ ++ if (len) ++ return ftrace_match(str, regex, len, type); ++ else ++ return 1; ++} + +- seq_printf(m, "%s\n", str); ++static void ftrace_match_module_records(char *buff, char *mod, int enable) ++{ ++ unsigned search_len = 0; ++ struct ftrace_page *pg; ++ struct dyn_ftrace *rec; ++ int type = MATCH_FULL; ++ char *search = buff; ++ unsigned long flag; ++ int not = 0; ++ ++ flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; ++ ++ /* blank or '*' mean the same */ ++ if (strcmp(buff, "*") == 0) ++ buff[0] = 0; ++ ++ /* handle the case of 'dont filter this module' */ ++ if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { ++ buff[0] = 0; ++ not = 1; ++ } ++ ++ if (strlen(buff)) { ++ type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); ++ search_len = strlen(search); ++ } ++ ++ mutex_lock(&ftrace_lock); ++ do_for_each_ftrace_rec(pg, rec) { ++ ++ if (rec->flags & FTRACE_FL_FAILED) ++ continue; ++ ++ if (ftrace_match_module_record(rec, mod, ++ search, search_len, type)) { ++ if (not) ++ rec->flags &= ~flag; ++ else ++ rec->flags |= flag; ++ } ++ if (enable && (rec->flags & FTRACE_FL_FILTER)) ++ ftrace_filtered = 1; ++ ++ } while_for_each_ftrace_rec(); ++ mutex_unlock(&ftrace_lock); ++} ++ ++/* ++ * We register the module command as a template to show others how ++ * to register the a command as well. ++ */ ++ ++static int ++ftrace_mod_callback(char *func, char *cmd, char *param, int enable) ++{ ++ char *mod; ++ ++ /* ++ * cmd == 'mod' because we only registered this func ++ * for the 'mod' ftrace_func_command. ++ * But if you register one func with multiple commands, ++ * you can tell which command was used by the cmd ++ * parameter. ++ */ ++ ++ /* we must have a module name */ ++ if (!param) ++ return -EINVAL; ++ ++ mod = strsep(¶m, ":"); ++ if (!strlen(mod)) ++ return -EINVAL; + ++ ftrace_match_module_records(func, mod, enable); + return 0; + } + +-static struct seq_operations show_ftrace_seq_ops = { +- .start = t_start, +- .next = t_next, +- .stop = t_stop, +- .show = t_show, ++static struct ftrace_func_command ftrace_mod_cmd = { ++ .name = "mod", ++ .func = ftrace_mod_callback, + }; + +-static int +-ftrace_avail_open(struct inode *inode, struct file *file) ++static int __init ftrace_mod_cmd_init(void) ++{ ++ return register_ftrace_command(&ftrace_mod_cmd); ++} ++device_initcall(ftrace_mod_cmd_init); ++ ++static void ++function_trace_probe_call(unsigned long ip, unsigned long parent_ip) ++{ ++ struct ftrace_func_probe *entry; ++ struct hlist_head *hhd; ++ struct hlist_node *n; ++ unsigned long key; ++ int resched; ++ ++ key = hash_long(ip, FTRACE_HASH_BITS); ++ ++ hhd = &ftrace_func_hash[key]; ++ ++ if (hlist_empty(hhd)) ++ return; ++ ++ /* ++ * Disable preemption for these calls to prevent a RCU grace ++ * period. This syncs the hash iteration and freeing of items ++ * on the hash. rcu_read_lock is too dangerous here. ++ */ ++ resched = ftrace_preempt_disable(); ++ hlist_for_each_entry_rcu(entry, n, hhd, node) { ++ if (entry->ip == ip) ++ entry->ops->func(ip, parent_ip, &entry->data); ++ } ++ ftrace_preempt_enable(resched); ++} ++ ++static struct ftrace_ops trace_probe_ops __read_mostly = + { +- struct ftrace_iterator *iter; +- int ret; +- +- if (unlikely(ftrace_disabled)) +- return -ENODEV; ++ .func = function_trace_probe_call, ++}; + +- iter = kzalloc(sizeof(*iter), GFP_KERNEL); +- if (!iter) +- return -ENOMEM; ++static int ftrace_probe_registered; + +- iter->pg = ftrace_pages_start; ++static void __enable_ftrace_function_probe(void) ++{ ++ int i; + +- ret = seq_open(file, &show_ftrace_seq_ops); +- if (!ret) { +- struct seq_file *m = file->private_data; ++ if (ftrace_probe_registered) ++ return; + +- m->private = iter; +- } else { +- kfree(iter); ++ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { ++ struct hlist_head *hhd = &ftrace_func_hash[i]; ++ if (hhd->first) ++ break; + } ++ /* Nothing registered? */ ++ if (i == FTRACE_FUNC_HASHSIZE) ++ return; + +- return ret; ++ __register_ftrace_function(&trace_probe_ops); ++ ftrace_startup(0); ++ ftrace_probe_registered = 1; + } + +-int ftrace_avail_release(struct inode *inode, struct file *file) ++static void __disable_ftrace_function_probe(void) + { +- struct seq_file *m = (struct seq_file *)file->private_data; +- struct ftrace_iterator *iter = m->private; ++ int i; + +- seq_release(inode, file); +- kfree(iter); ++ if (!ftrace_probe_registered) ++ return; + +- return 0; ++ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { ++ struct hlist_head *hhd = &ftrace_func_hash[i]; ++ if (hhd->first) ++ return; ++ } ++ ++ /* no more funcs left */ ++ __unregister_ftrace_function(&trace_probe_ops); ++ ftrace_shutdown(0); ++ ftrace_probe_registered = 0; + } + +-static int +-ftrace_failures_open(struct inode *inode, struct file *file) +-{ +- int ret; +- struct seq_file *m; +- struct ftrace_iterator *iter; + +- ret = ftrace_avail_open(inode, file); +- if (!ret) { +- m = (struct seq_file *)file->private_data; +- iter = (struct ftrace_iterator *)m->private; +- iter->flags = FTRACE_ITER_FAILURES; +- } ++static void ftrace_free_entry_rcu(struct rcu_head *rhp) ++{ ++ struct ftrace_func_probe *entry = ++ container_of(rhp, struct ftrace_func_probe, rcu); + +- return ret; ++ if (entry->ops->free) ++ entry->ops->free(&entry->data); ++ kfree(entry); + } + + +-static void ftrace_filter_reset(int enable) ++int ++register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ++ void *data) + { ++ struct ftrace_func_probe *entry; + struct ftrace_page *pg; + struct dyn_ftrace *rec; +- unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; +- unsigned i; ++ int type, len, not; ++ unsigned long key; ++ int count = 0; ++ char *search; + +- /* should not be called from interrupt context */ +- spin_lock(&ftrace_lock); +- if (enable) +- ftrace_filtered = 0; +- pg = ftrace_pages_start; +- while (pg) { +- for (i = 0; i < pg->index; i++) { +- rec = &pg->records[i]; +- if (rec->flags & FTRACE_FL_FAILED) ++ type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); ++ len = strlen(search); ++ ++ /* we do not support '!' for function probes */ ++ if (WARN_ON(not)) ++ return -EINVAL; ++ ++ mutex_lock(&ftrace_lock); ++ do_for_each_ftrace_rec(pg, rec) { ++ ++ if (rec->flags & FTRACE_FL_FAILED) ++ continue; ++ ++ if (!ftrace_match_record(rec, search, len, type)) ++ continue; ++ ++ entry = kmalloc(sizeof(*entry), GFP_KERNEL); ++ if (!entry) { ++ /* If we did not process any, then return error */ ++ if (!count) ++ count = -ENOMEM; ++ goto out_unlock; ++ } ++ ++ count++; ++ ++ entry->data = data; ++ ++ /* ++ * The caller might want to do something special ++ * for each function we find. We call the callback ++ * to give the caller an opportunity to do so. ++ */ ++ if (ops->callback) { ++ if (ops->callback(rec->ip, &entry->data) < 0) { ++ /* caller does not like this func */ ++ kfree(entry); + continue; +- rec->flags &= ~type; ++ } + } +- pg = pg->next; +- } +- spin_unlock(&ftrace_lock); ++ ++ entry->ops = ops; ++ entry->ip = rec->ip; ++ ++ key = hash_long(entry->ip, FTRACE_HASH_BITS); ++ hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); ++ ++ } while_for_each_ftrace_rec(); ++ __enable_ftrace_function_probe(); ++ ++ out_unlock: ++ mutex_unlock(&ftrace_lock); ++ ++ return count; + } + +-static int +-ftrace_regex_open(struct inode *inode, struct file *file, int enable) ++enum { ++ PROBE_TEST_FUNC = 1, ++ PROBE_TEST_DATA = 2 ++}; ++ ++static void ++__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ++ void *data, int flags) + { +- struct ftrace_iterator *iter; +- int ret = 0; ++ struct ftrace_func_probe *entry; ++ struct hlist_node *n, *tmp; ++ char str[KSYM_SYMBOL_LEN]; ++ int type = MATCH_FULL; ++ int i, len = 0; ++ char *search; + +- if (unlikely(ftrace_disabled)) +- return -ENODEV; ++ if (glob && (strcmp(glob, "*") || !strlen(glob))) ++ glob = NULL; ++ else { ++ int not; + +- iter = kzalloc(sizeof(*iter), GFP_KERNEL); +- if (!iter) +- return -ENOMEM; ++ type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); ++ len = strlen(search); + +- mutex_lock(&ftrace_regex_lock); +- if ((file->f_mode & FMODE_WRITE) && +- !(file->f_flags & O_APPEND)) +- ftrace_filter_reset(enable); ++ /* we do not support '!' for function probes */ ++ if (WARN_ON(not)) ++ return; ++ } + +- if (file->f_mode & FMODE_READ) { +- iter->pg = ftrace_pages_start; +- iter->flags = enable ? FTRACE_ITER_FILTER : +- FTRACE_ITER_NOTRACE; ++ mutex_lock(&ftrace_lock); ++ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { ++ struct hlist_head *hhd = &ftrace_func_hash[i]; + +- ret = seq_open(file, &show_ftrace_seq_ops); +- if (!ret) { +- struct seq_file *m = file->private_data; +- m->private = iter; +- } else +- kfree(iter); +- } else +- file->private_data = iter; +- mutex_unlock(&ftrace_regex_lock); ++ hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { + +- return ret; ++ /* break up if statements for readability */ ++ if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) ++ continue; ++ ++ if ((flags & PROBE_TEST_DATA) && entry->data != data) ++ continue; ++ ++ /* do this last, since it is the most expensive */ ++ if (glob) { ++ kallsyms_lookup(entry->ip, NULL, NULL, ++ NULL, str); ++ if (!ftrace_match(str, glob, len, type)) ++ continue; ++ } ++ ++ hlist_del(&entry->node); ++ call_rcu(&entry->rcu, ftrace_free_entry_rcu); ++ } ++ } ++ __disable_ftrace_function_probe(); ++ mutex_unlock(&ftrace_lock); + } + +-static int +-ftrace_filter_open(struct inode *inode, struct file *file) ++void ++unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ++ void *data) + { +- return ftrace_regex_open(inode, file, 1); ++ __unregister_ftrace_function_probe(glob, ops, data, ++ PROBE_TEST_FUNC | PROBE_TEST_DATA); + } + +-static int +-ftrace_notrace_open(struct inode *inode, struct file *file) ++void ++unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) + { +- return ftrace_regex_open(inode, file, 0); ++ __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); + } + +-static ssize_t +-ftrace_regex_read(struct file *file, char __user *ubuf, +- size_t cnt, loff_t *ppos) ++void unregister_ftrace_function_probe_all(char *glob) + { +- if (file->f_mode & FMODE_READ) +- return seq_read(file, ubuf, cnt, ppos); +- else +- return -EPERM; ++ __unregister_ftrace_function_probe(glob, NULL, NULL, 0); + } + +-static loff_t +-ftrace_regex_lseek(struct file *file, loff_t offset, int origin) ++static LIST_HEAD(ftrace_commands); ++static DEFINE_MUTEX(ftrace_cmd_mutex); ++ ++int register_ftrace_command(struct ftrace_func_command *cmd) + { +- loff_t ret; ++ struct ftrace_func_command *p; ++ int ret = 0; + +- if (file->f_mode & FMODE_READ) +- ret = seq_lseek(file, offset, origin); +- else +- file->f_pos = ret = 1; ++ mutex_lock(&ftrace_cmd_mutex); ++ list_for_each_entry(p, &ftrace_commands, list) { ++ if (strcmp(cmd->name, p->name) == 0) { ++ ret = -EBUSY; ++ goto out_unlock; ++ } ++ } ++ list_add(&cmd->list, &ftrace_commands); ++ out_unlock: ++ mutex_unlock(&ftrace_cmd_mutex); + + return ret; + } + +-enum { +- MATCH_FULL, +- MATCH_FRONT_ONLY, +- MATCH_MIDDLE_ONLY, +- MATCH_END_ONLY, +-}; +- +-static void +-ftrace_match(unsigned char *buff, int len, int enable) ++int unregister_ftrace_command(struct ftrace_func_command *cmd) + { +- char str[KSYM_SYMBOL_LEN]; +- char *search = NULL; +- struct ftrace_page *pg; +- struct dyn_ftrace *rec; +- int type = MATCH_FULL; +- unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; +- unsigned i, match = 0, search_len = 0; +- int not = 0; ++ struct ftrace_func_command *p, *n; ++ int ret = -ENODEV; + +- if (buff[0] == '!') { +- not = 1; +- buff++; +- len--; ++ mutex_lock(&ftrace_cmd_mutex); ++ list_for_each_entry_safe(p, n, &ftrace_commands, list) { ++ if (strcmp(cmd->name, p->name) == 0) { ++ ret = 0; ++ list_del_init(&p->list); ++ goto out_unlock; ++ } + } ++ out_unlock: ++ mutex_unlock(&ftrace_cmd_mutex); + +- for (i = 0; i < len; i++) { +- if (buff[i] == '*') { +- if (!i) { +- search = buff + i + 1; +- type = MATCH_END_ONLY; +- search_len = len - (i + 1); +- } else { +- if (type == MATCH_END_ONLY) { +- type = MATCH_MIDDLE_ONLY; +- } else { +- match = i; +- type = MATCH_FRONT_ONLY; +- } +- buff[i] = 0; +- break; +- } +- } ++ return ret; ++} ++ ++static int ftrace_process_regex(char *buff, int len, int enable) ++{ ++ char *func, *command, *next = buff; ++ struct ftrace_func_command *p; ++ int ret = -EINVAL; ++ ++ func = strsep(&next, ":"); ++ ++ if (!next) { ++ ftrace_match_records(func, len, enable); ++ return 0; + } + +- /* should not be called from interrupt context */ +- spin_lock(&ftrace_lock); +- if (enable) +- ftrace_filtered = 1; +- pg = ftrace_pages_start; +- while (pg) { +- for (i = 0; i < pg->index; i++) { +- int matched = 0; +- char *ptr; ++ /* command found */ + +- rec = &pg->records[i]; +- if (rec->flags & FTRACE_FL_FAILED) +- continue; +- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); +- switch (type) { +- case MATCH_FULL: +- if (strcmp(str, buff) == 0) +- matched = 1; +- break; +- case MATCH_FRONT_ONLY: +- if (memcmp(str, buff, match) == 0) +- matched = 1; +- break; +- case MATCH_MIDDLE_ONLY: +- if (strstr(str, search)) +- matched = 1; +- break; +- case MATCH_END_ONLY: +- ptr = strstr(str, search); +- if (ptr && (ptr[search_len] == 0)) +- matched = 1; +- break; +- } +- if (matched) { +- if (not) +- rec->flags &= ~flag; +- else +- rec->flags |= flag; +- } ++ command = strsep(&next, ":"); ++ ++ mutex_lock(&ftrace_cmd_mutex); ++ list_for_each_entry(p, &ftrace_commands, list) { ++ if (strcmp(p->name, command) == 0) { ++ ret = p->func(func, command, next, enable); ++ goto out_unlock; + } +- pg = pg->next; + } +- spin_unlock(&ftrace_lock); ++ out_unlock: ++ mutex_unlock(&ftrace_cmd_mutex); ++ ++ return ret; + } + + static ssize_t +@@ -1187,7 +2316,10 @@ ftrace_regex_write(struct file *file, co + if (isspace(ch)) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; +- ftrace_match(iter->buffer, iter->buffer_idx, enable); ++ ret = ftrace_process_regex(iter->buffer, ++ iter->buffer_idx, enable); ++ if (ret) ++ goto out; + iter->buffer_idx = 0; + } else + iter->flags |= FTRACE_ITER_CONT; +@@ -1226,7 +2358,7 @@ ftrace_set_regex(unsigned char *buf, int + if (reset) + ftrace_filter_reset(enable); + if (buf) +- ftrace_match(buf, len, enable); ++ ftrace_match_records(buf, len, enable); + mutex_unlock(&ftrace_regex_lock); + } + +@@ -1276,15 +2408,13 @@ ftrace_regex_release(struct inode *inode + if (iter->buffer_idx) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; +- ftrace_match(iter->buffer, iter->buffer_idx, enable); ++ ftrace_match_records(iter->buffer, iter->buffer_idx, enable); + } + +- mutex_lock(&ftrace_sysctl_lock); +- mutex_lock(&ftrace_start_lock); ++ mutex_lock(&ftrace_lock); + if (ftrace_start_up && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); +- mutex_unlock(&ftrace_start_lock); +- mutex_unlock(&ftrace_sysctl_lock); ++ mutex_unlock(&ftrace_lock); + + kfree(iter); + mutex_unlock(&ftrace_regex_lock); +@@ -1303,31 +2433,31 @@ ftrace_notrace_release(struct inode *ino + return ftrace_regex_release(inode, file, 0); + } + +-static struct file_operations ftrace_avail_fops = { ++static const struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, + }; + +-static struct file_operations ftrace_failures_fops = { ++static const struct file_operations ftrace_failures_fops = { + .open = ftrace_failures_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, + }; + +-static struct file_operations ftrace_filter_fops = { ++static const struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, +- .read = ftrace_regex_read, ++ .read = seq_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_filter_release, + }; + +-static struct file_operations ftrace_notrace_fops = { ++static const struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, +- .read = ftrace_regex_read, ++ .read = seq_read, + .write = ftrace_notrace_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_notrace_release, +@@ -1360,6 +2490,10 @@ static void *g_start(struct seq_file *m, + + mutex_lock(&graph_lock); + ++ /* Nothing, tell g_show to print all functions are enabled */ ++ if (!ftrace_graph_count && !*pos) ++ return (void *)1; ++ + p = g_next(m, p, pos); + + return p; +@@ -1378,6 +2512,11 @@ static int g_show(struct seq_file *m, vo + if (!ptr) + return 0; + ++ if (ptr == (unsigned long *)1) { ++ seq_printf(m, "#### all functions enabled ####\n"); ++ return 0; ++ } ++ + kallsyms_lookup(*ptr, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); +@@ -1420,53 +2559,53 @@ ftrace_graph_open(struct inode *inode, s + return ret; + } + +-static ssize_t +-ftrace_graph_read(struct file *file, char __user *ubuf, +- size_t cnt, loff_t *ppos) +-{ +- if (file->f_mode & FMODE_READ) +- return seq_read(file, ubuf, cnt, ppos); +- else +- return -EPERM; +-} +- + static int +-ftrace_set_func(unsigned long *array, int idx, char *buffer) ++ftrace_set_func(unsigned long *array, int *idx, char *buffer) + { +- char str[KSYM_SYMBOL_LEN]; + struct dyn_ftrace *rec; + struct ftrace_page *pg; ++ int search_len; + int found = 0; +- int i, j; ++ int type, not; ++ char *search; ++ bool exists; ++ int i; + + if (ftrace_disabled) + return -ENODEV; + +- /* should not be called from interrupt context */ +- spin_lock(&ftrace_lock); ++ /* decode regex */ ++ type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); ++ if (not) ++ return -EINVAL; + +- for (pg = ftrace_pages_start; pg; pg = pg->next) { +- for (i = 0; i < pg->index; i++) { +- rec = &pg->records[i]; ++ search_len = strlen(search); + +- if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) +- continue; ++ mutex_lock(&ftrace_lock); ++ do_for_each_ftrace_rec(pg, rec) { ++ ++ if (*idx >= FTRACE_GRAPH_MAX_FUNCS) ++ break; + +- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); +- if (strcmp(str, buffer) == 0) { ++ if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) ++ continue; ++ ++ if (ftrace_match_record(rec, search, search_len, type)) { ++ /* ensure it is not already in the array */ ++ exists = false; ++ for (i = 0; i < *idx; i++) ++ if (array[i] == rec->ip) { ++ exists = true; ++ break; ++ } ++ if (!exists) { ++ array[(*idx)++] = rec->ip; + found = 1; +- for (j = 0; j < idx; j++) +- if (array[j] == rec->ip) { +- found = 0; +- break; +- } +- if (found) +- array[idx] = rec->ip; +- break; + } + } +- } +- spin_unlock(&ftrace_lock); ++ } while_for_each_ftrace_rec(); ++ ++ mutex_unlock(&ftrace_lock); + + return found ? 0 : -EINVAL; + } +@@ -1534,13 +2673,11 @@ ftrace_graph_write(struct file *file, co + } + buffer[index] = 0; + +- /* we allow only one at a time */ +- ret = ftrace_set_func(array, ftrace_graph_count, buffer); ++ /* we allow only one expression at a time */ ++ ret = ftrace_set_func(array, &ftrace_graph_count, buffer); + if (ret) + goto out; + +- ftrace_graph_count++; +- + file->f_pos += read; + + ret = read; +@@ -1552,7 +2689,7 @@ ftrace_graph_write(struct file *file, co + + static const struct file_operations ftrace_graph_fops = { + .open = ftrace_graph_open, +- .read = ftrace_graph_read, ++ .read = seq_read, + .write = ftrace_graph_write, + }; + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +@@ -1604,7 +2741,7 @@ static int ftrace_convert_nops(struct mo + unsigned long addr; + unsigned long flags; + +- mutex_lock(&ftrace_start_lock); ++ mutex_lock(&ftrace_lock); + p = start; + while (p < end) { + addr = ftrace_call_adjust(*p++); +@@ -1623,7 +2760,7 @@ static int ftrace_convert_nops(struct mo + local_irq_save(flags); + ftrace_update_code(mod); + local_irq_restore(flags); +- mutex_unlock(&ftrace_start_lock); ++ mutex_unlock(&ftrace_lock); + + return 0; + } +@@ -1700,7 +2837,7 @@ ftrace_pid_read(struct file *file, char + if (ftrace_pid_trace == ftrace_swapper_pid) + r = sprintf(buf, "swapper tasks\n"); + else if (ftrace_pid_trace) +- r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); ++ r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); + else + r = sprintf(buf, "no pid\n"); + +@@ -1796,7 +2933,7 @@ ftrace_pid_write(struct file *filp, cons + if (ret < 0) + return ret; + +- mutex_lock(&ftrace_start_lock); ++ mutex_lock(&ftrace_lock); + if (val < 0) { + /* disable pid tracing */ + if (!ftrace_pid_trace) +@@ -1835,12 +2972,12 @@ ftrace_pid_write(struct file *filp, cons + ftrace_startup_enable(0); + + out: +- mutex_unlock(&ftrace_start_lock); ++ mutex_unlock(&ftrace_lock); + + return cnt; + } + +-static struct file_operations ftrace_pid_fops = { ++static const struct file_operations ftrace_pid_fops = { + .read = ftrace_pid_read, + .write = ftrace_pid_write, + }; +@@ -1861,9 +2998,11 @@ static __init int ftrace_init_debugfs(vo + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_pid' entry\n"); ++ ++ ftrace_profile_debugfs(d_tracer); ++ + return 0; + } +- + fs_initcall(ftrace_init_debugfs); + + /** +@@ -1898,17 +3037,17 @@ int register_ftrace_function(struct ftra + if (unlikely(ftrace_disabled)) + return -1; + +- mutex_lock(&ftrace_sysctl_lock); ++ mutex_lock(&ftrace_lock); + + ret = __register_ftrace_function(ops); + ftrace_startup(0); + +- mutex_unlock(&ftrace_sysctl_lock); ++ mutex_unlock(&ftrace_lock); + return ret; + } + + /** +- * unregister_ftrace_function - unresgister a function for profiling. ++ * unregister_ftrace_function - unregister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. +@@ -1917,10 +3056,10 @@ int unregister_ftrace_function(struct ft + { + int ret; + +- mutex_lock(&ftrace_sysctl_lock); ++ mutex_lock(&ftrace_lock); + ret = __unregister_ftrace_function(ops); + ftrace_shutdown(0); +- mutex_unlock(&ftrace_sysctl_lock); ++ mutex_unlock(&ftrace_lock); + + return ret; + } +@@ -1935,7 +3074,7 @@ ftrace_enable_sysctl(struct ctl_table *t + if (unlikely(ftrace_disabled)) + return -ENODEV; + +- mutex_lock(&ftrace_sysctl_lock); ++ mutex_lock(&ftrace_lock); + + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + +@@ -1964,7 +3103,7 @@ ftrace_enable_sysctl(struct ctl_table *t + } + + out: +- mutex_unlock(&ftrace_sysctl_lock); ++ mutex_unlock(&ftrace_lock); + return ret; + } + +@@ -2029,6 +3168,38 @@ free: + return ret; + } + ++static void ++ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ unsigned long long timestamp; ++ int index; ++ ++ /* ++ * Does the user want to count the time a function was asleep. ++ * If so, do not update the time stamps. ++ */ ++ if (trace_flags & TRACE_ITER_SLEEP_TIME) ++ return; ++ ++ timestamp = trace_clock_local(); ++ ++ prev->ftrace_timestamp = timestamp; ++ ++ /* only process tasks that we timestamped */ ++ if (!next->ftrace_timestamp) ++ return; ++ ++ /* ++ * Update all the counters in next to make up for the ++ * time next was sleeping. ++ */ ++ timestamp -= next->ftrace_timestamp; ++ ++ for (index = next->curr_ret_stack; index >= 0; index--) ++ next->ret_stack[index].calltime += timestamp; ++} ++ + /* Allocate a return stack for each task */ + static int start_graph_tracing(void) + { +@@ -2050,6 +3221,13 @@ static int start_graph_tracing(void) + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + ++ if (!ret) { ++ ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); ++ if (ret) ++ pr_info("ftrace_graph: Couldn't activate tracepoint" ++ " probe to kernel_sched_switch\n"); ++ } ++ + kfree(ret_stack_list); + return ret; + } +@@ -2080,7 +3258,13 @@ int register_ftrace_graph(trace_func_gra + { + int ret = 0; + +- mutex_lock(&ftrace_sysctl_lock); ++ mutex_lock(&ftrace_lock); ++ ++ /* we currently allow only one tracer registered at a time */ ++ if (atomic_read(&ftrace_graph_active)) { ++ ret = -EBUSY; ++ goto out; ++ } + + ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; + register_pm_notifier(&ftrace_suspend_notifier); +@@ -2098,21 +3282,22 @@ int register_ftrace_graph(trace_func_gra + ftrace_startup(FTRACE_START_FUNC_RET); + + out: +- mutex_unlock(&ftrace_sysctl_lock); ++ mutex_unlock(&ftrace_lock); + return ret; + } + + void unregister_ftrace_graph(void) + { +- mutex_lock(&ftrace_sysctl_lock); ++ mutex_lock(&ftrace_lock); + + atomic_dec(&ftrace_graph_active); ++ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); + +- mutex_unlock(&ftrace_sysctl_lock); ++ mutex_unlock(&ftrace_lock); + } + + /* Allocate a return stack for newly created task */ +@@ -2127,6 +3312,7 @@ void ftrace_graph_init_task(struct task_ + t->curr_ret_stack = -1; + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); ++ t->ftrace_timestamp = 0; + } else + t->ret_stack = NULL; + } +Index: linux-2.6-tip/kernel/trace/kmemtrace.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/kmemtrace.c +@@ -0,0 +1,464 @@ ++/* ++ * Memory allocator tracing ++ * ++ * Copyright (C) 2008 Eduard - Gabriel Munteanu ++ * Copyright (C) 2008 Pekka Enberg ++ * Copyright (C) 2008 Frederic Weisbecker ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "trace_output.h" ++#include "trace.h" ++ ++/* Select an alternative, minimalistic output than the original one */ ++#define TRACE_KMEM_OPT_MINIMAL 0x1 ++ ++static struct tracer_opt kmem_opts[] = { ++ /* Default disable the minimalistic output */ ++ { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, ++ { } ++}; ++ ++static struct tracer_flags kmem_tracer_flags = { ++ .val = 0, ++ .opts = kmem_opts ++}; ++ ++static struct trace_array *kmemtrace_array; ++ ++/* Trace allocations */ ++static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, ++ unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags, ++ int node) ++{ ++ struct trace_array *tr = kmemtrace_array; ++ struct kmemtrace_alloc_entry *entry; ++ struct ring_buffer_event *event; ++ ++ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); ++ if (!event) ++ return; ++ ++ entry = ring_buffer_event_data(event); ++ tracing_generic_entry_update(&entry->ent, 0, 0); ++ ++ entry->ent.type = TRACE_KMEM_ALLOC; ++ entry->type_id = type_id; ++ entry->call_site = call_site; ++ entry->ptr = ptr; ++ entry->bytes_req = bytes_req; ++ entry->bytes_alloc = bytes_alloc; ++ entry->gfp_flags = gfp_flags; ++ entry->node = node; ++ ++ ring_buffer_unlock_commit(tr->buffer, event); ++ ++ trace_wake_up(); ++} ++ ++static inline void kmemtrace_free(enum kmemtrace_type_id type_id, ++ unsigned long call_site, ++ const void *ptr) ++{ ++ struct trace_array *tr = kmemtrace_array; ++ struct kmemtrace_free_entry *entry; ++ struct ring_buffer_event *event; ++ ++ event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); ++ if (!event) ++ return; ++ entry = ring_buffer_event_data(event); ++ tracing_generic_entry_update(&entry->ent, 0, 0); ++ ++ entry->ent.type = TRACE_KMEM_FREE; ++ entry->type_id = type_id; ++ entry->call_site = call_site; ++ entry->ptr = ptr; ++ ++ ring_buffer_unlock_commit(tr->buffer, event); ++ ++ trace_wake_up(); ++} ++ ++static void kmemtrace_kmalloc(unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags) ++{ ++ kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, ++ bytes_req, bytes_alloc, gfp_flags, -1); ++} ++ ++static void kmemtrace_kmem_cache_alloc(unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags) ++{ ++ kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, ++ bytes_req, bytes_alloc, gfp_flags, -1); ++} ++ ++static void kmemtrace_kmalloc_node(unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags, ++ int node) ++{ ++ kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, ++ bytes_req, bytes_alloc, gfp_flags, node); ++} ++ ++static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, ++ const void *ptr, ++ size_t bytes_req, ++ size_t bytes_alloc, ++ gfp_t gfp_flags, ++ int node) ++{ ++ kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, ++ bytes_req, bytes_alloc, gfp_flags, node); ++} ++ ++static void kmemtrace_kfree(unsigned long call_site, const void *ptr) ++{ ++ kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); ++} ++ ++static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) ++{ ++ kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); ++} ++ ++static int kmemtrace_start_probes(void) ++{ ++ int err; ++ ++ err = register_trace_kmalloc(kmemtrace_kmalloc); ++ if (err) ++ return err; ++ err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); ++ if (err) ++ return err; ++ err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); ++ if (err) ++ return err; ++ err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); ++ if (err) ++ return err; ++ err = register_trace_kfree(kmemtrace_kfree); ++ if (err) ++ return err; ++ err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); ++ ++ return err; ++} ++ ++static void kmemtrace_stop_probes(void) ++{ ++ unregister_trace_kmalloc(kmemtrace_kmalloc); ++ unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); ++ unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); ++ unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); ++ unregister_trace_kfree(kmemtrace_kfree); ++ unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); ++} ++ ++static int kmem_trace_init(struct trace_array *tr) ++{ ++ int cpu; ++ kmemtrace_array = tr; ++ ++ for_each_cpu_mask(cpu, cpu_possible_map) ++ tracing_reset(tr, cpu); ++ ++ kmemtrace_start_probes(); ++ ++ return 0; ++} ++ ++static void kmem_trace_reset(struct trace_array *tr) ++{ ++ kmemtrace_stop_probes(); ++} ++ ++static void kmemtrace_headers(struct seq_file *s) ++{ ++ /* Don't need headers for the original kmemtrace output */ ++ if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) ++ return; ++ ++ seq_printf(s, "#\n"); ++ seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " ++ " POINTER NODE CALLER\n"); ++ seq_printf(s, "# FREE | | | | " ++ " | | | |\n"); ++ seq_printf(s, "# |\n\n"); ++} ++ ++/* ++ * The following functions give the original output from kmemtrace, ++ * plus the origin CPU, since reordering occurs in-kernel now. ++ */ ++ ++#define KMEMTRACE_USER_ALLOC 0 ++#define KMEMTRACE_USER_FREE 1 ++ ++struct kmemtrace_user_event { ++ u8 event_id; ++ u8 type_id; ++ u16 event_size; ++ u32 cpu; ++ u64 timestamp; ++ unsigned long call_site; ++ unsigned long ptr; ++}; ++ ++struct kmemtrace_user_event_alloc { ++ size_t bytes_req; ++ size_t bytes_alloc; ++ unsigned gfp_flags; ++ int node; ++}; ++ ++static enum print_line_t ++kmemtrace_print_alloc_user(struct trace_iterator *iter, ++ struct kmemtrace_alloc_entry *entry) ++{ ++ struct kmemtrace_user_event_alloc *ev_alloc; ++ struct trace_seq *s = &iter->seq; ++ struct kmemtrace_user_event *ev; ++ ++ ev = trace_seq_reserve(s, sizeof(*ev)); ++ if (!ev) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ ev->event_id = KMEMTRACE_USER_ALLOC; ++ ev->type_id = entry->type_id; ++ ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); ++ ev->cpu = iter->cpu; ++ ev->timestamp = iter->ts; ++ ev->call_site = entry->call_site; ++ ev->ptr = (unsigned long)entry->ptr; ++ ++ ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); ++ if (!ev_alloc) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ ev_alloc->bytes_req = entry->bytes_req; ++ ev_alloc->bytes_alloc = entry->bytes_alloc; ++ ev_alloc->gfp_flags = entry->gfp_flags; ++ ev_alloc->node = entry->node; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t ++kmemtrace_print_free_user(struct trace_iterator *iter, ++ struct kmemtrace_free_entry *entry) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct kmemtrace_user_event *ev; ++ ++ ev = trace_seq_reserve(s, sizeof(*ev)); ++ if (!ev) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ ev->event_id = KMEMTRACE_USER_FREE; ++ ev->type_id = entry->type_id; ++ ev->event_size = sizeof(*ev); ++ ev->cpu = iter->cpu; ++ ev->timestamp = iter->ts; ++ ev->call_site = entry->call_site; ++ ev->ptr = (unsigned long)entry->ptr; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++/* The two other following provide a more minimalistic output */ ++static enum print_line_t ++kmemtrace_print_alloc_compress(struct trace_iterator *iter, ++ struct kmemtrace_alloc_entry *entry) ++{ ++ struct trace_seq *s = &iter->seq; ++ int ret; ++ ++ /* Alloc entry */ ++ ret = trace_seq_printf(s, " + "); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Type */ ++ switch (entry->type_id) { ++ case KMEMTRACE_TYPE_KMALLOC: ++ ret = trace_seq_printf(s, "K "); ++ break; ++ case KMEMTRACE_TYPE_CACHE: ++ ret = trace_seq_printf(s, "C "); ++ break; ++ case KMEMTRACE_TYPE_PAGES: ++ ret = trace_seq_printf(s, "P "); ++ break; ++ default: ++ ret = trace_seq_printf(s, "? "); ++ } ++ ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Requested */ ++ ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Allocated */ ++ ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Flags ++ * TODO: would be better to see the name of the GFP flag names ++ */ ++ ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Pointer to allocated */ ++ ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Node */ ++ ret = trace_seq_printf(s, "%4d ", entry->node); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Call site */ ++ ret = seq_print_ip_sym(s, entry->call_site, 0); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ if (!trace_seq_printf(s, "\n")) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t ++kmemtrace_print_free_compress(struct trace_iterator *iter, ++ struct kmemtrace_free_entry *entry) ++{ ++ struct trace_seq *s = &iter->seq; ++ int ret; ++ ++ /* Free entry */ ++ ret = trace_seq_printf(s, " - "); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Type */ ++ switch (entry->type_id) { ++ case KMEMTRACE_TYPE_KMALLOC: ++ ret = trace_seq_printf(s, "K "); ++ break; ++ case KMEMTRACE_TYPE_CACHE: ++ ret = trace_seq_printf(s, "C "); ++ break; ++ case KMEMTRACE_TYPE_PAGES: ++ ret = trace_seq_printf(s, "P "); ++ break; ++ default: ++ ret = trace_seq_printf(s, "? "); ++ } ++ ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Skip requested/allocated/flags */ ++ ret = trace_seq_printf(s, " "); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Pointer to allocated */ ++ ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Skip node */ ++ ret = trace_seq_printf(s, " "); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Call site */ ++ ret = seq_print_ip_sym(s, entry->call_site, 0); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ if (!trace_seq_printf(s, "\n")) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) ++{ ++ struct trace_entry *entry = iter->ent; ++ ++ switch (entry->type) { ++ case TRACE_KMEM_ALLOC: { ++ struct kmemtrace_alloc_entry *field; ++ ++ trace_assign_type(field, entry); ++ if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) ++ return kmemtrace_print_alloc_compress(iter, field); ++ else ++ return kmemtrace_print_alloc_user(iter, field); ++ } ++ ++ case TRACE_KMEM_FREE: { ++ struct kmemtrace_free_entry *field; ++ ++ trace_assign_type(field, entry); ++ if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) ++ return kmemtrace_print_free_compress(iter, field); ++ else ++ return kmemtrace_print_free_user(iter, field); ++ } ++ ++ default: ++ return TRACE_TYPE_UNHANDLED; ++ } ++} ++ ++static struct tracer kmem_tracer __read_mostly = { ++ .name = "kmemtrace", ++ .init = kmem_trace_init, ++ .reset = kmem_trace_reset, ++ .print_line = kmemtrace_print_line, ++ .print_header = kmemtrace_headers, ++ .flags = &kmem_tracer_flags ++}; ++ ++void kmemtrace_init(void) ++{ ++ /* earliest opportunity to start kmem tracing */ ++} ++ ++static int __init init_kmem_tracer(void) ++{ ++ return register_tracer(&kmem_tracer); ++} ++device_initcall(init_kmem_tracer); +Index: linux-2.6-tip/kernel/trace/ring_buffer.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/ring_buffer.c ++++ linux-2.6-tip/kernel/trace/ring_buffer.c +@@ -4,21 +4,93 @@ + * Copyright (C) 2008 Steven Rostedt + */ + #include ++#include ++#include ++#include + #include + #include + #include ++#include + #include + #include + #include +-#include /* used for sched_clock() (for now) */ + #include + #include + #include ++#include + #include + + #include "trace.h" + + /* ++ * The ring buffer is made up of a list of pages. A separate list of pages is ++ * allocated for each CPU. A writer may only write to a buffer that is ++ * associated with the CPU it is currently executing on. A reader may read ++ * from any per cpu buffer. ++ * ++ * The reader is special. For each per cpu buffer, the reader has its own ++ * reader page. When a reader has read the entire reader page, this reader ++ * page is swapped with another page in the ring buffer. ++ * ++ * Now, as long as the writer is off the reader page, the reader can do what ++ * ever it wants with that page. The writer will never write to that page ++ * again (as long as it is out of the ring buffer). ++ * ++ * Here's some silly ASCII art. ++ * ++ * +------+ ++ * |reader| RING BUFFER ++ * |page | ++ * +------+ +---+ +---+ +---+ ++ * | |-->| |-->| | ++ * +---+ +---+ +---+ ++ * ^ | ++ * | | ++ * +---------------+ ++ * ++ * ++ * +------+ ++ * |reader| RING BUFFER ++ * |page |------------------v ++ * +------+ +---+ +---+ +---+ ++ * | |-->| |-->| | ++ * +---+ +---+ +---+ ++ * ^ | ++ * | | ++ * +---------------+ ++ * ++ * ++ * +------+ ++ * |reader| RING BUFFER ++ * |page |------------------v ++ * +------+ +---+ +---+ +---+ ++ * ^ | |-->| |-->| | ++ * | +---+ +---+ +---+ ++ * | | ++ * | | ++ * +------------------------------+ ++ * ++ * ++ * +------+ ++ * |buffer| RING BUFFER ++ * |page |------------------v ++ * +------+ +---+ +---+ +---+ ++ * ^ | | | |-->| | ++ * | New +---+ +---+ +---+ ++ * | Reader------^ | ++ * | page | ++ * +------------------------------+ ++ * ++ * ++ * After we make this swap, the reader can hand this page off to the splice ++ * code and be done with it. It can even allocate a new page if it needs to ++ * and swap that into the ring buffer. ++ * ++ * We will be using cmpxchg soon to make all this lockless. ++ * ++ */ ++ ++/* + * A fast way to enable or disable all ring buffers is to + * call tracing_on or tracing_off. Turning off the ring buffers + * prevents all ring buffers from being recorded to. +@@ -57,7 +129,9 @@ enum { + RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, + }; + +-static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; ++static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; ++ ++#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + + /** + * tracing_on - enable all tracing buffers +@@ -89,59 +163,92 @@ EXPORT_SYMBOL_GPL(tracing_off); + * tracing_off_permanent - permanently disable ring buffers + * + * This function, once called, will disable all ring buffers +- * permanenty. ++ * permanently. + */ + void tracing_off_permanent(void) + { + set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); + } + ++/** ++ * tracing_is_on - show state of ring buffers enabled ++ */ ++int tracing_is_on(void) ++{ ++ return ring_buffer_flags == RB_BUFFERS_ON; ++} ++EXPORT_SYMBOL_GPL(tracing_is_on); ++ + #include "trace.h" + +-/* Up this if you want to test the TIME_EXTENTS and normalization */ +-#define DEBUG_SHIFT 0 ++#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) ++#define RB_ALIGNMENT 4U ++#define RB_MAX_SMALL_DATA 28 ++ ++enum { ++ RB_LEN_TIME_EXTEND = 8, ++ RB_LEN_TIME_STAMP = 16, ++}; + +-/* FIXME!!! */ +-u64 ring_buffer_time_stamp(int cpu) ++static inline int rb_null_event(struct ring_buffer_event *event) + { +- u64 time; ++ return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; ++} + +- preempt_disable_notrace(); +- /* shift to debug/test normalization and TIME_EXTENTS */ +- time = sched_clock() << DEBUG_SHIFT; +- preempt_enable_no_resched_notrace(); ++static inline int rb_discarded_event(struct ring_buffer_event *event) ++{ ++ return event->type == RINGBUF_TYPE_PADDING && event->time_delta; ++} + +- return time; ++static void rb_event_set_padding(struct ring_buffer_event *event) ++{ ++ event->type = RINGBUF_TYPE_PADDING; ++ event->time_delta = 0; + } +-EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); + +-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) ++/** ++ * ring_buffer_event_discard - discard an event in the ring buffer ++ * @buffer: the ring buffer ++ * @event: the event to discard ++ * ++ * Sometimes a event that is in the ring buffer needs to be ignored. ++ * This function lets the user discard an event in the ring buffer ++ * and then that event will not be read later. ++ * ++ * Note, it is up to the user to be careful with this, and protect ++ * against races. If the user discards an event that has been consumed ++ * it is possible that it could corrupt the ring buffer. ++ */ ++void ring_buffer_event_discard(struct ring_buffer_event *event) + { +- /* Just stupid testing the normalize function and deltas */ +- *ts >>= DEBUG_SHIFT; ++ event->type = RINGBUF_TYPE_PADDING; ++ /* time delta must be non zero */ ++ if (!event->time_delta) ++ event->time_delta = 1; + } +-EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); + +-#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +-#define RB_ALIGNMENT_SHIFT 2 +-#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) +-#define RB_MAX_SMALL_DATA 28 ++static unsigned ++rb_event_data_length(struct ring_buffer_event *event) ++{ ++ unsigned length; + +-enum { +- RB_LEN_TIME_EXTEND = 8, +- RB_LEN_TIME_STAMP = 16, +-}; ++ if (event->len) ++ length = event->len * RB_ALIGNMENT; ++ else ++ length = event->array[0]; ++ return length + RB_EVNT_HDR_SIZE; ++} + + /* inline for ring buffer fast paths */ +-static inline unsigned ++static unsigned + rb_event_length(struct ring_buffer_event *event) + { +- unsigned length; +- + switch (event->type) { + case RINGBUF_TYPE_PADDING: +- /* undefined */ +- return -1; ++ if (rb_null_event(event)) ++ /* undefined */ ++ return -1; ++ return rb_event_data_length(event); + + case RINGBUF_TYPE_TIME_EXTEND: + return RB_LEN_TIME_EXTEND; +@@ -150,11 +257,7 @@ rb_event_length(struct ring_buffer_event + return RB_LEN_TIME_STAMP; + + case RINGBUF_TYPE_DATA: +- if (event->len) +- length = event->len << RB_ALIGNMENT_SHIFT; +- else +- length = event->array[0]; +- return length + RB_EVNT_HDR_SIZE; ++ return rb_event_data_length(event); + default: + BUG(); + } +@@ -179,7 +282,7 @@ unsigned ring_buffer_event_length(struct + EXPORT_SYMBOL_GPL(ring_buffer_event_length); + + /* inline for ring buffer fast paths */ +-static inline void * ++static void * + rb_event_data(struct ring_buffer_event *event) + { + BUG_ON(event->type != RINGBUF_TYPE_DATA); +@@ -209,7 +312,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data + + struct buffer_data_page { + u64 time_stamp; /* page time stamp */ +- local_t commit; /* write commited index */ ++ local_t commit; /* write committed index */ + unsigned char data[]; /* data of buffer page */ + }; + +@@ -225,14 +328,25 @@ static void rb_init_page(struct buffer_d + local_set(&bpage->commit, 0); + } + ++/** ++ * ring_buffer_page_len - the size of data on the page. ++ * @page: The page to read ++ * ++ * Returns the amount of data on the page, including buffer page header. ++ */ ++size_t ring_buffer_page_len(void *page) ++{ ++ return local_read(&((struct buffer_data_page *)page)->commit) ++ + BUF_PAGE_HDR_SIZE; ++} ++ + /* + * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing + * this issue out. + */ +-static inline void free_buffer_page(struct buffer_page *bpage) ++static void free_buffer_page(struct buffer_page *bpage) + { +- if (bpage->page) +- free_page((unsigned long)bpage->page); ++ free_page((unsigned long)bpage->page); + kfree(bpage); + } + +@@ -246,7 +360,7 @@ static inline int test_time_stamp(u64 de + return 0; + } + +-#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) ++#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) + + /* + * head_page == tail_page && head == tail then buffer is empty. +@@ -254,13 +368,13 @@ static inline int test_time_stamp(u64 de + struct ring_buffer_per_cpu { + int cpu; + struct ring_buffer *buffer; +- spinlock_t reader_lock; /* serialize readers */ +- raw_spinlock_t lock; ++ raw_spinlock_t reader_lock; /* serialize readers */ ++ __raw_spinlock_t lock; + struct lock_class_key lock_key; + struct list_head pages; + struct buffer_page *head_page; /* read from head */ + struct buffer_page *tail_page; /* write to tail */ +- struct buffer_page *commit_page; /* commited pages */ ++ struct buffer_page *commit_page; /* committed pages */ + struct buffer_page *reader_page; + unsigned long overrun; + unsigned long entries; +@@ -273,12 +387,17 @@ struct ring_buffer { + unsigned pages; + unsigned flags; + int cpus; +- cpumask_var_t cpumask; + atomic_t record_disabled; ++ cpumask_var_t cpumask; + + struct mutex mutex; + + struct ring_buffer_per_cpu **buffers; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ struct notifier_block cpu_notify; ++#endif ++ u64 (*clock)(void); + }; + + struct ring_buffer_iter { +@@ -299,11 +418,35 @@ struct ring_buffer_iter { + _____ret; \ + }) + ++/* Up this if you want to test the TIME_EXTENTS and normalization */ ++#define DEBUG_SHIFT 0 ++ ++u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) ++{ ++ u64 time; ++ ++ preempt_disable_notrace(); ++ /* shift to debug/test normalization and TIME_EXTENTS */ ++ time = buffer->clock() << DEBUG_SHIFT; ++ preempt_enable_no_resched_notrace(); ++ ++ return time; ++} ++EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); ++ ++void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, ++ int cpu, u64 *ts) ++{ ++ /* Just stupid testing the normalize function and deltas */ ++ *ts >>= DEBUG_SHIFT; ++} ++EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); ++ + /** + * check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * +- * As a safty measure we check to make sure the data pages have not ++ * As a safety measure we check to make sure the data pages have not + * been corrupted. + */ + static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +@@ -381,7 +524,7 @@ rb_allocate_cpu_buffer(struct ring_buffe + cpu_buffer->cpu = cpu; + cpu_buffer->buffer = buffer; + spin_lock_init(&cpu_buffer->reader_lock); +- cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; ++ cpu_buffer->lock = (__raw_spinlock_t) __RAW_SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&cpu_buffer->pages); + + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), +@@ -437,6 +580,11 @@ static void rb_free_cpu_buffer(struct ri + */ + extern int ring_buffer_page_too_big(void); + ++#ifdef CONFIG_HOTPLUG_CPU ++static int rb_cpu_notify(struct notifier_block *self, ++ unsigned long action, void *hcpu); ++#endif ++ + /** + * ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes per cpu that is needed. +@@ -469,12 +617,23 @@ struct ring_buffer *ring_buffer_alloc(un + + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + buffer->flags = flags; ++ buffer->clock = trace_clock_local; + + /* need at least two pages */ + if (buffer->pages == 1) + buffer->pages++; + ++ /* ++ * In case of non-hotplug cpu, if the ring-buffer is allocated ++ * in early initcall, it will not be notified of secondary cpus. ++ * In that off case, we need to allocate for all possible cpus. ++ */ ++#ifdef CONFIG_HOTPLUG_CPU ++ get_online_cpus(); ++ cpumask_copy(buffer->cpumask, cpu_online_mask); ++#else + cpumask_copy(buffer->cpumask, cpu_possible_mask); ++#endif + buffer->cpus = nr_cpu_ids; + + bsize = sizeof(void *) * nr_cpu_ids; +@@ -490,6 +649,13 @@ struct ring_buffer *ring_buffer_alloc(un + goto fail_free_buffers; + } + ++#ifdef CONFIG_HOTPLUG_CPU ++ buffer->cpu_notify.notifier_call = rb_cpu_notify; ++ buffer->cpu_notify.priority = 0; ++ register_cpu_notifier(&buffer->cpu_notify); ++#endif ++ ++ put_online_cpus(); + mutex_init(&buffer->mutex); + + return buffer; +@@ -503,6 +669,7 @@ struct ring_buffer *ring_buffer_alloc(un + + fail_free_cpumask: + free_cpumask_var(buffer->cpumask); ++ put_online_cpus(); + + fail_free_buffer: + kfree(buffer); +@@ -519,15 +686,29 @@ ring_buffer_free(struct ring_buffer *buf + { + int cpu; + ++ get_online_cpus(); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ unregister_cpu_notifier(&buffer->cpu_notify); ++#endif ++ + for_each_buffer_cpu(buffer, cpu) + rb_free_cpu_buffer(buffer->buffers[cpu]); + ++ put_online_cpus(); ++ + free_cpumask_var(buffer->cpumask); + + kfree(buffer); + } + EXPORT_SYMBOL_GPL(ring_buffer_free); + ++void ring_buffer_set_clock(struct ring_buffer *buffer, ++ u64 (*clock)(void)) ++{ ++ buffer->clock = clock; ++} ++ + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + + static void +@@ -627,16 +808,15 @@ int ring_buffer_resize(struct ring_buffe + return size; + + mutex_lock(&buffer->mutex); ++ get_online_cpus(); + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + + if (size < buffer_size) { + + /* easy case, just free pages */ +- if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { +- mutex_unlock(&buffer->mutex); +- return -1; +- } ++ if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) ++ goto out_fail; + + rm_pages = buffer->pages - nr_pages; + +@@ -655,10 +835,8 @@ int ring_buffer_resize(struct ring_buffe + * add these pages to the cpu_buffers. Otherwise we just free + * them all and return -ENOMEM; + */ +- if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { +- mutex_unlock(&buffer->mutex); +- return -1; +- } ++ if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) ++ goto out_fail; + + new_pages = nr_pages - buffer->pages; + +@@ -683,13 +861,12 @@ int ring_buffer_resize(struct ring_buffe + rb_insert_pages(cpu_buffer, &pages, new_pages); + } + +- if (RB_WARN_ON(buffer, !list_empty(&pages))) { +- mutex_unlock(&buffer->mutex); +- return -1; +- } ++ if (RB_WARN_ON(buffer, !list_empty(&pages))) ++ goto out_fail; + + out: + buffer->pages = nr_pages; ++ put_online_cpus(); + mutex_unlock(&buffer->mutex); + + return size; +@@ -699,15 +876,20 @@ int ring_buffer_resize(struct ring_buffe + list_del_init(&bpage->list); + free_buffer_page(bpage); + } ++ put_online_cpus(); + mutex_unlock(&buffer->mutex); + return -ENOMEM; +-} +-EXPORT_SYMBOL_GPL(ring_buffer_resize); + +-static inline int rb_null_event(struct ring_buffer_event *event) +-{ +- return event->type == RINGBUF_TYPE_PADDING; ++ /* ++ * Something went totally wrong, and we are too paranoid ++ * to even clean up the mess. ++ */ ++ out_fail: ++ put_online_cpus(); ++ mutex_unlock(&buffer->mutex); ++ return -1; + } ++EXPORT_SYMBOL_GPL(ring_buffer_resize); + + static inline void * + __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) +@@ -811,7 +993,7 @@ rb_event_index(struct ring_buffer_event + return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); + } + +-static inline int ++static int + rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) + { +@@ -825,7 +1007,7 @@ rb_is_commit(struct ring_buffer_per_cpu + rb_commit_index(cpu_buffer) == index; + } + +-static inline void ++static void + rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) + { +@@ -850,7 +1032,7 @@ rb_set_commit_event(struct ring_buffer_p + local_set(&cpu_buffer->commit_page->page->commit, index); + } + +-static inline void ++static void + rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) + { + /* +@@ -896,7 +1078,7 @@ static void rb_reset_reader_page(struct + cpu_buffer->reader_page->read = 0; + } + +-static inline void rb_inc_iter(struct ring_buffer_iter *iter) ++static void rb_inc_iter(struct ring_buffer_iter *iter) + { + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + +@@ -926,7 +1108,7 @@ static inline void rb_inc_iter(struct ri + * and with this, we can determine what to place into the + * data field. + */ +-static inline void ++static void + rb_update_event(struct ring_buffer_event *event, + unsigned type, unsigned length) + { +@@ -938,15 +1120,11 @@ rb_update_event(struct ring_buffer_event + break; + + case RINGBUF_TYPE_TIME_EXTEND: +- event->len = +- (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) +- >> RB_ALIGNMENT_SHIFT; ++ event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); + break; + + case RINGBUF_TYPE_TIME_STAMP: +- event->len = +- (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) +- >> RB_ALIGNMENT_SHIFT; ++ event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); + break; + + case RINGBUF_TYPE_DATA: +@@ -955,16 +1133,14 @@ rb_update_event(struct ring_buffer_event + event->len = 0; + event->array[0] = length; + } else +- event->len = +- (length + (RB_ALIGNMENT-1)) +- >> RB_ALIGNMENT_SHIFT; ++ event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); + break; + default: + BUG(); + } + } + +-static inline unsigned rb_calculate_event_length(unsigned length) ++static unsigned rb_calculate_event_length(unsigned length) + { + struct ring_buffer_event event; /* Used only for sizeof array */ + +@@ -990,6 +1166,7 @@ __rb_reserve_next(struct ring_buffer_per + struct ring_buffer *buffer = cpu_buffer->buffer; + struct ring_buffer_event *event; + unsigned long flags; ++ bool lock_taken = false; + + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ +@@ -1003,7 +1180,30 @@ __rb_reserve_next(struct ring_buffer_per + struct buffer_page *next_page = tail_page; + + local_irq_save(flags); +- __raw_spin_lock(&cpu_buffer->lock); ++ /* ++ * Since the write to the buffer is still not ++ * fully lockless, we must be careful with NMIs. ++ * The locks in the writers are taken when a write ++ * crosses to a new page. The locks protect against ++ * races with the readers (this will soon be fixed ++ * with a lockless solution). ++ * ++ * Because we can not protect against NMIs, and we ++ * want to keep traces reentrant, we need to manage ++ * what happens when we are in an NMI. ++ * ++ * NMIs can happen after we take the lock. ++ * If we are in an NMI, only take the lock ++ * if it is not already taken. Otherwise ++ * simply fail. ++ */ ++ if (unlikely(in_nmi())) { ++ if (!__raw_spin_trylock(&cpu_buffer->lock)) ++ goto out_reset; ++ } else ++ __raw_spin_lock(&cpu_buffer->lock); ++ ++ lock_taken = true; + + rb_inc_page(cpu_buffer, &next_page); + +@@ -1012,7 +1212,7 @@ __rb_reserve_next(struct ring_buffer_per + + /* we grabbed the lock before incrementing */ + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) +- goto out_unlock; ++ goto out_reset; + + /* + * If for some reason, we had an interrupt storm that made +@@ -1021,12 +1221,12 @@ __rb_reserve_next(struct ring_buffer_per + */ + if (unlikely(next_page == commit_page)) { + WARN_ON_ONCE(1); +- goto out_unlock; ++ goto out_reset; + } + + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) +- goto out_unlock; ++ goto out_reset; + + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { +@@ -1050,7 +1250,7 @@ __rb_reserve_next(struct ring_buffer_per + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ +- *ts = ring_buffer_time_stamp(cpu_buffer->cpu); ++ *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + cpu_buffer->tail_page->page->time_stamp = *ts; + } + +@@ -1060,7 +1260,8 @@ __rb_reserve_next(struct ring_buffer_per + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); +- event->type = RINGBUF_TYPE_PADDING; ++ kmemcheck_annotate_bitfield(event->bitfield); ++ rb_event_set_padding(event); + } + + if (tail <= BUF_PAGE_SIZE) +@@ -1089,6 +1290,7 @@ __rb_reserve_next(struct ring_buffer_per + return NULL; + + event = __rb_page_index(tail_page, tail); ++ kmemcheck_annotate_bitfield(event->bitfield); + rb_update_event(event, type, length); + + /* +@@ -1100,12 +1302,13 @@ __rb_reserve_next(struct ring_buffer_per + + return event; + +- out_unlock: ++ out_reset: + /* reset write */ + if (tail <= BUF_PAGE_SIZE) + local_set(&tail_page->write, tail); + +- __raw_spin_unlock(&cpu_buffer->lock); ++ if (likely(lock_taken)) ++ __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + return NULL; + } +@@ -1192,7 +1395,7 @@ rb_reserve_next_event(struct ring_buffer + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + return NULL; + +- ts = ring_buffer_time_stamp(cpu_buffer->cpu); ++ ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + + /* + * Only the first commit can update the timestamp. +@@ -1265,7 +1468,6 @@ static DEFINE_PER_CPU(int, rb_need_resch + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) +- * @flags: a pointer to save the interrupt flags + * + * Returns a reseverd event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into +@@ -1278,9 +1480,7 @@ static DEFINE_PER_CPU(int, rb_need_resch + * If NULL is returned, then nothing has been allocated or locked. + */ + struct ring_buffer_event * +-ring_buffer_lock_reserve(struct ring_buffer *buffer, +- unsigned long length, +- unsigned long *flags) ++ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) + { + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; +@@ -1347,15 +1547,13 @@ static void rb_commit(struct ring_buffer + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. +- * @flags: the interrupt flags received from ring_buffer_lock_reserve. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ + int ring_buffer_unlock_commit(struct ring_buffer *buffer, +- struct ring_buffer_event *event, +- unsigned long flags) ++ struct ring_buffer_event *event) + { + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); +@@ -1438,7 +1636,7 @@ int ring_buffer_write(struct ring_buffer + } + EXPORT_SYMBOL_GPL(ring_buffer_write); + +-static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) ++static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) + { + struct buffer_page *reader = cpu_buffer->reader_page; + struct buffer_page *head = cpu_buffer->head_page; +@@ -1528,12 +1726,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_ena + unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) + { + struct ring_buffer_per_cpu *cpu_buffer; ++ unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; +- return cpu_buffer->entries; ++ ret = cpu_buffer->entries; ++ ++ return ret; + } + EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); + +@@ -1545,12 +1746,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cp + unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) + { + struct ring_buffer_per_cpu *cpu_buffer; ++ unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; +- return cpu_buffer->overrun; ++ ret = cpu_buffer->overrun; ++ ++ return ret; + } + EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); + +@@ -1627,9 +1831,14 @@ static void rb_iter_reset(struct ring_bu + */ + void ring_buffer_iter_reset(struct ring_buffer_iter *iter) + { +- struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; ++ struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + ++ if (!iter) ++ return; ++ ++ cpu_buffer = iter->cpu_buffer; ++ + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_iter_reset(iter); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +@@ -1803,7 +2012,7 @@ static void rb_advance_reader(struct rin + + event = rb_reader_event(cpu_buffer); + +- if (event->type == RINGBUF_TYPE_DATA) ++ if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) + cpu_buffer->entries--; + + rb_update_read_stamp(cpu_buffer, event); +@@ -1864,9 +2073,6 @@ rb_buffer_peek(struct ring_buffer *buffe + struct buffer_page *reader; + int nr_loops = 0; + +- if (!cpumask_test_cpu(cpu, buffer->cpumask)) +- return NULL; +- + cpu_buffer = buffer->buffers[cpu]; + + again: +@@ -1889,9 +2095,18 @@ rb_buffer_peek(struct ring_buffer *buffe + + switch (event->type) { + case RINGBUF_TYPE_PADDING: +- RB_WARN_ON(cpu_buffer, 1); ++ if (rb_null_event(event)) ++ RB_WARN_ON(cpu_buffer, 1); ++ /* ++ * Because the writer could be discarding every ++ * event it creates (which would probably be bad) ++ * if we were to go back to "again" then we may never ++ * catch up, and will trigger the warn on, or lock ++ * the box. Return the padding, and we will release ++ * the current locks, and try again. ++ */ + rb_advance_reader(cpu_buffer); +- return NULL; ++ return event; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ +@@ -1906,7 +2121,8 @@ rb_buffer_peek(struct ring_buffer *buffe + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = cpu_buffer->read_stamp + event->time_delta; +- ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); ++ ring_buffer_normalize_time_stamp(buffer, ++ cpu_buffer->cpu, ts); + } + return event; + +@@ -1951,8 +2167,12 @@ rb_iter_peek(struct ring_buffer_iter *it + + switch (event->type) { + case RINGBUF_TYPE_PADDING: +- rb_inc_iter(iter); +- goto again; ++ if (rb_null_event(event)) { ++ rb_inc_iter(iter); ++ goto again; ++ } ++ rb_advance_iter(iter); ++ return event; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ +@@ -1967,7 +2187,8 @@ rb_iter_peek(struct ring_buffer_iter *it + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = iter->read_stamp + event->time_delta; +- ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); ++ ring_buffer_normalize_time_stamp(buffer, ++ cpu_buffer->cpu, ts); + } + return event; + +@@ -1995,10 +2216,19 @@ ring_buffer_peek(struct ring_buffer *buf + struct ring_buffer_event *event; + unsigned long flags; + ++ if (!cpumask_test_cpu(cpu, buffer->cpumask)) ++ return NULL; ++ ++ again: + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_buffer_peek(buffer, cpu, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + ++ if (event && event->type == RINGBUF_TYPE_PADDING) { ++ cpu_relax(); ++ goto again; ++ } ++ + return event; + } + +@@ -2017,10 +2247,16 @@ ring_buffer_iter_peek(struct ring_buffer + struct ring_buffer_event *event; + unsigned long flags; + ++ again: + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + ++ if (event && event->type == RINGBUF_TYPE_PADDING) { ++ cpu_relax(); ++ goto again; ++ } ++ + return event; + } + +@@ -2035,24 +2271,37 @@ ring_buffer_iter_peek(struct ring_buffer + struct ring_buffer_event * + ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) + { +- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; +- struct ring_buffer_event *event; ++ struct ring_buffer_per_cpu *cpu_buffer; ++ struct ring_buffer_event *event = NULL; + unsigned long flags; + ++ again: ++ /* might be called in atomic */ ++ preempt_disable(); ++ + if (!cpumask_test_cpu(cpu, buffer->cpumask)) +- return NULL; ++ goto out; + ++ cpu_buffer = buffer->buffers[cpu]; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + event = rb_buffer_peek(buffer, cpu, ts); + if (!event) +- goto out; ++ goto out_unlock; + + rb_advance_reader(cpu_buffer); + +- out: ++ out_unlock: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + ++ out: ++ preempt_enable(); ++ ++ if (event && event->type == RINGBUF_TYPE_PADDING) { ++ cpu_relax(); ++ goto again; ++ } ++ + return event; + } + EXPORT_SYMBOL_GPL(ring_buffer_consume); +@@ -2131,6 +2380,7 @@ ring_buffer_read(struct ring_buffer_iter + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + ++ again: + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); + if (!event) +@@ -2140,6 +2390,11 @@ ring_buffer_read(struct ring_buffer_iter + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + ++ if (event && event->type == RINGBUF_TYPE_PADDING) { ++ cpu_relax(); ++ goto again; ++ } ++ + return event; + } + EXPORT_SYMBOL_GPL(ring_buffer_read); +@@ -2232,6 +2487,7 @@ int ring_buffer_empty(struct ring_buffer + if (!rb_per_cpu_empty(cpu_buffer)) + return 0; + } ++ + return 1; + } + EXPORT_SYMBOL_GPL(ring_buffer_empty); +@@ -2244,12 +2500,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); + int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) + { + struct ring_buffer_per_cpu *cpu_buffer; ++ int ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 1; + + cpu_buffer = buffer->buffers[cpu]; +- return rb_per_cpu_empty(cpu_buffer); ++ ret = rb_per_cpu_empty(cpu_buffer); ++ ++ ++ return ret; + } + EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); + +@@ -2268,18 +2528,36 @@ int ring_buffer_swap_cpu(struct ring_buf + { + struct ring_buffer_per_cpu *cpu_buffer_a; + struct ring_buffer_per_cpu *cpu_buffer_b; ++ int ret = -EINVAL; + + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || + !cpumask_test_cpu(cpu, buffer_b->cpumask)) +- return -EINVAL; ++ goto out; + + /* At least make sure the two buffers are somewhat the same */ + if (buffer_a->pages != buffer_b->pages) +- return -EINVAL; ++ goto out; ++ ++ ret = -EAGAIN; ++ ++ if (ring_buffer_flags != RB_BUFFERS_ON) ++ goto out; ++ ++ if (atomic_read(&buffer_a->record_disabled)) ++ goto out; ++ ++ if (atomic_read(&buffer_b->record_disabled)) ++ goto out; + + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + ++ if (atomic_read(&cpu_buffer_a->record_disabled)) ++ goto out; ++ ++ if (atomic_read(&cpu_buffer_b->record_disabled)) ++ goto out; ++ + /* + * We can't do a synchronize_sched here because this + * function can be called in atomic context. +@@ -2298,18 +2576,21 @@ int ring_buffer_swap_cpu(struct ring_buf + atomic_dec(&cpu_buffer_a->record_disabled); + atomic_dec(&cpu_buffer_b->record_disabled); + +- return 0; ++ ret = 0; ++out: ++ return ret; + } + EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); + + static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, +- struct buffer_data_page *bpage) ++ struct buffer_data_page *bpage, ++ unsigned int offset) + { + struct ring_buffer_event *event; + unsigned long head; + + __raw_spin_lock(&cpu_buffer->lock); +- for (head = 0; head < local_read(&bpage->commit); ++ for (head = offset; head < local_read(&bpage->commit); + head += rb_event_length(event)) { + + event = __rb_data_page_index(bpage, head); +@@ -2340,8 +2621,8 @@ static void rb_remove_entries(struct rin + */ + void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) + { +- unsigned long addr; + struct buffer_data_page *bpage; ++ unsigned long addr; + + addr = __get_free_page(GFP_KERNEL); + if (!addr) +@@ -2349,6 +2630,8 @@ void *ring_buffer_alloc_read_page(struct + + bpage = (void *)addr; + ++ rb_init_page(bpage); ++ + return bpage; + } + +@@ -2368,6 +2651,7 @@ void ring_buffer_free_read_page(struct r + * ring_buffer_read_page - extract a page from the ring buffer + * @buffer: buffer to extract from + * @data_page: the page to use allocated from ring_buffer_alloc_read_page ++ * @len: amount to extract + * @cpu: the cpu of the buffer to extract + * @full: should the extraction only happen when the page is full. + * +@@ -2377,12 +2661,12 @@ void ring_buffer_free_read_page(struct r + * to swap with a page in the ring buffer. + * + * for example: +- * rpage = ring_buffer_alloc_page(buffer); ++ * rpage = ring_buffer_alloc_read_page(buffer); + * if (!rpage) + * return error; +- * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); +- * if (ret) +- * process_page(rpage); ++ * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); ++ * if (ret >= 0) ++ * process_page(rpage, ret); + * + * When @full is set, the function will not return true unless + * the writer is off the reader page. +@@ -2393,72 +2677,118 @@ void ring_buffer_free_read_page(struct r + * responsible for that. + * + * Returns: +- * 1 if data has been transferred +- * 0 if no data has been transferred. ++ * >=0 if data has been transferred, returns the offset of consumed data. ++ * <0 if no data has been transferred. + */ + int ring_buffer_read_page(struct ring_buffer *buffer, +- void **data_page, int cpu, int full) ++ void **data_page, size_t len, int cpu, int full) + { + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + struct buffer_data_page *bpage; ++ struct buffer_page *reader; + unsigned long flags; +- int ret = 0; ++ unsigned int commit; ++ unsigned int read; ++ u64 save_timestamp; ++ int ret = -1; ++ ++ if (!cpumask_test_cpu(cpu, buffer->cpumask)) ++ goto out; ++ ++ /* ++ * If len is not big enough to hold the page header, then ++ * we can not copy anything. ++ */ ++ if (len <= BUF_PAGE_HDR_SIZE) ++ goto out; ++ ++ len -= BUF_PAGE_HDR_SIZE; + + if (!data_page) +- return 0; ++ goto out; + + bpage = *data_page; + if (!bpage) +- return 0; ++ goto out; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + +- /* +- * rb_buffer_peek will get the next ring buffer if +- * the current reader page is empty. +- */ +- event = rb_buffer_peek(buffer, cpu, NULL); +- if (!event) +- goto out; ++ reader = rb_get_reader_page(cpu_buffer); ++ if (!reader) ++ goto out_unlock; ++ ++ event = rb_reader_event(cpu_buffer); ++ ++ read = reader->read; ++ commit = rb_page_commit(reader); + +- /* check for data */ +- if (!local_read(&cpu_buffer->reader_page->page->commit)) +- goto out; + /* +- * If the writer is already off of the read page, then simply +- * switch the read page with the given page. Otherwise +- * we need to copy the data from the reader to the writer. +- */ +- if (cpu_buffer->reader_page == cpu_buffer->commit_page) { +- unsigned int read = cpu_buffer->reader_page->read; ++ * If this page has been partially read or ++ * if len is not big enough to read the rest of the page or ++ * a writer is still on the page, then ++ * we must copy the data from the page to the buffer. ++ * Otherwise, we can simply swap the page with the one passed in. ++ */ ++ if (read || (len < (commit - read)) || ++ cpu_buffer->reader_page == cpu_buffer->commit_page) { ++ struct buffer_data_page *rpage = cpu_buffer->reader_page->page; ++ unsigned int rpos = read; ++ unsigned int pos = 0; ++ unsigned int size; + + if (full) +- goto out; +- /* The writer is still on the reader page, we must copy */ +- bpage = cpu_buffer->reader_page->page; +- memcpy(bpage->data, +- cpu_buffer->reader_page->page->data + read, +- local_read(&bpage->commit) - read); ++ goto out_unlock; ++ ++ if (len > (commit - read)) ++ len = (commit - read); ++ ++ size = rb_event_length(event); ++ ++ if (len < size) ++ goto out_unlock; ++ ++ /* save the current timestamp, since the user will need it */ ++ save_timestamp = cpu_buffer->read_stamp; ++ ++ /* Need to copy one event at a time */ ++ do { ++ memcpy(bpage->data + pos, rpage->data + rpos, size); ++ ++ len -= size; + +- /* consume what was read */ +- cpu_buffer->reader_page += read; ++ rb_advance_reader(cpu_buffer); ++ rpos = reader->read; ++ pos += size; + ++ event = rb_reader_event(cpu_buffer); ++ size = rb_event_length(event); ++ } while (len > size); ++ ++ /* update bpage */ ++ local_set(&bpage->commit, pos); ++ bpage->time_stamp = save_timestamp; ++ ++ /* we copied everything to the beginning */ ++ read = 0; + } else { + /* swap the pages */ + rb_init_page(bpage); +- bpage = cpu_buffer->reader_page->page; +- cpu_buffer->reader_page->page = *data_page; +- cpu_buffer->reader_page->read = 0; ++ bpage = reader->page; ++ reader->page = *data_page; ++ local_set(&reader->write, 0); ++ reader->read = 0; + *data_page = bpage; ++ ++ /* update the entry counter */ ++ rb_remove_entries(cpu_buffer, bpage, read); + } +- ret = 1; ++ ret = read; + +- /* update the entry counter */ +- rb_remove_entries(cpu_buffer, bpage); +- out: ++ out_unlock: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + ++ out: + return ret; + } + +@@ -2466,7 +2796,7 @@ static ssize_t + rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) + { +- long *p = filp->private_data; ++ unsigned long *p = filp->private_data; + char buf[64]; + int r; + +@@ -2482,9 +2812,9 @@ static ssize_t + rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +- long *p = filp->private_data; ++ unsigned long *p = filp->private_data; + char buf[64]; +- long val; ++ unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) +@@ -2509,7 +2839,7 @@ rb_simple_write(struct file *filp, const + return cnt; + } + +-static struct file_operations rb_simple_fops = { ++static const struct file_operations rb_simple_fops = { + .open = tracing_open_generic, + .read = rb_simple_read, + .write = rb_simple_write, +@@ -2532,3 +2862,42 @@ static __init int rb_init_debugfs(void) + } + + fs_initcall(rb_init_debugfs); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static int rb_cpu_notify(struct notifier_block *self, ++ unsigned long action, void *hcpu) ++{ ++ struct ring_buffer *buffer = ++ container_of(self, struct ring_buffer, cpu_notify); ++ long cpu = (long)hcpu; ++ ++ switch (action) { ++ case CPU_UP_PREPARE: ++ case CPU_UP_PREPARE_FROZEN: ++ if (cpu_isset(cpu, *buffer->cpumask)) ++ return NOTIFY_OK; ++ ++ buffer->buffers[cpu] = ++ rb_allocate_cpu_buffer(buffer, cpu); ++ if (!buffer->buffers[cpu]) { ++ WARN(1, "failed to allocate ring buffer on CPU %ld\n", ++ cpu); ++ return NOTIFY_OK; ++ } ++ smp_wmb(); ++ cpu_set(cpu, *buffer->cpumask); ++ break; ++ case CPU_DOWN_PREPARE: ++ case CPU_DOWN_PREPARE_FROZEN: ++ /* ++ * Do nothing. ++ * If we were to free the buffer, then the user would ++ * lose any trace that was in the buffer. ++ */ ++ break; ++ default: ++ break; ++ } ++ return NOTIFY_OK; ++} ++#endif +Index: linux-2.6-tip/kernel/trace/trace.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace.c ++++ linux-2.6-tip/kernel/trace/trace.c +@@ -11,32 +11,33 @@ + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ ++#include + #include ++#include ++#include + #include + #include + #include ++#include + #include + #include + #include + #include + #include ++#include + #include + #include + #include ++#include + #include + #include + #include + #include + #include + #include +-#include +-#include +- +-#include +-#include +-#include + + #include "trace.h" ++#include "trace_output.h" + + #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) + +@@ -44,14 +45,25 @@ unsigned long __read_mostly tracing_max_ + unsigned long __read_mostly tracing_thresh; + + /* ++ * On boot up, the ring buffer is set to the minimum size, so that ++ * we do not waste memory on systems that are not using tracing. ++ */ ++static int ring_buffer_expanded; ++ ++/* + * We need to change this state when a selftest is running. + * A selftest will lurk into the ring-buffer to count the + * entries inserted during the selftest although some concurrent +- * insertions into the ring-buffer such as ftrace_printk could occurred ++ * insertions into the ring-buffer such as trace_printk could occurred + * at the same time, giving false positive or negative results. + */ + static bool __read_mostly tracing_selftest_running; + ++/* ++ * If a tracer is running, we do not want to run SELFTEST. ++ */ ++static bool __read_mostly tracing_selftest_disabled; ++ + /* For tracers that don't implement custom flags */ + static struct tracer_opt dummy_tracer_opt[] = { + { } +@@ -73,7 +85,7 @@ static int dummy_set_flag(u32 old_flags, + * of the tracer is successful. But that is the only place that sets + * this back to zero. + */ +-int tracing_disabled = 1; ++static int tracing_disabled = 1; + + static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); + +@@ -91,6 +103,9 @@ static inline void ftrace_enable_cpu(voi + + static cpumask_var_t __read_mostly tracing_buffer_mask; + ++/* Define which cpu buffers are currently read in trace_pipe */ ++static cpumask_var_t tracing_reader_cpumask; ++ + #define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) + +@@ -109,14 +124,21 @@ static cpumask_var_t __read_mostly traci + */ + int ftrace_dump_on_oops; + +-static int tracing_set_tracer(char *buf); ++static int tracing_set_tracer(const char *buf); ++ ++#define BOOTUP_TRACER_SIZE 100 ++static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; ++static char *default_bootup_tracer; + + static int __init set_ftrace(char *str) + { +- tracing_set_tracer(str); ++ strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); ++ default_bootup_tracer = bootup_tracer_buf; ++ /* We are using ftrace early, expand it */ ++ ring_buffer_expanded = 1; + return 1; + } +-__setup("ftrace", set_ftrace); ++__setup("ftrace=", set_ftrace); + + static int __init set_ftrace_dump_on_oops(char *str) + { +@@ -133,13 +155,6 @@ ns2usecs(cycle_t nsec) + return nsec; + } + +-cycle_t ftrace_now(int cpu) +-{ +- u64 ts = ring_buffer_time_stamp(cpu); +- ring_buffer_normalize_time_stamp(cpu, &ts); +- return ts; +-} +- + /* + * The global_trace is the descriptor that holds the tracing + * buffers for the live tracing. For each CPU, it contains +@@ -156,6 +171,20 @@ static struct trace_array global_trace; + + static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); + ++cycle_t ftrace_now(int cpu) ++{ ++ u64 ts; ++ ++ /* Early boot up does not have a buffer yet */ ++ if (!global_trace.buffer) ++ return trace_clock_local(); ++ ++ ts = ring_buffer_time_stamp(global_trace.buffer, cpu); ++ ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); ++ ++ return ts; ++} ++ + /* + * The max_tr is used to snapshot the global_trace when a maximum + * latency is reached. Some tracers will use this to store a maximum +@@ -186,9 +215,6 @@ int tracing_is_enabled(void) + return tracer_enabled; + } + +-/* function tracing enabled */ +-int ftrace_function_enabled; +- + /* + * trace_buf_size is the size in bytes that is allocated + * for a buffer. Note, the number of bytes is always rounded +@@ -229,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wai + + /* trace_flags holds trace_options default values */ + unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | +- TRACE_ITER_ANNOTATE; ++ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | ++ TRACE_ITER_GRAPH_TIME; + + /** + * trace_wake_up - wake up tasks waiting for trace input +@@ -239,6 +266,10 @@ unsigned long trace_flags = TRACE_ITER_P + */ + void trace_wake_up(void) + { ++#ifdef CONFIG_PREEMPT_RT ++ if (in_atomic() || irqs_disabled()) ++ return; ++#endif + /* + * The runqueue_is_locked() can fail, but this is the best we + * have for now: +@@ -280,13 +311,18 @@ static const char *trace_options[] = { + "block", + "stacktrace", + "sched-tree", +- "ftrace_printk", ++ "trace_printk", + "ftrace_preempt", + "branch", + "annotate", + "userstacktrace", + "sym-userobj", + "printk-msg-only", ++ "context-info", ++ "latency-format", ++ "global-clock", ++ "sleep-time", ++ "graph-time", + NULL + }; + +@@ -299,8 +335,7 @@ static const char *trace_options[] = { + * This is defined as a raw_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + */ +-static raw_spinlock_t ftrace_max_lock = +- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; ++static __raw_spinlock_t ftrace_max_lock = __RAW_SPIN_LOCK_UNLOCKED; + + /* + * Copy the new maximum trace into the separate maximum-trace +@@ -326,146 +361,37 @@ __update_max_tr(struct trace_array *tr, + data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ +- tracing_record_cmdline(current); ++ tracing_record_cmdline(tsk); + } + +-/** +- * trace_seq_printf - sequence printing of trace information +- * @s: trace sequence descriptor +- * @fmt: printf format string +- * +- * The tracer may use either sequence operations or its own +- * copy to user routines. To simplify formating of a trace +- * trace_seq_printf is used to store strings into a special +- * buffer (@s). Then the output may be either used by +- * the sequencer or pulled into another buffer. +- */ +-int +-trace_seq_printf(struct trace_seq *s, const char *fmt, ...) ++ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) + { +- int len = (PAGE_SIZE - 1) - s->len; +- va_list ap; ++ int len; + int ret; + +- if (!len) +- return 0; +- +- va_start(ap, fmt); +- ret = vsnprintf(s->buffer + s->len, len, fmt, ap); +- va_end(ap); +- +- /* If we can't write it all, don't bother writing anything */ +- if (ret >= len) +- return 0; +- +- s->len += ret; +- +- return len; +-} +- +-/** +- * trace_seq_puts - trace sequence printing of simple string +- * @s: trace sequence descriptor +- * @str: simple string to record +- * +- * The tracer may use either the sequence operations or its own +- * copy to user routines. This function records a simple string +- * into a special buffer (@s) for later retrieval by a sequencer +- * or other mechanism. +- */ +-static int +-trace_seq_puts(struct trace_seq *s, const char *str) +-{ +- int len = strlen(str); +- +- if (len > ((PAGE_SIZE - 1) - s->len)) +- return 0; +- +- memcpy(s->buffer + s->len, str, len); +- s->len += len; +- +- return len; +-} +- +-static int +-trace_seq_putc(struct trace_seq *s, unsigned char c) +-{ +- if (s->len >= (PAGE_SIZE - 1)) +- return 0; +- +- s->buffer[s->len++] = c; +- +- return 1; +-} +- +-static int +-trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +-{ +- if (len > ((PAGE_SIZE - 1) - s->len)) ++ if (!cnt) + return 0; + +- memcpy(s->buffer + s->len, mem, len); +- s->len += len; +- +- return len; +-} +- +-#define MAX_MEMHEX_BYTES 8 +-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) +- +-static int +-trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +-{ +- unsigned char hex[HEX_CHARS]; +- unsigned char *data = mem; +- int i, j; +- +-#ifdef __BIG_ENDIAN +- for (i = 0, j = 0; i < len; i++) { +-#else +- for (i = len-1, j = 0; i >= 0; i--) { +-#endif +- hex[j++] = hex_asc_hi(data[i]); +- hex[j++] = hex_asc_lo(data[i]); +- } +- hex[j++] = ' '; +- +- return trace_seq_putmem(s, hex, j); +-} +- +-static int +-trace_seq_path(struct trace_seq *s, struct path *path) +-{ +- unsigned char *p; ++ if (s->len <= s->readpos) ++ return -EBUSY; + +- if (s->len >= (PAGE_SIZE - 1)) +- return 0; +- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); +- if (!IS_ERR(p)) { +- p = mangle_path(s->buffer + s->len, p, "\n"); +- if (p) { +- s->len = p - s->buffer; +- return 1; +- } +- } else { +- s->buffer[s->len++] = '?'; +- return 1; +- } ++ len = s->len - s->readpos; ++ if (cnt > len) ++ cnt = len; ++ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); ++ if (ret == cnt) ++ return -EFAULT; + +- return 0; +-} ++ cnt -= ret; + +-static void +-trace_seq_reset(struct trace_seq *s) +-{ +- s->len = 0; +- s->readpos = 0; ++ s->readpos += cnt; ++ return cnt; + } + +-ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) ++static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) + { + int len; +- int ret; ++ void *ret; + + if (s->len <= s->readpos) + return -EBUSY; +@@ -473,25 +399,14 @@ ssize_t trace_seq_to_user(struct trace_s + len = s->len - s->readpos; + if (cnt > len) + cnt = len; +- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); +- if (ret) ++ ret = memcpy(buf, s->buffer + s->readpos, cnt); ++ if (!ret) + return -EFAULT; + +- s->readpos += len; ++ s->readpos += cnt; + return cnt; + } + +-static void +-trace_print_seq(struct seq_file *m, struct trace_seq *s) +-{ +- int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; +- +- s->buffer[len] = 0; +- seq_puts(m, s->buffer); +- +- trace_seq_reset(s); +-} +- + /** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer +@@ -543,7 +458,7 @@ update_max_tr_single(struct trace_array + + ftrace_enable_cpu(); + +- WARN_ON_ONCE(ret); ++ WARN_ON_ONCE(ret && ret != -EAGAIN); + + __update_max_tr(tr, tsk, cpu); + __raw_spin_unlock(&ftrace_max_lock); +@@ -556,6 +471,8 @@ update_max_tr_single(struct trace_array + * Register a new plugin tracer. + */ + int register_tracer(struct tracer *type) ++__releases(kernel_lock) ++__acquires(kernel_lock) + { + struct tracer *t; + int len; +@@ -594,9 +511,12 @@ int register_tracer(struct tracer *type) + else + if (!type->flags->opts) + type->flags->opts = dummy_tracer_opt; ++ if (!type->wait_pipe) ++ type->wait_pipe = default_wait_pipe; ++ + + #ifdef CONFIG_FTRACE_STARTUP_TEST +- if (type->selftest) { ++ if (type->selftest && !tracing_selftest_disabled) { + struct tracer *saved_tracer = current_trace; + struct trace_array *tr = &global_trace; + int i; +@@ -638,8 +558,26 @@ int register_tracer(struct tracer *type) + out: + tracing_selftest_running = false; + mutex_unlock(&trace_types_lock); +- lock_kernel(); + ++ if (ret || !default_bootup_tracer) ++ goto out_unlock; ++ ++ if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) ++ goto out_unlock; ++ ++ printk(KERN_INFO "Starting tracer '%s'\n", type->name); ++ /* Do we want this tracer to start on bootup? */ ++ tracing_set_tracer(type->name); ++ default_bootup_tracer = NULL; ++ /* disable other selftests, since this will break it. */ ++ tracing_selftest_disabled = 1; ++#ifdef CONFIG_FTRACE_STARTUP_TEST ++ printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", ++ type->name); ++#endif ++ ++ out_unlock: ++ lock_kernel(); + return ret; + } + +@@ -658,6 +596,15 @@ void unregister_tracer(struct tracer *ty + + found: + *t = (*t)->next; ++ ++ if (type == current_trace && tracer_enabled) { ++ tracer_enabled = 0; ++ tracing_stop(); ++ if (current_trace->stop) ++ current_trace->stop(&global_trace); ++ current_trace = &nop_trace; ++ } ++ + if (strlen(type->name) != max_tracer_type_len) + goto out; + +@@ -689,24 +636,25 @@ void tracing_reset_online_cpus(struct tr + } + + #define SAVED_CMDLINES 128 ++#define NO_CMDLINE_MAP UINT_MAX + static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; + static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; + static int cmdline_idx; +-static DEFINE_SPINLOCK(trace_cmdline_lock); ++static __raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; + + /* temporary disable recording */ +-atomic_t trace_record_cmdline_disabled __read_mostly; ++static atomic_t trace_record_cmdline_disabled __read_mostly; + + static void trace_init_cmdlines(void) + { +- memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); +- memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); ++ memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); ++ memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); + cmdline_idx = 0; + } + + static int trace_stop_count; +-static DEFINE_SPINLOCK(tracing_start_lock); ++static DEFINE_RAW_SPINLOCK(tracing_start_lock); + + /** + * ftrace_off_permanent - disable all ftrace code permanently +@@ -738,13 +686,12 @@ void tracing_start(void) + return; + + spin_lock_irqsave(&tracing_start_lock, flags); +- if (--trace_stop_count) +- goto out; +- +- if (trace_stop_count < 0) { +- /* Someone screwed up their debugging */ +- WARN_ON_ONCE(1); +- trace_stop_count = 0; ++ if (--trace_stop_count) { ++ if (trace_stop_count < 0) { ++ /* Someone screwed up their debugging */ ++ WARN_ON_ONCE(1); ++ trace_stop_count = 0; ++ } + goto out; + } + +@@ -794,8 +741,7 @@ void trace_stop_cmdline_recording(void); + + static void trace_save_cmdline(struct task_struct *tsk) + { +- unsigned map; +- unsigned idx; ++ unsigned pid, idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return; +@@ -806,17 +752,24 @@ static void trace_save_cmdline(struct ta + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ +- if (!spin_trylock(&trace_cmdline_lock)) ++ if (!__raw_spin_trylock(&trace_cmdline_lock)) + return; + + idx = map_pid_to_cmdline[tsk->pid]; +- if (idx >= SAVED_CMDLINES) { ++ if (idx == NO_CMDLINE_MAP) { + idx = (cmdline_idx + 1) % SAVED_CMDLINES; + +- map = map_cmdline_to_pid[idx]; +- if (map <= PID_MAX_DEFAULT) +- map_pid_to_cmdline[map] = (unsigned)-1; ++ /* ++ * Check whether the cmdline buffer at idx has a pid ++ * mapped. We are going to overwrite that entry so we ++ * need to clear the map_pid_to_cmdline. Otherwise we ++ * would read the new comm for the old pid. ++ */ ++ pid = map_cmdline_to_pid[idx]; ++ if (pid != NO_CMDLINE_MAP) ++ map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + ++ map_cmdline_to_pid[idx] = tsk->pid; + map_pid_to_cmdline[tsk->pid] = idx; + + cmdline_idx = idx; +@@ -824,33 +777,37 @@ static void trace_save_cmdline(struct ta + + memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + +- spin_unlock(&trace_cmdline_lock); ++ __raw_spin_unlock(&trace_cmdline_lock); + } + +-char *trace_find_cmdline(int pid) ++void trace_find_cmdline(int pid, char comm[]) + { +- char *cmdline = "<...>"; + unsigned map; + +- if (!pid) +- return ""; ++ if (!pid) { ++ strcpy(comm, ""); ++ return; ++ } + +- if (pid > PID_MAX_DEFAULT) +- goto out; ++ if (pid > PID_MAX_DEFAULT) { ++ strcpy(comm, "<...>"); ++ return; ++ } + ++ __raw_spin_lock(&trace_cmdline_lock); + map = map_pid_to_cmdline[pid]; +- if (map >= SAVED_CMDLINES) +- goto out; +- +- cmdline = saved_cmdlines[map]; ++ if (map != NO_CMDLINE_MAP) ++ strcpy(comm, saved_cmdlines[map]); ++ else ++ strcpy(comm, "<...>"); + +- out: +- return cmdline; ++ __raw_spin_unlock(&trace_cmdline_lock); + } + + void tracing_record_cmdline(struct task_struct *tsk) + { +- if (atomic_read(&trace_record_cmdline_disabled)) ++ if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || ++ !tracing_is_on()) + return; + + trace_save_cmdline(tsk); +@@ -864,7 +821,7 @@ tracing_generic_entry_update(struct trac + + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; +- entry->tgid = (tsk) ? tsk->tgid : 0; ++ entry->tgid = (tsk) ? tsk->tgid : 0; + entry->flags = + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | +@@ -876,78 +833,132 @@ tracing_generic_entry_update(struct trac + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + } + ++struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, ++ unsigned char type, ++ unsigned long len, ++ unsigned long flags, int pc) ++{ ++ struct ring_buffer_event *event; ++ ++ event = ring_buffer_lock_reserve(tr->buffer, len); ++ if (event != NULL) { ++ struct trace_entry *ent = ring_buffer_event_data(event); ++ ++ tracing_generic_entry_update(ent, flags, pc); ++ ent->type = type; ++ } ++ ++ return event; ++} ++static void ftrace_trace_stack(struct trace_array *tr, ++ unsigned long flags, int skip, int pc); ++static void ftrace_trace_userstack(struct trace_array *tr, ++ unsigned long flags, int pc); ++ ++static inline void __trace_buffer_unlock_commit(struct trace_array *tr, ++ struct ring_buffer_event *event, ++ unsigned long flags, int pc, ++ int wake) ++{ ++ ring_buffer_unlock_commit(tr->buffer, event); ++ ++ ftrace_trace_stack(tr, flags, 6, pc); ++ ftrace_trace_userstack(tr, flags, pc); ++ ++ if (wake) ++ trace_wake_up(); ++} ++ ++void trace_buffer_unlock_commit(struct trace_array *tr, ++ struct ring_buffer_event *event, ++ unsigned long flags, int pc) ++{ ++ __trace_buffer_unlock_commit(tr, event, flags, pc, 1); ++} ++ ++struct ring_buffer_event * ++trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, ++ unsigned long flags, int pc) ++{ ++ return trace_buffer_lock_reserve(&global_trace, ++ type, len, flags, pc); ++} ++ ++void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, ++ unsigned long flags, int pc) ++{ ++ return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); ++} ++ ++void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, ++ unsigned long flags, int pc) ++{ ++ return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); ++} ++ + void +-trace_function(struct trace_array *tr, struct trace_array_cpu *data, ++trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) + { + struct ring_buffer_event *event; + struct ftrace_entry *entry; +- unsigned long irq_flags; + + /* If we are reading the ring buffer, don't trace */ + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), ++ flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, flags, pc); +- entry->ent.type = TRACE_FN; + entry->ip = ip; + entry->parent_ip = parent_ip; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ++ ring_buffer_unlock_commit(tr->buffer, event); + } + + #ifdef CONFIG_FUNCTION_GRAPH_TRACER +-static void __trace_graph_entry(struct trace_array *tr, +- struct trace_array_cpu *data, ++static int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) + { + struct ring_buffer_event *event; + struct ftrace_graph_ent_entry *entry; +- unsigned long irq_flags; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) +- return; ++ return 0; + +- event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, ++ sizeof(*entry), flags, pc); + if (!event) +- return; ++ return 0; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, flags, pc); +- entry->ent.type = TRACE_GRAPH_ENT; + entry->graph_ent = *trace; +- ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); ++ ring_buffer_unlock_commit(global_trace.buffer, event); ++ ++ return 1; + } + + static void __trace_graph_return(struct trace_array *tr, +- struct trace_array_cpu *data, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) + { + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *entry; +- unsigned long irq_flags; + + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + +- event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET, ++ sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, flags, pc); +- entry->ent.type = TRACE_GRAPH_RET; + entry->ret = *trace; +- ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); ++ ring_buffer_unlock_commit(global_trace.buffer, event); + } + #endif + +@@ -957,31 +968,23 @@ ftrace(struct trace_array *tr, struct tr + int pc) + { + if (likely(!atomic_read(&data->disabled))) +- trace_function(tr, data, ip, parent_ip, flags, pc); ++ trace_function(tr, ip, parent_ip, flags, pc); + } + +-static void ftrace_trace_stack(struct trace_array *tr, +- struct trace_array_cpu *data, +- unsigned long flags, +- int skip, int pc) ++static void __ftrace_trace_stack(struct trace_array *tr, ++ unsigned long flags, ++ int skip, int pc) + { + #ifdef CONFIG_STACKTRACE + struct ring_buffer_event *event; + struct stack_entry *entry; + struct stack_trace trace; +- unsigned long irq_flags; + +- if (!(trace_flags & TRACE_ITER_STACKTRACE)) +- return; +- +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_STACK, ++ sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, flags, pc); +- entry->ent.type = TRACE_STACK; +- + memset(&entry->caller, 0, sizeof(entry->caller)); + + trace.nr_entries = 0; +@@ -990,38 +993,43 @@ static void ftrace_trace_stack(struct tr + trace.entries = entry->caller; + + save_stack_trace(&trace); +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ++ ring_buffer_unlock_commit(tr->buffer, event); + #endif + } + ++static void ftrace_trace_stack(struct trace_array *tr, ++ unsigned long flags, ++ int skip, int pc) ++{ ++ if (!(trace_flags & TRACE_ITER_STACKTRACE)) ++ return; ++ ++ __ftrace_trace_stack(tr, flags, skip, pc); ++} ++ + void __trace_stack(struct trace_array *tr, +- struct trace_array_cpu *data, + unsigned long flags, +- int skip) ++ int skip, int pc) + { +- ftrace_trace_stack(tr, data, flags, skip, preempt_count()); ++ __ftrace_trace_stack(tr, flags, skip, pc); + } + + static void ftrace_trace_userstack(struct trace_array *tr, +- struct trace_array_cpu *data, +- unsigned long flags, int pc) ++ unsigned long flags, int pc) + { + #ifdef CONFIG_STACKTRACE + struct ring_buffer_event *event; + struct userstack_entry *entry; + struct stack_trace trace; +- unsigned long irq_flags; + + if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + return; + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, ++ sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, flags, pc); +- entry->ent.type = TRACE_USER_STACK; + + memset(&entry->caller, 0, sizeof(entry->caller)); + +@@ -1031,70 +1039,58 @@ static void ftrace_trace_userstack(struc + trace.entries = entry->caller; + + save_stack_trace_user(&trace); +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ++ ring_buffer_unlock_commit(tr->buffer, event); + #endif + } + +-void __trace_userstack(struct trace_array *tr, +- struct trace_array_cpu *data, +- unsigned long flags) ++#ifdef UNUSED ++static void __trace_userstack(struct trace_array *tr, unsigned long flags) + { +- ftrace_trace_userstack(tr, data, flags, preempt_count()); ++ ftrace_trace_userstack(tr, flags, preempt_count()); + } ++#endif /* UNUSED */ + + static void +-ftrace_trace_special(void *__tr, void *__data, ++ftrace_trace_special(void *__tr, + unsigned long arg1, unsigned long arg2, unsigned long arg3, + int pc) + { + struct ring_buffer_event *event; +- struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; + struct special_entry *entry; +- unsigned long irq_flags; + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, ++ sizeof(*entry), 0, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, 0, pc); +- entry->ent.type = TRACE_SPECIAL; + entry->arg1 = arg1; + entry->arg2 = arg2; + entry->arg3 = arg3; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- ftrace_trace_stack(tr, data, irq_flags, 4, pc); +- ftrace_trace_userstack(tr, data, irq_flags, pc); +- +- trace_wake_up(); ++ trace_buffer_unlock_commit(tr, event, 0, pc); + } + + void + __trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) + { +- ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); ++ ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); + } + + void + tracing_sched_switch_trace(struct trace_array *tr, +- struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) + { + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; +- unsigned long irq_flags; + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_CTX, ++ sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, flags, pc); +- entry->ent.type = TRACE_CTX; + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; +@@ -1102,29 +1098,23 @@ tracing_sched_switch_trace(struct trace_ + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- ftrace_trace_stack(tr, data, flags, 5, pc); +- ftrace_trace_userstack(tr, data, flags, pc); ++ trace_buffer_unlock_commit(tr, event, flags, pc); + } + + void + tracing_sched_wakeup_trace(struct trace_array *tr, +- struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) + { + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; +- unsigned long irq_flags; + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_WAKE, ++ sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, flags, pc); +- entry->ent.type = TRACE_WAKE; + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; +@@ -1132,11 +1122,10 @@ tracing_sched_wakeup_trace(struct trace_ + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- ftrace_trace_stack(tr, data, flags, 6, pc); +- ftrace_trace_userstack(tr, data, flags, pc); + +- trace_wake_up(); ++ ring_buffer_unlock_commit(tr->buffer, event); ++ ftrace_trace_stack(tr, flags, 6, pc); ++ ftrace_trace_userstack(tr, flags, pc); + } + + void +@@ -1157,66 +1146,7 @@ ftrace_special(unsigned long arg1, unsig + data = tr->data[cpu]; + + if (likely(atomic_inc_return(&data->disabled) == 1)) +- ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); +- +- atomic_dec(&data->disabled); +- local_irq_restore(flags); +-} +- +-#ifdef CONFIG_FUNCTION_TRACER +-static void +-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +-{ +- struct trace_array *tr = &global_trace; +- struct trace_array_cpu *data; +- unsigned long flags; +- long disabled; +- int cpu, resched; +- int pc; +- +- if (unlikely(!ftrace_function_enabled)) +- return; +- +- pc = preempt_count(); +- resched = ftrace_preempt_disable(); +- local_save_flags(flags); +- cpu = raw_smp_processor_id(); +- data = tr->data[cpu]; +- disabled = atomic_inc_return(&data->disabled); +- +- if (likely(disabled == 1)) +- trace_function(tr, data, ip, parent_ip, flags, pc); +- +- atomic_dec(&data->disabled); +- ftrace_preempt_enable(resched); +-} +- +-static void +-function_trace_call(unsigned long ip, unsigned long parent_ip) +-{ +- struct trace_array *tr = &global_trace; +- struct trace_array_cpu *data; +- unsigned long flags; +- long disabled; +- int cpu; +- int pc; +- +- if (unlikely(!ftrace_function_enabled)) +- return; +- +- /* +- * Need to use raw, since this must be called before the +- * recursive protection is performed. +- */ +- local_irq_save(flags); +- cpu = raw_smp_processor_id(); +- data = tr->data[cpu]; +- disabled = atomic_inc_return(&data->disabled); +- +- if (likely(disabled == 1)) { +- pc = preempt_count(); +- trace_function(tr, data, ip, parent_ip, flags, pc); +- } ++ ftrace_trace_special(tr, arg1, arg2, arg3, pc); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +@@ -1229,6 +1159,7 @@ int trace_graph_entry(struct ftrace_grap + struct trace_array_cpu *data; + unsigned long flags; + long disabled; ++ int ret; + int cpu; + int pc; + +@@ -1244,15 +1175,18 @@ int trace_graph_entry(struct ftrace_grap + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); +- __trace_graph_entry(tr, data, trace, flags, pc); ++ ret = __trace_graph_entry(tr, trace, flags, pc); ++ } else { ++ ret = 0; + } + /* Only do the atomic if it is not already set */ + if (!test_tsk_trace_graph(current)) + set_tsk_trace_graph(current); ++ + atomic_dec(&data->disabled); + local_irq_restore(flags); + +- return 1; ++ return ret; + } + + void trace_graph_return(struct ftrace_graph_ret *trace) +@@ -1270,7 +1204,7 @@ void trace_graph_return(struct ftrace_gr + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); +- __trace_graph_return(tr, data, trace, flags, pc); ++ __trace_graph_return(tr, trace, flags, pc); + } + if (!trace->depth) + clear_tsk_trace_graph(current); +@@ -1279,30 +1213,121 @@ void trace_graph_return(struct ftrace_gr + } + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +-static struct ftrace_ops trace_ops __read_mostly = +-{ +- .func = function_trace_call, +-}; + +-void tracing_start_function_trace(void) ++/** ++ * trace_vbprintk - write binary msg to tracing buffer ++ * ++ */ ++int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + { +- ftrace_function_enabled = 0; ++ static __raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; ++ static u32 trace_buf[TRACE_BUF_SIZE]; + +- if (trace_flags & TRACE_ITER_PREEMPTONLY) +- trace_ops.func = function_trace_call_preempt_only; +- else +- trace_ops.func = function_trace_call; ++ struct ring_buffer_event *event; ++ struct trace_array *tr = &global_trace; ++ struct trace_array_cpu *data; ++ struct bprint_entry *entry; ++ unsigned long flags; ++ int resched; ++ int cpu, len = 0, size, pc; ++ ++ if (unlikely(tracing_selftest_running || tracing_disabled)) ++ return 0; ++ ++ /* Don't pollute graph traces with trace_vprintk internals */ ++ pause_graph_tracing(); ++ ++ pc = preempt_count(); ++ resched = ftrace_preempt_disable(); ++ cpu = raw_smp_processor_id(); ++ data = tr->data[cpu]; ++ ++ if (unlikely(atomic_read(&data->disabled))) ++ goto out; ++ ++ /* Lockdep uses trace_printk for lock tracing */ ++ local_irq_save(flags); ++ __raw_spin_lock(&trace_buf_lock); ++ len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); ++ ++ if (len > TRACE_BUF_SIZE || len < 0) ++ goto out_unlock; ++ ++ size = sizeof(*entry) + sizeof(u32) * len; ++ event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); ++ if (!event) ++ goto out_unlock; ++ entry = ring_buffer_event_data(event); ++ entry->ip = ip; ++ entry->fmt = fmt; ++ ++ memcpy(entry->buf, trace_buf, sizeof(u32) * len); ++ ring_buffer_unlock_commit(tr->buffer, event); ++ ++out_unlock: ++ __raw_spin_unlock(&trace_buf_lock); ++ local_irq_restore(flags); ++ ++out: ++ ftrace_preempt_enable(resched); ++ unpause_graph_tracing(); + +- register_ftrace_function(&trace_ops); +- ftrace_function_enabled = 1; ++ return len; + } ++EXPORT_SYMBOL_GPL(trace_vbprintk); + +-void tracing_stop_function_trace(void) ++int trace_vprintk(unsigned long ip, const char *fmt, va_list args) + { +- ftrace_function_enabled = 0; +- unregister_ftrace_function(&trace_ops); ++ static __raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; ++ static char trace_buf[TRACE_BUF_SIZE]; ++ ++ struct ring_buffer_event *event; ++ struct trace_array *tr = &global_trace; ++ struct trace_array_cpu *data; ++ int cpu, len = 0, size, pc; ++ struct print_entry *entry; ++ unsigned long irq_flags; ++ ++ if (tracing_disabled || tracing_selftest_running) ++ return 0; ++ ++ pc = preempt_count(); ++ preempt_disable_notrace(); ++ cpu = raw_smp_processor_id(); ++ data = tr->data[cpu]; ++ ++ if (unlikely(atomic_read(&data->disabled))) ++ goto out; ++ ++ pause_graph_tracing(); ++ raw_local_irq_save(irq_flags); ++ __raw_spin_lock(&trace_buf_lock); ++ len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); ++ ++ len = min(len, TRACE_BUF_SIZE-1); ++ trace_buf[len] = 0; ++ ++ size = sizeof(*entry) + len + 1; ++ event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); ++ if (!event) ++ goto out_unlock; ++ entry = ring_buffer_event_data(event); ++ entry->ip = ip; ++ ++ memcpy(&entry->buf, trace_buf, len); ++ entry->buf[len] = 0; ++ ring_buffer_unlock_commit(tr->buffer, event); ++ ++ out_unlock: ++ __raw_spin_unlock(&trace_buf_lock); ++ raw_local_irq_restore(irq_flags); ++ unpause_graph_tracing(); ++ out: ++ preempt_enable_notrace(); ++ ++ return len; + } +-#endif ++EXPORT_SYMBOL_GPL(trace_vprintk); + + enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, +@@ -1345,10 +1370,25 @@ __find_next_entry(struct trace_iterator + { + struct ring_buffer *buffer = iter->tr->buffer; + struct trace_entry *ent, *next = NULL; ++ int cpu_file = iter->cpu_file; + u64 next_ts = 0, ts; + int next_cpu = -1; + int cpu; + ++ /* ++ * If we are in a per_cpu trace file, don't bother by iterating over ++ * all cpu and peek directly. ++ */ ++ if (cpu_file > TRACE_PIPE_ALL_CPU) { ++ if (ring_buffer_empty_cpu(buffer, cpu_file)) ++ return NULL; ++ ent = peek_next_entry(iter, cpu_file, ent_ts); ++ if (ent_cpu) ++ *ent_cpu = cpu_file; ++ ++ return ent; ++ } ++ + for_each_tracing_cpu(cpu) { + + if (ring_buffer_empty_cpu(buffer, cpu)) +@@ -1376,8 +1416,8 @@ __find_next_entry(struct trace_iterator + } + + /* Find the next real entry, without updating the iterator itself */ +-static struct trace_entry * +-find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) ++struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, ++ int *ent_cpu, u64 *ent_ts) + { + return __find_next_entry(iter, ent_cpu, ent_ts); + } +@@ -1426,19 +1466,32 @@ static void *s_next(struct seq_file *m, + return ent; + } + ++/* ++ * No necessary locking here. The worst thing which can ++ * happen is loosing events consumed at the same time ++ * by a trace_pipe reader. ++ * Other than that, we don't risk to crash the ring buffer ++ * because it serializes the readers. ++ * ++ * The current tracer is copied to avoid a global locking ++ * all around. ++ */ + static void *s_start(struct seq_file *m, loff_t *pos) + { + struct trace_iterator *iter = m->private; ++ static struct tracer *old_tracer; ++ int cpu_file = iter->cpu_file; + void *p = NULL; + loff_t l = 0; + int cpu; + ++ /* copy the tracer to avoid using a global lock all around */ + mutex_lock(&trace_types_lock); +- +- if (!current_trace || current_trace != iter->trace) { +- mutex_unlock(&trace_types_lock); +- return NULL; ++ if (unlikely(old_tracer != current_trace && current_trace)) { ++ old_tracer = current_trace; ++ *iter->trace = *current_trace; + } ++ mutex_unlock(&trace_types_lock); + + atomic_inc(&trace_record_cmdline_disabled); + +@@ -1449,9 +1502,12 @@ static void *s_start(struct seq_file *m, + + ftrace_disable_cpu(); + +- for_each_tracing_cpu(cpu) { +- ring_buffer_iter_reset(iter->buffer_iter[cpu]); +- } ++ if (cpu_file == TRACE_PIPE_ALL_CPU) { ++ for_each_tracing_cpu(cpu) ++ ring_buffer_iter_reset(iter->buffer_iter[cpu]); ++ } else ++ ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); ++ + + ftrace_enable_cpu(); + +@@ -1469,155 +1525,6 @@ static void *s_start(struct seq_file *m, + static void s_stop(struct seq_file *m, void *p) + { + atomic_dec(&trace_record_cmdline_disabled); +- mutex_unlock(&trace_types_lock); +-} +- +-#ifdef CONFIG_KRETPROBES +-static inline const char *kretprobed(const char *name) +-{ +- static const char tramp_name[] = "kretprobe_trampoline"; +- int size = sizeof(tramp_name); +- +- if (strncmp(tramp_name, name, size) == 0) +- return "[unknown/kretprobe'd]"; +- return name; +-} +-#else +-static inline const char *kretprobed(const char *name) +-{ +- return name; +-} +-#endif /* CONFIG_KRETPROBES */ +- +-static int +-seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +-{ +-#ifdef CONFIG_KALLSYMS +- char str[KSYM_SYMBOL_LEN]; +- const char *name; +- +- kallsyms_lookup(address, NULL, NULL, NULL, str); +- +- name = kretprobed(str); +- +- return trace_seq_printf(s, fmt, name); +-#endif +- return 1; +-} +- +-static int +-seq_print_sym_offset(struct trace_seq *s, const char *fmt, +- unsigned long address) +-{ +-#ifdef CONFIG_KALLSYMS +- char str[KSYM_SYMBOL_LEN]; +- const char *name; +- +- sprint_symbol(str, address); +- name = kretprobed(str); +- +- return trace_seq_printf(s, fmt, name); +-#endif +- return 1; +-} +- +-#ifndef CONFIG_64BIT +-# define IP_FMT "%08lx" +-#else +-# define IP_FMT "%016lx" +-#endif +- +-int +-seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +-{ +- int ret; +- +- if (!ip) +- return trace_seq_printf(s, "0"); +- +- if (sym_flags & TRACE_ITER_SYM_OFFSET) +- ret = seq_print_sym_offset(s, "%s", ip); +- else +- ret = seq_print_sym_short(s, "%s", ip); +- +- if (!ret) +- return 0; +- +- if (sym_flags & TRACE_ITER_SYM_ADDR) +- ret = trace_seq_printf(s, " <" IP_FMT ">", ip); +- return ret; +-} +- +-static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, +- unsigned long ip, unsigned long sym_flags) +-{ +- struct file *file = NULL; +- unsigned long vmstart = 0; +- int ret = 1; +- +- if (mm) { +- const struct vm_area_struct *vma; +- +- down_read(&mm->mmap_sem); +- vma = find_vma(mm, ip); +- if (vma) { +- file = vma->vm_file; +- vmstart = vma->vm_start; +- } +- if (file) { +- ret = trace_seq_path(s, &file->f_path); +- if (ret) +- ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); +- } +- up_read(&mm->mmap_sem); +- } +- if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) +- ret = trace_seq_printf(s, " <" IP_FMT ">", ip); +- return ret; +-} +- +-static int +-seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, +- unsigned long sym_flags) +-{ +- struct mm_struct *mm = NULL; +- int ret = 1; +- unsigned int i; +- +- if (trace_flags & TRACE_ITER_SYM_USEROBJ) { +- struct task_struct *task; +- /* +- * we do the lookup on the thread group leader, +- * since individual threads might have already quit! +- */ +- rcu_read_lock(); +- task = find_task_by_vpid(entry->ent.tgid); +- if (task) +- mm = get_task_mm(task); +- rcu_read_unlock(); +- } +- +- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { +- unsigned long ip = entry->caller[i]; +- +- if (ip == ULONG_MAX || !ret) +- break; +- if (i && ret) +- ret = trace_seq_puts(s, " <- "); +- if (!ip) { +- if (ret) +- ret = trace_seq_puts(s, "??"); +- continue; +- } +- if (!ret) +- break; +- if (ret) +- ret = seq_print_user_ip(s, mm, ip, sym_flags); +- } +- +- if (mm) +- mmput(mm); +- return ret; + } + + static void print_lat_help_header(struct seq_file *m) +@@ -1658,11 +1565,11 @@ print_trace_header(struct seq_file *m, s + total = entries + + ring_buffer_overruns(iter->tr->buffer); + +- seq_printf(m, "%s latency trace v1.1.5 on %s\n", ++ seq_printf(m, "# %s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); +- seq_puts(m, "-----------------------------------" ++ seq_puts(m, "# -----------------------------------" + "---------------------------------\n"); +- seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" ++ seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + nsecs_to_usecs(data->saved_latency), + entries, +@@ -1684,121 +1591,24 @@ print_trace_header(struct seq_file *m, s + #else + seq_puts(m, ")\n"); + #endif +- seq_puts(m, " -----------------\n"); +- seq_printf(m, " | task: %.16s-%d " ++ seq_puts(m, "# -----------------\n"); ++ seq_printf(m, "# | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, data->uid, data->nice, + data->policy, data->rt_priority); +- seq_puts(m, " -----------------\n"); ++ seq_puts(m, "# -----------------\n"); + + if (data->critical_start) { +- seq_puts(m, " => started at: "); ++ seq_puts(m, "# => started at: "); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); +- seq_puts(m, "\n => ended at: "); ++ seq_puts(m, "\n# => ended at: "); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); +- seq_puts(m, "\n"); +- } +- +- seq_puts(m, "\n"); +-} +- +-static void +-lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +-{ +- int hardirq, softirq; +- char *comm; +- +- comm = trace_find_cmdline(entry->pid); +- +- trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); +- trace_seq_printf(s, "%3d", cpu); +- trace_seq_printf(s, "%c%c", +- (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : +- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', +- ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); +- +- hardirq = entry->flags & TRACE_FLAG_HARDIRQ; +- softirq = entry->flags & TRACE_FLAG_SOFTIRQ; +- if (hardirq && softirq) { +- trace_seq_putc(s, 'H'); +- } else { +- if (hardirq) { +- trace_seq_putc(s, 'h'); +- } else { +- if (softirq) +- trace_seq_putc(s, 's'); +- else +- trace_seq_putc(s, '.'); +- } +- } +- +- if (entry->preempt_count) +- trace_seq_printf(s, "%x", entry->preempt_count); +- else +- trace_seq_puts(s, "."); +-} +- +-unsigned long preempt_mark_thresh = 100; +- +-static void +-lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, +- unsigned long rel_usecs) +-{ +- trace_seq_printf(s, " %4lldus", abs_usecs); +- if (rel_usecs > preempt_mark_thresh) +- trace_seq_puts(s, "!: "); +- else if (rel_usecs > 1) +- trace_seq_puts(s, "+: "); +- else +- trace_seq_puts(s, " : "); +-} +- +-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; +- +-static int task_state_char(unsigned long state) +-{ +- int bit = state ? __ffs(state) + 1 : 0; +- +- return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +-} +- +-/* +- * The message is supposed to contain an ending newline. +- * If the printing stops prematurely, try to add a newline of our own. +- */ +-void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) +-{ +- struct trace_entry *ent; +- struct trace_field_cont *cont; +- bool ok = true; +- +- ent = peek_next_entry(iter, iter->cpu, NULL); +- if (!ent || ent->type != TRACE_CONT) { +- trace_seq_putc(s, '\n'); +- return; ++ seq_puts(m, "#\n"); + } + +- do { +- cont = (struct trace_field_cont *)ent; +- if (ok) +- ok = (trace_seq_printf(s, "%s", cont->buf) > 0); +- +- ftrace_disable_cpu(); +- +- if (iter->buffer_iter[iter->cpu]) +- ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); +- else +- ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); +- +- ftrace_enable_cpu(); +- +- ent = peek_next_entry(iter, iter->cpu, NULL); +- } while (ent && ent->type == TRACE_CONT); +- +- if (!ok) +- trace_seq_putc(s, '\n'); ++ seq_puts(m, "#\n"); + } + + static void test_cpu_buff_start(struct trace_iterator *iter) +@@ -1818,533 +1628,128 @@ static void test_cpu_buff_start(struct t + trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); + } + +-static enum print_line_t +-print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) ++static enum print_line_t print_trace_fmt(struct trace_iterator *iter) + { + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); +- struct trace_entry *next_entry; +- unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); +- struct trace_entry *entry = iter->ent; +- unsigned long abs_usecs; +- unsigned long rel_usecs; +- u64 next_ts; +- char *comm; +- int S, T; +- int i; ++ struct trace_entry *entry; ++ struct trace_event *event; + +- if (entry->type == TRACE_CONT) +- return TRACE_TYPE_HANDLED; ++ entry = iter->ent; + + test_cpu_buff_start(iter); + +- next_entry = find_next_entry(iter, NULL, &next_ts); +- if (!next_entry) +- next_ts = iter->ts; +- rel_usecs = ns2usecs(next_ts - iter->ts); +- abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); +- +- if (verbose) { +- comm = trace_find_cmdline(entry->pid); +- trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]" +- " %ld.%03ldms (+%ld.%03ldms): ", +- comm, +- entry->pid, cpu, entry->flags, +- entry->preempt_count, trace_idx, +- ns2usecs(iter->ts), +- abs_usecs/1000, +- abs_usecs % 1000, rel_usecs/1000, +- rel_usecs % 1000); +- } else { +- lat_print_generic(s, entry, cpu); +- lat_print_timestamp(s, abs_usecs, rel_usecs); +- } +- switch (entry->type) { +- case TRACE_FN: { +- struct ftrace_entry *field; +- +- trace_assign_type(field, entry); +- +- seq_print_ip_sym(s, field->ip, sym_flags); +- trace_seq_puts(s, " ("); +- seq_print_ip_sym(s, field->parent_ip, sym_flags); +- trace_seq_puts(s, ")\n"); +- break; +- } +- case TRACE_CTX: +- case TRACE_WAKE: { +- struct ctx_switch_entry *field; +- +- trace_assign_type(field, entry); +- +- T = task_state_char(field->next_state); +- S = task_state_char(field->prev_state); +- comm = trace_find_cmdline(field->next_pid); +- trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", +- field->prev_pid, +- field->prev_prio, +- S, entry->type == TRACE_CTX ? "==>" : " +", +- field->next_cpu, +- field->next_pid, +- field->next_prio, +- T, comm); +- break; +- } +- case TRACE_SPECIAL: { +- struct special_entry *field; +- +- trace_assign_type(field, entry); ++ event = ftrace_find_event(entry->type); + +- trace_seq_printf(s, "# %ld %ld %ld\n", +- field->arg1, +- field->arg2, +- field->arg3); +- break; ++ if (trace_flags & TRACE_ITER_CONTEXT_INFO) { ++ if (iter->iter_flags & TRACE_FILE_LAT_FMT) { ++ if (!trace_print_lat_context(iter)) ++ goto partial; ++ } else { ++ if (!trace_print_context(iter)) ++ goto partial; ++ } + } +- case TRACE_STACK: { +- struct stack_entry *field; + +- trace_assign_type(field, entry); ++ if (event) ++ return event->trace(iter, sym_flags); + +- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { +- if (i) +- trace_seq_puts(s, " <= "); +- seq_print_ip_sym(s, field->caller[i], sym_flags); +- } +- trace_seq_puts(s, "\n"); +- break; +- } +- case TRACE_PRINT: { +- struct print_entry *field; ++ if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) ++ goto partial; + +- trace_assign_type(field, entry); ++ return TRACE_TYPE_HANDLED; ++partial: ++ return TRACE_TYPE_PARTIAL_LINE; ++} + +- seq_print_ip_sym(s, field->ip, sym_flags); +- trace_seq_printf(s, ": %s", field->buf); +- if (entry->flags & TRACE_FLAG_CONT) +- trace_seq_print_cont(s, iter); +- break; +- } +- case TRACE_BRANCH: { +- struct trace_branch *field; ++static enum print_line_t print_raw_fmt(struct trace_iterator *iter) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct trace_entry *entry; ++ struct trace_event *event; + +- trace_assign_type(field, entry); ++ entry = iter->ent; + +- trace_seq_printf(s, "[%s] %s:%s:%d\n", +- field->correct ? " ok " : " MISS ", +- field->func, +- field->file, +- field->line); +- break; ++ if (trace_flags & TRACE_ITER_CONTEXT_INFO) { ++ if (!trace_seq_printf(s, "%d %d %llu ", ++ entry->pid, iter->cpu, iter->ts)) ++ goto partial; + } +- case TRACE_USER_STACK: { +- struct userstack_entry *field; + +- trace_assign_type(field, entry); ++ event = ftrace_find_event(entry->type); ++ if (event) ++ return event->raw(iter, 0); ++ ++ if (!trace_seq_printf(s, "%d ?\n", entry->type)) ++ goto partial; + +- seq_print_userip_objs(field, s, sym_flags); +- trace_seq_putc(s, '\n'); +- break; +- } +- default: +- trace_seq_printf(s, "Unknown type %d\n", entry->type); +- } + return TRACE_TYPE_HANDLED; ++partial: ++ return TRACE_TYPE_PARTIAL_LINE; + } + +-static enum print_line_t print_trace_fmt(struct trace_iterator *iter) ++static enum print_line_t print_hex_fmt(struct trace_iterator *iter) + { + struct trace_seq *s = &iter->seq; +- unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); ++ unsigned char newline = '\n'; + struct trace_entry *entry; +- unsigned long usec_rem; +- unsigned long long t; +- unsigned long secs; +- char *comm; +- int ret; +- int S, T; +- int i; ++ struct trace_event *event; + + entry = iter->ent; + +- if (entry->type == TRACE_CONT) +- return TRACE_TYPE_HANDLED; +- +- test_cpu_buff_start(iter); ++ if (trace_flags & TRACE_ITER_CONTEXT_INFO) { ++ SEQ_PUT_HEX_FIELD_RET(s, entry->pid); ++ SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); ++ SEQ_PUT_HEX_FIELD_RET(s, iter->ts); ++ } + +- comm = trace_find_cmdline(iter->ent->pid); ++ event = ftrace_find_event(entry->type); ++ if (event) { ++ enum print_line_t ret = event->hex(iter, 0); ++ if (ret != TRACE_TYPE_HANDLED) ++ return ret; ++ } + +- t = ns2usecs(iter->ts); +- usec_rem = do_div(t, 1000000ULL); +- secs = (unsigned long)t; ++ SEQ_PUT_FIELD_RET(s, newline); + +- ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- ret = trace_seq_printf(s, "[%03d] ", iter->cpu); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; ++ return TRACE_TYPE_HANDLED; ++} + +- switch (entry->type) { +- case TRACE_FN: { +- struct ftrace_entry *field; +- +- trace_assign_type(field, entry); +- +- ret = seq_print_ip_sym(s, field->ip, sym_flags); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- if ((sym_flags & TRACE_ITER_PRINT_PARENT) && +- field->parent_ip) { +- ret = trace_seq_printf(s, " <-"); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- ret = seq_print_ip_sym(s, +- field->parent_ip, +- sym_flags); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- } +- ret = trace_seq_printf(s, "\n"); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- break; +- } +- case TRACE_CTX: +- case TRACE_WAKE: { +- struct ctx_switch_entry *field; +- +- trace_assign_type(field, entry); +- +- T = task_state_char(field->next_state); +- S = task_state_char(field->prev_state); +- ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", +- field->prev_pid, +- field->prev_prio, +- S, +- entry->type == TRACE_CTX ? "==>" : " +", +- field->next_cpu, +- field->next_pid, +- field->next_prio, +- T); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- break; +- } +- case TRACE_SPECIAL: { +- struct special_entry *field; ++static enum print_line_t print_bin_fmt(struct trace_iterator *iter) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct trace_entry *entry; ++ struct trace_event *event; + +- trace_assign_type(field, entry); ++ entry = iter->ent; + +- ret = trace_seq_printf(s, "# %ld %ld %ld\n", +- field->arg1, +- field->arg2, +- field->arg3); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- break; ++ if (trace_flags & TRACE_ITER_CONTEXT_INFO) { ++ SEQ_PUT_FIELD_RET(s, entry->pid); ++ SEQ_PUT_FIELD_RET(s, iter->cpu); ++ SEQ_PUT_FIELD_RET(s, iter->ts); + } +- case TRACE_STACK: { +- struct stack_entry *field; + +- trace_assign_type(field, entry); +- +- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { +- if (i) { +- ret = trace_seq_puts(s, " <= "); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- } +- ret = seq_print_ip_sym(s, field->caller[i], +- sym_flags); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- } +- ret = trace_seq_puts(s, "\n"); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- break; +- } +- case TRACE_PRINT: { +- struct print_entry *field; +- +- trace_assign_type(field, entry); +- +- seq_print_ip_sym(s, field->ip, sym_flags); +- trace_seq_printf(s, ": %s", field->buf); +- if (entry->flags & TRACE_FLAG_CONT) +- trace_seq_print_cont(s, iter); +- break; +- } +- case TRACE_GRAPH_RET: { +- return print_graph_function(iter); +- } +- case TRACE_GRAPH_ENT: { +- return print_graph_function(iter); +- } +- case TRACE_BRANCH: { +- struct trace_branch *field; +- +- trace_assign_type(field, entry); +- +- trace_seq_printf(s, "[%s] %s:%s:%d\n", +- field->correct ? " ok " : " MISS ", +- field->func, +- field->file, +- field->line); +- break; +- } +- case TRACE_USER_STACK: { +- struct userstack_entry *field; +- +- trace_assign_type(field, entry); +- +- ret = seq_print_userip_objs(field, s, sym_flags); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- ret = trace_seq_putc(s, '\n'); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- break; +- } +- } +- return TRACE_TYPE_HANDLED; +-} +- +-static enum print_line_t print_raw_fmt(struct trace_iterator *iter) +-{ +- struct trace_seq *s = &iter->seq; +- struct trace_entry *entry; +- int ret; +- int S, T; +- +- entry = iter->ent; +- +- if (entry->type == TRACE_CONT) +- return TRACE_TYPE_HANDLED; +- +- ret = trace_seq_printf(s, "%d %d %llu ", +- entry->pid, iter->cpu, iter->ts); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- +- switch (entry->type) { +- case TRACE_FN: { +- struct ftrace_entry *field; +- +- trace_assign_type(field, entry); +- +- ret = trace_seq_printf(s, "%x %x\n", +- field->ip, +- field->parent_ip); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- break; +- } +- case TRACE_CTX: +- case TRACE_WAKE: { +- struct ctx_switch_entry *field; +- +- trace_assign_type(field, entry); +- +- T = task_state_char(field->next_state); +- S = entry->type == TRACE_WAKE ? '+' : +- task_state_char(field->prev_state); +- ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", +- field->prev_pid, +- field->prev_prio, +- S, +- field->next_cpu, +- field->next_pid, +- field->next_prio, +- T); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- break; +- } +- case TRACE_SPECIAL: +- case TRACE_USER_STACK: +- case TRACE_STACK: { +- struct special_entry *field; +- +- trace_assign_type(field, entry); +- +- ret = trace_seq_printf(s, "# %ld %ld %ld\n", +- field->arg1, +- field->arg2, +- field->arg3); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- break; +- } +- case TRACE_PRINT: { +- struct print_entry *field; +- +- trace_assign_type(field, entry); +- +- trace_seq_printf(s, "# %lx %s", field->ip, field->buf); +- if (entry->flags & TRACE_FLAG_CONT) +- trace_seq_print_cont(s, iter); +- break; +- } +- } +- return TRACE_TYPE_HANDLED; +-} +- +-#define SEQ_PUT_FIELD_RET(s, x) \ +-do { \ +- if (!trace_seq_putmem(s, &(x), sizeof(x))) \ +- return 0; \ +-} while (0) +- +-#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +-do { \ +- BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ +- if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ +- return 0; \ +-} while (0) +- +-static enum print_line_t print_hex_fmt(struct trace_iterator *iter) +-{ +- struct trace_seq *s = &iter->seq; +- unsigned char newline = '\n'; +- struct trace_entry *entry; +- int S, T; +- +- entry = iter->ent; +- +- if (entry->type == TRACE_CONT) +- return TRACE_TYPE_HANDLED; +- +- SEQ_PUT_HEX_FIELD_RET(s, entry->pid); +- SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); +- SEQ_PUT_HEX_FIELD_RET(s, iter->ts); +- +- switch (entry->type) { +- case TRACE_FN: { +- struct ftrace_entry *field; +- +- trace_assign_type(field, entry); +- +- SEQ_PUT_HEX_FIELD_RET(s, field->ip); +- SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); +- break; +- } +- case TRACE_CTX: +- case TRACE_WAKE: { +- struct ctx_switch_entry *field; +- +- trace_assign_type(field, entry); +- +- T = task_state_char(field->next_state); +- S = entry->type == TRACE_WAKE ? '+' : +- task_state_char(field->prev_state); +- SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); +- SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); +- SEQ_PUT_HEX_FIELD_RET(s, S); +- SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); +- SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); +- SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); +- SEQ_PUT_HEX_FIELD_RET(s, T); +- break; +- } +- case TRACE_SPECIAL: +- case TRACE_USER_STACK: +- case TRACE_STACK: { +- struct special_entry *field; +- +- trace_assign_type(field, entry); +- +- SEQ_PUT_HEX_FIELD_RET(s, field->arg1); +- SEQ_PUT_HEX_FIELD_RET(s, field->arg2); +- SEQ_PUT_HEX_FIELD_RET(s, field->arg3); +- break; +- } +- } +- SEQ_PUT_FIELD_RET(s, newline); +- +- return TRACE_TYPE_HANDLED; +-} +- +-static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) +-{ +- struct trace_seq *s = &iter->seq; +- struct trace_entry *entry = iter->ent; +- struct print_entry *field; +- int ret; +- +- trace_assign_type(field, entry); +- +- ret = trace_seq_printf(s, field->buf); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- +- if (entry->flags & TRACE_FLAG_CONT) +- trace_seq_print_cont(s, iter); +- +- return TRACE_TYPE_HANDLED; +-} +- +-static enum print_line_t print_bin_fmt(struct trace_iterator *iter) +-{ +- struct trace_seq *s = &iter->seq; +- struct trace_entry *entry; +- +- entry = iter->ent; +- +- if (entry->type == TRACE_CONT) +- return TRACE_TYPE_HANDLED; +- +- SEQ_PUT_FIELD_RET(s, entry->pid); +- SEQ_PUT_FIELD_RET(s, entry->cpu); +- SEQ_PUT_FIELD_RET(s, iter->ts); +- +- switch (entry->type) { +- case TRACE_FN: { +- struct ftrace_entry *field; +- +- trace_assign_type(field, entry); +- +- SEQ_PUT_FIELD_RET(s, field->ip); +- SEQ_PUT_FIELD_RET(s, field->parent_ip); +- break; +- } +- case TRACE_CTX: { +- struct ctx_switch_entry *field; +- +- trace_assign_type(field, entry); +- +- SEQ_PUT_FIELD_RET(s, field->prev_pid); +- SEQ_PUT_FIELD_RET(s, field->prev_prio); +- SEQ_PUT_FIELD_RET(s, field->prev_state); +- SEQ_PUT_FIELD_RET(s, field->next_pid); +- SEQ_PUT_FIELD_RET(s, field->next_prio); +- SEQ_PUT_FIELD_RET(s, field->next_state); +- break; +- } +- case TRACE_SPECIAL: +- case TRACE_USER_STACK: +- case TRACE_STACK: { +- struct special_entry *field; +- +- trace_assign_type(field, entry); +- +- SEQ_PUT_FIELD_RET(s, field->arg1); +- SEQ_PUT_FIELD_RET(s, field->arg2); +- SEQ_PUT_FIELD_RET(s, field->arg3); +- break; +- } +- } +- return 1; +-} ++ event = ftrace_find_event(entry->type); ++ return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; ++} + + static int trace_empty(struct trace_iterator *iter) + { + int cpu; + ++ /* If we are looking at one CPU buffer, only check that one */ ++ if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { ++ cpu = iter->cpu_file; ++ if (iter->buffer_iter[cpu]) { ++ if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) ++ return 0; ++ } else { ++ if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) ++ return 0; ++ } ++ return 1; ++ } ++ + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) { + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) +@@ -2368,10 +1773,15 @@ static enum print_line_t print_trace_lin + return ret; + } + ++ if (iter->ent->type == TRACE_BPRINT && ++ trace_flags & TRACE_ITER_PRINTK && ++ trace_flags & TRACE_ITER_PRINTK_MSGONLY) ++ return trace_print_bprintk_msg_only(iter); ++ + if (iter->ent->type == TRACE_PRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) +- return print_printk_msg_only(iter); ++ return trace_print_printk_msg_only(iter); + + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); +@@ -2382,9 +1792,6 @@ static enum print_line_t print_trace_lin + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + +- if (iter->iter_flags & TRACE_FILE_LAT_FMT) +- return print_lat_fmt(iter, iter->idx, iter->cpu); +- + return print_trace_fmt(iter); + } + +@@ -2426,30 +1833,40 @@ static struct seq_operations tracer_seq_ + }; + + static struct trace_iterator * +-__tracing_open(struct inode *inode, struct file *file, int *ret) ++__tracing_open(struct inode *inode, struct file *file) + { ++ long cpu_file = (long) inode->i_private; ++ void *fail_ret = ERR_PTR(-ENOMEM); + struct trace_iterator *iter; + struct seq_file *m; +- int cpu; ++ int cpu, ret; + +- if (tracing_disabled) { +- *ret = -ENODEV; +- return NULL; +- } ++ if (tracing_disabled) ++ return ERR_PTR(-ENODEV); + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); +- if (!iter) { +- *ret = -ENOMEM; +- goto out; +- } ++ if (!iter) ++ return ERR_PTR(-ENOMEM); + ++ /* ++ * We make a copy of the current tracer to avoid concurrent ++ * changes on it while we are reading. ++ */ + mutex_lock(&trace_types_lock); ++ iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); ++ if (!iter->trace) ++ goto fail; ++ ++ if (current_trace) ++ *iter->trace = *current_trace; ++ + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else +- iter->tr = inode->i_private; +- iter->trace = current_trace; ++ iter->tr = &global_trace; + iter->pos = -1; ++ mutex_init(&iter->mutex); ++ iter->cpu_file = cpu_file; + + /* Notify the tracer early; before we stop tracing. */ + if (iter->trace && iter->trace->open) +@@ -2459,20 +1876,24 @@ __tracing_open(struct inode *inode, stru + if (ring_buffer_overruns(iter->tr->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + ++ if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { ++ for_each_tracing_cpu(cpu) { + +- for_each_tracing_cpu(cpu) { +- ++ iter->buffer_iter[cpu] = ++ ring_buffer_read_start(iter->tr->buffer, cpu); ++ } ++ } else { ++ cpu = iter->cpu_file; + iter->buffer_iter[cpu] = +- ring_buffer_read_start(iter->tr->buffer, cpu); +- +- if (!iter->buffer_iter[cpu]) +- goto fail_buffer; ++ ring_buffer_read_start(iter->tr->buffer, cpu); + } + + /* TODO stop tracer */ +- *ret = seq_open(file, &tracer_seq_ops); +- if (*ret) ++ ret = seq_open(file, &tracer_seq_ops); ++ if (ret < 0) { ++ fail_ret = ERR_PTR(ret); + goto fail_buffer; ++ } + + m = file->private_data; + m->private = iter; +@@ -2482,7 +1903,6 @@ __tracing_open(struct inode *inode, stru + + mutex_unlock(&trace_types_lock); + +- out: + return iter; + + fail_buffer: +@@ -2490,10 +1910,12 @@ __tracing_open(struct inode *inode, stru + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } ++ fail: + mutex_unlock(&trace_types_lock); ++ kfree(iter->trace); + kfree(iter); + +- return ERR_PTR(-ENOMEM); ++ return fail_ret; + } + + int tracing_open_generic(struct inode *inode, struct file *filp) +@@ -2505,12 +1927,17 @@ int tracing_open_generic(struct inode *i + return 0; + } + +-int tracing_release(struct inode *inode, struct file *file) ++static int tracing_release(struct inode *inode, struct file *file) + { + struct seq_file *m = (struct seq_file *)file->private_data; +- struct trace_iterator *iter = m->private; ++ struct trace_iterator *iter; + int cpu; + ++ if (!(file->f_mode & FMODE_READ)) ++ return 0; ++ ++ iter = m->private; ++ + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) +@@ -2525,33 +1952,38 @@ int tracing_release(struct inode *inode, + mutex_unlock(&trace_types_lock); + + seq_release(inode, file); ++ mutex_destroy(&iter->mutex); ++ kfree(iter->trace); + kfree(iter); + return 0; + } + + static int tracing_open(struct inode *inode, struct file *file) + { +- int ret; +- +- __tracing_open(inode, file, &ret); +- +- return ret; +-} +- +-static int tracing_lt_open(struct inode *inode, struct file *file) +-{ + struct trace_iterator *iter; +- int ret; ++ int ret = 0; + +- iter = __tracing_open(inode, file, &ret); ++ /* If this file was open for write, then erase contents */ ++ if ((file->f_mode & FMODE_WRITE) && ++ !(file->f_flags & O_APPEND)) { ++ long cpu = (long) inode->i_private; + +- if (!ret) +- iter->iter_flags |= TRACE_FILE_LAT_FMT; ++ if (cpu == TRACE_PIPE_ALL_CPU) ++ tracing_reset_online_cpus(&global_trace); ++ else ++ tracing_reset(&global_trace, cpu); ++ } + ++ if (file->f_mode & FMODE_READ) { ++ iter = __tracing_open(inode, file); ++ if (IS_ERR(iter)) ++ ret = PTR_ERR(iter); ++ else if (trace_flags & TRACE_ITER_LATENCY_FMT) ++ iter->iter_flags |= TRACE_FILE_LAT_FMT; ++ } + return ret; + } + +- + static void * + t_next(struct seq_file *m, void *v, loff_t *pos) + { +@@ -2623,21 +2055,22 @@ static int show_traces_open(struct inode + return ret; + } + +-static struct file_operations tracing_fops = { +- .open = tracing_open, +- .read = seq_read, +- .llseek = seq_lseek, +- .release = tracing_release, +-}; ++static ssize_t ++tracing_write_stub(struct file *filp, const char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ return count; ++} + +-static struct file_operations tracing_lt_fops = { +- .open = tracing_lt_open, ++static const struct file_operations tracing_fops = { ++ .open = tracing_open, + .read = seq_read, ++ .write = tracing_write_stub, + .llseek = seq_lseek, + .release = tracing_release, + }; + +-static struct file_operations show_traces_fops = { ++static const struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, +@@ -2730,7 +2163,7 @@ err_unlock: + return err; + } + +-static struct file_operations tracing_cpumask_fops = { ++static const struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, +@@ -2740,57 +2173,62 @@ static ssize_t + tracing_trace_options_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) + { +- int i; ++ struct tracer_opt *trace_opts; ++ u32 tracer_flags; ++ int len = 0; + char *buf; + int r = 0; +- int len = 0; +- u32 tracer_flags = current_trace->flags->val; +- struct tracer_opt *trace_opts = current_trace->flags->opts; ++ int i; + + +- /* calulate max size */ ++ /* calculate max size */ + for (i = 0; trace_options[i]; i++) { + len += strlen(trace_options[i]); +- len += 3; /* "no" and space */ ++ len += 3; /* "no" and newline */ + } + ++ mutex_lock(&trace_types_lock); ++ tracer_flags = current_trace->flags->val; ++ trace_opts = current_trace->flags->opts; ++ + /* + * Increase the size with names of options specific + * of the current tracer. + */ + for (i = 0; trace_opts[i].name; i++) { + len += strlen(trace_opts[i].name); +- len += 3; /* "no" and space */ ++ len += 3; /* "no" and newline */ + } + + /* +2 for \n and \0 */ + buf = kmalloc(len + 2, GFP_KERNEL); +- if (!buf) ++ if (!buf) { ++ mutex_unlock(&trace_types_lock); + return -ENOMEM; ++ } + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) +- r += sprintf(buf + r, "%s ", trace_options[i]); ++ r += sprintf(buf + r, "%s\n", trace_options[i]); + else +- r += sprintf(buf + r, "no%s ", trace_options[i]); ++ r += sprintf(buf + r, "no%s\n", trace_options[i]); + } + + for (i = 0; trace_opts[i].name; i++) { + if (tracer_flags & trace_opts[i].bit) +- r += sprintf(buf + r, "%s ", ++ r += sprintf(buf + r, "%s\n", + trace_opts[i].name); + else +- r += sprintf(buf + r, "no%s ", ++ r += sprintf(buf + r, "no%s\n", + trace_opts[i].name); + } ++ mutex_unlock(&trace_types_lock); + +- r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); +- + return r; + } + +@@ -2828,6 +2266,34 @@ static int set_tracer_option(struct trac + return 0; + } + ++static void set_tracer_flags(unsigned int mask, int enabled) ++{ ++ /* do nothing if flag is already set */ ++ if (!!(trace_flags & mask) == !!enabled) ++ return; ++ ++ if (enabled) ++ trace_flags |= mask; ++ else ++ trace_flags &= ~mask; ++ ++ if (mask == TRACE_ITER_GLOBAL_CLK) { ++ u64 (*func)(void); ++ ++ if (enabled) ++ func = trace_clock_global; ++ else ++ func = trace_clock_local; ++ ++ mutex_lock(&trace_types_lock); ++ ring_buffer_set_clock(global_trace.buffer, func); ++ ++ if (max_tr.buffer) ++ ring_buffer_set_clock(max_tr.buffer, func); ++ mutex_unlock(&trace_types_lock); ++ } ++} ++ + static ssize_t + tracing_trace_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +@@ -2855,17 +2321,16 @@ tracing_trace_options_write(struct file + int len = strlen(trace_options[i]); + + if (strncmp(cmp, trace_options[i], len) == 0) { +- if (neg) +- trace_flags &= ~(1 << i); +- else +- trace_flags |= (1 << i); ++ set_tracer_flags(1 << i, !neg); + break; + } + } + + /* If no option could be set, test the specific tracer options */ + if (!trace_options[i]) { ++ mutex_lock(&trace_types_lock); + ret = set_tracer_option(current_trace, cmp, neg); ++ mutex_unlock(&trace_types_lock); + if (ret) + return ret; + } +@@ -2875,7 +2340,7 @@ tracing_trace_options_write(struct file + return cnt; + } + +-static struct file_operations tracing_iter_fops = { ++static const struct file_operations tracing_iter_fops = { + .open = tracing_open_generic, + .read = tracing_trace_options_read, + .write = tracing_trace_options_write, +@@ -2908,7 +2373,7 @@ tracing_readme_read(struct file *filp, c + readme_msg, strlen(readme_msg)); + } + +-static struct file_operations tracing_readme_fops = { ++static const struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, + }; +@@ -2930,7 +2395,7 @@ tracing_ctrl_write(struct file *filp, co + { + struct trace_array *tr = filp->private_data; + char buf[64]; +- long val; ++ unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) +@@ -2985,13 +2450,105 @@ tracing_set_trace_read(struct file *filp + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + } + +-static int tracing_set_tracer(char *buf) ++int tracer_init(struct tracer *t, struct trace_array *tr) + { ++ tracing_reset_online_cpus(tr); ++ return t->init(tr); ++} ++ ++static int tracing_resize_ring_buffer(unsigned long size) ++{ ++ int ret; ++ ++ /* ++ * If kernel or user changes the size of the ring buffer ++ * we use the size that was given, and we can forget about ++ * expanding it later. ++ */ ++ ring_buffer_expanded = 1; ++ ++ ret = ring_buffer_resize(global_trace.buffer, size); ++ if (ret < 0) ++ return ret; ++ ++ ret = ring_buffer_resize(max_tr.buffer, size); ++ if (ret < 0) { ++ int r; ++ ++ r = ring_buffer_resize(global_trace.buffer, ++ global_trace.entries); ++ if (r < 0) { ++ /* ++ * AARGH! We are left with different ++ * size max buffer!!!! ++ * The max buffer is our "snapshot" buffer. ++ * When a tracer needs a snapshot (one of the ++ * latency tracers), it swaps the max buffer ++ * with the saved snap shot. We succeeded to ++ * update the size of the main buffer, but failed to ++ * update the size of the max buffer. But when we tried ++ * to reset the main buffer to the original size, we ++ * failed there too. This is very unlikely to ++ * happen, but if it does, warn and kill all ++ * tracing. ++ */ ++ WARN_ON(1); ++ tracing_disabled = 1; ++ } ++ return ret; ++ } ++ ++ global_trace.entries = size; ++ ++ return ret; ++} ++ ++/** ++ * tracing_update_buffers - used by tracing facility to expand ring buffers ++ * ++ * To save on memory when the tracing is never used on a system with it ++ * configured in. The ring buffers are set to a minimum size. But once ++ * a user starts to use the tracing facility, then they need to grow ++ * to their default size. ++ * ++ * This function is to be called when a tracer is about to be used. ++ */ ++int tracing_update_buffers(void) ++{ ++ int ret = 0; ++ ++ mutex_lock(&trace_types_lock); ++ if (!ring_buffer_expanded) ++ ret = tracing_resize_ring_buffer(trace_buf_size); ++ mutex_unlock(&trace_types_lock); ++ ++ return ret; ++} ++ ++struct trace_option_dentry; ++ ++static struct trace_option_dentry * ++create_trace_option_files(struct tracer *tracer); ++ ++static void ++destroy_trace_option_files(struct trace_option_dentry *topts); ++ ++static int tracing_set_tracer(const char *buf) ++{ ++ static struct trace_option_dentry *topts; + struct trace_array *tr = &global_trace; + struct tracer *t; + int ret = 0; + + mutex_lock(&trace_types_lock); ++ ++ if (!ring_buffer_expanded) { ++ ret = tracing_resize_ring_buffer(trace_buf_size); ++ if (ret < 0) ++ goto out; ++ ret = 0; ++ } ++ + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; +@@ -3007,9 +2564,14 @@ static int tracing_set_tracer(char *buf) + if (current_trace && current_trace->reset) + current_trace->reset(tr); + ++ destroy_trace_option_files(topts); ++ + current_trace = t; ++ ++ topts = create_trace_option_files(current_trace); ++ + if (t->init) { +- ret = t->init(tr); ++ ret = tracer_init(t, tr); + if (ret) + goto out; + } +@@ -3072,9 +2634,9 @@ static ssize_t + tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +- long *ptr = filp->private_data; ++ unsigned long *ptr = filp->private_data; + char buf[64]; +- long val; ++ unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) +@@ -3094,54 +2656,96 @@ tracing_max_lat_write(struct file *filp, + return cnt; + } + +-static atomic_t tracing_reader; +- + static int tracing_open_pipe(struct inode *inode, struct file *filp) + { ++ long cpu_file = (long) inode->i_private; + struct trace_iterator *iter; ++ int ret = 0; + + if (tracing_disabled) + return -ENODEV; + +- /* We only allow for reader of the pipe */ +- if (atomic_inc_return(&tracing_reader) != 1) { +- atomic_dec(&tracing_reader); +- return -EBUSY; ++ mutex_lock(&trace_types_lock); ++ ++ /* We only allow one reader per cpu */ ++ if (cpu_file == TRACE_PIPE_ALL_CPU) { ++ if (!cpumask_empty(tracing_reader_cpumask)) { ++ ret = -EBUSY; ++ goto out; ++ } ++ cpumask_setall(tracing_reader_cpumask); ++ } else { ++ if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) ++ cpumask_set_cpu(cpu_file, tracing_reader_cpumask); ++ else { ++ ret = -EBUSY; ++ goto out; ++ } + } + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); +- if (!iter) +- return -ENOMEM; +- +- if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { +- kfree(iter); +- return -ENOMEM; ++ if (!iter) { ++ ret = -ENOMEM; ++ goto out; + } + +- mutex_lock(&trace_types_lock); ++ /* ++ * We make a copy of the current tracer to avoid concurrent ++ * changes on it while we are reading. ++ */ ++ iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL); ++ if (!iter->trace) { ++ ret = -ENOMEM; ++ goto fail; ++ } ++ if (current_trace) ++ *iter->trace = *current_trace; ++ ++ if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ++ ret = -ENOMEM; ++ goto fail; ++ } + + /* trace pipe does not show start of buffer */ + cpumask_setall(iter->started); + ++ iter->cpu_file = cpu_file; + iter->tr = &global_trace; +- iter->trace = current_trace; ++ mutex_init(&iter->mutex); + filp->private_data = iter; + + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); ++ ++out: + mutex_unlock(&trace_types_lock); ++ return ret; + +- return 0; ++fail: ++ kfree(iter->trace); ++ kfree(iter); ++ mutex_unlock(&trace_types_lock); ++ return ret; + } + + static int tracing_release_pipe(struct inode *inode, struct file *file) + { + struct trace_iterator *iter = file->private_data; + ++ mutex_lock(&trace_types_lock); ++ ++ if (iter->cpu_file == TRACE_PIPE_ALL_CPU) ++ cpumask_clear(tracing_reader_cpumask); ++ else ++ cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); ++ ++ mutex_unlock(&trace_types_lock); ++ + free_cpumask_var(iter->started); ++ mutex_destroy(&iter->mutex); ++ kfree(iter->trace); + kfree(iter); +- atomic_dec(&tracing_reader); + + return 0; + } +@@ -3167,67 +2771,57 @@ tracing_poll_pipe(struct file *filp, pol + } + } + +-/* +- * Consumer reader. +- */ +-static ssize_t +-tracing_read_pipe(struct file *filp, char __user *ubuf, +- size_t cnt, loff_t *ppos) ++ ++void default_wait_pipe(struct trace_iterator *iter) + { +- struct trace_iterator *iter = filp->private_data; +- ssize_t sret; ++ DEFINE_WAIT(wait); + +- /* return any leftover data */ +- sret = trace_seq_to_user(&iter->seq, ubuf, cnt); +- if (sret != -EBUSY) +- return sret; ++ prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); + +- trace_seq_reset(&iter->seq); ++ if (trace_empty(iter)) ++ schedule(); + +- mutex_lock(&trace_types_lock); +- if (iter->trace->read) { +- sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); +- if (sret) +- goto out; +- } ++ finish_wait(&trace_wait, &wait); ++} ++ ++/* ++ * This is a make-shift waitqueue. ++ * A tracer might use this callback on some rare cases: ++ * ++ * 1) the current tracer might hold the runqueue lock when it wakes up ++ * a reader, hence a deadlock (sched, function, and function graph tracers) ++ * 2) the function tracers, trace all functions, we don't want ++ * the overhead of calling wake_up and friends ++ * (and tracing them too) ++ * ++ * Anyway, this is really very primitive wakeup. ++ */ ++void poll_wait_pipe(struct trace_iterator *iter) ++{ ++ set_current_state(TASK_INTERRUPTIBLE); ++ /* sleep for 100 msecs, and try again. */ ++ schedule_timeout(HZ / 10); ++} ++ ++/* Must be called with trace_types_lock mutex held. */ ++static int tracing_wait_pipe(struct file *filp) ++{ ++ struct trace_iterator *iter = filp->private_data; + +-waitagain: +- sret = 0; + while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) { +- sret = -EAGAIN; +- goto out; ++ return -EAGAIN; + } + +- /* +- * This is a make-shift waitqueue. The reason we don't use +- * an actual wait queue is because: +- * 1) we only ever have one waiter +- * 2) the tracing, traces all functions, we don't want +- * the overhead of calling wake_up and friends +- * (and tracing them too) +- * Anyway, this is really very primitive wakeup. +- */ +- set_current_state(TASK_INTERRUPTIBLE); +- iter->tr->waiter = current; ++ mutex_unlock(&iter->mutex); + +- mutex_unlock(&trace_types_lock); ++ iter->trace->wait_pipe(iter); + +- /* sleep for 100 msecs, and try again. */ +- schedule_timeout(HZ/10); ++ mutex_lock(&iter->mutex); + +- mutex_lock(&trace_types_lock); +- +- iter->tr->waiter = NULL; +- +- if (signal_pending(current)) { +- sret = -EINTR; +- goto out; +- } +- +- if (iter->trace != current_trace) +- goto out; ++ if (signal_pending(current)) ++ return -EINTR; + + /* + * We block until we read something and tracing is disabled. +@@ -3240,13 +2834,59 @@ waitagain: + */ + if (!tracer_enabled && iter->pos) + break; ++ } ++ ++ return 1; ++} ++ ++/* ++ * Consumer reader. ++ */ ++static ssize_t ++tracing_read_pipe(struct file *filp, char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ struct trace_iterator *iter = filp->private_data; ++ static struct tracer *old_tracer; ++ ssize_t sret; + +- continue; ++ /* return any leftover data */ ++ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); ++ if (sret != -EBUSY) ++ return sret; ++ ++ trace_seq_init(&iter->seq); ++ ++ /* copy the tracer to avoid using a global lock all around */ ++ mutex_lock(&trace_types_lock); ++ if (unlikely(old_tracer != current_trace && current_trace)) { ++ old_tracer = current_trace; ++ *iter->trace = *current_trace; + } ++ mutex_unlock(&trace_types_lock); ++ ++ /* ++ * Avoid more than one consumer on a single file descriptor ++ * This is just a matter of traces coherency, the ring buffer itself ++ * is protected. ++ */ ++ mutex_lock(&iter->mutex); ++ if (iter->trace->read) { ++ sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); ++ if (sret) ++ goto out; ++ } ++ ++waitagain: ++ sret = tracing_wait_pipe(filp); ++ if (sret <= 0) ++ goto out; + + /* stop when tracing is finished */ +- if (trace_empty(iter)) ++ if (trace_empty(iter)) { ++ sret = 0; + goto out; ++ } + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; +@@ -3267,8 +2907,8 @@ waitagain: + iter->seq.len = len; + break; + } +- +- trace_consume(iter); ++ if (ret != TRACE_TYPE_NO_CONSUME) ++ trace_consume(iter); + + if (iter->seq.len >= cnt) + break; +@@ -3277,7 +2917,7 @@ waitagain: + /* Now copy what we have to the user */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.readpos >= iter->seq.len) +- trace_seq_reset(&iter->seq); ++ trace_seq_init(&iter->seq); + + /* + * If there was nothing to send to user, inspite of consuming trace +@@ -3287,20 +2927,165 @@ waitagain: + goto waitagain; + + out: +- mutex_unlock(&trace_types_lock); ++ mutex_unlock(&iter->mutex); + + return sret; + } + ++static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ __free_page(buf->page); ++} ++ ++static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, ++ unsigned int idx) ++{ ++ __free_page(spd->pages[idx]); ++} ++ ++static struct pipe_buf_operations tracing_pipe_buf_ops = { ++ .can_merge = 0, ++ .map = generic_pipe_buf_map, ++ .unmap = generic_pipe_buf_unmap, ++ .confirm = generic_pipe_buf_confirm, ++ .release = tracing_pipe_buf_release, ++ .steal = generic_pipe_buf_steal, ++ .get = generic_pipe_buf_get, ++}; ++ ++static size_t ++tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) ++{ ++ size_t count; ++ int ret; ++ ++ /* Seq buffer is page-sized, exactly what we need. */ ++ for (;;) { ++ count = iter->seq.len; ++ ret = print_trace_line(iter); ++ count = iter->seq.len - count; ++ if (rem < count) { ++ rem = 0; ++ iter->seq.len -= count; ++ break; ++ } ++ if (ret == TRACE_TYPE_PARTIAL_LINE) { ++ iter->seq.len -= count; ++ break; ++ } ++ ++ trace_consume(iter); ++ rem -= count; ++ if (!find_next_entry_inc(iter)) { ++ rem = 0; ++ iter->ent = NULL; ++ break; ++ } ++ } ++ ++ return rem; ++} ++ ++static ssize_t tracing_splice_read_pipe(struct file *filp, ++ loff_t *ppos, ++ struct pipe_inode_info *pipe, ++ size_t len, ++ unsigned int flags) ++{ ++ struct page *pages[PIPE_BUFFERS]; ++ struct partial_page partial[PIPE_BUFFERS]; ++ struct trace_iterator *iter = filp->private_data; ++ struct splice_pipe_desc spd = { ++ .pages = pages, ++ .partial = partial, ++ .nr_pages = 0, /* This gets updated below. */ ++ .flags = flags, ++ .ops = &tracing_pipe_buf_ops, ++ .spd_release = tracing_spd_release_pipe, ++ }; ++ static struct tracer *old_tracer; ++ ssize_t ret; ++ size_t rem; ++ unsigned int i; ++ ++ /* copy the tracer to avoid using a global lock all around */ ++ mutex_lock(&trace_types_lock); ++ if (unlikely(old_tracer != current_trace && current_trace)) { ++ old_tracer = current_trace; ++ *iter->trace = *current_trace; ++ } ++ mutex_unlock(&trace_types_lock); ++ ++ mutex_lock(&iter->mutex); ++ ++ if (iter->trace->splice_read) { ++ ret = iter->trace->splice_read(iter, filp, ++ ppos, pipe, len, flags); ++ if (ret) ++ goto out_err; ++ } ++ ++ ret = tracing_wait_pipe(filp); ++ if (ret <= 0) ++ goto out_err; ++ ++ if (!iter->ent && !find_next_entry_inc(iter)) { ++ ret = -EFAULT; ++ goto out_err; ++ } ++ ++ /* Fill as many pages as possible. */ ++ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ break; ++ ++ rem = tracing_fill_pipe_page(rem, iter); ++ ++ /* Copy the data into the page, so we can start over. */ ++ ret = trace_seq_to_buffer(&iter->seq, ++ page_address(pages[i]), ++ iter->seq.len); ++ if (ret < 0) { ++ __free_page(pages[i]); ++ break; ++ } ++ partial[i].offset = 0; ++ partial[i].len = iter->seq.len; ++ ++ trace_seq_init(&iter->seq); ++ } ++ ++ mutex_unlock(&iter->mutex); ++ ++ spd.nr_pages = i; ++ ++ return splice_to_pipe(pipe, &spd); ++ ++out_err: ++ mutex_unlock(&iter->mutex); ++ ++ return ret; ++} ++ + static ssize_t + tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) + { + struct trace_array *tr = filp->private_data; +- char buf[64]; ++ char buf[96]; + int r; + +- r = sprintf(buf, "%lu\n", tr->entries >> 10); ++ mutex_lock(&trace_types_lock); ++ if (!ring_buffer_expanded) ++ r = sprintf(buf, "%lu (expanded: %lu)\n", ++ tr->entries >> 10, ++ trace_buf_size >> 10); ++ else ++ r = sprintf(buf, "%lu\n", tr->entries >> 10); ++ mutex_unlock(&trace_types_lock); ++ + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + } + +@@ -3344,197 +3129,764 @@ tracing_entries_write(struct file *filp, + val <<= 10; + + if (val != global_trace.entries) { +- ret = ring_buffer_resize(global_trace.buffer, val); ++ ret = tracing_resize_ring_buffer(val); + if (ret < 0) { + cnt = ret; + goto out; + } ++ } + +- ret = ring_buffer_resize(max_tr.buffer, val); +- if (ret < 0) { +- int r; +- cnt = ret; +- r = ring_buffer_resize(global_trace.buffer, +- global_trace.entries); +- if (r < 0) { +- /* AARGH! We are left with different +- * size max buffer!!!! */ +- WARN_ON(1); +- tracing_disabled = 1; +- } +- goto out; ++ filp->f_pos += cnt; ++ ++ /* If check pages failed, return ENOMEM */ ++ if (tracing_disabled) ++ cnt = -ENOMEM; ++ out: ++ for_each_tracing_cpu(cpu) { ++ if (global_trace.data[cpu]) ++ atomic_dec(&global_trace.data[cpu]->disabled); ++ if (max_tr.data[cpu]) ++ atomic_dec(&max_tr.data[cpu]->disabled); ++ } ++ ++ tracing_start(); ++ max_tr.entries = global_trace.entries; ++ mutex_unlock(&trace_types_lock); ++ ++ return cnt; ++} ++ ++static int mark_printk(const char *fmt, ...) ++{ ++ int ret; ++ va_list args; ++ va_start(args, fmt); ++ ret = trace_vprintk(0, fmt, args); ++ va_end(args); ++ return ret; ++} ++ ++static ssize_t ++tracing_mark_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *fpos) ++{ ++ char *buf; ++ char *end; ++ ++ if (tracing_disabled) ++ return -EINVAL; ++ ++ if (cnt > TRACE_BUF_SIZE) ++ cnt = TRACE_BUF_SIZE; ++ ++ buf = kmalloc(cnt + 1, GFP_KERNEL); ++ if (buf == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(buf, ubuf, cnt)) { ++ kfree(buf); ++ return -EFAULT; ++ } ++ ++ /* Cut from the first nil or newline. */ ++ buf[cnt] = '\0'; ++ end = strchr(buf, '\n'); ++ if (end) ++ *end = '\0'; ++ ++ cnt = mark_printk("%s\n", buf); ++ kfree(buf); ++ *fpos += cnt; ++ ++ return cnt; ++} ++ ++static const struct file_operations tracing_max_lat_fops = { ++ .open = tracing_open_generic, ++ .read = tracing_max_lat_read, ++ .write = tracing_max_lat_write, ++}; ++ ++static const struct file_operations tracing_ctrl_fops = { ++ .open = tracing_open_generic, ++ .read = tracing_ctrl_read, ++ .write = tracing_ctrl_write, ++}; ++ ++static const struct file_operations set_tracer_fops = { ++ .open = tracing_open_generic, ++ .read = tracing_set_trace_read, ++ .write = tracing_set_trace_write, ++}; ++ ++static const struct file_operations tracing_pipe_fops = { ++ .open = tracing_open_pipe, ++ .poll = tracing_poll_pipe, ++ .read = tracing_read_pipe, ++ .splice_read = tracing_splice_read_pipe, ++ .release = tracing_release_pipe, ++}; ++ ++static const struct file_operations tracing_entries_fops = { ++ .open = tracing_open_generic, ++ .read = tracing_entries_read, ++ .write = tracing_entries_write, ++}; ++ ++static const struct file_operations tracing_mark_fops = { ++ .open = tracing_open_generic, ++ .write = tracing_mark_write, ++}; ++ ++struct ftrace_buffer_info { ++ struct trace_array *tr; ++ void *spare; ++ int cpu; ++ unsigned int read; ++}; ++ ++static int tracing_buffers_open(struct inode *inode, struct file *filp) ++{ ++ int cpu = (int)(long)inode->i_private; ++ struct ftrace_buffer_info *info; ++ ++ if (tracing_disabled) ++ return -ENODEV; ++ ++ info = kzalloc(sizeof(*info), GFP_KERNEL); ++ if (!info) ++ return -ENOMEM; ++ ++ info->tr = &global_trace; ++ info->cpu = cpu; ++ info->spare = ring_buffer_alloc_read_page(info->tr->buffer); ++ /* Force reading ring buffer for first read */ ++ info->read = (unsigned int)-1; ++ if (!info->spare) ++ goto out; ++ ++ filp->private_data = info; ++ ++ return 0; ++ ++ out: ++ kfree(info); ++ return -ENOMEM; ++} ++ ++static ssize_t ++tracing_buffers_read(struct file *filp, char __user *ubuf, ++ size_t count, loff_t *ppos) ++{ ++ struct ftrace_buffer_info *info = filp->private_data; ++ unsigned int pos; ++ ssize_t ret; ++ size_t size; ++ ++ if (!count) ++ return 0; ++ ++ /* Do we have previous read data to read? */ ++ if (info->read < PAGE_SIZE) ++ goto read; ++ ++ info->read = 0; ++ ++ ret = ring_buffer_read_page(info->tr->buffer, ++ &info->spare, ++ count, ++ info->cpu, 0); ++ if (ret < 0) ++ return 0; ++ ++ pos = ring_buffer_page_len(info->spare); ++ ++ if (pos < PAGE_SIZE) ++ memset(info->spare + pos, 0, PAGE_SIZE - pos); ++ ++read: ++ size = PAGE_SIZE - info->read; ++ if (size > count) ++ size = count; ++ ++ ret = copy_to_user(ubuf, info->spare + info->read, size); ++ if (ret == size) ++ return -EFAULT; ++ size -= ret; ++ ++ *ppos += size; ++ info->read += size; ++ ++ return size; ++} ++ ++static int tracing_buffers_release(struct inode *inode, struct file *file) ++{ ++ struct ftrace_buffer_info *info = file->private_data; ++ ++ ring_buffer_free_read_page(info->tr->buffer, info->spare); ++ kfree(info); ++ ++ return 0; ++} ++ ++struct buffer_ref { ++ struct ring_buffer *buffer; ++ void *page; ++ int ref; ++}; ++ ++static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ struct buffer_ref *ref = (struct buffer_ref *)buf->private; ++ ++ if (--ref->ref) ++ return; ++ ++ ring_buffer_free_read_page(ref->buffer, ref->page); ++ kfree(ref); ++ buf->private = 0; ++} ++ ++static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ return 1; ++} ++ ++static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ struct buffer_ref *ref = (struct buffer_ref *)buf->private; ++ ++ ref->ref++; ++} ++ ++/* Pipe buffer operations for a buffer. */ ++static struct pipe_buf_operations buffer_pipe_buf_ops = { ++ .can_merge = 0, ++ .map = generic_pipe_buf_map, ++ .unmap = generic_pipe_buf_unmap, ++ .confirm = generic_pipe_buf_confirm, ++ .release = buffer_pipe_buf_release, ++ .steal = buffer_pipe_buf_steal, ++ .get = buffer_pipe_buf_get, ++}; ++ ++/* ++ * Callback from splice_to_pipe(), if we need to release some pages ++ * at the end of the spd in case we error'ed out in filling the pipe. ++ */ ++static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) ++{ ++ struct buffer_ref *ref = ++ (struct buffer_ref *)spd->partial[i].private; ++ ++ if (--ref->ref) ++ return; ++ ++ ring_buffer_free_read_page(ref->buffer, ref->page); ++ kfree(ref); ++ spd->partial[i].private = 0; ++} ++ ++static ssize_t ++tracing_buffers_splice_read(struct file *file, loff_t *ppos, ++ struct pipe_inode_info *pipe, size_t len, ++ unsigned int flags) ++{ ++ struct ftrace_buffer_info *info = file->private_data; ++ struct partial_page partial[PIPE_BUFFERS]; ++ struct page *pages[PIPE_BUFFERS]; ++ struct splice_pipe_desc spd = { ++ .pages = pages, ++ .partial = partial, ++ .flags = flags, ++ .ops = &buffer_pipe_buf_ops, ++ .spd_release = buffer_spd_release, ++ }; ++ struct buffer_ref *ref; ++ int size, i; ++ size_t ret; ++ ++ /* ++ * We can't seek on a buffer input ++ */ ++ if (unlikely(*ppos)) ++ return -ESPIPE; ++ ++ ++ for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) { ++ struct page *page; ++ int r; ++ ++ ref = kzalloc(sizeof(*ref), GFP_KERNEL); ++ if (!ref) ++ break; ++ ++ ref->buffer = info->tr->buffer; ++ ref->page = ring_buffer_alloc_read_page(ref->buffer); ++ if (!ref->page) { ++ kfree(ref); ++ break; ++ } ++ ++ r = ring_buffer_read_page(ref->buffer, &ref->page, ++ len, info->cpu, 0); ++ if (r < 0) { ++ ring_buffer_free_read_page(ref->buffer, ++ ref->page); ++ kfree(ref); ++ break; + } + +- global_trace.entries = val; ++ /* ++ * zero out any left over data, this is going to ++ * user land. ++ */ ++ size = ring_buffer_page_len(ref->page); ++ if (size < PAGE_SIZE) ++ memset(ref->page + size, 0, PAGE_SIZE - size); ++ ++ page = virt_to_page(ref->page); ++ ++ spd.pages[i] = page; ++ spd.partial[i].len = PAGE_SIZE; ++ spd.partial[i].offset = 0; ++ spd.partial[i].private = (unsigned long)ref; ++ spd.nr_pages++; + } + +- filp->f_pos += cnt; ++ spd.nr_pages = i; ++ ++ /* did we read anything? */ ++ if (!spd.nr_pages) { ++ if (flags & SPLICE_F_NONBLOCK) ++ ret = -EAGAIN; ++ else ++ ret = 0; ++ /* TODO: block */ ++ return ret; ++ } ++ ++ ret = splice_to_pipe(pipe, &spd); ++ ++ return ret; ++} ++ ++static const struct file_operations tracing_buffers_fops = { ++ .open = tracing_buffers_open, ++ .read = tracing_buffers_read, ++ .release = tracing_buffers_release, ++ .splice_read = tracing_buffers_splice_read, ++ .llseek = no_llseek, ++}; ++ ++#ifdef CONFIG_DYNAMIC_FTRACE ++ ++int __weak ftrace_arch_read_dyn_info(char *buf, int size) ++{ ++ return 0; ++} ++ ++static ssize_t ++tracing_read_dyn_info(struct file *filp, char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ static char ftrace_dyn_info_buffer[1024]; ++ static DEFINE_MUTEX(dyn_info_mutex); ++ unsigned long *p = filp->private_data; ++ char *buf = ftrace_dyn_info_buffer; ++ int size = ARRAY_SIZE(ftrace_dyn_info_buffer); ++ int r; ++ ++ mutex_lock(&dyn_info_mutex); ++ r = sprintf(buf, "%ld ", *p); ++ ++ r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); ++ buf[r++] = '\n'; ++ ++ r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); ++ ++ mutex_unlock(&dyn_info_mutex); ++ ++ return r; ++} ++ ++static const struct file_operations tracing_dyn_info_fops = { ++ .open = tracing_open_generic, ++ .read = tracing_read_dyn_info, ++}; ++#endif ++ ++static struct dentry *d_tracer; ++ ++struct dentry *tracing_init_dentry(void) ++{ ++ static int once; ++ ++ if (d_tracer) ++ return d_tracer; ++ ++ if (!debugfs_initialized()) ++ return NULL; ++ ++ d_tracer = debugfs_create_dir("tracing", NULL); ++ ++ if (!d_tracer && !once) { ++ once = 1; ++ pr_warning("Could not create debugfs directory 'tracing'\n"); ++ return NULL; ++ } ++ ++ return d_tracer; ++} ++ ++static struct dentry *d_percpu; ++ ++struct dentry *tracing_dentry_percpu(void) ++{ ++ static int once; ++ struct dentry *d_tracer; ++ ++ if (d_percpu) ++ return d_percpu; ++ ++ d_tracer = tracing_init_dentry(); ++ ++ if (!d_tracer) ++ return NULL; ++ ++ d_percpu = debugfs_create_dir("per_cpu", d_tracer); ++ ++ if (!d_percpu && !once) { ++ once = 1; ++ pr_warning("Could not create debugfs directory 'per_cpu'\n"); ++ return NULL; ++ } ++ ++ return d_percpu; ++} ++ ++static void tracing_init_debugfs_percpu(long cpu) ++{ ++ struct dentry *d_percpu = tracing_dentry_percpu(); ++ struct dentry *entry, *d_cpu; ++ /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ ++ char cpu_dir[7]; ++ ++ if (cpu > 999 || cpu < 0) ++ return; ++ ++ sprintf(cpu_dir, "cpu%ld", cpu); ++ d_cpu = debugfs_create_dir(cpu_dir, d_percpu); ++ if (!d_cpu) { ++ pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); ++ return; ++ } ++ ++ /* per cpu trace_pipe */ ++ entry = debugfs_create_file("trace_pipe", 0444, d_cpu, ++ (void *) cpu, &tracing_pipe_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs 'trace_pipe' entry\n"); ++ ++ /* per cpu trace */ ++ entry = debugfs_create_file("trace", 0644, d_cpu, ++ (void *) cpu, &tracing_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs 'trace' entry\n"); ++ ++ entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, ++ (void *) cpu, &tracing_buffers_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); ++} ++ ++#ifdef CONFIG_FTRACE_SELFTEST ++/* Let selftest have access to static functions in this file */ ++#include "trace_selftest.c" ++#endif ++ ++struct trace_option_dentry { ++ struct tracer_opt *opt; ++ struct tracer_flags *flags; ++ struct dentry *entry; ++}; ++ ++static ssize_t ++trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct trace_option_dentry *topt = filp->private_data; ++ char *buf; ++ ++ if (topt->flags->val & topt->opt->bit) ++ buf = "1\n"; ++ else ++ buf = "0\n"; ++ ++ return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); ++} ++ ++static ssize_t ++trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct trace_option_dentry *topt = filp->private_data; ++ unsigned long val; ++ char buf[64]; ++ int ret; ++ ++ if (cnt >= sizeof(buf)) ++ return -EINVAL; ++ ++ if (copy_from_user(&buf, ubuf, cnt)) ++ return -EFAULT; ++ ++ buf[cnt] = 0; ++ ++ ret = strict_strtoul(buf, 10, &val); ++ if (ret < 0) ++ return ret; ++ ++ ret = 0; ++ switch (val) { ++ case 0: ++ /* do nothing if already cleared */ ++ if (!(topt->flags->val & topt->opt->bit)) ++ break; ++ ++ mutex_lock(&trace_types_lock); ++ if (current_trace->set_flag) ++ ret = current_trace->set_flag(topt->flags->val, ++ topt->opt->bit, 0); ++ mutex_unlock(&trace_types_lock); ++ if (ret) ++ return ret; ++ topt->flags->val &= ~topt->opt->bit; ++ break; ++ case 1: ++ /* do nothing if already set */ ++ if (topt->flags->val & topt->opt->bit) ++ break; ++ ++ mutex_lock(&trace_types_lock); ++ if (current_trace->set_flag) ++ ret = current_trace->set_flag(topt->flags->val, ++ topt->opt->bit, 1); ++ mutex_unlock(&trace_types_lock); ++ if (ret) ++ return ret; ++ topt->flags->val |= topt->opt->bit; ++ break; + +- /* If check pages failed, return ENOMEM */ +- if (tracing_disabled) +- cnt = -ENOMEM; +- out: +- for_each_tracing_cpu(cpu) { +- if (global_trace.data[cpu]) +- atomic_dec(&global_trace.data[cpu]->disabled); +- if (max_tr.data[cpu]) +- atomic_dec(&max_tr.data[cpu]->disabled); ++ default: ++ return -EINVAL; + } + +- tracing_start(); +- max_tr.entries = global_trace.entries; +- mutex_unlock(&trace_types_lock); ++ *ppos += cnt; + + return cnt; + } + +-static int mark_printk(const char *fmt, ...) ++ ++static const struct file_operations trace_options_fops = { ++ .open = tracing_open_generic, ++ .read = trace_options_read, ++ .write = trace_options_write, ++}; ++ ++static ssize_t ++trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, ++ loff_t *ppos) + { +- int ret; +- va_list args; +- va_start(args, fmt); +- ret = trace_vprintk(0, -1, fmt, args); +- va_end(args); +- return ret; ++ long index = (long)filp->private_data; ++ char *buf; ++ ++ if (trace_flags & (1 << index)) ++ buf = "1\n"; ++ else ++ buf = "0\n"; ++ ++ return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + } + + static ssize_t +-tracing_mark_write(struct file *filp, const char __user *ubuf, +- size_t cnt, loff_t *fpos) ++trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, ++ loff_t *ppos) + { +- char *buf; +- char *end; ++ long index = (long)filp->private_data; ++ char buf[64]; ++ unsigned long val; ++ int ret; + +- if (tracing_disabled) ++ if (cnt >= sizeof(buf)) + return -EINVAL; + +- if (cnt > TRACE_BUF_SIZE) +- cnt = TRACE_BUF_SIZE; ++ if (copy_from_user(&buf, ubuf, cnt)) ++ return -EFAULT; + +- buf = kmalloc(cnt + 1, GFP_KERNEL); +- if (buf == NULL) +- return -ENOMEM; ++ buf[cnt] = 0; + +- if (copy_from_user(buf, ubuf, cnt)) { +- kfree(buf); +- return -EFAULT; +- } ++ ret = strict_strtoul(buf, 10, &val); ++ if (ret < 0) ++ return ret; + +- /* Cut from the first nil or newline. */ +- buf[cnt] = '\0'; +- end = strchr(buf, '\n'); +- if (end) +- *end = '\0'; ++ switch (val) { ++ case 0: ++ trace_flags &= ~(1 << index); ++ break; ++ case 1: ++ trace_flags |= 1 << index; ++ break; + +- cnt = mark_printk("%s\n", buf); +- kfree(buf); +- *fpos += cnt; ++ default: ++ return -EINVAL; ++ } ++ ++ *ppos += cnt; + + return cnt; + } + +-static struct file_operations tracing_max_lat_fops = { +- .open = tracing_open_generic, +- .read = tracing_max_lat_read, +- .write = tracing_max_lat_write, +-}; +- +-static struct file_operations tracing_ctrl_fops = { +- .open = tracing_open_generic, +- .read = tracing_ctrl_read, +- .write = tracing_ctrl_write, ++static const struct file_operations trace_options_core_fops = { ++ .open = tracing_open_generic, ++ .read = trace_options_core_read, ++ .write = trace_options_core_write, + }; + +-static struct file_operations set_tracer_fops = { +- .open = tracing_open_generic, +- .read = tracing_set_trace_read, +- .write = tracing_set_trace_write, +-}; ++static struct dentry *trace_options_init_dentry(void) ++{ ++ struct dentry *d_tracer; ++ static struct dentry *t_options; + +-static struct file_operations tracing_pipe_fops = { +- .open = tracing_open_pipe, +- .poll = tracing_poll_pipe, +- .read = tracing_read_pipe, +- .release = tracing_release_pipe, +-}; ++ if (t_options) ++ return t_options; + +-static struct file_operations tracing_entries_fops = { +- .open = tracing_open_generic, +- .read = tracing_entries_read, +- .write = tracing_entries_write, +-}; ++ d_tracer = tracing_init_dentry(); ++ if (!d_tracer) ++ return NULL; + +-static struct file_operations tracing_mark_fops = { +- .open = tracing_open_generic, +- .write = tracing_mark_write, +-}; ++ t_options = debugfs_create_dir("options", d_tracer); ++ if (!t_options) { ++ pr_warning("Could not create debugfs directory 'options'\n"); ++ return NULL; ++ } + +-#ifdef CONFIG_DYNAMIC_FTRACE ++ return t_options; ++} + +-int __weak ftrace_arch_read_dyn_info(char *buf, int size) ++static void ++create_trace_option_file(struct trace_option_dentry *topt, ++ struct tracer_flags *flags, ++ struct tracer_opt *opt) + { +- return 0; ++ struct dentry *t_options; ++ struct dentry *entry; ++ ++ t_options = trace_options_init_dentry(); ++ if (!t_options) ++ return; ++ ++ topt->flags = flags; ++ topt->opt = opt; ++ ++ entry = debugfs_create_file(opt->name, 0644, t_options, topt, ++ &trace_options_fops); ++ ++ topt->entry = entry; ++ + } + +-static ssize_t +-tracing_read_dyn_info(struct file *filp, char __user *ubuf, +- size_t cnt, loff_t *ppos) ++static struct trace_option_dentry * ++create_trace_option_files(struct tracer *tracer) + { +- static char ftrace_dyn_info_buffer[1024]; +- static DEFINE_MUTEX(dyn_info_mutex); +- unsigned long *p = filp->private_data; +- char *buf = ftrace_dyn_info_buffer; +- int size = ARRAY_SIZE(ftrace_dyn_info_buffer); +- int r; ++ struct trace_option_dentry *topts; ++ struct tracer_flags *flags; ++ struct tracer_opt *opts; ++ int cnt; + +- mutex_lock(&dyn_info_mutex); +- r = sprintf(buf, "%ld ", *p); ++ if (!tracer) ++ return NULL; + +- r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); +- buf[r++] = '\n'; ++ flags = tracer->flags; + +- r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); ++ if (!flags || !flags->opts) ++ return NULL; + +- mutex_unlock(&dyn_info_mutex); ++ opts = flags->opts; + +- return r; +-} ++ for (cnt = 0; opts[cnt].name; cnt++) ++ ; + +-static struct file_operations tracing_dyn_info_fops = { +- .open = tracing_open_generic, +- .read = tracing_read_dyn_info, +-}; +-#endif ++ topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); ++ if (!topts) ++ return NULL; + +-static struct dentry *d_tracer; ++ for (cnt = 0; opts[cnt].name; cnt++) ++ create_trace_option_file(&topts[cnt], flags, ++ &opts[cnt]); + +-struct dentry *tracing_init_dentry(void) ++ return topts; ++} ++ ++static void ++destroy_trace_option_files(struct trace_option_dentry *topts) + { +- static int once; ++ int cnt; + +- if (d_tracer) +- return d_tracer; ++ if (!topts) ++ return; + +- d_tracer = debugfs_create_dir("tracing", NULL); ++ for (cnt = 0; topts[cnt].opt; cnt++) { ++ if (topts[cnt].entry) ++ debugfs_remove(topts[cnt].entry); ++ } + +- if (!d_tracer && !once) { +- once = 1; +- pr_warning("Could not create debugfs directory 'tracing'\n"); ++ kfree(topts); ++} ++ ++static struct dentry * ++create_trace_option_core_file(const char *option, long index) ++{ ++ struct dentry *t_options; ++ struct dentry *entry; ++ ++ t_options = trace_options_init_dentry(); ++ if (!t_options) + return NULL; +- } + +- return d_tracer; ++ entry = debugfs_create_file(option, 0644, t_options, (void *)index, ++ &trace_options_core_fops); ++ ++ return entry; + } + +-#ifdef CONFIG_FTRACE_SELFTEST +-/* Let selftest have access to static functions in this file */ +-#include "trace_selftest.c" +-#endif ++static __init void create_trace_options_dir(void) ++{ ++ struct dentry *t_options; ++ struct dentry *entry; ++ int i; ++ ++ t_options = trace_options_init_dentry(); ++ if (!t_options) ++ return; ++ ++ for (i = 0; trace_options[i]; i++) { ++ entry = create_trace_option_core_file(trace_options[i], i); ++ if (!entry) ++ pr_warning("Could not create debugfs %s entry\n", ++ trace_options[i]); ++ } ++} + + static __init int tracer_init_debugfs(void) + { + struct dentry *d_tracer; + struct dentry *entry; ++ int cpu; + + d_tracer = tracing_init_dentry(); + +@@ -3548,18 +3900,15 @@ static __init int tracer_init_debugfs(vo + if (!entry) + pr_warning("Could not create debugfs 'trace_options' entry\n"); + ++ create_trace_options_dir(); ++ + entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); + +- entry = debugfs_create_file("latency_trace", 0444, d_tracer, +- &global_trace, &tracing_lt_fops); +- if (!entry) +- pr_warning("Could not create debugfs 'latency_trace' entry\n"); +- +- entry = debugfs_create_file("trace", 0444, d_tracer, +- &global_trace, &tracing_fops); ++ entry = debugfs_create_file("trace", 0644, d_tracer, ++ (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace' entry\n"); + +@@ -3590,8 +3939,8 @@ static __init int tracer_init_debugfs(vo + if (!entry) + pr_warning("Could not create debugfs 'README' entry\n"); + +- entry = debugfs_create_file("trace_pipe", 0644, d_tracer, +- NULL, &tracing_pipe_fops); ++ entry = debugfs_create_file("trace_pipe", 0444, d_tracer, ++ (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'trace_pipe' entry\n"); +@@ -3619,77 +3968,12 @@ static __init int tracer_init_debugfs(vo + #ifdef CONFIG_SYSPROF_TRACER + init_tracer_sysprof_debugfs(d_tracer); + #endif +- return 0; +-} +- +-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +-{ +- static DEFINE_SPINLOCK(trace_buf_lock); +- static char trace_buf[TRACE_BUF_SIZE]; +- +- struct ring_buffer_event *event; +- struct trace_array *tr = &global_trace; +- struct trace_array_cpu *data; +- int cpu, len = 0, size, pc; +- struct print_entry *entry; +- unsigned long irq_flags; +- +- if (tracing_disabled || tracing_selftest_running) +- return 0; +- +- pc = preempt_count(); +- preempt_disable_notrace(); +- cpu = raw_smp_processor_id(); +- data = tr->data[cpu]; +- +- if (unlikely(atomic_read(&data->disabled))) +- goto out; +- +- pause_graph_tracing(); +- spin_lock_irqsave(&trace_buf_lock, irq_flags); +- len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); +- +- len = min(len, TRACE_BUF_SIZE-1); +- trace_buf[len] = 0; +- +- size = sizeof(*entry) + len + 1; +- event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags); +- if (!event) +- goto out_unlock; +- entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, irq_flags, pc); +- entry->ent.type = TRACE_PRINT; +- entry->ip = ip; +- entry->depth = depth; + +- memcpy(&entry->buf, trace_buf, len); +- entry->buf[len] = 0; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- +- out_unlock: +- spin_unlock_irqrestore(&trace_buf_lock, irq_flags); +- unpause_graph_tracing(); +- out: +- preempt_enable_notrace(); +- +- return len; +-} +-EXPORT_SYMBOL_GPL(trace_vprintk); +- +-int __ftrace_printk(unsigned long ip, const char *fmt, ...) +-{ +- int ret; +- va_list ap; +- +- if (!(trace_flags & TRACE_ITER_PRINTK)) +- return 0; ++ for_each_tracing_cpu(cpu) ++ tracing_init_debugfs_percpu(cpu); + +- va_start(ap, fmt); +- ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); +- va_end(ap); +- return ret; ++ return 0; + } +-EXPORT_SYMBOL_GPL(__ftrace_printk); + + static int trace_panic_handler(struct notifier_block *this, + unsigned long event, void *unused) +@@ -3750,14 +4034,15 @@ trace_printk_seq(struct trace_seq *s) + + printk(KERN_TRACE "%s", s->buffer); + +- trace_seq_reset(s); ++ trace_seq_init(s); + } + +-void ftrace_dump(void) ++static void __ftrace_dump(bool disable_tracing) + { +- static DEFINE_SPINLOCK(ftrace_dump_lock); ++ static DEFINE_RAW_SPINLOCK(ftrace_dump_lock); + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; ++ unsigned int old_userobj; + static int dump_ran; + unsigned long flags; + int cnt = 0, cpu; +@@ -3769,21 +4054,26 @@ void ftrace_dump(void) + + dump_ran = 1; + +- /* No turning back! */ + tracing_off(); +- ftrace_kill(); ++ ++ if (disable_tracing) ++ ftrace_kill(); + + for_each_tracing_cpu(cpu) { + atomic_inc(&global_trace.data[cpu]->disabled); + } + ++ old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; ++ + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + ++ /* Simulate the iterator */ + iter.tr = &global_trace; + iter.trace = current_trace; ++ iter.cpu_file = TRACE_PIPE_ALL_CPU; + + /* + * We need to stop all tracing on all CPUS to read the +@@ -3819,13 +4109,30 @@ void ftrace_dump(void) + else + printk(KERN_TRACE "---------------------------------\n"); + ++ /* Re-enable tracing if requested */ ++ if (!disable_tracing) { ++ trace_flags |= old_userobj; ++ ++ for_each_tracing_cpu(cpu) { ++ atomic_dec(&global_trace.data[cpu]->disabled); ++ } ++ tracing_on(); ++ } ++ + out: + spin_unlock_irqrestore(&ftrace_dump_lock, flags); + } + ++/* By default: disable tracing after the dump */ ++void ftrace_dump(void) ++{ ++ __ftrace_dump(true); ++} ++ + __init static int tracer_alloc_buffers(void) + { + struct trace_array_cpu *data; ++ int ring_buf_size; + int i; + int ret = -ENOMEM; + +@@ -3835,11 +4142,21 @@ __init static int tracer_alloc_buffers(v + if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + goto out_free_buffer_mask; + ++ if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) ++ goto out_free_tracing_cpumask; ++ ++ /* To save memory, keep the ring buffer size to its minimum */ ++ if (ring_buffer_expanded) ++ ring_buf_size = trace_buf_size; ++ else ++ ring_buf_size = 1; ++ + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); + cpumask_copy(tracing_cpumask, cpu_all_mask); ++ cpumask_clear(tracing_reader_cpumask); + + /* TODO: make the number of buffers hot pluggable with CPUS */ +- global_trace.buffer = ring_buffer_alloc(trace_buf_size, ++ global_trace.buffer = ring_buffer_alloc(ring_buf_size, + TRACE_BUFFER_FLAGS); + if (!global_trace.buffer) { + printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); +@@ -3850,7 +4167,7 @@ __init static int tracer_alloc_buffers(v + + + #ifdef CONFIG_TRACER_MAX_TRACE +- max_tr.buffer = ring_buffer_alloc(trace_buf_size, ++ max_tr.buffer = ring_buffer_alloc(ring_buf_size, + TRACE_BUFFER_FLAGS); + if (!max_tr.buffer) { + printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); +@@ -3871,14 +4188,10 @@ __init static int tracer_alloc_buffers(v + trace_init_cmdlines(); + + register_tracer(&nop_trace); ++ current_trace = &nop_trace; + #ifdef CONFIG_BOOT_TRACER + register_tracer(&boot_tracer); +- current_trace = &boot_tracer; +- current_trace->init(&global_trace); +-#else +- current_trace = &nop_trace; + #endif +- + /* All seems OK, enable tracing */ + tracing_disabled = 0; + +@@ -3890,11 +4203,34 @@ __init static int tracer_alloc_buffers(v + return 0; + + out_free_cpumask: ++ free_cpumask_var(tracing_reader_cpumask); ++out_free_tracing_cpumask: + free_cpumask_var(tracing_cpumask); + out_free_buffer_mask: + free_cpumask_var(tracing_buffer_mask); + out: + return ret; + } ++ ++__init static int clear_boot_tracer(void) ++{ ++ /* ++ * The default tracer at boot buffer is an init section. ++ * This function is called in lateinit. If we did not ++ * find the boot tracer, then clear it out, to prevent ++ * later registration from accessing the buffer that is ++ * about to be freed. ++ */ ++ if (!default_bootup_tracer) ++ return 0; ++ ++ printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", ++ default_bootup_tracer); ++ default_bootup_tracer = NULL; ++ ++ return 0; ++} ++ + early_initcall(tracer_alloc_buffers); + fs_initcall(tracer_init_debugfs); ++late_initcall(clear_boot_tracer); +Index: linux-2.6-tip/kernel/trace/trace.h +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace.h ++++ linux-2.6-tip/kernel/trace/trace.h +@@ -9,6 +9,8 @@ + #include + #include + #include ++#include ++#include + + enum trace_type { + __TRACE_FIRST_TYPE = 0, +@@ -16,9 +18,9 @@ enum trace_type { + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, +- TRACE_CONT, + TRACE_STACK, + TRACE_PRINT, ++ TRACE_BPRINT, + TRACE_SPECIAL, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, +@@ -29,9 +31,14 @@ enum trace_type { + TRACE_GRAPH_ENT, + TRACE_USER_STACK, + TRACE_HW_BRANCHES, ++ TRACE_SYSCALL_ENTER, ++ TRACE_SYSCALL_EXIT, ++ TRACE_KMEM_ALLOC, ++ TRACE_KMEM_FREE, + TRACE_POWER, ++ TRACE_BLK, + +- __TRACE_LAST_TYPE ++ __TRACE_LAST_TYPE, + }; + + /* +@@ -42,7 +49,6 @@ enum trace_type { + */ + struct trace_entry { + unsigned char type; +- unsigned char cpu; + unsigned char flags; + unsigned char preempt_count; + int pid; +@@ -60,13 +66,13 @@ struct ftrace_entry { + + /* Function call entry */ + struct ftrace_graph_ent_entry { +- struct trace_entry ent; ++ struct trace_entry ent; + struct ftrace_graph_ent graph_ent; + }; + + /* Function return entry */ + struct ftrace_graph_ret_entry { +- struct trace_entry ent; ++ struct trace_entry ent; + struct ftrace_graph_ret ret; + }; + extern struct tracer boot_tracer; +@@ -112,12 +118,18 @@ struct userstack_entry { + }; + + /* +- * ftrace_printk entry: ++ * trace_printk entry: + */ ++struct bprint_entry { ++ struct trace_entry ent; ++ unsigned long ip; ++ const char *fmt; ++ u32 buf[]; ++}; ++ + struct print_entry { + struct trace_entry ent; + unsigned long ip; +- int depth; + char buf[]; + }; + +@@ -170,15 +182,51 @@ struct trace_power { + struct power_trace state_data; + }; + ++enum kmemtrace_type_id { ++ KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ ++ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ ++ KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ ++}; ++ ++struct kmemtrace_alloc_entry { ++ struct trace_entry ent; ++ enum kmemtrace_type_id type_id; ++ unsigned long call_site; ++ const void *ptr; ++ size_t bytes_req; ++ size_t bytes_alloc; ++ gfp_t gfp_flags; ++ int node; ++}; ++ ++struct kmemtrace_free_entry { ++ struct trace_entry ent; ++ enum kmemtrace_type_id type_id; ++ unsigned long call_site; ++ const void *ptr; ++}; ++ ++struct syscall_trace_enter { ++ struct trace_entry ent; ++ int nr; ++ unsigned long args[]; ++}; ++ ++struct syscall_trace_exit { ++ struct trace_entry ent; ++ int nr; ++ unsigned long ret; ++}; ++ ++ + /* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled +- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags ++ * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler +- * CONT - multiple entries hold the trace item + */ + enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, +@@ -186,7 +234,6 @@ enum trace_flag_type { + TRACE_FLAG_NEED_RESCHED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, +- TRACE_FLAG_CONT = 0x20, + }; + + #define TRACE_BUF_SIZE 1024 +@@ -198,6 +245,7 @@ enum trace_flag_type { + */ + struct trace_array_cpu { + atomic_t disabled; ++ void *buffer_page; /* ring buffer spare */ + + /* these fields get copied into max-trace: */ + unsigned long trace_idx; +@@ -262,10 +310,10 @@ extern void __ftrace_bad_type(void); + do { \ + IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ + IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ +- IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ + IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ + IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ ++ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ + IF_ASSIGN(var, ent, struct special_entry, 0); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ + TRACE_MMIO_RW); \ +@@ -279,7 +327,15 @@ extern void __ftrace_bad_type(void); + IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ + TRACE_GRAPH_RET); \ + IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ +- IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ ++ IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ ++ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ ++ TRACE_KMEM_ALLOC); \ ++ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ ++ TRACE_KMEM_FREE); \ ++ IF_ASSIGN(var, ent, struct syscall_trace_enter, \ ++ TRACE_SYSCALL_ENTER); \ ++ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ ++ TRACE_SYSCALL_EXIT); \ + __ftrace_bad_type(); \ + } while (0) + +@@ -287,7 +343,8 @@ extern void __ftrace_bad_type(void); + enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, +- TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ ++ TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ ++ TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ + }; + + +@@ -297,8 +354,8 @@ enum print_line_t { + * flags value in struct tracer_flags. + */ + struct tracer_opt { +- const char *name; /* Will appear on the trace_options file */ +- u32 bit; /* Mask assigned in val field in tracer_flags */ ++ const char *name; /* Will appear on the trace_options file */ ++ u32 bit; /* Mask assigned in val field in tracer_flags */ + }; + + /* +@@ -307,28 +364,51 @@ struct tracer_opt { + */ + struct tracer_flags { + u32 val; +- struct tracer_opt *opts; ++ struct tracer_opt *opts; + }; + + /* Makes more easy to define a tracer opt */ + #define TRACER_OPT(s, b) .name = #s, .bit = b + +-/* +- * A specific tracer, represented by methods that operate on a trace array: ++ ++/** ++ * struct tracer - a specific tracer and its callbacks to interact with debugfs ++ * @name: the name chosen to select it on the available_tracers file ++ * @init: called when one switches to this tracer (echo name > current_tracer) ++ * @reset: called when one switches to another tracer ++ * @start: called when tracing is unpaused (echo 1 > tracing_enabled) ++ * @stop: called when tracing is paused (echo 0 > tracing_enabled) ++ * @open: called when the trace file is opened ++ * @pipe_open: called when the trace_pipe file is opened ++ * @wait_pipe: override how the user waits for traces on trace_pipe ++ * @close: called when the trace file is released ++ * @read: override the default read callback on trace_pipe ++ * @splice_read: override the default splice_read callback on trace_pipe ++ * @selftest: selftest to run on boot (see trace_selftest.c) ++ * @print_headers: override the first lines that describe your columns ++ * @print_line: callback that prints a trace ++ * @set_flag: signals one of your private flags changed (trace_options file) ++ * @flags: your private flags + */ + struct tracer { + const char *name; +- /* Your tracer should raise a warning if init fails */ + int (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*start)(struct trace_array *tr); + void (*stop)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); ++ void (*wait_pipe)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); ++ ssize_t (*splice_read)(struct trace_iterator *iter, ++ struct file *filp, ++ loff_t *ppos, ++ struct pipe_inode_info *pipe, ++ size_t len, ++ unsigned int flags); + #ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +@@ -339,7 +419,8 @@ struct tracer { + int (*set_flag)(u32 old_flags, u32 bit, int set); + struct tracer *next; + int print_max; +- struct tracer_flags *flags; ++ struct tracer_flags *flags; ++ struct tracer_stat *stats; + }; + + struct trace_seq { +@@ -348,6 +429,16 @@ struct trace_seq { + unsigned int readpos; + }; + ++static inline void ++trace_seq_init(struct trace_seq *s) ++{ ++ s->len = 0; ++ s->readpos = 0; ++} ++ ++ ++#define TRACE_PIPE_ALL_CPU -1 ++ + /* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: +@@ -356,6 +447,8 @@ struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + void *private; ++ int cpu_file; ++ struct mutex mutex; + struct ring_buffer_iter *buffer_iter[NR_CPUS]; + + /* The below is zeroed out in pipe_read */ +@@ -371,6 +464,7 @@ struct trace_iterator { + cpumask_var_t started; + }; + ++int tracer_init(struct tracer *t, struct trace_array *tr); + int tracing_is_enabled(void); + void trace_wake_up(void); + void tracing_reset(struct trace_array *tr, int cpu); +@@ -379,26 +473,50 @@ int tracing_open_generic(struct inode *i + struct dentry *tracing_init_dentry(void); + void init_tracer_sysprof_debugfs(struct dentry *d_tracer); + ++struct ring_buffer_event; ++ ++struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, ++ unsigned char type, ++ unsigned long len, ++ unsigned long flags, ++ int pc); ++void trace_buffer_unlock_commit(struct trace_array *tr, ++ struct ring_buffer_event *event, ++ unsigned long flags, int pc); ++ ++struct ring_buffer_event * ++trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, ++ unsigned long flags, int pc); ++void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, ++ unsigned long flags, int pc); ++void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, ++ unsigned long flags, int pc); ++ + struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); ++ ++struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, ++ int *ent_cpu, u64 *ent_ts); ++ + void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); + ++void default_wait_pipe(struct trace_iterator *iter); ++void poll_wait_pipe(struct trace_iterator *iter); ++ + void ftrace(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); + void tracing_sched_switch_trace(struct trace_array *tr, +- struct trace_array_cpu *data, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc); + void tracing_record_cmdline(struct task_struct *tsk); + + void tracing_sched_wakeup_trace(struct trace_array *tr, +- struct trace_array_cpu *data, + struct task_struct *wakee, + struct task_struct *cur, + unsigned long flags, int pc); +@@ -408,14 +526,12 @@ void trace_special(struct trace_array *t + unsigned long arg2, + unsigned long arg3, int pc); + void trace_function(struct trace_array *tr, +- struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); + + void trace_graph_return(struct ftrace_graph_ret *trace); + int trace_graph_entry(struct ftrace_graph_ent *trace); +-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to); + + void tracing_start_cmdline_record(void); + void tracing_stop_cmdline_record(void); +@@ -434,15 +550,11 @@ void update_max_tr(struct trace_array *t + void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); + +-extern cycle_t ftrace_now(int cpu); ++void __trace_stack(struct trace_array *tr, ++ unsigned long flags, ++ int skip, int pc); + +-#ifdef CONFIG_FUNCTION_TRACER +-void tracing_start_function_trace(void); +-void tracing_stop_function_trace(void); +-#else +-# define tracing_start_function_trace() do { } while (0) +-# define tracing_stop_function_trace() do { } while (0) +-#endif ++extern cycle_t ftrace_now(int cpu); + + #ifdef CONFIG_CONTEXT_SWITCH_TRACER + typedef void +@@ -456,10 +568,10 @@ struct tracer_switch_ops { + void *private; + struct tracer_switch_ops *next; + }; +- +-char *trace_find_cmdline(int pid); + #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + ++extern void trace_find_cmdline(int pid, char comm[]); ++ + #ifdef CONFIG_DYNAMIC_FTRACE + extern unsigned long ftrace_update_tot_cnt; + #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +@@ -469,6 +581,8 @@ extern int DYN_FTRACE_TEST_NAME(void); + #ifdef CONFIG_FTRACE_STARTUP_TEST + extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); ++extern int trace_selftest_startup_function_graph(struct tracer *trace, ++ struct trace_array *tr); + extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); + extern int trace_selftest_startup_preemptoff(struct tracer *trace, +@@ -488,24 +602,19 @@ extern int trace_selftest_startup_branch + #endif /* CONFIG_FTRACE_STARTUP_TEST */ + + extern void *head_page(struct trace_array_cpu *data); +-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +-extern void trace_seq_print_cont(struct trace_seq *s, +- struct trace_iterator *iter); +- +-extern int +-seq_print_ip_sym(struct trace_seq *s, unsigned long ip, +- unsigned long sym_flags); +-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, +- size_t cnt); + extern long ns2usecs(cycle_t nsec); + extern int +-trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); ++trace_vbprintk(unsigned long ip, const char *fmt, va_list args); ++extern int ++trace_vprintk(unsigned long ip, const char *fmt, va_list args); + + extern unsigned long trace_flags; + + /* Standard output formatting function used for function return traces */ + #ifdef CONFIG_FUNCTION_GRAPH_TRACER + extern enum print_line_t print_graph_function(struct trace_iterator *iter); ++extern enum print_line_t ++trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); + + #ifdef CONFIG_DYNAMIC_FTRACE + /* TODO: make this variable */ +@@ -537,7 +646,6 @@ static inline int ftrace_graph_addr(unsi + return 1; + } + #endif /* CONFIG_DYNAMIC_FTRACE */ +- + #else /* CONFIG_FUNCTION_GRAPH_TRACER */ + static inline enum print_line_t + print_graph_function(struct trace_iterator *iter) +@@ -580,7 +688,12 @@ enum trace_iterator_flags { + TRACE_ITER_ANNOTATE = 0x2000, + TRACE_ITER_USERSTACKTRACE = 0x4000, + TRACE_ITER_SYM_USEROBJ = 0x8000, +- TRACE_ITER_PRINTK_MSGONLY = 0x10000 ++ TRACE_ITER_PRINTK_MSGONLY = 0x10000, ++ TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ ++ TRACE_ITER_LATENCY_FMT = 0x40000, ++ TRACE_ITER_GLOBAL_CLK = 0x80000, ++ TRACE_ITER_SLEEP_TIME = 0x100000, ++ TRACE_ITER_GRAPH_TIME = 0x200000, + }; + + /* +@@ -601,12 +714,12 @@ extern struct tracer nop_trace; + * preempt_enable (after a disable), a schedule might take place + * causing an infinite recursion. + * +- * To prevent this, we read the need_recshed flag before ++ * To prevent this, we read the need_resched flag before + * disabling preemption. When we want to enable preemption we + * check the flag, if it is set, then we call preempt_enable_no_resched. + * Otherwise, we call preempt_enable. + * +- * The rational for doing the above is that if need resched is set ++ * The rational for doing the above is that if need_resched is set + * and we have yet to reschedule, we are either in an atomic location + * (where we do not need to check for scheduling) or we are inside + * the scheduler and do not want to resched. +@@ -627,7 +740,7 @@ static inline int ftrace_preempt_disable + * + * This is a scheduler safe way to enable preemption and not miss + * any preemption checks. The disabled saved the state of preemption. +- * If resched is set, then we were either inside an atomic or ++ * If resched is set, then we are either inside an atomic or + * are inside the scheduler (we would have already scheduled + * otherwise). In this case, we do not want to call normal + * preempt_enable, but preempt_enable_no_resched instead. +@@ -664,4 +777,118 @@ static inline void trace_branch_disable( + } + #endif /* CONFIG_BRANCH_TRACER */ + ++/* set ring buffers to default size if not already done so */ ++int tracing_update_buffers(void); ++ ++/* trace event type bit fields, not numeric */ ++enum { ++ TRACE_EVENT_TYPE_PRINTF = 1, ++ TRACE_EVENT_TYPE_RAW = 2, ++}; ++ ++struct ftrace_event_field { ++ struct list_head link; ++ char *name; ++ char *type; ++ int offset; ++ int size; ++}; ++ ++struct ftrace_event_call { ++ char *name; ++ char *system; ++ struct dentry *dir; ++ int enabled; ++ int (*regfunc)(void); ++ void (*unregfunc)(void); ++ int id; ++ int (*raw_init)(void); ++ int (*show_format)(struct trace_seq *s); ++ int (*define_fields)(void); ++ struct list_head fields; ++ struct filter_pred **preds; ++ ++#ifdef CONFIG_EVENT_PROFILE ++ atomic_t profile_count; ++ int (*profile_enable)(struct ftrace_event_call *); ++ void (*profile_disable)(struct ftrace_event_call *); ++#endif ++}; ++ ++struct event_subsystem { ++ struct list_head list; ++ const char *name; ++ struct dentry *entry; ++ struct filter_pred **preds; ++}; ++ ++#define events_for_each(event) \ ++ for (event = __start_ftrace_events; \ ++ (unsigned long)event < (unsigned long)__stop_ftrace_events; \ ++ event++) ++ ++#define MAX_FILTER_PRED 8 ++ ++struct filter_pred; ++ ++typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); ++ ++struct filter_pred { ++ filter_pred_fn_t fn; ++ u64 val; ++ char *str_val; ++ int str_len; ++ char *field_name; ++ int offset; ++ int not; ++ int or; ++ int compound; ++ int clear; ++}; ++ ++int trace_define_field(struct ftrace_event_call *call, char *type, ++ char *name, int offset, int size); ++extern void filter_free_pred(struct filter_pred *pred); ++extern void filter_print_preds(struct filter_pred **preds, ++ struct trace_seq *s); ++extern int filter_parse(char **pbuf, struct filter_pred *pred); ++extern int filter_add_pred(struct ftrace_event_call *call, ++ struct filter_pred *pred); ++extern void filter_free_preds(struct ftrace_event_call *call); ++extern int filter_match_preds(struct ftrace_event_call *call, void *rec); ++extern void filter_free_subsystem_preds(struct event_subsystem *system); ++extern int filter_add_subsystem_pred(struct event_subsystem *system, ++ struct filter_pred *pred); ++ ++void event_trace_printk(unsigned long ip, const char *fmt, ...); ++extern struct ftrace_event_call __start_ftrace_events[]; ++extern struct ftrace_event_call __stop_ftrace_events[]; ++ ++#define for_each_event(event) \ ++ for (event = __start_ftrace_events; \ ++ (unsigned long)event < (unsigned long)__stop_ftrace_events; \ ++ event++) ++ ++extern const char *__start___trace_bprintk_fmt[]; ++extern const char *__stop___trace_bprintk_fmt[]; ++ ++/* ++ * The double __builtin_constant_p is because gcc will give us an error ++ * if we try to allocate the static variable to fmt if it is not a ++ * constant. Even with the outer if statement optimizing out. ++ */ ++#define event_trace_printk(ip, fmt, args...) \ ++do { \ ++ __trace_printk_check_format(fmt, ##args); \ ++ tracing_record_cmdline(current); \ ++ if (__builtin_constant_p(fmt)) { \ ++ static const char *trace_printk_fmt \ ++ __attribute__((section("__trace_printk_fmt"))) = \ ++ __builtin_constant_p(fmt) ? fmt : NULL; \ ++ \ ++ __trace_bprintk(ip, trace_printk_fmt, ##args); \ ++ } else \ ++ __trace_printk(ip, fmt, ##args); \ ++} while (0) ++ + #endif /* _LINUX_KERNEL_TRACE_H */ +Index: linux-2.6-tip/kernel/trace/trace_boot.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_boot.c ++++ linux-2.6-tip/kernel/trace/trace_boot.c +@@ -11,6 +11,7 @@ + #include + + #include "trace.h" ++#include "trace_output.h" + + static struct trace_array *boot_trace; + static bool pre_initcalls_finished; +@@ -27,13 +28,13 @@ void start_boot_trace(void) + + void enable_boot_trace(void) + { +- if (pre_initcalls_finished) ++ if (boot_trace && pre_initcalls_finished) + tracing_start_sched_switch_record(); + } + + void disable_boot_trace(void) + { +- if (pre_initcalls_finished) ++ if (boot_trace && pre_initcalls_finished) + tracing_stop_sched_switch_record(); + } + +@@ -42,6 +43,9 @@ static int boot_trace_init(struct trace_ + int cpu; + boot_trace = tr; + ++ if (!tr) ++ return 0; ++ + for_each_cpu(cpu, cpu_possible_mask) + tracing_reset(tr, cpu); + +@@ -128,10 +132,9 @@ void trace_boot_call(struct boot_trace_c + { + struct ring_buffer_event *event; + struct trace_boot_call *entry; +- unsigned long irq_flags; + struct trace_array *tr = boot_trace; + +- if (!pre_initcalls_finished) ++ if (!tr || !pre_initcalls_finished) + return; + + /* Get its name now since this function could +@@ -140,18 +143,13 @@ void trace_boot_call(struct boot_trace_c + sprint_symbol(bt->func, (unsigned long)fn); + preempt_disable(); + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, ++ sizeof(*entry), 0, 0); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, 0, 0); +- entry->ent.type = TRACE_BOOT_CALL; + entry->boot_call = *bt; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- +- trace_wake_up(); +- ++ trace_buffer_unlock_commit(tr, event, 0, 0); + out: + preempt_enable(); + } +@@ -160,27 +158,21 @@ void trace_boot_ret(struct boot_trace_re + { + struct ring_buffer_event *event; + struct trace_boot_ret *entry; +- unsigned long irq_flags; + struct trace_array *tr = boot_trace; + +- if (!pre_initcalls_finished) ++ if (!tr || !pre_initcalls_finished) + return; + + sprint_symbol(bt->func, (unsigned long)fn); + preempt_disable(); + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, ++ sizeof(*entry), 0, 0); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, 0, 0); +- entry->ent.type = TRACE_BOOT_RET; + entry->boot_ret = *bt; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- +- trace_wake_up(); +- ++ trace_buffer_unlock_commit(tr, event, 0, 0); + out: + preempt_enable(); + } +Index: linux-2.6-tip/kernel/trace/trace_branch.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_branch.c ++++ linux-2.6-tip/kernel/trace/trace_branch.c +@@ -14,12 +14,17 @@ + #include + #include + #include ++ + #include "trace.h" ++#include "trace_stat.h" ++#include "trace_output.h" + + #ifdef CONFIG_BRANCH_TRACER + ++static struct tracer branch_trace; + static int branch_tracing_enabled __read_mostly; + static DEFINE_MUTEX(branch_tracing_mutex); ++ + static struct trace_array *branch_tracer; + + static void +@@ -28,7 +33,7 @@ probe_likely_condition(struct ftrace_bra + struct trace_array *tr = branch_tracer; + struct ring_buffer_event *event; + struct trace_branch *entry; +- unsigned long flags, irq_flags; ++ unsigned long flags; + int cpu, pc; + const char *p; + +@@ -47,15 +52,13 @@ probe_likely_condition(struct ftrace_bra + if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + goto out; + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ pc = preempt_count(); ++ event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, ++ sizeof(*entry), flags, pc); + if (!event) + goto out; + +- pc = preempt_count(); + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, flags, pc); +- entry->ent.type = TRACE_BRANCH; + + /* Strip off the path, only save the file */ + p = f->file + strlen(f->file); +@@ -70,7 +73,7 @@ probe_likely_condition(struct ftrace_bra + entry->line = f->line; + entry->correct = val == expect; + +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); ++ ring_buffer_unlock_commit(tr->buffer, event); + + out: + atomic_dec(&tr->data[cpu]->disabled); +@@ -88,8 +91,6 @@ void trace_likely_condition(struct ftrac + + int enable_branch_tracing(struct trace_array *tr) + { +- int ret = 0; +- + mutex_lock(&branch_tracing_mutex); + branch_tracer = tr; + /* +@@ -100,7 +101,7 @@ int enable_branch_tracing(struct trace_a + branch_tracing_enabled++; + mutex_unlock(&branch_tracing_mutex); + +- return ret; ++ return 0; + } + + void disable_branch_tracing(void) +@@ -128,11 +129,6 @@ static void stop_branch_trace(struct tra + + static int branch_trace_init(struct trace_array *tr) + { +- int cpu; +- +- for_each_online_cpu(cpu) +- tracing_reset(tr, cpu); +- + start_branch_trace(tr); + return 0; + } +@@ -142,22 +138,53 @@ static void branch_trace_reset(struct tr + stop_branch_trace(tr); + } + +-struct tracer branch_trace __read_mostly = ++static enum print_line_t trace_branch_print(struct trace_iterator *iter, ++ int flags) ++{ ++ struct trace_branch *field; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", ++ field->correct ? " ok " : " MISS ", ++ field->func, ++ field->file, ++ field->line)) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++ ++static struct trace_event trace_branch_event = { ++ .type = TRACE_BRANCH, ++ .trace = trace_branch_print, ++}; ++ ++static struct tracer branch_trace __read_mostly = + { + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, + #ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +-#endif ++#endif /* CONFIG_FTRACE_SELFTEST */ + }; + +-__init static int init_branch_trace(void) ++__init static int init_branch_tracer(void) + { ++ int ret; ++ ++ ret = register_ftrace_event(&trace_branch_event); ++ if (!ret) { ++ printk(KERN_WARNING "Warning: could not register " ++ "branch events\n"); ++ return 1; ++ } + return register_tracer(&branch_trace); + } ++device_initcall(init_branch_tracer); + +-device_initcall(init_branch_trace); + #else + static inline + void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +@@ -183,66 +210,39 @@ void ftrace_likely_update(struct ftrace_ + } + EXPORT_SYMBOL(ftrace_likely_update); + +-struct ftrace_pointer { +- void *start; +- void *stop; +- int hit; +-}; ++extern unsigned long __start_annotated_branch_profile[]; ++extern unsigned long __stop_annotated_branch_profile[]; + +-static void * +-t_next(struct seq_file *m, void *v, loff_t *pos) ++static int annotated_branch_stat_headers(struct seq_file *m) + { +- const struct ftrace_pointer *f = m->private; +- struct ftrace_branch_data *p = v; +- +- (*pos)++; +- +- if (v == (void *)1) +- return f->start; +- +- ++p; +- +- if ((void *)p >= (void *)f->stop) +- return NULL; +- +- return p; ++ seq_printf(m, " correct incorrect %% "); ++ seq_printf(m, " Function " ++ " File Line\n" ++ " ------- --------- - " ++ " -------- " ++ " ---- ----\n"); ++ return 0; + } + +-static void *t_start(struct seq_file *m, loff_t *pos) ++static inline long get_incorrect_percent(struct ftrace_branch_data *p) + { +- void *t = (void *)1; +- loff_t l = 0; +- +- for (; t && l < *pos; t = t_next(m, t, &l)) +- ; ++ long percent; + +- return t; +-} ++ if (p->correct) { ++ percent = p->incorrect * 100; ++ percent /= p->correct + p->incorrect; ++ } else ++ percent = p->incorrect ? 100 : -1; + +-static void t_stop(struct seq_file *m, void *p) +-{ ++ return percent; + } + +-static int t_show(struct seq_file *m, void *v) ++static int branch_stat_show(struct seq_file *m, void *v) + { +- const struct ftrace_pointer *fp = m->private; + struct ftrace_branch_data *p = v; + const char *f; + long percent; + +- if (v == (void *)1) { +- if (fp->hit) +- seq_printf(m, " miss hit %% "); +- else +- seq_printf(m, " correct incorrect %% "); +- seq_printf(m, " Function " +- " File Line\n" +- " ------- --------- - " +- " -------- " +- " ---- ----\n"); +- return 0; +- } +- + /* Only print the file, not the path */ + f = p->file + strlen(p->file); + while (f >= p->file && *f != '/') +@@ -252,11 +252,7 @@ static int t_show(struct seq_file *m, vo + /* + * The miss is overlayed on correct, and hit on incorrect. + */ +- if (p->correct) { +- percent = p->incorrect * 100; +- percent /= p->correct + p->incorrect; +- } else +- percent = p->incorrect ? 100 : -1; ++ percent = get_incorrect_percent(p); + + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + if (percent < 0) +@@ -267,76 +263,118 @@ static int t_show(struct seq_file *m, vo + return 0; + } + +-static struct seq_operations tracing_likely_seq_ops = { +- .start = t_start, +- .next = t_next, +- .stop = t_stop, +- .show = t_show, ++static void *annotated_branch_stat_start(void) ++{ ++ return __start_annotated_branch_profile; ++} ++ ++static void * ++annotated_branch_stat_next(void *v, int idx) ++{ ++ struct ftrace_branch_data *p = v; ++ ++ ++p; ++ ++ if ((void *)p >= (void *)__stop_annotated_branch_profile) ++ return NULL; ++ ++ return p; ++} ++ ++static int annotated_branch_stat_cmp(void *p1, void *p2) ++{ ++ struct ftrace_branch_data *a = p1; ++ struct ftrace_branch_data *b = p2; ++ ++ long percent_a, percent_b; ++ ++ percent_a = get_incorrect_percent(a); ++ percent_b = get_incorrect_percent(b); ++ ++ if (percent_a < percent_b) ++ return -1; ++ if (percent_a > percent_b) ++ return 1; ++ else ++ return 0; ++} ++ ++static struct tracer_stat annotated_branch_stats = { ++ .name = "branch_annotated", ++ .stat_start = annotated_branch_stat_start, ++ .stat_next = annotated_branch_stat_next, ++ .stat_cmp = annotated_branch_stat_cmp, ++ .stat_headers = annotated_branch_stat_headers, ++ .stat_show = branch_stat_show + }; + +-static int tracing_branch_open(struct inode *inode, struct file *file) ++__init static int init_annotated_branch_stats(void) + { + int ret; + +- ret = seq_open(file, &tracing_likely_seq_ops); ++ ret = register_stat_tracer(&annotated_branch_stats); + if (!ret) { +- struct seq_file *m = file->private_data; +- m->private = (void *)inode->i_private; ++ printk(KERN_WARNING "Warning: could not register " ++ "annotated branches stats\n"); ++ return 1; + } +- +- return ret; ++ return 0; + } +- +-static const struct file_operations tracing_branch_fops = { +- .open = tracing_branch_open, +- .read = seq_read, +- .llseek = seq_lseek, +-}; ++fs_initcall(init_annotated_branch_stats); + + #ifdef CONFIG_PROFILE_ALL_BRANCHES ++ + extern unsigned long __start_branch_profile[]; + extern unsigned long __stop_branch_profile[]; + +-static const struct ftrace_pointer ftrace_branch_pos = { +- .start = __start_branch_profile, +- .stop = __stop_branch_profile, +- .hit = 1, +-}; ++static int all_branch_stat_headers(struct seq_file *m) ++{ ++ seq_printf(m, " miss hit %% "); ++ seq_printf(m, " Function " ++ " File Line\n" ++ " ------- --------- - " ++ " -------- " ++ " ---- ----\n"); ++ return 0; ++} + +-#endif /* CONFIG_PROFILE_ALL_BRANCHES */ ++static void *all_branch_stat_start(void) ++{ ++ return __start_branch_profile; ++} + +-extern unsigned long __start_annotated_branch_profile[]; +-extern unsigned long __stop_annotated_branch_profile[]; ++static void * ++all_branch_stat_next(void *v, int idx) ++{ ++ struct ftrace_branch_data *p = v; + +-static const struct ftrace_pointer ftrace_annotated_branch_pos = { +- .start = __start_annotated_branch_profile, +- .stop = __stop_annotated_branch_profile, +-}; ++ ++p; + +-static __init int ftrace_branch_init(void) +-{ +- struct dentry *d_tracer; +- struct dentry *entry; ++ if ((void *)p >= (void *)__stop_branch_profile) ++ return NULL; + +- d_tracer = tracing_init_dentry(); ++ return p; ++} + +- entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, +- (void *)&ftrace_annotated_branch_pos, +- &tracing_branch_fops); +- if (!entry) +- pr_warning("Could not create debugfs " +- "'profile_annotatet_branch' entry\n"); ++static struct tracer_stat all_branch_stats = { ++ .name = "branch_all", ++ .stat_start = all_branch_stat_start, ++ .stat_next = all_branch_stat_next, ++ .stat_headers = all_branch_stat_headers, ++ .stat_show = branch_stat_show ++}; + +-#ifdef CONFIG_PROFILE_ALL_BRANCHES +- entry = debugfs_create_file("profile_branch", 0444, d_tracer, +- (void *)&ftrace_branch_pos, +- &tracing_branch_fops); +- if (!entry) +- pr_warning("Could not create debugfs" +- " 'profile_branch' entry\n"); +-#endif ++__init static int all_annotated_branch_stats(void) ++{ ++ int ret; + ++ ret = register_stat_tracer(&all_branch_stats); ++ if (!ret) { ++ printk(KERN_WARNING "Warning: could not register " ++ "all branches stats\n"); ++ return 1; ++ } + return 0; + } +- +-device_initcall(ftrace_branch_init); ++fs_initcall(all_annotated_branch_stats); ++#endif /* CONFIG_PROFILE_ALL_BRANCHES */ +Index: linux-2.6-tip/kernel/trace/trace_clock.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_clock.c +@@ -0,0 +1,109 @@ ++/* ++ * tracing clocks ++ * ++ * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar ++ * ++ * Implements 3 trace clock variants, with differing scalability/precision ++ * tradeoffs: ++ * ++ * - local: CPU-local trace clock ++ * - medium: scalable global clock with some jitter ++ * - global: globally monotonic, serialized clock ++ * ++ * Tracer plugins will chose a default from these clocks. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * trace_clock_local(): the simplest and least coherent tracing clock. ++ * ++ * Useful for tracing that does not cross to other CPUs nor ++ * does it go through idle events. ++ */ ++u64 notrace trace_clock_local(void) ++{ ++ unsigned long flags; ++ u64 clock; ++ ++ /* ++ * sched_clock() is an architecture implemented, fast, scalable, ++ * lockless clock. It is not guaranteed to be coherent across ++ * CPUs, nor across CPU idle events. ++ */ ++ raw_local_irq_save(flags); ++ clock = sched_clock(); ++ raw_local_irq_restore(flags); ++ ++ return clock; ++} ++ ++/* ++ * trace_clock(): 'inbetween' trace clock. Not completely serialized, ++ * but not completely incorrect when crossing CPUs either. ++ * ++ * This is based on cpu_clock(), which will allow at most ~1 jiffy of ++ * jitter between CPUs. So it's a pretty scalable clock, but there ++ * can be offsets in the trace data. ++ */ ++u64 notrace trace_clock(void) ++{ ++ return cpu_clock(raw_smp_processor_id()); ++} ++ ++ ++/* ++ * trace_clock_global(): special globally coherent trace clock ++ * ++ * It has higher overhead than the other trace clocks but is still ++ * an order of magnitude faster than GTOD derived hardware clocks. ++ * ++ * Used by plugins that need globally coherent timestamps. ++ */ ++ ++static u64 prev_trace_clock_time; ++ ++static __raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = ++ __RAW_SPIN_LOCK_UNLOCKED; ++ ++u64 notrace trace_clock_global(void) ++{ ++ unsigned long flags; ++ int this_cpu; ++ u64 now; ++ ++ raw_local_irq_save(flags); ++ ++ this_cpu = raw_smp_processor_id(); ++ now = cpu_clock(this_cpu); ++ /* ++ * If in an NMI context then dont risk lockups and return the ++ * cpu_clock() time: ++ */ ++ if (unlikely(in_nmi())) ++ goto out; ++ ++ __raw_spin_lock(&trace_clock_lock); ++ ++ /* ++ * TODO: if this happens often then maybe we should reset ++ * my_scd->clock to prev_trace_clock_time+1, to make sure ++ * we start ticking with the local clock from now on? ++ */ ++ if ((s64)(now - prev_trace_clock_time) < 0) ++ now = prev_trace_clock_time + 1; ++ ++ prev_trace_clock_time = now; ++ ++ __raw_spin_unlock(&trace_clock_lock); ++ ++ out: ++ raw_local_irq_restore(flags); ++ ++ return now; ++} +Index: linux-2.6-tip/kernel/trace/trace_event_profile.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_event_profile.c +@@ -0,0 +1,31 @@ ++/* ++ * trace event based perf counter profiling ++ * ++ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra ++ * ++ */ ++ ++#include "trace.h" ++ ++int ftrace_profile_enable(int event_id) ++{ ++ struct ftrace_event_call *event; ++ ++ for_each_event(event) { ++ if (event->id == event_id) ++ return event->profile_enable(event); ++ } ++ ++ return -EINVAL; ++} ++ ++void ftrace_profile_disable(int event_id) ++{ ++ struct ftrace_event_call *event; ++ ++ for_each_event(event) { ++ if (event->id == event_id) ++ return event->profile_disable(event); ++ } ++} ++ +Index: linux-2.6-tip/kernel/trace/trace_event_types.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_event_types.h +@@ -0,0 +1,173 @@ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM ftrace ++ ++/* ++ * We cheat and use the proto type field as the ID ++ * and args as the entry type (minus 'struct') ++ */ ++TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned long, ip, ip) ++ TRACE_FIELD(unsigned long, parent_ip, parent_ip) ++ ), ++ TP_RAW_FMT(" %lx <-- %lx") ++); ++ ++TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, ++ ftrace_graph_ent_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned long, graph_ent.func, func) ++ TRACE_FIELD(int, graph_ent.depth, depth) ++ ), ++ TP_RAW_FMT("--> %lx (%d)") ++); ++ ++TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, ++ ftrace_graph_ret_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned long, ret.func, func) ++ TRACE_FIELD(int, ret.depth, depth) ++ ), ++ TP_RAW_FMT("<-- %lx (%d)") ++); ++ ++TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned int, prev_pid, prev_pid) ++ TRACE_FIELD(unsigned char, prev_prio, prev_prio) ++ TRACE_FIELD(unsigned char, prev_state, prev_state) ++ TRACE_FIELD(unsigned int, next_pid, next_pid) ++ TRACE_FIELD(unsigned char, next_prio, next_prio) ++ TRACE_FIELD(unsigned char, next_state, next_state) ++ TRACE_FIELD(unsigned int, next_cpu, next_cpu) ++ ), ++ TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ++); ++ ++TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned int, prev_pid, prev_pid) ++ TRACE_FIELD(unsigned char, prev_prio, prev_prio) ++ TRACE_FIELD(unsigned char, prev_state, prev_state) ++ TRACE_FIELD(unsigned int, next_pid, next_pid) ++ TRACE_FIELD(unsigned char, next_prio, next_prio) ++ TRACE_FIELD(unsigned char, next_state, next_state) ++ TRACE_FIELD(unsigned int, next_cpu, next_cpu) ++ ), ++ TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ++); ++ ++TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned long, arg1, arg1) ++ TRACE_FIELD(unsigned long, arg2, arg2) ++ TRACE_FIELD(unsigned long, arg3, arg3) ++ ), ++ TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") ++); ++ ++/* ++ * Stack-trace entry: ++ */ ++ ++/* #define FTRACE_STACK_ENTRIES 8 */ ++ ++TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned long, caller[0], stack0) ++ TRACE_FIELD(unsigned long, caller[1], stack1) ++ TRACE_FIELD(unsigned long, caller[2], stack2) ++ TRACE_FIELD(unsigned long, caller[3], stack3) ++ TRACE_FIELD(unsigned long, caller[4], stack4) ++ TRACE_FIELD(unsigned long, caller[5], stack5) ++ TRACE_FIELD(unsigned long, caller[6], stack6) ++ TRACE_FIELD(unsigned long, caller[7], stack7) ++ ), ++ TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" ++ "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ++); ++ ++TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned long, caller[0], stack0) ++ TRACE_FIELD(unsigned long, caller[1], stack1) ++ TRACE_FIELD(unsigned long, caller[2], stack2) ++ TRACE_FIELD(unsigned long, caller[3], stack3) ++ TRACE_FIELD(unsigned long, caller[4], stack4) ++ TRACE_FIELD(unsigned long, caller[5], stack5) ++ TRACE_FIELD(unsigned long, caller[6], stack6) ++ TRACE_FIELD(unsigned long, caller[7], stack7) ++ ), ++ TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" ++ "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ++); ++ ++TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned long, ip, ip) ++ TRACE_FIELD(char *, fmt, fmt) ++ TRACE_FIELD_ZERO_CHAR(buf) ++ ), ++ TP_RAW_FMT("%08lx (%d) fmt:%p %s") ++); ++ ++TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned long, ip, ip) ++ TRACE_FIELD_ZERO_CHAR(buf) ++ ), ++ TP_RAW_FMT("%08lx (%d) fmt:%p %s") ++); ++ ++TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(unsigned int, line, line) ++ TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) ++ TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) ++ TRACE_FIELD(char, correct, correct) ++ ), ++ TP_RAW_FMT("%u:%s:%s (%u)") ++); ++ ++TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(u64, from, from) ++ TRACE_FIELD(u64, to, to) ++ ), ++ TP_RAW_FMT("from: %llx to: %llx") ++); ++ ++TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(ktime_t, state_data.stamp, stamp) ++ TRACE_FIELD(ktime_t, state_data.end, end) ++ TRACE_FIELD(int, state_data.type, type) ++ TRACE_FIELD(int, state_data.state, state) ++ ), ++ TP_RAW_FMT("%llx->%llx type:%u state:%u") ++); ++ ++TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) ++ TRACE_FIELD(unsigned long, call_site, call_site) ++ TRACE_FIELD(const void *, ptr, ptr) ++ TRACE_FIELD(size_t, bytes_req, bytes_req) ++ TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) ++ TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) ++ TRACE_FIELD(int, node, node) ++ ), ++ TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" ++ " flags:%x node:%d") ++); ++ ++TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, ++ TRACE_STRUCT( ++ TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) ++ TRACE_FIELD(unsigned long, call_site, call_site) ++ TRACE_FIELD(const void *, ptr, ptr) ++ ), ++ TP_RAW_FMT("type:%u call_site:%lx ptr:%p") ++); ++ ++#undef TRACE_SYSTEM +Index: linux-2.6-tip/kernel/trace/trace_events.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_events.c +@@ -0,0 +1,832 @@ ++/* ++ * event tracer ++ * ++ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt ++ * ++ * - Added format output of fields of the trace point. ++ * This was based off of work by Tom Zanussi . ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "trace_output.h" ++ ++#define TRACE_SYSTEM "TRACE_SYSTEM" ++ ++static DEFINE_MUTEX(event_mutex); ++ ++int trace_define_field(struct ftrace_event_call *call, char *type, ++ char *name, int offset, int size) ++{ ++ struct ftrace_event_field *field; ++ ++ field = kzalloc(sizeof(*field), GFP_KERNEL); ++ if (!field) ++ goto err; ++ ++ field->name = kstrdup(name, GFP_KERNEL); ++ if (!field->name) ++ goto err; ++ ++ field->type = kstrdup(type, GFP_KERNEL); ++ if (!field->type) ++ goto err; ++ ++ field->offset = offset; ++ field->size = size; ++ list_add(&field->link, &call->fields); ++ ++ return 0; ++ ++err: ++ if (field) { ++ kfree(field->name); ++ kfree(field->type); ++ } ++ kfree(field); ++ ++ return -ENOMEM; ++} ++ ++static void ftrace_clear_events(void) ++{ ++ struct ftrace_event_call *call = (void *)__start_ftrace_events; ++ ++ ++ while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { ++ ++ if (call->enabled) { ++ call->enabled = 0; ++ call->unregfunc(); ++ } ++ call++; ++ } ++} ++ ++static void ftrace_event_enable_disable(struct ftrace_event_call *call, ++ int enable) ++{ ++ ++ switch (enable) { ++ case 0: ++ if (call->enabled) { ++ call->enabled = 0; ++ call->unregfunc(); ++ } ++ break; ++ case 1: ++ if (!call->enabled) { ++ call->enabled = 1; ++ call->regfunc(); ++ } ++ break; ++ } ++} ++ ++static int ftrace_set_clr_event(char *buf, int set) ++{ ++ struct ftrace_event_call *call = __start_ftrace_events; ++ char *event = NULL, *sub = NULL, *match; ++ int ret = -EINVAL; ++ ++ /* ++ * The buf format can be : ++ * *: means any event by that name. ++ * : is the same. ++ * ++ * :* means all events in that subsystem ++ * : means the same. ++ * ++ * (no ':') means all events in a subsystem with ++ * the name or any event that matches ++ */ ++ ++ match = strsep(&buf, ":"); ++ if (buf) { ++ sub = match; ++ event = buf; ++ match = NULL; ++ ++ if (!strlen(sub) || strcmp(sub, "*") == 0) ++ sub = NULL; ++ if (!strlen(event) || strcmp(event, "*") == 0) ++ event = NULL; ++ } ++ ++ mutex_lock(&event_mutex); ++ for_each_event(call) { ++ ++ if (!call->name || !call->regfunc) ++ continue; ++ ++ if (match && ++ strcmp(match, call->name) != 0 && ++ strcmp(match, call->system) != 0) ++ continue; ++ ++ if (sub && strcmp(sub, call->system) != 0) ++ continue; ++ ++ if (event && strcmp(event, call->name) != 0) ++ continue; ++ ++ ftrace_event_enable_disable(call, set); ++ ++ ret = 0; ++ } ++ mutex_unlock(&event_mutex); ++ ++ return ret; ++} ++ ++/* 128 should be much more than enough */ ++#define EVENT_BUF_SIZE 127 ++ ++static ssize_t ++ftrace_event_write(struct file *file, const char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ size_t read = 0; ++ int i, set = 1; ++ ssize_t ret; ++ char *buf; ++ char ch; ++ ++ if (!cnt || cnt < 0) ++ return 0; ++ ++ ret = tracing_update_buffers(); ++ if (ret < 0) ++ return ret; ++ ++ ret = get_user(ch, ubuf++); ++ if (ret) ++ return ret; ++ read++; ++ cnt--; ++ ++ /* skip white space */ ++ while (cnt && isspace(ch)) { ++ ret = get_user(ch, ubuf++); ++ if (ret) ++ return ret; ++ read++; ++ cnt--; ++ } ++ ++ /* Only white space found? */ ++ if (isspace(ch)) { ++ file->f_pos += read; ++ ret = read; ++ return ret; ++ } ++ ++ buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ if (cnt > EVENT_BUF_SIZE) ++ cnt = EVENT_BUF_SIZE; ++ ++ i = 0; ++ while (cnt && !isspace(ch)) { ++ if (!i && ch == '!') ++ set = 0; ++ else ++ buf[i++] = ch; ++ ++ ret = get_user(ch, ubuf++); ++ if (ret) ++ goto out_free; ++ read++; ++ cnt--; ++ } ++ buf[i] = 0; ++ ++ file->f_pos += read; ++ ++ ret = ftrace_set_clr_event(buf, set); ++ if (ret) ++ goto out_free; ++ ++ ret = read; ++ ++ out_free: ++ kfree(buf); ++ ++ return ret; ++} ++ ++static void * ++t_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct ftrace_event_call *call = m->private; ++ struct ftrace_event_call *next = call; ++ ++ (*pos)++; ++ ++ for (;;) { ++ if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) ++ return NULL; ++ ++ /* ++ * The ftrace subsystem is for showing formats only. ++ * They can not be enabled or disabled via the event files. ++ */ ++ if (call->regfunc) ++ break; ++ ++ call++; ++ next = call; ++ } ++ ++ m->private = ++next; ++ ++ return call; ++} ++ ++static void *t_start(struct seq_file *m, loff_t *pos) ++{ ++ return t_next(m, NULL, pos); ++} ++ ++static void * ++s_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct ftrace_event_call *call = m->private; ++ struct ftrace_event_call *next; ++ ++ (*pos)++; ++ ++ retry: ++ if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) ++ return NULL; ++ ++ if (!call->enabled) { ++ call++; ++ goto retry; ++ } ++ ++ next = call; ++ m->private = ++next; ++ ++ return call; ++} ++ ++static void *s_start(struct seq_file *m, loff_t *pos) ++{ ++ return s_next(m, NULL, pos); ++} ++ ++static int t_show(struct seq_file *m, void *v) ++{ ++ struct ftrace_event_call *call = v; ++ ++ if (strcmp(call->system, TRACE_SYSTEM) != 0) ++ seq_printf(m, "%s:", call->system); ++ seq_printf(m, "%s\n", call->name); ++ ++ return 0; ++} ++ ++static void t_stop(struct seq_file *m, void *p) ++{ ++} ++ ++static int ++ftrace_event_seq_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ const struct seq_operations *seq_ops; ++ ++ if ((file->f_mode & FMODE_WRITE) && ++ !(file->f_flags & O_APPEND)) ++ ftrace_clear_events(); ++ ++ seq_ops = inode->i_private; ++ ret = seq_open(file, seq_ops); ++ if (!ret) { ++ struct seq_file *m = file->private_data; ++ ++ m->private = __start_ftrace_events; ++ } ++ return ret; ++} ++ ++static ssize_t ++event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct ftrace_event_call *call = filp->private_data; ++ char *buf; ++ ++ if (call->enabled) ++ buf = "1\n"; ++ else ++ buf = "0\n"; ++ ++ return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); ++} ++ ++static ssize_t ++event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct ftrace_event_call *call = filp->private_data; ++ char buf[64]; ++ unsigned long val; ++ int ret; ++ ++ if (cnt >= sizeof(buf)) ++ return -EINVAL; ++ ++ if (copy_from_user(&buf, ubuf, cnt)) ++ return -EFAULT; ++ ++ buf[cnt] = 0; ++ ++ ret = strict_strtoul(buf, 10, &val); ++ if (ret < 0) ++ return ret; ++ ++ ret = tracing_update_buffers(); ++ if (ret < 0) ++ return ret; ++ ++ switch (val) { ++ case 0: ++ case 1: ++ mutex_lock(&event_mutex); ++ ftrace_event_enable_disable(call, val); ++ mutex_unlock(&event_mutex); ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ ++ *ppos += cnt; ++ ++ return cnt; ++} ++ ++#undef FIELD ++#define FIELD(type, name) \ ++ #type, "common_" #name, offsetof(typeof(field), name), \ ++ sizeof(field.name) ++ ++static int trace_write_header(struct trace_seq *s) ++{ ++ struct trace_entry field; ++ ++ /* struct trace_entry */ ++ return trace_seq_printf(s, ++ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" ++ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" ++ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" ++ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" ++ "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" ++ "\n", ++ FIELD(unsigned char, type), ++ FIELD(unsigned char, flags), ++ FIELD(unsigned char, preempt_count), ++ FIELD(int, pid), ++ FIELD(int, tgid)); ++} ++ ++static ssize_t ++event_format_read(struct file *filp, char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct ftrace_event_call *call = filp->private_data; ++ struct trace_seq *s; ++ char *buf; ++ int r; ++ ++ if (*ppos) ++ return 0; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ trace_seq_init(s); ++ ++ /* If any of the first writes fail, so will the show_format. */ ++ ++ trace_seq_printf(s, "name: %s\n", call->name); ++ trace_seq_printf(s, "ID: %d\n", call->id); ++ trace_seq_printf(s, "format:\n"); ++ trace_write_header(s); ++ ++ r = call->show_format(s); ++ if (!r) { ++ /* ++ * ug! The format output is bigger than a PAGE!! ++ */ ++ buf = "FORMAT TOO BIG\n"; ++ r = simple_read_from_buffer(ubuf, cnt, ppos, ++ buf, strlen(buf)); ++ goto out; ++ } ++ ++ r = simple_read_from_buffer(ubuf, cnt, ppos, ++ s->buffer, s->len); ++ out: ++ kfree(s); ++ return r; ++} ++ ++static ssize_t ++event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) ++{ ++ struct ftrace_event_call *call = filp->private_data; ++ struct trace_seq *s; ++ int r; ++ ++ if (*ppos) ++ return 0; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ trace_seq_init(s); ++ trace_seq_printf(s, "%d\n", call->id); ++ ++ r = simple_read_from_buffer(ubuf, cnt, ppos, ++ s->buffer, s->len); ++ kfree(s); ++ return r; ++} ++ ++static ssize_t ++event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct ftrace_event_call *call = filp->private_data; ++ struct trace_seq *s; ++ int r; ++ ++ if (*ppos) ++ return 0; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ trace_seq_init(s); ++ ++ filter_print_preds(call->preds, s); ++ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); ++ ++ kfree(s); ++ ++ return r; ++} ++ ++static ssize_t ++event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct ftrace_event_call *call = filp->private_data; ++ char buf[64], *pbuf = buf; ++ struct filter_pred *pred; ++ int err; ++ ++ if (cnt >= sizeof(buf)) ++ return -EINVAL; ++ ++ if (copy_from_user(&buf, ubuf, cnt)) ++ return -EFAULT; ++ ++ pred = kzalloc(sizeof(*pred), GFP_KERNEL); ++ if (!pred) ++ return -ENOMEM; ++ ++ err = filter_parse(&pbuf, pred); ++ if (err < 0) { ++ filter_free_pred(pred); ++ return err; ++ } ++ ++ if (pred->clear) { ++ filter_free_preds(call); ++ filter_free_pred(pred); ++ return cnt; ++ } ++ ++ if (filter_add_pred(call, pred)) { ++ filter_free_pred(pred); ++ return -EINVAL; ++ } ++ ++ *ppos += cnt; ++ ++ return cnt; ++} ++ ++static ssize_t ++subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct event_subsystem *system = filp->private_data; ++ struct trace_seq *s; ++ int r; ++ ++ if (*ppos) ++ return 0; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ trace_seq_init(s); ++ ++ filter_print_preds(system->preds, s); ++ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); ++ ++ kfree(s); ++ ++ return r; ++} ++ ++static ssize_t ++subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, ++ loff_t *ppos) ++{ ++ struct event_subsystem *system = filp->private_data; ++ char buf[64], *pbuf = buf; ++ struct filter_pred *pred; ++ int err; ++ ++ if (cnt >= sizeof(buf)) ++ return -EINVAL; ++ ++ if (copy_from_user(&buf, ubuf, cnt)) ++ return -EFAULT; ++ ++ pred = kzalloc(sizeof(*pred), GFP_KERNEL); ++ if (!pred) ++ return -ENOMEM; ++ ++ err = filter_parse(&pbuf, pred); ++ if (err < 0) { ++ filter_free_pred(pred); ++ return err; ++ } ++ ++ if (pred->clear) { ++ filter_free_subsystem_preds(system); ++ filter_free_pred(pred); ++ return cnt; ++ } ++ ++ if (filter_add_subsystem_pred(system, pred)) { ++ filter_free_subsystem_preds(system); ++ filter_free_pred(pred); ++ return -EINVAL; ++ } ++ ++ *ppos += cnt; ++ ++ return cnt; ++} ++ ++static const struct seq_operations show_event_seq_ops = { ++ .start = t_start, ++ .next = t_next, ++ .show = t_show, ++ .stop = t_stop, ++}; ++ ++static const struct seq_operations show_set_event_seq_ops = { ++ .start = s_start, ++ .next = s_next, ++ .show = t_show, ++ .stop = t_stop, ++}; ++ ++static const struct file_operations ftrace_avail_fops = { ++ .open = ftrace_event_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static const struct file_operations ftrace_set_event_fops = { ++ .open = ftrace_event_seq_open, ++ .read = seq_read, ++ .write = ftrace_event_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static const struct file_operations ftrace_enable_fops = { ++ .open = tracing_open_generic, ++ .read = event_enable_read, ++ .write = event_enable_write, ++}; ++ ++static const struct file_operations ftrace_event_format_fops = { ++ .open = tracing_open_generic, ++ .read = event_format_read, ++}; ++ ++static const struct file_operations ftrace_event_id_fops = { ++ .open = tracing_open_generic, ++ .read = event_id_read, ++}; ++ ++static const struct file_operations ftrace_event_filter_fops = { ++ .open = tracing_open_generic, ++ .read = event_filter_read, ++ .write = event_filter_write, ++}; ++ ++static const struct file_operations ftrace_subsystem_filter_fops = { ++ .open = tracing_open_generic, ++ .read = subsystem_filter_read, ++ .write = subsystem_filter_write, ++}; ++ ++static struct dentry *event_trace_events_dir(void) ++{ ++ static struct dentry *d_tracer; ++ static struct dentry *d_events; ++ ++ if (d_events) ++ return d_events; ++ ++ d_tracer = tracing_init_dentry(); ++ if (!d_tracer) ++ return NULL; ++ ++ d_events = debugfs_create_dir("events", d_tracer); ++ if (!d_events) ++ pr_warning("Could not create debugfs " ++ "'events' directory\n"); ++ ++ return d_events; ++} ++ ++static LIST_HEAD(event_subsystems); ++ ++static struct dentry * ++event_subsystem_dir(const char *name, struct dentry *d_events) ++{ ++ struct event_subsystem *system; ++ struct dentry *entry; ++ ++ /* First see if we did not already create this dir */ ++ list_for_each_entry(system, &event_subsystems, list) { ++ if (strcmp(system->name, name) == 0) ++ return system->entry; ++ } ++ ++ /* need to create new entry */ ++ system = kmalloc(sizeof(*system), GFP_KERNEL); ++ if (!system) { ++ pr_warning("No memory to create event subsystem %s\n", ++ name); ++ return d_events; ++ } ++ ++ system->entry = debugfs_create_dir(name, d_events); ++ if (!system->entry) { ++ pr_warning("Could not create event subsystem %s\n", ++ name); ++ kfree(system); ++ return d_events; ++ } ++ ++ system->name = name; ++ list_add(&system->list, &event_subsystems); ++ ++ system->preds = NULL; ++ ++ entry = debugfs_create_file("filter", 0644, system->entry, system, ++ &ftrace_subsystem_filter_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs " ++ "'%s/filter' entry\n", name); ++ ++ return system->entry; ++} ++ ++static int ++event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) ++{ ++ struct dentry *entry; ++ int ret; ++ ++ /* ++ * If the trace point header did not define TRACE_SYSTEM ++ * then the system would be called "TRACE_SYSTEM". ++ */ ++ if (strcmp(call->system, "TRACE_SYSTEM") != 0) ++ d_events = event_subsystem_dir(call->system, d_events); ++ ++ if (call->raw_init) { ++ ret = call->raw_init(); ++ if (ret < 0) { ++ pr_warning("Could not initialize trace point" ++ " events/%s\n", call->name); ++ return ret; ++ } ++ } ++ ++ call->dir = debugfs_create_dir(call->name, d_events); ++ if (!call->dir) { ++ pr_warning("Could not create debugfs " ++ "'%s' directory\n", call->name); ++ return -1; ++ } ++ ++ if (call->regfunc) { ++ entry = debugfs_create_file("enable", 0644, call->dir, call, ++ &ftrace_enable_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs " ++ "'%s/enable' entry\n", call->name); ++ } ++ ++ if (call->id) { ++ entry = debugfs_create_file("id", 0444, call->dir, call, ++ &ftrace_event_id_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs '%s/id' entry\n", ++ call->name); ++ } ++ ++ if (call->define_fields) { ++ ret = call->define_fields(); ++ if (ret < 0) { ++ pr_warning("Could not initialize trace point" ++ " events/%s\n", call->name); ++ return ret; ++ } ++ } ++ ++ entry = debugfs_create_file("filter", 0644, call->dir, call, ++ &ftrace_event_filter_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs " ++ "'%s/filter' entry\n", call->name); ++ ++ /* A trace may not want to export its format */ ++ if (!call->show_format) ++ return 0; ++ ++ entry = debugfs_create_file("format", 0444, call->dir, call, ++ &ftrace_event_format_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs " ++ "'%s/format' entry\n", call->name); ++ ++ return 0; ++} ++ ++static __init int event_trace_init(void) ++{ ++ struct ftrace_event_call *call = __start_ftrace_events; ++ struct dentry *d_tracer; ++ struct dentry *entry; ++ struct dentry *d_events; ++ ++ d_tracer = tracing_init_dentry(); ++ if (!d_tracer) ++ return 0; ++ ++ entry = debugfs_create_file("available_events", 0444, d_tracer, ++ (void *)&show_event_seq_ops, ++ &ftrace_avail_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs " ++ "'available_events' entry\n"); ++ ++ entry = debugfs_create_file("set_event", 0644, d_tracer, ++ (void *)&show_set_event_seq_ops, ++ &ftrace_set_event_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs " ++ "'set_event' entry\n"); ++ ++ d_events = event_trace_events_dir(); ++ if (!d_events) ++ return 0; ++ ++ for_each_event(call) { ++ /* The linker may leave blanks */ ++ if (!call->name) ++ continue; ++ event_create_dir(call, d_events); ++ } ++ ++ return 0; ++} ++fs_initcall(event_trace_init); +Index: linux-2.6-tip/kernel/trace/trace_events_filter.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_events_filter.c +@@ -0,0 +1,427 @@ ++/* ++ * trace_events_filter - generic event filtering ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ++ * ++ * Copyright (C) 2009 Tom Zanussi ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "trace.h" ++#include "trace_output.h" ++ ++static int filter_pred_64(struct filter_pred *pred, void *event) ++{ ++ u64 *addr = (u64 *)(event + pred->offset); ++ u64 val = (u64)pred->val; ++ int match; ++ ++ match = (val == *addr) ^ pred->not; ++ ++ return match; ++} ++ ++static int filter_pred_32(struct filter_pred *pred, void *event) ++{ ++ u32 *addr = (u32 *)(event + pred->offset); ++ u32 val = (u32)pred->val; ++ int match; ++ ++ match = (val == *addr) ^ pred->not; ++ ++ return match; ++} ++ ++static int filter_pred_16(struct filter_pred *pred, void *event) ++{ ++ u16 *addr = (u16 *)(event + pred->offset); ++ u16 val = (u16)pred->val; ++ int match; ++ ++ match = (val == *addr) ^ pred->not; ++ ++ return match; ++} ++ ++static int filter_pred_8(struct filter_pred *pred, void *event) ++{ ++ u8 *addr = (u8 *)(event + pred->offset); ++ u8 val = (u8)pred->val; ++ int match; ++ ++ match = (val == *addr) ^ pred->not; ++ ++ return match; ++} ++ ++static int filter_pred_string(struct filter_pred *pred, void *event) ++{ ++ char *addr = (char *)(event + pred->offset); ++ int cmp, match; ++ ++ cmp = strncmp(addr, pred->str_val, pred->str_len); ++ ++ match = (!cmp) ^ pred->not; ++ ++ return match; ++} ++ ++/* return 1 if event matches, 0 otherwise (discard) */ ++int filter_match_preds(struct ftrace_event_call *call, void *rec) ++{ ++ int i, matched, and_failed = 0; ++ struct filter_pred *pred; ++ ++ for (i = 0; i < MAX_FILTER_PRED; i++) { ++ if (call->preds[i]) { ++ pred = call->preds[i]; ++ if (and_failed && !pred->or) ++ continue; ++ matched = pred->fn(pred, rec); ++ if (!matched && !pred->or) { ++ and_failed = 1; ++ continue; ++ } else if (matched && pred->or) ++ return 1; ++ } else ++ break; ++ } ++ ++ if (and_failed) ++ return 0; ++ ++ return 1; ++} ++ ++void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) ++{ ++ char *field_name; ++ struct filter_pred *pred; ++ int i; ++ ++ if (!preds) { ++ trace_seq_printf(s, "none\n"); ++ return; ++ } ++ ++ for (i = 0; i < MAX_FILTER_PRED; i++) { ++ if (preds[i]) { ++ pred = preds[i]; ++ field_name = pred->field_name; ++ if (i) ++ trace_seq_printf(s, pred->or ? "|| " : "&& "); ++ trace_seq_printf(s, "%s ", field_name); ++ trace_seq_printf(s, pred->not ? "!= " : "== "); ++ if (pred->str_val) ++ trace_seq_printf(s, "%s\n", pred->str_val); ++ else ++ trace_seq_printf(s, "%llu\n", pred->val); ++ } else ++ break; ++ } ++} ++ ++static struct ftrace_event_field * ++find_event_field(struct ftrace_event_call *call, char *name) ++{ ++ struct ftrace_event_field *field; ++ ++ list_for_each_entry(field, &call->fields, link) { ++ if (!strcmp(field->name, name)) ++ return field; ++ } ++ ++ return NULL; ++} ++ ++void filter_free_pred(struct filter_pred *pred) ++{ ++ if (!pred) ++ return; ++ ++ kfree(pred->field_name); ++ kfree(pred->str_val); ++ kfree(pred); ++} ++ ++void filter_free_preds(struct ftrace_event_call *call) ++{ ++ int i; ++ ++ if (call->preds) { ++ for (i = 0; i < MAX_FILTER_PRED; i++) ++ filter_free_pred(call->preds[i]); ++ kfree(call->preds); ++ call->preds = NULL; ++ } ++} ++ ++void filter_free_subsystem_preds(struct event_subsystem *system) ++{ ++ struct ftrace_event_call *call = __start_ftrace_events; ++ int i; ++ ++ if (system->preds) { ++ for (i = 0; i < MAX_FILTER_PRED; i++) ++ filter_free_pred(system->preds[i]); ++ kfree(system->preds); ++ system->preds = NULL; ++ } ++ ++ events_for_each(call) { ++ if (!call->name || !call->regfunc) ++ continue; ++ ++ if (!strcmp(call->system, system->name)) ++ filter_free_preds(call); ++ } ++} ++ ++static int __filter_add_pred(struct ftrace_event_call *call, ++ struct filter_pred *pred) ++{ ++ int i; ++ ++ if (call->preds && !pred->compound) ++ filter_free_preds(call); ++ ++ if (!call->preds) { ++ call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), ++ GFP_KERNEL); ++ if (!call->preds) ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < MAX_FILTER_PRED; i++) { ++ if (!call->preds[i]) { ++ call->preds[i] = pred; ++ return 0; ++ } ++ } ++ ++ return -ENOMEM; ++} ++ ++static int is_string_field(const char *type) ++{ ++ if (strchr(type, '[') && strstr(type, "char")) ++ return 1; ++ ++ return 0; ++} ++ ++int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) ++{ ++ struct ftrace_event_field *field; ++ ++ field = find_event_field(call, pred->field_name); ++ if (!field) ++ return -EINVAL; ++ ++ pred->offset = field->offset; ++ ++ if (is_string_field(field->type)) { ++ if (!pred->str_val) ++ return -EINVAL; ++ pred->fn = filter_pred_string; ++ pred->str_len = field->size; ++ return __filter_add_pred(call, pred); ++ } else { ++ if (pred->str_val) ++ return -EINVAL; ++ } ++ ++ switch (field->size) { ++ case 8: ++ pred->fn = filter_pred_64; ++ break; ++ case 4: ++ pred->fn = filter_pred_32; ++ break; ++ case 2: ++ pred->fn = filter_pred_16; ++ break; ++ case 1: ++ pred->fn = filter_pred_8; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return __filter_add_pred(call, pred); ++} ++ ++static struct filter_pred *copy_pred(struct filter_pred *pred) ++{ ++ struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); ++ if (!new_pred) ++ return NULL; ++ ++ memcpy(new_pred, pred, sizeof(*pred)); ++ ++ if (pred->field_name) { ++ new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); ++ if (!new_pred->field_name) { ++ kfree(new_pred); ++ return NULL; ++ } ++ } ++ ++ if (pred->str_val) { ++ new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); ++ if (!new_pred->str_val) { ++ filter_free_pred(new_pred); ++ return NULL; ++ } ++ } ++ ++ return new_pred; ++} ++ ++int filter_add_subsystem_pred(struct event_subsystem *system, ++ struct filter_pred *pred) ++{ ++ struct ftrace_event_call *call = __start_ftrace_events; ++ struct filter_pred *event_pred; ++ int i; ++ ++ if (system->preds && !pred->compound) ++ filter_free_subsystem_preds(system); ++ ++ if (!system->preds) { ++ system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), ++ GFP_KERNEL); ++ if (!system->preds) ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < MAX_FILTER_PRED; i++) { ++ if (!system->preds[i]) { ++ system->preds[i] = pred; ++ break; ++ } ++ } ++ ++ if (i == MAX_FILTER_PRED) ++ return -EINVAL; ++ ++ events_for_each(call) { ++ int err; ++ ++ if (!call->name || !call->regfunc) ++ continue; ++ ++ if (strcmp(call->system, system->name)) ++ continue; ++ ++ if (!find_event_field(call, pred->field_name)) ++ continue; ++ ++ event_pred = copy_pred(pred); ++ if (!event_pred) ++ goto oom; ++ ++ err = filter_add_pred(call, event_pred); ++ if (err) ++ filter_free_pred(event_pred); ++ if (err == -ENOMEM) ++ goto oom; ++ } ++ ++ return 0; ++ ++oom: ++ system->preds[i] = NULL; ++ return -ENOMEM; ++} ++ ++int filter_parse(char **pbuf, struct filter_pred *pred) ++{ ++ char *tmp, *tok, *val_str = NULL; ++ int tok_n = 0; ++ ++ /* field ==/!= number, or/and field ==/!= number, number */ ++ while ((tok = strsep(pbuf, " \n"))) { ++ if (tok_n == 0) { ++ if (!strcmp(tok, "0")) { ++ pred->clear = 1; ++ return 0; ++ } else if (!strcmp(tok, "&&")) { ++ pred->or = 0; ++ pred->compound = 1; ++ } else if (!strcmp(tok, "||")) { ++ pred->or = 1; ++ pred->compound = 1; ++ } else ++ pred->field_name = tok; ++ tok_n = 1; ++ continue; ++ } ++ if (tok_n == 1) { ++ if (!pred->field_name) ++ pred->field_name = tok; ++ else if (!strcmp(tok, "!=")) ++ pred->not = 1; ++ else if (!strcmp(tok, "==")) ++ pred->not = 0; ++ else { ++ pred->field_name = NULL; ++ return -EINVAL; ++ } ++ tok_n = 2; ++ continue; ++ } ++ if (tok_n == 2) { ++ if (pred->compound) { ++ if (!strcmp(tok, "!=")) ++ pred->not = 1; ++ else if (!strcmp(tok, "==")) ++ pred->not = 0; ++ else { ++ pred->field_name = NULL; ++ return -EINVAL; ++ } ++ } else { ++ val_str = tok; ++ break; /* done */ ++ } ++ tok_n = 3; ++ continue; ++ } ++ if (tok_n == 3) { ++ val_str = tok; ++ break; /* done */ ++ } ++ } ++ ++ pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); ++ if (!pred->field_name) ++ return -ENOMEM; ++ ++ pred->val = simple_strtoull(val_str, &tmp, 10); ++ if (tmp == val_str) { ++ pred->str_val = kstrdup(val_str, GFP_KERNEL); ++ if (!pred->str_val) ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++ +Index: linux-2.6-tip/kernel/trace/trace_events_stage_1.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_events_stage_1.h +@@ -0,0 +1,39 @@ ++/* ++ * Stage 1 of the trace events. ++ * ++ * Override the macros in to include the following: ++ * ++ * struct ftrace_raw_ { ++ * struct trace_entry ent; ++ * ; ++ * []; ++ * [...] ++ * }; ++ * ++ * The is created by the __field(type, item) macro or ++ * the __array(type2, item2, len) macro. ++ * We simply do "type item;", and that will create the fields ++ * in the structure. ++ */ ++ ++#undef TRACE_FORMAT ++#define TRACE_FORMAT(call, proto, args, fmt) ++ ++#undef __array ++#define __array(type, item, len) type item[len]; ++ ++#undef __field ++#define __field(type, item) type item; ++ ++#undef TP_STRUCT__entry ++#define TP_STRUCT__entry(args...) args ++ ++#undef TRACE_EVENT ++#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ ++ struct ftrace_raw_##name { \ ++ struct trace_entry ent; \ ++ tstruct \ ++ }; \ ++ static struct ftrace_event_call event_##name ++ ++#include +Index: linux-2.6-tip/kernel/trace/trace_events_stage_2.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_events_stage_2.h +@@ -0,0 +1,176 @@ ++/* ++ * Stage 2 of the trace events. ++ * ++ * Override the macros in to include the following: ++ * ++ * enum print_line_t ++ * ftrace_raw_output_(struct trace_iterator *iter, int flags) ++ * { ++ * struct trace_seq *s = &iter->seq; ++ * struct ftrace_raw_ *field; <-- defined in stage 1 ++ * struct trace_entry *entry; ++ * int ret; ++ * ++ * entry = iter->ent; ++ * ++ * if (entry->type != event_.id) { ++ * WARN_ON_ONCE(1); ++ * return TRACE_TYPE_UNHANDLED; ++ * } ++ * ++ * field = (typeof(field))entry; ++ * ++ * ret = trace_seq_printf(s, "\n"); ++ * if (!ret) ++ * return TRACE_TYPE_PARTIAL_LINE; ++ * ++ * return TRACE_TYPE_HANDLED; ++ * } ++ * ++ * This is the method used to print the raw event to the trace ++ * output format. Note, this is not needed if the data is read ++ * in binary. ++ */ ++ ++#undef __entry ++#define __entry field ++ ++#undef TP_printk ++#define TP_printk(fmt, args...) fmt "\n", args ++ ++#undef TRACE_EVENT ++#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ ++enum print_line_t \ ++ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ ++{ \ ++ struct trace_seq *s = &iter->seq; \ ++ struct ftrace_raw_##call *field; \ ++ struct trace_entry *entry; \ ++ int ret; \ ++ \ ++ entry = iter->ent; \ ++ \ ++ if (entry->type != event_##call.id) { \ ++ WARN_ON_ONCE(1); \ ++ return TRACE_TYPE_UNHANDLED; \ ++ } \ ++ \ ++ field = (typeof(field))entry; \ ++ \ ++ ret = trace_seq_printf(s, #call ": " print); \ ++ if (!ret) \ ++ return TRACE_TYPE_PARTIAL_LINE; \ ++ \ ++ return TRACE_TYPE_HANDLED; \ ++} ++ ++#include ++ ++/* ++ * Setup the showing format of trace point. ++ * ++ * int ++ * ftrace_format_##call(struct trace_seq *s) ++ * { ++ * struct ftrace_raw_##call field; ++ * int ret; ++ * ++ * ret = trace_seq_printf(s, #type " " #item ";" ++ * " offset:%u; size:%u;\n", ++ * offsetof(struct ftrace_raw_##call, item), ++ * sizeof(field.type)); ++ * ++ * } ++ */ ++ ++#undef TP_STRUCT__entry ++#define TP_STRUCT__entry(args...) args ++ ++#undef __field ++#define __field(type, item) \ ++ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ ++ "offset:%u;\tsize:%u;\n", \ ++ (unsigned int)offsetof(typeof(field), item), \ ++ (unsigned int)sizeof(field.item)); \ ++ if (!ret) \ ++ return 0; ++ ++#undef __array ++#define __array(type, item, len) \ ++ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ ++ "offset:%u;\tsize:%u;\n", \ ++ (unsigned int)offsetof(typeof(field), item), \ ++ (unsigned int)sizeof(field.item)); \ ++ if (!ret) \ ++ return 0; ++ ++#undef __entry ++#define __entry "REC" ++ ++#undef TP_printk ++#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args ++ ++#undef TP_fast_assign ++#define TP_fast_assign(args...) args ++ ++#undef TRACE_EVENT ++#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ ++static int \ ++ftrace_format_##call(struct trace_seq *s) \ ++{ \ ++ struct ftrace_raw_##call field; \ ++ int ret; \ ++ \ ++ tstruct; \ ++ \ ++ trace_seq_printf(s, "\nprint fmt: " print); \ ++ \ ++ return ret; \ ++} ++ ++#include ++ ++#undef __field ++#define __field(type, item) \ ++ ret = trace_define_field(event_call, #type, #item, \ ++ offsetof(typeof(field), item), \ ++ sizeof(field.item)); \ ++ if (ret) \ ++ return ret; ++ ++#undef __array ++#define __array(type, item, len) \ ++ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ ++ offsetof(typeof(field), item), \ ++ sizeof(field.item)); \ ++ if (ret) \ ++ return ret; ++ ++#define __common_field(type, item) \ ++ ret = trace_define_field(event_call, #type, "common_" #item, \ ++ offsetof(typeof(field.ent), item), \ ++ sizeof(field.ent.item)); \ ++ if (ret) \ ++ return ret; ++ ++#undef TRACE_EVENT ++#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ ++int \ ++ftrace_define_fields_##call(void) \ ++{ \ ++ struct ftrace_raw_##call field; \ ++ struct ftrace_event_call *event_call = &event_##call; \ ++ int ret; \ ++ \ ++ __common_field(unsigned char, type); \ ++ __common_field(unsigned char, flags); \ ++ __common_field(unsigned char, preempt_count); \ ++ __common_field(int, pid); \ ++ __common_field(int, tgid); \ ++ \ ++ tstruct; \ ++ \ ++ return ret; \ ++} ++ ++#include +Index: linux-2.6-tip/kernel/trace/trace_events_stage_3.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_events_stage_3.h +@@ -0,0 +1,281 @@ ++/* ++ * Stage 3 of the trace events. ++ * ++ * Override the macros in to include the following: ++ * ++ * static void ftrace_event_(proto) ++ * { ++ * event_trace_printk(_RET_IP_, ": " ); ++ * } ++ * ++ * static int ftrace_reg_event_(void) ++ * { ++ * int ret; ++ * ++ * ret = register_trace_(ftrace_event_); ++ * if (!ret) ++ * pr_info("event trace: Could not activate trace point " ++ * "probe to "); ++ * return ret; ++ * } ++ * ++ * static void ftrace_unreg_event_(void) ++ * { ++ * unregister_trace_(ftrace_event_); ++ * } ++ * ++ * For those macros defined with TRACE_FORMAT: ++ * ++ * static struct ftrace_event_call __used ++ * __attribute__((__aligned__(4))) ++ * __attribute__((section("_ftrace_events"))) event_ = { ++ * .name = "", ++ * .regfunc = ftrace_reg_event_, ++ * .unregfunc = ftrace_unreg_event_, ++ * } ++ * ++ * ++ * For those macros defined with TRACE_EVENT: ++ * ++ * static struct ftrace_event_call event_; ++ * ++ * static void ftrace_raw_event_(proto) ++ * { ++ * struct ring_buffer_event *event; ++ * struct ftrace_raw_ *entry; <-- defined in stage 1 ++ * unsigned long irq_flags; ++ * int pc; ++ * ++ * local_save_flags(irq_flags); ++ * pc = preempt_count(); ++ * ++ * event = trace_current_buffer_lock_reserve(event_.id, ++ * sizeof(struct ftrace_raw_), ++ * irq_flags, pc); ++ * if (!event) ++ * return; ++ * entry = ring_buffer_event_data(event); ++ * ++ * ; <-- Here we assign the entries by the __field and ++ * __array macros. ++ * ++ * trace_current_buffer_unlock_commit(event, irq_flags, pc); ++ * } ++ * ++ * static int ftrace_raw_reg_event_(void) ++ * { ++ * int ret; ++ * ++ * ret = register_trace_(ftrace_raw_event_); ++ * if (!ret) ++ * pr_info("event trace: Could not activate trace point " ++ * "probe to "); ++ * return ret; ++ * } ++ * ++ * static void ftrace_unreg_event_(void) ++ * { ++ * unregister_trace_(ftrace_raw_event_); ++ * } ++ * ++ * static struct trace_event ftrace_event_type_ = { ++ * .trace = ftrace_raw_output_, <-- stage 2 ++ * }; ++ * ++ * static int ftrace_raw_init_event_(void) ++ * { ++ * int id; ++ * ++ * id = register_ftrace_event(&ftrace_event_type_); ++ * if (!id) ++ * return -ENODEV; ++ * event_.id = id; ++ * return 0; ++ * } ++ * ++ * static struct ftrace_event_call __used ++ * __attribute__((__aligned__(4))) ++ * __attribute__((section("_ftrace_events"))) event_ = { ++ * .name = "", ++ * .system = "", ++ * .raw_init = ftrace_raw_init_event_, ++ * .regfunc = ftrace_reg_event_, ++ * .unregfunc = ftrace_unreg_event_, ++ * .show_format = ftrace_format_, ++ * } ++ * ++ */ ++ ++#undef TP_FMT ++#define TP_FMT(fmt, args...) fmt "\n", ##args ++ ++#ifdef CONFIG_EVENT_PROFILE ++#define _TRACE_PROFILE(call, proto, args) \ ++static void ftrace_profile_##call(proto) \ ++{ \ ++ extern void perf_tpcounter_event(int); \ ++ perf_tpcounter_event(event_##call.id); \ ++} \ ++ \ ++static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ ++{ \ ++ int ret = 0; \ ++ \ ++ if (!atomic_inc_return(&call->profile_count)) \ ++ ret = register_trace_##call(ftrace_profile_##call); \ ++ \ ++ return ret; \ ++} \ ++ \ ++static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ ++{ \ ++ if (atomic_add_negative(-1, &call->profile_count)) \ ++ unregister_trace_##call(ftrace_profile_##call); \ ++} ++ ++#define _TRACE_PROFILE_INIT(call) \ ++ .profile_count = ATOMIC_INIT(-1), \ ++ .profile_enable = ftrace_profile_enable_##call, \ ++ .profile_disable = ftrace_profile_disable_##call, ++ ++#else ++#define _TRACE_PROFILE(call, proto, args) ++#define _TRACE_PROFILE_INIT(call) ++#endif ++ ++#define _TRACE_FORMAT(call, proto, args, fmt) \ ++static void ftrace_event_##call(proto) \ ++{ \ ++ event_trace_printk(_RET_IP_, #call ": " fmt); \ ++} \ ++ \ ++static int ftrace_reg_event_##call(void) \ ++{ \ ++ int ret; \ ++ \ ++ ret = register_trace_##call(ftrace_event_##call); \ ++ if (ret) \ ++ pr_info("event trace: Could not activate trace point " \ ++ "probe to " #call "\n"); \ ++ return ret; \ ++} \ ++ \ ++static void ftrace_unreg_event_##call(void) \ ++{ \ ++ unregister_trace_##call(ftrace_event_##call); \ ++} \ ++ \ ++static struct ftrace_event_call event_##call; \ ++ \ ++static int ftrace_init_event_##call(void) \ ++{ \ ++ int id; \ ++ \ ++ id = register_ftrace_event(NULL); \ ++ if (!id) \ ++ return -ENODEV; \ ++ event_##call.id = id; \ ++ return 0; \ ++} ++ ++#undef TRACE_FORMAT ++#define TRACE_FORMAT(call, proto, args, fmt) \ ++_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ ++_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ ++static struct ftrace_event_call __used \ ++__attribute__((__aligned__(4))) \ ++__attribute__((section("_ftrace_events"))) event_##call = { \ ++ .name = #call, \ ++ .system = __stringify(TRACE_SYSTEM), \ ++ .raw_init = ftrace_init_event_##call, \ ++ .regfunc = ftrace_reg_event_##call, \ ++ .unregfunc = ftrace_unreg_event_##call, \ ++ _TRACE_PROFILE_INIT(call) \ ++} ++ ++#undef __entry ++#define __entry entry ++ ++#undef TRACE_EVENT ++#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ ++_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ ++ \ ++static struct ftrace_event_call event_##call; \ ++ \ ++static void ftrace_raw_event_##call(proto) \ ++{ \ ++ struct ftrace_event_call *call = &event_##call; \ ++ struct ring_buffer_event *event; \ ++ struct ftrace_raw_##call *entry; \ ++ unsigned long irq_flags; \ ++ int pc; \ ++ \ ++ local_save_flags(irq_flags); \ ++ pc = preempt_count(); \ ++ \ ++ event = trace_current_buffer_lock_reserve(event_##call.id, \ ++ sizeof(struct ftrace_raw_##call), \ ++ irq_flags, pc); \ ++ if (!event) \ ++ return; \ ++ entry = ring_buffer_event_data(event); \ ++ \ ++ assign; \ ++ \ ++ if (call->preds && !filter_match_preds(call, entry)) \ ++ ring_buffer_event_discard(event); \ ++ \ ++ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ ++ \ ++} \ ++ \ ++static int ftrace_raw_reg_event_##call(void) \ ++{ \ ++ int ret; \ ++ \ ++ ret = register_trace_##call(ftrace_raw_event_##call); \ ++ if (ret) \ ++ pr_info("event trace: Could not activate trace point " \ ++ "probe to " #call "\n"); \ ++ return ret; \ ++} \ ++ \ ++static void ftrace_raw_unreg_event_##call(void) \ ++{ \ ++ unregister_trace_##call(ftrace_raw_event_##call); \ ++} \ ++ \ ++static struct trace_event ftrace_event_type_##call = { \ ++ .trace = ftrace_raw_output_##call, \ ++}; \ ++ \ ++static int ftrace_raw_init_event_##call(void) \ ++{ \ ++ int id; \ ++ \ ++ id = register_ftrace_event(&ftrace_event_type_##call); \ ++ if (!id) \ ++ return -ENODEV; \ ++ event_##call.id = id; \ ++ INIT_LIST_HEAD(&event_##call.fields); \ ++ return 0; \ ++} \ ++ \ ++static struct ftrace_event_call __used \ ++__attribute__((__aligned__(4))) \ ++__attribute__((section("_ftrace_events"))) event_##call = { \ ++ .name = #call, \ ++ .system = __stringify(TRACE_SYSTEM), \ ++ .raw_init = ftrace_raw_init_event_##call, \ ++ .regfunc = ftrace_raw_reg_event_##call, \ ++ .unregfunc = ftrace_raw_unreg_event_##call, \ ++ .show_format = ftrace_format_##call, \ ++ .define_fields = ftrace_define_fields_##call, \ ++ _TRACE_PROFILE_INIT(call) \ ++} ++ ++#include ++ ++#undef _TRACE_PROFILE ++#undef _TRACE_PROFILE_INIT ++ +Index: linux-2.6-tip/kernel/trace/trace_export.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_export.c +@@ -0,0 +1,102 @@ ++/* ++ * trace_export.c - export basic ftrace utilities to user space ++ * ++ * Copyright (C) 2009 Steven Rostedt ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "trace_output.h" ++ ++ ++#undef TRACE_STRUCT ++#define TRACE_STRUCT(args...) args ++ ++#undef TRACE_FIELD ++#define TRACE_FIELD(type, item, assign) \ ++ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ ++ "offset:%u;\tsize:%u;\n", \ ++ (unsigned int)offsetof(typeof(field), item), \ ++ (unsigned int)sizeof(field.item)); \ ++ if (!ret) \ ++ return 0; ++ ++ ++#undef TRACE_FIELD_SPECIAL ++#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ ++ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ ++ "offset:%u;\tsize:%u;\n", \ ++ (unsigned int)offsetof(typeof(field), item), \ ++ (unsigned int)sizeof(field.item)); \ ++ if (!ret) \ ++ return 0; ++ ++#undef TRACE_FIELD_ZERO_CHAR ++#define TRACE_FIELD_ZERO_CHAR(item) \ ++ ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \ ++ "offset:%u;\tsize:0;\n", \ ++ (unsigned int)offsetof(typeof(field), item)); \ ++ if (!ret) \ ++ return 0; ++ ++ ++#undef TP_RAW_FMT ++#define TP_RAW_FMT(args...) args ++ ++#undef TRACE_EVENT_FORMAT ++#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ ++static int \ ++ftrace_format_##call(struct trace_seq *s) \ ++{ \ ++ struct args field; \ ++ int ret; \ ++ \ ++ tstruct; \ ++ \ ++ trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ ++ \ ++ return ret; \ ++} ++ ++#include "trace_event_types.h" ++ ++#undef TRACE_ZERO_CHAR ++#define TRACE_ZERO_CHAR(arg) ++ ++#undef TRACE_FIELD ++#define TRACE_FIELD(type, item, assign)\ ++ entry->item = assign; ++ ++#undef TRACE_FIELD ++#define TRACE_FIELD(type, item, assign)\ ++ entry->item = assign; ++ ++#undef TP_CMD ++#define TP_CMD(cmd...) cmd ++ ++#undef TRACE_ENTRY ++#define TRACE_ENTRY entry ++ ++#undef TRACE_FIELD_SPECIAL ++#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ ++ cmd; ++ ++#undef TRACE_EVENT_FORMAT ++#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ ++ \ ++static struct ftrace_event_call __used \ ++__attribute__((__aligned__(4))) \ ++__attribute__((section("_ftrace_events"))) event_##call = { \ ++ .name = #call, \ ++ .id = proto, \ ++ .system = __stringify(TRACE_SYSTEM), \ ++ .show_format = ftrace_format_##call, \ ++} ++#include "trace_event_types.h" +Index: linux-2.6-tip/kernel/trace/trace_functions.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_functions.c ++++ linux-2.6-tip/kernel/trace/trace_functions.c +@@ -9,6 +9,7 @@ + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ ++#include + #include + #include + #include +@@ -16,52 +17,388 @@ + + #include "trace.h" + +-static void start_function_trace(struct trace_array *tr) ++/* function tracing enabled */ ++static int ftrace_function_enabled; ++ ++static struct trace_array *func_trace; ++ ++static void tracing_start_function_trace(void); ++static void tracing_stop_function_trace(void); ++ ++static int function_trace_init(struct trace_array *tr) + { ++ func_trace = tr; + tr->cpu = get_cpu(); +- tracing_reset_online_cpus(tr); + put_cpu(); + + tracing_start_cmdline_record(); + tracing_start_function_trace(); ++ return 0; + } + +-static void stop_function_trace(struct trace_array *tr) ++static void function_trace_reset(struct trace_array *tr) + { + tracing_stop_function_trace(); + tracing_stop_cmdline_record(); + } + +-static int function_trace_init(struct trace_array *tr) ++static void function_trace_start(struct trace_array *tr) + { +- start_function_trace(tr); +- return 0; ++ tracing_reset_online_cpus(tr); + } + +-static void function_trace_reset(struct trace_array *tr) ++static void ++function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) ++{ ++ struct trace_array *tr = func_trace; ++ struct trace_array_cpu *data; ++ unsigned long flags; ++ long disabled; ++ int cpu, resched; ++ int pc; ++ ++ if (unlikely(!ftrace_function_enabled)) ++ return; ++ ++ pc = preempt_count(); ++ resched = ftrace_preempt_disable(); ++ local_save_flags(flags); ++ cpu = raw_smp_processor_id(); ++ data = tr->data[cpu]; ++ disabled = atomic_inc_return(&data->disabled); ++ ++ if (likely(disabled == 1)) ++ trace_function(tr, ip, parent_ip, flags, pc); ++ ++ atomic_dec(&data->disabled); ++ ftrace_preempt_enable(resched); ++} ++ ++static void ++function_trace_call(unsigned long ip, unsigned long parent_ip) + { +- stop_function_trace(tr); ++ struct trace_array *tr = func_trace; ++ struct trace_array_cpu *data; ++ unsigned long flags; ++ long disabled; ++ int cpu; ++ int pc; ++ ++ if (unlikely(!ftrace_function_enabled)) ++ return; ++ ++ /* ++ * Need to use raw, since this must be called before the ++ * recursive protection is performed. ++ */ ++ local_irq_save(flags); ++ cpu = raw_smp_processor_id(); ++ data = tr->data[cpu]; ++ disabled = atomic_inc_return(&data->disabled); ++ ++ if (likely(disabled == 1)) { ++ pc = preempt_count(); ++ trace_function(tr, ip, parent_ip, flags, pc); ++ } ++ ++ atomic_dec(&data->disabled); ++ local_irq_restore(flags); + } + +-static void function_trace_start(struct trace_array *tr) ++static void ++function_stack_trace_call(unsigned long ip, unsigned long parent_ip) + { +- tracing_reset_online_cpus(tr); ++ struct trace_array *tr = func_trace; ++ struct trace_array_cpu *data; ++ unsigned long flags; ++ long disabled; ++ int cpu; ++ int pc; ++ ++ if (unlikely(!ftrace_function_enabled)) ++ return; ++ ++ /* ++ * Need to use raw, since this must be called before the ++ * recursive protection is performed. ++ */ ++ local_irq_save(flags); ++ cpu = raw_smp_processor_id(); ++ data = tr->data[cpu]; ++ disabled = atomic_inc_return(&data->disabled); ++ ++ if (likely(disabled == 1)) { ++ pc = preempt_count(); ++ trace_function(tr, ip, parent_ip, flags, pc); ++ /* ++ * skip over 5 funcs: ++ * __ftrace_trace_stack, ++ * __trace_stack, ++ * function_stack_trace_call ++ * ftrace_list_func ++ * ftrace_call ++ */ ++ __trace_stack(tr, flags, 5, pc); ++ } ++ ++ atomic_dec(&data->disabled); ++ local_irq_restore(flags); ++} ++ ++ ++static struct ftrace_ops trace_ops __read_mostly = ++{ ++ .func = function_trace_call, ++}; ++ ++static struct ftrace_ops trace_stack_ops __read_mostly = ++{ ++ .func = function_stack_trace_call, ++}; ++ ++/* Our two options */ ++enum { ++ TRACE_FUNC_OPT_STACK = 0x1, ++}; ++ ++static struct tracer_opt func_opts[] = { ++#ifdef CONFIG_STACKTRACE ++ { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, ++#endif ++ { } /* Always set a last empty entry */ ++}; ++ ++static struct tracer_flags func_flags = { ++ .val = 0, /* By default: all flags disabled */ ++ .opts = func_opts ++}; ++ ++static void tracing_start_function_trace(void) ++{ ++ ftrace_function_enabled = 0; ++ ++ if (trace_flags & TRACE_ITER_PREEMPTONLY) ++ trace_ops.func = function_trace_call_preempt_only; ++ else ++ trace_ops.func = function_trace_call; ++ ++ if (func_flags.val & TRACE_FUNC_OPT_STACK) ++ register_ftrace_function(&trace_stack_ops); ++ else ++ register_ftrace_function(&trace_ops); ++ ++ ftrace_function_enabled = 1; ++} ++ ++static void tracing_stop_function_trace(void) ++{ ++ ftrace_function_enabled = 0; ++ /* OK if they are not registered */ ++ unregister_ftrace_function(&trace_stack_ops); ++ unregister_ftrace_function(&trace_ops); ++} ++ ++static int func_set_flag(u32 old_flags, u32 bit, int set) ++{ ++ if (bit == TRACE_FUNC_OPT_STACK) { ++ /* do nothing if already set */ ++ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) ++ return 0; ++ ++ if (set) { ++ unregister_ftrace_function(&trace_ops); ++ register_ftrace_function(&trace_stack_ops); ++ } else { ++ unregister_ftrace_function(&trace_stack_ops); ++ register_ftrace_function(&trace_ops); ++ } ++ ++ return 0; ++ } ++ ++ return -EINVAL; + } + + static struct tracer function_trace __read_mostly = + { +- .name = "function", +- .init = function_trace_init, +- .reset = function_trace_reset, +- .start = function_trace_start, ++ .name = "function", ++ .init = function_trace_init, ++ .reset = function_trace_reset, ++ .start = function_trace_start, ++ .wait_pipe = poll_wait_pipe, ++ .flags = &func_flags, ++ .set_flag = func_set_flag, + #ifdef CONFIG_FTRACE_SELFTEST +- .selftest = trace_selftest_startup_function, ++ .selftest = trace_selftest_startup_function, + #endif + }; + ++#ifdef CONFIG_DYNAMIC_FTRACE ++static void ++ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) ++{ ++ long *count = (long *)data; ++ ++ if (tracing_is_on()) ++ return; ++ ++ if (!*count) ++ return; ++ ++ if (*count != -1) ++ (*count)--; ++ ++ tracing_on(); ++} ++ ++static void ++ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) ++{ ++ long *count = (long *)data; ++ ++ if (!tracing_is_on()) ++ return; ++ ++ if (!*count) ++ return; ++ ++ if (*count != -1) ++ (*count)--; ++ ++ tracing_off(); ++} ++ ++static int ++ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, ++ struct ftrace_probe_ops *ops, void *data); ++ ++static struct ftrace_probe_ops traceon_probe_ops = { ++ .func = ftrace_traceon, ++ .print = ftrace_trace_onoff_print, ++}; ++ ++static struct ftrace_probe_ops traceoff_probe_ops = { ++ .func = ftrace_traceoff, ++ .print = ftrace_trace_onoff_print, ++}; ++ ++static int ++ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, ++ struct ftrace_probe_ops *ops, void *data) ++{ ++ char str[KSYM_SYMBOL_LEN]; ++ long count = (long)data; ++ ++ kallsyms_lookup(ip, NULL, NULL, NULL, str); ++ seq_printf(m, "%s:", str); ++ ++ if (ops == &traceon_probe_ops) ++ seq_printf(m, "traceon"); ++ else ++ seq_printf(m, "traceoff"); ++ ++ if (count == -1) ++ seq_printf(m, ":unlimited\n"); ++ else ++ seq_printf(m, ":count=%ld", count); ++ seq_putc(m, '\n'); ++ ++ return 0; ++} ++ ++static int ++ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) ++{ ++ struct ftrace_probe_ops *ops; ++ ++ /* we register both traceon and traceoff to this callback */ ++ if (strcmp(cmd, "traceon") == 0) ++ ops = &traceon_probe_ops; ++ else ++ ops = &traceoff_probe_ops; ++ ++ unregister_ftrace_function_probe_func(glob, ops); ++ ++ return 0; ++} ++ ++static int ++ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) ++{ ++ struct ftrace_probe_ops *ops; ++ void *count = (void *)-1; ++ char *number; ++ int ret; ++ ++ /* hash funcs only work with set_ftrace_filter */ ++ if (!enable) ++ return -EINVAL; ++ ++ if (glob[0] == '!') ++ return ftrace_trace_onoff_unreg(glob+1, cmd, param); ++ ++ /* we register both traceon and traceoff to this callback */ ++ if (strcmp(cmd, "traceon") == 0) ++ ops = &traceon_probe_ops; ++ else ++ ops = &traceoff_probe_ops; ++ ++ if (!param) ++ goto out_reg; ++ ++ number = strsep(¶m, ":"); ++ ++ if (!strlen(number)) ++ goto out_reg; ++ ++ /* ++ * We use the callback data field (which is a pointer) ++ * as our counter. ++ */ ++ ret = strict_strtoul(number, 0, (unsigned long *)&count); ++ if (ret) ++ return ret; ++ ++ out_reg: ++ ret = register_ftrace_function_probe(glob, ops, count); ++ ++ return ret; ++} ++ ++static struct ftrace_func_command ftrace_traceon_cmd = { ++ .name = "traceon", ++ .func = ftrace_trace_onoff_callback, ++}; ++ ++static struct ftrace_func_command ftrace_traceoff_cmd = { ++ .name = "traceoff", ++ .func = ftrace_trace_onoff_callback, ++}; ++ ++static int __init init_func_cmd_traceon(void) ++{ ++ int ret; ++ ++ ret = register_ftrace_command(&ftrace_traceoff_cmd); ++ if (ret) ++ return ret; ++ ++ ret = register_ftrace_command(&ftrace_traceon_cmd); ++ if (ret) ++ unregister_ftrace_command(&ftrace_traceoff_cmd); ++ return ret; ++} ++#else ++static inline int init_func_cmd_traceon(void) ++{ ++ return 0; ++} ++#endif /* CONFIG_DYNAMIC_FTRACE */ ++ + static __init int init_function_trace(void) + { ++ init_func_cmd_traceon(); + return register_tracer(&function_trace); + } +- + device_initcall(init_function_trace); ++ +Index: linux-2.6-tip/kernel/trace/trace_functions_graph.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_functions_graph.c ++++ linux-2.6-tip/kernel/trace/trace_functions_graph.c +@@ -1,7 +1,7 @@ + /* + * + * Function graph tracer. +- * Copyright (c) 2008 Frederic Weisbecker ++ * Copyright (c) 2008-2009 Frederic Weisbecker + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt + * +@@ -12,6 +12,12 @@ + #include + + #include "trace.h" ++#include "trace_output.h" ++ ++struct fgraph_data { ++ pid_t last_pid; ++ int depth; ++}; + + #define TRACE_GRAPH_INDENT 2 + +@@ -20,9 +26,11 @@ + #define TRACE_GRAPH_PRINT_CPU 0x2 + #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 + #define TRACE_GRAPH_PRINT_PROC 0x8 ++#define TRACE_GRAPH_PRINT_DURATION 0x10 ++#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 + + static struct tracer_opt trace_opts[] = { +- /* Display overruns ? */ ++ /* Display overruns? (for self-debug purpose) */ + { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display CPU ? */ + { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, +@@ -30,26 +38,103 @@ static struct tracer_opt trace_opts[] = + { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, + /* Display proc name/pid */ + { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, ++ /* Display duration of execution */ ++ { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, ++ /* Display absolute time of an entry */ ++ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + { } /* Empty entry */ + }; + + static struct tracer_flags tracer_flags = { + /* Don't display overruns and proc by default */ +- .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, ++ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | ++ TRACE_GRAPH_PRINT_DURATION, + .opts = trace_opts + }; + + /* pid on the last trace processed */ +-static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; + +-static int graph_trace_init(struct trace_array *tr) ++ ++/* Add a function return address to the trace stack on thread info.*/ ++int ++ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) ++{ ++ unsigned long long calltime; ++ int index; ++ ++ if (!current->ret_stack) ++ return -EBUSY; ++ ++ /* The return trace stack is full */ ++ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { ++ atomic_inc(¤t->trace_overrun); ++ return -EBUSY; ++ } ++ ++ calltime = trace_clock_local(); ++ ++ index = ++current->curr_ret_stack; ++ barrier(); ++ current->ret_stack[index].ret = ret; ++ current->ret_stack[index].func = func; ++ current->ret_stack[index].calltime = calltime; ++ current->ret_stack[index].subtime = 0; ++ *depth = index; ++ ++ return 0; ++} ++ ++/* Retrieve a function return address to the trace stack on thread info.*/ ++static void ++ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) ++{ ++ int index; ++ ++ index = current->curr_ret_stack; ++ ++ if (unlikely(index < 0)) { ++ ftrace_graph_stop(); ++ WARN_ON(1); ++ /* Might as well panic, otherwise we have no where to go */ ++ *ret = (unsigned long)panic; ++ return; ++ } ++ ++ *ret = current->ret_stack[index].ret; ++ trace->func = current->ret_stack[index].func; ++ trace->calltime = current->ret_stack[index].calltime; ++ trace->overrun = atomic_read(¤t->trace_overrun); ++ trace->depth = index; ++} ++ ++/* ++ * Send the trace to the ring-buffer. ++ * @return the original return address. ++ */ ++unsigned long ftrace_return_to_handler(void) + { +- int cpu, ret; ++ struct ftrace_graph_ret trace; ++ unsigned long ret; ++ ++ ftrace_pop_return_trace(&trace, &ret); ++ trace.rettime = trace_clock_local(); ++ ftrace_graph_return(&trace); ++ barrier(); ++ current->curr_ret_stack--; ++ ++ if (unlikely(!ret)) { ++ ftrace_graph_stop(); ++ WARN_ON(1); ++ /* Might as well panic. What else to do? */ ++ ret = (unsigned long)panic; ++ } + +- for_each_online_cpu(cpu) +- tracing_reset(tr, cpu); ++ return ret; ++} + +- ret = register_ftrace_graph(&trace_graph_return, ++static int graph_trace_init(struct trace_array *tr) ++{ ++ int ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); + if (ret) + return ret; +@@ -112,15 +197,15 @@ print_graph_cpu(struct trace_seq *s, int + static enum print_line_t + print_graph_proc(struct trace_seq *s, pid_t pid) + { +- int i; +- int ret; +- int len; +- char comm[8]; +- int spaces = 0; ++ char comm[TASK_COMM_LEN]; + /* sign + log10(MAX_INT) + '\0' */ + char pid_str[11]; ++ int spaces = 0; ++ int ret; ++ int len; ++ int i; + +- strncpy(comm, trace_find_cmdline(pid), 7); ++ trace_find_cmdline(pid, comm); + comm[7] = '\0'; + sprintf(pid_str, "%d", pid); + +@@ -153,17 +238,25 @@ print_graph_proc(struct trace_seq *s, pi + + /* If the pid changed since the last trace, output this event */ + static enum print_line_t +-verif_pid(struct trace_seq *s, pid_t pid, int cpu) ++verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) + { + pid_t prev_pid; ++ pid_t *last_pid; + int ret; + +- if (last_pid[cpu] != -1 && last_pid[cpu] == pid) ++ if (!data) ++ return TRACE_TYPE_HANDLED; ++ ++ last_pid = &(per_cpu_ptr(data, cpu)->last_pid); ++ ++ if (*last_pid == pid) + return TRACE_TYPE_HANDLED; + +- prev_pid = last_pid[cpu]; +- last_pid[cpu] = pid; ++ prev_pid = *last_pid; ++ *last_pid = pid; + ++ if (prev_pid == -1) ++ return TRACE_TYPE_HANDLED; + /* + * Context-switch trace line: + +@@ -175,34 +268,34 @@ verif_pid(struct trace_seq *s, pid_t pid + ret = trace_seq_printf(s, + " ------------------------------------------\n"); + if (!ret) +- TRACE_TYPE_PARTIAL_LINE; ++ return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) +- TRACE_TYPE_PARTIAL_LINE; ++ return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, prev_pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) +- TRACE_TYPE_PARTIAL_LINE; ++ return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " => "); + if (!ret) +- TRACE_TYPE_PARTIAL_LINE; ++ return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) +- TRACE_TYPE_PARTIAL_LINE; ++ return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, + "\n ------------------------------------------\n\n"); + if (!ret) +- TRACE_TYPE_PARTIAL_LINE; ++ return TRACE_TYPE_PARTIAL_LINE; + +- return ret; ++ return TRACE_TYPE_HANDLED; + } + +-static bool +-trace_branch_is_leaf(struct trace_iterator *iter, ++static struct ftrace_graph_ret_entry * ++get_return_for_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *curr) + { + struct ring_buffer_iter *ring_iter; +@@ -211,72 +304,130 @@ trace_branch_is_leaf(struct trace_iterat + + ring_iter = iter->buffer_iter[iter->cpu]; + +- if (!ring_iter) +- return false; +- +- event = ring_buffer_iter_peek(ring_iter, NULL); ++ /* First peek to compare current entry and the next one */ ++ if (ring_iter) ++ event = ring_buffer_iter_peek(ring_iter, NULL); ++ else { ++ /* We need to consume the current entry to see the next one */ ++ ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); ++ event = ring_buffer_peek(iter->tr->buffer, iter->cpu, ++ NULL); ++ } + + if (!event) +- return false; ++ return NULL; + + next = ring_buffer_event_data(event); + + if (next->ent.type != TRACE_GRAPH_RET) +- return false; ++ return NULL; + + if (curr->ent.pid != next->ent.pid || + curr->graph_ent.func != next->ret.func) +- return false; ++ return NULL; ++ ++ /* this is a leaf, now advance the iterator */ ++ if (ring_iter) ++ ring_buffer_read(ring_iter, NULL); ++ ++ return next; ++} ++ ++/* Signal a overhead of time execution to the output */ ++static int ++print_graph_overhead(unsigned long long duration, struct trace_seq *s) ++{ ++ /* If duration disappear, we don't need anything */ ++ if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) ++ return 1; ++ ++ /* Non nested entry or return */ ++ if (duration == -1) ++ return trace_seq_printf(s, " "); ++ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { ++ /* Duration exceeded 100 msecs */ ++ if (duration > 100000ULL) ++ return trace_seq_printf(s, "! "); ++ ++ /* Duration exceeded 10 msecs */ ++ if (duration > 10000ULL) ++ return trace_seq_printf(s, "+ "); ++ } ++ ++ return trace_seq_printf(s, " "); ++} ++ ++static int print_graph_abs_time(u64 t, struct trace_seq *s) ++{ ++ unsigned long usecs_rem; ++ ++ usecs_rem = do_div(t, NSEC_PER_SEC); ++ usecs_rem /= 1000; + +- return true; ++ return trace_seq_printf(s, "%5lu.%06lu | ", ++ (unsigned long)t, usecs_rem); + } + + static enum print_line_t +-print_graph_irq(struct trace_seq *s, unsigned long addr, +- enum trace_type type, int cpu, pid_t pid) ++print_graph_irq(struct trace_iterator *iter, unsigned long addr, ++ enum trace_type type, int cpu, pid_t pid) + { + int ret; ++ struct trace_seq *s = &iter->seq; + + if (addr < (unsigned long)__irqentry_text_start || + addr >= (unsigned long)__irqentry_text_end) + return TRACE_TYPE_UNHANDLED; + +- if (type == TRACE_GRAPH_ENT) { +- ret = trace_seq_printf(s, "==========> | "); +- } else { +- /* Cpu */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { +- ret = print_graph_cpu(s, cpu); +- if (ret == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; +- } +- /* Proc */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { +- ret = print_graph_proc(s, pid); +- if (ret == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; ++ /* Absolute time */ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { ++ ret = print_graph_abs_time(iter->ts, s); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } + +- ret = trace_seq_printf(s, " | "); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- } ++ /* Cpu */ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ++ ret = print_graph_cpu(s, cpu); ++ if (ret == TRACE_TYPE_PARTIAL_LINE) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } ++ /* Proc */ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { ++ ret = print_graph_proc(s, pid); ++ if (ret == TRACE_TYPE_PARTIAL_LINE) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ret = trace_seq_printf(s, " | "); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } + +- /* No overhead */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { +- ret = trace_seq_printf(s, " "); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- } ++ /* No overhead */ ++ ret = print_graph_overhead(-1, s); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ if (type == TRACE_GRAPH_ENT) ++ ret = trace_seq_printf(s, "==========>"); ++ else ++ ret = trace_seq_printf(s, "<=========="); ++ ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* Don't close the duration column if haven't one */ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) ++ trace_seq_printf(s, " |"); ++ ret = trace_seq_printf(s, "\n"); + +- ret = trace_seq_printf(s, "<========== |\n"); +- } + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + } + +-static enum print_line_t +-print_graph_duration(unsigned long long duration, struct trace_seq *s) ++enum print_line_t ++trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) + { + unsigned long nsecs_rem = do_div(duration, 1000); + /* log10(ULONG_MAX) + '\0' */ +@@ -285,10 +436,10 @@ print_graph_duration(unsigned long long + int ret, len; + int i; + +- sprintf(msecs_str, "%lu", (unsigned long) duration); ++ snprintf(msecs_str, sizeof(msecs_str), "%lu", (unsigned long) duration); + + /* Print msecs */ +- ret = trace_seq_printf(s, msecs_str); ++ ret = trace_seq_printf(s, "%s", msecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + +@@ -313,60 +464,66 @@ print_graph_duration(unsigned long long + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } +- +- ret = trace_seq_printf(s, "| "); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +- + } + +-/* Signal a overhead of time execution to the output */ +-static int +-print_graph_overhead(unsigned long long duration, struct trace_seq *s) ++static enum print_line_t ++print_graph_duration(unsigned long long duration, struct trace_seq *s) + { +- /* Duration exceeded 100 msecs */ +- if (duration > 100000ULL) +- return trace_seq_printf(s, "! "); +- +- /* Duration exceeded 10 msecs */ +- if (duration > 10000ULL) +- return trace_seq_printf(s, "+ "); ++ int ret; + +- return trace_seq_printf(s, " "); ++ ret = trace_print_graph_duration(duration, s); ++ if (ret != TRACE_TYPE_HANDLED) ++ return ret; ++ ++ ret = trace_seq_printf(s, "| "); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; + } + + /* Case of a leaf function on its call entry */ + static enum print_line_t + print_graph_entry_leaf(struct trace_iterator *iter, +- struct ftrace_graph_ent_entry *entry, struct trace_seq *s) ++ struct ftrace_graph_ent_entry *entry, ++ struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) + { +- struct ftrace_graph_ret_entry *ret_entry; ++ struct fgraph_data *data = iter->private; + struct ftrace_graph_ret *graph_ret; +- struct ring_buffer_event *event; + struct ftrace_graph_ent *call; + unsigned long long duration; + int ret; + int i; + +- event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); +- ret_entry = ring_buffer_event_data(event); + graph_ret = &ret_entry->ret; + call = &entry->graph_ent; + duration = graph_ret->rettime - graph_ret->calltime; + +- /* Overhead */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { +- ret = print_graph_overhead(duration, s); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; ++ if (data) { ++ int cpu = iter->cpu; ++ int *depth = &(per_cpu_ptr(data, cpu)->depth); ++ ++ /* ++ * Comments display at + 1 to depth. Since ++ * this is a leaf function, keep the comments ++ * equal to this depth. ++ */ ++ *depth = call->depth - 1; + } + +- /* Duration */ +- ret = print_graph_duration(duration, s); +- if (ret == TRACE_TYPE_PARTIAL_LINE) ++ /* Overhead */ ++ ret = print_graph_overhead(duration, s); ++ if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + ++ /* Duration */ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { ++ ret = print_graph_duration(duration, s); ++ if (ret == TRACE_TYPE_PARTIAL_LINE) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } ++ + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); +@@ -386,33 +543,34 @@ print_graph_entry_leaf(struct trace_iter + } + + static enum print_line_t +-print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, +- struct trace_seq *s, pid_t pid, int cpu) ++print_graph_entry_nested(struct trace_iterator *iter, ++ struct ftrace_graph_ent_entry *entry, ++ struct trace_seq *s, int cpu) + { +- int i; +- int ret; + struct ftrace_graph_ent *call = &entry->graph_ent; ++ struct fgraph_data *data = iter->private; ++ int ret; ++ int i; + +- /* No overhead */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { +- ret = trace_seq_printf(s, " "); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; ++ if (data) { ++ int cpu = iter->cpu; ++ int *depth = &(per_cpu_ptr(data, cpu)->depth); ++ ++ *depth = call->depth; + } + +- /* Interrupt */ +- ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); +- if (ret == TRACE_TYPE_UNHANDLED) { +- /* No time */ ++ /* No overhead */ ++ ret = print_graph_overhead(-1, s); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* No time */ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; +- } else { +- if (ret == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; + } + +- + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); +@@ -428,20 +586,40 @@ print_graph_entry_nested(struct ftrace_g + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + +- return TRACE_TYPE_HANDLED; ++ /* ++ * we already consumed the current entry to check the next one ++ * and see if this is a leaf. ++ */ ++ return TRACE_TYPE_NO_CONSUME; + } + + static enum print_line_t +-print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, +- struct trace_iterator *iter, int cpu) ++print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, ++ int type, unsigned long addr) + { +- int ret; ++ struct fgraph_data *data = iter->private; + struct trace_entry *ent = iter->ent; ++ int cpu = iter->cpu; ++ int ret; + + /* Pid */ +- if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) ++ if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + ++ if (type) { ++ /* Interrupt */ ++ ret = print_graph_irq(iter, addr, type, cpu, ent->pid); ++ if (ret == TRACE_TYPE_PARTIAL_LINE) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } ++ ++ /* Absolute time */ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { ++ ret = print_graph_abs_time(iter->ts, s); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } ++ + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); +@@ -460,54 +638,65 @@ print_graph_entry(struct ftrace_graph_en + return TRACE_TYPE_PARTIAL_LINE; + } + +- if (trace_branch_is_leaf(iter, field)) +- return print_graph_entry_leaf(iter, field, s); ++ return 0; ++} ++ ++static enum print_line_t ++print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, ++ struct trace_iterator *iter) ++{ ++ int cpu = iter->cpu; ++ struct ftrace_graph_ent *call = &field->graph_ent; ++ struct ftrace_graph_ret_entry *leaf_ret; ++ ++ if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ leaf_ret = get_return_for_leaf(iter, field); ++ if (leaf_ret) ++ return print_graph_entry_leaf(iter, field, leaf_ret, s); + else +- return print_graph_entry_nested(field, s, iter->ent->pid, cpu); ++ return print_graph_entry_nested(iter, field, s, cpu); + + } + + static enum print_line_t + print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, +- struct trace_entry *ent, int cpu) ++ struct trace_entry *ent, struct trace_iterator *iter) + { +- int i; +- int ret; + unsigned long long duration = trace->rettime - trace->calltime; ++ struct fgraph_data *data = iter->private; ++ pid_t pid = ent->pid; ++ int cpu = iter->cpu; ++ int ret; ++ int i; + +- /* Pid */ +- if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; +- +- /* Cpu */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { +- ret = print_graph_cpu(s, cpu); +- if (ret == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; ++ if (data) { ++ int cpu = iter->cpu; ++ int *depth = &(per_cpu_ptr(data, cpu)->depth); ++ ++ /* ++ * Comments display at + 1 to depth. This is the ++ * return from a function, we now want the comments ++ * to display at the same level of the bracket. ++ */ ++ *depth = trace->depth - 1; + } + +- /* Proc */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { +- ret = print_graph_proc(s, ent->pid); +- if (ret == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; +- +- ret = trace_seq_printf(s, " | "); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- } ++ if (print_graph_prologue(iter, s, 0, 0)) ++ return TRACE_TYPE_PARTIAL_LINE; + + /* Overhead */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { +- ret = print_graph_overhead(duration, s); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- } ++ ret = print_graph_overhead(duration, s); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; + + /* Duration */ +- ret = print_graph_duration(duration, s); +- if (ret == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { ++ ret = print_graph_duration(duration, s); ++ if (ret == TRACE_TYPE_PARTIAL_LINE) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } + + /* Closing brace */ + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { +@@ -528,7 +717,7 @@ print_graph_return(struct ftrace_graph_r + return TRACE_TYPE_PARTIAL_LINE; + } + +- ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid); ++ ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + +@@ -536,61 +725,73 @@ print_graph_return(struct ftrace_graph_r + } + + static enum print_line_t +-print_graph_comment(struct print_entry *trace, struct trace_seq *s, +- struct trace_entry *ent, struct trace_iterator *iter) ++print_graph_comment(struct trace_seq *s, struct trace_entry *ent, ++ struct trace_iterator *iter) + { +- int i; ++ unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); ++ struct fgraph_data *data = iter->private; ++ struct trace_event *event; ++ int depth = 0; + int ret; ++ int i; + +- /* Pid */ +- if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; +- +- /* Cpu */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { +- ret = print_graph_cpu(s, iter->cpu); +- if (ret == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; +- } +- +- /* Proc */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { +- ret = print_graph_proc(s, ent->pid); +- if (ret == TRACE_TYPE_PARTIAL_LINE) +- return TRACE_TYPE_PARTIAL_LINE; ++ if (data) ++ depth = per_cpu_ptr(data, iter->cpu)->depth; + +- ret = trace_seq_printf(s, " | "); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- } ++ if (print_graph_prologue(iter, s, 0, 0)) ++ return TRACE_TYPE_PARTIAL_LINE; + + /* No overhead */ +- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { +- ret = trace_seq_printf(s, " "); ++ ret = print_graph_overhead(-1, s); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ /* No time */ ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { ++ ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + +- /* No time */ +- ret = trace_seq_printf(s, " | "); +- if (!ret) +- return TRACE_TYPE_PARTIAL_LINE; +- + /* Indentation */ +- if (trace->depth > 0) +- for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { ++ if (depth > 0) ++ for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* The comment */ +- ret = trace_seq_printf(s, "/* %s", trace->buf); ++ ret = trace_seq_printf(s, "/* "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + +- if (ent->flags & TRACE_FLAG_CONT) +- trace_seq_print_cont(s, iter); ++ switch (iter->ent->type) { ++ case TRACE_BPRINT: ++ ret = trace_print_bprintk_msg_only(iter); ++ if (ret != TRACE_TYPE_HANDLED) ++ return ret; ++ break; ++ case TRACE_PRINT: ++ ret = trace_print_printk_msg_only(iter); ++ if (ret != TRACE_TYPE_HANDLED) ++ return ret; ++ break; ++ default: ++ event = ftrace_find_event(ent->type); ++ if (!event) ++ return TRACE_TYPE_UNHANDLED; ++ ++ ret = event->trace(iter, sym_flags); ++ if (ret != TRACE_TYPE_HANDLED) ++ return ret; ++ } ++ ++ /* Strip ending newline */ ++ if (s->buffer[s->len - 1] == '\n') { ++ s->buffer[s->len - 1] = '\0'; ++ s->len--; ++ } + + ret = trace_seq_printf(s, " */\n"); + if (!ret) +@@ -603,62 +804,91 @@ print_graph_comment(struct print_entry * + enum print_line_t + print_graph_function(struct trace_iterator *iter) + { +- struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; ++ struct trace_seq *s = &iter->seq; + + switch (entry->type) { + case TRACE_GRAPH_ENT: { + struct ftrace_graph_ent_entry *field; + trace_assign_type(field, entry); +- return print_graph_entry(field, s, iter, +- iter->cpu); ++ return print_graph_entry(field, s, iter); + } + case TRACE_GRAPH_RET: { + struct ftrace_graph_ret_entry *field; + trace_assign_type(field, entry); +- return print_graph_return(&field->ret, s, entry, iter->cpu); +- } +- case TRACE_PRINT: { +- struct print_entry *field; +- trace_assign_type(field, entry); +- return print_graph_comment(field, s, entry, iter); ++ return print_graph_return(&field->ret, s, entry, iter); + } + default: +- return TRACE_TYPE_UNHANDLED; ++ return print_graph_comment(s, entry, iter); + } ++ ++ return TRACE_TYPE_HANDLED; + } + + static void print_graph_headers(struct seq_file *s) + { + /* 1st line */ + seq_printf(s, "# "); ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) ++ seq_printf(s, " TIME "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) +- seq_printf(s, "CPU "); ++ seq_printf(s, "CPU"); + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) +- seq_printf(s, "TASK/PID "); +- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) +- seq_printf(s, "OVERHEAD/"); +- seq_printf(s, "DURATION FUNCTION CALLS\n"); ++ seq_printf(s, " TASK/PID "); ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) ++ seq_printf(s, " DURATION "); ++ seq_printf(s, " FUNCTION CALLS\n"); + + /* 2nd line */ + seq_printf(s, "# "); ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) ++ seq_printf(s, " | "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) +- seq_printf(s, "| "); ++ seq_printf(s, "| "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) +- seq_printf(s, "| | "); +- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { +- seq_printf(s, "| "); +- seq_printf(s, "| | | | |\n"); +- } else +- seq_printf(s, " | | | | |\n"); ++ seq_printf(s, " | | "); ++ if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) ++ seq_printf(s, " | | "); ++ seq_printf(s, " | | | |\n"); ++} ++ ++static void graph_trace_open(struct trace_iterator *iter) ++{ ++ /* pid and depth on the last trace processed */ ++ struct fgraph_data *data = alloc_percpu(struct fgraph_data); ++ int cpu; ++ ++ if (!data) ++ pr_warning("function graph tracer: not enough memory\n"); ++ else ++ for_each_possible_cpu(cpu) { ++ pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); ++ int *depth = &(per_cpu_ptr(data, cpu)->depth); ++ *pid = -1; ++ *depth = 0; ++ } ++ ++ iter->private = data; + } ++ ++static void graph_trace_close(struct trace_iterator *iter) ++{ ++ free_percpu(iter->private); ++} ++ + static struct tracer graph_trace __read_mostly = { +- .name = "function_graph", +- .init = graph_trace_init, +- .reset = graph_trace_reset, ++ .name = "function_graph", ++ .open = graph_trace_open, ++ .close = graph_trace_close, ++ .wait_pipe = poll_wait_pipe, ++ .init = graph_trace_init, ++ .reset = graph_trace_reset, + .print_line = print_graph_function, + .print_header = print_graph_headers, + .flags = &tracer_flags, ++#ifdef CONFIG_FTRACE_SELFTEST ++ .selftest = trace_selftest_startup_function_graph, ++#endif + }; + + static __init int init_graph_trace(void) +Index: linux-2.6-tip/kernel/trace/trace_hw_branches.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_hw_branches.c ++++ linux-2.6-tip/kernel/trace/trace_hw_branches.c +@@ -1,30 +1,53 @@ + /* + * h/w branch tracer for x86 based on bts + * +- * Copyright (C) 2008 Markus Metzger +- * ++ * Copyright (C) 2008-2009 Intel Corporation. ++ * Markus Metzger , 2008-2009 + */ +- +-#include +-#include ++#include ++#include + #include + #include +-#include ++#include ++#include ++#include ++#include + + #include + + #include "trace.h" ++#include "trace_output.h" + + + #define SIZEOF_BTS (1 << 13) + ++/* ++ * The tracer lock protects the below per-cpu tracer array. ++ * It needs to be held to: ++ * - start tracing on all cpus ++ * - stop tracing on all cpus ++ * - start tracing on a single hotplug cpu ++ * - stop tracing on a single hotplug cpu ++ * - read the trace from all cpus ++ * - read the trace from a single cpu ++ */ ++static DEFINE_SPINLOCK(bts_tracer_lock); + static DEFINE_PER_CPU(struct bts_tracer *, tracer); + static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); + + #define this_tracer per_cpu(tracer, smp_processor_id()) + #define this_buffer per_cpu(buffer, smp_processor_id()) + ++static int __read_mostly trace_hw_branches_enabled; ++static struct trace_array *hw_branch_trace __read_mostly; ++ + ++/* ++ * Start tracing on the current cpu. ++ * The argument is ignored. ++ * ++ * pre: bts_tracer_lock must be locked. ++ */ + static void bts_trace_start_cpu(void *arg) + { + if (this_tracer) +@@ -42,14 +65,20 @@ static void bts_trace_start_cpu(void *ar + + static void bts_trace_start(struct trace_array *tr) + { +- int cpu; ++ spin_lock(&bts_tracer_lock); + +- tracing_reset_online_cpus(tr); ++ on_each_cpu(bts_trace_start_cpu, NULL, 1); ++ trace_hw_branches_enabled = 1; + +- for_each_cpu(cpu, cpu_possible_mask) +- smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); ++ spin_unlock(&bts_tracer_lock); + } + ++/* ++ * Stop tracing on the current cpu. ++ * The argument is ignored. ++ * ++ * pre: bts_tracer_lock must be locked. ++ */ + static void bts_trace_stop_cpu(void *arg) + { + if (this_tracer) { +@@ -60,26 +89,60 @@ static void bts_trace_stop_cpu(void *arg + + static void bts_trace_stop(struct trace_array *tr) + { +- int cpu; ++ spin_lock(&bts_tracer_lock); ++ ++ trace_hw_branches_enabled = 0; ++ on_each_cpu(bts_trace_stop_cpu, NULL, 1); ++ ++ spin_unlock(&bts_tracer_lock); ++} ++ ++static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, ++ unsigned long action, void *hcpu) ++{ ++ unsigned int cpu = (unsigned long)hcpu; + +- for_each_cpu(cpu, cpu_possible_mask) ++ spin_lock(&bts_tracer_lock); ++ ++ if (!trace_hw_branches_enabled) ++ goto out; ++ ++ switch (action) { ++ case CPU_ONLINE: ++ case CPU_DOWN_FAILED: ++ smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); ++ break; ++ case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); ++ break; ++ } ++ ++ out: ++ spin_unlock(&bts_tracer_lock); ++ return NOTIFY_DONE; + } + ++static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { ++ .notifier_call = bts_hotcpu_handler ++}; ++ + static int bts_trace_init(struct trace_array *tr) + { +- tracing_reset_online_cpus(tr); ++ hw_branch_trace = tr; ++ + bts_trace_start(tr); + + return 0; + } + ++static void bts_trace_reset(struct trace_array *tr) ++{ ++ bts_trace_stop(tr); ++} ++ + static void bts_trace_print_header(struct seq_file *m) + { +- seq_puts(m, +- "# CPU# FROM TO FUNCTION\n"); +- seq_puts(m, +- "# | | | |\n"); ++ seq_puts(m, "# CPU# TO <- FROM\n"); + } + + static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) +@@ -87,15 +150,15 @@ static enum print_line_t bts_trace_print + struct trace_entry *entry = iter->ent; + struct trace_seq *seq = &iter->seq; + struct hw_branch_entry *it; ++ unsigned long symflags = TRACE_ITER_SYM_OFFSET; + + trace_assign_type(it, entry); + + if (entry->type == TRACE_HW_BRANCHES) { +- if (trace_seq_printf(seq, "%4d ", entry->cpu) && +- trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", +- it->from, it->to) && +- (!it->from || +- seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) && ++ if (trace_seq_printf(seq, "%4d ", iter->cpu) && ++ seq_print_ip_sym(seq, it->to, symflags) && ++ trace_seq_printf(seq, "\t <- ") && ++ seq_print_ip_sym(seq, it->from, symflags) && + trace_seq_printf(seq, "\n")) + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE;; +@@ -103,26 +166,42 @@ static enum print_line_t bts_trace_print + return TRACE_TYPE_UNHANDLED; + } + +-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) ++void trace_hw_branch(u64 from, u64 to) + { ++ struct trace_array *tr = hw_branch_trace; + struct ring_buffer_event *event; + struct hw_branch_entry *entry; +- unsigned long irq; ++ unsigned long irq1; ++ int cpu; + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); +- if (!event) ++ if (unlikely(!tr)) + return; ++ ++ if (unlikely(!trace_hw_branches_enabled)) ++ return; ++ ++ local_irq_save(irq1); ++ cpu = raw_smp_processor_id(); ++ if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) ++ goto out; ++ ++ event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, ++ sizeof(*entry), 0, 0); ++ if (!event) ++ goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, from); + entry->ent.type = TRACE_HW_BRANCHES; +- entry->ent.cpu = smp_processor_id(); + entry->from = from; + entry->to = to; +- ring_buffer_unlock_commit(tr->buffer, event, irq); ++ trace_buffer_unlock_commit(tr, event, 0, 0); ++ ++ out: ++ atomic_dec(&tr->data[cpu]->disabled); ++ local_irq_restore(irq1); + } + +-static void trace_bts_at(struct trace_array *tr, +- const struct bts_trace *trace, void *at) ++static void trace_bts_at(const struct bts_trace *trace, void *at) + { + struct bts_struct bts; + int err = 0; +@@ -137,18 +216,29 @@ static void trace_bts_at(struct trace_ar + + switch (bts.qualifier) { + case BTS_BRANCH: +- trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); ++ trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); + break; + } + } + ++/* ++ * Collect the trace on the current cpu and write it into the ftrace buffer. ++ * ++ * pre: bts_tracer_lock must be locked ++ */ + static void trace_bts_cpu(void *arg) + { + struct trace_array *tr = (struct trace_array *) arg; + const struct bts_trace *trace; + unsigned char *at; + +- if (!this_tracer) ++ if (unlikely(!tr)) ++ return; ++ ++ if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) ++ return; ++ ++ if (unlikely(!this_tracer)) + return; + + ds_suspend_bts(this_tracer); +@@ -158,11 +248,11 @@ static void trace_bts_cpu(void *arg) + + for (at = trace->ds.top; (void *)at < trace->ds.end; + at += trace->ds.size) +- trace_bts_at(tr, trace, at); ++ trace_bts_at(trace, at); + + for (at = trace->ds.begin; (void *)at < trace->ds.top; + at += trace->ds.size) +- trace_bts_at(tr, trace, at); ++ trace_bts_at(trace, at); + + out: + ds_resume_bts(this_tracer); +@@ -170,26 +260,43 @@ out: + + static void trace_bts_prepare(struct trace_iterator *iter) + { +- int cpu; ++ spin_lock(&bts_tracer_lock); ++ ++ on_each_cpu(trace_bts_cpu, iter->tr, 1); ++ ++ spin_unlock(&bts_tracer_lock); ++} ++ ++static void trace_bts_close(struct trace_iterator *iter) ++{ ++ tracing_reset_online_cpus(iter->tr); ++} ++ ++void trace_hw_branch_oops(void) ++{ ++ spin_lock(&bts_tracer_lock); ++ ++ trace_bts_cpu(hw_branch_trace); + +- for_each_cpu(cpu, cpu_possible_mask) +- smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); ++ spin_unlock(&bts_tracer_lock); + } + + struct tracer bts_tracer __read_mostly = + { + .name = "hw-branch-tracer", + .init = bts_trace_init, +- .reset = bts_trace_stop, ++ .reset = bts_trace_reset, + .print_header = bts_trace_print_header, + .print_line = bts_trace_print_line, + .start = bts_trace_start, + .stop = bts_trace_stop, +- .open = trace_bts_prepare ++ .open = trace_bts_prepare, ++ .close = trace_bts_close + }; + + __init static int init_bts_trace(void) + { ++ register_hotcpu_notifier(&bts_hotcpu_notifier); + return register_tracer(&bts_tracer); + } + device_initcall(init_bts_trace); +Index: linux-2.6-tip/kernel/trace/trace_irqsoff.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_irqsoff.c ++++ linux-2.6-tip/kernel/trace/trace_irqsoff.c +@@ -1,5 +1,5 @@ + /* +- * trace irqs off criticall timings ++ * trace irqs off critical timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar +@@ -17,13 +17,14 @@ + #include + + #include "trace.h" ++#include "trace_hist.h" + + static struct trace_array *irqsoff_trace __read_mostly; + static int tracer_enabled __read_mostly; + + static DEFINE_PER_CPU(int, tracing_cpu); + +-static DEFINE_SPINLOCK(max_trace_lock); ++static DEFINE_RAW_SPINLOCK(max_trace_lock); + + enum { + TRACER_IRQS_OFF = (1 << 1), +@@ -32,6 +33,8 @@ enum { + + static int trace_type __read_mostly; + ++static int save_lat_flag; ++ + #ifdef CONFIG_PREEMPT_TRACER + static inline int + preempt_trace(void) +@@ -95,7 +98,7 @@ irqsoff_tracer_call(unsigned long ip, un + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) +- trace_function(tr, data, ip, parent_ip, flags, preempt_count()); ++ trace_function(tr, ip, parent_ip, flags, preempt_count()); + + atomic_dec(&data->disabled); + } +@@ -153,7 +156,7 @@ check_critical_timing(struct trace_array + if (!report_latency(delta)) + goto out_unlock; + +- trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); ++ trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + + latency = nsecs_to_usecs(delta); + +@@ -177,7 +180,7 @@ out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + tracing_reset(tr, cpu); +- trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); ++ trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + } + + static inline void +@@ -210,7 +213,7 @@ start_critical_timing(unsigned long ip, + + local_save_flags(flags); + +- trace_function(tr, data, ip, parent_ip, flags, preempt_count()); ++ trace_function(tr, ip, parent_ip, flags, preempt_count()); + + per_cpu(tracing_cpu, cpu) = 1; + +@@ -244,7 +247,7 @@ stop_critical_timing(unsigned long ip, u + atomic_inc(&data->disabled); + + local_save_flags(flags); +- trace_function(tr, data, ip, parent_ip, flags, preempt_count()); ++ trace_function(tr, ip, parent_ip, flags, preempt_count()); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +@@ -253,6 +256,7 @@ stop_critical_timing(unsigned long ip, u + /* start and stop critical timings used to for stoppage (in idle) */ + void start_critical_timings(void) + { ++ tracing_hist_preempt_start(); + if (preempt_trace() || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +@@ -260,6 +264,7 @@ EXPORT_SYMBOL_GPL(start_critical_timings + + void stop_critical_timings(void) + { ++ tracing_hist_preempt_stop(TRACE_STOP); + if (preempt_trace() || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +@@ -269,12 +274,14 @@ EXPORT_SYMBOL_GPL(stop_critical_timings) + #ifdef CONFIG_PROVE_LOCKING + void time_hardirqs_on(unsigned long a0, unsigned long a1) + { ++ tracing_hist_preempt_stop(1); + if (!preempt_trace() && irq_trace()) + stop_critical_timing(a0, a1); + } + + void time_hardirqs_off(unsigned long a0, unsigned long a1) + { ++ tracing_hist_preempt_start(); + if (!preempt_trace() && irq_trace()) + start_critical_timing(a0, a1); + } +@@ -310,6 +317,7 @@ inline void print_irqtrace_events(struct + */ + void trace_hardirqs_on(void) + { ++ tracing_hist_preempt_stop(1); + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +@@ -317,6 +325,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); + + void trace_hardirqs_off(void) + { ++ tracing_hist_preempt_start(); + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +@@ -324,6 +333,7 @@ EXPORT_SYMBOL(trace_hardirqs_off); + + void trace_hardirqs_on_caller(unsigned long caller_addr) + { ++ tracing_hist_preempt_stop(1); + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, caller_addr); + } +@@ -331,6 +341,7 @@ EXPORT_SYMBOL(trace_hardirqs_on_caller); + + void trace_hardirqs_off_caller(unsigned long caller_addr) + { ++ tracing_hist_preempt_start(); + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, caller_addr); + } +@@ -342,44 +353,39 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller) + #ifdef CONFIG_PREEMPT_TRACER + void trace_preempt_on(unsigned long a0, unsigned long a1) + { ++ tracing_hist_preempt_stop(0); + if (preempt_trace()) + stop_critical_timing(a0, a1); + } + + void trace_preempt_off(unsigned long a0, unsigned long a1) + { ++ tracing_hist_preempt_start(); + if (preempt_trace()) + start_critical_timing(a0, a1); + } + #endif /* CONFIG_PREEMPT_TRACER */ + +-/* +- * save_tracer_enabled is used to save the state of the tracer_enabled +- * variable when we disable it when we open a trace output file. +- */ +-static int save_tracer_enabled; +- + static void start_irqsoff_tracer(struct trace_array *tr) + { + register_ftrace_function(&trace_ops); +- if (tracing_is_enabled()) { ++ if (tracing_is_enabled()) + tracer_enabled = 1; +- save_tracer_enabled = 1; +- } else { ++ else + tracer_enabled = 0; +- save_tracer_enabled = 0; +- } + } + + static void stop_irqsoff_tracer(struct trace_array *tr) + { + tracer_enabled = 0; +- save_tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); + } + + static void __irqsoff_tracer_init(struct trace_array *tr) + { ++ save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; ++ trace_flags |= TRACE_ITER_LATENCY_FMT; ++ + tracing_max_latency = 0; + irqsoff_trace = tr; + /* make sure that the tracer is visible */ +@@ -390,30 +396,19 @@ static void __irqsoff_tracer_init(struct + static void irqsoff_tracer_reset(struct trace_array *tr) + { + stop_irqsoff_tracer(tr); ++ ++ if (!save_lat_flag) ++ trace_flags &= ~TRACE_ITER_LATENCY_FMT; + } + + static void irqsoff_tracer_start(struct trace_array *tr) + { + tracer_enabled = 1; +- save_tracer_enabled = 1; + } + + static void irqsoff_tracer_stop(struct trace_array *tr) + { + tracer_enabled = 0; +- save_tracer_enabled = 0; +-} +- +-static void irqsoff_tracer_open(struct trace_iterator *iter) +-{ +- /* stop the trace while dumping */ +- tracer_enabled = 0; +-} +- +-static void irqsoff_tracer_close(struct trace_iterator *iter) +-{ +- /* restart tracing */ +- tracer_enabled = save_tracer_enabled; + } + + #ifdef CONFIG_IRQSOFF_TRACER +@@ -431,8 +426,6 @@ static struct tracer irqsoff_tracer __re + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, +- .open = irqsoff_tracer_open, +- .close = irqsoff_tracer_close, + .print_max = 1, + #ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +@@ -459,8 +452,6 @@ static struct tracer preemptoff_tracer _ + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, +- .open = irqsoff_tracer_open, +- .close = irqsoff_tracer_close, + .print_max = 1, + #ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +@@ -489,8 +480,6 @@ static struct tracer preemptirqsoff_trac + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, +- .open = irqsoff_tracer_open, +- .close = irqsoff_tracer_close, + .print_max = 1, + #ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +Index: linux-2.6-tip/kernel/trace/trace_mmiotrace.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_mmiotrace.c ++++ linux-2.6-tip/kernel/trace/trace_mmiotrace.c +@@ -12,6 +12,7 @@ + #include + + #include "trace.h" ++#include "trace_output.h" + + struct header_iter { + struct pci_dev *dev; +@@ -183,21 +184,22 @@ static enum print_line_t mmio_print_rw(s + switch (rw->opcode) { + case MMIO_READ: + ret = trace_seq_printf(s, +- "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", ++ "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_WRITE: + ret = trace_seq_printf(s, +- "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", ++ "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_UNKNOWN_OP: + ret = trace_seq_printf(s, +- "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", ++ "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," ++ "%02lx 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, +@@ -229,14 +231,14 @@ static enum print_line_t mmio_print_map( + switch (m->opcode) { + case MMIO_PROBE: + ret = trace_seq_printf(s, +- "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", ++ "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, + 0UL, 0); + break; + case MMIO_UNPROBE: + ret = trace_seq_printf(s, +- "UNMAP %lu.%06lu %d 0x%lx %d\n", ++ "UNMAP %u.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, 0); + break; + default: +@@ -255,18 +257,15 @@ static enum print_line_t mmio_print_mark + const char *msg = print->buf; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); +- unsigned long usec_rem = do_div(t, 1000000ULL); ++ unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + int ret; + + /* The trailing newline must be in the message. */ +- ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); ++ ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + +- if (entry->flags & TRACE_FLAG_CONT) +- trace_seq_print_cont(s, iter); +- + return TRACE_TYPE_HANDLED; + } + +@@ -308,21 +307,17 @@ static void __trace_mmiotrace_rw(struct + { + struct ring_buffer_event *event; + struct trace_mmiotrace_rw *entry; +- unsigned long irq_flags; ++ int pc = preempt_count(); + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, ++ sizeof(*entry), 0, pc); + if (!event) { + atomic_inc(&dropped_count); + return; + } + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, 0, preempt_count()); +- entry->ent.type = TRACE_MMIO_RW; + entry->rw = *rw; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- +- trace_wake_up(); ++ trace_buffer_unlock_commit(tr, event, 0, pc); + } + + void mmio_trace_rw(struct mmiotrace_rw *rw) +@@ -338,21 +333,17 @@ static void __trace_mmiotrace_map(struct + { + struct ring_buffer_event *event; + struct trace_mmiotrace_map *entry; +- unsigned long irq_flags; ++ int pc = preempt_count(); + +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); ++ event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, ++ sizeof(*entry), 0, pc); + if (!event) { + atomic_inc(&dropped_count); + return; + } + entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, 0, preempt_count()); +- entry->ent.type = TRACE_MMIO_MAP; + entry->map = *map; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- +- trace_wake_up(); ++ trace_buffer_unlock_commit(tr, event, 0, pc); + } + + void mmio_trace_mapping(struct mmiotrace_map *map) +@@ -368,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace + + int mmio_trace_printk(const char *fmt, va_list args) + { +- return trace_vprintk(0, -1, fmt, args); ++ return trace_vprintk(0, fmt, args); + } +Index: linux-2.6-tip/kernel/trace/trace_nop.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_nop.c ++++ linux-2.6-tip/kernel/trace/trace_nop.c +@@ -47,12 +47,7 @@ static void stop_nop_trace(struct trace_ + + static int nop_trace_init(struct trace_array *tr) + { +- int cpu; + ctx_trace = tr; +- +- for_each_online_cpu(cpu) +- tracing_reset(tr, cpu); +- + start_nop_trace(tr); + return 0; + } +@@ -96,6 +91,7 @@ struct tracer nop_trace __read_mostly = + .name = "nop", + .init = nop_trace_init, + .reset = nop_trace_reset, ++ .wait_pipe = poll_wait_pipe, + #ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_nop, + #endif +Index: linux-2.6-tip/kernel/trace/trace_output.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_output.c +@@ -0,0 +1,1027 @@ ++/* ++ * trace_output.c ++ * ++ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "trace_output.h" ++ ++/* must be a power of 2 */ ++#define EVENT_HASHSIZE 128 ++ ++static DEFINE_MUTEX(trace_event_mutex); ++static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; ++ ++static int next_event_type = __TRACE_LAST_TYPE + 1; ++ ++void trace_print_seq(struct seq_file *m, struct trace_seq *s) ++{ ++ int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; ++ ++ s->buffer[len] = 0; ++ seq_puts(m, s->buffer); ++ ++ trace_seq_init(s); ++} ++ ++enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct trace_entry *entry = iter->ent; ++ struct bprint_entry *field; ++ int ret; ++ ++ trace_assign_type(field, entry); ++ ++ ret = trace_seq_bprintf(s, field->fmt, field->buf); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct trace_entry *entry = iter->ent; ++ struct print_entry *field; ++ int ret; ++ ++ trace_assign_type(field, entry); ++ ++ ret = trace_seq_printf(s, "%s", field->buf); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++/** ++ * trace_seq_printf - sequence printing of trace information ++ * @s: trace sequence descriptor ++ * @fmt: printf format string ++ * ++ * The tracer may use either sequence operations or its own ++ * copy to user routines. To simplify formating of a trace ++ * trace_seq_printf is used to store strings into a special ++ * buffer (@s). Then the output may be either used by ++ * the sequencer or pulled into another buffer. ++ */ ++int ++trace_seq_printf(struct trace_seq *s, const char *fmt, ...) ++{ ++ int len = (PAGE_SIZE - 1) - s->len; ++ va_list ap; ++ int ret; ++ ++ if (!len) ++ return 0; ++ ++ va_start(ap, fmt); ++ ret = vsnprintf(s->buffer + s->len, len, fmt, ap); ++ va_end(ap); ++ ++ /* If we can't write it all, don't bother writing anything */ ++ if (ret >= len) ++ return 0; ++ ++ s->len += ret; ++ ++ return len; ++} ++ ++int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) ++{ ++ int len = (PAGE_SIZE - 1) - s->len; ++ int ret; ++ ++ if (!len) ++ return 0; ++ ++ ret = bstr_printf(s->buffer + s->len, len, fmt, binary); ++ ++ /* If we can't write it all, don't bother writing anything */ ++ if (ret >= len) ++ return 0; ++ ++ s->len += ret; ++ ++ return len; ++} ++ ++/** ++ * trace_seq_puts - trace sequence printing of simple string ++ * @s: trace sequence descriptor ++ * @str: simple string to record ++ * ++ * The tracer may use either the sequence operations or its own ++ * copy to user routines. This function records a simple string ++ * into a special buffer (@s) for later retrieval by a sequencer ++ * or other mechanism. ++ */ ++int trace_seq_puts(struct trace_seq *s, const char *str) ++{ ++ int len = strlen(str); ++ ++ if (len > ((PAGE_SIZE - 1) - s->len)) ++ return 0; ++ ++ memcpy(s->buffer + s->len, str, len); ++ s->len += len; ++ ++ return len; ++} ++ ++int trace_seq_putc(struct trace_seq *s, unsigned char c) ++{ ++ if (s->len >= (PAGE_SIZE - 1)) ++ return 0; ++ ++ s->buffer[s->len++] = c; ++ ++ return 1; ++} ++ ++int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) ++{ ++ if (len > ((PAGE_SIZE - 1) - s->len)) ++ return 0; ++ ++ memcpy(s->buffer + s->len, mem, len); ++ s->len += len; ++ ++ return len; ++} ++ ++int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) ++{ ++ unsigned char hex[HEX_CHARS]; ++ unsigned char *data = mem; ++ int i, j; ++ ++#ifdef __BIG_ENDIAN ++ for (i = 0, j = 0; i < len; i++) { ++#else ++ for (i = len-1, j = 0; i >= 0; i--) { ++#endif ++ hex[j++] = hex_asc_hi(data[i]); ++ hex[j++] = hex_asc_lo(data[i]); ++ } ++ hex[j++] = ' '; ++ ++ return trace_seq_putmem(s, hex, j); ++} ++ ++void *trace_seq_reserve(struct trace_seq *s, size_t len) ++{ ++ void *ret; ++ ++ if (len > ((PAGE_SIZE - 1) - s->len)) ++ return NULL; ++ ++ ret = s->buffer + s->len; ++ s->len += len; ++ ++ return ret; ++} ++ ++int trace_seq_path(struct trace_seq *s, struct path *path) ++{ ++ unsigned char *p; ++ ++ if (s->len >= (PAGE_SIZE - 1)) ++ return 0; ++ p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); ++ if (!IS_ERR(p)) { ++ p = mangle_path(s->buffer + s->len, p, "\n"); ++ if (p) { ++ s->len = p - s->buffer; ++ return 1; ++ } ++ } else { ++ s->buffer[s->len++] = '?'; ++ return 1; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_KRETPROBES ++static inline const char *kretprobed(const char *name) ++{ ++ static const char tramp_name[] = "kretprobe_trampoline"; ++ int size = sizeof(tramp_name); ++ ++ if (strncmp(tramp_name, name, size) == 0) ++ return "[unknown/kretprobe'd]"; ++ return name; ++} ++#else ++static inline const char *kretprobed(const char *name) ++{ ++ return name; ++} ++#endif /* CONFIG_KRETPROBES */ ++ ++static int ++seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) ++{ ++#ifdef CONFIG_KALLSYMS ++ char str[KSYM_SYMBOL_LEN]; ++ const char *name; ++ ++ kallsyms_lookup(address, NULL, NULL, NULL, str); ++ ++ name = kretprobed(str); ++ ++ return trace_seq_printf(s, fmt, name); ++#endif ++ return 1; ++} ++ ++static int ++seq_print_sym_offset(struct trace_seq *s, const char *fmt, ++ unsigned long address) ++{ ++#ifdef CONFIG_KALLSYMS ++ char str[KSYM_SYMBOL_LEN]; ++ const char *name; ++ ++ sprint_symbol(str, address); ++ name = kretprobed(str); ++ ++ return trace_seq_printf(s, fmt, name); ++#endif ++ return 1; ++} ++ ++#ifndef CONFIG_64BIT ++# define IP_FMT "%08lx" ++#else ++# define IP_FMT "%016lx" ++#endif ++ ++int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, ++ unsigned long ip, unsigned long sym_flags) ++{ ++ struct file *file = NULL; ++ unsigned long vmstart = 0; ++ int ret = 1; ++ ++ if (mm) { ++ const struct vm_area_struct *vma; ++ ++ down_read(&mm->mmap_sem); ++ vma = find_vma(mm, ip); ++ if (vma) { ++ file = vma->vm_file; ++ vmstart = vma->vm_start; ++ } ++ if (file) { ++ ret = trace_seq_path(s, &file->f_path); ++ if (ret) ++ ret = trace_seq_printf(s, "[+0x%lx]", ++ ip - vmstart); ++ } ++ up_read(&mm->mmap_sem); ++ } ++ if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) ++ ret = trace_seq_printf(s, " <" IP_FMT ">", ip); ++ return ret; ++} ++ ++int ++seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, ++ unsigned long sym_flags) ++{ ++ struct mm_struct *mm = NULL; ++ int ret = 1; ++ unsigned int i; ++ ++ if (trace_flags & TRACE_ITER_SYM_USEROBJ) { ++ struct task_struct *task; ++ /* ++ * we do the lookup on the thread group leader, ++ * since individual threads might have already quit! ++ */ ++ rcu_read_lock(); ++ task = find_task_by_vpid(entry->ent.tgid); ++ if (task) ++ mm = get_task_mm(task); ++ rcu_read_unlock(); ++ } ++ ++ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { ++ unsigned long ip = entry->caller[i]; ++ ++ if (ip == ULONG_MAX || !ret) ++ break; ++ if (i && ret) ++ ret = trace_seq_puts(s, " <- "); ++ if (!ip) { ++ if (ret) ++ ret = trace_seq_puts(s, "??"); ++ continue; ++ } ++ if (!ret) ++ break; ++ if (ret) ++ ret = seq_print_user_ip(s, mm, ip, sym_flags); ++ } ++ ++ if (mm) ++ mmput(mm); ++ return ret; ++} ++ ++int ++seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) ++{ ++ int ret; ++ ++ if (!ip) ++ return trace_seq_printf(s, "0"); ++ ++ if (sym_flags & TRACE_ITER_SYM_OFFSET) ++ ret = seq_print_sym_offset(s, "%s", ip); ++ else ++ ret = seq_print_sym_short(s, "%s", ip); ++ ++ if (!ret) ++ return 0; ++ ++ if (sym_flags & TRACE_ITER_SYM_ADDR) ++ ret = trace_seq_printf(s, " <" IP_FMT ">", ip); ++ return ret; ++} ++ ++static int ++lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) ++{ ++ int hardirq, softirq; ++ char comm[TASK_COMM_LEN]; ++ ++ trace_find_cmdline(entry->pid, comm); ++ hardirq = entry->flags & TRACE_FLAG_HARDIRQ; ++ softirq = entry->flags & TRACE_FLAG_SOFTIRQ; ++ ++ if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", ++ comm, entry->pid, cpu, ++ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : ++ (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? ++ 'X' : '.', ++ (entry->flags & TRACE_FLAG_NEED_RESCHED) ? ++ 'N' : '.', ++ (hardirq && softirq) ? 'H' : ++ hardirq ? 'h' : softirq ? 's' : '.')) ++ return 0; ++ ++ if (entry->preempt_count) ++ return trace_seq_printf(s, "%x", entry->preempt_count); ++ return trace_seq_puts(s, "."); ++} ++ ++static unsigned long preempt_mark_thresh = 100; ++ ++static int ++lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, ++ unsigned long rel_usecs) ++{ ++ return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, ++ rel_usecs > preempt_mark_thresh ? '!' : ++ rel_usecs > 1 ? '+' : ' '); ++} ++ ++int trace_print_context(struct trace_iterator *iter) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct trace_entry *entry = iter->ent; ++ unsigned long long t = ns2usecs(iter->ts); ++ unsigned long usec_rem = do_div(t, USEC_PER_SEC); ++ unsigned long secs = (unsigned long)t; ++ char comm[TASK_COMM_LEN]; ++ ++ trace_find_cmdline(entry->pid, comm); ++ ++ return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", ++ comm, entry->pid, iter->cpu, secs, usec_rem); ++} ++ ++int trace_print_lat_context(struct trace_iterator *iter) ++{ ++ u64 next_ts; ++ int ret; ++ struct trace_seq *s = &iter->seq; ++ struct trace_entry *entry = iter->ent, ++ *next_entry = trace_find_next_entry(iter, NULL, ++ &next_ts); ++ unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); ++ unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); ++ unsigned long rel_usecs; ++ ++ if (!next_entry) ++ next_ts = iter->ts; ++ rel_usecs = ns2usecs(next_ts - iter->ts); ++ ++ if (verbose) { ++ char comm[TASK_COMM_LEN]; ++ ++ trace_find_cmdline(entry->pid, comm); ++ ++ ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" ++ " %ld.%03ldms (+%ld.%03ldms): ", comm, ++ entry->pid, iter->cpu, entry->flags, ++ entry->preempt_count, iter->idx, ++ ns2usecs(iter->ts), ++ abs_usecs / USEC_PER_MSEC, ++ abs_usecs % USEC_PER_MSEC, ++ rel_usecs / USEC_PER_MSEC, ++ rel_usecs % USEC_PER_MSEC); ++ } else { ++ ret = lat_print_generic(s, entry, iter->cpu); ++ if (ret) ++ ret = lat_print_timestamp(s, abs_usecs, rel_usecs); ++ } ++ ++ return ret; ++} ++ ++static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; ++ ++static int task_state_char(unsigned long state) ++{ ++ int bit = state ? __ffs(state) + 1 : 0; ++ ++ return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; ++} ++ ++/** ++ * ftrace_find_event - find a registered event ++ * @type: the type of event to look for ++ * ++ * Returns an event of type @type otherwise NULL ++ */ ++struct trace_event *ftrace_find_event(int type) ++{ ++ struct trace_event *event; ++ struct hlist_node *n; ++ unsigned key; ++ ++ key = type & (EVENT_HASHSIZE - 1); ++ ++ hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { ++ if (event->type == type) ++ return event; ++ } ++ ++ return NULL; ++} ++ ++/** ++ * register_ftrace_event - register output for an event type ++ * @event: the event type to register ++ * ++ * Event types are stored in a hash and this hash is used to ++ * find a way to print an event. If the @event->type is set ++ * then it will use that type, otherwise it will assign a ++ * type to use. ++ * ++ * If you assign your own type, please make sure it is added ++ * to the trace_type enum in trace.h, to avoid collisions ++ * with the dynamic types. ++ * ++ * Returns the event type number or zero on error. ++ */ ++int register_ftrace_event(struct trace_event *event) ++{ ++ unsigned key; ++ int ret = 0; ++ ++ mutex_lock(&trace_event_mutex); ++ ++ if (!event) { ++ ret = next_event_type++; ++ goto out; ++ } ++ ++ if (!event->type) ++ event->type = next_event_type++; ++ else if (event->type > __TRACE_LAST_TYPE) { ++ printk(KERN_WARNING "Need to add type to trace.h\n"); ++ WARN_ON(1); ++ } ++ ++ if (ftrace_find_event(event->type)) ++ goto out; ++ ++ if (event->trace == NULL) ++ event->trace = trace_nop_print; ++ if (event->raw == NULL) ++ event->raw = trace_nop_print; ++ if (event->hex == NULL) ++ event->hex = trace_nop_print; ++ if (event->binary == NULL) ++ event->binary = trace_nop_print; ++ ++ key = event->type & (EVENT_HASHSIZE - 1); ++ ++ hlist_add_head_rcu(&event->node, &event_hash[key]); ++ ++ ret = event->type; ++ out: ++ mutex_unlock(&trace_event_mutex); ++ ++ return ret; ++} ++ ++/** ++ * unregister_ftrace_event - remove a no longer used event ++ * @event: the event to remove ++ */ ++int unregister_ftrace_event(struct trace_event *event) ++{ ++ mutex_lock(&trace_event_mutex); ++ hlist_del(&event->node); ++ mutex_unlock(&trace_event_mutex); ++ ++ return 0; ++} ++ ++/* ++ * Standard events ++ */ ++ ++enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) ++{ ++ return TRACE_TYPE_HANDLED; ++} ++ ++/* TRACE_FN */ ++static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) ++{ ++ struct ftrace_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!seq_print_ip_sym(s, field->ip, flags)) ++ goto partial; ++ ++ if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { ++ if (!trace_seq_printf(s, " <-")) ++ goto partial; ++ if (!seq_print_ip_sym(s, ++ field->parent_ip, ++ flags)) ++ goto partial; ++ } ++ if (!trace_seq_printf(s, "\n")) ++ goto partial; ++ ++ return TRACE_TYPE_HANDLED; ++ ++ partial: ++ return TRACE_TYPE_PARTIAL_LINE; ++} ++ ++static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) ++{ ++ struct ftrace_entry *field; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!trace_seq_printf(&iter->seq, "%lx %lx\n", ++ field->ip, ++ field->parent_ip)) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) ++{ ++ struct ftrace_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ SEQ_PUT_HEX_FIELD_RET(s, field->ip); ++ SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) ++{ ++ struct ftrace_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ SEQ_PUT_FIELD_RET(s, field->ip); ++ SEQ_PUT_FIELD_RET(s, field->parent_ip); ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static struct trace_event trace_fn_event = { ++ .type = TRACE_FN, ++ .trace = trace_fn_trace, ++ .raw = trace_fn_raw, ++ .hex = trace_fn_hex, ++ .binary = trace_fn_bin, ++}; ++ ++/* TRACE_CTX an TRACE_WAKE */ ++static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, ++ char *delim) ++{ ++ struct ctx_switch_entry *field; ++ char comm[TASK_COMM_LEN]; ++ int S, T; ++ ++ ++ trace_assign_type(field, iter->ent); ++ ++ T = task_state_char(field->next_state); ++ S = task_state_char(field->prev_state); ++ trace_find_cmdline(field->next_pid, comm); ++ if (!trace_seq_printf(&iter->seq, ++ " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", ++ field->prev_pid, ++ field->prev_prio, ++ S, delim, ++ field->next_cpu, ++ field->next_pid, ++ field->next_prio, ++ T, comm)) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) ++{ ++ return trace_ctxwake_print(iter, "==>"); ++} ++ ++static enum print_line_t trace_wake_print(struct trace_iterator *iter, ++ int flags) ++{ ++ return trace_ctxwake_print(iter, " +"); ++} ++ ++static int trace_ctxwake_raw(struct trace_iterator *iter, char S) ++{ ++ struct ctx_switch_entry *field; ++ int T; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!S) ++ task_state_char(field->prev_state); ++ T = task_state_char(field->next_state); ++ if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", ++ field->prev_pid, ++ field->prev_prio, ++ S, ++ field->next_cpu, ++ field->next_pid, ++ field->next_prio, ++ T)) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) ++{ ++ return trace_ctxwake_raw(iter, 0); ++} ++ ++static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) ++{ ++ return trace_ctxwake_raw(iter, '+'); ++} ++ ++ ++static int trace_ctxwake_hex(struct trace_iterator *iter, char S) ++{ ++ struct ctx_switch_entry *field; ++ struct trace_seq *s = &iter->seq; ++ int T; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!S) ++ task_state_char(field->prev_state); ++ T = task_state_char(field->next_state); ++ ++ SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); ++ SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); ++ SEQ_PUT_HEX_FIELD_RET(s, S); ++ SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); ++ SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); ++ SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); ++ SEQ_PUT_HEX_FIELD_RET(s, T); ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) ++{ ++ return trace_ctxwake_hex(iter, 0); ++} ++ ++static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) ++{ ++ return trace_ctxwake_hex(iter, '+'); ++} ++ ++static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, ++ int flags) ++{ ++ struct ctx_switch_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ SEQ_PUT_FIELD_RET(s, field->prev_pid); ++ SEQ_PUT_FIELD_RET(s, field->prev_prio); ++ SEQ_PUT_FIELD_RET(s, field->prev_state); ++ SEQ_PUT_FIELD_RET(s, field->next_pid); ++ SEQ_PUT_FIELD_RET(s, field->next_prio); ++ SEQ_PUT_FIELD_RET(s, field->next_state); ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static struct trace_event trace_ctx_event = { ++ .type = TRACE_CTX, ++ .trace = trace_ctx_print, ++ .raw = trace_ctx_raw, ++ .hex = trace_ctx_hex, ++ .binary = trace_ctxwake_bin, ++}; ++ ++static struct trace_event trace_wake_event = { ++ .type = TRACE_WAKE, ++ .trace = trace_wake_print, ++ .raw = trace_wake_raw, ++ .hex = trace_wake_hex, ++ .binary = trace_ctxwake_bin, ++}; ++ ++/* TRACE_SPECIAL */ ++static enum print_line_t trace_special_print(struct trace_iterator *iter, ++ int flags) ++{ ++ struct special_entry *field; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", ++ field->arg1, ++ field->arg2, ++ field->arg3)) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t trace_special_hex(struct trace_iterator *iter, ++ int flags) ++{ ++ struct special_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ SEQ_PUT_HEX_FIELD_RET(s, field->arg1); ++ SEQ_PUT_HEX_FIELD_RET(s, field->arg2); ++ SEQ_PUT_HEX_FIELD_RET(s, field->arg3); ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static enum print_line_t trace_special_bin(struct trace_iterator *iter, ++ int flags) ++{ ++ struct special_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ SEQ_PUT_FIELD_RET(s, field->arg1); ++ SEQ_PUT_FIELD_RET(s, field->arg2); ++ SEQ_PUT_FIELD_RET(s, field->arg3); ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++static struct trace_event trace_special_event = { ++ .type = TRACE_SPECIAL, ++ .trace = trace_special_print, ++ .raw = trace_special_print, ++ .hex = trace_special_hex, ++ .binary = trace_special_bin, ++}; ++ ++/* TRACE_STACK */ ++ ++static enum print_line_t trace_stack_print(struct trace_iterator *iter, ++ int flags) ++{ ++ struct stack_entry *field; ++ struct trace_seq *s = &iter->seq; ++ int i; ++ ++ trace_assign_type(field, iter->ent); ++ ++ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { ++ if (i) { ++ if (!trace_seq_puts(s, " <= ")) ++ goto partial; ++ ++ if (!seq_print_ip_sym(s, field->caller[i], flags)) ++ goto partial; ++ } ++ if (!trace_seq_puts(s, "\n")) ++ goto partial; ++ } ++ ++ return TRACE_TYPE_HANDLED; ++ ++ partial: ++ return TRACE_TYPE_PARTIAL_LINE; ++} ++ ++static struct trace_event trace_stack_event = { ++ .type = TRACE_STACK, ++ .trace = trace_stack_print, ++ .raw = trace_special_print, ++ .hex = trace_special_hex, ++ .binary = trace_special_bin, ++}; ++ ++/* TRACE_USER_STACK */ ++static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, ++ int flags) ++{ ++ struct userstack_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!seq_print_userip_objs(field, s, flags)) ++ goto partial; ++ ++ if (!trace_seq_putc(s, '\n')) ++ goto partial; ++ ++ return TRACE_TYPE_HANDLED; ++ ++ partial: ++ return TRACE_TYPE_PARTIAL_LINE; ++} ++ ++static struct trace_event trace_user_stack_event = { ++ .type = TRACE_USER_STACK, ++ .trace = trace_user_stack_print, ++ .raw = trace_special_print, ++ .hex = trace_special_hex, ++ .binary = trace_special_bin, ++}; ++ ++/* TRACE_BPRINT */ ++static enum print_line_t ++trace_bprint_print(struct trace_iterator *iter, int flags) ++{ ++ struct trace_entry *entry = iter->ent; ++ struct trace_seq *s = &iter->seq; ++ struct bprint_entry *field; ++ ++ trace_assign_type(field, entry); ++ ++ if (!seq_print_ip_sym(s, field->ip, flags)) ++ goto partial; ++ ++ if (!trace_seq_puts(s, ": ")) ++ goto partial; ++ ++ if (!trace_seq_bprintf(s, field->fmt, field->buf)) ++ goto partial; ++ ++ return TRACE_TYPE_HANDLED; ++ ++ partial: ++ return TRACE_TYPE_PARTIAL_LINE; ++} ++ ++ ++static enum print_line_t ++trace_bprint_raw(struct trace_iterator *iter, int flags) ++{ ++ struct bprint_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!trace_seq_printf(s, ": %lx : ", field->ip)) ++ goto partial; ++ ++ if (!trace_seq_bprintf(s, field->fmt, field->buf)) ++ goto partial; ++ ++ return TRACE_TYPE_HANDLED; ++ ++ partial: ++ return TRACE_TYPE_PARTIAL_LINE; ++} ++ ++ ++static struct trace_event trace_bprint_event = { ++ .type = TRACE_BPRINT, ++ .trace = trace_bprint_print, ++ .raw = trace_bprint_raw, ++}; ++ ++/* TRACE_PRINT */ ++static enum print_line_t trace_print_print(struct trace_iterator *iter, ++ int flags) ++{ ++ struct print_entry *field; ++ struct trace_seq *s = &iter->seq; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!seq_print_ip_sym(s, field->ip, flags)) ++ goto partial; ++ ++ if (!trace_seq_printf(s, ": %s", field->buf)) ++ goto partial; ++ ++ return TRACE_TYPE_HANDLED; ++ ++ partial: ++ return TRACE_TYPE_PARTIAL_LINE; ++} ++ ++static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) ++{ ++ struct print_entry *field; ++ ++ trace_assign_type(field, iter->ent); ++ ++ if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) ++ goto partial; ++ ++ return TRACE_TYPE_HANDLED; ++ ++ partial: ++ return TRACE_TYPE_PARTIAL_LINE; ++} ++ ++static struct trace_event trace_print_event = { ++ .type = TRACE_PRINT, ++ .trace = trace_print_print, ++ .raw = trace_print_raw, ++}; ++ ++ ++static struct trace_event *events[] __initdata = { ++ &trace_fn_event, ++ &trace_ctx_event, ++ &trace_wake_event, ++ &trace_special_event, ++ &trace_stack_event, ++ &trace_user_stack_event, ++ &trace_bprint_event, ++ &trace_print_event, ++ NULL ++}; ++ ++__init static int init_events(void) ++{ ++ struct trace_event *event; ++ int i, ret; ++ ++ for (i = 0; events[i]; i++) { ++ event = events[i]; ++ ++ ret = register_ftrace_event(event); ++ if (!ret) { ++ printk(KERN_WARNING "event %d failed to register\n", ++ event->type); ++ WARN_ON_ONCE(1); ++ } ++ } ++ ++ return 0; ++} ++device_initcall(init_events); +Index: linux-2.6-tip/kernel/trace/trace_output.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_output.h +@@ -0,0 +1,72 @@ ++#ifndef __TRACE_EVENTS_H ++#define __TRACE_EVENTS_H ++ ++#include "trace.h" ++ ++typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, ++ int flags); ++ ++struct trace_event { ++ struct hlist_node node; ++ int type; ++ trace_print_func trace; ++ trace_print_func raw; ++ trace_print_func hex; ++ trace_print_func binary; ++}; ++ ++extern enum print_line_t ++trace_print_bprintk_msg_only(struct trace_iterator *iter); ++extern enum print_line_t ++trace_print_printk_msg_only(struct trace_iterator *iter); ++ ++extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); ++ ++extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) ++ __attribute__ ((format (printf, 2, 3))); ++extern int ++trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); ++extern int ++seq_print_ip_sym(struct trace_seq *s, unsigned long ip, ++ unsigned long sym_flags); ++extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, ++ size_t cnt); ++extern int trace_seq_puts(struct trace_seq *s, const char *str); ++extern int trace_seq_putc(struct trace_seq *s, unsigned char c); ++extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); ++extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); ++extern void *trace_seq_reserve(struct trace_seq *s, size_t len); ++extern int trace_seq_path(struct trace_seq *s, struct path *path); ++extern int seq_print_userip_objs(const struct userstack_entry *entry, ++ struct trace_seq *s, unsigned long sym_flags); ++extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, ++ unsigned long ip, unsigned long sym_flags); ++ ++extern int trace_print_context(struct trace_iterator *iter); ++extern int trace_print_lat_context(struct trace_iterator *iter); ++ ++extern struct trace_event *ftrace_find_event(int type); ++extern int register_ftrace_event(struct trace_event *event); ++extern int unregister_ftrace_event(struct trace_event *event); ++ ++extern enum print_line_t trace_nop_print(struct trace_iterator *iter, ++ int flags); ++ ++#define MAX_MEMHEX_BYTES 8 ++#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) ++ ++#define SEQ_PUT_FIELD_RET(s, x) \ ++do { \ ++ if (!trace_seq_putmem(s, &(x), sizeof(x))) \ ++ return TRACE_TYPE_PARTIAL_LINE; \ ++} while (0) ++ ++#define SEQ_PUT_HEX_FIELD_RET(s, x) \ ++do { \ ++ BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ ++ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ ++ return TRACE_TYPE_PARTIAL_LINE; \ ++} while (0) ++ ++#endif ++ +Index: linux-2.6-tip/kernel/trace/trace_power.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_power.c ++++ linux-2.6-tip/kernel/trace/trace_power.c +@@ -11,15 +11,113 @@ + + #include + #include +-#include ++#include + #include + #include + + #include "trace.h" ++#include "trace_output.h" + + static struct trace_array *power_trace; + static int __read_mostly trace_power_enabled; + ++static void probe_power_start(struct power_trace *it, unsigned int type, ++ unsigned int level) ++{ ++ if (!trace_power_enabled) ++ return; ++ ++ memset(it, 0, sizeof(struct power_trace)); ++ it->state = level; ++ it->type = type; ++ it->stamp = ktime_get(); ++} ++ ++ ++static void probe_power_end(struct power_trace *it) ++{ ++ struct ring_buffer_event *event; ++ struct trace_power *entry; ++ struct trace_array_cpu *data; ++ struct trace_array *tr = power_trace; ++ ++ if (!trace_power_enabled) ++ return; ++ ++ preempt_disable(); ++ it->end = ktime_get(); ++ data = tr->data[smp_processor_id()]; ++ ++ event = trace_buffer_lock_reserve(tr, TRACE_POWER, ++ sizeof(*entry), 0, 0); ++ if (!event) ++ goto out; ++ entry = ring_buffer_event_data(event); ++ entry->state_data = *it; ++ trace_buffer_unlock_commit(tr, event, 0, 0); ++ out: ++ preempt_enable(); ++} ++ ++static void probe_power_mark(struct power_trace *it, unsigned int type, ++ unsigned int level) ++{ ++ struct ring_buffer_event *event; ++ struct trace_power *entry; ++ struct trace_array_cpu *data; ++ struct trace_array *tr = power_trace; ++ ++ if (!trace_power_enabled) ++ return; ++ ++ memset(it, 0, sizeof(struct power_trace)); ++ it->state = level; ++ it->type = type; ++ it->stamp = ktime_get(); ++ preempt_disable(); ++ it->end = it->stamp; ++ data = tr->data[smp_processor_id()]; ++ ++ event = trace_buffer_lock_reserve(tr, TRACE_POWER, ++ sizeof(*entry), 0, 0); ++ if (!event) ++ goto out; ++ entry = ring_buffer_event_data(event); ++ entry->state_data = *it; ++ trace_buffer_unlock_commit(tr, event, 0, 0); ++ out: ++ preempt_enable(); ++} ++ ++static int tracing_power_register(void) ++{ ++ int ret; ++ ++ ret = register_trace_power_start(probe_power_start); ++ if (ret) { ++ pr_info("power trace: Couldn't activate tracepoint" ++ " probe to trace_power_start\n"); ++ return ret; ++ } ++ ret = register_trace_power_end(probe_power_end); ++ if (ret) { ++ pr_info("power trace: Couldn't activate tracepoint" ++ " probe to trace_power_end\n"); ++ goto fail_start; ++ } ++ ret = register_trace_power_mark(probe_power_mark); ++ if (ret) { ++ pr_info("power trace: Couldn't activate tracepoint" ++ " probe to trace_power_mark\n"); ++ goto fail_end; ++ } ++ return ret; ++fail_end: ++ unregister_trace_power_end(probe_power_end); ++fail_start: ++ unregister_trace_power_start(probe_power_start); ++ return ret; ++} + + static void start_power_trace(struct trace_array *tr) + { +@@ -31,6 +129,14 @@ static void stop_power_trace(struct trac + trace_power_enabled = 0; + } + ++static void power_trace_reset(struct trace_array *tr) ++{ ++ trace_power_enabled = 0; ++ unregister_trace_power_start(probe_power_start); ++ unregister_trace_power_end(probe_power_end); ++ unregister_trace_power_mark(probe_power_mark); ++} ++ + + static int power_trace_init(struct trace_array *tr) + { +@@ -38,6 +144,7 @@ static int power_trace_init(struct trace + power_trace = tr; + + trace_power_enabled = 1; ++ tracing_power_register(); + + for_each_cpu(cpu, cpu_possible_mask) + tracing_reset(tr, cpu); +@@ -85,7 +192,7 @@ static struct tracer power_tracer __read + .init = power_trace_init, + .start = start_power_trace, + .stop = stop_power_trace, +- .reset = stop_power_trace, ++ .reset = power_trace_reset, + .print_line = power_print_line, + }; + +@@ -94,86 +201,3 @@ static int init_power_trace(void) + return register_tracer(&power_tracer); + } + device_initcall(init_power_trace); +- +-void trace_power_start(struct power_trace *it, unsigned int type, +- unsigned int level) +-{ +- if (!trace_power_enabled) +- return; +- +- memset(it, 0, sizeof(struct power_trace)); +- it->state = level; +- it->type = type; +- it->stamp = ktime_get(); +-} +-EXPORT_SYMBOL_GPL(trace_power_start); +- +- +-void trace_power_end(struct power_trace *it) +-{ +- struct ring_buffer_event *event; +- struct trace_power *entry; +- struct trace_array_cpu *data; +- unsigned long irq_flags; +- struct trace_array *tr = power_trace; +- +- if (!trace_power_enabled) +- return; +- +- preempt_disable(); +- it->end = ktime_get(); +- data = tr->data[smp_processor_id()]; +- +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); +- if (!event) +- goto out; +- entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, 0, 0); +- entry->ent.type = TRACE_POWER; +- entry->state_data = *it; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- +- trace_wake_up(); +- +- out: +- preempt_enable(); +-} +-EXPORT_SYMBOL_GPL(trace_power_end); +- +-void trace_power_mark(struct power_trace *it, unsigned int type, +- unsigned int level) +-{ +- struct ring_buffer_event *event; +- struct trace_power *entry; +- struct trace_array_cpu *data; +- unsigned long irq_flags; +- struct trace_array *tr = power_trace; +- +- if (!trace_power_enabled) +- return; +- +- memset(it, 0, sizeof(struct power_trace)); +- it->state = level; +- it->type = type; +- it->stamp = ktime_get(); +- preempt_disable(); +- it->end = it->stamp; +- data = tr->data[smp_processor_id()]; +- +- event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), +- &irq_flags); +- if (!event) +- goto out; +- entry = ring_buffer_event_data(event); +- tracing_generic_entry_update(&entry->ent, 0, 0); +- entry->ent.type = TRACE_POWER; +- entry->state_data = *it; +- ring_buffer_unlock_commit(tr->buffer, event, irq_flags); +- +- trace_wake_up(); +- +- out: +- preempt_enable(); +-} +-EXPORT_SYMBOL_GPL(trace_power_mark); +Index: linux-2.6-tip/kernel/trace/trace_printk.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_printk.c +@@ -0,0 +1,270 @@ ++/* ++ * trace binary printk ++ * ++ * Copyright (C) 2008 Lai Jiangshan ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "trace.h" ++ ++#ifdef CONFIG_MODULES ++ ++/* ++ * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt ++ * which are queued on trace_bprintk_fmt_list. ++ */ ++static LIST_HEAD(trace_bprintk_fmt_list); ++ ++/* serialize accesses to trace_bprintk_fmt_list */ ++static DEFINE_MUTEX(btrace_mutex); ++ ++struct trace_bprintk_fmt { ++ struct list_head list; ++ char fmt[0]; ++}; ++ ++static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) ++{ ++ struct trace_bprintk_fmt *pos; ++ list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { ++ if (!strcmp(pos->fmt, fmt)) ++ return pos; ++ } ++ return NULL; ++} ++ ++static ++void hold_module_trace_bprintk_format(const char **start, const char **end) ++{ ++ const char **iter; ++ ++ mutex_lock(&btrace_mutex); ++ for (iter = start; iter < end; iter++) { ++ struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); ++ if (tb_fmt) { ++ *iter = tb_fmt->fmt; ++ continue; ++ } ++ ++ tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) ++ + strlen(*iter) + 1, GFP_KERNEL); ++ if (tb_fmt) { ++ list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); ++ strcpy(tb_fmt->fmt, *iter); ++ *iter = tb_fmt->fmt; ++ } else ++ *iter = NULL; ++ } ++ mutex_unlock(&btrace_mutex); ++} ++ ++static int module_trace_bprintk_format_notify(struct notifier_block *self, ++ unsigned long val, void *data) ++{ ++ struct module *mod = data; ++ if (mod->num_trace_bprintk_fmt) { ++ const char **start = mod->trace_bprintk_fmt_start; ++ const char **end = start + mod->num_trace_bprintk_fmt; ++ ++ if (val == MODULE_STATE_COMING) ++ hold_module_trace_bprintk_format(start, end); ++ } ++ return 0; ++} ++ ++#else /* !CONFIG_MODULES */ ++__init static int ++module_trace_bprintk_format_notify(struct notifier_block *self, ++ unsigned long val, void *data) ++{ ++ return 0; ++} ++#endif /* CONFIG_MODULES */ ++ ++ ++__initdata_or_module static ++struct notifier_block module_trace_bprintk_format_nb = { ++ .notifier_call = module_trace_bprintk_format_notify, ++}; ++ ++int __trace_bprintk(unsigned long ip, const char *fmt, ...) ++ { ++ int ret; ++ va_list ap; ++ ++ if (unlikely(!fmt)) ++ return 0; ++ ++ if (!(trace_flags & TRACE_ITER_PRINTK)) ++ return 0; ++ ++ va_start(ap, fmt); ++ ret = trace_vbprintk(ip, fmt, ap); ++ va_end(ap); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(__trace_bprintk); ++ ++int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) ++ { ++ if (unlikely(!fmt)) ++ return 0; ++ ++ if (!(trace_flags & TRACE_ITER_PRINTK)) ++ return 0; ++ ++ return trace_vbprintk(ip, fmt, ap); ++} ++EXPORT_SYMBOL_GPL(__ftrace_vbprintk); ++ ++int __trace_printk(unsigned long ip, const char *fmt, ...) ++{ ++ int ret; ++ va_list ap; ++ ++ if (!(trace_flags & TRACE_ITER_PRINTK)) ++ return 0; ++ ++ va_start(ap, fmt); ++ ret = trace_vprintk(ip, fmt, ap); ++ va_end(ap); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(__trace_printk); ++ ++int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) ++{ ++ if (!(trace_flags & TRACE_ITER_PRINTK)) ++ return 0; ++ ++ return trace_vprintk(ip, fmt, ap); ++} ++EXPORT_SYMBOL_GPL(__ftrace_vprintk); ++ ++static void * ++t_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ const char **fmt = m->private; ++ const char **next = fmt; ++ ++ (*pos)++; ++ ++ if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) ++ return NULL; ++ ++ next = fmt; ++ m->private = ++next; ++ ++ return fmt; ++} ++ ++static void *t_start(struct seq_file *m, loff_t *pos) ++{ ++ return t_next(m, NULL, pos); ++} ++ ++static int t_show(struct seq_file *m, void *v) ++{ ++ const char **fmt = v; ++ const char *str = *fmt; ++ int i; ++ ++ seq_printf(m, "0x%lx : \"", (unsigned long)fmt); ++ ++ /* ++ * Tabs and new lines need to be converted. ++ */ ++ for (i = 0; str[i]; i++) { ++ switch (str[i]) { ++ case '\n': ++ seq_puts(m, "\\n"); ++ break; ++ case '\t': ++ seq_puts(m, "\\t"); ++ break; ++ case '\\': ++ seq_puts(m, "\\"); ++ break; ++ case '"': ++ seq_puts(m, "\\\""); ++ break; ++ default: ++ seq_putc(m, str[i]); ++ } ++ } ++ seq_puts(m, "\"\n"); ++ ++ return 0; ++} ++ ++static void t_stop(struct seq_file *m, void *p) ++{ ++} ++ ++static const struct seq_operations show_format_seq_ops = { ++ .start = t_start, ++ .next = t_next, ++ .show = t_show, ++ .stop = t_stop, ++}; ++ ++static int ++ftrace_formats_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ ++ ret = seq_open(file, &show_format_seq_ops); ++ if (!ret) { ++ struct seq_file *m = file->private_data; ++ ++ m->private = __start___trace_bprintk_fmt; ++ } ++ return ret; ++} ++ ++static const struct file_operations ftrace_formats_fops = { ++ .open = ftrace_formats_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static __init int init_trace_printk_function_export(void) ++{ ++ struct dentry *d_tracer; ++ struct dentry *entry; ++ ++ d_tracer = tracing_init_dentry(); ++ if (!d_tracer) ++ return 0; ++ ++ entry = debugfs_create_file("printk_formats", 0444, d_tracer, ++ NULL, &ftrace_formats_fops); ++ if (!entry) ++ pr_warning("Could not create debugfs " ++ "'printk_formats' entry\n"); ++ ++ return 0; ++} ++ ++fs_initcall(init_trace_printk_function_export); ++ ++static __init int init_trace_printk(void) ++{ ++ return register_module_notifier(&module_trace_bprintk_format_nb); ++} ++ ++early_initcall(init_trace_printk); +Index: linux-2.6-tip/kernel/trace/trace_sched_switch.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_sched_switch.c ++++ linux-2.6-tip/kernel/trace/trace_sched_switch.c +@@ -18,6 +18,7 @@ static struct trace_array *ctx_trace; + static int __read_mostly tracer_enabled; + static int sched_ref; + static DEFINE_MUTEX(sched_register_mutex); ++static int sched_stopped; + + static void + probe_sched_switch(struct rq *__rq, struct task_struct *prev, +@@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, stru + int cpu; + int pc; + +- if (!sched_ref) ++ if (!sched_ref || sched_stopped) + return; + + tracing_record_cmdline(prev); +@@ -43,7 +44,7 @@ probe_sched_switch(struct rq *__rq, stru + data = ctx_trace->data[cpu]; + + if (likely(!atomic_read(&data->disabled))) +- tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); ++ tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); + + local_irq_restore(flags); + } +@@ -66,7 +67,7 @@ probe_sched_wakeup(struct rq *__rq, stru + data = ctx_trace->data[cpu]; + + if (likely(!atomic_read(&data->disabled))) +- tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, ++ tracing_sched_wakeup_trace(ctx_trace, wakee, current, + flags, pc); + + local_irq_restore(flags); +@@ -93,7 +94,7 @@ static int tracing_sched_register(void) + ret = register_trace_sched_switch(probe_sched_switch); + if (ret) { + pr_info("sched trace: Couldn't activate tracepoint" +- " probe to kernel_sched_schedule\n"); ++ " probe to kernel_sched_switch\n"); + goto fail_deprobe_wake_new; + } + +@@ -185,12 +186,6 @@ void tracing_sched_switch_assign_trace(s + ctx_trace = tr; + } + +-static void start_sched_trace(struct trace_array *tr) +-{ +- tracing_reset_online_cpus(tr); +- tracing_start_sched_switch_record(); +-} +- + static void stop_sched_trace(struct trace_array *tr) + { + tracing_stop_sched_switch_record(); +@@ -199,7 +194,8 @@ static void stop_sched_trace(struct trac + static int sched_switch_trace_init(struct trace_array *tr) + { + ctx_trace = tr; +- start_sched_trace(tr); ++ tracing_reset_online_cpus(tr); ++ tracing_start_sched_switch_record(); + return 0; + } + +@@ -211,13 +207,12 @@ static void sched_switch_trace_reset(str + + static void sched_switch_trace_start(struct trace_array *tr) + { +- tracing_reset_online_cpus(tr); +- tracing_start_sched_switch(); ++ sched_stopped = 0; + } + + static void sched_switch_trace_stop(struct trace_array *tr) + { +- tracing_stop_sched_switch(); ++ sched_stopped = 1; + } + + static struct tracer sched_switch_trace __read_mostly = +@@ -227,6 +222,7 @@ static struct tracer sched_switch_trace + .reset = sched_switch_trace_reset, + .start = sched_switch_trace_start, + .stop = sched_switch_trace_stop, ++ .wait_pipe = poll_wait_pipe, + #ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_sched_switch, + #endif +Index: linux-2.6-tip/kernel/trace/trace_sched_wakeup.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_sched_wakeup.c ++++ linux-2.6-tip/kernel/trace/trace_sched_wakeup.c +@@ -25,12 +25,14 @@ static int __read_mostly tracer_enabled; + static struct task_struct *wakeup_task; + static int wakeup_cpu; + static unsigned wakeup_prio = -1; ++static int wakeup_rt; + +-static raw_spinlock_t wakeup_lock = +- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; ++static __raw_spinlock_t wakeup_lock = __RAW_SPIN_LOCK_UNLOCKED; + + static void __wakeup_reset(struct trace_array *tr); + ++static int save_lat_flag; ++ + #ifdef CONFIG_FUNCTION_TRACER + /* + * irqsoff uses its own tracer function to keep the overhead down: +@@ -71,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, uns + if (task_cpu(wakeup_task) != cpu) + goto unlock; + +- trace_function(tr, data, ip, parent_ip, flags, pc); ++ trace_function(tr, ip, parent_ip, flags, pc); + + unlock: + __raw_spin_unlock(&wakeup_lock); +@@ -151,7 +153,8 @@ probe_wakeup_sched_switch(struct rq *rq, + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + +- trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); ++ trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); ++ tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); + + /* + * usecs conversion is slow so we try to delay the conversion +@@ -182,13 +185,10 @@ out: + + static void __wakeup_reset(struct trace_array *tr) + { +- struct trace_array_cpu *data; + int cpu; + +- for_each_possible_cpu(cpu) { +- data = tr->data[cpu]; ++ for_each_possible_cpu(cpu) + tracing_reset(tr, cpu); +- } + + wakeup_cpu = -1; + wakeup_prio = -1; +@@ -213,6 +213,7 @@ static void wakeup_reset(struct trace_ar + static void + probe_wakeup(struct rq *rq, struct task_struct *p, int success) + { ++ struct trace_array_cpu *data; + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; +@@ -224,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_ + tracing_record_cmdline(p); + tracing_record_cmdline(current); + +- if (likely(!rt_task(p)) || ++ if ((wakeup_rt && !rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= current->prio) + return; +@@ -252,9 +253,10 @@ probe_wakeup(struct rq *rq, struct task_ + + local_save_flags(flags); + +- wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); +- trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], +- CALLER_ADDR1, CALLER_ADDR2, flags, pc); ++ data = wakeup_trace->data[wakeup_cpu]; ++ data->preempt_timestamp = ftrace_now(cpu); ++ tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); ++ trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + + out_locked: + __raw_spin_unlock(&wakeup_lock); +@@ -262,12 +264,6 @@ out: + atomic_dec(&wakeup_trace->data[cpu]->disabled); + } + +-/* +- * save_tracer_enabled is used to save the state of the tracer_enabled +- * variable when we disable it when we open a trace output file. +- */ +-static int save_tracer_enabled; +- + static void start_wakeup_tracer(struct trace_array *tr) + { + int ret; +@@ -289,7 +285,7 @@ static void start_wakeup_tracer(struct t + ret = register_trace_sched_switch(probe_wakeup_sched_switch); + if (ret) { + pr_info("sched trace: Couldn't activate tracepoint" +- " probe to kernel_sched_schedule\n"); ++ " probe to kernel_sched_switch\n"); + goto fail_deprobe_wake_new; + } + +@@ -306,13 +302,10 @@ static void start_wakeup_tracer(struct t + + register_ftrace_function(&trace_ops); + +- if (tracing_is_enabled()) { ++ if (tracing_is_enabled()) + tracer_enabled = 1; +- save_tracer_enabled = 1; +- } else { ++ else + tracer_enabled = 0; +- save_tracer_enabled = 0; +- } + + return; + fail_deprobe_wake_new: +@@ -324,54 +317,54 @@ fail_deprobe: + static void stop_wakeup_tracer(struct trace_array *tr) + { + tracer_enabled = 0; +- save_tracer_enabled = 0; + unregister_ftrace_function(&trace_ops); + unregister_trace_sched_switch(probe_wakeup_sched_switch); + unregister_trace_sched_wakeup_new(probe_wakeup); + unregister_trace_sched_wakeup(probe_wakeup); + } + +-static int wakeup_tracer_init(struct trace_array *tr) ++static int __wakeup_tracer_init(struct trace_array *tr) + { ++ save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; ++ trace_flags |= TRACE_ITER_LATENCY_FMT; ++ + tracing_max_latency = 0; + wakeup_trace = tr; + start_wakeup_tracer(tr); + return 0; + } + ++static int wakeup_tracer_init(struct trace_array *tr) ++{ ++ wakeup_rt = 0; ++ return __wakeup_tracer_init(tr); ++} ++ ++static int wakeup_rt_tracer_init(struct trace_array *tr) ++{ ++ wakeup_rt = 1; ++ return __wakeup_tracer_init(tr); ++} ++ + static void wakeup_tracer_reset(struct trace_array *tr) + { + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); ++ ++ if (!save_lat_flag) ++ trace_flags &= ~TRACE_ITER_LATENCY_FMT; + } + + static void wakeup_tracer_start(struct trace_array *tr) + { + wakeup_reset(tr); + tracer_enabled = 1; +- save_tracer_enabled = 1; + } + + static void wakeup_tracer_stop(struct trace_array *tr) + { + tracer_enabled = 0; +- save_tracer_enabled = 0; +-} +- +-static void wakeup_tracer_open(struct trace_iterator *iter) +-{ +- /* stop the trace while dumping */ +- tracer_enabled = 0; +-} +- +-static void wakeup_tracer_close(struct trace_iterator *iter) +-{ +- /* forget about any processes we were recording */ +- if (save_tracer_enabled) { +- wakeup_reset(iter->tr); +- tracer_enabled = 1; +- } + } + + static struct tracer wakeup_tracer __read_mostly = +@@ -381,8 +374,20 @@ static struct tracer wakeup_tracer __rea + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, +- .open = wakeup_tracer_open, +- .close = wakeup_tracer_close, ++ .print_max = 1, ++#ifdef CONFIG_FTRACE_SELFTEST ++ .selftest = trace_selftest_startup_wakeup, ++#endif ++}; ++ ++static struct tracer wakeup_rt_tracer __read_mostly = ++{ ++ .name = "wakeup_rt", ++ .init = wakeup_rt_tracer_init, ++ .reset = wakeup_tracer_reset, ++ .start = wakeup_tracer_start, ++ .stop = wakeup_tracer_stop, ++ .wait_pipe = poll_wait_pipe, + .print_max = 1, + #ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +@@ -397,6 +402,10 @@ __init static int init_wakeup_tracer(voi + if (ret) + return ret; + ++ ret = register_tracer(&wakeup_rt_tracer); ++ if (ret) ++ return ret; ++ + return 0; + } + device_initcall(init_wakeup_tracer); +Index: linux-2.6-tip/kernel/trace/trace_selftest.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_selftest.c ++++ linux-2.6-tip/kernel/trace/trace_selftest.c +@@ -1,5 +1,6 @@ + /* Include in trace.c */ + ++#include + #include + #include + +@@ -9,11 +10,12 @@ static inline int trace_valid_entry(stru + case TRACE_FN: + case TRACE_CTX: + case TRACE_WAKE: +- case TRACE_CONT: + case TRACE_STACK: + case TRACE_PRINT: + case TRACE_SPECIAL: + case TRACE_BRANCH: ++ case TRACE_GRAPH_ENT: ++ case TRACE_GRAPH_RET: + return 1; + } + return 0; +@@ -99,9 +101,6 @@ static inline void warn_failed_init_trac + + #ifdef CONFIG_DYNAMIC_FTRACE + +-#define __STR(x) #x +-#define STR(x) __STR(x) +- + /* Test dynamic code modification and ftrace filters */ + int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, +@@ -125,17 +124,17 @@ int trace_selftest_startup_dynamic_traci + func(); + + /* +- * Some archs *cough*PowerPC*cough* add charachters to the ++ * Some archs *cough*PowerPC*cough* add characters to the + * start of the function names. We simply put a '*' to +- * accomodate them. ++ * accommodate them. + */ +- func_name = "*" STR(DYN_FTRACE_TEST_NAME); ++ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + + /* filter only on our function */ + ftrace_set_filter(func_name, strlen(func_name), 1); + + /* enable tracing */ +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; +@@ -209,7 +208,7 @@ trace_selftest_startup_function(struct t + ftrace_enabled = 1; + tracer_enabled = 1; + +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; +@@ -247,6 +246,90 @@ trace_selftest_startup_function(struct t + } + #endif /* CONFIG_FUNCTION_TRACER */ + ++ ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ ++/* Maximum number of functions to trace before diagnosing a hang */ ++#define GRAPH_MAX_FUNC_TEST 100000000 ++ ++static void __ftrace_dump(bool disable_tracing); ++static unsigned int graph_hang_thresh; ++ ++/* Wrap the real function entry probe to avoid possible hanging */ ++static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) ++{ ++ /* This is harmlessly racy, we want to approximately detect a hang */ ++ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ++ ftrace_graph_stop(); ++ printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); ++ if (ftrace_dump_on_oops) ++ __ftrace_dump(false); ++ return 0; ++ } ++ ++ return trace_graph_entry(trace); ++} ++ ++/* ++ * Pretty much the same than for the function tracer from which the selftest ++ * has been borrowed. ++ */ ++int ++trace_selftest_startup_function_graph(struct tracer *trace, ++ struct trace_array *tr) ++{ ++ int ret; ++ unsigned long count; ++ ++ /* ++ * Simulate the init() callback but we attach a watchdog callback ++ * to detect and recover from possible hangs ++ */ ++ tracing_reset_online_cpus(tr); ++ ret = register_ftrace_graph(&trace_graph_return, ++ &trace_graph_entry_watchdog); ++ if (ret) { ++ warn_failed_init_tracer(trace, ret); ++ goto out; ++ } ++ tracing_start_cmdline_record(); ++ ++ /* Sleep for a 1/10 of a second */ ++ msleep(100); ++ ++ /* Have we just recovered from a hang? */ ++ if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { ++ tracing_selftest_disabled = true; ++ ret = -1; ++ goto out; ++ } ++ ++ tracing_stop(); ++ ++ /* check the trace buffer */ ++ ret = trace_test_buffer(tr, &count); ++ ++ trace->reset(tr); ++ tracing_start(); ++ ++ if (!ret && !count) { ++ printk(KERN_CONT ".. no entries found .."); ++ ret = -1; ++ goto out; ++ } ++ ++ /* Don't test dynamic tracing, the function tracer already did */ ++ ++out: ++ /* Stop it if we failed */ ++ if (ret) ++ ftrace_graph_stop(); ++ ++ return ret; ++} ++#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ ++ ++ + #ifdef CONFIG_IRQSOFF_TRACER + int + trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +@@ -256,7 +339,7 @@ trace_selftest_startup_irqsoff(struct tr + int ret; + + /* start the tracing */ +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; +@@ -268,6 +351,14 @@ trace_selftest_startup_irqsoff(struct tr + local_irq_disable(); + udelay(100); + local_irq_enable(); ++ ++ /* ++ * Stop the tracer to avoid a warning subsequent ++ * to buffer flipping failure because tracing_stop() ++ * disables the tr and max buffers, making flipping impossible ++ * in case of parallels max irqs off latencies. ++ */ ++ trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ +@@ -310,7 +401,7 @@ trace_selftest_startup_preemptoff(struct + } + + /* start the tracing */ +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; +@@ -322,6 +413,14 @@ trace_selftest_startup_preemptoff(struct + preempt_disable(); + udelay(100); + preempt_enable(); ++ ++ /* ++ * Stop the tracer to avoid a warning subsequent ++ * to buffer flipping failure because tracing_stop() ++ * disables the tr and max buffers, making flipping impossible ++ * in case of parallels max preempt off latencies. ++ */ ++ trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ +@@ -364,10 +463,10 @@ trace_selftest_startup_preemptirqsoff(st + } + + /* start the tracing */ +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); +- goto out; ++ goto out_no_start; + } + + /* reset the max latency */ +@@ -381,31 +480,35 @@ trace_selftest_startup_preemptirqsoff(st + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + ++ /* ++ * Stop the tracer to avoid a warning subsequent ++ * to buffer flipping failure because tracing_stop() ++ * disables the tr and max buffers, making flipping impossible ++ * in case of parallels max irqs/preempt off latencies. ++ */ ++ trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); +- if (ret) { +- tracing_start(); ++ if (ret) + goto out; +- } + + ret = trace_test_buffer(&max_tr, &count); +- if (ret) { +- tracing_start(); ++ if (ret) + goto out; +- } + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; +- tracing_start(); + goto out; + } + + /* do the test by disabling interrupts first this time */ + tracing_max_latency = 0; + tracing_start(); ++ trace->start(tr); ++ + preempt_disable(); + local_irq_disable(); + udelay(100); +@@ -413,6 +516,7 @@ trace_selftest_startup_preemptirqsoff(st + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + ++ trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ +@@ -428,9 +532,10 @@ trace_selftest_startup_preemptirqsoff(st + goto out; + } + +- out: +- trace->reset(tr); ++out: + tracing_start(); ++out_no_start: ++ trace->reset(tr); + tracing_max_latency = save_max; + + return ret; +@@ -496,7 +601,7 @@ trace_selftest_startup_wakeup(struct tra + wait_for_completion(&isrt); + + /* start the tracing */ +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; +@@ -557,7 +662,7 @@ trace_selftest_startup_sched_switch(stru + int ret; + + /* start the tracing */ +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; +@@ -589,10 +694,10 @@ trace_selftest_startup_sysprof(struct tr + int ret; + + /* start the tracing */ +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); +- return 0; ++ return ret; + } + + /* Sleep for a 1/10 of a second */ +@@ -604,6 +709,11 @@ trace_selftest_startup_sysprof(struct tr + trace->reset(tr); + tracing_start(); + ++ if (!ret && !count) { ++ printk(KERN_CONT ".. no entries found .."); ++ ret = -1; ++ } ++ + return ret; + } + #endif /* CONFIG_SYSPROF_TRACER */ +@@ -616,7 +726,7 @@ trace_selftest_startup_branch(struct tra + int ret; + + /* start the tracing */ +- ret = trace->init(tr); ++ ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; +@@ -631,6 +741,11 @@ trace_selftest_startup_branch(struct tra + trace->reset(tr); + tracing_start(); + ++ if (!ret && !count) { ++ printk(KERN_CONT ".. no entries found .."); ++ ret = -1; ++ } ++ + return ret; + } + #endif /* CONFIG_BRANCH_TRACER */ +Index: linux-2.6-tip/kernel/trace/trace_stack.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_stack.c ++++ linux-2.6-tip/kernel/trace/trace_stack.c +@@ -27,8 +27,7 @@ static struct stack_trace max_stack_trac + }; + + static unsigned long max_stack_size; +-static raw_spinlock_t max_stack_lock = +- (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; ++static __raw_spinlock_t max_stack_lock = __RAW_SPIN_LOCK_UNLOCKED; + + static int stack_trace_disabled __read_mostly; + static DEFINE_PER_CPU(int, trace_active); +@@ -245,16 +244,31 @@ static int trace_lookup_stack(struct seq + #endif + } + ++static void print_disabled(struct seq_file *m) ++{ ++ seq_puts(m, "#\n" ++ "# Stack tracer disabled\n" ++ "#\n" ++ "# To enable the stack tracer, either add 'stacktrace' to the\n" ++ "# kernel command line\n" ++ "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" ++ "#\n"); ++} ++ + static int t_show(struct seq_file *m, void *v) + { + long i; + int size; + + if (v == SEQ_START_TOKEN) { +- seq_printf(m, " Depth Size Location" ++ seq_printf(m, " Depth Size Location" + " (%d entries)\n" +- " ----- ---- --------\n", ++ " ----- ---- --------\n", + max_stack_trace.nr_entries); ++ ++ if (!stack_tracer_enabled && !max_stack_size) ++ print_disabled(m); ++ + return 0; + } + +Index: linux-2.6-tip/kernel/trace/trace_stat.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_stat.c +@@ -0,0 +1,326 @@ ++/* ++ * Infrastructure for statistic tracing (histogram output). ++ * ++ * Copyright (C) 2008 Frederic Weisbecker ++ * ++ * Based on the code from trace_branch.c which is ++ * Copyright (C) 2008 Steven Rostedt ++ * ++ */ ++ ++ ++#include ++#include ++#include "trace_stat.h" ++#include "trace.h" ++ ++ ++/* List of stat entries from a tracer */ ++struct trace_stat_list { ++ struct list_head list; ++ void *stat; ++}; ++ ++/* A stat session is the stats output in one file */ ++struct tracer_stat_session { ++ struct list_head session_list; ++ struct tracer_stat *ts; ++ struct list_head stat_list; ++ struct mutex stat_mutex; ++ struct dentry *file; ++}; ++ ++/* All of the sessions currently in use. Each stat file embed one session */ ++static LIST_HEAD(all_stat_sessions); ++static DEFINE_MUTEX(all_stat_sessions_mutex); ++ ++/* The root directory for all stat files */ ++static struct dentry *stat_dir; ++ ++ ++static void reset_stat_session(struct tracer_stat_session *session) ++{ ++ struct trace_stat_list *node, *next; ++ ++ list_for_each_entry_safe(node, next, &session->stat_list, list) ++ kfree(node); ++ ++ INIT_LIST_HEAD(&session->stat_list); ++} ++ ++static void destroy_session(struct tracer_stat_session *session) ++{ ++ debugfs_remove(session->file); ++ reset_stat_session(session); ++ mutex_destroy(&session->stat_mutex); ++ kfree(session); ++} ++ ++/* ++ * For tracers that don't provide a stat_cmp callback. ++ * This one will force an immediate insertion on tail of ++ * the list. ++ */ ++static int dummy_cmp(void *p1, void *p2) ++{ ++ return 1; ++} ++ ++/* ++ * Initialize the stat list at each trace_stat file opening. ++ * All of these copies and sorting are required on all opening ++ * since the stats could have changed between two file sessions. ++ */ ++static int stat_seq_init(struct tracer_stat_session *session) ++{ ++ struct trace_stat_list *iter_entry, *new_entry; ++ struct tracer_stat *ts = session->ts; ++ void *stat; ++ int ret = 0; ++ int i; ++ ++ mutex_lock(&session->stat_mutex); ++ reset_stat_session(session); ++ ++ if (!ts->stat_cmp) ++ ts->stat_cmp = dummy_cmp; ++ ++ stat = ts->stat_start(ts); ++ if (!stat) ++ goto exit; ++ ++ /* ++ * The first entry. Actually this is the second, but the first ++ * one (the stat_list head) is pointless. ++ */ ++ new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); ++ if (!new_entry) { ++ ret = -ENOMEM; ++ goto exit; ++ } ++ ++ INIT_LIST_HEAD(&new_entry->list); ++ ++ list_add(&new_entry->list, &session->stat_list); ++ ++ new_entry->stat = stat; ++ ++ /* ++ * Iterate over the tracer stat entries and store them in a sorted ++ * list. ++ */ ++ for (i = 1; ; i++) { ++ stat = ts->stat_next(stat, i); ++ ++ /* End of insertion */ ++ if (!stat) ++ break; ++ ++ new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); ++ if (!new_entry) { ++ ret = -ENOMEM; ++ goto exit_free_list; ++ } ++ ++ INIT_LIST_HEAD(&new_entry->list); ++ new_entry->stat = stat; ++ ++ list_for_each_entry_reverse(iter_entry, &session->stat_list, ++ list) { ++ ++ /* Insertion with a descendent sorting */ ++ if (ts->stat_cmp(iter_entry->stat, ++ new_entry->stat) >= 0) { ++ ++ list_add(&new_entry->list, &iter_entry->list); ++ break; ++ } ++ } ++ ++ /* The current larger value */ ++ if (list_empty(&new_entry->list)) ++ list_add(&new_entry->list, &session->stat_list); ++ } ++exit: ++ mutex_unlock(&session->stat_mutex); ++ return ret; ++ ++exit_free_list: ++ reset_stat_session(session); ++ mutex_unlock(&session->stat_mutex); ++ return ret; ++} ++ ++ ++static void *stat_seq_start(struct seq_file *s, loff_t *pos) ++{ ++ struct tracer_stat_session *session = s->private; ++ ++ /* Prevent from tracer switch or stat_list modification */ ++ mutex_lock(&session->stat_mutex); ++ ++ /* If we are in the beginning of the file, print the headers */ ++ if (!*pos && session->ts->stat_headers) ++ return SEQ_START_TOKEN; ++ ++ return seq_list_start(&session->stat_list, *pos); ++} ++ ++static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) ++{ ++ struct tracer_stat_session *session = s->private; ++ ++ if (p == SEQ_START_TOKEN) ++ return seq_list_start(&session->stat_list, *pos); ++ ++ return seq_list_next(p, &session->stat_list, pos); ++} ++ ++static void stat_seq_stop(struct seq_file *s, void *p) ++{ ++ struct tracer_stat_session *session = s->private; ++ mutex_unlock(&session->stat_mutex); ++} ++ ++static int stat_seq_show(struct seq_file *s, void *v) ++{ ++ struct tracer_stat_session *session = s->private; ++ struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); ++ ++ if (v == SEQ_START_TOKEN) ++ return session->ts->stat_headers(s); ++ ++ return session->ts->stat_show(s, l->stat); ++} ++ ++static const struct seq_operations trace_stat_seq_ops = { ++ .start = stat_seq_start, ++ .next = stat_seq_next, ++ .stop = stat_seq_stop, ++ .show = stat_seq_show ++}; ++ ++/* The session stat is refilled and resorted at each stat file opening */ ++static int tracing_stat_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ ++ struct tracer_stat_session *session = inode->i_private; ++ ++ ret = seq_open(file, &trace_stat_seq_ops); ++ if (!ret) { ++ struct seq_file *m = file->private_data; ++ m->private = session; ++ ret = stat_seq_init(session); ++ } ++ ++ return ret; ++} ++ ++/* ++ * Avoid consuming memory with our now useless list. ++ */ ++static int tracing_stat_release(struct inode *i, struct file *f) ++{ ++ struct tracer_stat_session *session = i->i_private; ++ ++ mutex_lock(&session->stat_mutex); ++ reset_stat_session(session); ++ mutex_unlock(&session->stat_mutex); ++ ++ return 0; ++} ++ ++static const struct file_operations tracing_stat_fops = { ++ .open = tracing_stat_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = tracing_stat_release ++}; ++ ++static int tracing_stat_init(void) ++{ ++ struct dentry *d_tracing; ++ ++ d_tracing = tracing_init_dentry(); ++ ++ stat_dir = debugfs_create_dir("trace_stat", d_tracing); ++ if (!stat_dir) ++ pr_warning("Could not create debugfs " ++ "'trace_stat' entry\n"); ++ return 0; ++} ++ ++static int init_stat_file(struct tracer_stat_session *session) ++{ ++ if (!stat_dir && tracing_stat_init()) ++ return -ENODEV; ++ ++ session->file = debugfs_create_file(session->ts->name, 0644, ++ stat_dir, ++ session, &tracing_stat_fops); ++ if (!session->file) ++ return -ENOMEM; ++ return 0; ++} ++ ++int register_stat_tracer(struct tracer_stat *trace) ++{ ++ struct tracer_stat_session *session, *node, *tmp; ++ int ret; ++ ++ if (!trace) ++ return -EINVAL; ++ ++ if (!trace->stat_start || !trace->stat_next || !trace->stat_show) ++ return -EINVAL; ++ ++ /* Already registered? */ ++ mutex_lock(&all_stat_sessions_mutex); ++ list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { ++ if (node->ts == trace) { ++ mutex_unlock(&all_stat_sessions_mutex); ++ return -EINVAL; ++ } ++ } ++ mutex_unlock(&all_stat_sessions_mutex); ++ ++ /* Init the session */ ++ session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); ++ if (!session) ++ return -ENOMEM; ++ ++ session->ts = trace; ++ INIT_LIST_HEAD(&session->session_list); ++ INIT_LIST_HEAD(&session->stat_list); ++ mutex_init(&session->stat_mutex); ++ session->file = NULL; ++ ++ ret = init_stat_file(session); ++ if (ret) { ++ destroy_session(session); ++ return ret; ++ } ++ ++ /* Register */ ++ mutex_lock(&all_stat_sessions_mutex); ++ list_add_tail(&session->session_list, &all_stat_sessions); ++ mutex_unlock(&all_stat_sessions_mutex); ++ ++ return 0; ++} ++ ++void unregister_stat_tracer(struct tracer_stat *trace) ++{ ++ struct tracer_stat_session *node, *tmp; ++ ++ mutex_lock(&all_stat_sessions_mutex); ++ list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { ++ if (node->ts == trace) { ++ list_del(&node->session_list); ++ destroy_session(node); ++ break; ++ } ++ } ++ mutex_unlock(&all_stat_sessions_mutex); ++} +Index: linux-2.6-tip/kernel/trace/trace_stat.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_stat.h +@@ -0,0 +1,31 @@ ++#ifndef __TRACE_STAT_H ++#define __TRACE_STAT_H ++ ++#include ++ ++/* ++ * If you want to provide a stat file (one-shot statistics), fill ++ * an iterator with stat_start/stat_next and a stat_show callbacks. ++ * The others callbacks are optional. ++ */ ++struct tracer_stat { ++ /* The name of your stat file */ ++ const char *name; ++ /* Iteration over statistic entries */ ++ void *(*stat_start)(struct tracer_stat *trace); ++ void *(*stat_next)(void *prev, int idx); ++ /* Compare two entries for stats sorting */ ++ int (*stat_cmp)(void *p1, void *p2); ++ /* Print a stat entry */ ++ int (*stat_show)(struct seq_file *s, void *p); ++ /* Print the headers of your stat entries */ ++ int (*stat_headers)(struct seq_file *s); ++}; ++ ++/* ++ * Destroy or create a stat file ++ */ ++extern int register_stat_tracer(struct tracer_stat *trace); ++extern void unregister_stat_tracer(struct tracer_stat *trace); ++ ++#endif /* __TRACE_STAT_H */ +Index: linux-2.6-tip/kernel/trace/trace_syscalls.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_syscalls.c +@@ -0,0 +1,250 @@ ++#include ++#include ++#include ++ ++#include "trace_output.h" ++#include "trace.h" ++ ++/* Keep a counter of the syscall tracing users */ ++static int refcount; ++ ++/* Prevent from races on thread flags toggling */ ++static DEFINE_MUTEX(syscall_trace_lock); ++ ++/* Option to display the parameters types */ ++enum { ++ TRACE_SYSCALLS_OPT_TYPES = 0x1, ++}; ++ ++static struct tracer_opt syscalls_opts[] = { ++ { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) }, ++ { } ++}; ++ ++static struct tracer_flags syscalls_flags = { ++ .val = 0, /* By default: no parameters types */ ++ .opts = syscalls_opts ++}; ++ ++enum print_line_t ++print_syscall_enter(struct trace_iterator *iter, int flags) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct trace_entry *ent = iter->ent; ++ struct syscall_trace_enter *trace; ++ struct syscall_metadata *entry; ++ int i, ret, syscall; ++ ++ trace_assign_type(trace, ent); ++ ++ syscall = trace->nr; ++ ++ entry = syscall_nr_to_meta(syscall); ++ if (!entry) ++ goto end; ++ ++ ret = trace_seq_printf(s, "%s(", entry->name); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ for (i = 0; i < entry->nb_args; i++) { ++ /* parameter types */ ++ if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { ++ ret = trace_seq_printf(s, "%s ", entry->types[i]); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } ++ /* parameter values */ ++ ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], ++ trace->args[i], ++ i == entry->nb_args - 1 ? ")" : ","); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ } ++ ++end: ++ trace_seq_printf(s, "\n"); ++ return TRACE_TYPE_HANDLED; ++} ++ ++enum print_line_t ++print_syscall_exit(struct trace_iterator *iter, int flags) ++{ ++ struct trace_seq *s = &iter->seq; ++ struct trace_entry *ent = iter->ent; ++ struct syscall_trace_exit *trace; ++ int syscall; ++ struct syscall_metadata *entry; ++ int ret; ++ ++ trace_assign_type(trace, ent); ++ ++ syscall = trace->nr; ++ ++ entry = syscall_nr_to_meta(syscall); ++ if (!entry) { ++ trace_seq_printf(s, "\n"); ++ return TRACE_TYPE_HANDLED; ++ } ++ ++ ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, ++ trace->ret); ++ if (!ret) ++ return TRACE_TYPE_PARTIAL_LINE; ++ ++ return TRACE_TYPE_HANDLED; ++} ++ ++void start_ftrace_syscalls(void) ++{ ++ unsigned long flags; ++ struct task_struct *g, *t; ++ ++ mutex_lock(&syscall_trace_lock); ++ ++ /* Don't enable the flag on the tasks twice */ ++ if (++refcount != 1) ++ goto unlock; ++ ++ arch_init_ftrace_syscalls(); ++ read_lock_irqsave(&tasklist_lock, flags); ++ ++ do_each_thread(g, t) { ++ set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); ++ } while_each_thread(g, t); ++ ++ read_unlock_irqrestore(&tasklist_lock, flags); ++ ++unlock: ++ mutex_unlock(&syscall_trace_lock); ++} ++ ++void stop_ftrace_syscalls(void) ++{ ++ unsigned long flags; ++ struct task_struct *g, *t; ++ ++ mutex_lock(&syscall_trace_lock); ++ ++ /* There are perhaps still some users */ ++ if (--refcount) ++ goto unlock; ++ ++ read_lock_irqsave(&tasklist_lock, flags); ++ ++ do_each_thread(g, t) { ++ clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); ++ } while_each_thread(g, t); ++ ++ read_unlock_irqrestore(&tasklist_lock, flags); ++ ++unlock: ++ mutex_unlock(&syscall_trace_lock); ++} ++ ++void ftrace_syscall_enter(struct pt_regs *regs) ++{ ++ struct syscall_trace_enter *entry; ++ struct syscall_metadata *sys_data; ++ struct ring_buffer_event *event; ++ int size; ++ int syscall_nr; ++ ++ syscall_nr = syscall_get_nr(current, regs); ++ ++ sys_data = syscall_nr_to_meta(syscall_nr); ++ if (!sys_data) ++ return; ++ ++ size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; ++ ++ event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, ++ 0, 0); ++ if (!event) ++ return; ++ ++ entry = ring_buffer_event_data(event); ++ entry->nr = syscall_nr; ++ syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); ++ ++ trace_current_buffer_unlock_commit(event, 0, 0); ++ trace_wake_up(); ++} ++ ++void ftrace_syscall_exit(struct pt_regs *regs) ++{ ++ struct syscall_trace_exit *entry; ++ struct syscall_metadata *sys_data; ++ struct ring_buffer_event *event; ++ int syscall_nr; ++ ++ syscall_nr = syscall_get_nr(current, regs); ++ ++ sys_data = syscall_nr_to_meta(syscall_nr); ++ if (!sys_data) ++ return; ++ ++ event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, ++ sizeof(*entry), 0, 0); ++ if (!event) ++ return; ++ ++ entry = ring_buffer_event_data(event); ++ entry->nr = syscall_nr; ++ entry->ret = syscall_get_return_value(current, regs); ++ ++ trace_current_buffer_unlock_commit(event, 0, 0); ++ trace_wake_up(); ++} ++ ++static int init_syscall_tracer(struct trace_array *tr) ++{ ++ start_ftrace_syscalls(); ++ ++ return 0; ++} ++ ++static void reset_syscall_tracer(struct trace_array *tr) ++{ ++ stop_ftrace_syscalls(); ++ tracing_reset_online_cpus(tr); ++} ++ ++static struct trace_event syscall_enter_event = { ++ .type = TRACE_SYSCALL_ENTER, ++ .trace = print_syscall_enter, ++}; ++ ++static struct trace_event syscall_exit_event = { ++ .type = TRACE_SYSCALL_EXIT, ++ .trace = print_syscall_exit, ++}; ++ ++static struct tracer syscall_tracer __read_mostly = { ++ .name = "syscall", ++ .init = init_syscall_tracer, ++ .reset = reset_syscall_tracer, ++ .flags = &syscalls_flags, ++}; ++ ++__init int register_ftrace_syscalls(void) ++{ ++ int ret; ++ ++ ret = register_ftrace_event(&syscall_enter_event); ++ if (!ret) { ++ printk(KERN_WARNING "event %d failed to register\n", ++ syscall_enter_event.type); ++ WARN_ON_ONCE(1); ++ } ++ ++ ret = register_ftrace_event(&syscall_exit_event); ++ if (!ret) { ++ printk(KERN_WARNING "event %d failed to register\n", ++ syscall_exit_event.type); ++ WARN_ON_ONCE(1); ++ } ++ ++ return register_tracer(&syscall_tracer); ++} ++device_initcall(register_ftrace_syscalls); +Index: linux-2.6-tip/kernel/trace/trace_sysprof.c +=================================================================== +--- linux-2.6-tip.orig/kernel/trace/trace_sysprof.c ++++ linux-2.6-tip/kernel/trace/trace_sysprof.c +@@ -88,7 +88,7 @@ static void backtrace_address(void *data + } + } + +-const static struct stacktrace_ops backtrace_ops = { ++static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, +@@ -226,15 +226,6 @@ static void stop_stack_timers(void) + stop_stack_timer(cpu); + } + +-static void start_stack_trace(struct trace_array *tr) +-{ +- mutex_lock(&sample_timer_lock); +- tracing_reset_online_cpus(tr); +- start_stack_timers(); +- tracer_enabled = 1; +- mutex_unlock(&sample_timer_lock); +-} +- + static void stop_stack_trace(struct trace_array *tr) + { + mutex_lock(&sample_timer_lock); +@@ -247,12 +238,18 @@ static int stack_trace_init(struct trace + { + sysprof_trace = tr; + +- start_stack_trace(tr); ++ tracing_start_cmdline_record(); ++ ++ mutex_lock(&sample_timer_lock); ++ start_stack_timers(); ++ tracer_enabled = 1; ++ mutex_unlock(&sample_timer_lock); + return 0; + } + + static void stack_trace_reset(struct trace_array *tr) + { ++ tracing_stop_cmdline_record(); + stop_stack_trace(tr); + } + +@@ -317,7 +314,7 @@ sysprof_sample_write(struct file *filp, + return cnt; + } + +-static struct file_operations sysprof_sample_fops = { ++static const struct file_operations sysprof_sample_fops = { + .read = sysprof_sample_read, + .write = sysprof_sample_write, + }; +@@ -330,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct + d_tracer, NULL, &sysprof_sample_fops); + if (entry) + return; +- pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); ++ pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n"); + } +Index: linux-2.6-tip/kernel/trace/trace_workqueue.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/trace/trace_workqueue.c +@@ -0,0 +1,288 @@ ++/* ++ * Workqueue statistical tracer. ++ * ++ * Copyright (C) 2008 Frederic Weisbecker ++ * ++ */ ++ ++ ++#include ++#include ++#include ++#include "trace_stat.h" ++#include "trace.h" ++ ++ ++/* A cpu workqueue thread */ ++struct cpu_workqueue_stats { ++ struct list_head list; ++/* Useful to know if we print the cpu headers */ ++ bool first_entry; ++ int cpu; ++ pid_t pid; ++/* Can be inserted from interrupt or user context, need to be atomic */ ++ atomic_t inserted; ++/* ++ * Don't need to be atomic, works are serialized in a single workqueue thread ++ * on a single CPU. ++ */ ++ unsigned int executed; ++}; ++ ++/* List of workqueue threads on one cpu */ ++struct workqueue_global_stats { ++ struct list_head list; ++ spinlock_t lock; ++}; ++ ++/* Don't need a global lock because allocated before the workqueues, and ++ * never freed. ++ */ ++static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); ++#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) ++ ++/* Insertion of a work */ ++static void ++probe_workqueue_insertion(struct task_struct *wq_thread, ++ struct work_struct *work) ++{ ++ int cpu = cpumask_first(&wq_thread->cpus_allowed); ++ struct cpu_workqueue_stats *node, *next; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); ++ list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, ++ list) { ++ if (node->pid == wq_thread->pid) { ++ atomic_inc(&node->inserted); ++ goto found; ++ } ++ } ++ pr_debug("trace_workqueue: entry not found\n"); ++found: ++ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); ++} ++ ++/* Execution of a work */ ++static void ++probe_workqueue_execution(struct task_struct *wq_thread, ++ struct work_struct *work) ++{ ++ int cpu = cpumask_first(&wq_thread->cpus_allowed); ++ struct cpu_workqueue_stats *node, *next; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); ++ list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, ++ list) { ++ if (node->pid == wq_thread->pid) { ++ node->executed++; ++ goto found; ++ } ++ } ++ pr_debug("trace_workqueue: entry not found\n"); ++found: ++ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); ++} ++ ++/* Creation of a cpu workqueue thread */ ++static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) ++{ ++ struct cpu_workqueue_stats *cws; ++ unsigned long flags; ++ ++ WARN_ON(cpu < 0); ++ ++ /* Workqueues are sometimes created in atomic context */ ++ cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); ++ if (!cws) { ++ pr_warning("trace_workqueue: not enough memory\n"); ++ return; ++ } ++ INIT_LIST_HEAD(&cws->list); ++ cws->cpu = cpu; ++ ++ cws->pid = wq_thread->pid; ++ ++ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); ++ if (list_empty(&workqueue_cpu_stat(cpu)->list)) ++ cws->first_entry = true; ++ list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); ++ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); ++} ++ ++/* Destruction of a cpu workqueue thread */ ++static void probe_workqueue_destruction(struct task_struct *wq_thread) ++{ ++ /* Workqueue only execute on one cpu */ ++ int cpu = cpumask_first(&wq_thread->cpus_allowed); ++ struct cpu_workqueue_stats *node, *next; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); ++ list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, ++ list) { ++ if (node->pid == wq_thread->pid) { ++ list_del(&node->list); ++ kfree(node); ++ goto found; ++ } ++ } ++ ++ pr_debug("trace_workqueue: don't find workqueue to destroy\n"); ++found: ++ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); ++ ++} ++ ++static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) ++{ ++ unsigned long flags; ++ struct cpu_workqueue_stats *ret = NULL; ++ ++ ++ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); ++ ++ if (!list_empty(&workqueue_cpu_stat(cpu)->list)) ++ ret = list_entry(workqueue_cpu_stat(cpu)->list.next, ++ struct cpu_workqueue_stats, list); ++ ++ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); ++ ++ return ret; ++} ++ ++static void *workqueue_stat_start(void) ++{ ++ int cpu; ++ void *ret = NULL; ++ ++ for_each_possible_cpu(cpu) { ++ ret = workqueue_stat_start_cpu(cpu); ++ if (ret) ++ return ret; ++ } ++ return NULL; ++} ++ ++static void *workqueue_stat_next(void *prev, int idx) ++{ ++ struct cpu_workqueue_stats *prev_cws = prev; ++ int cpu = prev_cws->cpu; ++ unsigned long flags; ++ void *ret = NULL; ++ ++ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); ++ if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { ++ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); ++ do { ++ cpu = cpumask_next(cpu, cpu_possible_mask); ++ if (cpu >= nr_cpu_ids) ++ return NULL; ++ } while (!(ret = workqueue_stat_start_cpu(cpu))); ++ return ret; ++ } ++ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); ++ ++ return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, ++ list); ++} ++ ++static int workqueue_stat_show(struct seq_file *s, void *p) ++{ ++ struct cpu_workqueue_stats *cws = p; ++ unsigned long flags; ++ int cpu = cws->cpu; ++ struct pid *pid; ++ struct task_struct *tsk; ++ ++ spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); ++ if (&cws->list == workqueue_cpu_stat(cpu)->list.next) ++ seq_printf(s, "\n"); ++ spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); ++ ++ pid = find_get_pid(cws->pid); ++ if (pid) { ++ tsk = get_pid_task(pid, PIDTYPE_PID); ++ if (tsk) { ++ seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, ++ atomic_read(&cws->inserted), cws->executed, ++ tsk->comm); ++ put_task_struct(tsk); ++ } ++ put_pid(pid); ++ } ++ ++ return 0; ++} ++ ++static int workqueue_stat_headers(struct seq_file *s) ++{ ++ seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); ++ seq_printf(s, "# | | | |\n"); ++ return 0; ++} ++ ++struct tracer_stat workqueue_stats __read_mostly = { ++ .name = "workqueues", ++ .stat_start = workqueue_stat_start, ++ .stat_next = workqueue_stat_next, ++ .stat_show = workqueue_stat_show, ++ .stat_headers = workqueue_stat_headers ++}; ++ ++ ++int __init stat_workqueue_init(void) ++{ ++ if (register_stat_tracer(&workqueue_stats)) { ++ pr_warning("Unable to register workqueue stat tracer\n"); ++ return 1; ++ } ++ ++ return 0; ++} ++fs_initcall(stat_workqueue_init); ++ ++/* ++ * Workqueues are created very early, just after pre-smp initcalls. ++ * So we must register our tracepoints at this stage. ++ */ ++int __init trace_workqueue_early_init(void) ++{ ++ int ret, cpu; ++ ++ ret = register_trace_workqueue_insertion(probe_workqueue_insertion); ++ if (ret) ++ goto out; ++ ++ ret = register_trace_workqueue_execution(probe_workqueue_execution); ++ if (ret) ++ goto no_insertion; ++ ++ ret = register_trace_workqueue_creation(probe_workqueue_creation); ++ if (ret) ++ goto no_execution; ++ ++ ret = register_trace_workqueue_destruction(probe_workqueue_destruction); ++ if (ret) ++ goto no_creation; ++ ++ for_each_possible_cpu(cpu) { ++ spin_lock_init(&workqueue_cpu_stat(cpu)->lock); ++ INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); ++ } ++ ++ return 0; ++ ++no_creation: ++ unregister_trace_workqueue_creation(probe_workqueue_creation); ++no_execution: ++ unregister_trace_workqueue_execution(probe_workqueue_execution); ++no_insertion: ++ unregister_trace_workqueue_insertion(probe_workqueue_insertion); ++out: ++ pr_warning("trace_workqueue: unable to trace workqueues\n"); ++ ++ return 1; ++} ++early_initcall(trace_workqueue_early_init); +Index: linux-2.6-tip/kernel/tracepoint.c +=================================================================== +--- linux-2.6-tip.orig/kernel/tracepoint.c ++++ linux-2.6-tip/kernel/tracepoint.c +@@ -272,12 +272,15 @@ static void disable_tracepoint(struct tr + * + * Updates the probe callback corresponding to a range of tracepoints. + */ +-void tracepoint_update_probe_range(struct tracepoint *begin, +- struct tracepoint *end) ++void ++tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) + { + struct tracepoint *iter; + struct tracepoint_entry *mark_entry; + ++ if (!begin) ++ return; ++ + mutex_lock(&tracepoints_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_tracepoint(iter->name); +Index: linux-2.6-tip/kernel/workqueue.c +=================================================================== +--- linux-2.6-tip.orig/kernel/workqueue.c ++++ linux-2.6-tip/kernel/workqueue.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -33,6 +34,9 @@ + #include + #include + #include ++#include ++ ++#include + + /* + * The per-CPU workqueue (if single thread, we always use the first +@@ -125,9 +129,13 @@ struct cpu_workqueue_struct *get_wq_data + return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); + } + ++DEFINE_TRACE(workqueue_insertion); ++ + static void insert_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work, struct list_head *head) + { ++ trace_workqueue_insertion(cwq->thread, work); ++ + set_wq_data(work, cwq); + /* + * Ensure that we get the right work->data if we see the +@@ -157,13 +165,14 @@ static void __queue_work(struct cpu_work + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. ++ * ++ * Especially no such guarantee on PREEMPT_RT. + */ + int queue_work(struct workqueue_struct *wq, struct work_struct *work) + { +- int ret; ++ int ret = 0, cpu = raw_smp_processor_id(); + +- ret = queue_work_on(get_cpu(), wq, work); +- put_cpu(); ++ ret = queue_work_on(cpu, wq, work); + + return ret; + } +@@ -200,7 +209,7 @@ static void delayed_work_timer_fn(unsign + struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); + struct workqueue_struct *wq = cwq->wq; + +- __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); ++ __queue_work(wq_per_cpu(wq, raw_smp_processor_id()), &dwork->work); + } + + /** +@@ -259,6 +268,8 @@ int queue_delayed_work_on(int cpu, struc + } + EXPORT_SYMBOL_GPL(queue_delayed_work_on); + ++DEFINE_TRACE(workqueue_execution); ++ + static void run_workqueue(struct cpu_workqueue_struct *cwq) + { + spin_lock_irq(&cwq->lock); +@@ -284,7 +295,7 @@ static void run_workqueue(struct cpu_wor + */ + struct lockdep_map lockdep_map = work->lockdep_map; + #endif +- ++ trace_workqueue_execution(cwq->thread, work); + cwq->current_work = work; + list_del_init(cwq->worklist.next); + spin_unlock_irq(&cwq->lock); +@@ -765,6 +776,8 @@ init_cpu_workqueue(struct workqueue_stru + return cwq; + } + ++DEFINE_TRACE(workqueue_creation); ++ + static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) + { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; +@@ -787,6 +800,8 @@ static int create_workqueue_thread(struc + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); + cwq->thread = p; + ++ trace_workqueue_creation(cwq->thread, cpu); ++ + return 0; + } + +@@ -868,6 +883,8 @@ struct workqueue_struct *__create_workqu + } + EXPORT_SYMBOL_GPL(__create_workqueue_key); + ++DEFINE_TRACE(workqueue_destruction); ++ + static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) + { + /* +@@ -891,10 +908,54 @@ static void cleanup_workqueue_thread(str + * checks list_empty(), and a "normal" queue_work() can't use + * a dead CPU. + */ ++ trace_workqueue_destruction(cwq->thread); + kthread_stop(cwq->thread); + cwq->thread = NULL; + } + ++void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu, ++ int policy, int rt_priority, int nice) ++{ ++ struct sched_param param = { .sched_priority = rt_priority }; ++ struct cpu_workqueue_struct *cwq; ++ mm_segment_t oldfs = get_fs(); ++ struct task_struct *p; ++ unsigned long flags; ++ int ret; ++ ++ cwq = per_cpu_ptr(wq->cpu_wq, cpu); ++ spin_lock_irqsave(&cwq->lock, flags); ++ p = cwq->thread; ++ spin_unlock_irqrestore(&cwq->lock, flags); ++ ++ set_user_nice(p, nice); ++ ++ set_fs(KERNEL_DS); ++ ret = sys_sched_setscheduler(p->pid, policy, ¶m); ++ set_fs(oldfs); ++ ++ WARN_ON(ret); ++} ++ ++void set_workqueue_prio(struct workqueue_struct *wq, int policy, ++ int rt_priority, int nice) ++{ ++ int cpu; ++ ++ /* We don't need the distraction of CPUs appearing and vanishing. */ ++ get_online_cpus(); ++ spin_lock(&workqueue_lock); ++ if (is_wq_single_threaded(wq)) ++ set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice); ++ else { ++ for_each_online_cpu(cpu) ++ set_workqueue_thread_prio(wq, cpu, policy, ++ rt_priority, nice); ++ } ++ spin_unlock(&workqueue_lock); ++ put_online_cpus(); ++} ++ + /** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue +@@ -1021,6 +1082,7 @@ void __init init_workqueues(void) + hotcpu_notifier(workqueue_cpu_callback, 0); + keventd_wq = create_workqueue("events"); + BUG_ON(!keventd_wq); ++ set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20); + #ifdef CONFIG_SMP + work_on_cpu_wq = create_workqueue("work_on_cpu"); + BUG_ON(!work_on_cpu_wq); +Index: linux-2.6-tip/lib/Kconfig +=================================================================== +--- linux-2.6-tip.orig/lib/Kconfig ++++ linux-2.6-tip/lib/Kconfig +@@ -2,6 +2,9 @@ + # Library configuration + # + ++config BINARY_PRINTF ++ def_bool n ++ + menu "Library routines" + + config BITREVERSE +@@ -98,6 +101,20 @@ config LZO_DECOMPRESS + tristate + + # ++# These all provide a common interface (hence the apparent duplication with ++# ZLIB_INFLATE; DECOMPRESS_GZIP is just a wrapper.) ++# ++config DECOMPRESS_GZIP ++ select ZLIB_INFLATE ++ tristate ++ ++config DECOMPRESS_BZIP2 ++ tristate ++ ++config DECOMPRESS_LZMA ++ tristate ++ ++# + # Generic allocator support is selected if needed + # + config GENERIC_ALLOCATOR +@@ -136,12 +153,6 @@ config TEXTSEARCH_BM + config TEXTSEARCH_FSM + tristate + +-# +-# plist support is select#ed if needed +-# +-config PLIST +- boolean +- + config HAS_IOMEM + boolean + depends on !NO_IOMEM +@@ -165,6 +176,7 @@ config HAVE_LMB + + config CPUMASK_OFFSTACK + bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS ++ depends on !PREEMPT_RT && BROKEN + help + Use dynamic allocation for cpumask_var_t, instead of putting + them on the stack. This is a bit more expensive, but avoids +Index: linux-2.6-tip/lib/Kconfig.debug +=================================================================== +--- linux-2.6-tip.orig/lib/Kconfig.debug ++++ linux-2.6-tip/lib/Kconfig.debug +@@ -9,8 +9,20 @@ config PRINTK_TIME + operations. This is useful for identifying long delays + in kernel startup. + ++config ALLOW_WARNINGS ++ bool "Continue building despite compiler warnings" ++ default y ++ help ++ By disabling this option you will enable -Werror on building C ++ files. This causes all warnings to abort the compilation, just as ++ errors do. (It is generally not recommended to disable this option as ++ the overwhelming majority of warnings is harmless and also gcc puts ++ out false-positive warnings. It is useful for automated testing ++ though.) ++ + config ENABLE_WARN_DEPRECATED + bool "Enable __deprecated logic" ++ depends on ALLOW_WARNINGS + default y + help + Enable the __deprecated logic in the kernel build. +@@ -19,12 +31,13 @@ config ENABLE_WARN_DEPRECATED + + config ENABLE_MUST_CHECK + bool "Enable __must_check logic" +- default y ++ depends on ALLOW_WARNINGS + help + Enable the __must_check logic in the kernel build. Disable this to + suppress the "warning: ignoring return value of 'foo', declared with + attribute warn_unused_result" messages. + ++ + config FRAME_WARN + int "Warn for stack frames larger than (needs gcc 4.4)" + range 0 8192 +@@ -95,7 +108,6 @@ config HEADERS_CHECK + + config DEBUG_SECTION_MISMATCH + bool "Enable full Section mismatch analysis" +- depends on UNDEFINED + # This option is on purpose disabled for now. + # It will be enabled when we are down to a resonable number + # of section mismatch warnings (< 10 for an allyesconfig build) +@@ -186,6 +198,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE + default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC + default 1 if BOOTPARAM_SOFTLOCKUP_PANIC + ++config DETECT_HUNG_TASK ++ bool "Detect Hung Tasks" ++ depends on DEBUG_KERNEL ++ default y ++ help ++ Say Y here to enable the kernel to detect "hung tasks", ++ which are bugs that cause the task to be stuck in ++ uninterruptible "D" state indefinitiley. ++ ++ When a hung task is detected, the kernel will print the ++ current stack trace (which you should report), but the ++ task will stay in uninterruptible state. If lockdep is ++ enabled then all held locks will also be reported. This ++ feature has negligible overhead. ++ ++config BOOTPARAM_HUNG_TASK_PANIC ++ bool "Panic (Reboot) On Hung Tasks" ++ depends on DETECT_HUNG_TASK ++ help ++ Say Y here to enable the kernel to panic on "hung tasks", ++ which are bugs that cause the kernel to leave a task stuck ++ in uninterruptible "D" state. ++ ++ The panic can be used in combination with panic_timeout, ++ to cause the system to reboot automatically after a ++ hung task has been detected. This feature is useful for ++ high-availability systems that have uptime guarantees and ++ where a hung tasks must be resolved ASAP. ++ ++ Say N if unsure. ++ ++config BOOTPARAM_HUNG_TASK_PANIC_VALUE ++ int ++ depends on DETECT_HUNG_TASK ++ range 0 1 ++ default 0 if !BOOTPARAM_HUNG_TASK_PANIC ++ default 1 if BOOTPARAM_HUNG_TASK_PANIC ++ + config SCHED_DEBUG + bool "Collect scheduler debugging info" + depends on DEBUG_KERNEL && PROC_FS +@@ -262,7 +312,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT + + config DEBUG_SLAB + bool "Debug slab memory allocations" +- depends on DEBUG_KERNEL && SLAB ++ depends on DEBUG_KERNEL && SLAB && !KMEMCHECK + help + Say Y here to have the kernel do limited verification on memory + allocation as well as poisoning memory on free to catch use of freed +@@ -274,7 +324,7 @@ config DEBUG_SLAB_LEAK + + config SLUB_DEBUG_ON + bool "SLUB debugging on by default" +- depends on SLUB && SLUB_DEBUG ++ depends on SLUB && SLUB_DEBUG && !KMEMCHECK + default n + help + Boot with debugging on by default. SLUB boots by default with +@@ -314,6 +364,8 @@ config DEBUG_RT_MUTEXES + help + This allows rt mutex semantics violations and rt mutex related + deadlocks (lockups) to be detected and reported automatically. ++ When realtime preemption is enabled this includes spinlocks, ++ rwlocks, mutexes and (rw)semaphores + + config DEBUG_PI_LIST + bool +@@ -337,7 +389,7 @@ config DEBUG_SPINLOCK + + config DEBUG_MUTEXES + bool "Mutex debugging: basic checks" +- depends on DEBUG_KERNEL ++ depends on DEBUG_KERNEL && !PREEMPT_RT + help + This feature allows mutex semantics violations to be detected and + reported. +@@ -402,7 +454,7 @@ config LOCKDEP + bool + depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT + select STACKTRACE +- select FRAME_POINTER if !X86 && !MIPS && !PPC ++ select FRAME_POINTER if !MIPS && !PPC + select KALLSYMS + select KALLSYMS_ALL + +@@ -902,6 +954,19 @@ config DYNAMIC_PRINTK_DEBUG + debugging for all modules. This mode can be turned off via the above + disable command. + ++config DMA_API_DEBUG ++ bool "Enable debugging of DMA-API usage" ++ depends on HAVE_DMA_API_DEBUG ++ help ++ Enable this option to debug the use of the DMA API by device drivers. ++ With this option you will be able to detect common bugs in device ++ drivers like double-freeing of DMA mappings or freeing mappings that ++ were never allocated. ++ This option causes a performance degredation. Use only if you want ++ to debug device drivers. If unsure, say N. ++ + source "samples/Kconfig" + + source "lib/Kconfig.kgdb" ++ ++source "lib/Kconfig.kmemcheck" +Index: linux-2.6-tip/lib/Kconfig.kmemcheck +=================================================================== +--- /dev/null ++++ linux-2.6-tip/lib/Kconfig.kmemcheck +@@ -0,0 +1,91 @@ ++config HAVE_ARCH_KMEMCHECK ++ bool ++ ++menuconfig KMEMCHECK ++ bool "kmemcheck: trap use of uninitialized memory" ++ depends on DEBUG_KERNEL ++ depends on !X86_USE_3DNOW ++ depends on SLUB || SLAB ++ depends on !CC_OPTIMIZE_FOR_SIZE ++ depends on !FUNCTION_TRACER ++ select FRAME_POINTER ++ select STACKTRACE ++ default n ++ help ++ This option enables tracing of dynamically allocated kernel memory ++ to see if memory is used before it has been given an initial value. ++ Be aware that this requires half of your memory for bookkeeping and ++ will insert extra code at *every* read and write to tracked memory ++ thus slow down the kernel code (but user code is unaffected). ++ ++ The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable ++ or enable kmemcheck at boot-time. If the kernel is started with ++ kmemcheck=0, the large memory and CPU overhead is not incurred. ++ ++choice ++ prompt "kmemcheck: default mode at boot" ++ depends on KMEMCHECK ++ default KMEMCHECK_ONESHOT_BY_DEFAULT ++ help ++ This option controls the default behaviour of kmemcheck when the ++ kernel boots and no kmemcheck= parameter is given. ++ ++config KMEMCHECK_DISABLED_BY_DEFAULT ++ bool "disabled" ++ depends on KMEMCHECK ++ ++config KMEMCHECK_ENABLED_BY_DEFAULT ++ bool "enabled" ++ depends on KMEMCHECK ++ ++config KMEMCHECK_ONESHOT_BY_DEFAULT ++ bool "one-shot" ++ depends on KMEMCHECK ++ help ++ In one-shot mode, only the first error detected is reported before ++ kmemcheck is disabled. ++ ++endchoice ++ ++config KMEMCHECK_QUEUE_SIZE ++ int "kmemcheck: error queue size" ++ depends on KMEMCHECK ++ default 64 ++ help ++ Select the maximum number of errors to store in the queue. Since ++ errors can occur virtually anywhere and in any context, we need a ++ temporary storage area which is guarantueed not to generate any ++ other faults. The queue will be emptied as soon as a tasklet may ++ be scheduled. If the queue is full, new error reports will be ++ lost. ++ ++config KMEMCHECK_SHADOW_COPY_SHIFT ++ int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)" ++ depends on KMEMCHECK ++ range 2 8 ++ default 5 ++ help ++ Select the number of shadow bytes to save along with each entry of ++ the queue. These bytes indicate what parts of an allocation are ++ initialized, uninitialized, etc. and will be displayed when an ++ error is detected to help the debugging of a particular problem. ++ ++config KMEMCHECK_PARTIAL_OK ++ bool "kmemcheck: allow partially uninitialized memory" ++ depends on KMEMCHECK ++ default y ++ help ++ This option works around certain GCC optimizations that produce ++ 32-bit reads from 16-bit variables where the upper 16 bits are ++ thrown away afterwards. This may of course also hide some real ++ bugs. ++ ++config KMEMCHECK_BITOPS_OK ++ bool "kmemcheck: allow bit-field manipulation" ++ depends on KMEMCHECK ++ default n ++ help ++ This option silences warnings that would be generated for bit-field ++ accesses where not all the bits are initialized at the same time. ++ This may also hide some real bugs. ++ +Index: linux-2.6-tip/lib/Makefile +=================================================================== +--- linux-2.6-tip.orig/lib/Makefile ++++ linux-2.6-tip/lib/Makefile +@@ -11,7 +11,8 @@ lib-y := ctype.o string.o vsprintf.o cmd + rbtree.o radix-tree.o dump_stack.o \ + idr.o int_sqrt.o extable.o prio_tree.o \ + sha1.o irq_regs.o reciprocal_div.o argv_split.o \ +- proportions.o prio_heap.o ratelimit.o show_mem.o is_single_threaded.o ++ proportions.o prio_heap.o ratelimit.o show_mem.o \ ++ is_single_threaded.o plist.o decompress.o + + lib-$(CONFIG_MMU) += ioremap.o + lib-$(CONFIG_SMP) += cpumask.o +@@ -33,14 +34,14 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o + obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o + obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o + obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +-lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o ++obj-$(CONFIG_PREEMPT_RT) += plist.o ++obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o + lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o + lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o + lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o + lib-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o + obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o + obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o +-obj-$(CONFIG_PLIST) += plist.o + obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o + obj-$(CONFIG_DEBUG_LIST) += list_debug.o + obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o +@@ -65,6 +66,10 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solom + obj-$(CONFIG_LZO_COMPRESS) += lzo/ + obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ + ++lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o ++lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o ++lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o ++ + obj-$(CONFIG_TEXTSEARCH) += textsearch.o + obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o + obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o +@@ -84,6 +89,8 @@ obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += sys + + obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o + ++obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o ++ + hostprogs-y := gen_crc32table + clean-files := crc32table.h + +Index: linux-2.6-tip/lib/debugobjects.c +=================================================================== +--- linux-2.6-tip.orig/lib/debugobjects.c ++++ linux-2.6-tip/lib/debugobjects.c +@@ -25,14 +25,14 @@ + + struct debug_bucket { + struct hlist_head list; +- spinlock_t lock; ++ raw_spinlock_t lock; + }; + + static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; + +-static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE]; ++static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; + +-static DEFINE_SPINLOCK(pool_lock); ++static DEFINE_RAW_SPINLOCK(pool_lock); + + static HLIST_HEAD(obj_pool); + +@@ -50,12 +50,23 @@ static int debug_objects_enabled __rea + + static struct debug_obj_descr *descr_test __read_mostly; + ++static void free_obj_work(struct work_struct *work); ++static DECLARE_WORK(debug_obj_work, free_obj_work); ++ + static int __init enable_object_debug(char *str) + { + debug_objects_enabled = 1; + return 0; + } ++ ++static int __init disable_object_debug(char *str) ++{ ++ debug_objects_enabled = 0; ++ return 0; ++} ++ + early_param("debug_objects", enable_object_debug); ++early_param("no_debug_objects", disable_object_debug); + + static const char *obj_states[ODEBUG_STATE_MAX] = { + [ODEBUG_STATE_NONE] = "none", +@@ -146,25 +157,51 @@ alloc_object(void *addr, struct debug_bu + } + + /* +- * Put the object back into the pool or give it back to kmem_cache: ++ * workqueue function to free objects. + */ +-static void free_object(struct debug_obj *obj) ++static void free_obj_work(struct work_struct *work) + { +- unsigned long idx = (unsigned long)(obj - obj_static_pool); ++ struct debug_obj *obj; + unsigned long flags; + +- if (obj_pool_free < ODEBUG_POOL_SIZE || idx < ODEBUG_POOL_SIZE) { +- spin_lock_irqsave(&pool_lock, flags); +- hlist_add_head(&obj->node, &obj_pool); +- obj_pool_free++; +- obj_pool_used--; +- spin_unlock_irqrestore(&pool_lock, flags); +- } else { +- spin_lock_irqsave(&pool_lock, flags); +- obj_pool_used--; ++ spin_lock_irqsave(&pool_lock, flags); ++ while (obj_pool_free > ODEBUG_POOL_SIZE) { ++ obj = hlist_entry(obj_pool.first, typeof(*obj), node); ++ hlist_del(&obj->node); ++ obj_pool_free--; ++ /* ++ * We release pool_lock across kmem_cache_free() to ++ * avoid contention on pool_lock. ++ */ + spin_unlock_irqrestore(&pool_lock, flags); + kmem_cache_free(obj_cache, obj); ++ spin_lock_irqsave(&pool_lock, flags); + } ++ spin_unlock_irqrestore(&pool_lock, flags); ++} ++ ++/* ++ * Put the object back into the pool and schedule work to free objects ++ * if necessary. ++ */ ++static void free_object(struct debug_obj *obj) ++{ ++ unsigned long flags; ++ int sched = 0; ++ ++ spin_lock_irqsave(&pool_lock, flags); ++ /* ++ * schedule work when the pool is filled and the cache is ++ * initialized: ++ */ ++ if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache) ++ sched = !work_pending(&debug_obj_work); ++ hlist_add_head(&obj->node, &obj_pool); ++ obj_pool_free++; ++ obj_pool_used--; ++ spin_unlock_irqrestore(&pool_lock, flags); ++ if (sched) ++ schedule_work(&debug_obj_work); + } + + /* +@@ -876,6 +913,63 @@ void __init debug_objects_early_init(voi + } + + /* ++ * Convert the statically allocated objects to dynamic ones: ++ */ ++static int debug_objects_replace_static_objects(void) ++{ ++ struct debug_bucket *db = obj_hash; ++ struct hlist_node *node, *tmp; ++ struct debug_obj *obj, *new; ++ HLIST_HEAD(objects); ++ int i, cnt = 0; ++ ++ for (i = 0; i < ODEBUG_POOL_SIZE; i++) { ++ obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL); ++ if (!obj) ++ goto free; ++ hlist_add_head(&obj->node, &objects); ++ } ++ ++ /* ++ * When debug_objects_mem_init() is called we know that only ++ * one CPU is up, so disabling interrupts is enough ++ * protection. This avoids the lockdep hell of lock ordering. ++ */ ++ local_irq_disable(); ++ ++ /* Remove the statically allocated objects from the pool */ ++ hlist_for_each_entry_safe(obj, node, tmp, &obj_pool, node) ++ hlist_del(&obj->node); ++ /* Move the allocated objects to the pool */ ++ hlist_move_list(&objects, &obj_pool); ++ ++ /* Replace the active object references */ ++ for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { ++ hlist_move_list(&db->list, &objects); ++ ++ hlist_for_each_entry(obj, node, &objects, node) { ++ new = hlist_entry(obj_pool.first, typeof(*obj), node); ++ hlist_del(&new->node); ++ /* copy object data */ ++ *new = *obj; ++ hlist_add_head(&new->node, &db->list); ++ cnt++; ++ } ++ } ++ ++ printk(KERN_DEBUG "ODEBUG: %d of %d active objects replaced\n", cnt, ++ obj_pool_used); ++ local_irq_enable(); ++ return 0; ++free: ++ hlist_for_each_entry_safe(obj, node, tmp, &objects, node) { ++ hlist_del(&obj->node); ++ kmem_cache_free(obj_cache, obj); ++ } ++ return -ENOMEM; ++} ++ ++/* + * Called after the kmem_caches are functional to setup a dedicated + * cache pool, which has the SLAB_DEBUG_OBJECTS flag set. This flag + * prevents that the debug code is called on kmem_cache_free() for the +@@ -890,8 +984,11 @@ void __init debug_objects_mem_init(void) + sizeof (struct debug_obj), 0, + SLAB_DEBUG_OBJECTS, NULL); + +- if (!obj_cache) ++ if (!obj_cache || debug_objects_replace_static_objects()) { + debug_objects_enabled = 0; +- else ++ if (obj_cache) ++ kmem_cache_destroy(obj_cache); ++ printk(KERN_WARNING "ODEBUG: out of memory.\n"); ++ } else + debug_objects_selftest(); + } +Index: linux-2.6-tip/lib/decompress.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/lib/decompress.c +@@ -0,0 +1,54 @@ ++/* ++ * decompress.c ++ * ++ * Detect the decompression method based on magic number ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++#ifndef CONFIG_DECOMPRESS_GZIP ++# define gunzip NULL ++#endif ++#ifndef CONFIG_DECOMPRESS_BZIP2 ++# define bunzip2 NULL ++#endif ++#ifndef CONFIG_DECOMPRESS_LZMA ++# define unlzma NULL ++#endif ++ ++static const struct compress_format { ++ unsigned char magic[2]; ++ const char *name; ++ decompress_fn decompressor; ++} compressed_formats[] = { ++ { {037, 0213}, "gzip", gunzip }, ++ { {037, 0236}, "gzip", gunzip }, ++ { {0x42, 0x5a}, "bzip2", bunzip2 }, ++ { {0x5d, 0x00}, "lzma", unlzma }, ++ { {0, 0}, NULL, NULL } ++}; ++ ++decompress_fn decompress_method(const unsigned char *inbuf, int len, ++ const char **name) ++{ ++ const struct compress_format *cf; ++ ++ if (len < 2) ++ return NULL; /* Need at least this much... */ ++ ++ for (cf = compressed_formats; cf->name; cf++) { ++ if (!memcmp(inbuf, cf->magic, 2)) ++ break; ++ ++ } ++ if (name) ++ *name = cf->name; ++ return cf->decompressor; ++} +Index: linux-2.6-tip/lib/decompress_bunzip2.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/lib/decompress_bunzip2.c +@@ -0,0 +1,736 @@ ++/* vi: set sw = 4 ts = 4: */ ++/* Small bzip2 deflate implementation, by Rob Landley (rob@landley.net). ++ ++ Based on bzip2 decompression code by Julian R Seward (jseward@acm.org), ++ which also acknowledges contributions by Mike Burrows, David Wheeler, ++ Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten, ++ Robert Sedgewick, and Jon L. Bentley. ++ ++ This code is licensed under the LGPLv2: ++ LGPL (http://www.gnu.org/copyleft/lgpl.html ++*/ ++ ++/* ++ Size and speed optimizations by Manuel Novoa III (mjn3@codepoet.org). ++ ++ More efficient reading of Huffman codes, a streamlined read_bunzip() ++ function, and various other tweaks. In (limited) tests, approximately ++ 20% faster than bzcat on x86 and about 10% faster on arm. ++ ++ Note that about 2/3 of the time is spent in read_unzip() reversing ++ the Burrows-Wheeler transformation. Much of that time is delay ++ resulting from cache misses. ++ ++ I would ask that anyone benefiting from this work, especially those ++ using it in commercial products, consider making a donation to my local ++ non-profit hospice organization in the name of the woman I loved, who ++ passed away Feb. 12, 2003. ++ ++ In memory of Toni W. Hagan ++ ++ Hospice of Acadiana, Inc. ++ 2600 Johnston St., Suite 200 ++ Lafayette, LA 70503-3240 ++ ++ Phone (337) 232-1234 or 1-800-738-2226 ++ Fax (337) 232-1297 ++ ++ http://www.hospiceacadiana.com/ ++ ++ Manuel ++ */ ++ ++/* ++ Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu) ++*/ ++ ++ ++#ifndef STATIC ++#include ++#endif /* !STATIC */ ++ ++#include ++#include ++ ++#ifndef INT_MAX ++#define INT_MAX 0x7fffffff ++#endif ++ ++/* Constants for Huffman coding */ ++#define MAX_GROUPS 6 ++#define GROUP_SIZE 50 /* 64 would have been more efficient */ ++#define MAX_HUFCODE_BITS 20 /* Longest Huffman code allowed */ ++#define MAX_SYMBOLS 258 /* 256 literals + RUNA + RUNB */ ++#define SYMBOL_RUNA 0 ++#define SYMBOL_RUNB 1 ++ ++/* Status return values */ ++#define RETVAL_OK 0 ++#define RETVAL_LAST_BLOCK (-1) ++#define RETVAL_NOT_BZIP_DATA (-2) ++#define RETVAL_UNEXPECTED_INPUT_EOF (-3) ++#define RETVAL_UNEXPECTED_OUTPUT_EOF (-4) ++#define RETVAL_DATA_ERROR (-5) ++#define RETVAL_OUT_OF_MEMORY (-6) ++#define RETVAL_OBSOLETE_INPUT (-7) ++ ++/* Other housekeeping constants */ ++#define BZIP2_IOBUF_SIZE 4096 ++ ++/* This is what we know about each Huffman coding group */ ++struct group_data { ++ /* We have an extra slot at the end of limit[] for a sentinal value. */ ++ int limit[MAX_HUFCODE_BITS+1]; ++ int base[MAX_HUFCODE_BITS]; ++ int permute[MAX_SYMBOLS]; ++ int minLen, maxLen; ++}; ++ ++/* Structure holding all the housekeeping data, including IO buffers and ++ memory that persists between calls to bunzip */ ++struct bunzip_data { ++ /* State for interrupting output loop */ ++ int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent; ++ /* I/O tracking data (file handles, buffers, positions, etc.) */ ++ int (*fill)(void*, unsigned int); ++ int inbufCount, inbufPos /*, outbufPos*/; ++ unsigned char *inbuf /*,*outbuf*/; ++ unsigned int inbufBitCount, inbufBits; ++ /* The CRC values stored in the block header and calculated from the ++ data */ ++ unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC; ++ /* Intermediate buffer and its size (in bytes) */ ++ unsigned int *dbuf, dbufSize; ++ /* These things are a bit too big to go on the stack */ ++ unsigned char selectors[32768]; /* nSelectors = 15 bits */ ++ struct group_data groups[MAX_GROUPS]; /* Huffman coding tables */ ++ int io_error; /* non-zero if we have IO error */ ++}; ++ ++ ++/* Return the next nnn bits of input. All reads from the compressed input ++ are done through this function. All reads are big endian */ ++static unsigned int INIT get_bits(struct bunzip_data *bd, char bits_wanted) ++{ ++ unsigned int bits = 0; ++ ++ /* If we need to get more data from the byte buffer, do so. ++ (Loop getting one byte at a time to enforce endianness and avoid ++ unaligned access.) */ ++ while (bd->inbufBitCount < bits_wanted) { ++ /* If we need to read more data from file into byte buffer, do ++ so */ ++ if (bd->inbufPos == bd->inbufCount) { ++ if (bd->io_error) ++ return 0; ++ bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE); ++ if (bd->inbufCount <= 0) { ++ bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF; ++ return 0; ++ } ++ bd->inbufPos = 0; ++ } ++ /* Avoid 32-bit overflow (dump bit buffer to top of output) */ ++ if (bd->inbufBitCount >= 24) { ++ bits = bd->inbufBits&((1 << bd->inbufBitCount)-1); ++ bits_wanted -= bd->inbufBitCount; ++ bits <<= bits_wanted; ++ bd->inbufBitCount = 0; ++ } ++ /* Grab next 8 bits of input from buffer. */ ++ bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++]; ++ bd->inbufBitCount += 8; ++ } ++ /* Calculate result */ ++ bd->inbufBitCount -= bits_wanted; ++ bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1); ++ ++ return bits; ++} ++ ++/* Unpacks the next block and sets up for the inverse burrows-wheeler step. */ ++ ++static int INIT get_next_block(struct bunzip_data *bd) ++{ ++ struct group_data *hufGroup = NULL; ++ int *base = NULL; ++ int *limit = NULL; ++ int dbufCount, nextSym, dbufSize, groupCount, selector, ++ i, j, k, t, runPos, symCount, symTotal, nSelectors, ++ byteCount[256]; ++ unsigned char uc, symToByte[256], mtfSymbol[256], *selectors; ++ unsigned int *dbuf, origPtr; ++ ++ dbuf = bd->dbuf; ++ dbufSize = bd->dbufSize; ++ selectors = bd->selectors; ++ ++ /* Read in header signature and CRC, then validate signature. ++ (last block signature means CRC is for whole file, return now) */ ++ i = get_bits(bd, 24); ++ j = get_bits(bd, 24); ++ bd->headerCRC = get_bits(bd, 32); ++ if ((i == 0x177245) && (j == 0x385090)) ++ return RETVAL_LAST_BLOCK; ++ if ((i != 0x314159) || (j != 0x265359)) ++ return RETVAL_NOT_BZIP_DATA; ++ /* We can add support for blockRandomised if anybody complains. ++ There was some code for this in busybox 1.0.0-pre3, but nobody ever ++ noticed that it didn't actually work. */ ++ if (get_bits(bd, 1)) ++ return RETVAL_OBSOLETE_INPUT; ++ origPtr = get_bits(bd, 24); ++ if (origPtr > dbufSize) ++ return RETVAL_DATA_ERROR; ++ /* mapping table: if some byte values are never used (encoding things ++ like ascii text), the compression code removes the gaps to have fewer ++ symbols to deal with, and writes a sparse bitfield indicating which ++ values were present. We make a translation table to convert the ++ symbols back to the corresponding bytes. */ ++ t = get_bits(bd, 16); ++ symTotal = 0; ++ for (i = 0; i < 16; i++) { ++ if (t&(1 << (15-i))) { ++ k = get_bits(bd, 16); ++ for (j = 0; j < 16; j++) ++ if (k&(1 << (15-j))) ++ symToByte[symTotal++] = (16*i)+j; ++ } ++ } ++ /* How many different Huffman coding groups does this block use? */ ++ groupCount = get_bits(bd, 3); ++ if (groupCount < 2 || groupCount > MAX_GROUPS) ++ return RETVAL_DATA_ERROR; ++ /* nSelectors: Every GROUP_SIZE many symbols we select a new ++ Huffman coding group. Read in the group selector list, ++ which is stored as MTF encoded bit runs. (MTF = Move To ++ Front, as each value is used it's moved to the start of the ++ list.) */ ++ nSelectors = get_bits(bd, 15); ++ if (!nSelectors) ++ return RETVAL_DATA_ERROR; ++ for (i = 0; i < groupCount; i++) ++ mtfSymbol[i] = i; ++ for (i = 0; i < nSelectors; i++) { ++ /* Get next value */ ++ for (j = 0; get_bits(bd, 1); j++) ++ if (j >= groupCount) ++ return RETVAL_DATA_ERROR; ++ /* Decode MTF to get the next selector */ ++ uc = mtfSymbol[j]; ++ for (; j; j--) ++ mtfSymbol[j] = mtfSymbol[j-1]; ++ mtfSymbol[0] = selectors[i] = uc; ++ } ++ /* Read the Huffman coding tables for each group, which code ++ for symTotal literal symbols, plus two run symbols (RUNA, ++ RUNB) */ ++ symCount = symTotal+2; ++ for (j = 0; j < groupCount; j++) { ++ unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1]; ++ int minLen, maxLen, pp; ++ /* Read Huffman code lengths for each symbol. They're ++ stored in a way similar to mtf; record a starting ++ value for the first symbol, and an offset from the ++ previous value for everys symbol after that. ++ (Subtracting 1 before the loop and then adding it ++ back at the end is an optimization that makes the ++ test inside the loop simpler: symbol length 0 ++ becomes negative, so an unsigned inequality catches ++ it.) */ ++ t = get_bits(bd, 5)-1; ++ for (i = 0; i < symCount; i++) { ++ for (;;) { ++ if (((unsigned)t) > (MAX_HUFCODE_BITS-1)) ++ return RETVAL_DATA_ERROR; ++ ++ /* If first bit is 0, stop. Else ++ second bit indicates whether to ++ increment or decrement the value. ++ Optimization: grab 2 bits and unget ++ the second if the first was 0. */ ++ ++ k = get_bits(bd, 2); ++ if (k < 2) { ++ bd->inbufBitCount++; ++ break; ++ } ++ /* Add one if second bit 1, else ++ * subtract 1. Avoids if/else */ ++ t += (((k+1)&2)-1); ++ } ++ /* Correct for the initial -1, to get the ++ * final symbol length */ ++ length[i] = t+1; ++ } ++ /* Find largest and smallest lengths in this group */ ++ minLen = maxLen = length[0]; ++ ++ for (i = 1; i < symCount; i++) { ++ if (length[i] > maxLen) ++ maxLen = length[i]; ++ else if (length[i] < minLen) ++ minLen = length[i]; ++ } ++ ++ /* Calculate permute[], base[], and limit[] tables from ++ * length[]. ++ * ++ * permute[] is the lookup table for converting ++ * Huffman coded symbols into decoded symbols. base[] ++ * is the amount to subtract from the value of a ++ * Huffman symbol of a given length when using ++ * permute[]. ++ * ++ * limit[] indicates the largest numerical value a ++ * symbol with a given number of bits can have. This ++ * is how the Huffman codes can vary in length: each ++ * code with a value > limit[length] needs another ++ * bit. ++ */ ++ hufGroup = bd->groups+j; ++ hufGroup->minLen = minLen; ++ hufGroup->maxLen = maxLen; ++ /* Note that minLen can't be smaller than 1, so we ++ adjust the base and limit array pointers so we're ++ not always wasting the first entry. We do this ++ again when using them (during symbol decoding).*/ ++ base = hufGroup->base-1; ++ limit = hufGroup->limit-1; ++ /* Calculate permute[]. Concurently, initialize ++ * temp[] and limit[]. */ ++ pp = 0; ++ for (i = minLen; i <= maxLen; i++) { ++ temp[i] = limit[i] = 0; ++ for (t = 0; t < symCount; t++) ++ if (length[t] == i) ++ hufGroup->permute[pp++] = t; ++ } ++ /* Count symbols coded for at each bit length */ ++ for (i = 0; i < symCount; i++) ++ temp[length[i]]++; ++ /* Calculate limit[] (the largest symbol-coding value ++ *at each bit length, which is (previous limit << ++ *1)+symbols at this level), and base[] (number of ++ *symbols to ignore at each bit length, which is limit ++ *minus the cumulative count of symbols coded for ++ *already). */ ++ pp = t = 0; ++ for (i = minLen; i < maxLen; i++) { ++ pp += temp[i]; ++ /* We read the largest possible symbol size ++ and then unget bits after determining how ++ many we need, and those extra bits could be ++ set to anything. (They're noise from ++ future symbols.) At each level we're ++ really only interested in the first few ++ bits, so here we set all the trailing ++ to-be-ignored bits to 1 so they don't ++ affect the value > limit[length] ++ comparison. */ ++ limit[i] = (pp << (maxLen - i)) - 1; ++ pp <<= 1; ++ base[i+1] = pp-(t += temp[i]); ++ } ++ limit[maxLen+1] = INT_MAX; /* Sentinal value for ++ * reading next sym. */ ++ limit[maxLen] = pp+temp[maxLen]-1; ++ base[minLen] = 0; ++ } ++ /* We've finished reading and digesting the block header. Now ++ read this block's Huffman coded symbols from the file and ++ undo the Huffman coding and run length encoding, saving the ++ result into dbuf[dbufCount++] = uc */ ++ ++ /* Initialize symbol occurrence counters and symbol Move To ++ * Front table */ ++ for (i = 0; i < 256; i++) { ++ byteCount[i] = 0; ++ mtfSymbol[i] = (unsigned char)i; ++ } ++ /* Loop through compressed symbols. */ ++ runPos = dbufCount = symCount = selector = 0; ++ for (;;) { ++ /* Determine which Huffman coding group to use. */ ++ if (!(symCount--)) { ++ symCount = GROUP_SIZE-1; ++ if (selector >= nSelectors) ++ return RETVAL_DATA_ERROR; ++ hufGroup = bd->groups+selectors[selector++]; ++ base = hufGroup->base-1; ++ limit = hufGroup->limit-1; ++ } ++ /* Read next Huffman-coded symbol. */ ++ /* Note: It is far cheaper to read maxLen bits and ++ back up than it is to read minLen bits and then an ++ additional bit at a time, testing as we go. ++ Because there is a trailing last block (with file ++ CRC), there is no danger of the overread causing an ++ unexpected EOF for a valid compressed file. As a ++ further optimization, we do the read inline ++ (falling back to a call to get_bits if the buffer ++ runs dry). The following (up to got_huff_bits:) is ++ equivalent to j = get_bits(bd, hufGroup->maxLen); ++ */ ++ while (bd->inbufBitCount < hufGroup->maxLen) { ++ if (bd->inbufPos == bd->inbufCount) { ++ j = get_bits(bd, hufGroup->maxLen); ++ goto got_huff_bits; ++ } ++ bd->inbufBits = ++ (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++]; ++ bd->inbufBitCount += 8; ++ }; ++ bd->inbufBitCount -= hufGroup->maxLen; ++ j = (bd->inbufBits >> bd->inbufBitCount)& ++ ((1 << hufGroup->maxLen)-1); ++got_huff_bits: ++ /* Figure how how many bits are in next symbol and ++ * unget extras */ ++ i = hufGroup->minLen; ++ while (j > limit[i]) ++ ++i; ++ bd->inbufBitCount += (hufGroup->maxLen - i); ++ /* Huffman decode value to get nextSym (with bounds checking) */ ++ if ((i > hufGroup->maxLen) ++ || (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i])) ++ >= MAX_SYMBOLS)) ++ return RETVAL_DATA_ERROR; ++ nextSym = hufGroup->permute[j]; ++ /* We have now decoded the symbol, which indicates ++ either a new literal byte, or a repeated run of the ++ most recent literal byte. First, check if nextSym ++ indicates a repeated run, and if so loop collecting ++ how many times to repeat the last literal. */ ++ if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */ ++ /* If this is the start of a new run, zero out ++ * counter */ ++ if (!runPos) { ++ runPos = 1; ++ t = 0; ++ } ++ /* Neat trick that saves 1 symbol: instead of ++ or-ing 0 or 1 at each bit position, add 1 ++ or 2 instead. For example, 1011 is 1 << 0 ++ + 1 << 1 + 2 << 2. 1010 is 2 << 0 + 2 << 1 ++ + 1 << 2. You can make any bit pattern ++ that way using 1 less symbol than the basic ++ or 0/1 method (except all bits 0, which ++ would use no symbols, but a run of length 0 ++ doesn't mean anything in this context). ++ Thus space is saved. */ ++ t += (runPos << nextSym); ++ /* +runPos if RUNA; +2*runPos if RUNB */ ++ ++ runPos <<= 1; ++ continue; ++ } ++ /* When we hit the first non-run symbol after a run, ++ we now know how many times to repeat the last ++ literal, so append that many copies to our buffer ++ of decoded symbols (dbuf) now. (The last literal ++ used is the one at the head of the mtfSymbol ++ array.) */ ++ if (runPos) { ++ runPos = 0; ++ if (dbufCount+t >= dbufSize) ++ return RETVAL_DATA_ERROR; ++ ++ uc = symToByte[mtfSymbol[0]]; ++ byteCount[uc] += t; ++ while (t--) ++ dbuf[dbufCount++] = uc; ++ } ++ /* Is this the terminating symbol? */ ++ if (nextSym > symTotal) ++ break; ++ /* At this point, nextSym indicates a new literal ++ character. Subtract one to get the position in the ++ MTF array at which this literal is currently to be ++ found. (Note that the result can't be -1 or 0, ++ because 0 and 1 are RUNA and RUNB. But another ++ instance of the first symbol in the mtf array, ++ position 0, would have been handled as part of a ++ run above. Therefore 1 unused mtf position minus 2 ++ non-literal nextSym values equals -1.) */ ++ if (dbufCount >= dbufSize) ++ return RETVAL_DATA_ERROR; ++ i = nextSym - 1; ++ uc = mtfSymbol[i]; ++ /* Adjust the MTF array. Since we typically expect to ++ *move only a small number of symbols, and are bound ++ *by 256 in any case, using memmove here would ++ *typically be bigger and slower due to function call ++ *overhead and other assorted setup costs. */ ++ do { ++ mtfSymbol[i] = mtfSymbol[i-1]; ++ } while (--i); ++ mtfSymbol[0] = uc; ++ uc = symToByte[uc]; ++ /* We have our literal byte. Save it into dbuf. */ ++ byteCount[uc]++; ++ dbuf[dbufCount++] = (unsigned int)uc; ++ } ++ /* At this point, we've read all the Huffman-coded symbols ++ (and repeated runs) for this block from the input stream, ++ and decoded them into the intermediate buffer. There are ++ dbufCount many decoded bytes in dbuf[]. Now undo the ++ Burrows-Wheeler transform on dbuf. See ++ http://dogma.net/markn/articles/bwt/bwt.htm ++ */ ++ /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */ ++ j = 0; ++ for (i = 0; i < 256; i++) { ++ k = j+byteCount[i]; ++ byteCount[i] = j; ++ j = k; ++ } ++ /* Figure out what order dbuf would be in if we sorted it. */ ++ for (i = 0; i < dbufCount; i++) { ++ uc = (unsigned char)(dbuf[i] & 0xff); ++ dbuf[byteCount[uc]] |= (i << 8); ++ byteCount[uc]++; ++ } ++ /* Decode first byte by hand to initialize "previous" byte. ++ Note that it doesn't get output, and if the first three ++ characters are identical it doesn't qualify as a run (hence ++ writeRunCountdown = 5). */ ++ if (dbufCount) { ++ if (origPtr >= dbufCount) ++ return RETVAL_DATA_ERROR; ++ bd->writePos = dbuf[origPtr]; ++ bd->writeCurrent = (unsigned char)(bd->writePos&0xff); ++ bd->writePos >>= 8; ++ bd->writeRunCountdown = 5; ++ } ++ bd->writeCount = dbufCount; ++ ++ return RETVAL_OK; ++} ++ ++/* Undo burrows-wheeler transform on intermediate buffer to produce output. ++ If start_bunzip was initialized with out_fd =-1, then up to len bytes of ++ data are written to outbuf. Return value is number of bytes written or ++ error (all errors are negative numbers). If out_fd!=-1, outbuf and len ++ are ignored, data is written to out_fd and return is RETVAL_OK or error. ++*/ ++ ++static int INIT read_bunzip(struct bunzip_data *bd, char *outbuf, int len) ++{ ++ const unsigned int *dbuf; ++ int pos, xcurrent, previous, gotcount; ++ ++ /* If last read was short due to end of file, return last block now */ ++ if (bd->writeCount < 0) ++ return bd->writeCount; ++ ++ gotcount = 0; ++ dbuf = bd->dbuf; ++ pos = bd->writePos; ++ xcurrent = bd->writeCurrent; ++ ++ /* We will always have pending decoded data to write into the output ++ buffer unless this is the very first call (in which case we haven't ++ Huffman-decoded a block into the intermediate buffer yet). */ ++ ++ if (bd->writeCopies) { ++ /* Inside the loop, writeCopies means extra copies (beyond 1) */ ++ --bd->writeCopies; ++ /* Loop outputting bytes */ ++ for (;;) { ++ /* If the output buffer is full, snapshot ++ * state and return */ ++ if (gotcount >= len) { ++ bd->writePos = pos; ++ bd->writeCurrent = xcurrent; ++ bd->writeCopies++; ++ return len; ++ } ++ /* Write next byte into output buffer, updating CRC */ ++ outbuf[gotcount++] = xcurrent; ++ bd->writeCRC = (((bd->writeCRC) << 8) ++ ^bd->crc32Table[((bd->writeCRC) >> 24) ++ ^xcurrent]); ++ /* Loop now if we're outputting multiple ++ * copies of this byte */ ++ if (bd->writeCopies) { ++ --bd->writeCopies; ++ continue; ++ } ++decode_next_byte: ++ if (!bd->writeCount--) ++ break; ++ /* Follow sequence vector to undo ++ * Burrows-Wheeler transform */ ++ previous = xcurrent; ++ pos = dbuf[pos]; ++ xcurrent = pos&0xff; ++ pos >>= 8; ++ /* After 3 consecutive copies of the same ++ byte, the 4th is a repeat count. We count ++ down from 4 instead *of counting up because ++ testing for non-zero is faster */ ++ if (--bd->writeRunCountdown) { ++ if (xcurrent != previous) ++ bd->writeRunCountdown = 4; ++ } else { ++ /* We have a repeated run, this byte ++ * indicates the count */ ++ bd->writeCopies = xcurrent; ++ xcurrent = previous; ++ bd->writeRunCountdown = 5; ++ /* Sometimes there are just 3 bytes ++ * (run length 0) */ ++ if (!bd->writeCopies) ++ goto decode_next_byte; ++ /* Subtract the 1 copy we'd output ++ * anyway to get extras */ ++ --bd->writeCopies; ++ } ++ } ++ /* Decompression of this block completed successfully */ ++ bd->writeCRC = ~bd->writeCRC; ++ bd->totalCRC = ((bd->totalCRC << 1) | ++ (bd->totalCRC >> 31)) ^ bd->writeCRC; ++ /* If this block had a CRC error, force file level CRC error. */ ++ if (bd->writeCRC != bd->headerCRC) { ++ bd->totalCRC = bd->headerCRC+1; ++ return RETVAL_LAST_BLOCK; ++ } ++ } ++ ++ /* Refill the intermediate buffer by Huffman-decoding next ++ * block of input */ ++ /* (previous is just a convenient unused temp variable here) */ ++ previous = get_next_block(bd); ++ if (previous) { ++ bd->writeCount = previous; ++ return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount; ++ } ++ bd->writeCRC = 0xffffffffUL; ++ pos = bd->writePos; ++ xcurrent = bd->writeCurrent; ++ goto decode_next_byte; ++} ++ ++static int INIT nofill(void *buf, unsigned int len) ++{ ++ return -1; ++} ++ ++/* Allocate the structure, read file header. If in_fd ==-1, inbuf must contain ++ a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are ++ ignored, and data is read from file handle into temporary buffer. */ ++static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len, ++ int (*fill)(void*, unsigned int)) ++{ ++ struct bunzip_data *bd; ++ unsigned int i, j, c; ++ const unsigned int BZh0 = ++ (((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16) ++ +(((unsigned int)'h') << 8)+(unsigned int)'0'; ++ ++ /* Figure out how much data to allocate */ ++ i = sizeof(struct bunzip_data); ++ ++ /* Allocate bunzip_data. Most fields initialize to zero. */ ++ bd = *bdp = malloc(i); ++ memset(bd, 0, sizeof(struct bunzip_data)); ++ /* Setup input buffer */ ++ bd->inbuf = inbuf; ++ bd->inbufCount = len; ++ if (fill != NULL) ++ bd->fill = fill; ++ else ++ bd->fill = nofill; ++ ++ /* Init the CRC32 table (big endian) */ ++ for (i = 0; i < 256; i++) { ++ c = i << 24; ++ for (j = 8; j; j--) ++ c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1); ++ bd->crc32Table[i] = c; ++ } ++ ++ /* Ensure that file starts with "BZh['1'-'9']." */ ++ i = get_bits(bd, 32); ++ if (((unsigned int)(i-BZh0-1)) >= 9) ++ return RETVAL_NOT_BZIP_DATA; ++ ++ /* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of ++ uncompressed data. Allocate intermediate buffer for block. */ ++ bd->dbufSize = 100000*(i-BZh0); ++ ++ bd->dbuf = large_malloc(bd->dbufSize * sizeof(int)); ++ return RETVAL_OK; ++} ++ ++/* Example usage: decompress src_fd to dst_fd. (Stops at end of bzip2 data, ++ not end of file.) */ ++STATIC int INIT bunzip2(unsigned char *buf, int len, ++ int(*fill)(void*, unsigned int), ++ int(*flush)(void*, unsigned int), ++ unsigned char *outbuf, ++ int *pos, ++ void(*error_fn)(char *x)) ++{ ++ struct bunzip_data *bd; ++ int i = -1; ++ unsigned char *inbuf; ++ ++ set_error_fn(error_fn); ++ if (flush) ++ outbuf = malloc(BZIP2_IOBUF_SIZE); ++ else ++ len -= 4; /* Uncompressed size hack active in pre-boot ++ environment */ ++ if (!outbuf) { ++ error("Could not allocate output bufer"); ++ return -1; ++ } ++ if (buf) ++ inbuf = buf; ++ else ++ inbuf = malloc(BZIP2_IOBUF_SIZE); ++ if (!inbuf) { ++ error("Could not allocate input bufer"); ++ goto exit_0; ++ } ++ i = start_bunzip(&bd, inbuf, len, fill); ++ if (!i) { ++ for (;;) { ++ i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE); ++ if (i <= 0) ++ break; ++ if (!flush) ++ outbuf += i; ++ else ++ if (i != flush(outbuf, i)) { ++ i = RETVAL_UNEXPECTED_OUTPUT_EOF; ++ break; ++ } ++ } ++ } ++ /* Check CRC and release memory */ ++ if (i == RETVAL_LAST_BLOCK) { ++ if (bd->headerCRC != bd->totalCRC) ++ error("Data integrity error when decompressing."); ++ else ++ i = RETVAL_OK; ++ } else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) { ++ error("Compressed file ends unexpectedly"); ++ } ++ if (bd->dbuf) ++ large_free(bd->dbuf); ++ if (pos) ++ *pos = bd->inbufPos; ++ free(bd); ++ if (!buf) ++ free(inbuf); ++exit_0: ++ if (flush) ++ free(outbuf); ++ return i; ++} ++ ++#define decompress bunzip2 +Index: linux-2.6-tip/lib/decompress_inflate.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/lib/decompress_inflate.c +@@ -0,0 +1,168 @@ ++#ifdef STATIC ++/* Pre-boot environment: included */ ++ ++/* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots ++ * errors about console_printk etc... on ARM */ ++#define _LINUX_KERNEL_H ++ ++#include "zlib_inflate/inftrees.c" ++#include "zlib_inflate/inffast.c" ++#include "zlib_inflate/inflate.c" ++ ++#else /* STATIC */ ++/* initramfs et al: linked */ ++ ++#include ++ ++#include "zlib_inflate/inftrees.h" ++#include "zlib_inflate/inffast.h" ++#include "zlib_inflate/inflate.h" ++ ++#include "zlib_inflate/infutil.h" ++ ++#endif /* STATIC */ ++ ++#include ++#include ++ ++#define INBUF_LEN (16*1024) ++ ++/* Included from initramfs et al code */ ++STATIC int INIT gunzip(unsigned char *buf, int len, ++ int(*fill)(void*, unsigned int), ++ int(*flush)(void*, unsigned int), ++ unsigned char *out_buf, ++ int *pos, ++ void(*error_fn)(char *x)) { ++ u8 *zbuf; ++ struct z_stream_s *strm; ++ int rc; ++ size_t out_len; ++ ++ set_error_fn(error_fn); ++ rc = -1; ++ if (flush) { ++ out_len = 0x8000; /* 32 K */ ++ out_buf = malloc(out_len); ++ } else { ++ out_len = 0x7fffffff; /* no limit */ ++ } ++ if (!out_buf) { ++ error("Out of memory while allocating output buffer"); ++ goto gunzip_nomem1; ++ } ++ ++ if (buf) ++ zbuf = buf; ++ else { ++ zbuf = malloc(INBUF_LEN); ++ len = 0; ++ } ++ if (!zbuf) { ++ error("Out of memory while allocating input buffer"); ++ goto gunzip_nomem2; ++ } ++ ++ strm = malloc(sizeof(*strm)); ++ if (strm == NULL) { ++ error("Out of memory while allocating z_stream"); ++ goto gunzip_nomem3; ++ } ++ ++ strm->workspace = malloc(flush ? zlib_inflate_workspacesize() : ++ sizeof(struct inflate_state)); ++ if (strm->workspace == NULL) { ++ error("Out of memory while allocating workspace"); ++ goto gunzip_nomem4; ++ } ++ ++ if (len == 0) ++ len = fill(zbuf, INBUF_LEN); ++ ++ /* verify the gzip header */ ++ if (len < 10 || ++ zbuf[0] != 0x1f || zbuf[1] != 0x8b || zbuf[2] != 0x08) { ++ if (pos) ++ *pos = 0; ++ error("Not a gzip file"); ++ goto gunzip_5; ++ } ++ ++ /* skip over gzip header (1f,8b,08... 10 bytes total + ++ * possible asciz filename) ++ */ ++ strm->next_in = zbuf + 10; ++ /* skip over asciz filename */ ++ if (zbuf[3] & 0x8) { ++ while (strm->next_in[0]) ++ strm->next_in++; ++ strm->next_in++; ++ } ++ strm->avail_in = len - (strm->next_in - zbuf); ++ ++ strm->next_out = out_buf; ++ strm->avail_out = out_len; ++ ++ rc = zlib_inflateInit2(strm, -MAX_WBITS); ++ ++ if (!flush) { ++ WS(strm)->inflate_state.wsize = 0; ++ WS(strm)->inflate_state.window = NULL; ++ } ++ ++ while (rc == Z_OK) { ++ if (strm->avail_in == 0) { ++ /* TODO: handle case where both pos and fill are set */ ++ len = fill(zbuf, INBUF_LEN); ++ if (len < 0) { ++ rc = -1; ++ error("read error"); ++ break; ++ } ++ strm->next_in = zbuf; ++ strm->avail_in = len; ++ } ++ rc = zlib_inflate(strm, 0); ++ ++ /* Write any data generated */ ++ if (flush && strm->next_out > out_buf) { ++ int l = strm->next_out - out_buf; ++ if (l != flush(out_buf, l)) { ++ rc = -1; ++ error("write error"); ++ break; ++ } ++ strm->next_out = out_buf; ++ strm->avail_out = out_len; ++ } ++ ++ /* after Z_FINISH, only Z_STREAM_END is "we unpacked it all" */ ++ if (rc == Z_STREAM_END) { ++ rc = 0; ++ break; ++ } else if (rc != Z_OK) { ++ error("uncompression error"); ++ rc = -1; ++ } ++ } ++ ++ zlib_inflateEnd(strm); ++ if (pos) ++ /* add + 8 to skip over trailer */ ++ *pos = strm->next_in - zbuf+8; ++ ++gunzip_5: ++ free(strm->workspace); ++gunzip_nomem4: ++ free(strm); ++gunzip_nomem3: ++ if (!buf) ++ free(zbuf); ++gunzip_nomem2: ++ if (flush) ++ free(out_buf); ++gunzip_nomem1: ++ return rc; /* returns Z_OK (0) if successful */ ++} ++ ++#define decompress gunzip +Index: linux-2.6-tip/lib/decompress_unlzma.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/lib/decompress_unlzma.c +@@ -0,0 +1,648 @@ ++/* Lzma decompressor for Linux kernel. Shamelessly snarfed ++ *from busybox 1.1.1 ++ * ++ *Linux kernel adaptation ++ *Copyright (C) 2006 Alain < alain@knaff.lu > ++ * ++ *Based on small lzma deflate implementation/Small range coder ++ *implementation for lzma. ++ *Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org > ++ * ++ *Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/) ++ *Copyright (C) 1999-2005 Igor Pavlov ++ * ++ *Copyrights of the parts, see headers below. ++ * ++ * ++ *This program is free software; you can redistribute it and/or ++ *modify it under the terms of the GNU Lesser General Public ++ *License as published by the Free Software Foundation; either ++ *version 2.1 of the License, or (at your option) any later version. ++ * ++ *This program is distributed in the hope that it will be useful, ++ *but WITHOUT ANY WARRANTY; without even the implied warranty of ++ *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ *Lesser General Public License for more details. ++ * ++ *You should have received a copy of the GNU Lesser General Public ++ *License along with this library; if not, write to the Free Software ++ *Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef STATIC ++#include ++#endif /* STATIC */ ++ ++#include ++#include ++ ++#define MIN(a, b) (((a) < (b)) ? (a) : (b)) ++ ++static long long INIT read_int(unsigned char *ptr, int size) ++{ ++ int i; ++ long long ret = 0; ++ ++ for (i = 0; i < size; i++) ++ ret = (ret << 8) | ptr[size-i-1]; ++ return ret; ++} ++ ++#define ENDIAN_CONVERT(x) \ ++ x = (typeof(x))read_int((unsigned char *)&x, sizeof(x)) ++ ++ ++/* Small range coder implementation for lzma. ++ *Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org > ++ * ++ *Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/) ++ *Copyright (c) 1999-2005 Igor Pavlov ++ */ ++ ++#include ++ ++#define LZMA_IOBUF_SIZE 0x10000 ++ ++struct rc { ++ int (*fill)(void*, unsigned int); ++ uint8_t *ptr; ++ uint8_t *buffer; ++ uint8_t *buffer_end; ++ int buffer_size; ++ uint32_t code; ++ uint32_t range; ++ uint32_t bound; ++}; ++ ++ ++#define RC_TOP_BITS 24 ++#define RC_MOVE_BITS 5 ++#define RC_MODEL_TOTAL_BITS 11 ++ ++ ++/* Called twice: once at startup and once in rc_normalize() */ ++static void INIT rc_read(struct rc *rc) ++{ ++ rc->buffer_size = rc->fill((char *)rc->buffer, LZMA_IOBUF_SIZE); ++ if (rc->buffer_size <= 0) ++ error("unexpected EOF"); ++ rc->ptr = rc->buffer; ++ rc->buffer_end = rc->buffer + rc->buffer_size; ++} ++ ++/* Called once */ ++static inline void INIT rc_init(struct rc *rc, ++ int (*fill)(void*, unsigned int), ++ char *buffer, int buffer_size) ++{ ++ rc->fill = fill; ++ rc->buffer = (uint8_t *)buffer; ++ rc->buffer_size = buffer_size; ++ rc->buffer_end = rc->buffer + rc->buffer_size; ++ rc->ptr = rc->buffer; ++ ++ rc->code = 0; ++ rc->range = 0xFFFFFFFF; ++} ++ ++static inline void INIT rc_init_code(struct rc *rc) ++{ ++ int i; ++ ++ for (i = 0; i < 5; i++) { ++ if (rc->ptr >= rc->buffer_end) ++ rc_read(rc); ++ rc->code = (rc->code << 8) | *rc->ptr++; ++ } ++} ++ ++ ++/* Called once. TODO: bb_maybe_free() */ ++static inline void INIT rc_free(struct rc *rc) ++{ ++ free(rc->buffer); ++} ++ ++/* Called twice, but one callsite is in inline'd rc_is_bit_0_helper() */ ++static void INIT rc_do_normalize(struct rc *rc) ++{ ++ if (rc->ptr >= rc->buffer_end) ++ rc_read(rc); ++ rc->range <<= 8; ++ rc->code = (rc->code << 8) | *rc->ptr++; ++} ++static inline void INIT rc_normalize(struct rc *rc) ++{ ++ if (rc->range < (1 << RC_TOP_BITS)) ++ rc_do_normalize(rc); ++} ++ ++/* Called 9 times */ ++/* Why rc_is_bit_0_helper exists? ++ *Because we want to always expose (rc->code < rc->bound) to optimizer ++ */ ++static inline uint32_t INIT rc_is_bit_0_helper(struct rc *rc, uint16_t *p) ++{ ++ rc_normalize(rc); ++ rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS); ++ return rc->bound; ++} ++static inline int INIT rc_is_bit_0(struct rc *rc, uint16_t *p) ++{ ++ uint32_t t = rc_is_bit_0_helper(rc, p); ++ return rc->code < t; ++} ++ ++/* Called ~10 times, but very small, thus inlined */ ++static inline void INIT rc_update_bit_0(struct rc *rc, uint16_t *p) ++{ ++ rc->range = rc->bound; ++ *p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS; ++} ++static inline void rc_update_bit_1(struct rc *rc, uint16_t *p) ++{ ++ rc->range -= rc->bound; ++ rc->code -= rc->bound; ++ *p -= *p >> RC_MOVE_BITS; ++} ++ ++/* Called 4 times in unlzma loop */ ++static int INIT rc_get_bit(struct rc *rc, uint16_t *p, int *symbol) ++{ ++ if (rc_is_bit_0(rc, p)) { ++ rc_update_bit_0(rc, p); ++ *symbol *= 2; ++ return 0; ++ } else { ++ rc_update_bit_1(rc, p); ++ *symbol = *symbol * 2 + 1; ++ return 1; ++ } ++} ++ ++/* Called once */ ++static inline int INIT rc_direct_bit(struct rc *rc) ++{ ++ rc_normalize(rc); ++ rc->range >>= 1; ++ if (rc->code >= rc->range) { ++ rc->code -= rc->range; ++ return 1; ++ } ++ return 0; ++} ++ ++/* Called twice */ ++static inline void INIT ++rc_bit_tree_decode(struct rc *rc, uint16_t *p, int num_levels, int *symbol) ++{ ++ int i = num_levels; ++ ++ *symbol = 1; ++ while (i--) ++ rc_get_bit(rc, p + *symbol, symbol); ++ *symbol -= 1 << num_levels; ++} ++ ++ ++/* ++ * Small lzma deflate implementation. ++ * Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org > ++ * ++ * Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/) ++ * Copyright (C) 1999-2005 Igor Pavlov ++ */ ++ ++ ++struct lzma_header { ++ uint8_t pos; ++ uint32_t dict_size; ++ uint64_t dst_size; ++} __attribute__ ((packed)) ; ++ ++ ++#define LZMA_BASE_SIZE 1846 ++#define LZMA_LIT_SIZE 768 ++ ++#define LZMA_NUM_POS_BITS_MAX 4 ++ ++#define LZMA_LEN_NUM_LOW_BITS 3 ++#define LZMA_LEN_NUM_MID_BITS 3 ++#define LZMA_LEN_NUM_HIGH_BITS 8 ++ ++#define LZMA_LEN_CHOICE 0 ++#define LZMA_LEN_CHOICE_2 (LZMA_LEN_CHOICE + 1) ++#define LZMA_LEN_LOW (LZMA_LEN_CHOICE_2 + 1) ++#define LZMA_LEN_MID (LZMA_LEN_LOW \ ++ + (1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_LOW_BITS))) ++#define LZMA_LEN_HIGH (LZMA_LEN_MID \ ++ +(1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_MID_BITS))) ++#define LZMA_NUM_LEN_PROBS (LZMA_LEN_HIGH + (1 << LZMA_LEN_NUM_HIGH_BITS)) ++ ++#define LZMA_NUM_STATES 12 ++#define LZMA_NUM_LIT_STATES 7 ++ ++#define LZMA_START_POS_MODEL_INDEX 4 ++#define LZMA_END_POS_MODEL_INDEX 14 ++#define LZMA_NUM_FULL_DISTANCES (1 << (LZMA_END_POS_MODEL_INDEX >> 1)) ++ ++#define LZMA_NUM_POS_SLOT_BITS 6 ++#define LZMA_NUM_LEN_TO_POS_STATES 4 ++ ++#define LZMA_NUM_ALIGN_BITS 4 ++ ++#define LZMA_MATCH_MIN_LEN 2 ++ ++#define LZMA_IS_MATCH 0 ++#define LZMA_IS_REP (LZMA_IS_MATCH + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX)) ++#define LZMA_IS_REP_G0 (LZMA_IS_REP + LZMA_NUM_STATES) ++#define LZMA_IS_REP_G1 (LZMA_IS_REP_G0 + LZMA_NUM_STATES) ++#define LZMA_IS_REP_G2 (LZMA_IS_REP_G1 + LZMA_NUM_STATES) ++#define LZMA_IS_REP_0_LONG (LZMA_IS_REP_G2 + LZMA_NUM_STATES) ++#define LZMA_POS_SLOT (LZMA_IS_REP_0_LONG \ ++ + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX)) ++#define LZMA_SPEC_POS (LZMA_POS_SLOT \ ++ +(LZMA_NUM_LEN_TO_POS_STATES << LZMA_NUM_POS_SLOT_BITS)) ++#define LZMA_ALIGN (LZMA_SPEC_POS \ ++ + LZMA_NUM_FULL_DISTANCES - LZMA_END_POS_MODEL_INDEX) ++#define LZMA_LEN_CODER (LZMA_ALIGN + (1 << LZMA_NUM_ALIGN_BITS)) ++#define LZMA_REP_LEN_CODER (LZMA_LEN_CODER + LZMA_NUM_LEN_PROBS) ++#define LZMA_LITERAL (LZMA_REP_LEN_CODER + LZMA_NUM_LEN_PROBS) ++ ++ ++struct writer { ++ uint8_t *buffer; ++ uint8_t previous_byte; ++ size_t buffer_pos; ++ int bufsize; ++ size_t global_pos; ++ int(*flush)(void*, unsigned int); ++ struct lzma_header *header; ++}; ++ ++struct cstate { ++ int state; ++ uint32_t rep0, rep1, rep2, rep3; ++}; ++ ++static inline size_t INIT get_pos(struct writer *wr) ++{ ++ return ++ wr->global_pos + wr->buffer_pos; ++} ++ ++static inline uint8_t INIT peek_old_byte(struct writer *wr, ++ uint32_t offs) ++{ ++ if (!wr->flush) { ++ int32_t pos; ++ while (offs > wr->header->dict_size) ++ offs -= wr->header->dict_size; ++ pos = wr->buffer_pos - offs; ++ return wr->buffer[pos]; ++ } else { ++ uint32_t pos = wr->buffer_pos - offs; ++ while (pos >= wr->header->dict_size) ++ pos += wr->header->dict_size; ++ return wr->buffer[pos]; ++ } ++ ++} ++ ++static inline void INIT write_byte(struct writer *wr, uint8_t byte) ++{ ++ wr->buffer[wr->buffer_pos++] = wr->previous_byte = byte; ++ if (wr->flush && wr->buffer_pos == wr->header->dict_size) { ++ wr->buffer_pos = 0; ++ wr->global_pos += wr->header->dict_size; ++ wr->flush((char *)wr->buffer, wr->header->dict_size); ++ } ++} ++ ++ ++static inline void INIT copy_byte(struct writer *wr, uint32_t offs) ++{ ++ write_byte(wr, peek_old_byte(wr, offs)); ++} ++ ++static inline void INIT copy_bytes(struct writer *wr, ++ uint32_t rep0, int len) ++{ ++ do { ++ copy_byte(wr, rep0); ++ len--; ++ } while (len != 0 && wr->buffer_pos < wr->header->dst_size); ++} ++ ++static inline void INIT process_bit0(struct writer *wr, struct rc *rc, ++ struct cstate *cst, uint16_t *p, ++ int pos_state, uint16_t *prob, ++ int lc, uint32_t literal_pos_mask) { ++ int mi = 1; ++ rc_update_bit_0(rc, prob); ++ prob = (p + LZMA_LITERAL + ++ (LZMA_LIT_SIZE ++ * (((get_pos(wr) & literal_pos_mask) << lc) ++ + (wr->previous_byte >> (8 - lc)))) ++ ); ++ ++ if (cst->state >= LZMA_NUM_LIT_STATES) { ++ int match_byte = peek_old_byte(wr, cst->rep0); ++ do { ++ int bit; ++ uint16_t *prob_lit; ++ ++ match_byte <<= 1; ++ bit = match_byte & 0x100; ++ prob_lit = prob + 0x100 + bit + mi; ++ if (rc_get_bit(rc, prob_lit, &mi)) { ++ if (!bit) ++ break; ++ } else { ++ if (bit) ++ break; ++ } ++ } while (mi < 0x100); ++ } ++ while (mi < 0x100) { ++ uint16_t *prob_lit = prob + mi; ++ rc_get_bit(rc, prob_lit, &mi); ++ } ++ write_byte(wr, mi); ++ if (cst->state < 4) ++ cst->state = 0; ++ else if (cst->state < 10) ++ cst->state -= 3; ++ else ++ cst->state -= 6; ++} ++ ++static inline void INIT process_bit1(struct writer *wr, struct rc *rc, ++ struct cstate *cst, uint16_t *p, ++ int pos_state, uint16_t *prob) { ++ int offset; ++ uint16_t *prob_len; ++ int num_bits; ++ int len; ++ ++ rc_update_bit_1(rc, prob); ++ prob = p + LZMA_IS_REP + cst->state; ++ if (rc_is_bit_0(rc, prob)) { ++ rc_update_bit_0(rc, prob); ++ cst->rep3 = cst->rep2; ++ cst->rep2 = cst->rep1; ++ cst->rep1 = cst->rep0; ++ cst->state = cst->state < LZMA_NUM_LIT_STATES ? 0 : 3; ++ prob = p + LZMA_LEN_CODER; ++ } else { ++ rc_update_bit_1(rc, prob); ++ prob = p + LZMA_IS_REP_G0 + cst->state; ++ if (rc_is_bit_0(rc, prob)) { ++ rc_update_bit_0(rc, prob); ++ prob = (p + LZMA_IS_REP_0_LONG ++ + (cst->state << ++ LZMA_NUM_POS_BITS_MAX) + ++ pos_state); ++ if (rc_is_bit_0(rc, prob)) { ++ rc_update_bit_0(rc, prob); ++ ++ cst->state = cst->state < LZMA_NUM_LIT_STATES ? ++ 9 : 11; ++ copy_byte(wr, cst->rep0); ++ return; ++ } else { ++ rc_update_bit_1(rc, prob); ++ } ++ } else { ++ uint32_t distance; ++ ++ rc_update_bit_1(rc, prob); ++ prob = p + LZMA_IS_REP_G1 + cst->state; ++ if (rc_is_bit_0(rc, prob)) { ++ rc_update_bit_0(rc, prob); ++ distance = cst->rep1; ++ } else { ++ rc_update_bit_1(rc, prob); ++ prob = p + LZMA_IS_REP_G2 + cst->state; ++ if (rc_is_bit_0(rc, prob)) { ++ rc_update_bit_0(rc, prob); ++ distance = cst->rep2; ++ } else { ++ rc_update_bit_1(rc, prob); ++ distance = cst->rep3; ++ cst->rep3 = cst->rep2; ++ } ++ cst->rep2 = cst->rep1; ++ } ++ cst->rep1 = cst->rep0; ++ cst->rep0 = distance; ++ } ++ cst->state = cst->state < LZMA_NUM_LIT_STATES ? 8 : 11; ++ prob = p + LZMA_REP_LEN_CODER; ++ } ++ ++ prob_len = prob + LZMA_LEN_CHOICE; ++ if (rc_is_bit_0(rc, prob_len)) { ++ rc_update_bit_0(rc, prob_len); ++ prob_len = (prob + LZMA_LEN_LOW ++ + (pos_state << ++ LZMA_LEN_NUM_LOW_BITS)); ++ offset = 0; ++ num_bits = LZMA_LEN_NUM_LOW_BITS; ++ } else { ++ rc_update_bit_1(rc, prob_len); ++ prob_len = prob + LZMA_LEN_CHOICE_2; ++ if (rc_is_bit_0(rc, prob_len)) { ++ rc_update_bit_0(rc, prob_len); ++ prob_len = (prob + LZMA_LEN_MID ++ + (pos_state << ++ LZMA_LEN_NUM_MID_BITS)); ++ offset = 1 << LZMA_LEN_NUM_LOW_BITS; ++ num_bits = LZMA_LEN_NUM_MID_BITS; ++ } else { ++ rc_update_bit_1(rc, prob_len); ++ prob_len = prob + LZMA_LEN_HIGH; ++ offset = ((1 << LZMA_LEN_NUM_LOW_BITS) ++ + (1 << LZMA_LEN_NUM_MID_BITS)); ++ num_bits = LZMA_LEN_NUM_HIGH_BITS; ++ } ++ } ++ ++ rc_bit_tree_decode(rc, prob_len, num_bits, &len); ++ len += offset; ++ ++ if (cst->state < 4) { ++ int pos_slot; ++ ++ cst->state += LZMA_NUM_LIT_STATES; ++ prob = ++ p + LZMA_POS_SLOT + ++ ((len < ++ LZMA_NUM_LEN_TO_POS_STATES ? len : ++ LZMA_NUM_LEN_TO_POS_STATES - 1) ++ << LZMA_NUM_POS_SLOT_BITS); ++ rc_bit_tree_decode(rc, prob, ++ LZMA_NUM_POS_SLOT_BITS, ++ &pos_slot); ++ if (pos_slot >= LZMA_START_POS_MODEL_INDEX) { ++ int i, mi; ++ num_bits = (pos_slot >> 1) - 1; ++ cst->rep0 = 2 | (pos_slot & 1); ++ if (pos_slot < LZMA_END_POS_MODEL_INDEX) { ++ cst->rep0 <<= num_bits; ++ prob = p + LZMA_SPEC_POS + ++ cst->rep0 - pos_slot - 1; ++ } else { ++ num_bits -= LZMA_NUM_ALIGN_BITS; ++ while (num_bits--) ++ cst->rep0 = (cst->rep0 << 1) | ++ rc_direct_bit(rc); ++ prob = p + LZMA_ALIGN; ++ cst->rep0 <<= LZMA_NUM_ALIGN_BITS; ++ num_bits = LZMA_NUM_ALIGN_BITS; ++ } ++ i = 1; ++ mi = 1; ++ while (num_bits--) { ++ if (rc_get_bit(rc, prob + mi, &mi)) ++ cst->rep0 |= i; ++ i <<= 1; ++ } ++ } else ++ cst->rep0 = pos_slot; ++ if (++(cst->rep0) == 0) ++ return; ++ } ++ ++ len += LZMA_MATCH_MIN_LEN; ++ ++ copy_bytes(wr, cst->rep0, len); ++} ++ ++ ++ ++STATIC inline int INIT unlzma(unsigned char *buf, int in_len, ++ int(*fill)(void*, unsigned int), ++ int(*flush)(void*, unsigned int), ++ unsigned char *output, ++ int *posp, ++ void(*error_fn)(char *x) ++ ) ++{ ++ struct lzma_header header; ++ int lc, pb, lp; ++ uint32_t pos_state_mask; ++ uint32_t literal_pos_mask; ++ uint16_t *p; ++ int num_probs; ++ struct rc rc; ++ int i, mi; ++ struct writer wr; ++ struct cstate cst; ++ unsigned char *inbuf; ++ int ret = -1; ++ ++ set_error_fn(error_fn); ++ if (!flush) ++ in_len -= 4; /* Uncompressed size hack active in pre-boot ++ environment */ ++ if (buf) ++ inbuf = buf; ++ else ++ inbuf = malloc(LZMA_IOBUF_SIZE); ++ if (!inbuf) { ++ error("Could not allocate input bufer"); ++ goto exit_0; ++ } ++ ++ cst.state = 0; ++ cst.rep0 = cst.rep1 = cst.rep2 = cst.rep3 = 1; ++ ++ wr.header = &header; ++ wr.flush = flush; ++ wr.global_pos = 0; ++ wr.previous_byte = 0; ++ wr.buffer_pos = 0; ++ ++ rc_init(&rc, fill, inbuf, in_len); ++ ++ for (i = 0; i < sizeof(header); i++) { ++ if (rc.ptr >= rc.buffer_end) ++ rc_read(&rc); ++ ((unsigned char *)&header)[i] = *rc.ptr++; ++ } ++ ++ if (header.pos >= (9 * 5 * 5)) ++ error("bad header"); ++ ++ mi = 0; ++ lc = header.pos; ++ while (lc >= 9) { ++ mi++; ++ lc -= 9; ++ } ++ pb = 0; ++ lp = mi; ++ while (lp >= 5) { ++ pb++; ++ lp -= 5; ++ } ++ pos_state_mask = (1 << pb) - 1; ++ literal_pos_mask = (1 << lp) - 1; ++ ++ ENDIAN_CONVERT(header.dict_size); ++ ENDIAN_CONVERT(header.dst_size); ++ ++ if (header.dict_size == 0) ++ header.dict_size = 1; ++ ++ if (output) ++ wr.buffer = output; ++ else { ++ wr.bufsize = MIN(header.dst_size, header.dict_size); ++ wr.buffer = large_malloc(wr.bufsize); ++ } ++ if (wr.buffer == NULL) ++ goto exit_1; ++ ++ num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp)); ++ p = (uint16_t *) large_malloc(num_probs * sizeof(*p)); ++ if (p == 0) ++ goto exit_2; ++ num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp)); ++ for (i = 0; i < num_probs; i++) ++ p[i] = (1 << RC_MODEL_TOTAL_BITS) >> 1; ++ ++ rc_init_code(&rc); ++ ++ while (get_pos(&wr) < header.dst_size) { ++ int pos_state = get_pos(&wr) & pos_state_mask; ++ uint16_t *prob = p + LZMA_IS_MATCH + ++ (cst.state << LZMA_NUM_POS_BITS_MAX) + pos_state; ++ if (rc_is_bit_0(&rc, prob)) ++ process_bit0(&wr, &rc, &cst, p, pos_state, prob, ++ lc, literal_pos_mask); ++ else { ++ process_bit1(&wr, &rc, &cst, p, pos_state, prob); ++ if (cst.rep0 == 0) ++ break; ++ } ++ } ++ ++ if (posp) ++ *posp = rc.ptr-rc.buffer; ++ if (wr.flush) ++ wr.flush(wr.buffer, wr.buffer_pos); ++ ret = 0; ++ large_free(p); ++exit_2: ++ if (!output) ++ large_free(wr.buffer); ++exit_1: ++ if (!buf) ++ free(inbuf); ++exit_0: ++ return ret; ++} ++ ++#define decompress unlzma +Index: linux-2.6-tip/lib/dma-debug.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/lib/dma-debug.c +@@ -0,0 +1,955 @@ ++/* ++ * Copyright (C) 2008 Advanced Micro Devices, Inc. ++ * ++ * Author: Joerg Roedel ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 as published ++ * by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define HASH_SIZE 1024ULL ++#define HASH_FN_SHIFT 13 ++#define HASH_FN_MASK (HASH_SIZE - 1) ++ ++enum { ++ dma_debug_single, ++ dma_debug_page, ++ dma_debug_sg, ++ dma_debug_coherent, ++}; ++ ++#define DMA_DEBUG_STACKTRACE_ENTRIES 5 ++ ++struct dma_debug_entry { ++ struct list_head list; ++ struct device *dev; ++ int type; ++ phys_addr_t paddr; ++ u64 dev_addr; ++ u64 size; ++ int direction; ++ int sg_call_ents; ++ int sg_mapped_ents; ++#ifdef CONFIG_STACKTRACE ++ struct stack_trace stacktrace; ++ unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; ++#endif ++}; ++ ++struct hash_bucket { ++ struct list_head list; ++ spinlock_t lock; ++} ____cacheline_aligned_in_smp; ++ ++/* Hash list to save the allocated dma addresses */ ++static struct hash_bucket dma_entry_hash[HASH_SIZE]; ++/* List of pre-allocated dma_debug_entry's */ ++static LIST_HEAD(free_entries); ++/* Lock for the list above */ ++static DEFINE_SPINLOCK(free_entries_lock); ++ ++/* Global disable flag - will be set in case of an error */ ++static bool global_disable __read_mostly; ++ ++/* Global error count */ ++static u32 error_count; ++ ++/* Global error show enable*/ ++static u32 show_all_errors __read_mostly; ++/* Number of errors to show */ ++static u32 show_num_errors = 1; ++ ++static u32 num_free_entries; ++static u32 min_free_entries; ++ ++/* number of preallocated entries requested by kernel cmdline */ ++static u32 req_entries; ++ ++/* debugfs dentry's for the stuff above */ ++static struct dentry *dma_debug_dent __read_mostly; ++static struct dentry *global_disable_dent __read_mostly; ++static struct dentry *error_count_dent __read_mostly; ++static struct dentry *show_all_errors_dent __read_mostly; ++static struct dentry *show_num_errors_dent __read_mostly; ++static struct dentry *num_free_entries_dent __read_mostly; ++static struct dentry *min_free_entries_dent __read_mostly; ++ ++static const char *type2name[4] = { "single", "page", ++ "scather-gather", "coherent" }; ++ ++static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", ++ "DMA_FROM_DEVICE", "DMA_NONE" }; ++ ++/* ++ * The access to some variables in this macro is racy. We can't use atomic_t ++ * here because all these variables are exported to debugfs. Some of them even ++ * writeable. This is also the reason why a lock won't help much. But anyway, ++ * the races are no big deal. Here is why: ++ * ++ * error_count: the addition is racy, but the worst thing that can happen is ++ * that we don't count some errors ++ * show_num_errors: the subtraction is racy. Also no big deal because in ++ * worst case this will result in one warning more in the ++ * system log than the user configured. This variable is ++ * writeable via debugfs. ++ */ ++static inline void dump_entry_trace(struct dma_debug_entry *entry) ++{ ++#ifdef CONFIG_STACKTRACE ++ if (entry) { ++ printk(KERN_WARNING "Mapped at:\n"); ++ print_stack_trace(&entry->stacktrace, 0); ++ } ++#endif ++} ++ ++#define err_printk(dev, entry, format, arg...) do { \ ++ error_count += 1; \ ++ if (show_all_errors || show_num_errors > 0) { \ ++ WARN(1, "%s %s: " format, \ ++ dev_driver_string(dev), \ ++ dev_name(dev) , ## arg); \ ++ dump_entry_trace(entry); \ ++ } \ ++ if (!show_all_errors && show_num_errors > 0) \ ++ show_num_errors -= 1; \ ++ } while (0); ++ ++/* ++ * Hash related functions ++ * ++ * Every DMA-API request is saved into a struct dma_debug_entry. To ++ * have quick access to these structs they are stored into a hash. ++ */ ++static int hash_fn(struct dma_debug_entry *entry) ++{ ++ /* ++ * Hash function is based on the dma address. ++ * We use bits 20-27 here as the index into the hash ++ */ ++ return (entry->dev_addr >> HASH_FN_SHIFT) & HASH_FN_MASK; ++} ++ ++/* ++ * Request exclusive access to a hash bucket for a given dma_debug_entry. ++ */ ++static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, ++ unsigned long *flags) ++{ ++ int idx = hash_fn(entry); ++ unsigned long __flags; ++ ++ spin_lock_irqsave(&dma_entry_hash[idx].lock, __flags); ++ *flags = __flags; ++ return &dma_entry_hash[idx]; ++} ++ ++/* ++ * Give up exclusive access to the hash bucket ++ */ ++static void put_hash_bucket(struct hash_bucket *bucket, ++ unsigned long *flags) ++{ ++ unsigned long __flags = *flags; ++ ++ spin_unlock_irqrestore(&bucket->lock, __flags); ++} ++ ++/* ++ * Search a given entry in the hash bucket list ++ */ ++static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket, ++ struct dma_debug_entry *ref) ++{ ++ struct dma_debug_entry *entry; ++ ++ list_for_each_entry(entry, &bucket->list, list) { ++ if ((entry->dev_addr == ref->dev_addr) && ++ (entry->dev == ref->dev)) ++ return entry; ++ } ++ ++ return NULL; ++} ++ ++/* ++ * Add an entry to a hash bucket ++ */ ++static void hash_bucket_add(struct hash_bucket *bucket, ++ struct dma_debug_entry *entry) ++{ ++ list_add_tail(&entry->list, &bucket->list); ++} ++ ++/* ++ * Remove entry from a hash bucket list ++ */ ++static void hash_bucket_del(struct dma_debug_entry *entry) ++{ ++ list_del(&entry->list); ++} ++ ++/* ++ * Dump mapping entries for debugging purposes ++ */ ++void debug_dma_dump_mappings(struct device *dev) ++{ ++ int idx; ++ ++ for (idx = 0; idx < HASH_SIZE; idx++) { ++ struct hash_bucket *bucket = &dma_entry_hash[idx]; ++ struct dma_debug_entry *entry; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&bucket->lock, flags); ++ ++ list_for_each_entry(entry, &bucket->list, list) { ++ if (!dev || dev == entry->dev) { ++ dev_info(entry->dev, ++ "%s idx %d P=%Lx D=%Lx L=%Lx %s\n", ++ type2name[entry->type], idx, ++ (unsigned long long)entry->paddr, ++ entry->dev_addr, entry->size, ++ dir2name[entry->direction]); ++ } ++ } ++ ++ spin_unlock_irqrestore(&bucket->lock, flags); ++ } ++} ++EXPORT_SYMBOL(debug_dma_dump_mappings); ++ ++/* ++ * Wrapper function for adding an entry to the hash. ++ * This function takes care of locking itself. ++ */ ++static void add_dma_entry(struct dma_debug_entry *entry) ++{ ++ struct hash_bucket *bucket; ++ unsigned long flags; ++ ++ bucket = get_hash_bucket(entry, &flags); ++ hash_bucket_add(bucket, entry); ++ put_hash_bucket(bucket, &flags); ++} ++ ++/* struct dma_entry allocator ++ * ++ * The next two functions implement the allocator for ++ * struct dma_debug_entries. ++ */ ++static struct dma_debug_entry *dma_entry_alloc(void) ++{ ++ struct dma_debug_entry *entry = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&free_entries_lock, flags); ++ ++ if (list_empty(&free_entries)) { ++ printk(KERN_ERR "DMA-API: debugging out of memory " ++ "- disabling\n"); ++ global_disable = true; ++ goto out; ++ } ++ ++ entry = list_entry(free_entries.next, struct dma_debug_entry, list); ++ list_del(&entry->list); ++ memset(entry, 0, sizeof(*entry)); ++ ++#ifdef CONFIG_STACKTRACE ++ entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; ++ entry->stacktrace.entries = entry->st_entries; ++ entry->stacktrace.skip = 2; ++ save_stack_trace(&entry->stacktrace); ++#endif ++ num_free_entries -= 1; ++ if (num_free_entries < min_free_entries) ++ min_free_entries = num_free_entries; ++ ++out: ++ spin_unlock_irqrestore(&free_entries_lock, flags); ++ ++ return entry; ++} ++ ++static void dma_entry_free(struct dma_debug_entry *entry) ++{ ++ unsigned long flags; ++ ++ /* ++ * add to beginning of the list - this way the entries are ++ * more likely cache hot when they are reallocated. ++ */ ++ spin_lock_irqsave(&free_entries_lock, flags); ++ list_add(&entry->list, &free_entries); ++ num_free_entries += 1; ++ spin_unlock_irqrestore(&free_entries_lock, flags); ++} ++ ++/* ++ * DMA-API debugging init code ++ * ++ * The init code does two things: ++ * 1. Initialize core data structures ++ * 2. Preallocate a given number of dma_debug_entry structs ++ */ ++ ++static int prealloc_memory(u32 num_entries) ++{ ++ struct dma_debug_entry *entry, *next_entry; ++ int i; ++ ++ for (i = 0; i < num_entries; ++i) { ++ entry = kzalloc(sizeof(*entry), GFP_KERNEL); ++ if (!entry) ++ goto out_err; ++ ++ list_add_tail(&entry->list, &free_entries); ++ } ++ ++ num_free_entries = num_entries; ++ min_free_entries = num_entries; ++ ++ printk(KERN_INFO "DMA-API: preallocated %d debug entries\n", ++ num_entries); ++ ++ return 0; ++ ++out_err: ++ ++ list_for_each_entry_safe(entry, next_entry, &free_entries, list) { ++ list_del(&entry->list); ++ kfree(entry); ++ } ++ ++ return -ENOMEM; ++} ++ ++static int dma_debug_fs_init(void) ++{ ++ dma_debug_dent = debugfs_create_dir("dma-api", NULL); ++ if (!dma_debug_dent) { ++ printk(KERN_ERR "DMA-API: can not create debugfs directory\n"); ++ return -ENOMEM; ++ } ++ ++ global_disable_dent = debugfs_create_bool("disabled", 0444, ++ dma_debug_dent, ++ (u32 *)&global_disable); ++ if (!global_disable_dent) ++ goto out_err; ++ ++ error_count_dent = debugfs_create_u32("error_count", 0444, ++ dma_debug_dent, &error_count); ++ if (!error_count_dent) ++ goto out_err; ++ ++ show_all_errors_dent = debugfs_create_u32("all_errors", 0644, ++ dma_debug_dent, ++ &show_all_errors); ++ if (!show_all_errors_dent) ++ goto out_err; ++ ++ show_num_errors_dent = debugfs_create_u32("num_errors", 0644, ++ dma_debug_dent, ++ &show_num_errors); ++ if (!show_num_errors_dent) ++ goto out_err; ++ ++ num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, ++ dma_debug_dent, ++ &num_free_entries); ++ if (!num_free_entries_dent) ++ goto out_err; ++ ++ min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, ++ dma_debug_dent, ++ &min_free_entries); ++ if (!min_free_entries_dent) ++ goto out_err; ++ ++ return 0; ++ ++out_err: ++ debugfs_remove_recursive(dma_debug_dent); ++ ++ return -ENOMEM; ++} ++ ++static int device_dma_allocations(struct device *dev) ++{ ++ struct dma_debug_entry *entry; ++ unsigned long flags; ++ int count = 0, i; ++ ++ for (i = 0; i < HASH_SIZE; ++i) { ++ spin_lock_irqsave(&dma_entry_hash[i].lock, flags); ++ list_for_each_entry(entry, &dma_entry_hash[i].list, list) { ++ if (entry->dev == dev) ++ count += 1; ++ } ++ spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); ++ } ++ ++ return count; ++} ++ ++static int dma_debug_device_change(struct notifier_block *nb, ++ unsigned long action, void *data) ++{ ++ struct device *dev = data; ++ int count; ++ ++ ++ switch (action) { ++ case BUS_NOTIFY_UNBIND_DRIVER: ++ count = device_dma_allocations(dev); ++ if (count == 0) ++ break; ++ err_printk(dev, NULL, "DMA-API: device driver has pending " ++ "DMA allocations while released from device " ++ "[count=%d]\n", count); ++ break; ++ default: ++ break; ++ } ++ ++ return 0; ++} ++ ++void dma_debug_add_bus(struct bus_type *bus) ++{ ++ struct notifier_block *nb; ++ ++ nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); ++ if (nb == NULL) { ++ printk(KERN_ERR "dma_debug_add_bus: out of memory\n"); ++ return; ++ } ++ ++ nb->notifier_call = dma_debug_device_change; ++ ++ bus_register_notifier(bus, nb); ++} ++ ++/* ++ * Let the architectures decide how many entries should be preallocated. ++ */ ++void dma_debug_init(u32 num_entries) ++{ ++ int i; ++ ++ if (global_disable) ++ return; ++ ++ for (i = 0; i < HASH_SIZE; ++i) { ++ INIT_LIST_HEAD(&dma_entry_hash[i].list); ++ dma_entry_hash[i].lock = SPIN_LOCK_UNLOCKED; ++ } ++ ++ if (dma_debug_fs_init() != 0) { ++ printk(KERN_ERR "DMA-API: error creating debugfs entries " ++ "- disabling\n"); ++ global_disable = true; ++ ++ return; ++ } ++ ++ if (req_entries) ++ num_entries = req_entries; ++ ++ if (prealloc_memory(num_entries) != 0) { ++ printk(KERN_ERR "DMA-API: debugging out of memory error " ++ "- disabled\n"); ++ global_disable = true; ++ ++ return; ++ } ++ ++ printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n"); ++} ++ ++static __init int dma_debug_cmdline(char *str) ++{ ++ if (!str) ++ return -EINVAL; ++ ++ if (strncmp(str, "off", 3) == 0) { ++ printk(KERN_INFO "DMA-API: debugging disabled on kernel " ++ "command line\n"); ++ global_disable = true; ++ } ++ ++ return 0; ++} ++ ++static __init int dma_debug_entries_cmdline(char *str) ++{ ++ int res; ++ ++ if (!str) ++ return -EINVAL; ++ ++ res = get_option(&str, &req_entries); ++ ++ if (!res) ++ req_entries = 0; ++ ++ return 0; ++} ++ ++__setup("dma_debug=", dma_debug_cmdline); ++__setup("dma_debug_entries=", dma_debug_entries_cmdline); ++ ++static void check_unmap(struct dma_debug_entry *ref) ++{ ++ struct dma_debug_entry *entry; ++ struct hash_bucket *bucket; ++ unsigned long flags; ++ ++ if (dma_mapping_error(ref->dev, ref->dev_addr)) { ++ err_printk(ref->dev, NULL, "DMA-API: device driver tries " ++ "to free an invalid DMA memory address\n"); ++ return; ++ } ++ ++ bucket = get_hash_bucket(ref, &flags); ++ entry = hash_bucket_find(bucket, ref); ++ ++ if (!entry) { ++ err_printk(ref->dev, NULL, "DMA-API: device driver tries " ++ "to free DMA memory it has not allocated " ++ "[device address=0x%016llx] [size=%llu bytes]\n", ++ ref->dev_addr, ref->size); ++ goto out; ++ } ++ ++ if (ref->size != entry->size) { ++ err_printk(ref->dev, entry, "DMA-API: device driver frees " ++ "DMA memory with different size " ++ "[device address=0x%016llx] [map size=%llu bytes] " ++ "[unmap size=%llu bytes]\n", ++ ref->dev_addr, entry->size, ref->size); ++ } ++ ++ if (ref->type != entry->type) { ++ err_printk(ref->dev, entry, "DMA-API: device driver frees " ++ "DMA memory with wrong function " ++ "[device address=0x%016llx] [size=%llu bytes] " ++ "[mapped as %s] [unmapped as %s]\n", ++ ref->dev_addr, ref->size, ++ type2name[entry->type], type2name[ref->type]); ++ } else if ((entry->type == dma_debug_coherent) && ++ (ref->paddr != entry->paddr)) { ++ err_printk(ref->dev, entry, "DMA-API: device driver frees " ++ "DMA memory with different CPU address " ++ "[device address=0x%016llx] [size=%llu bytes] " ++ "[cpu alloc address=%p] [cpu free address=%p]", ++ ref->dev_addr, ref->size, ++ (void *)entry->paddr, (void *)ref->paddr); ++ } ++ ++ if (ref->sg_call_ents && ref->type == dma_debug_sg && ++ ref->sg_call_ents != entry->sg_call_ents) { ++ err_printk(ref->dev, entry, "DMA-API: device driver frees " ++ "DMA sg list with different entry count " ++ "[map count=%d] [unmap count=%d]\n", ++ entry->sg_call_ents, ref->sg_call_ents); ++ } ++ ++ /* ++ * This may be no bug in reality - but most implementations of the ++ * DMA API don't handle this properly, so check for it here ++ */ ++ if (ref->direction != entry->direction) { ++ err_printk(ref->dev, entry, "DMA-API: device driver frees " ++ "DMA memory with different direction " ++ "[device address=0x%016llx] [size=%llu bytes] " ++ "[mapped with %s] [unmapped with %s]\n", ++ ref->dev_addr, ref->size, ++ dir2name[entry->direction], ++ dir2name[ref->direction]); ++ } ++ ++ hash_bucket_del(entry); ++ dma_entry_free(entry); ++ ++out: ++ put_hash_bucket(bucket, &flags); ++} ++ ++static void check_for_stack(struct device *dev, void *addr) ++{ ++ if (object_is_on_stack(addr)) ++ err_printk(dev, NULL, "DMA-API: device driver maps memory from" ++ "stack [addr=%p]\n", addr); ++} ++ ++static inline bool overlap(void *addr, u64 size, void *start, void *end) ++{ ++ void *addr2 = (char *)addr + size; ++ ++ return ((addr >= start && addr < end) || ++ (addr2 >= start && addr2 < end) || ++ ((addr < start) && (addr2 >= end))); ++} ++ ++static void check_for_illegal_area(struct device *dev, void *addr, u64 size) ++{ ++ if (overlap(addr, size, _text, _etext) || ++ overlap(addr, size, __start_rodata, __end_rodata)) ++ err_printk(dev, NULL, "DMA-API: device driver maps " ++ "memory from kernel text or rodata " ++ "[addr=%p] [size=%llu]\n", addr, size); ++} ++ ++static void check_sync(struct device *dev, dma_addr_t addr, ++ u64 size, u64 offset, int direction, bool to_cpu) ++{ ++ struct dma_debug_entry ref = { ++ .dev = dev, ++ .dev_addr = addr, ++ .size = size, ++ .direction = direction, ++ }; ++ struct dma_debug_entry *entry; ++ struct hash_bucket *bucket; ++ unsigned long flags; ++ ++ bucket = get_hash_bucket(&ref, &flags); ++ ++ entry = hash_bucket_find(bucket, &ref); ++ ++ if (!entry) { ++ err_printk(dev, NULL, "DMA-API: device driver tries " ++ "to sync DMA memory it has not allocated " ++ "[device address=0x%016llx] [size=%llu bytes]\n", ++ addr, size); ++ goto out; ++ } ++ ++ if ((offset + size) > entry->size) { ++ err_printk(dev, entry, "DMA-API: device driver syncs" ++ " DMA memory outside allocated range " ++ "[device address=0x%016llx] " ++ "[allocation size=%llu bytes] [sync offset=%llu] " ++ "[sync size=%llu]\n", entry->dev_addr, entry->size, ++ offset, size); ++ } ++ ++ if (direction != entry->direction) { ++ err_printk(dev, entry, "DMA-API: device driver syncs " ++ "DMA memory with different direction " ++ "[device address=0x%016llx] [size=%llu bytes] " ++ "[mapped with %s] [synced with %s]\n", ++ addr, entry->size, ++ dir2name[entry->direction], ++ dir2name[direction]); ++ } ++ ++ if (entry->direction == DMA_BIDIRECTIONAL) ++ goto out; ++ ++ if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) && ++ !(direction == DMA_TO_DEVICE)) ++ err_printk(dev, entry, "DMA-API: device driver syncs " ++ "device read-only DMA memory for cpu " ++ "[device address=0x%016llx] [size=%llu bytes] " ++ "[mapped with %s] [synced with %s]\n", ++ addr, entry->size, ++ dir2name[entry->direction], ++ dir2name[direction]); ++ ++ if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) && ++ !(direction == DMA_FROM_DEVICE)) ++ err_printk(dev, entry, "DMA-API: device driver syncs " ++ "device write-only DMA memory to device " ++ "[device address=0x%016llx] [size=%llu bytes] " ++ "[mapped with %s] [synced with %s]\n", ++ addr, entry->size, ++ dir2name[entry->direction], ++ dir2name[direction]); ++ ++out: ++ put_hash_bucket(bucket, &flags); ++ ++} ++ ++void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, ++ size_t size, int direction, dma_addr_t dma_addr, ++ bool map_single) ++{ ++ struct dma_debug_entry *entry; ++ ++ if (unlikely(global_disable)) ++ return; ++ ++ if (unlikely(dma_mapping_error(dev, dma_addr))) ++ return; ++ ++ entry = dma_entry_alloc(); ++ if (!entry) ++ return; ++ ++ entry->dev = dev; ++ entry->type = dma_debug_page; ++ entry->paddr = page_to_phys(page) + offset; ++ entry->dev_addr = dma_addr; ++ entry->size = size; ++ entry->direction = direction; ++ ++ if (map_single) ++ entry->type = dma_debug_single; ++ ++ if (!PageHighMem(page)) { ++ void *addr = ((char *)page_address(page)) + offset; ++ check_for_stack(dev, addr); ++ check_for_illegal_area(dev, addr, size); ++ } ++ ++ add_dma_entry(entry); ++} ++EXPORT_SYMBOL(debug_dma_map_page); ++ ++void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, ++ size_t size, int direction, bool map_single) ++{ ++ struct dma_debug_entry ref = { ++ .type = dma_debug_page, ++ .dev = dev, ++ .dev_addr = addr, ++ .size = size, ++ .direction = direction, ++ }; ++ ++ if (unlikely(global_disable)) ++ return; ++ ++ if (map_single) ++ ref.type = dma_debug_single; ++ ++ check_unmap(&ref); ++} ++EXPORT_SYMBOL(debug_dma_unmap_page); ++ ++void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, ++ int nents, int mapped_ents, int direction) ++{ ++ struct dma_debug_entry *entry; ++ struct scatterlist *s; ++ int i; ++ ++ if (unlikely(global_disable)) ++ return; ++ ++ for_each_sg(sg, s, mapped_ents, i) { ++ entry = dma_entry_alloc(); ++ if (!entry) ++ return; ++ ++ entry->type = dma_debug_sg; ++ entry->dev = dev; ++ entry->paddr = sg_phys(s); ++ entry->size = s->length; ++ entry->dev_addr = s->dma_address; ++ entry->direction = direction; ++ entry->sg_call_ents = nents; ++ entry->sg_mapped_ents = mapped_ents; ++ ++ if (!PageHighMem(sg_page(s))) { ++ check_for_stack(dev, sg_virt(s)); ++ check_for_illegal_area(dev, sg_virt(s), s->length); ++ } ++ ++ add_dma_entry(entry); ++ } ++} ++EXPORT_SYMBOL(debug_dma_map_sg); ++ ++void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, ++ int nelems, int dir) ++{ ++ struct dma_debug_entry *entry; ++ struct scatterlist *s; ++ int mapped_ents = 0, i; ++ unsigned long flags; ++ ++ if (unlikely(global_disable)) ++ return; ++ ++ for_each_sg(sglist, s, nelems, i) { ++ ++ struct dma_debug_entry ref = { ++ .type = dma_debug_sg, ++ .dev = dev, ++ .paddr = sg_phys(s), ++ .dev_addr = s->dma_address, ++ .size = s->length, ++ .direction = dir, ++ .sg_call_ents = 0, ++ }; ++ ++ if (mapped_ents && i >= mapped_ents) ++ break; ++ ++ if (mapped_ents == 0) { ++ struct hash_bucket *bucket; ++ ref.sg_call_ents = nelems; ++ bucket = get_hash_bucket(&ref, &flags); ++ entry = hash_bucket_find(bucket, &ref); ++ if (entry) ++ mapped_ents = entry->sg_mapped_ents; ++ put_hash_bucket(bucket, &flags); ++ } ++ ++ check_unmap(&ref); ++ } ++} ++EXPORT_SYMBOL(debug_dma_unmap_sg); ++ ++void debug_dma_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t dma_addr, void *virt) ++{ ++ struct dma_debug_entry *entry; ++ ++ if (unlikely(global_disable)) ++ return; ++ ++ if (unlikely(virt == NULL)) ++ return; ++ ++ entry = dma_entry_alloc(); ++ if (!entry) ++ return; ++ ++ entry->type = dma_debug_coherent; ++ entry->dev = dev; ++ entry->paddr = virt_to_phys(virt); ++ entry->size = size; ++ entry->dev_addr = dma_addr; ++ entry->direction = DMA_BIDIRECTIONAL; ++ ++ add_dma_entry(entry); ++} ++EXPORT_SYMBOL(debug_dma_alloc_coherent); ++ ++void debug_dma_free_coherent(struct device *dev, size_t size, ++ void *virt, dma_addr_t addr) ++{ ++ struct dma_debug_entry ref = { ++ .type = dma_debug_coherent, ++ .dev = dev, ++ .paddr = virt_to_phys(virt), ++ .dev_addr = addr, ++ .size = size, ++ .direction = DMA_BIDIRECTIONAL, ++ }; ++ ++ if (unlikely(global_disable)) ++ return; ++ ++ check_unmap(&ref); ++} ++EXPORT_SYMBOL(debug_dma_free_coherent); ++ ++void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, ++ size_t size, int direction) ++{ ++ if (unlikely(global_disable)) ++ return; ++ ++ check_sync(dev, dma_handle, size, 0, direction, true); ++} ++EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); ++ ++void debug_dma_sync_single_for_device(struct device *dev, ++ dma_addr_t dma_handle, size_t size, ++ int direction) ++{ ++ if (unlikely(global_disable)) ++ return; ++ ++ check_sync(dev, dma_handle, size, 0, direction, false); ++} ++EXPORT_SYMBOL(debug_dma_sync_single_for_device); ++ ++void debug_dma_sync_single_range_for_cpu(struct device *dev, ++ dma_addr_t dma_handle, ++ unsigned long offset, size_t size, ++ int direction) ++{ ++ if (unlikely(global_disable)) ++ return; ++ ++ check_sync(dev, dma_handle, size, offset, direction, true); ++} ++EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu); ++ ++void debug_dma_sync_single_range_for_device(struct device *dev, ++ dma_addr_t dma_handle, ++ unsigned long offset, ++ size_t size, int direction) ++{ ++ if (unlikely(global_disable)) ++ return; ++ ++ check_sync(dev, dma_handle, size, offset, direction, false); ++} ++EXPORT_SYMBOL(debug_dma_sync_single_range_for_device); ++ ++void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, ++ int nelems, int direction) ++{ ++ struct scatterlist *s; ++ int i; ++ ++ if (unlikely(global_disable)) ++ return; ++ ++ for_each_sg(sg, s, nelems, i) { ++ check_sync(dev, s->dma_address, s->dma_length, 0, ++ direction, true); ++ } ++} ++EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); ++ ++void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, ++ int nelems, int direction) ++{ ++ struct scatterlist *s; ++ int i; ++ ++ if (unlikely(global_disable)) ++ return; ++ ++ for_each_sg(sg, s, nelems, i) { ++ check_sync(dev, s->dma_address, s->dma_length, 0, ++ direction, false); ++ } ++} ++EXPORT_SYMBOL(debug_dma_sync_sg_for_device); ++ +Index: linux-2.6-tip/lib/kernel_lock.c +=================================================================== +--- linux-2.6-tip.orig/lib/kernel_lock.c ++++ linux-2.6-tip/lib/kernel_lock.c +@@ -11,121 +11,90 @@ + #include + + /* +- * The 'big kernel lock' ++ * The 'big kernel semaphore' + * +- * This spinlock is taken and released recursively by lock_kernel() ++ * This mutex is taken and released recursively by lock_kernel() + * and unlock_kernel(). It is transparently dropped and reacquired + * over schedule(). It is used to protect legacy code that hasn't + * been migrated to a proper locking design yet. + * ++ * Note: code locked by this semaphore will only be serialized against ++ * other code using the same locking facility. The code guarantees that ++ * the task remains on the same CPU. ++ * + * Don't use in new code. + */ +-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag); +- ++DECLARE_MUTEX(kernel_sem); + + /* +- * Acquire/release the underlying lock from the scheduler. ++ * Re-acquire the kernel semaphore. + * +- * This is called with preemption disabled, and should +- * return an error value if it cannot get the lock and +- * TIF_NEED_RESCHED gets set. ++ * This function is called with preemption off. + * +- * If it successfully gets the lock, it should increment +- * the preemption count like any spinlock does. ++ * We are executing in schedule() so the code must be extremely careful ++ * about recursion, both due to the down() and due to the enabling of ++ * preemption. schedule() will re-check the preemption flag after ++ * reacquiring the semaphore. + * +- * (This works on UP too - _raw_spin_trylock will never +- * return false in that case) ++ * Called with interrupts disabled. + */ + int __lockfunc __reacquire_kernel_lock(void) + { +- while (!_raw_spin_trylock(&kernel_flag)) { +- if (test_thread_flag(TIF_NEED_RESCHED)) +- return -EAGAIN; +- cpu_relax(); +- } +- preempt_disable(); ++ struct task_struct *task = current; ++ int saved_lock_depth = task->lock_depth; ++ ++ local_irq_enable(); ++ BUG_ON(saved_lock_depth < 0); ++ ++ task->lock_depth = -1; ++ ++ down(&kernel_sem); ++ ++ task->lock_depth = saved_lock_depth; ++ ++ local_irq_disable(); ++ + return 0; + } + + void __lockfunc __release_kernel_lock(void) + { +- _raw_spin_unlock(&kernel_flag); +- preempt_enable_no_resched(); ++ up(&kernel_sem); + } + + /* +- * These are the BKL spinlocks - we try to be polite about preemption. +- * If SMP is not on (ie UP preemption), this all goes away because the +- * _raw_spin_trylock() will always succeed. ++ * Getting the big kernel semaphore. + */ +-#ifdef CONFIG_PREEMPT +-static inline void __lock_kernel(void) ++void __lockfunc lock_kernel(void) + { +- preempt_disable(); +- if (unlikely(!_raw_spin_trylock(&kernel_flag))) { +- /* +- * If preemption was disabled even before this +- * was called, there's nothing we can be polite +- * about - just spin. +- */ +- if (preempt_count() > 1) { +- _raw_spin_lock(&kernel_flag); +- return; +- } ++ struct task_struct *task = current; ++ int depth = task->lock_depth + 1; + ++ if (likely(!depth)) { + /* +- * Otherwise, let's wait for the kernel lock +- * with preemption enabled.. ++ * No recursion worries - we set up lock_depth _after_ + */ +- do { +- preempt_enable(); +- while (spin_is_locked(&kernel_flag)) +- cpu_relax(); +- preempt_disable(); +- } while (!_raw_spin_trylock(&kernel_flag)); ++ down(&kernel_sem); ++#ifdef CONFIG_DEBUG_RT_MUTEXES ++ current->last_kernel_lock = __builtin_return_address(0); ++#endif + } +-} + +-#else +- +-/* +- * Non-preemption case - just get the spinlock +- */ +-static inline void __lock_kernel(void) +-{ +- _raw_spin_lock(&kernel_flag); ++ task->lock_depth = depth; + } +-#endif + +-static inline void __unlock_kernel(void) ++void __lockfunc unlock_kernel(void) + { +- /* +- * the BKL is not covered by lockdep, so we open-code the +- * unlocking sequence (and thus avoid the dep-chain ops): +- */ +- _raw_spin_unlock(&kernel_flag); +- preempt_enable(); +-} ++ struct task_struct *task = current; + +-/* +- * Getting the big kernel lock. +- * +- * This cannot happen asynchronously, so we only need to +- * worry about other CPU's. +- */ +-void __lockfunc lock_kernel(void) +-{ +- int depth = current->lock_depth+1; +- if (likely(!depth)) +- __lock_kernel(); +- current->lock_depth = depth; +-} ++ BUG_ON(task->lock_depth < 0); + +-void __lockfunc unlock_kernel(void) +-{ +- BUG_ON(current->lock_depth < 0); +- if (likely(--current->lock_depth < 0)) +- __unlock_kernel(); ++ if (likely(--task->lock_depth == -1)) { ++#ifdef CONFIG_DEBUG_RT_MUTEXES ++ current->last_kernel_lock = NULL; ++#endif ++ up(&kernel_sem); ++ } + } + + EXPORT_SYMBOL(lock_kernel); +Index: linux-2.6-tip/lib/locking-selftest.c +=================================================================== +--- linux-2.6-tip.orig/lib/locking-selftest.c ++++ linux-2.6-tip/lib/locking-selftest.c +@@ -157,11 +157,11 @@ static void init_shared_classes(void) + #define SOFTIRQ_ENTER() \ + local_bh_disable(); \ + local_irq_disable(); \ +- trace_softirq_enter(); \ +- WARN_ON(!in_softirq()); ++ lockdep_softirq_enter(); \ ++ /* FIXME: preemptible softirqs. WARN_ON(!in_softirq()); */ + + #define SOFTIRQ_EXIT() \ +- trace_softirq_exit(); \ ++ lockdep_softirq_exit(); \ + local_irq_enable(); \ + local_bh_enable(); + +@@ -550,6 +550,11 @@ GENERATE_TESTCASE(init_held_rsem) + #undef E + + /* ++ * FIXME: turns these into raw-spinlock tests on -rt ++ */ ++#ifndef CONFIG_PREEMPT_RT ++ ++/* + * locking an irq-safe lock with irqs enabled: + */ + #define E1() \ +@@ -890,6 +895,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_ + #include "locking-selftest-softirq.h" + // GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft) + ++#endif /* !CONFIG_PREEMPT_RT */ ++ + #ifdef CONFIG_DEBUG_LOCK_ALLOC + # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) + # define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map) +@@ -940,6 +947,9 @@ static void dotest(void (*testcase_fn)(v + { + unsigned long saved_preempt_count = preempt_count(); + int expected_failure = 0; ++#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES) ++ long saved_lock_count = atomic_read(¤t->lock_count); ++#endif + + WARN_ON(irqs_disabled()); + +@@ -989,6 +999,9 @@ static void dotest(void (*testcase_fn)(v + #endif + + reset_locks(); ++#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_DEBUG_RT_MUTEXES) ++ atomic_set(¤t->lock_count, saved_lock_count); ++#endif + } + + static inline void print_testname(const char *testname) +@@ -998,7 +1011,7 @@ static inline void print_testname(const + + #define DO_TESTCASE_1(desc, name, nr) \ + print_testname(desc"/"#nr); \ +- dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ ++ dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + printk("\n"); + + #define DO_TESTCASE_1B(desc, name, nr) \ +@@ -1006,17 +1019,17 @@ static inline void print_testname(const + dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + printk("\n"); + +-#define DO_TESTCASE_3(desc, name, nr) \ +- print_testname(desc"/"#nr); \ +- dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ +- dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ ++#define DO_TESTCASE_3(desc, name, nr) \ ++ print_testname(desc"/"#nr); \ ++ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ ++ dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + printk("\n"); + +-#define DO_TESTCASE_3RW(desc, name, nr) \ +- print_testname(desc"/"#nr); \ ++#define DO_TESTCASE_3RW(desc, name, nr) \ ++ print_testname(desc"/"#nr); \ + dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ +- dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ ++ dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + printk("\n"); + +@@ -1047,7 +1060,7 @@ static inline void print_testname(const + print_testname(desc); \ + dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ +- dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ ++ dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ + dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ + dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ +@@ -1179,6 +1192,7 @@ void locking_selftest(void) + /* + * irq-context testcases: + */ ++#ifndef CONFIG_PREEMPT_RT + DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); + DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); + DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); +@@ -1188,6 +1202,7 @@ void locking_selftest(void) + + DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); + // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); ++#endif + + if (unexpected_testcase_failures) { + printk("-----------------------------------------------------------------\n"); +Index: linux-2.6-tip/lib/swiotlb.c +=================================================================== +--- linux-2.6-tip.orig/lib/swiotlb.c ++++ linux-2.6-tip/lib/swiotlb.c +@@ -145,7 +145,7 @@ static void *swiotlb_bus_to_virt(dma_add + return phys_to_virt(swiotlb_bus_to_phys(address)); + } + +-int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) ++int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) + { + return 0; + } +@@ -315,9 +315,9 @@ address_needs_mapping(struct device *hwd + return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); + } + +-static inline int range_needs_mapping(void *ptr, size_t size) ++static inline int range_needs_mapping(phys_addr_t paddr, size_t size) + { +- return swiotlb_force || swiotlb_arch_range_needs_mapping(ptr, size); ++ return swiotlb_force || swiotlb_arch_range_needs_mapping(paddr, size); + } + + static int is_swiotlb_buffer(char *addr) +@@ -636,11 +636,14 @@ swiotlb_full(struct device *dev, size_t + * Once the device is given the dma address, the device owns this memory until + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. + */ +-dma_addr_t +-swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size, +- int dir, struct dma_attrs *attrs) +-{ +- dma_addr_t dev_addr = swiotlb_virt_to_bus(hwdev, ptr); ++dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ phys_addr_t phys = page_to_phys(page) + offset; ++ void *ptr = page_address(page) + offset; ++ dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys); + void *map; + + BUG_ON(dir == DMA_NONE); +@@ -649,37 +652,30 @@ swiotlb_map_single_attrs(struct device * + * we can safely return the device addr and not worry about bounce + * buffering it. + */ +- if (!address_needs_mapping(hwdev, dev_addr, size) && +- !range_needs_mapping(ptr, size)) ++ if (!address_needs_mapping(dev, dev_addr, size) && ++ !range_needs_mapping(virt_to_phys(ptr), size)) + return dev_addr; + + /* + * Oh well, have to allocate and map a bounce buffer. + */ +- map = map_single(hwdev, virt_to_phys(ptr), size, dir); ++ map = map_single(dev, phys, size, dir); + if (!map) { +- swiotlb_full(hwdev, size, dir, 1); ++ swiotlb_full(dev, size, dir, 1); + map = io_tlb_overflow_buffer; + } + +- dev_addr = swiotlb_virt_to_bus(hwdev, map); ++ dev_addr = swiotlb_virt_to_bus(dev, map); + + /* + * Ensure that the address returned is DMA'ble + */ +- if (address_needs_mapping(hwdev, dev_addr, size)) ++ if (address_needs_mapping(dev, dev_addr, size)) + panic("map_single: bounce buffer is not DMA'ble"); + + return dev_addr; + } +-EXPORT_SYMBOL(swiotlb_map_single_attrs); +- +-dma_addr_t +-swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) +-{ +- return swiotlb_map_single_attrs(hwdev, ptr, size, dir, NULL); +-} +-EXPORT_SYMBOL(swiotlb_map_single); ++EXPORT_SYMBOL_GPL(swiotlb_map_page); + + /* + * Unmap a single streaming mode DMA translation. The dma_addr and size must +@@ -689,9 +685,9 @@ EXPORT_SYMBOL(swiotlb_map_single); + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +-void +-swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr, +- size_t size, int dir, struct dma_attrs *attrs) ++void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + char *dma_addr = swiotlb_bus_to_virt(dev_addr); + +@@ -701,15 +697,7 @@ swiotlb_unmap_single_attrs(struct device + else if (dir == DMA_FROM_DEVICE) + dma_mark_clean(dma_addr, size); + } +-EXPORT_SYMBOL(swiotlb_unmap_single_attrs); +- +-void +-swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, +- int dir) +-{ +- return swiotlb_unmap_single_attrs(hwdev, dev_addr, size, dir, NULL); +-} +-EXPORT_SYMBOL(swiotlb_unmap_single); ++EXPORT_SYMBOL_GPL(swiotlb_unmap_page); + + /* + * Make physical memory consistent for a single streaming mode DMA translation +@@ -736,7 +724,7 @@ swiotlb_sync_single(struct device *hwdev + + void + swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, +- size_t size, int dir) ++ size_t size, enum dma_data_direction dir) + { + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); + } +@@ -744,7 +732,7 @@ EXPORT_SYMBOL(swiotlb_sync_single_for_cp + + void + swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, +- size_t size, int dir) ++ size_t size, enum dma_data_direction dir) + { + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); + } +@@ -769,7 +757,8 @@ swiotlb_sync_single_range(struct device + + void + swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, +- unsigned long offset, size_t size, int dir) ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir) + { + swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, + SYNC_FOR_CPU); +@@ -778,7 +767,8 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra + + void + swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr, +- unsigned long offset, size_t size, int dir) ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir) + { + swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, + SYNC_FOR_DEVICE); +@@ -803,7 +793,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_ra + */ + int + swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, +- int dir, struct dma_attrs *attrs) ++ enum dma_data_direction dir, struct dma_attrs *attrs) + { + struct scatterlist *sg; + int i; +@@ -811,10 +801,10 @@ swiotlb_map_sg_attrs(struct device *hwde + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { +- void *addr = sg_virt(sg); +- dma_addr_t dev_addr = swiotlb_virt_to_bus(hwdev, addr); ++ phys_addr_t paddr = sg_phys(sg); ++ dma_addr_t dev_addr = swiotlb_phys_to_bus(hwdev, paddr); + +- if (range_needs_mapping(addr, sg->length) || ++ if (range_needs_mapping(paddr, sg->length) || + address_needs_mapping(hwdev, dev_addr, sg->length)) { + void *map = map_single(hwdev, sg_phys(sg), + sg->length, dir); +@@ -850,7 +840,7 @@ EXPORT_SYMBOL(swiotlb_map_sg); + */ + void + swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, +- int nelems, int dir, struct dma_attrs *attrs) ++ int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) + { + struct scatterlist *sg; + int i; +@@ -858,11 +848,11 @@ swiotlb_unmap_sg_attrs(struct device *hw + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { +- if (sg->dma_address != swiotlb_virt_to_bus(hwdev, sg_virt(sg))) ++ if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) + unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), + sg->dma_length, dir); + else if (dir == DMA_FROM_DEVICE) +- dma_mark_clean(sg_virt(sg), sg->dma_length); ++ dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length); + } + } + EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); +@@ -892,17 +882,17 @@ swiotlb_sync_sg(struct device *hwdev, st + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { +- if (sg->dma_address != swiotlb_virt_to_bus(hwdev, sg_virt(sg))) ++ if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) + sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), + sg->dma_length, dir, target); + else if (dir == DMA_FROM_DEVICE) +- dma_mark_clean(sg_virt(sg), sg->dma_length); ++ dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length); + } + } + + void + swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, +- int nelems, int dir) ++ int nelems, enum dma_data_direction dir) + { + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); + } +@@ -910,7 +900,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); + + void + swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, +- int nelems, int dir) ++ int nelems, enum dma_data_direction dir) + { + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); + } +Index: linux-2.6-tip/lib/vsprintf.c +=================================================================== +--- linux-2.6-tip.orig/lib/vsprintf.c ++++ linux-2.6-tip/lib/vsprintf.c +@@ -396,7 +396,38 @@ static noinline char* put_dec(char *buf, + #define SMALL 32 /* Must be 32 == 0x20 */ + #define SPECIAL 64 /* 0x */ + +-static char *number(char *buf, char *end, unsigned long long num, int base, int size, int precision, int type) ++enum format_type { ++ FORMAT_TYPE_NONE, /* Just a string part */ ++ FORMAT_TYPE_WIDTH, ++ FORMAT_TYPE_PRECISION, ++ FORMAT_TYPE_CHAR, ++ FORMAT_TYPE_STR, ++ FORMAT_TYPE_PTR, ++ FORMAT_TYPE_PERCENT_CHAR, ++ FORMAT_TYPE_INVALID, ++ FORMAT_TYPE_LONG_LONG, ++ FORMAT_TYPE_ULONG, ++ FORMAT_TYPE_LONG, ++ FORMAT_TYPE_USHORT, ++ FORMAT_TYPE_SHORT, ++ FORMAT_TYPE_UINT, ++ FORMAT_TYPE_INT, ++ FORMAT_TYPE_NRCHARS, ++ FORMAT_TYPE_SIZE_T, ++ FORMAT_TYPE_PTRDIFF ++}; ++ ++struct printf_spec { ++ enum format_type type; ++ int flags; /* flags to number() */ ++ int field_width; /* width of output field */ ++ int base; ++ int precision; /* # of digits/chars */ ++ int qualifier; ++}; ++ ++static char *number(char *buf, char *end, unsigned long long num, ++ struct printf_spec spec) + { + /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ + static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ +@@ -404,32 +435,32 @@ static char *number(char *buf, char *end + char tmp[66]; + char sign; + char locase; +- int need_pfx = ((type & SPECIAL) && base != 10); ++ int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); + int i; + + /* locase = 0 or 0x20. ORing digits or letters with 'locase' + * produces same digits or (maybe lowercased) letters */ +- locase = (type & SMALL); +- if (type & LEFT) +- type &= ~ZEROPAD; ++ locase = (spec.flags & SMALL); ++ if (spec.flags & LEFT) ++ spec.flags &= ~ZEROPAD; + sign = 0; +- if (type & SIGN) { ++ if (spec.flags & SIGN) { + if ((signed long long) num < 0) { + sign = '-'; + num = - (signed long long) num; +- size--; +- } else if (type & PLUS) { ++ spec.field_width--; ++ } else if (spec.flags & PLUS) { + sign = '+'; +- size--; +- } else if (type & SPACE) { ++ spec.field_width--; ++ } else if (spec.flags & SPACE) { + sign = ' '; +- size--; ++ spec.field_width--; + } + } + if (need_pfx) { +- size--; +- if (base == 16) +- size--; ++ spec.field_width--; ++ if (spec.base == 16) ++ spec.field_width--; + } + + /* generate full string in tmp[], in reverse order */ +@@ -441,10 +472,10 @@ static char *number(char *buf, char *end + tmp[i++] = (digits[do_div(num,base)] | locase); + } while (num != 0); + */ +- else if (base != 10) { /* 8 or 16 */ +- int mask = base - 1; ++ else if (spec.base != 10) { /* 8 or 16 */ ++ int mask = spec.base - 1; + int shift = 3; +- if (base == 16) shift = 4; ++ if (spec.base == 16) shift = 4; + do { + tmp[i++] = (digits[((unsigned char)num) & mask] | locase); + num >>= shift; +@@ -454,12 +485,12 @@ static char *number(char *buf, char *end + } + + /* printing 100 using %2d gives "100", not "00" */ +- if (i > precision) +- precision = i; ++ if (i > spec.precision) ++ spec.precision = i; + /* leading space padding */ +- size -= precision; +- if (!(type & (ZEROPAD+LEFT))) { +- while(--size >= 0) { ++ spec.field_width -= spec.precision; ++ if (!(spec.flags & (ZEROPAD+LEFT))) { ++ while(--spec.field_width >= 0) { + if (buf < end) + *buf = ' '; + ++buf; +@@ -476,23 +507,23 @@ static char *number(char *buf, char *end + if (buf < end) + *buf = '0'; + ++buf; +- if (base == 16) { ++ if (spec.base == 16) { + if (buf < end) + *buf = ('X' | locase); + ++buf; + } + } + /* zero or space padding */ +- if (!(type & LEFT)) { +- char c = (type & ZEROPAD) ? '0' : ' '; +- while (--size >= 0) { ++ if (!(spec.flags & LEFT)) { ++ char c = (spec.flags & ZEROPAD) ? '0' : ' '; ++ while (--spec.field_width >= 0) { + if (buf < end) + *buf = c; + ++buf; + } + } + /* hmm even more zero padding? */ +- while (i <= --precision) { ++ while (i <= --spec.precision) { + if (buf < end) + *buf = '0'; + ++buf; +@@ -504,7 +535,7 @@ static char *number(char *buf, char *end + ++buf; + } + /* trailing space padding */ +- while (--size >= 0) { ++ while (--spec.field_width >= 0) { + if (buf < end) + *buf = ' '; + ++buf; +@@ -512,17 +543,17 @@ static char *number(char *buf, char *end + return buf; + } + +-static char *string(char *buf, char *end, char *s, int field_width, int precision, int flags) ++static char *string(char *buf, char *end, char *s, struct printf_spec spec) + { + int len, i; + + if ((unsigned long)s < PAGE_SIZE) + s = ""; + +- len = strnlen(s, precision); ++ len = strnlen(s, spec.precision); + +- if (!(flags & LEFT)) { +- while (len < field_width--) { ++ if (!(spec.flags & LEFT)) { ++ while (len < spec.field_width--) { + if (buf < end) + *buf = ' '; + ++buf; +@@ -533,7 +564,7 @@ static char *string(char *buf, char *end + *buf = *s; + ++buf; ++s; + } +- while (len < field_width--) { ++ while (len < spec.field_width--) { + if (buf < end) + *buf = ' '; + ++buf; +@@ -541,21 +572,24 @@ static char *string(char *buf, char *end + return buf; + } + +-static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int precision, int flags) ++static char *symbol_string(char *buf, char *end, void *ptr, ++ struct printf_spec spec) + { + unsigned long value = (unsigned long) ptr; + #ifdef CONFIG_KALLSYMS + char sym[KSYM_SYMBOL_LEN]; + sprint_symbol(sym, value); +- return string(buf, end, sym, field_width, precision, flags); ++ return string(buf, end, sym, spec); + #else +- field_width = 2*sizeof(void *); +- flags |= SPECIAL | SMALL | ZEROPAD; +- return number(buf, end, value, 16, field_width, precision, flags); ++ spec.field_width = 2*sizeof(void *); ++ spec.flags |= SPECIAL | SMALL | ZEROPAD; ++ spec.base = 16; ++ return number(buf, end, value, spec); + #endif + } + +-static char *resource_string(char *buf, char *end, struct resource *res, int field_width, int precision, int flags) ++static char *resource_string(char *buf, char *end, struct resource *res, ++ struct printf_spec spec) + { + #ifndef IO_RSRC_PRINTK_SIZE + #define IO_RSRC_PRINTK_SIZE 4 +@@ -564,7 +598,11 @@ static char *resource_string(char *buf, + #ifndef MEM_RSRC_PRINTK_SIZE + #define MEM_RSRC_PRINTK_SIZE 8 + #endif +- ++ struct printf_spec num_spec = { ++ .base = 16, ++ .precision = -1, ++ .flags = SPECIAL | SMALL | ZEROPAD, ++ }; + /* room for the actual numbers, the two "0x", -, [, ] and the final zero */ + char sym[4*sizeof(resource_size_t) + 8]; + char *p = sym, *pend = sym + sizeof(sym); +@@ -576,17 +614,18 @@ static char *resource_string(char *buf, + size = MEM_RSRC_PRINTK_SIZE; + + *p++ = '['; +- p = number(p, pend, res->start, 16, size, -1, SPECIAL | SMALL | ZEROPAD); ++ num_spec.field_width = size; ++ p = number(p, pend, res->start, num_spec); + *p++ = '-'; +- p = number(p, pend, res->end, 16, size, -1, SPECIAL | SMALL | ZEROPAD); ++ p = number(p, pend, res->end, num_spec); + *p++ = ']'; + *p = 0; + +- return string(buf, end, sym, field_width, precision, flags); ++ return string(buf, end, sym, spec); + } + +-static char *mac_address_string(char *buf, char *end, u8 *addr, int field_width, +- int precision, int flags) ++static char *mac_address_string(char *buf, char *end, u8 *addr, ++ struct printf_spec spec) + { + char mac_addr[6 * 3]; /* (6 * 2 hex digits), 5 colons and trailing zero */ + char *p = mac_addr; +@@ -594,16 +633,17 @@ static char *mac_address_string(char *bu + + for (i = 0; i < 6; i++) { + p = pack_hex_byte(p, addr[i]); +- if (!(flags & SPECIAL) && i != 5) ++ if (!(spec.flags & SPECIAL) && i != 5) + *p++ = ':'; + } + *p = '\0'; ++ spec.flags &= ~SPECIAL; + +- return string(buf, end, mac_addr, field_width, precision, flags & ~SPECIAL); ++ return string(buf, end, mac_addr, spec); + } + +-static char *ip6_addr_string(char *buf, char *end, u8 *addr, int field_width, +- int precision, int flags) ++static char *ip6_addr_string(char *buf, char *end, u8 *addr, ++ struct printf_spec spec) + { + char ip6_addr[8 * 5]; /* (8 * 4 hex digits), 7 colons and trailing zero */ + char *p = ip6_addr; +@@ -612,16 +652,17 @@ static char *ip6_addr_string(char *buf, + for (i = 0; i < 8; i++) { + p = pack_hex_byte(p, addr[2 * i]); + p = pack_hex_byte(p, addr[2 * i + 1]); +- if (!(flags & SPECIAL) && i != 7) ++ if (!(spec.flags & SPECIAL) && i != 7) + *p++ = ':'; + } + *p = '\0'; ++ spec.flags &= ~SPECIAL; + +- return string(buf, end, ip6_addr, field_width, precision, flags & ~SPECIAL); ++ return string(buf, end, ip6_addr, spec); + } + +-static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width, +- int precision, int flags) ++static char *ip4_addr_string(char *buf, char *end, u8 *addr, ++ struct printf_spec spec) + { + char ip4_addr[4 * 4]; /* (4 * 3 decimal digits), 3 dots and trailing zero */ + char temp[3]; /* hold each IP quad in reverse order */ +@@ -637,8 +678,9 @@ static char *ip4_addr_string(char *buf, + *p++ = '.'; + } + *p = '\0'; ++ spec.flags &= ~SPECIAL; + +- return string(buf, end, ip4_addr, field_width, precision, flags & ~SPECIAL); ++ return string(buf, end, ip4_addr, spec); + } + + /* +@@ -663,41 +705,233 @@ static char *ip4_addr_string(char *buf, + * function pointers are really function descriptors, which contain a + * pointer to the real address. + */ +-static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field_width, int precision, int flags) ++static char *pointer(const char *fmt, char *buf, char *end, void *ptr, ++ struct printf_spec spec) + { + if (!ptr) +- return string(buf, end, "(null)", field_width, precision, flags); ++ return string(buf, end, "(null)", spec); + + switch (*fmt) { + case 'F': + ptr = dereference_function_descriptor(ptr); + /* Fallthrough */ + case 'S': +- return symbol_string(buf, end, ptr, field_width, precision, flags); ++ return symbol_string(buf, end, ptr, spec); + case 'R': +- return resource_string(buf, end, ptr, field_width, precision, flags); ++ return resource_string(buf, end, ptr, spec); + case 'm': +- flags |= SPECIAL; ++ spec.flags |= SPECIAL; + /* Fallthrough */ + case 'M': +- return mac_address_string(buf, end, ptr, field_width, precision, flags); ++ return mac_address_string(buf, end, ptr, spec); + case 'i': +- flags |= SPECIAL; ++ spec.flags |= SPECIAL; + /* Fallthrough */ + case 'I': + if (fmt[1] == '6') +- return ip6_addr_string(buf, end, ptr, field_width, precision, flags); ++ return ip6_addr_string(buf, end, ptr, spec); + if (fmt[1] == '4') +- return ip4_addr_string(buf, end, ptr, field_width, precision, flags); +- flags &= ~SPECIAL; ++ return ip4_addr_string(buf, end, ptr, spec); ++ spec.flags &= ~SPECIAL; ++ break; ++ } ++ spec.flags |= SMALL; ++ if (spec.field_width == -1) { ++ spec.field_width = 2*sizeof(void *); ++ spec.flags |= ZEROPAD; ++ } ++ spec.base = 16; ++ ++ return number(buf, end, (unsigned long) ptr, spec); ++} ++ ++/* ++ * Helper function to decode printf style format. ++ * Each call decode a token from the format and return the ++ * number of characters read (or likely the delta where it wants ++ * to go on the next call). ++ * The decoded token is returned through the parameters ++ * ++ * 'h', 'l', or 'L' for integer fields ++ * 'z' support added 23/7/1999 S.H. ++ * 'z' changed to 'Z' --davidm 1/25/99 ++ * 't' added for ptrdiff_t ++ * ++ * @fmt: the format string ++ * @type of the token returned ++ * @flags: various flags such as +, -, # tokens.. ++ * @field_width: overwritten width ++ * @base: base of the number (octal, hex, ...) ++ * @precision: precision of a number ++ * @qualifier: qualifier of a number (long, size_t, ...) ++ */ ++static int format_decode(const char *fmt, struct printf_spec *spec) ++{ ++ const char *start = fmt; ++ ++ /* we finished early by reading the field width */ ++ if (spec->type == FORMAT_TYPE_WIDTH) { ++ if (spec->field_width < 0) { ++ spec->field_width = -spec->field_width; ++ spec->flags |= LEFT; ++ } ++ spec->type = FORMAT_TYPE_NONE; ++ goto precision; ++ } ++ ++ /* we finished early by reading the precision */ ++ if (spec->type == FORMAT_TYPE_PRECISION) { ++ if (spec->precision < 0) ++ spec->precision = 0; ++ ++ spec->type = FORMAT_TYPE_NONE; ++ goto qualifier; ++ } ++ ++ /* By default */ ++ spec->type = FORMAT_TYPE_NONE; ++ ++ for (; *fmt ; ++fmt) { ++ if (*fmt == '%') ++ break; ++ } ++ ++ /* Return the current non-format string */ ++ if (fmt != start || !*fmt) ++ return fmt - start; ++ ++ /* Process flags */ ++ spec->flags = 0; ++ ++ while (1) { /* this also skips first '%' */ ++ bool found = true; ++ ++ ++fmt; ++ ++ switch (*fmt) { ++ case '-': spec->flags |= LEFT; break; ++ case '+': spec->flags |= PLUS; break; ++ case ' ': spec->flags |= SPACE; break; ++ case '#': spec->flags |= SPECIAL; break; ++ case '0': spec->flags |= ZEROPAD; break; ++ default: found = false; ++ } ++ ++ if (!found) ++ break; ++ } ++ ++ /* get field width */ ++ spec->field_width = -1; ++ ++ if (isdigit(*fmt)) ++ spec->field_width = skip_atoi(&fmt); ++ else if (*fmt == '*') { ++ /* it's the next argument */ ++ spec->type = FORMAT_TYPE_WIDTH; ++ return ++fmt - start; ++ } ++ ++precision: ++ /* get the precision */ ++ spec->precision = -1; ++ if (*fmt == '.') { ++ ++fmt; ++ if (isdigit(*fmt)) { ++ spec->precision = skip_atoi(&fmt); ++ if (spec->precision < 0) ++ spec->precision = 0; ++ } else if (*fmt == '*') { ++ /* it's the next argument */ ++ spec->type = FORMAT_TYPE_PRECISION; ++ return ++fmt - start; ++ } ++ } ++ ++qualifier: ++ /* get the conversion qualifier */ ++ spec->qualifier = -1; ++ if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || ++ *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { ++ spec->qualifier = *fmt; ++ ++fmt; ++ if (spec->qualifier == 'l' && *fmt == 'l') { ++ spec->qualifier = 'L'; ++ ++fmt; ++ } ++ } ++ ++ /* default base */ ++ spec->base = 10; ++ switch (*fmt) { ++ case 'c': ++ spec->type = FORMAT_TYPE_CHAR; ++ return ++fmt - start; ++ ++ case 's': ++ spec->type = FORMAT_TYPE_STR; ++ return ++fmt - start; ++ ++ case 'p': ++ spec->type = FORMAT_TYPE_PTR; ++ return fmt - start; ++ /* skip alnum */ ++ ++ case 'n': ++ spec->type = FORMAT_TYPE_NRCHARS; ++ return ++fmt - start; ++ ++ case '%': ++ spec->type = FORMAT_TYPE_PERCENT_CHAR; ++ return ++fmt - start; ++ ++ /* integer number formats - set up the flags and "break" */ ++ case 'o': ++ spec->base = 8; ++ break; ++ ++ case 'x': ++ spec->flags |= SMALL; ++ ++ case 'X': ++ spec->base = 16; ++ break; ++ ++ case 'd': ++ case 'i': ++ spec->flags |= SIGN; ++ case 'u': + break; ++ ++ default: ++ spec->type = FORMAT_TYPE_INVALID; ++ return fmt - start; + } +- flags |= SMALL; +- if (field_width == -1) { +- field_width = 2*sizeof(void *); +- flags |= ZEROPAD; ++ ++ if (spec->qualifier == 'L') ++ spec->type = FORMAT_TYPE_LONG_LONG; ++ else if (spec->qualifier == 'l') { ++ if (spec->flags & SIGN) ++ spec->type = FORMAT_TYPE_LONG; ++ else ++ spec->type = FORMAT_TYPE_ULONG; ++ } else if (spec->qualifier == 'Z' || spec->qualifier == 'z') { ++ spec->type = FORMAT_TYPE_SIZE_T; ++ } else if (spec->qualifier == 't') { ++ spec->type = FORMAT_TYPE_PTRDIFF; ++ } else if (spec->qualifier == 'h') { ++ if (spec->flags & SIGN) ++ spec->type = FORMAT_TYPE_SHORT; ++ else ++ spec->type = FORMAT_TYPE_USHORT; ++ } else { ++ if (spec->flags & SIGN) ++ spec->type = FORMAT_TYPE_INT; ++ else ++ spec->type = FORMAT_TYPE_UINT; + } +- return number(buf, end, (unsigned long) ptr, 16, field_width, precision, flags); ++ ++ return ++fmt - start; + } + + /** +@@ -726,18 +960,9 @@ static char *pointer(const char *fmt, ch + int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + { + unsigned long long num; +- int base; + char *str, *end, c; +- +- int flags; /* flags to number() */ +- +- int field_width; /* width of output field */ +- int precision; /* min. # of digits for integers; max +- number of chars for from string */ +- int qualifier; /* 'h', 'l', or 'L' for integer fields */ +- /* 'z' support added 23/7/1999 S.H. */ +- /* 'z' changed to 'Z' --davidm 1/25/99 */ +- /* 't' added for ptrdiff_t */ ++ int read; ++ struct printf_spec spec = {0}; + + /* Reject out-of-range values early. Large positive sizes are + used for unknown buffer sizes. */ +@@ -758,184 +983,144 @@ int vsnprintf(char *buf, size_t size, co + size = end - buf; + } + +- for (; *fmt ; ++fmt) { +- if (*fmt != '%') { +- if (str < end) +- *str = *fmt; +- ++str; +- continue; +- } ++ while (*fmt) { ++ const char *old_fmt = fmt; + +- /* process flags */ +- flags = 0; +- repeat: +- ++fmt; /* this also skips first '%' */ +- switch (*fmt) { +- case '-': flags |= LEFT; goto repeat; +- case '+': flags |= PLUS; goto repeat; +- case ' ': flags |= SPACE; goto repeat; +- case '#': flags |= SPECIAL; goto repeat; +- case '0': flags |= ZEROPAD; goto repeat; +- } ++ read = format_decode(fmt, &spec); + +- /* get field width */ +- field_width = -1; +- if (isdigit(*fmt)) +- field_width = skip_atoi(&fmt); +- else if (*fmt == '*') { +- ++fmt; +- /* it's the next argument */ +- field_width = va_arg(args, int); +- if (field_width < 0) { +- field_width = -field_width; +- flags |= LEFT; +- } +- } ++ fmt += read; + +- /* get the precision */ +- precision = -1; +- if (*fmt == '.') { +- ++fmt; +- if (isdigit(*fmt)) +- precision = skip_atoi(&fmt); +- else if (*fmt == '*') { +- ++fmt; +- /* it's the next argument */ +- precision = va_arg(args, int); ++ switch (spec.type) { ++ case FORMAT_TYPE_NONE: { ++ int copy = read; ++ if (str < end) { ++ if (copy > end - str) ++ copy = end - str; ++ memcpy(str, old_fmt, copy); + } +- if (precision < 0) +- precision = 0; ++ str += read; ++ break; + } + +- /* get the conversion qualifier */ +- qualifier = -1; +- if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || +- *fmt =='Z' || *fmt == 'z' || *fmt == 't') { +- qualifier = *fmt; +- ++fmt; +- if (qualifier == 'l' && *fmt == 'l') { +- qualifier = 'L'; +- ++fmt; +- } +- } ++ case FORMAT_TYPE_WIDTH: ++ spec.field_width = va_arg(args, int); ++ break; + +- /* default base */ +- base = 10; ++ case FORMAT_TYPE_PRECISION: ++ spec.precision = va_arg(args, int); ++ break; + +- switch (*fmt) { +- case 'c': +- if (!(flags & LEFT)) { +- while (--field_width > 0) { +- if (str < end) +- *str = ' '; +- ++str; +- } +- } +- c = (unsigned char) va_arg(args, int); +- if (str < end) +- *str = c; +- ++str; +- while (--field_width > 0) { ++ case FORMAT_TYPE_CHAR: ++ if (!(spec.flags & LEFT)) { ++ while (--spec.field_width > 0) { + if (str < end) + *str = ' '; + ++str; +- } +- continue; + +- case 's': +- str = string(str, end, va_arg(args, char *), field_width, precision, flags); +- continue; +- +- case 'p': +- str = pointer(fmt+1, str, end, +- va_arg(args, void *), +- field_width, precision, flags); +- /* Skip all alphanumeric pointer suffixes */ +- while (isalnum(fmt[1])) +- fmt++; +- continue; +- +- case 'n': +- /* FIXME: +- * What does C99 say about the overflow case here? */ +- if (qualifier == 'l') { +- long * ip = va_arg(args, long *); +- *ip = (str - buf); +- } else if (qualifier == 'Z' || qualifier == 'z') { +- size_t * ip = va_arg(args, size_t *); +- *ip = (str - buf); +- } else { +- int * ip = va_arg(args, int *); +- *ip = (str - buf); + } +- continue; +- +- case '%': ++ } ++ c = (unsigned char) va_arg(args, int); ++ if (str < end) ++ *str = c; ++ ++str; ++ while (--spec.field_width > 0) { + if (str < end) +- *str = '%'; ++ *str = ' '; + ++str; +- continue; ++ } ++ break; + +- /* integer number formats - set up the flags and "break" */ +- case 'o': +- base = 8; +- break; ++ case FORMAT_TYPE_STR: ++ str = string(str, end, va_arg(args, char *), spec); ++ break; + +- case 'x': +- flags |= SMALL; +- case 'X': +- base = 16; +- break; ++ case FORMAT_TYPE_PTR: ++ str = pointer(fmt+1, str, end, va_arg(args, void *), ++ spec); ++ while (isalnum(*fmt)) ++ fmt++; ++ break; + +- case 'd': +- case 'i': +- flags |= SIGN; +- case 'u': +- break; ++ case FORMAT_TYPE_PERCENT_CHAR: ++ if (str < end) ++ *str = '%'; ++ ++str; ++ break; + +- default: ++ case FORMAT_TYPE_INVALID: ++ if (str < end) ++ *str = '%'; ++ ++str; ++ if (*fmt) { + if (str < end) +- *str = '%'; ++ *str = *fmt; + ++str; +- if (*fmt) { +- if (str < end) +- *str = *fmt; +- ++str; +- } else { +- --fmt; +- } +- continue; ++ } else { ++ --fmt; ++ } ++ break; ++ ++ case FORMAT_TYPE_NRCHARS: { ++ int qualifier = spec.qualifier; ++ ++ if (qualifier == 'l') { ++ long *ip = va_arg(args, long *); ++ *ip = (str - buf); ++ } else if (qualifier == 'Z' || ++ qualifier == 'z') { ++ size_t *ip = va_arg(args, size_t *); ++ *ip = (str - buf); ++ } else { ++ int *ip = va_arg(args, int *); ++ *ip = (str - buf); ++ } ++ break; + } +- if (qualifier == 'L') +- num = va_arg(args, long long); +- else if (qualifier == 'l') { +- num = va_arg(args, unsigned long); +- if (flags & SIGN) +- num = (signed long) num; +- } else if (qualifier == 'Z' || qualifier == 'z') { +- num = va_arg(args, size_t); +- } else if (qualifier == 't') { +- num = va_arg(args, ptrdiff_t); +- } else if (qualifier == 'h') { +- num = (unsigned short) va_arg(args, int); +- if (flags & SIGN) +- num = (signed short) num; +- } else { +- num = va_arg(args, unsigned int); +- if (flags & SIGN) +- num = (signed int) num; ++ ++ default: ++ switch (spec.type) { ++ case FORMAT_TYPE_LONG_LONG: ++ num = va_arg(args, long long); ++ break; ++ case FORMAT_TYPE_ULONG: ++ num = va_arg(args, unsigned long); ++ break; ++ case FORMAT_TYPE_LONG: ++ num = va_arg(args, long); ++ break; ++ case FORMAT_TYPE_SIZE_T: ++ num = va_arg(args, size_t); ++ break; ++ case FORMAT_TYPE_PTRDIFF: ++ num = va_arg(args, ptrdiff_t); ++ break; ++ case FORMAT_TYPE_USHORT: ++ num = (unsigned short) va_arg(args, int); ++ break; ++ case FORMAT_TYPE_SHORT: ++ num = (short) va_arg(args, int); ++ break; ++ case FORMAT_TYPE_INT: ++ num = (int) va_arg(args, int); ++ break; ++ default: ++ num = va_arg(args, unsigned int); ++ } ++ ++ str = number(str, end, num, spec); + } +- str = number(str, end, num, base, +- field_width, precision, flags); + } ++ + if (size > 0) { + if (str < end) + *str = '\0'; + else + end[-1] = '\0'; + } ++ + /* the trailing null byte doesn't count towards the total */ + return str-buf; ++ + } + EXPORT_SYMBOL(vsnprintf); + +@@ -1058,6 +1243,372 @@ int sprintf(char * buf, const char *fmt, + } + EXPORT_SYMBOL(sprintf); + ++#ifdef CONFIG_BINARY_PRINTF ++/* ++ * bprintf service: ++ * vbin_printf() - VA arguments to binary data ++ * bstr_printf() - Binary data to text string ++ */ ++ ++/** ++ * vbin_printf - Parse a format string and place args' binary value in a buffer ++ * @bin_buf: The buffer to place args' binary value ++ * @size: The size of the buffer(by words(32bits), not characters) ++ * @fmt: The format string to use ++ * @args: Arguments for the format string ++ * ++ * The format follows C99 vsnprintf, except %n is ignored, and its argument ++ * is skiped. ++ * ++ * The return value is the number of words(32bits) which would be generated for ++ * the given input. ++ * ++ * NOTE: ++ * If the return value is greater than @size, the resulting bin_buf is NOT ++ * valid for bstr_printf(). ++ */ ++int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) ++{ ++ struct printf_spec spec = {0}; ++ char *str, *end; ++ int read; ++ ++ str = (char *)bin_buf; ++ end = (char *)(bin_buf + size); ++ ++#define save_arg(type) \ ++do { \ ++ if (sizeof(type) == 8) { \ ++ unsigned long long value; \ ++ str = PTR_ALIGN(str, sizeof(u32)); \ ++ value = va_arg(args, unsigned long long); \ ++ if (str + sizeof(type) <= end) { \ ++ *(u32 *)str = *(u32 *)&value; \ ++ *(u32 *)(str + 4) = *((u32 *)&value + 1); \ ++ } \ ++ } else { \ ++ unsigned long value; \ ++ str = PTR_ALIGN(str, sizeof(type)); \ ++ value = va_arg(args, int); \ ++ if (str + sizeof(type) <= end) \ ++ *(typeof(type) *)str = (type)value; \ ++ } \ ++ str += sizeof(type); \ ++} while (0) ++ ++ ++ while (*fmt) { ++ read = format_decode(fmt, &spec); ++ ++ fmt += read; ++ ++ switch (spec.type) { ++ case FORMAT_TYPE_NONE: ++ break; ++ ++ case FORMAT_TYPE_WIDTH: ++ case FORMAT_TYPE_PRECISION: ++ save_arg(int); ++ break; ++ ++ case FORMAT_TYPE_CHAR: ++ save_arg(char); ++ break; ++ ++ case FORMAT_TYPE_STR: { ++ const char *save_str = va_arg(args, char *); ++ size_t len; ++ if ((unsigned long)save_str > (unsigned long)-PAGE_SIZE ++ || (unsigned long)save_str < PAGE_SIZE) ++ save_str = ""; ++ len = strlen(save_str); ++ if (str + len + 1 < end) ++ memcpy(str, save_str, len + 1); ++ str += len + 1; ++ break; ++ } ++ ++ case FORMAT_TYPE_PTR: ++ save_arg(void *); ++ /* skip all alphanumeric pointer suffixes */ ++ while (isalnum(*fmt)) ++ fmt++; ++ break; ++ ++ case FORMAT_TYPE_PERCENT_CHAR: ++ break; ++ ++ case FORMAT_TYPE_INVALID: ++ if (!*fmt) ++ --fmt; ++ break; ++ ++ case FORMAT_TYPE_NRCHARS: { ++ /* skip %n 's argument */ ++ int qualifier = spec.qualifier; ++ void *skip_arg; ++ if (qualifier == 'l') ++ skip_arg = va_arg(args, long *); ++ else if (qualifier == 'Z' || qualifier == 'z') ++ skip_arg = va_arg(args, size_t *); ++ else ++ skip_arg = va_arg(args, int *); ++ break; ++ } ++ ++ default: ++ switch (spec.type) { ++ ++ case FORMAT_TYPE_LONG_LONG: ++ save_arg(long long); ++ break; ++ case FORMAT_TYPE_ULONG: ++ case FORMAT_TYPE_LONG: ++ save_arg(unsigned long); ++ break; ++ case FORMAT_TYPE_SIZE_T: ++ save_arg(size_t); ++ break; ++ case FORMAT_TYPE_PTRDIFF: ++ save_arg(ptrdiff_t); ++ break; ++ case FORMAT_TYPE_USHORT: ++ case FORMAT_TYPE_SHORT: ++ save_arg(short); ++ break; ++ default: ++ save_arg(int); ++ } ++ } ++ } ++ return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; ++ ++#undef save_arg ++} ++EXPORT_SYMBOL_GPL(vbin_printf); ++ ++/** ++ * bstr_printf - Format a string from binary arguments and place it in a buffer ++ * @buf: The buffer to place the result into ++ * @size: The size of the buffer, including the trailing null space ++ * @fmt: The format string to use ++ * @bin_buf: Binary arguments for the format string ++ * ++ * This function like C99 vsnprintf, but the difference is that vsnprintf gets ++ * arguments from stack, and bstr_printf gets arguments from @bin_buf which is ++ * a binary buffer that generated by vbin_printf. ++ * ++ * The format follows C99 vsnprintf, but has some extensions: ++ * %pS output the name of a text symbol ++ * %pF output the name of a function pointer ++ * %pR output the address range in a struct resource ++ * %n is ignored ++ * ++ * The return value is the number of characters which would ++ * be generated for the given input, excluding the trailing ++ * '\0', as per ISO C99. If you want to have the exact ++ * number of characters written into @buf as return value ++ * (not including the trailing '\0'), use vscnprintf(). If the ++ * return is greater than or equal to @size, the resulting ++ * string is truncated. ++ */ ++int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) ++{ ++ unsigned long long num; ++ char *str, *end, c; ++ const char *args = (const char *)bin_buf; ++ ++ struct printf_spec spec = {0}; ++ ++ if (unlikely((int) size < 0)) { ++ /* There can be only one.. */ ++ static char warn = 1; ++ WARN_ON(warn); ++ warn = 0; ++ return 0; ++ } ++ ++ str = buf; ++ end = buf + size; ++ ++#define get_arg(type) \ ++({ \ ++ typeof(type) value; \ ++ if (sizeof(type) == 8) { \ ++ args = PTR_ALIGN(args, sizeof(u32)); \ ++ *(u32 *)&value = *(u32 *)args; \ ++ *((u32 *)&value + 1) = *(u32 *)(args + 4); \ ++ } else { \ ++ args = PTR_ALIGN(args, sizeof(type)); \ ++ value = *(typeof(type) *)args; \ ++ } \ ++ args += sizeof(type); \ ++ value; \ ++}) ++ ++ /* Make sure end is always >= buf */ ++ if (end < buf) { ++ end = ((void *)-1); ++ size = end - buf; ++ } ++ ++ while (*fmt) { ++ int read; ++ const char *old_fmt = fmt; ++ ++ read = format_decode(fmt, &spec); ++ ++ fmt += read; ++ ++ switch (spec.type) { ++ case FORMAT_TYPE_NONE: { ++ int copy = read; ++ if (str < end) { ++ if (copy > end - str) ++ copy = end - str; ++ memcpy(str, old_fmt, copy); ++ } ++ str += read; ++ break; ++ } ++ ++ case FORMAT_TYPE_WIDTH: ++ spec.field_width = get_arg(int); ++ break; ++ ++ case FORMAT_TYPE_PRECISION: ++ spec.precision = get_arg(int); ++ break; ++ ++ case FORMAT_TYPE_CHAR: ++ if (!(spec.flags & LEFT)) { ++ while (--spec.field_width > 0) { ++ if (str < end) ++ *str = ' '; ++ ++str; ++ } ++ } ++ c = (unsigned char) get_arg(char); ++ if (str < end) ++ *str = c; ++ ++str; ++ while (--spec.field_width > 0) { ++ if (str < end) ++ *str = ' '; ++ ++str; ++ } ++ break; ++ ++ case FORMAT_TYPE_STR: { ++ const char *str_arg = args; ++ size_t len = strlen(str_arg); ++ args += len + 1; ++ str = string(str, end, (char *)str_arg, spec); ++ break; ++ } ++ ++ case FORMAT_TYPE_PTR: ++ str = pointer(fmt+1, str, end, get_arg(void *), spec); ++ while (isalnum(*fmt)) ++ fmt++; ++ break; ++ ++ case FORMAT_TYPE_PERCENT_CHAR: ++ if (str < end) ++ *str = '%'; ++ ++str; ++ break; ++ ++ case FORMAT_TYPE_INVALID: ++ if (str < end) ++ *str = '%'; ++ ++str; ++ if (*fmt) { ++ if (str < end) ++ *str = *fmt; ++ ++str; ++ } else { ++ --fmt; ++ } ++ break; ++ ++ case FORMAT_TYPE_NRCHARS: ++ /* skip */ ++ break; ++ ++ default: ++ switch (spec.type) { ++ ++ case FORMAT_TYPE_LONG_LONG: ++ num = get_arg(long long); ++ break; ++ case FORMAT_TYPE_ULONG: ++ num = get_arg(unsigned long); ++ break; ++ case FORMAT_TYPE_LONG: ++ num = get_arg(unsigned long); ++ break; ++ case FORMAT_TYPE_SIZE_T: ++ num = get_arg(size_t); ++ break; ++ case FORMAT_TYPE_PTRDIFF: ++ num = get_arg(ptrdiff_t); ++ break; ++ case FORMAT_TYPE_USHORT: ++ num = get_arg(unsigned short); ++ break; ++ case FORMAT_TYPE_SHORT: ++ num = get_arg(short); ++ break; ++ case FORMAT_TYPE_UINT: ++ num = get_arg(unsigned int); ++ break; ++ default: ++ num = get_arg(int); ++ } ++ ++ str = number(str, end, num, spec); ++ } ++ } ++ ++ if (size > 0) { ++ if (str < end) ++ *str = '\0'; ++ else ++ end[-1] = '\0'; ++ } ++ ++#undef get_arg ++ ++ /* the trailing null byte doesn't count towards the total */ ++ return str - buf; ++} ++EXPORT_SYMBOL_GPL(bstr_printf); ++ ++/** ++ * bprintf - Parse a format string and place args' binary value in a buffer ++ * @bin_buf: The buffer to place args' binary value ++ * @size: The size of the buffer(by words(32bits), not characters) ++ * @fmt: The format string to use ++ * @...: Arguments for the format string ++ * ++ * The function returns the number of words(u32) written ++ * into @bin_buf. ++ */ ++int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) ++{ ++ va_list args; ++ int ret; ++ ++ va_start(args, fmt); ++ ret = vbin_printf(bin_buf, size, fmt, args); ++ va_end(args); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(bprintf); ++ ++#endif /* CONFIG_BINARY_PRINTF */ ++ + /** + * vsscanf - Unformat a buffer into a list of arguments + * @buf: input buffer +Index: linux-2.6-tip/lib/zlib_inflate/inflate.h +=================================================================== +--- linux-2.6-tip.orig/lib/zlib_inflate/inflate.h ++++ linux-2.6-tip/lib/zlib_inflate/inflate.h +@@ -1,3 +1,6 @@ ++#ifndef INFLATE_H ++#define INFLATE_H ++ + /* inflate.h -- internal inflate state definition + * Copyright (C) 1995-2004 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h +@@ -105,3 +108,4 @@ struct inflate_state { + unsigned short work[288]; /* work area for code table building */ + code codes[ENOUGH]; /* space for code tables */ + }; ++#endif +Index: linux-2.6-tip/lib/zlib_inflate/inftrees.h +=================================================================== +--- linux-2.6-tip.orig/lib/zlib_inflate/inftrees.h ++++ linux-2.6-tip/lib/zlib_inflate/inftrees.h +@@ -1,3 +1,6 @@ ++#ifndef INFTREES_H ++#define INFTREES_H ++ + /* inftrees.h -- header to use inftrees.c + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h +@@ -53,3 +56,4 @@ typedef enum { + extern int zlib_inflate_table (codetype type, unsigned short *lens, + unsigned codes, code **table, + unsigned *bits, unsigned short *work); ++#endif +Index: linux-2.6-tip/mm/Makefile +=================================================================== +--- linux-2.6-tip.orig/mm/Makefile ++++ linux-2.6-tip/mm/Makefile +@@ -26,10 +26,15 @@ obj-$(CONFIG_SLOB) += slob.o + obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o + obj-$(CONFIG_SLAB) += slab.o + obj-$(CONFIG_SLUB) += slub.o ++obj-$(CONFIG_KMEMCHECK) += kmemcheck.o + obj-$(CONFIG_FAILSLAB) += failslab.o + obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o + obj-$(CONFIG_FS_XIP) += filemap_xip.o + obj-$(CONFIG_MIGRATION) += migrate.o ++ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA ++obj-$(CONFIG_SMP) += percpu.o ++else + obj-$(CONFIG_SMP) += allocpercpu.o ++endif + obj-$(CONFIG_QUICKLIST) += quicklist.o + obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +Index: linux-2.6-tip/mm/allocpercpu.c +=================================================================== +--- linux-2.6-tip.orig/mm/allocpercpu.c ++++ linux-2.6-tip/mm/allocpercpu.c +@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void * + __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) + + /** +- * percpu_alloc_mask - initial setup of per-cpu data ++ * alloc_percpu - initial setup of per-cpu data + * @size: size of per-cpu object +- * @gfp: may sleep or not etc. +- * @mask: populate per-data for cpu's selected through mask bits ++ * @align: alignment + * +- * Populating per-cpu data for all online cpu's would be a typical use case, +- * which is simplified by the percpu_alloc() wrapper. +- * Per-cpu objects are populated with zeroed buffers. ++ * Allocate dynamic percpu area. Percpu objects are populated with ++ * zeroed buffers. + */ +-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) ++void *__alloc_percpu(size_t size, size_t align) + { + /* + * We allocate whole cache lines to avoid false sharing + */ + size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); +- void *pdata = kzalloc(sz, gfp); ++ void *pdata = kzalloc(sz, GFP_KERNEL); + void *__pdata = __percpu_disguise(pdata); + ++ /* ++ * Can't easily make larger alignment work with kmalloc. WARN ++ * on it. Larger alignment should only be used for module ++ * percpu sections on SMP for which this path isn't used. ++ */ ++ WARN_ON_ONCE(align > SMP_CACHE_BYTES); ++ + if (unlikely(!pdata)) + return NULL; +- if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) ++ if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, ++ &cpu_possible_map))) + return __pdata; + kfree(pdata); + return NULL; + } +-EXPORT_SYMBOL_GPL(__percpu_alloc_mask); ++EXPORT_SYMBOL_GPL(__alloc_percpu); + + /** +- * percpu_free - final cleanup of per-cpu data ++ * free_percpu - final cleanup of per-cpu data + * @__pdata: object to clean up + * + * We simply clean up any per-cpu object left. No need for the client to + * track and specify through a bis mask which per-cpu objects are to free. + */ +-void percpu_free(void *__pdata) ++void free_percpu(void *__pdata) + { + if (unlikely(!__pdata)) + return; + __percpu_depopulate_mask(__pdata, &cpu_possible_map); + kfree(__percpu_disguise(__pdata)); + } +-EXPORT_SYMBOL_GPL(percpu_free); ++EXPORT_SYMBOL_GPL(free_percpu); +Index: linux-2.6-tip/mm/bootmem.c +=================================================================== +--- linux-2.6-tip.orig/mm/bootmem.c ++++ linux-2.6-tip/mm/bootmem.c +@@ -318,6 +318,8 @@ static int __init mark_bootmem(unsigned + pos = bdata->node_low_pfn; + } + BUG(); ++ ++ return 0; + } + + /** +@@ -382,7 +384,6 @@ int __init reserve_bootmem_node(pg_data_ + return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); + } + +-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE + /** + * reserve_bootmem - mark a page range as usable + * @addr: starting address of the range +@@ -403,7 +404,6 @@ int __init reserve_bootmem(unsigned long + + return mark_bootmem(start, end, 1, flags); + } +-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ + + static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, + unsigned long step) +@@ -429,8 +429,8 @@ static unsigned long align_off(struct bo + } + + static void * __init alloc_bootmem_core(struct bootmem_data *bdata, +- unsigned long size, unsigned long align, +- unsigned long goal, unsigned long limit) ++ unsigned long size, unsigned long align, ++ unsigned long goal, unsigned long limit) + { + unsigned long fallback = 0; + unsigned long min, max, start, sidx, midx, step; +@@ -530,17 +530,34 @@ find_block: + return NULL; + } + ++static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, ++ unsigned long size, unsigned long align, ++ unsigned long goal, unsigned long limit) ++{ ++#ifdef CONFIG_HAVE_ARCH_BOOTMEM ++ bootmem_data_t *p_bdata; ++ ++ p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); ++ if (p_bdata) ++ return alloc_bootmem_core(p_bdata, size, align, goal, limit); ++#endif ++ return NULL; ++} ++ + static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) + { + bootmem_data_t *bdata; ++ void *region; + + restart: +- list_for_each_entry(bdata, &bdata_list, list) { +- void *region; ++ region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); ++ if (region) ++ return region; + ++ list_for_each_entry(bdata, &bdata_list, list) { + if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) + continue; + if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) +@@ -618,6 +635,10 @@ static void * __init ___alloc_bootmem_no + { + void *ptr; + ++ ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); ++ if (ptr) ++ return ptr; ++ + ptr = alloc_bootmem_core(bdata, size, align, goal, limit); + if (ptr) + return ptr; +@@ -674,6 +695,10 @@ void * __init __alloc_bootmem_node_nopan + { + void *ptr; + ++ ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); ++ if (ptr) ++ return ptr; ++ + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + if (ptr) + return ptr; +Index: linux-2.6-tip/mm/failslab.c +=================================================================== +--- linux-2.6-tip.orig/mm/failslab.c ++++ linux-2.6-tip/mm/failslab.c +@@ -1,4 +1,5 @@ + #include ++#include + + static struct { + struct fault_attr attr; +Index: linux-2.6-tip/mm/filemap.c +=================================================================== +--- linux-2.6-tip.orig/mm/filemap.c ++++ linux-2.6-tip/mm/filemap.c +@@ -1823,7 +1823,7 @@ static size_t __iovec_copy_from_user_ina + int copy = min(bytes, iov->iov_len - base); + + base = 0; +- left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); ++ left = __copy_from_user_inatomic(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; +@@ -1846,13 +1846,12 @@ size_t iov_iter_copy_from_user_atomic(st + char *kaddr; + size_t copied; + +- BUG_ON(!in_atomic()); ++// BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page, KM_USER0); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; +- left = __copy_from_user_inatomic_nocache(kaddr + offset, +- buf, bytes); ++ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, +@@ -1880,7 +1879,7 @@ size_t iov_iter_copy_from_user(struct pa + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; +- left = __copy_from_user_nocache(kaddr + offset, buf, bytes); ++ left = __copy_from_user(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, +Index: linux-2.6-tip/mm/kmemcheck.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/mm/kmemcheck.c +@@ -0,0 +1,122 @@ ++#include ++#include ++#include ++#include ++#include ++ ++void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) ++{ ++ struct page *shadow; ++ int pages; ++ int i; ++ ++ pages = 1 << order; ++ ++ /* ++ * With kmemcheck enabled, we need to allocate a memory area for the ++ * shadow bits as well. ++ */ ++ shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); ++ if (!shadow) { ++ if (printk_ratelimit()) ++ printk(KERN_ERR "kmemcheck: failed to allocate " ++ "shadow bitmap\n"); ++ return; ++ } ++ ++ for(i = 0; i < pages; ++i) ++ page[i].shadow = page_address(&shadow[i]); ++ ++ /* ++ * Mark it as non-present for the MMU so that our accesses to ++ * this memory will trigger a page fault and let us analyze ++ * the memory accesses. ++ */ ++ kmemcheck_hide_pages(page, pages); ++} ++ ++void kmemcheck_free_shadow(struct page *page, int order) ++{ ++ struct page *shadow; ++ int pages; ++ int i; ++ ++ if (!kmemcheck_page_is_tracked(page)) ++ return; ++ ++ pages = 1 << order; ++ ++ kmemcheck_show_pages(page, pages); ++ ++ shadow = virt_to_page(page[0].shadow); ++ ++ for(i = 0; i < pages; ++i) ++ page[i].shadow = NULL; ++ ++ __free_pages(shadow, order); ++} ++ ++void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, ++ size_t size) ++{ ++ /* ++ * Has already been memset(), which initializes the shadow for us ++ * as well. ++ */ ++ if (gfpflags & __GFP_ZERO) ++ return; ++ ++ /* No need to initialize the shadow of a non-tracked slab. */ ++ if (s->flags & SLAB_NOTRACK) ++ return; ++ ++ if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { ++ /* ++ * Allow notracked objects to be allocated from ++ * tracked caches. Note however that these objects ++ * will still get page faults on access, they just ++ * won't ever be flagged as uninitialized. If page ++ * faults are not acceptable, the slab cache itself ++ * should be marked NOTRACK. ++ */ ++ kmemcheck_mark_initialized(object, size); ++ } else if (!s->ctor) { ++ /* ++ * New objects should be marked uninitialized before ++ * they're returned to the called. ++ */ ++ kmemcheck_mark_uninitialized(object, size); ++ } ++} ++ ++void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) ++{ ++ /* TODO: RCU freeing is unsupported for now; hide false positives. */ ++ if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) ++ kmemcheck_mark_freed(object, size); ++} ++ ++void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, ++ gfp_t gfpflags) ++{ ++ int pages; ++ ++ if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) ++ return; ++ ++ pages = 1 << order; ++ ++ /* ++ * NOTE: We choose to track GFP_ZERO pages too; in fact, they ++ * can become uninitialized by copying uninitialized memory ++ * into them. ++ */ ++ ++ /* XXX: Can use zone->node for node? */ ++ kmemcheck_alloc_shadow(page, order, gfpflags, -1); ++ ++ if (gfpflags & __GFP_ZERO) ++ kmemcheck_mark_initialized_pages(page, pages); ++ else ++ kmemcheck_mark_uninitialized_pages(page, pages); ++} +Index: linux-2.6-tip/mm/memory.c +=================================================================== +--- linux-2.6-tip.orig/mm/memory.c ++++ linux-2.6-tip/mm/memory.c +@@ -48,6 +48,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -922,10 +924,13 @@ static unsigned long unmap_page_range(st + return addr; + } + +-#ifdef CONFIG_PREEMPT ++#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT) + # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) + #else +-/* No preempt: go for improved straight-line efficiency */ ++/* ++ * No preempt: go for improved straight-line efficiency ++ * on PREEMPT_RT this is not a critical latency-path. ++ */ + # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) + #endif + +@@ -955,17 +960,14 @@ static unsigned long unmap_page_range(st + * ensure that any thus-far unmapped pages are flushed before unmap_vmas() + * drops the lock and schedules. + */ +-unsigned long unmap_vmas(struct mmu_gather **tlbp, ++unsigned long unmap_vmas(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted, + struct zap_details *details) + { + long zap_work = ZAP_BLOCK_SIZE; +- unsigned long tlb_start = 0; /* For tlb_finish_mmu */ +- int tlb_start_valid = 0; + unsigned long start = start_addr; + spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; +- int fullmm = (*tlbp)->fullmm; + struct mm_struct *mm = vma->vm_mm; + + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); +@@ -986,11 +988,6 @@ unsigned long unmap_vmas(struct mmu_gath + untrack_pfn_vma(vma, 0, 0); + + while (start != end) { +- if (!tlb_start_valid) { +- tlb_start = start; +- tlb_start_valid = 1; +- } +- + if (unlikely(is_vm_hugetlb_page(vma))) { + /* + * It is undesirable to test vma->vm_file as it +@@ -1011,7 +1008,7 @@ unsigned long unmap_vmas(struct mmu_gath + + start = end; + } else +- start = unmap_page_range(*tlbp, vma, ++ start = unmap_page_range(tlb, vma, + start, end, &zap_work, details); + + if (zap_work > 0) { +@@ -1019,19 +1016,13 @@ unsigned long unmap_vmas(struct mmu_gath + break; + } + +- tlb_finish_mmu(*tlbp, tlb_start, start); +- + if (need_resched() || + (i_mmap_lock && spin_needbreak(i_mmap_lock))) { +- if (i_mmap_lock) { +- *tlbp = NULL; ++ if (i_mmap_lock) + goto out; +- } + cond_resched(); + } + +- *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); +- tlb_start_valid = 0; + zap_work = ZAP_BLOCK_SIZE; + } + } +@@ -1051,16 +1042,15 @@ unsigned long zap_page_range(struct vm_a + unsigned long size, struct zap_details *details) + { + struct mm_struct *mm = vma->vm_mm; +- struct mmu_gather *tlb; ++ struct mmu_gather tlb; + unsigned long end = address + size; + unsigned long nr_accounted = 0; + + lru_add_drain(); +- tlb = tlb_gather_mmu(mm, 0); ++ tlb_gather_mmu(&tlb, mm, 0); + update_hiwater_rss(mm); + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); +- if (tlb) +- tlb_finish_mmu(tlb, address, end); ++ tlb_finish_mmu(&tlb, address, end); + return end; + } + +@@ -1667,7 +1657,7 @@ int remap_pfn_range(struct vm_area_struc + */ + if (addr == vma->vm_start && end == vma->vm_end) { + vma->vm_pgoff = pfn; +- vma->vm_flags |= VM_PFNMAP_AT_MMAP; ++ vma->vm_flags |= VM_PFN_AT_MMAP; + } else if (is_cow_mapping(vma->vm_flags)) + return -EINVAL; + +@@ -1680,7 +1670,7 @@ int remap_pfn_range(struct vm_area_struc + * needed from higher level routine calling unmap_vmas + */ + vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); +- vma->vm_flags &= ~VM_PFNMAP_AT_MMAP; ++ vma->vm_flags &= ~VM_PFN_AT_MMAP; + return -EINVAL; + } + +@@ -2902,6 +2892,28 @@ unlock: + return 0; + } + ++void pagefault_disable(void) ++{ ++ current->pagefault_disabled++; ++ /* ++ * make sure to have issued the store before a pagefault ++ * can hit. ++ */ ++ barrier(); ++} ++EXPORT_SYMBOL(pagefault_disable); ++ ++void pagefault_enable(void) ++{ ++ /* ++ * make sure to issue those last loads/stores before enabling ++ * the pagefault handler again. ++ */ ++ barrier(); ++ current->pagefault_disabled--; ++} ++EXPORT_SYMBOL(pagefault_enable); ++ + /* + * By the time we get here, we already hold the mm semaphore + */ +Index: linux-2.6-tip/mm/mempolicy.c +=================================================================== +--- linux-2.6-tip.orig/mm/mempolicy.c ++++ linux-2.6-tip/mm/mempolicy.c +@@ -1421,7 +1421,7 @@ unsigned slab_node(struct mempolicy *pol + } + + default: +- BUG(); ++ panic("slab_node: bad policy mode!"); + } + } + +Index: linux-2.6-tip/mm/page_alloc.c +=================================================================== +--- linux-2.6-tip.orig/mm/page_alloc.c ++++ linux-2.6-tip/mm/page_alloc.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -162,6 +163,53 @@ static unsigned long __meminitdata dma_r + EXPORT_SYMBOL(movable_zone); + #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + ++#ifdef CONFIG_PREEMPT_RT ++static DEFINE_PER_CPU_LOCKED(int, pcp_locks); ++#endif ++ ++static inline void __lock_cpu_pcp(unsigned long *flags, int cpu) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ spin_lock(&__get_cpu_lock(pcp_locks, cpu)); ++ flags = 0; ++#else ++ local_irq_save(*flags); ++#endif ++} ++ ++static inline void lock_cpu_pcp(unsigned long *flags, int *this_cpu) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ (void)get_cpu_var_locked(pcp_locks, this_cpu); ++ flags = 0; ++#else ++ local_irq_save(*flags); ++ *this_cpu = smp_processor_id(); ++#endif ++} ++ ++static inline void unlock_cpu_pcp(unsigned long flags, int this_cpu) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ put_cpu_var_locked(pcp_locks, this_cpu); ++#else ++ local_irq_restore(flags); ++#endif ++} ++ ++static struct per_cpu_pageset * ++get_zone_pcp(struct zone *zone, unsigned long *flags, int *this_cpu) ++{ ++ lock_cpu_pcp(flags, this_cpu); ++ return zone_pcp(zone, *this_cpu); ++} ++ ++static void ++put_zone_pcp(struct zone *zone, unsigned long flags, int this_cpu) ++{ ++ unlock_cpu_pcp(flags, this_cpu); ++} ++ + #if MAX_NUMNODES > 1 + int nr_node_ids __read_mostly = MAX_NUMNODES; + EXPORT_SYMBOL(nr_node_ids); +@@ -516,38 +564,45 @@ static inline int free_pages_check(struc + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +-static void free_pages_bulk(struct zone *zone, int count, +- struct list_head *list, int order) ++static void ++free_pages_bulk(struct zone *zone, struct list_head *list, int order) + { +- spin_lock(&zone->lock); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&zone->lock, flags); + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->pages_scanned = 0; +- while (count--) { +- struct page *page; + +- VM_BUG_ON(list_empty(list)); +- page = list_entry(list->prev, struct page, lru); +- /* have to delete it as __free_one_page list manipulates */ ++ while (!list_empty(list)) { ++ struct page *page = list_first_entry(list, struct page, lru); ++ + list_del(&page->lru); + __free_one_page(page, zone, order); ++#ifdef CONFIG_PREEMPT_RT ++ cond_resched_lock(&zone->lock); ++#endif + } +- spin_unlock(&zone->lock); ++ spin_unlock_irqrestore(&zone->lock, flags); + } + + static void free_one_page(struct zone *zone, struct page *page, int order) + { +- spin_lock(&zone->lock); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&zone->lock, flags); ++ + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->pages_scanned = 0; + __free_one_page(page, zone, order); +- spin_unlock(&zone->lock); ++ spin_unlock_irqrestore(&zone->lock, flags); + } + + static void __free_pages_ok(struct page *page, unsigned int order) + { + unsigned long flags; +- int i; +- int bad = 0; ++ int i, this_cpu, bad = 0; ++ ++ kmemcheck_free_shadow(page, order); + + for (i = 0 ; i < (1 << order) ; ++i) + bad += free_pages_check(page + i); +@@ -562,10 +617,10 @@ static void __free_pages_ok(struct page + arch_free_page(page, order); + kernel_map_pages(page, 1 << order, 0); + +- local_irq_save(flags); +- __count_vm_events(PGFREE, 1 << order); ++ lock_cpu_pcp(&flags, &this_cpu); ++ count_vm_events(PGFREE, 1 << order); ++ unlock_cpu_pcp(flags, this_cpu); + free_one_page(page_zone(page), page, order); +- local_irq_restore(flags); + } + + /* +@@ -885,6 +940,16 @@ static int rmqueue_bulk(struct zone *zon + return i; + } + ++static void ++isolate_pcp_pages(int count, struct list_head *src, struct list_head *dst) ++{ ++ while (count--) { ++ struct page *page = list_last_entry(src, struct page, lru); ++ list_move(&page->lru, dst); ++ } ++} ++ ++ + #ifdef CONFIG_NUMA + /* + * Called from the vmstat counter updater to drain pagesets of this +@@ -896,17 +961,20 @@ static int rmqueue_bulk(struct zone *zon + */ + void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) + { ++ LIST_HEAD(free_list); + unsigned long flags; + int to_drain; ++ int this_cpu; + +- local_irq_save(flags); ++ lock_cpu_pcp(&flags, &this_cpu); + if (pcp->count >= pcp->batch) + to_drain = pcp->batch; + else + to_drain = pcp->count; +- free_pages_bulk(zone, to_drain, &pcp->list, 0); ++ isolate_pcp_pages(to_drain, &pcp->list, &free_list); + pcp->count -= to_drain; +- local_irq_restore(flags); ++ unlock_cpu_pcp(flags, this_cpu); ++ free_pages_bulk(zone, &free_list, 0); + } + #endif + +@@ -925,17 +993,23 @@ static void drain_pages(unsigned int cpu + for_each_zone(zone) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; ++ LIST_HEAD(free_list); + + if (!populated_zone(zone)) + continue; + ++ __lock_cpu_pcp(&flags, cpu); + pset = zone_pcp(zone, cpu); +- ++ if (!pset) { ++ unlock_cpu_pcp(flags, cpu); ++ WARN_ON(1); ++ continue; ++ } + pcp = &pset->pcp; +- local_irq_save(flags); +- free_pages_bulk(zone, pcp->count, &pcp->list, 0); ++ isolate_pcp_pages(pcp->count, &pcp->list, &free_list); + pcp->count = 0; +- local_irq_restore(flags); ++ unlock_cpu_pcp(flags, cpu); ++ free_pages_bulk(zone, &free_list, 0); + } + } + +@@ -947,12 +1021,52 @@ void drain_local_pages(void *arg) + drain_pages(smp_processor_id()); + } + ++#ifdef CONFIG_PREEMPT_RT ++static void drain_local_pages_work(struct work_struct *wrk) ++{ ++ drain_pages(smp_processor_id()); ++} ++#endif ++ + /* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator + */ + void drain_all_pages(void) + { ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * HACK!!!!! ++ * For RT we can't use IPIs to run drain_local_pages, since ++ * that code will call spin_locks that will now sleep. ++ * But, schedule_on_each_cpu will call kzalloc, which will ++ * call page_alloc which was what calls this. ++ * ++ * Luckily, there's a condition to get here, and that is if ++ * the order passed in to alloc_pages is greater than 0 ++ * (alloced more than a page size). The slabs only allocate ++ * what is needed, and the allocation made by schedule_on_each_cpu ++ * does an alloc of "sizeof(void *)*nr_cpu_ids". ++ * ++ * So we can safely call schedule_on_each_cpu if that number ++ * is less than a page. Otherwise don't bother. At least warn of ++ * this issue. ++ * ++ * And yes, this is one big hack. Please fix ;-) ++ */ ++ if (sizeof(void *)*nr_cpu_ids < PAGE_SIZE) ++ schedule_on_each_cpu(drain_local_pages_work); ++ else { ++ static int once; ++ if (!once) { ++ printk(KERN_ERR "Can't drain all CPUS due to possible recursion\n"); ++ once = 1; ++ } ++ drain_local_pages(NULL); ++ } ++ ++#else + on_each_cpu(drain_local_pages, NULL, 1); ++#endif + } + + #ifdef CONFIG_HIBERNATION +@@ -997,8 +1111,12 @@ void mark_free_pages(struct zone *zone) + static void free_hot_cold_page(struct page *page, int cold) + { + struct zone *zone = page_zone(page); ++ struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + unsigned long flags; ++ int this_cpu; ++ ++ kmemcheck_free_shadow(page, 0); + + if (PageAnon(page)) + page->mapping = NULL; +@@ -1012,9 +1130,11 @@ static void free_hot_cold_page(struct pa + arch_free_page(page, 0); + kernel_map_pages(page, 1, 0); + +- pcp = &zone_pcp(zone, get_cpu())->pcp; +- local_irq_save(flags); +- __count_vm_event(PGFREE); ++ pset = get_zone_pcp(zone, &flags, &this_cpu); ++ pcp = &pset->pcp; ++ ++ count_vm_event(PGFREE); ++ + if (cold) + list_add_tail(&page->lru, &pcp->list); + else +@@ -1022,11 +1142,14 @@ static void free_hot_cold_page(struct pa + set_page_private(page, get_pageblock_migratetype(page)); + pcp->count++; + if (pcp->count >= pcp->high) { +- free_pages_bulk(zone, pcp->batch, &pcp->list, 0); ++ LIST_HEAD(free_list); ++ ++ isolate_pcp_pages(pcp->batch, &pcp->list, &free_list); + pcp->count -= pcp->batch; +- } +- local_irq_restore(flags); +- put_cpu(); ++ put_zone_pcp(zone, flags, this_cpu); ++ free_pages_bulk(zone, &free_list, 0); ++ } else ++ put_zone_pcp(zone, flags, this_cpu); + } + + void free_hot_page(struct page *page) +@@ -1053,6 +1176,16 @@ void split_page(struct page *page, unsig + + VM_BUG_ON(PageCompound(page)); + VM_BUG_ON(!page_count(page)); ++ ++#ifdef CONFIG_KMEMCHECK ++ /* ++ * Split shadow pages too, because free(page[0]) would ++ * otherwise free the whole shadow. ++ */ ++ if (kmemcheck_page_is_tracked(page)) ++ split_page(virt_to_page(page[0].shadow), order); ++#endif ++ + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); + } +@@ -1068,16 +1201,15 @@ static struct page *buffered_rmqueue(str + unsigned long flags; + struct page *page; + int cold = !!(gfp_flags & __GFP_COLD); +- int cpu; ++ struct per_cpu_pageset *pset; + int migratetype = allocflags_to_migratetype(gfp_flags); ++ int this_cpu; + + again: +- cpu = get_cpu(); ++ pset = get_zone_pcp(zone, &flags, &this_cpu); + if (likely(order == 0)) { +- struct per_cpu_pages *pcp; ++ struct per_cpu_pages *pcp = &pset->pcp; + +- pcp = &zone_pcp(zone, cpu)->pcp; +- local_irq_save(flags); + if (!pcp->count) { + pcp->count = rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list, migratetype); +@@ -1106,7 +1238,7 @@ again: + list_del(&page->lru); + pcp->count--; + } else { +- spin_lock_irqsave(&zone->lock, flags); ++ spin_lock(&zone->lock); + page = __rmqueue(zone, order, migratetype); + spin_unlock(&zone->lock); + if (!page) +@@ -1115,8 +1247,7 @@ again: + + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(preferred_zone, zone); +- local_irq_restore(flags); +- put_cpu(); ++ put_zone_pcp(zone, flags, this_cpu); + + VM_BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order, gfp_flags)) +@@ -1124,8 +1255,7 @@ again: + return page; + + failed: +- local_irq_restore(flags); +- put_cpu(); ++ put_zone_pcp(zone, flags, this_cpu); + return NULL; + } + +@@ -1479,6 +1609,8 @@ __alloc_pages_internal(gfp_t gfp_mask, u + unsigned long did_some_progress; + unsigned long pages_reclaimed = 0; + ++ lockdep_trace_alloc(gfp_mask); ++ + might_sleep_if(wait); + + if (should_fail_alloc_page(gfp_mask, order)) +@@ -1578,12 +1710,15 @@ nofail_alloc: + */ + cpuset_update_task_memory_state(); + p->flags |= PF_MEMALLOC; ++ ++ lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); + + p->reclaim_state = NULL; ++ lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; + + cond_resched(); +@@ -1667,7 +1802,10 @@ nopage: + dump_stack(); + show_mem(); + } ++ return page; + got_pg: ++ if (kmemcheck_enabled) ++ kmemcheck_pagealloc_alloc(page, order, gfp_mask); + return page; + } + EXPORT_SYMBOL(__alloc_pages_internal); +@@ -2134,7 +2272,7 @@ static int find_next_best_node(int node, + int n, val; + int min_val = INT_MAX; + int best_node = -1; +- node_to_cpumask_ptr(tmp, 0); ++ const struct cpumask *tmp = cpumask_of_node(0); + + /* Use the local node if we haven't already */ + if (!node_isset(node, *used_node_mask)) { +@@ -2155,8 +2293,8 @@ static int find_next_best_node(int node, + val += (n < node); + + /* Give preference to headless and unused nodes */ +- node_to_cpumask_ptr_next(tmp, n); +- if (!cpus_empty(*tmp)) ++ tmp = cpumask_of_node(n); ++ if (!cpumask_empty(tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ +@@ -2814,12 +2952,27 @@ static inline void free_zone_pagesets(in + struct zone *zone; + + for_each_zone(zone) { +- struct per_cpu_pageset *pset = zone_pcp(zone, cpu); ++ unsigned long flags; ++ struct per_cpu_pageset *pset; ++ ++ /* ++ * On PREEMPT_RT the allocator is preemptible, therefore ++ * kstopmachine can preempt a process in the middle of an ++ * allocation, freeing the pset underneath such a process ++ * isn't a good idea. ++ * ++ * Take the per-cpu pcp lock to allow the task to complete ++ * before we free it. New tasks will be held off by the ++ * cpu_online() check in get_cpu_var_locked(). ++ */ ++ __lock_cpu_pcp(&flags, cpu); ++ pset = zone_pcp(zone, cpu); ++ zone_pcp(zone, cpu) = NULL; ++ unlock_cpu_pcp(flags, cpu); + + /* Free per_cpu_pageset if it is slab allocated */ + if (pset != &boot_pageset[cpu]) + kfree(pset); +- zone_pcp(zone, cpu) = NULL; + } + } + +Index: linux-2.6-tip/mm/percpu.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/mm/percpu.c +@@ -0,0 +1,1326 @@ ++/* ++ * linux/mm/percpu.c - percpu memory allocator ++ * ++ * Copyright (C) 2009 SUSE Linux Products GmbH ++ * Copyright (C) 2009 Tejun Heo ++ * ++ * This file is released under the GPLv2. ++ * ++ * This is percpu allocator which can handle both static and dynamic ++ * areas. Percpu areas are allocated in chunks in vmalloc area. Each ++ * chunk is consisted of num_possible_cpus() units and the first chunk ++ * is used for static percpu variables in the kernel image (special ++ * boot time alloc/init handling necessary as these areas need to be ++ * brought up before allocation services are running). Unit grows as ++ * necessary and all units grow or shrink in unison. When a chunk is ++ * filled up, another chunk is allocated. ie. in vmalloc area ++ * ++ * c0 c1 c2 ++ * ------------------- ------------------- ------------ ++ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u ++ * ------------------- ...... ------------------- .... ------------ ++ * ++ * Allocation is done in offset-size areas of single unit space. Ie, ++ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, ++ * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring ++ * percpu base registers UNIT_SIZE apart. ++ * ++ * There are usually many small percpu allocations many of them as ++ * small as 4 bytes. The allocator organizes chunks into lists ++ * according to free size and tries to allocate from the fullest one. ++ * Each chunk keeps the maximum contiguous area size hint which is ++ * guaranteed to be eqaul to or larger than the maximum contiguous ++ * area in the chunk. This helps the allocator not to iterate the ++ * chunk maps unnecessarily. ++ * ++ * Allocation state in each chunk is kept using an array of integers ++ * on chunk->map. A positive value in the map represents a free ++ * region and negative allocated. Allocation inside a chunk is done ++ * by scanning this map sequentially and serving the first matching ++ * entry. This is mostly copied from the percpu_modalloc() allocator. ++ * Chunks are also linked into a rb tree to ease address to chunk ++ * mapping during free. ++ * ++ * To use this allocator, arch code should do the followings. ++ * ++ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA ++ * ++ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate ++ * regular address to percpu pointer and back if they need to be ++ * different from the default ++ * ++ * - use pcpu_setup_first_chunk() during percpu area initialization to ++ * setup the first chunk containing the kernel static percpu area ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ ++#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ ++ ++/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ ++#ifndef __addr_to_pcpu_ptr ++#define __addr_to_pcpu_ptr(addr) \ ++ (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ ++ + (unsigned long)__per_cpu_start) ++#endif ++#ifndef __pcpu_ptr_to_addr ++#define __pcpu_ptr_to_addr(ptr) \ ++ (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ ++ - (unsigned long)__per_cpu_start) ++#endif ++ ++struct pcpu_chunk { ++ struct list_head list; /* linked to pcpu_slot lists */ ++ struct rb_node rb_node; /* key is chunk->vm->addr */ ++ int free_size; /* free bytes in the chunk */ ++ int contig_hint; /* max contiguous size hint */ ++ struct vm_struct *vm; /* mapped vmalloc region */ ++ int map_used; /* # of map entries used */ ++ int map_alloc; /* # of map entries allocated */ ++ int *map; /* allocation map */ ++ bool immutable; /* no [de]population allowed */ ++ struct page **page; /* points to page array */ ++ struct page *page_ar[]; /* #cpus * UNIT_PAGES */ ++}; ++ ++static int pcpu_unit_pages __read_mostly; ++static int pcpu_unit_size __read_mostly; ++static int pcpu_chunk_size __read_mostly; ++static int pcpu_nr_slots __read_mostly; ++static size_t pcpu_chunk_struct_size __read_mostly; ++ ++/* the address of the first chunk which starts with the kernel static area */ ++void *pcpu_base_addr __read_mostly; ++EXPORT_SYMBOL_GPL(pcpu_base_addr); ++ ++/* optional reserved chunk, only accessible for reserved allocations */ ++static struct pcpu_chunk *pcpu_reserved_chunk; ++/* offset limit of the reserved chunk */ ++static int pcpu_reserved_chunk_limit; ++ ++/* ++ * Synchronization rules. ++ * ++ * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former ++ * protects allocation/reclaim paths, chunks and chunk->page arrays. ++ * The latter is a spinlock and protects the index data structures - ++ * chunk slots, rbtree, chunks and area maps in chunks. ++ * ++ * During allocation, pcpu_alloc_mutex is kept locked all the time and ++ * pcpu_lock is grabbed and released as necessary. All actual memory ++ * allocations are done using GFP_KERNEL with pcpu_lock released. ++ * ++ * Free path accesses and alters only the index data structures, so it ++ * can be safely called from atomic context. When memory needs to be ++ * returned to the system, free path schedules reclaim_work which ++ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be ++ * reclaimed, release both locks and frees the chunks. Note that it's ++ * necessary to grab both locks to remove a chunk from circulation as ++ * allocation path might be referencing the chunk with only ++ * pcpu_alloc_mutex locked. ++ */ ++static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ ++static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ ++ ++static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ ++static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ ++ ++/* reclaim work to release fully free chunks, scheduled from free path */ ++static void pcpu_reclaim(struct work_struct *work); ++static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); ++ ++static int __pcpu_size_to_slot(int size) ++{ ++ int highbit = fls(size); /* size is in bytes */ ++ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); ++} ++ ++static int pcpu_size_to_slot(int size) ++{ ++ if (size == pcpu_unit_size) ++ return pcpu_nr_slots - 1; ++ return __pcpu_size_to_slot(size); ++} ++ ++static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) ++{ ++ if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) ++ return 0; ++ ++ return pcpu_size_to_slot(chunk->free_size); ++} ++ ++static int pcpu_page_idx(unsigned int cpu, int page_idx) ++{ ++ return cpu * pcpu_unit_pages + page_idx; ++} ++ ++static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, ++ unsigned int cpu, int page_idx) ++{ ++ return &chunk->page[pcpu_page_idx(cpu, page_idx)]; ++} ++ ++static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, ++ unsigned int cpu, int page_idx) ++{ ++ return (unsigned long)chunk->vm->addr + ++ (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); ++} ++ ++static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, ++ int page_idx) ++{ ++ return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; ++} ++ ++/** ++ * pcpu_mem_alloc - allocate memory ++ * @size: bytes to allocate ++ * ++ * Allocate @size bytes. If @size is smaller than PAGE_SIZE, ++ * kzalloc() is used; otherwise, vmalloc() is used. The returned ++ * memory is always zeroed. ++ * ++ * CONTEXT: ++ * Does GFP_KERNEL allocation. ++ * ++ * RETURNS: ++ * Pointer to the allocated area on success, NULL on failure. ++ */ ++static void *pcpu_mem_alloc(size_t size) ++{ ++ if (size <= PAGE_SIZE) ++ return kzalloc(size, GFP_KERNEL); ++ else { ++ void *ptr = vmalloc(size); ++ if (ptr) ++ memset(ptr, 0, size); ++ return ptr; ++ } ++} ++ ++/** ++ * pcpu_mem_free - free memory ++ * @ptr: memory to free ++ * @size: size of the area ++ * ++ * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). ++ */ ++static void pcpu_mem_free(void *ptr, size_t size) ++{ ++ if (size <= PAGE_SIZE) ++ kfree(ptr); ++ else ++ vfree(ptr); ++} ++ ++/** ++ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot ++ * @chunk: chunk of interest ++ * @oslot: the previous slot it was on ++ * ++ * This function is called after an allocation or free changed @chunk. ++ * New slot according to the changed state is determined and @chunk is ++ * moved to the slot. Note that the reserved chunk is never put on ++ * chunk slots. ++ * ++ * CONTEXT: ++ * pcpu_lock. ++ */ ++static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) ++{ ++ int nslot = pcpu_chunk_slot(chunk); ++ ++ if (chunk != pcpu_reserved_chunk && oslot != nslot) { ++ if (oslot < nslot) ++ list_move(&chunk->list, &pcpu_slot[nslot]); ++ else ++ list_move_tail(&chunk->list, &pcpu_slot[nslot]); ++ } ++} ++ ++static struct rb_node **pcpu_chunk_rb_search(void *addr, ++ struct rb_node **parentp) ++{ ++ struct rb_node **p = &pcpu_addr_root.rb_node; ++ struct rb_node *parent = NULL; ++ struct pcpu_chunk *chunk; ++ ++ while (*p) { ++ parent = *p; ++ chunk = rb_entry(parent, struct pcpu_chunk, rb_node); ++ ++ if (addr < chunk->vm->addr) ++ p = &(*p)->rb_left; ++ else if (addr > chunk->vm->addr) ++ p = &(*p)->rb_right; ++ else ++ break; ++ } ++ ++ if (parentp) ++ *parentp = parent; ++ return p; ++} ++ ++/** ++ * pcpu_chunk_addr_search - search for chunk containing specified address ++ * @addr: address to search for ++ * ++ * Look for chunk which might contain @addr. More specifically, it ++ * searchs for the chunk with the highest start address which isn't ++ * beyond @addr. ++ * ++ * CONTEXT: ++ * pcpu_lock. ++ * ++ * RETURNS: ++ * The address of the found chunk. ++ */ ++static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) ++{ ++ struct rb_node *n, *parent; ++ struct pcpu_chunk *chunk; ++ ++ /* is it in the reserved chunk? */ ++ if (pcpu_reserved_chunk) { ++ void *start = pcpu_reserved_chunk->vm->addr; ++ ++ if (addr >= start && addr < start + pcpu_reserved_chunk_limit) ++ return pcpu_reserved_chunk; ++ } ++ ++ /* nah... search the regular ones */ ++ n = *pcpu_chunk_rb_search(addr, &parent); ++ if (!n) { ++ /* no exactly matching chunk, the parent is the closest */ ++ n = parent; ++ BUG_ON(!n); ++ } ++ chunk = rb_entry(n, struct pcpu_chunk, rb_node); ++ ++ if (addr < chunk->vm->addr) { ++ /* the parent was the next one, look for the previous one */ ++ n = rb_prev(n); ++ BUG_ON(!n); ++ chunk = rb_entry(n, struct pcpu_chunk, rb_node); ++ } ++ ++ return chunk; ++} ++ ++/** ++ * pcpu_chunk_addr_insert - insert chunk into address rb tree ++ * @new: chunk to insert ++ * ++ * Insert @new into address rb tree. ++ * ++ * CONTEXT: ++ * pcpu_lock. ++ */ ++static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) ++{ ++ struct rb_node **p, *parent; ++ ++ p = pcpu_chunk_rb_search(new->vm->addr, &parent); ++ BUG_ON(*p); ++ rb_link_node(&new->rb_node, parent, p); ++ rb_insert_color(&new->rb_node, &pcpu_addr_root); ++} ++ ++/** ++ * pcpu_extend_area_map - extend area map for allocation ++ * @chunk: target chunk ++ * ++ * Extend area map of @chunk so that it can accomodate an allocation. ++ * A single allocation can split an area into three areas, so this ++ * function makes sure that @chunk->map has at least two extra slots. ++ * ++ * CONTEXT: ++ * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired ++ * if area map is extended. ++ * ++ * RETURNS: ++ * 0 if noop, 1 if successfully extended, -errno on failure. ++ */ ++static int pcpu_extend_area_map(struct pcpu_chunk *chunk) ++{ ++ int new_alloc; ++ int *new; ++ size_t size; ++ ++ /* has enough? */ ++ if (chunk->map_alloc >= chunk->map_used + 2) ++ return 0; ++ ++ spin_unlock_irq(&pcpu_lock); ++ ++ new_alloc = PCPU_DFL_MAP_ALLOC; ++ while (new_alloc < chunk->map_used + 2) ++ new_alloc *= 2; ++ ++ new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); ++ if (!new) { ++ spin_lock_irq(&pcpu_lock); ++ return -ENOMEM; ++ } ++ ++ /* ++ * Acquire pcpu_lock and switch to new area map. Only free ++ * could have happened inbetween, so map_used couldn't have ++ * grown. ++ */ ++ spin_lock_irq(&pcpu_lock); ++ BUG_ON(new_alloc < chunk->map_used + 2); ++ ++ size = chunk->map_alloc * sizeof(chunk->map[0]); ++ memcpy(new, chunk->map, size); ++ ++ /* ++ * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is ++ * one of the first chunks and still using static map. ++ */ ++ if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) ++ pcpu_mem_free(chunk->map, size); ++ ++ chunk->map_alloc = new_alloc; ++ chunk->map = new; ++ return 0; ++} ++ ++/** ++ * pcpu_split_block - split a map block ++ * @chunk: chunk of interest ++ * @i: index of map block to split ++ * @head: head size in bytes (can be 0) ++ * @tail: tail size in bytes (can be 0) ++ * ++ * Split the @i'th map block into two or three blocks. If @head is ++ * non-zero, @head bytes block is inserted before block @i moving it ++ * to @i+1 and reducing its size by @head bytes. ++ * ++ * If @tail is non-zero, the target block, which can be @i or @i+1 ++ * depending on @head, is reduced by @tail bytes and @tail byte block ++ * is inserted after the target block. ++ * ++ * @chunk->map must have enough free slots to accomodate the split. ++ * ++ * CONTEXT: ++ * pcpu_lock. ++ */ ++static void pcpu_split_block(struct pcpu_chunk *chunk, int i, ++ int head, int tail) ++{ ++ int nr_extra = !!head + !!tail; ++ ++ BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); ++ ++ /* insert new subblocks */ ++ memmove(&chunk->map[i + nr_extra], &chunk->map[i], ++ sizeof(chunk->map[0]) * (chunk->map_used - i)); ++ chunk->map_used += nr_extra; ++ ++ if (head) { ++ chunk->map[i + 1] = chunk->map[i] - head; ++ chunk->map[i++] = head; ++ } ++ if (tail) { ++ chunk->map[i++] -= tail; ++ chunk->map[i] = tail; ++ } ++} ++ ++/** ++ * pcpu_alloc_area - allocate area from a pcpu_chunk ++ * @chunk: chunk of interest ++ * @size: wanted size in bytes ++ * @align: wanted align ++ * ++ * Try to allocate @size bytes area aligned at @align from @chunk. ++ * Note that this function only allocates the offset. It doesn't ++ * populate or map the area. ++ * ++ * @chunk->map must have at least two free slots. ++ * ++ * CONTEXT: ++ * pcpu_lock. ++ * ++ * RETURNS: ++ * Allocated offset in @chunk on success, -1 if no matching area is ++ * found. ++ */ ++static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) ++{ ++ int oslot = pcpu_chunk_slot(chunk); ++ int max_contig = 0; ++ int i, off; ++ ++ for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { ++ bool is_last = i + 1 == chunk->map_used; ++ int head, tail; ++ ++ /* extra for alignment requirement */ ++ head = ALIGN(off, align) - off; ++ BUG_ON(i == 0 && head != 0); ++ ++ if (chunk->map[i] < 0) ++ continue; ++ if (chunk->map[i] < head + size) { ++ max_contig = max(chunk->map[i], max_contig); ++ continue; ++ } ++ ++ /* ++ * If head is small or the previous block is free, ++ * merge'em. Note that 'small' is defined as smaller ++ * than sizeof(int), which is very small but isn't too ++ * uncommon for percpu allocations. ++ */ ++ if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { ++ if (chunk->map[i - 1] > 0) ++ chunk->map[i - 1] += head; ++ else { ++ chunk->map[i - 1] -= head; ++ chunk->free_size -= head; ++ } ++ chunk->map[i] -= head; ++ off += head; ++ head = 0; ++ } ++ ++ /* if tail is small, just keep it around */ ++ tail = chunk->map[i] - head - size; ++ if (tail < sizeof(int)) ++ tail = 0; ++ ++ /* split if warranted */ ++ if (head || tail) { ++ pcpu_split_block(chunk, i, head, tail); ++ if (head) { ++ i++; ++ off += head; ++ max_contig = max(chunk->map[i - 1], max_contig); ++ } ++ if (tail) ++ max_contig = max(chunk->map[i + 1], max_contig); ++ } ++ ++ /* update hint and mark allocated */ ++ if (is_last) ++ chunk->contig_hint = max_contig; /* fully scanned */ ++ else ++ chunk->contig_hint = max(chunk->contig_hint, ++ max_contig); ++ ++ chunk->free_size -= chunk->map[i]; ++ chunk->map[i] = -chunk->map[i]; ++ ++ pcpu_chunk_relocate(chunk, oslot); ++ return off; ++ } ++ ++ chunk->contig_hint = max_contig; /* fully scanned */ ++ pcpu_chunk_relocate(chunk, oslot); ++ ++ /* tell the upper layer that this chunk has no matching area */ ++ return -1; ++} ++ ++/** ++ * pcpu_free_area - free area to a pcpu_chunk ++ * @chunk: chunk of interest ++ * @freeme: offset of area to free ++ * ++ * Free area starting from @freeme to @chunk. Note that this function ++ * only modifies the allocation map. It doesn't depopulate or unmap ++ * the area. ++ * ++ * CONTEXT: ++ * pcpu_lock. ++ */ ++static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) ++{ ++ int oslot = pcpu_chunk_slot(chunk); ++ int i, off; ++ ++ for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) ++ if (off == freeme) ++ break; ++ BUG_ON(off != freeme); ++ BUG_ON(chunk->map[i] > 0); ++ ++ chunk->map[i] = -chunk->map[i]; ++ chunk->free_size += chunk->map[i]; ++ ++ /* merge with previous? */ ++ if (i > 0 && chunk->map[i - 1] >= 0) { ++ chunk->map[i - 1] += chunk->map[i]; ++ chunk->map_used--; ++ memmove(&chunk->map[i], &chunk->map[i + 1], ++ (chunk->map_used - i) * sizeof(chunk->map[0])); ++ i--; ++ } ++ /* merge with next? */ ++ if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { ++ chunk->map[i] += chunk->map[i + 1]; ++ chunk->map_used--; ++ memmove(&chunk->map[i + 1], &chunk->map[i + 2], ++ (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); ++ } ++ ++ chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); ++ pcpu_chunk_relocate(chunk, oslot); ++} ++ ++/** ++ * pcpu_unmap - unmap pages out of a pcpu_chunk ++ * @chunk: chunk of interest ++ * @page_start: page index of the first page to unmap ++ * @page_end: page index of the last page to unmap + 1 ++ * @flush: whether to flush cache and tlb or not ++ * ++ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. ++ * If @flush is true, vcache is flushed before unmapping and tlb ++ * after. ++ */ ++static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, ++ bool flush) ++{ ++ unsigned int last = num_possible_cpus() - 1; ++ unsigned int cpu; ++ ++ /* unmap must not be done on immutable chunk */ ++ WARN_ON(chunk->immutable); ++ ++ /* ++ * Each flushing trial can be very expensive, issue flush on ++ * the whole region at once rather than doing it for each cpu. ++ * This could be an overkill but is more scalable. ++ */ ++ if (flush) ++ flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), ++ pcpu_chunk_addr(chunk, last, page_end)); ++ ++ for_each_possible_cpu(cpu) ++ unmap_kernel_range_noflush( ++ pcpu_chunk_addr(chunk, cpu, page_start), ++ (page_end - page_start) << PAGE_SHIFT); ++ ++ /* ditto as flush_cache_vunmap() */ ++ if (flush) ++ flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), ++ pcpu_chunk_addr(chunk, last, page_end)); ++} ++ ++/** ++ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk ++ * @chunk: chunk to depopulate ++ * @off: offset to the area to depopulate ++ * @size: size of the area to depopulate in bytes ++ * @flush: whether to flush cache and tlb or not ++ * ++ * For each cpu, depopulate and unmap pages [@page_start,@page_end) ++ * from @chunk. If @flush is true, vcache is flushed before unmapping ++ * and tlb after. ++ * ++ * CONTEXT: ++ * pcpu_alloc_mutex. ++ */ ++static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, ++ bool flush) ++{ ++ int page_start = PFN_DOWN(off); ++ int page_end = PFN_UP(off + size); ++ int unmap_start = -1; ++ int uninitialized_var(unmap_end); ++ unsigned int cpu; ++ int i; ++ ++ for (i = page_start; i < page_end; i++) { ++ for_each_possible_cpu(cpu) { ++ struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); ++ ++ if (!*pagep) ++ continue; ++ ++ __free_page(*pagep); ++ ++ /* ++ * If it's partial depopulation, it might get ++ * populated or depopulated again. Mark the ++ * page gone. ++ */ ++ *pagep = NULL; ++ ++ unmap_start = unmap_start < 0 ? i : unmap_start; ++ unmap_end = i + 1; ++ } ++ } ++ ++ if (unmap_start >= 0) ++ pcpu_unmap(chunk, unmap_start, unmap_end, flush); ++} ++ ++/** ++ * pcpu_map - map pages into a pcpu_chunk ++ * @chunk: chunk of interest ++ * @page_start: page index of the first page to map ++ * @page_end: page index of the last page to map + 1 ++ * ++ * For each cpu, map pages [@page_start,@page_end) into @chunk. ++ * vcache is flushed afterwards. ++ */ ++static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) ++{ ++ unsigned int last = num_possible_cpus() - 1; ++ unsigned int cpu; ++ int err; ++ ++ /* map must not be done on immutable chunk */ ++ WARN_ON(chunk->immutable); ++ ++ for_each_possible_cpu(cpu) { ++ err = map_kernel_range_noflush( ++ pcpu_chunk_addr(chunk, cpu, page_start), ++ (page_end - page_start) << PAGE_SHIFT, ++ PAGE_KERNEL, ++ pcpu_chunk_pagep(chunk, cpu, page_start)); ++ if (err < 0) ++ return err; ++ } ++ ++ /* flush at once, please read comments in pcpu_unmap() */ ++ flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), ++ pcpu_chunk_addr(chunk, last, page_end)); ++ return 0; ++} ++ ++/** ++ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk ++ * @chunk: chunk of interest ++ * @off: offset to the area to populate ++ * @size: size of the area to populate in bytes ++ * ++ * For each cpu, populate and map pages [@page_start,@page_end) into ++ * @chunk. The area is cleared on return. ++ * ++ * CONTEXT: ++ * pcpu_alloc_mutex, does GFP_KERNEL allocation. ++ */ ++static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) ++{ ++ const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; ++ int page_start = PFN_DOWN(off); ++ int page_end = PFN_UP(off + size); ++ int map_start = -1; ++ int uninitialized_var(map_end); ++ unsigned int cpu; ++ int i; ++ ++ for (i = page_start; i < page_end; i++) { ++ if (pcpu_chunk_page_occupied(chunk, i)) { ++ if (map_start >= 0) { ++ if (pcpu_map(chunk, map_start, map_end)) ++ goto err; ++ map_start = -1; ++ } ++ continue; ++ } ++ ++ map_start = map_start < 0 ? i : map_start; ++ map_end = i + 1; ++ ++ for_each_possible_cpu(cpu) { ++ struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); ++ ++ *pagep = alloc_pages_node(cpu_to_node(cpu), ++ alloc_mask, 0); ++ if (!*pagep) ++ goto err; ++ } ++ } ++ ++ if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) ++ goto err; ++ ++ for_each_possible_cpu(cpu) ++ memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, ++ size); ++ ++ return 0; ++err: ++ /* likely under heavy memory pressure, give memory back */ ++ pcpu_depopulate_chunk(chunk, off, size, true); ++ return -ENOMEM; ++} ++ ++static void free_pcpu_chunk(struct pcpu_chunk *chunk) ++{ ++ if (!chunk) ++ return; ++ if (chunk->vm) ++ free_vm_area(chunk->vm); ++ pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); ++ kfree(chunk); ++} ++ ++static struct pcpu_chunk *alloc_pcpu_chunk(void) ++{ ++ struct pcpu_chunk *chunk; ++ ++ chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); ++ if (!chunk) ++ return NULL; ++ ++ chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); ++ chunk->map_alloc = PCPU_DFL_MAP_ALLOC; ++ chunk->map[chunk->map_used++] = pcpu_unit_size; ++ chunk->page = chunk->page_ar; ++ ++ chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); ++ if (!chunk->vm) { ++ free_pcpu_chunk(chunk); ++ return NULL; ++ } ++ ++ INIT_LIST_HEAD(&chunk->list); ++ chunk->free_size = pcpu_unit_size; ++ chunk->contig_hint = pcpu_unit_size; ++ ++ return chunk; ++} ++ ++/** ++ * pcpu_alloc - the percpu allocator ++ * @size: size of area to allocate in bytes ++ * @align: alignment of area (max PAGE_SIZE) ++ * @reserved: allocate from the reserved chunk if available ++ * ++ * Allocate percpu area of @size bytes aligned at @align. ++ * ++ * CONTEXT: ++ * Does GFP_KERNEL allocation. ++ * ++ * RETURNS: ++ * Percpu pointer to the allocated area on success, NULL on failure. ++ */ ++static void *pcpu_alloc(size_t size, size_t align, bool reserved) ++{ ++ struct pcpu_chunk *chunk; ++ int slot, off; ++ ++ if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { ++ WARN(true, "illegal size (%zu) or align (%zu) for " ++ "percpu allocation\n", size, align); ++ return NULL; ++ } ++ ++ mutex_lock(&pcpu_alloc_mutex); ++ spin_lock_irq(&pcpu_lock); ++ ++ /* serve reserved allocations from the reserved chunk if available */ ++ if (reserved && pcpu_reserved_chunk) { ++ chunk = pcpu_reserved_chunk; ++ if (size > chunk->contig_hint || ++ pcpu_extend_area_map(chunk) < 0) ++ goto fail_unlock; ++ off = pcpu_alloc_area(chunk, size, align); ++ if (off >= 0) ++ goto area_found; ++ goto fail_unlock; ++ } ++ ++restart: ++ /* search through normal chunks */ ++ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { ++ list_for_each_entry(chunk, &pcpu_slot[slot], list) { ++ if (size > chunk->contig_hint) ++ continue; ++ ++ switch (pcpu_extend_area_map(chunk)) { ++ case 0: ++ break; ++ case 1: ++ goto restart; /* pcpu_lock dropped, restart */ ++ default: ++ goto fail_unlock; ++ } ++ ++ off = pcpu_alloc_area(chunk, size, align); ++ if (off >= 0) ++ goto area_found; ++ } ++ } ++ ++ /* hmmm... no space left, create a new chunk */ ++ spin_unlock_irq(&pcpu_lock); ++ ++ chunk = alloc_pcpu_chunk(); ++ if (!chunk) ++ goto fail_unlock_mutex; ++ ++ spin_lock_irq(&pcpu_lock); ++ pcpu_chunk_relocate(chunk, -1); ++ pcpu_chunk_addr_insert(chunk); ++ goto restart; ++ ++area_found: ++ spin_unlock_irq(&pcpu_lock); ++ ++ /* populate, map and clear the area */ ++ if (pcpu_populate_chunk(chunk, off, size)) { ++ spin_lock_irq(&pcpu_lock); ++ pcpu_free_area(chunk, off); ++ goto fail_unlock; ++ } ++ ++ mutex_unlock(&pcpu_alloc_mutex); ++ ++ return __addr_to_pcpu_ptr(chunk->vm->addr + off); ++ ++fail_unlock: ++ spin_unlock_irq(&pcpu_lock); ++fail_unlock_mutex: ++ mutex_unlock(&pcpu_alloc_mutex); ++ return NULL; ++} ++ ++/** ++ * __alloc_percpu - allocate dynamic percpu area ++ * @size: size of area to allocate in bytes ++ * @align: alignment of area (max PAGE_SIZE) ++ * ++ * Allocate percpu area of @size bytes aligned at @align. Might ++ * sleep. Might trigger writeouts. ++ * ++ * CONTEXT: ++ * Does GFP_KERNEL allocation. ++ * ++ * RETURNS: ++ * Percpu pointer to the allocated area on success, NULL on failure. ++ */ ++void *__alloc_percpu(size_t size, size_t align) ++{ ++ return pcpu_alloc(size, align, false); ++} ++EXPORT_SYMBOL_GPL(__alloc_percpu); ++ ++/** ++ * __alloc_reserved_percpu - allocate reserved percpu area ++ * @size: size of area to allocate in bytes ++ * @align: alignment of area (max PAGE_SIZE) ++ * ++ * Allocate percpu area of @size bytes aligned at @align from reserved ++ * percpu area if arch has set it up; otherwise, allocation is served ++ * from the same dynamic area. Might sleep. Might trigger writeouts. ++ * ++ * CONTEXT: ++ * Does GFP_KERNEL allocation. ++ * ++ * RETURNS: ++ * Percpu pointer to the allocated area on success, NULL on failure. ++ */ ++void *__alloc_reserved_percpu(size_t size, size_t align) ++{ ++ return pcpu_alloc(size, align, true); ++} ++ ++/** ++ * pcpu_reclaim - reclaim fully free chunks, workqueue function ++ * @work: unused ++ * ++ * Reclaim all fully free chunks except for the first one. ++ * ++ * CONTEXT: ++ * workqueue context. ++ */ ++static void pcpu_reclaim(struct work_struct *work) ++{ ++ LIST_HEAD(todo); ++ struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; ++ struct pcpu_chunk *chunk, *next; ++ ++ mutex_lock(&pcpu_alloc_mutex); ++ spin_lock_irq(&pcpu_lock); ++ ++ list_for_each_entry_safe(chunk, next, head, list) { ++ WARN_ON(chunk->immutable); ++ ++ /* spare the first one */ ++ if (chunk == list_first_entry(head, struct pcpu_chunk, list)) ++ continue; ++ ++ rb_erase(&chunk->rb_node, &pcpu_addr_root); ++ list_move(&chunk->list, &todo); ++ } ++ ++ spin_unlock_irq(&pcpu_lock); ++ mutex_unlock(&pcpu_alloc_mutex); ++ ++ list_for_each_entry_safe(chunk, next, &todo, list) { ++ pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); ++ free_pcpu_chunk(chunk); ++ } ++} ++ ++/** ++ * free_percpu - free percpu area ++ * @ptr: pointer to area to free ++ * ++ * Free percpu area @ptr. ++ * ++ * CONTEXT: ++ * Can be called from atomic context. ++ */ ++void free_percpu(void *ptr) ++{ ++ void *addr = __pcpu_ptr_to_addr(ptr); ++ struct pcpu_chunk *chunk; ++ unsigned long flags; ++ int off; ++ ++ if (!ptr) ++ return; ++ ++ spin_lock_irqsave(&pcpu_lock, flags); ++ ++ chunk = pcpu_chunk_addr_search(addr); ++ off = addr - chunk->vm->addr; ++ ++ pcpu_free_area(chunk, off); ++ ++ /* if there are more than one fully free chunks, wake up grim reaper */ ++ if (chunk->free_size == pcpu_unit_size) { ++ struct pcpu_chunk *pos; ++ ++ list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) ++ if (pos != chunk) { ++ schedule_work(&pcpu_reclaim_work); ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcpu_lock, flags); ++} ++EXPORT_SYMBOL_GPL(free_percpu); ++ ++/** ++ * pcpu_setup_first_chunk - initialize the first percpu chunk ++ * @get_page_fn: callback to fetch page pointer ++ * @static_size: the size of static percpu area in bytes ++ * @reserved_size: the size of reserved percpu area in bytes ++ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto ++ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto ++ * @base_addr: mapped address, NULL for auto ++ * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary ++ * ++ * Initialize the first percpu chunk which contains the kernel static ++ * perpcu area. This function is to be called from arch percpu area ++ * setup path. The first two parameters are mandatory. The rest are ++ * optional. ++ * ++ * @get_page_fn() should return pointer to percpu page given cpu ++ * number and page number. It should at least return enough pages to ++ * cover the static area. The returned pages for static area should ++ * have been initialized with valid data. If @unit_size is specified, ++ * it can also return pages after the static area. NULL return ++ * indicates end of pages for the cpu. Note that @get_page_fn() must ++ * return the same number of pages for all cpus. ++ * ++ * @reserved_size, if non-zero, specifies the amount of bytes to ++ * reserve after the static area in the first chunk. This reserves ++ * the first chunk such that it's available only through reserved ++ * percpu allocation. This is primarily used to serve module percpu ++ * static areas on architectures where the addressing model has ++ * limited offset range for symbol relocations to guarantee module ++ * percpu symbols fall inside the relocatable range. ++ * ++ * @dyn_size, if non-negative, determines the number of bytes ++ * available for dynamic allocation in the first chunk. Specifying ++ * non-negative value makes percpu leave alone the area beyond ++ * @static_size + @reserved_size + @dyn_size. ++ * ++ * @unit_size, if non-negative, specifies unit size and must be ++ * aligned to PAGE_SIZE and equal to or larger than @static_size + ++ * @reserved_size + if non-negative, @dyn_size. ++ * ++ * Non-null @base_addr means that the caller already allocated virtual ++ * region for the first chunk and mapped it. percpu must not mess ++ * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL ++ * @populate_pte_fn doesn't make any sense. ++ * ++ * @populate_pte_fn is used to populate the pagetable. NULL means the ++ * caller already populated the pagetable. ++ * ++ * If the first chunk ends up with both reserved and dynamic areas, it ++ * is served by two chunks - one to serve the core static and reserved ++ * areas and the other for the dynamic area. They share the same vm ++ * and page map but uses different area allocation map to stay away ++ * from each other. The latter chunk is circulated in the chunk slots ++ * and available for dynamic allocation like any other chunks. ++ * ++ * RETURNS: ++ * The determined pcpu_unit_size which can be used to initialize ++ * percpu access. ++ */ ++size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, ++ size_t static_size, size_t reserved_size, ++ ssize_t dyn_size, ssize_t unit_size, ++ void *base_addr, ++ pcpu_populate_pte_fn_t populate_pte_fn) ++{ ++ static struct vm_struct first_vm; ++ static int smap[2], dmap[2]; ++ size_t size_sum = static_size + reserved_size + ++ (dyn_size >= 0 ? dyn_size : 0); ++ struct pcpu_chunk *schunk, *dchunk = NULL; ++ unsigned int cpu; ++ int nr_pages; ++ int err, i; ++ ++ /* santiy checks */ ++ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ++ ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); ++ BUG_ON(!static_size); ++ if (unit_size >= 0) { ++ BUG_ON(unit_size < size_sum); ++ BUG_ON(unit_size & ~PAGE_MASK); ++ BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); ++ } else ++ BUG_ON(base_addr); ++ BUG_ON(base_addr && populate_pte_fn); ++ ++ if (unit_size >= 0) ++ pcpu_unit_pages = unit_size >> PAGE_SHIFT; ++ else ++ pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, ++ PFN_UP(size_sum)); ++ ++ pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; ++ pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; ++ pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) ++ + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); ++ ++ if (dyn_size < 0) ++ dyn_size = pcpu_unit_size - static_size - reserved_size; ++ ++ /* ++ * Allocate chunk slots. The additional last slot is for ++ * empty chunks. ++ */ ++ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; ++ pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); ++ for (i = 0; i < pcpu_nr_slots; i++) ++ INIT_LIST_HEAD(&pcpu_slot[i]); ++ ++ /* ++ * Initialize static chunk. If reserved_size is zero, the ++ * static chunk covers static area + dynamic allocation area ++ * in the first chunk. If reserved_size is not zero, it ++ * covers static area + reserved area (mostly used for module ++ * static percpu allocation). ++ */ ++ schunk = alloc_bootmem(pcpu_chunk_struct_size); ++ INIT_LIST_HEAD(&schunk->list); ++ schunk->vm = &first_vm; ++ schunk->map = smap; ++ schunk->map_alloc = ARRAY_SIZE(smap); ++ schunk->page = schunk->page_ar; ++ ++ if (reserved_size) { ++ schunk->free_size = reserved_size; ++ pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ ++ } else { ++ schunk->free_size = dyn_size; ++ dyn_size = 0; /* dynamic area covered */ ++ } ++ schunk->contig_hint = schunk->free_size; ++ ++ schunk->map[schunk->map_used++] = -static_size; ++ if (schunk->free_size) ++ schunk->map[schunk->map_used++] = schunk->free_size; ++ ++ pcpu_reserved_chunk_limit = static_size + schunk->free_size; ++ ++ /* init dynamic chunk if necessary */ ++ if (dyn_size) { ++ dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); ++ INIT_LIST_HEAD(&dchunk->list); ++ dchunk->vm = &first_vm; ++ dchunk->map = dmap; ++ dchunk->map_alloc = ARRAY_SIZE(dmap); ++ dchunk->page = schunk->page_ar; /* share page map with schunk */ ++ ++ dchunk->contig_hint = dchunk->free_size = dyn_size; ++ dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; ++ dchunk->map[dchunk->map_used++] = dchunk->free_size; ++ } ++ ++ /* allocate vm address */ ++ first_vm.flags = VM_ALLOC; ++ first_vm.size = pcpu_chunk_size; ++ ++ if (!base_addr) ++ vm_area_register_early(&first_vm, PAGE_SIZE); ++ else { ++ /* ++ * Pages already mapped. No need to remap into ++ * vmalloc area. In this case the first chunks can't ++ * be mapped or unmapped by percpu and are marked ++ * immutable. ++ */ ++ first_vm.addr = base_addr; ++ schunk->immutable = true; ++ if (dchunk) ++ dchunk->immutable = true; ++ } ++ ++ /* assign pages */ ++ nr_pages = -1; ++ for_each_possible_cpu(cpu) { ++ for (i = 0; i < pcpu_unit_pages; i++) { ++ struct page *page = get_page_fn(cpu, i); ++ ++ if (!page) ++ break; ++ *pcpu_chunk_pagep(schunk, cpu, i) = page; ++ } ++ ++ BUG_ON(i < PFN_UP(static_size)); ++ ++ if (nr_pages < 0) ++ nr_pages = i; ++ else ++ BUG_ON(nr_pages != i); ++ } ++ ++ /* map them */ ++ if (populate_pte_fn) { ++ for_each_possible_cpu(cpu) ++ for (i = 0; i < nr_pages; i++) ++ populate_pte_fn(pcpu_chunk_addr(schunk, ++ cpu, i)); ++ ++ err = pcpu_map(schunk, 0, nr_pages); ++ if (err) ++ panic("failed to setup static percpu area, err=%d\n", ++ err); ++ } ++ ++ /* link the first chunk in */ ++ if (!dchunk) { ++ pcpu_chunk_relocate(schunk, -1); ++ pcpu_chunk_addr_insert(schunk); ++ } else { ++ pcpu_chunk_relocate(dchunk, -1); ++ pcpu_chunk_addr_insert(dchunk); ++ } ++ ++ /* we're done */ ++ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); ++ return pcpu_unit_size; ++} ++ ++/* ++ * Embedding first chunk setup helper. ++ */ ++static void *pcpue_ptr __initdata; ++static size_t pcpue_size __initdata; ++static size_t pcpue_unit_size __initdata; ++ ++static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) ++{ ++ size_t off = (size_t)pageno << PAGE_SHIFT; ++ ++ if (off >= pcpue_size) ++ return NULL; ++ ++ return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); ++} ++ ++/** ++ * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem ++ * @static_size: the size of static percpu area in bytes ++ * @reserved_size: the size of reserved percpu area in bytes ++ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto ++ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto ++ * ++ * This is a helper to ease setting up embedded first percpu chunk and ++ * can be called where pcpu_setup_first_chunk() is expected. ++ * ++ * If this function is used to setup the first chunk, it is allocated ++ * as a contiguous area using bootmem allocator and used as-is without ++ * being mapped into vmalloc area. This enables the first chunk to ++ * piggy back on the linear physical mapping which often uses larger ++ * page size. ++ * ++ * When @dyn_size is positive, dynamic area might be larger than ++ * specified to fill page alignment. Also, when @dyn_size is auto, ++ * @dyn_size does not fill the whole first chunk but only what's ++ * necessary for page alignment after static and reserved areas. ++ * ++ * If the needed size is smaller than the minimum or specified unit ++ * size, the leftover is returned to the bootmem allocator. ++ * ++ * RETURNS: ++ * The determined pcpu_unit_size which can be used to initialize ++ * percpu access on success, -errno on failure. ++ */ ++ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ++ ssize_t dyn_size, ssize_t unit_size) ++{ ++ unsigned int cpu; ++ ++ /* determine parameters and allocate */ ++ pcpue_size = PFN_ALIGN(static_size + reserved_size + ++ (dyn_size >= 0 ? dyn_size : 0)); ++ if (dyn_size != 0) ++ dyn_size = pcpue_size - static_size - reserved_size; ++ ++ if (unit_size >= 0) { ++ BUG_ON(unit_size < pcpue_size); ++ pcpue_unit_size = unit_size; ++ } else ++ pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); ++ ++ pcpue_ptr = __alloc_bootmem_nopanic( ++ num_possible_cpus() * pcpue_unit_size, ++ PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); ++ if (!pcpue_ptr) ++ return -ENOMEM; ++ ++ /* return the leftover and copy */ ++ for_each_possible_cpu(cpu) { ++ void *ptr = pcpue_ptr + cpu * pcpue_unit_size; ++ ++ free_bootmem(__pa(ptr + pcpue_size), ++ pcpue_unit_size - pcpue_size); ++ memcpy(ptr, __per_cpu_load, static_size); ++ } ++ ++ /* we're ready, commit */ ++ pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", ++ pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); ++ ++ return pcpu_setup_first_chunk(pcpue_get_page, static_size, ++ reserved_size, dyn_size, ++ pcpue_unit_size, pcpue_ptr, NULL); ++} +Index: linux-2.6-tip/mm/quicklist.c +=================================================================== +--- linux-2.6-tip.orig/mm/quicklist.c ++++ linux-2.6-tip/mm/quicklist.c +@@ -19,7 +19,7 @@ + #include + #include + +-DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; ++DEFINE_PER_CPU_LOCKED(struct quicklist, quicklist)[CONFIG_NR_QUICK]; + + #define FRACTION_OF_NODE_MEM 16 + +@@ -29,7 +29,7 @@ static unsigned long max_pages(unsigned + int node = numa_node_id(); + struct zone *zones = NODE_DATA(node)->node_zones; + int num_cpus_on_node; +- node_to_cpumask_ptr(cpumask_on_node, node); ++ const struct cpumask *cpumask_on_node = cpumask_of_node(node); + + node_free_pages = + #ifdef CONFIG_ZONE_DMA +@@ -66,17 +66,14 @@ void quicklist_trim(int nr, void (*dtor) + { + long pages_to_free; + struct quicklist *q; ++ int cpu; + +- q = &get_cpu_var(quicklist)[nr]; ++ q = &get_cpu_var_locked(quicklist, &cpu)[nr]; + if (q->nr_pages > min_pages) { + pages_to_free = min_pages_to_free(q, min_pages, max_free); + + while (pages_to_free > 0) { +- /* +- * We pass a gfp_t of 0 to quicklist_alloc here +- * because we will never call into the page allocator. +- */ +- void *p = quicklist_alloc(nr, 0, NULL); ++ void *p = __quicklist_alloc(q); + + if (dtor) + dtor(p); +@@ -84,7 +81,7 @@ void quicklist_trim(int nr, void (*dtor) + pages_to_free--; + } + } +- put_cpu_var(quicklist); ++ put_cpu_var_locked(quicklist, cpu); + } + + unsigned long quicklist_total_size(void) +@@ -94,7 +91,7 @@ unsigned long quicklist_total_size(void) + struct quicklist *ql, *q; + + for_each_online_cpu(cpu) { +- ql = per_cpu(quicklist, cpu); ++ ql = per_cpu_var_locked(quicklist, cpu); + for (q = ql; q < ql + CONFIG_NR_QUICK; q++) + count += q->nr_pages; + } +Index: linux-2.6-tip/mm/slab.c +=================================================================== +--- linux-2.6-tip.orig/mm/slab.c ++++ linux-2.6-tip/mm/slab.c +@@ -102,6 +102,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -112,12 +113,145 @@ + #include + #include + #include ++#include + + #include + #include + #include + + /* ++ * On !PREEMPT_RT, raw irq flags are used as a per-CPU locking ++ * mechanism. ++ * ++ * On PREEMPT_RT, we use per-CPU locks for this. That's why the ++ * calling convention is changed slightly: a new 'flags' argument ++ * is passed to 'irq disable/enable' - the PREEMPT_RT code stores ++ * the CPU number of the lock there. ++ */ ++#ifndef CONFIG_PREEMPT_RT ++ ++# define slab_irq_disable(cpu) \ ++ do { local_irq_disable(); (cpu) = smp_processor_id(); } while (0) ++# define slab_irq_enable(cpu) local_irq_enable() ++ ++static inline void slab_irq_disable_this_rt(int cpu) ++{ ++} ++ ++static inline void slab_irq_enable_rt(int cpu) ++{ ++} ++ ++# define slab_irq_save(flags, cpu) \ ++ do { local_irq_save(flags); (cpu) = smp_processor_id(); } while (0) ++# define slab_irq_restore(flags, cpu) local_irq_restore(flags) ++ ++/* ++ * In the __GFP_WAIT case we enable/disable interrupts on !PREEMPT_RT, ++ * which has no per-CPU locking effect since we are holding the cache ++ * lock in that case already. ++ */ ++static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu) ++{ ++ if (flags & __GFP_WAIT) ++ local_irq_enable(); ++} ++ ++static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu) ++{ ++ if (flags & __GFP_WAIT) ++ local_irq_disable(); ++} ++ ++# define slab_spin_lock_irq(lock, cpu) \ ++ do { spin_lock_irq(lock); (cpu) = smp_processor_id(); } while (0) ++# define slab_spin_unlock_irq(lock, cpu) spin_unlock_irq(lock) ++ ++# define slab_spin_lock_irqsave(lock, flags, cpu) \ ++ do { spin_lock_irqsave(lock, flags); (cpu) = smp_processor_id(); } while (0) ++# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ ++ do { spin_unlock_irqrestore(lock, flags); } while (0) ++ ++#else /* CONFIG_PREEMPT_RT */ ++ ++/* ++ * Instead of serializing the per-cpu state by disabling interrupts we do so ++ * by a lock. This keeps the code preemptable - albeit at the cost of remote ++ * memory access when the task does get migrated away. ++ */ ++DEFINE_PER_CPU_LOCKED(struct list_head, slab) = { 0, }; ++ ++static void _slab_irq_disable(int *cpu) ++{ ++ (void)get_cpu_var_locked(slab, cpu); ++} ++ ++#define slab_irq_disable(cpu) _slab_irq_disable(&(cpu)) ++ ++static inline void slab_irq_enable(int cpu) ++{ ++ LIST_HEAD(list); ++ ++ list_splice_init(&__get_cpu_var_locked(slab, cpu), &list); ++ put_cpu_var_locked(slab, cpu); ++ ++ while (!list_empty(&list)) { ++ struct page *page = list_first_entry(&list, struct page, lru); ++ list_del(&page->lru); ++ __free_pages(page, page->index); ++ } ++} ++ ++static inline void slab_irq_disable_this_rt(int cpu) ++{ ++ spin_lock(&__get_cpu_lock(slab, cpu)); ++} ++ ++static inline void slab_irq_enable_rt(int cpu) ++{ ++ LIST_HEAD(list); ++ ++ list_splice_init(&__get_cpu_var_locked(slab, cpu), &list); ++ spin_unlock(&__get_cpu_lock(slab, cpu)); ++ ++ while (!list_empty(&list)) { ++ struct page *page = list_first_entry(&list, struct page, lru); ++ list_del(&page->lru); ++ __free_pages(page, page->index); ++ } ++} ++ ++# define slab_irq_save(flags, cpu) \ ++ do { slab_irq_disable(cpu); (void) (flags); } while (0) ++# define slab_irq_restore(flags, cpu) \ ++ do { slab_irq_enable(cpu); (void) (flags); } while (0) ++ ++/* ++ * On PREEMPT_RT we have to drop the locks unconditionally to avoid lock ++ * recursion on the cache_grow()->alloc_slabmgmt() path. ++ */ ++static void slab_irq_enable_GFP_WAIT(gfp_t flags, int *cpu) ++{ ++ slab_irq_enable(*cpu); ++} ++ ++static void slab_irq_disable_GFP_WAIT(gfp_t flags, int *cpu) ++{ ++ slab_irq_disable(*cpu); ++} ++ ++# define slab_spin_lock_irq(lock, cpu) \ ++ do { slab_irq_disable(cpu); spin_lock(lock); } while (0) ++# define slab_spin_unlock_irq(lock, cpu) \ ++ do { spin_unlock(lock); slab_irq_enable(cpu); } while (0) ++# define slab_spin_lock_irqsave(lock, flags, cpu) \ ++ do { slab_irq_disable(cpu); spin_lock_irqsave(lock, flags); } while (0) ++# define slab_spin_unlock_irqrestore(lock, flags, cpu) \ ++ do { spin_unlock_irqrestore(lock, flags); slab_irq_enable(cpu); } while (0) ++ ++#endif /* CONFIG_PREEMPT_RT */ ++ ++/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * +@@ -177,13 +311,13 @@ + SLAB_STORE_USER | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ +- SLAB_DEBUG_OBJECTS) ++ SLAB_DEBUG_OBJECTS | SLAB_NOTRACK) + #else + # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ +- SLAB_DEBUG_OBJECTS) ++ SLAB_DEBUG_OBJECTS | SLAB_NOTRACK) + #endif + + /* +@@ -313,7 +447,7 @@ struct kmem_list3 __initdata initkmem_li + static int drain_freelist(struct kmem_cache *cache, + struct kmem_list3 *l3, int tofree); + static void free_block(struct kmem_cache *cachep, void **objpp, int len, +- int node); ++ int node, int *this_cpu); + static int enable_cpucache(struct kmem_cache *cachep); + static void cache_reap(struct work_struct *unused); + +@@ -372,87 +506,6 @@ static void kmem_list3_init(struct kmem_ + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ + } while (0) + +-/* +- * struct kmem_cache +- * +- * manages a cache. +- */ +- +-struct kmem_cache { +-/* 1) per-cpu data, touched during every alloc/free */ +- struct array_cache *array[NR_CPUS]; +-/* 2) Cache tunables. Protected by cache_chain_mutex */ +- unsigned int batchcount; +- unsigned int limit; +- unsigned int shared; +- +- unsigned int buffer_size; +- u32 reciprocal_buffer_size; +-/* 3) touched by every alloc & free from the backend */ +- +- unsigned int flags; /* constant flags */ +- unsigned int num; /* # of objs per slab */ +- +-/* 4) cache_grow/shrink */ +- /* order of pgs per slab (2^n) */ +- unsigned int gfporder; +- +- /* force GFP flags, e.g. GFP_DMA */ +- gfp_t gfpflags; +- +- size_t colour; /* cache colouring range */ +- unsigned int colour_off; /* colour offset */ +- struct kmem_cache *slabp_cache; +- unsigned int slab_size; +- unsigned int dflags; /* dynamic flags */ +- +- /* constructor func */ +- void (*ctor)(void *obj); +- +-/* 5) cache creation/removal */ +- const char *name; +- struct list_head next; +- +-/* 6) statistics */ +-#if STATS +- unsigned long num_active; +- unsigned long num_allocations; +- unsigned long high_mark; +- unsigned long grown; +- unsigned long reaped; +- unsigned long errors; +- unsigned long max_freeable; +- unsigned long node_allocs; +- unsigned long node_frees; +- unsigned long node_overflow; +- atomic_t allochit; +- atomic_t allocmiss; +- atomic_t freehit; +- atomic_t freemiss; +-#endif +-#if DEBUG +- /* +- * If debugging is enabled, then the allocator can add additional +- * fields and/or padding to every object. buffer_size contains the total +- * object size including these internal fields, the following two +- * variables contain the offset to the user object and its size. +- */ +- int obj_offset; +- int obj_size; +-#endif +- /* +- * We put nodelists[] at the end of kmem_cache, because we want to size +- * this array to nr_node_ids slots instead of MAX_NUMNODES +- * (see kmem_cache_init()) +- * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache +- * is statically defined, so we reserve the max number of nodes. +- */ +- struct kmem_list3 *nodelists[MAX_NUMNODES]; +- /* +- * Do not add fields after nodelists[] +- */ +-}; +- + #define CFLGS_OFF_SLAB (0x80000000UL) + #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) + +@@ -568,6 +621,14 @@ static void **dbg_userword(struct kmem_c + + #endif + ++#ifdef CONFIG_KMEMTRACE ++size_t slab_buffer_size(struct kmem_cache *cachep) ++{ ++ return cachep->buffer_size; ++} ++EXPORT_SYMBOL(slab_buffer_size); ++#endif ++ + /* + * Do not go above this order unless 0 objects fit into the slab. + */ +@@ -756,9 +817,10 @@ int slab_is_available(void) + + static DEFINE_PER_CPU(struct delayed_work, reap_work); + +-static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) ++static inline struct array_cache * ++cpu_cache_get(struct kmem_cache *cachep, int this_cpu) + { +- return cachep->array[smp_processor_id()]; ++ return cachep->array[this_cpu]; + } + + static inline struct kmem_cache *__find_general_cachep(size_t size, +@@ -992,7 +1054,7 @@ static int transfer_objects(struct array + #ifndef CONFIG_NUMA + + #define drain_alien_cache(cachep, alien) do { } while (0) +-#define reap_alien(cachep, l3) do { } while (0) ++#define reap_alien(cachep, l3, this_cpu) 0 + + static inline struct array_cache **alloc_alien_cache(int node, int limit) + { +@@ -1003,27 +1065,29 @@ static inline void free_alien_cache(stru + { + } + +-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) ++static inline int ++cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) + { + return 0; + } + + static inline void *alternate_node_alloc(struct kmem_cache *cachep, +- gfp_t flags) ++ gfp_t flags, int *this_cpu) + { + return NULL; + } + + static inline void *____cache_alloc_node(struct kmem_cache *cachep, +- gfp_t flags, int nodeid) ++ gfp_t flags, int nodeid, int *this_cpu) + { + return NULL; + } + + #else /* CONFIG_NUMA */ + +-static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); +-static void *alternate_node_alloc(struct kmem_cache *, gfp_t); ++static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, ++ int nodeid, int *this_cpu); ++static void *alternate_node_alloc(struct kmem_cache *, gfp_t, int *); + + static struct array_cache **alloc_alien_cache(int node, int limit) + { +@@ -1064,7 +1128,8 @@ static void free_alien_cache(struct arra + } + + static void __drain_alien_cache(struct kmem_cache *cachep, +- struct array_cache *ac, int node) ++ struct array_cache *ac, int node, ++ int *this_cpu) + { + struct kmem_list3 *rl3 = cachep->nodelists[node]; + +@@ -1078,7 +1143,7 @@ static void __drain_alien_cache(struct k + if (rl3->shared) + transfer_objects(rl3->shared, ac, ac->limit); + +- free_block(cachep, ac->entry, ac->avail, node); ++ free_block(cachep, ac->entry, ac->avail, node, this_cpu); + ac->avail = 0; + spin_unlock(&rl3->list_lock); + } +@@ -1087,38 +1152,42 @@ static void __drain_alien_cache(struct k + /* + * Called from cache_reap() to regularly drain alien caches round robin. + */ +-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) ++static int ++reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3, int *this_cpu) + { +- int node = __get_cpu_var(reap_node); ++ int node = per_cpu(reap_node, *this_cpu); + + if (l3->alien) { + struct array_cache *ac = l3->alien[node]; + + if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { +- __drain_alien_cache(cachep, ac, node); ++ __drain_alien_cache(cachep, ac, node, this_cpu); + spin_unlock_irq(&ac->lock); ++ return 1; + } + } ++ return 0; + } + + static void drain_alien_cache(struct kmem_cache *cachep, + struct array_cache **alien) + { +- int i = 0; ++ int i = 0, this_cpu; + struct array_cache *ac; + unsigned long flags; + + for_each_online_node(i) { + ac = alien[i]; + if (ac) { +- spin_lock_irqsave(&ac->lock, flags); +- __drain_alien_cache(cachep, ac, i); +- spin_unlock_irqrestore(&ac->lock, flags); ++ slab_spin_lock_irqsave(&ac->lock, flags, this_cpu); ++ __drain_alien_cache(cachep, ac, i, &this_cpu); ++ slab_spin_unlock_irqrestore(&ac->lock, flags, this_cpu); + } + } + } + +-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) ++static inline int ++cache_free_alien(struct kmem_cache *cachep, void *objp, int *this_cpu) + { + struct slab *slabp = virt_to_slab(objp); + int nodeid = slabp->nodeid; +@@ -1126,7 +1195,7 @@ static inline int cache_free_alien(struc + struct array_cache *alien = NULL; + int node; + +- node = numa_node_id(); ++ node = cpu_to_node(*this_cpu); + + /* + * Make sure we are not freeing a object from another node to the array +@@ -1142,30 +1211,31 @@ static inline int cache_free_alien(struc + spin_lock(&alien->lock); + if (unlikely(alien->avail == alien->limit)) { + STATS_INC_ACOVERFLOW(cachep); +- __drain_alien_cache(cachep, alien, nodeid); ++ __drain_alien_cache(cachep, alien, nodeid, this_cpu); + } + alien->entry[alien->avail++] = objp; + spin_unlock(&alien->lock); + } else { + spin_lock(&(cachep->nodelists[nodeid])->list_lock); +- free_block(cachep, &objp, 1, nodeid); ++ free_block(cachep, &objp, 1, nodeid, this_cpu); + spin_unlock(&(cachep->nodelists[nodeid])->list_lock); + } + return 1; + } + #endif + +-static void __cpuinit cpuup_canceled(long cpu) ++static void __cpuinit cpuup_canceled(int cpu) + { + struct kmem_cache *cachep; + struct kmem_list3 *l3 = NULL; + int node = cpu_to_node(cpu); +- node_to_cpumask_ptr(mask, node); ++ const struct cpumask *mask = cpumask_of_node(node); + + list_for_each_entry(cachep, &cache_chain, next) { + struct array_cache *nc; + struct array_cache *shared; + struct array_cache **alien; ++ int orig_cpu = cpu; + + /* cpu is dead; no one can alloc from it. */ + nc = cachep->array[cpu]; +@@ -1180,7 +1250,8 @@ static void __cpuinit cpuup_canceled(lon + /* Free limit for this kmem_list3 */ + l3->free_limit -= cachep->batchcount; + if (nc) +- free_block(cachep, nc->entry, nc->avail, node); ++ free_block(cachep, nc->entry, nc->avail, node, ++ &cpu); + + if (!cpus_empty(*mask)) { + spin_unlock_irq(&l3->list_lock); +@@ -1190,7 +1261,7 @@ static void __cpuinit cpuup_canceled(lon + shared = l3->shared; + if (shared) { + free_block(cachep, shared->entry, +- shared->avail, node); ++ shared->avail, node, &cpu); + l3->shared = NULL; + } + +@@ -1206,6 +1277,7 @@ static void __cpuinit cpuup_canceled(lon + } + free_array_cache: + kfree(nc); ++ BUG_ON(cpu != orig_cpu); + } + /* + * In the previous loop, all the objects were freed to +@@ -1220,7 +1292,7 @@ free_array_cache: + } + } + +-static int __cpuinit cpuup_prepare(long cpu) ++static int __cpuinit cpuup_prepare(int cpu) + { + struct kmem_cache *cachep; + struct kmem_list3 *l3 = NULL; +@@ -1328,10 +1400,19 @@ static int __cpuinit cpuup_callback(stru + long cpu = (long)hcpu; + int err = 0; + ++ + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + mutex_lock(&cache_chain_mutex); ++ /* ++ * lock/unlock cycle to push any holders away -- no new ones ++ * can come in due to the cpu still being offline. ++ * ++ * XXX -- weird case anyway, can it happen? ++ */ ++ slab_irq_disable_this_rt(cpu); ++ slab_irq_enable_rt(cpu); + err = cpuup_prepare(cpu); + mutex_unlock(&cache_chain_mutex); + break; +@@ -1371,10 +1452,14 @@ static int __cpuinit cpuup_callback(stru + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + mutex_lock(&cache_chain_mutex); ++ slab_irq_disable_this_rt(cpu); + cpuup_canceled(cpu); ++ slab_irq_enable_rt(cpu); + mutex_unlock(&cache_chain_mutex); + break; + } ++ ++ + return err ? NOTIFY_BAD : NOTIFY_OK; + } + +@@ -1389,11 +1474,13 @@ static void init_list(struct kmem_cache + int nodeid) + { + struct kmem_list3 *ptr; ++ int this_cpu; + + ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); + BUG_ON(!ptr); + +- local_irq_disable(); ++ WARN_ON(spin_is_locked(&list->list_lock)); ++ slab_irq_disable(this_cpu); + memcpy(ptr, list, sizeof(struct kmem_list3)); + /* + * Do not assume that spinlocks can be initialized via memcpy: +@@ -1402,7 +1489,7 @@ static void init_list(struct kmem_cache + + MAKE_ALL_LISTS(cachep, ptr, nodeid); + cachep->nodelists[nodeid] = ptr; +- local_irq_enable(); ++ slab_irq_enable(this_cpu); + } + + /* +@@ -1434,6 +1521,12 @@ void __init kmem_cache_init(void) + int order; + int node; + ++#ifdef CONFIG_PREEMPT_RT ++ for_each_possible_cpu(i) { ++ INIT_LIST_HEAD(&__get_cpu_var_locked(slab, i)); ++ } ++#endif ++ + if (num_possible_nodes() == 1) { + use_alien_caches = 0; + numa_platform = 0; +@@ -1565,36 +1658,34 @@ void __init kmem_cache_init(void) + /* 4) Replace the bootstrap head arrays */ + { + struct array_cache *ptr; ++ int this_cpu; + + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + +- local_irq_disable(); +- BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); +- memcpy(ptr, cpu_cache_get(&cache_cache), +- sizeof(struct arraycache_init)); ++ slab_irq_disable(this_cpu); ++ BUG_ON(cpu_cache_get(&cache_cache, this_cpu) != &initarray_cache.cache); ++ memcpy(ptr, cpu_cache_get(&cache_cache, this_cpu), ++ sizeof(struct arraycache_init)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->lock); +- +- cache_cache.array[smp_processor_id()] = ptr; +- local_irq_enable(); ++ cache_cache.array[this_cpu] = ptr; ++ slab_irq_enable(this_cpu); + + ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + +- local_irq_disable(); +- BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) +- != &initarray_generic.cache); +- memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), +- sizeof(struct arraycache_init)); ++ slab_irq_disable(this_cpu); ++ BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu) ++ != &initarray_generic.cache); ++ memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep, this_cpu), ++ sizeof(struct arraycache_init)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->lock); +- +- malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = +- ptr; +- local_irq_enable(); ++ malloc_sizes[INDEX_AC].cs_cachep->array[this_cpu] = ptr; ++ slab_irq_enable(this_cpu); + } + /* 5) Replace the bootstrap kmem_list3's */ + { +@@ -1680,7 +1771,7 @@ static void *kmem_getpages(struct kmem_c + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; + +- page = alloc_pages_node(nodeid, flags, cachep->gfporder); ++ page = alloc_pages_node(nodeid, flags & ~__GFP_NOTRACK, cachep->gfporder); + if (!page) + return NULL; + +@@ -1693,24 +1784,39 @@ static void *kmem_getpages(struct kmem_c + NR_SLAB_UNRECLAIMABLE, nr_pages); + for (i = 0; i < nr_pages; i++) + __SetPageSlab(page + i); ++ ++ if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { ++ kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); ++ ++ if (cachep->ctor) ++ kmemcheck_mark_uninitialized_pages(page, nr_pages); ++ else ++ kmemcheck_mark_unallocated_pages(page, nr_pages); ++ } ++ + return page_address(page); + } + + /* + * Interface to system's page release. + */ +-static void kmem_freepages(struct kmem_cache *cachep, void *addr) ++static void kmem_freepages(struct kmem_cache *cachep, void *addr, int cpu) + { + unsigned long i = (1 << cachep->gfporder); +- struct page *page = virt_to_page(addr); ++ struct page *page, *basepage = virt_to_page(addr); + const unsigned long nr_freed = i; + ++ page = basepage; ++ ++ kmemcheck_free_shadow(page, cachep->gfporder); ++ + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + sub_zone_page_state(page_zone(page), + NR_SLAB_RECLAIMABLE, nr_freed); + else + sub_zone_page_state(page_zone(page), + NR_SLAB_UNRECLAIMABLE, nr_freed); ++ + while (i--) { + BUG_ON(!PageSlab(page)); + __ClearPageSlab(page); +@@ -1718,6 +1824,13 @@ static void kmem_freepages(struct kmem_c + } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += nr_freed; ++ ++#ifdef CONFIG_PREEMPT_RT ++ if (cpu >= 0) { ++ basepage->index = cachep->gfporder; ++ list_add(&basepage->lru, &__get_cpu_var_locked(slab, cpu)); ++ } else ++#endif + free_pages((unsigned long)addr, cachep->gfporder); + } + +@@ -1726,7 +1839,7 @@ static void kmem_rcu_free(struct rcu_hea + struct slab_rcu *slab_rcu = (struct slab_rcu *)head; + struct kmem_cache *cachep = slab_rcu->cachep; + +- kmem_freepages(cachep, slab_rcu->addr); ++ kmem_freepages(cachep, slab_rcu->addr, -1); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slab_rcu); + } +@@ -1746,7 +1859,7 @@ static void store_stackinfo(struct kmem_ + + *addr++ = 0x12345678; + *addr++ = caller; +- *addr++ = smp_processor_id(); ++ *addr++ = raw_smp_processor_id(); + size -= 3 * sizeof(unsigned long); + { + unsigned long *sptr = &caller; +@@ -1936,6 +2049,10 @@ static void slab_destroy_debugcheck(stru + } + #endif + ++static void ++__cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu); ++ ++ + /** + * slab_destroy - destroy and release all objects in a slab + * @cachep: cache pointer being destroyed +@@ -1945,7 +2062,8 @@ static void slab_destroy_debugcheck(stru + * Before calling the slab must have been unlinked from the cache. The + * cache-lock is not held/needed. + */ +-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) ++static void ++slab_destroy(struct kmem_cache *cachep, struct slab *slabp, int *this_cpu) + { + void *addr = slabp->s_mem - slabp->colouroff; + +@@ -1958,9 +2076,13 @@ static void slab_destroy(struct kmem_cac + slab_rcu->addr = addr; + call_rcu(&slab_rcu->head, kmem_rcu_free); + } else { +- kmem_freepages(cachep, addr); +- if (OFF_SLAB(cachep)) +- kmem_cache_free(cachep->slabp_cache, slabp); ++ kmem_freepages(cachep, addr, *this_cpu); ++ if (OFF_SLAB(cachep)) { ++ if (this_cpu) ++ __cache_free(cachep->slabp_cache, slabp, this_cpu); ++ else ++ kmem_cache_free(cachep->slabp_cache, slabp); ++ } + } + } + +@@ -2057,6 +2179,8 @@ static size_t calculate_slab_order(struc + + static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) + { ++ int this_cpu; ++ + if (g_cpucache_up == FULL) + return enable_cpucache(cachep); + +@@ -2100,10 +2224,12 @@ static int __init_refok setup_cpu_cache( + jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + +- cpu_cache_get(cachep)->avail = 0; +- cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; +- cpu_cache_get(cachep)->batchcount = 1; +- cpu_cache_get(cachep)->touched = 0; ++ this_cpu = raw_smp_processor_id(); ++ ++ cpu_cache_get(cachep, this_cpu)->avail = 0; ++ cpu_cache_get(cachep, this_cpu)->limit = BOOT_CPUCACHE_ENTRIES; ++ cpu_cache_get(cachep, this_cpu)->batchcount = 1; ++ cpu_cache_get(cachep, this_cpu)->touched = 0; + cachep->batchcount = 1; + cachep->limit = BOOT_CPUCACHE_ENTRIES; + return 0; +@@ -2394,19 +2520,19 @@ EXPORT_SYMBOL(kmem_cache_create); + #if DEBUG + static void check_irq_off(void) + { ++/* ++ * On PREEMPT_RT we use locks to protect the per-CPU lists, ++ * and keep interrupts enabled. ++ */ ++#ifndef CONFIG_PREEMPT_RT + BUG_ON(!irqs_disabled()); ++#endif + } + + static void check_irq_on(void) + { ++#ifndef CONFIG_PREEMPT_RT + BUG_ON(irqs_disabled()); +-} +- +-static void check_spinlock_acquired(struct kmem_cache *cachep) +-{ +-#ifdef CONFIG_SMP +- check_irq_off(); +- assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); + #endif + } + +@@ -2421,34 +2547,67 @@ static void check_spinlock_acquired_node + #else + #define check_irq_off() do { } while(0) + #define check_irq_on() do { } while(0) +-#define check_spinlock_acquired(x) do { } while(0) + #define check_spinlock_acquired_node(x, y) do { } while(0) + #endif + +-static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, ++static int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, + int force, int node); + +-static void do_drain(void *arg) ++static void __do_drain(void *arg, int this_cpu) + { + struct kmem_cache *cachep = arg; ++ int node = cpu_to_node(this_cpu); + struct array_cache *ac; +- int node = numa_node_id(); + + check_irq_off(); +- ac = cpu_cache_get(cachep); ++ ac = cpu_cache_get(cachep, this_cpu); + spin_lock(&cachep->nodelists[node]->list_lock); +- free_block(cachep, ac->entry, ac->avail, node); ++ free_block(cachep, ac->entry, ac->avail, node, &this_cpu); + spin_unlock(&cachep->nodelists[node]->list_lock); + ac->avail = 0; + } + ++#ifdef CONFIG_PREEMPT_RT ++static void do_drain(void *arg, int this_cpu) ++{ ++ __do_drain(arg, this_cpu); ++} ++#else ++static void do_drain(void *arg) ++{ ++ __do_drain(arg, smp_processor_id()); ++} ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * execute func() for all CPUs. On PREEMPT_RT we dont actually have ++ * to run on the remote CPUs - we only have to take their CPU-locks. ++ * (This is a rare operation, so cacheline bouncing is not an issue.) ++ */ ++static void ++slab_on_each_cpu(void (*func)(void *arg, int this_cpu), void *arg) ++{ ++ unsigned int i; ++ ++ check_irq_on(); ++ for_each_online_cpu(i) { ++ spin_lock(&__get_cpu_lock(slab, i)); ++ func(arg, i); ++ spin_unlock(&__get_cpu_lock(slab, i)); ++ } ++} ++#else ++# define slab_on_each_cpu(func, cachep) on_each_cpu(func, cachep, 1) ++#endif ++ + static void drain_cpu_caches(struct kmem_cache *cachep) + { + struct kmem_list3 *l3; + int node; + +- on_each_cpu(do_drain, cachep, 1); ++ slab_on_each_cpu(do_drain, cachep); + check_irq_on(); + for_each_online_node(node) { + l3 = cachep->nodelists[node]; +@@ -2473,16 +2632,16 @@ static int drain_freelist(struct kmem_ca + struct kmem_list3 *l3, int tofree) + { + struct list_head *p; +- int nr_freed; ++ int nr_freed, this_cpu; + struct slab *slabp; + + nr_freed = 0; + while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { + +- spin_lock_irq(&l3->list_lock); ++ slab_spin_lock_irq(&l3->list_lock, this_cpu); + p = l3->slabs_free.prev; + if (p == &l3->slabs_free) { +- spin_unlock_irq(&l3->list_lock); ++ slab_spin_unlock_irq(&l3->list_lock, this_cpu); + goto out; + } + +@@ -2491,13 +2650,9 @@ static int drain_freelist(struct kmem_ca + BUG_ON(slabp->inuse); + #endif + list_del(&slabp->list); +- /* +- * Safe to drop the lock. The slab is no longer linked +- * to the cache. +- */ + l3->free_objects -= cache->num; +- spin_unlock_irq(&l3->list_lock); +- slab_destroy(cache, slabp); ++ slab_destroy(cache, slabp, &this_cpu); ++ slab_spin_unlock_irq(&l3->list_lock, this_cpu); + nr_freed++; + } + out: +@@ -2753,8 +2908,8 @@ static void slab_map_pages(struct kmem_c + * Grow (by 1) the number of slabs within a cache. This is called by + * kmem_cache_alloc() when there are no active objs left in a cache. + */ +-static int cache_grow(struct kmem_cache *cachep, +- gfp_t flags, int nodeid, void *objp) ++static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid, ++ void *objp, int *this_cpu) + { + struct slab *slabp; + size_t offset; +@@ -2782,8 +2937,7 @@ static int cache_grow(struct kmem_cache + + offset *= cachep->colour_off; + +- if (local_flags & __GFP_WAIT) +- local_irq_enable(); ++ slab_irq_enable_GFP_WAIT(local_flags, this_cpu); + + /* + * The test for missing atomic flag is performed here, rather than +@@ -2812,8 +2966,8 @@ static int cache_grow(struct kmem_cache + + cache_init_objs(cachep, slabp); + +- if (local_flags & __GFP_WAIT) +- local_irq_disable(); ++ slab_irq_disable_GFP_WAIT(local_flags, this_cpu); ++ + check_irq_off(); + spin_lock(&l3->list_lock); + +@@ -2824,10 +2978,9 @@ static int cache_grow(struct kmem_cache + spin_unlock(&l3->list_lock); + return 1; + opps1: +- kmem_freepages(cachep, objp); ++ kmem_freepages(cachep, objp, -1); + failed: +- if (local_flags & __GFP_WAIT) +- local_irq_disable(); ++ slab_irq_disable_GFP_WAIT(local_flags, this_cpu); + return 0; + } + +@@ -2949,7 +3102,8 @@ bad: + #define check_slabp(x,y) do { } while(0) + #endif + +-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) ++static void * ++cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) + { + int batchcount; + struct kmem_list3 *l3; +@@ -2959,7 +3113,7 @@ static void *cache_alloc_refill(struct k + retry: + check_irq_off(); + node = numa_node_id(); +- ac = cpu_cache_get(cachep); ++ ac = cpu_cache_get(cachep, *this_cpu); + batchcount = ac->batchcount; + if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { + /* +@@ -2969,7 +3123,7 @@ retry: + */ + batchcount = BATCHREFILL_LIMIT; + } +- l3 = cachep->nodelists[node]; ++ l3 = cachep->nodelists[cpu_to_node(*this_cpu)]; + + BUG_ON(ac->avail > 0 || !l3); + spin_lock(&l3->list_lock); +@@ -2992,7 +3146,7 @@ retry: + + slabp = list_entry(entry, struct slab, list); + check_slabp(cachep, slabp); +- check_spinlock_acquired(cachep); ++ check_spinlock_acquired_node(cachep, cpu_to_node(*this_cpu)); + + /* + * The slab was either on partial or free list so +@@ -3006,8 +3160,9 @@ retry: + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + +- ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, +- node); ++ ac->entry[ac->avail++] = ++ slab_get_obj(cachep, slabp, ++ cpu_to_node(*this_cpu)); + } + check_slabp(cachep, slabp); + +@@ -3026,10 +3181,10 @@ alloc_done: + + if (unlikely(!ac->avail)) { + int x; +- x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); ++ x = cache_grow(cachep, flags | GFP_THISNODE, cpu_to_node(*this_cpu), NULL, this_cpu); + + /* cache_grow can reenable interrupts, then ac could change. */ +- ac = cpu_cache_get(cachep); ++ ac = cpu_cache_get(cachep, *this_cpu); + if (!x && ac->avail == 0) /* no objects in sight? abort */ + return NULL; + +@@ -3116,21 +3271,22 @@ static bool slab_should_failslab(struct + return should_failslab(obj_size(cachep), flags); + } + +-static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) ++static inline void * ++____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) + { + void *objp; + struct array_cache *ac; + + check_irq_off(); + +- ac = cpu_cache_get(cachep); ++ ac = cpu_cache_get(cachep, *this_cpu); + if (likely(ac->avail)) { + STATS_INC_ALLOCHIT(cachep); + ac->touched = 1; + objp = ac->entry[--ac->avail]; + } else { + STATS_INC_ALLOCMISS(cachep); +- objp = cache_alloc_refill(cachep, flags); ++ objp = cache_alloc_refill(cachep, flags, this_cpu); + } + return objp; + } +@@ -3142,7 +3298,8 @@ static inline void *____cache_alloc(stru + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +-static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) ++static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags, ++ int *this_cpu) + { + int nid_alloc, nid_here; + +@@ -3154,7 +3311,7 @@ static void *alternate_node_alloc(struct + else if (current->mempolicy) + nid_alloc = slab_node(current->mempolicy); + if (nid_alloc != nid_here) +- return ____cache_alloc_node(cachep, flags, nid_alloc); ++ return ____cache_alloc_node(cachep, flags, nid_alloc, this_cpu); + return NULL; + } + +@@ -3166,7 +3323,7 @@ static void *alternate_node_alloc(struct + * allocator to do its reclaim / fallback magic. We then insert the + * slab into the proper nodelist and then allocate from it. + */ +-static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) ++static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) + { + struct zonelist *zonelist; + gfp_t local_flags; +@@ -3194,7 +3351,8 @@ retry: + cache->nodelists[nid] && + cache->nodelists[nid]->free_objects) { + obj = ____cache_alloc_node(cache, +- flags | GFP_THISNODE, nid); ++ flags | GFP_THISNODE, nid, ++ this_cpu); + if (obj) + break; + } +@@ -3207,20 +3365,21 @@ retry: + * We may trigger various forms of reclaim on the allowed + * set and go into memory reserves if necessary. + */ +- if (local_flags & __GFP_WAIT) +- local_irq_enable(); ++ slab_irq_enable_GFP_WAIT(local_flags, this_cpu); ++ + kmem_flagcheck(cache, flags); + obj = kmem_getpages(cache, local_flags, -1); +- if (local_flags & __GFP_WAIT) +- local_irq_disable(); ++ ++ slab_irq_disable_GFP_WAIT(local_flags, this_cpu); ++ + if (obj) { + /* + * Insert into the appropriate per node queues + */ + nid = page_to_nid(virt_to_page(obj)); +- if (cache_grow(cache, flags, nid, obj)) { ++ if (cache_grow(cache, flags, nid, obj, this_cpu)) { + obj = ____cache_alloc_node(cache, +- flags | GFP_THISNODE, nid); ++ flags | GFP_THISNODE, nid, this_cpu); + if (!obj) + /* + * Another processor may allocate the +@@ -3241,7 +3400,7 @@ retry: + * A interface to enable slab creation on nodeid + */ + static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, +- int nodeid) ++ int nodeid, int *this_cpu) + { + struct list_head *entry; + struct slab *slabp; +@@ -3289,11 +3448,11 @@ retry: + + must_grow: + spin_unlock(&l3->list_lock); +- x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); ++ x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, this_cpu); + if (x) + goto retry; + +- return fallback_alloc(cachep, flags); ++ return fallback_alloc(cachep, flags, this_cpu); + + done: + return obj; +@@ -3316,40 +3475,47 @@ __cache_alloc_node(struct kmem_cache *ca + void *caller) + { + unsigned long save_flags; ++ int this_cpu; + void *ptr; + ++ lockdep_trace_alloc(flags); ++ + if (slab_should_failslab(cachep, flags)) + return NULL; + + cache_alloc_debugcheck_before(cachep, flags); +- local_irq_save(save_flags); ++ ++ slab_irq_save(save_flags, this_cpu); + + if (unlikely(nodeid == -1)) +- nodeid = numa_node_id(); ++ nodeid = cpu_to_node(this_cpu); + + if (unlikely(!cachep->nodelists[nodeid])) { + /* Node not bootstrapped yet */ +- ptr = fallback_alloc(cachep, flags); ++ ptr = fallback_alloc(cachep, flags, &this_cpu); + goto out; + } + +- if (nodeid == numa_node_id()) { ++ if (nodeid == cpu_to_node(this_cpu)) { + /* + * Use the locally cached objects if possible. + * However ____cache_alloc does not allow fallback + * to other nodes. It may fail while we still have + * objects on other nodes available. + */ +- ptr = ____cache_alloc(cachep, flags); ++ ptr = ____cache_alloc(cachep, flags, &this_cpu); + if (ptr) + goto out; + } + /* ___cache_alloc_node can fall back to other nodes */ +- ptr = ____cache_alloc_node(cachep, flags, nodeid); ++ ptr = ____cache_alloc_node(cachep, flags, nodeid, &this_cpu); + out: +- local_irq_restore(save_flags); ++ slab_irq_restore(save_flags, this_cpu); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + ++ if (likely(ptr)) ++ kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); ++ + if (unlikely((flags & __GFP_ZERO) && ptr)) + memset(ptr, 0, obj_size(cachep)); + +@@ -3357,33 +3523,33 @@ __cache_alloc_node(struct kmem_cache *ca + } + + static __always_inline void * +-__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) ++__do_cache_alloc(struct kmem_cache *cache, gfp_t flags, int *this_cpu) + { + void *objp; + + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { +- objp = alternate_node_alloc(cache, flags); ++ objp = alternate_node_alloc(cache, flags, this_cpu); + if (objp) + goto out; + } +- objp = ____cache_alloc(cache, flags); + ++ objp = ____cache_alloc(cache, flags, this_cpu); + /* + * We may just have run out of memory on the local node. + * ____cache_alloc_node() knows how to locate memory on other nodes + */ +- if (!objp) +- objp = ____cache_alloc_node(cache, flags, numa_node_id()); +- ++ if (!objp) ++ objp = ____cache_alloc_node(cache, flags, ++ cpu_to_node(*this_cpu), this_cpu); + out: + return objp; + } + #else + + static __always_inline void * +-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) ++__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu) + { +- return ____cache_alloc(cachep, flags); ++ return ____cache_alloc(cachep, flags, this_cpu); + } + + #endif /* CONFIG_NUMA */ +@@ -3392,18 +3558,24 @@ static __always_inline void * + __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) + { + unsigned long save_flags; ++ int this_cpu; + void *objp; + ++ lockdep_trace_alloc(flags); ++ + if (slab_should_failslab(cachep, flags)) + return NULL; + + cache_alloc_debugcheck_before(cachep, flags); +- local_irq_save(save_flags); +- objp = __do_cache_alloc(cachep, flags); +- local_irq_restore(save_flags); ++ slab_irq_save(save_flags, this_cpu); ++ objp = __do_cache_alloc(cachep, flags, &this_cpu); ++ slab_irq_restore(save_flags, this_cpu); + objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + prefetchw(objp); + ++ if (likely(objp)) ++ kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); ++ + if (unlikely((flags & __GFP_ZERO) && objp)) + memset(objp, 0, obj_size(cachep)); + +@@ -3414,7 +3586,7 @@ __cache_alloc(struct kmem_cache *cachep, + * Caller needs to acquire correct kmem_list's list_lock + */ + static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, +- int node) ++ int node, int *this_cpu) + { + int i; + struct kmem_list3 *l3; +@@ -3443,7 +3615,7 @@ static void free_block(struct kmem_cache + * a different cache, refer to comments before + * alloc_slabmgmt. + */ +- slab_destroy(cachep, slabp); ++ slab_destroy(cachep, slabp, this_cpu); + } else { + list_add(&slabp->list, &l3->slabs_free); + } +@@ -3457,11 +3629,12 @@ static void free_block(struct kmem_cache + } + } + +-static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) ++static void ++cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac, int *this_cpu) + { + int batchcount; + struct kmem_list3 *l3; +- int node = numa_node_id(); ++ int node = cpu_to_node(*this_cpu); + + batchcount = ac->batchcount; + #if DEBUG +@@ -3483,7 +3656,7 @@ static void cache_flusharray(struct kmem + } + } + +- free_block(cachep, ac->entry, batchcount, node); ++ free_block(cachep, ac->entry, batchcount, node, this_cpu); + free_done: + #if STATS + { +@@ -3512,13 +3685,15 @@ free_done: + * Release an obj back to its cache. If the obj has a constructed state, it must + * be in this state _before_ it is released. Called with disabled ints. + */ +-static inline void __cache_free(struct kmem_cache *cachep, void *objp) ++static void __cache_free(struct kmem_cache *cachep, void *objp, int *this_cpu) + { +- struct array_cache *ac = cpu_cache_get(cachep); ++ struct array_cache *ac = cpu_cache_get(cachep, *this_cpu); + + check_irq_off(); + objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + ++ kmemcheck_slab_free(cachep, objp, obj_size(cachep)); ++ + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which +@@ -3526,7 +3701,7 @@ static inline void __cache_free(struct k + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ +- if (numa_platform && cache_free_alien(cachep, objp)) ++ if (numa_platform && cache_free_alien(cachep, objp, this_cpu)) + return; + + if (likely(ac->avail < ac->limit)) { +@@ -3535,7 +3710,7 @@ static inline void __cache_free(struct k + return; + } else { + STATS_INC_FREEMISS(cachep); +- cache_flusharray(cachep, ac); ++ cache_flusharray(cachep, ac, this_cpu); + ac->entry[ac->avail++] = objp; + } + } +@@ -3550,10 +3725,23 @@ static inline void __cache_free(struct k + */ + void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) + { +- return __cache_alloc(cachep, flags, __builtin_return_address(0)); ++ void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); ++ ++ trace_kmem_cache_alloc(_RET_IP_, ret, ++ obj_size(cachep), cachep->buffer_size, flags); ++ ++ return ret; + } + EXPORT_SYMBOL(kmem_cache_alloc); + ++#ifdef CONFIG_KMEMTRACE ++void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) ++{ ++ return __cache_alloc(cachep, flags, __builtin_return_address(0)); ++} ++EXPORT_SYMBOL(kmem_cache_alloc_notrace); ++#endif ++ + /** + * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. + * @cachep: the cache we're checking against +@@ -3598,23 +3786,46 @@ out: + #ifdef CONFIG_NUMA + void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) + { +- return __cache_alloc_node(cachep, flags, nodeid, +- __builtin_return_address(0)); ++ void *ret = __cache_alloc_node(cachep, flags, nodeid, ++ __builtin_return_address(0)); ++ ++ trace_kmem_cache_alloc_node(_RET_IP_, ret, ++ obj_size(cachep), cachep->buffer_size, ++ flags, nodeid); ++ ++ return ret; + } + EXPORT_SYMBOL(kmem_cache_alloc_node); + ++#ifdef CONFIG_KMEMTRACE ++void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, ++ gfp_t flags, ++ int nodeid) ++{ ++ return __cache_alloc_node(cachep, flags, nodeid, ++ __builtin_return_address(0)); ++} ++EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); ++#endif ++ + static __always_inline void * + __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) + { + struct kmem_cache *cachep; ++ void *ret; + + cachep = kmem_find_general_cachep(size, flags); + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; +- return kmem_cache_alloc_node(cachep, flags, node); ++ ret = kmem_cache_alloc_node_notrace(cachep, flags, node); ++ ++ trace_kmalloc_node((unsigned long) caller, ret, ++ size, cachep->buffer_size, flags, node); ++ ++ return ret; + } + +-#ifdef CONFIG_DEBUG_SLAB ++#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) + void *__kmalloc_node(size_t size, gfp_t flags, int node) + { + return __do_kmalloc_node(size, flags, node, +@@ -3647,6 +3858,7 @@ static __always_inline void *__do_kmallo + void *caller) + { + struct kmem_cache *cachep; ++ void *ret; + + /* If you want to save a few bytes .text space: replace + * __ with kmem_. +@@ -3656,11 +3868,16 @@ static __always_inline void *__do_kmallo + cachep = __find_general_cachep(size, flags); + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; +- return __cache_alloc(cachep, flags, caller); ++ ret = __cache_alloc(cachep, flags, caller); ++ ++ trace_kmalloc((unsigned long) caller, ret, ++ size, cachep->buffer_size, flags); ++ ++ return ret; + } + + +-#ifdef CONFIG_DEBUG_SLAB ++#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) + void *__kmalloc(size_t size, gfp_t flags) + { + return __do_kmalloc(size, flags, __builtin_return_address(0)); +@@ -3692,13 +3909,16 @@ EXPORT_SYMBOL(__kmalloc); + void kmem_cache_free(struct kmem_cache *cachep, void *objp) + { + unsigned long flags; ++ int this_cpu; + +- local_irq_save(flags); ++ slab_irq_save(flags, this_cpu); + debug_check_no_locks_freed(objp, obj_size(cachep)); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, obj_size(cachep)); +- __cache_free(cachep, objp); +- local_irq_restore(flags); ++ __cache_free(cachep, objp, &this_cpu); ++ slab_irq_restore(flags, this_cpu); ++ ++ trace_kmem_cache_free(_RET_IP_, objp); + } + EXPORT_SYMBOL(kmem_cache_free); + +@@ -3715,16 +3935,19 @@ void kfree(const void *objp) + { + struct kmem_cache *c; + unsigned long flags; ++ int this_cpu; ++ ++ trace_kfree(_RET_IP_, objp); + + if (unlikely(ZERO_OR_NULL_PTR(objp))) + return; +- local_irq_save(flags); ++ slab_irq_save(flags, this_cpu); + kfree_debugcheck(objp); + c = virt_to_cache(objp); + debug_check_no_locks_freed(objp, obj_size(c)); + debug_check_no_obj_freed(objp, obj_size(c)); +- __cache_free(c, (void *)objp); +- local_irq_restore(flags); ++ __cache_free(c, (void *)objp, &this_cpu); ++ slab_irq_restore(flags, this_cpu); + } + EXPORT_SYMBOL(kfree); + +@@ -3745,7 +3968,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); + */ + static int alloc_kmemlist(struct kmem_cache *cachep) + { +- int node; ++ int node, this_cpu; + struct kmem_list3 *l3; + struct array_cache *new_shared; + struct array_cache **new_alien = NULL; +@@ -3773,11 +3996,11 @@ static int alloc_kmemlist(struct kmem_ca + if (l3) { + struct array_cache *shared = l3->shared; + +- spin_lock_irq(&l3->list_lock); ++ slab_spin_lock_irq(&l3->list_lock, this_cpu); + + if (shared) + free_block(cachep, shared->entry, +- shared->avail, node); ++ shared->avail, node, &this_cpu); + + l3->shared = new_shared; + if (!l3->alien) { +@@ -3786,7 +4009,7 @@ static int alloc_kmemlist(struct kmem_ca + } + l3->free_limit = (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; +- spin_unlock_irq(&l3->list_lock); ++ slab_spin_unlock_irq(&l3->list_lock, this_cpu); + kfree(shared); + free_alien_cache(new_alien); + continue; +@@ -3833,42 +4056,50 @@ struct ccupdate_struct { + struct array_cache *new[NR_CPUS]; + }; + +-static void do_ccupdate_local(void *info) ++static void __do_ccupdate_local(void *info, int this_cpu) + { + struct ccupdate_struct *new = info; + struct array_cache *old; + + check_irq_off(); +- old = cpu_cache_get(new->cachep); ++ old = cpu_cache_get(new->cachep, this_cpu); ++ ++ new->cachep->array[this_cpu] = new->new[this_cpu]; ++ new->new[this_cpu] = old; ++} + +- new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; +- new->new[smp_processor_id()] = old; ++#ifdef CONFIG_PREEMPT_RT ++static void do_ccupdate_local(void *arg, int this_cpu) ++{ ++ __do_ccupdate_local(arg, this_cpu); + } ++#else ++static void do_ccupdate_local(void *arg) ++{ ++ __do_ccupdate_local(arg, smp_processor_id()); ++} ++#endif + + /* Always called with the cache_chain_mutex held */ + static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared) + { +- struct ccupdate_struct *new; +- int i; +- +- new = kzalloc(sizeof(*new), GFP_KERNEL); +- if (!new) +- return -ENOMEM; ++ struct ccupdate_struct new; ++ int i, this_cpu; + ++ memset(&new.new, 0, sizeof(new.new)); + for_each_online_cpu(i) { +- new->new[i] = alloc_arraycache(cpu_to_node(i), limit, ++ new.new[i] = alloc_arraycache(cpu_to_node(i), limit, + batchcount); +- if (!new->new[i]) { ++ if (!new.new[i]) { + for (i--; i >= 0; i--) +- kfree(new->new[i]); +- kfree(new); ++ kfree(new.new[i]); + return -ENOMEM; + } + } +- new->cachep = cachep; ++ new.cachep = cachep; + +- on_each_cpu(do_ccupdate_local, (void *)new, 1); ++ slab_on_each_cpu(do_ccupdate_local, (void *)&new); + + check_irq_on(); + cachep->batchcount = batchcount; +@@ -3876,15 +4107,15 @@ static int do_tune_cpucache(struct kmem_ + cachep->shared = shared; + + for_each_online_cpu(i) { +- struct array_cache *ccold = new->new[i]; ++ struct array_cache *ccold = new.new[i]; + if (!ccold) + continue; +- spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); +- free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); +- spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); ++ slab_spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu); ++ free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i), &this_cpu); ++ slab_spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock, this_cpu); + kfree(ccold); + } +- kfree(new); ++ + return alloc_kmemlist(cachep); + } + +@@ -3946,29 +4177,31 @@ static int enable_cpucache(struct kmem_c + * Drain an array if it contains any elements taking the l3 lock only if + * necessary. Note that the l3 listlock also protects the array_cache + * if drain_array() is used on the shared array. ++ * returns non-zero if some work is done + */ +-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +- struct array_cache *ac, int force, int node) ++int drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, ++ struct array_cache *ac, int force, int node) + { +- int tofree; ++ int tofree, this_cpu; + + if (!ac || !ac->avail) +- return; ++ return 0; + if (ac->touched && !force) { + ac->touched = 0; + } else { +- spin_lock_irq(&l3->list_lock); ++ slab_spin_lock_irq(&l3->list_lock, this_cpu); + if (ac->avail) { + tofree = force ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; +- free_block(cachep, ac->entry, tofree, node); ++ free_block(cachep, ac->entry, tofree, node, &this_cpu); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), + sizeof(void *) * ac->avail); + } +- spin_unlock_irq(&l3->list_lock); ++ slab_spin_unlock_irq(&l3->list_lock, this_cpu); + } ++ return 1; + } + + /** +@@ -3985,11 +4218,12 @@ void drain_array(struct kmem_cache *cach + */ + static void cache_reap(struct work_struct *w) + { ++ int this_cpu = raw_smp_processor_id(), node = cpu_to_node(this_cpu); + struct kmem_cache *searchp; + struct kmem_list3 *l3; +- int node = numa_node_id(); + struct delayed_work *work = + container_of(w, struct delayed_work, work); ++ int work_done = 0; + + if (!mutex_trylock(&cache_chain_mutex)) + /* Give up. Setup the next iteration. */ +@@ -4005,9 +4239,12 @@ static void cache_reap(struct work_struc + */ + l3 = searchp->nodelists[node]; + +- reap_alien(searchp, l3); ++ work_done += reap_alien(searchp, l3, &this_cpu); + +- drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); ++ node = cpu_to_node(this_cpu); ++ ++ work_done += drain_array(searchp, l3, ++ cpu_cache_get(searchp, this_cpu), 0, node); + + /* + * These are racy checks but it does not matter +@@ -4018,7 +4255,7 @@ static void cache_reap(struct work_struc + + l3->next_reap = jiffies + REAPTIMEOUT_LIST3; + +- drain_array(searchp, l3, l3->shared, 0, node); ++ work_done += drain_array(searchp, l3, l3->shared, 0, node); + + if (l3->free_touched) + l3->free_touched = 0; +@@ -4037,7 +4274,8 @@ next: + next_reap_node(); + out: + /* Set up the next iteration */ +- schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); ++ schedule_delayed_work(work, ++ round_jiffies_relative((1+!work_done) * REAPTIMEOUT_CPUC)); + } + + #ifdef CONFIG_SLABINFO +@@ -4096,7 +4334,7 @@ static int s_show(struct seq_file *m, vo + unsigned long num_slabs, free_objects = 0, shared_avail = 0; + const char *name; + char *error = NULL; +- int node; ++ int this_cpu, node; + struct kmem_list3 *l3; + + active_objs = 0; +@@ -4107,7 +4345,7 @@ static int s_show(struct seq_file *m, vo + continue; + + check_irq_on(); +- spin_lock_irq(&l3->list_lock); ++ slab_spin_lock_irq(&l3->list_lock, this_cpu); + + list_for_each_entry(slabp, &l3->slabs_full, list) { + if (slabp->inuse != cachep->num && !error) +@@ -4132,7 +4370,7 @@ static int s_show(struct seq_file *m, vo + if (l3->shared) + shared_avail += l3->shared->avail; + +- spin_unlock_irq(&l3->list_lock); ++ slab_spin_unlock_irq(&l3->list_lock, this_cpu); + } + num_slabs += active_slabs; + num_objs = num_slabs * cachep->num; +@@ -4341,7 +4579,7 @@ static int leaks_show(struct seq_file *m + struct kmem_list3 *l3; + const char *name; + unsigned long *n = m->private; +- int node; ++ int node, this_cpu; + int i; + + if (!(cachep->flags & SLAB_STORE_USER)) +@@ -4359,13 +4597,13 @@ static int leaks_show(struct seq_file *m + continue; + + check_irq_on(); +- spin_lock_irq(&l3->list_lock); ++ slab_spin_lock_irq(&l3->list_lock, this_cpu); + + list_for_each_entry(slabp, &l3->slabs_full, list) + handle_slab(n, cachep, slabp); + list_for_each_entry(slabp, &l3->slabs_partial, list) + handle_slab(n, cachep, slabp); +- spin_unlock_irq(&l3->list_lock); ++ slab_spin_unlock_irq(&l3->list_lock, this_cpu); + } + name = cachep->name; + if (n[0] == n[1]) { +Index: linux-2.6-tip/mm/slob.c +=================================================================== +--- linux-2.6-tip.orig/mm/slob.c ++++ linux-2.6-tip/mm/slob.c +@@ -65,6 +65,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -463,27 +464,38 @@ void *__kmalloc_node(size_t size, gfp_t + { + unsigned int *m; + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); ++ void *ret; ++ ++ lockdep_trace_alloc(gfp); + + if (size < PAGE_SIZE - align) { + if (!size) + return ZERO_SIZE_PTR; + + m = slob_alloc(size + align, gfp, align, node); ++ + if (!m) + return NULL; + *m = size; +- return (void *)m + align; ++ ret = (void *)m + align; ++ ++ trace_kmalloc_node(_RET_IP_, ret, ++ size, size + align, gfp, node); + } else { +- void *ret; ++ unsigned int order = get_order(size); + +- ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); ++ ret = slob_new_page(gfp | __GFP_COMP, order, node); + if (ret) { + struct page *page; + page = virt_to_page(ret); + page->private = size; + } +- return ret; ++ ++ trace_kmalloc_node(_RET_IP_, ret, ++ size, PAGE_SIZE << order, gfp, node); + } ++ ++ return ret; + } + EXPORT_SYMBOL(__kmalloc_node); + +@@ -491,6 +503,8 @@ void kfree(const void *block) + { + struct slob_page *sp; + ++ trace_kfree(_RET_IP_, block); ++ + if (unlikely(ZERO_OR_NULL_PTR(block))) + return; + +@@ -570,10 +584,17 @@ void *kmem_cache_alloc_node(struct kmem_ + { + void *b; + +- if (c->size < PAGE_SIZE) ++ if (c->size < PAGE_SIZE) { + b = slob_alloc(c->size, flags, c->align, node); +- else ++ trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, ++ SLOB_UNITS(c->size) * SLOB_UNIT, ++ flags, node); ++ } else { + b = slob_new_page(flags, get_order(c->size), node); ++ trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, ++ PAGE_SIZE << get_order(c->size), ++ flags, node); ++ } + + if (c->ctor) + c->ctor(b); +@@ -609,6 +630,8 @@ void kmem_cache_free(struct kmem_cache * + } else { + __kmem_cache_free(b, c->size); + } ++ ++ trace_kmem_cache_free(_RET_IP_, b); + } + EXPORT_SYMBOL(kmem_cache_free); + +Index: linux-2.6-tip/mm/slub.c +=================================================================== +--- linux-2.6-tip.orig/mm/slub.c ++++ linux-2.6-tip/mm/slub.c +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -25,6 +26,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -145,7 +147,7 @@ + SLAB_TRACE | SLAB_DESTROY_BY_RCU) + + #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ +- SLAB_CACHE_DMA) ++ SLAB_CACHE_DMA | SLAB_NOTRACK) + + #ifndef ARCH_KMALLOC_MINALIGN + #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +@@ -1069,6 +1071,8 @@ static inline struct page *alloc_slab_pa + { + int order = oo_order(oo); + ++ flags |= __GFP_NOTRACK; ++ + if (node == -1) + return alloc_pages(flags, order); + else +@@ -1096,6 +1100,24 @@ static struct page *allocate_slab(struct + + stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + } ++ ++ if (kmemcheck_enabled ++ && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) ++ { ++ int pages = 1 << oo_order(oo); ++ ++ kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); ++ ++ /* ++ * Objects from caches that have a constructor don't get ++ * cleared when they're allocated, so we need to do it here. ++ */ ++ if (s->ctor) ++ kmemcheck_mark_uninitialized_pages(page, pages); ++ else ++ kmemcheck_mark_unallocated_pages(page, pages); ++ } ++ + page->objects = oo_objects(oo); + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? +@@ -1169,6 +1191,8 @@ static void __free_slab(struct kmem_cach + __ClearPageSlubDebug(page); + } + ++ kmemcheck_free_shadow(page, compound_order(page)); ++ + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, +@@ -1599,6 +1623,7 @@ static __always_inline void *slab_alloc( + unsigned long flags; + unsigned int objsize; + ++ lockdep_trace_alloc(gfpflags); + might_sleep_if(gfpflags & __GFP_WAIT); + + if (should_failslab(s->objsize, gfpflags)) +@@ -1621,23 +1646,51 @@ static __always_inline void *slab_alloc( + if (unlikely((gfpflags & __GFP_ZERO) && object)) + memset(object, 0, objsize); + ++ kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); + return object; + } + + void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) + { +- return slab_alloc(s, gfpflags, -1, _RET_IP_); ++ void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); ++ ++ trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); ++ ++ return ret; + } + EXPORT_SYMBOL(kmem_cache_alloc); + ++#ifdef CONFIG_KMEMTRACE ++void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) ++{ ++ return slab_alloc(s, gfpflags, -1, _RET_IP_); ++} ++EXPORT_SYMBOL(kmem_cache_alloc_notrace); ++#endif ++ + #ifdef CONFIG_NUMA + void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) + { +- return slab_alloc(s, gfpflags, node, _RET_IP_); ++ void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); ++ ++ trace_kmem_cache_alloc_node(_RET_IP_, ret, ++ s->objsize, s->size, gfpflags, node); ++ ++ return ret; + } + EXPORT_SYMBOL(kmem_cache_alloc_node); + #endif + ++#ifdef CONFIG_KMEMTRACE ++void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, ++ gfp_t gfpflags, ++ int node) ++{ ++ return slab_alloc(s, gfpflags, node, _RET_IP_); ++} ++EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); ++#endif ++ + /* + * Slow patch handling. This may still be called frequently since objects + * have a longer lifetime than the cpu slabs in most processing loads. +@@ -1725,6 +1778,7 @@ static __always_inline void slab_free(st + + local_irq_save(flags); + c = get_cpu_slab(s, smp_processor_id()); ++ kmemcheck_slab_free(s, object, c->objsize); + debug_check_no_locks_freed(object, c->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, s->objsize); +@@ -1745,6 +1799,8 @@ void kmem_cache_free(struct kmem_cache * + page = virt_to_head_page(x); + + slab_free(s, page, x, _RET_IP_); ++ ++ trace_kmem_cache_free(_RET_IP_, x); + } + EXPORT_SYMBOL(kmem_cache_free); + +@@ -2478,7 +2534,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); + * Kmalloc subsystem + *******************************************************************/ + +-struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; ++struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; + EXPORT_SYMBOL(kmalloc_caches); + + static int __init setup_slub_min_order(char *str) +@@ -2540,7 +2596,7 @@ panic: + } + + #ifdef CONFIG_ZONE_DMA +-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; ++static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; + + static void sysfs_add_func(struct work_struct *w) + { +@@ -2586,7 +2642,8 @@ static noinline struct kmem_cache *dma_k + + if (!s || !text || !kmem_cache_open(s, flags, text, + realsize, ARCH_KMALLOC_MINALIGN, +- SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { ++ SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED, ++ NULL)) { + kfree(s); + kfree(text); + goto unlock_out; +@@ -2660,8 +2717,9 @@ static struct kmem_cache *get_slab(size_ + void *__kmalloc(size_t size, gfp_t flags) + { + struct kmem_cache *s; ++ void *ret; + +- if (unlikely(size > PAGE_SIZE)) ++ if (unlikely(size > SLUB_MAX_SIZE)) + return kmalloc_large(size, flags); + + s = get_slab(size, flags); +@@ -2669,15 +2727,20 @@ void *__kmalloc(size_t size, gfp_t flags + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + +- return slab_alloc(s, flags, -1, _RET_IP_); ++ ret = slab_alloc(s, flags, -1, _RET_IP_); ++ ++ trace_kmalloc(_RET_IP_, ret, size, s->size, flags); ++ ++ return ret; + } + EXPORT_SYMBOL(__kmalloc); + + static void *kmalloc_large_node(size_t size, gfp_t flags, int node) + { +- struct page *page = alloc_pages_node(node, flags | __GFP_COMP, +- get_order(size)); ++ struct page *page; + ++ flags |= __GFP_COMP | __GFP_NOTRACK; ++ page = alloc_pages_node(node, flags, get_order(size)); + if (page) + return page_address(page); + else +@@ -2688,16 +2751,28 @@ static void *kmalloc_large_node(size_t s + void *__kmalloc_node(size_t size, gfp_t flags, int node) + { + struct kmem_cache *s; ++ void *ret; ++ ++ if (unlikely(size > SLUB_MAX_SIZE)) { ++ ret = kmalloc_large_node(size, flags, node); + +- if (unlikely(size > PAGE_SIZE)) +- return kmalloc_large_node(size, flags, node); ++ trace_kmalloc_node(_RET_IP_, ret, ++ size, PAGE_SIZE << get_order(size), ++ flags, node); ++ ++ return ret; ++ } + + s = get_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + +- return slab_alloc(s, flags, node, _RET_IP_); ++ ret = slab_alloc(s, flags, node, _RET_IP_); ++ ++ trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); ++ ++ return ret; + } + EXPORT_SYMBOL(__kmalloc_node); + #endif +@@ -2746,6 +2821,8 @@ void kfree(const void *x) + struct page *page; + void *object = (void *)x; + ++ trace_kfree(_RET_IP_, x); ++ + if (unlikely(ZERO_OR_NULL_PTR(x))) + return; + +@@ -2989,7 +3066,7 @@ void __init kmem_cache_init(void) + caches++; + } + +- for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { ++ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { + create_kmalloc_cache(&kmalloc_caches[i], + "kmalloc", 1 << i, GFP_KERNEL); + caches++; +@@ -3026,7 +3103,7 @@ void __init kmem_cache_init(void) + slab_state = UP; + + /* Provide the correct kmalloc names now that the caches are up */ +- for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) ++ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) + kmalloc_caches[i]. name = + kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + +@@ -3225,8 +3302,9 @@ static struct notifier_block __cpuinitda + void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) + { + struct kmem_cache *s; ++ void *ret; + +- if (unlikely(size > PAGE_SIZE)) ++ if (unlikely(size > SLUB_MAX_SIZE)) + return kmalloc_large(size, gfpflags); + + s = get_slab(size, gfpflags); +@@ -3234,15 +3312,21 @@ void *__kmalloc_track_caller(size_t size + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + +- return slab_alloc(s, gfpflags, -1, caller); ++ ret = slab_alloc(s, gfpflags, -1, caller); ++ ++ /* Honor the call site pointer we recieved. */ ++ trace_kmalloc(caller, ret, size, s->size, gfpflags); ++ ++ return ret; + } + + void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + int node, unsigned long caller) + { + struct kmem_cache *s; ++ void *ret; + +- if (unlikely(size > PAGE_SIZE)) ++ if (unlikely(size > SLUB_MAX_SIZE)) + return kmalloc_large_node(size, gfpflags, node); + + s = get_slab(size, gfpflags); +@@ -3250,7 +3334,12 @@ void *__kmalloc_node_track_caller(size_t + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + +- return slab_alloc(s, gfpflags, node, caller); ++ ret = slab_alloc(s, gfpflags, node, caller); ++ ++ /* Honor the call site pointer we recieved. */ ++ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); ++ ++ return ret; + } + + #ifdef CONFIG_SLUB_DEBUG +@@ -4305,6 +4394,8 @@ static char *create_unique_id(struct kme + *p++ = 'a'; + if (s->flags & SLAB_DEBUG_FREE) + *p++ = 'F'; ++ if (!(s->flags & SLAB_NOTRACK)) ++ *p++ = 't'; + if (p != name + 1) + *p++ = '-'; + p += sprintf(p, "%07d", s->size); +Index: linux-2.6-tip/mm/swapfile.c +=================================================================== +--- linux-2.6-tip.orig/mm/swapfile.c ++++ linux-2.6-tip/mm/swapfile.c +@@ -585,13 +585,14 @@ int free_swap_and_cache(swp_entry_t entr + p = swap_info_get(entry); + if (p) { + if (swap_entry_free(p, entry) == 1) { ++ spin_unlock(&swap_lock); + page = find_get_page(&swapper_space, entry.val); + if (page && !trylock_page(page)) { + page_cache_release(page); + page = NULL; + } +- } +- spin_unlock(&swap_lock); ++ } else ++ spin_unlock(&swap_lock); + } + if (page) { + /* +@@ -1649,7 +1650,7 @@ SYSCALL_DEFINE2(swapon, const char __use + union swap_header *swap_header = NULL; + unsigned int nr_good_pages = 0; + int nr_extents = 0; +- sector_t span; ++ sector_t uninitialized_var(span); + unsigned long maxpages = 1; + unsigned long swapfilepages; + unsigned short *swap_map = NULL; +Index: linux-2.6-tip/mm/util.c +=================================================================== +--- linux-2.6-tip.orig/mm/util.c ++++ linux-2.6-tip/mm/util.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + + /** +@@ -206,3 +207,18 @@ int __attribute__((weak)) get_user_pages + return ret; + } + EXPORT_SYMBOL_GPL(get_user_pages_fast); ++ ++/* Tracepoints definitions. */ ++DEFINE_TRACE(kmalloc); ++DEFINE_TRACE(kmem_cache_alloc); ++DEFINE_TRACE(kmalloc_node); ++DEFINE_TRACE(kmem_cache_alloc_node); ++DEFINE_TRACE(kfree); ++DEFINE_TRACE(kmem_cache_free); ++ ++EXPORT_TRACEPOINT_SYMBOL(kmalloc); ++EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); ++EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); ++EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); ++EXPORT_TRACEPOINT_SYMBOL(kfree); ++EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +Index: linux-2.6-tip/mm/vmalloc.c +=================================================================== +--- linux-2.6-tip.orig/mm/vmalloc.c ++++ linux-2.6-tip/mm/vmalloc.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, un + * + * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + */ +-static int vmap_page_range(unsigned long start, unsigned long end, +- pgprot_t prot, struct page **pages) ++static int vmap_page_range_noflush(unsigned long start, unsigned long end, ++ pgprot_t prot, struct page **pages) + { + pgd_t *pgd; + unsigned long next; +@@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long + if (err) + break; + } while (pgd++, addr = next, addr != end); +- flush_cache_vmap(start, end); + + if (unlikely(err)) + return err; + return nr; + } + ++static int vmap_page_range(unsigned long start, unsigned long end, ++ pgprot_t prot, struct page **pages) ++{ ++ int ret; ++ ++ ret = vmap_page_range_noflush(start, end, prot, pages); ++ flush_cache_vmap(start, end); ++ return ret; ++} ++ + static inline int is_vmalloc_or_module_addr(const void *x) + { + /* +@@ -990,6 +1000,32 @@ void *vm_map_ram(struct page **pages, un + } + EXPORT_SYMBOL(vm_map_ram); + ++/** ++ * vm_area_register_early - register vmap area early during boot ++ * @vm: vm_struct to register ++ * @align: requested alignment ++ * ++ * This function is used to register kernel vm area before ++ * vmalloc_init() is called. @vm->size and @vm->flags should contain ++ * proper values on entry and other fields should be zero. On return, ++ * vm->addr contains the allocated address. ++ * ++ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. ++ */ ++void __init vm_area_register_early(struct vm_struct *vm, size_t align) ++{ ++ static size_t vm_init_off __initdata; ++ unsigned long addr; ++ ++ addr = ALIGN(VMALLOC_START + vm_init_off, align); ++ vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; ++ ++ vm->addr = (void *)addr; ++ ++ vm->next = vmlist; ++ vmlist = vm; ++} ++ + void __init vmalloc_init(void) + { + struct vmap_area *va; +@@ -1017,6 +1053,58 @@ void __init vmalloc_init(void) + vmap_initialized = true; + } + ++/** ++ * map_kernel_range_noflush - map kernel VM area with the specified pages ++ * @addr: start of the VM area to map ++ * @size: size of the VM area to map ++ * @prot: page protection flags to use ++ * @pages: pages to map ++ * ++ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size ++ * specify should have been allocated using get_vm_area() and its ++ * friends. ++ * ++ * NOTE: ++ * This function does NOT do any cache flushing. The caller is ++ * responsible for calling flush_cache_vmap() on to-be-mapped areas ++ * before calling this function. ++ * ++ * RETURNS: ++ * The number of pages mapped on success, -errno on failure. ++ */ ++int map_kernel_range_noflush(unsigned long addr, unsigned long size, ++ pgprot_t prot, struct page **pages) ++{ ++ return vmap_page_range_noflush(addr, addr + size, prot, pages); ++} ++ ++/** ++ * unmap_kernel_range_noflush - unmap kernel VM area ++ * @addr: start of the VM area to unmap ++ * @size: size of the VM area to unmap ++ * ++ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size ++ * specify should have been allocated using get_vm_area() and its ++ * friends. ++ * ++ * NOTE: ++ * This function does NOT do any cache flushing. The caller is ++ * responsible for calling flush_cache_vunmap() on to-be-mapped areas ++ * before calling this function and flush_tlb_kernel_range() after. ++ */ ++void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) ++{ ++ vunmap_page_range(addr, addr + size); ++} ++ ++/** ++ * unmap_kernel_range - unmap kernel VM area and flush cache and TLB ++ * @addr: start of the VM area to unmap ++ * @size: size of the VM area to unmap ++ * ++ * Similar to unmap_kernel_range_noflush() but flushes vcache before ++ * the unmapping and tlb after. ++ */ + void unmap_kernel_range(unsigned long addr, unsigned long size) + { + unsigned long end = addr + size; +@@ -1267,6 +1355,7 @@ EXPORT_SYMBOL(vfree); + void vunmap(const void *addr) + { + BUG_ON(in_interrupt()); ++ might_sleep(); + __vunmap(addr, 0); + } + EXPORT_SYMBOL(vunmap); +@@ -1286,6 +1375,8 @@ void *vmap(struct page **pages, unsigned + { + struct vm_struct *area; + ++ might_sleep(); ++ + if (count > num_physpages) + return NULL; + +Index: linux-2.6-tip/mm/vmscan.c +=================================================================== +--- linux-2.6-tip.orig/mm/vmscan.c ++++ linux-2.6-tip/mm/vmscan.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include /* for try_to_release_page(), + buffer_heads_over_limit */ + #include +@@ -1125,7 +1126,7 @@ static unsigned long shrink_inactive_lis + } + + nr_reclaimed += nr_freed; +- local_irq_disable(); ++ local_irq_disable_nort(); + if (current_is_kswapd()) { + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + __count_vm_events(KSWAPD_STEAL, nr_freed); +@@ -1166,9 +1167,14 @@ static unsigned long shrink_inactive_lis + } + } + } while (nr_scanned < max_scan); ++ /* ++ * Non-PREEMPT_RT relies on IRQs-off protecting the page_states ++ * per-CPU data. PREEMPT_RT has that data protected even in ++ * __mod_page_state(), so no need to keep IRQs disabled. ++ */ + spin_unlock(&zone->lru_lock); + done: +- local_irq_enable(); ++ local_irq_enable_nort(); + pagevec_release(&pvec); + return nr_reclaimed; + } +@@ -1963,7 +1969,9 @@ static int kswapd(void *p) + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; +- node_to_cpumask_ptr(cpumask, pgdat->node_id); ++ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); ++ ++ lockdep_set_current_reclaim_state(GFP_KERNEL); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); +@@ -2198,7 +2206,9 @@ static int __devinit cpu_callback(struct + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_HIGH_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); +- node_to_cpumask_ptr(mask, pgdat->node_id); ++ const struct cpumask *mask; ++ ++ mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ +Index: linux-2.6-tip/net/9p/Kconfig +=================================================================== +--- linux-2.6-tip.orig/net/9p/Kconfig ++++ linux-2.6-tip/net/9p/Kconfig +@@ -4,6 +4,8 @@ + + menuconfig NET_9P + depends on NET && EXPERIMENTAL ++ # build breakage ++ depends on 0 + tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" + help + If you say Y here, you will get experimental support for +Index: linux-2.6-tip/net/core/skbuff.c +=================================================================== +--- linux-2.6-tip.orig/net/core/skbuff.c ++++ linux-2.6-tip/net/core/skbuff.c +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -197,6 +198,8 @@ struct sk_buff *__alloc_skb(unsigned int + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; ++ kmemcheck_annotate_bitfield(skb->flags1); ++ kmemcheck_annotate_bitfield(skb->flags2); + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); +@@ -211,6 +214,8 @@ struct sk_buff *__alloc_skb(unsigned int + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); + ++ kmemcheck_annotate_bitfield(child->flags1); ++ kmemcheck_annotate_bitfield(child->flags2); + skb->fclone = SKB_FCLONE_ORIG; + atomic_set(fclone_ref, 1); + +@@ -240,7 +245,7 @@ nodata: + struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) + { +- int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; ++ int node = dev_to_node(&dev->dev); + struct sk_buff *skb; + + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); +@@ -378,7 +383,7 @@ static void skb_release_head_state(struc + secpath_put(skb->sp); + #endif + if (skb->destructor) { +- WARN_ON(in_irq()); ++// WARN_ON(in_irq()); + skb->destructor(skb); + } + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +@@ -600,6 +605,9 @@ struct sk_buff *skb_clone(struct sk_buff + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; ++ ++ kmemcheck_annotate_bitfield(n->flags1); ++ kmemcheck_annotate_bitfield(n->flags2); + n->fclone = SKB_FCLONE_UNAVAILABLE; + } + +Index: linux-2.6-tip/net/core/sock.c +=================================================================== +--- linux-2.6-tip.orig/net/core/sock.c ++++ linux-2.6-tip/net/core/sock.c +@@ -894,6 +894,8 @@ static struct sock *sk_prot_alloc(struct + sk = kmalloc(prot->obj_size, priority); + + if (sk != NULL) { ++ kmemcheck_annotate_bitfield(sk->flags); ++ + if (security_sk_alloc(sk, family, priority)) + goto out_free; + +@@ -1947,8 +1949,9 @@ static DECLARE_BITMAP(proto_inuse_idx, P + #ifdef CONFIG_NET_NS + void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) + { +- int cpu = smp_processor_id(); ++ int cpu = get_cpu(); + per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val; ++ put_cpu(); + } + EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +@@ -1994,7 +1997,9 @@ static DEFINE_PER_CPU(struct prot_inuse, + + void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) + { +- __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val; ++ int cpu = get_cpu(); ++ per_cpu(prot_inuse, cpu).val[prot->inuse_idx] += val; ++ put_cpu(); + } + EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +Index: linux-2.6-tip/net/ipv4/af_inet.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv4/af_inet.c ++++ linux-2.6-tip/net/ipv4/af_inet.c +@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field); + int snmp_mib_init(void *ptr[2], size_t mibsize) + { + BUG_ON(ptr == NULL); +- ptr[0] = __alloc_percpu(mibsize); ++ ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); + if (!ptr[0]) + goto err0; +- ptr[1] = __alloc_percpu(mibsize); ++ ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); + if (!ptr[1]) + goto err1; + return 0; +Index: linux-2.6-tip/net/ipv4/inet_timewait_sock.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv4/inet_timewait_sock.c ++++ linux-2.6-tip/net/ipv4/inet_timewait_sock.c +@@ -9,6 +9,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -117,6 +118,8 @@ struct inet_timewait_sock *inet_twsk_all + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + ++ kmemcheck_annotate_bitfield(tw->flags); ++ + /* Give us an identity. */ + tw->tw_daddr = inet->daddr; + tw->tw_rcv_saddr = inet->rcv_saddr; +Index: linux-2.6-tip/net/ipv4/route.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv4/route.c ++++ linux-2.6-tip/net/ipv4/route.c +@@ -204,13 +204,13 @@ struct rt_hash_bucket { + }; + + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ +- defined(CONFIG_PROVE_LOCKING) ++ defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT) + /* + * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks + * The size of this table is a power of two and depends on the number of CPUS. + * (on lockdep we have a quite big spinlock_t, so keep the size down there) + */ +-#ifdef CONFIG_LOCKDEP ++#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) + # define RT_HASH_LOCK_SZ 256 + #else + # if NR_CPUS >= 32 +@@ -242,7 +242,7 @@ static __init void rt_hash_lock_init(voi + spin_lock_init(&rt_hash_locks[i]); + } + #else +-# define rt_hash_lock_addr(slot) NULL ++# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL) + + static inline void rt_hash_lock_init(void) + { +@@ -3356,7 +3356,7 @@ int __init ip_rt_init(void) + int rc = 0; + + #ifdef CONFIG_NET_CLS_ROUTE +- ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); ++ ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); + if (!ip_rt_acct) + panic("IP: failed to allocate ip_rt_acct\n"); + #endif +Index: linux-2.6-tip/net/netfilter/ipvs/ip_vs_ctl.c +=================================================================== +--- linux-2.6-tip.orig/net/netfilter/ipvs/ip_vs_ctl.c ++++ linux-2.6-tip/net/netfilter/ipvs/ip_vs_ctl.c +@@ -2315,6 +2315,7 @@ __ip_vs_get_dest_entries(const struct ip + static inline void + __ip_vs_get_timeouts(struct ip_vs_timeout_user *u) + { ++ memset(u, 0, sizeof(*u)); + #ifdef CONFIG_IP_VS_PROTO_TCP + u->tcp_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; +Index: linux-2.6-tip/net/netfilter/nf_conntrack_ftp.c +=================================================================== +--- linux-2.6-tip.orig/net/netfilter/nf_conntrack_ftp.c ++++ linux-2.6-tip/net/netfilter/nf_conntrack_ftp.c +@@ -588,3 +588,4 @@ static int __init nf_conntrack_ftp_init( + + module_init(nf_conntrack_ftp_init); + module_exit(nf_conntrack_ftp_fini); ++ +Index: linux-2.6-tip/net/netfilter/nf_conntrack_proto_sctp.c +=================================================================== +--- linux-2.6-tip.orig/net/netfilter/nf_conntrack_proto_sctp.c ++++ linux-2.6-tip/net/netfilter/nf_conntrack_proto_sctp.c +@@ -373,6 +373,9 @@ static int sctp_packet(struct nf_conn *c + } + write_unlock_bh(&sctp_lock); + ++ if (new_state == SCTP_CONNTRACK_MAX) ++ goto out; ++ + nf_ct_refresh_acct(ct, ctinfo, skb, sctp_timeouts[new_state]); + + if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && +Index: linux-2.6-tip/net/packet/af_packet.c +=================================================================== +--- linux-2.6-tip.orig/net/packet/af_packet.c ++++ linux-2.6-tip/net/packet/af_packet.c +@@ -711,7 +711,7 @@ static int tpacket_rcv(struct sk_buff *s + hdrlen = sizeof(*h.h2); + break; + default: +- BUG(); ++ panic("AF_PACKET: bad tp->version"); + } + + sll = h.raw + TPACKET_ALIGN(hdrlen); +Index: linux-2.6-tip/net/rfkill/rfkill.c +=================================================================== +--- linux-2.6-tip.orig/net/rfkill/rfkill.c ++++ linux-2.6-tip/net/rfkill/rfkill.c +@@ -387,6 +387,7 @@ static const char *rfkill_get_type_str(e + return "wwan"; + default: + BUG(); ++ return NULL; + } + } + +Index: linux-2.6-tip/net/sunrpc/svc.c +=================================================================== +--- linux-2.6-tip.orig/net/sunrpc/svc.c ++++ linux-2.6-tip/net/sunrpc/svc.c +@@ -317,8 +317,7 @@ svc_pool_map_set_cpumask(struct task_str + } + case SVC_POOL_PERNODE: + { +- node_to_cpumask_ptr(nodecpumask, node); +- set_cpus_allowed_ptr(task, nodecpumask); ++ set_cpus_allowed_ptr(task, cpumask_of_node(node)); + break; + } + } +Index: linux-2.6-tip/net/sunrpc/svcauth_unix.c +=================================================================== +--- linux-2.6-tip.orig/net/sunrpc/svcauth_unix.c ++++ linux-2.6-tip/net/sunrpc/svcauth_unix.c +@@ -682,7 +682,7 @@ svcauth_unix_set_client(struct svc_rqst + sin6 = svc_addr_in6(rqstp); + break; + default: +- BUG(); ++ panic("svcauth_unix_set_client: bad address family!"); + } + + rqstp->rq_client = NULL; +@@ -863,3 +863,4 @@ struct auth_ops svcauth_unix = { + .set_client = svcauth_unix_set_client, + }; + ++ +Index: linux-2.6-tip/samples/tracepoints/tp-samples-trace.h +=================================================================== +--- linux-2.6-tip.orig/samples/tracepoints/tp-samples-trace.h ++++ linux-2.6-tip/samples/tracepoints/tp-samples-trace.h +@@ -5,9 +5,9 @@ + #include + + DECLARE_TRACE(subsys_event, +- TPPROTO(struct inode *inode, struct file *file), +- TPARGS(inode, file)); ++ TP_PROTO(struct inode *inode, struct file *file), ++ TP_ARGS(inode, file)); + DECLARE_TRACE(subsys_eventb, +- TPPROTO(void), +- TPARGS()); ++ TP_PROTO(void), ++ TP_ARGS()); + #endif +Index: linux-2.6-tip/samples/tracepoints/tracepoint-sample.c +=================================================================== +--- linux-2.6-tip.orig/samples/tracepoints/tracepoint-sample.c ++++ linux-2.6-tip/samples/tracepoints/tracepoint-sample.c +@@ -1,6 +1,6 @@ + /* tracepoint-sample.c + * +- * Executes a tracepoint when /proc/tracepoint-example is opened. ++ * Executes a tracepoint when /proc/tracepoint-sample is opened. + * + * (C) Copyright 2007 Mathieu Desnoyers + * +@@ -16,7 +16,7 @@ + DEFINE_TRACE(subsys_event); + DEFINE_TRACE(subsys_eventb); + +-struct proc_dir_entry *pentry_example; ++struct proc_dir_entry *pentry_sample; + + static int my_open(struct inode *inode, struct file *file) + { +@@ -32,25 +32,25 @@ static struct file_operations mark_ops = + .open = my_open, + }; + +-static int __init example_init(void) ++static int __init sample_init(void) + { +- printk(KERN_ALERT "example init\n"); +- pentry_example = proc_create("tracepoint-example", 0444, NULL, ++ printk(KERN_ALERT "sample init\n"); ++ pentry_sample = proc_create("tracepoint-sample", 0444, NULL, + &mark_ops); +- if (!pentry_example) ++ if (!pentry_sample) + return -EPERM; + return 0; + } + +-static void __exit example_exit(void) ++static void __exit sample_exit(void) + { +- printk(KERN_ALERT "example exit\n"); +- remove_proc_entry("tracepoint-example", NULL); ++ printk(KERN_ALERT "sample exit\n"); ++ remove_proc_entry("tracepoint-sample", NULL); + } + +-module_init(example_init) +-module_exit(example_exit) ++module_init(sample_init) ++module_exit(sample_exit) + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Mathieu Desnoyers"); +-MODULE_DESCRIPTION("Tracepoint example"); ++MODULE_DESCRIPTION("Tracepoint sample"); +Index: linux-2.6-tip/scripts/Makefile.build +=================================================================== +--- linux-2.6-tip.orig/scripts/Makefile.build ++++ linux-2.6-tip/scripts/Makefile.build +@@ -112,13 +112,13 @@ endif + # --------------------------------------------------------------------------- + + # Default is built-in, unless we know otherwise +-modkern_cflags := $(CFLAGS_KERNEL) ++modkern_cflags = $(if $(part-of-module), $(CFLAGS_MODULE), $(CFLAGS_KERNEL)) + quiet_modtag := $(empty) $(empty) + +-$(real-objs-m) : modkern_cflags := $(CFLAGS_MODULE) +-$(real-objs-m:.o=.i) : modkern_cflags := $(CFLAGS_MODULE) +-$(real-objs-m:.o=.s) : modkern_cflags := $(CFLAGS_MODULE) +-$(real-objs-m:.o=.lst): modkern_cflags := $(CFLAGS_MODULE) ++$(real-objs-m) : part-of-module := y ++$(real-objs-m:.o=.i) : part-of-module := y ++$(real-objs-m:.o=.s) : part-of-module := y ++$(real-objs-m:.o=.lst): part-of-module := y + + $(real-objs-m) : quiet_modtag := [M] + $(real-objs-m:.o=.i) : quiet_modtag := [M] +@@ -205,7 +205,8 @@ endif + ifdef CONFIG_FTRACE_MCOUNT_RECORD + cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ + "$(if $(CONFIG_64BIT),64,32)" \ +- "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)"; ++ "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" \ ++ "$(if $(part-of-module),1,0)" "$(@)"; + endif + + define rule_cc_o_c +Index: linux-2.6-tip/scripts/Makefile.lib +=================================================================== +--- linux-2.6-tip.orig/scripts/Makefile.lib ++++ linux-2.6-tip/scripts/Makefile.lib +@@ -186,3 +186,17 @@ quiet_cmd_gzip = GZIP $@ + cmd_gzip = gzip -f -9 < $< > $@ + + ++# Bzip2 ++# --------------------------------------------------------------------------- ++ ++# Bzip2 does not include size in file... so we have to fake that ++size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size ++ ++quiet_cmd_bzip2 = BZIP2 $@ ++cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false) ++ ++# Lzma ++# --------------------------------------------------------------------------- ++ ++quiet_cmd_lzma = LZMA $@ ++cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false) +Index: linux-2.6-tip/scripts/bin_size +=================================================================== +--- /dev/null ++++ linux-2.6-tip/scripts/bin_size +@@ -0,0 +1,10 @@ ++#!/bin/sh ++ ++if [ $# = 0 ] ; then ++ echo Usage: $0 file ++fi ++ ++size_dec=`stat -c "%s" $1` ++size_hex_echo_string=`printf "%08x" $size_dec | ++ sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'` ++/bin/echo -ne $size_hex_echo_string +Index: linux-2.6-tip/scripts/gcc-x86_32-has-stack-protector.sh +=================================================================== +--- /dev/null ++++ linux-2.6-tip/scripts/gcc-x86_32-has-stack-protector.sh +@@ -0,0 +1,8 @@ ++#!/bin/sh ++ ++echo "int foo(void) { char X[200]; return 3; }" | $* -S -xc -c -O0 -fstack-protector - -o - 2> /dev/null | grep -q "%gs" ++if [ "$?" -eq "0" ] ; then ++ echo y ++else ++ echo n ++fi +Index: linux-2.6-tip/scripts/gcc-x86_64-has-stack-protector.sh +=================================================================== +--- linux-2.6-tip.orig/scripts/gcc-x86_64-has-stack-protector.sh ++++ linux-2.6-tip/scripts/gcc-x86_64-has-stack-protector.sh +@@ -1,6 +1,8 @@ + #!/bin/sh + +-echo "int foo(void) { char X[200]; return 3; }" | $1 -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs" ++echo "int foo(void) { char X[200]; return 3; }" | $* -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs" + if [ "$?" -eq "0" ] ; then +- echo $2 ++ echo y ++else ++ echo n + fi +Index: linux-2.6-tip/scripts/gen_initramfs_list.sh +=================================================================== +--- linux-2.6-tip.orig/scripts/gen_initramfs_list.sh ++++ linux-2.6-tip/scripts/gen_initramfs_list.sh +@@ -5,7 +5,7 @@ + # Released under the terms of the GNU GPL + # + # Generate a cpio packed initramfs. It uses gen_init_cpio to generate +-# the cpio archive, and gzip to pack it. ++# the cpio archive, and then compresses it. + # The script may also be used to generate the inputfile used for gen_init_cpio + # This script assumes that gen_init_cpio is located in usr/ directory + +@@ -16,8 +16,8 @@ usage() { + cat << EOF + Usage: + $0 [-o ] [-u ] [-g ] {-d | } ... +- -o Create gzipped initramfs file named using +- gen_init_cpio and gzip ++ -o Create compressed initramfs file named using ++ gen_init_cpio and compressor depending on the extension + -u User ID to map to user ID 0 (root). + is only meaningful if is a + directory. "squash" forces all files to uid 0. +@@ -225,6 +225,7 @@ cpio_list= + output="/dev/stdout" + output_file="" + is_cpio_compressed= ++compr="gzip -9 -f" + + arg="$1" + case "$arg" in +@@ -233,11 +234,15 @@ case "$arg" in + echo "deps_initramfs := \\" + shift + ;; +- "-o") # generate gzipped cpio image named $1 ++ "-o") # generate compressed cpio image named $1 + shift + output_file="$1" + cpio_list="$(mktemp ${TMPDIR:-/tmp}/cpiolist.XXXXXX)" + output=${cpio_list} ++ echo "$output_file" | grep -q "\.gz$" && compr="gzip -9 -f" ++ echo "$output_file" | grep -q "\.bz2$" && compr="bzip2 -9 -f" ++ echo "$output_file" | grep -q "\.lzma$" && compr="lzma -9 -f" ++ echo "$output_file" | grep -q "\.cpio$" && compr="cat" + shift + ;; + esac +@@ -274,7 +279,7 @@ while [ $# -gt 0 ]; do + esac + done + +-# If output_file is set we will generate cpio archive and gzip it ++# If output_file is set we will generate cpio archive and compress it + # we are carefull to delete tmp files + if [ ! -z ${output_file} ]; then + if [ -z ${cpio_file} ]; then +@@ -287,7 +292,8 @@ if [ ! -z ${output_file} ]; then + if [ "${is_cpio_compressed}" = "compressed" ]; then + cat ${cpio_tfile} > ${output_file} + else +- cat ${cpio_tfile} | gzip -f -9 - > ${output_file} ++ (cat ${cpio_tfile} | ${compr} - > ${output_file}) \ ++ || (rm -f ${output_file} ; false) + fi + [ -z ${cpio_file} ] && rm ${cpio_tfile} + fi +Index: linux-2.6-tip/scripts/headers_check.pl +=================================================================== +--- linux-2.6-tip.orig/scripts/headers_check.pl ++++ linux-2.6-tip/scripts/headers_check.pl +@@ -38,7 +38,7 @@ foreach my $file (@files) { + &check_asm_types(); + &check_sizetypes(); + &check_prototypes(); +- &check_config(); ++ # Dropped for now. Too much noise &check_config(); + } + close FH; + } +Index: linux-2.6-tip/scripts/kallsyms.c +=================================================================== +--- linux-2.6-tip.orig/scripts/kallsyms.c ++++ linux-2.6-tip/scripts/kallsyms.c +@@ -500,6 +500,51 @@ static void optimize_token_table(void) + optimize_result(); + } + ++/* guess for "linker script provide" symbol */ ++static int may_be_linker_script_provide_symbol(const struct sym_entry *se) ++{ ++ const char *symbol = (char *)se->sym + 1; ++ int len = se->len - 1; ++ ++ if (len < 8) ++ return 0; ++ ++ if (symbol[0] != '_' || symbol[1] != '_') ++ return 0; ++ ++ /* __start_XXXXX */ ++ if (!memcmp(symbol + 2, "start_", 6)) ++ return 1; ++ ++ /* __stop_XXXXX */ ++ if (!memcmp(symbol + 2, "stop_", 5)) ++ return 1; ++ ++ /* __end_XXXXX */ ++ if (!memcmp(symbol + 2, "end_", 4)) ++ return 1; ++ ++ /* __XXXXX_start */ ++ if (!memcmp(symbol + len - 6, "_start", 6)) ++ return 1; ++ ++ /* __XXXXX_end */ ++ if (!memcmp(symbol + len - 4, "_end", 4)) ++ return 1; ++ ++ return 0; ++} ++ ++static int prefix_underscores_count(const char *str) ++{ ++ const char *tail = str; ++ ++ while (*tail != '_') ++ tail++; ++ ++ return tail - str; ++} ++ + static int compare_symbols(const void *a, const void *b) + { + const struct sym_entry *sa; +@@ -521,6 +566,18 @@ static int compare_symbols(const void *a + if (wa != wb) + return wa - wb; + ++ /* sort by "linker script provide" type */ ++ wa = may_be_linker_script_provide_symbol(sa); ++ wb = may_be_linker_script_provide_symbol(sb); ++ if (wa != wb) ++ return wa - wb; ++ ++ /* sort by the number of prefix underscores */ ++ wa = prefix_underscores_count((const char *)sa->sym + 1); ++ wb = prefix_underscores_count((const char *)sb->sym + 1); ++ if (wa != wb) ++ return wa - wb; ++ + /* sort by initial order, so that other symbols are left undisturbed */ + return sa->start_pos - sb->start_pos; + } +Index: linux-2.6-tip/scripts/mod/modpost.c +=================================================================== +--- linux-2.6-tip.orig/scripts/mod/modpost.c ++++ linux-2.6-tip/scripts/mod/modpost.c +@@ -415,8 +415,9 @@ static int parse_elf(struct elf_info *in + const char *secstrings + = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + const char *secname; ++ int nobits = sechdrs[i].sh_type == SHT_NOBITS; + +- if (sechdrs[i].sh_offset > info->size) { ++ if (!nobits && sechdrs[i].sh_offset > info->size) { + fatal("%s is truncated. sechdrs[i].sh_offset=%lu > " + "sizeof(*hrd)=%zu\n", filename, + (unsigned long)sechdrs[i].sh_offset, +@@ -425,6 +426,8 @@ static int parse_elf(struct elf_info *in + } + secname = secstrings + sechdrs[i].sh_name; + if (strcmp(secname, ".modinfo") == 0) { ++ if (nobits) ++ fatal("%s has NOBITS .modinfo\n", filename); + info->modinfo = (void *)hdr + sechdrs[i].sh_offset; + info->modinfo_len = sechdrs[i].sh_size; + } else if (strcmp(secname, "__ksymtab") == 0) +Index: linux-2.6-tip/scripts/recordmcount.pl +=================================================================== +--- linux-2.6-tip.orig/scripts/recordmcount.pl ++++ linux-2.6-tip/scripts/recordmcount.pl +@@ -100,14 +100,19 @@ $P =~ s@.*/@@g; + + my $V = '0.1'; + +-if ($#ARGV < 6) { +- print "usage: $P arch objdump objcopy cc ld nm rm mv inputfile\n"; ++if ($#ARGV < 7) { ++ print "usage: $P arch bits objdump objcopy cc ld nm rm mv is_module inputfile\n"; + print "version: $V\n"; + exit(1); + } + + my ($arch, $bits, $objdump, $objcopy, $cc, +- $ld, $nm, $rm, $mv, $inputfile) = @ARGV; ++ $ld, $nm, $rm, $mv, $is_module, $inputfile) = @ARGV; ++ ++# This file refers to mcount and shouldn't be ftraced, so lets' ignore it ++if ($inputfile eq "kernel/trace/ftrace.o") { ++ exit(0); ++} + + # Acceptable sections to record. + my %text_sections = ( +@@ -201,6 +206,13 @@ if ($arch eq "x86_64") { + $alignment = 2; + $section_type = '%progbits'; + ++} elsif ($arch eq "ia64") { ++ $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; ++ $type = "data8"; ++ ++ if ($is_module eq "0") { ++ $cc .= " -mconstant-gp"; ++ } + } else { + die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; + } +@@ -263,7 +275,6 @@ if (!$found_version) { + "\tDisabling local function references.\n"; + } + +- + # + # Step 1: find all the local (static functions) and weak symbols. + # 't' is local, 'w/W' is weak (we never use a weak function) +@@ -331,13 +342,16 @@ sub update_funcs + # + # Step 2: find the sections and mcount call sites + # +-open(IN, "$objdump -dr $inputfile|") || die "error running $objdump"; ++open(IN, "$objdump -hdr $inputfile|") || die "error running $objdump"; + + my $text; + ++my $read_headers = 1; ++ + while () { + # is it a section? + if (/$section_regex/) { ++ $read_headers = 0; + + # Only record text sections that we know are safe + if (defined($text_sections{$1})) { +@@ -371,6 +385,19 @@ while () { + $ref_func = $text; + } + } ++ } elsif ($read_headers && /$mcount_section/) { ++ # ++ # Somehow the make process can execute this script on an ++ # object twice. If it does, we would duplicate the mcount ++ # section and it will cause the function tracer self test ++ # to fail. Check if the mcount section exists, and if it does, ++ # warn and exit. ++ # ++ print STDERR "ERROR: $mcount_section already in $inputfile\n" . ++ "\tThis may be an indication that your build is corrupted.\n" . ++ "\tDelete $inputfile and try again. If the same object file\n" . ++ "\tstill causes an issue, then disable CONFIG_DYNAMIC_FTRACE.\n"; ++ exit(-1); + } + + # is this a call site to mcount? If so, record it to print later +Index: linux-2.6-tip/security/capability.c +=================================================================== +--- linux-2.6-tip.orig/security/capability.c ++++ linux-2.6-tip/security/capability.c +@@ -11,6 +11,7 @@ + */ + + #include ++#include + + static int cap_acct(struct file *file) + { +@@ -680,6 +681,9 @@ static int cap_socket_getpeersec_dgram(s + + static int cap_sk_alloc_security(struct sock *sk, int family, gfp_t priority) + { ++#ifdef CONFIG_SECURITY_NETWORK ++ sk->sk_security = NULL; ++#endif + return 0; + } + +Index: linux-2.6-tip/security/keys/keyctl.c +=================================================================== +--- linux-2.6-tip.orig/security/keys/keyctl.c ++++ linux-2.6-tip/security/keys/keyctl.c +@@ -896,7 +896,7 @@ long keyctl_instantiate_key(key_serial_t + { + const struct cred *cred = current_cred(); + struct request_key_auth *rka; +- struct key *instkey, *dest_keyring; ++ struct key *instkey, *uninitialized_var(dest_keyring); + void *payload; + long ret; + bool vm = false; +@@ -974,7 +974,7 @@ long keyctl_negate_key(key_serial_t id, + { + const struct cred *cred = current_cred(); + struct request_key_auth *rka; +- struct key *instkey, *dest_keyring; ++ struct key *instkey, *uninitialized_var(dest_keyring); + long ret; + + kenter("%d,%u,%d", id, timeout, ringid); +Index: linux-2.6-tip/security/selinux/netnode.c +=================================================================== +--- linux-2.6-tip.orig/security/selinux/netnode.c ++++ linux-2.6-tip/security/selinux/netnode.c +@@ -140,6 +140,7 @@ static struct sel_netnode *sel_netnode_f + break; + default: + BUG(); ++ return NULL; + } + + list_for_each_entry_rcu(node, &sel_netnode_hash[idx].list, list) +Index: linux-2.6-tip/sound/drivers/Kconfig +=================================================================== +--- linux-2.6-tip.orig/sound/drivers/Kconfig ++++ linux-2.6-tip/sound/drivers/Kconfig +@@ -33,7 +33,7 @@ if SND_DRIVERS + + config SND_PCSP + tristate "PC-Speaker support (READ HELP!)" +- depends on PCSPKR_PLATFORM && X86_PC && HIGH_RES_TIMERS ++ depends on PCSPKR_PLATFORM && X86 && HIGH_RES_TIMERS + depends on INPUT + depends on EXPERIMENTAL + select SND_PCM +@@ -91,6 +91,8 @@ config SND_VIRMIDI + + config SND_MTPAV + tristate "MOTU MidiTimePiece AV multiport MIDI" ++ # sometimes crashes ++ depends on 0 + select SND_RAWMIDI + help + To use a MOTU MidiTimePiece AV multiport MIDI adapter +Index: linux-2.6-tip/sound/isa/sb/sb8.c +=================================================================== +--- linux-2.6-tip.orig/sound/isa/sb/sb8.c ++++ linux-2.6-tip/sound/isa/sb/sb8.c +@@ -101,7 +101,7 @@ static int __devinit snd_sb8_probe(struc + struct snd_card *card; + struct snd_sb8 *acard; + struct snd_opl3 *opl3; +- int err; ++ int uninitialized_var(err); + + card = snd_card_new(index[dev], id[dev], THIS_MODULE, + sizeof(struct snd_sb8)); +Index: linux-2.6-tip/sound/oss/ad1848.c +=================================================================== +--- linux-2.6-tip.orig/sound/oss/ad1848.c ++++ linux-2.6-tip/sound/oss/ad1848.c +@@ -2879,7 +2879,7 @@ static struct isapnp_device_id id_table[ + {0} + }; + +-MODULE_DEVICE_TABLE(isapnp, id_table); ++MODULE_STATIC_DEVICE_TABLE(isapnp, id_table); + + static struct pnp_dev *activate_dev(char *devname, char *resname, struct pnp_dev *dev) + { +Index: linux-2.6-tip/sound/pci/pcxhr/pcxhr.c +=================================================================== +--- linux-2.6-tip.orig/sound/pci/pcxhr/pcxhr.c ++++ linux-2.6-tip/sound/pci/pcxhr/pcxhr.c +@@ -224,7 +224,7 @@ static int pcxhr_pll_freq_register(unsig + static int pcxhr_get_clock_reg(struct pcxhr_mgr *mgr, unsigned int rate, + unsigned int *reg, unsigned int *freq) + { +- unsigned int val, realfreq, pllreg; ++ unsigned int val, realfreq, uninitialized_var(pllreg); + struct pcxhr_rmh rmh; + int err; + +@@ -298,7 +298,9 @@ static int pcxhr_sub_set_clock(struct pc + unsigned int rate, + int *changed) + { +- unsigned int val, realfreq, speed; ++ unsigned int uninitialized_var(val), ++ uninitialized_var(realfreq), ++ speed; + struct pcxhr_rmh rmh; + int err; + +@@ -681,7 +683,7 @@ static void pcxhr_trigger_tasklet(unsign + { + unsigned long flags; + int i, j, err; +- struct pcxhr_pipe *pipe; ++ struct pcxhr_pipe *uninitialized_var(pipe); + struct snd_pcxhr *chip; + struct pcxhr_mgr *mgr = (struct pcxhr_mgr*)(arg); + int capture_mask = 0; +Index: linux-2.6-tip/sound/pci/pcxhr/pcxhr_mixer.c +=================================================================== +--- linux-2.6-tip.orig/sound/pci/pcxhr/pcxhr_mixer.c ++++ linux-2.6-tip/sound/pci/pcxhr/pcxhr_mixer.c +@@ -936,7 +936,7 @@ static int pcxhr_iec958_get(struct snd_k + struct snd_ctl_elem_value *ucontrol) + { + struct snd_pcxhr *chip = snd_kcontrol_chip(kcontrol); +- unsigned char aes_bits; ++ unsigned char uninitialized_var(aes_bits); + int i, err; + + mutex_lock(&chip->mgr->mixer_mutex); +@@ -1264,3 +1264,4 @@ int pcxhr_create_mixer(struct pcxhr_mgr + + return 0; + } ++ +Index: linux-2.6-tip/sound/pci/via82xx.c +=================================================================== +--- linux-2.6-tip.orig/sound/pci/via82xx.c ++++ linux-2.6-tip/sound/pci/via82xx.c +@@ -2428,7 +2428,7 @@ static int __devinit snd_via82xx_probe(s + const struct pci_device_id *pci_id) + { + struct snd_card *card; +- struct via82xx *chip; ++ struct via82xx *uninitialized_var(chip); + int chip_type = 0, card_type; + unsigned int i; + int err; +Index: linux-2.6-tip/sound/pci/via82xx_modem.c +=================================================================== +--- linux-2.6-tip.orig/sound/pci/via82xx_modem.c ++++ linux-2.6-tip/sound/pci/via82xx_modem.c +@@ -1162,7 +1162,7 @@ static int __devinit snd_via82xx_probe(s + const struct pci_device_id *pci_id) + { + struct snd_card *card; +- struct via82xx_modem *chip; ++ struct via82xx_modem *uninitialized_var(chip); + int chip_type = 0, card_type; + unsigned int i; + int err; +Index: linux-2.6-tip/sound/pci/vx222/vx222.c +=================================================================== +--- linux-2.6-tip.orig/sound/pci/vx222/vx222.c ++++ linux-2.6-tip/sound/pci/vx222/vx222.c +@@ -194,7 +194,7 @@ static int __devinit snd_vx222_probe(str + static int dev; + struct snd_card *card; + struct snd_vx_hardware *hw; +- struct snd_vx222 *vx; ++ struct snd_vx222 *uninitialized_var(vx); + int err; + + if (dev >= SNDRV_CARDS) +Index: linux-2.6-tip/usr/Kconfig +=================================================================== +--- linux-2.6-tip.orig/usr/Kconfig ++++ linux-2.6-tip/usr/Kconfig +@@ -44,3 +44,92 @@ config INITRAMFS_ROOT_GID + owned by group root in the initial ramdisk image. + + If you are not sure, leave it set to "0". ++ ++config RD_GZIP ++ bool "Initial ramdisk compressed using gzip" ++ default y ++ depends on BLK_DEV_INITRD=y ++ select DECOMPRESS_GZIP ++ help ++ Support loading of a gzip encoded initial ramdisk or cpio buffer. ++ If unsure, say Y. ++ ++config RD_BZIP2 ++ bool "Initial ramdisk compressed using bzip2" ++ default n ++ depends on BLK_DEV_INITRD=y ++ select DECOMPRESS_BZIP2 ++ help ++ Support loading of a bzip2 encoded initial ramdisk or cpio buffer ++ If unsure, say N. ++ ++config RD_LZMA ++ bool "Initial ramdisk compressed using lzma" ++ default n ++ depends on BLK_DEV_INITRD=y ++ select DECOMPRESS_LZMA ++ help ++ Support loading of a lzma encoded initial ramdisk or cpio buffer ++ If unsure, say N. ++ ++choice ++ prompt "Built-in initramfs compression mode" ++ help ++ This setting is only meaningful if the INITRAMFS_SOURCE is ++ set. It decides by which algorithm the INITRAMFS_SOURCE will ++ be compressed. ++ Several compression algorithms are available, which differ ++ in efficiency, compression and decompression speed. ++ Compression speed is only relevant when building a kernel. ++ Decompression speed is relevant at each boot. ++ ++ If you have any problems with bzip2 or lzma compressed ++ initramfs, mail me (Alain Knaff) . ++ ++ High compression options are mostly useful for users who ++ are low on disk space (embedded systems), but for whom ram ++ size matters less. ++ ++ If in doubt, select 'gzip' ++ ++config INITRAMFS_COMPRESSION_NONE ++ bool "None" ++ help ++ Do not compress the built-in initramfs at all. This may ++ sound wasteful in space, but, you should be aware that the ++ built-in initramfs will be compressed at a later stage ++ anyways along with the rest of the kernel, on those ++ architectures that support this. ++ However, not compressing the initramfs may lead to slightly ++ higher memory consumption during a short time at boot, while ++ both the cpio image and the unpacked filesystem image will ++ be present in memory simultaneously ++ ++config INITRAMFS_COMPRESSION_GZIP ++ bool "Gzip" ++ depends on RD_GZIP ++ help ++ The old and tried gzip compression. Its compression ratio is ++ the poorest among the 3 choices; however its speed (both ++ compression and decompression) is the fastest. ++ ++config INITRAMFS_COMPRESSION_BZIP2 ++ bool "Bzip2" ++ depends on RD_BZIP2 ++ help ++ Its compression ratio and speed is intermediate. ++ Decompression speed is slowest among the three. The initramfs ++ size is about 10% smaller with bzip2, in comparison to gzip. ++ Bzip2 uses a large amount of memory. For modern kernels you ++ will need at least 8MB RAM or more for booting. ++ ++config INITRAMFS_COMPRESSION_LZMA ++ bool "LZMA" ++ depends on RD_LZMA ++ help ++ The most recent compression algorithm. ++ Its ratio is best, decompression speed is between the other ++ two. Compression is slowest. The initramfs size is about 33% ++ smaller with LZMA in comparison to gzip. ++ ++endchoice +Index: linux-2.6-tip/usr/Makefile +=================================================================== +--- linux-2.6-tip.orig/usr/Makefile ++++ linux-2.6-tip/usr/Makefile +@@ -6,13 +6,25 @@ klibcdirs:; + PHONY += klibcdirs + + ++# No compression ++suffix_$(CONFIG_INITRAMFS_COMPRESSION_NONE) = ++ ++# Gzip, but no bzip2 ++suffix_$(CONFIG_INITRAMFS_COMPRESSION_GZIP) = .gz ++ ++# Bzip2 ++suffix_$(CONFIG_INITRAMFS_COMPRESSION_BZIP2) = .bz2 ++ ++# Lzma ++suffix_$(CONFIG_INITRAMFS_COMPRESSION_LZMA) = .lzma ++ + # Generate builtin.o based on initramfs_data.o +-obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o ++obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data$(suffix_y).o + +-# initramfs_data.o contains the initramfs_data.cpio.gz image. ++# initramfs_data.o contains the compressed initramfs_data.cpio image. + # The image is included using .incbin, a dependency which is not + # tracked automatically. +-$(obj)/initramfs_data.o: $(obj)/initramfs_data.cpio.gz FORCE ++$(obj)/initramfs_data$(suffix_y).o: $(obj)/initramfs_data.cpio$(suffix_y) FORCE + + ##### + # Generate the initramfs cpio archive +@@ -25,28 +37,28 @@ ramfs-args := \ + $(if $(CONFIG_INITRAMFS_ROOT_UID), -u $(CONFIG_INITRAMFS_ROOT_UID)) \ + $(if $(CONFIG_INITRAMFS_ROOT_GID), -g $(CONFIG_INITRAMFS_ROOT_GID)) + +-# .initramfs_data.cpio.gz.d is used to identify all files included ++# .initramfs_data.cpio.d is used to identify all files included + # in initramfs and to detect if any files are added/removed. + # Removed files are identified by directory timestamp being updated + # The dependency list is generated by gen_initramfs.sh -l +-ifneq ($(wildcard $(obj)/.initramfs_data.cpio.gz.d),) +- include $(obj)/.initramfs_data.cpio.gz.d ++ifneq ($(wildcard $(obj)/.initramfs_data.cpio.d),) ++ include $(obj)/.initramfs_data.cpio.d + endif + + quiet_cmd_initfs = GEN $@ + cmd_initfs = $(initramfs) -o $@ $(ramfs-args) $(ramfs-input) + +-targets := initramfs_data.cpio.gz ++targets := initramfs_data.cpio.gz initramfs_data.cpio.bz2 initramfs_data.cpio.lzma initramfs_data.cpio + # do not try to update files included in initramfs + $(deps_initramfs): ; + + $(deps_initramfs): klibcdirs +-# We rebuild initramfs_data.cpio.gz if: +-# 1) Any included file is newer then initramfs_data.cpio.gz ++# We rebuild initramfs_data.cpio if: ++# 1) Any included file is newer then initramfs_data.cpio + # 2) There are changes in which files are included (added or deleted) +-# 3) If gen_init_cpio are newer than initramfs_data.cpio.gz ++# 3) If gen_init_cpio are newer than initramfs_data.cpio + # 4) arguments to gen_initramfs.sh changes +-$(obj)/initramfs_data.cpio.gz: $(obj)/gen_init_cpio $(deps_initramfs) klibcdirs +- $(Q)$(initramfs) -l $(ramfs-input) > $(obj)/.initramfs_data.cpio.gz.d ++$(obj)/initramfs_data.cpio$(suffix_y): $(obj)/gen_init_cpio $(deps_initramfs) klibcdirs ++ $(Q)$(initramfs) -l $(ramfs-input) > $(obj)/.initramfs_data.cpio.d + $(call if_changed,initfs) + +Index: linux-2.6-tip/usr/initramfs_data.S +=================================================================== +--- linux-2.6-tip.orig/usr/initramfs_data.S ++++ linux-2.6-tip/usr/initramfs_data.S +@@ -26,5 +26,5 @@ SECTIONS + */ + + .section .init.ramfs,"a" +-.incbin "usr/initramfs_data.cpio.gz" ++.incbin "usr/initramfs_data.cpio" + +Index: linux-2.6-tip/usr/initramfs_data.bz2.S +=================================================================== +--- /dev/null ++++ linux-2.6-tip/usr/initramfs_data.bz2.S +@@ -0,0 +1,29 @@ ++/* ++ initramfs_data includes the compressed binary that is the ++ filesystem used for early user space. ++ Note: Older versions of "as" (prior to binutils 2.11.90.0.23 ++ released on 2001-07-14) dit not support .incbin. ++ If you are forced to use older binutils than that then the ++ following trick can be applied to create the resulting binary: ++ ++ ++ ld -m elf_i386 --format binary --oformat elf32-i386 -r \ ++ -T initramfs_data.scr initramfs_data.cpio.gz -o initramfs_data.o ++ ld -m elf_i386 -r -o built-in.o initramfs_data.o ++ ++ initramfs_data.scr looks like this: ++SECTIONS ++{ ++ .init.ramfs : { *(.data) } ++} ++ ++ The above example is for i386 - the parameters vary from architectures. ++ Eventually look up LDFLAGS_BLOB in an older version of the ++ arch/$(ARCH)/Makefile to see the flags used before .incbin was introduced. ++ ++ Using .incbin has the advantage over ld that the correct flags are set ++ in the ELF header, as required by certain architectures. ++*/ ++ ++.section .init.ramfs,"a" ++.incbin "usr/initramfs_data.cpio.bz2" +Index: linux-2.6-tip/usr/initramfs_data.gz.S +=================================================================== +--- /dev/null ++++ linux-2.6-tip/usr/initramfs_data.gz.S +@@ -0,0 +1,29 @@ ++/* ++ initramfs_data includes the compressed binary that is the ++ filesystem used for early user space. ++ Note: Older versions of "as" (prior to binutils 2.11.90.0.23 ++ released on 2001-07-14) dit not support .incbin. ++ If you are forced to use older binutils than that then the ++ following trick can be applied to create the resulting binary: ++ ++ ++ ld -m elf_i386 --format binary --oformat elf32-i386 -r \ ++ -T initramfs_data.scr initramfs_data.cpio.gz -o initramfs_data.o ++ ld -m elf_i386 -r -o built-in.o initramfs_data.o ++ ++ initramfs_data.scr looks like this: ++SECTIONS ++{ ++ .init.ramfs : { *(.data) } ++} ++ ++ The above example is for i386 - the parameters vary from architectures. ++ Eventually look up LDFLAGS_BLOB in an older version of the ++ arch/$(ARCH)/Makefile to see the flags used before .incbin was introduced. ++ ++ Using .incbin has the advantage over ld that the correct flags are set ++ in the ELF header, as required by certain architectures. ++*/ ++ ++.section .init.ramfs,"a" ++.incbin "usr/initramfs_data.cpio.gz" +Index: linux-2.6-tip/usr/initramfs_data.lzma.S +=================================================================== +--- /dev/null ++++ linux-2.6-tip/usr/initramfs_data.lzma.S +@@ -0,0 +1,29 @@ ++/* ++ initramfs_data includes the compressed binary that is the ++ filesystem used for early user space. ++ Note: Older versions of "as" (prior to binutils 2.11.90.0.23 ++ released on 2001-07-14) dit not support .incbin. ++ If you are forced to use older binutils than that then the ++ following trick can be applied to create the resulting binary: ++ ++ ++ ld -m elf_i386 --format binary --oformat elf32-i386 -r \ ++ -T initramfs_data.scr initramfs_data.cpio.gz -o initramfs_data.o ++ ld -m elf_i386 -r -o built-in.o initramfs_data.o ++ ++ initramfs_data.scr looks like this: ++SECTIONS ++{ ++ .init.ramfs : { *(.data) } ++} ++ ++ The above example is for i386 - the parameters vary from architectures. ++ Eventually look up LDFLAGS_BLOB in an older version of the ++ arch/$(ARCH)/Makefile to see the flags used before .incbin was introduced. ++ ++ Using .incbin has the advantage over ld that the correct flags are set ++ in the ELF header, as required by certain architectures. ++*/ ++ ++.section .init.ramfs,"a" ++.incbin "usr/initramfs_data.cpio.lzma" +Index: linux-2.6-tip/scripts/Kbuild.include +=================================================================== +--- linux-2.6-tip.orig/scripts/Kbuild.include ++++ linux-2.6-tip/scripts/Kbuild.include +@@ -98,8 +98,9 @@ as-option = $(call try-run,\ + # as-instr + # Usage: cflags-y += $(call as-instr,instr,option1,option2) + +-as-instr = $(call try-run,\ +- echo -e "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" -,$(2),$(3)) ++as-instr = $(call try-run, \ ++ echo -e "$(1)" > "$$TMP"; \ ++ $(CC) $(KBUILD_AFLAGS) -c -xassembler -o /dev/null "$$TMP",$(2),$(3)) + + # cc-option + # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) +Index: linux-2.6-tip/arch/mn10300/Kconfig +=================================================================== +--- linux-2.6-tip.orig/arch/mn10300/Kconfig ++++ linux-2.6-tip/arch/mn10300/Kconfig +@@ -186,6 +186,17 @@ config PREEMPT + Say Y here if you are building a kernel for a desktop, embedded + or real-time system. Say N if you are unsure. + ++config PREEMPT_BKL ++ bool "Preempt The Big Kernel Lock" ++ depends on PREEMPT ++ default y ++ help ++ This option reduces the latency of the kernel by making the ++ big kernel lock preemptible. ++ ++ Say Y here if you are building a kernel for a desktop system. ++ Say N if you are unsure. ++ + config MN10300_CURRENT_IN_E2 + bool "Hold current task address in E2 register" + default y +Index: linux-2.6-tip/kernel/posix-timers.c +=================================================================== +--- linux-2.6-tip.orig/kernel/posix-timers.c ++++ linux-2.6-tip/kernel/posix-timers.c +@@ -420,6 +420,7 @@ static enum hrtimer_restart posix_timer_ + static struct pid *good_sigevent(sigevent_t * event) + { + struct task_struct *rtn = current->group_leader; ++ int sig = event->sigev_signo; + + if ((event->sigev_notify & SIGEV_THREAD_ID ) && + (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || +@@ -428,7 +429,8 @@ static struct pid *good_sigevent(sigeven + return NULL; + + if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && +- ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) ++ (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) || ++ sig_kernel_coredump(sig))) + return NULL; + + return task_pid(rtn); +@@ -787,6 +789,7 @@ retry: + + unlock_timer(timr, flag); + if (error == TIMER_RETRY) { ++ hrtimer_wait_for_timer(&timr->it.real.timer); + rtn = NULL; // We already got the old time... + goto retry; + } +@@ -825,6 +828,7 @@ retry_delete: + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); ++ hrtimer_wait_for_timer(&timer->it.real.timer); + goto retry_delete; + } + +@@ -854,6 +858,7 @@ retry_delete: + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); ++ hrtimer_wait_for_timer(&timer->it.real.timer); + goto retry_delete; + } + list_del(&timer->list); +Index: linux-2.6-tip/include/linux/srcu.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/srcu.h ++++ linux-2.6-tip/include/linux/srcu.h +@@ -27,6 +27,8 @@ + #ifndef _LINUX_SRCU_H + #define _LINUX_SRCU_H + ++#include ++ + struct srcu_struct_array { + int c[2]; + }; +@@ -50,4 +52,24 @@ void srcu_read_unlock(struct srcu_struct + void synchronize_srcu(struct srcu_struct *sp); + long srcu_batches_completed(struct srcu_struct *sp); + ++/* ++ * fully compatible with srcu, but optimized for writers. ++ */ ++ ++struct qrcu_struct { ++ int completed; ++ atomic_t ctr[2]; ++ wait_queue_head_t wq; ++ struct mutex mutex; ++}; ++ ++int init_qrcu_struct(struct qrcu_struct *qp); ++int qrcu_read_lock(struct qrcu_struct *qp); ++void qrcu_read_unlock(struct qrcu_struct *qp, int idx); ++void synchronize_qrcu(struct qrcu_struct *qp); ++ ++static inline void cleanup_qrcu_struct(struct qrcu_struct *qp) ++{ ++} ++ + #endif +Index: linux-2.6-tip/kernel/srcu.c +=================================================================== +--- linux-2.6-tip.orig/kernel/srcu.c ++++ linux-2.6-tip/kernel/srcu.c +@@ -255,3 +255,89 @@ EXPORT_SYMBOL_GPL(srcu_read_lock); + EXPORT_SYMBOL_GPL(srcu_read_unlock); + EXPORT_SYMBOL_GPL(synchronize_srcu); + EXPORT_SYMBOL_GPL(srcu_batches_completed); ++ ++int init_qrcu_struct(struct qrcu_struct *qp) ++{ ++ qp->completed = 0; ++ atomic_set(qp->ctr + 0, 1); ++ atomic_set(qp->ctr + 1, 0); ++ init_waitqueue_head(&qp->wq); ++ mutex_init(&qp->mutex); ++ ++ return 0; ++} ++ ++int qrcu_read_lock(struct qrcu_struct *qp) ++{ ++ for (;;) { ++ int idx = qp->completed & 0x1; ++ if (likely(atomic_inc_not_zero(qp->ctr + idx))) ++ return idx; ++ } ++} ++ ++void qrcu_read_unlock(struct qrcu_struct *qp, int idx) ++{ ++ if (atomic_dec_and_test(qp->ctr + idx)) ++ wake_up(&qp->wq); ++} ++ ++void synchronize_qrcu(struct qrcu_struct *qp) ++{ ++ int idx; ++ ++ smp_mb(); /* Force preceding change to happen before fastpath check. */ ++ ++ /* ++ * Fastpath: If the two counters sum to "1" at a given point in ++ * time, there are no readers. However, it takes two separate ++ * loads to sample both counters, which won't occur simultaneously. ++ * So we might race with a counter switch, so that we might see ++ * ctr[0]==0, then the counter might switch, then we might see ++ * ctr[1]==1 (unbeknownst to us because there is a reader still ++ * there). So we do a read memory barrier and recheck. If the ++ * same race happens again, there must have been a second counter ++ * switch. This second counter switch could not have happened ++ * until all preceding readers finished, so if the condition ++ * is true both times, we may safely proceed. ++ * ++ * This relies critically on the atomic increment and atomic ++ * decrement being seen as executing in order. ++ */ ++ ++ if (atomic_read(&qp->ctr[0]) + atomic_read(&qp->ctr[1]) <= 1) { ++ smp_rmb(); /* Keep two checks independent. */ ++ if (atomic_read(&qp->ctr[0]) + atomic_read(&qp->ctr[1]) <= 1) ++ goto out; ++ } ++ ++ mutex_lock(&qp->mutex); ++ ++ idx = qp->completed & 0x1; ++ if (atomic_read(qp->ctr + idx) == 1) ++ goto out_unlock; ++ ++ atomic_inc(qp->ctr + (idx ^ 0x1)); ++ ++ /* ++ * Prevent subsequent decrement from being seen before previous ++ * increment -- such an inversion could cause the fastpath ++ * above to falsely conclude that there were no readers. Also, ++ * reduce the likelihood that qrcu_read_lock() will loop. ++ */ ++ ++ smp_mb__after_atomic_inc(); ++ qp->completed++; ++ ++ atomic_dec(qp->ctr + idx); ++ __wait_event(qp->wq, !atomic_read(qp->ctr + idx)); ++out_unlock: ++ mutex_unlock(&qp->mutex); ++out: ++ smp_mb(); /* force subsequent free after qrcu_read_unlock(). */ ++} ++ ++EXPORT_SYMBOL_GPL(init_qrcu_struct); ++EXPORT_SYMBOL_GPL(qrcu_read_lock); ++EXPORT_SYMBOL_GPL(qrcu_read_unlock); ++EXPORT_SYMBOL_GPL(synchronize_qrcu); +Index: linux-2.6-tip/drivers/net/sungem.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/sungem.c ++++ linux-2.6-tip/drivers/net/sungem.c +@@ -1032,12 +1032,8 @@ static int gem_start_xmit(struct sk_buff + (csum_stuff_off << 21)); + } + +- local_irq_save(flags); +- if (!spin_trylock(&gp->tx_lock)) { +- /* Tell upper layer to requeue */ +- local_irq_restore(flags); +- return NETDEV_TX_LOCKED; +- } ++ spin_lock_irqsave(&gp->tx_lock, flags); ++ + /* We raced with gem_do_stop() */ + if (!gp->running) { + spin_unlock_irqrestore(&gp->tx_lock, flags); +Index: linux-2.6-tip/arch/x86/kernel/tsc_sync.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/tsc_sync.c ++++ linux-2.6-tip/arch/x86/kernel/tsc_sync.c +@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count + * we want to have the fastest, inlined, non-debug version + * of a critical section, to be able to prove TSC time-warps: + */ +-static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; ++static __cpuinitdata __raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; + static __cpuinitdata cycles_t last_tsc; + static __cpuinitdata cycles_t max_warp; + static __cpuinitdata int nr_warps; +@@ -103,6 +103,7 @@ static __cpuinit void check_tsc_warp(voi + */ + void __cpuinit check_tsc_sync_source(int cpu) + { ++ unsigned long flags; + int cpus = 2; + + /* +@@ -129,8 +130,11 @@ void __cpuinit check_tsc_sync_source(int + /* + * Wait for the target to arrive: + */ ++ local_save_flags(flags); ++ local_irq_enable(); + while (atomic_read(&start_count) != cpus-1) + cpu_relax(); ++ local_irq_restore(flags); + /* + * Trigger the target to continue into the measurement too: + */ +Index: linux-2.6-tip/drivers/input/keyboard/atkbd.c +=================================================================== +--- linux-2.6-tip.orig/drivers/input/keyboard/atkbd.c ++++ linux-2.6-tip/drivers/input/keyboard/atkbd.c +@@ -1556,8 +1556,23 @@ static struct dmi_system_id atkbd_dmi_qu + { } + }; + ++static int __read_mostly noatkbd; ++ ++static int __init noatkbd_setup(char *str) ++{ ++ noatkbd = 1; ++ printk(KERN_INFO "debug: not setting up AT keyboard.\n"); ++ ++ return 1; ++} ++ ++__setup("noatkbd", noatkbd_setup); ++ + static int __init atkbd_init(void) + { ++ if (noatkbd) ++ return 0; ++ + dmi_check_system(atkbd_dmi_quirk_table); + + return serio_register_driver(&atkbd_drv); +Index: linux-2.6-tip/drivers/input/mouse/psmouse-base.c +=================================================================== +--- linux-2.6-tip.orig/drivers/input/mouse/psmouse-base.c ++++ linux-2.6-tip/drivers/input/mouse/psmouse-base.c +@@ -1645,10 +1645,25 @@ static int psmouse_get_maxproto(char *bu + return sprintf(buffer, "%s\n", psmouse_protocol_by_type(type)->name); + } + ++static int __read_mostly nopsmouse; ++ ++static int __init nopsmouse_setup(char *str) ++{ ++ nopsmouse = 1; ++ printk(KERN_INFO "debug: not setting up psmouse.\n"); ++ ++ return 1; ++} ++ ++__setup("nopsmouse", nopsmouse_setup); ++ + static int __init psmouse_init(void) + { + int err; + ++ if (nopsmouse) ++ return 0; ++ + kpsmoused_wq = create_singlethread_workqueue("kpsmoused"); + if (!kpsmoused_wq) { + printk(KERN_ERR "psmouse: failed to create kpsmoused workqueue\n"); +Index: linux-2.6-tip/kernel/rtmutex-debug.h +=================================================================== +--- linux-2.6-tip.orig/kernel/rtmutex-debug.h ++++ linux-2.6-tip/kernel/rtmutex-debug.h +@@ -17,17 +17,17 @@ extern void debug_rt_mutex_free_waiter(s + extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); + extern void debug_rt_mutex_lock(struct rt_mutex *lock); + extern void debug_rt_mutex_unlock(struct rt_mutex *lock); +-extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, +- struct task_struct *powner); ++extern void ++debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); + extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); + extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, + struct rt_mutex *lock); + extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +-# define debug_rt_mutex_reset_waiter(w) \ ++# define debug_rt_mutex_reset_waiter(w) \ + do { (w)->deadlock_lock = NULL; } while (0) + +-static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, +- int detect) ++static inline int ++debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, int detect) + { +- return (waiter != NULL); ++ return waiter != NULL; + } +Index: linux-2.6-tip/drivers/net/8139too.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/8139too.c ++++ linux-2.6-tip/drivers/net/8139too.c +@@ -2209,7 +2209,11 @@ static irqreturn_t rtl8139_interrupt (in + */ + static void rtl8139_poll_controller(struct net_device *dev) + { +- disable_irq(dev->irq); ++ /* ++ * use _nosync() variant - might be used by netconsole ++ * from atomic contexts: ++ */ ++ disable_irq_nosync(dev->irq); + rtl8139_interrupt(dev->irq, dev); + enable_irq(dev->irq); + } +Index: linux-2.6-tip/drivers/pci/msi.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/msi.c ++++ linux-2.6-tip/drivers/pci/msi.c +@@ -323,6 +323,10 @@ static void __pci_restore_msi_state(stru + return; + + entry = get_irq_msi(dev->irq); ++ if (!entry) { ++ WARN_ON(1); ++ return; ++ } + pos = entry->msi_attrib.pos; + + pci_intx_for_msi(dev, 0); +Index: linux-2.6-tip/drivers/block/floppy.c +=================================================================== +--- linux-2.6-tip.orig/drivers/block/floppy.c ++++ linux-2.6-tip/drivers/block/floppy.c +@@ -4148,6 +4148,28 @@ static void floppy_device_release(struct + { + } + ++static int floppy_suspend(struct platform_device *dev, pm_message_t state) ++{ ++ floppy_release_irq_and_dma(); ++ ++ return 0; ++} ++ ++static int floppy_resume(struct platform_device *dev) ++{ ++ floppy_grab_irq_and_dma(); ++ ++ return 0; ++} ++ ++static struct platform_driver floppy_driver = { ++ .suspend = floppy_suspend, ++ .resume = floppy_resume, ++ .driver = { ++ .name = "floppy", ++ }, ++}; ++ + static struct platform_device floppy_device[N_DRIVE]; + + static struct kobject *floppy_find(dev_t dev, int *part, void *data) +@@ -4196,10 +4218,14 @@ static int __init floppy_init(void) + if (err) + goto out_put_disk; + ++ err = platform_driver_register(&floppy_driver); ++ if (err) ++ goto out_unreg_blkdev; ++ + floppy_queue = blk_init_queue(do_fd_request, &floppy_lock); + if (!floppy_queue) { + err = -ENOMEM; +- goto out_unreg_blkdev; ++ goto out_unreg_driver; + } + blk_queue_max_sectors(floppy_queue, 64); + +@@ -4346,6 +4372,8 @@ out_flush_work: + out_unreg_region: + blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); + blk_cleanup_queue(floppy_queue); ++out_unreg_driver: ++ platform_driver_unregister(&floppy_driver); + out_unreg_blkdev: + unregister_blkdev(FLOPPY_MAJOR, "fd"); + out_put_disk: +@@ -4567,6 +4595,7 @@ static void __exit floppy_module_exit(vo + blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); + unregister_blkdev(FLOPPY_MAJOR, "fd"); + ++ platform_driver_unregister(&floppy_driver); + for (drive = 0; drive < N_DRIVE; drive++) { + del_timer_sync(&motor_off_timer[drive]); + +Index: linux-2.6-tip/net/core/flow.c +=================================================================== +--- linux-2.6-tip.orig/net/core/flow.c ++++ linux-2.6-tip/net/core/flow.c +@@ -39,9 +39,10 @@ atomic_t flow_cache_genid = ATOMIC_INIT( + + static u32 flow_hash_shift; + #define flow_hash_size (1 << flow_hash_shift) +-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; + +-#define flow_table(cpu) (per_cpu(flow_tables, cpu)) ++static DEFINE_PER_CPU_LOCKED(struct flow_cache_entry **, flow_tables); ++ ++#define flow_table(cpu) (per_cpu_var_locked(flow_tables, cpu)) + + static struct kmem_cache *flow_cachep __read_mostly; + +@@ -168,24 +169,24 @@ static int flow_key_compare(struct flowi + void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver) + { +- struct flow_cache_entry *fle, **head; ++ struct flow_cache_entry **table, *fle, **head = NULL /* shut up GCC */; + unsigned int hash; + int cpu; + + local_bh_disable(); +- cpu = smp_processor_id(); ++ table = get_cpu_var_locked(flow_tables, &cpu); + + fle = NULL; + /* Packet really early in init? Making flow_cache_init a + * pre-smp initcall would solve this. --RR */ +- if (!flow_table(cpu)) ++ if (!table) + goto nocache; + + if (flow_hash_rnd_recalc(cpu)) + flow_new_hash_rnd(cpu); + hash = flow_hash_code(key, cpu); + +- head = &flow_table(cpu)[hash]; ++ head = &table[hash]; + for (fle = *head; fle; fle = fle->next) { + if (fle->family == family && + fle->dir == dir && +@@ -195,6 +196,7 @@ void *flow_cache_lookup(struct net *net, + + if (ret) + atomic_inc(fle->object_ref); ++ put_cpu_var_locked(flow_tables, cpu); + local_bh_enable(); + + return ret; +@@ -220,6 +222,8 @@ void *flow_cache_lookup(struct net *net, + } + + nocache: ++ put_cpu_var_locked(flow_tables, cpu); ++ + { + int err; + void *obj; +@@ -249,14 +253,15 @@ nocache: + static void flow_cache_flush_tasklet(unsigned long data) + { + struct flow_flush_info *info = (void *)data; ++ struct flow_cache_entry **table; + int i; + int cpu; + +- cpu = smp_processor_id(); ++ table = get_cpu_var_locked(flow_tables, &cpu); + for (i = 0; i < flow_hash_size; i++) { + struct flow_cache_entry *fle; + +- fle = flow_table(cpu)[i]; ++ fle = table[i]; + for (; fle; fle = fle->next) { + unsigned genid = atomic_read(&flow_cache_genid); + +@@ -267,6 +272,7 @@ static void flow_cache_flush_tasklet(uns + atomic_dec(fle->object_ref); + } + } ++ put_cpu_var_locked(flow_tables, cpu); + + if (atomic_dec_and_test(&info->cpuleft)) + complete(&info->completion); +Index: linux-2.6-tip/fs/nfs/iostat.h +=================================================================== +--- linux-2.6-tip.orig/fs/nfs/iostat.h ++++ linux-2.6-tip/fs/nfs/iostat.h +@@ -28,7 +28,7 @@ static inline void nfs_inc_server_stats( + cpu = get_cpu(); + iostats = per_cpu_ptr(server->io_stats, cpu); + iostats->events[stat]++; +- put_cpu_no_resched(); ++ put_cpu(); + } + + static inline void nfs_inc_stats(const struct inode *inode, +@@ -47,7 +47,7 @@ static inline void nfs_add_server_stats( + cpu = get_cpu(); + iostats = per_cpu_ptr(server->io_stats, cpu); + iostats->bytes[stat] += addend; +- put_cpu_no_resched(); ++ put_cpu(); + } + + static inline void nfs_add_stats(const struct inode *inode, +Index: linux-2.6-tip/drivers/net/loopback.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/loopback.c ++++ linux-2.6-tip/drivers/net/loopback.c +@@ -76,13 +76,13 @@ static int loopback_xmit(struct sk_buff + + skb->protocol = eth_type_trans(skb,dev); + +- /* it's OK to use per_cpu_ptr() because BHs are off */ + pcpu_lstats = dev->ml_priv; +- lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id()); ++ lb_stats = per_cpu_ptr(pcpu_lstats, get_cpu()); + lb_stats->bytes += skb->len; + lb_stats->packets++; ++ put_cpu(); + +- netif_rx(skb); ++ netif_rx_ni(skb); + + return 0; + } +Index: linux-2.6-tip/include/asm-generic/cmpxchg-local.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/cmpxchg-local.h ++++ linux-2.6-tip/include/asm-generic/cmpxchg-local.h +@@ -20,7 +20,7 @@ static inline unsigned long __cmpxchg_lo + if (size == 8 && sizeof(unsigned long) != 8) + wrong_size_cmpxchg(ptr); + +- local_irq_save(flags); ++ raw_local_irq_save(flags); + switch (size) { + case 1: prev = *(u8 *)ptr; + if (prev == old) +@@ -41,7 +41,7 @@ static inline unsigned long __cmpxchg_lo + default: + wrong_size_cmpxchg(ptr); + } +- local_irq_restore(flags); ++ raw_local_irq_restore(flags); + return prev; + } + +@@ -54,11 +54,11 @@ static inline u64 __cmpxchg64_local_gene + u64 prev; + unsigned long flags; + +- local_irq_save(flags); ++ raw_local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; +- local_irq_restore(flags); ++ raw_local_irq_restore(flags); + return prev; + } + +Index: linux-2.6-tip/kernel/Kconfig.preempt +=================================================================== +--- linux-2.6-tip.orig/kernel/Kconfig.preempt ++++ linux-2.6-tip/kernel/Kconfig.preempt +@@ -1,14 +1,13 @@ +- + choice +- prompt "Preemption Model" +- default PREEMPT_NONE ++ prompt "Preemption Mode" ++ default PREEMPT_RT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" + help +- This is the traditional Linux preemption model, geared towards ++ This is the traditional Linux preemption model geared towards + throughput. It will still provide good latencies most of the +- time, but there are no guarantees and occasional longer delays ++ time but there are no guarantees and occasional long delays + are possible. + + Select this option if you are building a kernel for a server or +@@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new +- preemption points have been selected to reduce the maximum ++ preemption points have been selected to minimize the maximum + latency of rescheduling, providing faster application reactions, + at the cost of slightly lower throughput. + +@@ -33,22 +32,91 @@ config PREEMPT_VOLUNTARY + + Select this if you are building a kernel for a desktop system. + +-config PREEMPT ++config PREEMPT_DESKTOP + bool "Preemptible Kernel (Low-Latency Desktop)" + help + This option reduces the latency of the kernel by making +- all kernel code (that is not executing in a critical section) ++ all kernel code that is not executing in a critical section + preemptible. This allows reaction to interactive events by + permitting a low priority process to be preempted involuntarily + even if it is in kernel mode executing a system call and would +- otherwise not be about to reach a natural preemption point. +- This allows applications to run more 'smoothly' even when the +- system is under load, at the cost of slightly lower throughput +- and a slight runtime overhead to kernel code. ++ otherwise not about to reach a preemption point. This allows ++ applications to run more 'smoothly' even when the system is ++ under load, at the cost of slighly lower throughput and a ++ slight runtime overhead to kernel code. ++ ++ (According to profiles, when this mode is selected then even ++ during kernel-intense workloads the system is in an immediately ++ preemptible state more than 50% of the time.) + + Select this if you are building a kernel for a desktop or + embedded system with latency requirements in the milliseconds + range. + ++config PREEMPT_RT ++ bool "Complete Preemption (Real-Time)" ++ select PREEMPT_SOFTIRQS ++ select PREEMPT_HARDIRQS ++ select PREEMPT_RCU ++ select RT_MUTEXES ++ help ++ This option further reduces the scheduling latency of the ++ kernel by replacing almost every spinlock used by the kernel ++ with preemptible mutexes and thus making all but the most ++ critical kernel code involuntarily preemptible. The remaining ++ handful of lowlevel non-preemptible codepaths are short and ++ have a deterministic latency of a couple of tens of ++ microseconds (depending on the hardware). This also allows ++ applications to run more 'smoothly' even when the system is ++ under load, at the cost of lower throughput and runtime ++ overhead to kernel code. ++ ++ (According to profiles, when this mode is selected then even ++ during kernel-intense workloads the system is in an immediately ++ preemptible state more than 95% of the time.) ++ ++ Select this if you are building a kernel for a desktop, ++ embedded or real-time system with guaranteed latency ++ requirements of 100 usecs or lower. ++ + endchoice + ++config PREEMPT ++ bool ++ default y ++ depends on PREEMPT_DESKTOP || PREEMPT_RT ++ ++config PREEMPT_SOFTIRQS ++ bool "Thread Softirqs" ++ default n ++# depends on PREEMPT ++ help ++ This option reduces the latency of the kernel by 'threading' ++ soft interrupts. This means that all softirqs will execute ++ in softirqd's context. While this helps latency, it can also ++ reduce performance. ++ ++ The threading of softirqs can also be controlled via ++ /proc/sys/kernel/softirq_preemption runtime flag and the ++ sofirq-preempt=0/1 boot-time option. ++ ++ Say N if you are unsure. ++ ++config PREEMPT_HARDIRQS ++ bool "Thread Hardirqs" ++ default n ++ depends on GENERIC_HARDIRQS_NO__DO_IRQ ++ select PREEMPT_SOFTIRQS ++ help ++ This option reduces the latency of the kernel by 'threading' ++ hardirqs. This means that all (or selected) hardirqs will run ++ in their own kernel thread context. While this helps latency, ++ this feature can also reduce performance. ++ ++ The threading of hardirqs can also be controlled via the ++ /proc/sys/kernel/hardirq_preemption runtime flag and the ++ hardirq-preempt=0/1 boot-time option. Per-irq threading can ++ be enabled/disable via the /proc/irq///threaded ++ runtime flags. ++ ++ Say N if you are unsure. +Index: linux-2.6-tip/kernel/irq/autoprobe.c +=================================================================== +--- linux-2.6-tip.orig/kernel/irq/autoprobe.c ++++ linux-2.6-tip/kernel/irq/autoprobe.c +@@ -7,6 +7,7 @@ + */ + + #include ++#include + #include + #include + #include +Index: linux-2.6-tip/include/linux/hrtimer.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/hrtimer.h ++++ linux-2.6-tip/include/linux/hrtimer.h +@@ -105,6 +105,7 @@ struct hrtimer { + struct hrtimer_clock_base *base; + unsigned long state; + struct list_head cb_entry; ++ int irqsafe; + #ifdef CONFIG_TIMER_STATS + int start_pid; + void *start_site; +@@ -140,6 +141,7 @@ struct hrtimer_clock_base { + struct hrtimer_cpu_base *cpu_base; + clockid_t index; + struct rb_root active; ++ struct list_head expired; + struct rb_node *first; + ktime_t resolution; + ktime_t (*get_time)(void); +@@ -166,13 +168,16 @@ struct hrtimer_clock_base { + * @nr_events: Total number of timer interrupt events + */ + struct hrtimer_cpu_base { +- spinlock_t lock; ++ raw_spinlock_t lock; + struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; + #ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires_next; + int hres_active; + unsigned long nr_events; + #endif ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ wait_queue_head_t wait; ++#endif + }; + + static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) +@@ -360,6 +365,13 @@ static inline int hrtimer_restart(struct + return hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + } + ++/* Softirq preemption could deadlock timer removal */ ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ extern void hrtimer_wait_for_timer(const struct hrtimer *timer); ++#else ++# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) ++#endif ++ + /* Query timers: */ + extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); + extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); +Index: linux-2.6-tip/kernel/hrtimer.c +=================================================================== +--- linux-2.6-tip.orig/kernel/hrtimer.c ++++ linux-2.6-tip/kernel/hrtimer.c +@@ -476,9 +476,9 @@ static inline int hrtimer_is_hres_enable + /* + * Is the high resolution mode active ? + */ +-static inline int hrtimer_hres_active(void) ++static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) + { +- return __get_cpu_var(hrtimer_bases).hres_active; ++ return cpu_base->hres_active; + } + + /* +@@ -538,8 +538,7 @@ static int hrtimer_reprogram(struct hrti + * When the callback is running, we do not reprogram the clock event + * device. The timer callback is either running on a different CPU or + * the callback is executed in the hrtimer_interrupt context. The +- * reprogramming is handled either by the softirq, which called the +- * callback or at the end of the hrtimer_interrupt. ++ * reprogramming is handled at the end of the hrtimer_interrupt. + */ + if (hrtimer_callback_running(timer)) + return 0; +@@ -573,11 +572,12 @@ static int hrtimer_reprogram(struct hrti + */ + static void retrigger_next_event(void *arg) + { +- struct hrtimer_cpu_base *base; ++ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); ++ + struct timespec realtime_offset; + unsigned long seq; + +- if (!hrtimer_hres_active()) ++ if (!hrtimer_hres_active(base)) + return; + + do { +@@ -587,8 +587,6 @@ static void retrigger_next_event(void *a + -wall_to_monotonic.tv_nsec); + } while (read_seqretry(&xtime_lock, seq)); + +- base = &__get_cpu_var(hrtimer_bases); +- + /* Adjust CLOCK_REALTIME offset */ + spin_lock(&base->lock); + base->clock_base[CLOCK_REALTIME].offset = +@@ -643,6 +641,8 @@ static inline void hrtimer_init_timer_hr + { + } + ++static void __run_hrtimer(struct hrtimer *timer); ++static int hrtimer_rt_defer(struct hrtimer *timer); + + /* + * When High resolution timers are active, try to reprogram. Note, that in case +@@ -654,7 +654,27 @@ static inline int hrtimer_enqueue_reprog + struct hrtimer_clock_base *base, + int wakeup) + { ++#ifdef CONFIG_PREEMPT_RT ++again: ++#endif + if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * Move softirq based timers away from the rbtree in ++ * case it expired already. Otherwise we would have a ++ * stale base->first entry until the softirq runs. ++ */ ++ if (!hrtimer_rt_defer(timer)) { ++ __run_hrtimer(timer); ++ /* ++ * __run_hrtimer might have requeued timer and ++ * it could be base->first again. ++ */ ++ if (base->first == &timer->node) ++ goto again; ++ return 1; ++ } ++#endif + if (wakeup) { + spin_unlock(&base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); +@@ -671,10 +691,8 @@ static inline int hrtimer_enqueue_reprog + /* + * Switch to high resolution mode + */ +-static int hrtimer_switch_to_hres(void) ++static int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) + { +- int cpu = smp_processor_id(); +- struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); + unsigned long flags; + + if (base->hres_active) +@@ -685,7 +703,7 @@ static int hrtimer_switch_to_hres(void) + if (tick_init_highres()) { + local_irq_restore(flags); + printk(KERN_WARNING "Could not switch to high resolution " +- "mode on CPU %d\n", cpu); ++ "mode on CPU %d\n", raw_smp_processor_id()); + return 0; + } + base->hres_active = 1; +@@ -697,16 +715,20 @@ static int hrtimer_switch_to_hres(void) + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); +- printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", +- smp_processor_id()); + return 1; + } + + #else + +-static inline int hrtimer_hres_active(void) { return 0; } ++static inline int hrtimer_hres_active(struct hrtimer_cpu_base *base) ++{ ++ return 0; ++} + static inline int hrtimer_is_hres_enabled(void) { return 0; } +-static inline int hrtimer_switch_to_hres(void) { return 0; } ++static inline int hrtimer_switch_to_hres(struct hrtimer_cpu_base *base) ++{ ++ return 0; ++} + static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } + static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base, +@@ -714,6 +736,13 @@ static inline int hrtimer_enqueue_reprog + { + return 0; + } ++ ++static inline int hrtimer_reprogram(struct hrtimer *timer, ++ struct hrtimer_clock_base *base) ++{ ++ return 0; ++} ++ + static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } + static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } + +@@ -836,6 +865,32 @@ static int enqueue_hrtimer(struct hrtime + return leftmost; + } + ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++# define wake_up_timer_waiters(b) wake_up(&(b)->wait) ++ ++/** ++ * hrtimer_wait_for_timer - Wait for a running timer ++ * ++ * @timer: timer to wait for ++ * ++ * The function waits in case the timers callback function is ++ * currently executed on the waitqueue of the timer base. The ++ * waitqueue is woken up after the timer callback function has ++ * finished execution. ++ */ ++void hrtimer_wait_for_timer(const struct hrtimer *timer) ++{ ++ struct hrtimer_clock_base *base = timer->base; ++ ++ if (base && base->cpu_base && !timer->irqsafe) ++ wait_event(base->cpu_base->wait, ++ !(timer->state & HRTIMER_STATE_CALLBACK)); ++} ++ ++#else ++# define wake_up_timer_waiters(b) do { } while (0) ++#endif ++ + /* + * __remove_hrtimer - internal function to remove a timer + * +@@ -851,6 +906,11 @@ static void __remove_hrtimer(struct hrti + unsigned long newstate, int reprogram) + { + if (timer->state & HRTIMER_STATE_ENQUEUED) { ++ ++ if (unlikely(!list_empty(&timer->cb_entry))) { ++ list_del_init(&timer->cb_entry); ++ goto out; ++ } + /* + * Remove the timer from the rbtree and replace the + * first entry pointer if necessary. +@@ -858,11 +918,12 @@ static void __remove_hrtimer(struct hrti + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); + /* Reprogram the clock event device. if enabled */ +- if (reprogram && hrtimer_hres_active()) ++ if (reprogram && hrtimer_hres_active(base->cpu_base)) + hrtimer_force_reprogram(base->cpu_base); + } + rb_erase(&timer->node, &base->active); + } ++out: + timer->state = newstate; + } + +@@ -1022,7 +1083,7 @@ int hrtimer_cancel(struct hrtimer *timer + + if (ret >= 0) + return ret; +- cpu_relax(); ++ hrtimer_wait_for_timer(timer); + } + } + EXPORT_SYMBOL_GPL(hrtimer_cancel); +@@ -1062,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void) + + spin_lock_irqsave(&cpu_base->lock, flags); + +- if (!hrtimer_hres_active()) { ++ if (!hrtimer_hres_active(cpu_base)) { + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + +@@ -1176,6 +1237,114 @@ static void __run_hrtimer(struct hrtimer + timer->state &= ~HRTIMER_STATE_CALLBACK; + } + ++#ifdef CONFIG_PREEMPT_RT ++ ++static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer, ++ struct hrtimer_clock_base *base) ++{ ++ /* ++ * Note, we clear the callback flag before we requeue the ++ * timer otherwise we trigger the callback_running() check ++ * in hrtimer_reprogram(). ++ */ ++ timer->state &= ~HRTIMER_STATE_CALLBACK; ++ ++ if (restart != HRTIMER_NORESTART) { ++ BUG_ON(hrtimer_active(timer)); ++ /* ++ * Enqueue the timer, if it's the leftmost timer then ++ * we need to reprogram it. ++ */ ++ if (!enqueue_hrtimer(timer, base)) ++ return; ++ ++ if (hrtimer_reprogram(timer, base)) ++ goto requeue; ++ ++ } else if (hrtimer_active(timer)) { ++ /* ++ * If the timer was rearmed on another CPU, reprogram ++ * the event device. ++ */ ++ if (base->first == &timer->node && ++ hrtimer_reprogram(timer, base)) ++ goto requeue; ++ } ++ return; ++ ++requeue: ++ /* ++ * Timer is expired. Thus move it from tree to pending list ++ * again. ++ */ ++ __remove_hrtimer(timer, base, timer->state, 0); ++ list_add_tail(&timer->cb_entry, &base->expired); ++} ++ ++/* ++ * The changes in mainline which removed the callback modes from ++ * hrtimer are not yet working with -rt. The non wakeup_process() ++ * based callbacks which involve sleeping locks need to be treated ++ * seperately. ++ */ ++static void hrtimer_rt_run_pending(void) ++{ ++ enum hrtimer_restart (*fn)(struct hrtimer *); ++ struct hrtimer_cpu_base *cpu_base; ++ struct hrtimer_clock_base *base; ++ struct hrtimer *timer; ++ int index, restart; ++ ++ local_irq_disable(); ++ cpu_base = &per_cpu(hrtimer_bases, smp_processor_id()); ++ ++ spin_lock(&cpu_base->lock); ++ ++ for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { ++ base = &cpu_base->clock_base[index]; ++ ++ while (!list_empty(&base->expired)) { ++ timer = list_first_entry(&base->expired, ++ struct hrtimer, cb_entry); ++ ++ /* ++ * Same as the above __run_hrtimer function ++ * just we run with interrupts enabled. ++ */ ++ debug_hrtimer_deactivate(timer); ++ __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); ++ timer_stats_account_hrtimer(timer); ++ fn = timer->function; ++ ++ spin_unlock_irq(&cpu_base->lock); ++ restart = fn(timer); ++ spin_lock_irq(&cpu_base->lock); ++ ++ hrtimer_rt_reprogram(restart, timer, base); ++ } ++ } ++ spin_unlock_irq(&cpu_base->lock); ++ ++ wake_up_timer_waiters(cpu_base); ++} ++ ++static int hrtimer_rt_defer(struct hrtimer *timer) ++{ ++ if (timer->irqsafe) ++ return 0; ++ ++ __remove_hrtimer(timer, timer->base, timer->state, 0); ++ list_add_tail(&timer->cb_entry, &timer->base->expired); ++ return 1; ++} ++ ++#else ++ ++static inline void hrtimer_rt_run_pending(void) { } ++static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; } ++ ++#endif ++ + #ifdef CONFIG_HIGH_RES_TIMERS + + static int force_clock_reprogram; +@@ -1211,7 +1380,7 @@ void hrtimer_interrupt(struct clock_even + struct hrtimer_clock_base *base; + ktime_t expires_next, now; + int nr_retries = 0; +- int i; ++ int i, raise = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; +@@ -1264,7 +1433,10 @@ void hrtimer_interrupt(struct clock_even + break; + } + +- __run_hrtimer(timer); ++ if (!hrtimer_rt_defer(timer)) ++ __run_hrtimer(timer); ++ else ++ raise = 1; + } + spin_unlock(&cpu_base->lock); + base++; +@@ -1277,6 +1449,9 @@ void hrtimer_interrupt(struct clock_even + if (tick_program_event(expires_next, force_clock_reprogram)) + goto retry; + } ++ ++ if (raise) ++ raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + /* +@@ -1285,9 +1460,11 @@ void hrtimer_interrupt(struct clock_even + */ + static void __hrtimer_peek_ahead_timers(void) + { ++ struct hrtimer_cpu_base *cpu_base; + struct tick_device *td; + +- if (!hrtimer_hres_active()) ++ cpu_base = &__get_cpu_var(hrtimer_bases); ++ if (!hrtimer_hres_active(cpu_base)) + return; + + td = &__get_cpu_var(tick_cpu_device); +@@ -1313,17 +1490,17 @@ void hrtimer_peek_ahead_timers(void) + local_irq_restore(flags); + } + +-static void run_hrtimer_softirq(struct softirq_action *h) +-{ +- hrtimer_peek_ahead_timers(); +-} +- + #else /* CONFIG_HIGH_RES_TIMERS */ + + static inline void __hrtimer_peek_ahead_timers(void) { } + + #endif /* !CONFIG_HIGH_RES_TIMERS */ + ++static void run_hrtimer_softirq(struct softirq_action *h) ++{ ++ hrtimer_rt_run_pending(); ++} ++ + /* + * Called from timer softirq every jiffy, expire hrtimers: + * +@@ -1333,7 +1510,9 @@ static inline void __hrtimer_peek_ahead_ + */ + void hrtimer_run_pending(void) + { +- if (hrtimer_hres_active()) ++ struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ++ ++ if (hrtimer_hres_active(cpu_base)) + return; + + /* +@@ -1345,7 +1524,7 @@ void hrtimer_run_pending(void) + * deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) +- hrtimer_switch_to_hres(); ++ hrtimer_switch_to_hres(cpu_base); + } + + /* +@@ -1354,11 +1533,12 @@ void hrtimer_run_pending(void) + void hrtimer_run_queues(void) + { + struct rb_node *node; +- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ++ struct hrtimer_cpu_base *cpu_base; + struct hrtimer_clock_base *base; +- int index, gettime = 1; ++ int index, gettime = 1, raise = 0; + +- if (hrtimer_hres_active()) ++ cpu_base = &per_cpu(hrtimer_bases, raw_smp_processor_id()); ++ if (hrtimer_hres_active(cpu_base)) + return; + + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { +@@ -1382,10 +1562,16 @@ void hrtimer_run_queues(void) + hrtimer_get_expires_tv64(timer)) + break; + +- __run_hrtimer(timer); ++ if (!hrtimer_rt_defer(timer)) ++ __run_hrtimer(timer); ++ else ++ raise = 1; + } + spin_unlock(&cpu_base->lock); + } ++ ++ if (raise) ++ raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + /* +@@ -1407,6 +1593,7 @@ static enum hrtimer_restart hrtimer_wake + void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) + { + sl->timer.function = hrtimer_wakeup; ++ sl->timer.irqsafe = 1; + sl->task = task; + } + +@@ -1541,10 +1728,15 @@ static void __cpuinit init_hrtimers_cpu( + + spin_lock_init(&cpu_base->lock); + +- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) ++ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + cpu_base->clock_base[i].cpu_base = cpu_base; ++ INIT_LIST_HEAD(&cpu_base->clock_base[i].expired); ++ } + + hrtimer_init_hres(cpu_base); ++#ifdef CONFIG_PREEMPT_RT ++ init_waitqueue_head(&cpu_base->wait); ++#endif + } + + #ifdef CONFIG_HOTPLUG_CPU +@@ -1657,9 +1849,7 @@ void __init hrtimers_init(void) + hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&hrtimers_nb); +-#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +-#endif + } + + /** +Index: linux-2.6-tip/kernel/itimer.c +=================================================================== +--- linux-2.6-tip.orig/kernel/itimer.c ++++ linux-2.6-tip/kernel/itimer.c +@@ -161,6 +161,7 @@ again: + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); ++ hrtimer_wait_for_timer(&tsk->signal->real_timer); + goto again; + } + expires = timeval_to_ktime(value->it_value); +Index: linux-2.6-tip/include/linux/bottom_half.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/bottom_half.h ++++ linux-2.6-tip/include/linux/bottom_half.h +@@ -1,9 +1,17 @@ + #ifndef _LINUX_BH_H + #define _LINUX_BH_H + ++#ifdef CONFIG_PREEMPT_HARDIRQS ++# define local_bh_disable() do { } while (0) ++# define __local_bh_disable(ip) do { } while (0) ++# define _local_bh_enable() do { } while (0) ++# define local_bh_enable() do { } while (0) ++# define local_bh_enable_ip(ip) do { } while (0) ++#else + extern void local_bh_disable(void); + extern void _local_bh_enable(void); + extern void local_bh_enable(void); + extern void local_bh_enable_ip(unsigned long ip); ++#endif + + #endif /* _LINUX_BH_H */ +Index: linux-2.6-tip/include/linux/preempt.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/preempt.h ++++ linux-2.6-tip/include/linux/preempt.h +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) + extern void add_preempt_count(int val); +@@ -21,11 +22,12 @@ + #define inc_preempt_count() add_preempt_count(1) + #define dec_preempt_count() sub_preempt_count(1) + +-#define preempt_count() (current_thread_info()->preempt_count) ++#define preempt_count() (current_thread_info()->preempt_count) + + #ifdef CONFIG_PREEMPT + + asmlinkage void preempt_schedule(void); ++asmlinkage void preempt_schedule_irq(void); + + #define preempt_disable() \ + do { \ +@@ -33,12 +35,19 @@ do { \ + barrier(); \ + } while (0) + +-#define preempt_enable_no_resched() \ ++#define __preempt_enable_no_resched() \ + do { \ + barrier(); \ + dec_preempt_count(); \ + } while (0) + ++ ++#ifdef CONFIG_DEBUG_PREEMPT ++extern void notrace preempt_enable_no_resched(void); ++#else ++# define preempt_enable_no_resched() __preempt_enable_no_resched() ++#endif ++ + #define preempt_check_resched() \ + do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ +@@ -47,7 +56,7 @@ do { \ + + #define preempt_enable() \ + do { \ +- preempt_enable_no_resched(); \ ++ __preempt_enable_no_resched(); \ + barrier(); \ + preempt_check_resched(); \ + } while (0) +@@ -84,6 +93,7 @@ do { \ + + #define preempt_disable() do { } while (0) + #define preempt_enable_no_resched() do { } while (0) ++#define __preempt_enable_no_resched() do { } while (0) + #define preempt_enable() do { } while (0) + #define preempt_check_resched() do { } while (0) + +@@ -91,6 +101,8 @@ do { \ + #define preempt_enable_no_resched_notrace() do { } while (0) + #define preempt_enable_notrace() do { } while (0) + ++#define preempt_schedule_irq() do { } while (0) ++ + #endif + + #ifdef CONFIG_PREEMPT_NOTIFIERS +Index: linux-2.6-tip/net/ipv4/tcp.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv4/tcp.c ++++ linux-2.6-tip/net/ipv4/tcp.c +@@ -1323,11 +1323,11 @@ int tcp_recvmsg(struct kiocb *iocb, stru + (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && + dma_find_channel(DMA_MEMCPY)) { +- preempt_enable_no_resched(); ++ preempt_enable(); + tp->ucopy.pinned_list = + dma_pin_iovec_pages(msg->msg_iov, len); + } else { +- preempt_enable_no_resched(); ++ preempt_enable(); + } + } + #endif +Index: linux-2.6-tip/arch/x86/include/asm/rwsem.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/rwsem.h ++++ linux-2.6-tip/arch/x86/include/asm/rwsem.h +@@ -44,14 +44,14 @@ + + struct rwsem_waiter; + +-extern asmregparm struct rw_semaphore * +- rwsem_down_read_failed(struct rw_semaphore *sem); +-extern asmregparm struct rw_semaphore * +- rwsem_down_write_failed(struct rw_semaphore *sem); +-extern asmregparm struct rw_semaphore * +- rwsem_wake(struct rw_semaphore *); +-extern asmregparm struct rw_semaphore * +- rwsem_downgrade_wake(struct rw_semaphore *sem); ++extern asmregparm struct compat_rw_semaphore * ++ rwsem_down_read_failed(struct compat_rw_semaphore *sem); ++extern asmregparm struct compat_rw_semaphore * ++ rwsem_down_write_failed(struct compat_rw_semaphore *sem); ++extern asmregparm struct compat_rw_semaphore * ++ rwsem_wake(struct compat_rw_semaphore *); ++extern asmregparm struct compat_rw_semaphore * ++ rwsem_downgrade_wake(struct compat_rw_semaphore *sem); + + /* + * the semaphore definition +@@ -64,7 +64,7 @@ extern asmregparm struct rw_semaphore * + #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS + #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + +-struct rw_semaphore { ++struct compat_rw_semaphore { + signed long count; + spinlock_t wait_lock; + struct list_head wait_list; +@@ -86,23 +86,23 @@ struct rw_semaphore { + LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \ + } + +-#define DECLARE_RWSEM(name) \ +- struct rw_semaphore name = __RWSEM_INITIALIZER(name) ++#define COMPAT_DECLARE_RWSEM(name) \ ++ struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name) + +-extern void __init_rwsem(struct rw_semaphore *sem, const char *name, ++extern void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, + struct lock_class_key *key); + +-#define init_rwsem(sem) \ ++#define compat_init_rwsem(sem) \ + do { \ + static struct lock_class_key __key; \ + \ +- __init_rwsem((sem), #sem, &__key); \ ++ __compat_init_rwsem((sem), #sem, &__key); \ + } while (0) + + /* + * lock for reading + */ +-static inline void __down_read(struct rw_semaphore *sem) ++static inline void __down_read(struct compat_rw_semaphore *sem) + { + asm volatile("# beginning down_read\n\t" + LOCK_PREFIX " incl (%%eax)\n\t" +@@ -119,7 +119,7 @@ static inline void __down_read(struct rw + /* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +-static inline int __down_read_trylock(struct rw_semaphore *sem) ++static inline int __down_read_trylock(struct compat_rw_semaphore *sem) + { + __s32 result, tmp; + asm volatile("# beginning __down_read_trylock\n\t" +@@ -141,7 +141,8 @@ static inline int __down_read_trylock(st + /* + * lock for writing + */ +-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) ++static inline void ++__down_write_nested(struct compat_rw_semaphore *sem, int subclass) + { + int tmp; + +@@ -160,7 +161,7 @@ static inline void __down_write_nested(s + : "memory", "cc"); + } + +-static inline void __down_write(struct rw_semaphore *sem) ++static inline void __down_write(struct compat_rw_semaphore *sem) + { + __down_write_nested(sem, 0); + } +@@ -168,7 +169,7 @@ static inline void __down_write(struct r + /* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +-static inline int __down_write_trylock(struct rw_semaphore *sem) ++static inline int __down_write_trylock(struct compat_rw_semaphore *sem) + { + signed long ret = cmpxchg(&sem->count, + RWSEM_UNLOCKED_VALUE, +@@ -181,7 +182,7 @@ static inline int __down_write_trylock(s + /* + * unlock after reading + */ +-static inline void __up_read(struct rw_semaphore *sem) ++static inline void __up_read(struct compat_rw_semaphore *sem) + { + __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + asm volatile("# beginning __up_read\n\t" +@@ -199,7 +200,7 @@ static inline void __up_read(struct rw_s + /* + * unlock after writing + */ +-static inline void __up_write(struct rw_semaphore *sem) ++static inline void __up_write(struct compat_rw_semaphore *sem) + { + asm volatile("# beginning __up_write\n\t" + " movl %2,%%edx\n\t" +@@ -218,7 +219,7 @@ static inline void __up_write(struct rw_ + /* + * downgrade write lock to read lock + */ +-static inline void __downgrade_write(struct rw_semaphore *sem) ++static inline void __downgrade_write(struct compat_rw_semaphore *sem) + { + asm volatile("# beginning __downgrade_write\n\t" + LOCK_PREFIX " addl %2,(%%eax)\n\t" +@@ -235,7 +236,7 @@ static inline void __downgrade_write(str + /* + * implement atomic add functionality + */ +-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) ++static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem) + { + asm volatile(LOCK_PREFIX "addl %1,%0" + : "+m" (sem->count) +@@ -245,7 +246,7 @@ static inline void rwsem_atomic_add(int + /* + * implement exchange and add functionality + */ +-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) ++static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem) + { + int tmp = delta; + +@@ -256,7 +257,7 @@ static inline int rwsem_atomic_update(in + return tmp + delta; + } + +-static inline int rwsem_is_locked(struct rw_semaphore *sem) ++static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) + { + return (sem->count != 0); + } +Index: linux-2.6-tip/arch/x86/include/asm/spinlock_types.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/spinlock_types.h ++++ linux-2.6-tip/arch/x86/include/asm/spinlock_types.h +@@ -7,13 +7,13 @@ + + typedef struct raw_spinlock { + unsigned int slock; +-} raw_spinlock_t; ++} __raw_spinlock_t; + + #define __RAW_SPIN_LOCK_UNLOCKED { 0 } + + typedef struct { + unsigned int lock; +-} raw_rwlock_t; ++} __raw_rwlock_t; + + #define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } + +Index: linux-2.6-tip/arch/x86/kernel/vsyscall_64.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/vsyscall_64.c ++++ linux-2.6-tip/arch/x86/kernel/vsyscall_64.c +@@ -59,7 +59,7 @@ int __vgetcpu_mode __section_vgetcpu_mod + + struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = + { +- .lock = SEQLOCK_UNLOCKED, ++ .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), + .sysctl_enabled = 1, + }; + +@@ -78,14 +78,40 @@ void update_vsyscall(struct timespec *wa + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); ++ ++ if (likely(vsyscall_gtod_data.sysctl_enabled == 2)) { ++ struct timespec tmp = *(wall_time); ++ cycle_t (*vread)(void); ++ cycle_t now; ++ ++ vread = vsyscall_gtod_data.clock.vread; ++ if (likely(vread)) ++ now = vread(); ++ else ++ now = clock->read(); ++ ++ /* calculate interval: */ ++ now = (now - clock->cycle_last) & clock->mask; ++ /* convert to nsecs: */ ++ tmp.tv_nsec += ( now * clock->mult) >> clock->shift; ++ ++ while (tmp.tv_nsec >= NSEC_PER_SEC) { ++ tmp.tv_sec += 1; ++ tmp.tv_nsec -= NSEC_PER_SEC; ++ } ++ ++ vsyscall_gtod_data.wall_time_sec = tmp.tv_sec; ++ vsyscall_gtod_data.wall_time_nsec = tmp.tv_nsec; ++ } else { ++ vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; ++ vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; ++ } + /* copy vsyscall data */ + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.shift = clock->shift; +- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; +- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + } +@@ -123,6 +149,26 @@ static __always_inline void do_vgettimeo + unsigned seq; + unsigned long mult, shift, nsec; + cycle_t (*vread)(void); ++ ++ if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) { ++ struct timeval tmp; ++ ++ do { ++ barrier(); ++ tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; ++ tv->tv_usec = __vsyscall_gtod_data.wall_time_nsec; ++ barrier(); ++ tmp.tv_sec = __vsyscall_gtod_data.wall_time_sec; ++ tmp.tv_usec = __vsyscall_gtod_data.wall_time_nsec; ++ ++ } while (tmp.tv_usec != tv->tv_usec || ++ tmp.tv_sec != tv->tv_sec); ++ ++ tv->tv_usec /= NSEC_PER_MSEC; ++ tv->tv_usec *= USEC_PER_MSEC; ++ return; ++ } ++ + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + +@@ -138,7 +184,6 @@ static __always_inline void do_vgettimeo + * does not cause time warps: + */ + rdtsc_barrier(); +- now = vread(); + rdtsc_barrier(); + + base = __vsyscall_gtod_data.clock.cycle_last; +@@ -150,6 +195,7 @@ static __always_inline void do_vgettimeo + nsec = __vsyscall_gtod_data.wall_time_nsec; + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + ++ now = vread(); + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ +Index: linux-2.6-tip/drivers/input/ff-memless.c +=================================================================== +--- linux-2.6-tip.orig/drivers/input/ff-memless.c ++++ linux-2.6-tip/drivers/input/ff-memless.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include + #include + +Index: linux-2.6-tip/fs/proc/array.c +=================================================================== +--- linux-2.6-tip.orig/fs/proc/array.c ++++ linux-2.6-tip/fs/proc/array.c +@@ -134,12 +134,13 @@ static inline void task_name(struct seq_ + */ + static const char *task_state_array[] = { + "R (running)", /* 0 */ +- "S (sleeping)", /* 1 */ +- "D (disk sleep)", /* 2 */ +- "T (stopped)", /* 4 */ +- "T (tracing stop)", /* 8 */ +- "Z (zombie)", /* 16 */ +- "X (dead)" /* 32 */ ++ "M (running-mutex)", /* 1 */ ++ "S (sleeping)", /* 2 */ ++ "D (disk sleep)", /* 4 */ ++ "T (stopped)", /* 8 */ ++ "T (tracing stop)", /* 16 */ ++ "Z (zombie)", /* 32 */ ++ "X (dead)" /* 64 */ + }; + + static inline const char *get_task_state(struct task_struct *tsk) +@@ -321,6 +322,19 @@ static inline void task_context_switch_c + p->nivcsw); + } + ++#define get_blocked_on(t) (-1) ++ ++static inline void show_blocked_on(struct seq_file *m, struct task_struct *p) ++{ ++ pid_t pid = get_blocked_on(p); ++ ++ if (pid < 0) ++ return; ++ ++ seq_printf(m, "BlckOn: %d\n", pid); ++} ++ ++ + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) + { +@@ -340,6 +354,7 @@ int proc_pid_status(struct seq_file *m, + task_show_regs(m, task); + #endif + task_context_switch_counts(m, task); ++ show_blocked_on(m, task); + return 0; + } + +Index: linux-2.6-tip/include/linux/bit_spinlock.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/bit_spinlock.h ++++ linux-2.6-tip/include/linux/bit_spinlock.h +@@ -1,6 +1,8 @@ + #ifndef __LINUX_BIT_SPINLOCK_H + #define __LINUX_BIT_SPINLOCK_H + ++#if 0 ++ + /* + * bit-based spin_lock() + * +@@ -91,5 +93,7 @@ static inline int bit_spin_is_locked(int + #endif + } + ++#endif ++ + #endif /* __LINUX_BIT_SPINLOCK_H */ + +Index: linux-2.6-tip/include/linux/pickop.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/pickop.h +@@ -0,0 +1,32 @@ ++#ifndef _LINUX_PICKOP_H ++#define _LINUX_PICKOP_H ++ ++#undef PICK_TYPE_EQUAL ++#define PICK_TYPE_EQUAL(var, type) \ ++ __builtin_types_compatible_p(typeof(var), type) ++ ++extern int __bad_func_type(void); ++ ++#define PICK_FUNCTION(type1, type2, func1, func2, arg0, ...) \ ++do { \ ++ if (PICK_TYPE_EQUAL((arg0), type1)) \ ++ func1((type1)(arg0), ##__VA_ARGS__); \ ++ else if (PICK_TYPE_EQUAL((arg0), type2)) \ ++ func2((type2)(arg0), ##__VA_ARGS__); \ ++ else __bad_func_type(); \ ++} while (0) ++ ++#define PICK_FUNCTION_RET(type1, type2, func1, func2, arg0, ...) \ ++({ \ ++ unsigned long __ret; \ ++ \ ++ if (PICK_TYPE_EQUAL((arg0), type1)) \ ++ __ret = func1((type1)(arg0), ##__VA_ARGS__); \ ++ else if (PICK_TYPE_EQUAL((arg0), type2)) \ ++ __ret = func2((type2)(arg0), ##__VA_ARGS__); \ ++ else __ret = __bad_func_type(); \ ++ \ ++ __ret; \ ++}) ++ ++#endif /* _LINUX_PICKOP_H */ +Index: linux-2.6-tip/include/linux/rt_lock.h +=================================================================== +--- /dev/null ++++ linux-2.6-tip/include/linux/rt_lock.h +@@ -0,0 +1,274 @@ ++#ifndef __LINUX_RT_LOCK_H ++#define __LINUX_RT_LOCK_H ++ ++/* ++ * Real-Time Preemption Support ++ * ++ * started by Ingo Molnar: ++ * ++ * Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar ++ * ++ * This file contains the main data structure definitions. ++ */ ++#include ++#include ++#include ++ ++#ifdef CONFIG_PREEMPT_RT ++# define preempt_rt 1 ++/* ++ * spinlocks - an RT mutex plus lock-break field: ++ */ ++typedef struct { ++ struct rt_mutex lock; ++ unsigned int break_lock; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++} spinlock_t; ++ ++#ifdef CONFIG_DEBUG_RT_MUTEXES ++# define __RT_SPIN_INITIALIZER(name) \ ++ { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ ++ .save_state = 1, \ ++ .file = __FILE__, \ ++ .line = __LINE__, } ++#else ++# define __RT_SPIN_INITIALIZER(name) \ ++ { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) } ++#endif ++ ++#define __SPIN_LOCK_UNLOCKED(name) (spinlock_t) \ ++ { .lock = __RT_SPIN_INITIALIZER(name), \ ++ SPIN_DEP_MAP_INIT(name) } ++ ++#else /* !PREEMPT_RT */ ++ ++typedef raw_spinlock_t spinlock_t; ++ ++#define __SPIN_LOCK_UNLOCKED _RAW_SPIN_LOCK_UNLOCKED ++ ++#endif ++ ++#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(spin_old_style) ++ ++ ++#define __DEFINE_SPINLOCK(name) \ ++ spinlock_t name = __SPIN_LOCK_UNLOCKED(name) ++ ++#define DEFINE_SPINLOCK(name) \ ++ spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name) ++ ++#ifdef CONFIG_PREEMPT_RT ++ ++/* ++ * RW-semaphores are a spinlock plus a reader-depth count. ++ * ++ * Note that the semantics are different from the usual ++ * Linux rw-sems, in PREEMPT_RT mode we do not allow ++ * multiple readers to hold the lock at once, we only allow ++ * a read-lock owner to read-lock recursively. This is ++ * better for latency, makes the implementation inherently ++ * fair and makes it simpler as well: ++ */ ++struct rw_semaphore { ++ struct rt_mutex lock; ++ int read_depth; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++/* ++ * rwlocks - an RW semaphore plus lock-break field: ++ */ ++typedef struct { ++ struct rt_mutex lock; ++ int read_depth; ++ unsigned int break_lock; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++} rwlock_t; ++ ++#define __RW_LOCK_UNLOCKED(name) (rwlock_t) \ ++ { .lock = __RT_SPIN_INITIALIZER(name), \ ++ RW_DEP_MAP_INIT(name) } ++#else /* !PREEMPT_RT */ ++ ++typedef raw_rwlock_t rwlock_t; ++ ++#define __RW_LOCK_UNLOCKED _RAW_RW_LOCK_UNLOCKED ++ ++#endif ++ ++#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(rw_old_style) ++ ++ ++#define DEFINE_RWLOCK(name) \ ++ rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name) ++ ++#ifdef CONFIG_PREEMPT_RT ++ ++/* ++ * Semaphores - a spinlock plus the semaphore count: ++ */ ++struct semaphore { ++ atomic_t count; ++ struct rt_mutex lock; ++}; ++ ++#define DECLARE_MUTEX(name) \ ++struct semaphore name = \ ++ { .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) } ++ ++extern void ++__sema_init(struct semaphore *sem, int val, char *name, char *file, int line); ++ ++#define rt_sema_init(sem, val) \ ++ __sema_init(sem, val, #sem, __FILE__, __LINE__) ++ ++extern void ++__init_MUTEX(struct semaphore *sem, char *name, char *file, int line); ++#define rt_init_MUTEX(sem) \ ++ __init_MUTEX(sem, #sem, __FILE__, __LINE__) ++ ++extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void); ++ ++/* ++ * No locked initialization for RT semaphores ++ */ ++#define rt_init_MUTEX_LOCKED(sem) \ ++ there_is_no_init_MUTEX_LOCKED_for_RT_semaphores() ++extern void rt_down(struct semaphore *sem); ++extern int rt_down_interruptible(struct semaphore *sem); ++extern int rt_down_timeout(struct semaphore *sem, long jiffies); ++extern int rt_down_trylock(struct semaphore *sem); ++extern void rt_up(struct semaphore *sem); ++ ++#define rt_sem_is_locked(s) rt_mutex_is_locked(&(s)->lock) ++#define rt_sema_count(s) atomic_read(&(s)->count) ++ ++extern int __bad_func_type(void); ++ ++#include ++ ++/* ++ * PICK_SEM_OP() is a small redirector to allow less typing of the lock ++ * types struct compat_semaphore, struct semaphore, at the front of the ++ * PICK_FUNCTION macro. ++ */ ++#define PICK_SEM_OP(...) PICK_FUNCTION(struct compat_semaphore *, \ ++ struct semaphore *, ##__VA_ARGS__) ++#define PICK_SEM_OP_RET(...) PICK_FUNCTION_RET(struct compat_semaphore *,\ ++ struct semaphore *, ##__VA_ARGS__) ++ ++#define sema_init(sem, val) \ ++ PICK_SEM_OP(compat_sema_init, rt_sema_init, sem, val) ++ ++#define init_MUTEX(sem) PICK_SEM_OP(compat_init_MUTEX, rt_init_MUTEX, sem) ++ ++#define init_MUTEX_LOCKED(sem) \ ++ PICK_SEM_OP(compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem) ++ ++#define down(sem) PICK_SEM_OP(compat_down, rt_down, sem) ++ ++#define down_timeout(sem, jiff) \ ++ PICK_SEM_OP_RET(compat_down_timeout, rt_down_timeout, sem, jiff) ++ ++#define down_interruptible(sem) \ ++ PICK_SEM_OP_RET(compat_down_interruptible, rt_down_interruptible, sem) ++ ++#define down_trylock(sem) \ ++ PICK_SEM_OP_RET(compat_down_trylock, rt_down_trylock, sem) ++ ++#define up(sem) PICK_SEM_OP(compat_up, rt_up, sem) ++ ++/* ++ * rwsems: ++ */ ++ ++#define __RWSEM_INITIALIZER(name) \ ++ { .lock = __RT_MUTEX_INITIALIZER(name.lock), \ ++ RW_DEP_MAP_INIT(name) } ++ ++#define DECLARE_RWSEM(lockname) \ ++ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) ++ ++extern void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, ++ struct lock_class_key *key); ++ ++# define rt_init_rwsem(sem) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __rt_rwsem_init((sem), #sem, &__key); \ ++} while (0) ++ ++extern void __dont_do_this_in_rt(struct rw_semaphore *rwsem); ++ ++#define rt_down_read_non_owner(rwsem) __dont_do_this_in_rt(rwsem) ++#define rt_up_read_non_owner(rwsem) __dont_do_this_in_rt(rwsem) ++ ++extern void rt_down_write(struct rw_semaphore *rwsem); ++extern void ++rt_down_read_nested(struct rw_semaphore *rwsem, int subclass); ++extern void ++rt_down_write_nested(struct rw_semaphore *rwsem, int subclass); ++extern void rt_down_read(struct rw_semaphore *rwsem); ++extern int rt_down_write_trylock(struct rw_semaphore *rwsem); ++extern int rt_down_read_trylock(struct rw_semaphore *rwsem); ++extern void rt_up_read(struct rw_semaphore *rwsem); ++extern void rt_up_write(struct rw_semaphore *rwsem); ++extern void rt_downgrade_write(struct rw_semaphore *rwsem); ++ ++# define rt_rwsem_is_locked(rws) (rt_mutex_is_locked(&(rws)->lock)) ++ ++#define PICK_RWSEM_OP(...) PICK_FUNCTION(struct compat_rw_semaphore *, \ ++ struct rw_semaphore *, ##__VA_ARGS__) ++#define PICK_RWSEM_OP_RET(...) PICK_FUNCTION_RET(struct compat_rw_semaphore *,\ ++ struct rw_semaphore *, ##__VA_ARGS__) ++ ++#define init_rwsem(rwsem) PICK_RWSEM_OP(compat_init_rwsem, rt_init_rwsem, rwsem) ++ ++#define down_read(rwsem) PICK_RWSEM_OP(compat_down_read, rt_down_read, rwsem) ++ ++#define down_read_non_owner(rwsem) \ ++ PICK_RWSEM_OP(compat_down_read_non_owner, rt_down_read_non_owner, rwsem) ++ ++#define down_read_trylock(rwsem) \ ++ PICK_RWSEM_OP_RET(compat_down_read_trylock, rt_down_read_trylock, rwsem) ++ ++#define down_write(rwsem) PICK_RWSEM_OP(compat_down_write, rt_down_write, rwsem) ++ ++#define down_read_nested(rwsem, subclass) \ ++ PICK_RWSEM_OP(compat_down_read_nested, rt_down_read_nested, \ ++ rwsem, subclass) ++ ++#define down_write_nested(rwsem, subclass) \ ++ PICK_RWSEM_OP(compat_down_write_nested, rt_down_write_nested, \ ++ rwsem, subclass) ++ ++#define down_write_trylock(rwsem) \ ++ PICK_RWSEM_OP_RET(compat_down_write_trylock, rt_down_write_trylock,\ ++ rwsem) ++ ++#define up_read(rwsem) PICK_RWSEM_OP(compat_up_read, rt_up_read, rwsem) ++ ++#define up_read_non_owner(rwsem) \ ++ PICK_RWSEM_OP(compat_up_read_non_owner, rt_up_read_non_owner, rwsem) ++ ++#define up_write(rwsem) PICK_RWSEM_OP(compat_up_write, rt_up_write, rwsem) ++ ++#define downgrade_write(rwsem) \ ++ PICK_RWSEM_OP(compat_downgrade_write, rt_downgrade_write, rwsem) ++ ++#define rwsem_is_locked(rwsem) \ ++ PICK_RWSEM_OP_RET(compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem) ++ ++#else ++# define preempt_rt 0 ++#endif /* CONFIG_PREEMPT_RT */ ++ ++#endif ++ +Index: linux-2.6-tip/include/linux/rtmutex.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/rtmutex.h ++++ linux-2.6-tip/include/linux/rtmutex.h +@@ -24,7 +24,7 @@ + * @owner: the mutex owner + */ + struct rt_mutex { +- spinlock_t wait_lock; ++ raw_spinlock_t wait_lock; + struct plist_head wait_list; + struct task_struct *owner; + #ifdef CONFIG_DEBUG_RT_MUTEXES +@@ -63,8 +63,8 @@ struct hrtimer_sleeper; + #endif + + #define __RT_MUTEX_INITIALIZER(mutexname) \ +- { .wait_lock = __SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ +- , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \ ++ { .wait_lock = RAW_SPIN_LOCK_UNLOCKED(mutexname) \ ++ , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, &mutexname.wait_lock) \ + , .owner = NULL \ + __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} + +@@ -88,6 +88,8 @@ extern void rt_mutex_destroy(struct rt_m + extern void rt_mutex_lock(struct rt_mutex *lock); + extern int rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock); ++extern int rt_mutex_lock_killable(struct rt_mutex *lock, ++ int detect_deadlock); + extern int rt_mutex_timed_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout, + int detect_deadlock); +@@ -98,7 +100,7 @@ extern void rt_mutex_unlock(struct rt_mu + + #ifdef CONFIG_RT_MUTEXES + # define INIT_RT_MUTEXES(tsk) \ +- .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \ ++ .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters, &tsk.pi_lock), \ + INIT_RT_MUTEX_DEBUG(tsk) + #else + # define INIT_RT_MUTEXES(tsk) +Index: linux-2.6-tip/include/linux/rwsem-spinlock.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/rwsem-spinlock.h ++++ linux-2.6-tip/include/linux/rwsem-spinlock.h +@@ -28,7 +28,7 @@ struct rwsem_waiter; + * - if activity is -1 then there is one active writer + * - if wait_list is not empty, then there are processes waiting for the semaphore + */ +-struct rw_semaphore { ++struct compat_rw_semaphore { + __s32 activity; + spinlock_t wait_lock; + struct list_head wait_list; +@@ -43,33 +43,32 @@ struct rw_semaphore { + # define __RWSEM_DEP_MAP_INIT(lockname) + #endif + +-#define __RWSEM_INITIALIZER(name) \ +-{ 0, __SPIN_LOCK_UNLOCKED(name.wait_lock), LIST_HEAD_INIT((name).wait_list) \ +- __RWSEM_DEP_MAP_INIT(name) } ++#define __COMPAT_RWSEM_INITIALIZER(name) \ ++{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) } + +-#define DECLARE_RWSEM(name) \ +- struct rw_semaphore name = __RWSEM_INITIALIZER(name) ++#define COMPAT_DECLARE_RWSEM(name) \ ++ struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name) + +-extern void __init_rwsem(struct rw_semaphore *sem, const char *name, ++extern void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, + struct lock_class_key *key); + +-#define init_rwsem(sem) \ ++#define compat_init_rwsem(sem) \ + do { \ + static struct lock_class_key __key; \ + \ +- __init_rwsem((sem), #sem, &__key); \ ++ __compat_init_rwsem((sem), #sem, &__key); \ + } while (0) + +-extern void __down_read(struct rw_semaphore *sem); +-extern int __down_read_trylock(struct rw_semaphore *sem); +-extern void __down_write(struct rw_semaphore *sem); +-extern void __down_write_nested(struct rw_semaphore *sem, int subclass); +-extern int __down_write_trylock(struct rw_semaphore *sem); +-extern void __up_read(struct rw_semaphore *sem); +-extern void __up_write(struct rw_semaphore *sem); +-extern void __downgrade_write(struct rw_semaphore *sem); ++extern void __down_read(struct compat_rw_semaphore *sem); ++extern int __down_read_trylock(struct compat_rw_semaphore *sem); ++extern void __down_write(struct compat_rw_semaphore *sem); ++extern void __down_write_nested(struct compat_rw_semaphore *sem, int subclass); ++extern int __down_write_trylock(struct compat_rw_semaphore *sem); ++extern void __up_read(struct compat_rw_semaphore *sem); ++extern void __up_write(struct compat_rw_semaphore *sem); ++extern void __downgrade_write(struct compat_rw_semaphore *sem); + +-static inline int rwsem_is_locked(struct rw_semaphore *sem) ++static inline int compat_rwsem_is_locked(struct compat_rw_semaphore *sem) + { + return (sem->activity != 0); + } +Index: linux-2.6-tip/include/linux/rwsem.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/rwsem.h ++++ linux-2.6-tip/include/linux/rwsem.h +@@ -9,53 +9,68 @@ + + #include + ++#ifdef CONFIG_PREEMPT_RT ++# include ++#endif ++ + #include + #include + #include + #include + +-struct rw_semaphore; ++#ifndef CONFIG_PREEMPT_RT ++/* ++ * On !PREEMPT_RT all rw-semaphores are compat: ++ */ ++#define compat_rw_semaphore rw_semaphore ++#endif ++ ++struct compat_rw_semaphore; + + #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +-#include /* use a generic implementation */ ++# include /* use a generic implementation */ ++# ifndef CONFIG_PREEMPT_RT ++# define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER ++# define DECLARE_RWSEM COMPAT_DECLARE_RWSEM ++# endif + #else +-#include /* use an arch-specific implementation */ ++# include /* use an arch-specific implementation */ + #endif + + /* + * lock for reading + */ +-extern void down_read(struct rw_semaphore *sem); ++extern void compat_down_read(struct compat_rw_semaphore *sem); + + /* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +-extern int down_read_trylock(struct rw_semaphore *sem); ++extern int compat_down_read_trylock(struct compat_rw_semaphore *sem); + + /* + * lock for writing + */ +-extern void down_write(struct rw_semaphore *sem); ++extern void compat_down_write(struct compat_rw_semaphore *sem); + + /* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +-extern int down_write_trylock(struct rw_semaphore *sem); ++extern int compat_down_write_trylock(struct compat_rw_semaphore *sem); + + /* + * release a read lock + */ +-extern void up_read(struct rw_semaphore *sem); ++extern void compat_up_read(struct compat_rw_semaphore *sem); + + /* + * release a write lock + */ +-extern void up_write(struct rw_semaphore *sem); ++extern void compat_up_write(struct compat_rw_semaphore *sem); + + /* + * downgrade write lock to read lock + */ +-extern void downgrade_write(struct rw_semaphore *sem); ++extern void compat_downgrade_write(struct compat_rw_semaphore *sem); + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + /* +@@ -71,21 +86,78 @@ extern void downgrade_write(struct rw_se + * lockdep_set_class() at lock initialization time. + * See Documentation/lockdep-design.txt for more details.) + */ +-extern void down_read_nested(struct rw_semaphore *sem, int subclass); +-extern void down_write_nested(struct rw_semaphore *sem, int subclass); ++extern void ++compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass); ++extern void ++compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass); + /* + * Take/release a lock when not the owner will release it. + * + * [ This API should be avoided as much as possible - the + * proper abstraction for this case is completions. ] + */ +-extern void down_read_non_owner(struct rw_semaphore *sem); +-extern void up_read_non_owner(struct rw_semaphore *sem); ++extern void ++compat_down_read_non_owner(struct compat_rw_semaphore *sem); ++extern void ++compat_up_read_non_owner(struct compat_rw_semaphore *sem); + #else +-# define down_read_nested(sem, subclass) down_read(sem) +-# define down_write_nested(sem, subclass) down_write(sem) +-# define down_read_non_owner(sem) down_read(sem) +-# define up_read_non_owner(sem) up_read(sem) ++# define compat_down_read_nested(sem, subclass) compat_down_read(sem) ++# define compat_down_write_nested(sem, subclass) compat_down_write(sem) ++# define compat_down_read_non_owner(sem) compat_down_read(sem) ++# define compat_up_read_non_owner(sem) compat_up_read(sem) + #endif + ++#ifndef CONFIG_PREEMPT_RT ++ ++#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM ++ ++/* ++ * NOTE, lockdep: this has to be a macro, so that separate class-keys ++ * get generated by the compiler, if the same function does multiple ++ * init_rwsem() calls to different rwsems. ++ */ ++#define init_rwsem(rwsem) compat_init_rwsem(rwsem) ++ ++static inline void down_read(struct compat_rw_semaphore *rwsem) ++{ ++ compat_down_read(rwsem); ++} ++static inline int down_read_trylock(struct compat_rw_semaphore *rwsem) ++{ ++ return compat_down_read_trylock(rwsem); ++} ++static inline void down_write(struct compat_rw_semaphore *rwsem) ++{ ++ compat_down_write(rwsem); ++} ++static inline int down_write_trylock(struct compat_rw_semaphore *rwsem) ++{ ++ return compat_down_write_trylock(rwsem); ++} ++static inline void up_read(struct compat_rw_semaphore *rwsem) ++{ ++ compat_up_read(rwsem); ++} ++static inline void up_write(struct compat_rw_semaphore *rwsem) ++{ ++ compat_up_write(rwsem); ++} ++static inline void downgrade_write(struct compat_rw_semaphore *rwsem) ++{ ++ compat_downgrade_write(rwsem); ++} ++static inline int rwsem_is_locked(struct compat_rw_semaphore *sem) ++{ ++ return compat_rwsem_is_locked(sem); ++} ++# define down_read_nested(sem, subclass) \ ++ compat_down_read_nested(sem, subclass) ++# define down_write_nested(sem, subclass) \ ++ compat_down_write_nested(sem, subclass) ++# define down_read_non_owner(sem) \ ++ compat_down_read_non_owner(sem) ++# define up_read_non_owner(sem) \ ++ compat_up_read_non_owner(sem) ++#endif /* !CONFIG_PREEMPT_RT */ ++ + #endif /* _LINUX_RWSEM_H */ +Index: linux-2.6-tip/include/linux/semaphore.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/semaphore.h ++++ linux-2.6-tip/include/linux/semaphore.h +@@ -9,41 +9,88 @@ + #ifndef __LINUX_SEMAPHORE_H + #define __LINUX_SEMAPHORE_H + +-#include +-#include ++#ifndef CONFIG_PREEMPT_RT ++# define compat_semaphore semaphore ++#endif ++ ++# include ++# include + + /* Please don't access any members of this structure directly */ +-struct semaphore { ++struct compat_semaphore { + spinlock_t lock; + unsigned int count; + struct list_head wait_list; + }; + +-#define __SEMAPHORE_INITIALIZER(name, n) \ ++#define __COMPAT_SEMAPHORE_INITIALIZER(name, n) \ + { \ + .lock = __SPIN_LOCK_UNLOCKED((name).lock), \ + .count = n, \ + .wait_list = LIST_HEAD_INIT((name).wait_list), \ + } + +-#define DECLARE_MUTEX(name) \ +- struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1) ++#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \ ++ struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name, count) + +-static inline void sema_init(struct semaphore *sem, int val) ++#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1) ++static inline void compat_sema_init(struct compat_semaphore *sem, int val) + { + static struct lock_class_key __key; +- *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val); ++ *sem = (struct compat_semaphore) __COMPAT_SEMAPHORE_INITIALIZER(*sem, val); ++ ++ spin_lock_init(&sem->lock); + lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0); + } + +-#define init_MUTEX(sem) sema_init(sem, 1) +-#define init_MUTEX_LOCKED(sem) sema_init(sem, 0) ++#define compat_init_MUTEX(sem) compat_sema_init(sem, 1) ++#define compat_init_MUTEX_LOCKED(sem) compat_sema_init(sem, 0) ++ ++extern void compat_down(struct compat_semaphore *sem); ++extern int __must_check compat_down_interruptible(struct compat_semaphore *sem); ++extern int __must_check compat_down_killable(struct compat_semaphore *sem); ++extern int __must_check compat_down_trylock(struct compat_semaphore *sem); ++extern int __must_check compat_down_timeout(struct compat_semaphore *sem, long jiffies); ++extern void compat_up(struct compat_semaphore *sem); ++ ++#ifdef CONFIG_PREEMPT_RT ++# include ++#else ++#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX ++ ++static inline void sema_init(struct compat_semaphore *sem, int val) ++{ ++ compat_sema_init(sem, val); ++} ++static inline void init_MUTEX(struct compat_semaphore *sem) ++{ ++ compat_init_MUTEX(sem); ++} ++static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem) ++{ ++ compat_init_MUTEX_LOCKED(sem); ++} ++static inline void down(struct compat_semaphore *sem) ++{ ++ compat_down(sem); ++} ++static inline int down_interruptible(struct compat_semaphore *sem) ++{ ++ return compat_down_interruptible(sem); ++} ++static inline int down_trylock(struct compat_semaphore *sem) ++{ ++ return compat_down_trylock(sem); ++} ++static inline int down_timeout(struct compat_semaphore *sem, long jiffies) ++{ ++ return compat_down_timeout(sem, jiffies); ++} + +-extern void down(struct semaphore *sem); +-extern int __must_check down_interruptible(struct semaphore *sem); +-extern int __must_check down_killable(struct semaphore *sem); +-extern int __must_check down_trylock(struct semaphore *sem); +-extern int __must_check down_timeout(struct semaphore *sem, long jiffies); +-extern void up(struct semaphore *sem); ++static inline void up(struct compat_semaphore *sem) ++{ ++ compat_up(sem); ++} ++#endif /* CONFIG_PREEMPT_RT */ + + #endif /* __LINUX_SEMAPHORE_H */ +Index: linux-2.6-tip/include/linux/seqlock.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/seqlock.h ++++ linux-2.6-tip/include/linux/seqlock.h +@@ -3,9 +3,11 @@ + /* + * Reader/writer consistent mechanism without starving writers. This type of + * lock for data where the reader wants a consistent set of information +- * and is willing to retry if the information changes. Readers never +- * block but they may have to retry if a writer is in +- * progress. Writers do not wait for readers. ++ * and is willing to retry if the information changes. Readers block ++ * on write contention (and where applicable, pi-boost the writer). ++ * Readers without contention on entry acquire the critical section ++ * without any atomic operations, but they may have to retry if a writer ++ * enters before the critical section ends. Writers do not wait for readers. + * + * This is not as cache friendly as brlock. Also, this will not work + * for data that contains pointers, because any writer could +@@ -24,56 +26,110 @@ + * + * Based on x86_64 vsyscall gettimeofday + * by Keith Owens and Andrea Arcangeli ++ * ++ * Priority inheritance and live-lock avoidance by Gregory Haskins + */ + ++#include + #include + #include + + typedef struct { + unsigned sequence; +- spinlock_t lock; +-} seqlock_t; ++ rwlock_t lock; ++} __seqlock_t; ++ ++typedef struct { ++ unsigned sequence; ++ raw_spinlock_t lock; ++} __raw_seqlock_t; ++ ++#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock) ++ ++#ifdef CONFIG_PREEMPT_RT ++typedef __seqlock_t seqlock_t; ++#else ++typedef __raw_seqlock_t seqlock_t; ++#endif ++ ++typedef __raw_seqlock_t raw_seqlock_t; + + /* + * These macros triggered gcc-3.x compile-time problems. We think these are + * OK now. Be cautious. + */ +-#define __SEQLOCK_UNLOCKED(lockname) \ +- { 0, __SPIN_LOCK_UNLOCKED(lockname) } ++#define __RAW_SEQLOCK_UNLOCKED(lockname) \ ++ { 0, RAW_SPIN_LOCK_UNLOCKED(lockname) } ++ ++#ifdef CONFIG_PREEMPT_RT ++# define __SEQLOCK_UNLOCKED(lockname) { 0, __RW_LOCK_UNLOCKED(lockname) } ++#else ++# define __SEQLOCK_UNLOCKED(lockname) __RAW_SEQLOCK_UNLOCKED(lockname) ++#endif + + #define SEQLOCK_UNLOCKED \ + __SEQLOCK_UNLOCKED(old_style_seqlock_init) + +-#define seqlock_init(x) \ +- do { \ +- (x)->sequence = 0; \ +- spin_lock_init(&(x)->lock); \ +- } while (0) ++static inline void __raw_seqlock_init(raw_seqlock_t *seqlock) ++{ ++ *seqlock = (raw_seqlock_t) __RAW_SEQLOCK_UNLOCKED(x); ++ spin_lock_init(&seqlock->lock); ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++static inline void __seqlock_init(seqlock_t *seqlock) ++{ ++ *seqlock = (seqlock_t) __SEQLOCK_UNLOCKED(seqlock); ++ rwlock_init(&seqlock->lock); ++} ++#else ++extern void __seqlock_init(seqlock_t *seqlock); ++#endif ++ ++#define seqlock_init(seq) \ ++ PICK_FUNCTION(raw_seqlock_t *, seqlock_t *, \ ++ __raw_seqlock_init, __seqlock_init, seq); + + #define DEFINE_SEQLOCK(x) \ + seqlock_t x = __SEQLOCK_UNLOCKED(x) + ++#define DEFINE_RAW_SEQLOCK(name) \ ++ raw_seqlock_t name __cacheline_aligned_in_smp = \ ++ __RAW_SEQLOCK_UNLOCKED(name) ++ ++ + /* Lock out other writers and update the count. + * Acts like a normal spin_lock/unlock. + * Don't need preempt_disable() because that is in the spin_lock already. + */ +-static inline void write_seqlock(seqlock_t *sl) ++static inline void __write_seqlock(seqlock_t *sl) + { +- spin_lock(&sl->lock); ++ write_lock(&sl->lock); + ++sl->sequence; + smp_wmb(); + } + +-static inline void write_sequnlock(seqlock_t *sl) ++static __always_inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) ++{ ++ unsigned long flags; ++ ++ local_save_flags(flags); ++ __write_seqlock(sl); ++ return flags; ++} ++ ++static inline void __write_sequnlock(seqlock_t *sl) + { + smp_wmb(); + sl->sequence++; +- spin_unlock(&sl->lock); ++ write_unlock(&sl->lock); + } + +-static inline int write_tryseqlock(seqlock_t *sl) ++#define __write_sequnlock_irqrestore(sl, flags) __write_sequnlock(sl) ++ ++static inline int __write_tryseqlock(seqlock_t *sl) + { +- int ret = spin_trylock(&sl->lock); ++ int ret = write_trylock(&sl->lock); + + if (ret) { + ++sl->sequence; +@@ -83,18 +139,25 @@ static inline int write_tryseqlock(seqlo + } + + /* Start of read calculation -- fetch last complete writer token */ +-static __always_inline unsigned read_seqbegin(const seqlock_t *sl) ++static __always_inline unsigned __read_seqbegin(seqlock_t *sl) + { + unsigned ret; + +-repeat: + ret = sl->sequence; + smp_rmb(); + if (unlikely(ret & 1)) { +- cpu_relax(); +- goto repeat; ++ /* ++ * Serialze with the writer which will ensure they are ++ * pi-boosted if necessary and prevent us from starving ++ * them. ++ */ ++ read_lock(&sl->lock); ++ ret = sl->sequence; ++ read_unlock(&sl->lock); + } + ++ BUG_ON(ret & 1); ++ + return ret; + } + +@@ -103,13 +166,192 @@ repeat: + * + * If sequence value changed then writer changed data while in section. + */ +-static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start) ++static inline int __read_seqretry(seqlock_t *sl, unsigned iv) ++{ ++ smp_rmb(); ++ return (sl->sequence != iv); ++} ++ ++static __always_inline void __write_seqlock_raw(raw_seqlock_t *sl) ++{ ++ spin_lock(&sl->lock); ++ ++sl->sequence; ++ smp_wmb(); ++} ++ ++static __always_inline unsigned long ++__write_seqlock_irqsave_raw(raw_seqlock_t *sl) ++{ ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ __write_seqlock_raw(sl); ++ return flags; ++} ++ ++static __always_inline void __write_seqlock_irq_raw(raw_seqlock_t *sl) ++{ ++ local_irq_disable(); ++ __write_seqlock_raw(sl); ++} ++ ++static __always_inline void __write_seqlock_bh_raw(raw_seqlock_t *sl) ++{ ++ local_bh_disable(); ++ __write_seqlock_raw(sl); ++} ++ ++static __always_inline void __write_sequnlock_raw(raw_seqlock_t *sl) ++{ ++ smp_wmb(); ++ sl->sequence++; ++ spin_unlock(&sl->lock); ++} ++ ++static __always_inline void ++__write_sequnlock_irqrestore_raw(raw_seqlock_t *sl, unsigned long flags) ++{ ++ __write_sequnlock_raw(sl); ++ local_irq_restore(flags); ++ preempt_check_resched(); ++} ++ ++static __always_inline void __write_sequnlock_irq_raw(raw_seqlock_t *sl) ++{ ++ __write_sequnlock_raw(sl); ++ local_irq_enable(); ++ preempt_check_resched(); ++} ++ ++static __always_inline void __write_sequnlock_bh_raw(raw_seqlock_t *sl) ++{ ++ __write_sequnlock_raw(sl); ++ local_bh_enable(); ++} ++ ++static __always_inline int __write_tryseqlock_raw(raw_seqlock_t *sl) ++{ ++ int ret = spin_trylock(&sl->lock); ++ ++ if (ret) { ++ ++sl->sequence; ++ smp_wmb(); ++ } ++ return ret; ++} ++ ++static __always_inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl) ++{ ++ unsigned ret; ++ ++repeat: ++ ret = sl->sequence; ++ smp_rmb(); ++ if (unlikely(ret & 1)) { ++ cpu_relax(); ++ goto repeat; ++ } ++ ++ return ret; ++} ++ ++static __always_inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned start) + { + smp_rmb(); + + return (sl->sequence != start); + } + ++extern int __bad_seqlock_type(void); ++ ++/* ++ * PICK_SEQ_OP() is a small redirector to allow less typing of the lock ++ * types raw_seqlock_t, seqlock_t, at the front of the PICK_FUNCTION ++ * macro. ++ */ ++#define PICK_SEQ_OP(...) \ ++ PICK_FUNCTION(raw_seqlock_t *, seqlock_t *, ##__VA_ARGS__) ++#define PICK_SEQ_OP_RET(...) \ ++ PICK_FUNCTION_RET(raw_seqlock_t *, seqlock_t *, ##__VA_ARGS__) ++ ++#define write_seqlock(sl) PICK_SEQ_OP(__write_seqlock_raw, __write_seqlock, sl) ++ ++#define write_sequnlock(sl) \ ++ PICK_SEQ_OP(__write_sequnlock_raw, __write_sequnlock, sl) ++ ++#define write_tryseqlock(sl) \ ++ PICK_SEQ_OP_RET(__write_tryseqlock_raw, __write_tryseqlock, sl) ++ ++#define read_seqbegin(sl) \ ++ PICK_SEQ_OP_RET(__read_seqbegin_raw, __read_seqbegin, sl) ++ ++#define read_seqretry(sl, iv) \ ++ PICK_SEQ_OP_RET(__read_seqretry_raw, __read_seqretry, sl, iv) ++ ++#define write_seqlock_irqsave(lock, flags) \ ++do { \ ++ flags = PICK_SEQ_OP_RET(__write_seqlock_irqsave_raw, \ ++ __write_seqlock_irqsave, lock); \ ++} while (0) ++ ++#define write_seqlock_irq(lock) \ ++ PICK_SEQ_OP(__write_seqlock_irq_raw, __write_seqlock, lock) ++ ++#define write_seqlock_bh(lock) \ ++ PICK_SEQ_OP(__write_seqlock_bh_raw, __write_seqlock, lock) ++ ++#define write_sequnlock_irqrestore(lock, flags) \ ++ PICK_SEQ_OP(__write_sequnlock_irqrestore_raw, \ ++ __write_sequnlock_irqrestore, lock, flags) ++ ++#define write_sequnlock_bh(lock) \ ++ PICK_SEQ_OP(__write_sequnlock_bh_raw, __write_sequnlock, lock) ++ ++#define write_sequnlock_irq(lock) \ ++ PICK_SEQ_OP(__write_sequnlock_irq_raw, __write_sequnlock, lock) ++ ++static __always_inline ++unsigned long __seq_irqsave_raw(raw_seqlock_t *sl) ++{ ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ return flags; ++} ++ ++static __always_inline unsigned long __seq_irqsave(seqlock_t *sl) ++{ ++ unsigned long flags; ++ ++ local_save_flags(flags); ++ return flags; ++} ++ ++#define read_seqbegin_irqsave(lock, flags) \ ++({ \ ++ flags = PICK_SEQ_OP_RET(__seq_irqsave_raw, __seq_irqsave, lock);\ ++ read_seqbegin(lock); \ ++}) ++ ++static __always_inline int ++__read_seqretry_irqrestore(seqlock_t *sl, unsigned iv, unsigned long flags) ++{ ++ return __read_seqretry(sl, iv); ++} ++ ++static __always_inline int ++__read_seqretry_irqrestore_raw(raw_seqlock_t *sl, unsigned iv, ++ unsigned long flags) ++{ ++ int ret = read_seqretry(sl, iv); ++ local_irq_restore(flags); ++ preempt_check_resched(); ++ return ret; ++} ++ ++#define read_seqretry_irqrestore(lock, iv, flags) \ ++ PICK_SEQ_OP_RET(__read_seqretry_irqrestore_raw, \ ++ __read_seqretry_irqrestore, lock, iv, flags) + + /* + * Version using sequence counter only. +@@ -166,32 +408,4 @@ static inline void write_seqcount_end(se + smp_wmb(); + s->sequence++; + } +- +-/* +- * Possible sw/hw IRQ protected versions of the interfaces. +- */ +-#define write_seqlock_irqsave(lock, flags) \ +- do { local_irq_save(flags); write_seqlock(lock); } while (0) +-#define write_seqlock_irq(lock) \ +- do { local_irq_disable(); write_seqlock(lock); } while (0) +-#define write_seqlock_bh(lock) \ +- do { local_bh_disable(); write_seqlock(lock); } while (0) +- +-#define write_sequnlock_irqrestore(lock, flags) \ +- do { write_sequnlock(lock); local_irq_restore(flags); } while(0) +-#define write_sequnlock_irq(lock) \ +- do { write_sequnlock(lock); local_irq_enable(); } while(0) +-#define write_sequnlock_bh(lock) \ +- do { write_sequnlock(lock); local_bh_enable(); } while(0) +- +-#define read_seqbegin_irqsave(lock, flags) \ +- ({ local_irq_save(flags); read_seqbegin(lock); }) +- +-#define read_seqretry_irqrestore(lock, iv, flags) \ +- ({ \ +- int ret = read_seqretry(lock, iv); \ +- local_irq_restore(flags); \ +- ret; \ +- }) +- + #endif /* __LINUX_SEQLOCK_H */ +Index: linux-2.6-tip/include/linux/spinlock.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/spinlock.h ++++ linux-2.6-tip/include/linux/spinlock.h +@@ -44,6 +44,42 @@ + * builds the _spin_*() APIs. + * + * linux/spinlock.h: builds the final spin_*() APIs. ++ * ++ * ++ * Public types and naming conventions: ++ * ------------------------------------ ++ * spinlock_t: type: sleep-lock ++ * raw_spinlock_t: type: spin-lock (debug) ++ * ++ * spin_lock([raw_]spinlock_t): API: acquire lock, both types ++ * ++ * ++ * Internal types and naming conventions: ++ * ------------------------------------- ++ * __raw_spinlock_t: type: lowlevel spin-lock ++ * ++ * _spin_lock(struct rt_mutex): API: acquire sleep-lock ++ * __spin_lock(raw_spinlock_t): API: acquire spin-lock (highlevel) ++ * _raw_spin_lock(raw_spinlock_t): API: acquire spin-lock (debug) ++ * __raw_spin_lock(__raw_spinlock_t): API: acquire spin-lock (lowlevel) ++ * ++ * ++ * spin_lock(raw_spinlock_t) translates into the following chain of ++ * calls/inlines/macros, if spin-lock debugging is enabled: ++ * ++ * spin_lock() [include/linux/spinlock.h] ++ * -> __spin_lock() [kernel/spinlock.c] ++ * -> _raw_spin_lock() [lib/spinlock_debug.c] ++ * -> __raw_spin_lock() [include/asm/spinlock.h] ++ * ++ * spin_lock(spinlock_t) translates into the following chain of ++ * calls/inlines/macros: ++ * ++ * spin_lock() [include/linux/spinlock.h] ++ * -> _spin_lock() [include/linux/spinlock.h] ++ * -> rt_spin_lock() [kernel/rtmutex.c] ++ * -> rt_spin_lock_fastlock() [kernel/rtmutex.c] ++ * -> rt_spin_lock_slowlock() [kernel/rtmutex.c] + */ + + #include +@@ -52,29 +88,15 @@ + #include + #include + #include ++#include + #include + #include ++#include ++#include + + #include + + /* +- * Must define these before including other files, inline functions need them +- */ +-#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME +- +-#define LOCK_SECTION_START(extra) \ +- ".subsection 1\n\t" \ +- extra \ +- ".ifndef " LOCK_SECTION_NAME "\n\t" \ +- LOCK_SECTION_NAME ":\n\t" \ +- ".endif\n" +- +-#define LOCK_SECTION_END \ +- ".previous\n\t" +- +-#define __lockfunc __attribute__((section(".spinlock.text"))) +- +-/* + * Pull the raw_spinlock_t and raw_rwlock_t definitions: + */ + #include +@@ -90,36 +112,10 @@ extern int __lockfunc generic__raw_read_ + # include + #endif + +-#ifdef CONFIG_DEBUG_SPINLOCK +- extern void __spin_lock_init(spinlock_t *lock, const char *name, +- struct lock_class_key *key); +-# define spin_lock_init(lock) \ +-do { \ +- static struct lock_class_key __key; \ +- \ +- __spin_lock_init((lock), #lock, &__key); \ +-} while (0) +- +-#else +-# define spin_lock_init(lock) \ +- do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0) +-#endif +- +-#ifdef CONFIG_DEBUG_SPINLOCK +- extern void __rwlock_init(rwlock_t *lock, const char *name, +- struct lock_class_key *key); +-# define rwlock_init(lock) \ +-do { \ +- static struct lock_class_key __key; \ +- \ +- __rwlock_init((lock), #lock, &__key); \ +-} while (0) +-#else +-# define rwlock_init(lock) \ +- do { *(lock) = RW_LOCK_UNLOCKED; } while (0) +-#endif +- +-#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) ++/* ++ * Pull the RT types: ++ */ ++#include + + #ifdef CONFIG_GENERIC_LOCKBREAK + #define spin_is_contended(lock) ((lock)->break_lock) +@@ -132,12 +128,6 @@ do { \ + #endif /*__raw_spin_is_contended*/ + #endif + +-/** +- * spin_unlock_wait - wait until the spinlock gets unlocked +- * @lock: the spinlock in question. +- */ +-#define spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) +- + /* + * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: + */ +@@ -148,16 +138,16 @@ do { \ + #endif + + #ifdef CONFIG_DEBUG_SPINLOCK +- extern void _raw_spin_lock(spinlock_t *lock); +-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) +- extern int _raw_spin_trylock(spinlock_t *lock); +- extern void _raw_spin_unlock(spinlock_t *lock); +- extern void _raw_read_lock(rwlock_t *lock); +- extern int _raw_read_trylock(rwlock_t *lock); +- extern void _raw_read_unlock(rwlock_t *lock); +- extern void _raw_write_lock(rwlock_t *lock); +- extern int _raw_write_trylock(rwlock_t *lock); +- extern void _raw_write_unlock(rwlock_t *lock); ++ extern __lockfunc void _raw_spin_lock(raw_spinlock_t *lock); ++# define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock) ++ extern __lockfunc int _raw_spin_trylock(raw_spinlock_t *lock); ++ extern __lockfunc void _raw_spin_unlock(raw_spinlock_t *lock); ++ extern __lockfunc void _raw_read_lock(raw_rwlock_t *lock); ++ extern __lockfunc int _raw_read_trylock(raw_rwlock_t *lock); ++ extern __lockfunc void _raw_read_unlock(raw_rwlock_t *lock); ++ extern __lockfunc void _raw_write_lock(raw_rwlock_t *lock); ++ extern __lockfunc int _raw_write_trylock(raw_rwlock_t *lock); ++ extern __lockfunc void _raw_write_unlock(raw_rwlock_t *lock); + #else + # define _raw_spin_lock(lock) __raw_spin_lock(&(lock)->raw_lock) + # define _raw_spin_lock_flags(lock, flags) \ +@@ -172,179 +162,440 @@ do { \ + # define _raw_write_unlock(rwlock) __raw_write_unlock(&(rwlock)->raw_lock) + #endif + +-#define read_can_lock(rwlock) __raw_read_can_lock(&(rwlock)->raw_lock) +-#define write_can_lock(rwlock) __raw_write_can_lock(&(rwlock)->raw_lock) ++extern int __bad_spinlock_type(void); ++extern int __bad_rwlock_type(void); ++ ++extern void ++__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key); ++ ++extern void __lockfunc rt_spin_lock(spinlock_t *lock); ++extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); ++extern void __lockfunc rt_spin_unlock(spinlock_t *lock); ++extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); ++extern int __lockfunc ++rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); ++extern int __lockfunc rt_spin_trylock(spinlock_t *lock); ++extern int _atomic_dec_and_spin_lock(spinlock_t *lock, atomic_t *atomic); ++ ++/* ++ * lockdep-less calls, for derived types like rwlock: ++ * (for trylock they can use rt_mutex_trylock() directly. ++ */ ++extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); ++extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); ++ ++#ifdef CONFIG_PREEMPT_RT ++# define _spin_lock(l) rt_spin_lock(l) ++# define _spin_lock_nested(l, s) rt_spin_lock_nested(l, s) ++# define _spin_lock_bh(l) rt_spin_lock(l) ++# define _spin_lock_irq(l) rt_spin_lock(l) ++# define _spin_unlock(l) rt_spin_unlock(l) ++# define _spin_unlock_no_resched(l) rt_spin_unlock(l) ++# define _spin_unlock_bh(l) rt_spin_unlock(l) ++# define _spin_unlock_irq(l) rt_spin_unlock(l) ++# define _spin_unlock_irqrestore(l, f) rt_spin_unlock(l) ++static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) ++{ ++ rt_spin_lock(lock); ++ return 0; ++} ++static inline unsigned long __lockfunc ++_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) ++{ ++ rt_spin_lock_nested(lock, subclass); ++ return 0; ++} ++#else ++static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) ++{ ++ return 0; ++} ++static inline unsigned long __lockfunc ++_spin_lock_irqsave_nested(spinlock_t *lock, int subclass) ++{ ++ return 0; ++} ++# define _spin_lock(l) do { } while (0) ++# define _spin_lock_nested(l, s) do { } while (0) ++# define _spin_lock_bh(l) do { } while (0) ++# define _spin_lock_irq(l) do { } while (0) ++# define _spin_unlock(l) do { } while (0) ++# define _spin_unlock_no_resched(l) do { } while (0) ++# define _spin_unlock_bh(l) do { } while (0) ++# define _spin_unlock_irq(l) do { } while (0) ++# define _spin_unlock_irqrestore(l, f) do { } while (0) ++#endif ++ ++#define _spin_lock_init(sl, n, f, l) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __rt_spin_lock_init(sl, n, &__key); \ ++} while (0) ++ ++# ifdef CONFIG_PREEMPT_RT ++# define _spin_can_lock(l) (!rt_mutex_is_locked(&(l)->lock)) ++# define _spin_is_locked(l) rt_mutex_is_locked(&(l)->lock) ++# define _spin_unlock_wait(l) rt_spin_unlock_wait(l) ++ ++# define _spin_trylock(l) rt_spin_trylock(l) ++# define _spin_trylock_bh(l) rt_spin_trylock(l) ++# define _spin_trylock_irq(l) rt_spin_trylock(l) ++# define _spin_trylock_irqsave(l,f) rt_spin_trylock_irqsave(l, f) ++# else ++ ++ extern int this_should_never_be_called_on_non_rt(spinlock_t *lock); ++# define TSNBCONRT(l) this_should_never_be_called_on_non_rt(l) ++# define _spin_can_lock(l) TSNBCONRT(l) ++# define _spin_is_locked(l) TSNBCONRT(l) ++# define _spin_unlock_wait(l) TSNBCONRT(l) ++ ++# define _spin_trylock(l) TSNBCONRT(l) ++# define _spin_trylock_bh(l) TSNBCONRT(l) ++# define _spin_trylock_irq(l) TSNBCONRT(l) ++# define _spin_trylock_irqsave(l,f) TSNBCONRT(l) ++#endif ++ ++extern void __lockfunc rt_write_lock(rwlock_t *rwlock); ++extern void __lockfunc rt_read_lock(rwlock_t *rwlock); ++extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); ++extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, ++ unsigned long *flags); ++extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); ++extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); ++extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); ++extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock); ++extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock); ++extern void ++__rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); ++ ++#define _rwlock_init(rwl, n, f, l) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __rt_rwlock_init(rwl, n, &__key); \ ++} while (0) ++ ++#ifdef CONFIG_PREEMPT_RT ++# define rt_read_can_lock(rwl) (!rt_mutex_is_locked(&(rwl)->lock)) ++# define rt_write_can_lock(rwl) (!rt_mutex_is_locked(&(rwl)->lock)) ++#else ++ extern int rt_rwlock_can_lock_never_call_on_non_rt(rwlock_t *rwlock); ++# define rt_read_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) ++# define rt_write_can_lock(rwl) rt_rwlock_can_lock_never_call_on_non_rt(rwl) ++#endif ++ ++# define _read_can_lock(rwl) rt_read_can_lock(rwl) ++# define _write_can_lock(rwl) rt_write_can_lock(rwl) ++ ++# define _read_trylock(rwl) rt_read_trylock(rwl) ++# define _write_trylock(rwl) rt_write_trylock(rwl) ++# define _write_trylock_irqsave(rwl, flags) \ ++ rt_write_trylock_irqsave(rwl, flags) ++ ++# define _read_lock(rwl) rt_read_lock(rwl) ++# define _write_lock(rwl) rt_write_lock(rwl) ++# define _read_unlock(rwl) rt_read_unlock(rwl) ++# define _write_unlock(rwl) rt_write_unlock(rwl) ++ ++# define _read_lock_bh(rwl) rt_read_lock(rwl) ++# define _write_lock_bh(rwl) rt_write_lock(rwl) ++# define _read_unlock_bh(rwl) rt_read_unlock(rwl) ++# define _write_unlock_bh(rwl) rt_write_unlock(rwl) ++ ++# define _read_lock_irq(rwl) rt_read_lock(rwl) ++# define _write_lock_irq(rwl) rt_write_lock(rwl) ++# define _read_unlock_irq(rwl) rt_read_unlock(rwl) ++# define _write_unlock_irq(rwl) rt_write_unlock(rwl) ++ ++# define _read_lock_irqsave(rwl) rt_read_lock_irqsave(rwl) ++# define _write_lock_irqsave(rwl) rt_write_lock_irqsave(rwl) ++ ++# define _read_unlock_irqrestore(rwl, f) rt_read_unlock(rwl) ++# define _write_unlock_irqrestore(rwl, f) rt_write_unlock(rwl) ++ ++#ifdef CONFIG_DEBUG_SPINLOCK ++ extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, ++ struct lock_class_key *key); ++# define _raw_spin_lock_init(lock, name, file, line) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __raw_spin_lock_init((lock), #lock, &__key); \ ++} while (0) ++ ++#else ++#define __raw_spin_lock_init(lock) \ ++ do { *(lock) = RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) ++# define _raw_spin_lock_init(lock, name, file, line) __raw_spin_lock_init(lock) ++#endif ++ ++/* ++ * PICK_SPIN_OP()/PICK_RW_OP() are simple redirectors for PICK_FUNCTION ++ */ ++#define PICK_SPIN_OP(...) \ ++ PICK_FUNCTION(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__) ++#define PICK_SPIN_OP_RET(...) \ ++ PICK_FUNCTION_RET(raw_spinlock_t *, spinlock_t *, ##__VA_ARGS__) ++#define PICK_RW_OP(...) PICK_FUNCTION(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__) ++#define PICK_RW_OP_RET(...) \ ++ PICK_FUNCTION_RET(raw_rwlock_t *, rwlock_t *, ##__VA_ARGS__) ++ ++#define spin_lock_init(lock) \ ++ PICK_SPIN_OP(_raw_spin_lock_init, _spin_lock_init, lock, #lock, \ ++ __FILE__, __LINE__) ++ ++#ifdef CONFIG_DEBUG_SPINLOCK ++ extern void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, ++ struct lock_class_key *key); ++# define _raw_rwlock_init(lock, name, file, line) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __raw_rwlock_init((lock), #lock, &__key); \ ++} while (0) ++#else ++#define __raw_rwlock_init(lock) \ ++ do { *(lock) = RAW_RW_LOCK_UNLOCKED(lock); } while (0) ++# define _raw_rwlock_init(lock, name, file, line) __raw_rwlock_init(lock) ++#endif ++ ++#define rwlock_init(lock) \ ++ PICK_RW_OP(_raw_rwlock_init, _rwlock_init, lock, #lock, \ ++ __FILE__, __LINE__) ++ ++#define __spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock) ++ ++#define spin_is_locked(lock) \ ++ PICK_SPIN_OP_RET(__spin_is_locked, _spin_is_locked, lock) ++ ++#define __spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock) ++ ++#define spin_unlock_wait(lock) \ ++ PICK_SPIN_OP(__spin_unlock_wait, _spin_unlock_wait, lock) + + /* + * Define the various spin_lock and rw_lock methods. Note we define these + * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various + * methods are defined as nops in the case they are not required. + */ +-#define spin_trylock(lock) __cond_lock(lock, _spin_trylock(lock)) +-#define read_trylock(lock) __cond_lock(lock, _read_trylock(lock)) +-#define write_trylock(lock) __cond_lock(lock, _write_trylock(lock)) ++#define spin_trylock(lock) \ ++ __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock, _spin_trylock, lock)) ++ ++#define read_trylock(lock) \ ++ __cond_lock(lock, PICK_RW_OP_RET(__read_trylock, _read_trylock, lock)) + +-#define spin_lock(lock) _spin_lock(lock) ++#define write_trylock(lock) \ ++ __cond_lock(lock, PICK_RW_OP_RET(__write_trylock, _write_trylock, lock)) ++ ++#define write_trylock_irqsave(lock, flags) \ ++ __cond_lock(lock, PICK_RW_OP_RET(__write_trylock_irqsave, \ ++ _write_trylock_irqsave, lock, &flags)) ++ ++#define __spin_can_lock(lock) __raw_spin_can_lock(&(lock)->raw_lock) ++#define __read_can_lock(lock) __raw_read_can_lock(&(lock)->raw_lock) ++#define __write_can_lock(lock) __raw_write_can_lock(&(lock)->raw_lock) ++ ++#define read_can_lock(lock) \ ++ __cond_lock(lock, PICK_RW_OP_RET(__read_can_lock, _read_can_lock, lock)) ++ ++#define write_can_lock(lock) \ ++ __cond_lock(lock, PICK_RW_OP_RET(__write_can_lock, _write_can_lock,\ ++ lock)) ++ ++#define spin_lock(lock) PICK_SPIN_OP(__spin_lock, _spin_lock, lock) + + #ifdef CONFIG_DEBUG_LOCK_ALLOC +-# define spin_lock_nested(lock, subclass) _spin_lock_nested(lock, subclass) +-# define spin_lock_nest_lock(lock, nest_lock) \ +- do { \ +- typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\ +- _spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \ +- } while (0) ++# define spin_lock_nested(lock, subclass) \ ++ PICK_SPIN_OP(__spin_lock_nested, _spin_lock_nested, lock, subclass) + #else +-# define spin_lock_nested(lock, subclass) _spin_lock(lock) +-# define spin_lock_nest_lock(lock, nest_lock) _spin_lock(lock) ++# define spin_lock_nested(lock, subclass) spin_lock(lock) + #endif + +-#define write_lock(lock) _write_lock(lock) +-#define read_lock(lock) _read_lock(lock) ++#define write_lock(lock) PICK_RW_OP(__write_lock, _write_lock, lock) + +-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) ++#define read_lock(lock) PICK_RW_OP(__read_lock, _read_lock, lock) + +-#define spin_lock_irqsave(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- flags = _spin_lock_irqsave(lock); \ +- } while (0) +-#define read_lock_irqsave(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- flags = _read_lock_irqsave(lock); \ +- } while (0) +-#define write_lock_irqsave(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- flags = _write_lock_irqsave(lock); \ +- } while (0) ++# define spin_lock_irqsave(lock, flags) \ ++do { \ ++ BUILD_CHECK_IRQ_FLAGS(flags); \ ++ flags = PICK_SPIN_OP_RET(__spin_lock_irqsave, _spin_lock_irqsave, \ ++ lock); \ ++} while (0) + + #ifdef CONFIG_DEBUG_LOCK_ALLOC +-#define spin_lock_irqsave_nested(lock, flags, subclass) \ +- do { \ +- typecheck(unsigned long, flags); \ +- flags = _spin_lock_irqsave_nested(lock, subclass); \ +- } while (0) +-#else +-#define spin_lock_irqsave_nested(lock, flags, subclass) \ +- do { \ +- typecheck(unsigned long, flags); \ +- flags = _spin_lock_irqsave(lock); \ +- } while (0) +-#endif +- +-#else +- +-#define spin_lock_irqsave(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- _spin_lock_irqsave(lock, flags); \ +- } while (0) +-#define read_lock_irqsave(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- _read_lock_irqsave(lock, flags); \ +- } while (0) +-#define write_lock_irqsave(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- _write_lock_irqsave(lock, flags); \ +- } while (0) +-#define spin_lock_irqsave_nested(lock, flags, subclass) \ +- spin_lock_irqsave(lock, flags) +- +-#endif +- +-#define spin_lock_irq(lock) _spin_lock_irq(lock) +-#define spin_lock_bh(lock) _spin_lock_bh(lock) +- +-#define read_lock_irq(lock) _read_lock_irq(lock) +-#define read_lock_bh(lock) _read_lock_bh(lock) +- +-#define write_lock_irq(lock) _write_lock_irq(lock) +-#define write_lock_bh(lock) _write_lock_bh(lock) +- +-/* +- * We inline the unlock functions in the nondebug case: +- */ +-#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) || \ +- !defined(CONFIG_SMP) +-# define spin_unlock(lock) _spin_unlock(lock) +-# define read_unlock(lock) _read_unlock(lock) +-# define write_unlock(lock) _write_unlock(lock) +-# define spin_unlock_irq(lock) _spin_unlock_irq(lock) +-# define read_unlock_irq(lock) _read_unlock_irq(lock) +-# define write_unlock_irq(lock) _write_unlock_irq(lock) +-#else +-# define spin_unlock(lock) \ +- do {__raw_spin_unlock(&(lock)->raw_lock); __release(lock); } while (0) +-# define read_unlock(lock) \ +- do {__raw_read_unlock(&(lock)->raw_lock); __release(lock); } while (0) +-# define write_unlock(lock) \ +- do {__raw_write_unlock(&(lock)->raw_lock); __release(lock); } while (0) +-# define spin_unlock_irq(lock) \ +-do { \ +- __raw_spin_unlock(&(lock)->raw_lock); \ +- __release(lock); \ +- local_irq_enable(); \ +-} while (0) +-# define read_unlock_irq(lock) \ +-do { \ +- __raw_read_unlock(&(lock)->raw_lock); \ +- __release(lock); \ +- local_irq_enable(); \ +-} while (0) +-# define write_unlock_irq(lock) \ +-do { \ +- __raw_write_unlock(&(lock)->raw_lock); \ +- __release(lock); \ +- local_irq_enable(); \ +-} while (0) +-#endif +- +-#define spin_unlock_irqrestore(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- _spin_unlock_irqrestore(lock, flags); \ +- } while (0) +-#define spin_unlock_bh(lock) _spin_unlock_bh(lock) +- +-#define read_unlock_irqrestore(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- _read_unlock_irqrestore(lock, flags); \ +- } while (0) +-#define read_unlock_bh(lock) _read_unlock_bh(lock) +- +-#define write_unlock_irqrestore(lock, flags) \ +- do { \ +- typecheck(unsigned long, flags); \ +- _write_unlock_irqrestore(lock, flags); \ +- } while (0) +-#define write_unlock_bh(lock) _write_unlock_bh(lock) +- +-#define spin_trylock_bh(lock) __cond_lock(lock, _spin_trylock_bh(lock)) +- +-#define spin_trylock_irq(lock) \ +-({ \ +- local_irq_disable(); \ +- spin_trylock(lock) ? \ +- 1 : ({ local_irq_enable(); 0; }); \ +-}) ++# define spin_lock_irqsave_nested(lock, flags, subclass) \ ++do { \ ++ BUILD_CHECK_IRQ_FLAGS(flags); \ ++ flags = PICK_SPIN_OP_RET(__spin_lock_irqsave_nested, \ ++ _spin_lock_irqsave_nested, lock, subclass); \ ++} while (0) ++#else ++# define spin_lock_irqsave_nested(lock, flags, subclass) \ ++ spin_lock_irqsave(lock, flags) ++#endif ++ ++# define read_lock_irqsave(lock, flags) \ ++do { \ ++ BUILD_CHECK_IRQ_FLAGS(flags); \ ++ flags = PICK_RW_OP_RET(__read_lock_irqsave, _read_lock_irqsave, lock);\ ++} while (0) ++ ++# define write_lock_irqsave(lock, flags) \ ++do { \ ++ BUILD_CHECK_IRQ_FLAGS(flags); \ ++ flags = PICK_RW_OP_RET(__write_lock_irqsave, _write_lock_irqsave,lock);\ ++} while (0) ++ ++#define spin_lock_irq(lock) PICK_SPIN_OP(__spin_lock_irq, _spin_lock_irq, lock) ++ ++#define spin_lock_bh(lock) PICK_SPIN_OP(__spin_lock_bh, _spin_lock_bh, lock) ++ ++#define read_lock_irq(lock) PICK_RW_OP(__read_lock_irq, _read_lock_irq, lock) ++ ++#define read_lock_bh(lock) PICK_RW_OP(__read_lock_bh, _read_lock_bh, lock) ++ ++#define write_lock_irq(lock) PICK_RW_OP(__write_lock_irq, _write_lock_irq, lock) ++ ++#define write_lock_bh(lock) PICK_RW_OP(__write_lock_bh, _write_lock_bh, lock) ++ ++#define spin_unlock(lock) PICK_SPIN_OP(__spin_unlock, _spin_unlock, lock) ++ ++#define read_unlock(lock) PICK_RW_OP(__read_unlock, _read_unlock, lock) ++ ++#define write_unlock(lock) PICK_RW_OP(__write_unlock, _write_unlock, lock) ++ ++#define spin_unlock_no_resched(lock) \ ++ PICK_SPIN_OP(__spin_unlock_no_resched, _spin_unlock_no_resched, lock) ++ ++#define spin_unlock_irqrestore(lock, flags) \ ++do { \ ++ BUILD_CHECK_IRQ_FLAGS(flags); \ ++ PICK_SPIN_OP(__spin_unlock_irqrestore, _spin_unlock_irqrestore, \ ++ lock, flags); \ ++} while (0) ++ ++#define spin_unlock_irq(lock) \ ++ PICK_SPIN_OP(__spin_unlock_irq, _spin_unlock_irq, lock) ++#define spin_unlock_bh(lock) \ ++ PICK_SPIN_OP(__spin_unlock_bh, _spin_unlock_bh, lock) ++ ++#define read_unlock_irqrestore(lock, flags) \ ++do { \ ++ BUILD_CHECK_IRQ_FLAGS(flags); \ ++ PICK_RW_OP(__read_unlock_irqrestore, _read_unlock_irqrestore, \ ++ lock, flags); \ ++} while (0) ++ ++#define read_unlock_irq(lock) \ ++ PICK_RW_OP(__read_unlock_irq, _read_unlock_irq, lock) ++#define read_unlock_bh(lock) PICK_RW_OP(__read_unlock_bh, _read_unlock_bh, lock) ++ ++#define write_unlock_irqrestore(lock, flags) \ ++do { \ ++ BUILD_CHECK_IRQ_FLAGS(flags); \ ++ PICK_RW_OP(__write_unlock_irqrestore, _write_unlock_irqrestore, \ ++ lock, flags); \ ++} while (0) ++#define write_unlock_irq(lock) \ ++ PICK_RW_OP(__write_unlock_irq, _write_unlock_irq, lock) ++ ++#define write_unlock_bh(lock) \ ++ PICK_RW_OP(__write_unlock_bh, _write_unlock_bh, lock) ++ ++#define spin_trylock_bh(lock) \ ++ __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_bh, _spin_trylock_bh,\ ++ lock)) ++ ++#define spin_trylock_irq(lock) \ ++ __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irq, \ ++ _spin_trylock_irq, lock)) + + #define spin_trylock_irqsave(lock, flags) \ +-({ \ +- local_irq_save(flags); \ +- spin_trylock(lock) ? \ +- 1 : ({ local_irq_restore(flags); 0; }); \ +-}) ++ __cond_lock(lock, PICK_SPIN_OP_RET(__spin_trylock_irqsave, \ ++ _spin_trylock_irqsave, lock, &flags)) + +-#define write_trylock_irqsave(lock, flags) \ +-({ \ +- local_irq_save(flags); \ +- write_trylock(lock) ? \ +- 1 : ({ local_irq_restore(flags); 0; }); \ +-}) ++/* ++ * bit-based spin_lock() ++ * ++ * Don't use this unless you really need to: spin_lock() and spin_unlock() ++ * are significantly faster. ++ */ ++static inline void bit_spin_lock(int bitnum, unsigned long *addr) ++{ ++ /* ++ * Assuming the lock is uncontended, this never enters ++ * the body of the outer loop. If it is contended, then ++ * within the inner loop a non-atomic test is used to ++ * busywait with less bus contention for a good time to ++ * attempt to acquire the lock bit. ++ */ ++#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) ++ while (unlikely(test_and_set_bit_lock(bitnum, addr))) ++ while (test_bit(bitnum, addr)) ++ cpu_relax(); ++#endif ++ __acquire(bitlock); ++} ++ ++/* ++ * Return true if it was acquired ++ */ ++static inline int bit_spin_trylock(int bitnum, unsigned long *addr) ++{ ++#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) ++ if (unlikely(test_and_set_bit_lock(bitnum, addr))) ++ return 0; ++#endif ++ __acquire(bitlock); ++ return 1; ++} ++ ++/* ++ * bit-based spin_unlock(): ++ */ ++static inline void bit_spin_unlock(int bitnum, unsigned long *addr) ++{ ++#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) ++# ifdef CONFIG_DEBUG_SPINLOCK ++ BUG_ON(!test_bit(bitnum, addr)); ++# endif ++ clear_bit_unlock(bitnum, addr); ++#endif ++ __release(bitlock); ++} ++ ++/* ++ * bit-based spin_unlock() - non-atomic version: ++ */ ++static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) ++{ ++#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) ++# ifdef CONFIG_DEBUG_SPINLOCK ++ BUG_ON(!test_bit(bitnum, addr)); ++# endif ++ __clear_bit_unlock(bitnum, addr); ++#endif ++ __release(bitlock); ++} ++ ++/* ++ * Return true if the lock is held. ++ */ ++static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) ++{ ++#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT) ++ return test_bit(bitnum, addr); ++#else ++ return 1; ++#endif ++} ++ ++/** ++ * __raw_spin_can_lock - would __raw_spin_trylock() succeed? ++ * @lock: the spinlock in question. ++ */ ++#define __raw_spin_can_lock(lock) (!__raw_spin_is_locked(lock)) + + /* + * Pull the atomic_t declaration: +@@ -359,14 +610,25 @@ do { \ + * Decrements @atomic by 1. If the result is 0, returns true and locks + * @lock. Returns false for all other cases. + */ +-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); +-#define atomic_dec_and_lock(atomic, lock) \ +- __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) ++/* "lock on reference count zero" */ ++#ifndef ATOMIC_DEC_AND_LOCK ++# include ++ extern int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic); ++#endif ++ ++#define atomic_dec_and_lock(atomic, lock) \ ++ __cond_lock(lock, PICK_SPIN_OP_RET(__atomic_dec_and_spin_lock, \ ++ _atomic_dec_and_spin_lock, lock, atomic)) + + /** + * spin_can_lock - would spin_trylock() succeed? + * @lock: the spinlock in question. + */ +-#define spin_can_lock(lock) (!spin_is_locked(lock)) ++#define spin_can_lock(lock) \ ++ __cond_lock(lock, PICK_SPIN_OP_RET(__spin_can_lock, _spin_can_lock,\ ++ lock)) ++ ++/* FIXME: porting hack! */ ++#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0) + + #endif /* __LINUX_SPINLOCK_H */ +Index: linux-2.6-tip/include/linux/spinlock_api_smp.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/spinlock_api_smp.h ++++ linux-2.6-tip/include/linux/spinlock_api_smp.h +@@ -19,45 +19,60 @@ int in_lock_functions(unsigned long addr + + #define assert_spin_locked(x) BUG_ON(!spin_is_locked(x)) + +-void __lockfunc _spin_lock(spinlock_t *lock) __acquires(lock); +-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) ++void __lockfunc __spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map) + __acquires(lock); +-void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *map) +- __acquires(lock); +-void __lockfunc _read_lock(rwlock_t *lock) __acquires(lock); +-void __lockfunc _write_lock(rwlock_t *lock) __acquires(lock); +-void __lockfunc _spin_lock_bh(spinlock_t *lock) __acquires(lock); +-void __lockfunc _read_lock_bh(rwlock_t *lock) __acquires(lock); +-void __lockfunc _write_lock_bh(rwlock_t *lock) __acquires(lock); +-void __lockfunc _spin_lock_irq(spinlock_t *lock) __acquires(lock); +-void __lockfunc _read_lock_irq(rwlock_t *lock) __acquires(lock); +-void __lockfunc _write_lock_irq(rwlock_t *lock) __acquires(lock); +-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +- __acquires(lock); +-unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +- __acquires(lock); +-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +- __acquires(lock); +-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +- __acquires(lock); +-int __lockfunc _spin_trylock(spinlock_t *lock); +-int __lockfunc _read_trylock(rwlock_t *lock); +-int __lockfunc _write_trylock(rwlock_t *lock); +-int __lockfunc _spin_trylock_bh(spinlock_t *lock); +-void __lockfunc _spin_unlock(spinlock_t *lock) __releases(lock); +-void __lockfunc _read_unlock(rwlock_t *lock) __releases(lock); +-void __lockfunc _write_unlock(rwlock_t *lock) __releases(lock); +-void __lockfunc _spin_unlock_bh(spinlock_t *lock) __releases(lock); +-void __lockfunc _read_unlock_bh(rwlock_t *lock) __releases(lock); +-void __lockfunc _write_unlock_bh(rwlock_t *lock) __releases(lock); +-void __lockfunc _spin_unlock_irq(spinlock_t *lock) __releases(lock); +-void __lockfunc _read_unlock_irq(rwlock_t *lock) __releases(lock); +-void __lockfunc _write_unlock_irq(rwlock_t *lock) __releases(lock); +-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +- __releases(lock); +-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +- __releases(lock); +-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +- __releases(lock); ++#define ACQUIRE_SPIN __acquires(lock) ++#define ACQUIRE_RW __acquires(lock) ++#define RELEASE_SPIN __releases(lock) ++#define RELEASE_RW __releases(lock) ++ ++void __lockfunc __spin_lock(raw_spinlock_t *lock) ACQUIRE_SPIN; ++void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass) ++ ACQUIRE_SPIN; ++void __lockfunc __read_lock(raw_rwlock_t *lock) ACQUIRE_RW; ++void __lockfunc __write_lock(raw_rwlock_t *lock) ACQUIRE_RW; ++void __lockfunc __spin_lock_bh(raw_spinlock_t *lock) ACQUIRE_SPIN; ++void __lockfunc __read_lock_bh(raw_rwlock_t *lock) ACQUIRE_RW; ++void __lockfunc __write_lock_bh(raw_rwlock_t *lock) ACQUIRE_RW; ++void __lockfunc __spin_lock_irq(raw_spinlock_t *lock) ACQUIRE_SPIN; ++void __lockfunc __read_lock_irq(raw_rwlock_t *lock) ACQUIRE_RW; ++void __lockfunc __write_lock_irq(raw_rwlock_t *lock) ACQUIRE_RW; ++unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock) ++ ACQUIRE_SPIN; ++unsigned long __lockfunc ++__spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) ACQUIRE_SPIN; ++unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock) ++ ACQUIRE_RW; ++unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock) ++ ACQUIRE_RW; ++int __lockfunc __spin_trylock(raw_spinlock_t *lock); ++int __lockfunc ++__spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags); ++int __lockfunc __read_trylock(raw_rwlock_t *lock); ++int __lockfunc __write_trylock(raw_rwlock_t *lock); ++int __lockfunc ++__write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags); ++int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock); ++int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock); ++void __lockfunc __spin_unlock(raw_spinlock_t *lock) RELEASE_SPIN; ++void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock) ++ RELEASE_SPIN; ++void __lockfunc __read_unlock(raw_rwlock_t *lock) RELEASE_RW; ++void __lockfunc __write_unlock(raw_rwlock_t *lock) RELEASE_RW; ++void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock) RELEASE_SPIN; ++void __lockfunc __read_unlock_bh(raw_rwlock_t *lock) RELEASE_RW; ++void __lockfunc __write_unlock_bh(raw_rwlock_t *lock) RELEASE_RW; ++void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock) RELEASE_SPIN; ++void __lockfunc __read_unlock_irq(raw_rwlock_t *lock) RELEASE_RW; ++void __lockfunc __write_unlock_irq(raw_rwlock_t *lock) RELEASE_RW; ++void __lockfunc ++__spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) ++ RELEASE_SPIN; ++void __lockfunc ++__read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) ++ RELEASE_RW; ++void ++__lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) ++ RELEASE_RW; + + #endif /* __LINUX_SPINLOCK_API_SMP_H */ +Index: linux-2.6-tip/include/linux/spinlock_api_up.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/spinlock_api_up.h ++++ linux-2.6-tip/include/linux/spinlock_api_up.h +@@ -33,12 +33,22 @@ + #define __LOCK_IRQ(lock) \ + do { local_irq_disable(); __LOCK(lock); } while (0) + +-#define __LOCK_IRQSAVE(lock, flags) \ +- do { local_irq_save(flags); __LOCK(lock); } while (0) ++#define __LOCK_IRQSAVE(lock) \ ++ ({ unsigned long __flags; local_irq_save(__flags); __LOCK(lock); __flags; }) ++ ++#define __TRYLOCK_IRQSAVE(lock, flags) \ ++ ({ local_irq_save(*(flags)); __LOCK(lock); 1; }) ++ ++#define __spin_trylock_irqsave(lock, flags) __TRYLOCK_IRQSAVE(lock, flags) ++ ++#define __write_trylock_irqsave(lock, flags) __TRYLOCK_IRQSAVE(lock, flags) + + #define __UNLOCK(lock) \ + do { preempt_enable(); __release(lock); (void)(lock); } while (0) + ++#define __UNLOCK_NO_RESCHED(lock) \ ++ do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0) ++ + #define __UNLOCK_BH(lock) \ + do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0) + +@@ -48,34 +58,36 @@ + #define __UNLOCK_IRQRESTORE(lock, flags) \ + do { local_irq_restore(flags); __UNLOCK(lock); } while (0) + +-#define _spin_lock(lock) __LOCK(lock) +-#define _spin_lock_nested(lock, subclass) __LOCK(lock) +-#define _read_lock(lock) __LOCK(lock) +-#define _write_lock(lock) __LOCK(lock) +-#define _spin_lock_bh(lock) __LOCK_BH(lock) +-#define _read_lock_bh(lock) __LOCK_BH(lock) +-#define _write_lock_bh(lock) __LOCK_BH(lock) +-#define _spin_lock_irq(lock) __LOCK_IRQ(lock) +-#define _read_lock_irq(lock) __LOCK_IRQ(lock) +-#define _write_lock_irq(lock) __LOCK_IRQ(lock) +-#define _spin_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) +-#define _read_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) +-#define _write_lock_irqsave(lock, flags) __LOCK_IRQSAVE(lock, flags) +-#define _spin_trylock(lock) ({ __LOCK(lock); 1; }) +-#define _read_trylock(lock) ({ __LOCK(lock); 1; }) +-#define _write_trylock(lock) ({ __LOCK(lock); 1; }) +-#define _spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) +-#define _spin_unlock(lock) __UNLOCK(lock) +-#define _read_unlock(lock) __UNLOCK(lock) +-#define _write_unlock(lock) __UNLOCK(lock) +-#define _spin_unlock_bh(lock) __UNLOCK_BH(lock) +-#define _write_unlock_bh(lock) __UNLOCK_BH(lock) +-#define _read_unlock_bh(lock) __UNLOCK_BH(lock) +-#define _spin_unlock_irq(lock) __UNLOCK_IRQ(lock) +-#define _read_unlock_irq(lock) __UNLOCK_IRQ(lock) +-#define _write_unlock_irq(lock) __UNLOCK_IRQ(lock) +-#define _spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +-#define _read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) +-#define _write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) ++#define __spin_lock(lock) __LOCK(lock) ++#define __spin_lock_nested(lock, subclass) __LOCK(lock) ++#define __read_lock(lock) __LOCK(lock) ++#define __write_lock(lock) __LOCK(lock) ++#define __spin_lock_bh(lock) __LOCK_BH(lock) ++#define __read_lock_bh(lock) __LOCK_BH(lock) ++#define __write_lock_bh(lock) __LOCK_BH(lock) ++#define __spin_lock_irq(lock) __LOCK_IRQ(lock) ++#define __read_lock_irq(lock) __LOCK_IRQ(lock) ++#define __write_lock_irq(lock) __LOCK_IRQ(lock) ++#define __spin_lock_irqsave(lock) __LOCK_IRQSAVE(lock) ++#define __read_lock_irqsave(lock) __LOCK_IRQSAVE(lock) ++#define __write_lock_irqsave(lock) __LOCK_IRQSAVE(lock) ++#define __spin_trylock(lock) ({ __LOCK(lock); 1; }) ++#define __read_trylock(lock) ({ __LOCK(lock); 1; }) ++#define __write_trylock(lock) ({ __LOCK(lock); 1; }) ++#define __spin_trylock_bh(lock) ({ __LOCK_BH(lock); 1; }) ++#define __spin_trylock_irq(lock) ({ __LOCK_IRQ(lock); 1; }) ++#define __spin_unlock(lock) __UNLOCK(lock) ++#define __spin_unlock_no_resched(lock) __UNLOCK_NO_RESCHED(lock) ++#define __read_unlock(lock) __UNLOCK(lock) ++#define __write_unlock(lock) __UNLOCK(lock) ++#define __spin_unlock_bh(lock) __UNLOCK_BH(lock) ++#define __write_unlock_bh(lock) __UNLOCK_BH(lock) ++#define __read_unlock_bh(lock) __UNLOCK_BH(lock) ++#define __spin_unlock_irq(lock) __UNLOCK_IRQ(lock) ++#define __read_unlock_irq(lock) __UNLOCK_IRQ(lock) ++#define __write_unlock_irq(lock) __UNLOCK_IRQ(lock) ++#define __spin_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) ++#define __read_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) ++#define __write_unlock_irqrestore(lock, flags) __UNLOCK_IRQRESTORE(lock, flags) + + #endif /* __LINUX_SPINLOCK_API_UP_H */ +Index: linux-2.6-tip/include/linux/spinlock_types.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/spinlock_types.h ++++ linux-2.6-tip/include/linux/spinlock_types.h +@@ -15,10 +15,27 @@ + # include + #endif + ++/* ++ * Must define these before including other files, inline functions need them ++ */ ++#define LOCK_SECTION_NAME ".text.lock."KBUILD_BASENAME ++ ++#define LOCK_SECTION_START(extra) \ ++ ".subsection 1\n\t" \ ++ extra \ ++ ".ifndef " LOCK_SECTION_NAME "\n\t" \ ++ LOCK_SECTION_NAME ":\n\t" \ ++ ".endif\n" ++ ++#define LOCK_SECTION_END \ ++ ".previous\n\t" ++ ++#define __lockfunc __attribute__((section(".spinlock.text"))) ++ + #include + + typedef struct { +- raw_spinlock_t raw_lock; ++ __raw_spinlock_t raw_lock; + #ifdef CONFIG_GENERIC_LOCKBREAK + unsigned int break_lock; + #endif +@@ -29,12 +46,12 @@ typedef struct { + #ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + #endif +-} spinlock_t; ++} raw_spinlock_t; + + #define SPINLOCK_MAGIC 0xdead4ead + + typedef struct { +- raw_rwlock_t raw_lock; ++ __raw_rwlock_t raw_lock; + #ifdef CONFIG_GENERIC_LOCKBREAK + unsigned int break_lock; + #endif +@@ -45,7 +62,7 @@ typedef struct { + #ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + #endif +-} rwlock_t; ++} raw_rwlock_t; + + #define RWLOCK_MAGIC 0xdeaf1eed + +@@ -64,24 +81,24 @@ typedef struct { + #endif + + #ifdef CONFIG_DEBUG_SPINLOCK +-# define __SPIN_LOCK_UNLOCKED(lockname) \ +- (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ ++# define _RAW_SPIN_LOCK_UNLOCKED(lockname) \ ++ { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ + .magic = SPINLOCK_MAGIC, \ + .owner = SPINLOCK_OWNER_INIT, \ + .owner_cpu = -1, \ + SPIN_DEP_MAP_INIT(lockname) } +-#define __RW_LOCK_UNLOCKED(lockname) \ +- (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ ++#define _RAW_RW_LOCK_UNLOCKED(lockname) \ ++ { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ + .magic = RWLOCK_MAGIC, \ + .owner = SPINLOCK_OWNER_INIT, \ + .owner_cpu = -1, \ + RW_DEP_MAP_INIT(lockname) } + #else +-# define __SPIN_LOCK_UNLOCKED(lockname) \ +- (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ ++# define _RAW_SPIN_LOCK_UNLOCKED(lockname) \ ++ { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ + SPIN_DEP_MAP_INIT(lockname) } +-#define __RW_LOCK_UNLOCKED(lockname) \ +- (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ ++# define _RAW_RW_LOCK_UNLOCKED(lockname) \ ++ { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ + RW_DEP_MAP_INIT(lockname) } + #endif + +@@ -91,10 +108,22 @@ typedef struct { + * Please use DEFINE_SPINLOCK()/DEFINE_RWLOCK() or + * __SPIN_LOCK_UNLOCKED()/__RW_LOCK_UNLOCKED() as appropriate. + */ +-#define SPIN_LOCK_UNLOCKED __SPIN_LOCK_UNLOCKED(old_style_spin_init) +-#define RW_LOCK_UNLOCKED __RW_LOCK_UNLOCKED(old_style_rw_init) + +-#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) +-#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x) ++# define RAW_SPIN_LOCK_UNLOCKED(lockname) \ ++ (raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED(lockname) ++ ++# define RAW_RW_LOCK_UNLOCKED(lockname) \ ++ (raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED(lockname) ++ ++#define DEFINE_RAW_SPINLOCK(name) \ ++ raw_spinlock_t name __cacheline_aligned_in_smp = \ ++ RAW_SPIN_LOCK_UNLOCKED(name) ++ ++#define __DEFINE_RAW_SPINLOCK(name) \ ++ raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED(name) ++ ++#define DEFINE_RAW_RWLOCK(name) \ ++ raw_rwlock_t name __cacheline_aligned_in_smp = \ ++ RAW_RW_LOCK_UNLOCKED(name) + + #endif /* __LINUX_SPINLOCK_TYPES_H */ +Index: linux-2.6-tip/include/linux/spinlock_types_up.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/spinlock_types_up.h ++++ linux-2.6-tip/include/linux/spinlock_types_up.h +@@ -16,13 +16,13 @@ + + typedef struct { + volatile unsigned int slock; +-} raw_spinlock_t; ++} __raw_spinlock_t; + + #define __RAW_SPIN_LOCK_UNLOCKED { 1 } + + #else + +-typedef struct { } raw_spinlock_t; ++typedef struct { } __raw_spinlock_t; + + #define __RAW_SPIN_LOCK_UNLOCKED { } + +@@ -30,7 +30,7 @@ typedef struct { } raw_spinlock_t; + + typedef struct { + /* no debug version on UP */ +-} raw_rwlock_t; ++} __raw_rwlock_t; + + #define __RAW_RW_LOCK_UNLOCKED { } + +Index: linux-2.6-tip/include/linux/spinlock_up.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/spinlock_up.h ++++ linux-2.6-tip/include/linux/spinlock_up.h +@@ -20,19 +20,19 @@ + #ifdef CONFIG_DEBUG_SPINLOCK + #define __raw_spin_is_locked(x) ((x)->slock == 0) + +-static inline void __raw_spin_lock(raw_spinlock_t *lock) ++static inline void __raw_spin_lock(__raw_spinlock_t *lock) + { + lock->slock = 0; + } + + static inline void +-__raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) ++__raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags) + { + local_irq_save(flags); + lock->slock = 0; + } + +-static inline int __raw_spin_trylock(raw_spinlock_t *lock) ++static inline int __raw_spin_trylock(__raw_spinlock_t *lock) + { + char oldval = lock->slock; + +@@ -41,7 +41,7 @@ static inline int __raw_spin_trylock(raw + return oldval > 0; + } + +-static inline void __raw_spin_unlock(raw_spinlock_t *lock) ++static inline void __raw_spin_unlock(__raw_spinlock_t *lock) + { + lock->slock = 1; + } +Index: linux-2.6-tip/kernel/rt.c +=================================================================== +--- /dev/null ++++ linux-2.6-tip/kernel/rt.c +@@ -0,0 +1,528 @@ ++/* ++ * kernel/rt.c ++ * ++ * Real-Time Preemption Support ++ * ++ * started by Ingo Molnar: ++ * ++ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar ++ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner ++ * ++ * historic credit for proving that Linux spinlocks can be implemented via ++ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow ++ * and others) who prototyped it on 2.4 and did lots of comparative ++ * research and analysis; TimeSys, for proving that you can implement a ++ * fully preemptible kernel via the use of IRQ threading and mutexes; ++ * Bill Huey for persuasively arguing on lkml that the mutex model is the ++ * right one; and to MontaVista, who ported pmutexes to 2.6. ++ * ++ * This code is a from-scratch implementation and is not based on pmutexes, ++ * but the idea of converting spinlocks to mutexes is used here too. ++ * ++ * lock debugging, locking tree, deadlock detection: ++ * ++ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey ++ * Released under the General Public License (GPL). ++ * ++ * Includes portions of the generic R/W semaphore implementation from: ++ * ++ * Copyright (c) 2001 David Howells (dhowells@redhat.com). ++ * - Derived partially from idea by Andrea Arcangeli ++ * - Derived also from comments by Linus ++ * ++ * Pending ownership of locks and ownership stealing: ++ * ++ * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt ++ * ++ * (also by Steven Rostedt) ++ * - Converted single pi_lock to individual task locks. ++ * ++ * By Esben Nielsen: ++ * Doing priority inheritance with help of the scheduler. ++ * ++ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner ++ * - major rework based on Esben Nielsens initial patch ++ * - replaced thread_info references by task_struct refs ++ * - removed task->pending_owner dependency ++ * - BKL drop/reacquire for semaphore style locks to avoid deadlocks ++ * in the scheduler return path as discussed with Steven Rostedt ++ * ++ * Copyright (C) 2006, Kihon Technologies Inc. ++ * Steven Rostedt ++ * - debugged and patched Thomas Gleixner's rework. ++ * - added back the cmpxchg to the rework. ++ * - turned atomic require back on for SMP. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "rtmutex_common.h" ++ ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * Unlock these on crash: ++ */ ++void zap_rt_locks(void) ++{ ++ //trace_lock_init(); ++} ++#endif ++ ++/* ++ * struct mutex functions ++ */ ++void __mutex_init(struct mutex *lock, char *name, struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held lock: ++ */ ++ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++ __rt_mutex_init(&lock->lock, name); ++} ++EXPORT_SYMBOL(__mutex_init); ++ ++void __lockfunc _mutex_lock(struct mutex *lock) ++{ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ rt_mutex_lock(&lock->lock); ++} ++EXPORT_SYMBOL(_mutex_lock); ++ ++int __lockfunc _mutex_lock_interruptible(struct mutex *lock) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ ret = rt_mutex_lock_interruptible(&lock->lock, 0); ++ if (ret) ++ mutex_release(&lock->dep_map, 1, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_interruptible); ++ ++int __lockfunc _mutex_lock_killable(struct mutex *lock) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ ret = rt_mutex_lock_killable(&lock->lock, 0); ++ if (ret) ++ mutex_release(&lock->dep_map, 1, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_killable); ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) ++{ ++ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ rt_mutex_lock(&lock->lock); ++} ++EXPORT_SYMBOL(_mutex_lock_nested); ++ ++int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ ret = rt_mutex_lock_interruptible(&lock->lock, 0); ++ if (ret) ++ mutex_release(&lock->dep_map, 1, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_interruptible_nested); ++ ++int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ ret = rt_mutex_lock_killable(&lock->lock, 0); ++ if (ret) ++ mutex_release(&lock->dep_map, 1, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_killable_nested); ++#endif ++ ++int __lockfunc _mutex_trylock(struct mutex *lock) ++{ ++ int ret = rt_mutex_trylock(&lock->lock); ++ ++ if (ret) ++ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_trylock); ++ ++void __lockfunc _mutex_unlock(struct mutex *lock) ++{ ++ mutex_release(&lock->dep_map, 1, _RET_IP_); ++ rt_mutex_unlock(&lock->lock); ++} ++EXPORT_SYMBOL(_mutex_unlock); ++ ++/* ++ * rwlock_t functions ++ */ ++int __lockfunc rt_write_trylock(rwlock_t *rwlock) ++{ ++ int ret = rt_mutex_trylock(&rwlock->lock); ++ ++ if (ret) ++ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); ++ ++ return ret; ++} ++EXPORT_SYMBOL(rt_write_trylock); ++ ++int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags) ++{ ++ *flags = 0; ++ return rt_write_trylock(rwlock); ++} ++EXPORT_SYMBOL(rt_write_trylock_irqsave); ++ ++int __lockfunc rt_read_trylock(rwlock_t *rwlock) ++{ ++ struct rt_mutex *lock = &rwlock->lock; ++ int ret = 1; ++ ++ /* ++ * recursive read locks succeed when current owns the lock ++ */ ++ if (rt_mutex_real_owner(lock) != current || !rwlock->read_depth) ++ ret = rt_mutex_trylock(lock); ++ ++ if (ret) { ++ rwlock->read_depth++; ++ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); ++ } ++ ++ return ret; ++} ++EXPORT_SYMBOL(rt_read_trylock); ++ ++void __lockfunc rt_write_lock(rwlock_t *rwlock) ++{ ++ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); ++ __rt_spin_lock(&rwlock->lock); ++} ++EXPORT_SYMBOL(rt_write_lock); ++ ++void __lockfunc rt_read_lock(rwlock_t *rwlock) ++{ ++ struct rt_mutex *lock = &rwlock->lock; ++ ++ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); ++ ++ /* ++ * recursive read locks succeed when current owns the lock ++ */ ++ if (rt_mutex_real_owner(lock) != current || !rwlock->read_depth) ++ __rt_spin_lock(lock); ++ rwlock->read_depth++; ++} ++ ++EXPORT_SYMBOL(rt_read_lock); ++ ++void __lockfunc rt_write_unlock(rwlock_t *rwlock) ++{ ++ /* NOTE: we always pass in '1' for nested, for simplicity */ ++ rwlock_release(&rwlock->dep_map, 1, _RET_IP_); ++ __rt_spin_unlock(&rwlock->lock); ++} ++EXPORT_SYMBOL(rt_write_unlock); ++ ++void __lockfunc rt_read_unlock(rwlock_t *rwlock) ++{ ++ rwlock_release(&rwlock->dep_map, 1, _RET_IP_); ++ ++ BUG_ON(rwlock->read_depth <= 0); ++ ++ /* Release the lock only when read_depth is down to 0 */ ++ if (--rwlock->read_depth == 0) ++ __rt_spin_unlock(&rwlock->lock); ++} ++EXPORT_SYMBOL(rt_read_unlock); ++ ++unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock) ++{ ++ rt_write_lock(rwlock); ++ ++ return 0; ++} ++EXPORT_SYMBOL(rt_write_lock_irqsave); ++ ++unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock) ++{ ++ rt_read_lock(rwlock); ++ ++ return 0; ++} ++EXPORT_SYMBOL(rt_read_lock_irqsave); ++ ++void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held lock: ++ */ ++ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock)); ++ lockdep_init_map(&rwlock->dep_map, name, key, 0); ++#endif ++ __rt_mutex_init(&rwlock->lock, name); ++ rwlock->read_depth = 0; ++} ++EXPORT_SYMBOL(__rt_rwlock_init); ++ ++/* ++ * rw_semaphores ++ */ ++ ++void rt_up_write(struct rw_semaphore *rwsem) ++{ ++ rwsem_release(&rwsem->dep_map, 1, _RET_IP_); ++ rt_mutex_unlock(&rwsem->lock); ++} ++EXPORT_SYMBOL(rt_up_write); ++ ++void rt_up_read(struct rw_semaphore *rwsem) ++{ ++ rwsem_release(&rwsem->dep_map, 1, _RET_IP_); ++ rt_mutex_unlock(&rwsem->lock); ++} ++EXPORT_SYMBOL(rt_up_read); ++ ++/* ++ * downgrade a write lock into a read lock ++ * - just wake up any readers at the front of the queue ++ */ ++void rt_downgrade_write(struct rw_semaphore *rwsem) ++{ ++ BUG_ON(rt_mutex_real_owner(&rwsem->lock) != current); ++} ++EXPORT_SYMBOL(rt_downgrade_write); ++ ++int rt_down_write_trylock(struct rw_semaphore *rwsem) ++{ ++ int ret = rt_mutex_trylock(&rwsem->lock); ++ ++ if (ret) ++ rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(rt_down_write_trylock); ++ ++void rt_down_write(struct rw_semaphore *rwsem) ++{ ++ rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_); ++ rt_mutex_lock(&rwsem->lock); ++} ++EXPORT_SYMBOL(rt_down_write); ++ ++void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass) ++{ ++ rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_); ++ rt_mutex_lock(&rwsem->lock); ++} ++EXPORT_SYMBOL(rt_down_write_nested); ++ ++int rt_down_read_trylock(struct rw_semaphore *rwsem) ++{ ++ int ret = rt_mutex_trylock(&rwsem->lock); ++ ++ if (ret) ++ rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(rt_down_read_trylock); ++ ++static void __rt_down_read(struct rw_semaphore *rwsem, int subclass) ++{ ++ rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_); ++ rt_mutex_lock(&rwsem->lock); ++} ++ ++void rt_down_read(struct rw_semaphore *rwsem) ++{ ++ __rt_down_read(rwsem, 0); ++} ++EXPORT_SYMBOL(rt_down_read); ++ ++void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass) ++{ ++ __rt_down_read(rwsem, subclass); ++} ++EXPORT_SYMBOL(rt_down_read_nested); ++ ++void __rt_rwsem_init(struct rw_semaphore *rwsem, char *name, ++ struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held lock: ++ */ ++ debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem)); ++ lockdep_init_map(&rwsem->dep_map, name, key, 0); ++#endif ++ __rt_mutex_init(&rwsem->lock, name); ++} ++EXPORT_SYMBOL(__rt_rwsem_init); ++ ++/* ++ * Semaphores ++ */ ++/* ++ * Linux Semaphores implemented via RT-mutexes. ++ * ++ * In the down() variants we use the mutex as the semaphore blocking ++ * object: we always acquire it, decrease the counter and keep the lock ++ * locked if we did the 1->0 transition. The next down() will then block. ++ * ++ * In the up() path we atomically increase the counter and do the ++ * unlock if we were the one doing the 0->1 transition. ++ */ ++ ++static inline void __down_complete(struct semaphore *sem) ++{ ++ int count = atomic_dec_return(&sem->count); ++ ++ if (unlikely(count > 0)) ++ rt_mutex_unlock(&sem->lock); ++} ++ ++void rt_down(struct semaphore *sem) ++{ ++ rt_mutex_lock(&sem->lock); ++ __down_complete(sem); ++} ++EXPORT_SYMBOL(rt_down); ++ ++int rt_down_interruptible(struct semaphore *sem) ++{ ++ int ret; ++ ++ ret = rt_mutex_lock_interruptible(&sem->lock, 0); ++ if (ret) ++ return ret; ++ __down_complete(sem); ++ return 0; ++} ++EXPORT_SYMBOL(rt_down_interruptible); ++ ++int rt_down_timeout(struct semaphore *sem, long jiff) ++{ ++ struct hrtimer_sleeper t; ++ struct timespec ts; ++ unsigned long expires = jiffies + jiff + 1; ++ int ret; ++ ++ /* ++ * rt_mutex_slowlock can use an interruptible, but this needs to ++ * be TASK_INTERRUPTIBLE. The down_timeout uses TASK_UNINTERRUPTIBLE. ++ * To handle this we loop if a signal caused the timeout and the ++ * we recalculate the new timeout. ++ * Yes Thomas, this is a hack! But we can fix it right later. ++ */ ++ do { ++ jiffies_to_timespec(jiff, &ts); ++ hrtimer_init_on_stack(&t.timer, HRTIMER_MODE_REL, CLOCK_MONOTONIC); ++ t.timer._expires = timespec_to_ktime(ts); ++ ++ ret = rt_mutex_timed_lock(&sem->lock, &t, 0); ++ if (ret != -EINTR) ++ break; ++ ++ /* signal occured, but the down_timeout doesn't handle them */ ++ jiff = expires - jiffies; ++ ++ } while (jiff > 0); ++ ++ if (!ret) ++ __down_complete(sem); ++ else ++ ret = -ETIME; ++ ++ return ret; ++} ++EXPORT_SYMBOL(rt_down_timeout); ++ ++/* ++ * try to down the semaphore, 0 on success and 1 on failure. (inverted) ++ */ ++int rt_down_trylock(struct semaphore *sem) ++{ ++ /* ++ * Here we are a tiny bit different from ordinary Linux semaphores, ++ * because we can get 'transient' locking-failures when say a ++ * process decreases the count from 9 to 8 and locks/releases the ++ * embedded mutex internally. It would be quite complex to remove ++ * these transient failures so lets try it the simple way first: ++ */ ++ if (rt_mutex_trylock(&sem->lock)) { ++ __down_complete(sem); ++ return 0; ++ } ++ return 1; ++} ++EXPORT_SYMBOL(rt_down_trylock); ++ ++void rt_up(struct semaphore *sem) ++{ ++ int count; ++ ++ /* ++ * Disable preemption to make sure a highprio trylock-er cannot ++ * preempt us here and get into an infinite loop: ++ */ ++ preempt_disable(); ++ count = atomic_inc_return(&sem->count); ++ /* ++ * If we did the 0 -> 1 transition then we are the ones to unlock it: ++ */ ++ if (likely(count == 1)) ++ rt_mutex_unlock(&sem->lock); ++ preempt_enable(); ++} ++EXPORT_SYMBOL(rt_up); ++ ++void __sema_init(struct semaphore *sem, int val, ++ char *name, char *file, int line) ++{ ++ atomic_set(&sem->count, val); ++ switch (val) { ++ case 0: ++ __rt_mutex_init(&sem->lock, name); ++ rt_mutex_lock(&sem->lock); ++ break; ++ default: ++ __rt_mutex_init(&sem->lock, name); ++ break; ++ } ++} ++EXPORT_SYMBOL(__sema_init); ++ ++void __init_MUTEX(struct semaphore *sem, char *name, char *file, ++ int line) ++{ ++ __sema_init(sem, 1, name, file, line); ++} ++EXPORT_SYMBOL(__init_MUTEX); ++ +Index: linux-2.6-tip/kernel/rtmutex-debug.c +=================================================================== +--- linux-2.6-tip.orig/kernel/rtmutex-debug.c ++++ linux-2.6-tip/kernel/rtmutex-debug.c +@@ -16,6 +16,7 @@ + * + * See rt.c in preempt-rt for proper credits and further information + */ ++#include + #include + #include + #include +@@ -29,61 +30,6 @@ + + #include "rtmutex_common.h" + +-# define TRACE_WARN_ON(x) WARN_ON(x) +-# define TRACE_BUG_ON(x) BUG_ON(x) +- +-# define TRACE_OFF() \ +-do { \ +- if (rt_trace_on) { \ +- rt_trace_on = 0; \ +- console_verbose(); \ +- if (spin_is_locked(¤t->pi_lock)) \ +- spin_unlock(¤t->pi_lock); \ +- } \ +-} while (0) +- +-# define TRACE_OFF_NOLOCK() \ +-do { \ +- if (rt_trace_on) { \ +- rt_trace_on = 0; \ +- console_verbose(); \ +- } \ +-} while (0) +- +-# define TRACE_BUG_LOCKED() \ +-do { \ +- TRACE_OFF(); \ +- BUG(); \ +-} while (0) +- +-# define TRACE_WARN_ON_LOCKED(c) \ +-do { \ +- if (unlikely(c)) { \ +- TRACE_OFF(); \ +- WARN_ON(1); \ +- } \ +-} while (0) +- +-# define TRACE_BUG_ON_LOCKED(c) \ +-do { \ +- if (unlikely(c)) \ +- TRACE_BUG_LOCKED(); \ +-} while (0) +- +-#ifdef CONFIG_SMP +-# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) +-#else +-# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) +-#endif +- +-/* +- * deadlock detection flag. We turn it off when we detect +- * the first problem because we dont want to recurse back +- * into the tracing code when doing error printk or +- * executing a BUG(): +- */ +-static int rt_trace_on = 1; +- + static void printk_task(struct task_struct *p) + { + if (p) +@@ -111,8 +57,8 @@ static void printk_lock(struct rt_mutex + + void rt_mutex_debug_task_free(struct task_struct *task) + { +- WARN_ON(!plist_head_empty(&task->pi_waiters)); +- WARN_ON(task->pi_blocked_on); ++ DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); ++ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); + } + + /* +@@ -125,7 +71,7 @@ void debug_rt_mutex_deadlock(int detect, + { + struct task_struct *task; + +- if (!rt_trace_on || detect || !act_waiter) ++ if (!debug_locks || detect || !act_waiter) + return; + + task = rt_mutex_owner(act_waiter->lock); +@@ -139,7 +85,7 @@ void debug_rt_mutex_print_deadlock(struc + { + struct task_struct *task; + +- if (!waiter->deadlock_lock || !rt_trace_on) ++ if (!waiter->deadlock_lock || !debug_locks) + return; + + rcu_read_lock(); +@@ -149,7 +95,8 @@ void debug_rt_mutex_print_deadlock(struc + return; + } + +- TRACE_OFF_NOLOCK(); ++ if (!debug_locks_off()) ++ return; + + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); +@@ -180,7 +127,6 @@ void debug_rt_mutex_print_deadlock(struc + + printk("[ turning off deadlock detection." + "Please report this trace. ]\n\n"); +- local_irq_disable(); + } + + void debug_rt_mutex_lock(struct rt_mutex *lock) +@@ -189,7 +135,8 @@ void debug_rt_mutex_lock(struct rt_mutex + + void debug_rt_mutex_unlock(struct rt_mutex *lock) + { +- TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); ++ if (debug_locks) ++ DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); + } + + void +@@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mute + + void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) + { +- TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); ++ DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); + } + + void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +@@ -213,9 +160,9 @@ void debug_rt_mutex_init_waiter(struct r + void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) + { + put_pid(waiter->deadlock_task_pid); +- TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); +- TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); +- TRACE_WARN_ON(waiter->task); ++ DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); ++ DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); ++ DEBUG_LOCKS_WARN_ON(waiter->task); + memset(waiter, 0x22, sizeof(*waiter)); + } + +@@ -231,9 +178,36 @@ void debug_rt_mutex_init(struct rt_mutex + void + rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) + { ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (atomic_read(&task->lock_count) >= MAX_LOCK_STACK) { ++ if (!debug_locks_off()) ++ return; ++ printk("BUG: %s/%d: lock count overflow!\n", ++ task->comm, task->pid); ++ dump_stack(); ++ return; ++ } ++#ifdef CONFIG_PREEMPT_RT ++ task->owned_lock[atomic_read(&task->lock_count)] = lock; ++#endif ++ atomic_inc(&task->lock_count); ++#endif + } + + void rt_mutex_deadlock_account_unlock(struct task_struct *task) + { ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!atomic_read(&task->lock_count)) { ++ if (!debug_locks_off()) ++ return; ++ printk("BUG: %s/%d: lock count underflow!\n", ++ task->comm, task->pid); ++ dump_stack(); ++ return; ++ } ++ atomic_dec(&task->lock_count); ++#ifdef CONFIG_PREEMPT_RT ++ task->owned_lock[atomic_read(&task->lock_count)] = NULL; ++#endif ++#endif + } +- +Index: linux-2.6-tip/kernel/rwsem.c +=================================================================== +--- linux-2.6-tip.orig/kernel/rwsem.c ++++ linux-2.6-tip/kernel/rwsem.c +@@ -16,7 +16,7 @@ + /* + * lock for reading + */ +-void __sched down_read(struct rw_semaphore *sem) ++void __sched compat_down_read(struct compat_rw_semaphore *sem) + { + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); +@@ -24,12 +24,12 @@ void __sched down_read(struct rw_semapho + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + } + +-EXPORT_SYMBOL(down_read); ++EXPORT_SYMBOL(compat_down_read); + + /* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +-int down_read_trylock(struct rw_semaphore *sem) ++int compat_down_read_trylock(struct compat_rw_semaphore *sem) + { + int ret = __down_read_trylock(sem); + +@@ -38,12 +38,12 @@ int down_read_trylock(struct rw_semaphor + return ret; + } + +-EXPORT_SYMBOL(down_read_trylock); ++EXPORT_SYMBOL(compat_down_read_trylock); + + /* + * lock for writing + */ +-void __sched down_write(struct rw_semaphore *sem) ++void __sched compat_down_write(struct compat_rw_semaphore *sem) + { + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); +@@ -51,12 +51,12 @@ void __sched down_write(struct rw_semaph + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + } + +-EXPORT_SYMBOL(down_write); ++EXPORT_SYMBOL(compat_down_write); + + /* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +-int down_write_trylock(struct rw_semaphore *sem) ++int compat_down_write_trylock(struct compat_rw_semaphore *sem) + { + int ret = __down_write_trylock(sem); + +@@ -65,36 +65,36 @@ int down_write_trylock(struct rw_semapho + return ret; + } + +-EXPORT_SYMBOL(down_write_trylock); ++EXPORT_SYMBOL(compat_down_write_trylock); + + /* + * release a read lock + */ +-void up_read(struct rw_semaphore *sem) ++void compat_up_read(struct compat_rw_semaphore *sem) + { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_read(sem); + } + +-EXPORT_SYMBOL(up_read); ++EXPORT_SYMBOL(compat_up_read); + + /* + * release a write lock + */ +-void up_write(struct rw_semaphore *sem) ++void compat_up_write(struct compat_rw_semaphore *sem) + { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_write(sem); + } + +-EXPORT_SYMBOL(up_write); ++EXPORT_SYMBOL(compat_up_write); + + /* + * downgrade write lock to read lock + */ +-void downgrade_write(struct rw_semaphore *sem) ++void compat_downgrade_write(struct compat_rw_semaphore *sem) + { + /* + * lockdep: a downgraded write will live on as a write +@@ -103,11 +103,11 @@ void downgrade_write(struct rw_semaphore + __downgrade_write(sem); + } + +-EXPORT_SYMBOL(downgrade_write); ++EXPORT_SYMBOL(compat_downgrade_write); + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + +-void down_read_nested(struct rw_semaphore *sem, int subclass) ++void compat_down_read_nested(struct compat_rw_semaphore *sem, int subclass) + { + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); +@@ -115,18 +115,18 @@ void down_read_nested(struct rw_semaphor + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + } + +-EXPORT_SYMBOL(down_read_nested); ++EXPORT_SYMBOL(compat_down_read_nested); + +-void down_read_non_owner(struct rw_semaphore *sem) ++void compat_down_read_non_owner(struct compat_rw_semaphore *sem) + { + might_sleep(); + + __down_read(sem); + } + +-EXPORT_SYMBOL(down_read_non_owner); ++EXPORT_SYMBOL(compat_down_read_non_owner); + +-void down_write_nested(struct rw_semaphore *sem, int subclass) ++void compat_down_write_nested(struct compat_rw_semaphore *sem, int subclass) + { + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); +@@ -134,14 +134,14 @@ void down_write_nested(struct rw_semapho + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + } + +-EXPORT_SYMBOL(down_write_nested); ++EXPORT_SYMBOL(compat_down_write_nested); + +-void up_read_non_owner(struct rw_semaphore *sem) ++void compat_up_read_non_owner(struct compat_rw_semaphore *sem) + { + __up_read(sem); + } + +-EXPORT_SYMBOL(up_read_non_owner); ++EXPORT_SYMBOL(compat_up_read_non_owner); + + #endif + +Index: linux-2.6-tip/kernel/semaphore.c +=================================================================== +--- linux-2.6-tip.orig/kernel/semaphore.c ++++ linux-2.6-tip/kernel/semaphore.c +@@ -33,11 +33,11 @@ + #include + #include + +-static noinline void __down(struct semaphore *sem); +-static noinline int __down_interruptible(struct semaphore *sem); +-static noinline int __down_killable(struct semaphore *sem); +-static noinline int __down_timeout(struct semaphore *sem, long jiffies); +-static noinline void __up(struct semaphore *sem); ++static noinline void __down(struct compat_semaphore *sem); ++static noinline int __down_interruptible(struct compat_semaphore *sem); ++static noinline int __down_killable(struct compat_semaphore *sem); ++static noinline int __down_timeout(struct compat_semaphore *sem, long jiffies); ++static noinline void __up(struct compat_semaphore *sem); + + /** + * down - acquire the semaphore +@@ -50,7 +50,7 @@ static noinline void __up(struct semapho + * Use of this function is deprecated, please use down_interruptible() or + * down_killable() instead. + */ +-void down(struct semaphore *sem) ++void compat_down(struct compat_semaphore *sem) + { + unsigned long flags; + +@@ -61,7 +61,7 @@ void down(struct semaphore *sem) + __down(sem); + spin_unlock_irqrestore(&sem->lock, flags); + } +-EXPORT_SYMBOL(down); ++EXPORT_SYMBOL(compat_down); + + /** + * down_interruptible - acquire the semaphore unless interrupted +@@ -72,7 +72,7 @@ EXPORT_SYMBOL(down); + * If the sleep is interrupted by a signal, this function will return -EINTR. + * If the semaphore is successfully acquired, this function returns 0. + */ +-int down_interruptible(struct semaphore *sem) ++int compat_down_interruptible(struct compat_semaphore *sem) + { + unsigned long flags; + int result = 0; +@@ -86,7 +86,7 @@ int down_interruptible(struct semaphore + + return result; + } +-EXPORT_SYMBOL(down_interruptible); ++EXPORT_SYMBOL(compat_down_interruptible); + + /** + * down_killable - acquire the semaphore unless killed +@@ -98,7 +98,7 @@ EXPORT_SYMBOL(down_interruptible); + * -EINTR. If the semaphore is successfully acquired, this function returns + * 0. + */ +-int down_killable(struct semaphore *sem) ++int compat_down_killable(struct compat_semaphore *sem) + { + unsigned long flags; + int result = 0; +@@ -112,7 +112,7 @@ int down_killable(struct semaphore *sem) + + return result; + } +-EXPORT_SYMBOL(down_killable); ++EXPORT_SYMBOL(compat_down_killable); + + /** + * down_trylock - try to acquire the semaphore, without waiting +@@ -127,7 +127,7 @@ EXPORT_SYMBOL(down_killable); + * Unlike mutex_trylock, this function can be used from interrupt context, + * and the semaphore can be released by any task or interrupt. + */ +-int down_trylock(struct semaphore *sem) ++int compat_down_trylock(struct compat_semaphore *sem) + { + unsigned long flags; + int count; +@@ -140,7 +140,7 @@ int down_trylock(struct semaphore *sem) + + return (count < 0); + } +-EXPORT_SYMBOL(down_trylock); ++EXPORT_SYMBOL(compat_down_trylock); + + /** + * down_timeout - acquire the semaphore within a specified time +@@ -152,7 +152,7 @@ EXPORT_SYMBOL(down_trylock); + * If the semaphore is not released within the specified number of jiffies, + * this function returns -ETIME. It returns 0 if the semaphore was acquired. + */ +-int down_timeout(struct semaphore *sem, long jiffies) ++int compat_down_timeout(struct compat_semaphore *sem, long jiffies) + { + unsigned long flags; + int result = 0; +@@ -166,7 +166,7 @@ int down_timeout(struct semaphore *sem, + + return result; + } +-EXPORT_SYMBOL(down_timeout); ++EXPORT_SYMBOL(compat_down_timeout); + + /** + * up - release the semaphore +@@ -175,7 +175,7 @@ EXPORT_SYMBOL(down_timeout); + * Release the semaphore. Unlike mutexes, up() may be called from any + * context and even by tasks which have never called down(). + */ +-void up(struct semaphore *sem) ++void compat_up(struct compat_semaphore *sem) + { + unsigned long flags; + +@@ -186,7 +186,7 @@ void up(struct semaphore *sem) + __up(sem); + spin_unlock_irqrestore(&sem->lock, flags); + } +-EXPORT_SYMBOL(up); ++EXPORT_SYMBOL(compat_up); + + /* Functions for the contended case */ + +@@ -201,7 +201,7 @@ struct semaphore_waiter { + * constant, and thus optimised away by the compiler. Likewise the + * 'timeout' parameter for the cases without timeouts. + */ +-static inline int __sched __down_common(struct semaphore *sem, long state, ++static inline int __sched __down_common(struct compat_semaphore *sem, long state, + long timeout) + { + struct task_struct *task = current; +@@ -233,27 +233,27 @@ static inline int __sched __down_common( + return -EINTR; + } + +-static noinline void __sched __down(struct semaphore *sem) ++static noinline void __sched __down(struct compat_semaphore *sem) + { + __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + +-static noinline int __sched __down_interruptible(struct semaphore *sem) ++static noinline int __sched __down_interruptible(struct compat_semaphore *sem) + { + return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + +-static noinline int __sched __down_killable(struct semaphore *sem) ++static noinline int __sched __down_killable(struct compat_semaphore *sem) + { + return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); + } + +-static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) ++static noinline int __sched __down_timeout(struct compat_semaphore *sem, long jiffies) + { + return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); + } + +-static noinline void __sched __up(struct semaphore *sem) ++static noinline void __sched __up(struct compat_semaphore *sem) + { + struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, + struct semaphore_waiter, list); +Index: linux-2.6-tip/kernel/spinlock.c +=================================================================== +--- linux-2.6-tip.orig/kernel/spinlock.c ++++ linux-2.6-tip/kernel/spinlock.c +@@ -21,7 +21,7 @@ + #include + #include + +-int __lockfunc _spin_trylock(spinlock_t *lock) ++int __lockfunc __spin_trylock(raw_spinlock_t *lock) + { + preempt_disable(); + if (_raw_spin_trylock(lock)) { +@@ -32,9 +32,46 @@ int __lockfunc _spin_trylock(spinlock_t + preempt_enable(); + return 0; + } +-EXPORT_SYMBOL(_spin_trylock); ++EXPORT_SYMBOL(__spin_trylock); + +-int __lockfunc _read_trylock(rwlock_t *lock) ++int __lockfunc __spin_trylock_irq(raw_spinlock_t *lock) ++{ ++ local_irq_disable(); ++ preempt_disable(); ++ ++ if (_raw_spin_trylock(lock)) { ++ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ return 1; ++ } ++ ++ __preempt_enable_no_resched(); ++ local_irq_enable(); ++ preempt_check_resched(); ++ ++ return 0; ++} ++EXPORT_SYMBOL(__spin_trylock_irq); ++ ++int __lockfunc __spin_trylock_irqsave(raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ local_irq_save(*flags); ++ preempt_disable(); ++ ++ if (_raw_spin_trylock(lock)) { ++ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ return 1; ++ } ++ ++ __preempt_enable_no_resched(); ++ local_irq_restore(*flags); ++ preempt_check_resched(); ++ ++ return 0; ++} ++EXPORT_SYMBOL(__spin_trylock_irqsave); ++ ++int __lockfunc __read_trylock(raw_rwlock_t *lock) + { + preempt_disable(); + if (_raw_read_trylock(lock)) { +@@ -45,9 +82,9 @@ int __lockfunc _read_trylock(rwlock_t *l + preempt_enable(); + return 0; + } +-EXPORT_SYMBOL(_read_trylock); ++EXPORT_SYMBOL(__read_trylock); + +-int __lockfunc _write_trylock(rwlock_t *lock) ++int __lockfunc __write_trylock(raw_rwlock_t *lock) + { + preempt_disable(); + if (_raw_write_trylock(lock)) { +@@ -58,7 +95,21 @@ int __lockfunc _write_trylock(rwlock_t * + preempt_enable(); + return 0; + } +-EXPORT_SYMBOL(_write_trylock); ++EXPORT_SYMBOL(__write_trylock); ++ ++int __lockfunc __write_trylock_irqsave(raw_rwlock_t *lock, unsigned long *flags) ++{ ++ int ret; ++ ++ local_irq_save(*flags); ++ ret = __write_trylock(lock); ++ if (ret) ++ return ret; ++ ++ local_irq_restore(*flags); ++ return 0; ++} ++EXPORT_SYMBOL(__write_trylock_irqsave); + + /* + * If lockdep is enabled then we use the non-preemption spin-ops +@@ -67,15 +118,15 @@ EXPORT_SYMBOL(_write_trylock); + */ + #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) + +-void __lockfunc _read_lock(rwlock_t *lock) ++void __lockfunc __read_lock(raw_rwlock_t *lock) + { + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + } +-EXPORT_SYMBOL(_read_lock); ++EXPORT_SYMBOL(__read_lock); + +-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) ++unsigned long __lockfunc __spin_lock_irqsave(raw_spinlock_t *lock) + { + unsigned long flags; + +@@ -94,27 +145,27 @@ unsigned long __lockfunc _spin_lock_irqs + #endif + return flags; + } +-EXPORT_SYMBOL(_spin_lock_irqsave); ++EXPORT_SYMBOL(__spin_lock_irqsave); + +-void __lockfunc _spin_lock_irq(spinlock_t *lock) ++void __lockfunc __spin_lock_irq(raw_spinlock_t *lock) + { + local_irq_disable(); + preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + } +-EXPORT_SYMBOL(_spin_lock_irq); ++EXPORT_SYMBOL(__spin_lock_irq); + +-void __lockfunc _spin_lock_bh(spinlock_t *lock) ++void __lockfunc __spin_lock_bh(raw_spinlock_t *lock) + { + local_bh_disable(); + preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + } +-EXPORT_SYMBOL(_spin_lock_bh); ++EXPORT_SYMBOL(__spin_lock_bh); + +-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) ++unsigned long __lockfunc __read_lock_irqsave(raw_rwlock_t *lock) + { + unsigned long flags; + +@@ -124,27 +175,27 @@ unsigned long __lockfunc _read_lock_irqs + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + return flags; + } +-EXPORT_SYMBOL(_read_lock_irqsave); ++EXPORT_SYMBOL(__read_lock_irqsave); + +-void __lockfunc _read_lock_irq(rwlock_t *lock) ++void __lockfunc __read_lock_irq(raw_rwlock_t *lock) + { + local_irq_disable(); + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + } +-EXPORT_SYMBOL(_read_lock_irq); ++EXPORT_SYMBOL(__read_lock_irq); + +-void __lockfunc _read_lock_bh(rwlock_t *lock) ++void __lockfunc __read_lock_bh(raw_rwlock_t *lock) + { + local_bh_disable(); + preempt_disable(); + rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); + } +-EXPORT_SYMBOL(_read_lock_bh); ++EXPORT_SYMBOL(__read_lock_bh); + +-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) ++unsigned long __lockfunc __write_lock_irqsave(raw_rwlock_t *lock) + { + unsigned long flags; + +@@ -154,43 +205,43 @@ unsigned long __lockfunc _write_lock_irq + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + return flags; + } +-EXPORT_SYMBOL(_write_lock_irqsave); ++EXPORT_SYMBOL(__write_lock_irqsave); + +-void __lockfunc _write_lock_irq(rwlock_t *lock) ++void __lockfunc __write_lock_irq(raw_rwlock_t *lock) + { + local_irq_disable(); + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + } +-EXPORT_SYMBOL(_write_lock_irq); ++EXPORT_SYMBOL(__write_lock_irq); + +-void __lockfunc _write_lock_bh(rwlock_t *lock) ++void __lockfunc __write_lock_bh(raw_rwlock_t *lock) + { + local_bh_disable(); + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + } +-EXPORT_SYMBOL(_write_lock_bh); ++EXPORT_SYMBOL(__write_lock_bh); + +-void __lockfunc _spin_lock(spinlock_t *lock) ++void __lockfunc __spin_lock(raw_spinlock_t *lock) + { + preempt_disable(); + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + } + +-EXPORT_SYMBOL(_spin_lock); ++EXPORT_SYMBOL(__spin_lock); + +-void __lockfunc _write_lock(rwlock_t *lock) ++void __lockfunc __write_lock(raw_rwlock_t *lock) + { + preempt_disable(); + rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); + } + +-EXPORT_SYMBOL(_write_lock); ++EXPORT_SYMBOL(__write_lock); + + #else /* CONFIG_PREEMPT: */ + +@@ -203,7 +254,7 @@ EXPORT_SYMBOL(_write_lock); + */ + + #define BUILD_LOCK_OPS(op, locktype) \ +-void __lockfunc _##op##_lock(locktype##_t *lock) \ ++void __lockfunc __##op##_lock(locktype##_t *lock) \ + { \ + for (;;) { \ + preempt_disable(); \ +@@ -213,15 +264,16 @@ void __lockfunc _##op##_lock(locktype##_ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ +- while (!op##_can_lock(lock) && (lock)->break_lock) \ +- _raw_##op##_relax(&lock->raw_lock); \ ++ while (!__raw_##op##_can_lock(&(lock)->raw_lock) && \ ++ (lock)->break_lock) \ ++ __raw_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ + } \ + \ +-EXPORT_SYMBOL(_##op##_lock); \ ++EXPORT_SYMBOL(__##op##_lock); \ + \ +-unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ ++unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ + { \ + unsigned long flags; \ + \ +@@ -235,23 +287,24 @@ unsigned long __lockfunc _##op##_lock_ir + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ +- while (!op##_can_lock(lock) && (lock)->break_lock) \ +- _raw_##op##_relax(&lock->raw_lock); \ ++ while (!__raw_##op##_can_lock(&(lock)->raw_lock) && \ ++ (lock)->break_lock) \ ++ __raw_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ + return flags; \ + } \ + \ +-EXPORT_SYMBOL(_##op##_lock_irqsave); \ ++EXPORT_SYMBOL(__##op##_lock_irqsave); \ + \ +-void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ ++void __lockfunc __##op##_lock_irq(locktype##_t *lock) \ + { \ +- _##op##_lock_irqsave(lock); \ ++ __##op##_lock_irqsave(lock); \ + } \ + \ +-EXPORT_SYMBOL(_##op##_lock_irq); \ ++EXPORT_SYMBOL(__##op##_lock_irq); \ + \ +-void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ ++void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ + { \ + unsigned long flags; \ + \ +@@ -260,39 +313,48 @@ void __lockfunc _##op##_lock_bh(locktype + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ +- flags = _##op##_lock_irqsave(lock); \ ++ flags = __##op##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ + } \ + \ +-EXPORT_SYMBOL(_##op##_lock_bh) ++EXPORT_SYMBOL(__##op##_lock_bh) + + /* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * +- * _[spin|read|write]_lock() +- * _[spin|read|write]_lock_irq() +- * _[spin|read|write]_lock_irqsave() +- * _[spin|read|write]_lock_bh() ++ * __[spin|read|write]_lock() ++ * __[spin|read|write]_lock_irq() ++ * __[spin|read|write]_lock_irqsave() ++ * __[spin|read|write]_lock_bh() + */ +-BUILD_LOCK_OPS(spin, spinlock); +-BUILD_LOCK_OPS(read, rwlock); +-BUILD_LOCK_OPS(write, rwlock); ++BUILD_LOCK_OPS(spin, raw_spinlock); ++BUILD_LOCK_OPS(read, raw_rwlock); ++BUILD_LOCK_OPS(write, raw_rwlock); + + #endif /* CONFIG_PREEMPT */ + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + +-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) ++void __lockfunc __spin_lock_nested(raw_spinlock_t *lock, int subclass) + { + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + } +-EXPORT_SYMBOL(_spin_lock_nested); ++EXPORT_SYMBOL(__spin_lock_nested); ++ ++void __lockfunc __spin_lock_nest_lock(raw_spinlock_t *lock, ++ struct lockdep_map *nest_lock) ++{ ++ preempt_disable(); ++ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); ++ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); ++} ++EXPORT_SYMBOL(__spin_lock_nest_lock); + +-unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) ++unsigned long __lockfunc __spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) + { + unsigned long flags; + +@@ -311,125 +373,130 @@ unsigned long __lockfunc _spin_lock_irqs + #endif + return flags; + } +-EXPORT_SYMBOL(_spin_lock_irqsave_nested); +- +-void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, +- struct lockdep_map *nest_lock) +-{ +- preempt_disable(); +- spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); +- LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +-} +-EXPORT_SYMBOL(_spin_lock_nest_lock); ++EXPORT_SYMBOL(__spin_lock_irqsave_nested); + + #endif + +-void __lockfunc _spin_unlock(spinlock_t *lock) ++void __lockfunc __spin_unlock(raw_spinlock_t *lock) + { + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); + preempt_enable(); + } +-EXPORT_SYMBOL(_spin_unlock); ++EXPORT_SYMBOL(__spin_unlock); + +-void __lockfunc _write_unlock(rwlock_t *lock) ++void __lockfunc __spin_unlock_no_resched(raw_spinlock_t *lock) ++{ ++ spin_release(&lock->dep_map, 1, _RET_IP_); ++ _raw_spin_unlock(lock); ++ __preempt_enable_no_resched(); ++} ++/* not exported */ ++ ++void __lockfunc __write_unlock(raw_rwlock_t *lock) + { + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); + preempt_enable(); + } +-EXPORT_SYMBOL(_write_unlock); ++EXPORT_SYMBOL(__write_unlock); + +-void __lockfunc _read_unlock(rwlock_t *lock) ++void __lockfunc __read_unlock(raw_rwlock_t *lock) + { + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); + preempt_enable(); + } +-EXPORT_SYMBOL(_read_unlock); ++EXPORT_SYMBOL(__read_unlock); + +-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) ++void __lockfunc __spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) + { + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); ++ __preempt_enable_no_resched(); + local_irq_restore(flags); +- preempt_enable(); ++ preempt_check_resched(); + } +-EXPORT_SYMBOL(_spin_unlock_irqrestore); ++EXPORT_SYMBOL(__spin_unlock_irqrestore); + +-void __lockfunc _spin_unlock_irq(spinlock_t *lock) ++void __lockfunc __spin_unlock_irq(raw_spinlock_t *lock) + { + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); ++ __preempt_enable_no_resched(); + local_irq_enable(); +- preempt_enable(); ++ preempt_check_resched(); + } +-EXPORT_SYMBOL(_spin_unlock_irq); ++EXPORT_SYMBOL(__spin_unlock_irq); + +-void __lockfunc _spin_unlock_bh(spinlock_t *lock) ++void __lockfunc __spin_unlock_bh(raw_spinlock_t *lock) + { + spin_release(&lock->dep_map, 1, _RET_IP_); + _raw_spin_unlock(lock); +- preempt_enable_no_resched(); ++ __preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + } +-EXPORT_SYMBOL(_spin_unlock_bh); ++EXPORT_SYMBOL(__spin_unlock_bh); + +-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) ++void __lockfunc __read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) + { + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); ++ __preempt_enable_no_resched(); + local_irq_restore(flags); +- preempt_enable(); ++ preempt_check_resched(); + } +-EXPORT_SYMBOL(_read_unlock_irqrestore); ++EXPORT_SYMBOL(__read_unlock_irqrestore); + +-void __lockfunc _read_unlock_irq(rwlock_t *lock) ++void __lockfunc __read_unlock_irq(raw_rwlock_t *lock) + { + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); ++ __preempt_enable_no_resched(); + local_irq_enable(); +- preempt_enable(); ++ preempt_check_resched(); + } +-EXPORT_SYMBOL(_read_unlock_irq); ++EXPORT_SYMBOL(__read_unlock_irq); + +-void __lockfunc _read_unlock_bh(rwlock_t *lock) ++void __lockfunc __read_unlock_bh(raw_rwlock_t *lock) + { + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_read_unlock(lock); +- preempt_enable_no_resched(); ++ __preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + } +-EXPORT_SYMBOL(_read_unlock_bh); ++EXPORT_SYMBOL(__read_unlock_bh); + +-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) ++void __lockfunc __write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) + { + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); ++ __preempt_enable_no_resched(); + local_irq_restore(flags); +- preempt_enable(); ++ preempt_check_resched(); + } +-EXPORT_SYMBOL(_write_unlock_irqrestore); ++EXPORT_SYMBOL(__write_unlock_irqrestore); + +-void __lockfunc _write_unlock_irq(rwlock_t *lock) ++void __lockfunc __write_unlock_irq(raw_rwlock_t *lock) + { + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); ++ __preempt_enable_no_resched(); + local_irq_enable(); +- preempt_enable(); ++ preempt_check_resched(); + } +-EXPORT_SYMBOL(_write_unlock_irq); ++EXPORT_SYMBOL(__write_unlock_irq); + +-void __lockfunc _write_unlock_bh(rwlock_t *lock) ++void __lockfunc __write_unlock_bh(raw_rwlock_t *lock) + { + rwlock_release(&lock->dep_map, 1, _RET_IP_); + _raw_write_unlock(lock); +- preempt_enable_no_resched(); ++ __preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + } +-EXPORT_SYMBOL(_write_unlock_bh); ++EXPORT_SYMBOL(__write_unlock_bh); + +-int __lockfunc _spin_trylock_bh(spinlock_t *lock) ++int __lockfunc __spin_trylock_bh(raw_spinlock_t *lock) + { + local_bh_disable(); + preempt_disable(); +@@ -438,11 +505,11 @@ int __lockfunc _spin_trylock_bh(spinlock + return 1; + } + +- preempt_enable_no_resched(); ++ __preempt_enable_no_resched(); + local_bh_enable_ip((unsigned long)__builtin_return_address(0)); + return 0; + } +-EXPORT_SYMBOL(_spin_trylock_bh); ++EXPORT_SYMBOL(__spin_trylock_bh); + + notrace int in_lock_functions(unsigned long addr) + { +@@ -450,6 +517,17 @@ notrace int in_lock_functions(unsigned l + extern char __lock_text_start[], __lock_text_end[]; + + return addr >= (unsigned long)__lock_text_start +- && addr < (unsigned long)__lock_text_end; ++ && addr < (unsigned long)__lock_text_end; + } + EXPORT_SYMBOL(in_lock_functions); ++ ++void notrace __debug_atomic_dec_and_test(atomic_t *v) ++{ ++ static int warn_once = 1; ++ ++ if (!atomic_read(v) && warn_once) { ++ warn_once = 0; ++ printk("BUG: atomic counter underflow!\n"); ++ WARN_ON(1); ++ } ++} +Index: linux-2.6-tip/lib/dec_and_lock.c +=================================================================== +--- linux-2.6-tip.orig/lib/dec_and_lock.c ++++ linux-2.6-tip/lib/dec_and_lock.c +@@ -17,7 +17,7 @@ + * because the spin-lock and the decrement must be + * "atomic". + */ +-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) ++int __atomic_dec_and_spin_lock(raw_spinlock_t *lock, atomic_t *atomic) + { + #ifdef CONFIG_SMP + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ +@@ -32,4 +32,4 @@ int _atomic_dec_and_lock(atomic_t *atomi + return 0; + } + +-EXPORT_SYMBOL(_atomic_dec_and_lock); ++EXPORT_SYMBOL(__atomic_dec_and_spin_lock); +Index: linux-2.6-tip/lib/plist.c +=================================================================== +--- linux-2.6-tip.orig/lib/plist.c ++++ linux-2.6-tip/lib/plist.c +@@ -54,7 +54,9 @@ static void plist_check_list(struct list + + static void plist_check_head(struct plist_head *head) + { ++#ifndef CONFIG_PREEMPT_RT + WARN_ON(!head->lock); ++#endif + if (head->lock) + WARN_ON_SMP(!spin_is_locked(head->lock)); + plist_check_list(&head->prio_list); +Index: linux-2.6-tip/lib/rwsem-spinlock.c +=================================================================== +--- linux-2.6-tip.orig/lib/rwsem-spinlock.c ++++ linux-2.6-tip/lib/rwsem-spinlock.c +@@ -20,7 +20,7 @@ struct rwsem_waiter { + /* + * initialise the semaphore + */ +-void __init_rwsem(struct rw_semaphore *sem, const char *name, ++void __compat_init_rwsem(struct compat_rw_semaphore *sem, const char *name, + struct lock_class_key *key) + { + #ifdef CONFIG_DEBUG_LOCK_ALLOC +@@ -44,8 +44,8 @@ void __init_rwsem(struct rw_semaphore *s + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if wakewrite is non-zero + */ +-static inline struct rw_semaphore * +-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) ++static inline struct compat_rw_semaphore * ++__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite) + { + struct rwsem_waiter *waiter; + struct task_struct *tsk; +@@ -103,8 +103,8 @@ __rwsem_do_wake(struct rw_semaphore *sem + /* + * wake a single writer + */ +-static inline struct rw_semaphore * +-__rwsem_wake_one_writer(struct rw_semaphore *sem) ++static inline struct compat_rw_semaphore * ++__rwsem_wake_one_writer(struct compat_rw_semaphore *sem) + { + struct rwsem_waiter *waiter; + struct task_struct *tsk; +@@ -125,7 +125,7 @@ __rwsem_wake_one_writer(struct rw_semaph + /* + * get a read lock on the semaphore + */ +-void __sched __down_read(struct rw_semaphore *sem) ++void __sched __down_read(struct compat_rw_semaphore *sem) + { + struct rwsem_waiter waiter; + struct task_struct *tsk; +@@ -168,7 +168,7 @@ void __sched __down_read(struct rw_semap + /* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +-int __down_read_trylock(struct rw_semaphore *sem) ++int __down_read_trylock(struct compat_rw_semaphore *sem) + { + unsigned long flags; + int ret = 0; +@@ -191,7 +191,8 @@ int __down_read_trylock(struct rw_semaph + * get a write lock on the semaphore + * - we increment the waiting count anyway to indicate an exclusive lock + */ +-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) ++void __sched ++__down_write_nested(struct compat_rw_semaphore *sem, int subclass) + { + struct rwsem_waiter waiter; + struct task_struct *tsk; +@@ -231,7 +232,7 @@ void __sched __down_write_nested(struct + ; + } + +-void __sched __down_write(struct rw_semaphore *sem) ++void __sched __down_write(struct compat_rw_semaphore *sem) + { + __down_write_nested(sem, 0); + } +@@ -239,7 +240,7 @@ void __sched __down_write(struct rw_sema + /* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +-int __down_write_trylock(struct rw_semaphore *sem) ++int __down_write_trylock(struct compat_rw_semaphore *sem) + { + unsigned long flags; + int ret = 0; +@@ -260,7 +261,7 @@ int __down_write_trylock(struct rw_semap + /* + * release a read lock on the semaphore + */ +-void __up_read(struct rw_semaphore *sem) ++void __up_read(struct compat_rw_semaphore *sem) + { + unsigned long flags; + +@@ -275,7 +276,7 @@ void __up_read(struct rw_semaphore *sem) + /* + * release a write lock on the semaphore + */ +-void __up_write(struct rw_semaphore *sem) ++void __up_write(struct compat_rw_semaphore *sem) + { + unsigned long flags; + +@@ -292,7 +293,7 @@ void __up_write(struct rw_semaphore *sem + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +-void __downgrade_write(struct rw_semaphore *sem) ++void __downgrade_write(struct compat_rw_semaphore *sem) + { + unsigned long flags; + +@@ -305,7 +306,7 @@ void __downgrade_write(struct rw_semapho + spin_unlock_irqrestore(&sem->wait_lock, flags); + } + +-EXPORT_SYMBOL(__init_rwsem); ++EXPORT_SYMBOL(__compat_init_rwsem); + EXPORT_SYMBOL(__down_read); + EXPORT_SYMBOL(__down_read_trylock); + EXPORT_SYMBOL(__down_write_nested); +Index: linux-2.6-tip/lib/rwsem.c +=================================================================== +--- linux-2.6-tip.orig/lib/rwsem.c ++++ linux-2.6-tip/lib/rwsem.c +@@ -11,8 +11,8 @@ + /* + * Initialize an rwsem: + */ +-void __init_rwsem(struct rw_semaphore *sem, const char *name, +- struct lock_class_key *key) ++void __compat_init_rwsem(struct rw_semaphore *sem, const char *name, ++ struct lock_class_key *key) + { + #ifdef CONFIG_DEBUG_LOCK_ALLOC + /* +@@ -26,7 +26,7 @@ void __init_rwsem(struct rw_semaphore *s + INIT_LIST_HEAD(&sem->wait_list); + } + +-EXPORT_SYMBOL(__init_rwsem); ++EXPORT_SYMBOL(__compat_init_rwsem); + + struct rwsem_waiter { + struct list_head list; +Index: linux-2.6-tip/lib/spinlock_debug.c +=================================================================== +--- linux-2.6-tip.orig/lib/spinlock_debug.c ++++ linux-2.6-tip/lib/spinlock_debug.c +@@ -13,8 +13,8 @@ + #include + #include + +-void __spin_lock_init(spinlock_t *lock, const char *name, +- struct lock_class_key *key) ++void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, ++ struct lock_class_key *key) + { + #ifdef CONFIG_DEBUG_LOCK_ALLOC + /* +@@ -23,16 +23,16 @@ void __spin_lock_init(spinlock_t *lock, + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); + #endif +- lock->raw_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; ++ lock->raw_lock = (__raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + lock->magic = SPINLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; + } + +-EXPORT_SYMBOL(__spin_lock_init); ++EXPORT_SYMBOL(__raw_spin_lock_init); + +-void __rwlock_init(rwlock_t *lock, const char *name, +- struct lock_class_key *key) ++void __raw_rwlock_init(raw_rwlock_t *lock, const char *name, ++ struct lock_class_key *key) + { + #ifdef CONFIG_DEBUG_LOCK_ALLOC + /* +@@ -41,15 +41,15 @@ void __rwlock_init(rwlock_t *lock, const + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); + #endif +- lock->raw_lock = (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED; ++ lock->raw_lock = (__raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED; + lock->magic = RWLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; + } + +-EXPORT_SYMBOL(__rwlock_init); ++EXPORT_SYMBOL(__raw_rwlock_init); + +-static void spin_bug(spinlock_t *lock, const char *msg) ++static void spin_bug(raw_spinlock_t *lock, const char *msg) + { + struct task_struct *owner = NULL; + +@@ -73,7 +73,7 @@ static void spin_bug(spinlock_t *lock, c + #define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) + + static inline void +-debug_spin_lock_before(spinlock_t *lock) ++debug_spin_lock_before(raw_spinlock_t *lock) + { + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(lock->owner == current, lock, "recursion"); +@@ -81,13 +81,13 @@ debug_spin_lock_before(spinlock_t *lock) + lock, "cpu recursion"); + } + +-static inline void debug_spin_lock_after(spinlock_t *lock) ++static inline void debug_spin_lock_after(raw_spinlock_t *lock) + { + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; + } + +-static inline void debug_spin_unlock(spinlock_t *lock) ++static inline void debug_spin_unlock(raw_spinlock_t *lock) + { + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked"); +@@ -98,7 +98,7 @@ static inline void debug_spin_unlock(spi + lock->owner_cpu = -1; + } + +-static void __spin_lock_debug(spinlock_t *lock) ++static void __spin_lock_debug(raw_spinlock_t *lock) + { + u64 i; + u64 loops = loops_per_jiffy * HZ; +@@ -125,7 +125,7 @@ static void __spin_lock_debug(spinlock_t + } + } + +-void _raw_spin_lock(spinlock_t *lock) ++void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) + { + debug_spin_lock_before(lock); + if (unlikely(!__raw_spin_trylock(&lock->raw_lock))) +@@ -133,7 +133,7 @@ void _raw_spin_lock(spinlock_t *lock) + debug_spin_lock_after(lock); + } + +-int _raw_spin_trylock(spinlock_t *lock) ++int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) + { + int ret = __raw_spin_trylock(&lock->raw_lock); + +@@ -148,13 +148,13 @@ int _raw_spin_trylock(spinlock_t *lock) + return ret; + } + +-void _raw_spin_unlock(spinlock_t *lock) ++void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) + { + debug_spin_unlock(lock); + __raw_spin_unlock(&lock->raw_lock); + } + +-static void rwlock_bug(rwlock_t *lock, const char *msg) ++static void rwlock_bug(raw_rwlock_t *lock, const char *msg) + { + if (!debug_locks_off()) + return; +@@ -167,8 +167,8 @@ static void rwlock_bug(rwlock_t *lock, c + + #define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) + +-#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ +-static void __read_lock_debug(rwlock_t *lock) ++#if 1 /* __write_lock_debug() can lock up - maybe this can too? */ ++static void __raw_read_lock_debug(raw_rwlock_t *lock) + { + u64 i; + u64 loops = loops_per_jiffy * HZ; +@@ -193,13 +193,13 @@ static void __read_lock_debug(rwlock_t * + } + #endif + +-void _raw_read_lock(rwlock_t *lock) ++void __lockfunc _raw_read_lock(raw_rwlock_t *lock) + { + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +- __raw_read_lock(&lock->raw_lock); ++ __raw_read_lock_debug(lock); + } + +-int _raw_read_trylock(rwlock_t *lock) ++int __lockfunc _raw_read_trylock(raw_rwlock_t *lock) + { + int ret = __raw_read_trylock(&lock->raw_lock); + +@@ -212,13 +212,13 @@ int _raw_read_trylock(rwlock_t *lock) + return ret; + } + +-void _raw_read_unlock(rwlock_t *lock) ++void __lockfunc _raw_read_unlock(raw_rwlock_t *lock) + { + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + __raw_read_unlock(&lock->raw_lock); + } + +-static inline void debug_write_lock_before(rwlock_t *lock) ++static inline void debug_write_lock_before(raw_rwlock_t *lock) + { + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); +@@ -226,13 +226,13 @@ static inline void debug_write_lock_befo + lock, "cpu recursion"); + } + +-static inline void debug_write_lock_after(rwlock_t *lock) ++static inline void debug_write_lock_after(raw_rwlock_t *lock) + { + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; + } + +-static inline void debug_write_unlock(rwlock_t *lock) ++static inline void debug_write_unlock(raw_rwlock_t *lock) + { + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); +@@ -242,8 +242,8 @@ static inline void debug_write_unlock(rw + lock->owner_cpu = -1; + } + +-#if 0 /* This can cause lockups */ +-static void __write_lock_debug(rwlock_t *lock) ++#if 1 /* This can cause lockups */ ++static void __raw_write_lock_debug(raw_rwlock_t *lock) + { + u64 i; + u64 loops = loops_per_jiffy * HZ; +@@ -268,14 +268,14 @@ static void __write_lock_debug(rwlock_t + } + #endif + +-void _raw_write_lock(rwlock_t *lock) ++void __lockfunc _raw_write_lock(raw_rwlock_t *lock) + { + debug_write_lock_before(lock); +- __raw_write_lock(&lock->raw_lock); ++ __raw_write_lock_debug(lock); + debug_write_lock_after(lock); + } + +-int _raw_write_trylock(rwlock_t *lock) ++int __lockfunc _raw_write_trylock(raw_rwlock_t *lock) + { + int ret = __raw_write_trylock(&lock->raw_lock); + +@@ -290,7 +290,7 @@ int _raw_write_trylock(rwlock_t *lock) + return ret; + } + +-void _raw_write_unlock(rwlock_t *lock) ++void __lockfunc _raw_write_unlock(raw_rwlock_t *lock) + { + debug_write_unlock(lock); + __raw_write_unlock(&lock->raw_lock); +Index: linux-2.6-tip/drivers/media/dvb/dvb-core/dvb_frontend.c +=================================================================== +--- linux-2.6-tip.orig/drivers/media/dvb/dvb-core/dvb_frontend.c ++++ linux-2.6-tip/drivers/media/dvb/dvb-core/dvb_frontend.c +@@ -101,7 +101,7 @@ struct dvb_frontend_private { + struct dvb_device *dvbdev; + struct dvb_frontend_parameters parameters; + struct dvb_fe_events events; +- struct semaphore sem; ++ struct compat_semaphore sem; + struct list_head list_head; + wait_queue_head_t wait_queue; + struct task_struct *thread; +Index: linux-2.6-tip/drivers/net/3c527.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/3c527.c ++++ linux-2.6-tip/drivers/net/3c527.c +@@ -181,7 +181,7 @@ struct mc32_local + + u16 rx_ring_tail; /* index to rx de-queue end */ + +- struct semaphore cmd_mutex; /* Serialises issuing of execute commands */ ++ struct compat_semaphore cmd_mutex; /* Serialises issuing of execute commands */ + struct completion execution_cmd; /* Card has completed an execute command */ + struct completion xceiver_cmd; /* Card has completed a tx or rx command */ + }; +Index: linux-2.6-tip/drivers/net/hamradio/6pack.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/hamradio/6pack.c ++++ linux-2.6-tip/drivers/net/hamradio/6pack.c +@@ -120,7 +120,7 @@ struct sixpack { + struct timer_list tx_t; + struct timer_list resync_t; + atomic_t refcnt; +- struct semaphore dead_sem; ++ struct compat_semaphore dead_sem; + spinlock_t lock; + }; + +Index: linux-2.6-tip/drivers/net/hamradio/mkiss.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/hamradio/mkiss.c ++++ linux-2.6-tip/drivers/net/hamradio/mkiss.c +@@ -84,7 +84,7 @@ struct mkiss { + #define CRC_MODE_SMACK_TEST 4 + + atomic_t refcnt; +- struct semaphore dead_sem; ++ struct compat_semaphore dead_sem; + }; + + /*---------------------------------------------------------------------------*/ +Index: linux-2.6-tip/drivers/net/ppp_async.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/ppp_async.c ++++ linux-2.6-tip/drivers/net/ppp_async.c +@@ -67,7 +67,7 @@ struct asyncppp { + struct tasklet_struct tsk; + + atomic_t refcnt; +- struct semaphore dead_sem; ++ struct compat_semaphore dead_sem; + struct ppp_channel chan; /* interface to generic ppp layer */ + unsigned char obuf[OBUFSIZE]; + }; +Index: linux-2.6-tip/drivers/pci/hotplug/ibmphp_hpc.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/hotplug/ibmphp_hpc.c ++++ linux-2.6-tip/drivers/pci/hotplug/ibmphp_hpc.c +@@ -104,7 +104,7 @@ static int to_debug = 0; + static struct mutex sem_hpcaccess; // lock access to HPC + static struct semaphore semOperations; // lock all operations and + // access to data structures +-static struct semaphore sem_exit; // make sure polling thread goes away ++static struct compat_semaphore sem_exit; // make sure polling thread goes away + static struct task_struct *ibmphp_poll_thread; + //---------------------------------------------------------------------------- + // local function prototypes +Index: linux-2.6-tip/drivers/scsi/aacraid/aacraid.h +=================================================================== +--- linux-2.6-tip.orig/drivers/scsi/aacraid/aacraid.h ++++ linux-2.6-tip/drivers/scsi/aacraid/aacraid.h +@@ -719,7 +719,7 @@ struct aac_fib_context { + u32 unique; // unique value representing this context + ulong jiffies; // used for cleanup - dmb changed to ulong + struct list_head next; // used to link context's into a linked list +- struct semaphore wait_sem; // this is used to wait for the next fib to arrive. ++ struct compat_semaphore wait_sem; // this is used to wait for the next fib to arrive. + int wait; // Set to true when thread is in WaitForSingleObject + unsigned long count; // total number of FIBs on FibList + struct list_head fib_list; // this holds fibs and their attachd hw_fibs +@@ -789,7 +789,7 @@ struct fib { + * This is the event the sendfib routine will wait on if the + * caller did not pass one and this is synch io. + */ +- struct semaphore event_wait; ++ struct compat_semaphore event_wait; + spinlock_t event_lock; + + u32 done; /* gets set to 1 when fib is complete */ +Index: linux-2.6-tip/include/linux/parport.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/parport.h ++++ linux-2.6-tip/include/linux/parport.h +@@ -264,7 +264,7 @@ enum ieee1284_phase { + struct ieee1284_info { + int mode; + volatile enum ieee1284_phase phase; +- struct semaphore irq; ++ struct compat_semaphore irq; + }; + + /* A parallel port */ +Index: linux-2.6-tip/include/asm-generic/tlb.h +=================================================================== +--- linux-2.6-tip.orig/include/asm-generic/tlb.h ++++ linux-2.6-tip/include/asm-generic/tlb.h +@@ -22,14 +22,8 @@ + * and page free order so much.. + */ + #ifdef CONFIG_SMP +- #ifdef ARCH_FREE_PTR_NR +- #define FREE_PTR_NR ARCH_FREE_PTR_NR +- #else +- #define FREE_PTE_NR 506 +- #endif + #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U) + #else +- #define FREE_PTE_NR 1 + #define tlb_fast_mode(tlb) 1 + #endif + +@@ -39,30 +33,48 @@ + struct mmu_gather { + struct mm_struct *mm; + unsigned int nr; /* set to ~0U means fast mode */ ++ unsigned int max; /* nr < max */ + unsigned int need_flush;/* Really unmapped some ptes? */ + unsigned int fullmm; /* non-zero means full mm flush */ +- struct page * pages[FREE_PTE_NR]; ++#ifdef HAVE_ARCH_MMU_GATHER ++ struct arch_mmu_gather arch; ++#endif ++ struct page ** pages; ++ struct page * local[8]; + }; + +-/* Users of the generic TLB shootdown code must declare this storage space. */ +-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); ++static inline void __tlb_alloc_pages(struct mmu_gather *tlb) ++{ ++ unsigned long addr = __get_free_pages(GFP_ATOMIC, 0); ++ ++ if (addr) { ++ tlb->pages = (void *)addr; ++ tlb->max = PAGE_SIZE / sizeof(struct page *); ++ } ++} + + /* tlb_gather_mmu + * Return a pointer to an initialized struct mmu_gather. + */ +-static inline struct mmu_gather * +-tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) ++static inline void ++tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) + { +- struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); +- + tlb->mm = mm; + +- /* Use fast mode if only one CPU is online */ +- tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; ++ tlb->max = ARRAY_SIZE(tlb->local); ++ tlb->pages = tlb->local; ++ ++ if (num_online_cpus() > 1) { ++ tlb->nr = 0; ++ __tlb_alloc_pages(tlb); ++ } else /* Use fast mode if only one CPU is online */ ++ tlb->nr = ~0U; + + tlb->fullmm = full_mm_flush; + +- return tlb; ++#ifdef HAVE_ARCH_MMU_GATHER ++ tlb->arch = ARCH_MMU_GATHER_INIT; ++#endif + } + + static inline void +@@ -75,6 +87,8 @@ tlb_flush_mmu(struct mmu_gather *tlb, un + if (!tlb_fast_mode(tlb)) { + free_pages_and_swap_cache(tlb->pages, tlb->nr); + tlb->nr = 0; ++ if (tlb->pages == tlb->local) ++ __tlb_alloc_pages(tlb); + } + } + +@@ -90,7 +104,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, u + /* keep the page table cache within bounds */ + check_pgt_cache(); + +- put_cpu_var(mmu_gathers); ++ if (tlb->pages != tlb->local) ++ free_pages((unsigned long)tlb->pages, 0); + } + + /* tlb_remove_page +@@ -106,7 +121,7 @@ static inline void tlb_remove_page(struc + return; + } + tlb->pages[tlb->nr++] = page; +- if (tlb->nr >= FREE_PTE_NR) ++ if (tlb->nr >= tlb->max) + tlb_flush_mmu(tlb, 0, 0); + } + +Index: linux-2.6-tip/mm/swap.c +=================================================================== +--- linux-2.6-tip.orig/mm/swap.c ++++ linux-2.6-tip/mm/swap.c +@@ -30,14 +30,49 @@ + #include + #include + #include ++#include + + #include "internal.h" + + /* How many pages do we try to swap or page in/out together? */ + int page_cluster; + +-static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +-static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); ++/* ++ * On PREEMPT_RT we don't want to disable preemption for cpu variables. ++ * We grab a cpu and then use that cpu to lock the variables accordingly. ++ * ++ * (On !PREEMPT_RT this turns into normal preempt-off sections, as before.) ++ */ ++static DEFINE_PER_CPU_LOCKED(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); ++static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_rotate_pvecs); ++ ++#define swap_get_cpu_var_irq_save(var, flags, cpu) \ ++ ({ \ ++ (void)flags; \ ++ &get_cpu_var_locked(var, &cpu); \ ++ }) ++ ++#define swap_put_cpu_var_irq_restore(var, flags, cpu) \ ++ put_cpu_var_locked(var, cpu) ++ ++#define swap_get_cpu_var(var, cpu) \ ++ &get_cpu_var_locked(var, &cpu) ++ ++#define swap_put_cpu_var(var, cpu) \ ++ put_cpu_var_locked(var, cpu) ++ ++#define swap_per_cpu_lock(var, cpu) \ ++ ({ \ ++ spin_lock(&__get_cpu_lock(var, cpu)); \ ++ &__get_cpu_var_locked(var, cpu); \ ++ }) ++ ++#define swap_per_cpu_unlock(var, cpu) \ ++ spin_unlock(&__get_cpu_lock(var, cpu)); ++ ++#define swap_get_cpu() raw_smp_processor_id() ++ ++#define swap_put_cpu() + + /* + * This path almost never happens for VM activity - pages are normally +@@ -141,13 +176,13 @@ void rotate_reclaimable_page(struct pag + !PageUnevictable(page) && PageLRU(page)) { + struct pagevec *pvec; + unsigned long flags; ++ int cpu; + + page_cache_get(page); +- local_irq_save(flags); +- pvec = &__get_cpu_var(lru_rotate_pvecs); ++ pvec = swap_get_cpu_var_irq_save(lru_rotate_pvecs, flags, cpu); + if (!pagevec_add(pvec, page)) + pagevec_move_tail(pvec); +- local_irq_restore(flags); ++ swap_put_cpu_var_irq_restore(lru_rotate_pvecs, flags, cpu); + } + } + +@@ -216,12 +251,14 @@ EXPORT_SYMBOL(mark_page_accessed); + + void __lru_cache_add(struct page *page, enum lru_list lru) + { +- struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; ++ struct pagevec *pvec; ++ int cpu; + ++ pvec = swap_get_cpu_var(lru_add_pvecs, cpu)[lru]; + page_cache_get(page); + if (!pagevec_add(pvec, page)) + ____pagevec_lru_add(pvec, lru); +- put_cpu_var(lru_add_pvecs); ++ swap_put_cpu_var(lru_add_pvecs, cpu); + } + + /** +@@ -271,31 +308,36 @@ void add_page_to_unevictable_list(struct + */ + static void drain_cpu_pagevecs(int cpu) + { +- struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); +- struct pagevec *pvec; ++ struct pagevec *pvecs, *pvec; + int lru; + ++ pvecs = swap_per_cpu_lock(lru_add_pvecs, cpu)[0]; + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + if (pagevec_count(pvec)) + ____pagevec_lru_add(pvec, lru); + } ++ swap_per_cpu_unlock(lru_add_pvecs, cpu); + +- pvec = &per_cpu(lru_rotate_pvecs, cpu); ++ pvec = swap_per_cpu_lock(lru_rotate_pvecs, cpu); + if (pagevec_count(pvec)) { + unsigned long flags; + + /* No harm done if a racing interrupt already did this */ +- local_irq_save(flags); ++ local_irq_save_nort(flags); + pagevec_move_tail(pvec); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } ++ swap_per_cpu_unlock(lru_rotate_pvecs, cpu); + } + + void lru_add_drain(void) + { +- drain_cpu_pagevecs(get_cpu()); +- put_cpu(); ++ int cpu; ++ ++ cpu = swap_get_cpu(); ++ drain_cpu_pagevecs(cpu); ++ swap_put_cpu(); + } + + static void lru_add_drain_per_cpu(struct work_struct *dummy) +@@ -369,7 +411,7 @@ void release_pages(struct page **pages, + } + __pagevec_free(&pages_to_free); + pagevec_reinit(&pages_to_free); +- } ++ } + } + if (zone) + spin_unlock_irqrestore(&zone->lru_lock, flags); +Index: linux-2.6-tip/net/ipv4/netfilter/arp_tables.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv4/netfilter/arp_tables.c ++++ linux-2.6-tip/net/ipv4/netfilter/arp_tables.c +@@ -239,7 +239,7 @@ unsigned int arpt_do_table(struct sk_buf + + read_lock_bh(&table->lock); + private = table->private; +- table_base = (void *)private->entries[smp_processor_id()]; ++ table_base = (void *)private->entries[raw_smp_processor_id()]; + e = get_entry(table_base, private->hook_entry[hook]); + back = get_entry(table_base, private->underflow[hook]); + +@@ -1159,7 +1159,7 @@ static int do_add_counters(struct net *n + + i = 0; + /* Choose the copy that is on our node */ +- loc_cpu_entry = private->entries[smp_processor_id()]; ++ loc_cpu_entry = private->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, + private->size, + add_counter_to_entry, +Index: linux-2.6-tip/net/ipv4/netfilter/ip_tables.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv4/netfilter/ip_tables.c ++++ linux-2.6-tip/net/ipv4/netfilter/ip_tables.c +@@ -350,7 +350,7 @@ ipt_do_table(struct sk_buff *skb, + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + private = table->private; +- table_base = (void *)private->entries[smp_processor_id()]; ++ table_base = (void *)private->entries[raw_smp_processor_id()]; + e = get_entry(table_base, private->hook_entry[hook]); + + /* For return from builtin chain */ +Index: linux-2.6-tip/net/core/dev.c +=================================================================== +--- linux-2.6-tip.orig/net/core/dev.c ++++ linux-2.6-tip/net/core/dev.c +@@ -1878,42 +1878,52 @@ gso: + Check this and shot the lock. It is not prone from deadlocks. + Either shot noqueue qdisc, it is even simpler 8) + */ +- if (dev->flags & IFF_UP) { +- int cpu = smp_processor_id(); /* ok because BHs are off */ ++ if (!(dev->flags & IFF_UP)) ++ goto err; + +- if (txq->xmit_lock_owner != cpu) { ++ /* Recursion is detected! It is possible, unfortunately: */ ++ if (netif_tx_lock_recursion(txq)) ++ goto err_recursion; + +- HARD_TX_LOCK(dev, txq, cpu); ++ HARD_TX_LOCK(dev, txq); + +- if (!netif_tx_queue_stopped(txq)) { +- rc = 0; +- if (!dev_hard_start_xmit(skb, dev, txq)) { +- HARD_TX_UNLOCK(dev, txq); +- goto out; +- } +- } +- HARD_TX_UNLOCK(dev, txq); +- if (net_ratelimit()) +- printk(KERN_CRIT "Virtual device %s asks to " +- "queue packet!\n", dev->name); +- } else { +- /* Recursion is detected! It is possible, +- * unfortunately */ +- if (net_ratelimit()) +- printk(KERN_CRIT "Dead loop on virtual device " +- "%s, fix it urgently!\n", dev->name); +- } ++ if (netif_tx_queue_stopped(txq)) ++ goto err_tx_unlock; ++ ++ if (dev_hard_start_xmit(skb, dev, txq)) ++ goto err_tx_unlock; ++ ++ rc = 0; ++ HARD_TX_UNLOCK(dev, txq); ++ ++out: ++ rcu_read_unlock_bh(); ++ return rc; ++ ++err_recursion: ++ if (net_ratelimit()) { ++ printk(KERN_CRIT ++ "Dead loop on virtual device %s, fix it urgently!\n", ++ dev->name); ++ } ++ goto err; ++ ++err_tx_unlock: ++ HARD_TX_UNLOCK(dev, txq); ++ ++ if (net_ratelimit()) { ++ printk(KERN_CRIT "Virtual device %s asks to queue packet!\n", ++ dev->name); + } ++ /* Fall through: */ + ++err: + rc = -ENETDOWN; + rcu_read_unlock_bh(); + + out_kfree_skb: + kfree_skb(skb); + return rc; +-out: +- rcu_read_unlock_bh(); +- return rc; + } + + +@@ -1986,8 +1996,8 @@ int netif_rx_ni(struct sk_buff *skb) + { + int err; + +- preempt_disable(); + err = netif_rx(skb); ++ preempt_disable(); + if (local_softirq_pending()) + do_softirq(); + preempt_enable(); +@@ -1999,7 +2009,8 @@ EXPORT_SYMBOL(netif_rx_ni); + + static void net_tx_action(struct softirq_action *h) + { +- struct softnet_data *sd = &__get_cpu_var(softnet_data); ++ struct softnet_data *sd = &per_cpu(softnet_data, ++ raw_smp_processor_id()); + + if (sd->completion_queue) { + struct sk_buff *clist; +@@ -2015,6 +2026,11 @@ static void net_tx_action(struct softirq + + WARN_ON(atomic_read(&skb->users)); + __kfree_skb(skb); ++ /* ++ * Safe to reschedule - the list is private ++ * at this point. ++ */ ++ cond_resched_softirq_context(); + } + } + +@@ -2033,6 +2049,22 @@ static void net_tx_action(struct softirq + head = head->next_sched; + + root_lock = qdisc_lock(q); ++ /* ++ * We are executing in softirq context here, and ++ * if softirqs are preemptible, we must avoid ++ * infinite reactivation of the softirq by ++ * either the tx handler, or by netif_schedule(). ++ * (it would result in an infinitely looping ++ * softirq context) ++ * So we take the spinlock unconditionally. ++ */ ++#ifdef CONFIG_PREEMPT_SOFTIRQS ++ spin_lock(root_lock); ++ smp_mb__before_clear_bit(); ++ clear_bit(__QDISC_STATE_SCHED, &q->state); ++ qdisc_run(q); ++ spin_unlock(root_lock); ++#else + if (spin_trylock(root_lock)) { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, +@@ -2049,6 +2081,7 @@ static void net_tx_action(struct softirq + &q->state); + } + } ++#endif + } + } + } +@@ -2257,7 +2290,7 @@ int netif_receive_skb(struct sk_buff *sk + skb->dev = orig_dev->master; + } + +- __get_cpu_var(netdev_rx_stat).total++; ++ per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); +@@ -2578,9 +2611,10 @@ EXPORT_SYMBOL(napi_gro_frags); + static int process_backlog(struct napi_struct *napi, int quota) + { + int work = 0; +- struct softnet_data *queue = &__get_cpu_var(softnet_data); ++ struct softnet_data *queue; + unsigned long start_time = jiffies; + ++ queue = &per_cpu(softnet_data, raw_smp_processor_id()); + napi->weight = weight_p; + do { + struct sk_buff *skb; +@@ -2612,7 +2646,7 @@ void __napi_schedule(struct napi_struct + + local_irq_save(flags); + list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); +- __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ raise_softirq_irqoff(NET_RX_SOFTIRQ); + local_irq_restore(flags); + } + EXPORT_SYMBOL(__napi_schedule); +@@ -2760,7 +2794,7 @@ out: + + softnet_break: + __get_cpu_var(netdev_rx_stat).time_squeeze++; +- __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ raise_softirq_irqoff(NET_RX_SOFTIRQ); + goto out; + } + +@@ -4231,7 +4265,7 @@ static void __netdev_init_queue_locks_on + { + spin_lock_init(&dev_queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); +- dev_queue->xmit_lock_owner = -1; ++ dev_queue->xmit_lock_owner = (void *)-1; + } + + static void netdev_init_queue_locks(struct net_device *dev) +Index: linux-2.6-tip/fs/buffer.c +=================================================================== +--- linux-2.6-tip.orig/fs/buffer.c ++++ linux-2.6-tip/fs/buffer.c +@@ -40,7 +40,6 @@ + #include + #include + #include +-#include + + static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); + +@@ -469,8 +468,7 @@ static void end_buffer_async_read(struct + * decide that the page is now completely done. + */ + first = page_buffers(page); +- local_irq_save(flags); +- bit_spin_lock(BH_Uptodate_Lock, &first->b_state); ++ spin_lock_irqsave(&first->b_uptodate_lock, flags); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; +@@ -483,8 +481,7 @@ static void end_buffer_async_read(struct + } + tmp = tmp->b_this_page; + } while (tmp != bh); +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + + /* + * If none of the buffers had errors and they are all +@@ -496,8 +493,7 @@ static void end_buffer_async_read(struct + return; + + still_busy: +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; + } + +@@ -532,8 +528,7 @@ static void end_buffer_async_write(struc + } + + first = page_buffers(page); +- local_irq_save(flags); +- bit_spin_lock(BH_Uptodate_Lock, &first->b_state); ++ spin_lock_irqsave(&first->b_uptodate_lock, flags); + + clear_buffer_async_write(bh); + unlock_buffer(bh); +@@ -545,14 +540,12 @@ static void end_buffer_async_write(struc + } + tmp = tmp->b_this_page; + } +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + end_page_writeback(page); + return; + + still_busy: +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; + } + +@@ -3311,6 +3304,8 @@ struct buffer_head *alloc_buffer_head(gf + struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); + if (ret) { + INIT_LIST_HEAD(&ret->b_assoc_buffers); ++ spin_lock_init(&ret->b_uptodate_lock); ++ spin_lock_init(&ret->b_state_lock); + get_cpu_var(bh_accounting).nr++; + recalc_bh_state(); + put_cpu_var(bh_accounting); +@@ -3322,6 +3317,8 @@ EXPORT_SYMBOL(alloc_buffer_head); + void free_buffer_head(struct buffer_head *bh) + { + BUG_ON(!list_empty(&bh->b_assoc_buffers)); ++ BUG_ON(spin_is_locked(&bh->b_uptodate_lock)); ++ BUG_ON(spin_is_locked(&bh->b_state_lock)); + kmem_cache_free(bh_cachep, bh); + get_cpu_var(bh_accounting).nr--; + recalc_bh_state(); +Index: linux-2.6-tip/fs/ntfs/aops.c +=================================================================== +--- linux-2.6-tip.orig/fs/ntfs/aops.c ++++ linux-2.6-tip/fs/ntfs/aops.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + #include "aops.h" + #include "attrib.h" +@@ -107,8 +108,7 @@ static void ntfs_end_buffer_async_read(s + "0x%llx.", (unsigned long long)bh->b_blocknr); + } + first = page_buffers(page); +- local_irq_save(flags); +- bit_spin_lock(BH_Uptodate_Lock, &first->b_state); ++ spin_lock_irqsave(&first->b_uptodate_lock, flags); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; +@@ -123,8 +123,7 @@ static void ntfs_end_buffer_async_read(s + } + tmp = tmp->b_this_page; + } while (tmp != bh); +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + /* + * If none of the buffers had errors then we can set the page uptodate, + * but we first have to perform the post read mst fixups, if the +@@ -145,13 +144,13 @@ static void ntfs_end_buffer_async_read(s + recs = PAGE_CACHE_SIZE / rec_size; + /* Should have been verified before we got here... */ + BUG_ON(!recs); +- local_irq_save(flags); ++ local_irq_save_nort(flags); + kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); + for (i = 0; i < recs; i++) + post_read_mst_fixup((NTFS_RECORD*)(kaddr + + i * rec_size), rec_size); + kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + flush_dcache_page(page); + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); +@@ -159,8 +158,7 @@ static void ntfs_end_buffer_async_read(s + unlock_page(page); + return; + still_busy: +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; + } + +Index: linux-2.6-tip/include/linux/buffer_head.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/buffer_head.h ++++ linux-2.6-tip/include/linux/buffer_head.h +@@ -21,10 +21,6 @@ enum bh_state_bits { + BH_Dirty, /* Is dirty */ + BH_Lock, /* Is locked */ + BH_Req, /* Has been submitted for I/O */ +- BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise +- * IO completion of other buffers in the page +- */ +- + BH_Mapped, /* Has a disk mapping */ + BH_New, /* Disk mapping was newly created by get_block */ + BH_Async_Read, /* Is under end_buffer_async_read I/O */ +@@ -74,6 +70,8 @@ struct buffer_head { + struct address_space *b_assoc_map; /* mapping this buffer is + associated with */ + atomic_t b_count; /* users using this buffer_head */ ++ spinlock_t b_uptodate_lock; ++ spinlock_t b_state_lock; + }; + + /* +Index: linux-2.6-tip/include/linux/jbd.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/jbd.h ++++ linux-2.6-tip/include/linux/jbd.h +@@ -260,6 +260,15 @@ void buffer_assertion_failure(struct buf + #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) + #endif + ++/* ++ * For assertions that are only valid on SMP (e.g. spin_is_locked()): ++ */ ++#ifdef CONFIG_SMP ++# define J_ASSERT_JH_SMP(jh, expr) J_ASSERT_JH(jh, expr) ++#else ++# define J_ASSERT_JH_SMP(jh, assert) do { } while (0) ++#endif ++ + #if defined(JBD_PARANOID_IOFAIL) + #define J_EXPECT(expr, why...) J_ASSERT(expr) + #define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) +@@ -315,32 +324,32 @@ static inline struct journal_head *bh2jh + + static inline void jbd_lock_bh_state(struct buffer_head *bh) + { +- bit_spin_lock(BH_State, &bh->b_state); ++ spin_lock(&bh->b_state_lock); + } + + static inline int jbd_trylock_bh_state(struct buffer_head *bh) + { +- return bit_spin_trylock(BH_State, &bh->b_state); ++ return spin_trylock(&bh->b_state_lock); + } + + static inline int jbd_is_locked_bh_state(struct buffer_head *bh) + { +- return bit_spin_is_locked(BH_State, &bh->b_state); ++ return spin_is_locked(&bh->b_state_lock); + } + + static inline void jbd_unlock_bh_state(struct buffer_head *bh) + { +- bit_spin_unlock(BH_State, &bh->b_state); ++ spin_unlock(&bh->b_state_lock); + } + + static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) + { +- bit_spin_lock(BH_JournalHead, &bh->b_state); ++ spin_lock_irq(&bh->b_uptodate_lock); + } + + static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) + { +- bit_spin_unlock(BH_JournalHead, &bh->b_state); ++ spin_unlock_irq(&bh->b_uptodate_lock); + } + + struct jbd_revoke_table_s; +Index: linux-2.6-tip/fs/jbd/transaction.c +=================================================================== +--- linux-2.6-tip.orig/fs/jbd/transaction.c ++++ linux-2.6-tip/fs/jbd/transaction.c +@@ -1582,7 +1582,7 @@ static void __journal_temp_unlink_buffer + transaction_t *transaction; + struct buffer_head *bh = jh2bh(jh); + +- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); ++ J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); + transaction = jh->b_transaction; + if (transaction) + assert_spin_locked(&transaction->t_journal->j_list_lock); +@@ -2077,7 +2077,7 @@ void __journal_file_buffer(struct journa + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); + +- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); ++ J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); +@@ -2166,7 +2166,7 @@ void __journal_refile_buffer(struct jour + int was_dirty; + struct buffer_head *bh = jh2bh(jh); + +- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); ++ J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh)); + if (jh->b_transaction) + assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); + +Index: linux-2.6-tip/fs/proc/stat.c +=================================================================== +--- linux-2.6-tip.orig/fs/proc/stat.c ++++ linux-2.6-tip/fs/proc/stat.c +@@ -23,13 +23,14 @@ static int show_stat(struct seq_file *p, + { + int i, j; + unsigned long jif; +- cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; ++ cputime64_t user_rt, user, nice, system_rt, system, idle, ++ iowait, irq, softirq, steal; + cputime64_t guest; + u64 sum = 0; + struct timespec boottime; + unsigned int per_irq_sum; + +- user = nice = system = idle = iowait = ++ user_rt = user = nice = system_rt = system = idle = iowait = + irq = softirq = steal = cputime64_zero; + guest = cputime64_zero; + getboottime(&boottime); +@@ -44,6 +45,8 @@ static int show_stat(struct seq_file *p, + irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); + softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); + steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); ++ user_rt = cputime64_add(user_rt, kstat_cpu(i).cpustat.user_rt); ++ system_rt = cputime64_add(system_rt, kstat_cpu(i).cpustat.system_rt); + guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + for_each_irq_nr(j) { + sum += kstat_irqs_cpu(j, i); +@@ -52,7 +55,10 @@ static int show_stat(struct seq_file *p, + } + sum += arch_irq_stat(); + +- seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", ++ user = cputime64_add(user_rt, user); ++ system = cputime64_add(system_rt, system); ++ ++ seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), +@@ -61,13 +67,17 @@ static int show_stat(struct seq_file *p, + (unsigned long long)cputime64_to_clock_t(irq), + (unsigned long long)cputime64_to_clock_t(softirq), + (unsigned long long)cputime64_to_clock_t(steal), ++ (unsigned long long)cputime64_to_clock_t(user_rt), ++ (unsigned long long)cputime64_to_clock_t(system_rt), + (unsigned long long)cputime64_to_clock_t(guest)); + for_each_online_cpu(i) { + + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ +- user = kstat_cpu(i).cpustat.user; ++ user_rt = kstat_cpu(i).cpustat.user_rt; ++ system_rt = kstat_cpu(i).cpustat.system_rt; ++ user = cputime64_add(user_rt, kstat_cpu(i).cpustat.user); + nice = kstat_cpu(i).cpustat.nice; +- system = kstat_cpu(i).cpustat.system; ++ system = cputime64_add(system_rt, kstat_cpu(i).cpustat.system); + idle = kstat_cpu(i).cpustat.idle; + iowait = kstat_cpu(i).cpustat.iowait; + irq = kstat_cpu(i).cpustat.irq; +@@ -75,7 +85,7 @@ static int show_stat(struct seq_file *p, + steal = kstat_cpu(i).cpustat.steal; + guest = kstat_cpu(i).cpustat.guest; + seq_printf(p, +- "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", ++ "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + i, + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), +@@ -85,6 +95,8 @@ static int show_stat(struct seq_file *p, + (unsigned long long)cputime64_to_clock_t(irq), + (unsigned long long)cputime64_to_clock_t(softirq), + (unsigned long long)cputime64_to_clock_t(steal), ++ (unsigned long long)cputime64_to_clock_t(user_rt), ++ (unsigned long long)cputime64_to_clock_t(system_rt), + (unsigned long long)cputime64_to_clock_t(guest)); + } + seq_printf(p, "intr %llu", (unsigned long long)sum); +Index: linux-2.6-tip/drivers/net/3c59x.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/3c59x.c ++++ linux-2.6-tip/drivers/net/3c59x.c +@@ -791,9 +791,9 @@ static void poll_vortex(struct net_devic + { + struct vortex_private *vp = netdev_priv(dev); + unsigned long flags; +- local_irq_save(flags); ++ local_irq_save_nort(flags); + (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + #endif + +@@ -1739,6 +1739,7 @@ vortex_timer(unsigned long data) + int next_tick = 60*HZ; + int ok = 0; + int media_status, old_window; ++ unsigned long flags; + + if (vortex_debug > 2) { + printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n", +@@ -1746,7 +1747,7 @@ vortex_timer(unsigned long data) + printk(KERN_DEBUG "dev->watchdog_timeo=%d\n", dev->watchdog_timeo); + } + +- disable_irq_lockdep(dev->irq); ++ spin_lock_irqsave(&vp->lock, flags); + old_window = ioread16(ioaddr + EL3_CMD) >> 13; + EL3WINDOW(4); + media_status = ioread16(ioaddr + Wn4_Media); +@@ -1769,10 +1770,7 @@ vortex_timer(unsigned long data) + case XCVR_MII: case XCVR_NWAY: + { + ok = 1; +- /* Interrupts are already disabled */ +- spin_lock(&vp->lock); + vortex_check_media(dev, 0); +- spin_unlock(&vp->lock); + } + break; + default: /* Other media types handled by Tx timeouts. */ +@@ -1828,7 +1826,7 @@ leave_media_alone: + dev->name, media_tbl[dev->if_port].name); + + EL3WINDOW(old_window); +- enable_irq_lockdep(dev->irq); ++ spin_unlock_irqrestore(&vp->lock, flags); + mod_timer(&vp->timer, RUN_AT(next_tick)); + if (vp->deferred) + iowrite16(FakeIntr, ioaddr + EL3_CMD); +@@ -1862,12 +1860,12 @@ static void vortex_tx_timeout(struct net + * Block interrupts because vortex_interrupt does a bare spin_lock() + */ + unsigned long flags; +- local_irq_save(flags); ++ local_irq_save_nort(flags); + if (vp->full_bus_master_tx) + boomerang_interrupt(dev->irq, dev); + else + vortex_interrupt(dev->irq, dev); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + } + +Index: linux-2.6-tip/drivers/serial/8250.c +=================================================================== +--- linux-2.6-tip.orig/drivers/serial/8250.c ++++ linux-2.6-tip/drivers/serial/8250.c +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1546,7 +1547,10 @@ static irqreturn_t serial8250_interrupt( + { + struct irq_info *i = dev_id; + struct list_head *l, *end = NULL; +- int pass_counter = 0, handled = 0; ++#ifndef CONFIG_PREEMPT_RT ++ int pass_counter = 0; ++#endif ++ int handled = 0; + + DEBUG_INTR("serial8250_interrupt(%d)...", irq); + +@@ -1584,12 +1588,18 @@ static irqreturn_t serial8250_interrupt( + + l = l->next; + ++ /* ++ * On preempt-rt we can be preempted and run in our ++ * own thread. ++ */ ++#ifndef CONFIG_PREEMPT_RT + if (l == i->head && pass_counter++ > PASS_LIMIT) { + /* If we hit this, we're dead. */ + printk(KERN_ERR "serial8250: too much work for " + "irq%d\n", irq); + break; + } ++#endif + } while (l != end); + + spin_unlock(&i->lock); +@@ -2707,14 +2717,10 @@ serial8250_console_write(struct console + + touch_nmi_watchdog(); + +- local_irq_save(flags); +- if (up->port.sysrq) { +- /* serial8250_handle_port() already took the lock */ +- locked = 0; +- } else if (oops_in_progress) { +- locked = spin_trylock(&up->port.lock); +- } else +- spin_lock(&up->port.lock); ++ if (up->port.sysrq || oops_in_progress || preempt_rt) ++ locked = spin_trylock_irqsave(&up->port.lock, flags); ++ else ++ spin_lock_irqsave(&up->port.lock, flags); + + /* + * First save the IER then disable the interrupts +@@ -2746,8 +2752,7 @@ serial8250_console_write(struct console + check_modem_status(up); + + if (locked) +- spin_unlock(&up->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&up->port.lock, flags); + } + + static int __init serial8250_console_setup(struct console *co, char *options) +Index: linux-2.6-tip/drivers/char/tty_buffer.c +=================================================================== +--- linux-2.6-tip.orig/drivers/char/tty_buffer.c ++++ linux-2.6-tip/drivers/char/tty_buffer.c +@@ -482,10 +482,14 @@ void tty_flip_buffer_push(struct tty_str + tty->buf.tail->commit = tty->buf.tail->used; + spin_unlock_irqrestore(&tty->buf.lock, flags); + ++#ifndef CONFIG_PREEMPT_RT + if (tty->low_latency) + flush_to_ldisc(&tty->buf.work.work); + else + schedule_delayed_work(&tty->buf.work, 1); ++#else ++ flush_to_ldisc(&tty->buf.work.work); ++#endif + } + EXPORT_SYMBOL(tty_flip_buffer_push); + +Index: linux-2.6-tip/arch/x86/include/asm/vgtod.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/vgtod.h ++++ linux-2.6-tip/arch/x86/include/asm/vgtod.h +@@ -5,7 +5,7 @@ + #include + + struct vsyscall_gtod_data { +- seqlock_t lock; ++ raw_seqlock_t lock; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; +Index: linux-2.6-tip/arch/x86/include/asm/i8253.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/i8253.h ++++ linux-2.6-tip/arch/x86/include/asm/i8253.h +@@ -6,7 +6,7 @@ + #define PIT_CH0 0x40 + #define PIT_CH2 0x42 + +-extern spinlock_t i8253_lock; ++extern raw_spinlock_t i8253_lock; + + extern struct clock_event_device *global_clock_event; + +Index: linux-2.6-tip/arch/x86/include/asm/pci_x86.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/pci_x86.h ++++ linux-2.6-tip/arch/x86/include/asm/pci_x86.h +@@ -83,7 +83,7 @@ struct irq_routing_table { + extern unsigned int pcibios_irq_mask; + + extern int pcibios_scanned; +-extern spinlock_t pci_config_lock; ++extern raw_spinlock_t pci_config_lock; + + extern int (*pcibios_enable_irq)(struct pci_dev *dev); + extern void (*pcibios_disable_irq)(struct pci_dev *dev); +Index: linux-2.6-tip/arch/x86/include/asm/xor_32.h +=================================================================== +--- linux-2.6-tip.orig/arch/x86/include/asm/xor_32.h ++++ linux-2.6-tip/arch/x86/include/asm/xor_32.h +@@ -865,7 +865,21 @@ static struct xor_block_template xor_blo + #include + + #undef XOR_TRY_TEMPLATES +-#define XOR_TRY_TEMPLATES \ ++/* ++ * MMX/SSE ops disable preemption for long periods of time, ++ * so on PREEMPT_RT use the register-based ops only: ++ */ ++#ifdef CONFIG_PREEMPT_RT ++# define XOR_TRY_TEMPLATES \ ++ do { \ ++ xor_speed(&xor_block_8regs); \ ++ xor_speed(&xor_block_8regs_p); \ ++ xor_speed(&xor_block_32regs); \ ++ xor_speed(&xor_block_32regs_p); \ ++ } while (0) ++# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST) ++#else ++# define XOR_TRY_TEMPLATES \ + do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_8regs_p); \ +@@ -882,7 +896,8 @@ do { \ + /* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ +-#define XOR_SELECT_TEMPLATE(FASTEST) \ ++# define XOR_SELECT_TEMPLATE(FASTEST) \ + (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) ++#endif /* CONFIG_PREEMPT_RT */ + + #endif /* _ASM_X86_XOR_32_H */ +Index: linux-2.6-tip/arch/x86/kernel/dumpstack_32.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/kernel/dumpstack_32.c ++++ linux-2.6-tip/arch/x86/kernel/dumpstack_32.c +@@ -93,6 +93,12 @@ show_stack_log_lvl(struct task_struct *t + } + + ++#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE) ++extern unsigned long worst_stack_left; ++#else ++# define worst_stack_left -1L ++#endif ++ + void show_registers(struct pt_regs *regs) + { + int i; +Index: linux-2.6-tip/arch/x86/pci/direct.c +=================================================================== +--- linux-2.6-tip.orig/arch/x86/pci/direct.c ++++ linux-2.6-tip/arch/x86/pci/direct.c +@@ -223,16 +223,23 @@ static int __init pci_check_type1(void) + unsigned int tmp; + int works = 0; + +- local_irq_save(flags); ++ spin_lock_irqsave(&pci_config_lock, flags); + + outb(0x01, 0xCFB); + tmp = inl(0xCF8); + outl(0x80000000, 0xCF8); +- if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { +- works = 1; ++ ++ if (inl(0xCF8) == 0x80000000) { ++ spin_unlock_irqrestore(&pci_config_lock, flags); ++ ++ if (pci_sanity_check(&pci_direct_conf1)) ++ works = 1; ++ ++ spin_lock_irqsave(&pci_config_lock, flags); + } + outl(tmp, 0xCF8); +- local_irq_restore(flags); ++ ++ spin_unlock_irqrestore(&pci_config_lock, flags); + + return works; + } +@@ -242,17 +249,19 @@ static int __init pci_check_type2(void) + unsigned long flags; + int works = 0; + +- local_irq_save(flags); ++ spin_lock_irqsave(&pci_config_lock, flags); + + outb(0x00, 0xCFB); + outb(0x00, 0xCF8); + outb(0x00, 0xCFA); +- if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && +- pci_sanity_check(&pci_direct_conf2)) { +- works = 1; +- } + +- local_irq_restore(flags); ++ if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) { ++ spin_unlock_irqrestore(&pci_config_lock, flags); ++ ++ if (pci_sanity_check(&pci_direct_conf2)) ++ works = 1; ++ } else ++ spin_unlock_irqrestore(&pci_config_lock, flags); + + return works; + } +Index: linux-2.6-tip/kernel/sched_cpupri.h +=================================================================== +--- linux-2.6-tip.orig/kernel/sched_cpupri.h ++++ linux-2.6-tip/kernel/sched_cpupri.h +@@ -12,7 +12,7 @@ + /* values 2-101 are RT priorities 0-99 */ + + struct cpupri_vec { +- spinlock_t lock; ++ raw_spinlock_t lock; + int count; + cpumask_var_t mask; + }; +Index: linux-2.6-tip/include/linux/profile.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/profile.h ++++ linux-2.6-tip/include/linux/profile.h +@@ -4,14 +4,16 @@ + #include + #include + #include ++#include + #include + + #include + +-#define CPU_PROFILING 1 +-#define SCHED_PROFILING 2 +-#define SLEEP_PROFILING 3 +-#define KVM_PROFILING 4 ++#define CPU_PROFILING 1 ++#define SCHED_PROFILING 2 ++#define SLEEP_PROFILING 3 ++#define KVM_PROFILING 4 ++#define PREEMPT_PROFILING 5 + + struct proc_dir_entry; + struct pt_regs; +@@ -36,6 +38,8 @@ enum profile_type { + PROFILE_MUNMAP + }; + ++extern int prof_pid; ++ + #ifdef CONFIG_PROFILING + + extern int prof_on __read_mostly; +Index: linux-2.6-tip/include/linux/radix-tree.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/radix-tree.h ++++ linux-2.6-tip/include/linux/radix-tree.h +@@ -167,7 +167,18 @@ radix_tree_gang_lookup_slot(struct radix + unsigned long first_index, unsigned int max_items); + unsigned long radix_tree_next_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan); ++/* ++ * On a mutex based kernel we can freely schedule within the radix code: ++ */ ++#ifdef CONFIG_PREEMPT_RT ++static inline int radix_tree_preload(gfp_t gfp_mask) ++{ ++ return 0; ++} ++#else + int radix_tree_preload(gfp_t gfp_mask); ++#endif ++ + void radix_tree_init(void); + void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, unsigned int tag); +@@ -187,7 +198,9 @@ int radix_tree_tagged(struct radix_tree_ + + static inline void radix_tree_preload_end(void) + { ++#ifndef CONFIG_PREEMPT_RT + preempt_enable(); ++#endif + } + + #endif /* _LINUX_RADIX_TREE_H */ +Index: linux-2.6-tip/include/linux/smp_lock.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/smp_lock.h ++++ linux-2.6-tip/include/linux/smp_lock.h +@@ -45,7 +45,7 @@ static inline void cycle_kernel_lock(voi + #define unlock_kernel() do { } while(0) + #define release_kernel_lock(task) do { } while(0) + #define cycle_kernel_lock() do { } while(0) +-#define reacquire_kernel_lock(task) 0 ++#define reacquire_kernel_lock(task) do { } while(0) + #define kernel_locked() 1 + + #endif /* CONFIG_LOCK_KERNEL */ +Index: linux-2.6-tip/include/linux/workqueue.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/workqueue.h ++++ linux-2.6-tip/include/linux/workqueue.h +@@ -190,6 +190,9 @@ __create_workqueue_key(const char *name, + #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0) + #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0) + ++extern void set_workqueue_prio(struct workqueue_struct *wq, int policy, ++ int rt_priority, int nice); ++ + extern void destroy_workqueue(struct workqueue_struct *wq); + + extern int queue_work(struct workqueue_struct *wq, struct work_struct *work); +Index: linux-2.6-tip/kernel/notifier.c +=================================================================== +--- linux-2.6-tip.orig/kernel/notifier.c ++++ linux-2.6-tip/kernel/notifier.c +@@ -71,7 +71,7 @@ static int notifier_chain_unregister(str + * @returns: notifier_call_chain returns the value returned by the + * last notifier function called. + */ +-static int __kprobes notifier_call_chain(struct notifier_block **nl, ++static int __kprobes notrace notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) + { +@@ -217,7 +217,7 @@ int blocking_notifier_chain_register(str + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ +- if (unlikely(system_state == SYSTEM_BOOTING)) ++ if (unlikely(system_state < SYSTEM_RUNNING)) + return notifier_chain_register(&nh->head, n); + + down_write(&nh->rwsem); +Index: linux-2.6-tip/kernel/user.c +=================================================================== +--- linux-2.6-tip.orig/kernel/user.c ++++ linux-2.6-tip/kernel/user.c +@@ -297,14 +297,14 @@ static void cleanup_user_struct(struct w + */ + uids_mutex_lock(); + +- local_irq_save(flags); ++ local_irq_save_nort(flags); + + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { + uid_hash_remove(up); + remove_user = 1; + spin_unlock_irqrestore(&uidhash_lock, flags); + } else { +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + + if (!remove_user) +@@ -405,11 +405,11 @@ void free_uid(struct user_struct *up) + if (!up) + return; + +- local_irq_save(flags); ++ local_irq_save_nort(flags); + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + free_user(up, flags); + else +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + + struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) +Index: linux-2.6-tip/lib/radix-tree.c +=================================================================== +--- linux-2.6-tip.orig/lib/radix-tree.c ++++ linux-2.6-tip/lib/radix-tree.c +@@ -157,12 +157,14 @@ radix_tree_node_alloc(struct radix_tree_ + * succeed in getting a node here (and never reach + * kmem_cache_alloc) + */ ++ rtp = &get_cpu_var(radix_tree_preloads); + rtp = &__get_cpu_var(radix_tree_preloads); + if (rtp->nr) { + ret = rtp->nodes[rtp->nr - 1]; + rtp->nodes[rtp->nr - 1] = NULL; + rtp->nr--; + } ++ put_cpu_var(radix_tree_preloads); + } + if (ret == NULL) + ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); +@@ -195,6 +197,8 @@ radix_tree_node_free(struct radix_tree_n + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); + } + ++#ifndef CONFIG_PREEMPT_RT ++ + /* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On +@@ -227,6 +231,8 @@ out: + } + EXPORT_SYMBOL(radix_tree_preload); + ++#endif ++ + /* + * Return the maximum key which can be store into a + * radix tree with height HEIGHT. +Index: linux-2.6-tip/net/ipv4/proc.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv4/proc.c ++++ linux-2.6-tip/net/ipv4/proc.c +@@ -54,8 +54,8 @@ static int sockstat_seq_show(struct seq_ + int orphans, sockets; + + local_bh_disable(); +- orphans = percpu_counter_sum_positive(&tcp_orphan_count), +- sockets = percpu_counter_sum_positive(&tcp_sockets_allocated), ++ orphans = percpu_counter_sum_positive(&tcp_orphan_count); ++ sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); + local_bh_enable(); + + socket_seq_show(seq); +Index: linux-2.6-tip/include/linux/netdevice.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/netdevice.h ++++ linux-2.6-tip/include/linux/netdevice.h +@@ -439,7 +439,7 @@ struct netdev_queue { + struct Qdisc *qdisc; + unsigned long state; + spinlock_t _xmit_lock; +- int xmit_lock_owner; ++ void *xmit_lock_owner; + struct Qdisc *qdisc_sleeping; + } ____cacheline_aligned_in_smp; + +@@ -1625,35 +1625,43 @@ static inline void netif_rx_complete(str + napi_complete(napi); + } + +-static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) ++static inline void __netif_tx_lock(struct netdev_queue *txq) + { + spin_lock(&txq->_xmit_lock); +- txq->xmit_lock_owner = cpu; ++ txq->xmit_lock_owner = (void *)current; ++} ++ ++/* ++ * Do we hold the xmit_lock already? ++ */ ++static inline int netif_tx_lock_recursion(struct netdev_queue *txq) ++{ ++ return txq->xmit_lock_owner == (void *)current; + } + + static inline void __netif_tx_lock_bh(struct netdev_queue *txq) + { + spin_lock_bh(&txq->_xmit_lock); +- txq->xmit_lock_owner = smp_processor_id(); ++ txq->xmit_lock_owner = (void *)current; + } + + static inline int __netif_tx_trylock(struct netdev_queue *txq) + { + int ok = spin_trylock(&txq->_xmit_lock); + if (likely(ok)) +- txq->xmit_lock_owner = smp_processor_id(); ++ txq->xmit_lock_owner = (void *)current; + return ok; + } + + static inline void __netif_tx_unlock(struct netdev_queue *txq) + { +- txq->xmit_lock_owner = -1; ++ txq->xmit_lock_owner = (void *)-1; + spin_unlock(&txq->_xmit_lock); + } + + static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) + { +- txq->xmit_lock_owner = -1; ++ txq->xmit_lock_owner = (void *)-1; + spin_unlock_bh(&txq->_xmit_lock); + } + +@@ -1666,10 +1674,8 @@ static inline void __netif_tx_unlock_bh( + static inline void netif_tx_lock(struct net_device *dev) + { + unsigned int i; +- int cpu; + + spin_lock(&dev->tx_global_lock); +- cpu = smp_processor_id(); + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + +@@ -1679,7 +1685,7 @@ static inline void netif_tx_lock(struct + * the ->hard_start_xmit() handler and already + * checked the frozen bit. + */ +- __netif_tx_lock(txq, cpu); ++ __netif_tx_lock(txq); + set_bit(__QUEUE_STATE_FROZEN, &txq->state); + __netif_tx_unlock(txq); + } +@@ -1715,9 +1721,9 @@ static inline void netif_tx_unlock_bh(st + local_bh_enable(); + } + +-#define HARD_TX_LOCK(dev, txq, cpu) { \ ++#define HARD_TX_LOCK(dev, txq) { \ + if ((dev->features & NETIF_F_LLTX) == 0) { \ +- __netif_tx_lock(txq, cpu); \ ++ __netif_tx_lock(txq); \ + } \ + } + +@@ -1730,14 +1736,12 @@ static inline void netif_tx_unlock_bh(st + static inline void netif_tx_disable(struct net_device *dev) + { + unsigned int i; +- int cpu; + + local_bh_disable(); +- cpu = smp_processor_id(); + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + +- __netif_tx_lock(txq, cpu); ++ __netif_tx_lock(txq); + netif_tx_stop_queue(txq); + __netif_tx_unlock(txq); + } +Index: linux-2.6-tip/include/net/dn_dev.h +=================================================================== +--- linux-2.6-tip.orig/include/net/dn_dev.h ++++ linux-2.6-tip/include/net/dn_dev.h +@@ -76,9 +76,9 @@ struct dn_dev_parms { + int priority; /* Priority to be a router */ + char *name; /* Name for sysctl */ + int ctl_name; /* Index for sysctl */ +- int (*up)(struct net_device *); +- void (*down)(struct net_device *); +- void (*timer3)(struct net_device *, struct dn_ifaddr *ifa); ++ int (*dn_up)(struct net_device *); ++ void (*dn_down)(struct net_device *); ++ void (*dn_timer3)(struct net_device *, struct dn_ifaddr *ifa); + void *sysctl; + }; + +Index: linux-2.6-tip/net/core/netpoll.c +=================================================================== +--- linux-2.6-tip.orig/net/core/netpoll.c ++++ linux-2.6-tip/net/core/netpoll.c +@@ -68,20 +68,20 @@ static void queue_process(struct work_st + + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + +- local_irq_save(flags); +- __netif_tx_lock(txq, smp_processor_id()); ++ local_irq_save_nort(flags); ++ __netif_tx_lock(txq); + if (netif_tx_queue_stopped(txq) || + netif_tx_queue_frozen(txq) || + ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { + skb_queue_head(&npinfo->txq, skb); + __netif_tx_unlock(txq); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + + schedule_delayed_work(&npinfo->tx_work, HZ/10); + return; + } + __netif_tx_unlock(txq); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + } + +@@ -151,7 +151,7 @@ static void poll_napi(struct net_device + int budget = 16; + + list_for_each_entry(napi, &dev->napi_list, dev_list) { +- if (napi->poll_owner != smp_processor_id() && ++ if (napi->poll_owner != raw_smp_processor_id() && + spin_trylock(&napi->poll_lock)) { + budget = poll_one_napi(dev->npinfo, napi, budget); + spin_unlock(&napi->poll_lock); +@@ -208,30 +208,35 @@ static void refill_skbs(void) + + static void zap_completion_queue(void) + { +- unsigned long flags; + struct softnet_data *sd = &get_cpu_var(softnet_data); ++ struct sk_buff *clist = NULL; ++ unsigned long flags; + + if (sd->completion_queue) { +- struct sk_buff *clist; + + local_irq_save(flags); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_restore(flags); +- +- while (clist != NULL) { +- struct sk_buff *skb = clist; +- clist = clist->next; +- if (skb->destructor) { +- atomic_inc(&skb->users); +- dev_kfree_skb_any(skb); /* put this one back */ +- } else { +- __kfree_skb(skb); +- } +- } + } + ++ ++ /* ++ * Took the list private, can drop our softnet ++ * reference: ++ */ + put_cpu_var(softnet_data); ++ ++ while (clist != NULL) { ++ struct sk_buff *skb = clist; ++ clist = clist->next; ++ if (skb->destructor) { ++ atomic_inc(&skb->users); ++ dev_kfree_skb_any(skb); /* put this one back */ ++ } else { ++ __kfree_skb(skb); ++ } ++ } + } + + static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) +@@ -239,13 +244,26 @@ static struct sk_buff *find_skb(struct n + int count = 0; + struct sk_buff *skb; + ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * On -rt skb_pool.lock is schedulable, so if we are ++ * in an atomic context we just try to dequeue from the ++ * pool and fail if we cannot get one. ++ */ ++ if (in_atomic() || irqs_disabled()) ++ goto pick_atomic; ++#endif + zap_completion_queue(); + refill_skbs(); + repeat: + + skb = alloc_skb(len, GFP_ATOMIC); +- if (!skb) ++ if (!skb) { ++#ifdef CONFIG_PREEMPT_RT ++pick_atomic: ++#endif + skb = skb_dequeue(&skb_pool); ++ } + + if (!skb) { + if (++count < 10) { +@@ -265,7 +283,7 @@ static int netpoll_owner_active(struct n + struct napi_struct *napi; + + list_for_each_entry(napi, &dev->napi_list, dev_list) { +- if (napi->poll_owner == smp_processor_id()) ++ if (napi->poll_owner == raw_smp_processor_id()) + return 1; + } + return 0; +@@ -291,7 +309,7 @@ static void netpoll_send_skb(struct netp + + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + +- local_irq_save(flags); ++ local_irq_save_nort(flags); + /* try until next clock tick */ + for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; + tries > 0; --tries) { +@@ -310,7 +328,7 @@ static void netpoll_send_skb(struct netp + + udelay(USEC_PER_POLL); + } +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + + if (status != NETDEV_TX_OK) { +@@ -731,7 +749,7 @@ int netpoll_setup(struct netpoll *np) + np->name); + break; + } +- cond_resched(); ++ schedule_timeout_uninterruptible(1); + } + + /* If carrier appears to come up instantly, we don't +Index: linux-2.6-tip/net/decnet/dn_dev.c +=================================================================== +--- linux-2.6-tip.orig/net/decnet/dn_dev.c ++++ linux-2.6-tip/net/decnet/dn_dev.c +@@ -90,9 +90,9 @@ static struct dn_dev_parms dn_dev_list[] + .t3 = 10, + .name = "ethernet", + .ctl_name = NET_DECNET_CONF_ETHER, +- .up = dn_eth_up, +- .down = dn_eth_down, +- .timer3 = dn_send_brd_hello, ++ .dn_up = dn_eth_up, ++ .dn_down = dn_eth_down, ++ .dn_timer3 = dn_send_brd_hello, + }, + { + .type = ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */ +@@ -102,7 +102,7 @@ static struct dn_dev_parms dn_dev_list[] + .t3 = 10, + .name = "ipgre", + .ctl_name = NET_DECNET_CONF_GRE, +- .timer3 = dn_send_brd_hello, ++ .dn_timer3 = dn_send_brd_hello, + }, + #if 0 + { +@@ -113,7 +113,7 @@ static struct dn_dev_parms dn_dev_list[] + .t3 = 120, + .name = "x25", + .ctl_name = NET_DECNET_CONF_X25, +- .timer3 = dn_send_ptp_hello, ++ .dn_timer3 = dn_send_ptp_hello, + }, + #endif + #if 0 +@@ -125,7 +125,7 @@ static struct dn_dev_parms dn_dev_list[] + .t3 = 10, + .name = "ppp", + .ctl_name = NET_DECNET_CONF_PPP, +- .timer3 = dn_send_brd_hello, ++ .dn_timer3 = dn_send_brd_hello, + }, + #endif + { +@@ -136,7 +136,7 @@ static struct dn_dev_parms dn_dev_list[] + .t3 = 120, + .name = "ddcmp", + .ctl_name = NET_DECNET_CONF_DDCMP, +- .timer3 = dn_send_ptp_hello, ++ .dn_timer3 = dn_send_ptp_hello, + }, + { + .type = ARPHRD_LOOPBACK, /* Loopback interface - always last */ +@@ -146,7 +146,7 @@ static struct dn_dev_parms dn_dev_list[] + .t3 = 10, + .name = "loopback", + .ctl_name = NET_DECNET_CONF_LOOPBACK, +- .timer3 = dn_send_brd_hello, ++ .dn_timer3 = dn_send_brd_hello, + } + }; + +@@ -305,11 +305,11 @@ static int dn_forwarding_proc(ctl_table + */ + tmp = dn_db->parms.forwarding; + dn_db->parms.forwarding = old; +- if (dn_db->parms.down) +- dn_db->parms.down(dev); ++ if (dn_db->parms.dn_down) ++ dn_db->parms.dn_down(dev); + dn_db->parms.forwarding = tmp; +- if (dn_db->parms.up) +- dn_db->parms.up(dev); ++ if (dn_db->parms.dn_up) ++ dn_db->parms.dn_up(dev); + } + + return err; +@@ -343,11 +343,11 @@ static int dn_forwarding_sysctl(ctl_tabl + if (value > 2) + return -EINVAL; + +- if (dn_db->parms.down) +- dn_db->parms.down(dev); ++ if (dn_db->parms.dn_down) ++ dn_db->parms.dn_down(dev); + dn_db->parms.forwarding = value; +- if (dn_db->parms.up) +- dn_db->parms.up(dev); ++ if (dn_db->parms.dn_up) ++ dn_db->parms.dn_up(dev); + } + + return 0; +@@ -1078,10 +1078,10 @@ static void dn_dev_timer_func(unsigned l + struct dn_ifaddr *ifa; + + if (dn_db->t3 <= dn_db->parms.t2) { +- if (dn_db->parms.timer3) { ++ if (dn_db->parms.dn_timer3) { + for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) +- dn_db->parms.timer3(dev, ifa); ++ dn_db->parms.dn_timer3(dev, ifa); + } + } + dn_db->t3 = dn_db->parms.t3; +@@ -1140,8 +1140,8 @@ static struct dn_dev *dn_dev_create(stru + return NULL; + } + +- if (dn_db->parms.up) { +- if (dn_db->parms.up(dev) < 0) { ++ if (dn_db->parms.dn_up) { ++ if (dn_db->parms.dn_up(dev) < 0) { + neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms); + dev->dn_ptr = NULL; + kfree(dn_db); +@@ -1235,8 +1235,8 @@ static void dn_dev_delete(struct net_dev + dn_dev_check_default(dev); + neigh_ifdown(&dn_neigh_table, dev); + +- if (dn_db->parms.down) +- dn_db->parms.down(dev); ++ if (dn_db->parms.dn_down) ++ dn_db->parms.dn_down(dev); + + dev->dn_ptr = NULL; + +Index: linux-2.6-tip/net/ipv4/icmp.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv4/icmp.c ++++ linux-2.6-tip/net/ipv4/icmp.c +@@ -201,7 +201,10 @@ static const struct icmp_control icmp_po + */ + static struct sock *icmp_sk(struct net *net) + { +- return net->ipv4.icmp_sk[smp_processor_id()]; ++ /* ++ * Should be safe on PREEMPT_SOFTIRQS/HARDIRQS to use raw-smp-processor-id: ++ */ ++ return net->ipv4.icmp_sk[raw_smp_processor_id()]; + } + + static inline struct sock *icmp_xmit_lock(struct net *net) +Index: linux-2.6-tip/net/ipv6/netfilter/ip6_tables.c +=================================================================== +--- linux-2.6-tip.orig/net/ipv6/netfilter/ip6_tables.c ++++ linux-2.6-tip/net/ipv6/netfilter/ip6_tables.c +@@ -376,7 +376,7 @@ ip6t_do_table(struct sk_buff *skb, + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + private = table->private; +- table_base = (void *)private->entries[smp_processor_id()]; ++ table_base = (void *)private->entries[raw_smp_processor_id()]; + e = get_entry(table_base, private->hook_entry[hook]); + + /* For return from builtin chain */ +Index: linux-2.6-tip/net/sched/sch_generic.c +=================================================================== +--- linux-2.6-tip.orig/net/sched/sch_generic.c ++++ linux-2.6-tip/net/sched/sch_generic.c +@@ -12,6 +12,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -24,6 +25,7 @@ + #include + #include + #include ++#include + #include + + /* Main transmission queue. */ +@@ -78,7 +80,7 @@ static inline int handle_dev_cpu_collisi + { + int ret; + +- if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { ++ if (unlikely(netif_tx_lock_recursion(dev_queue))) { + /* + * Same CPU holding the lock. It may be a transient + * configuration error, when hard_start_xmit() recurses. We +@@ -95,7 +97,9 @@ static inline int handle_dev_cpu_collisi + * Another cpu is holding lock, requeue & delay xmits for + * some time. + */ ++ preempt_disable(); /* FIXME: we need an _rt version of this */ + __get_cpu_var(netdev_rx_stat).cpu_collision++; ++ preempt_enable(); + ret = dev_requeue_skb(skb, q); + } + +@@ -141,7 +145,7 @@ static inline int qdisc_restart(struct Q + dev = qdisc_dev(q); + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + +- HARD_TX_LOCK(dev, txq, smp_processor_id()); ++ HARD_TX_LOCK(dev, txq); + if (!netif_tx_queue_stopped(txq) && + !netif_tx_queue_frozen(txq)) + ret = dev_hard_start_xmit(skb, dev, txq); +@@ -691,9 +695,12 @@ void dev_deactivate(struct net_device *d + /* Wait for outstanding qdisc-less dev_queue_xmit calls. */ + synchronize_rcu(); + +- /* Wait for outstanding qdisc_run calls. */ ++ /* ++ * Wait for outstanding qdisc_run calls. ++ * TODO: shouldnt this be wakeup-based, instead of polling it? ++ */ + while (some_qdisc_is_busy(dev)) +- yield(); ++ msleep(1); + } + + static void dev_init_scheduler_queue(struct net_device *dev, +Index: linux-2.6-tip/drivers/net/bnx2.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/bnx2.c ++++ linux-2.6-tip/drivers/net/bnx2.c +@@ -2662,7 +2662,7 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2 + + if (unlikely(netif_tx_queue_stopped(txq)) && + (bnx2_tx_avail(bp, txr) > bp->tx_wake_thresh)) { +- __netif_tx_lock(txq, smp_processor_id()); ++ __netif_tx_lock(txq); + if ((netif_tx_queue_stopped(txq)) && + (bnx2_tx_avail(bp, txr) > bp->tx_wake_thresh)) + netif_tx_wake_queue(txq); +Index: linux-2.6-tip/drivers/net/mv643xx_eth.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/mv643xx_eth.c ++++ linux-2.6-tip/drivers/net/mv643xx_eth.c +@@ -484,7 +484,7 @@ static void txq_maybe_wake(struct tx_que + struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index); + + if (netif_tx_queue_stopped(nq)) { +- __netif_tx_lock(nq, smp_processor_id()); ++ __netif_tx_lock(nq); + if (txq->tx_ring_size - txq->tx_desc_count >= MAX_SKB_FRAGS + 1) + netif_tx_wake_queue(nq); + __netif_tx_unlock(nq); +@@ -838,7 +838,7 @@ static void txq_kick(struct tx_queue *tx + u32 hw_desc_ptr; + u32 expected_ptr; + +- __netif_tx_lock(nq, smp_processor_id()); ++ __netif_tx_lock(nq); + + if (rdlp(mp, TXQ_COMMAND) & (1 << txq->index)) + goto out; +@@ -862,7 +862,7 @@ static int txq_reclaim(struct tx_queue * + struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index); + int reclaimed; + +- __netif_tx_lock(nq, smp_processor_id()); ++ __netif_tx_lock(nq); + + reclaimed = 0; + while (reclaimed < budget && txq->tx_desc_count > 0) { +Index: linux-2.6-tip/drivers/net/niu.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/niu.c ++++ linux-2.6-tip/drivers/net/niu.c +@@ -3519,7 +3519,7 @@ static void niu_tx_work(struct niu *np, + out: + if (unlikely(netif_tx_queue_stopped(txq) && + (niu_tx_avail(rp) > NIU_TX_WAKEUP_THRESH(rp)))) { +- __netif_tx_lock(txq, smp_processor_id()); ++ __netif_tx_lock(txq); + if (netif_tx_queue_stopped(txq) && + (niu_tx_avail(rp) > NIU_TX_WAKEUP_THRESH(rp))) + netif_tx_wake_queue(txq); +Index: linux-2.6-tip/block/blk-core.c +=================================================================== +--- linux-2.6-tip.orig/block/blk-core.c ++++ linux-2.6-tip/block/blk-core.c +@@ -212,7 +212,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); + */ + void blk_plug_device(struct request_queue *q) + { +- WARN_ON(!irqs_disabled()); ++ WARN_ON_NONRT(!irqs_disabled()); + + /* + * don't plug a stopped queue, it must be paired with blk_start_queue() +@@ -252,7 +252,7 @@ EXPORT_SYMBOL(blk_plug_device_unlocked); + */ + int blk_remove_plug(struct request_queue *q) + { +- WARN_ON(!irqs_disabled()); ++ WARN_ON_NONRT(!irqs_disabled()); + + if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) + return 0; +@@ -362,7 +362,7 @@ static void blk_invoke_request_fn(struct + **/ + void blk_start_queue(struct request_queue *q) + { +- WARN_ON(!irqs_disabled()); ++ WARN_ON_NONRT(!irqs_disabled()); + + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + blk_invoke_request_fn(q); +Index: linux-2.6-tip/fs/aio.c +=================================================================== +--- linux-2.6-tip.orig/fs/aio.c ++++ linux-2.6-tip/fs/aio.c +@@ -622,9 +622,11 @@ static void use_mm(struct mm_struct *mm) + task_lock(tsk); + active_mm = tsk->active_mm; + atomic_inc(&mm->mm_count); ++ local_irq_disable(); // FIXME ++ switch_mm(active_mm, mm, tsk); + tsk->mm = mm; + tsk->active_mm = mm; +- switch_mm(active_mm, mm, tsk); ++ local_irq_enable(); + task_unlock(tsk); + + mmdrop(active_mm); +Index: linux-2.6-tip/fs/file.c +=================================================================== +--- linux-2.6-tip.orig/fs/file.c ++++ linux-2.6-tip/fs/file.c +@@ -102,14 +102,15 @@ void free_fdtable_rcu(struct rcu_head *r + kfree(fdt->open_fds); + kfree(fdt); + } else { +- fddef = &get_cpu_var(fdtable_defer_list); ++ ++ fddef = &per_cpu(fdtable_defer_list, raw_smp_processor_id()); ++ + spin_lock(&fddef->lock); + fdt->next = fddef->next; + fddef->next = fdt; + /* vmallocs are handled from the workqueue context */ + schedule_work(&fddef->wq); + spin_unlock(&fddef->lock); +- put_cpu_var(fdtable_defer_list); + } + } + +Index: linux-2.6-tip/fs/notify/dnotify/dnotify.c +=================================================================== +--- linux-2.6-tip.orig/fs/notify/dnotify/dnotify.c ++++ linux-2.6-tip/fs/notify/dnotify/dnotify.c +@@ -170,7 +170,7 @@ void dnotify_parent(struct dentry *dentr + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent; +- if (parent->d_inode->i_dnotify_mask & event) { ++ if (unlikely(parent->d_inode->i_dnotify_mask & event)) { + dget(parent); + spin_unlock(&dentry->d_lock); + __inode_dir_notify(parent->d_inode, event); +Index: linux-2.6-tip/fs/pipe.c +=================================================================== +--- linux-2.6-tip.orig/fs/pipe.c ++++ linux-2.6-tip/fs/pipe.c +@@ -386,8 +386,14 @@ redo: + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } ++ /* ++ * Hack: we turn off atime updates for -RT kernels. ++ * Who uses them on pipes anyway? ++ */ ++#ifndef CONFIG_PREEMPT_RT + if (ret > 0) + file_accessed(filp); ++#endif + return ret; + } + +@@ -559,8 +565,14 @@ out: + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } ++ /* ++ * Hack: we turn off atime updates for -RT kernels. ++ * Who uses them on pipes anyway? ++ */ ++#ifndef CONFIG_PREEMPT_RT + if (ret > 0) + file_update_time(filp); ++#endif + return ret; + } + +Index: linux-2.6-tip/fs/proc/task_mmu.c +=================================================================== +--- linux-2.6-tip.orig/fs/proc/task_mmu.c ++++ linux-2.6-tip/fs/proc/task_mmu.c +@@ -137,8 +137,10 @@ static void *m_start(struct seq_file *m, + vma = NULL; + if ((unsigned long)l < mm->map_count) { + vma = mm->mmap; +- while (l-- && vma) ++ while (l-- && vma) { + vma = vma->vm_next; ++ cond_resched(); ++ } + goto out; + } + +Index: linux-2.6-tip/fs/xfs/linux-2.6/mrlock.h +=================================================================== +--- linux-2.6-tip.orig/fs/xfs/linux-2.6/mrlock.h ++++ linux-2.6-tip/fs/xfs/linux-2.6/mrlock.h +@@ -21,7 +21,7 @@ + #include + + typedef struct { +- struct rw_semaphore mr_lock; ++ struct compat_rw_semaphore mr_lock; + #ifdef DEBUG + int mr_writer; + #endif +Index: linux-2.6-tip/fs/xfs/xfs_mount.h +=================================================================== +--- linux-2.6-tip.orig/fs/xfs/xfs_mount.h ++++ linux-2.6-tip/fs/xfs/xfs_mount.h +@@ -275,7 +275,7 @@ typedef struct xfs_mount { + uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ + uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ + struct xfs_perag *m_perag; /* per-ag accounting info */ +- struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ ++ struct compat_rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ + struct mutex m_growlock; /* growfs mutex */ + int m_fixedfsid[2]; /* unchanged for life of FS */ + uint m_dmevmask; /* DMI events for this FS */ +Index: linux-2.6-tip/drivers/acpi/acpica/acglobal.h +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/acpica/acglobal.h ++++ linux-2.6-tip/drivers/acpi/acpica/acglobal.h +@@ -190,7 +190,12 @@ ACPI_EXTERN u8 acpi_gbl_global_lock_pres + * interrupt level + */ + ACPI_EXTERN spinlock_t _acpi_gbl_gpe_lock; /* For GPE data structs and registers */ +-ACPI_EXTERN spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ ++ ++/* ++ * Need to be raw because it might be used in acpi_processor_idle(): ++ */ ++ACPI_EXTERN raw_spinlock_t _acpi_gbl_hardware_lock; /* For ACPI H/W except GPE registers */ ++ + #define acpi_gbl_gpe_lock &_acpi_gbl_gpe_lock + #define acpi_gbl_hardware_lock &_acpi_gbl_hardware_lock + +Index: linux-2.6-tip/drivers/acpi/acpica/hwregs.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/acpica/hwregs.c ++++ linux-2.6-tip/drivers/acpi/acpica/hwregs.c +@@ -74,7 +74,7 @@ acpi_status acpi_hw_clear_acpi_status(vo + ACPI_BITMASK_ALL_FIXED_STATUS, + (u16) acpi_gbl_FADT.xpm1a_event_block.address)); + +- lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); ++ spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); + + status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, + ACPI_BITMASK_ALL_FIXED_STATUS); +@@ -97,7 +97,7 @@ acpi_status acpi_hw_clear_acpi_status(vo + status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL); + + unlock_and_exit: +- acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); ++ spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); + return_ACPI_STATUS(status); + } + +Index: linux-2.6-tip/drivers/acpi/acpica/hwxface.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/acpica/hwxface.c ++++ linux-2.6-tip/drivers/acpi/acpica/hwxface.c +@@ -313,9 +313,9 @@ acpi_status acpi_get_register(u32 regist + acpi_status status; + acpi_cpu_flags flags; + +- flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); ++ spin_lock_irqsave(acpi_gbl_hardware_lock, flags); + status = acpi_get_register_unlocked(register_id, return_value); +- acpi_os_release_lock(acpi_gbl_hardware_lock, flags); ++ spin_unlock_irqrestore(acpi_gbl_hardware_lock, flags); + + return (status); + } +@@ -353,7 +353,7 @@ acpi_status acpi_set_register(u32 regist + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + +- lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); ++ spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); + + /* Always do a register read first so we can insert the new bits */ + +@@ -458,7 +458,7 @@ acpi_status acpi_set_register(u32 regist + + unlock_and_exit: + +- acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); ++ spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); + + /* Normalize the value that was read */ + +Index: linux-2.6-tip/drivers/acpi/acpica/utmutex.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/acpica/utmutex.c ++++ linux-2.6-tip/drivers/acpi/acpica/utmutex.c +@@ -117,7 +117,7 @@ void acpi_ut_mutex_terminate(void) + /* Delete the spinlocks */ + + acpi_os_delete_lock(acpi_gbl_gpe_lock); +- acpi_os_delete_lock(acpi_gbl_hardware_lock); ++// acpi_os_delete_lock(acpi_gbl_hardware_lock); + return_VOID; + } + +Index: linux-2.6-tip/drivers/acpi/ec.c +=================================================================== +--- linux-2.6-tip.orig/drivers/acpi/ec.c ++++ linux-2.6-tip/drivers/acpi/ec.c +@@ -563,8 +563,21 @@ static u32 acpi_ec_gpe_handler(void *dat + if (test_bit(EC_FLAGS_GPE_MODE, &ec->flags)) { + gpe_transaction(ec, status); + if (ec_transaction_done(ec) && +- (status & ACPI_EC_FLAG_IBF) == 0) ++ (status & ACPI_EC_FLAG_IBF) == 0) { ++#if 0 + wake_up(&ec->wait); ++#else ++ // hack ... ++ if (waitqueue_active(&ec->wait)) { ++ struct task_struct *task; ++ ++ task = list_entry(ec->wait.task_list.next, ++ wait_queue_t, task_list)->private; ++ if (task) ++ wake_up_process(task); ++ } ++#endif ++ } + } + + ec_check_sci(ec, status); +Index: linux-2.6-tip/ipc/mqueue.c +=================================================================== +--- linux-2.6-tip.orig/ipc/mqueue.c ++++ linux-2.6-tip/ipc/mqueue.c +@@ -787,12 +787,17 @@ static inline void pipelined_send(struct + struct msg_msg *message, + struct ext_wait_queue *receiver) + { ++ /* ++ * Keep them in one critical section for PREEMPT_RT: ++ */ ++ preempt_disable(); + receiver->msg = message; + list_del(&receiver->list); + receiver->state = STATE_PENDING; + wake_up_process(receiver->task); + smp_wmb(); + receiver->state = STATE_READY; ++ preempt_enable(); + } + + /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() +Index: linux-2.6-tip/ipc/msg.c +=================================================================== +--- linux-2.6-tip.orig/ipc/msg.c ++++ linux-2.6-tip/ipc/msg.c +@@ -259,12 +259,19 @@ static void expunge_all(struct msg_queue + while (tmp != &msq->q_receivers) { + struct msg_receiver *msr; + ++ /* ++ * Make sure that the wakeup doesnt preempt ++ * this CPU prematurely. (on PREEMPT_RT) ++ */ ++ preempt_disable(); ++ + msr = list_entry(tmp, struct msg_receiver, r_list); + tmp = tmp->next; + msr->r_msg = NULL; +- wake_up_process(msr->r_tsk); +- smp_mb(); ++ wake_up_process(msr->r_tsk); /* serializes */ + msr->r_msg = ERR_PTR(res); ++ ++ preempt_enable(); + } + } + +@@ -611,22 +618,28 @@ static inline int pipelined_send(struct + !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, + msr->r_msgtype, msr->r_mode)) { + ++ /* ++ * Make sure that the wakeup doesnt preempt ++ * this CPU prematurely. (on PREEMPT_RT) ++ */ ++ preempt_disable(); ++ + list_del(&msr->r_list); + if (msr->r_maxsize < msg->m_ts) { + msr->r_msg = NULL; +- wake_up_process(msr->r_tsk); +- smp_mb(); ++ wake_up_process(msr->r_tsk); /* serializes */ + msr->r_msg = ERR_PTR(-E2BIG); + } else { + msr->r_msg = NULL; + msq->q_lrpid = task_pid_vnr(msr->r_tsk); + msq->q_rtime = get_seconds(); +- wake_up_process(msr->r_tsk); +- smp_mb(); ++ wake_up_process(msr->r_tsk); /* serializes */ + msr->r_msg = msg; ++ preempt_enable(); + + return 1; + } ++ preempt_enable(); + } + } + return 0; +Index: linux-2.6-tip/ipc/sem.c +=================================================================== +--- linux-2.6-tip.orig/ipc/sem.c ++++ linux-2.6-tip/ipc/sem.c +@@ -415,6 +415,11 @@ static void update_queue (struct sem_arr + struct sem_queue *n; + + /* ++ * make sure that the wakeup doesnt preempt ++ * _this_ cpu prematurely. (on preempt_rt) ++ */ ++ preempt_disable(); ++ /* + * Continue scanning. The next operation + * that must be checked depends on the type of the + * completed operation: +@@ -450,6 +455,7 @@ static void update_queue (struct sem_arr + */ + smp_wmb(); + q->status = error; ++ preempt_enable(); + q = n; + } else { + q = list_entry(q->list.next, struct sem_queue, list); +Index: linux-2.6-tip/include/linux/pagevec.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/pagevec.h ++++ linux-2.6-tip/include/linux/pagevec.h +@@ -9,7 +9,7 @@ + #define _LINUX_PAGEVEC_H + + /* 14 pointers + two long's align the pagevec structure to a power of two */ +-#define PAGEVEC_SIZE 14 ++#define PAGEVEC_SIZE 8 + + struct page; + struct address_space; +Index: linux-2.6-tip/include/linux/vmstat.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/vmstat.h ++++ linux-2.6-tip/include/linux/vmstat.h +@@ -75,7 +75,12 @@ DECLARE_PER_CPU(struct vm_event_state, v + + static inline void __count_vm_event(enum vm_event_item item) + { ++#ifdef CONFIG_PREEMPT_RT ++ get_cpu_var(vm_event_states).event[item]++; ++ put_cpu(); ++#else + __get_cpu_var(vm_event_states).event[item]++; ++#endif + } + + static inline void count_vm_event(enum vm_event_item item) +@@ -86,7 +91,12 @@ static inline void count_vm_event(enum v + + static inline void __count_vm_events(enum vm_event_item item, long delta) + { ++#ifdef CONFIG_PREEMPT_RT ++ get_cpu_var(vm_event_states).event[item] += delta; ++ put_cpu(); ++#else + __get_cpu_var(vm_event_states).event[item] += delta; ++#endif + } + + static inline void count_vm_events(enum vm_event_item item, long delta) +Index: linux-2.6-tip/mm/bounce.c +=================================================================== +--- linux-2.6-tip.orig/mm/bounce.c ++++ linux-2.6-tip/mm/bounce.c +@@ -51,11 +51,11 @@ static void bounce_copy_vec(struct bio_v + unsigned long flags; + unsigned char *vto; + +- local_irq_save(flags); ++ local_irq_save_nort(flags); + vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); + memcpy(vto + to->bv_offset, vfrom, to->bv_len); + kunmap_atomic(vto, KM_BOUNCE_READ); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + + #else /* CONFIG_HIGHMEM */ +Index: linux-2.6-tip/mm/mmap.c +=================================================================== +--- linux-2.6-tip.orig/mm/mmap.c ++++ linux-2.6-tip/mm/mmap.c +@@ -1765,17 +1765,17 @@ static void unmap_region(struct mm_struc + unsigned long start, unsigned long end) + { + struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; +- struct mmu_gather *tlb; ++ struct mmu_gather tlb; + unsigned long nr_accounted = 0; + + lru_add_drain(); +- tlb = tlb_gather_mmu(mm, 0); ++ tlb_gather_mmu(&tlb, mm, 0); + update_hiwater_rss(mm); + unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); +- free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, ++ free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + next? next->vm_start: 0); +- tlb_finish_mmu(tlb, start, end); ++ tlb_finish_mmu(&tlb, start, end); + } + + /* +@@ -1957,10 +1957,16 @@ SYSCALL_DEFINE2(munmap, unsigned long, a + static inline void verify_mm_writelocked(struct mm_struct *mm) + { + #ifdef CONFIG_DEBUG_VM +- if (unlikely(down_read_trylock(&mm->mmap_sem))) { ++# ifdef CONFIG_PREEMPT_RT ++ if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) { + WARN_ON(1); +- up_read(&mm->mmap_sem); + } ++# else ++ if (unlikely(down_read_trylock(&mm->mmap_sem))) { ++ WARN_ON(1); ++ up_read(&mm->mmap_sem); ++ } ++# endif + #endif + } + +@@ -2074,7 +2080,7 @@ EXPORT_SYMBOL(do_brk); + /* Release all mmaps. */ + void exit_mmap(struct mm_struct *mm) + { +- struct mmu_gather *tlb; ++ struct mmu_gather tlb; + struct vm_area_struct *vma; + unsigned long nr_accounted = 0; + unsigned long end; +@@ -2099,13 +2105,13 @@ void exit_mmap(struct mm_struct *mm) + + lru_add_drain(); + flush_cache_mm(mm); +- tlb = tlb_gather_mmu(mm, 1); ++ tlb_gather_mmu(&tlb, mm, 1); + /* update_hiwater_rss(mm) here? but nobody should be looking */ + /* Use -1 here to ensure all VMAs in the mm are unmapped */ + end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); + vm_unacct_memory(nr_accounted); +- free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); +- tlb_finish_mmu(tlb, 0, end); ++ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); ++ tlb_finish_mmu(&tlb, 0, end); + + /* + * Walk the list again, actually closing and freeing it, +Index: linux-2.6-tip/mm/vmstat.c +=================================================================== +--- linux-2.6-tip.orig/mm/vmstat.c ++++ linux-2.6-tip/mm/vmstat.c +@@ -153,10 +153,14 @@ static void refresh_zone_stat_thresholds + void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) + { +- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); +- s8 *p = pcp->vm_stat_diff + item; ++ struct per_cpu_pageset *pcp; ++ int cpu; + long x; ++ s8 *p; + ++ cpu = get_cpu(); ++ pcp = zone_pcp(zone, cpu); ++ p = pcp->vm_stat_diff + item; + x = delta + *p; + + if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { +@@ -164,6 +168,7 @@ void __mod_zone_page_state(struct zone * + x = 0; + } + *p = x; ++ put_cpu(); + } + EXPORT_SYMBOL(__mod_zone_page_state); + +@@ -206,9 +211,13 @@ EXPORT_SYMBOL(mod_zone_page_state); + */ + void __inc_zone_state(struct zone *zone, enum zone_stat_item item) + { +- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); +- s8 *p = pcp->vm_stat_diff + item; ++ struct per_cpu_pageset *pcp; ++ int cpu; ++ s8 *p; + ++ cpu = get_cpu(); ++ pcp = zone_pcp(zone, cpu); ++ p = pcp->vm_stat_diff + item; + (*p)++; + + if (unlikely(*p > pcp->stat_threshold)) { +@@ -217,18 +226,34 @@ void __inc_zone_state(struct zone *zone, + zone_page_state_add(*p + overstep, zone, item); + *p = -overstep; + } ++ put_cpu(); + } + + void __inc_zone_page_state(struct page *page, enum zone_stat_item item) + { ++#ifdef CONFIG_PREEMPT_RT ++ unsigned long flags; ++ struct zone *zone; ++ ++ zone = page_zone(page); ++ local_irq_save(flags); ++ __inc_zone_state(zone, item); ++ local_irq_restore(flags); ++#else + __inc_zone_state(page_zone(page), item); ++#endif + } + EXPORT_SYMBOL(__inc_zone_page_state); + + void __dec_zone_state(struct zone *zone, enum zone_stat_item item) + { +- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); +- s8 *p = pcp->vm_stat_diff + item; ++ struct per_cpu_pageset *pcp; ++ int cpu; ++ s8 *p; ++ ++ cpu = get_cpu(); ++ pcp = zone_pcp(zone, cpu); ++ p = pcp->vm_stat_diff + item; + + (*p)--; + +@@ -238,6 +263,7 @@ void __dec_zone_state(struct zone *zone, + zone_page_state_add(*p - overstep, zone, item); + *p = overstep; + } ++ put_cpu(); + } + + void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +Index: linux-2.6-tip/drivers/block/paride/pseudo.h +=================================================================== +--- linux-2.6-tip.orig/drivers/block/paride/pseudo.h ++++ linux-2.6-tip/drivers/block/paride/pseudo.h +@@ -43,7 +43,7 @@ static unsigned long ps_timeout; + static int ps_tq_active = 0; + static int ps_nice = 0; + +-static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); ++static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock); + + static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int); + +Index: linux-2.6-tip/drivers/video/console/fbcon.c +=================================================================== +--- linux-2.6-tip.orig/drivers/video/console/fbcon.c ++++ linux-2.6-tip/drivers/video/console/fbcon.c +@@ -1203,7 +1203,6 @@ static void fbcon_clear(struct vc_data * + { + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_ops *ops = info->fbcon_par; +- + struct display *p = &fb_display[vc->vc_num]; + u_int y_break; + +@@ -1235,10 +1234,11 @@ static void fbcon_putcs(struct vc_data * + struct display *p = &fb_display[vc->vc_num]; + struct fbcon_ops *ops = info->fbcon_par; + +- if (!fbcon_is_inactive(vc, info)) ++ if (!fbcon_is_inactive(vc, info)) { + ops->putcs(vc, info, s, count, real_y(p, ypos), xpos, + get_color(vc, info, scr_readw(s), 1), + get_color(vc, info, scr_readw(s), 0)); ++ } + } + + static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos) +@@ -3225,6 +3225,7 @@ static const struct consw fb_con = { + .con_screen_pos = fbcon_screen_pos, + .con_getxy = fbcon_getxy, + .con_resize = fbcon_resize, ++ .con_preemptible = 1, + }; + + static struct notifier_block fbcon_event_notifier = { +Index: linux-2.6-tip/include/linux/console.h +=================================================================== +--- linux-2.6-tip.orig/include/linux/console.h ++++ linux-2.6-tip/include/linux/console.h +@@ -55,6 +55,7 @@ struct consw { + void (*con_invert_region)(struct vc_data *, u16 *, int); + u16 *(*con_screen_pos)(struct vc_data *, int); + unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *); ++ int con_preemptible; // can it reschedule from within printk? + }; + + extern const struct consw *conswitchp; +@@ -92,6 +93,17 @@ void give_up_console(const struct consw + #define CON_BOOT (8) + #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ + #define CON_BRL (32) /* Used for a braille device */ ++#define CON_ATOMIC (64) /* Safe to call in PREEMPT_RT atomic */ ++ ++#ifdef CONFIG_PREEMPT_RT ++# define console_atomic_safe(con) \ ++ (((con)->flags & CON_ATOMIC) || \ ++ (!in_atomic() && !irqs_disabled()) || \ ++ (system_state != SYSTEM_RUNNING) || \ ++ oops_in_progress) ++#else ++# define console_atomic_safe(con) (1) ++#endif + + struct console { + char name[16]; +Index: linux-2.6-tip/drivers/ide/alim15x3.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ide/alim15x3.c ++++ linux-2.6-tip/drivers/ide/alim15x3.c +@@ -90,7 +90,7 @@ static void ali_set_pio_mode(ide_drive_t + if (r_clc >= 16) + r_clc = 0; + } +- local_irq_save(flags); ++ local_irq_save_nort(flags); + + /* + * PIO mode => ATA FIFO on, ATAPI FIFO off +@@ -112,7 +112,7 @@ static void ali_set_pio_mode(ide_drive_t + + pci_write_config_byte(dev, port, s_clc); + pci_write_config_byte(dev, port + unit + 2, (a_clc << 4) | r_clc); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + + /** +@@ -222,7 +222,7 @@ static unsigned int init_chipset_ali15x3 + + isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL); + +- local_irq_save(flags); ++ local_irq_save_nort(flags); + + if (m5229_revision < 0xC2) { + /* +@@ -313,7 +313,7 @@ out: + } + pci_dev_put(north); + pci_dev_put(isa_dev); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + return 0; + } + +@@ -375,7 +375,7 @@ static u8 ali_cable_detect(ide_hwif_t *h + unsigned long flags; + u8 cbl = ATA_CBL_PATA40, tmpbyte; + +- local_irq_save(flags); ++ local_irq_save_nort(flags); + + if (m5229_revision >= 0xC2) { + /* +@@ -396,7 +396,7 @@ static u8 ali_cable_detect(ide_hwif_t *h + } + } + +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + + return cbl; + } +Index: linux-2.6-tip/drivers/ide/hpt366.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ide/hpt366.c ++++ linux-2.6-tip/drivers/ide/hpt366.c +@@ -1330,7 +1330,7 @@ static int __devinit init_dma_hpt366(ide + + dma_old = inb(base + 2); + +- local_irq_save(flags); ++ local_irq_save_nort(flags); + + dma_new = dma_old; + pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma); +@@ -1341,7 +1341,7 @@ static int __devinit init_dma_hpt366(ide + if (dma_new != dma_old) + outb(dma_new, base + 2); + +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + + printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n", + hwif->name, base, base + 7); +Index: linux-2.6-tip/drivers/ide/ide-io.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ide/ide-io.c ++++ linux-2.6-tip/drivers/ide/ide-io.c +@@ -948,7 +948,7 @@ void ide_timer_expiry (unsigned long dat + /* disable_irq_nosync ?? */ + disable_irq(hwif->irq); + /* local CPU only, as if we were handling an interrupt */ +- local_irq_disable(); ++ local_irq_disable_nort(); + if (hwif->polling) { + startstop = handler(drive); + } else if (drive_is_ready(drive)) { +Index: linux-2.6-tip/drivers/ide/ide-iops.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ide/ide-iops.c ++++ linux-2.6-tip/drivers/ide/ide-iops.c +@@ -275,7 +275,7 @@ void ide_input_data(ide_drive_t *drive, + unsigned long uninitialized_var(flags); + + if ((io_32bit & 2) && !mmio) { +- local_irq_save(flags); ++ local_irq_save_nort(flags); + ata_vlb_sync(io_ports->nsect_addr); + } + +@@ -285,7 +285,7 @@ void ide_input_data(ide_drive_t *drive, + insl(data_addr, buf, len / 4); + + if ((io_32bit & 2) && !mmio) +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + + if ((len & 3) >= 2) { + if (mmio) +@@ -321,7 +321,7 @@ void ide_output_data(ide_drive_t *drive, + unsigned long uninitialized_var(flags); + + if ((io_32bit & 2) && !mmio) { +- local_irq_save(flags); ++ local_irq_save_nort(flags); + ata_vlb_sync(io_ports->nsect_addr); + } + +@@ -331,7 +331,7 @@ void ide_output_data(ide_drive_t *drive, + outsl(data_addr, buf, len / 4); + + if ((io_32bit & 2) && !mmio) +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + + if ((len & 3) >= 2) { + if (mmio) +@@ -509,12 +509,12 @@ static int __ide_wait_stat(ide_drive_t * + if ((stat & ATA_BUSY) == 0) + break; + +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + *rstat = stat; + return -EBUSY; + } + } +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + } + /* + * Allow status to settle, then read it again. +@@ -694,17 +694,17 @@ int ide_driveid_update(ide_drive_t *driv + printk("%s: CHECK for good STATUS\n", drive->name); + return 0; + } +- local_irq_save(flags); ++ local_irq_save_nort(flags); + SELECT_MASK(drive, 0); + id = kmalloc(SECTOR_SIZE, GFP_ATOMIC); + if (!id) { +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + return 0; + } + tp_ops->input_data(drive, NULL, id, SECTOR_SIZE); + (void)tp_ops->read_status(hwif); /* clear drive IRQ */ +- local_irq_enable(); +- local_irq_restore(flags); ++ local_irq_enable_nort(); ++ local_irq_restore_nort(flags); + ide_fix_driveid(id); + + drive->id[ATA_ID_UDMA_MODES] = id[ATA_ID_UDMA_MODES]; +Index: linux-2.6-tip/drivers/ide/ide-probe.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ide/ide-probe.c ++++ linux-2.6-tip/drivers/ide/ide-probe.c +@@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *dri + int bswap = 1; + + /* local CPU only; some systems need this */ +- local_irq_save(flags); ++ local_irq_save_nort(flags); + /* read 512 bytes of id info */ + hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + + drive->dev_flags |= IDE_DFLAG_ID_READ; + #ifdef DEBUG +@@ -813,7 +813,7 @@ static int ide_probe_port(ide_hwif_t *hw + rc = 0; + } + +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + + /* + * Use cached IRQ number. It might be (and is...) changed by probe +Index: linux-2.6-tip/drivers/ide/ide-taskfile.c +=================================================================== +--- linux-2.6-tip.orig/drivers/ide/ide-taskfile.c ++++ linux-2.6-tip/drivers/ide/ide-taskfile.c +@@ -219,7 +219,7 @@ static void ide_pio_sector(ide_drive_t * + offset %= PAGE_SIZE; + + #ifdef CONFIG_HIGHMEM +- local_irq_save(flags); ++ local_irq_save_nort(flags); + #endif + buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; + +@@ -239,7 +239,7 @@ static void ide_pio_sector(ide_drive_t * + + kunmap_atomic(buf, KM_BIO_SRC_IRQ); + #ifdef CONFIG_HIGHMEM +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + #endif + } + +@@ -430,7 +430,7 @@ static ide_startstop_t pre_task_out_intr + } + + if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0) +- local_irq_disable(); ++ local_irq_disable_nort(); + + ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL); + ide_pio_datablock(drive, rq, 1); +Index: linux-2.6-tip/drivers/input/gameport/gameport.c +=================================================================== +--- linux-2.6-tip.orig/drivers/input/gameport/gameport.c ++++ linux-2.6-tip/drivers/input/gameport/gameport.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include /* HZ */ + #include + #include +@@ -97,12 +98,12 @@ static int gameport_measure_speed(struct + tx = 1 << 30; + + for(i = 0; i < 50; i++) { +- local_irq_save(flags); ++ local_irq_save_nort(flags); + GET_TIME(t1); + for (t = 0; t < 50; t++) gameport_read(gameport); + GET_TIME(t2); + GET_TIME(t3); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + udelay(i * 10); + if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; + } +@@ -121,11 +122,11 @@ static int gameport_measure_speed(struct + tx = 1 << 30; + + for(i = 0; i < 50; i++) { +- local_irq_save(flags); ++ local_irq_save_nort(flags); + rdtscl(t1); + for (t = 0; t < 50; t++) gameport_read(gameport); + rdtscl(t2); +- local_irq_restore(flags); ++ local_irq_restore_nort(flags); + udelay(i * 10); + if (t2 - t1 < tx) tx = t2 - t1; + } +Index: linux-2.6-tip/drivers/net/tulip/tulip_core.c +=================================================================== +--- linux-2.6-tip.orig/drivers/net/tulip/tulip_core.c ++++ linux-2.6-tip/drivers/net/tulip/tulip_core.c +@@ -1814,6 +1814,7 @@ static void __devexit tulip_remove_one ( + pci_iounmap(pdev, tp->base_addr); + free_netdev (dev); + pci_release_regions (pdev); ++ pci_disable_device (pdev); + pci_set_drvdata (pdev, NULL); + + /* pci_power_off (pdev, -1); */ +Index: linux-2.6-tip/lib/ratelimit.c +=================================================================== +--- linux-2.6-tip.orig/lib/ratelimit.c ++++ linux-2.6-tip/lib/ratelimit.c +@@ -14,7 +14,7 @@ + #include + #include + +-static DEFINE_SPINLOCK(ratelimit_lock); ++static DEFINE_RAW_SPINLOCK(ratelimit_lock); + + /* + * __ratelimit - rate limiting +Index: linux-2.6-tip/drivers/oprofile/oprofilefs.c +=================================================================== +--- linux-2.6-tip.orig/drivers/oprofile/oprofilefs.c ++++ linux-2.6-tip/drivers/oprofile/oprofilefs.c +@@ -21,7 +21,7 @@ + + #define OPROFILEFS_MAGIC 0x6f70726f + +-DEFINE_SPINLOCK(oprofilefs_lock); ++DEFINE_RAW_SPINLOCK(oprofilefs_lock); + + static struct inode *oprofilefs_get_inode(struct super_block *sb, int mode) + { +Index: linux-2.6-tip/drivers/pci/access.c +=================================================================== +--- linux-2.6-tip.orig/drivers/pci/access.c ++++ linux-2.6-tip/drivers/pci/access.c +@@ -12,7 +12,7 @@ + * configuration space. + */ + +-static DEFINE_SPINLOCK(pci_lock); ++static DEFINE_RAW_SPINLOCK(pci_lock); + + /* + * Wrappers for all PCI configuration access functions. They just check +Index: linux-2.6-tip/drivers/video/console/vgacon.c +=================================================================== +--- linux-2.6-tip.orig/drivers/video/console/vgacon.c ++++ linux-2.6-tip/drivers/video/console/vgacon.c +@@ -51,7 +51,7 @@ + #include