diff -Nru xen-4.6.5/debian/changelog xen-4.6.5/debian/changelog --- xen-4.6.5/debian/changelog 2017-07-04 09:35:50.000000000 +0000 +++ xen-4.6.5/debian/changelog 2017-10-11 14:35:28.000000000 +0000 @@ -1,3 +1,63 @@ +xen (4.6.5-0ubuntu1.4) xenial-security; urgency=medium + + * Applying Xen Security Advisories: + - CVE-2017-14316 / XSA-231 + - xen/mm: make sure node is less than MAX_NUMNODES + - CVE-2017-14318 / XSA-232 + - grant_table: fix GNTTABOP_cache_flush handling + - CVE-2017-14317 / XSA-233 + - tools/xenstore: dont unlink connection object twice + - CVE-2017-14319 / XSA-234 + - gnttab: also validate PTE permissions upon destroy/replace + - XSA-235 + - arm/mm: release grant lock on xenmem_add_to_physmap_one() error paths + - XSA-237 + - x86: don't allow MSI pIRQ mapping on unowned device + - x86: enforce proper privilege when (un)mapping pIRQ-s + - x86/MSI: disallow redundant enabling + - x86/IRQ: conditionally preserve irq <-> pirq mapping on map error + paths + - x86/FLASK: fix unmap-domain-IRQ XSM hook + - XSA-238 + - x86/ioreq server: correctly handle bogus + XEN_DMOP_{,un}map_io_range_to_ioreq_server arguments + - XSA-239 + - x86/HVM: prefill partially used variable on emulation paths + - XSA-240 + - x86: limit linear page table use to a single level + - x86/mm: Disable PV linear pagetables by default + - XSA-241 + - x86: don't store possibly stale TLB flush time stamp + - XSA-242 + - x86: don't allow page_unlock() to drop the last type reference + - XSA-243 + - x86: Disable the use of auto-translated PV guestsx86: Disable the use + of auto-translated PV guests + - x86/shadow: Don't create self-linear shadow mappings for 4-level + translated guests + - XSA-244 + - x86/cpu: Fix IST handling during PCPU bringup + - XSA-245 + - xen/page_alloc: Cover memory unreserved after boot in first_valid_mfn + - xen/arm: Correctly report the memory region in the dummy NUMA helpers + + -- Stefan Bader Wed, 11 Oct 2017 15:41:03 +0200 + +xen (4.6.5-0ubuntu1.3) xenial-security; urgency=medium + + * Applying Xen Security Advisories: + - XSA-226 / CVE-2017-12135 + - gnttab: don't use possibly unbounded tail calls + - gnttab: fix transitive grant handling + - XSA-227 / CVE-2017-12137 + - x86/grant: Disallow misaligned PTEs + - XSA-228 / CVE-2017-12136 + - gnttab: split maptrack lock to make it fulfill its purpose again + - XSA-230 / CVE-2017-12855 + - gnttab: correct pin status fixup for copy + + -- Stefan Bader Mon, 21 Aug 2017 16:07:38 +0200 + xen (4.6.5-0ubuntu1.2) xenial-security; urgency=low * Applying Xen Security Advisories: diff -Nru xen-4.6.5/debian/patches/series xen-4.6.5/debian/patches/series --- xen-4.6.5/debian/patches/series 2017-07-04 09:32:26.000000000 +0000 +++ xen-4.6.5/debian/patches/series 2017-10-11 13:59:37.000000000 +0000 @@ -78,3 +78,29 @@ xsa224-4.6-0003-gnttab-correct-logic-to-get-page-references-during-m.patch xsa224-4.6-0004-gnttab-__gnttab_unmap_common_complete-is-all-or-noth.patch xsa225.patch +xsa226-4.6-0001-gnttab-dont-use-possibly-unbounded-tail-calls.patch +xsa226-4.6-0002-gnttab-fix-transitive-grant-handling.patch +xsa227-4.6.patch +xsa228-4.8.patch +xsa230.patch +xsa231-4.7.patch +xsa232.patch +xsa233.patch +xsa234-4.6.patch +xsa235-4.6.patch +xsa237-4.6-0001-x86-dont-allow-MSI-pIRQ-mapping-on-unowned-device.patch +xsa237-4.6-0002-x86-enforce-proper-privilege-when-mapping-pIRQ-s.patch +xsa237-4.6-0003-x86-MSI-disallow-redundant-enabling.patch +xsa237-4.6-0004-x86-IRQ-conditionally-preserve-irq-pirq-mapping-on-error.patch +xsa237-4.6-0005-x86-FLASK-fix-unmap-domain-IRQ-XSM-hook.patch +xsa238-4.6.patch +xsa239.patch +xsa240-4.6-0001-x86-limit-linear-page-table-use-to-a-single-level.patch +xsa240-4.6-0002-x86-mm-Disable-PV-linear-pagetables-by-default.patch +xsa241-4.9.patch +xsa242-4.9.patch +xsa243-4.6-1.patch +xsa243-4.6-2.patch +xsa244-4.6.patch +xsa245-0001-xen-page_alloc-Cover-memory-unreserved-after-boot-in.patch +xsa245-0002-xen-arm-Correctly-report-the-memory-region-in-the-du.patch diff -Nru xen-4.6.5/debian/patches/xsa226-4.6-0001-gnttab-dont-use-possibly-unbounded-tail-calls.patch xen-4.6.5/debian/patches/xsa226-4.6-0001-gnttab-dont-use-possibly-unbounded-tail-calls.patch --- xen-4.6.5/debian/patches/xsa226-4.6-0001-gnttab-dont-use-possibly-unbounded-tail-calls.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa226-4.6-0001-gnttab-dont-use-possibly-unbounded-tail-calls.patch 2017-08-21 14:08:13.000000000 +0000 @@ -0,0 +1,134 @@ +From: Jan Beulich +Subject: gnttab: don't use possibly unbounded tail calls + +There is no guarantee that the compiler would actually translate them +to branches instead of calls, so only ones with a known recursion limit +are okay: +- __release_grant_for_copy() can call itself only once, as + __acquire_grant_for_copy() won't permit use of multi-level transitive + grants, +- __acquire_grant_for_copy() is fine to call itself with the last + argument false, as that prevents further recursion, +- __acquire_grant_for_copy() must not call itself to recover from an + observed change to the active entry's pin count + +This is part of CVE-2017-12135 / XSA-226. + +Signed-off-by: Jan Beulich + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2089,8 +2089,10 @@ __release_grant_for_copy( + + if ( td != rd ) + { +- /* Recursive calls, but they're tail calls, so it's +- okay. */ ++ /* ++ * Recursive calls, but they're bounded (acquire permits only a single ++ * level of transitivity), so it's okay. ++ */ + if ( released_write ) + __release_grant_for_copy(td, trans_gref, 0); + else if ( released_read ) +@@ -2241,10 +2243,11 @@ __acquire_grant_for_copy( + return rc; + } + +- /* We dropped the lock, so we have to check that nobody +- else tried to pin (or, for that matter, unpin) the +- reference in *this* domain. If they did, just give up +- and try again. */ ++ /* ++ * We dropped the lock, so we have to check that nobody else tried ++ * to pin (or, for that matter, unpin) the reference in *this* ++ * domain. If they did, just give up and tell the caller to retry. ++ */ + if ( act->pin != old_pin ) + { + __fixup_status_for_copy_pin(act, status); +@@ -2252,9 +2255,8 @@ __acquire_grant_for_copy( + active_entry_release(act); + read_unlock(&rgt->lock); + put_page(*page); +- return __acquire_grant_for_copy(rd, gref, ldom, readonly, +- frame, page, page_off, length, +- allow_transitive); ++ *page = NULL; ++ return ERESTART; + } + + /* The actual remote remote grant may or may not be a +@@ -2560,7 +2562,7 @@ static int gnttab_copy_one(const struct + { + gnttab_copy_release_buf(src); + rc = gnttab_copy_claim_buf(op, &op->source, src, GNTCOPY_source_gref); +- if ( rc < 0 ) ++ if ( rc ) + goto out; + } + +@@ -2570,7 +2572,7 @@ static int gnttab_copy_one(const struct + { + gnttab_copy_release_buf(dest); + rc = gnttab_copy_claim_buf(op, &op->dest, dest, GNTCOPY_dest_gref); +- if ( rc < 0 ) ++ if ( rc ) + goto out; + } + +@@ -2579,6 +2581,14 @@ static int gnttab_copy_one(const struct + return rc; + } + ++/* ++ * gnttab_copy(), other than the various other helpers of ++ * do_grant_table_op(), returns (besides possible error indicators) ++ * "count - i" rather than "i" to ensure that even if no progress ++ * was made at all (perhaps due to gnttab_copy_one() returning a ++ * positive value) a non-zero value is being handed back (zero needs ++ * to be avoided, as that means "success, all done"). ++ */ + static long gnttab_copy( + XEN_GUEST_HANDLE_PARAM(gnttab_copy_t) uop, unsigned int count) + { +@@ -2592,7 +2602,7 @@ static long gnttab_copy( + { + if ( i && hypercall_preempt_check() ) + { +- rc = i; ++ rc = count - i; + break; + } + +@@ -2602,13 +2612,20 @@ static long gnttab_copy( + break; + } + +- op.status = gnttab_copy_one(&op, &dest, &src); +- if ( op.status != GNTST_okay ) ++ rc = gnttab_copy_one(&op, &dest, &src); ++ if ( rc > 0 ) ++ { ++ rc = count - i; ++ break; ++ } ++ if ( rc != GNTST_okay ) + { + gnttab_copy_release_buf(&src); + gnttab_copy_release_buf(&dest); + } + ++ op.status = rc; ++ rc = 0; + if ( unlikely(__copy_field_to_guest(uop, &op, status)) ) + { + rc = -EFAULT; +@@ -3146,6 +3163,7 @@ do_grant_table_op( + rc = gnttab_copy(copy, count); + if ( rc > 0 ) + { ++ rc = count - rc; + guest_handle_add_offset(copy, rc); + uop = guest_handle_cast(copy, void); + } diff -Nru xen-4.6.5/debian/patches/xsa226-4.6-0002-gnttab-fix-transitive-grant-handling.patch xen-4.6.5/debian/patches/xsa226-4.6-0002-gnttab-fix-transitive-grant-handling.patch --- xen-4.6.5/debian/patches/xsa226-4.6-0002-gnttab-fix-transitive-grant-handling.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa226-4.6-0002-gnttab-fix-transitive-grant-handling.patch 2017-08-21 14:08:24.000000000 +0000 @@ -0,0 +1,280 @@ +From: Jan Beulich +Subject: gnttab: fix transitive grant handling + +Processing of transitive grants must not use the fast path, or else +reference counting breaks due to the skipped recursive call to +__acquire_grant_for_copy() (its __release_grant_for_copy() +counterpart occurs independent of original pin count). Furthermore +after re-acquiring temporarily dropped locks we need to verify no grant +properties changed if the original pin count was non-zero; checking +just the pin counts is sufficient only for well-behaved guests. As a +result, __release_grant_for_copy() needs to mirror that new behavior. + +Furthermore a __release_grant_for_copy() invocation was missing on the +retry path of __acquire_grant_for_copy(), and gnttab_set_version() also +needs to bail out upon encountering a transitive grant. + +This is part of CVE-2017-12135 / XSA-226. + +Reported-by: Andrew Cooper +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2036,13 +2036,8 @@ __release_grant_for_copy( + unsigned long r_frame; + uint16_t *status; + grant_ref_t trans_gref; +- int released_read; +- int released_write; + struct domain *td; + +- released_read = 0; +- released_write = 0; +- + read_lock(&rgt->lock); + + act = active_entry_acquire(rgt, gref); +@@ -2072,17 +2067,11 @@ __release_grant_for_copy( + + act->pin -= GNTPIN_hstw_inc; + if ( !(act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) ) +- { +- released_write = 1; + gnttab_clear_flag(_GTF_writing, status); +- } + } + + if ( !act->pin ) +- { + gnttab_clear_flag(_GTF_reading, status); +- released_read = 1; +- } + + active_entry_release(act); + read_unlock(&rgt->lock); +@@ -2090,13 +2079,10 @@ __release_grant_for_copy( + if ( td != rd ) + { + /* +- * Recursive calls, but they're bounded (acquire permits only a single ++ * Recursive call, but it is bounded (acquire permits only a single + * level of transitivity), so it's okay. + */ +- if ( released_write ) +- __release_grant_for_copy(td, trans_gref, 0); +- else if ( released_read ) +- __release_grant_for_copy(td, trans_gref, 1); ++ __release_grant_for_copy(td, trans_gref, readonly); + + rcu_unlock_domain(td); + } +@@ -2170,8 +2156,108 @@ __acquire_grant_for_copy( + act->domid, ldom, act->pin); + + old_pin = act->pin; +- if ( !act->pin || +- (!readonly && !(act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask))) ) ++ if ( sha2 && (shah->flags & GTF_type_mask) == GTF_transitive ) ++ { ++ if ( (!old_pin || (!readonly && ++ !(old_pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)))) && ++ (rc = _set_status_v2(ldom, readonly, 0, shah, act, ++ status)) != GNTST_okay ) ++ goto unlock_out; ++ ++ if ( !allow_transitive ) ++ PIN_FAIL(unlock_out_clear, GNTST_general_error, ++ "transitive grant when transitivity not allowed\n"); ++ ++ trans_domid = sha2->transitive.trans_domid; ++ trans_gref = sha2->transitive.gref; ++ barrier(); /* Stop the compiler from re-loading ++ trans_domid from shared memory */ ++ if ( trans_domid == rd->domain_id ) ++ PIN_FAIL(unlock_out_clear, GNTST_general_error, ++ "transitive grants cannot be self-referential\n"); ++ ++ /* ++ * We allow the trans_domid == ldom case, which corresponds to a ++ * grant being issued by one domain, sent to another one, and then ++ * transitively granted back to the original domain. Allowing it ++ * is easy, and means that you don't need to go out of your way to ++ * avoid it in the guest. ++ */ ++ ++ /* We need to leave the rrd locked during the grant copy. */ ++ td = rcu_lock_domain_by_id(trans_domid); ++ if ( td == NULL ) ++ PIN_FAIL(unlock_out_clear, GNTST_general_error, ++ "transitive grant referenced bad domain %d\n", ++ trans_domid); ++ ++ /* ++ * __acquire_grant_for_copy() could take the lock on the ++ * remote table (if rd == td), so we have to drop the lock ++ * here and reacquire. ++ */ ++ active_entry_release(act); ++ read_unlock(&rgt->lock); ++ ++ rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id, ++ readonly, &grant_frame, page, ++ &trans_page_off, &trans_length, 0); ++ ++ read_lock(&rgt->lock); ++ act = active_entry_acquire(rgt, gref); ++ ++ if ( rc != GNTST_okay ) ++ { ++ __fixup_status_for_copy_pin(act, status); ++ rcu_unlock_domain(td); ++ active_entry_release(act); ++ read_unlock(&rgt->lock); ++ return rc; ++ } ++ ++ /* ++ * We dropped the lock, so we have to check that the grant didn't ++ * change, and that nobody else tried to pin/unpin it. If anything ++ * changed, just give up and tell the caller to retry. ++ */ ++ if ( rgt->gt_version != 2 || ++ act->pin != old_pin || ++ (old_pin && (act->domid != ldom || act->frame != grant_frame || ++ act->start != trans_page_off || ++ act->length != trans_length || ++ act->trans_domain != td || ++ act->trans_gref != trans_gref || ++ !act->is_sub_page)) ) ++ { ++ __release_grant_for_copy(td, trans_gref, readonly); ++ __fixup_status_for_copy_pin(act, status); ++ rcu_unlock_domain(td); ++ active_entry_release(act); ++ read_unlock(&rgt->lock); ++ put_page(*page); ++ *page = NULL; ++ return ERESTART; ++ } ++ ++ if ( !old_pin ) ++ { ++ act->domid = ldom; ++ act->start = trans_page_off; ++ act->length = trans_length; ++ act->trans_domain = td; ++ act->trans_gref = trans_gref; ++ act->frame = grant_frame; ++ act->gfn = -1ul; ++ /* ++ * The actual remote remote grant may or may not be a sub-page, ++ * but we always treat it as one because that blocks mappings of ++ * transitive grants. ++ */ ++ act->is_sub_page = 1; ++ } ++ } ++ else if ( !old_pin || ++ (!readonly && !(old_pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask))) ) + { + if ( (rc = _set_status(rgt->gt_version, ldom, + readonly, 0, shah, act, +@@ -2192,79 +2278,6 @@ __acquire_grant_for_copy( + trans_page_off = 0; + trans_length = PAGE_SIZE; + } +- else if ( (shah->flags & GTF_type_mask) == GTF_transitive ) +- { +- if ( !allow_transitive ) +- PIN_FAIL(unlock_out_clear, GNTST_general_error, +- "transitive grant when transitivity not allowed\n"); +- +- trans_domid = sha2->transitive.trans_domid; +- trans_gref = sha2->transitive.gref; +- barrier(); /* Stop the compiler from re-loading +- trans_domid from shared memory */ +- if ( trans_domid == rd->domain_id ) +- PIN_FAIL(unlock_out_clear, GNTST_general_error, +- "transitive grants cannot be self-referential\n"); +- +- /* We allow the trans_domid == ldom case, which +- corresponds to a grant being issued by one domain, sent +- to another one, and then transitively granted back to +- the original domain. Allowing it is easy, and means +- that you don't need to go out of your way to avoid it +- in the guest. */ +- +- /* We need to leave the rrd locked during the grant copy */ +- td = rcu_lock_domain_by_id(trans_domid); +- if ( td == NULL ) +- PIN_FAIL(unlock_out_clear, GNTST_general_error, +- "transitive grant referenced bad domain %d\n", +- trans_domid); +- +- /* +- * __acquire_grant_for_copy() could take the lock on the +- * remote table (if rd == td), so we have to drop the lock +- * here and reacquire +- */ +- active_entry_release(act); +- read_unlock(&rgt->lock); +- +- rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id, +- readonly, &grant_frame, page, +- &trans_page_off, &trans_length, 0); +- +- read_lock(&rgt->lock); +- act = active_entry_acquire(rgt, gref); +- +- if ( rc != GNTST_okay ) { +- __fixup_status_for_copy_pin(act, status); +- rcu_unlock_domain(td); +- active_entry_release(act); +- read_unlock(&rgt->lock); +- return rc; +- } +- +- /* +- * We dropped the lock, so we have to check that nobody else tried +- * to pin (or, for that matter, unpin) the reference in *this* +- * domain. If they did, just give up and tell the caller to retry. +- */ +- if ( act->pin != old_pin ) +- { +- __fixup_status_for_copy_pin(act, status); +- rcu_unlock_domain(td); +- active_entry_release(act); +- read_unlock(&rgt->lock); +- put_page(*page); +- *page = NULL; +- return ERESTART; +- } +- +- /* The actual remote remote grant may or may not be a +- sub-page, but we always treat it as one because that +- blocks mappings of transitive grants. */ +- is_sub_page = 1; +- act->gfn = -1ul; +- } + else if ( !(sha2->hdr.flags & GTF_sub_page) ) + { + rc = __get_paged_frame(sha2->full_page.frame, &grant_frame, page, readonly, rd); +@@ -2696,10 +2709,13 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA + case 2: + for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ ) + { +- if ( ((shared_entry_v2(gt, i).hdr.flags & GTF_type_mask) == +- GTF_permit_access) && +- (shared_entry_v2(gt, i).full_page.frame >> 32) ) ++ switch ( shared_entry_v2(gt, i).hdr.flags & GTF_type_mask ) + { ++ case GTF_permit_access: ++ if ( !(shared_entry_v2(gt, i).full_page.frame >> 32) ) ++ break; ++ /* fall through */ ++ case GTF_transitive: + gdprintk(XENLOG_WARNING, + "tried to change grant table version to 1 with non-representable entries\n"); + res = -ERANGE; diff -Nru xen-4.6.5/debian/patches/xsa227-4.6.patch xen-4.6.5/debian/patches/xsa227-4.6.patch --- xen-4.6.5/debian/patches/xsa227-4.6.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa227-4.6.patch 2017-08-21 14:08:40.000000000 +0000 @@ -0,0 +1,66 @@ +From 697edc414352e89f29ca3de744a76c1625c0466c Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 20 Jun 2017 19:18:54 +0100 +Subject: [PATCH] x86/grant: Disallow misaligned PTEs + +Pagetable entries must be aligned to function correctly. Disallow attempts +from the guest to have a grant PTE created at a misaligned address, which +would result in corruption of the L1 table with largely-guest-controlled +values. + +This is XSA-227 + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +--- + xen/arch/x86/mm.c | 13 +++++++++++++ + xen/include/xen/config.h | 2 ++ + 2 files changed, 15 insertions(+) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 213b52a..3bf728b 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -3878,6 +3878,9 @@ static int create_grant_pte_mapping( + l1_pgentry_t ol1e; + struct domain *d = v->domain; + ++ if ( !IS_ALIGNED(pte_addr, sizeof(nl1e)) ) ++ return GNTST_general_error; ++ + adjust_guest_l1e(nl1e, d); + + gmfn = pte_addr >> PAGE_SHIFT; +@@ -3935,6 +3938,16 @@ static int destroy_grant_pte_mapping( + struct page_info *page; + l1_pgentry_t ol1e; + ++ /* ++ * addr comes from Xen's active_entry tracking so isn't guest controlled, ++ * but it had still better be PTE-aligned. ++ */ ++ if ( !IS_ALIGNED(addr, sizeof(ol1e)) ) ++ { ++ ASSERT_UNREACHABLE(); ++ return GNTST_general_error; ++ } ++ + gmfn = addr >> PAGE_SHIFT; + page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC); + +diff --git a/xen/include/xen/config.h b/xen/include/xen/config.h +index f7258c7..ded8156 100644 +--- a/xen/include/xen/config.h ++++ b/xen/include/xen/config.h +@@ -72,6 +72,8 @@ + #define MB(_mb) (_AC(_mb, ULL) << 20) + #define GB(_gb) (_AC(_gb, ULL) << 30) + ++#define IS_ALIGNED(val, align) (((val) & ((align) - 1)) == 0) ++ + #define __STR(...) #__VA_ARGS__ + #define STR(...) __STR(__VA_ARGS__) + +-- +2.1.4 + diff -Nru xen-4.6.5/debian/patches/xsa228-4.8.patch xen-4.6.5/debian/patches/xsa228-4.8.patch --- xen-4.6.5/debian/patches/xsa228-4.8.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa228-4.8.patch 2017-08-21 14:08:57.000000000 +0000 @@ -0,0 +1,198 @@ +From cb91f4c43bd4158daa6561c73921a6455176f278 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Mon, 31 Jul 2017 15:17:56 +0100 +Subject: [PATCH] gnttab: split maptrack lock to make it fulfill its purpose + again + +The way the lock is currently being used in get_maptrack_handle(), it +protects only the maptrack limit: The function acts on current's list +only, so races on list accesses are impossible even without the lock. + +Otoh list access races are possible between __get_maptrack_handle() and +put_maptrack_handle(), due to the invocation of the former for other +than current from steal_maptrack_handle(). Introduce a per-vCPU lock +for list accesses to become race free again. This lock will be +uncontended except when it becomes necessary to take the steal path, +i.e. in the common case there should be no meaningful performance +impact. + +When in get_maptrack_handle adds a stolen entry to a fresh, empty, +freelist, we think that there is probably no concurrency. However, +this is not a fast path and adding the locking there makes the code +clearly correct. + +Also, while we are here: the stolen maptrack_entry's tail pointer was +not properly set. Set it. + +This is XSA-228. + +Reported-by: Ian Jackson +Signed-off-by: Jan Beulich +Signed-off-by: Ian Jackson +--- + docs/misc/grant-tables.txt | 7 ++++++- + xen/common/grant_table.c | 30 ++++++++++++++++++++++++------ + xen/include/xen/grant_table.h | 2 +- + xen/include/xen/sched.h | 1 + + 4 files changed, 32 insertions(+), 8 deletions(-) + +diff --git a/docs/misc/grant-tables.txt b/docs/misc/grant-tables.txt +index 417ce2d..64da5cf 100644 +--- a/docs/misc/grant-tables.txt ++++ b/docs/misc/grant-tables.txt +@@ -87,7 +87,8 @@ is complete. + inconsistent grant table state such as current + version, partially initialized active table pages, + etc. +- grant_table->maptrack_lock : spinlock used to protect the maptrack free list ++ grant_table->maptrack_lock : spinlock used to protect the maptrack limit ++ v->maptrack_freelist_lock : spinlock used to protect the maptrack free list + active_grant_entry->lock : spinlock used to serialize modifications to + active entries + +@@ -102,6 +103,10 @@ is complete. + The maptrack free list is protected by its own spinlock. The maptrack + lock may be locked while holding the grant table lock. + ++ The maptrack_freelist_lock is an innermost lock. It may be locked ++ while holding other locks, but no other locks may be acquired within ++ it. ++ + Active entries are obtained by calling active_entry_acquire(gt, ref). + This function returns a pointer to the active entry after locking its + spinlock. The caller must hold the grant table read lock before +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index f9654f1..593121c 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -304,11 +304,16 @@ __get_maptrack_handle( + { + unsigned int head, next, prev_head; + ++ spin_lock(&v->maptrack_freelist_lock); ++ + do { + /* No maptrack pages allocated for this VCPU yet? */ + head = read_atomic(&v->maptrack_head); + if ( unlikely(head == MAPTRACK_TAIL) ) ++ { ++ spin_unlock(&v->maptrack_freelist_lock); + return -1; ++ } + + /* + * Always keep one entry in the free list to make it easier to +@@ -316,12 +321,17 @@ __get_maptrack_handle( + */ + next = read_atomic(&maptrack_entry(t, head).ref); + if ( unlikely(next == MAPTRACK_TAIL) ) ++ { ++ spin_unlock(&v->maptrack_freelist_lock); + return -1; ++ } + + prev_head = head; + head = cmpxchg(&v->maptrack_head, prev_head, next); + } while ( head != prev_head ); + ++ spin_unlock(&v->maptrack_freelist_lock); ++ + return head; + } + +@@ -380,6 +390,8 @@ put_maptrack_handle( + /* 2. Add entry to the tail of the list on the original VCPU. */ + v = currd->vcpu[maptrack_entry(t, handle).vcpu]; + ++ spin_lock(&v->maptrack_freelist_lock); ++ + cur_tail = read_atomic(&v->maptrack_tail); + do { + prev_tail = cur_tail; +@@ -388,6 +400,8 @@ put_maptrack_handle( + + /* 3. Update the old tail entry to point to the new entry. */ + write_atomic(&maptrack_entry(t, prev_tail).ref, handle); ++ ++ spin_unlock(&v->maptrack_freelist_lock); + } + + static inline int +@@ -411,10 +425,6 @@ get_maptrack_handle( + */ + if ( nr_maptrack_frames(lgt) >= max_maptrack_frames ) + { +- /* +- * Can drop the lock since no other VCPU can be adding a new +- * frame once they've run out. +- */ + spin_unlock(&lgt->maptrack_lock); + + /* +@@ -426,8 +436,12 @@ get_maptrack_handle( + handle = steal_maptrack_handle(lgt, curr); + if ( handle == -1 ) + return -1; ++ spin_lock(&curr->maptrack_freelist_lock); ++ maptrack_entry(lgt, handle).ref = MAPTRACK_TAIL; + curr->maptrack_tail = handle; +- write_atomic(&curr->maptrack_head, handle); ++ if ( curr->maptrack_head == MAPTRACK_TAIL ) ++ write_atomic(&curr->maptrack_head, handle); ++ spin_unlock(&curr->maptrack_freelist_lock); + } + return steal_maptrack_handle(lgt, curr); + } +@@ -460,12 +474,15 @@ get_maptrack_handle( + smp_wmb(); + lgt->maptrack_limit += MAPTRACK_PER_PAGE; + ++ spin_unlock(&lgt->maptrack_lock); ++ spin_lock(&curr->maptrack_freelist_lock); ++ + do { + new_mt[i - 1].ref = read_atomic(&curr->maptrack_head); + head = cmpxchg(&curr->maptrack_head, new_mt[i - 1].ref, handle + 1); + } while ( head != new_mt[i - 1].ref ); + +- spin_unlock(&lgt->maptrack_lock); ++ spin_unlock(&curr->maptrack_freelist_lock); + + return handle; + } +@@ -3474,6 +3491,7 @@ grant_table_destroy( + + void grant_table_init_vcpu(struct vcpu *v) + { ++ spin_lock_init(&v->maptrack_freelist_lock); + v->maptrack_head = MAPTRACK_TAIL; + v->maptrack_tail = MAPTRACK_TAIL; + } +diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h +index 4e77899..100f2b3 100644 +--- a/xen/include/xen/grant_table.h ++++ b/xen/include/xen/grant_table.h +@@ -78,7 +78,7 @@ struct grant_table { + /* Mapping tracking table per vcpu. */ + struct grant_mapping **maptrack; + unsigned int maptrack_limit; +- /* Lock protecting the maptrack page list, head, and limit */ ++ /* Lock protecting the maptrack limit */ + spinlock_t maptrack_lock; + /* The defined versions are 1 and 2. Set to 0 if we don't know + what version to use yet. */ +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 1fbda87..ff0f38f 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -223,6 +223,7 @@ struct vcpu + int controller_pause_count; + + /* Maptrack */ ++ spinlock_t maptrack_freelist_lock; + unsigned int maptrack_head; + unsigned int maptrack_tail; + +-- +2.1.4 + diff -Nru xen-4.6.5/debian/patches/xsa230.patch xen-4.6.5/debian/patches/xsa230.patch --- xen-4.6.5/debian/patches/xsa230.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa230.patch 2017-08-21 14:09:13.000000000 +0000 @@ -0,0 +1,38 @@ +From: Jan Beulich +Subject: gnttab: correct pin status fixup for copy + +Regardless of copy operations only setting GNTPIN_hst*, GNTPIN_dev* +also need to be taken into account when deciding whether to clear +_GTF_{read,writ}ing. At least for consistency with code elsewhere the +read part better doesn't use any mask at all. + +This is XSA-230. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index ae34547..9c9d33c 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2107,10 +2107,10 @@ __release_grant_for_copy( + static void __fixup_status_for_copy_pin(const struct active_grant_entry *act, + uint16_t *status) + { +- if ( !(act->pin & GNTPIN_hstw_mask) ) ++ if ( !(act->pin & (GNTPIN_hstw_mask | GNTPIN_devw_mask)) ) + gnttab_clear_flag(_GTF_writing, status); + +- if ( !(act->pin & GNTPIN_hstr_mask) ) ++ if ( !act->pin ) + gnttab_clear_flag(_GTF_reading, status); + } + +@@ -2318,7 +2318,7 @@ __acquire_grant_for_copy( + + unlock_out_clear: + if ( !(readonly) && +- !(act->pin & GNTPIN_hstw_mask) ) ++ !(act->pin & (GNTPIN_hstw_mask | GNTPIN_devw_mask)) ) + gnttab_clear_flag(_GTF_writing, status); + + if ( !act->pin ) diff -Nru xen-4.6.5/debian/patches/xsa231-4.7.patch xen-4.6.5/debian/patches/xsa231-4.7.patch --- xen-4.6.5/debian/patches/xsa231-4.7.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa231-4.7.patch 2017-10-11 13:51:23.000000000 +0000 @@ -0,0 +1,108 @@ +From: George Dunlap +Subject: xen/mm: make sure node is less than MAX_NUMNODES + +The output of MEMF_get_node(memflags) can be as large as nodeid_t can +hold (currently 255). This is then used as an index to arrays of size +MAX_NUMNODE, which is 64 on x86 and 1 on ARM, can be passed in by an +untrusted guest (via memory_exchange and increase_reservation) and is +not currently bounds-checked. + +Check the value in page_alloc.c before using it, and also check the +value in the hypercall call sites and return -EINVAL if appropriate. +Don't permit domains other than the hardware or control domain to +allocate node-constrained memory. + +This is XSA-231. + +Reported-by: Matthew Daley +Signed-off-by: George Dunlap +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -390,6 +390,31 @@ static void decrease_reservation(struct + a->nr_done = i; + } + ++static bool_t propagate_node(unsigned int xmf, unsigned int *memflags) ++{ ++ const struct domain *currd = current->domain; ++ ++ BUILD_BUG_ON(XENMEMF_get_node(0) != NUMA_NO_NODE); ++ BUILD_BUG_ON(MEMF_get_node(0) != NUMA_NO_NODE); ++ ++ if ( XENMEMF_get_node(xmf) == NUMA_NO_NODE ) ++ return 1; ++ ++ if ( is_hardware_domain(currd) || is_control_domain(currd) ) ++ { ++ if ( XENMEMF_get_node(xmf) >= MAX_NUMNODES ) ++ return 0; ++ ++ *memflags |= MEMF_node(XENMEMF_get_node(xmf)); ++ if ( xmf & XENMEMF_exact_node_request ) ++ *memflags |= MEMF_exact_node; ++ } ++ else if ( xmf & XENMEMF_exact_node_request ) ++ return 0; ++ ++ return 1; ++} ++ + static long memory_exchange(XEN_GUEST_HANDLE_PARAM(xen_memory_exchange_t) arg) + { + struct xen_memory_exchange exch; +@@ -462,6 +487,12 @@ static long memory_exchange(XEN_GUEST_HA + } + } + ++ if ( unlikely(!propagate_node(exch.out.mem_flags, &memflags)) ) ++ { ++ rc = -EINVAL; ++ goto fail_early; ++ } ++ + d = rcu_lock_domain_by_any_id(exch.in.domid); + if ( d == NULL ) + { +@@ -480,7 +511,6 @@ static long memory_exchange(XEN_GUEST_HA + d, + XENMEMF_get_address_bits(exch.out.mem_flags) ? : + (BITS_PER_LONG+PAGE_SHIFT))); +- memflags |= MEMF_node(XENMEMF_get_node(exch.out.mem_flags)); + + for ( i = (exch.nr_exchanged >> in_chunk_order); + i < (exch.in.nr_extents >> in_chunk_order); +@@ -834,12 +864,8 @@ static int construct_memop_from_reservat + } + read_unlock(&d->vnuma_rwlock); + } +- else +- { +- a->memflags |= MEMF_node(XENMEMF_get_node(r->mem_flags)); +- if ( r->mem_flags & XENMEMF_exact_node_request ) +- a->memflags |= MEMF_exact_node; +- } ++ else if ( unlikely(!propagate_node(r->mem_flags, &a->memflags)) ) ++ return -EINVAL; + + return 0; + } +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -711,9 +711,13 @@ static struct page_info *alloc_heap_page + if ( node >= MAX_NUMNODES ) + node = cpu_to_node(smp_processor_id()); + } ++ else if ( unlikely(node >= MAX_NUMNODES) ) ++ { ++ ASSERT_UNREACHABLE(); ++ return NULL; ++ } + first_node = node; + +- ASSERT(node < MAX_NUMNODES); + ASSERT(zone_lo <= zone_hi); + ASSERT(zone_hi < NR_ZONES); + diff -Nru xen-4.6.5/debian/patches/xsa232.patch xen-4.6.5/debian/patches/xsa232.patch --- xen-4.6.5/debian/patches/xsa232.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa232.patch 2017-10-11 13:52:08.000000000 +0000 @@ -0,0 +1,23 @@ +From: Andrew Cooper +Subject: grant_table: fix GNTTABOP_cache_flush handling + +Don't fall over a NULL grant_table pointer when the owner of the domain +is a system domain (DOMID_{XEN,IO} etc). + +This is XSA-232. + +Reported-by: Matthew Daley +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -3053,7 +3053,7 @@ static int cache_flush(gnttab_cache_flus + + page = mfn_to_page(mfn); + owner = page_get_owner_and_reference(page); +- if ( !owner ) ++ if ( !owner || !owner->grant_table ) + { + rcu_unlock_domain(d); + return -EPERM; diff -Nru xen-4.6.5/debian/patches/xsa233.patch xen-4.6.5/debian/patches/xsa233.patch --- xen-4.6.5/debian/patches/xsa233.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa233.patch 2017-10-11 13:52:19.000000000 +0000 @@ -0,0 +1,52 @@ +From: Juergen Gross +Subject: tools/xenstore: dont unlink connection object twice + +A connection object of a domain with associated stubdom has two +parents: the domain and the stubdom. When cleaning up the list of +active domains in domain_cleanup() make sure not to unlink the +connection twice from the same domain. This could happen when the +domain and its stubdom are being destroyed at the same time leading +to the domain loop being entered twice. + +Additionally don't use talloc_free() in this case as it will remove +a random parent link, leading eventually to a memory leak. Use +talloc_unlink() instead specifying the context from which the +connection object should be removed. + +This is XSA-233. + +Reported-by: Eric Chanudet +Signed-off-by: Juergen Gross +Reviewed-by: Ian Jackson + +--- a/tools/xenstore/xenstored_domain.c ++++ b/tools/xenstore/xenstored_domain.c +@@ -221,10 +221,11 @@ static int destroy_domain(void *_domain) + static void domain_cleanup(void) + { + xc_dominfo_t dominfo; +- struct domain *domain, *tmp; ++ struct domain *domain; + int notify = 0; + +- list_for_each_entry_safe(domain, tmp, &domains, list) { ++ again: ++ list_for_each_entry(domain, &domains, list) { + if (xc_domain_getinfo(*xc_handle, domain->domid, 1, + &dominfo) == 1 && + dominfo.domid == domain->domid) { +@@ -236,8 +237,12 @@ static void domain_cleanup(void) + if (!dominfo.dying) + continue; + } +- talloc_free(domain->conn); +- notify = 0; /* destroy_domain() fires the watch */ ++ if (domain->conn) { ++ talloc_unlink(talloc_autofree_context(), domain->conn); ++ domain->conn = NULL; ++ notify = 0; /* destroy_domain() fires the watch */ ++ goto again; ++ } + } + + if (notify) diff -Nru xen-4.6.5/debian/patches/xsa234-4.6.patch xen-4.6.5/debian/patches/xsa234-4.6.patch --- xen-4.6.5/debian/patches/xsa234-4.6.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa234-4.6.patch 2017-10-11 13:52:30.000000000 +0000 @@ -0,0 +1,185 @@ +From: Jan Beulich +Subject: gnttab: also validate PTE permissions upon destroy/replace + +In order for PTE handling to match up with the reference counting done +by common code, presence and writability of grant mapping PTEs must +also be taken into account; validating just the frame number is not +enough. This is in particular relevant if a guest fiddles with grant +PTEs via non-grant hypercalls. + +Note that the flags being passed to replace_grant_host_mapping() +already happen to be those of the existing mapping, so no new function +parameter is needed. + +This is XSA-234. + +Reported-by: Andrew Cooper +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -3930,7 +3930,8 @@ static int create_grant_pte_mapping( + } + + static int destroy_grant_pte_mapping( +- uint64_t addr, unsigned long frame, struct domain *d) ++ uint64_t addr, unsigned long frame, unsigned int grant_pte_flags, ++ struct domain *d) + { + int rc = GNTST_okay; + void *va; +@@ -3976,16 +3977,27 @@ static int destroy_grant_pte_mapping( + + ol1e = *(l1_pgentry_t *)va; + +- /* Check that the virtual address supplied is actually mapped to frame. */ +- if ( unlikely(l1e_get_pfn(ol1e) != frame) ) ++ /* ++ * Check that the PTE supplied actually maps frame (with appropriate ++ * permissions). ++ */ ++ if ( unlikely(l1e_get_pfn(ol1e) != frame) || ++ unlikely((l1e_get_flags(ol1e) ^ grant_pte_flags) & ++ (_PAGE_PRESENT | _PAGE_RW)) ) + { + page_unlock(page); +- MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx", +- (unsigned long)l1e_get_intpte(ol1e), addr, frame); ++ MEM_LOG("PTE %"PRIpte" at %"PRIx64" doesn't match grant (%"PRIpte")", ++ l1e_get_intpte(ol1e), addr, ++ l1e_get_intpte(l1e_from_pfn(frame, grant_pte_flags))); + rc = GNTST_general_error; + goto failed; + } + ++ if ( unlikely((l1e_get_flags(ol1e) ^ grant_pte_flags) & ++ ~(_PAGE_AVAIL | PAGE_CACHE_ATTRS)) ) ++ MEM_LOG("PTE flags %x at %"PRIx64" don't match grant (%x)\n", ++ l1e_get_flags(ol1e), addr, grant_pte_flags); ++ + /* Delete pagetable entry. */ + if ( unlikely(!UPDATE_ENTRY + (l1, +@@ -3994,7 +4006,7 @@ static int destroy_grant_pte_mapping( + 0)) ) + { + page_unlock(page); +- MEM_LOG("Cannot delete PTE entry at %p", va); ++ MEM_LOG("Cannot delete PTE entry at %"PRIx64, addr); + rc = GNTST_general_error; + goto failed; + } +@@ -4062,7 +4074,8 @@ static int create_grant_va_mapping( + } + + static int replace_grant_va_mapping( +- unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v) ++ unsigned long addr, unsigned long frame, unsigned int grant_pte_flags, ++ l1_pgentry_t nl1e, struct vcpu *v) + { + l1_pgentry_t *pl1e, ol1e; + unsigned long gl1mfn; +@@ -4098,19 +4111,30 @@ static int replace_grant_va_mapping( + + ol1e = *pl1e; + +- /* Check that the virtual address supplied is actually mapped to frame. */ +- if ( unlikely(l1e_get_pfn(ol1e) != frame) ) +- { +- MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx", +- l1e_get_pfn(ol1e), addr, frame); ++ /* ++ * Check that the virtual address supplied is actually mapped to frame ++ * (with appropriate permissions). ++ */ ++ if ( unlikely(l1e_get_pfn(ol1e) != frame) || ++ unlikely((l1e_get_flags(ol1e) ^ grant_pte_flags) & ++ (_PAGE_PRESENT | _PAGE_RW)) ) ++ { ++ MEM_LOG("PTE %"PRIpte" for %lx doesn't match grant (%"PRIpte")", ++ l1e_get_intpte(ol1e), addr, ++ l1e_get_intpte(l1e_from_pfn(frame, grant_pte_flags))); + rc = GNTST_general_error; + goto unlock_and_out; + } + ++ if ( unlikely((l1e_get_flags(ol1e) ^ grant_pte_flags) & ++ ~(_PAGE_AVAIL | PAGE_CACHE_ATTRS)) ) ++ MEM_LOG("PTE flags %x for %"PRIx64" don't match grant (%x)", ++ l1e_get_flags(ol1e), addr, grant_pte_flags); ++ + /* Delete pagetable entry. */ + if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) ) + { +- MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); ++ MEM_LOG("Cannot delete PTE entry for %"PRIx64, addr); + rc = GNTST_general_error; + goto unlock_and_out; + } +@@ -4124,9 +4148,11 @@ static int replace_grant_va_mapping( + } + + static int destroy_grant_va_mapping( +- unsigned long addr, unsigned long frame, struct vcpu *v) ++ unsigned long addr, unsigned long frame, unsigned int grant_pte_flags, ++ struct vcpu *v) + { +- return replace_grant_va_mapping(addr, frame, l1e_empty(), v); ++ return replace_grant_va_mapping(addr, frame, grant_pte_flags, ++ l1e_empty(), v); + } + + static int create_grant_p2m_mapping(uint64_t addr, unsigned long frame, +@@ -4219,21 +4245,40 @@ int replace_grant_host_mapping( + unsigned long gl1mfn; + struct page_info *l1pg; + int rc; ++ unsigned int grant_pte_flags; + + if ( paging_mode_external(current->domain) ) + return replace_grant_p2m_mapping(addr, frame, new_addr, flags); + ++ grant_pte_flags = ++ _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_GNTTAB | _PAGE_NX; ++ ++ if ( flags & GNTMAP_application_map ) ++ grant_pte_flags |= _PAGE_USER; ++ if ( !(flags & GNTMAP_readonly) ) ++ grant_pte_flags |= _PAGE_RW; ++ /* ++ * On top of the explicit settings done by create_grant_host_mapping() ++ * also open-code relevant parts of adjust_guest_l1e(). Don't mirror ++ * available and cachability flags, though. ++ */ ++ if ( !is_pv_32bit_domain(curr->domain) ) ++ grant_pte_flags |= (grant_pte_flags & _PAGE_USER) ++ ? _PAGE_GLOBAL ++ : _PAGE_GUEST_KERNEL | _PAGE_USER; ++ + if ( flags & GNTMAP_contains_pte ) + { + if ( !new_addr ) +- return destroy_grant_pte_mapping(addr, frame, curr->domain); ++ return destroy_grant_pte_mapping(addr, frame, grant_pte_flags, ++ curr->domain); + + MEM_LOG("Unsupported grant table operation"); + return GNTST_general_error; + } + + if ( !new_addr ) +- return destroy_grant_va_mapping(addr, frame, curr); ++ return destroy_grant_va_mapping(addr, frame, grant_pte_flags, curr); + + pl1e = guest_map_l1e(curr, new_addr, &gl1mfn); + if ( !pl1e ) +@@ -4281,7 +4326,7 @@ int replace_grant_host_mapping( + put_page(l1pg); + guest_unmap_l1e(curr, pl1e); + +- rc = replace_grant_va_mapping(addr, frame, ol1e, curr); ++ rc = replace_grant_va_mapping(addr, frame, grant_pte_flags, ol1e, curr); + if ( rc && !paging_mode_refcounts(curr->domain) ) + put_page_from_l1e(ol1e, curr->domain); + diff -Nru xen-4.6.5/debian/patches/xsa235-4.6.patch xen-4.6.5/debian/patches/xsa235-4.6.patch --- xen-4.6.5/debian/patches/xsa235-4.6.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa235-4.6.patch 2017-10-11 13:54:38.000000000 +0000 @@ -0,0 +1,49 @@ +From: Jan Beulich +Subject: arm/mm: release grant lock on xenmem_add_to_physmap_one() error paths + +Commit 55021ff9ab ("xen/arm: add_to_physmap_one: Avoid to map mfn 0 if +an error occurs") introduced error paths not releasing the grant table +lock. Replace them by a suitable check after the lock was dropped. + +This is XSA-235. + +Reported-by: Wei Liu +Signed-off-by: Jan Beulich +Reviewed-by: Julien Grall + +--- a/xen/arch/arm/mm.c ++++ b/xen/arch/arm/mm.c +@@ -1073,7 +1073,7 @@ int xenmem_add_to_physmap_one( + if ( idx < nr_status_frames(d->grant_table) ) + mfn = virt_to_mfn(d->grant_table->status[idx]); + else +- return -EINVAL; ++ mfn = INVALID_MFN; + } + else + { +@@ -1084,14 +1084,21 @@ int xenmem_add_to_physmap_one( + if ( idx < nr_grant_frames(d->grant_table) ) + mfn = virt_to_mfn(d->grant_table->shared_raw[idx]); + else +- return -EINVAL; ++ mfn = INVALID_MFN; + } + +- d->arch.grant_table_gpfn[idx] = gpfn; ++ if ( mfn != INVALID_MFN ) ++ { ++ d->arch.grant_table_gpfn[idx] = gpfn; + +- t = p2m_ram_rw; ++ t = p2m_ram_rw; ++ } + + write_unlock(&d->grant_table->lock); ++ ++ if ( mfn == INVALID_MFN ) ++ return -EINVAL; ++ + break; + case XENMAPSPACE_shared_info: + if ( idx != 0 ) diff -Nru xen-4.6.5/debian/patches/xsa237-4.6-0001-x86-dont-allow-MSI-pIRQ-mapping-on-unowned-device.patch xen-4.6.5/debian/patches/xsa237-4.6-0001-x86-dont-allow-MSI-pIRQ-mapping-on-unowned-device.patch --- xen-4.6.5/debian/patches/xsa237-4.6-0001-x86-dont-allow-MSI-pIRQ-mapping-on-unowned-device.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa237-4.6-0001-x86-dont-allow-MSI-pIRQ-mapping-on-unowned-device.patch 2017-10-11 13:55:08.000000000 +0000 @@ -0,0 +1,26 @@ +From: Jan Beulich +Subject: x86: don't allow MSI pIRQ mapping on unowned device + +MSI setup should be permitted only for existing devices owned by the +respective guest (the operation may still be carried out by the domain +controlling that guest). + +This is part of XSA-237. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/irq.c ++++ b/xen/arch/x86/irq.c +@@ -1961,7 +1961,10 @@ int map_domain_pirq( + if ( !cpu_has_apic ) + goto done; + +- pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn); ++ pdev = pci_get_pdev_by_domain(d, msi->seg, msi->bus, msi->devfn); ++ if ( !pdev ) ++ goto done; ++ + ret = pci_enable_msi(msi, &msi_desc); + if ( ret ) + { diff -Nru xen-4.6.5/debian/patches/xsa237-4.6-0002-x86-enforce-proper-privilege-when-mapping-pIRQ-s.patch xen-4.6.5/debian/patches/xsa237-4.6-0002-x86-enforce-proper-privilege-when-mapping-pIRQ-s.patch --- xen-4.6.5/debian/patches/xsa237-4.6-0002-x86-enforce-proper-privilege-when-mapping-pIRQ-s.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa237-4.6-0002-x86-enforce-proper-privilege-when-mapping-pIRQ-s.patch 2017-10-11 13:55:25.000000000 +0000 @@ -0,0 +1,65 @@ +From: Jan Beulich +Subject: x86: enforce proper privilege when (un)mapping pIRQ-s + +(Un)mapping of IRQs, just like other RESOURCE__ADD* / RESOURCE__REMOVE* +actions (in FLASK terms) should be XSM_DM_PRIV rather than XSM_TARGET. +This in turn requires bypassing the XSM check in physdev_unmap_pirq() +for the HVM emuirq case just like is being done in physdev_map_pirq(). +The primary goal security wise, however, is to no longer allow HVM +guests, by specifying their own domain ID instead of DOMID_SELF, to +enter code paths intended for PV guest and the control domains of HVM +guests only. + +This is part of XSA-237. + +Signed-off-by: Jan Beulich +Reviewed-by: George Dunlap + +--- a/xen/arch/x86/physdev.c ++++ b/xen/arch/x86/physdev.c +@@ -110,7 +110,7 @@ int physdev_map_pirq(domid_t domid, int + if ( d == NULL ) + return -ESRCH; + +- ret = xsm_map_domain_pirq(XSM_TARGET, d); ++ ret = xsm_map_domain_pirq(XSM_DM_PRIV, d); + if ( ret ) + goto free_domain; + +@@ -255,13 +255,14 @@ int physdev_map_pirq(domid_t domid, int + int physdev_unmap_pirq(domid_t domid, int pirq) + { + struct domain *d; +- int ret; ++ int ret = 0; + + d = rcu_lock_domain_by_any_id(domid); + if ( d == NULL ) + return -ESRCH; + +- ret = xsm_unmap_domain_pirq(XSM_TARGET, d); ++ if ( domid != DOMID_SELF || !is_hvm_domain(d) ) ++ ret = xsm_unmap_domain_pirq(XSM_DM_PRIV, d); + if ( ret ) + goto free_domain; + +--- a/xen/include/xsm/dummy.h ++++ b/xen/include/xsm/dummy.h +@@ -446,7 +446,7 @@ static XSM_INLINE char *xsm_show_irq_sid + + static XSM_INLINE int xsm_map_domain_pirq(XSM_DEFAULT_ARG struct domain *d) + { +- XSM_ASSERT_ACTION(XSM_TARGET); ++ XSM_ASSERT_ACTION(XSM_DM_PRIV); + return xsm_default_action(action, current->domain, d); + } + +@@ -458,7 +458,7 @@ static XSM_INLINE int xsm_map_domain_irq + + static XSM_INLINE int xsm_unmap_domain_pirq(XSM_DEFAULT_ARG struct domain *d) + { +- XSM_ASSERT_ACTION(XSM_TARGET); ++ XSM_ASSERT_ACTION(XSM_DM_PRIV); + return xsm_default_action(action, current->domain, d); + } + diff -Nru xen-4.6.5/debian/patches/xsa237-4.6-0003-x86-MSI-disallow-redundant-enabling.patch xen-4.6.5/debian/patches/xsa237-4.6-0003-x86-MSI-disallow-redundant-enabling.patch --- xen-4.6.5/debian/patches/xsa237-4.6-0003-x86-MSI-disallow-redundant-enabling.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa237-4.6-0003-x86-MSI-disallow-redundant-enabling.patch 2017-10-11 13:55:36.000000000 +0000 @@ -0,0 +1,54 @@ +From: Jan Beulich +Subject: x86/MSI: disallow redundant enabling + +At the moment, Xen attempts to allow redundant enabling of MSI by +having pci_enable_msi() return 0, and point to the existing MSI +descriptor, when the msi already exists. + +Unfortunately, if subsequent errors are encountered, the cleanup +paths assume pci_enable_msi() had done full initialization, and +hence undo everything that was assumed to be done by that +function without also undoing other setup that would normally +occur only after that function was called (in map_domain_pirq() +itself). + +Rather than try to make the redundant enabling case work properly, just +forbid it entirely by having pci_enable_msi() return -EEXIST when MSI +is already set up. + +This is part of XSA-237. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +Reviewed-by: George Dunlap + +--- a/xen/arch/x86/msi.c ++++ b/xen/arch/x86/msi.c +@@ -1050,11 +1050,10 @@ static int __pci_enable_msi(struct msi_i + old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI); + if ( old_desc ) + { +- printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n", ++ printk(XENLOG_ERR "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n", + msi->irq, msi->seg, msi->bus, + PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); +- *desc = old_desc; +- return 0; ++ return -EEXIST; + } + + old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX); +@@ -1118,11 +1117,10 @@ static int __pci_enable_msix(struct msi_ + old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX); + if ( old_desc ) + { +- printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n", ++ printk(XENLOG_ERR "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n", + msi->irq, msi->seg, msi->bus, + PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); +- *desc = old_desc; +- return 0; ++ return -EEXIST; + } + + old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI); diff -Nru xen-4.6.5/debian/patches/xsa237-4.6-0004-x86-IRQ-conditionally-preserve-irq-pirq-mapping-on-error.patch xen-4.6.5/debian/patches/xsa237-4.6-0004-x86-IRQ-conditionally-preserve-irq-pirq-mapping-on-error.patch --- xen-4.6.5/debian/patches/xsa237-4.6-0004-x86-IRQ-conditionally-preserve-irq-pirq-mapping-on-error.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa237-4.6-0004-x86-IRQ-conditionally-preserve-irq-pirq-mapping-on-error.patch 2017-10-11 13:55:44.000000000 +0000 @@ -0,0 +1,123 @@ +From: Jan Beulich +Subject: x86/IRQ: conditionally preserve irq <-> pirq mapping on map error paths + +Mappings that had been set up before should not be torn down when +handling unrelated errors. + +This is part of XSA-237. + +Signed-off-by: Jan Beulich +Reviewed-by: George Dunlap + +--- a/xen/arch/x86/irq.c ++++ b/xen/arch/x86/irq.c +@@ -1249,7 +1249,8 @@ static int prepare_domain_irq_pirq(struc + return -ENOMEM; + } + *pinfo = info; +- return 0; ++ ++ return !!err; + } + + static void set_domain_irq_pirq(struct domain *d, int irq, struct pirq *pirq) +@@ -1292,7 +1293,10 @@ int init_domain_irq_mapping(struct domai + continue; + err = prepare_domain_irq_pirq(d, i, i, &info); + if ( err ) ++ { ++ ASSERT(err < 0); + break; ++ } + set_domain_irq_pirq(d, i, info); + } + +@@ -1900,6 +1904,7 @@ int map_domain_pirq( + struct pirq *info; + struct irq_desc *desc; + unsigned long flags; ++ DECLARE_BITMAP(prepared, MAX_MSI_IRQS) = {}; + + ASSERT(spin_is_locked(&d->event_lock)); + +@@ -1943,8 +1948,10 @@ int map_domain_pirq( + } + + ret = prepare_domain_irq_pirq(d, irq, pirq, &info); +- if ( ret ) ++ if ( ret < 0 ) + goto revoke; ++ if ( !ret ) ++ __set_bit(0, prepared); + + desc = irq_to_desc(irq); + +@@ -2016,8 +2023,10 @@ int map_domain_pirq( + irq = create_irq(NUMA_NO_NODE); + ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info) + : irq; +- if ( ret ) ++ if ( ret < 0 ) + break; ++ if ( !ret ) ++ __set_bit(nr, prepared); + msi_desc[nr].irq = irq; + + if ( irq_permit_access(d, irq) != 0 ) +@@ -2050,15 +2059,15 @@ int map_domain_pirq( + desc->msi_desc = NULL; + spin_unlock_irqrestore(&desc->lock, flags); + } +- while ( nr-- ) ++ while ( nr ) + { + if ( irq >= 0 && irq_deny_access(d, irq) ) + printk(XENLOG_G_ERR + "dom%d: could not revoke access to IRQ%d (pirq %d)\n", + d->domain_id, irq, pirq); +- if ( info ) ++ if ( info && test_bit(nr, prepared) ) + cleanup_domain_irq_pirq(d, irq, info); +- info = pirq_info(d, pirq + nr); ++ info = pirq_info(d, pirq + --nr); + irq = info->arch.irq; + } + msi_desc->irq = -1; +@@ -2074,12 +2083,14 @@ int map_domain_pirq( + spin_lock_irqsave(&desc->lock, flags); + set_domain_irq_pirq(d, irq, info); + spin_unlock_irqrestore(&desc->lock, flags); ++ ret = 0; + } + + done: + if ( ret ) + { +- cleanup_domain_irq_pirq(d, irq, info); ++ if ( test_bit(0, prepared) ) ++ cleanup_domain_irq_pirq(d, irq, info); + revoke: + if ( irq_deny_access(d, irq) ) + printk(XENLOG_G_ERR +--- a/xen/arch/x86/physdev.c ++++ b/xen/arch/x86/physdev.c +@@ -185,7 +185,7 @@ int physdev_map_pirq(domid_t domid, int + } + else if ( type == MAP_PIRQ_TYPE_MULTI_MSI ) + { +- if ( msi->entry_nr <= 0 || msi->entry_nr > 32 ) ++ if ( msi->entry_nr <= 0 || msi->entry_nr > MAX_MSI_IRQS ) + ret = -EDOM; + else if ( msi->entry_nr != 1 && !iommu_intremap ) + ret = -EOPNOTSUPP; +--- a/xen/include/asm-x86/msi.h ++++ b/xen/include/asm-x86/msi.h +@@ -55,6 +55,8 @@ + /* MAX fixed pages reserved for mapping MSIX tables. */ + #define FIX_MSIX_MAX_PAGES 512 + ++#define MAX_MSI_IRQS 32 /* limited by MSI capability struct properties */ ++ + struct msi_info { + u16 seg; + u8 bus; diff -Nru xen-4.6.5/debian/patches/xsa237-4.6-0005-x86-FLASK-fix-unmap-domain-IRQ-XSM-hook.patch xen-4.6.5/debian/patches/xsa237-4.6-0005-x86-FLASK-fix-unmap-domain-IRQ-XSM-hook.patch --- xen-4.6.5/debian/patches/xsa237-4.6-0005-x86-FLASK-fix-unmap-domain-IRQ-XSM-hook.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa237-4.6-0005-x86-FLASK-fix-unmap-domain-IRQ-XSM-hook.patch 2017-10-11 13:55:55.000000000 +0000 @@ -0,0 +1,37 @@ +From: Jan Beulich +Subject: x86/FLASK: fix unmap-domain-IRQ XSM hook + +The caller and the FLASK implementation of xsm_unmap_domain_irq() +disagreed about what the "data" argument points to in the MSI case: +Change both sides to pass/take a PCI device. + +This is part of XSA-237. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/irq.c ++++ b/xen/arch/x86/irq.c +@@ -2141,7 +2141,8 @@ int unmap_domain_pirq(struct domain *d, + nr = msi_desc->msi.nvec; + } + +- ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc); ++ ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, ++ msi_desc ? msi_desc->dev : NULL); + if ( ret ) + goto done; + +--- a/xen/xsm/flask/hooks.c ++++ b/xen/xsm/flask/hooks.c +@@ -897,8 +897,8 @@ static int flask_unmap_domain_msi (struc + u32 *sid, struct avc_audit_data *ad) + { + #ifdef HAS_PCI +- struct msi_info *msi = data; +- u32 machine_bdf = (msi->seg << 16) | (msi->bus << 8) | msi->devfn; ++ const struct pci_dev *pdev = data; ++ u32 machine_bdf = (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn; + + AVC_AUDIT_DATA_INIT(ad, DEV); + ad->device = machine_bdf; diff -Nru xen-4.6.5/debian/patches/xsa238-4.6.patch xen-4.6.5/debian/patches/xsa238-4.6.patch --- xen-4.6.5/debian/patches/xsa238-4.6.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa238-4.6.patch 2017-10-11 13:56:45.000000000 +0000 @@ -0,0 +1,42 @@ +From: XenProject Security Team +Subject: x86/ioreq server: correctly handle bogus + XEN_DMOP_{,un}map_io_range_to_ioreq_server arguments + +Misbehaving device model can pass incorrect XEN_DMOP_map/ +unmap_io_range_to_ioreq_server arguments, namely end < start when +specifying address range. When this happens we hit ASSERT(s <= e) in +rangeset_contains_range()/rangeset_overlaps_range() with debug builds. +Production builds will not trap right away but may misbehave later +while handling such bogus ranges. + +This is XSA-238. + +Reviewed-by: Jan Beulich +--- + xen/arch/x86/hvm/hvm.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c +index b2a8b0e986..8c8bf1f0ec 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -1271,6 +1271,9 @@ int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id, + struct hvm_ioreq_server *s; + int rc; + ++ if ( start > end ) ++ return -EINVAL; ++ + spin_lock_recursive(&d->arch.hvm_domain.ioreq_server.lock); + + rc = -ENOENT; +@@ -1322,6 +1325,9 @@ int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id, + struct hvm_ioreq_server *s; + int rc; + ++ if ( start > end ) ++ return -EINVAL; ++ + spin_lock_recursive(&d->arch.hvm_domain.ioreq_server.lock); + + rc = -ENOENT; diff -Nru xen-4.6.5/debian/patches/xsa239.patch xen-4.6.5/debian/patches/xsa239.patch --- xen-4.6.5/debian/patches/xsa239.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa239.patch 2017-10-11 13:56:58.000000000 +0000 @@ -0,0 +1,44 @@ +From: Jan Beulich +Subject: x86/HVM: prefill partially used variable on emulation paths + +Certain handlers ignore the access size (vioapic_write() being the +example this was found with), perhaps leading to subsequent reads +seeing data that wasn't actually written by the guest. For +consistency and extra safety also do this on the read path of +hvm_process_io_intercept(), even if this doesn't directly affect what +guests get to see, as we've supposedly already dealt with read handlers +leaving data completely unitialized. + +This is XSA-239. + +Signed-off-by: Jan Beulich + +--- a/xen/arch/x86/hvm/emulate.c ++++ b/xen/arch/x86/hvm/emulate.c +@@ -129,7 +129,7 @@ static int hvmemul_do_io( + .count = *reps, + .dir = dir, + .df = df, +- .data = data, ++ .data = data_is_addr ? data : 0, + .data_is_ptr = data_is_addr, /* ioreq_t field name is misleading */ + .state = STATE_IOREQ_READY, + }; +--- a/xen/arch/x86/hvm/intercept.c ++++ b/xen/arch/x86/hvm/intercept.c +@@ -127,6 +127,7 @@ int hvm_process_io_intercept(const struc + addr = (p->type == IOREQ_TYPE_COPY) ? + p->addr + step * i : + p->addr; ++ data = 0; + rc = ops->read(handler, addr, p->size, &data); + if ( rc != X86EMUL_OKAY ) + break; +@@ -161,6 +162,7 @@ int hvm_process_io_intercept(const struc + { + if ( p->data_is_ptr ) + { ++ data = 0; + switch ( hvm_copy_from_guest_phys(&data, p->data + step * i, + p->size) ) + { diff -Nru xen-4.6.5/debian/patches/xsa240-4.6-0001-x86-limit-linear-page-table-use-to-a-single-level.patch xen-4.6.5/debian/patches/xsa240-4.6-0001-x86-limit-linear-page-table-use-to-a-single-level.patch --- xen-4.6.5/debian/patches/xsa240-4.6-0001-x86-limit-linear-page-table-use-to-a-single-level.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa240-4.6-0001-x86-limit-linear-page-table-use-to-a-single-level.patch 2017-10-11 13:57:15.000000000 +0000 @@ -0,0 +1,493 @@ +From ce31198dd811479da34dfb66315f399dc4b98055 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Thu, 28 Sep 2017 15:17:28 +0100 +Subject: [PATCH 1/2] x86: limit linear page table use to a single level + +That's the only way that they're meant to be used. Without such a +restriction arbitrarily long chains of same-level page tables can be +built, tearing down of which may then cause arbitrarily deep recursion, +causing a stack overflow. To facilitate this restriction, a counter is +being introduced to track both the number of same-level entries in a +page table as well as the number of uses of a page table in another +same-level one (counting into positive and negative direction +respectively, utilizing the fact that both counts can't be non-zero at +the same time). + +Note that the added accounting introduces a restriction on the number +of times a page can be used in other same-level page tables - more than +32k of such uses are no longer possible. + +Note also that some put_page_and_type[_preemptible]() calls are +replaced with open-coded equivalents. This seemed preferrable to +adding "parent_table" to the matrix of functions. + +Note further that cross-domain same-level page table references are no +longer permitted (they probably never should have been). + +This is XSA-240. + +Signed-off-by: Jan Beulich +Signed-off-by: George Dunlap +--- + xen/arch/x86/domain.c | 1 + + xen/arch/x86/mm.c | 171 ++++++++++++++++++++++++++++++++++++++----- + xen/include/asm-x86/domain.h | 2 + + xen/include/asm-x86/mm.h | 25 +++++-- + 4 files changed, 175 insertions(+), 24 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index bfe614d7b7..23d034fa8d 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1131,6 +1131,7 @@ int arch_set_info_guest( + case -EINTR: + rc = -ERESTART; + case -ERESTART: ++ v->arch.old_guest_ptpg = NULL; + v->arch.old_guest_table = + pagetable_get_page(v->arch.guest_table); + v->arch.guest_table = pagetable_null(); +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 20d963cb22..81074aa473 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -660,6 +660,61 @@ static void put_data_page( + put_page(page); + } + ++static bool_t inc_linear_entries(struct page_info *pg) ++{ ++ typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc; ++ ++ do { ++ /* ++ * The check below checks for the "linear use" count being non-zero ++ * as well as overflow. Signed integer overflow is undefined behavior ++ * according to the C spec. However, as long as linear_pt_count is ++ * smaller in size than 'int', the arithmetic operation of the ++ * increment below won't overflow; rather the result will be truncated ++ * when stored. Ensure that this is always true. ++ */ ++ BUILD_BUG_ON(sizeof(nc) >= sizeof(int)); ++ oc = nc++; ++ if ( nc <= 0 ) ++ return 0; ++ nc = cmpxchg(&pg->linear_pt_count, oc, nc); ++ } while ( oc != nc ); ++ ++ return 1; ++} ++ ++static void dec_linear_entries(struct page_info *pg) ++{ ++ typeof(pg->linear_pt_count) oc; ++ ++ oc = arch_fetch_and_add(&pg->linear_pt_count, -1); ++ ASSERT(oc > 0); ++} ++ ++static bool_t inc_linear_uses(struct page_info *pg) ++{ ++ typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc; ++ ++ do { ++ /* See the respective comment in inc_linear_entries(). */ ++ BUILD_BUG_ON(sizeof(nc) >= sizeof(int)); ++ oc = nc--; ++ if ( nc >= 0 ) ++ return 0; ++ nc = cmpxchg(&pg->linear_pt_count, oc, nc); ++ } while ( oc != nc ); ++ ++ return 1; ++} ++ ++static void dec_linear_uses(struct page_info *pg) ++{ ++ typeof(pg->linear_pt_count) oc; ++ ++ oc = arch_fetch_and_add(&pg->linear_pt_count, 1); ++ ASSERT(oc < 0); ++} ++ + /* + * We allow root tables to map each other (a.k.a. linear page tables). It + * needs some special care with reference counts and access permissions: +@@ -689,15 +744,35 @@ get_##level##_linear_pagetable( \ + \ + if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \ + { \ ++ struct page_info *ptpg = mfn_to_page(pde_pfn); \ ++ \ ++ /* Make sure the page table belongs to the correct domain. */ \ ++ if ( unlikely(page_get_owner(ptpg) != d) ) \ ++ return 0; \ ++ \ + /* Make sure the mapped frame belongs to the correct domain. */ \ + if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \ + return 0; \ + \ + /* \ +- * Ensure that the mapped frame is an already-validated page table. \ ++ * Ensure that the mapped frame is an already-validated page table \ ++ * and is not itself having linear entries, as well as that the \ ++ * containing page table is not iself in use as a linear page table \ ++ * elsewhere. \ + * If so, atomically increment the count (checking for overflow). \ + */ \ + page = mfn_to_page(pfn); \ ++ if ( !inc_linear_entries(ptpg) ) \ ++ { \ ++ put_page(page); \ ++ return 0; \ ++ } \ ++ if ( !inc_linear_uses(page) ) \ ++ { \ ++ dec_linear_entries(ptpg); \ ++ put_page(page); \ ++ return 0; \ ++ } \ + y = page->u.inuse.type_info; \ + do { \ + x = y; \ +@@ -705,6 +780,8 @@ get_##level##_linear_pagetable( \ + unlikely((x & (PGT_type_mask|PGT_validated)) != \ + (PGT_##level##_page_table|PGT_validated)) ) \ + { \ ++ dec_linear_uses(page); \ ++ dec_linear_entries(ptpg); \ + put_page(page); \ + return 0; \ + } \ +@@ -1129,6 +1206,9 @@ get_page_from_l4e( + l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \ + } while ( 0 ) + ++static int _put_page_type(struct page_info *page, bool_t preemptible, ++ struct page_info *ptpg); ++ + void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) + { + unsigned long pfn = l1e_get_pfn(l1e); +@@ -1198,17 +1278,22 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) + if ( l2e_get_flags(l2e) & _PAGE_PSE ) + put_superpage(l2e_get_pfn(l2e)); + else +- put_page_and_type(l2e_get_page(l2e)); ++ { ++ struct page_info *pg = l2e_get_page(l2e); ++ int rc = _put_page_type(pg, 0, mfn_to_page(pfn)); ++ ++ ASSERT(!rc); ++ put_page(pg); ++ } + + return 0; + } + +-static int __put_page_type(struct page_info *, int preemptible); +- + static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + int partial, bool_t defer) + { + struct page_info *pg; ++ int rc; + + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) ) + return 1; +@@ -1231,21 +1316,28 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + if ( unlikely(partial > 0) ) + { + ASSERT(!defer); +- return __put_page_type(pg, 1); ++ return _put_page_type(pg, 1, mfn_to_page(pfn)); + } + + if ( defer ) + { ++ current->arch.old_guest_ptpg = mfn_to_page(pfn); + current->arch.old_guest_table = pg; + return 0; + } + +- return put_page_and_type_preemptible(pg); ++ rc = _put_page_type(pg, 1, mfn_to_page(pfn)); ++ if ( likely(!rc) ) ++ put_page(pg); ++ ++ return rc; + } + + static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + int partial, bool_t defer) + { ++ int rc = 1; ++ + if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && + (l4e_get_pfn(l4e) != pfn) ) + { +@@ -1254,18 +1346,22 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + if ( unlikely(partial > 0) ) + { + ASSERT(!defer); +- return __put_page_type(pg, 1); ++ return _put_page_type(pg, 1, mfn_to_page(pfn)); + } + + if ( defer ) + { ++ current->arch.old_guest_ptpg = mfn_to_page(pfn); + current->arch.old_guest_table = pg; + return 0; + } + +- return put_page_and_type_preemptible(pg); ++ rc = _put_page_type(pg, 1, mfn_to_page(pfn)); ++ if ( likely(!rc) ) ++ put_page(pg); + } +- return 1; ++ ++ return rc; + } + + static int alloc_l1_table(struct page_info *page) +@@ -1463,6 +1559,7 @@ static int alloc_l3_table(struct page_info *page) + { + page->nr_validated_ptes = i; + page->partial_pte = 0; ++ current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } + while ( i-- > 0 ) +@@ -1555,6 +1652,7 @@ static int alloc_l4_table(struct page_info *page) + { + if ( current->arch.old_guest_table ) + page->nr_validated_ptes++; ++ current->arch.old_guest_ptpg = NULL; + current->arch.old_guest_table = page; + } + } +@@ -2303,14 +2401,20 @@ int free_page_type(struct page_info *pag + } + + +-static int __put_final_page_type( +- struct page_info *page, unsigned long type, int preemptible) ++static int _put_final_page_type(struct page_info *page, unsigned long type, ++ bool_t preemptible, struct page_info *ptpg) + { + int rc = free_page_type(page, type, preemptible); + + /* No need for atomic update of type_info here: noone else updates it. */ + if ( rc == 0 ) + { ++ if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) ) ++ { ++ dec_linear_uses(page); ++ dec_linear_entries(ptpg); ++ } ++ ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying); + /* + * Record TLB information for flush later. We do not stamp page tables + * when running in shadow mode: +@@ -2346,8 +2450,8 @@ static int __put_final_page_type( + } + + +-static int __put_page_type(struct page_info *page, +- int preemptible) ++static int _put_page_type(struct page_info *page, bool_t preemptible, ++ struct page_info *ptpg) + { + unsigned long nx, x, y = page->u.inuse.type_info; + int rc = 0; +@@ -2374,12 +2478,28 @@ static int __put_page_type(struct page_info *page, + x, nx)) != x) ) + continue; + /* We cleared the 'valid bit' so we do the clean up. */ +- rc = __put_final_page_type(page, x, preemptible); ++ rc = _put_final_page_type(page, x, preemptible, ptpg); ++ ptpg = NULL; + if ( x & PGT_partial ) + put_page(page); + break; + } + ++ if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) ++ { ++ /* ++ * page_set_tlbflush_timestamp() accesses the same union ++ * linear_pt_count lives in. Unvalidated page table pages, ++ * however, should occur during domain destruction only ++ * anyway. Updating of linear_pt_count luckily is not ++ * necessary anymore for a dying domain. ++ */ ++ ASSERT(page_get_owner(page)->is_dying); ++ ASSERT(page->linear_pt_count < 0); ++ ASSERT(ptpg->linear_pt_count > 0); ++ ptpg = NULL; ++ } ++ + /* + * Record TLB information for flush later. We do not stamp page + * tables when running in shadow mode: +@@ -2399,6 +2519,13 @@ static int __put_page_type(struct page_info *page, + return -EINTR; + } + ++ if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) ++ { ++ ASSERT(!rc); ++ dec_linear_uses(page); ++ dec_linear_entries(ptpg); ++ } ++ + return rc; + } + +@@ -2533,6 +2660,7 @@ static int __get_page_type(struct page_info *page, unsigned long type, + page->nr_validated_ptes = 0; + page->partial_pte = 0; + } ++ page->linear_pt_count = 0; + rc = alloc_page_type(page, type, preemptible); + } + +@@ -2544,7 +2672,7 @@ static int __get_page_type(struct page_info *page, unsigned long type, + + void put_page_type(struct page_info *page) + { +- int rc = __put_page_type(page, 0); ++ int rc = _put_page_type(page, 0, NULL); + ASSERT(rc == 0); + (void)rc; + } +@@ -2560,7 +2688,7 @@ int get_page_type(struct page_info *page, unsigned long type) + + int put_page_type_preemptible(struct page_info *page) + { +- return __put_page_type(page, 1); ++ return _put_page_type(page, 1, NULL); + } + + int get_page_type_preemptible(struct page_info *page, unsigned long type) +@@ -2766,11 +2894,14 @@ int put_old_guest_table(struct vcpu *v) + if ( !v->arch.old_guest_table ) + return 0; + +- switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) ) ++ switch ( rc = _put_page_type(v->arch.old_guest_table, 1, ++ v->arch.old_guest_ptpg) ) + { + case -EINTR: + case -ERESTART: + return -ERESTART; ++ case 0: ++ put_page(v->arch.old_guest_table); + } + + v->arch.old_guest_table = NULL; +@@ -2927,6 +3058,7 @@ int new_guest_cr3(unsigned long mfn) + rc = -ERESTART; + /* fallthrough */ + case -ERESTART: ++ curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; + break; + default: +@@ -3172,7 +3304,10 @@ long do_mmuext_op( + if ( type == PGT_l1_page_table ) + put_page_and_type(page); + else ++ { ++ curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; ++ } + } + } + +@@ -3205,6 +3340,7 @@ long do_mmuext_op( + { + case -EINTR: + case -ERESTART: ++ curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; + rc = 0; + break; +@@ -3285,6 +3421,7 @@ long do_mmuext_op( + rc = -ERESTART; + /* fallthrough */ + case -ERESTART: ++ curr->arch.old_guest_ptpg = NULL; + curr->arch.old_guest_table = page; + okay = 0; + break; +diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h +index a2d4f74696..4279549c28 100644 +--- a/xen/include/asm-x86/domain.h ++++ b/xen/include/asm-x86/domain.h +@@ -512,6 +512,8 @@ struct arch_vcpu + pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */ + pagetable_t guest_table; /* (MFN) guest notion of cr3 */ + struct page_info *old_guest_table; /* partially destructed pagetable */ ++ struct page_info *old_guest_ptpg; /* containing page table of the */ ++ /* former, if any */ + /* guest_table holds a ref to the page, and also a type-count unless + * shadow refcounts are in use */ + pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ +diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h +index b4299fddea..4191e13112 100644 +--- a/xen/include/asm-x86/mm.h ++++ b/xen/include/asm-x86/mm.h +@@ -124,11 +124,11 @@ struct page_info + u32 tlbflush_timestamp; + + /* +- * When PGT_partial is true then this field is valid and indicates +- * that PTEs in the range [0, @nr_validated_ptes) have been validated. +- * An extra page reference must be acquired (or not dropped) whenever +- * PGT_partial gets set, and it must be dropped when the flag gets +- * cleared. This is so that a get() leaving a page in partially ++ * When PGT_partial is true then the first two fields are valid and ++ * indicate that PTEs in the range [0, @nr_validated_ptes) have been ++ * validated. An extra page reference must be acquired (or not dropped) ++ * whenever PGT_partial gets set, and it must be dropped when the flag ++ * gets cleared. This is so that a get() leaving a page in partially + * validated state (where the caller would drop the reference acquired + * due to the getting of the type [apparently] failing [-ERESTART]) + * would not accidentally result in a page left with zero general +@@ -152,10 +152,18 @@ struct page_info + * put_page_from_lNe() (due to the apparent failure), and hence it + * must be dropped when the put operation is resumed (and completes), + * but it must not be acquired if picking up the page for validation. ++ * ++ * The 3rd field, @linear_pt_count, indicates ++ * - by a positive value, how many same-level page table entries a page ++ * table has, ++ * - by a negative value, in how many same-level page tables a page is ++ * in use. + */ + struct { +- u16 nr_validated_ptes; +- s8 partial_pte; ++ u16 nr_validated_ptes:PAGETABLE_ORDER + 1; ++ u16 :16 - PAGETABLE_ORDER - 1 - 2; ++ s16 partial_pte:2; ++ s16 linear_pt_count; + }; + + /* +@@ -206,6 +214,9 @@ struct page_info + #define PGT_count_width PG_shift(9) + #define PGT_count_mask ((1UL< +Date: Fri, 22 Sep 2017 11:46:55 +0100 +Subject: [PATCH 2/2] x86/mm: Disable PV linear pagetables by default + +Allowing pagetables to point to other pagetables of the same level +(often called 'linear pagetables') has been included in Xen since its +inception. But it is not used by the most common PV guests (Linux, +NetBSD, minios), and has been the source of a number of subtle +reference-counting bugs. + +Add a command-line option to control whether PV linear pagetables are +allowed (disabled by default). + +Signed-off-by: George Dunlap +Reviewed-by: Andrew Cooper +--- +Changes since v2: +- s/_/-/; in command-line option +- Added __read_mostly +--- + docs/misc/xen-command-line.markdown | 15 +++++++++++++++ + xen/arch/x86/mm.c | 9 +++++++++ + 2 files changed, 24 insertions(+) + +diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown +index d99a20a44b..c91acaa464 100644 +--- a/docs/misc/xen-command-line.markdown ++++ b/docs/misc/xen-command-line.markdown +@@ -1231,6 +1231,21 @@ The following resources are available: + the cache allocation. + * `cat` instructs Xen to enable/disable Cache Allocation Technology. + * `cos_max` indicates the max value for COS ID. ++ ++### pv-linear-pt ++> `= ` ++ ++> Default: `false` ++ ++Allow PV guests to have pagetable entries pointing to other pagetables ++of the same level (i.e., allowing L2 PTEs to point to other L2 pages). ++This technique is often called "linear pagetables", and is sometimes ++used to allow operating systems a simple way to consistently map the ++current process's pagetables into its own virtual address space. ++ ++None of the most common PV operating systems (Linux, NetBSD, MiniOS) ++use this technique, but there may be custom operating systems which ++do. + + ### reboot + > `= t[riple] | k[bd] | a[cpi] | p[ci] | P[ower] | e[fi] | n[o] [, [w]arm | [c]old]` +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 81074aa473..75dd077046 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -727,6 +727,9 @@ static void dec_linear_uses(struct page_info *pg) + * frame if it is mapped by a different root table. This is sufficient and + * also necessary to allow validation of a root table mapping itself. + */ ++static bool_t __read_mostly pv_linear_pt_enable = 0; ++boolean_param("pv-linear-pt", pv_linear_pt_enable); ++ + #define define_get_linear_pagetable(level) \ + static int \ + get_##level##_linear_pagetable( \ +@@ -736,6 +739,12 @@ get_##level##_linear_pagetable( \ + struct page_info *page; \ + unsigned long pfn; \ + \ ++ if ( !pv_linear_pt_enable ) \ ++ { \ ++ MEM_LOG("Attempt to create linear p.t. (feature disabled)"); \ ++ return 0; \ ++ } \ ++ \ + if ( (level##e_get_flags(pde) & _PAGE_RW) ) \ + { \ + MEM_LOG("Attempt to create linear p.t. with write perms"); \ +-- +2.14.1 + diff -Nru xen-4.6.5/debian/patches/xsa241-4.9.patch xen-4.6.5/debian/patches/xsa241-4.9.patch --- xen-4.6.5/debian/patches/xsa241-4.9.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa241-4.9.patch 2017-10-11 16:32:13.000000000 +0000 @@ -0,0 +1,130 @@ +From: Jan Beulich +Subject: x86: don't store possibly stale TLB flush time stamp + +While the timing window is extremely narrow, it is theoretically +possible for an update to the TLB flush clock and a subsequent flush +IPI to happen between the read and write parts of the update of the +per-page stamp. Exclude this possibility by disabling interrupts +across the update, preventing the IPI to be serviced in the middle. + +This is XSA-241. + +Suggested-by: George Dunlap +Signed-off-by: Jan Beulich +Reviewed-by: George Dunlap + +Index: xen-4.6.5/xen/arch/x86/mm.c +=================================================================== +--- xen-4.6.5.orig/xen/arch/x86/mm.c ++++ xen-4.6.5/xen/arch/x86/mm.c +@@ -2433,7 +2433,7 @@ static int _put_final_page_type(struct p + */ + if ( !(shadow_mode_enabled(page_get_owner(page)) && + (page->count_info & PGC_page_table)) ) +- page->tlbflush_timestamp = tlbflush_current_time(); ++ page_set_tlbflush_timestamp(page); + wmb(); + page->u.inuse.type_info--; + } +@@ -2443,7 +2443,7 @@ static int _put_final_page_type(struct p + (PGT_count_mask|PGT_validated|PGT_partial)) == 1); + if ( !(shadow_mode_enabled(page_get_owner(page)) && + (page->count_info & PGC_page_table)) ) +- page->tlbflush_timestamp = tlbflush_current_time(); ++ page_set_tlbflush_timestamp(page); + wmb(); + page->u.inuse.type_info |= PGT_validated; + } +@@ -2497,7 +2497,7 @@ static int _put_page_type(struct page_in + if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) + { + /* +- * page_set_tlbflush_timestamp() accesses the same union ++ * set_tlbflush_timestamp() accesses the same union + * linear_pt_count lives in. Unvalidated page table pages, + * however, should occur during domain destruction only + * anyway. Updating of linear_pt_count luckily is not +@@ -2518,7 +2518,7 @@ static int _put_page_type(struct page_in + */ + if ( !(shadow_mode_enabled(page_get_owner(page)) && + (page->count_info & PGC_page_table)) ) +- page->tlbflush_timestamp = tlbflush_current_time(); ++ page_set_tlbflush_timestamp(page); + } + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) +Index: xen-4.6.5/xen/arch/x86/mm/shadow/common.c +=================================================================== +--- xen-4.6.5.orig/xen/arch/x86/mm/shadow/common.c ++++ xen-4.6.5/xen/arch/x86/mm/shadow/common.c +@@ -1580,7 +1580,7 @@ void shadow_free(struct domain *d, mfn_t + * TLBs when we reuse the page. Because the destructors leave the + * contents of the pages in place, we can delay TLB flushes until + * just before the allocator hands the page out again. */ +- sp->tlbflush_timestamp = tlbflush_current_time(); ++ page_set_tlbflush_timestamp(sp); + perfc_decr(shadow_alloc_count); + page_list_add_tail(sp, &d->arch.paging.shadow.freelist); + sp = next; +Index: xen-4.6.5/xen/common/page_alloc.c +=================================================================== +--- xen-4.6.5.orig/xen/common/page_alloc.c ++++ xen-4.6.5/xen/common/page_alloc.c +@@ -878,7 +878,7 @@ static void free_heap_pages( + /* If a page has no owner it will need no safety TLB flush. */ + pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL); + if ( pg[i].u.free.need_tlbflush ) +- pg[i].tlbflush_timestamp = tlbflush_current_time(); ++ page_set_tlbflush_timestamp(&pg[i]); + + /* This page is not a guest frame any more. */ + page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */ +Index: xen-4.6.5/xen/include/asm-arm/flushtlb.h +=================================================================== +--- xen-4.6.5.orig/xen/include/asm-arm/flushtlb.h ++++ xen-4.6.5/xen/include/asm-arm/flushtlb.h +@@ -2,6 +2,7 @@ + #define __ASM_ARM_FLUSHTLB_H__ + + #include ++#include + + /* + * Filter the given set of CPUs, removing those that definitely flushed their +@@ -14,6 +15,11 @@ do { + + #define tlbflush_current_time() (0) + ++static inline void page_set_tlbflush_timestamp(struct page_info *page) ++{ ++ page->tlbflush_timestamp = tlbflush_current_time(); ++} ++ + #if defined(CONFIG_ARM_32) + # include + #elif defined(CONFIG_ARM_64) +Index: xen-4.6.5/xen/include/asm-x86/flushtlb.h +=================================================================== +--- xen-4.6.5.orig/xen/include/asm-x86/flushtlb.h ++++ xen-4.6.5/xen/include/asm-x86/flushtlb.h +@@ -24,6 +24,20 @@ DECLARE_PER_CPU(u32, tlbflush_time); + + #define tlbflush_current_time() tlbflush_clock + ++static inline void page_set_tlbflush_timestamp(struct page_info *page) ++{ ++ /* ++ * Prevent storing a stale time stamp, which could happen if an update ++ * to tlbflush_clock plus a subsequent flush IPI happen between the ++ * reading of tlbflush_clock and the writing of the struct page_info ++ * field. ++ */ ++ ASSERT(local_irq_is_enabled()); ++ local_irq_disable(); ++ page->tlbflush_timestamp = tlbflush_current_time(); ++ local_irq_enable(); ++} ++ + /* + * @cpu_stamp is the timestamp at last TLB flush for the CPU we are testing. + * @lastuse_stamp is a timestamp taken when the PFN we are testing was last diff -Nru xen-4.6.5/debian/patches/xsa242-4.9.patch xen-4.6.5/debian/patches/xsa242-4.9.patch --- xen-4.6.5/debian/patches/xsa242-4.9.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa242-4.9.patch 2017-10-11 13:57:47.000000000 +0000 @@ -0,0 +1,43 @@ +From: Jan Beulich +Subject: x86: don't allow page_unlock() to drop the last type reference + +Only _put_page_type() does the necessary cleanup, and hence not all +domain pages can be released during guest cleanup (leaving around +zombie domains) if we get this wrong. + +This is XSA-242. + +Signed-off-by: Jan Beulich + +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1923,7 +1923,11 @@ void page_unlock(struct page_info *page) + + do { + x = y; ++ ASSERT((x & PGT_count_mask) && (x & PGT_locked)); ++ + nx = x - (1 | PGT_locked); ++ /* We must not drop the last reference here. */ ++ ASSERT(nx & PGT_count_mask); + } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); + } + +@@ -2611,6 +2615,17 @@ static int _put_page_type(struct page_in + (page->count_info & PGC_page_table)) ) + page_set_tlbflush_timestamp(page); + } ++ else if ( unlikely((nx & (PGT_locked | PGT_count_mask)) == ++ (PGT_locked | 1)) ) ++ { ++ /* ++ * We must not drop the second to last reference when the page is ++ * locked, as page_unlock() doesn't do any cleanup of the type. ++ */ ++ cpu_relax(); ++ y = page->u.inuse.type_info; ++ continue; ++ } + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) + break; diff -Nru xen-4.6.5/debian/patches/xsa243-4.6-1.patch xen-4.6.5/debian/patches/xsa243-4.6-1.patch --- xen-4.6.5/debian/patches/xsa243-4.6-1.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa243-4.6-1.patch 2017-10-11 13:58:09.000000000 +0000 @@ -0,0 +1,36 @@ +From: Andrew Cooper +Subject: x86: Disable the use of auto-translated PV guests + +This is a minimal backport of c/s 92942fd3d469 "x86/mm: drop +guest_{map,get_eff}_l1e() hooks" from Xen 4.7, which stated: + + Disallow the unmaintained and presumed broken translated-but-not-external + paging mode combination ... + +It turns out that this mode is insecure to run with, as opposed to just simply +broken. + +This is part of XSA-243. + +Signed-off-by: Andrew Cooper + +diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c +index dcff4fb..945bb61 100644 +--- a/xen/arch/x86/mm/paging.c ++++ b/xen/arch/x86/mm/paging.c +@@ -835,6 +835,15 @@ void paging_final_teardown(struct domain *d) + * creation. */ + int paging_enable(struct domain *d, u32 mode) + { ++ switch ( mode & (PG_external | PG_translate) ) ++ { ++ case 0: ++ case PG_external | PG_translate: ++ break; ++ default: ++ return -EINVAL; ++ } ++ + if ( hap_enabled(d) ) + return hap_enable(d, mode | PG_HAP_enable); + else diff -Nru xen-4.6.5/debian/patches/xsa243-4.6-2.patch xen-4.6.5/debian/patches/xsa243-4.6-2.patch --- xen-4.6.5/debian/patches/xsa243-4.6-2.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa243-4.6-2.patch 2017-10-11 13:58:54.000000000 +0000 @@ -0,0 +1,94 @@ +From: Andrew Cooper +Subject: x86/shadow: Don't create self-linear shadow mappings for 4-level translated guests + +When initially creating a monitor table for 4-level translated guests, don't +install a shadow-linear mapping. This mapping is actually self-linear, and +trips up the writeable heuristic logic into following Xen's mappings, not the +guests' shadows it was expecting to follow. + +A consequence of this is that sh_guess_wrmap() needs to cope with there being +no shadow-linear mapping present, which in practice occurs once each time a +vcpu switches to 4-level paging from a different paging mode. + +An appropriate shadow-linear slot will be inserted into the monitor table +either while constructing lower level monitor tables, or by sh_update_cr3(). + +While fixing this, clarify the safety of the other mappings. Despite +appearing unsafe, it is correct to create a guest-linear mapping for +translated domains; this is self-linear and doesn't point into the translated +domain. Drop a dead clause for translate != external guests. + +This is part of XSA-243. + +Signed-off-by: Andrew Cooper +Acked-by: Tim Deegan + +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index c34ebe0..cb8ddde 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -1456,26 +1456,38 @@ void sh_install_xen_entries_in_l4(struct domain *d, mfn_t gl4mfn, mfn_t sl4mfn) + sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = shadow_l4e_empty(); + } + +- /* Shadow linear mapping for 4-level shadows. N.B. for 3-level +- * shadows on 64-bit xen, this linear mapping is later replaced by the +- * monitor pagetable structure, which is built in make_monitor_table +- * and maintained by sh_update_linear_entries. */ +- sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = +- shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR); +- +- /* Self linear mapping. */ +- if ( shadow_mode_translate(d) && !shadow_mode_external(d) ) +- { +- // linear tables may not be used with translated PV guests +- sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = ++ /* ++ * Linear mapping slots: ++ * ++ * Calling this function with gl4mfn == sl4mfn is used to construct a ++ * monitor table for translated domains. In this case, gl4mfn forms the ++ * self-linear mapping (i.e. not pointing into the translated domain), and ++ * the shadow-linear slot is skipped. The shadow-linear slot is either ++ * filled when constructing lower level monitor tables, or via ++ * sh_update_cr3() for 4-level guests. ++ * ++ * Calling this function with gl4mfn != sl4mfn is used for non-translated ++ * guests, where the shadow-linear slot is actually self-linear, and the ++ * guest-linear slot points into the guests view of its pagetables. ++ */ ++ if ( shadow_mode_translate(d) ) ++ { ++ ASSERT(mfn_x(gl4mfn) == mfn_x(sl4mfn)); ++ ++ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = + shadow_l4e_empty(); + } + else + { +- sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = +- shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); ++ ASSERT(mfn_x(gl4mfn) != mfn_x(sl4mfn)); ++ ++ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = ++ shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR); + } + ++ sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = ++ shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); ++ + unmap_domain_page(sl4e); + } + #endif +@@ -4270,6 +4282,11 @@ static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn) + + /* Carefully look in the shadow linear map for the l1e we expect */ + #if SHADOW_PAGING_LEVELS >= 4 ++ /* Is a shadow linear map is installed in the first place? */ ++ sl4p = v->arch.paging.shadow.guest_vtable; ++ sl4p += shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START); ++ if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) ++ return 0; + sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr); + if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) + return 0; diff -Nru xen-4.6.5/debian/patches/xsa244-4.6.patch xen-4.6.5/debian/patches/xsa244-4.6.patch --- xen-4.6.5/debian/patches/xsa244-4.6.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa244-4.6.patch 2017-10-11 13:59:18.000000000 +0000 @@ -0,0 +1,51 @@ +From: Andrew Cooper +Subject: x86/cpu: fix IST handling during PCPU bringup + +Clear IST references in newly allocated IDTs. Nothing good will come of +having them set before the TSS is suitably constructed (although the chances +of the CPU surviving such an IST interrupt/exception is extremely slim). + +Uniformly set the IST references after the TSS is in place. This fixes an +issue on AMD hardware, where onlining a PCPU while PCPU0 is in HVM context +will cause IST_NONE to be copied into the new IDT, making that PCPU vulnerable +to privilege escalation from PV guests until it subsequently schedules an HVM +guest. + +This is XSA-244. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich + +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c +@@ -558,6 +558,7 @@ void __init early_cpu_init(void) + * - Sets up TSS with stack pointers, including ISTs + * - Inserts TSS selector into regular and compat GDTs + * - Loads GDT, IDT, TR then null LDT ++ * - Sets up IST references in the IDT + */ + void __cpuinit load_system_tables(void) + { +@@ -604,6 +605,10 @@ void __cpuinit load_system_tables(void) + asm volatile ("lidt %0" : : "m" (idtr) ); + asm volatile ("ltr %w0" : : "rm" (TSS_ENTRY << 3) ); + asm volatile ("lldt %w0" : : "rm" (0) ); ++ ++ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); ++ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); ++ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); + } + + /* +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -749,6 +749,9 @@ static int cpu_smpboot_alloc(unsigned in + if ( idt_tables[cpu] == NULL ) + goto oom; + memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t)); ++ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); ++ set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); ++ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); + + for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); + i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) diff -Nru xen-4.6.5/debian/patches/xsa245-0001-xen-page_alloc-Cover-memory-unreserved-after-boot-in.patch xen-4.6.5/debian/patches/xsa245-0001-xen-page_alloc-Cover-memory-unreserved-after-boot-in.patch --- xen-4.6.5/debian/patches/xsa245-0001-xen-page_alloc-Cover-memory-unreserved-after-boot-in.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa245-0001-xen-page_alloc-Cover-memory-unreserved-after-boot-in.patch 2017-10-11 13:59:31.000000000 +0000 @@ -0,0 +1,48 @@ +From a48d47febc1340f27d6c716545692641a09b414c Mon Sep 17 00:00:00 2001 +From: Julien Grall +Date: Thu, 21 Sep 2017 14:13:08 +0100 +Subject: [PATCH 1/2] xen/page_alloc: Cover memory unreserved after boot in + first_valid_mfn + +On Arm, some regions (e.g Initramfs, Dom0 Kernel...) are marked as +reserved until the hardware domain is built and they are copied into its +memory. Therefore, they will not be added in the boot allocator via +init_boot_pages. + +Instead, init_xenheap_pages will be called once the region are not used +anymore. + +Update first_valid_mfn in both init_heap_pages and init_boot_pages +(already exist) to cover all the cases. + +Signed-off-by: Julien Grall +[Adjust comment, added locking around first_valid_mfn update] +Signed-off-by: Boris Ostrovsky +--- + xen/common/page_alloc.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c +index 0b9f6cc6df..fbe5a8af39 100644 +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -1700,6 +1700,16 @@ static void init_heap_pages( + { + unsigned long i; + ++ /* ++ * Some pages may not go through the boot allocator (e.g reserved ++ * memory at boot but released just after --- kernel, initramfs, ++ * etc.). ++ * Update first_valid_mfn to ensure those regions are covered. ++ */ ++ spin_lock(&heap_lock); ++ first_valid_mfn = min_t(unsigned long, page_to_mfn(pg), first_valid_mfn); ++ spin_unlock(&heap_lock); ++ + for ( i = 0; i < nr_pages; i++ ) + { + unsigned int nid = phys_to_nid(page_to_maddr(pg+i)); +-- +2.11.0 + diff -Nru xen-4.6.5/debian/patches/xsa245-0002-xen-arm-Correctly-report-the-memory-region-in-the-du.patch xen-4.6.5/debian/patches/xsa245-0002-xen-arm-Correctly-report-the-memory-region-in-the-du.patch --- xen-4.6.5/debian/patches/xsa245-0002-xen-arm-Correctly-report-the-memory-region-in-the-du.patch 1970-01-01 00:00:00.000000000 +0000 +++ xen-4.6.5/debian/patches/xsa245-0002-xen-arm-Correctly-report-the-memory-region-in-the-du.patch 2017-10-11 13:59:37.000000000 +0000 @@ -0,0 +1,73 @@ +From cbfcf039d0e0b6f4c4cb3de612f7bf788a0c47cd Mon Sep 17 00:00:00 2001 +From: Julien Grall +Date: Mon, 18 Sep 2017 14:24:08 +0100 +Subject: [PATCH 2/2] xen/arm: Correctly report the memory region in the dummy + NUMA helpers + +NUMA is currently not supported on Arm. Because common code is +NUMA-aware, dummy helpers are instead provided to expose a single node. + +Those helpers are for instance used to know the region to scrub. + +However the memory region is not reported correctly. Indeed, the +frametable may not be at the beginning of the memory and there might be +multiple memory banks. This will lead to not scrub some part of the +memory. + +The memory information can be found using: + * first_valid_mfn as the start of the memory + * max_page - first_valid_mfn as the spanned pages + +Note that first_valid_mfn is now been exported. The prototype has been +added in asm-arm/numa.h and not in a common header because I would +expect the variable to become static once NUMA is fully supported on +Arm. + +Signed-off-by: Julien Grall +--- + xen/common/page_alloc.c | 6 +++++- + xen/include/asm-arm/numa.h | 10 ++++++++-- + 2 files changed, 13 insertions(+), 3 deletions(-) + +diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c +index fbe5a8af39..472c6fe329 100644 +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -192,7 +192,11 @@ PAGE_LIST_HEAD(page_broken_list); + * BOOT-TIME ALLOCATOR + */ + +-static unsigned long __initdata first_valid_mfn = ~0UL; ++/* ++ * first_valid_mfn is exported because it is use in ARM specific NUMA ++ * helpers. See comment in asm-arm/numa.h. ++ */ ++unsigned long first_valid_mfn = ~0UL; + + static struct bootmem_region { + unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */ +diff --git a/xen/include/asm-arm/numa.h b/xen/include/asm-arm/numa.h +index a2c1a3476d..3e7384da9e 100644 +--- a/xen/include/asm-arm/numa.h ++++ b/xen/include/asm-arm/numa.h +@@ -12,9 +12,15 @@ static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr) + return 0; + } + ++/* ++ * TODO: make first_valid_mfn static when NUMA is supported on Arm, this ++ * is required because the dummy helpers is using it. ++ */ ++extern unsigned long first_valid_mfn; ++ + /* XXX: implement NUMA support */ +-#define node_spanned_pages(nid) (total_pages) +-#define node_start_pfn(nid) (pdx_to_pfn(frametable_base_pdx)) ++#define node_spanned_pages(nid) (max_page - first_valid_mfn) ++#define node_start_pfn(nid) (first_valid_mfn) + #define __node_distance(a, b) (20) + + static inline unsigned int arch_get_dma_bitsize(void) +-- +2.11.0 +