--- dragonegg-2.8.orig/Makefile
+++ dragonegg-2.8/Makefile
@@ -91,8 +91,7 @@
 	@echo Linking $@
 	$(QUIET)$(CXX) -o $@ $(LOADABLE_MODULE_OPTIONS) $(CXXFLAGS) \
 	$(LD_OPTIONS) $(PLUGIN_OBJECTS) $(TARGET_OBJECT) \
-	$(shell $(LLVM_CONFIG) --libs analysis core ipo scalaropts target \
-	$(shell $(TARGET_UTIL) -p))
+	-lLLVM-2.8
 
 clean::
 	$(QUIET)rm -f *.o *.d $(PLUGIN) $(TARGET_UTIL)
--- dragonegg-2.8.orig/debian/gcc-wrap.in
+++ dragonegg-2.8/debian/gcc-wrap.in
@@ -0,0 +1,2 @@
+#!/bin/sh
+exec ${0#/usr/bin/llvm-}-4.5 -fplugin=@plugindir@/dragonegg.so "$@"
--- dragonegg-2.8.orig/debian/rules
+++ dragonegg-2.8/debian/rules
@@ -0,0 +1,42 @@
+#!/usr/bin/make -f
+
+deb_version		:= $(shell dpkg-parsechangelog | sed -ne "s/^Version: \(.*\)/\1/p")
+upstream_version	:= $(shell echo $(deb_version) | sed -e "s/-[^-]*$$//")
+major 			:= $(shell echo $(upstream_version) | sed -e "s/\([0-9]\+\.[0-9]\+\)[\.-].*/\1/g")
+
+plugindir := $(shell gcc-4.5 -print-file-name=plugin | sed 's/4\.\([5-9]\)\../4.\1/')
+
+include /usr/share/cdbs/1/rules/debhelper.mk
+include /usr/share/cdbs/1/rules/simple-patchsys.mk
+
+stamps/build: apply-patches
+	# link with llvm-snapshot
+	LDFLAGS=-L/usr/lib/llvm-2.8/lib $(MAKE) \
+		VERBOSE=1 \
+		LLVM_CONFIG=/usr/lib/llvm-2.8/bin/llvm-config \
+		-C $(DEB_SRCDIR)
+	mkdir -p stamps
+	touch $@
+
+stamps/install:
+	mkdir -p debian/dragonegg/$(plugindir)
+	install -m755 $(DEB_SRCDIR)/dragonegg.so debian/dragonegg/$(plugindir)/
+	mkdir -p stamps
+	touch $@
+
+stamps/install-llvm-gcc-4.5:
+	mkdir -p debian/llvm-gcc-4.5/usr/bin
+	sed 's,@plugindir@,$(plugindir),' debian/gcc-wrap.in > debian/gcc-wrap
+	for i in c++ cpp g++ gcc gccbug gcov ; do \
+		install -m755 debian/gcc-wrap debian/llvm-gcc-4.5/usr/bin/llvm-$$i ; \
+	done
+
+build/dragonegg:: stamps/build
+install/dragonegg:: stamps/install
+
+install/llvm-gcc-4.5:: stamps/install-llvm-gcc-4.5
+
+clean::
+	$(MAKE) clean VERBOSE=1
+	rm -f debian/gcc-wrap
+	rm -rf stamps
--- dragonegg-2.8.orig/debian/copyright
+++ dragonegg-2.8/debian/copyright
@@ -0,0 +1,50 @@
+Name: DragonEgg
+Source: http://dragonegg.llvm.org/
+Maintainer: Duncan Sands <baldrick@free.fr>
+
+Files: *
+Copyright: 2009  Duncan Sands
+License: GPL-2+
+
+Files: llvm-backend.cpp
+       llvm-types.cpp
+       llvm-internal.h
+       linux/llvm-os.h
+       llvm-convert.cpp
+       llvm-cache.h
+       darwin/llvm-os.h
+       gt-llvm-cache.h
+       llvm-debug.cpp
+       llvm-cache.c
+       llvm-debug.h
+       x86/llvm-target.h
+       x86/llvm-target.cpp
+       llvm-abi.h
+Copyright: 2004, 2005, 2006, 2007, 2009  Free Software Foundation, Inc.
+License: GPL-2+
+
+Files: debian/*
+Copyright: 2009, Robert Millan <rmh.debian@aybabtu.com>
+License: GPL-2+
+
+Files: debian/patches/02_missing_gcc_headers.diff
+Copyright: 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
+           2007, 2008, 2009  Free Software Foundation, Inc.
+License: GPL-3+
+
+Files: debian/patches/05_gcc_i386.diff
+Copyright: 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+           2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+           Free Software Foundation, Inc.
+License: GPL-3+
+
+
+License: GPL-2+
+ On Debian systems the full text of the GNU General Public
+ License can be found in the `/usr/share/common-licenses/GPL-2'
+ file.
+
+License: GPL-3+
+ On Debian systems the full text of the GNU General Public
+ License can be found in the `/usr/share/common-licenses/GPL'
+ file.
--- dragonegg-2.8.orig/debian/compat
+++ dragonegg-2.8/debian/compat
@@ -0,0 +1 @@
+7
--- dragonegg-2.8.orig/debian/changelog
+++ dragonegg-2.8/debian/changelog
@@ -0,0 +1,65 @@
+dragonegg (2.8-0ubuntu2) natty; urgency=low
+
+  * Rebuild for GCC multiarch locations.
+
+ -- Matthias Klose <doko@ubuntu.com>  Sun, 03 Apr 2011 11:53:59 +0200
+
+dragonegg (2.8-0ubuntu1) maverick-proposed; urgency=low
+
+  * Final 2.8 release. LP: #632727.
+
+ -- Matthias Klose <doko@ubuntu.com>  Thu, 14 Oct 2010 18:50:58 +0200
+
+dragonegg (2.8~20101006-0ubuntu1) maverick; urgency=low
+
+  * New upstream version, taken from the trunk.
+
+ -- Matthias Klose <doko@ubuntu.com>  Wed, 06 Oct 2010 16:05:42 +0200
+
+dragonegg (2.8~20100921-0ubuntu1) maverick; urgency=low
+
+  * New upstream version, taken from the trunk.
+
+ -- Matthias Klose <doko@ubuntu.com>  Tue, 21 Sep 2010 22:39:41 +0200
+
+dragonegg (2.8~20100911-0ubuntu1) maverick; urgency=low
+
+  * New upstream version, taken from the trunk.
+
+ -- Matthias Klose <doko@ubuntu.com>  Mon, 13 Sep 2010 10:37:04 +0200
+
+dragonegg (2.8~20100907-0ubuntu1) maverick; urgency=low
+
+  * New upstream version, taken from the trunk.
+
+ -- Matthias Klose <doko@ubuntu.com>  Tue, 07 Sep 2010 17:24:30 +0200
+
+dragonegg (2.7-0ubuntu2) maverick; urgency=low
+
+  * Build-depend on llvm-2.7-dev.
+
+ -- Matthias Klose <doko@ubuntu.com>  Wed, 02 Jun 2010 00:22:09 +0200
+
+dragonegg (2.7-0ubuntu1) maverick; urgency=low
+
+  * Upload to maverick.
+
+ -- Matthias Klose <doko@ubuntu.com>  Fri, 21 May 2010 13:22:45 +0200
+
+dragonegg (2.7-0ubuntu1~ppa3) lucid; urgency=low
+
+  * New upstream version.
+
+ -- Matthias Klose <doko@ubuntu.com>  Tue, 27 Apr 2010 15:09:49 +0200
+
+dragonegg (0~20091229-2) experimental; urgency=low
+
+  * control (llvm-gcc-4.5): Provide and conflict with llvm-gcc.
+
+ -- Robert Millan <rmh.debian@aybabtu.com>  Mon, 04 Jan 2010 13:15:08 +0100
+
+dragonegg (0~20091229-1) experimental; urgency=low
+
+  * Initial release.  (Closes: #563315)
+
+ -- Robert Millan <rmh.debian@aybabtu.com>  Fri, 01 Jan 2010 22:55:02 +0100
--- dragonegg-2.8.orig/debian/control
+++ dragonegg-2.8/debian/control
@@ -0,0 +1,42 @@
+Source: dragonegg
+Section: devel
+Priority: optional
+Maintainer: Robert Millan <rmh.debian@aybabtu.com>
+Uploaders: Arthur Loiret <aloiret@debian.org>
+Build-Depends:
+ cdbs,
+ debhelper (>= 7),
+ gcc-4.5-plugin-dev,
+ llvm-2.8-dev (>= 2.8),
+ libffi-dev,
+ libmpfr-dev,
+ libmpc-dev,
+Standards-Version: 3.7.3
+Homepage: http://dragonegg.llvm.org/
+
+Package: dragonegg
+Architecture: i386 kfreebsd-i386 hurd-i386 kopensolaris-i386 amd64 kfreebsd-amd64 lpia
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Recommends: gcc-4.5 | g++-4.5
+Description: GCC plugin that uses LLVM for optimization and code generation
+ DragonEgg is a GCC plugin (dragonegg.so) that replaces GCC's optimizers
+ and code generators with those from the LLVM project.
+ .
+ It is a reimplementation of llvm-gcc that works with gcc-4.5 or later.
+ .
+ DragonEgg is under heavy development and is not mature - it may crash or
+ produce wrong code.
+
+Package: llvm-gcc-4.5
+Architecture: i386 kfreebsd-i386 hurd-i386 kopensolaris-i386 amd64 kfreebsd-amd64 lpia
+Depends: ${shlibs:Depends}, ${misc:Depends}, dragonegg, gcc-4.5, g++-4.5
+Provides: llvm-gcc
+Conflicts: llvm-gcc
+Description: C front end for LLVM C/C++ compiler
+ The Low-Level Virtual Machine (LLVM) is a collection of libraries and
+ tools that make it easy to build compilers, optimizers, Just-In-Time
+ code generators, and many other compiler-related programs.
+ .
+ This is the DragonEgg-based version of llvm-gcc.  Note that DragonEgg
+ is under heavy development and is not mature - it may crash or
+ produce wrong code.
--- dragonegg-2.8.orig/debian/patches/06_no_llvm-os_header.diff
+++ dragonegg-2.8/debian/patches/06_no_llvm-os_header.diff
@@ -0,0 +1,11 @@
+--- ./llvm-backend.cpp~	2010-09-07 17:20:53.000000000 +0200
++++ ./llvm-backend.cpp	2010-09-07 17:30:35.470942297 +0200
+@@ -25,7 +25,7 @@
+ }
+ #include "llvm-debug.h"
+ #include "llvm-internal.h"
+-#include "llvm-os.h"
++// #include "llvm-os.h"
+ #include "llvm-target.h"
+ 
+ // LLVM headers
--- dragonegg-2.8.orig/debian/patches/04_hasnestattr.diff
+++ dragonegg-2.8/debian/patches/04_hasnestattr.diff
@@ -0,0 +1,11 @@
+--- dragonegg-0~20091229/llvm-backend.cpp~	2009-12-29 20:32:56.000000000 +0100
++++ dragonegg-0~20091229/llvm-backend.cpp	2009-12-29 21:08:06.000000000 +0100
+@@ -1644,7 +1644,7 @@
+     // additional artificial arguments for doing struct return or passing a
+     // nested function static chain.  Look for 'this' while passing through
+     // all arguments except for 'this' unchanged.
+-    if (FoundThis || AI->hasStructRetAttr() || AI->hasNestAttr()) {
++    if (FoundThis || AI->hasStructRetAttr()) {
+       Arguments.push_back(AI);
+       continue;
+     }
--- dragonegg-2.8.orig/debian/patches/05_gcc_i386.diff
+++ dragonegg-2.8/debian/patches/05_gcc_i386.diff
@@ -0,0 +1,2351 @@
+
+Temporary replacement for [dragonegg]/gcc-patches/i386_static.diff.  Upstream
+(Duncan) said he has plans for a proper solution other than merging
+i386_static.diff.  This patch should be removed as soon as this solution is
+available.
+
+--- Makefile~	2010-04-20 09:25:47.000000000 +0200
++++ Makefile	2010-04-27 15:55:19.464836030 +0200
+@@ -34,7 +34,7 @@
+ 
+ PLUGIN=dragonegg.so
+ PLUGIN_OBJECTS=llvm-cache.o llvm-convert.o llvm-backend.o llvm-debug.o \
+-	       llvm-types.o bits_and_bobs.o llvm-abi-default.o
++	       llvm-types.o bits_and_bobs.o llvm-abi-default.o gcc-i386.o
+ 
+ TARGET_OBJECT=llvm-target.o
+ TARGET_SOURCE=$(SRC_DIR)/$(shell $(TARGET_UTIL) -p)/llvm-target.cpp
+--- gcc-i386.c	1970-01-01 00:00:00 +0000
++++ gcc-i386.c	2009-12-29 12:28:35 +0000
+@@ -0,0 +1,2330 @@
++
++/*
++ * Derived from [gcc]/gcc/config/i386/i386.c
++ * (pre-4.5 snapshot taken on 20091223)
++ */
++
++
++/* Subroutines used for code generation on IA-32.
++   Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
++   2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
++   Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "tm.h"
++#include "rtl.h"
++#include "tree.h"
++#include "tm_p.h"
++#include "hard-reg-set.h"
++#include "real.h"
++#include "output.h"
++#include "flags.h"
++#include "except.h"
++#include "function.h"
++#include "toplev.h"
++#include "basic-block.h"
++#include "ggc.h"
++#include "target.h"
++#include "langhooks.h"
++#include "cgraph.h"
++#include "gimple.h"
++#include "params.h"
++
++#ifndef CHECK_STACK_LIMIT
++#define CHECK_STACK_LIMIT (-1)
++#endif
++
++/* Return index of given mode in mult and division cost tables.  */
++#define MODE_INDEX(mode)					\
++  ((mode) == QImode ? 0						\
++   : (mode) == HImode ? 1					\
++   : (mode) == SImode ? 2					\
++   : (mode) == DImode ? 3					\
++   : 4)
++
++/* Processor costs (relative to an add) */
++/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
++#define COSTS_N_BYTES(N) ((N) * 2)
++
++#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
++
++const
++struct processor_costs ix86_size_cost = {/* costs for tuning for size */
++  COSTS_N_BYTES (2),			/* cost of an add instruction */
++  COSTS_N_BYTES (3),			/* cost of a lea instruction */
++  COSTS_N_BYTES (2),			/* variable shift costs */
++  COSTS_N_BYTES (3),			/* constant shift costs */
++  {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
++   COSTS_N_BYTES (3),			/*                               HI */
++   COSTS_N_BYTES (3),			/*                               SI */
++   COSTS_N_BYTES (3),			/*                               DI */
++   COSTS_N_BYTES (5)},			/*                            other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
++   COSTS_N_BYTES (3),			/*                          HI */
++   COSTS_N_BYTES (3),			/*                          SI */
++   COSTS_N_BYTES (3),			/*                          DI */
++   COSTS_N_BYTES (5)},			/*                       other */
++  COSTS_N_BYTES (3),			/* cost of movsx */
++  COSTS_N_BYTES (3),			/* cost of movzx */
++  0,					/* "large" insn */
++  2,					/* MOVE_RATIO */
++  2,					/* cost for loading QImode using movzbl */
++  {2, 2, 2},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {2, 2, 2},				/* cost of storing integer registers */
++  2,					/* cost of reg,reg fld/fst */
++  {2, 2, 2},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {2, 2, 2},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  3,					/* cost of moving MMX register */
++  {3, 3},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {3, 3},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  3,					/* cost of moving SSE register */
++  {3, 3, 3},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {3, 3, 3},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  3,					/* MMX or SSE register to integer */
++  0,					/* size of l1 cache  */
++  0,					/* size of l2 cache  */
++  0,					/* size of prefetch block */
++  0,					/* number of parallel prefetches */
++  2,					/* Branch cost */
++  COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
++  COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
++  COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
++  COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
++  COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
++  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
++   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
++  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
++   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  1,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  1,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++/* Processor costs (relative to an add) */
++static const
++struct processor_costs i386_cost = {	/* 386 specific costs */
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1),			/* cost of a lea instruction */
++  COSTS_N_INSNS (3),			/* variable shift costs */
++  COSTS_N_INSNS (2),			/* constant shift costs */
++  {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (6),			/*                               HI */
++   COSTS_N_INSNS (6),			/*                               SI */
++   COSTS_N_INSNS (6),			/*                               DI */
++   COSTS_N_INSNS (6)},			/*                               other */
++  COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (23),			/*                          HI */
++   COSTS_N_INSNS (23),			/*                          SI */
++   COSTS_N_INSNS (23),			/*                          DI */
++   COSTS_N_INSNS (23)},			/*                          other */
++  COSTS_N_INSNS (3),			/* cost of movsx */
++  COSTS_N_INSNS (2),			/* cost of movzx */
++  15,					/* "large" insn */
++  3,					/* MOVE_RATIO */
++  4,					/* cost for loading QImode using movzbl */
++  {2, 4, 2},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {2, 4, 2},				/* cost of storing integer registers */
++  2,					/* cost of reg,reg fld/fst */
++  {8, 8, 8},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {8, 8, 8},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {4, 8},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {4, 8},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {4, 8, 16},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {4, 8, 16},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  3,					/* MMX or SSE register to integer */
++  0,					/* size of l1 cache  */
++  0,					/* size of l2 cache  */
++  0,					/* size of prefetch block */
++  0,					/* number of parallel prefetches */
++  1,					/* Branch cost */
++  COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
++  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
++   DUMMY_STRINGOP_ALGS},
++  {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs i486_cost = {	/* 486 specific costs */
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1),			/* cost of a lea instruction */
++  COSTS_N_INSNS (3),			/* variable shift costs */
++  COSTS_N_INSNS (2),			/* constant shift costs */
++  {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (12),			/*                               HI */
++   COSTS_N_INSNS (12),			/*                               SI */
++   COSTS_N_INSNS (12),			/*                               DI */
++   COSTS_N_INSNS (12)},			/*                               other */
++  1,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (40),			/*                          HI */
++   COSTS_N_INSNS (40),			/*                          SI */
++   COSTS_N_INSNS (40),			/*                          DI */
++   COSTS_N_INSNS (40)},			/*                          other */
++  COSTS_N_INSNS (3),			/* cost of movsx */
++  COSTS_N_INSNS (2),			/* cost of movzx */
++  15,					/* "large" insn */
++  3,					/* MOVE_RATIO */
++  4,					/* cost for loading QImode using movzbl */
++  {2, 4, 2},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {2, 4, 2},				/* cost of storing integer registers */
++  2,					/* cost of reg,reg fld/fst */
++  {8, 8, 8},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {8, 8, 8},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {4, 8},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {4, 8},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {4, 8, 16},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {4, 8, 16},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  3,					/* MMX or SSE register to integer */
++  4,					/* size of l1 cache.  486 has 8kB cache
++					   shared for code and data, so 4kB is
++					   not really precise.  */
++  4,					/* size of l2 cache  */
++  0,					/* size of prefetch block */
++  0,					/* number of parallel prefetches */
++  1,					/* Branch cost */
++  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
++  {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
++   DUMMY_STRINGOP_ALGS},
++  {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs pentium_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1),			/* cost of a lea instruction */
++  COSTS_N_INSNS (4),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (11),			/*                               HI */
++   COSTS_N_INSNS (11),			/*                               SI */
++   COSTS_N_INSNS (11),			/*                               DI */
++   COSTS_N_INSNS (11)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (25),			/*                          HI */
++   COSTS_N_INSNS (25),			/*                          SI */
++   COSTS_N_INSNS (25),			/*                          DI */
++   COSTS_N_INSNS (25)},			/*                          other */
++  COSTS_N_INSNS (3),			/* cost of movsx */
++  COSTS_N_INSNS (2),			/* cost of movzx */
++  8,					/* "large" insn */
++  6,					/* MOVE_RATIO */
++  6,					/* cost for loading QImode using movzbl */
++  {2, 4, 2},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {2, 4, 2},				/* cost of storing integer registers */
++  2,					/* cost of reg,reg fld/fst */
++  {2, 2, 6},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {4, 4, 6},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  8,					/* cost of moving MMX register */
++  {8, 8},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {8, 8},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {4, 8, 16},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {4, 8, 16},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  3,					/* MMX or SSE register to integer */
++  8,					/* size of l1 cache.  */
++  8,					/* size of l2 cache  */
++  0,					/* size of prefetch block */
++  0,					/* number of parallel prefetches */
++  2,					/* Branch cost */
++  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
++  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  {{libcall, {{-1, rep_prefix_4_byte}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs pentiumpro_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1),			/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (4),			/*                               HI */
++   COSTS_N_INSNS (4),			/*                               SI */
++   COSTS_N_INSNS (4),			/*                               DI */
++   COSTS_N_INSNS (4)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (17),			/*                          HI */
++   COSTS_N_INSNS (17),			/*                          SI */
++   COSTS_N_INSNS (17),			/*                          DI */
++   COSTS_N_INSNS (17)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  6,					/* MOVE_RATIO */
++  2,					/* cost for loading QImode using movzbl */
++  {4, 4, 4},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {2, 2, 2},				/* cost of storing integer registers */
++  2,					/* cost of reg,reg fld/fst */
++  {2, 2, 6},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {4, 4, 6},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {2, 2},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {2, 2},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {2, 2, 8},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {2, 2, 8},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  3,					/* MMX or SSE register to integer */
++  8,					/* size of l1 cache.  */
++  256,					/* size of l2 cache  */
++  32,					/* size of prefetch block */
++  6,					/* number of parallel prefetches */
++  2,					/* Branch cost */
++  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
++  /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
++     the alignment).  For small blocks inline loop is still a noticeable win, for bigger
++     blocks either rep movsl or rep movsb is way to go.  Rep movsb has apparently
++     more expensive startup time in CPU, but after 4K the difference is down in the noise.
++   */
++  {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
++			{8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
++   DUMMY_STRINGOP_ALGS},
++  {{rep_prefix_4_byte, {{1024, unrolled_loop},
++  		        {8192, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs geode_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1),			/* cost of a lea instruction */
++  COSTS_N_INSNS (2),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (4),			/*                               HI */
++   COSTS_N_INSNS (7),			/*                               SI */
++   COSTS_N_INSNS (7),			/*                               DI */
++   COSTS_N_INSNS (7)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (23),			/*                          HI */
++   COSTS_N_INSNS (39),			/*                          SI */
++   COSTS_N_INSNS (39),			/*                          DI */
++   COSTS_N_INSNS (39)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  4,					/* MOVE_RATIO */
++  1,					/* cost for loading QImode using movzbl */
++  {1, 1, 1},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {1, 1, 1},				/* cost of storing integer registers */
++  1,					/* cost of reg,reg fld/fst */
++  {1, 1, 1},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {4, 6, 6},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++
++  1,					/* cost of moving MMX register */
++  {1, 1},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {1, 1},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  1,					/* cost of moving SSE register */
++  {1, 1, 1},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {1, 1, 1},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  1,					/* MMX or SSE register to integer */
++  64,					/* size of l1 cache.  */
++  128,					/* size of l2 cache.  */
++  32,					/* size of prefetch block */
++  1,					/* number of parallel prefetches */
++  1,					/* Branch cost */
++  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
++  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs k6_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (2),			/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (3),			/*                               HI */
++   COSTS_N_INSNS (3),			/*                               SI */
++   COSTS_N_INSNS (3),			/*                               DI */
++   COSTS_N_INSNS (3)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (18),			/*                          HI */
++   COSTS_N_INSNS (18),			/*                          SI */
++   COSTS_N_INSNS (18),			/*                          DI */
++   COSTS_N_INSNS (18)},			/*                          other */
++  COSTS_N_INSNS (2),			/* cost of movsx */
++  COSTS_N_INSNS (2),			/* cost of movzx */
++  8,					/* "large" insn */
++  4,					/* MOVE_RATIO */
++  3,					/* cost for loading QImode using movzbl */
++  {4, 5, 4},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {2, 3, 2},				/* cost of storing integer registers */
++  4,					/* cost of reg,reg fld/fst */
++  {6, 6, 6},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {4, 4, 4},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {2, 2},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {2, 2},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {2, 2, 8},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {2, 2, 8},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  6,					/* MMX or SSE register to integer */
++  32,					/* size of l1 cache.  */
++  32,					/* size of l2 cache.  Some models
++					   have integrated l2 cache, but
++					   optimizing for k6 is not important
++					   enough to worry about that.  */
++  32,					/* size of prefetch block */
++  1,					/* number of parallel prefetches */
++  1,					/* Branch cost */
++  COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
++  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs athlon_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (2),			/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (5),			/*                               HI */
++   COSTS_N_INSNS (5),			/*                               SI */
++   COSTS_N_INSNS (5),			/*                               DI */
++   COSTS_N_INSNS (5)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (26),			/*                          HI */
++   COSTS_N_INSNS (42),			/*                          SI */
++   COSTS_N_INSNS (74),			/*                          DI */
++   COSTS_N_INSNS (74)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  9,					/* MOVE_RATIO */
++  4,					/* cost for loading QImode using movzbl */
++  {3, 4, 3},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {3, 4, 3},				/* cost of storing integer registers */
++  4,					/* cost of reg,reg fld/fst */
++  {4, 4, 12},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {6, 6, 8},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {4, 4},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {4, 4},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {4, 4, 6},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {4, 4, 5},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  5,					/* MMX or SSE register to integer */
++  64,					/* size of l1 cache.  */
++  256,					/* size of l2 cache.  */
++  64,					/* size of prefetch block */
++  6,					/* number of parallel prefetches */
++  5,					/* Branch cost */
++  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
++  /* For some reason, Athlon deals better with REP prefix (relative to loops)
++     compared to K8. Alignment becomes important after 8 bytes for memcpy and
++     128 bytes for memset.  */
++  {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs k8_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (2),			/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (4),			/*                               HI */
++   COSTS_N_INSNS (3),			/*                               SI */
++   COSTS_N_INSNS (4),			/*                               DI */
++   COSTS_N_INSNS (5)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (26),			/*                          HI */
++   COSTS_N_INSNS (42),			/*                          SI */
++   COSTS_N_INSNS (74),			/*                          DI */
++   COSTS_N_INSNS (74)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  9,					/* MOVE_RATIO */
++  4,					/* cost for loading QImode using movzbl */
++  {3, 4, 3},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {3, 4, 3},				/* cost of storing integer registers */
++  4,					/* cost of reg,reg fld/fst */
++  {4, 4, 12},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {6, 6, 8},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {3, 3},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {4, 4},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {4, 3, 6},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {4, 4, 5},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  5,					/* MMX or SSE register to integer */
++  64,					/* size of l1 cache.  */
++  512,					/* size of l2 cache.  */
++  64,					/* size of prefetch block */
++  /* New AMD processors never drop prefetches; if they cannot be performed
++     immediately, they are queued.  We set number of simultaneous prefetches
++     to a large constant to reflect this (it probably is not a good idea not
++     to limit number of prefetches at all, as their execution also takes some
++     time).  */
++  100,					/* number of parallel prefetches */
++  3,					/* Branch cost */
++  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
++  /* K8 has optimized REP instruction for medium sized blocks, but for very small
++     blocks it is better to use loop. For large blocks, libcall can do
++     nontemporary accesses and beat inline considerably.  */
++  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
++   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  {{libcall, {{8, loop}, {24, unrolled_loop},
++	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
++   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  4,                                    /* scalar_stmt_cost.  */
++  2,                                    /* scalar load_cost.  */
++  2,                                    /* scalar_store_cost.  */
++  5,                                    /* vec_stmt_cost.  */
++  0,                                    /* vec_to_scalar_cost.  */
++  2,                                    /* scalar_to_vec_cost.  */
++  2,                                    /* vec_align_load_cost.  */
++  3,                                    /* vec_unalign_load_cost.  */
++  3,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  2,                                    /* cond_not_taken_branch_cost.  */
++};
++
++struct processor_costs amdfam10_cost = {
++  COSTS_N_INSNS (1),                    /* cost of an add instruction */
++  COSTS_N_INSNS (2),                    /* cost of a lea instruction */
++  COSTS_N_INSNS (1),                    /* variable shift costs */
++  COSTS_N_INSNS (1),                    /* constant shift costs */
++  {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
++   COSTS_N_INSNS (4),                   /*                               HI */
++   COSTS_N_INSNS (3),                   /*                               SI */
++   COSTS_N_INSNS (4),                   /*                               DI */
++   COSTS_N_INSNS (5)},                  /*                               other */
++  0,                                    /* cost of multiply per each bit set */
++  {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
++   COSTS_N_INSNS (35),                  /*                          HI */
++   COSTS_N_INSNS (51),                  /*                          SI */
++   COSTS_N_INSNS (83),                  /*                          DI */
++   COSTS_N_INSNS (83)},                 /*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  9,					/* MOVE_RATIO */
++  4,					/* cost for loading QImode using movzbl */
++  {3, 4, 3},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {3, 4, 3},				/* cost of storing integer registers */
++  4,					/* cost of reg,reg fld/fst */
++  {4, 4, 12},				/* cost of loading fp registers
++		   			   in SFmode, DFmode and XFmode */
++  {6, 6, 8},				/* cost of storing fp registers
++ 		   			   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {3, 3},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {4, 4},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {4, 4, 3},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {4, 4, 5},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  3,					/* MMX or SSE register to integer */
++  					/* On K8
++  					    MOVD reg64, xmmreg 	Double	FSTORE 4
++					    MOVD reg32, xmmreg 	Double	FSTORE 4
++					   On AMDFAM10
++					    MOVD reg64, xmmreg 	Double	FADD 3
++                                                                1/1  1/1
++					    MOVD reg32, xmmreg 	Double	FADD 3
++                                                                1/1  1/1 */
++  64,					/* size of l1 cache.  */
++  512,					/* size of l2 cache.  */
++  64,					/* size of prefetch block */
++  /* New AMD processors never drop prefetches; if they cannot be performed
++     immediately, they are queued.  We set number of simultaneous prefetches
++     to a large constant to reflect this (it probably is not a good idea not
++     to limit number of prefetches at all, as their execution also takes some
++     time).  */
++  100,					/* number of parallel prefetches */
++  2,					/* Branch cost */
++  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
++
++  /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
++     very small blocks it is better to use loop. For large blocks, libcall can
++     do nontemporary accesses and beat inline considerably.  */
++  {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
++   {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  {{libcall, {{8, loop}, {24, unrolled_loop},
++	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
++   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  4,                                    /* scalar_stmt_cost.  */
++  2,                                    /* scalar load_cost.  */
++  2,                                    /* scalar_store_cost.  */
++  6,                                    /* vec_stmt_cost.  */
++  0,                                    /* vec_to_scalar_cost.  */
++  2,                                    /* scalar_to_vec_cost.  */
++  2,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  2,                                    /* vec_store_cost.  */
++  2,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs pentium4_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (3),			/* cost of a lea instruction */
++  COSTS_N_INSNS (4),			/* variable shift costs */
++  COSTS_N_INSNS (4),			/* constant shift costs */
++  {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (15),			/*                               HI */
++   COSTS_N_INSNS (15),			/*                               SI */
++   COSTS_N_INSNS (15),			/*                               DI */
++   COSTS_N_INSNS (15)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (56),			/*                          HI */
++   COSTS_N_INSNS (56),			/*                          SI */
++   COSTS_N_INSNS (56),			/*                          DI */
++   COSTS_N_INSNS (56)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  16,					/* "large" insn */
++  6,					/* MOVE_RATIO */
++  2,					/* cost for loading QImode using movzbl */
++  {4, 5, 4},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {2, 3, 2},				/* cost of storing integer registers */
++  2,					/* cost of reg,reg fld/fst */
++  {2, 2, 6},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {4, 4, 6},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {2, 2},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {2, 2},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  12,					/* cost of moving SSE register */
++  {12, 12, 12},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {2, 2, 8},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  10,					/* MMX or SSE register to integer */
++  8,					/* size of l1 cache.  */
++  256,					/* size of l2 cache.  */
++  64,					/* size of prefetch block */
++  6,					/* number of parallel prefetches */
++  2,					/* Branch cost */
++  COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
++  {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
++   DUMMY_STRINGOP_ALGS},
++  {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
++   {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs nocona_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1),			/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (10),			/*                               HI */
++   COSTS_N_INSNS (10),			/*                               SI */
++   COSTS_N_INSNS (10),			/*                               DI */
++   COSTS_N_INSNS (10)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (66),			/*                          HI */
++   COSTS_N_INSNS (66),			/*                          SI */
++   COSTS_N_INSNS (66),			/*                          DI */
++   COSTS_N_INSNS (66)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  16,					/* "large" insn */
++  17,					/* MOVE_RATIO */
++  4,					/* cost for loading QImode using movzbl */
++  {4, 4, 4},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {4, 4, 4},				/* cost of storing integer registers */
++  3,					/* cost of reg,reg fld/fst */
++  {12, 12, 12},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {4, 4, 4},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  6,					/* cost of moving MMX register */
++  {12, 12},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {12, 12},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  6,					/* cost of moving SSE register */
++  {12, 12, 12},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {12, 12, 12},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  8,					/* MMX or SSE register to integer */
++  8,					/* size of l1 cache.  */
++  1024,					/* size of l2 cache.  */
++  128,					/* size of prefetch block */
++  8,					/* number of parallel prefetches */
++  1,					/* Branch cost */
++  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
++  {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
++   {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
++	      {100000, unrolled_loop}, {-1, libcall}}}},
++  {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
++   {-1, libcall}}},
++   {libcall, {{24, loop}, {64, unrolled_loop},
++	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs core2_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (3),			/*                               HI */
++   COSTS_N_INSNS (3),			/*                               SI */
++   COSTS_N_INSNS (3),			/*                               DI */
++   COSTS_N_INSNS (3)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (22),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (22),			/*                          HI */
++   COSTS_N_INSNS (22),			/*                          SI */
++   COSTS_N_INSNS (22),			/*                          DI */
++   COSTS_N_INSNS (22)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  16,					/* MOVE_RATIO */
++  2,					/* cost for loading QImode using movzbl */
++  {6, 6, 6},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {4, 4, 4},				/* cost of storing integer registers */
++  2,					/* cost of reg,reg fld/fst */
++  {6, 6, 6},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {4, 4, 4},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {6, 6},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {4, 4},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {6, 6, 6},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {4, 4, 4},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  2,					/* MMX or SSE register to integer */
++  32,					/* size of l1 cache.  */
++  2048,					/* size of l2 cache.  */
++  128,					/* size of prefetch block */
++  8,					/* number of parallel prefetches */
++  3,					/* Branch cost */
++  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (32),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (58),			/* cost of FSQRT instruction.  */
++  {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
++   {libcall, {{32, loop}, {64, rep_prefix_4_byte},
++	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  {{libcall, {{8, loop}, {15, unrolled_loop},
++	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
++   {libcall, {{24, loop}, {32, unrolled_loop},
++	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++static const
++struct processor_costs atom_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (4),			/*                               HI */
++   COSTS_N_INSNS (3),			/*                               SI */
++   COSTS_N_INSNS (4),			/*                               DI */
++   COSTS_N_INSNS (2)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (26),			/*                          HI */
++   COSTS_N_INSNS (42),			/*                          SI */
++   COSTS_N_INSNS (74),			/*                          DI */
++   COSTS_N_INSNS (74)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  17,					/* MOVE_RATIO */
++  2,					/* cost for loading QImode using movzbl */
++  {4, 4, 4},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {4, 4, 4},				/* cost of storing integer registers */
++  4,					/* cost of reg,reg fld/fst */
++  {12, 12, 12},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {6, 6, 8},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {8, 8},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {8, 8},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {8, 8, 8},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {8, 8, 8},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  5,					/* MMX or SSE register to integer */
++  32,					/* size of l1 cache.  */
++  256,					/* size of l2 cache.  */
++  64,					/* size of prefetch block */
++  6,					/* number of parallel prefetches */
++  3,					/* Branch cost */
++  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
++  {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
++   {libcall, {{32, loop}, {64, rep_prefix_4_byte},
++          {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  {{libcall, {{8, loop}, {15, unrolled_loop},
++          {2048, rep_prefix_4_byte}, {-1, libcall}}},
++   {libcall, {{24, loop}, {32, unrolled_loop},
++          {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++/* Generic64 should produce code tuned for Nocona and K8.  */
++static const
++struct processor_costs generic64_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  /* On all chips taken into consideration lea is 2 cycles and more.  With
++     this cost however our current implementation of synth_mult results in
++     use of unnecessary temporary registers causing regression on several
++     SPECfp benchmarks.  */
++  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (4),			/*                               HI */
++   COSTS_N_INSNS (3),			/*                               SI */
++   COSTS_N_INSNS (4),			/*                               DI */
++   COSTS_N_INSNS (2)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (26),			/*                          HI */
++   COSTS_N_INSNS (42),			/*                          SI */
++   COSTS_N_INSNS (74),			/*                          DI */
++   COSTS_N_INSNS (74)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  17,					/* MOVE_RATIO */
++  4,					/* cost for loading QImode using movzbl */
++  {4, 4, 4},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {4, 4, 4},				/* cost of storing integer registers */
++  4,					/* cost of reg,reg fld/fst */
++  {12, 12, 12},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {6, 6, 8},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {8, 8},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {8, 8},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {8, 8, 8},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {8, 8, 8},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  5,					/* MMX or SSE register to integer */
++  32,					/* size of l1 cache.  */
++  512,					/* size of l2 cache.  */
++  64,					/* size of prefetch block */
++  6,					/* number of parallel prefetches */
++  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
++     is increased to perhaps more appropriate value of 5.  */
++  3,					/* Branch cost */
++  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
++  {DUMMY_STRINGOP_ALGS,
++   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  {DUMMY_STRINGOP_ALGS,
++   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++/* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8.  */
++static const
++struct processor_costs generic32_cost = {
++  COSTS_N_INSNS (1),			/* cost of an add instruction */
++  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
++  COSTS_N_INSNS (1),			/* variable shift costs */
++  COSTS_N_INSNS (1),			/* constant shift costs */
++  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
++   COSTS_N_INSNS (4),			/*                               HI */
++   COSTS_N_INSNS (3),			/*                               SI */
++   COSTS_N_INSNS (4),			/*                               DI */
++   COSTS_N_INSNS (2)},			/*                               other */
++  0,					/* cost of multiply per each bit set */
++  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
++   COSTS_N_INSNS (26),			/*                          HI */
++   COSTS_N_INSNS (42),			/*                          SI */
++   COSTS_N_INSNS (74),			/*                          DI */
++   COSTS_N_INSNS (74)},			/*                          other */
++  COSTS_N_INSNS (1),			/* cost of movsx */
++  COSTS_N_INSNS (1),			/* cost of movzx */
++  8,					/* "large" insn */
++  17,					/* MOVE_RATIO */
++  4,					/* cost for loading QImode using movzbl */
++  {4, 4, 4},				/* cost of loading integer registers
++					   in QImode, HImode and SImode.
++					   Relative to reg-reg move (2).  */
++  {4, 4, 4},				/* cost of storing integer registers */
++  4,					/* cost of reg,reg fld/fst */
++  {12, 12, 12},				/* cost of loading fp registers
++					   in SFmode, DFmode and XFmode */
++  {6, 6, 8},				/* cost of storing fp registers
++					   in SFmode, DFmode and XFmode */
++  2,					/* cost of moving MMX register */
++  {8, 8},				/* cost of loading MMX registers
++					   in SImode and DImode */
++  {8, 8},				/* cost of storing MMX registers
++					   in SImode and DImode */
++  2,					/* cost of moving SSE register */
++  {8, 8, 8},				/* cost of loading SSE registers
++					   in SImode, DImode and TImode */
++  {8, 8, 8},				/* cost of storing SSE registers
++					   in SImode, DImode and TImode */
++  5,					/* MMX or SSE register to integer */
++  32,					/* size of l1 cache.  */
++  256,					/* size of l2 cache.  */
++  64,					/* size of prefetch block */
++  6,					/* number of parallel prefetches */
++  3,					/* Branch cost */
++  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
++  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
++  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
++  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
++  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
++  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
++  {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
++   DUMMY_STRINGOP_ALGS},
++  1,                                    /* scalar_stmt_cost.  */
++  1,                                    /* scalar load_cost.  */
++  1,                                    /* scalar_store_cost.  */
++  1,                                    /* vec_stmt_cost.  */
++  1,                                    /* vec_to_scalar_cost.  */
++  1,                                    /* scalar_to_vec_cost.  */
++  1,                                    /* vec_align_load_cost.  */
++  2,                                    /* vec_unalign_load_cost.  */
++  1,                                    /* vec_store_cost.  */
++  3,                                    /* cond_taken_branch_cost.  */
++  1,                                    /* cond_not_taken_branch_cost.  */
++};
++
++const struct processor_costs *ix86_cost = &pentium_cost;
++
++/* Processor feature/optimization bitmasks.  */
++#define m_386 (1<<PROCESSOR_I386)
++#define m_486 (1<<PROCESSOR_I486)
++#define m_PENT (1<<PROCESSOR_PENTIUM)
++#define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
++#define m_PENT4  (1<<PROCESSOR_PENTIUM4)
++#define m_NOCONA  (1<<PROCESSOR_NOCONA)
++#define m_CORE2  (1<<PROCESSOR_CORE2)
++#define m_ATOM  (1<<PROCESSOR_ATOM)
++
++#define m_GEODE  (1<<PROCESSOR_GEODE)
++#define m_K6  (1<<PROCESSOR_K6)
++#define m_K6_GEODE  (m_K6 | m_GEODE)
++#define m_K8  (1<<PROCESSOR_K8)
++#define m_ATHLON  (1<<PROCESSOR_ATHLON)
++#define m_ATHLON_K8  (m_K8 | m_ATHLON)
++#define m_AMDFAM10  (1<<PROCESSOR_AMDFAM10)
++#define m_AMD_MULTIPLE  (m_K8 | m_ATHLON | m_AMDFAM10)
++
++#define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
++#define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
++
++/* Generic instruction choice should be common subset of supported CPUs
++   (PPro/PENT4/NOCONA/CORE2/Athlon/K8).  */
++#define m_GENERIC (m_GENERIC32 | m_GENERIC64)
++
++/* In case the average insn count for single function invocation is
++   lower than this constant, emit fast (but longer) prologue and
++   epilogue code.  */
++#define FAST_PROLOGUE_INSN_COUNT 20
++
++/* Names for 8 (low), 8 (high), and 16-bit registers, respectively.  */
++static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
++static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
++static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
++
++/* Array of the smallest class containing reg number REGNO, indexed by
++   REGNO.  Used by REGNO_REG_CLASS in i386.h.  */
++
++enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
++{
++  /* ax, dx, cx, bx */
++  AREG, DREG, CREG, BREG,
++  /* si, di, bp, sp */
++  SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
++  /* FP registers */
++  FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
++  FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
++  /* arg pointer */
++  NON_Q_REGS,
++  /* flags, fpsr, fpcr, frame */
++  NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
++  /* SSE registers */
++  SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
++  SSE_REGS, SSE_REGS,
++  /* MMX registers */
++  MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
++  MMX_REGS, MMX_REGS,
++  /* REX registers */
++  NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
++  NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
++  /* SSE REX registers */
++  SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
++  SSE_REGS, SSE_REGS,
++};
++
++/* The "default" register map used in 32bit mode.  */
++
++int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
++{
++  0, 2, 1, 3, 6, 7, 4, 5,		/* general regs */
++  12, 13, 14, 15, 16, 17, 18, 19,	/* fp regs */
++  -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
++  21, 22, 23, 24, 25, 26, 27, 28,	/* SSE */
++  29, 30, 31, 32, 33, 34, 35, 36,       /* MMX */
++  -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
++  -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
++};
++
++/* The "default" register map used in 64bit mode.  */
++
++int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
++{
++  0, 1, 2, 3, 4, 5, 6, 7,		/* general regs */
++  33, 34, 35, 36, 37, 38, 39, 40,	/* fp regs */
++  -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
++  17, 18, 19, 20, 21, 22, 23, 24,	/* SSE */
++  41, 42, 43, 44, 45, 46, 47, 48,       /* MMX */
++  8,9,10,11,12,13,14,15,		/* extended integer registers */
++  25, 26, 27, 28, 29, 30, 31, 32,	/* extended SSE registers */
++};
++
++/* Define the register numbers to be used in Dwarf debugging information.
++   The SVR4 reference port C compiler uses the following register numbers
++   in its Dwarf output code:
++	0 for %eax (gcc regno = 0)
++	1 for %ecx (gcc regno = 2)
++	2 for %edx (gcc regno = 1)
++	3 for %ebx (gcc regno = 3)
++	4 for %esp (gcc regno = 7)
++	5 for %ebp (gcc regno = 6)
++	6 for %esi (gcc regno = 4)
++	7 for %edi (gcc regno = 5)
++   The following three DWARF register numbers are never generated by
++   the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
++   believes these numbers have these meanings.
++	8  for %eip    (no gcc equivalent)
++	9  for %eflags (gcc regno = 17)
++	10 for %trapno (no gcc equivalent)
++   It is not at all clear how we should number the FP stack registers
++   for the x86 architecture.  If the version of SDB on x86/svr4 were
++   a bit less brain dead with respect to floating-point then we would
++   have a precedent to follow with respect to DWARF register numbers
++   for x86 FP registers, but the SDB on x86/svr4 is so completely
++   broken with respect to FP registers that it is hardly worth thinking
++   of it as something to strive for compatibility with.
++   The version of x86/svr4 SDB I have at the moment does (partially)
++   seem to believe that DWARF register number 11 is associated with
++   the x86 register %st(0), but that's about all.  Higher DWARF
++   register numbers don't seem to be associated with anything in
++   particular, and even for DWARF regno 11, SDB only seems to under-
++   stand that it should say that a variable lives in %st(0) (when
++   asked via an `=' command) if we said it was in DWARF regno 11,
++   but SDB still prints garbage when asked for the value of the
++   variable in question (via a `/' command).
++   (Also note that the labels SDB prints for various FP stack regs
++   when doing an `x' command are all wrong.)
++   Note that these problems generally don't affect the native SVR4
++   C compiler because it doesn't allow the use of -O with -g and
++   because when it is *not* optimizing, it allocates a memory
++   location for each floating-point variable, and the memory
++   location is what gets described in the DWARF AT_location
++   attribute for the variable in question.
++   Regardless of the severe mental illness of the x86/svr4 SDB, we
++   do something sensible here and we use the following DWARF
++   register numbers.  Note that these are all stack-top-relative
++   numbers.
++	11 for %st(0) (gcc regno = 8)
++	12 for %st(1) (gcc regno = 9)
++	13 for %st(2) (gcc regno = 10)
++	14 for %st(3) (gcc regno = 11)
++	15 for %st(4) (gcc regno = 12)
++	16 for %st(5) (gcc regno = 13)
++	17 for %st(6) (gcc regno = 14)
++	18 for %st(7) (gcc regno = 15)
++*/
++int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
++{
++  0, 2, 1, 3, 6, 7, 5, 4,		/* general regs */
++  11, 12, 13, 14, 15, 16, 17, 18,	/* fp regs */
++  -1, 9, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
++  21, 22, 23, 24, 25, 26, 27, 28,	/* SSE registers */
++  29, 30, 31, 32, 33, 34, 35, 36,	/* MMX registers */
++  -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
++  -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
++};
++
++/* Test and compare insns in i386.md store the information needed to
++   generate branch and scc insns here.  */
++
++rtx ix86_compare_op0 = NULL_RTX;
++rtx ix86_compare_op1 = NULL_RTX;
++
++/* Define parameter passing and return registers.  */
++
++static int const x86_64_int_parameter_registers[6] =
++{
++  DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
++};
++
++static int const x86_64_ms_abi_int_parameter_registers[4] =
++{
++  CX_REG, DX_REG, R8_REG, R9_REG
++};
++
++static int const x86_64_int_return_registers[4] =
++{
++  AX_REG, DX_REG, DI_REG, SI_REG
++};
++
++/* Define the structure for the machine field in struct function.  */
++
++struct GTY(()) stack_local_entry {
++  unsigned short mode;
++  unsigned short n;
++  rtx rtl;
++  struct stack_local_entry *next;
++};
++
++/* Structure describing stack frame layout.
++   Stack grows downward:
++
++   [arguments]
++					      <- ARG_POINTER
++   saved pc
++
++   saved frame pointer if frame_pointer_needed
++					      <- HARD_FRAME_POINTER
++   [saved regs]
++
++   [padding0]
++
++   [saved SSE regs]
++
++   [padding1]          \
++		        )
++   [va_arg registers]  (
++		        > to_allocate	      <- FRAME_POINTER
++   [frame]	       (
++		        )
++   [padding2]	       /
++  */
++struct ix86_frame
++{
++  int padding0;
++  int nsseregs;
++  int nregs;
++  int padding1;
++  int va_arg_size;
++  HOST_WIDE_INT frame;
++  int padding2;
++  int outgoing_arguments_size;
++  int red_zone_size;
++
++  HOST_WIDE_INT to_allocate;
++  /* The offsets relative to ARG_POINTER.  */
++  HOST_WIDE_INT frame_pointer_offset;
++  HOST_WIDE_INT hard_frame_pointer_offset;
++  HOST_WIDE_INT stack_pointer_offset;
++
++  /* When save_regs_using_mov is set, emit prologue using
++     move instead of push instructions.  */
++  bool save_regs_using_mov;
++};
++
++/* Code model option.  */
++enum cmodel ix86_cmodel;
++/* Asm dialect.  */
++enum asm_dialect ix86_asm_dialect = ASM_ATT;
++/* TLS dialects.  */
++enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
++
++/* Which unit we are generating floating point math for.  */
++enum fpmath_unit ix86_fpmath;
++
++/* Which cpu are we optimizing for.  */
++enum processor_type ix86_tune;
++
++/* Which instruction set architecture to use.  */
++enum processor_type ix86_arch;
++
++/* true if sse prefetch instruction is not NOOP.  */
++int x86_prefetch_sse;
++
++/* -mstackrealign option */
++extern int ix86_force_align_arg_pointer;
++static const char ix86_force_align_arg_pointer_string[]
++  = "force_align_arg_pointer";
++
++/* Preferred alignment for stack boundary in bits.  */
++unsigned int ix86_preferred_stack_boundary;
++
++/* The abi used by target.  */
++enum calling_abi ix86_abi;
++
++/* Values 1-5: see jump.c */
++int ix86_branch_cost;
++
++/* Variables which are this size or smaller are put in the data/bss
++   or ldata/lbss sections.  */
++
++int ix86_section_threshold = 65536;
++
++/* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
++char internal_label_prefix[16];
++int internal_label_prefix_len;
++
++/* Fence to use after loop using movnt.  */
++tree x86_mfence;
++
++/* Register class used for passing given 64bit part of the argument.
++   These represent classes as documented by the PS ABI, with the exception
++   of SSESF, SSEDF classes, that are basically SSE class, just gcc will
++   use SF or DFmode move instead of DImode to avoid reformatting penalties.
++
++   Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
++   whenever possible (upper half does contain padding).  */
++enum x86_64_reg_class
++  {
++    X86_64_NO_CLASS,
++    X86_64_INTEGER_CLASS,
++    X86_64_INTEGERSI_CLASS,
++    X86_64_SSE_CLASS,
++    X86_64_SSESF_CLASS,
++    X86_64_SSEDF_CLASS,
++    X86_64_SSEUP_CLASS,
++    X86_64_X87_CLASS,
++    X86_64_X87UP_CLASS,
++    X86_64_COMPLEX_X87_CLASS,
++    X86_64_MEMORY_CLASS
++  };
++
++#define MAX_CLASSES 4
++
++
++
++enum ix86_function_specific_strings
++{
++  IX86_FUNCTION_SPECIFIC_ARCH,
++  IX86_FUNCTION_SPECIFIC_TUNE,
++  IX86_FUNCTION_SPECIFIC_FPMATH,
++  IX86_FUNCTION_SPECIFIC_MAX
++};
++
++
++/* The svr4 ABI for the i386 says that records and unions are returned
++   in memory.  */
++#ifndef DEFAULT_PCC_STRUCT_RETURN
++#define DEFAULT_PCC_STRUCT_RETURN 1
++#endif
++
++/* Bit flags that specify the ISA we are compiling for.  */
++int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT;
++
++/* Define a set of ISAs which are available when a given ISA is
++   enabled.  MMX and SSE ISAs are handled separately.  */
++
++#define OPTION_MASK_ISA_MMX_SET OPTION_MASK_ISA_MMX
++#define OPTION_MASK_ISA_3DNOW_SET \
++  (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_MMX_SET)
++
++#define OPTION_MASK_ISA_SSE_SET OPTION_MASK_ISA_SSE
++#define OPTION_MASK_ISA_SSE2_SET \
++  (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE_SET)
++#define OPTION_MASK_ISA_SSE3_SET \
++  (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE2_SET)
++#define OPTION_MASK_ISA_SSSE3_SET \
++  (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE3_SET)
++#define OPTION_MASK_ISA_SSE4_1_SET \
++  (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSSE3_SET)
++#define OPTION_MASK_ISA_SSE4_2_SET \
++  (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_1_SET)
++#define OPTION_MASK_ISA_AVX_SET \
++  (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_SSE4_2_SET)
++#define OPTION_MASK_ISA_FMA_SET \
++  (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_AVX_SET)
++
++/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
++   as -msse4.2.  */
++#define OPTION_MASK_ISA_SSE4_SET OPTION_MASK_ISA_SSE4_2_SET
++
++#define OPTION_MASK_ISA_SSE4A_SET \
++  (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET)
++#define OPTION_MASK_ISA_FMA4_SET \
++  (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_SSE4A_SET \
++   | OPTION_MASK_ISA_AVX_SET)
++#define OPTION_MASK_ISA_XOP_SET \
++  (OPTION_MASK_ISA_XOP | OPTION_MASK_ISA_FMA4_SET)
++#define OPTION_MASK_ISA_LWP_SET \
++  OPTION_MASK_ISA_LWP
++
++/* AES and PCLMUL need SSE2 because they use xmm registers */
++#define OPTION_MASK_ISA_AES_SET \
++  (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2_SET)
++#define OPTION_MASK_ISA_PCLMUL_SET \
++  (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2_SET)
++
++#define OPTION_MASK_ISA_ABM_SET \
++  (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
++
++#define OPTION_MASK_ISA_POPCNT_SET OPTION_MASK_ISA_POPCNT
++#define OPTION_MASK_ISA_CX16_SET OPTION_MASK_ISA_CX16
++#define OPTION_MASK_ISA_SAHF_SET OPTION_MASK_ISA_SAHF
++#define OPTION_MASK_ISA_MOVBE_SET OPTION_MASK_ISA_MOVBE
++#define OPTION_MASK_ISA_CRC32_SET OPTION_MASK_ISA_CRC32
++
++/* Define a set of ISAs which aren't available when a given ISA is
++   disabled.  MMX and SSE ISAs are handled separately.  */
++
++#define OPTION_MASK_ISA_MMX_UNSET \
++  (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_3DNOW_UNSET)
++#define OPTION_MASK_ISA_3DNOW_UNSET \
++  (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_A_UNSET)
++#define OPTION_MASK_ISA_3DNOW_A_UNSET OPTION_MASK_ISA_3DNOW_A
++
++#define OPTION_MASK_ISA_SSE_UNSET \
++  (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2_UNSET)
++#define OPTION_MASK_ISA_SSE2_UNSET \
++  (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE3_UNSET)
++#define OPTION_MASK_ISA_SSE3_UNSET \
++  (OPTION_MASK_ISA_SSE3 \
++   | OPTION_MASK_ISA_SSSE3_UNSET \
++   | OPTION_MASK_ISA_SSE4A_UNSET )
++#define OPTION_MASK_ISA_SSSE3_UNSET \
++  (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE4_1_UNSET)
++#define OPTION_MASK_ISA_SSE4_1_UNSET \
++  (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2_UNSET)
++#define OPTION_MASK_ISA_SSE4_2_UNSET \
++  (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_AVX_UNSET )
++#define OPTION_MASK_ISA_AVX_UNSET \
++  (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET \
++   | OPTION_MASK_ISA_FMA4_UNSET)
++#define OPTION_MASK_ISA_FMA_UNSET OPTION_MASK_ISA_FMA
++
++/* SSE4 includes both SSE4.1 and SSE4.2.  -mno-sse4 should the same
++   as -mno-sse4.1. */
++#define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
++
++#define OPTION_MASK_ISA_SSE4A_UNSET \
++  (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_FMA4_UNSET)
++
++#define OPTION_MASK_ISA_FMA4_UNSET \
++  (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_XOP_UNSET)
++#define OPTION_MASK_ISA_XOP_UNSET OPTION_MASK_ISA_XOP
++#define OPTION_MASK_ISA_LWP_UNSET OPTION_MASK_ISA_LWP
++
++#define OPTION_MASK_ISA_AES_UNSET OPTION_MASK_ISA_AES
++#define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
++#define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
++#define OPTION_MASK_ISA_POPCNT_UNSET OPTION_MASK_ISA_POPCNT
++#define OPTION_MASK_ISA_CX16_UNSET OPTION_MASK_ISA_CX16
++#define OPTION_MASK_ISA_SAHF_UNSET OPTION_MASK_ISA_SAHF
++#define OPTION_MASK_ISA_MOVBE_UNSET OPTION_MASK_ISA_MOVBE
++#define OPTION_MASK_ISA_CRC32_UNSET OPTION_MASK_ISA_CRC32
++
++#if 0
++/* Vectorization library interface and handlers.  */
++tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL;
++static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
++static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
++#endif
++
++/* Processor target table, indexed by processor number */
++struct ptt
++{
++  const struct processor_costs *cost;		/* Processor costs */
++  const int align_loop;				/* Default alignments.  */
++  const int align_loop_max_skip;
++  const int align_jump;
++  const int align_jump_max_skip;
++  const int align_func;
++};
++
++static const struct ptt processor_target_table[PROCESSOR_max] =
++{
++  {&i386_cost, 4, 3, 4, 3, 4},
++  {&i486_cost, 16, 15, 16, 15, 16},
++  {&pentium_cost, 16, 7, 16, 7, 16},
++  {&pentiumpro_cost, 16, 15, 16, 10, 16},
++  {&geode_cost, 0, 0, 0, 0, 0},
++  {&k6_cost, 32, 7, 32, 7, 32},
++  {&athlon_cost, 16, 7, 16, 7, 16},
++  {&pentium4_cost, 0, 0, 0, 0, 0},
++  {&k8_cost, 16, 7, 16, 7, 16},
++  {&nocona_cost, 0, 0, 0, 0, 0},
++  {&core2_cost, 16, 10, 16, 10, 16},
++  {&generic32_cost, 16, 7, 16, 7, 16},
++  {&generic64_cost, 16, 10, 16, 10, 16},
++  {&amdfam10_cost, 32, 24, 32, 7, 32},
++  {&atom_cost, 16, 7, 16, 7, 16}
++};
++
++static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
++{
++  "generic",
++  "i386",
++  "i486",
++  "pentium",
++  "pentium-mmx",
++  "pentiumpro",
++  "pentium2",
++  "pentium3",
++  "pentium4",
++  "pentium-m",
++  "prescott",
++  "nocona",
++  "core2",
++  "atom",
++  "geode",
++  "k6",
++  "k6-2",
++  "k6-3",
++  "athlon",
++  "athlon-4",
++  "k8",
++  "amdfam10"
++};
++
++
++/* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
++   But in the case of vector types, it is some vector mode.
++
++   When we have only some of our vector isa extensions enabled, then there
++   are some modes for which vector_mode_supported_p is false.  For these
++   modes, the generic vector support in gcc will choose some non-vector mode
++   in order to implement the type.  By computing the natural mode, we'll
++   select the proper ABI location for the operand and not depend on whatever
++   the middle-end decides to do with these vector types.
++
++   The midde-end can't deal with the vector types > 16 bytes.  In this
++   case, we return the original mode and warn ABI change if CUM isn't
++   NULL.  */
++
++enum machine_mode
++type_natural_mode (const_tree type, CUMULATIVE_ARGS *cum)
++{
++  enum machine_mode mode = TYPE_MODE (type);
++
++  if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
++    {
++      HOST_WIDE_INT size = int_size_in_bytes (type);
++      if ((size == 8 || size == 16 || size == 32)
++	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
++	  && TYPE_VECTOR_SUBPARTS (type) > 1)
++	{
++	  enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
++
++	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
++	    mode = MIN_MODE_VECTOR_FLOAT;
++	  else
++	    mode = MIN_MODE_VECTOR_INT;
++
++	  /* Get the mode which has this inner mode and number of units.  */
++	  for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
++	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
++		&& GET_MODE_INNER (mode) == innermode)
++	      {
++		if (size == 32 && !TARGET_AVX)
++		  {
++		    static bool warnedavx;
++
++		    if (cum
++			&& !warnedavx 
++			&& cum->warn_avx)
++		      {
++			warnedavx = true;
++			warning (0, "AVX vector argument without AVX "
++				 "enabled changes the ABI");
++		      }
++		    return TYPE_MODE (type);
++		  }
++		else
++		  return mode;
++	      }
++
++	  gcc_unreachable ();
++	}
++    }
++
++  return mode;
++}
++
++/* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
++   of this code is to classify each 8bytes of incoming argument by the register
++   class and assign registers accordingly.  */
++
++/* Return the union class of CLASS1 and CLASS2.
++   See the x86-64 PS ABI for details.  */
++
++static enum x86_64_reg_class
++merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
++{
++  /* Rule #1: If both classes are equal, this is the resulting class.  */
++  if (class1 == class2)
++    return class1;
++
++  /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
++     the other class.  */
++  if (class1 == X86_64_NO_CLASS)
++    return class2;
++  if (class2 == X86_64_NO_CLASS)
++    return class1;
++
++  /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
++  if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
++    return X86_64_MEMORY_CLASS;
++
++  /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
++  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
++      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
++    return X86_64_INTEGERSI_CLASS;
++  if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
++      || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
++    return X86_64_INTEGER_CLASS;
++
++  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
++     MEMORY is used.  */
++  if (class1 == X86_64_X87_CLASS
++      || class1 == X86_64_X87UP_CLASS
++      || class1 == X86_64_COMPLEX_X87_CLASS
++      || class2 == X86_64_X87_CLASS
++      || class2 == X86_64_X87UP_CLASS
++      || class2 == X86_64_COMPLEX_X87_CLASS)
++    return X86_64_MEMORY_CLASS;
++
++  /* Rule #6: Otherwise class SSE is used.  */
++  return X86_64_SSE_CLASS;
++}
++
++/* Classify the argument of type TYPE and mode MODE.
++   CLASSES will be filled by the register class used to pass each word
++   of the operand.  The number of words is returned.  In case the parameter
++   should be passed in memory, 0 is returned. As a special case for zero
++   sized containers, classes[0] will be NO_CLASS and 1 is returned.
++
++   BIT_OFFSET is used internally for handling records and specifies offset
++   of the offset in bits modulo 256 to avoid overflow cases.
++
++   See the x86-64 PS ABI for details.
++*/
++
++int
++classify_argument (enum machine_mode mode, const_tree type,
++		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
++{
++  HOST_WIDE_INT bytes =
++    (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
++  int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
++
++  /* Variable sized entities are always passed/returned in memory.  */
++  if (bytes < 0)
++    return 0;
++
++  if (mode != VOIDmode
++      && targetm.calls.must_pass_in_stack (mode, type))
++    return 0;
++
++  if (type && AGGREGATE_TYPE_P (type))
++    {
++      int i;
++      tree field;
++      enum x86_64_reg_class subclasses[MAX_CLASSES];
++
++      /* On x86-64 we pass structures larger than 32 bytes on the stack.  */
++      if (bytes > 32)
++	return 0;
++
++      for (i = 0; i < words; i++)
++	classes[i] = X86_64_NO_CLASS;
++
++      /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
++	 signalize memory class, so handle it as special case.  */
++      if (!words)
++	{
++	  classes[0] = X86_64_NO_CLASS;
++	  return 1;
++	}
++
++      /* Classify each field of record and merge classes.  */
++      switch (TREE_CODE (type))
++	{
++	case RECORD_TYPE:
++	  /* And now merge the fields of structure.  */
++	  for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
++	    {
++	      if (TREE_CODE (field) == FIELD_DECL)
++		{
++		  int num;
++
++		  if (TREE_TYPE (field) == error_mark_node)
++		    continue;
++
++		  /* Bitfields are always classified as integer.  Handle them
++		     early, since later code would consider them to be
++		     misaligned integers.  */
++		  if (DECL_BIT_FIELD (field))
++		    {
++		      for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
++			   i < ((int_bit_position (field) + (bit_offset % 64))
++			        + tree_low_cst (DECL_SIZE (field), 0)
++				+ 63) / 8 / 8; i++)
++			classes[i] =
++			  merge_classes (X86_64_INTEGER_CLASS,
++					 classes[i]);
++		    }
++		  else
++		    {
++		      int pos;
++
++		      type = TREE_TYPE (field);
++
++		      /* Flexible array member is ignored.  */
++		      if (TYPE_MODE (type) == BLKmode
++			  && TREE_CODE (type) == ARRAY_TYPE
++			  && TYPE_SIZE (type) == NULL_TREE
++			  && TYPE_DOMAIN (type) != NULL_TREE
++			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
++			      == NULL_TREE))
++			{
++			  static bool warned;
++			  
++			  if (!warned && warn_psabi)
++			    {
++			      warned = true;
++			      inform (input_location,
++				      "The ABI of passing struct with"
++				      " a flexible array member has"
++				      " changed in GCC 4.4");
++			    }
++			  continue;
++			}
++		      num = classify_argument (TYPE_MODE (type), type,
++					       subclasses,
++					       (int_bit_position (field)
++						+ bit_offset) % 256);
++		      if (!num)
++			return 0;
++		      pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
++		      for (i = 0; i < num && (i + pos) < words; i++)
++			classes[i + pos] =
++			  merge_classes (subclasses[i], classes[i + pos]);
++		    }
++		}
++	    }
++	  break;
++
++	case ARRAY_TYPE:
++	  /* Arrays are handled as small records.  */
++	  {
++	    int num;
++	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
++				     TREE_TYPE (type), subclasses, bit_offset);
++	    if (!num)
++	      return 0;
++
++	    /* The partial classes are now full classes.  */
++	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
++	      subclasses[0] = X86_64_SSE_CLASS;
++	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
++		&& !((bit_offset % 64) == 0 && bytes == 4))
++	      subclasses[0] = X86_64_INTEGER_CLASS;
++
++	    for (i = 0; i < words; i++)
++	      classes[i] = subclasses[i % num];
++
++	    break;
++	  }
++	case UNION_TYPE:
++	case QUAL_UNION_TYPE:
++	  /* Unions are similar to RECORD_TYPE but offset is always 0.
++	     */
++	  for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
++	    {
++	      if (TREE_CODE (field) == FIELD_DECL)
++		{
++		  int num;
++
++		  if (TREE_TYPE (field) == error_mark_node)
++		    continue;
++
++		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
++					   TREE_TYPE (field), subclasses,
++					   bit_offset);
++		  if (!num)
++		    return 0;
++		  for (i = 0; i < num; i++)
++		    classes[i] = merge_classes (subclasses[i], classes[i]);
++		}
++	    }
++	  break;
++
++	default:
++	  gcc_unreachable ();
++	}
++
++      if (words > 2)
++	{
++	  /* When size > 16 bytes, if the first one isn't
++	     X86_64_SSE_CLASS or any other ones aren't
++	     X86_64_SSEUP_CLASS, everything should be passed in
++	     memory.  */
++	  if (classes[0] != X86_64_SSE_CLASS)
++	      return 0;
++
++	  for (i = 1; i < words; i++)
++	    if (classes[i] != X86_64_SSEUP_CLASS)
++	      return 0;
++	}
++
++      /* Final merger cleanup.  */
++      for (i = 0; i < words; i++)
++	{
++	  /* If one class is MEMORY, everything should be passed in
++	     memory.  */
++	  if (classes[i] == X86_64_MEMORY_CLASS)
++	    return 0;
++
++	  /* The X86_64_SSEUP_CLASS should be always preceded by
++	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
++	  if (classes[i] == X86_64_SSEUP_CLASS
++	      && classes[i - 1] != X86_64_SSE_CLASS
++	      && classes[i - 1] != X86_64_SSEUP_CLASS)
++	    {
++	      /* The first one should never be X86_64_SSEUP_CLASS.  */
++	      gcc_assert (i != 0);
++	      classes[i] = X86_64_SSE_CLASS;
++	    }
++
++	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
++	       everything should be passed in memory.  */
++	  if (classes[i] == X86_64_X87UP_CLASS
++	      && (classes[i - 1] != X86_64_X87_CLASS))
++	    {
++	      static bool warned;
++
++	      /* The first one should never be X86_64_X87UP_CLASS.  */
++	      gcc_assert (i != 0);
++	      if (!warned && warn_psabi)
++		{
++		  warned = true;
++		  inform (input_location,
++			  "The ABI of passing union with long double"
++			  " has changed in GCC 4.4");
++		}
++	      return 0;
++	    }
++	}
++      return words;
++    }
++
++  /* Compute alignment needed.  We align all types to natural boundaries with
++     exception of XFmode that is aligned to 64bits.  */
++  if (mode != VOIDmode && mode != BLKmode)
++    {
++      int mode_alignment = GET_MODE_BITSIZE (mode);
++
++      if (mode == XFmode)
++	mode_alignment = 128;
++      else if (mode == XCmode)
++	mode_alignment = 256;
++      if (COMPLEX_MODE_P (mode))
++	mode_alignment /= 2;
++      /* Misaligned fields are always returned in memory.  */
++      if (bit_offset % mode_alignment)
++	return 0;
++    }
++
++  /* for V1xx modes, just use the base mode */
++  if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
++      && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
++    mode = GET_MODE_INNER (mode);
++
++  /* Classification of atomic types.  */
++  switch (mode)
++    {
++    case SDmode:
++    case DDmode:
++      classes[0] = X86_64_SSE_CLASS;
++      return 1;
++    case TDmode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      return 2;
++    case DImode:
++    case SImode:
++    case HImode:
++    case QImode:
++    case CSImode:
++    case CHImode:
++    case CQImode:
++      {
++	int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
++
++	if (size <= 32)
++	  {
++	    classes[0] = X86_64_INTEGERSI_CLASS;
++	    return 1;
++	  }
++	else if (size <= 64)
++	  {
++	    classes[0] = X86_64_INTEGER_CLASS;
++	    return 1;
++	  }
++	else if (size <= 64+32)
++	  {
++	    classes[0] = X86_64_INTEGER_CLASS;
++	    classes[1] = X86_64_INTEGERSI_CLASS;
++	    return 2;
++	  }
++	else if (size <= 64+64)
++	  {
++	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
++	    return 2;
++	  }
++	else
++	  gcc_unreachable ();
++      }
++    case CDImode:
++    case TImode:
++      classes[0] = classes[1] = X86_64_INTEGER_CLASS;
++      return 2;
++    case COImode:
++    case OImode:
++      /* OImode shouldn't be used directly.  */
++      gcc_unreachable ();
++    case CTImode:
++      return 0;
++    case SFmode:
++      if (!(bit_offset % 64))
++	classes[0] = X86_64_SSESF_CLASS;
++      else
++	classes[0] = X86_64_SSE_CLASS;
++      return 1;
++    case DFmode:
++      classes[0] = X86_64_SSEDF_CLASS;
++      return 1;
++    case XFmode:
++      classes[0] = X86_64_X87_CLASS;
++      classes[1] = X86_64_X87UP_CLASS;
++      return 2;
++    case TFmode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      return 2;
++    case SCmode:
++      classes[0] = X86_64_SSE_CLASS;
++      if (!(bit_offset % 64))
++	return 1;
++      else
++	{
++	  static bool warned;
++
++	  if (!warned && warn_psabi)
++	    {
++	      warned = true;
++	      inform (input_location,
++		      "The ABI of passing structure with complex float"
++		      " member has changed in GCC 4.4");
++	    }
++	  classes[1] = X86_64_SSESF_CLASS;
++	  return 2;
++	}
++    case DCmode:
++      classes[0] = X86_64_SSEDF_CLASS;
++      classes[1] = X86_64_SSEDF_CLASS;
++      return 2;
++    case XCmode:
++      classes[0] = X86_64_COMPLEX_X87_CLASS;
++      return 1;
++    case TCmode:
++      /* This modes is larger than 16 bytes.  */
++      return 0;
++    case V8SFmode:
++    case V8SImode:
++    case V32QImode:
++    case V16HImode:
++    case V4DFmode:
++    case V4DImode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      classes[2] = X86_64_SSEUP_CLASS;
++      classes[3] = X86_64_SSEUP_CLASS;
++      return 4;
++    case V4SFmode:
++    case V4SImode:
++    case V16QImode:
++    case V8HImode:
++    case V2DFmode:
++    case V2DImode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      return 2;
++    case V1TImode:
++    case V1DImode:
++    case V2SFmode:
++    case V2SImode:
++    case V4HImode:
++    case V8QImode:
++      classes[0] = X86_64_SSE_CLASS;
++      return 1;
++    case BLKmode:
++    case VOIDmode:
++      return 0;
++    default:
++      gcc_assert (VECTOR_MODE_P (mode));
++
++      if (bytes > 16)
++	return 0;
++
++      gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
++
++      if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
++	classes[0] = X86_64_INTEGERSI_CLASS;
++      else
++	classes[0] = X86_64_INTEGER_CLASS;
++      classes[1] = X86_64_INTEGER_CLASS;
++      return 1 + (bytes > 8);
++    }
++}
++
++/* Examine the argument and return set number of register required in each
++   class.  Return 0 iff parameter should be passed in memory.  */
++int
++examine_argument (enum machine_mode mode, const_tree type, int in_return,
++		  int *int_nregs, int *sse_nregs)
++{
++  enum x86_64_reg_class regclass[MAX_CLASSES];
++  int n = classify_argument (mode, type, regclass, 0);
++
++  *int_nregs = 0;
++  *sse_nregs = 0;
++  if (!n)
++    return 0;
++  for (n--; n >= 0; n--)
++    switch (regclass[n])
++      {
++      case X86_64_INTEGER_CLASS:
++      case X86_64_INTEGERSI_CLASS:
++	(*int_nregs)++;
++	break;
++      case X86_64_SSE_CLASS:
++      case X86_64_SSESF_CLASS:
++      case X86_64_SSEDF_CLASS:
++	(*sse_nregs)++;
++	break;
++      case X86_64_NO_CLASS:
++      case X86_64_SSEUP_CLASS:
++	break;
++      case X86_64_X87_CLASS:
++      case X86_64_X87UP_CLASS:
++	if (!in_return)
++	  return 0;
++	break;
++      case X86_64_COMPLEX_X87_CLASS:
++	return in_return ? 2 : 0;
++      case X86_64_MEMORY_CLASS:
++	gcc_unreachable ();
++      }
++  return 1;
++}
++
++/* Return true when TYPE should be 128bit aligned for 32bit argument passing
++   ABI.  */
++bool
++contains_aligned_value_p (tree type)
++{
++  enum machine_mode mode = TYPE_MODE (type);
++  if (((TARGET_SSE && SSE_REG_MODE_P (mode))
++       || mode == TDmode
++       || mode == TFmode
++       || mode == TCmode)
++      && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
++    return true;
++  if (TYPE_ALIGN (type) < 128)
++    return false;
++
++  if (AGGREGATE_TYPE_P (type))
++    {
++      /* Walk the aggregates recursively.  */
++      switch (TREE_CODE (type))
++	{
++	case RECORD_TYPE:
++	case UNION_TYPE:
++	case QUAL_UNION_TYPE:
++	  {
++	    tree field;
++
++	    /* Walk all the structure fields.  */
++	    for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
++	      {
++		if (TREE_CODE (field) == FIELD_DECL
++		    && contains_aligned_value_p (TREE_TYPE (field)))
++		  return true;
++	      }
++	    break;
++	  }
++
++	case ARRAY_TYPE:
++	  /* Just for use if some languages passes arrays by value.  */
++	  if (contains_aligned_value_p (TREE_TYPE (type)))
++	    return true;
++	  break;
++
++	default:
++	  gcc_unreachable ();
++	}
++    }
++  return false;
++}
+