diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/math/ldexp.cl libclc-0.2.0+git20170213/amdgcn/lib/math/ldexp.cl
--- libclc-0.2.0+git20150813/amdgcn/lib/math/ldexp.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/math/ldexp.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "../../../generic/lib/clcmacro.h"
+
+#ifdef __HAS_LDEXPF__
+#define BUILTINF __builtin_amdgcn_ldexpf
+#else
+#include "math/clc_ldexp.h"
+#define BUILTINF __clc_ldexp
+#endif
+
+// This defines all the ldexp(floatN, intN) variants.
+_CLC_DEFINE_BINARY_BUILTIN(float, ldexp, BUILTINF, float, int);
+
+#ifdef cl_khr_fp64
+  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    // This defines all the ldexp(doubleN, intN) variants.
+  _CLC_DEFINE_BINARY_BUILTIN(double, ldexp, __builtin_amdgcn_ldexp, double, int);
+#endif
+
+// This defines all the ldexp(GENTYPE, int);
+#define __CLC_BODY <../../../generic/lib/math/ldexp.inc>
+#include <clc/math/gentype.inc>
+
+#undef BUILTINF
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/SOURCES libclc-0.2.0+git20170213/amdgcn/lib/SOURCES
--- libclc-0.2.0+git20150813/amdgcn/lib/SOURCES	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/SOURCES	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+math/ldexp.cl
+synchronization/barrier_impl.ll
+workitem/get_global_offset.cl
+workitem/get_group_id.cl
+workitem/get_global_size.ll
+workitem/get_local_id.cl
+workitem/get_local_size.ll
+workitem/get_num_groups.ll
+workitem/get_work_dim.cl
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/synchronization/barrier_impl.ll libclc-0.2.0+git20170213/amdgcn/lib/synchronization/barrier_impl.ll
--- libclc-0.2.0+git20150813/amdgcn/lib/synchronization/barrier_impl.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/synchronization/barrier_impl.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,32 @@
+declare i32 @__clc_clk_local_mem_fence() #1
+declare i32 @__clc_clk_global_mem_fence() #1
+declare void @llvm.amdgcn.s.barrier() #0
+
+define void @barrier(i32 %flags) #2 {
+barrier_local_test:
+  %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
+  %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE
+  %1 = icmp ne i32 %0, 0
+  br i1 %1, label %barrier_local, label %barrier_global_test
+
+barrier_local:
+  call void @llvm.amdgcn.s.barrier()
+  br label %barrier_global_test
+
+barrier_global_test:
+  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
+  %2 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %barrier_global, label %done
+
+barrier_global:
+  call void @llvm.amdgcn.s.barrier()
+  br label %done
+
+done:
+  ret void
+}
+
+attributes #0 = { nounwind convergent }
+attributes #1 = { nounwind alwaysinline }
+attributes #2 = { nounwind convergent alwaysinline }
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_global_offset.cl libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_global_offset.cl
--- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_global_offset.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_global_offset.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_global_offset(uint dim)
+{
+	__attribute__((address_space(2))) uint * ptr =
+		(__attribute__((address_space(2))) uint *)
+		__builtin_amdgcn_implicitarg_ptr();
+	if (dim < 3)
+		return ptr[dim + 1];
+	return 0;
+}
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_global_size.ll libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_global_size.ll
--- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_global_size.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_global_size.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,21 @@
+declare i32 @llvm.r600.read.global.size.x() nounwind readnone
+declare i32 @llvm.r600.read.global.size.y() nounwind readnone
+declare i32 @llvm.r600.read.global.size.z() nounwind readnone
+
+define i64 @get_global_size(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.global.size.x()
+  %x.ext = zext i32 %x to i64
+  ret i64 %x.ext
+y_dim:
+  %y = call i32 @llvm.r600.read.global.size.y()
+  %y.ext = zext i32 %y to i64
+  ret i64 %y.ext
+z_dim:
+  %z = call i32 @llvm.r600.read.global.size.z()
+  %z.ext = zext i32 %z to i64
+  ret i64 %z.ext
+default:
+  ret i64 1
+}
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_group_id.cl libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_group_id.cl
--- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_group_id.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_group_id.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_group_id(uint dim)
+{
+	switch(dim) {
+	case 0: return __builtin_amdgcn_workgroup_id_x();
+	case 1: return __builtin_amdgcn_workgroup_id_y();
+	case 2: return __builtin_amdgcn_workgroup_id_z();
+	default: return 1;
+	}
+}
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_local_id.cl libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_local_id.cl
--- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_local_id.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_local_id.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_local_id(uint dim)
+{
+	switch(dim) {
+	case 0: return __builtin_amdgcn_workitem_id_x();
+	case 1: return __builtin_amdgcn_workitem_id_y();
+	case 2: return __builtin_amdgcn_workitem_id_z();
+	default: return 1;
+	}
+}
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_local_size.ll libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_local_size.ll
--- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_local_size.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_local_size.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,21 @@
+declare i32 @llvm.r600.read.local.size.x() nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() nounwind readnone
+
+define i64 @get_local_size(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.local.size.x()
+  %x.ext = zext i32 %x to i64
+  ret i64 %x.ext
+y_dim:
+  %y = call i32 @llvm.r600.read.local.size.y()
+  %y.ext = zext i32 %y to i64
+  ret i64 %y.ext
+z_dim:
+  %z = call i32 @llvm.r600.read.local.size.z()
+  %z.ext = zext i32 %z to i64
+  ret i64 %z.ext
+default:
+  ret i64 1
+}
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_num_groups.ll libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_num_groups.ll
--- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_num_groups.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_num_groups.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,21 @@
+declare i32 @llvm.r600.read.ngroups.x() nounwind readnone
+declare i32 @llvm.r600.read.ngroups.y() nounwind readnone
+declare i32 @llvm.r600.read.ngroups.z() nounwind readnone
+
+define i64 @get_num_groups(i32 %dim) nounwind readnone alwaysinline {
+  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
+x_dim:
+  %x = call i32 @llvm.r600.read.ngroups.x()
+  %x.ext = zext i32 %x to i64
+  ret i64 %x.ext
+y_dim:
+  %y = call i32 @llvm.r600.read.ngroups.y()
+  %y.ext = zext i32 %y to i64
+  ret i64 %y.ext
+z_dim:
+  %z = call i32 @llvm.r600.read.ngroups.z()
+  %z.ext = zext i32 %z to i64
+  ret i64 %z.ext
+default:
+  ret i64 1
+}
diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_work_dim.cl libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_work_dim.cl
--- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_work_dim.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_work_dim.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_DEF uint get_work_dim()
+{
+	__attribute__((address_space(2))) uint * ptr =
+		(__attribute__((address_space(2))) uint *)
+		__builtin_amdgcn_implicitarg_ptr();
+	return ptr[0];
+}
diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/OVERRIDES libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/OVERRIDES
--- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/OVERRIDES	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/OVERRIDES	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1 @@
+workitem/get_num_groups.ll
diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/SOURCES libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/SOURCES
--- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/SOURCES	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/SOURCES	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,3 @@
+workitem/get_global_size.ll
+workitem/get_local_size.ll
+workitem/get_num_groups.cl
diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_global_size.ll libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_global_size.ll
--- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_global_size.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_global_size.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,39 @@
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+
+define i64 @get_global_size(i32 %dim) #1 {
+  %dispatch_ptr = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  switch i32 %dim, label %default [
+    i32 0, label %x
+    i32 1, label %y
+    i32 2, label %z
+  ]
+
+x:
+  %ptr_x = getelementptr inbounds i8, i8 addrspace(2)* %dispatch_ptr, i64 12
+  %ptr_x32 = bitcast i8 addrspace(2)* %ptr_x to i32 addrspace(2)*
+  %x32 = load i32, i32 addrspace(2)* %ptr_x32, align 4, !invariant.load !0
+  %size_x = zext i32 %x32 to i64
+  ret i64 %size_x
+
+y:
+  %ptr_y = getelementptr inbounds i8, i8 addrspace(2)* %dispatch_ptr, i64 16
+  %ptr_y32 = bitcast i8 addrspace(2)* %ptr_y to i32 addrspace(2)*
+  %y32 = load i32, i32 addrspace(2)* %ptr_y32, align 4, !invariant.load !0
+  %size_y = zext i32 %y32 to i64
+  ret i64 %size_y
+
+z:
+  %ptr_z = getelementptr inbounds i8, i8 addrspace(2)* %dispatch_ptr, i64 20
+  %ptr_z32 = bitcast i8 addrspace(2)* %ptr_z to i32 addrspace(2)*
+  %z32 = load i32, i32 addrspace(2)* %ptr_z32, align 4, !invariant.load !0
+  %size_z = zext i32 %z32 to i64
+  ret i64 %size_z
+
+default:
+  ret i64 1
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { alwaysinline norecurse nounwind readonly }
+
+!0 = !{}
diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_local_size.ll libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_local_size.ll
--- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_local_size.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_local_size.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,38 @@
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
+
+define i64 @get_local_size(i32 %dim) #1 {
+  %dispatch_ptr = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  %dispatch_ptr_i32 = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
+  %xy_size_ptr = getelementptr inbounds i32, i32 addrspace(2)* %dispatch_ptr_i32, i64 1
+  %xy_size = load i32, i32 addrspace(2)* %xy_size_ptr, align 4, !invariant.load !0
+  switch i32 %dim, label %default [
+    i32 0, label %x_dim
+    i32 1, label %y_dim
+    i32 2, label %z_dim
+  ]
+
+x_dim:
+  %x_size = and i32 %xy_size, 65535
+  %x_size.ext = zext i32 %x_size to i64
+  ret i64 %x_size.ext
+
+y_dim:
+  %y_size = lshr i32 %xy_size, 16
+  %y_size.ext = zext i32 %y_size to i64
+  ret i64 %y_size.ext
+
+z_dim:
+  %z_size_ptr = getelementptr inbounds i32, i32 addrspace(2)* %dispatch_ptr_i32, i64 2
+  %z_size = load i32, i32 addrspace(2)* %z_size_ptr, align 4, !invariant.load !0, !range !1
+  %z_size.ext = zext i32 %z_size to i64
+  ret i64 %z_size.ext
+
+default:
+  ret i64 1
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { alwaysinline norecurse nounwind readonly }
+
+!0 = !{}
+!1 = !{ i32 0, i32 257 }
diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_num_groups.cl libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_num_groups.cl
--- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_num_groups.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_num_groups.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,12 @@
+
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_num_groups(uint dim) {
+  size_t global_size = get_global_size(dim);
+  size_t local_size = get_local_size(dim);
+  size_t num_groups = global_size / local_size;
+  if (global_size % local_size != 0) {
+    num_groups++;
+  }
+  return num_groups;
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/atomic/atomic.cl libclc-0.2.0+git20170213/amdgpu/lib/atomic/atomic.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/atomic/atomic.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/atomic/atomic.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,65 @@
+#include <clc/clc.h>
+
+#define ATOMIC_FUNC_DEFINE(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE val) { \
+	return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)val); \
+}
+
+/* For atomic functions that don't need different bitcode dependending on argument signedness */
+#define ATOMIC_FUNC_SIGN(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+	_CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE); \
+	ATOMIC_FUNC_DEFINE(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+	ATOMIC_FUNC_DEFINE(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE)
+
+#define ATOMIC_FUNC_ADDRSPACE(TYPE, FUNCTION) \
+	ATOMIC_FUNC_SIGN(TYPE, FUNCTION, global, 1) \
+	ATOMIC_FUNC_SIGN(TYPE, FUNCTION, local, 3)
+
+#define ATOMIC_FUNC(FUNCTION) \
+	ATOMIC_FUNC_ADDRSPACE(int, FUNCTION)
+
+#define ATOMIC_FUNC_DEFINE_3_ARG(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE cmp, RET_SIGN TYPE val) { \
+	return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)cmp, (ARG_SIGN TYPE)val); \
+}
+
+/* For atomic functions that don't need different bitcode dependending on argument signedness */
+#define ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+	_CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE, signed TYPE); \
+	ATOMIC_FUNC_DEFINE_3_ARG(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
+	ATOMIC_FUNC_DEFINE_3_ARG(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE)
+
+#define ATOMIC_FUNC_ADDRSPACE_3_ARG(TYPE, FUNCTION) \
+	ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, global, 1) \
+	ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, local, 3)
+
+#define ATOMIC_FUNC_3_ARG(FUNCTION) \
+	ATOMIC_FUNC_ADDRSPACE_3_ARG(int, FUNCTION)
+
+ATOMIC_FUNC(atomic_add)
+ATOMIC_FUNC(atomic_and)
+ATOMIC_FUNC(atomic_or)
+ATOMIC_FUNC(atomic_sub)
+ATOMIC_FUNC(atomic_xchg)
+ATOMIC_FUNC(atomic_xor)
+ATOMIC_FUNC_3_ARG(atomic_cmpxchg)
+
+_CLC_DECL signed int __clc_atomic_max_addr1(volatile global signed int*, signed int);
+_CLC_DECL signed int __clc_atomic_max_addr3(volatile local signed int*, signed int);
+_CLC_DECL uint __clc_atomic_umax_addr1(volatile global uint*, uint);
+_CLC_DECL uint __clc_atomic_umax_addr3(volatile local uint*, uint);
+
+ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, global, 1)
+ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, local, 3)
+ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, global, 1)
+ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, local, 3)
+
+_CLC_DECL signed int __clc_atomic_min_addr1(volatile global signed int*, signed int);
+_CLC_DECL signed int __clc_atomic_min_addr3(volatile local signed int*, signed int);
+_CLC_DECL uint __clc_atomic_umin_addr1(volatile global uint*, uint);
+_CLC_DECL uint __clc_atomic_umin_addr3(volatile local uint*, uint);
+
+ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, global, 1)
+ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, local, 3)
+ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, global, 1)
+ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, local, 3)
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_attributes_impl.ll libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_attributes_impl.ll
--- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_attributes_impl.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_attributes_impl.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,87 @@
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+
+declare i32 @llvm.OpenCL.image.get.resource.id.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare i32 @llvm.OpenCL.image.get.resource.id.3d(
+  %opencl.image3d_t addrspace(1)*) nounwind readnone
+
+declare [3 x i32] @llvm.OpenCL.image.get.size.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare [3 x i32] @llvm.OpenCL.image.get.size.3d(
+  %opencl.image3d_t addrspace(1)*) nounwind readnone
+
+declare [2 x i32] @llvm.OpenCL.image.get.format.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare [2 x i32] @llvm.OpenCL.image.get.format.3d(
+  %opencl.image3d_t addrspace(1)*) nounwind readnone
+
+define i32 @__clc_get_image_width_2d(
+                          %opencl.image2d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.2d(
+    %opencl.image2d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 0
+  ret i32 %2
+}
+define i32 @__clc_get_image_width_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 0
+  ret i32 %2
+}
+
+define i32 @__clc_get_image_height_2d(
+                          %opencl.image2d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.2d(
+    %opencl.image2d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 1
+  ret i32 %2
+}
+define i32 @__clc_get_image_height_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 1
+  ret i32 %2
+}
+
+define i32 @__clc_get_image_depth_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [3 x i32] %1, 2
+  ret i32 %2
+}
+
+define i32 @__clc_get_image_channel_data_type_2d(
+                          %opencl.image2d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.2d(
+    %opencl.image2d_t addrspace(1)* %img)
+  %2 = extractvalue [2 x i32] %1, 0
+  ret i32 %2
+}
+define i32 @__clc_get_image_channel_data_type_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [2 x i32] %1, 0
+  ret i32 %2
+}
+
+define i32 @__clc_get_image_channel_order_2d(
+                          %opencl.image2d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.2d(
+    %opencl.image2d_t addrspace(1)* %img)
+  %2 = extractvalue [2 x i32] %1, 1
+  ret i32 %2
+}
+define i32 @__clc_get_image_channel_order_3d(
+                          %opencl.image3d_t addrspace(1)* nocapture %img) #0 {
+  %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.3d(
+    %opencl.image3d_t addrspace(1)* %img)
+  %2 = extractvalue [2 x i32] %1, 1
+  ret i32 %2
+}
+
+attributes #0 = { nounwind readnone alwaysinline }
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_channel_data_type.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_channel_data_type.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_channel_data_type.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_channel_data_type.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,13 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_channel_data_type_2d(image2d_t);
+_CLC_DECL int __clc_get_image_channel_data_type_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_channel_data_type(image2d_t image) {
+  return __clc_get_image_channel_data_type_2d(image);
+}
+_CLC_OVERLOAD _CLC_DEF int
+get_image_channel_data_type(image3d_t image) {
+  return __clc_get_image_channel_data_type_3d(image);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_channel_order.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_channel_order.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_channel_order.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_channel_order.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,13 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_channel_order_2d(image2d_t);
+_CLC_DECL int __clc_get_image_channel_order_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_channel_order(image2d_t image) {
+  return __clc_get_image_channel_order_2d(image);
+}
+_CLC_OVERLOAD _CLC_DEF int
+get_image_channel_order(image3d_t image) {
+  return __clc_get_image_channel_order_3d(image);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_depth.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_depth.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_depth.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_depth.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,8 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_depth_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_depth(image3d_t image) {
+	return __clc_get_image_depth_3d(image);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_height.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_height.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_height.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_height.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,13 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_height_2d(image2d_t);
+_CLC_DECL int __clc_get_image_height_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_height(image2d_t image) {
+  return __clc_get_image_height_2d(image);
+}
+_CLC_OVERLOAD _CLC_DEF int
+get_image_height(image3d_t image) {
+  return __clc_get_image_height_3d(image);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_width.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_width.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_width.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_width.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,13 @@
+#include <clc/clc.h>
+
+_CLC_DECL int __clc_get_image_width_2d(image2d_t);
+_CLC_DECL int __clc_get_image_width_3d(image3d_t);
+
+_CLC_OVERLOAD _CLC_DEF int
+get_image_width(image2d_t image) {
+  return __clc_get_image_width_2d(image);
+}
+_CLC_OVERLOAD _CLC_DEF int
+get_image_width(image3d_t image) {
+  return __clc_get_image_width_3d(image);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/read_imagef.cl libclc-0.2.0+git20170213/amdgpu/lib/image/read_imagef.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/read_imagef.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/read_imagef.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,14 @@
+#include <clc/clc.h>
+
+_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2);
+
+_CLC_OVERLOAD _CLC_DEF float4 read_imagef(image2d_t image, sampler_t sampler,
+                                          int2 coord) {
+  float2 coord_float = (float2)(coord.x, coord.y);
+  return __clc_read_imagef_tex(image, sampler, coord_float);
+}
+
+_CLC_OVERLOAD _CLC_DEF float4 read_imagef(image2d_t image, sampler_t sampler,
+                                          float2 coord) {
+  return __clc_read_imagef_tex(image, sampler, coord);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/read_imagei.cl libclc-0.2.0+git20170213/amdgpu/lib/image/read_imagei.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/read_imagei.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/read_imagei.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,23 @@
+#include <clc/clc.h>
+
+_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2);
+
+int4 __clc_reinterpret_v4f_to_v4i(float4 v) {
+  union {
+    int4 v4i;
+    float4 v4f;
+  } res = { .v4f = v};
+  return res.v4i;
+}
+
+_CLC_OVERLOAD _CLC_DEF int4 read_imagei(image2d_t image, sampler_t sampler,
+                                        int2 coord) {
+  float2 coord_float = (float2)(coord.x, coord.y);
+  return __clc_reinterpret_v4f_to_v4i(
+    __clc_read_imagef_tex(image, sampler, coord_float));
+}
+_CLC_OVERLOAD _CLC_DEF int4 read_imagei(image2d_t image, sampler_t sampler,
+                                        float2 coord) {
+  return __clc_reinterpret_v4f_to_v4i(
+    __clc_read_imagef_tex(image, sampler, coord));
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/read_image_impl.ll libclc-0.2.0+git20170213/amdgpu/lib/image/read_image_impl.ll
--- libclc-0.2.0+git20150813/amdgpu/lib/image/read_image_impl.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/read_image_impl.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,46 @@
+%opencl.image2d_t = type opaque
+
+declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32,
+                                   i32, i32, i32) readnone
+declare i32 @llvm.OpenCL.image.get.resource.id.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare i32 @llvm.OpenCL.sampler.get.resource.id(i32) readnone
+
+define <4 x float> @__clc_v4f_from_v2f(<2 x float> %v) alwaysinline {
+  %e0 = extractelement <2 x float> %v, i32 0
+  %e1 = extractelement <2 x float> %v, i32 1
+  %res.0 = insertelement <4 x float> undef,  float %e0, i32 0
+  %res.1 = insertelement <4 x float> %res.0, float %e1, i32 1
+  %res.2 = insertelement <4 x float> %res.1, float 0.0, i32 2
+  %res.3 = insertelement <4 x float> %res.2, float 0.0, i32 3
+  ret <4 x float> %res.3
+}
+
+define <4 x float> @__clc_read_imagef_tex(
+    %opencl.image2d_t addrspace(1)* nocapture %img,
+    i32 %sampler, <2 x float> %coord) alwaysinline {
+entry:
+  %coord_v4 = call <4 x float> @__clc_v4f_from_v2f(<2 x float> %coord)
+  %smp_id = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %sampler)
+  %img_id = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %img)
+  %tex_id = add i32 %img_id, 2    ; First 2 IDs are reserved.
+
+  %coord_norm = and i32 %sampler, 1
+  %is_norm = icmp eq i32 %coord_norm, 1
+  br i1 %is_norm, label %NormCoord, label %UnnormCoord
+NormCoord:
+  %data.norm = call <4 x float> @llvm.R600.tex(
+      <4 x float> %coord_v4,
+      i32 0, i32 0, i32 0,        ; Offset.
+      i32 2, i32 %smp_id,
+      i32 1, i32 1, i32 1, i32 1) ; Normalized coords.
+  ret <4 x float> %data.norm
+UnnormCoord:
+  %data.unnorm = call <4 x float> @llvm.R600.tex(
+      <4 x float> %coord_v4,
+      i32 0, i32 0, i32 0,        ; Offset.
+      i32 %tex_id, i32 %smp_id,
+      i32 0, i32 0, i32 0, i32 0) ; Unnormalized coords.
+  ret <4 x float> %data.unnorm
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/read_imageui.cl libclc-0.2.0+git20170213/amdgpu/lib/image/read_imageui.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/read_imageui.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/read_imageui.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,23 @@
+#include <clc/clc.h>
+
+_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2);
+
+uint4 __clc_reinterpret_v4f_to_v4ui(float4 v) {
+  union {
+    uint4 v4ui;
+    float4 v4f;
+  } res = { .v4f = v};
+  return res.v4ui;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint4 read_imageui(image2d_t image, sampler_t sampler,
+                                          int2 coord) {
+  float2 coord_float = (float2)(coord.x, coord.y);
+  return __clc_reinterpret_v4f_to_v4ui(
+    __clc_read_imagef_tex(image, sampler, coord_float));
+}
+_CLC_OVERLOAD _CLC_DEF uint4 read_imageui(image2d_t image, sampler_t sampler,
+                                          float2 coord) {
+  return __clc_reinterpret_v4f_to_v4ui(
+    __clc_read_imagef_tex(image, sampler, coord));
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/write_imagef.cl libclc-0.2.0+git20170213/amdgpu/lib/image/write_imagef.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/write_imagef.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/write_imagef.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_DECL void __clc_write_imagef_2d(image2d_t image, int2 coord, float4 color);
+
+_CLC_OVERLOAD _CLC_DEF void
+write_imagef(image2d_t image, int2 coord, float4 color)
+{
+  __clc_write_imagef_2d(image, coord, color);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/write_imagei.cl libclc-0.2.0+git20170213/amdgpu/lib/image/write_imagei.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/write_imagei.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/write_imagei.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_DECL void __clc_write_imagei_2d(image2d_t image, int2 coord, int4 color);
+
+_CLC_OVERLOAD _CLC_DEF void
+write_imagei(image2d_t image, int2 coord, int4 color)
+{
+  __clc_write_imagei_2d(image, coord, color);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/write_image_impl.ll libclc-0.2.0+git20170213/amdgpu/lib/image/write_image_impl.ll
--- libclc-0.2.0+git20150813/amdgpu/lib/image/write_image_impl.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/write_image_impl.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,52 @@
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+
+declare i32 @llvm.OpenCL.image.get.resource.id.2d(
+  %opencl.image2d_t addrspace(1)*) nounwind readnone
+declare i32 @llvm.OpenCL.image.get.resource.id.3d(
+  %opencl.image3d_t addrspace(1)*) nounwind readnone
+
+declare void @llvm.r600.rat.store.typed(<4 x i32> %color, <4 x i32> %coord, i32 %rat_id)
+
+define void @__clc_write_imageui_2d(
+    %opencl.image2d_t addrspace(1)* nocapture %img,
+    <2 x i32> %coord, <4 x i32> %color) #0 {
+
+  ; Coordinate int2 -> int4.
+  %e0 = extractelement <2 x i32> %coord, i32 0
+  %e1 = extractelement <2 x i32> %coord, i32 1
+  %coord.0 = insertelement <4 x i32> undef,    i32 %e0, i32 0
+  %coord.1 = insertelement <4 x i32> %coord.0, i32 %e1, i32 1
+  %coord.2 = insertelement <4 x i32> %coord.1, i32 0,  i32 2
+  %coord.3 = insertelement <4 x i32> %coord.2, i32 0,  i32 3
+
+  ; Get RAT ID.
+  %img_id = call i32 @llvm.OpenCL.image.get.resource.id.2d(
+      %opencl.image2d_t addrspace(1)* %img)
+  %rat_id = add i32 %img_id, 1
+
+  ; Call store intrinsic.
+  call void @llvm.r600.rat.store.typed(<4 x i32> %color, <4 x i32> %coord.3, i32 %rat_id)
+  ret void
+}
+
+define void @__clc_write_imagei_2d(
+    %opencl.image2d_t addrspace(1)* nocapture %img,
+    <2 x i32> %coord, <4 x i32> %color) #0 {
+  call void @__clc_write_imageui_2d(
+      %opencl.image2d_t addrspace(1)* nocapture %img,
+      <2 x i32> %coord, <4 x i32> %color)
+  ret void
+}
+
+define void @__clc_write_imagef_2d(
+    %opencl.image2d_t addrspace(1)* nocapture %img,
+    <2 x i32> %coord, <4 x float> %color) #0 {
+  %color.i32 = bitcast <4 x float> %color to <4 x i32>
+  call void @__clc_write_imageui_2d(
+      %opencl.image2d_t addrspace(1)* nocapture %img,
+      <2 x i32> %coord, <4 x i32> %color.i32)
+  ret void
+}
+
+attributes #0 = { alwaysinline }
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/write_imageui.cl libclc-0.2.0+git20170213/amdgpu/lib/image/write_imageui.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/image/write_imageui.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/image/write_imageui.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_DECL void __clc_write_imageui_2d(image2d_t image, int2 coord, uint4 color);
+
+_CLC_OVERLOAD _CLC_DEF void
+write_imageui(image2d_t image, int2 coord, uint4 color)
+{
+  __clc_write_imageui_2d(image, coord, color);
+}
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/math/nextafter.cl libclc-0.2.0+git20170213/amdgpu/lib/math/nextafter.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/math/nextafter.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/math/nextafter.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+#include "../lib/clcmacro.h"
+
+_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+_CLC_DEFINE_BINARY_BUILTIN(double, nextafter, __clc_nextafter, double, double)
+#endif
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/math/sqrt.cl libclc-0.2.0+git20170213/amdgpu/lib/math/sqrt.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/math/sqrt.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/math/sqrt.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "../../../generic/lib/clcmacro.h"
+#include "math/clc_sqrt.h"
+
+_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#ifdef __AMDGCN__
+  #define __clc_builtin_rsq __builtin_amdgcn_rsq
+#else
+  #define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee
+#endif
+
+_CLC_OVERLOAD _CLC_DEF double sqrt(double x) {
+
+  uint vcc = x < 0x1p-767;
+  uint exp0 = vcc ? 0x100 : 0;
+  unsigned exp1 = vcc ? 0xffffff80 : 0;
+
+  double v01 = ldexp(x, exp0);
+  double v23 = __clc_builtin_rsq(v01);
+  double v45 = v01 * v23;
+  v23 = v23 * 0.5;
+
+  double v67 = fma(-v23, v45, 0.5);
+  v45 = fma(v45, v67, v45);
+  double v89 = fma(-v45, v45, v01);
+  v23 = fma(v23, v67, v23);
+  v45 = fma(v89, v23, v45);
+  v67 = fma(-v45, v45, v01);
+  v23 = fma(v67, v23, v45);
+
+  v23 = ldexp(v23, exp1);
+  return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double);
+
+#endif
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/OVERRIDES libclc-0.2.0+git20170213/amdgpu/lib/OVERRIDES
--- libclc-0.2.0+git20150813/amdgpu/lib/OVERRIDES	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/OVERRIDES	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,2 @@
+workitem/get_group_id.cl
+workitem/get_global_size.cl
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/SOURCES libclc-0.2.0+git20170213/amdgpu/lib/SOURCES
--- libclc-0.2.0+git20150813/amdgpu/lib/SOURCES	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/SOURCES	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,18 @@
+atomic/atomic.cl
+math/nextafter.cl
+math/sqrt.cl
+synchronization/barrier.cl
+image/get_image_width.cl
+image/get_image_height.cl
+image/get_image_depth.cl
+image/get_image_channel_data_type.cl
+image/get_image_channel_order.cl
+image/get_image_attributes_impl.ll
+image/read_imagef.cl
+image/read_imagei.cl
+image/read_imageui.cl
+image/read_image_impl.ll
+image/write_imagef.cl
+image/write_imagei.cl
+image/write_imageui.cl
+image/write_image_impl.ll
diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/synchronization/barrier.cl libclc-0.2.0+git20170213/amdgpu/lib/synchronization/barrier.cl
--- libclc-0.2.0+git20150813/amdgpu/lib/synchronization/barrier.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/amdgpu/lib/synchronization/barrier.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,10 @@
+
+#include <clc/clc.h>
+
+_CLC_DEF int __clc_clk_local_mem_fence() {
+  return CLK_LOCAL_MEM_FENCE;
+}
+
+_CLC_DEF int __clc_clk_global_mem_fence() {
+  return CLK_GLOBAL_MEM_FENCE;
+}
diff -Nru libclc-0.2.0+git20150813/configure.py libclc-0.2.0+git20170213/configure.py
--- libclc-0.2.0+git20150813/configure.py	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/configure.py	2017-02-12 21:33:49.000000000 +0000
@@ -69,8 +69,8 @@
 llvm_int_version = int(llvm_version[0]) * 100 + int(llvm_version[1]) * 10
 llvm_string_version = 'LLVM' + llvm_version[0] + '.' + llvm_version[1]
 
-if llvm_int_version < 370:
-    print "libclc requires LLVM >= 3.7"
+if llvm_int_version < 400:
+    print "libclc requires LLVM >= 4.0"
     sys.exit(1)
 
 llvm_system_libs = llvm_config(['--system-libs'])
@@ -92,18 +92,22 @@
 available_targets = {
   'r600--' : { 'devices' :
                [{'gpu' : 'cedar',   'aliases' : ['palm', 'sumo', 'sumo2', 'redwood', 'juniper']},
-                {'gpu' : 'cypress', 'aliases' : ['hemlock']},
-                {'gpu' : 'barts',   'aliases' : ['turks', 'caicos']},
-                {'gpu' : 'cayman',  'aliases' : ['aruba']}]},
+                {'gpu' : 'cypress', 'aliases' : ['hemlock'] },
+                {'gpu' : 'barts',   'aliases' : ['turks', 'caicos'] },
+                {'gpu' : 'cayman',  'aliases' : ['aruba']} ]},
   'amdgcn--': { 'devices' :
-                [{'gpu' : 'tahiti',  'aliases' : ['pitcairn', 'verde', 'oland', 'hainan', 'bonaire', 'kabini', 'kaveri', 'hawaii','mullins']}]},
-  'nvptx--'   : { 'devices' : [{'gpu' : '', 'aliases' : []}]},
-  'nvptx64--'   : { 'devices' : [{'gpu' : '', 'aliases' : []}] },
-  'nvptx--nvidiacl'   : { 'devices' : [{'gpu' : '', 'aliases' : []}] },
-  'nvptx64--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []}] }
+                [{'gpu' : 'tahiti', 'aliases' : ['pitcairn', 'verde', 'oland', 'hainan', 'bonaire', 'kabini', 'kaveri', 'hawaii','mullins','tonga','carrizo','iceland','fiji','stoney','polaris10','polaris11']} ]},
+  'amdgcn--amdhsa': { 'devices' :
+                      [{'gpu' : '', 'aliases' : ['bonaire', 'hawaii', 'kabini', 'kaveri', 'mullins', 'carrizo', 'stoney', 'fiji', 'iceland', 'tonga','polaris10','polaris11']} ]},
+  'nvptx--'   : { 'devices' : [{'gpu' : '', 'aliases' : []} ]},
+  'nvptx64--' : { 'devices' : [{'gpu' : '', 'aliases' : []} ]},
+  'nvptx--nvidiacl'   : { 'devices' : [{'gpu' : '', 'aliases' : []} ]},
+  'nvptx64--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []} ]},
 }
 
-default_targets = ['nvptx--nvidiacl', 'nvptx64--nvidiacl', 'r600--', 'amdgcn--']
+available_targets['amdgcn-mesa-mesa3d'] = available_targets['amdgcn--']
+
+default_targets = ['nvptx--nvidiacl', 'nvptx64--nvidiacl', 'r600--', 'amdgcn--', 'amdgcn--amdhsa', 'amdgcn-mesa-mesa3d']
 
 targets = args
 if not targets:
@@ -165,9 +169,11 @@
   for arch in archs:
     subdirs.append("%s-%s-%s" % (arch, t_vendor, t_os))
     subdirs.append("%s-%s" % (arch, t_os))
+    if t_os == 'mesa3d':
+        subdirs.append('amdgcn-amdhsa')
     subdirs.append(arch)
-    if arch == 'amdgcn':
-        subdirs.append('r600')
+    if arch == 'amdgcn' or arch == 'r600':
+        subdirs.append('amdgpu')
 
   incdirs = filter(os.path.isdir,
                [os.path.join(srcdir, subdir, 'include') for subdir in subdirs])
@@ -180,9 +186,6 @@
     # The rule for building a .bc file for the specified architecture using clang.
     clang_bc_flags = "-target %s -I`dirname $in` %s " \
                      "-fno-builtin " \
-                     "-Dcl_clang_storage_class_specifiers " \
-                     "-Dcl_khr_fp64 " \
-                     "-Dcles_khr_int64 " \
                      "-D__CLC_INTERNAL " \
                      "-emit-llvm" % (target, clang_cl_includes)
     if device['gpu'] != '':
diff -Nru libclc-0.2.0+git20150813/debian/changelog libclc-0.2.0+git20170213/debian/changelog
--- libclc-0.2.0+git20150813/debian/changelog	2015-09-27 19:55:56.000000000 +0000
+++ libclc-0.2.0+git20170213/debian/changelog	2017-06-08 11:11:13.000000000 +0000
@@ -1,3 +1,55 @@
+libclc (0.2.0+git20170213-1~16.04.1) xenial; urgency=medium
+
+  * Backport to xenial. (LP: #1687981)
+  * Don't use debhelper 10.
+
+ -- Timo Aaltonen <tjaalton@debian.org>  Fri, 24 Mar 2017 10:11:06 +0200
+
+libclc (0.2.0+git20170213-1) experimental; urgency=medium
+
+  [ Andreas Boll ]
+  * Simplify clang version updates even more.
+
+  [ Timo Aaltonen ]
+  * New upstream snapshot.
+  * clang: Bump clang version to 4.0.
+
+ -- Timo Aaltonen <tjaalton@debian.org>  Mon, 13 Feb 2017 15:08:23 +0200
+
+libclc (0.2.0+git20160907-3) unstable; urgency=medium
+
+  * Simplify clang version updates.
+  * Drop de-duplication of files that aren't duplicate any more.
+
+ -- Michael Gilbert <mgilbert@debian.org>  Sat, 26 Nov 2016 03:35:48 +0000
+
+libclc (0.2.0+git20160907-2) unstable; urgency=medium
+
+  [ Andreas Boll ]
+  * Declare Multi-Arch: foreign for all packages (closes: #845314).
+
+  [ Michael Gilbert ]
+  * Update to debhelper 10.
+
+ -- Michael Gilbert <mgilbert@debian.org>  Sat, 26 Nov 2016 02:35:37 +0000
+
+libclc (0.2.0+git20160907-1) experimental; urgency=medium
+
+  * New upstream snapshot (closes: #836960).
+  * Build with clang 3.9.
+  * Drop devices.patch, upstream.
+  * Use https for Vcs-Git field.
+
+ -- Timo Aaltonen <tjaalton@debian.org>  Fri, 16 Sep 2016 09:20:06 +0300
+
+libclc (0.2.0+git20150813-3) unstable; urgency=medium
+
+  * Bump standards version.
+  * Build with clang 3.8 (closes: #832014).
+  * Add support for additional GPU devices (closes: #823677).
+
+ -- Michael Gilbert <mgilbert@debian.org>  Sat, 30 Jul 2016 22:47:05 +0000
+
 libclc (0.2.0+git20150813-2) unstable; urgency=medium
 
   * Enable build hardening flags.
diff -Nru libclc-0.2.0+git20150813/debian/clang libclc-0.2.0+git20170213/debian/clang
--- libclc-0.2.0+git20150813/debian/clang	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/debian/clang	2017-06-08 10:49:21.000000000 +0000
@@ -0,0 +1 @@
+4.0
diff -Nru libclc-0.2.0+git20150813/debian/clean libclc-0.2.0+git20170213/debian/clean
--- libclc-0.2.0+git20150813/debian/clean	2015-09-27 18:36:38.000000000 +0000
+++ libclc-0.2.0+git20170213/debian/clean	2017-02-13 13:00:28.000000000 +0000
@@ -1,3 +1,4 @@
 Makefile
 libclc.pc
 build/*.pyc
+utils/prepare-builtins.dwo
diff -Nru libclc-0.2.0+git20150813/debian/control libclc-0.2.0+git20170213/debian/control
--- libclc-0.2.0+git20150813/debian/control	2015-09-27 18:38:18.000000000 +0000
+++ libclc-0.2.0+git20170213/debian/control	2017-06-08 11:11:33.000000000 +0000
@@ -7,21 +7,22 @@
 Build-Depends:
  debhelper (>= 9),
  python,
- clang-3.7,
- llvm-3.7-dev (>= 3.7),
+ clang-4.0,
+ llvm-4.0-dev,
  zlib1g-dev,
  libedit-dev,
-Standards-Version: 3.9.6
+Standards-Version: 3.9.8
 Homepage: http://libclc.llvm.org
-Vcs-Git: git://anonscm.debian.org/pkg-opencl/libclc.git
+Vcs-Git: https://anonscm.debian.org/git/pkg-opencl/libclc.git
 Vcs-Browser: https://anonscm.debian.org/cgit/pkg-opencl/libclc.git
 
 Package: libclc-ptx
 Architecture: all
+Multi-Arch: foreign
 Depends:
  ${misc:Depends},
  libclc-dev (= ${binary:Version}),
- libclang-common-3.7-dev,
+ libclang-common-4.0-dev,
 Description: OpenCL C language implementation - ptx support
  libclc is an open implementation of the OpenCL C programming language,
  as specified by the OpenCL 1.1 Specification.
@@ -30,10 +31,11 @@
 
 Package: libclc-amdgcn
 Architecture: all
+Multi-Arch: foreign
 Depends:
  ${misc:Depends},
  libclc-dev (= ${binary:Version}),
- libclang-common-3.7-dev,
+ libclang-common-4.0-dev,
 Description: OpenCL C language implementation - amdgcn support
  libclc is an open implementation of the OpenCL C programming language,
  as specified by the OpenCL 1.1 Specification.
@@ -43,10 +45,11 @@
 
 Package: libclc-r600
 Architecture: all
+Multi-Arch: foreign
 Depends:
  ${misc:Depends},
  libclc-dev (= ${binary:Version}),
- libclang-common-3.7-dev,
+ libclang-common-4.0-dev,
 Description: OpenCL C language implementation - r600 support
  libclc is an open implementation of the OpenCL C programming language,
  as specified by the OpenCL 1.1 Specification.
@@ -57,6 +60,7 @@
 Package: libclc-dev
 Section: libdevel
 Architecture: all
+Multi-Arch: foreign
 Depends:
  ${misc:Depends},
 Description: OpenCL C language implementation - development files
diff -Nru libclc-0.2.0+git20150813/debian/control.in libclc-0.2.0+git20170213/debian/control.in
--- libclc-0.2.0+git20150813/debian/control.in	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/debian/control.in	2017-06-08 10:49:21.000000000 +0000
@@ -0,0 +1,70 @@
+Source: libclc
+Section: libs
+Priority: extra
+Maintainer: Debian OpenCL team <pkg-opencl-devel@lists.alioth.debian.org>
+Uploaders:
+ Michael Gilbert <mgilbert@debian.org>,
+Build-Depends:
+ debhelper (>= 9),
+ python,
+ clang-CLANG_VERSION,
+ llvm-CLANG_VERSION-dev,
+ zlib1g-dev,
+ libedit-dev,
+Standards-Version: 3.9.8
+Homepage: http://libclc.llvm.org
+Vcs-Git: https://anonscm.debian.org/git/pkg-opencl/libclc.git
+Vcs-Browser: https://anonscm.debian.org/cgit/pkg-opencl/libclc.git
+
+Package: libclc-ptx
+Architecture: all
+Multi-Arch: foreign
+Depends:
+ ${misc:Depends},
+ libclc-dev (= ${binary:Version}),
+ libclang-common-CLANG_VERSION-dev,
+Description: OpenCL C language implementation - ptx support
+ libclc is an open implementation of the OpenCL C programming language,
+ as specified by the OpenCL 1.1 Specification.
+ .
+ This package contains support for the PTX platform.
+
+Package: libclc-amdgcn
+Architecture: all
+Multi-Arch: foreign
+Depends:
+ ${misc:Depends},
+ libclc-dev (= ${binary:Version}),
+ libclang-common-CLANG_VERSION-dev,
+Description: OpenCL C language implementation - amdgcn support
+ libclc is an open implementation of the OpenCL C programming language,
+ as specified by the OpenCL 1.1 Specification.
+ .
+ This package contains support for the amdgcn (AMD GPU) platform.
+ Supported GPU families: Southern Islands and newer.
+
+Package: libclc-r600
+Architecture: all
+Multi-Arch: foreign
+Depends:
+ ${misc:Depends},
+ libclc-dev (= ${binary:Version}),
+ libclang-common-CLANG_VERSION-dev,
+Description: OpenCL C language implementation - r600 support
+ libclc is an open implementation of the OpenCL C programming language,
+ as specified by the OpenCL 1.1 Specification.
+ .
+ This package contains support for the r600 (AMD GPU) platform.
+ Supported GPU families: Evergreen and Northern Islands.
+
+Package: libclc-dev
+Section: libdevel
+Architecture: all
+Multi-Arch: foreign
+Depends:
+ ${misc:Depends},
+Description: OpenCL C language implementation - development files
+ libclc is an open implementation of the OpenCL C programming language,
+ as specified by the OpenCL 1.1 Specification.
+ .
+ This package contains development header files.
diff -Nru libclc-0.2.0+git20150813/debian/copyright libclc-0.2.0+git20170213/debian/copyright
--- libclc-0.2.0+git20150813/debian/copyright	2015-09-27 18:36:38.000000000 +0000
+++ libclc-0.2.0+git20170213/debian/copyright	2017-02-13 13:00:28.000000000 +0000
@@ -9,7 +9,7 @@
 
 Files: debian/*
 Copyright:
- 2013-2015 Michael Gilbert <mgilbert@debian.org>
+ 2013-2016 Michael Gilbert <mgilbert@debian.org>
  2013-2014 Julian Wollrath <jwollrath@web.de>
 License: NCSA or MIT
 
diff -Nru libclc-0.2.0+git20150813/debian/README.source libclc-0.2.0+git20170213/debian/README.source
--- libclc-0.2.0+git20150813/debian/README.source	2015-09-27 18:36:38.000000000 +0000
+++ libclc-0.2.0+git20170213/debian/README.source	2017-02-13 13:00:28.000000000 +0000
@@ -1,2 +1,12 @@
+Watch File
+==========
 There is no watch file in debian/ because upstream uses git but has made
 no tags, so there is currently no way to watch upstream changes.
+
+Clang Updates
+=============
+To build with a different version of clang, just alter the clang version
+number contained in the debian/clang file, then update the control file
+with:
+
+$ ./debian/rules debian/control
diff -Nru libclc-0.2.0+git20150813/debian/rules libclc-0.2.0+git20170213/debian/rules
--- libclc-0.2.0+git20150813/debian/rules	2015-09-27 20:01:01.000000000 +0000
+++ libclc-0.2.0+git20170213/debian/rules	2017-06-08 10:49:21.000000000 +0000
@@ -6,26 +6,13 @@
 
 export DEB_BUILD_MAINT_OPTIONS=hardening=+all
 
-confflags=--prefix=/usr \
-          --with-llvm-config=/usr/bin/llvm-config-3.7 \
+LLVM_CONFIG=/usr/bin/llvm-config-$(shell cat debian/clang)
 
-path=debian/tmp/usr/lib/clc
+debian/control: debian/control.in
+	sed "s/CLANG_VERSION/$(shell cat debian/clang)/g" < $< > $@
 
-%:
+%: debian/control
 	dh $@ --parallel
 
 override_dh_auto_configure:
-	./configure.py $(confflags)
-
-override_dh_install:
-	test $(shell sha512sum $(path)/cypress-r600--.bc) != $(shell sha512sum $(path)/cayman-r600--.bc) || \
-	    rm -f $(path)/cypress-r600--.bc && \
-	    ln -s cayman-r600--.bc $(path)/cypress-r600--.bc
-	test $(shell sha512sum $(path)/cedar-r600--.bc) != $(shell sha512sum $(path)/barts-r600--.bc) || \
-	    rm -f $(path)/cedar-r600--.bc && \
-	    ln -s barts-r600--.bc $(path)/cedar-r600--.bc
-	dh_install
-
-override_dh_clean:
-	dh_clean
-	find -name '*.d' -execdir rm -f {} \;
+	./configure.py --prefix=/usr --with-llvm-config=$(LLVM_CONFIG)
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/clc.h libclc-0.2.0+git20170213/generic/include/clc/clc.h
--- libclc-0.2.0+git20150813/generic/include/clc/clc.h	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/clc.h	2017-02-12 21:33:49.000000000 +0000
@@ -30,6 +30,7 @@
 #include <clc/workitem/get_local_id.h>
 #include <clc/workitem/get_num_groups.h>
 #include <clc/workitem/get_group_id.h>
+#include <clc/workitem/get_global_offset.h>
 
 /* 6.11.2 Math Functions */
 #include <clc/math/acos.h>
@@ -43,30 +44,41 @@
 #include <clc/math/atan2pi.h>
 #include <clc/math/atanh.h>
 #include <clc/math/atanpi.h>
+#include <clc/math/cbrt.h>
 #include <clc/math/copysign.h>
 #include <clc/math/cos.h>
+#include <clc/math/cosh.h>
 #include <clc/math/cospi.h>
 #include <clc/math/ceil.h>
+#include <clc/math/erf.h>
 #include <clc/math/erfc.h>
 #include <clc/math/exp.h>
+#include <clc/math/expm1.h>
 #include <clc/math/exp10.h>
 #include <clc/math/exp2.h>
 #include <clc/math/fabs.h>
+#include <clc/math/fdim.h>
 #include <clc/math/floor.h>
 #include <clc/math/fma.h>
 #include <clc/math/fmax.h>
 #include <clc/math/fmin.h>
 #include <clc/math/fmod.h>
 #include <clc/math/fract.h>
+#include <clc/math/frexp.h>
 #include <clc/math/half_rsqrt.h>
 #include <clc/math/half_sqrt.h>
 #include <clc/math/hypot.h>
+#include <clc/math/ilogb.h>
 #include <clc/math/ldexp.h>
+#include <clc/math/lgamma.h>
+#include <clc/math/lgamma_r.h>
 #include <clc/math/log.h>
 #include <clc/math/log10.h>
 #include <clc/math/log1p.h>
 #include <clc/math/log2.h>
+#include <clc/math/logb.h>
 #include <clc/math/mad.h>
+#include <clc/math/modf.h>
 #include <clc/math/nextafter.h>
 #include <clc/math/pow.h>
 #include <clc/math/pown.h>
@@ -77,6 +89,8 @@
 #include <clc/math/sinpi.h>
 #include <clc/math/sqrt.h>
 #include <clc/math/tan.h>
+#include <clc/math/tanh.h>
+#include <clc/math/tgamma.h>
 #include <clc/math/trunc.h>
 #include <clc/math/native_cos.h>
 #include <clc/math/native_divide.h>
@@ -88,6 +102,7 @@
 #include <clc/math/native_powr.h>
 #include <clc/math/native_sin.h>
 #include <clc/math/native_sqrt.h>
+#include <clc/math/native_rsqrt.h>
 #include <clc/math/rsqrt.h>
 
 /* 6.11.2.1 Floating-point macros */
@@ -210,6 +225,11 @@
 #include <clc/cl_khr_local_int32_extended_atomics/atom_or.h>
 #include <clc/cl_khr_local_int32_extended_atomics/atom_xor.h>
 
+/* 6.11.13 Image Read and Write Functions */
+
+#include <clc/image/image_defines.h>
+#include <clc/image/image.h>
+
 /* libclc internal defintions */
 #ifdef __CLC_INTERNAL
 #include <math/clc_nextafter.h>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/float/definitions.h libclc-0.2.0+git20170213/generic/include/clc/float/definitions.h
--- libclc-0.2.0+git20150813/generic/include/clc/float/definitions.h	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/float/definitions.h	2017-02-12 21:33:49.000000000 +0000
@@ -14,6 +14,9 @@
 #define FLT_MIN         0x1.0p-126f
 #define FLT_EPSILON     0x1.0p-23f
 
+#define FP_ILOGB0 (-2147483647 - 1)
+#define FP_ILOGBNAN (-2147483647 - 1)
+
 #define M_E_F           0x1.5bf0a8p+1f
 #define M_LOG2E_F       0x1.715476p+0f
 #define M_LOG10E_F      0x1.bcb7b2p-2f
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/geometric/floatn.inc libclc-0.2.0+git20170213/generic/include/clc/geometric/floatn.inc
--- libclc-0.2.0+git20150813/generic/include/clc/geometric/floatn.inc	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/geometric/floatn.inc	2017-02-12 21:33:49.000000000 +0000
@@ -19,6 +19,14 @@
 #include __CLC_BODY
 #undef __CLC_FLOATN
 
+#define __CLC_FLOATN float8
+#include __CLC_BODY
+#undef __CLC_FLOATN
+
+#define __CLC_FLOATN float16
+#include __CLC_BODY
+#undef __CLC_FLOATN
+
 #undef __CLC_FLOAT
 #undef __CLC_FPSIZE
 
@@ -46,6 +54,14 @@
 #include __CLC_BODY
 #undef __CLC_FLOATN
 
+#define __CLC_FLOATN double8
+#include __CLC_BODY
+#undef __CLC_FLOATN
+
+#define __CLC_FLOATN double16
+#include __CLC_BODY
+#undef __CLC_FLOATN
+
 #undef __CLC_FLOAT
 #undef __CLC_FPSIZE
 
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/image/image_defines.h libclc-0.2.0+git20170213/generic/include/clc/image/image_defines.h
--- libclc-0.2.0+git20150813/generic/include/clc/image/image_defines.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/image/image_defines.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,49 @@
+/* get_image_channel_data_type flags */
+#define CLK_SNORM_INT8               0x10D0
+#define CLK_SNORM_INT16              0x10D1
+#define CLK_UNORM_INT8               0x10D2
+#define CLK_UNORM_INT16              0x10D3
+#define CLK_UNORM_SHORT_565          0x10D4
+#define CLK_UNORM_SHORT_555          0x10D5
+#define CLK_UNORM_SHORT_101010       0x10D6
+#define CLK_SIGNED_INT8              0x10D7
+#define CLK_SIGNED_INT16             0x10D8
+#define CLK_SIGNED_INT32             0x10D9
+#define CLK_UNSIGNED_INT8            0x10DA
+#define CLK_UNSIGNED_INT16           0x10DB
+#define CLK_UNSIGNED_INT32           0x10DC
+#define CLK_HALF_FLOAT               0x10DD
+#define CLK_FLOAT                    0x10DE
+
+/* get_image_channel_order flags */
+#define CLK_R                        0x10B0
+#define CLK_A                        0x10B1
+#define CLK_RG                       0x10B2
+#define CLK_RA                       0x10B3
+#define CLK_RGB                      0x10B4
+#define CLK_RGBA                     0x10B5
+#define CLK_BGRA                     0x10B6
+#define CLK_ARGB                     0x10B7
+#define CLK_INTENSITY                0x10B8
+#define CLK_LUMINANCE                0x10B9
+#define CLK_Rx                       0x10BA
+#define CLK_RGx                      0x10BB
+#define CLK_RGBx                     0x10BC
+
+/* sampler normalized coords */
+#define CLK_NORMALIZED_COORDS_FALSE  0x0000
+#define CLK_NORMALIZED_COORDS_TRUE   0x0001
+#define __CLC_NORMALIZED_COORDS_MASK 0x0001
+
+/* sampler addressing mode */
+#define CLK_ADDRESS_NONE             0x0000
+#define CLK_ADDRESS_CLAMP_TO_EDGE    0x0002
+#define CLK_ADDRESS_CLAMP            0x0004
+#define CLK_ADDRESS_REPEAT           0x0006
+#define CLK_ADDRESS_MIRRORED_REPEAT  0x0008
+#define __CLC_ADDRESS_MASK           0x000E
+
+/* sampler filter mode */
+#define CLK_FILTER_NEAREST           0x0000
+#define CLK_FILTER_LINEAR            0x0010
+#define __CLC_FILTER_MASK            0x0010
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/image/image.h libclc-0.2.0+git20170213/generic/include/clc/image/image.h
--- libclc-0.2.0+git20150813/generic/include/clc/image/image.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/image/image.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,36 @@
+_CLC_OVERLOAD _CLC_DECL int get_image_width (image2d_t image);
+_CLC_OVERLOAD _CLC_DECL int get_image_width (image3d_t image);
+
+_CLC_OVERLOAD _CLC_DECL int get_image_height (image2d_t image);
+_CLC_OVERLOAD _CLC_DECL int get_image_height (image3d_t image);
+
+_CLC_OVERLOAD _CLC_DECL int get_image_depth (image3d_t image);
+
+_CLC_OVERLOAD _CLC_DECL int get_image_channel_data_type (image2d_t image);
+_CLC_OVERLOAD _CLC_DECL int get_image_channel_data_type (image3d_t image);
+
+_CLC_OVERLOAD _CLC_DECL int get_image_channel_order (image2d_t image);
+_CLC_OVERLOAD _CLC_DECL int get_image_channel_order (image3d_t image);
+
+_CLC_OVERLOAD _CLC_DECL int2 get_image_dim (image2d_t image);
+_CLC_OVERLOAD _CLC_DECL int4 get_image_dim (image3d_t image);
+
+_CLC_OVERLOAD _CLC_DECL void
+write_imagef(image2d_t image, int2 coord, float4 color);
+_CLC_OVERLOAD _CLC_DECL void
+write_imagei(image2d_t image, int2 coord, int4 color);
+_CLC_OVERLOAD _CLC_DECL void
+write_imageui(image2d_t image, int2 coord, uint4 color);
+
+_CLC_OVERLOAD _CLC_DECL float4
+read_imagef(image2d_t image, sampler_t sampler, int2 coord);
+_CLC_OVERLOAD _CLC_DECL float4
+read_imagef(image2d_t image, sampler_t sampler, float2 coord);
+_CLC_OVERLOAD _CLC_DECL int4
+read_imagei(image2d_t image, sampler_t sampler, int2 coord);
+_CLC_OVERLOAD _CLC_DECL int4
+read_imagei(image2d_t image, sampler_t sampler, float2 coord);
+_CLC_OVERLOAD _CLC_DECL uint4
+read_imageui(image2d_t image, sampler_t sampler, int2 coord);
+_CLC_OVERLOAD _CLC_DECL uint4
+read_imageui(image2d_t image, sampler_t sampler, float2 coord);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/integer/definitions.h libclc-0.2.0+git20170213/generic/include/clc/integer/definitions.h
--- libclc-0.2.0+git20150813/generic/include/clc/integer/definitions.h	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/integer/definitions.h	2017-02-12 21:33:49.000000000 +0000
@@ -1,14 +1,14 @@
 #define CHAR_BIT 8
 #define INT_MAX 2147483647
-#define INT_MIN -2147483648
+#define INT_MIN (-2147483647 - 1)
 #define LONG_MAX  0x7fffffffffffffffL
-#define LONG_MIN -0x8000000000000000L
+#define LONG_MIN (-0x7fffffffffffffffL - 1)
+#define CHAR_MAX SCHAR_MAX
+#define CHAR_MIN SCHAR_MIN
 #define SCHAR_MAX 127
-#define SCHAR_MIN -128
-#define CHAR_MAX 127
-#define CHAR_MIN -128
+#define SCHAR_MIN (-127 - 1)
 #define SHRT_MAX 32767
-#define SHRT_MIN -32768
+#define SHRT_MIN (-32767 - 1)
 #define UCHAR_MAX 255
 #define USHRT_MAX 65535
 #define UINT_MAX 0xffffffff
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/cbrt.h libclc-0.2.0+git20170213/generic/include/clc/math/cbrt.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/cbrt.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/cbrt.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define __CLC_BODY <clc/math/cbrt.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/cbrt.inc libclc-0.2.0+git20170213/generic/include/clc/math/cbrt.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/cbrt.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/cbrt.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE cbrt(__CLC_GENTYPE x);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/cosh.h libclc-0.2.0+git20170213/generic/include/clc/math/cosh.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/cosh.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/cosh.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define __CLC_BODY <clc/math/cosh.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/cosh.inc libclc-0.2.0+git20170213/generic/include/clc/math/cosh.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/cosh.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/cosh.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE cosh(__CLC_GENTYPE x);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/erf.h libclc-0.2.0+git20170213/generic/include/clc/math/erf.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/erf.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/erf.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#undef erfc
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION erf
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/expm1.h libclc-0.2.0+git20170213/generic/include/clc/math/expm1.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/expm1.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/expm1.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#undef exp
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION expm1
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/fdim.h libclc-0.2.0+git20170213/generic/include/clc/math/fdim.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/fdim.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/fdim.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,2 @@
+#define __CLC_BODY <clc/math/fdim.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/fdim.inc libclc-0.2.0+git20170213/generic/include/clc/math/fdim.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/fdim.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/fdim.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE fdim(__CLC_GENTYPE a, __CLC_GENTYPE b);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/frexp.h libclc-0.2.0+git20170213/generic/include/clc/math/frexp.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/frexp.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/frexp.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,2 @@
+#define __CLC_BODY <clc/math/frexp.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/frexp.inc libclc-0.2.0+git20170213/generic/include/clc/math/frexp.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/frexp.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/frexp.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE frexp(__CLC_GENTYPE x, global __CLC_INTN *iptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE frexp(__CLC_GENTYPE x, local __CLC_INTN *iptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE frexp(__CLC_GENTYPE x, private __CLC_INTN *iptr);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/gentype.inc libclc-0.2.0+git20170213/generic/include/clc/math/gentype.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/gentype.inc	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/gentype.inc	2017-02-12 21:33:49.000000000 +0000
@@ -2,38 +2,50 @@
 #define __CLC_FPSIZE 32
 
 #define __CLC_GENTYPE float
+#define __CLC_INTN int
 #define __CLC_SCALAR
 #include __CLC_BODY
 #undef __CLC_GENTYPE
+#undef __CLC_INTN
 #undef __CLC_SCALAR
 
 #define __CLC_GENTYPE float2
 #define __CLC_INTN int2
+#define __CLC_VECSIZE 2
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
 #define __CLC_GENTYPE float3
 #define __CLC_INTN int3
+#define __CLC_VECSIZE 3
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
 #define __CLC_GENTYPE float4
 #define __CLC_INTN int4
+#define __CLC_VECSIZE 4
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
 #define __CLC_GENTYPE float8
 #define __CLC_INTN int8
+#define __CLC_VECSIZE 8
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
 #define __CLC_GENTYPE float16
 #define __CLC_INTN int16
+#define __CLC_VECSIZE 16
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
@@ -47,37 +59,49 @@
 
 #define __CLC_SCALAR
 #define __CLC_GENTYPE double
+#define __CLC_INTN int
 #include __CLC_BODY
 #undef __CLC_GENTYPE
+#undef __CLC_INTN
 #undef __CLC_SCALAR
 
 #define __CLC_GENTYPE double2
 #define __CLC_INTN int2
+#define __CLC_VECSIZE 2
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
 #define __CLC_GENTYPE double3
 #define __CLC_INTN int3
+#define __CLC_VECSIZE 3
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
 #define __CLC_GENTYPE double4
 #define __CLC_INTN int4
+#define __CLC_VECSIZE 4
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
 #define __CLC_GENTYPE double8
 #define __CLC_INTN int8
+#define __CLC_VECSIZE 8
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
 #define __CLC_GENTYPE double16
 #define __CLC_INTN int16
+#define __CLC_VECSIZE 16
 #include __CLC_BODY
+#undef __CLC_VECSIZE
 #undef __CLC_GENTYPE
 #undef __CLC_INTN
 
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/ilogb.h libclc-0.2.0+git20170213/generic/include/clc/math/ilogb.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/ilogb.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/ilogb.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,5 @@
+#define __CLC_BODY <clc/math/ilogb.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/ilogb.inc libclc-0.2.0+git20170213/generic/include/clc/math/ilogb.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/ilogb.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/ilogb.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_INTN ilogb(__CLC_GENTYPE x);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/lgamma.h libclc-0.2.0+git20170213/generic/include/clc/math/lgamma.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/lgamma.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/lgamma.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,2 @@
+#define __CLC_BODY <clc/math/lgamma.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/lgamma.inc libclc-0.2.0+git20170213/generic/include/clc/math/lgamma.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/lgamma.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/lgamma.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE lgamma(__CLC_GENTYPE a);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/lgamma_r.h libclc-0.2.0+git20170213/generic/include/clc/math/lgamma_r.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/lgamma_r.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/lgamma_r.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,2 @@
+#define __CLC_BODY <clc/math/lgamma_r.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/lgamma_r.inc libclc-0.2.0+git20170213/generic/include/clc/math/lgamma_r.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/lgamma_r.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/lgamma_r.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,3 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, global __CLC_INTN *iptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, local __CLC_INTN *iptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, private __CLC_INTN *iptr);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/logb.h libclc-0.2.0+git20170213/generic/include/clc/math/logb.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/logb.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/logb.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,2 @@
+#define __CLC_BODY <clc/math/logb.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/logb.inc libclc-0.2.0+git20170213/generic/include/clc/math/logb.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/logb.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/logb.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE logb(__CLC_GENTYPE a);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/modf.h libclc-0.2.0+git20170213/generic/include/clc/math/modf.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/modf.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/modf.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define __CLC_BODY <clc/math/modf.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/modf.inc libclc-0.2.0+git20170213/generic/include/clc/math/modf.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/modf.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/modf.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, global __CLC_GENTYPE *iptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, local __CLC_GENTYPE *iptr);
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, private __CLC_GENTYPE *iptr);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/native_rsqrt.h libclc-0.2.0+git20170213/generic/include/clc/math/native_rsqrt.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/native_rsqrt.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/native_rsqrt.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1 @@
+#define native_rsqrt rsqrt
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/tanh.h libclc-0.2.0+git20170213/generic/include/clc/math/tanh.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/tanh.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/tanh.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define __CLC_BODY <clc/math/tanh.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/tanh.inc libclc-0.2.0+git20170213/generic/include/clc/math/tanh.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/tanh.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/tanh.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE tanh(__CLC_GENTYPE a);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/tgamma.h libclc-0.2.0+git20170213/generic/include/clc/math/tgamma.h
--- libclc-0.2.0+git20150813/generic/include/clc/math/tgamma.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/tgamma.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,3 @@
+#define __CLC_BODY <clc/math/tgamma.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_BODY
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/tgamma.inc libclc-0.2.0+git20170213/generic/include/clc/math/tgamma.inc
--- libclc-0.2.0+git20150813/generic/include/clc/math/tgamma.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/math/tgamma.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE tgamma(__CLC_GENTYPE a);
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/shared/vstore.h libclc-0.2.0+git20170213/generic/include/clc/shared/vstore.h
--- libclc-0.2.0+git20150813/generic/include/clc/shared/vstore.h	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/shared/vstore.h	2017-02-12 21:33:49.000000000 +0000
@@ -1,17 +1,20 @@
-#define _CLC_VSTORE_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
-  _CLC_OVERLOAD _CLC_DECL void vstore##WIDTH(VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out);
+#define _CLC_VSTORE_DECL(SUFFIX, PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
+  _CLC_OVERLOAD _CLC_DECL void vstore##SUFFIX##WIDTH(VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out);
 
-#define _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, ADDR_SPACE) \
-  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
-  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
-  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
-  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
-  _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
+#define _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
+  _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
+
+#define _CLC_VECTOR_VSTORE_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
+  _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
+  _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
+  _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) \
 
 #define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE) \
-  _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __private) \
-  _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __local) \
-  _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __global) \
+  _CLC_VECTOR_VSTORE_PRIM3(,PRIM_TYPE, PRIM_TYPE) \
 
 #define _CLC_VECTOR_VSTORE_PRIM() \
     _CLC_VECTOR_VSTORE_PRIM1(char) \
@@ -23,14 +26,18 @@
     _CLC_VECTOR_VSTORE_PRIM1(long) \
     _CLC_VECTOR_VSTORE_PRIM1(ulong) \
     _CLC_VECTOR_VSTORE_PRIM1(float) \
-        
+    _CLC_VECTOR_VSTORE_PRIM3(_half, half, float)
+
 #ifdef cl_khr_fp64
-#define _CLC_VECTOR_VSTORE() \
-  _CLC_VECTOR_VSTORE_PRIM1(double) \
-  _CLC_VECTOR_VSTORE_PRIM()
-#else
-#define _CLC_VECTOR_VSTORE() \
-  _CLC_VECTOR_VSTORE_PRIM()
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+  _CLC_VECTOR_VSTORE_PRIM1(double)
+  _CLC_VECTOR_VSTORE_PRIM3(_half, half, double)
+  _CLC_VSTORE_DECL(_half, half, double, , __private)
+  _CLC_VSTORE_DECL(_half, half, double, , __local)
+  _CLC_VSTORE_DECL(_half, half, double, , __global)
 #endif
 
-_CLC_VECTOR_VSTORE()
+_CLC_VECTOR_VSTORE_PRIM()
+_CLC_VSTORE_DECL(_half, half, float, , __private)
+_CLC_VSTORE_DECL(_half, half, float, , __local)
+_CLC_VSTORE_DECL(_half, half, float, , __global)
diff -Nru libclc-0.2.0+git20150813/generic/include/clc/workitem/get_global_offset.h libclc-0.2.0+git20170213/generic/include/clc/workitem/get_global_offset.h
--- libclc-0.2.0+git20150813/generic/include/clc/workitem/get_global_offset.h	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/include/clc/workitem/get_global_offset.h	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1 @@
+_CLC_DECL size_t get_global_offset(uint dim);
diff -Nru libclc-0.2.0+git20150813/generic/lib/clcmacro.h libclc-0.2.0+git20170213/generic/lib/clcmacro.h
--- libclc-0.2.0+git20150813/generic/lib/clcmacro.h	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/clcmacro.h	2017-02-12 21:33:49.000000000 +0000
@@ -109,6 +109,43 @@
   } \
 \
 
+#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ADDR_SPACE, ARG2_TYPE) \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ADDR_SPACE ARG2_TYPE##2 *y) { \
+    return (RET_TYPE##2)( \
+        FUNCTION(x.x, (ARG2_TYPE*)y), \
+        FUNCTION(x.y, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+1)) \
+    ); \
+  } \
+\
+  DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ADDR_SPACE ARG2_TYPE##3 *y) { \
+    return (RET_TYPE##3)( \
+        FUNCTION(x.x, (ARG2_TYPE*)y), \
+        FUNCTION(x.y, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+1)), \
+        FUNCTION(x.z, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+2)) \
+    ); \
+  } \
+\
+  DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ADDR_SPACE ARG2_TYPE##4 *y) { \
+    return (RET_TYPE##4)( \
+        FUNCTION(x.lo, (ARG2_TYPE##2*)y), \
+        FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##2*)((ADDR_SPACE ARG2_TYPE*)y+2)) \
+    ); \
+  } \
+\
+  DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ADDR_SPACE ARG2_TYPE##8 *y) { \
+    return (RET_TYPE##8)( \
+        FUNCTION(x.lo, (ARG2_TYPE##4*)y), \
+        FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##4*)((ADDR_SPACE ARG2_TYPE*)y+4)) \
+    ); \
+  } \
+\
+  DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ADDR_SPACE ARG2_TYPE##16 *y) { \
+    return (RET_TYPE##16)( \
+        FUNCTION(x.lo, (ARG2_TYPE##8*)y), \
+        FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##8*)((ADDR_SPACE ARG2_TYPE*)y+8)) \
+    ); \
+  }
+
 #define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \
 _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \
   return BUILTIN(x, y); \
diff -Nru libclc-0.2.0+git20150813/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl libclc-0.2.0+git20170213/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl
--- libclc-0.2.0+git20150813/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,7 +2,7 @@
 
 #define IMPL(TYPE) \
 _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(global TYPE *p) { \
-  return atom_sub(p, 1); \
+  return atom_sub(p, (TYPE)1); \
 }
 
 IMPL(int)
diff -Nru libclc-0.2.0+git20150813/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl libclc-0.2.0+git20170213/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl
--- libclc-0.2.0+git20150813/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,7 +2,7 @@
 
 #define IMPL(TYPE) \
 _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(global TYPE *p) { \
-  return atom_add(p, 1); \
+  return atom_add(p, (TYPE)1); \
 }
 
 IMPL(int)
diff -Nru libclc-0.2.0+git20150813/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl libclc-0.2.0+git20170213/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl
--- libclc-0.2.0+git20150813/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,7 +2,7 @@
 
 #define IMPL(TYPE) \
 _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(local TYPE *p) { \
-  return atom_sub(p, 1); \
+  return atom_sub(p, (TYPE)1); \
 }
 
 IMPL(int)
diff -Nru libclc-0.2.0+git20150813/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl libclc-0.2.0+git20170213/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl
--- libclc-0.2.0+git20150813/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,7 +2,7 @@
 
 #define IMPL(TYPE) \
 _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(local TYPE *p) { \
-  return atom_add(p, 1); \
+  return atom_add(p, (TYPE)1); \
 }
 
 IMPL(int)
diff -Nru libclc-0.2.0+git20150813/generic/lib/gen_convert.py libclc-0.2.0+git20170213/generic/lib/gen_convert.py
--- libclc-0.2.0+git20150813/generic/lib/gen_convert.py	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/gen_convert.py	2017-02-12 21:33:49.000000000 +0000
@@ -97,14 +97,12 @@
     int64_count = int64_count +1
   elif dst in float64_types:
     float64_count = float64_count + 1
-  if float64_count > 0 and int64_count > 0:
-    print("#if defined(cl_khr_fp64) && defined(cles_khr_int64)")
-    return True
-  elif float64_count > 0:
+  if float64_count > 0:
+    #In embedded profile, if cl_khr_fp64 is supported cles_khr_int64 has to be
     print("#ifdef cl_khr_fp64")
     return True
   elif int64_count > 0:
-    print("#ifdef cles_khr_int64")
+    print("#if defined cles_khr_int64 || !defined(__EMBEDDED_PROFILE__)")
     return True
   return False
 
@@ -142,6 +140,15 @@
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+#if defined(__EMBEDDED_PROFILE__) && !defined(cles_khr_int64)
+#error Embedded profile that supports cl_khr_fp64 also has to support cles_khr_int64
+#endif
+
+#endif
+
+#ifdef cles_khr_int64
+#pragma OPENCL EXTENSION cles_khr_int64 : enable
 #endif
 
 """)
diff -Nru libclc-0.2.0+git20150813/generic/lib/image/get_image_dim.cl libclc-0.2.0+git20170213/generic/lib/image/get_image_dim.cl
--- libclc-0.2.0+git20150813/generic/lib/image/get_image_dim.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/image/get_image_dim.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF int2 get_image_dim (image2d_t image) {
+  return (int2)(get_image_width(image), get_image_height(image));
+}
+_CLC_OVERLOAD _CLC_DEF int4 get_image_dim (image3d_t image) {
+  return (int4)(get_image_width(image), get_image_height(image),
+                get_image_depth(image), 0);
+}
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/cbrt.cl libclc-0.2.0+git20170213/generic/lib/math/cbrt.cl
--- libclc-0.2.0+git20150813/generic/lib/math/cbrt.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/cbrt.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float cbrt(float x) {
+
+    uint xi = as_uint(x);
+    uint axi = xi & EXSIGNBIT_SP32;
+    uint xsign = axi ^ xi;
+    xi = axi;
+
+    int m = (xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+    // Treat subnormals
+    uint xisub = as_uint(as_float(xi | 0x3f800000) - 1.0f);
+    int msub = (xisub >> EXPSHIFTBITS_SP32) - 253;
+    int c = m == -127;
+    xi = c ? xisub : xi;
+    m = c ? msub : m;
+
+    int m3 = m / 3;
+    int rem = m - m3*3;
+    float mf = as_float((m3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+
+    uint indx = (xi & 0x007f0000) + ((xi & 0x00008000) << 1);
+    float f = as_float((xi & MANTBITS_SP32) | 0x3f000000) - as_float(indx | 0x3f000000);
+
+    indx >>= 16;
+    float r = f * USE_TABLE(log_inv_tbl, indx);
+    float poly = mad(mad(r, 0x1.f9add4p-5f, -0x1.c71c72p-4f), r*r, r * 0x1.555556p-2f);
+
+    // This could also be done with a 5-element table
+    float remH = 0x1.428000p-1f;
+    float remT = 0x1.45f31ap-14f;
+
+    remH = rem == -1 ? 0x1.964000p-1f : remH;
+    remT = rem == -1 ? 0x1.fea53ep-13f : remT;
+
+    remH = rem ==  0 ? 0x1.000000p+0f : remH;
+    remT = rem ==  0 ? 0x0.000000p+0f  : remT;
+
+    remH = rem ==  1 ? 0x1.428000p+0f : remH;
+    remT = rem ==  1 ? 0x1.45f31ap-13f : remT;
+
+    remH = rem ==  2 ? 0x1.964000p+0f : remH;
+    remT = rem ==  2 ? 0x1.fea53ep-12f : remT;
+
+    float2 tv = USE_TABLE(cbrt_tbl, indx);
+    float cbrtH = tv.s0;
+    float cbrtT = tv.s1;
+
+    float bH = cbrtH * remH;
+    float bT = mad(cbrtH, remT, mad(cbrtT, remH, cbrtT*remT));
+
+    float z = mad(poly, bH, mad(poly, bT, bT)) + bH;
+    z *= mf;
+    z = as_float(as_uint(z) | xsign);
+    c = axi >= EXPBITS_SP32 | axi == 0;
+    z = c ? x : z;
+    return z;
+
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cbrt, float);
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double cbrt(double x) {
+
+    int return_x = isinf(x) | isnan(x) | x == 0.0;
+    ulong ux = as_ulong(fabs(x));
+    int m = (as_int2(ux).hi >> 20) - 1023;
+
+    // Treat subnormals
+    ulong uxs = as_ulong(as_double(0x3ff0000000000000UL | ux) - 1.0);
+    int ms = m + (as_int2(uxs).hi >> 20) - 1022;
+
+    int c = m == -1023;
+    ux = c ? uxs : ux;
+    m = c ? ms : m;
+
+    int mby3 = m / 3;
+    int rem = m - 3*mby3;
+
+    double mf = as_double((ulong)(mby3 + 1023) << 52);
+
+    ux &= 0x000fffffffffffffUL;
+    double Y = as_double(0x3fe0000000000000UL | ux);
+
+    // nearest integer
+    int index = as_int2(ux).hi >> 11;
+    index = (0x100 | (index >> 1)) + (index & 1);
+    double F = (double)index * 0x1.0p-9;
+
+    double f = Y - F;
+    double r = f * USE_TABLE(cbrt_inv_tbl, index-256);
+
+    double z = r * fma(r,
+                       fma(r,
+                           fma(r,
+                               fma(r,
+                                   fma(r, -0x1.8090d6221a247p-6, 0x1.ee7113506ac13p-6),
+                                   -0x1.511e8d2b3183bp-5),
+                               0x1.f9add3c0ca458p-5),
+                           -0x1.c71c71c71c71cp-4),
+                       0x1.5555555555555p-2);
+
+    double2 tv = USE_TABLE(cbrt_rem_tbl, rem+2);
+    double Rem_h = tv.s0;
+    double Rem_t = tv.s1;
+
+    tv = USE_TABLE(cbrt_dbl_tbl, index-256);
+    double F_h = tv.s0;
+    double F_t = tv.s1;
+
+    double b_h = F_h * Rem_h; 
+    double b_t = fma(Rem_t, F_h, fma(F_t, Rem_h, F_t*Rem_t));
+
+    double ans = fma(z, b_h, fma(z, b_t, b_t)) + b_h;
+    ans = copysign(ans*mf, x);
+    return return_x ? x : ans;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cbrt, double)
+
+#endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/clc_nextafter.cl libclc-0.2.0+git20170213/generic/lib/math/clc_nextafter.cl
--- libclc-0.2.0+git20150813/generic/lib/math/clc_nextafter.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/clc_nextafter.cl	2017-02-12 21:33:49.000000000 +0000
@@ -1,43 +1,39 @@
 #include <clc/clc.h>
 #include "../clcmacro.h"
 
-// This file provides OpenCL C implementations of nextafter for targets that
-// don't support the clang builtin.
+// This file provides OpenCL C implementations of nextafter for
+// targets that don't support the clang builtin.
 
-#define FLT_NAN 0.0f/0.0f
+#define AS_TYPE(x) as_##x
 
-#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, NAN, ZERO, NEXTAFTER_ZERO) \
+#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, INT_TYPE)                      \
 _CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x, FLOAT_TYPE y) { \
-  union {                     \
-    FLOAT_TYPE f;             \
-    UINT_TYPE i;              \
-  } next;                     \
-  if (isnan(x) || isnan(y)) { \
-    return NAN;               \
-  }                           \
-  if (x == y) {               \
-    return y;                 \
-  }                           \
-  next.f = x;                 \
-  if (x < y) {                \
-    next.i++;                 \
-  } else {                    \
-    if (next.f == ZERO) {     \
-    next.i = NEXTAFTER_ZERO;  \
-    } else {                  \
-      next.i--;               \
-    }                         \
-  }                           \
-  return next.f;              \
+  const UINT_TYPE sign_bit                                        \
+   = (UINT_TYPE)1 << (sizeof(INT_TYPE) * 8 - 1);                  \
+  const UINT_TYPE sign_bit_mask = sign_bit - 1;                   \
+  INT_TYPE ix = AS_TYPE(INT_TYPE)(x);                             \
+  INT_TYPE ax = ix & sign_bit_mask;                               \
+  INT_TYPE mx = sign_bit - ix;                                    \
+  mx = ix < 0 ? mx : ix;                                          \
+  INT_TYPE iy = AS_TYPE(INT_TYPE)(y);                             \
+  INT_TYPE ay = iy & sign_bit_mask;                               \
+  INT_TYPE my = sign_bit - iy;                                    \
+  my = iy < 0 ? my : iy;                                          \
+  INT_TYPE t = mx + (mx < my ? 1 : -1);                           \
+  INT_TYPE r = sign_bit - t;                                      \
+  r = t < 0 ? r : t;                                              \
+  r = isnan(x) ? ix : r;                                          \
+  r = isnan(y) ? iy : r;                                          \
+  r = ((ax | ay) == 0 | ix == iy) ? iy : r;                       \
+  return AS_TYPE(FLOAT_TYPE)(r);                                  \
 }
 
-NEXTAFTER(float, uint, FLT_NAN, 0.0f, 0x80000001)
+NEXTAFTER(float, uint, int)
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_nextafter, float, float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#define DBL_NAN 0.0/0.0
 
-NEXTAFTER(double, ulong, DBL_NAN, 0.0, 0x8000000000000001)
+NEXTAFTER(double, ulong, long)
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_nextafter, double, double)
 #endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/cosh.cl libclc-0.2.0+git20170213/generic/lib/math/cosh.cl
--- libclc-0.2.0+git20150813/generic/lib/math/cosh.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/cosh.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float cosh(float x) {
+
+    // After dealing with special cases the computation is split into regions as follows.
+    // abs(x) >= max_cosh_arg:
+    // cosh(x) = sign(x)*Inf
+    // abs(x) >= small_threshold:
+    // cosh(x) = sign(x)*exp(abs(x))/2 computed using the
+    // splitexp and scaleDouble functions as for exp_amd().
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // cosh(x) is then z.
+
+    const float max_cosh_arg = 0x1.65a9fap+6f;
+    const float small_threshold = 0x1.0a2b24p+3f;
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    float y = as_float(aux);
+
+    // Find the integer part y0 of y and the increment dy = y - y0. We then compute
+    // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+    // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are tabulated above.
+
+    int ind = (int)y;
+    ind = (uint)ind > 36U ? 0 : ind;
+
+    float dy = y - ind;
+    float dy2 = dy * dy;
+
+    float sdy = mad(dy2,
+                    mad(dy2,
+                        mad(dy2,
+                            mad(dy2,
+                                mad(dy2,
+                                    mad(dy2, 0.7746188980094184251527126e-12f, 0.160576793121939886190847e-9f),
+                                    0.250521176994133472333666e-7f),
+                                0.275573191913636406057211e-5f),
+                            0.198412698413242405162014e-3f),
+                        0.833333333333329931873097e-2f),
+                    0.166666666666666667013899e0f);
+    sdy = mad(sdy, dy*dy2, dy);
+
+    float cdy = mad(dy2,
+                    mad(dy2,
+                        mad(dy2,
+                            mad(dy2,
+                                mad(dy2,
+                                    mad(dy2, 0.1163921388172173692062032e-10f, 0.208744349831471353536305e-8f),
+                                    0.275573350756016588011357e-6f),
+                                0.248015872460622433115785e-4f),
+                            0.138888888889814854814536e-2f),
+                        0.416666666666660876512776e-1f),
+                    0.500000000000000005911074e0f);
+    cdy = mad(cdy, dy2, 1.0f);
+
+    float2 tv = USE_TABLE(sinhcosh_tbl, ind);
+    float z = mad(tv.s0, sdy, tv.s1 * cdy);
+
+    // When exp(-x) is insignificant compared to exp(x), return exp(x)/2
+    float t = exp(y - 0x1.62e500p-1f);
+    float zsmall = mad(0x1.a0210ep-18f, t, t);
+    z = y >= small_threshold ? zsmall : z;
+
+    // Corner cases
+    z = y >= max_cosh_arg ? as_float(PINFBITPATT_SP32) : z;
+    z = aux > PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : z;
+    z = aux < 0x38800000 ? 1.0f : z;
+
+    return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cosh, float);
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double cosh(double x) {
+
+    // After dealing with special cases the computation is split into
+    // regions as follows:
+    //
+    // abs(x) >= max_cosh_arg:
+    // cosh(x) = sign(x)*Inf
+    //
+    // abs(x) >= small_threshold:
+    // cosh(x) = sign(x)*exp(abs(x))/2 computed using the
+    // splitexp and scaleDouble functions as for exp_amd().
+    //
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // cosh(x) is then sign(x)*z.
+
+    // This is ln(2^1025)
+    const double max_cosh_arg = 7.10475860073943977113e+02;      // 0x408633ce8fb9f87e
+
+    // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27)
+    const double small_threshold = 0x1.2b708872320e2p+4;
+
+    double y = fabs(x);
+
+    // In this range we find the integer part y0 of y 
+    // and the increment dy = y - y0. We then compute
+    // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are tabulated above.
+
+    int ind = min((int)y, 36);
+    double dy = y - ind;
+    double dy2 = dy * dy;
+
+    double sdy = dy * dy2 *
+	         fma(dy2,
+		     fma(dy2,
+			 fma(dy2,
+			     fma(dy2,
+				 fma(dy2,
+				     fma(dy2, 0.7746188980094184251527126e-12, 0.160576793121939886190847e-9),
+				     0.250521176994133472333666e-7),
+				 0.275573191913636406057211e-5),
+			     0.198412698413242405162014e-3),
+			 0.833333333333329931873097e-2),
+		     0.166666666666666667013899e0);
+
+    double cdy = dy2 * fma(dy2,
+	                   fma(dy2,
+			       fma(dy2,
+				   fma(dy2,
+				       fma(dy2,
+					   fma(dy2, 0.1163921388172173692062032e-10, 0.208744349831471353536305e-8),
+					   0.275573350756016588011357e-6),
+				       0.248015872460622433115785e-4),
+				   0.138888888889814854814536e-2),
+			       0.416666666666660876512776e-1),
+			   0.500000000000000005911074e0);
+
+    // At this point sinh(dy) is approximated by dy + sdy,
+    // and cosh(dy) is approximated by 1 + cdy.
+    double2 tv = USE_TABLE(cosh_tbl, ind);
+    double cl = tv.s0;
+    double ct = tv.s1;
+    tv = USE_TABLE(sinh_tbl, ind);
+    double sl = tv.s0;
+    double st = tv.s1;
+
+    double z = fma(sl, dy, fma(sl, sdy, fma(cl, cdy, fma(st, dy, fma(st, sdy, ct*cdy)) + ct))) + cl;
+
+    // Other cases
+    z = y < 0x1.0p-28 ? 1.0 : z;
+
+    double t = exp(y - 0x1.62e42fefa3800p-1);
+    t =  fma(t, -0x1.ef35793c76641p-45, t);
+    z = y >= small_threshold ? t : z;
+
+    z = y >= max_cosh_arg ? as_double(PINFBITPATT_DP64) : z;
+
+    z = isinf(x) | isnan(x) ? y : z;
+
+    return z;
+
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cosh, double)
+
+#endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/erf.cl libclc-0.2.0+git20170213/generic/lib/math/erf.cl
--- libclc-0.2.0+git20150813/generic/lib/math/erf.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/erf.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+*/
+
+#define erx   8.4506291151e-01f        /* 0x3f58560b */
+
+// Coefficients for approximation to  erf on [00.84375]
+
+#define efx   1.2837916613e-01f        /* 0x3e0375d4 */
+#define efx8  1.0270333290e+00f        /* 0x3f8375d4 */
+
+#define pp0   1.2837916613e-01f        /* 0x3e0375d4 */
+#define pp1  -3.2504209876e-01f        /* 0xbea66beb */
+#define pp2  -2.8481749818e-02f        /* 0xbce9528f */
+#define pp3  -5.7702702470e-03f        /* 0xbbbd1489 */
+#define pp4  -2.3763017452e-05f        /* 0xb7c756b1 */
+#define qq1   3.9791721106e-01f        /* 0x3ecbbbce */
+#define qq2   6.5022252500e-02f        /* 0x3d852a63 */
+#define qq3   5.0813062117e-03f        /* 0x3ba68116 */
+#define qq4   1.3249473704e-04f        /* 0x390aee49 */
+#define qq5  -3.9602282413e-06f        /* 0xb684e21a */
+
+// Coefficients for approximation to  erf  in [0.843751.25]
+
+#define pa0  -2.3621185683e-03f        /* 0xbb1acdc6 */
+#define pa1   4.1485610604e-01f        /* 0x3ed46805 */
+#define pa2  -3.7220788002e-01f        /* 0xbebe9208 */
+#define pa3   3.1834661961e-01f        /* 0x3ea2fe54 */
+#define pa4  -1.1089469492e-01f        /* 0xbde31cc2 */
+#define pa5   3.5478305072e-02f        /* 0x3d1151b3 */
+#define pa6  -2.1663755178e-03f        /* 0xbb0df9c0 */
+#define qa1   1.0642088205e-01f        /* 0x3dd9f331 */
+#define qa2   5.4039794207e-01f        /* 0x3f0a5785 */
+#define qa3   7.1828655899e-02f        /* 0x3d931ae7 */
+#define qa4   1.2617121637e-01f        /* 0x3e013307 */
+#define qa5   1.3637083583e-02f        /* 0x3c5f6e13 */
+#define qa6   1.1984500103e-02f        /* 0x3c445aa3 */
+
+// Coefficients for approximation to  erfc in [1.251/0.35]
+
+#define ra0  -9.8649440333e-03f        /* 0xbc21a093 */
+#define ra1  -6.9385856390e-01f        /* 0xbf31a0b7 */
+#define ra2  -1.0558626175e+01f        /* 0xc128f022 */
+#define ra3  -6.2375331879e+01f        /* 0xc2798057 */
+#define ra4  -1.6239666748e+02f        /* 0xc322658c */
+#define ra5  -1.8460508728e+02f        /* 0xc3389ae7 */
+#define ra6  -8.1287437439e+01f        /* 0xc2a2932b */
+#define ra7  -9.8143291473e+00f        /* 0xc11d077e */
+#define sa1   1.9651271820e+01f        /* 0x419d35ce */
+#define sa2   1.3765776062e+02f        /* 0x4309a863 */
+#define sa3   4.3456588745e+02f        /* 0x43d9486f */
+#define sa4   6.4538726807e+02f        /* 0x442158c9 */
+#define sa5   4.2900814819e+02f        /* 0x43d6810b */
+#define sa6   1.0863500214e+02f        /* 0x42d9451f */
+#define sa7   6.5702495575e+00f        /* 0x40d23f7c */
+#define sa8  -6.0424413532e-02f        /* 0xbd777f97 */
+
+// Coefficients for approximation to  erfc in [1/.3528]
+
+#define rb0  -9.8649431020e-03f        /* 0xbc21a092 */
+#define rb1  -7.9928326607e-01f        /* 0xbf4c9dd4 */
+#define rb2  -1.7757955551e+01f        /* 0xc18e104b */
+#define rb3  -1.6063638306e+02f        /* 0xc320a2ea */
+#define rb4  -6.3756646729e+02f        /* 0xc41f6441 */
+#define rb5  -1.0250950928e+03f        /* 0xc480230b */
+#define rb6  -4.8351919556e+02f        /* 0xc3f1c275 */
+#define sb1   3.0338060379e+01f        /* 0x41f2b459 */
+#define sb2   3.2579251099e+02f        /* 0x43a2e571 */
+#define sb3   1.5367296143e+03f        /* 0x44c01759 */
+#define sb4   3.1998581543e+03f        /* 0x4547fdbb */
+#define sb5   2.5530502930e+03f        /* 0x451f90ce */
+#define sb6   4.7452853394e+02f        /* 0x43ed43a7 */
+#define sb7  -2.2440952301e+01f        /* 0xc1b38712 */
+
+_CLC_OVERLOAD _CLC_DEF float erf(float x) {
+    int hx = as_uint(x);
+    int ix = hx & 0x7fffffff;
+    float absx = as_float(ix);
+
+    float x2 = absx * absx;
+    float t = 1.0f / x2;
+    float tt = absx - 1.0f;
+    t = absx < 1.25f ? tt : t;
+    t = absx < 0.84375f ? x2 : t;
+
+    float u, v, tu, tv;
+
+    // |x| < 6
+    u = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+    v = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sb7, sb6), sb5), sb4), sb3), sb2), sb1);
+
+    tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+    tv = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1);
+    u = absx < 0x1.6db6dcp+1f ? tu : u;
+    v = absx < 0x1.6db6dcp+1f ? tv : v;
+
+    tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+    tv = mad(t, mad(t, mad(t, mad(t, mad(t, qa6, qa5), qa4), qa3), qa2), qa1);
+    u = absx < 1.25f ? tu : u;
+    v = absx < 1.25f ? tv : v;
+
+    tu = mad(t, mad(t, mad(t, mad(t, pp4, pp3), pp2), pp1), pp0);
+    tv = mad(t, mad(t, mad(t, mad(t, qq5, qq4), qq3), qq2), qq1);
+    u = absx < 0.84375f ? tu : u;
+    v = absx < 0.84375f ? tv : v;
+
+    v = mad(t, v, 1.0f);
+    float q = MATH_DIVIDE(u, v);
+
+    float ret = 1.0f;
+
+    // |x| < 6
+    float z = as_float(ix & 0xfffff000);
+    float r = exp(mad(-z, z, -0.5625f)) * exp(mad(z-absx, z+absx, q));
+    r = 1.0f - MATH_DIVIDE(r,  absx);
+    ret = absx < 6.0f ? r : ret;
+
+    r = erx + q;
+    ret = absx < 1.25f ? r : ret;
+
+    ret = as_float((hx & 0x80000000) | as_int(ret));
+
+    r = mad(x, q, x);
+    ret = absx < 0.84375f ? r : ret;
+
+    // Prevent underflow
+    r = 0.125f * mad(8.0f, x, efx8 * x);
+    ret = absx < 0x1.0p-28f ? r : ret;
+
+    ret = isnan(x) ? x : ret;
+
+    return ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, erf, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* double erf(double x)
+ * double erfc(double x)
+ *                             x
+ *                      2      |\
+ *     erf(x)  =  ---------  | exp(-t*t)dt
+ *                    sqrt(pi) \|
+ *                             0
+ *
+ *     erfc(x) =  1-erf(x)
+ *  Note that
+ *                erf(-x) = -erf(x)
+ *                erfc(-x) = 2 - erfc(x)
+ *
+ * Method:
+ *        1. For |x| in [0, 0.84375]
+ *            erf(x)  = x + x*R(x^2)
+ *          erfc(x) = 1 - erf(x)           if x in [-.84375,0.25]
+ *                  = 0.5 + ((0.5-x)-x*R)  if x in [0.25,0.84375]
+ *           where R = P/Q where P is an odd poly of degree 8 and
+ *           Q is an odd poly of degree 10.
+ *                                                 -57.90
+ *                        | R - (erf(x)-x)/x | <= 2
+ *
+ *
+ *           Remark. The formula is derived by noting
+ *          erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....)
+ *           and that
+ *          2/sqrt(pi) = 1.128379167095512573896158903121545171688
+ *           is close to one. The interval is chosen because the fix
+ *           point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is
+ *           near 0.6174), and by some experiment, 0.84375 is chosen to
+ *            guarantee the error is less than one ulp for erf.
+ *
+ *      2. For |x| in [0.84375,1.25], let s = |x| - 1, and
+ *         c = 0.84506291151 rounded to single (24 bits)
+ *                 erf(x)  = sign(x) * (c  + P1(s)/Q1(s))
+ *                 erfc(x) = (1-c)  - P1(s)/Q1(s) if x > 0
+ *                          1+(c+P1(s)/Q1(s))    if x < 0
+ *                 |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06
+ *           Remark: here we use the taylor series expansion at x=1.
+ *                erf(1+s) = erf(1) + s*Poly(s)
+ *                         = 0.845.. + P1(s)/Q1(s)
+ *           That is, we use rational approximation to approximate
+ *                        erf(1+s) - (c = (single)0.84506291151)
+ *           Note that |P1/Q1|< 0.078 for x in [0.84375,1.25]
+ *           where
+ *                P1(s) = degree 6 poly in s
+ *                Q1(s) = degree 6 poly in s
+ *
+ *      3. For x in [1.25,1/0.35(~2.857143)],
+ *                 erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1)
+ *                 erf(x)  = 1 - erfc(x)
+ *           where
+ *                R1(z) = degree 7 poly in z, (z=1/x^2)
+ *                S1(z) = degree 8 poly in z
+ *
+ *      4. For x in [1/0.35,28]
+ *                 erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0
+ *                        = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6<x<0
+ *                        = 2.0 - tiny                (if x <= -6)
+ *                 erf(x)  = sign(x)*(1.0 - erfc(x)) if x < 6, else
+ *                 erf(x)  = sign(x)*(1.0 - tiny)
+ *           where
+ *                R2(z) = degree 6 poly in z, (z=1/x^2)
+ *                S2(z) = degree 7 poly in z
+ *
+ *      Note1:
+ *           To compute exp(-x*x-0.5625+R/S), let s be a single
+ *           precision number and s := x; then
+ *                -x*x = -s*s + (s-x)*(s+x)
+ *                exp(-x*x-0.5626+R/S) =
+ *                        exp(-s*s-0.5625)*exp((s-x)*(s+x)+R/S);
+ *      Note2:
+ *           Here 4 and 5 make use of the asymptotic series
+ *                          exp(-x*x)
+ *                erfc(x) ~ ---------- * ( 1 + Poly(1/x^2) )
+ *                          x*sqrt(pi)
+ *           We use rational approximation to approximate
+ *              g(s)=f(1/x^2) = log(erfc(x)*x) - x*x + 0.5625
+ *           Here is the error bound for R1/S1 and R2/S2
+ *              |R1/S1 - f(x)|  < 2**(-62.57)
+ *              |R2/S2 - f(x)|  < 2**(-61.52)
+ *
+ *      5. For inf > x >= 28
+ *                 erf(x)  = sign(x) *(1 - tiny)  (raise inexact)
+ *                 erfc(x) = tiny*tiny (raise underflow) if x > 0
+ *                        = 2 - tiny if x<0
+ *
+ *      7. Special case:
+ *                 erf(0)  = 0, erf(inf)  = 1, erf(-inf) = -1,
+ *                 erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2,
+ *                   erfc/erf(NaN) is NaN
+ */
+
+#define AU0 -9.86494292470009928597e-03
+#define AU1 -7.99283237680523006574e-01
+#define AU2 -1.77579549177547519889e+01
+#define AU3 -1.60636384855821916062e+02
+#define AU4 -6.37566443368389627722e+02
+#define AU5 -1.02509513161107724954e+03
+#define AU6 -4.83519191608651397019e+02
+
+#define AV1  3.03380607434824582924e+01
+#define AV2  3.25792512996573918826e+02
+#define AV3  1.53672958608443695994e+03
+#define AV4  3.19985821950859553908e+03
+#define AV5  2.55305040643316442583e+03
+#define AV6  4.74528541206955367215e+02
+#define AV7 -2.24409524465858183362e+01
+
+#define BU0 -9.86494403484714822705e-03
+#define BU1 -6.93858572707181764372e-01
+#define BU2 -1.05586262253232909814e+01
+#define BU3 -6.23753324503260060396e+01
+#define BU4 -1.62396669462573470355e+02
+#define BU5 -1.84605092906711035994e+02
+#define BU6 -8.12874355063065934246e+01
+#define BU7 -9.81432934416914548592e+00
+
+#define BV1  1.96512716674392571292e+01
+#define BV2  1.37657754143519042600e+02
+#define BV3  4.34565877475229228821e+02
+#define BV4  6.45387271733267880336e+02
+#define BV5  4.29008140027567833386e+02
+#define BV6  1.08635005541779435134e+02
+#define BV7  6.57024977031928170135e+00
+#define BV8 -6.04244152148580987438e-02
+
+#define CU0 -2.36211856075265944077e-03
+#define CU1  4.14856118683748331666e-01
+#define CU2 -3.72207876035701323847e-01
+#define CU3  3.18346619901161753674e-01
+#define CU4 -1.10894694282396677476e-01
+#define CU5  3.54783043256182359371e-02
+#define CU6 -2.16637559486879084300e-03
+
+#define CV1  1.06420880400844228286e-01
+#define CV2  5.40397917702171048937e-01
+#define CV3  7.18286544141962662868e-02
+#define CV4  1.26171219808761642112e-01
+#define CV5  1.36370839120290507362e-02
+#define CV6  1.19844998467991074170e-02
+
+#define DU0  1.28379167095512558561e-01
+#define DU1 -3.25042107247001499370e-01
+#define DU2 -2.84817495755985104766e-02
+#define DU3 -5.77027029648944159157e-03
+#define DU4 -2.37630166566501626084e-05
+
+#define DV1  3.97917223959155352819e-01
+#define DV2  6.50222499887672944485e-02
+#define DV3  5.08130628187576562776e-03
+#define DV4  1.32494738004321644526e-04
+#define DV5 -3.96022827877536812320e-06
+
+_CLC_OVERLOAD _CLC_DEF double erf(double y) {
+    double x = fabs(y);
+    double x2 = x * x;
+    double xm1 = x - 1.0;
+
+    // Poly variable
+    double t = 1.0 / x2;
+    t = x < 1.25 ? xm1 : t;
+    t = x < 0.84375 ? x2 : t;
+
+    double u, ut, v, vt;
+
+    // Evaluate rational poly
+    // XXX We need to see of we can grab 16 coefficents from a table
+    // faster than evaluating 3 of the poly pairs
+    // if (x < 6.0)
+    u = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AU6, AU5), AU4), AU3), AU2), AU1), AU0);
+    v = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AV7, AV6), AV5), AV4), AV3), AV2), AV1);
+
+    ut = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BU7, BU6), BU5), BU4), BU3), BU2), BU1), BU0);
+    vt = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BV8, BV7), BV6), BV5), BV4), BV3), BV2), BV1);
+    u = x < 0x1.6db6ep+1 ? ut : u;
+    v = x < 0x1.6db6ep+1 ? vt : v;
+
+    ut = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, CU6, CU5), CU4), CU3), CU2), CU1), CU0);
+    vt = fma(t, fma(t, fma(t, fma(t, fma(t, CV6, CV5), CV4), CV3), CV2), CV1);
+    u = x < 1.25 ? ut : u;
+    v = x < 1.25 ? vt : v;
+
+    ut = fma(t, fma(t, fma(t, fma(t, DU4, DU3), DU2), DU1), DU0);
+    vt = fma(t, fma(t, fma(t, fma(t, DV5, DV4), DV3), DV2), DV1);
+    u = x < 0.84375 ? ut : u;
+    v = x < 0.84375 ? vt : v;
+
+    v = fma(t, v, 1.0);
+
+    // Compute rational approximation
+    double q = u / v;
+
+    // Compute results
+    double z = as_double(as_long(x) & 0xffffffff00000000L);
+    double r = exp(-z * z - 0.5625) * exp((z - x) * (z + x) + q);
+    r = 1.0 - r / x;
+
+    double ret = x < 6.0 ? r : 1.0;
+
+    r = 8.45062911510467529297e-01 + q;
+    ret = x < 1.25 ? r : ret;
+
+    q = x < 0x1.0p-28 ? 1.28379167095512586316e-01 : q;
+
+    r = fma(x, q, x);
+    ret = x < 0.84375 ? r : ret;
+
+    ret = isnan(x) ? x : ret;
+
+    return y < 0.0 ? -ret : ret;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, erf, double);
+
+#endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/expm1.cl libclc-0.2.0+git20170213/generic/lib/math/expm1.cl
--- libclc-0.2.0+git20150813/generic/lib/math/expm1.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/expm1.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,142 @@
+#include <clc/clc.h>
+
+#include "math.h"
+#include "tables.h"
+#include "../clcmacro.h"
+
+/* Refer to the exp routine for the underlying algorithm */
+
+_CLC_OVERLOAD _CLC_DEF float expm1(float x) {
+    const float X_MAX = 0x1.62e42ep+6f; // 128*log2 : 88.722839111673
+    const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184
+
+    const float R_64_BY_LOG2 = 0x1.715476p+6f;     // 64/log2 : 92.332482616893657
+    const float R_LOG2_BY_64_LD = 0x1.620000p-7f;  // log2/64 lead: 0.0108032227
+    const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388
+
+    uint xi = as_uint(x);
+    int n = (int)(x * R_64_BY_LOG2);
+    float fn = (float)n;
+
+    int j = n & 0x3f;
+    int m = n >> 6;
+
+    float r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x));
+
+    // Truncated Taylor series
+    float z2 = mad(r*r, mad(r, mad(r, 0x1.555556p-5f,  0x1.555556p-3f), 0.5f), r);
+
+    float m2 = as_float((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
+    float2 tv = USE_TABLE(exp_tbl_ep, j);
+
+    float two_to_jby64_h = tv.s0 * m2;
+    float two_to_jby64_t = tv.s1 * m2;
+    float two_to_jby64 = two_to_jby64_h + two_to_jby64_t;
+
+    z2 = mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f);
+	//Make subnormals work
+    z2 = x == 0.f ? x : z2;
+    z2 = x < X_MIN | m < -24 ? -1.0f : z2;
+    z2 = x > X_MAX ? as_float(PINFBITPATT_SP32) : z2;
+    z2 = isnan(x) ? x : z2;
+
+    return z2;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, expm1, float)
+
+#ifdef cl_khr_fp64
+
+#include "exp_helper.h"
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double expm1(double x) {
+    const double max_expm1_arg = 709.8;
+    const double min_expm1_arg = -37.42994775023704;
+    const double log_OnePlus_OneByFour = 0.22314355131420976;   //0x3FCC8FF7C79A9A22 = log(1+1/4)
+    const double log_OneMinus_OneByFour = -0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4)
+    const double sixtyfour_by_lnof2 = 92.33248261689366;        //0x40571547652b82fe
+    const double lnof2_by_64_head = 0.010830424696223417;       //0x3f862e42fefa0000
+    const double lnof2_by_64_tail = 2.5728046223276688e-14;     //0x3d1cf79abc9e3b39
+
+    // First, assume log(1-1/4) < x < log(1+1/4) i.e  -0.28768 < x < 0.22314
+    double u = as_double(as_ulong(x) & 0xffffffffff000000UL);
+    double v = x - u;
+    double y = u * u * 0.5;
+    double z = v * (x + u) * 0.5;
+
+    double q = fma(x,
+	           fma(x,
+		       fma(x,
+			   fma(x,
+			       fma(x,
+				   fma(x,
+				       fma(x,
+					   fma(x,2.4360682937111612e-8, 2.7582184028154370e-7),
+					   2.7558212415361945e-6),
+				       2.4801576918453420e-5),
+				   1.9841269447671544e-4),
+			       1.3888888890687830e-3),
+			   8.3333333334012270e-3),
+		       4.1666666666665560e-2),
+		   1.6666666666666632e-1);
+    q *= x * x * x;
+
+    double z1g = (u + y) + (q + (v + z));
+    double z1 = x + (y + (q + z));
+    z1 = y >= 0x1.0p-7 ? z1g : z1;
+
+    // Now assume outside interval around 0
+    int n = (int)(x * sixtyfour_by_lnof2);
+    int j = n & 0x3f;
+    int m = n >> 6;
+
+    double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j);
+    double f1 = tv.s0;
+    double f2 = tv.s1;
+    double f = f1 + f2;
+
+    double dn = -n;
+    double r = fma(dn, lnof2_by_64_tail, fma(dn, lnof2_by_64_head, x));
+
+    q = fma(r,
+	    fma(r,
+		fma(r,
+		    fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03),
+		    4.16666666662260795726e-02),
+		1.66666666665260878863e-01),
+	     5.00000000000000008883e-01);
+    q = fma(r*r, q, r);
+
+    double twopm = as_double((long)(m + EXPBIAS_DP64) << EXPSHIFTBITS_DP64);
+    double twopmm = as_double((long)(EXPBIAS_DP64 - m) << EXPSHIFTBITS_DP64);
+
+    // Computations for m > 52, including where result is close to Inf
+    ulong uval = as_ulong(0x1.0p+1023 * (f1 + (f * q + (f2))));
+    int e = (int)(uval >> EXPSHIFTBITS_DP64) + 1;
+
+    double zme1024 = as_double(((long)e << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64));
+    zme1024 = e == 2047 ? as_double(PINFBITPATT_DP64) : zme1024;
+
+    double zmg52 = twopm * (f1 + fma(f, q, f2 - twopmm));
+    zmg52 = m == 1024 ? zme1024 : zmg52;
+
+    // For m < 53
+    double zml53 = twopm * ((f1 - twopmm) + fma(f1, q, f2*(1.0 + q)));
+
+    // For m < -7
+    double zmln7 = fma(twopm,  f1 + fma(f, q, f2), -1.0);
+
+    z = m < 53 ? zml53 : zmg52;
+    z = m < -7 ? zmln7 : z;
+    z = x > log_OneMinus_OneByFour & x < log_OnePlus_OneByFour ? z1 : z;
+    z = x > max_expm1_arg ? as_double(PINFBITPATT_DP64) : z;
+    z = x < min_expm1_arg ? -1.0 : z;
+
+    return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, expm1, double)
+
+#endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/fdim.cl libclc-0.2.0+git20170213/generic/lib/math/fdim.cl
--- libclc-0.2.0+git20150813/generic/lib/math/fdim.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/fdim.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,10 @@
+#include <clc/clc.h>
+
+#include "math.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <fdim.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/fdim.inc libclc-0.2.0+git20170213/generic/lib/math/fdim.inc
--- libclc-0.2.0+git20150813/generic/lib/math/fdim.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/fdim.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ * Copyright (c) 2016 Aaron Watry
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __CLC_FPSIZE == 32
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fdim(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+    if (__builtin_isnan(x) || __builtin_isnan(y))
+        return as_float(QNANBITPATT_SP32);
+    return fmax(x - y, 0.0f);
+}
+#define __CLC_FDIM_VEC(width) \
+_CLC_OVERLOAD _CLC_DEF float##width fdim(float##width x, float##width y) { \
+    /* Determine if x or y is NaN. */ \
+    /* Vector true is -1, i.e. all-bits-set, and NaN==NaN is false. */ \
+    /* If either is NaN, then ~((x==x) & (y==y)) will be 0 (e.g. ~(-1)), as will n. */ \
+    int##width n = ~((x == x) & (y == y)) & QNANBITPATT_SP32; \
+    /* Calculate x-y if x>y, otherwise positive 0, again taking */ \
+    /* advantage of vector true being all-bits-set. */ \
+    int##width r = (x > y) & as_int##width(x - y); \
+    return as_float##width(n | r); \
+}
+__CLC_FDIM_VEC(2)
+__CLC_FDIM_VEC(3)
+__CLC_FDIM_VEC(4)
+__CLC_FDIM_VEC(8)
+__CLC_FDIM_VEC(16)
+#undef __CLC_FDIM_VEC
+#endif
+#endif
+
+#if __CLC_FPSIZE == 64
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fdim(__CLC_GENTYPE x, private __CLC_GENTYPE y) {
+    long n = -(isnan(x) | isnan(y)) & QNANBITPATT_DP64;
+    long r = -(x > y) & as_long(x - y);
+    return as_double(n | r);
+}
+#define __CLC_FDIM_VEC(width) \
+_CLC_OVERLOAD _CLC_DEF double##width fdim(double##width x, double##width y) { \
+    /* See comment in float implementation for explanation. */ \
+    long##width n = ~((x == x) & (y == y)) & QNANBITPATT_DP64; \
+    long##width r = (x > y) & as_long##width(x - y); \
+    return as_double##width(n | r); \
+}
+__CLC_FDIM_VEC(2)
+__CLC_FDIM_VEC(3)
+__CLC_FDIM_VEC(4)
+__CLC_FDIM_VEC(8)
+__CLC_FDIM_VEC(16)
+#undef __CLC_FDIM_VEC
+#endif
+#endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/frexp.cl libclc-0.2.0+git20170213/generic/lib/math/frexp.cl
--- libclc-0.2.0+git20150813/generic/lib/math/frexp.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/frexp.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,10 @@
+#include <clc/clc.h>
+
+#include "math.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <frexp.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/frexp.inc libclc-0.2.0+git20170213/generic/lib/math/frexp.inc
--- libclc-0.2.0+git20150813/generic/lib/math/frexp.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/frexp.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ * Copyright (c) 2016 Aaron Watry
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __CLC_FPSIZE == 32
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(float x, private int *ep) {
+    int i = as_int(x);
+    int ai = i & 0x7fffffff;
+    int d = ai > 0 & ai < 0x00800000;
+    // scale subnormal by 2^26 without multiplying
+    float s = as_float(ai | 0x0d800000) - 0x1.0p-100F;
+    ai = d ? as_int(s) : ai;
+    int e = (ai >> 23) - 126 - (d ? 26 : 0);
+    int t = ai == 0 | e == 129;
+    i = (i & 0x80000000) | 0x3f000000 | (ai & 0x007fffff);
+    *ep = t ? 0 : e;
+    return t ? x : as_float(i);
+}
+#define __CLC_FREXP_VEC(width) \
+_CLC_OVERLOAD _CLC_DEF float##width frexp(float##width x, private int##width *ep) { \
+    int##width i = as_int##width(x); \
+    int##width ai = i & 0x7fffffff; \
+    int##width d = ai > 0 & ai < 0x00800000; \
+    /* scale subnormal by 2^26 without multiplying */ \
+    float##width s = as_float##width(ai | 0x0d800000) - 0x1.0p-100F; \
+    ai = bitselect(ai, as_int##width(s), d); \
+    int##width e = (ai >> 23) - 126 - bitselect((int##width)0, (int##width)26, d); \
+    int##width t = ai == (int##width)0 | e == (int##width)129; \
+    i = (i & (int##width)0x80000000) | (int##width)0x3f000000 | (ai & 0x007fffff); \
+    *ep = bitselect(e, (int##width)0, t); \
+    return bitselect(as_float##width(i), x, as_float##width(t)); \
+}
+__CLC_FREXP_VEC(2)
+__CLC_FREXP_VEC(3)
+__CLC_FREXP_VEC(4)
+__CLC_FREXP_VEC(8)
+__CLC_FREXP_VEC(16)
+#undef __CLC_FREXP_VEC
+#endif
+#endif
+
+#if __CLC_FPSIZE == 64
+#ifdef __CLC_SCALAR
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, private __CLC_INTN *ep) {
+    long i = as_long(x);
+    long ai = i & 0x7fffffffffffffffL;
+    int d = ai > 0 & ai < 0x0010000000000000L;
+    // scale subnormal by 2^54 without multiplying
+    double s = as_double(ai | 0x0370000000000000L) - 0x1.0p-968;
+    ai = d ? as_long(s) : ai;
+    int e = (int)(ai >> 52) - 1022 - (d ? 54 : 0);
+    int t = ai == 0 | e == 1025;
+    i = (i & 0x8000000000000000L) | 0x3fe0000000000000L | (ai & 0x000fffffffffffffL);
+    *ep = t ? 0 : e;
+    return t ? x : as_double(i);
+}
+#define __CLC_FREXP_VEC(width) \
+_CLC_OVERLOAD _CLC_DEF double##width frexp(double##width x, private int##width *ep) { \
+    long##width i = as_long##width(x); \
+    long##width ai = i & 0x7fffffffffffffffL; \
+    long##width d = ai > 0 & ai < 0x0010000000000000L; \
+    /* scale subnormal by 2^54 without multiplying */ \
+    double##width s = as_double##width(ai | 0x0370000000000000L) - 0x1.0p-968; \
+    ai = bitselect(ai, as_long##width(s), d); \
+    int##width e = convert_int##width(ai >> 52) - 1022 - bitselect((int##width)0, (int##width)54, convert_int##width(d)); \
+    int##width t = convert_int##width(ai == (long##width)0) | (e == (int##width)129); \
+    i = (i & (long##width)0x8000000000000000L) | (long##width)0x3fe0000000000000L | (ai & 0x000fffffffffffffL); \
+    *ep = bitselect(e, (int##width)0, t); \
+    return bitselect(as_double##width(i), x, as_double##width(convert_long##width(t))); \
+}
+__CLC_FREXP_VEC(2)
+__CLC_FREXP_VEC(3)
+__CLC_FREXP_VEC(4)
+__CLC_FREXP_VEC(8)
+__CLC_FREXP_VEC(16)
+#undef __CLC_FREXP_VEC
+#endif
+#endif
+
+#define __CLC_FREXP_DEF(addrspace) \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, addrspace __CLC_INTN *iptr) { \
+    __CLC_INTN private_iptr; \
+    __CLC_GENTYPE ret = frexp(x, &private_iptr); \
+    *iptr = private_iptr; \
+    return ret; \
+}
+
+__CLC_FREXP_DEF(local);
+__CLC_FREXP_DEF(global);
+
+#undef __CLC_FREXP_DEF
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/ilogb.cl libclc-0.2.0+git20170213/generic/lib/math/ilogb.cl
--- libclc-0.2.0+git20150813/generic/lib/math/ilogb.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/ilogb.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * Copyright (c) 2016 Aaron Watry
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "../clcmacro.h"
+#include "math.h"
+
+_CLC_OVERLOAD _CLC_DEF int ilogb(float x) {
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+    int rs = -118 - (int) clz(ux & MANTBITS_SP32);
+    int r = (int) (ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    r = ax < 0x00800000U ? rs : r;
+    r = ax > EXPBITS_SP32 | ax == 0 ? 0x80000000 : r;
+    r = ax == EXPBITS_SP32 ? 0x7fffffff : r;
+    return r;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ilogb, float);
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF int ilogb(double x) {
+    ulong ux = as_ulong(x);
+    ulong ax = ux & ~SIGNBIT_DP64;
+    int r = (int) (ax >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+    int rs = -1011 - (int) clz(ax & MANTBITS_DP64);
+    r = ax < 0x0010000000000000UL ? rs : r;
+    r = ax > 0x7ff0000000000000UL | ax == 0UL ? 0x80000000 : r;
+    r = ax == 0x7ff0000000000000UL ? 0x7fffffff : r;
+    return r;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ilogb, double);
+
+#endif // cl_khr_fp64
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/lgamma.cl libclc-0.2.0+git20170213/generic/lib/math/lgamma.cl
--- libclc-0.2.0+git20150813/generic/lib/math/lgamma.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/lgamma.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016 Aaron Watry <awatry@gmail.com>
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float lgamma(float x) {
+    int s;
+    return lgamma_r(x, &s);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, lgamma, float)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double lgamma(double x) {
+    int s;
+    return lgamma_r(x, &s);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, lgamma, double)
+
+#endif
\ No newline at end of file
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/lgamma_r.cl libclc-0.2.0+git20170213/generic/lib/math/lgamma_r.cl
--- libclc-0.2.0+git20150813/generic/lib/math/lgamma_r.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/lgamma_r.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+#include "../clcmacro.h"
+#include "math.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <lgamma_r.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/lgamma_r.inc libclc-0.2.0+git20170213/generic/lib/math/lgamma_r.inc
--- libclc-0.2.0+git20150813/generic/lib/math/lgamma_r.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/lgamma_r.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ * Copyright (c) 2016 Aaron Watry <awatry@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __CLC_FPSIZE == 32
+#ifdef __CLC_SCALAR
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+#define pi_f   3.1415927410e+00f        /* 0x40490fdb */
+
+#define a0_f   7.7215664089e-02f        /* 0x3d9e233f */
+#define a1_f   3.2246702909e-01f        /* 0x3ea51a66 */
+#define a2_f   6.7352302372e-02f        /* 0x3d89f001 */
+#define a3_f   2.0580807701e-02f        /* 0x3ca89915 */
+#define a4_f   7.3855509982e-03f        /* 0x3bf2027e */
+#define a5_f   2.8905137442e-03f        /* 0x3b3d6ec6 */
+#define a6_f   1.1927076848e-03f        /* 0x3a9c54a1 */
+#define a7_f   5.1006977446e-04f        /* 0x3a05b634 */
+#define a8_f   2.2086278477e-04f        /* 0x39679767 */
+#define a9_f   1.0801156895e-04f        /* 0x38e28445 */
+#define a10_f  2.5214456400e-05f        /* 0x37d383a2 */
+#define a11_f  4.4864096708e-05f        /* 0x383c2c75 */
+
+#define tc_f   1.4616321325e+00f        /* 0x3fbb16c3 */
+
+#define tf_f  -1.2148628384e-01f        /* 0xbdf8cdcd */
+/* tt -(tail of tf) */
+#define tt_f   6.6971006518e-09f        /* 0x31e61c52 */
+
+#define t0_f   4.8383611441e-01f        /* 0x3ef7b95e */
+#define t1_f  -1.4758771658e-01f        /* 0xbe17213c */
+#define t2_f   6.4624942839e-02f        /* 0x3d845a15 */
+#define t3_f  -3.2788541168e-02f        /* 0xbd064d47 */
+#define t4_f   1.7970675603e-02f        /* 0x3c93373d */
+#define t5_f  -1.0314224288e-02f        /* 0xbc28fcfe */
+#define t6_f   6.1005386524e-03f        /* 0x3bc7e707 */
+#define t7_f  -3.6845202558e-03f        /* 0xbb7177fe */
+#define t8_f   2.2596477065e-03f        /* 0x3b141699 */
+#define t9_f  -1.4034647029e-03f        /* 0xbab7f476 */
+#define t10_f  8.8108185446e-04f        /* 0x3a66f867 */
+#define t11_f -5.3859531181e-04f        /* 0xba0d3085 */
+#define t12_f  3.1563205994e-04f        /* 0x39a57b6b */
+#define t13_f -3.1275415677e-04f        /* 0xb9a3f927 */
+#define t14_f  3.3552918467e-04f        /* 0x39afe9f7 */
+
+#define u0_f  -7.7215664089e-02f        /* 0xbd9e233f */
+#define u1_f   6.3282704353e-01f        /* 0x3f2200f4 */
+#define u2_f   1.4549225569e+00f        /* 0x3fba3ae7 */
+#define u3_f   9.7771751881e-01f        /* 0x3f7a4bb2 */
+#define u4_f   2.2896373272e-01f        /* 0x3e6a7578 */
+#define u5_f   1.3381091878e-02f        /* 0x3c5b3c5e */
+
+#define v1_f   2.4559779167e+00f        /* 0x401d2ebe */
+#define v2_f   2.1284897327e+00f        /* 0x4008392d */
+#define v3_f   7.6928514242e-01f        /* 0x3f44efdf */
+#define v4_f   1.0422264785e-01f        /* 0x3dd572af */
+#define v5_f   3.2170924824e-03f        /* 0x3b52d5db */
+
+#define s0_f  -7.7215664089e-02f        /* 0xbd9e233f */
+#define s1_f   2.1498242021e-01f        /* 0x3e5c245a */
+#define s2_f   3.2577878237e-01f        /* 0x3ea6cc7a */
+#define s3_f   1.4635047317e-01f        /* 0x3e15dce6 */
+#define s4_f   2.6642270386e-02f        /* 0x3cda40e4 */
+#define s5_f   1.8402845599e-03f        /* 0x3af135b4 */
+#define s6_f   3.1947532989e-05f        /* 0x3805ff67 */
+
+#define r1_f   1.3920053244e+00f        /* 0x3fb22d3b */
+#define r2_f   7.2193557024e-01f        /* 0x3f38d0c5 */
+#define r3_f   1.7193385959e-01f        /* 0x3e300f6e */
+#define r4_f   1.8645919859e-02f        /* 0x3c98bf54 */
+#define r5_f   7.7794247773e-04f        /* 0x3a4beed6 */
+#define r6_f   7.3266842264e-06f        /* 0x36f5d7bd */
+
+#define w0_f   4.1893854737e-01f        /* 0x3ed67f1d */
+#define w1_f   8.3333335817e-02f        /* 0x3daaaaab */
+#define w2_f  -2.7777778450e-03f        /* 0xbb360b61 */
+#define w3_f   7.9365057172e-04f        /* 0x3a500cfd */
+#define w4_f  -5.9518753551e-04f        /* 0xba1c065c */
+#define w5_f   8.3633989561e-04f        /* 0x3a5b3dd2 */
+#define w6_f  -1.6309292987e-03f        /* 0xbad5c4e8 */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE lgamma_r(float x, private int *signp) {
+    int hx = as_int(x);
+    int ix = hx & 0x7fffffff;
+    float absx = as_float(ix);
+
+    if (ix >= 0x7f800000) {
+        *signp = 1;
+        return x;
+    }
+
+    if (absx < 0x1.0p-70f) {
+        *signp = hx < 0 ? -1 : 1;
+        return -log(absx);
+    }
+
+    float r;
+
+    if (absx == 1.0f | absx == 2.0f)
+        r = 0.0f;
+
+    else if (absx < 2.0f) {
+        float y = 2.0f - absx;
+        int i = 0;
+
+        int c = absx < 0x1.bb4c30p+0f;
+        float yt = absx - tc_f;
+        y = c ? yt : y;
+        i = c ? 1 : i;
+
+        c = absx < 0x1.3b4c40p+0f;
+        yt = absx - 1.0f;
+        y = c ? yt : y;
+        i = c ? 2 : i;
+
+        r = -log(absx);
+        yt = 1.0f - absx;
+        c = absx <= 0x1.ccccccp-1f;
+        r = c ? r : 0.0f;
+        y = c ? yt : y;
+        i = c ? 0 : i;
+
+        c = absx < 0x1.769440p-1f;
+        yt = absx - (tc_f - 1.0f);
+        y = c ? yt : y;
+        i = c ? 1 : i;
+
+        c = absx < 0x1.da6610p-3f;
+        y = c ? absx : y;
+        i = c ? 2 : i;
+
+        float z, w, p1, p2, p3, p;
+        switch (i) {
+            case 0:
+                z = y * y;
+                p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10_f, a8_f), a6_f), a4_f), a2_f), a0_f);
+                p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11_f, a9_f), a7_f), a5_f), a3_f), a1_f);
+                p = mad(y, p1, p2);
+                r += mad(y, -0.5f, p);
+                break;
+            case 1:
+                z = y * y;
+                w = z * y;
+                p1 = mad(w, mad(w, mad(w, mad(w, t12_f, t9_f), t6_f), t3_f), t0_f);
+                p2 = mad(w, mad(w, mad(w, mad(w, t13_f, t10_f), t7_f), t4_f), t1_f);
+                p3 = mad(w, mad(w, mad(w, mad(w, t14_f, t11_f), t8_f), t5_f), t2_f);
+                p = mad(z, p1, -mad(w, -mad(y, p3, p2), tt_f));
+                r += tf_f + p;
+                break;
+            case 2:
+                p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5_f, u4_f), u3_f), u2_f), u1_f), u0_f);
+                p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5_f, v4_f), v3_f), v2_f), v1_f), 1.0f);
+                r += mad(y, -0.5f, MATH_DIVIDE(p1, p2));
+                break;
+        }
+    } else if (absx < 8.0f) {
+        int i = (int) absx;
+        float y = absx - (float) i;
+        float p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6_f, s5_f), s4_f), s3_f), s2_f), s1_f), s0_f);
+        float q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6_f, r5_f), r4_f), r3_f), r2_f), r1_f), 1.0f);
+        r = mad(y, 0.5f, MATH_DIVIDE(p, q));
+
+        float y6 = y + 6.0f;
+        float y5 = y + 5.0f;
+        float y4 = y + 4.0f;
+        float y3 = y + 3.0f;
+        float y2 = y + 2.0f;
+
+        float z = 1.0f;
+        z *= i > 6 ? y6 : 1.0f;
+        z *= i > 5 ? y5 : 1.0f;
+        z *= i > 4 ? y4 : 1.0f;
+        z *= i > 3 ? y3 : 1.0f;
+        z *= i > 2 ? y2 : 1.0f;
+
+        r += log(z);
+    } else if (absx < 0x1.0p+58f) {
+        float z = 1.0f / absx;
+        float y = z * z;
+        float w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6_f, w5_f), w4_f), w3_f), w2_f), w1_f), w0_f);
+        r = mad(absx - 0.5f, log(absx) - 1.0f, w);
+    } else
+        // 2**58 <= x <= Inf
+        r = absx * (log(absx) - 1.0f);
+
+    int s = 1;
+
+    if (x < 0.0f) {
+        float t = sinpi(x);
+        r = log(pi_f / fabs(t * x)) - r;
+        r = t == 0.0f ? as_float(PINFBITPATT_SP32) : r;
+        s = t < 0.0f ? -1 : s;
+    }
+
+    *signp = s;
+    return r;
+}
+
+_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, lgamma_r, float, private, int)
+
+#endif
+#endif
+
+#if __CLC_FPSIZE == 64
+#ifdef __CLC_SCALAR
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunPro, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+
+// lgamma_r(x, i)
+// Reentrant version of the logarithm of the Gamma function
+// with user provide pointer for the sign of Gamma(x).
+//
+// Method:
+//   1. Argument Reduction for 0 < x <= 8
+//      Since gamma(1+s)=s*gamma(s), for x in [0,8], we may
+//      reduce x to a number in [1.5,2.5] by
+//              lgamma(1+s) = log(s) + lgamma(s)
+//      for example,
+//              lgamma(7.3) = log(6.3) + lgamma(6.3)
+//                          = log(6.3*5.3) + lgamma(5.3)
+//                          = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3)
+//   2. Polynomial approximation of lgamma around its
+//      minimun ymin=1.461632144968362245 to maintain monotonicity.
+//      On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use
+//              Let z = x-ymin;
+//              lgamma(x) = -1.214862905358496078218 + z^2*poly(z)
+//      where
+//              poly(z) is a 14 degree polynomial.
+//   2. Rational approximation in the primary interval [2,3]
+//      We use the following approximation:
+//              s = x-2.0;
+//              lgamma(x) = 0.5*s + s*P(s)/Q(s)
+//      with accuracy
+//              |P/Q - (lgamma(x)-0.5s)| < 2**-61.71
+//      Our algorithms are based on the following observation
+//
+//                             zeta(2)-1    2    zeta(3)-1    3
+// lgamma(2+s) = s*(1-Euler) + --------- * s  -  --------- * s  + ...
+//                                 2                 3
+//
+//      where Euler = 0.5771... is the Euler constant, which is very
+//      close to 0.5.
+//
+//   3. For x>=8, we have
+//      lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+....
+//      (better formula:
+//         lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...)
+//      Let z = 1/x, then we approximation
+//              f(z) = lgamma(x) - (x-0.5)(log(x)-1)
+//      by
+//                                  3       5             11
+//              w = w0 + w1*z + w2*z  + w3*z  + ... + w6*z
+//      where
+//              |w - f(z)| < 2**-58.74
+//
+//   4. For negative x, since (G is gamma function)
+//              -x*G(-x)*G(x) = pi/sin(pi*x),
+//      we have
+//              G(x) = pi/(sin(pi*x)*(-x)*G(-x))
+//      since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0
+//      Hence, for x<0, signgam = sign(sin(pi*x)) and
+//              lgamma(x) = log(|Gamma(x)|)
+//                        = log(pi/(|x*sin(pi*x)|)) - lgamma(-x);
+//      Note: one should avoid compute pi*(-x) directly in the
+//            computation of sin(pi*(-x)).
+//
+//   5. Special Cases
+//              lgamma(2+s) ~ s*(1-Euler) for tiny s
+//              lgamma(1)=lgamma(2)=0
+//              lgamma(x) ~ -log(x) for tiny x
+//              lgamma(0) = lgamma(inf) = inf
+//              lgamma(-integer) = +-inf
+//
+#define pi 3.14159265358979311600e+00	/* 0x400921FB, 0x54442D18 */
+
+#define a0 7.72156649015328655494e-02	/* 0x3FB3C467, 0xE37DB0C8 */
+#define a1 3.22467033424113591611e-01	/* 0x3FD4A34C, 0xC4A60FAD */
+#define a2 6.73523010531292681824e-02	/* 0x3FB13E00, 0x1A5562A7 */
+#define a3 2.05808084325167332806e-02	/* 0x3F951322, 0xAC92547B */
+#define a4 7.38555086081402883957e-03	/* 0x3F7E404F, 0xB68FEFE8 */
+#define a5 2.89051383673415629091e-03	/* 0x3F67ADD8, 0xCCB7926B */
+#define a6 1.19270763183362067845e-03	/* 0x3F538A94, 0x116F3F5D */
+#define a7 5.10069792153511336608e-04	/* 0x3F40B6C6, 0x89B99C00 */
+#define a8 2.20862790713908385557e-04	/* 0x3F2CF2EC, 0xED10E54D */
+#define a9 1.08011567247583939954e-04	/* 0x3F1C5088, 0x987DFB07 */
+#define a10 2.52144565451257326939e-05	/* 0x3EFA7074, 0x428CFA52 */
+#define a11 4.48640949618915160150e-05	/* 0x3F07858E, 0x90A45837 */
+
+#define tc 1.46163214496836224576e+00	/* 0x3FF762D8, 0x6356BE3F */
+#define tf -1.21486290535849611461e-01	/* 0xBFBF19B9, 0xBCC38A42 */
+#define tt -3.63867699703950536541e-18	/* 0xBC50C7CA, 0xA48A971F */
+
+#define t0 4.83836122723810047042e-01	/* 0x3FDEF72B, 0xC8EE38A2 */
+#define t1 -1.47587722994593911752e-01	/* 0xBFC2E427, 0x8DC6C509 */
+#define t2 6.46249402391333854778e-02	/* 0x3FB08B42, 0x94D5419B */
+#define t3 -3.27885410759859649565e-02	/* 0xBFA0C9A8, 0xDF35B713 */
+#define t4 1.79706750811820387126e-02	/* 0x3F9266E7, 0x970AF9EC */
+#define t5 -1.03142241298341437450e-02	/* 0xBF851F9F, 0xBA91EC6A */
+#define t6 6.10053870246291332635e-03	/* 0x3F78FCE0, 0xE370E344 */
+#define t7 -3.68452016781138256760e-03	/* 0xBF6E2EFF, 0xB3E914D7 */
+#define t8 2.25964780900612472250e-03	/* 0x3F6282D3, 0x2E15C915 */
+#define t9 -1.40346469989232843813e-03	/* 0xBF56FE8E, 0xBF2D1AF1 */
+#define t10 8.81081882437654011382e-04	/* 0x3F4CDF0C, 0xEF61A8E9 */
+#define t11 -5.38595305356740546715e-04	/* 0xBF41A610, 0x9C73E0EC */
+#define t12 3.15632070903625950361e-04	/* 0x3F34AF6D, 0x6C0EBBF7 */
+#define t13 -3.12754168375120860518e-04	/* 0xBF347F24, 0xECC38C38 */
+#define t14 3.35529192635519073543e-04	/* 0x3F35FD3E, 0xE8C2D3F4 */
+
+#define u0 -7.72156649015328655494e-02	/* 0xBFB3C467, 0xE37DB0C8 */
+#define u1 6.32827064025093366517e-01	/* 0x3FE4401E, 0x8B005DFF */
+#define u2 1.45492250137234768737e+00	/* 0x3FF7475C, 0xD119BD6F */
+#define u3 9.77717527963372745603e-01	/* 0x3FEF4976, 0x44EA8450 */
+#define u4 2.28963728064692451092e-01	/* 0x3FCD4EAE, 0xF6010924 */
+#define u5 1.33810918536787660377e-02	/* 0x3F8B678B, 0xBF2BAB09 */
+
+#define v1 2.45597793713041134822e+00	/* 0x4003A5D7, 0xC2BD619C */
+#define v2 2.12848976379893395361e+00	/* 0x40010725, 0xA42B18F5 */
+#define v3 7.69285150456672783825e-01	/* 0x3FE89DFB, 0xE45050AF */
+#define v4 1.04222645593369134254e-01	/* 0x3FBAAE55, 0xD6537C88 */
+#define v5 3.21709242282423911810e-03	/* 0x3F6A5ABB, 0x57D0CF61 */
+
+#define s0 -7.72156649015328655494e-02	/* 0xBFB3C467, 0xE37DB0C8 */
+#define s1 2.14982415960608852501e-01	/* 0x3FCB848B, 0x36E20878 */
+#define s2 3.25778796408930981787e-01	/* 0x3FD4D98F, 0x4F139F59 */
+#define s3 1.46350472652464452805e-01	/* 0x3FC2BB9C, 0xBEE5F2F7 */
+#define s4 2.66422703033638609560e-02	/* 0x3F9B481C, 0x7E939961 */
+#define s5 1.84028451407337715652e-03	/* 0x3F5E26B6, 0x7368F239 */
+#define s6 3.19475326584100867617e-05	/* 0x3F00BFEC, 0xDD17E945 */
+
+#define r1 1.39200533467621045958e+00	/* 0x3FF645A7, 0x62C4AB74 */
+#define r2 7.21935547567138069525e-01	/* 0x3FE71A18, 0x93D3DCDC */
+#define r3 1.71933865632803078993e-01	/* 0x3FC601ED, 0xCCFBDF27 */
+#define r4 1.86459191715652901344e-02	/* 0x3F9317EA, 0x742ED475 */
+#define r5 7.77942496381893596434e-04	/* 0x3F497DDA, 0xCA41A95B */
+#define r6 7.32668430744625636189e-06	/* 0x3EDEBAF7, 0xA5B38140 */
+
+#define w0 4.18938533204672725052e-01	/* 0x3FDACFE3, 0x90C97D69 */
+#define w1 8.33333333333329678849e-02	/* 0x3FB55555, 0x5555553B */
+#define w2 -2.77777777728775536470e-03	/* 0xBF66C16C, 0x16B02E5C */
+#define w3 7.93650558643019558500e-04	/* 0x3F4A019F, 0x98CF38B6 */
+#define w4 -5.95187557450339963135e-04	/* 0xBF4380CB, 0x8C0FE741 */
+#define w5 8.36339918996282139126e-04	/* 0x3F4B67BA, 0x4CDAD5D1 */
+#define w6 -1.63092934096575273989e-03	/* 0xBF5AB89D, 0x0B9E43E4 */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, private __CLC_INTN *ip) {
+    ulong ux = as_ulong(x);
+    ulong ax = ux & EXSIGNBIT_DP64;
+    double absx = as_double(ax);
+
+    if (ax >= 0x7ff0000000000000UL) {
+        // +-Inf, NaN
+        *ip = 1;
+        return absx;
+    }
+
+    if (absx < 0x1.0p-70) {
+        *ip = ax == ux ? 1 : -1;
+        return -log(absx);
+    }
+
+    // Handle rest of range
+    double r;
+
+    if (absx < 2.0) {
+        int i = 0;
+        double y = 2.0 - absx;
+
+        int c = absx < 0x1.bb4c3p+0;
+        double t = absx - tc;
+        i = c ? 1 : i;
+        y = c ? t : y;
+
+        c = absx < 0x1.3b4c4p+0;
+        t = absx - 1.0;
+        i = c ? 2 : i;
+        y = c ? t : y;
+
+        c = absx <= 0x1.cccccp-1;
+        t = -log(absx);
+        r = c ? t : 0.0;
+        t = 1.0 - absx;
+        i = c ? 0 : i;
+        y = c ? t : y;
+
+        c = absx < 0x1.76944p-1;
+        t = absx - (tc - 1.0);
+        i = c ? 1 : i;
+        y = c ? t : y;
+
+        c = absx < 0x1.da661p-3;
+        i = c ? 2 : i;
+        y = c ? absx : y;
+
+        double p, q;
+
+        switch (i) {
+            case 0:
+                p = fma(y, fma(y, fma(y, fma(y, a11, a10), a9), a8), a7);
+                p = fma(y, fma(y, fma(y, fma(y, p, a6), a5), a4), a3);
+                p = fma(y, fma(y, fma(y, p, a2), a1), a0);
+                r = fma(y, p - 0.5, r);
+                break;
+            case 1:
+                p = fma(y, fma(y, fma(y, fma(y, t14, t13), t12), t11), t10);
+                p = fma(y, fma(y, fma(y, fma(y, fma(y, p, t9), t8), t7), t6), t5);
+                p = fma(y, fma(y, fma(y, fma(y, fma(y, p, t4), t3), t2), t1), t0);
+                p = fma(y*y, p, -tt);
+                r += (tf + p);
+                break;
+            case 2:
+                p = y * fma(y, fma(y, fma(y, fma(y, fma(y, u5, u4), u3), u2), u1), u0);
+                q = fma(y, fma(y, fma(y, fma(y, fma(y, v5, v4), v3), v2), v1), 1.0);
+                r += fma(-0.5, y, p / q);
+        }
+    } else if (absx < 8.0) {
+        int i = absx;
+        double y = absx - (double) i;
+        double p = y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, s6, s5), s4), s3), s2), s1), s0);
+        double q = fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, r6, r5), r4), r3), r2), r1), 1.0);
+        r = fma(0.5, y, p / q);
+        double z = 1.0;
+        // lgamma(1+s) = log(s) + lgamma(s)
+        double y6 = y + 6.0;
+        double y5 = y + 5.0;
+        double y4 = y + 4.0;
+        double y3 = y + 3.0;
+        double y2 = y + 2.0;
+        z *= i > 6 ? y6 : 1.0;
+        z *= i > 5 ? y5 : 1.0;
+        z *= i > 4 ? y4 : 1.0;
+        z *= i > 3 ? y3 : 1.0;
+        z *= i > 2 ? y2 : 1.0;
+        r += log(z);
+    } else {
+        double z = 1.0 / absx;
+        double z2 = z * z;
+        double w = fma(z, fma(z2, fma(z2, fma(z2, fma(z2, fma(z2, w6, w5), w4), w3), w2), w1), w0);
+        r = (absx - 0.5) * (log(absx) - 1.0) + w;
+    }
+
+    if (x < 0.0) {
+        double t = sinpi(x);
+        r = log(pi / fabs(t * x)) - r;
+        r = t == 0.0 ? as_double(PINFBITPATT_DP64) : r;
+        *ip = t < 0.0 ? -1 : 1;
+    } else
+        *ip = 1;
+
+    return r;
+}
+
+_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, lgamma_r, double, private, int)
+#endif
+#endif
+
+#define __CLC_LGAMMA_R_DEF(addrspace) \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, addrspace __CLC_INTN *iptr) { \
+    __CLC_INTN private_iptr; \
+    __CLC_GENTYPE ret = lgamma_r(x, &private_iptr); \
+    *iptr = private_iptr; \
+    return ret; \
+}
+__CLC_LGAMMA_R_DEF(local);
+__CLC_LGAMMA_R_DEF(global);
+
+#undef __CLC_LGAMMA_R_DEF
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/log2.cl libclc-0.2.0+git20170213/generic/lib/math/log2.cl
--- libclc-0.2.0+git20150813/generic/lib/math/log2.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/log2.cl	2017-02-12 21:33:49.000000000 +0000
@@ -34,4 +34,6 @@
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log2, float);
 
+#ifdef cl_khr_fp64
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log2, double);
+#endif // cl_khr_fp64
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/logb.cl libclc-0.2.0+git20170213/generic/lib/math/logb.cl
--- libclc-0.2.0+git20150813/generic/lib/math/logb.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/logb.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,31 @@
+#include <clc/clc.h>
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float logb(float x) {
+    int ax = as_int(x) & EXSIGNBIT_SP32;
+    float s = -118 - clz(ax);
+    float r = (ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+    r = ax >= PINFBITPATT_SP32 ? as_float(ax) : r;
+    r = ax < 0x00800000 ? s : r;
+    r = ax == 0 ? as_float(NINFBITPATT_SP32) : r;
+    return r;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, logb, float);
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double logb(double x) {
+    long ax = as_long(x) & EXSIGNBIT_DP64;
+    double s = -1011L - clz(ax);
+    double r = (int) (ax >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+    r = ax >= PINFBITPATT_DP64 ? as_double(ax) : r;
+    r = ax < 0x0010000000000000L ? s : r;
+    r = ax == 0L ? as_double(NINFBITPATT_DP64) : r;
+    return r;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, logb, double)
+#endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/modf.cl libclc-0.2.0+git20170213/generic/lib/math/modf.cl
--- libclc-0.2.0+git20150813/generic/lib/math/modf.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/modf.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+#define __CLC_BODY <modf.inc>
+#include <clc/math/gentype.inc>
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/modf.inc libclc-0.2.0+git20170213/generic/lib/math/modf.inc
--- libclc-0.2.0+git20150813/generic/lib/math/modf.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/modf.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, __CLC_GENTYPE *iptr) {
+  *iptr = trunc(x);
+  return copysign(isinf(x) ? 0.0f : x - *iptr, x);
+}
+
+#define MODF_DEF(addrspace) \
+  _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \
+    __CLC_GENTYPE private_iptr; \
+    __CLC_GENTYPE ret = modf(x, &private_iptr); \
+    *iptr = private_iptr; \
+    return ret; \
+}
+
+MODF_DEF(local);
+MODF_DEF(global);
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/tables.cl libclc-0.2.0+git20170213/generic/lib/math/tables.cl
--- libclc-0.2.0+git20150813/generic/lib/math/tables.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/tables.cl	2017-02-12 21:33:49.000000000 +0000
@@ -435,6 +435,315 @@
     230, 139, 2, 0, 0, 0, 0, 0, 0, 0
 };
 
+// Tabulated values of sinh(i) and cosh(i) for i = 0,...,36.
+DECLARE_TABLE(float2, SINHCOSH_TBL, 37) = {
+    (float2)(0x0.000000p+0f, 0x1.000000p+0f),
+    (float2)(0x1.2cd9fcp+0f, 0x1.8b0756p+0f),
+    (float2)(0x1.d03cf6p+1f, 0x1.e18fa0p+1f),
+    (float2)(0x1.40926ep+3f, 0x1.422a4ap+3f),
+    (float2)(0x1.b4a380p+4f, 0x1.b4ee86p+4f),
+    (float2)(0x1.28d016p+6f, 0x1.28d6fcp+6f),
+    (float2)(0x1.936d22p+7f, 0x1.936e68p+7f),
+    (float2)(0x1.122876p+9f, 0x1.122894p+9f),
+    (float2)(0x1.749ea6p+10f, 0x1.749eaap+10f),
+    (float2)(0x1.fa7158p+11f, 0x1.fa7158p+11f),
+    (float2)(0x1.5829dcp+13f, 0x1.5829dep+13f),
+    (float2)(0x1.d3c448p+14f, 0x1.d3c448p+14f),
+    (float2)(0x1.3de166p+16f, 0x1.3de166p+16f),
+    (float2)(0x1.b00b5ap+17f, 0x1.b00b5ap+17f),
+    (float2)(0x1.259ac4p+19f, 0x1.259ac4p+19f),
+    (float2)(0x1.8f0ccap+20f, 0x1.8f0ccap+20f),
+    (float2)(0x1.0f2ebep+22f, 0x1.0f2ebep+22f),
+    (float2)(0x1.709348p+23f, 0x1.709348p+23f),
+    (float2)(0x1.f4f220p+24f, 0x1.f4f220p+24f),
+    (float2)(0x1.546d90p+26f, 0x1.546d90p+26f),
+    (float2)(0x1.ceb088p+27f, 0x1.ceb088p+27f),
+    (float2)(0x1.3a6e20p+29f, 0x1.3a6e20p+29f),
+    (float2)(0x1.ab5adcp+30f, 0x1.ab5adcp+30f),
+    (float2)(0x1.226af4p+32f, 0x1.226af4p+32f),
+    (float2)(0x1.8ab7fcp+33f, 0x1.8ab7fcp+33f),
+    (float2)(0x1.0c3d3ap+35f, 0x1.0c3d3ap+35f),
+    (float2)(0x1.6c9326p+36f, 0x1.6c9326p+36f),
+    (float2)(0x1.ef8230p+37f, 0x1.ef8230p+37f),
+    (float2)(0x1.50bba4p+39f, 0x1.50bba4p+39f),
+    (float2)(0x1.c9aae4p+40f, 0x1.c9aae4p+40f),
+    (float2)(0x1.370470p+42f, 0x1.370470p+42f),
+    (float2)(0x1.a6b766p+43f, 0x1.a6b766p+43f),
+    (float2)(0x1.1f43fcp+45f, 0x1.1f43fcp+45f),
+    (float2)(0x1.866f34p+46f, 0x1.866f34p+46f),
+    (float2)(0x1.0953e2p+48f, 0x1.0953e2p+48f),
+    (float2)(0x1.689e22p+49f, 0x1.689e22p+49f),
+    (float2)(0x1.ea215ap+50f, 0x1.ea215ap+50f)
+};
+
+DECLARE_TABLE(float2, CBRT_TBL, 129) = {
+    (float2)(0x1.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.008000p+0f, 0x1.51cb0ap-11f),
+    (float2)(0x1.014000p+0f, 0x1.39221ep-12f),
+    (float2)(0x1.01c000p+0f, 0x1.e06908p-11f),
+    (float2)(0x1.028000p+0f, 0x1.1d6978p-11f),
+    (float2)(0x1.034000p+0f, 0x1.4ea1bep-13f),
+    (float2)(0x1.03c000p+0f, 0x1.833b8ep-11f),
+    (float2)(0x1.048000p+0f, 0x1.587002p-12f),
+    (float2)(0x1.050000p+0f, 0x1.ceb290p-11f),
+    (float2)(0x1.05c000p+0f, 0x1.d57f34p-12f),
+    (float2)(0x1.068000p+0f, 0x1.cc53acp-21f),
+    (float2)(0x1.070000p+0f, 0x1.0fe098p-11f),
+    (float2)(0x1.07c000p+0f, 0x1.91b586p-15f),
+    (float2)(0x1.084000p+0f, 0x1.1c362ep-11f),
+    (float2)(0x1.090000p+0f, 0x1.94398ep-15f),
+    (float2)(0x1.098000p+0f, 0x1.1055bcp-11f),
+    (float2)(0x1.0a4000p+0f, 0x1.7e63cap-19f),
+    (float2)(0x1.0ac000p+0f, 0x1.d99e1ap-12f),
+    (float2)(0x1.0b4000p+0f, 0x1.d258dep-11f),
+    (float2)(0x1.0c0000p+0f, 0x1.645962p-12f),
+    (float2)(0x1.0c8000p+0f, 0x1.8c5b0ep-11f),
+    (float2)(0x1.0d4000p+0f, 0x1.83d0c8p-13f),
+    (float2)(0x1.0dc000p+0f, 0x1.300812p-11f),
+    (float2)(0x1.0e4000p+0f, 0x1.f9a65ap-11f),
+    (float2)(0x1.0f0000p+0f, 0x1.7bbcd8p-12f),
+    (float2)(0x1.0f8000p+0f, 0x1.7cbf68p-11f),
+    (float2)(0x1.104000p+0f, 0x1.b2c166p-14f),
+    (float2)(0x1.10c000p+0f, 0x1.d56ea4p-12f),
+    (float2)(0x1.114000p+0f, 0x1.99eb32p-11f),
+    (float2)(0x1.120000p+0f, 0x1.1007a2p-13f),
+    (float2)(0x1.128000p+0f, 0x1.d212aap-12f),
+    (float2)(0x1.130000p+0f, 0x1.890f18p-11f),
+    (float2)(0x1.13c000p+0f, 0x1.2104e2p-14f),
+    (float2)(0x1.144000p+0f, 0x1.74961ep-12f),
+    (float2)(0x1.14c000p+0f, 0x1.4b9b66p-11f),
+    (float2)(0x1.154000p+0f, 0x1.d81e66p-11f),
+    (float2)(0x1.160000p+0f, 0x1.7f825cp-13f),
+    (float2)(0x1.168000p+0f, 0x1.c5dca2p-12f),
+    (float2)(0x1.170000p+0f, 0x1.6153bap-11f),
+    (float2)(0x1.178000p+0f, 0x1.db1cc2p-11f),
+    (float2)(0x1.184000p+0f, 0x1.4154b0p-13f),
+    (float2)(0x1.18c000p+0f, 0x1.821114p-12f),
+    (float2)(0x1.194000p+0f, 0x1.2d4240p-11f),
+    (float2)(0x1.19c000p+0f, 0x1.950d82p-11f),
+    (float2)(0x1.1a4000p+0f, 0x1.f8755cp-11f),
+    (float2)(0x1.1b0000p+0f, 0x1.5e12a4p-13f),
+    (float2)(0x1.1b8000p+0f, 0x1.648c38p-12f),
+    (float2)(0x1.1c0000p+0f, 0x1.08c43ep-11f),
+    (float2)(0x1.1c8000p+0f, 0x1.5b0970p-11f),
+    (float2)(0x1.1d0000p+0f, 0x1.a91fe8p-11f),
+    (float2)(0x1.1d8000p+0f, 0x1.f311b6p-11f),
+    (float2)(0x1.1e4000p+0f, 0x1.c74618p-14f),
+    (float2)(0x1.1ec000p+0f, 0x1.eabb54p-13f),
+    (float2)(0x1.1f4000p+0f, 0x1.70db14p-12f),
+    (float2)(0x1.1fc000p+0f, 0x1.e45cbcp-12f),
+    (float2)(0x1.204000p+0f, 0x1.27faa6p-11f),
+    (float2)(0x1.20c000p+0f, 0x1.59db98p-11f),
+    (float2)(0x1.214000p+0f, 0x1.87da46p-11f),
+    (float2)(0x1.21c000p+0f, 0x1.b1ffa0p-11f),
+    (float2)(0x1.224000p+0f, 0x1.d85478p-11f),
+    (float2)(0x1.22c000p+0f, 0x1.fae17ep-11f),
+    (float2)(0x1.238000p+0f, 0x1.9af40cp-15f),
+    (float2)(0x1.240000p+0f, 0x1.a6319ep-14f),
+    (float2)(0x1.248000p+0f, 0x1.30baa6p-13f),
+    (float2)(0x1.250000p+0f, 0x1.7fc362p-13f),
+    (float2)(0x1.258000p+0f, 0x1.c05362p-13f),
+    (float2)(0x1.260000p+0f, 0x1.f28a98p-13f),
+    (float2)(0x1.268000p+0f, 0x1.0b4442p-12f),
+    (float2)(0x1.270000p+0f, 0x1.16361ap-12f),
+    (float2)(0x1.278000p+0f, 0x1.1a2a2ap-12f),
+    (float2)(0x1.280000p+0f, 0x1.172f8ep-12f),
+    (float2)(0x1.288000p+0f, 0x1.0d5530p-12f),
+    (float2)(0x1.290000p+0f, 0x1.f9538ep-13f),
+    (float2)(0x1.298000p+0f, 0x1.ca77b0p-13f),
+    (float2)(0x1.2a0000p+0f, 0x1.8e336ap-13f),
+    (float2)(0x1.2a8000p+0f, 0x1.44a304p-13f),
+    (float2)(0x1.2b0000p+0f, 0x1.dbc4c8p-14f),
+    (float2)(0x1.2b8000p+0f, 0x1.141a2ap-14f),
+    (float2)(0x1.2c0000p+0f, 0x1.93e44cp-17f),
+    (float2)(0x1.2c4000p+0f, 0x1.e6e432p-11f),
+    (float2)(0x1.2cc000p+0f, 0x1.c447c6p-11f),
+    (float2)(0x1.2d4000p+0f, 0x1.9e80d8p-11f),
+    (float2)(0x1.2dc000p+0f, 0x1.7595dcp-11f),
+    (float2)(0x1.2e4000p+0f, 0x1.498d30p-11f),
+    (float2)(0x1.2ec000p+0f, 0x1.1a6d1ep-11f),
+    (float2)(0x1.2f4000p+0f, 0x1.d077bap-12f),
+    (float2)(0x1.2fc000p+0f, 0x1.65ff1ep-12f),
+    (float2)(0x1.304000p+0f, 0x1.eaf912p-13f),
+    (float2)(0x1.30c000p+0f, 0x1.fbefb8p-14f),
+    (float2)(0x1.314000p+0f, 0x1.44905ap-19f),
+    (float2)(0x1.318000p+0f, 0x1.c017e6p-11f),
+    (float2)(0x1.320000p+0f, 0x1.7bfdbep-11f),
+    (float2)(0x1.328000p+0f, 0x1.34fbc6p-11f),
+    (float2)(0x1.330000p+0f, 0x1.d62f48p-12f),
+    (float2)(0x1.338000p+0f, 0x1.3cadc6p-12f),
+    (float2)(0x1.340000p+0f, 0x1.3afc06p-13f),
+    (float2)(0x1.344000p+0f, 0x1.fc556ep-11f),
+    (float2)(0x1.34c000p+0f, 0x1.a71f84p-11f),
+    (float2)(0x1.354000p+0f, 0x1.4f2290p-11f),
+    (float2)(0x1.35c000p+0f, 0x1.e8c79cp-12f),
+    (float2)(0x1.364000p+0f, 0x1.2dd0d8p-12f),
+    (float2)(0x1.36c000p+0f, 0x1.b5ac2ep-14f),
+    (float2)(0x1.370000p+0f, 0x1.d3d02ap-11f),
+    (float2)(0x1.378000p+0f, 0x1.6e3d58p-11f),
+    (float2)(0x1.380000p+0f, 0x1.060200p-11f),
+    (float2)(0x1.388000p+0f, 0x1.364608p-12f),
+    (float2)(0x1.390000p+0f, 0x1.6d29b6p-14f),
+    (float2)(0x1.394000p+0f, 0x1.bd8d5ep-11f),
+    (float2)(0x1.39c000p+0f, 0x1.4ae030p-11f),
+    (float2)(0x1.3a4000p+0f, 0x1.ab44b2p-12f),
+    (float2)(0x1.3ac000p+0f, 0x1.7761cep-13f),
+    (float2)(0x1.3b0000p+0f, 0x1.e38710p-11f),
+    (float2)(0x1.3b8000p+0f, 0x1.66b2b0p-11f),
+    (float2)(0x1.3c0000p+0f, 0x1.cebf96p-12f),
+    (float2)(0x1.3c8000p+0f, 0x1.964b20p-13f),
+    (float2)(0x1.3cc000p+0f, 0x1.e15004p-11f),
+    (float2)(0x1.3d4000p+0f, 0x1.5a9bcep-11f),
+    (float2)(0x1.3dc000p+0f, 0x1.a2f4d8p-12f),
+    (float2)(0x1.3e4000p+0f, 0x1.17c056p-13f),
+    (float2)(0x1.3e8000p+0f, 0x1.b800f8p-11f),
+    (float2)(0x1.3f0000p+0f, 0x1.27b132p-11f),
+    (float2)(0x1.3f8000p+0f, 0x1.2a09b8p-12f),
+    (float2)(0x1.400000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.404000p+0f, 0x1.68a69cp-11f),
+    (float2)(0x1.40c000p+0f, 0x1.9df950p-12f),
+    (float2)(0x1.414000p+0f, 0x1.983050p-14f),
+    (float2)(0x1.418000p+0f, 0x1.94c6a4p-11f),
+    (float2)(0x1.420000p+0f, 0x1.e88494p-12f),
+    (float2)(0x1.428000p+0f, 0x1.45f31ap-13f)
+};
+
+DECLARE_TABLE(float, EXP_TBL, 65) = {
+    0x1.000000p+0f,
+    0x1.02c9a4p+0f,
+    0x1.059b0ep+0f,
+    0x1.087452p+0f,
+    0x1.0b5586p+0f,
+    0x1.0e3ec4p+0f,
+    0x1.11301ep+0f,
+    0x1.1429aap+0f,
+    0x1.172b84p+0f,
+    0x1.1a35bep+0f,
+    0x1.1d4874p+0f,
+    0x1.2063b8p+0f,
+    0x1.2387a6p+0f,
+    0x1.26b456p+0f,
+    0x1.29e9e0p+0f,
+    0x1.2d285ap+0f,
+    0x1.306fe0p+0f,
+    0x1.33c08cp+0f,
+    0x1.371a74p+0f,
+    0x1.3a7db4p+0f,
+    0x1.3dea64p+0f,
+    0x1.4160a2p+0f,
+    0x1.44e086p+0f,
+    0x1.486a2cp+0f,
+    0x1.4bfdaep+0f,
+    0x1.4f9b28p+0f,
+    0x1.5342b6p+0f,
+    0x1.56f474p+0f,
+    0x1.5ab07ep+0f,
+    0x1.5e76f2p+0f,
+    0x1.6247ecp+0f,
+    0x1.662388p+0f,
+    0x1.6a09e6p+0f,
+    0x1.6dfb24p+0f,
+    0x1.71f75ep+0f,
+    0x1.75feb6p+0f,
+    0x1.7a1148p+0f,
+    0x1.7e2f34p+0f,
+    0x1.82589ap+0f,
+    0x1.868d9ap+0f,
+    0x1.8ace54p+0f,
+    0x1.8f1aeap+0f,
+    0x1.93737cp+0f,
+    0x1.97d82ap+0f,
+    0x1.9c4918p+0f,
+    0x1.a0c668p+0f,
+    0x1.a5503cp+0f,
+    0x1.a9e6b6p+0f,
+    0x1.ae89fap+0f,
+    0x1.b33a2cp+0f,
+    0x1.b7f770p+0f,
+    0x1.bcc1eap+0f,
+    0x1.c199bep+0f,
+    0x1.c67f12p+0f,
+    0x1.cb720ep+0f,
+    0x1.d072d4p+0f,
+    0x1.d5818ep+0f,
+    0x1.da9e60p+0f,
+    0x1.dfc974p+0f,
+    0x1.e502eep+0f,
+    0x1.ea4afap+0f,
+    0x1.efa1bep+0f,
+    0x1.f50766p+0f,
+    0x1.fa7c18p+0f,
+    0x1.000000p+1f,
+};
+
+DECLARE_TABLE(float2, EXP_TBL_EP, 65) = {
+    (float2) (0x1.000000p+0f, 0x0.000000p+0f),
+    (float2) (0x1.02c000p+0f, 0x1.347ceep-13f),
+    (float2) (0x1.058000p+0f, 0x1.b0d314p-12f),
+    (float2) (0x1.084000p+0f, 0x1.a28c3ap-11f),
+    (float2) (0x1.0b4000p+0f, 0x1.586cf8p-12f),
+    (float2) (0x1.0e0000p+0f, 0x1.f61968p-11f),
+    (float2) (0x1.110000p+0f, 0x1.80e808p-11f),
+    (float2) (0x1.140000p+0f, 0x1.4d5754p-11f),
+    (float2) (0x1.170000p+0f, 0x1.5c1e3ep-11f),
+    (float2) (0x1.1a0000p+0f, 0x1.adf5b6p-11f),
+    (float2) (0x1.1d4000p+0f, 0x1.0e62d0p-13f),
+    (float2) (0x1.204000p+0f, 0x1.1dc430p-11f),
+    (float2) (0x1.238000p+0f, 0x1.e9b9d4p-14f),
+    (float2) (0x1.268000p+0f, 0x1.a2b2f0p-11f),
+    (float2) (0x1.29c000p+0f, 0x1.4efa8ep-11f),
+    (float2) (0x1.2d0000p+0f, 0x1.42d372p-11f),
+    (float2) (0x1.304000p+0f, 0x1.7f0518p-11f),
+    (float2) (0x1.33c000p+0f, 0x1.164c82p-17f),
+    (float2) (0x1.370000p+0f, 0x1.a7373ap-12f),
+    (float2) (0x1.3a4000p+0f, 0x1.ed9a72p-11f),
+    (float2) (0x1.3dc000p+0f, 0x1.532608p-11f),
+    (float2) (0x1.414000p+0f, 0x1.0510fap-11f),
+    (float2) (0x1.44c000p+0f, 0x1.043030p-11f),
+    (float2) (0x1.484000p+0f, 0x1.515ae0p-11f),
+    (float2) (0x1.4bc000p+0f, 0x1.ed6a9ap-11f),
+    (float2) (0x1.4f8000p+0f, 0x1.b2769cp-12f),
+    (float2) (0x1.534000p+0f, 0x1.5ab4eap-15f),
+    (float2) (0x1.56c000p+0f, 0x1.a39b5ap-11f),
+    (float2) (0x1.5a8000p+0f, 0x1.83eea4p-11f),
+    (float2) (0x1.5e4000p+0f, 0x1.b78ad6p-11f),
+    (float2) (0x1.624000p+0f, 0x1.fac0e8p-14f),
+    (float2) (0x1.660000p+0f, 0x1.1c412ap-11f),
+    (float2) (0x1.6a0000p+0f, 0x1.3cccfep-13f),
+    (float2) (0x1.6dc000p+0f, 0x1.d91e32p-11f),
+    (float2) (0x1.71c000p+0f, 0x1.baf476p-11f),
+    (float2) (0x1.75c000p+0f, 0x1.f5ab20p-11f),
+    (float2) (0x1.7a0000p+0f, 0x1.1473eap-12f),
+    (float2) (0x1.7e0000p+0f, 0x1.799b66p-11f),
+    (float2) (0x1.824000p+0f, 0x1.89994cp-12f),
+    (float2) (0x1.868000p+0f, 0x1.b33688p-13f),
+    (float2) (0x1.8ac000p+0f, 0x1.ca8454p-13f),
+    (float2) (0x1.8f0000p+0f, 0x1.ae9914p-12f),
+    (float2) (0x1.934000p+0f, 0x1.9bd866p-11f),
+    (float2) (0x1.97c000p+0f, 0x1.829fdep-12f),
+    (float2) (0x1.9c4000p+0f, 0x1.230546p-13f),
+    (float2) (0x1.a0c000p+0f, 0x1.99ed76p-14f),
+    (float2) (0x1.a54000p+0f, 0x1.03b23ep-12f),
+    (float2) (0x1.a9c000p+0f, 0x1.35aabcp-11f),
+    (float2) (0x1.ae8000p+0f, 0x1.3f32b4p-13f),
+    (float2) (0x1.b30000p+0f, 0x1.d15c26p-11f),
+    (float2) (0x1.b7c000p+0f, 0x1.bb797cp-11f),
+    (float2) (0x1.bcc000p+0f, 0x1.e904bcp-16f),
+    (float2) (0x1.c18000p+0f, 0x1.9bdd84p-12f),
+    (float2) (0x1.c64000p+0f, 0x1.f8972ap-11f),
+    (float2) (0x1.cb4000p+0f, 0x1.906e76p-11f),
+    (float2) (0x1.d04000p+0f, 0x1.96a502p-11f),
+    (float2) (0x1.d58000p+0f, 0x1.8dcfbap-16f),
+    (float2) (0x1.da8000p+0f, 0x1.e603dap-12f),
+    (float2) (0x1.dfc000p+0f, 0x1.2e66f6p-13f),
+    (float2) (0x1.e50000p+0f, 0x1.773c58p-15f),
+    (float2) (0x1.ea4000p+0f, 0x1.5f4548p-13f),
+    (float2) (0x1.ef8000p+0f, 0x1.0df730p-11f),
+    (float2) (0x1.f50000p+0f, 0x1.d96db8p-14f),
+    (float2) (0x1.fa4000p+0f, 0x1.e0c0cep-11f),
+    (float2) (0x1.000000p+1f, 0x0.000000p+0f),
+};
+
 TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl);
 TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl);
 TABLE_FUNCTION(float2, LOG2_TBL, log2_tbl);
@@ -443,6 +752,11 @@
     return *(__constant uint4 *)(PIBITS_TBL + idx);
 }
 
+TABLE_FUNCTION(float2, SINHCOSH_TBL, sinhcosh_tbl);
+TABLE_FUNCTION(float2, CBRT_TBL, cbrt_tbl);
+TABLE_FUNCTION(float, EXP_TBL, exp_tbl);
+TABLE_FUNCTION(float2, EXP_TBL_EP, exp_tbl_ep);
+
 #ifdef cl_khr_fp64
 
 DECLARE_TABLE(double2, LN_TBL, 65) = {
@@ -835,7 +1149,621 @@
 };
 
 
+DECLARE_TABLE(double2, SINH_TBL, 37) = {
+    (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.2cd9fc0000000p+0, 0x1.13ae6096a0092p-26),
+    (double2)(0x1.d03cf60000000p+1, 0x1.db70cfb79a640p-26),
+    (double2)(0x1.40926e0000000p+3, 0x1.c2526b66dc067p-23),
+    (double2)(0x1.b4a3800000000p+4, 0x1.b81b18647f380p-23),
+    (double2)(0x1.28d0160000000p+6, 0x1.bc1cdd1e1eb08p-20),
+    (double2)(0x1.936d228000000p+7, 0x1.d9f201534fb09p-19),
+    (double2)(0x1.1228768000000p+9, 0x1.d1c064a4e9954p-18),
+    (double2)(0x1.749ea50000000p+10, 0x1.4eca65d06ea74p-18),
+    (double2)(0x1.fa71570000000p+11, 0x1.0c259bcc0ecc5p-15),
+    (double2)(0x1.5829dc8000000p+13, 0x1.b5a6647cf9016p-13),
+    (double2)(0x1.d3c4488000000p+14, 0x1.9691adefb0870p-15),
+    (double2)(0x1.3de1650000000p+16, 0x1.3410fc29cde38p-10),
+    (double2)(0x1.b00b590000000p+17, 0x1.6a31a50b6fb3cp-11),
+    (double2)(0x1.259ac48000000p+19, 0x1.7defc71805c40p-10),
+    (double2)(0x1.8f0cca8000000p+20, 0x1.eb49fd80e0babp-6),
+    (double2)(0x1.0f2ebd0000000p+22, 0x1.4fffc7bcd5920p-7),
+    (double2)(0x1.7093488000000p+23, 0x1.03a93b6c63435p-3),
+    (double2)(0x1.f4f2208000000p+24, 0x1.1940bb255fd1cp-4),
+    (double2)(0x1.546d8f8000000p+26, 0x1.ed26e14260b50p-2),
+    (double2)(0x1.ceb0888000000p+27, 0x1.b47401fc9f2a2p+0),
+    (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55634f1p+3),
+    (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff8194ddcp+2),
+    (double2)(0x1.226af30000000p+32, 0x1.d8fee052ba63ap+5),
+    (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde3f6p+7),
+    (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557d1ap+8),
+    (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9dc4p+8),
+    (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abap+11),
+    (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13),
+    (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6363p+14),
+    (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15),
+    (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17),
+    (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19),
+    (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19),
+    (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22),
+    (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21),
+    (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22),
+};
+
+DECLARE_TABLE(double2, COSH_TBL, 37) = {
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.8b07550000000p+0, 0x1.d9f5504c2bd28p-28),
+    (double2)(0x1.e18fa08000000p+1, 0x1.7cb66f0a4c9fdp-25),
+    (double2)(0x1.422a490000000p+3, 0x1.f58617928e588p-23),
+    (double2)(0x1.b4ee858000000p+4, 0x1.bc7d000c38d48p-25),
+    (double2)(0x1.28d6fc8000000p+6, 0x1.f7f9d4e329998p-21),
+    (double2)(0x1.936e678000000p+7, 0x1.6e6e464885269p-19),
+    (double2)(0x1.1228948000000p+9, 0x1.ba3a8b946c154p-19),
+    (double2)(0x1.749eaa8000000p+10, 0x1.3f4e76110d5a4p-18),
+    (double2)(0x1.fa71580000000p+11, 0x1.17622515a3e2bp-15),
+    (double2)(0x1.5829dd0000000p+13, 0x1.4dc4b528af3d0p-17),
+    (double2)(0x1.d3c4488000000p+14, 0x1.1156278615e10p-14),
+    (double2)(0x1.3de1650000000p+16, 0x1.35ad50ed821f5p-10),
+    (double2)(0x1.b00b590000000p+17, 0x1.6b61055f2935cp-11),
+    (double2)(0x1.259ac48000000p+19, 0x1.7e2794a601240p-10),
+    (double2)(0x1.8f0cca8000000p+20, 0x1.eb4b45f6aadd3p-6),
+    (double2)(0x1.0f2ebd0000000p+22, 0x1.5000b967b3698p-7),
+    (double2)(0x1.7093488000000p+23, 0x1.03a940fadc092p-3),
+    (double2)(0x1.f4f2208000000p+24, 0x1.1940bf3bf874cp-4),
+    (double2)(0x1.546d8f8000000p+26, 0x1.ed26e1a2a2110p-2),
+    (double2)(0x1.ceb0888000000p+27, 0x1.b4740205796d6p+0),
+    (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55cb85dp+3),
+    (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff81e18acp+2),
+    (double2)(0x1.226af30000000p+32, 0x1.d8fee052bdea4p+5),
+    (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde926p+7),
+    (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557e0ep+8),
+    (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9e1cp+8),
+    (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abep+11),
+    (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13),
+    (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6364p+14),
+    (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15),
+    (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17),
+    (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19),
+    (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19),
+    (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22),
+    (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21),
+    (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22)
+};
+
+DECLARE_TABLE(double, CBRT_INV_TBL, 257) = {
+    0x1.0000000000000p+1,
+    0x1.fe01fe01fe020p+0,
+    0x1.fc07f01fc07f0p+0,
+    0x1.fa11caa01fa12p+0,
+    0x1.f81f81f81f820p+0,
+    0x1.f6310aca0dbb5p+0,
+    0x1.f44659e4a4271p+0,
+    0x1.f25f644230ab5p+0,
+    0x1.f07c1f07c1f08p+0,
+    0x1.ee9c7f8458e02p+0,
+    0x1.ecc07b301ecc0p+0,
+    0x1.eae807aba01ebp+0,
+    0x1.e9131abf0b767p+0,
+    0x1.e741aa59750e4p+0,
+    0x1.e573ac901e574p+0,
+    0x1.e3a9179dc1a73p+0,
+    0x1.e1e1e1e1e1e1ep+0,
+    0x1.e01e01e01e01ep+0,
+    0x1.de5d6e3f8868ap+0,
+    0x1.dca01dca01dcap+0,
+    0x1.dae6076b981dbp+0,
+    0x1.d92f2231e7f8ap+0,
+    0x1.d77b654b82c34p+0,
+    0x1.d5cac807572b2p+0,
+    0x1.d41d41d41d41dp+0,
+    0x1.d272ca3fc5b1ap+0,
+    0x1.d0cb58f6ec074p+0,
+    0x1.cf26e5c44bfc6p+0,
+    0x1.cd85689039b0bp+0,
+    0x1.cbe6d9601cbe7p+0,
+    0x1.ca4b3055ee191p+0,
+    0x1.c8b265afb8a42p+0,
+    0x1.c71c71c71c71cp+0,
+    0x1.c5894d10d4986p+0,
+    0x1.c3f8f01c3f8f0p+0,
+    0x1.c26b5392ea01cp+0,
+    0x1.c0e070381c0e0p+0,
+    0x1.bf583ee868d8bp+0,
+    0x1.bdd2b899406f7p+0,
+    0x1.bc4fd65883e7bp+0,
+    0x1.bacf914c1bad0p+0,
+    0x1.b951e2b18ff23p+0,
+    0x1.b7d6c3dda338bp+0,
+    0x1.b65e2e3beee05p+0,
+    0x1.b4e81b4e81b4fp+0,
+    0x1.b37484ad806cep+0,
+    0x1.b2036406c80d9p+0,
+    0x1.b094b31d922a4p+0,
+    0x1.af286bca1af28p+0,
+    0x1.adbe87f94905ep+0,
+    0x1.ac5701ac5701bp+0,
+    0x1.aaf1d2f87ebfdp+0,
+    0x1.a98ef606a63bep+0,
+    0x1.a82e65130e159p+0,
+    0x1.a6d01a6d01a6dp+0,
+    0x1.a574107688a4ap+0,
+    0x1.a41a41a41a41ap+0,
+    0x1.a2c2a87c51ca0p+0,
+    0x1.a16d3f97a4b02p+0,
+    0x1.a01a01a01a01ap+0,
+    0x1.9ec8e951033d9p+0,
+    0x1.9d79f176b682dp+0,
+    0x1.9c2d14ee4a102p+0,
+    0x1.9ae24ea5510dap+0,
+    0x1.999999999999ap+0,
+    0x1.9852f0d8ec0ffp+0,
+    0x1.970e4f80cb872p+0,
+    0x1.95cbb0be377aep+0,
+    0x1.948b0fcd6e9e0p+0,
+    0x1.934c67f9b2ce6p+0,
+    0x1.920fb49d0e229p+0,
+    0x1.90d4f120190d5p+0,
+    0x1.8f9c18f9c18fap+0,
+    0x1.8e6527af1373fp+0,
+    0x1.8d3018d3018d3p+0,
+    0x1.8bfce8062ff3ap+0,
+    0x1.8acb90f6bf3aap+0,
+    0x1.899c0f601899cp+0,
+    0x1.886e5f0abb04ap+0,
+    0x1.87427bcc092b9p+0,
+    0x1.8618618618618p+0,
+    0x1.84f00c2780614p+0,
+    0x1.83c977ab2beddp+0,
+    0x1.82a4a0182a4a0p+0,
+    0x1.8181818181818p+0,
+    0x1.8060180601806p+0,
+    0x1.7f405fd017f40p+0,
+    0x1.7e225515a4f1dp+0,
+    0x1.7d05f417d05f4p+0,
+    0x1.7beb3922e017cp+0,
+    0x1.7ad2208e0ecc3p+0,
+    0x1.79baa6bb6398bp+0,
+    0x1.78a4c8178a4c8p+0,
+    0x1.77908119ac60dp+0,
+    0x1.767dce434a9b1p+0,
+    0x1.756cac201756dp+0,
+    0x1.745d1745d1746p+0,
+    0x1.734f0c541fe8dp+0,
+    0x1.724287f46debcp+0,
+    0x1.713786d9c7c09p+0,
+    0x1.702e05c0b8170p+0,
+    0x1.6f26016f26017p+0,
+    0x1.6e1f76b4337c7p+0,
+    0x1.6d1a62681c861p+0,
+    0x1.6c16c16c16c17p+0,
+    0x1.6b1490aa31a3dp+0,
+    0x1.6a13cd1537290p+0,
+    0x1.691473a88d0c0p+0,
+    0x1.6816816816817p+0,
+    0x1.6719f3601671ap+0,
+    0x1.661ec6a5122f9p+0,
+    0x1.6524f853b4aa3p+0,
+    0x1.642c8590b2164p+0,
+    0x1.63356b88ac0dep+0,
+    0x1.623fa77016240p+0,
+    0x1.614b36831ae94p+0,
+    0x1.6058160581606p+0,
+    0x1.5f66434292dfcp+0,
+    0x1.5e75bb8d015e7p+0,
+    0x1.5d867c3ece2a5p+0,
+    0x1.5c9882b931057p+0,
+    0x1.5babcc647fa91p+0,
+    0x1.5ac056b015ac0p+0,
+    0x1.59d61f123ccaap+0,
+    0x1.58ed2308158edp+0,
+    0x1.5805601580560p+0,
+    0x1.571ed3c506b3ap+0,
+    0x1.56397ba7c52e2p+0,
+    0x1.5555555555555p+0,
+    0x1.54725e6bb82fep+0,
+    0x1.5390948f40febp+0,
+    0x1.52aff56a8054bp+0,
+    0x1.51d07eae2f815p+0,
+    0x1.50f22e111c4c5p+0,
+    0x1.5015015015015p+0,
+    0x1.4f38f62dd4c9bp+0,
+    0x1.4e5e0a72f0539p+0,
+    0x1.4d843bedc2c4cp+0,
+    0x1.4cab88725af6ep+0,
+    0x1.4bd3edda68fe1p+0,
+    0x1.4afd6a052bf5bp+0,
+    0x1.4a27fad76014ap+0,
+    0x1.49539e3b2d067p+0,
+    0x1.4880522014880p+0,
+    0x1.47ae147ae147bp+0,
+    0x1.46dce34596066p+0,
+    0x1.460cbc7f5cf9ap+0,
+    0x1.453d9e2c776cap+0,
+    0x1.446f86562d9fbp+0,
+    0x1.43a2730abee4dp+0,
+    0x1.42d6625d51f87p+0,
+    0x1.420b5265e5951p+0,
+    0x1.4141414141414p+0,
+    0x1.40782d10e6566p+0,
+    0x1.3fb013fb013fbp+0,
+    0x1.3ee8f42a5af07p+0,
+    0x1.3e22cbce4a902p+0,
+    0x1.3d5d991aa75c6p+0,
+    0x1.3c995a47babe7p+0,
+    0x1.3bd60d9232955p+0,
+    0x1.3b13b13b13b14p+0,
+    0x1.3a524387ac822p+0,
+    0x1.3991c2c187f63p+0,
+    0x1.38d22d366088ep+0,
+    0x1.3813813813814p+0,
+    0x1.3755bd1c945eep+0,
+    0x1.3698df3de0748p+0,
+    0x1.35dce5f9f2af8p+0,
+    0x1.3521cfb2b78c1p+0,
+    0x1.34679ace01346p+0,
+    0x1.33ae45b57bcb2p+0,
+    0x1.32f5ced6a1dfap+0,
+    0x1.323e34a2b10bfp+0,
+    0x1.3187758e9ebb6p+0,
+    0x1.30d190130d190p+0,
+    0x1.301c82ac40260p+0,
+    0x1.2f684bda12f68p+0,
+    0x1.2eb4ea1fed14bp+0,
+    0x1.2e025c04b8097p+0,
+    0x1.2d50a012d50a0p+0,
+    0x1.2c9fb4d812ca0p+0,
+    0x1.2bef98e5a3711p+0,
+    0x1.2b404ad012b40p+0,
+    0x1.2a91c92f3c105p+0,
+    0x1.29e4129e4129ep+0,
+    0x1.293725bb804a5p+0,
+    0x1.288b01288b013p+0,
+    0x1.27dfa38a1ce4dp+0,
+    0x1.27350b8812735p+0,
+    0x1.268b37cd60127p+0,
+    0x1.25e22708092f1p+0,
+    0x1.2539d7e9177b2p+0,
+    0x1.2492492492492p+0,
+    0x1.23eb79717605bp+0,
+    0x1.23456789abcdfp+0,
+    0x1.22a0122a0122ap+0,
+    0x1.21fb78121fb78p+0,
+    0x1.21579804855e6p+0,
+    0x1.20b470c67c0d9p+0,
+    0x1.2012012012012p+0,
+    0x1.1f7047dc11f70p+0,
+    0x1.1ecf43c7fb84cp+0,
+    0x1.1e2ef3b3fb874p+0,
+    0x1.1d8f5672e4abdp+0,
+    0x1.1cf06ada2811dp+0,
+    0x1.1c522fc1ce059p+0,
+    0x1.1bb4a4046ed29p+0,
+    0x1.1b17c67f2bae3p+0,
+    0x1.1a7b9611a7b96p+0,
+    0x1.19e0119e0119ep+0,
+    0x1.19453808ca29cp+0,
+    0x1.18ab083902bdbp+0,
+    0x1.1811811811812p+0,
+    0x1.1778a191bd684p+0,
+    0x1.16e0689427379p+0,
+    0x1.1648d50fc3201p+0,
+    0x1.15b1e5f75270dp+0,
+    0x1.151b9a3fdd5c9p+0,
+    0x1.1485f0e0acd3bp+0,
+    0x1.13f0e8d344724p+0,
+    0x1.135c81135c811p+0,
+    0x1.12c8b89edc0acp+0,
+    0x1.12358e75d3033p+0,
+    0x1.11a3019a74826p+0,
+    0x1.1111111111111p+0,
+    0x1.107fbbe011080p+0,
+    0x1.0fef010fef011p+0,
+    0x1.0f5edfab325a2p+0,
+    0x1.0ecf56be69c90p+0,
+    0x1.0e40655826011p+0,
+    0x1.0db20a88f4696p+0,
+    0x1.0d24456359e3ap+0,
+    0x1.0c9714fbcda3bp+0,
+    0x1.0c0a7868b4171p+0,
+    0x1.0b7e6ec259dc8p+0,
+    0x1.0af2f722eecb5p+0,
+    0x1.0a6810a6810a7p+0,
+    0x1.09ddba6af8360p+0,
+    0x1.0953f39010954p+0,
+    0x1.08cabb37565e2p+0,
+    0x1.0842108421084p+0,
+    0x1.07b9f29b8eae2p+0,
+    0x1.073260a47f7c6p+0,
+    0x1.06ab59c7912fbp+0,
+    0x1.0624dd2f1a9fcp+0,
+    0x1.059eea0727586p+0,
+    0x1.05197f7d73404p+0,
+    0x1.04949cc1664c5p+0,
+    0x1.0410410410410p+0,
+    0x1.038c6b78247fcp+0,
+    0x1.03091b51f5e1ap+0,
+    0x1.02864fc7729e9p+0,
+    0x1.0204081020408p+0,
+    0x1.0182436517a37p+0,
+    0x1.0101010101010p+0,
+    0x1.0080402010080p+0,
+    0x1.0000000000000p+0
+};
+
+DECLARE_TABLE(double2, CBRT_DBL_TBL, 257) = {
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.0055380000000p+0, 0x1.e6a24c81e4294p-25),
+    (double2)(0x1.00aa390000000p+0, 0x1.8548511e3a785p-26),
+    (double2)(0x1.00ff010000000p+0, 0x1.4eb9336ec07f6p-25),
+    (double2)(0x1.0153920000000p+0, 0x1.0ea64b8b750e1p-27),
+    (double2)(0x1.01a7eb0000000p+0, 0x1.61637cff8a53cp-27),
+    (double2)(0x1.01fc0d0000000p+0, 0x1.0733bf7bd1943p-27),
+    (double2)(0x1.024ff80000000p+0, 0x1.666911345ccedp-26),
+    (double2)(0x1.02a3ad0000000p+0, 0x1.77b7a3f592f14p-27),
+    (double2)(0x1.02f72b0000000p+0, 0x1.f18d3dd1a5402p-25),
+    (double2)(0x1.034a750000000p+0, 0x1.be2f5a58ee9a4p-29),
+    (double2)(0x1.039d880000000p+0, 0x1.8901f8f085fa7p-25),
+    (double2)(0x1.03f0670000000p+0, 0x1.c68b8cd5b5d69p-26),
+    (double2)(0x1.0443110000000p+0, 0x1.a6b0e8624be42p-26),
+    (double2)(0x1.0495870000000p+0, 0x1.c4b22b06f68e7p-36),
+    (double2)(0x1.04e7c80000000p+0, 0x1.0f3f0afcabe9bp-25),
+    (double2)(0x1.0539d60000000p+0, 0x1.48495bca4e1b7p-26),
+    (double2)(0x1.058bb00000000p+0, 0x1.6107f1abdfdc3p-25),
+    (double2)(0x1.05dd570000000p+0, 0x1.e67261878288ap-25),
+    (double2)(0x1.062ecc0000000p+0, 0x1.a6bc155286f1ep-26),
+    (double2)(0x1.06800e0000000p+0, 0x1.8a759c64a85f2p-26),
+    (double2)(0x1.06d11e0000000p+0, 0x1.5fce70a4a8d09p-27),
+    (double2)(0x1.0721fc0000000p+0, 0x1.2f9cbf373fe1dp-28),
+    (double2)(0x1.0772a80000000p+0, 0x1.90564ce4ac359p-26),
+    (double2)(0x1.07c3230000000p+0, 0x1.ac29ce761b02fp-26),
+    (double2)(0x1.08136d0000000p+0, 0x1.cb752f497381cp-26),
+    (double2)(0x1.0863860000000p+0, 0x1.8bb9e1cfb35e0p-25),
+    (double2)(0x1.08b36f0000000p+0, 0x1.5b4917099de90p-25),
+    (double2)(0x1.0903280000000p+0, 0x1.cc77ac9c65ef2p-26),
+    (double2)(0x1.0952b10000000p+0, 0x1.7a0f3e7be3dbap-26),
+    (double2)(0x1.09a20a0000000p+0, 0x1.6ec851ee0c16fp-25),
+    (double2)(0x1.09f1340000000p+0, 0x1.89449bf2946dap-25),
+    (double2)(0x1.0a402f0000000p+0, 0x1.98f25301ba223p-25),
+    (double2)(0x1.0a8efc0000000p+0, 0x1.47d5ec651f549p-28),
+    (double2)(0x1.0add990000000p+0, 0x1.c33ec9a86007ap-25),
+    (double2)(0x1.0b2c090000000p+0, 0x1.e0b6653e92649p-26),
+    (double2)(0x1.0b7a4b0000000p+0, 0x1.bd64ac09d755fp-28),
+    (double2)(0x1.0bc85f0000000p+0, 0x1.f537506f78167p-29),
+    (double2)(0x1.0c16450000000p+0, 0x1.2c382d1b3735ep-25),
+    (double2)(0x1.0c63fe0000000p+0, 0x1.e20ed659f99e1p-25),
+    (double2)(0x1.0cb18b0000000p+0, 0x1.86b633a9c182ap-26),
+    (double2)(0x1.0cfeeb0000000p+0, 0x1.45cfd5a65e777p-27),
+    (double2)(0x1.0d4c1e0000000p+0, 0x1.0c8770f58bca4p-25),
+    (double2)(0x1.0d99250000000p+0, 0x1.739e44b0933c5p-25),
+    (double2)(0x1.0de6010000000p+0, 0x1.27dc3d9ce7bd8p-31),
+    (double2)(0x1.0e32b00000000p+0, 0x1.3c53c7c5a7b64p-25),
+    (double2)(0x1.0e7f340000000p+0, 0x1.9669683830cecp-25),
+    (double2)(0x1.0ecb8d0000000p+0, 0x1.8d772c39bdcc4p-25),
+    (double2)(0x1.0f17bb0000000p+0, 0x1.9b0008bcf6d7bp-25),
+    (double2)(0x1.0f63bf0000000p+0, 0x1.bbb305825ce4fp-28),
+    (double2)(0x1.0faf970000000p+0, 0x1.da3f4af13a406p-25),
+    (double2)(0x1.0ffb460000000p+0, 0x1.f36b96f74ce86p-26),
+    (double2)(0x1.1046cb0000000p+0, 0x1.65c002303f790p-30),
+    (double2)(0x1.1092250000000p+0, 0x1.82f84095ba7d5p-25),
+    (double2)(0x1.10dd560000000p+0, 0x1.d46433541b2c6p-25),
+    (double2)(0x1.11285e0000000p+0, 0x1.71c3d56e93a89p-25),
+    (double2)(0x1.11733d0000000p+0, 0x1.98dcef4e40012p-26),
+    (double2)(0x1.11bdf30000000p+0, 0x1.530ebef17fe03p-27),
+    (double2)(0x1.1208800000000p+0, 0x1.e8b8fa3715066p-27),
+    (double2)(0x1.1252e40000000p+0, 0x1.ab26eb3b211dcp-25),
+    (double2)(0x1.129d210000000p+0, 0x1.54dd4dc906307p-27),
+    (double2)(0x1.12e7350000000p+0, 0x1.c9f962387984ep-26),
+    (double2)(0x1.1331210000000p+0, 0x1.c62a959afec09p-25),
+    (double2)(0x1.137ae60000000p+0, 0x1.638d9ac6a866ap-25),
+    (double2)(0x1.13c4840000000p+0, 0x1.38704eca8a22dp-28),
+    (double2)(0x1.140dfa0000000p+0, 0x1.e6c9e1db14f8fp-27),
+    (double2)(0x1.1457490000000p+0, 0x1.8744b7f9c9eaap-26),
+    (double2)(0x1.14a0710000000p+0, 0x1.6c2893486373bp-25),
+    (double2)(0x1.14e9730000000p+0, 0x1.b36bce31699b7p-26),
+    (double2)(0x1.15324e0000000p+0, 0x1.71e3813d200c7p-25),
+    (double2)(0x1.157b030000000p+0, 0x1.99755ab40aa88p-25),
+    (double2)(0x1.15c3920000000p+0, 0x1.b45ca0e4bcfc0p-25),
+    (double2)(0x1.160bfc0000000p+0, 0x1.2dd090d869c5dp-28),
+    (double2)(0x1.16543f0000000p+0, 0x1.4fe0516b917dap-25),
+    (double2)(0x1.169c5d0000000p+0, 0x1.94563226317a2p-25),
+    (double2)(0x1.16e4560000000p+0, 0x1.53d8fafc2c851p-25),
+    (double2)(0x1.172c2a0000000p+0, 0x1.dcbd41fbd41a3p-26),
+    (double2)(0x1.1773d90000000p+0, 0x1.862ff5285f59cp-26),
+    (double2)(0x1.17bb630000000p+0, 0x1.3072ea97a1e1cp-25),
+    (double2)(0x1.1802c90000000p+0, 0x1.2839075184805p-26),
+    (double2)(0x1.184a0a0000000p+0, 0x1.4b0323e9eff42p-25),
+    (double2)(0x1.1891270000000p+0, 0x1.b158893c45484p-25),
+    (double2)(0x1.18d8210000000p+0, 0x1.149ef0fc35826p-28),
+    (double2)(0x1.191ef60000000p+0, 0x1.f2e77ea96acaap-26),
+    (double2)(0x1.1965a80000000p+0, 0x1.200074c471a95p-26),
+    (double2)(0x1.19ac360000000p+0, 0x1.3f8cc517f6f04p-25),
+    (double2)(0x1.19f2a10000000p+0, 0x1.60ba2e311bb55p-25),
+    (double2)(0x1.1a38e90000000p+0, 0x1.4b788730bbec3p-25),
+    (double2)(0x1.1a7f0e0000000p+0, 0x1.57090795ee20cp-25),
+    (double2)(0x1.1ac5100000000p+0, 0x1.d9ffe983670b1p-25),
+    (double2)(0x1.1b0af00000000p+0, 0x1.2a463ff61bfdap-25),
+    (double2)(0x1.1b50ad0000000p+0, 0x1.9d1bc6a5e65cfp-25),
+    (double2)(0x1.1b96480000000p+0, 0x1.8718abaa9e922p-25),
+    (double2)(0x1.1bdbc10000000p+0, 0x1.3c2f52ffa342ep-25),
+    (double2)(0x1.1c21180000000p+0, 0x1.0fae13ff42c80p-25),
+    (double2)(0x1.1c664d0000000p+0, 0x1.5440f0ef00d57p-25),
+    (double2)(0x1.1cab610000000p+0, 0x1.6fcd22d4e3c1ep-27),
+    (double2)(0x1.1cf0530000000p+0, 0x1.e0c60b409e863p-27),
+    (double2)(0x1.1d35230000000p+0, 0x1.f9cab5a5f0333p-25),
+    (double2)(0x1.1d79d30000000p+0, 0x1.30f24744c333dp-25),
+    (double2)(0x1.1dbe620000000p+0, 0x1.b50622a76b2fep-27),
+    (double2)(0x1.1e02cf0000000p+0, 0x1.fdb94ba595375p-25),
+    (double2)(0x1.1e471d0000000p+0, 0x1.861b9b945a171p-28),
+    (double2)(0x1.1e8b490000000p+0, 0x1.54348015188c4p-25),
+    (double2)(0x1.1ecf550000000p+0, 0x1.b54d149865523p-25),
+    (double2)(0x1.1f13410000000p+0, 0x1.a0bb783d9de33p-25),
+    (double2)(0x1.1f570d0000000p+0, 0x1.629d12b1a2157p-25),
+    (double2)(0x1.1f9ab90000000p+0, 0x1.467fe35d179dfp-25),
+    (double2)(0x1.1fde450000000p+0, 0x1.9763f3e26c8f7p-25),
+    (double2)(0x1.2021b20000000p+0, 0x1.3f798bb9f7679p-26),
+    (double2)(0x1.2064ff0000000p+0, 0x1.52e577e855898p-26),
+    (double2)(0x1.20a82c0000000p+0, 0x1.fde47e5502c3ap-25),
+    (double2)(0x1.20eb3b0000000p+0, 0x1.cbd0b548d96a0p-26),
+    (double2)(0x1.212e2a0000000p+0, 0x1.a9cd9f7be8de8p-25),
+    (double2)(0x1.2170fb0000000p+0, 0x1.22bbe704886dep-26),
+    (double2)(0x1.21b3ac0000000p+0, 0x1.e3dea8317f020p-25),
+    (double2)(0x1.21f63f0000000p+0, 0x1.e812085ac8855p-25),
+    (double2)(0x1.2238b40000000p+0, 0x1.c87144f24cb07p-26),
+    (double2)(0x1.227b0a0000000p+0, 0x1.1e128ee311fa2p-25),
+    (double2)(0x1.22bd420000000p+0, 0x1.b5c163d61a2d3p-26),
+    (double2)(0x1.22ff5c0000000p+0, 0x1.7d97e7fb90633p-27),
+    (double2)(0x1.2341570000000p+0, 0x1.efe899d50f6a7p-25),
+    (double2)(0x1.2383350000000p+0, 0x1.d0333eb75de5ap-25),
+    (double2)(0x1.23c4f60000000p+0, 0x1.0e590be73a573p-27),
+    (double2)(0x1.2406980000000p+0, 0x1.8ce8dcac3cdd2p-25),
+    (double2)(0x1.24481d0000000p+0, 0x1.ee8a48954064bp-25),
+    (double2)(0x1.2489850000000p+0, 0x1.aa62f18461e09p-25),
+    (double2)(0x1.24cad00000000p+0, 0x1.01e5940986a15p-25),
+    (double2)(0x1.250bfe0000000p+0, 0x1.b082f4f9b8d4cp-28),
+    (double2)(0x1.254d0e0000000p+0, 0x1.876e0e5527f5ap-25),
+    (double2)(0x1.258e020000000p+0, 0x1.3617080831e6bp-25),
+    (double2)(0x1.25ced90000000p+0, 0x1.81b26e34aa4a2p-25),
+    (double2)(0x1.260f940000000p+0, 0x1.52ee66dfab0c1p-26),
+    (double2)(0x1.2650320000000p+0, 0x1.d85a5329e8819p-26),
+    (double2)(0x1.2690b40000000p+0, 0x1.105c1b646b5d1p-26),
+    (double2)(0x1.26d1190000000p+0, 0x1.bb6690c1a379cp-25),
+    (double2)(0x1.2711630000000p+0, 0x1.86aeba73ce3a9p-26),
+    (double2)(0x1.2751900000000p+0, 0x1.dd16198294dd4p-25),
+    (double2)(0x1.2791a20000000p+0, 0x1.454e675775e83p-25),
+    (double2)(0x1.27d1980000000p+0, 0x1.3842e026197eap-25),
+    (double2)(0x1.2811720000000p+0, 0x1.f1ce0e70c44d2p-25),
+    (double2)(0x1.2851310000000p+0, 0x1.ad636441a5627p-25),
+    (double2)(0x1.2890d50000000p+0, 0x1.4c205d7212abbp-26),
+    (double2)(0x1.28d05d0000000p+0, 0x1.167c86c116419p-25),
+    (double2)(0x1.290fca0000000p+0, 0x1.38ec3ef16e294p-25),
+    (double2)(0x1.294f1c0000000p+0, 0x1.473fceace9321p-25),
+    (double2)(0x1.298e530000000p+0, 0x1.7af53a836dba7p-25),
+    (double2)(0x1.29cd700000000p+0, 0x1.a51f3c383b652p-30),
+    (double2)(0x1.2a0c710000000p+0, 0x1.3696da190822dp-25),
+    (double2)(0x1.2a4b580000000p+0, 0x1.2f9adec77074bp-25),
+    (double2)(0x1.2a8a250000000p+0, 0x1.8190fd5bee55fp-28),
+    (double2)(0x1.2ac8d70000000p+0, 0x1.bfee8fac68e55p-27),
+    (double2)(0x1.2b076f0000000p+0, 0x1.31c9d6bc5f68ap-28),
+    (double2)(0x1.2b45ec0000000p+0, 0x1.89d0523737edfp-25),
+    (double2)(0x1.2b84500000000p+0, 0x1.a295943bf47bbp-26),
+    (double2)(0x1.2bc29a0000000p+0, 0x1.96be32e5b3207p-28),
+    (double2)(0x1.2c00c90000000p+0, 0x1.e44c7d909fa0ep-25),
+    (double2)(0x1.2c3ee00000000p+0, 0x1.b2505da94d9eap-29),
+    (double2)(0x1.2c7cdc0000000p+0, 0x1.0c851f46c9c98p-25),
+    (double2)(0x1.2cbabf0000000p+0, 0x1.da71f7d9aa3b7p-26),
+    (double2)(0x1.2cf8880000000p+0, 0x1.f1b605d019ef1p-25),
+    (double2)(0x1.2d36390000000p+0, 0x1.386e8a2189563p-27),
+    (double2)(0x1.2d73d00000000p+0, 0x1.b19fa5d306ba7p-28),
+    (double2)(0x1.2db14d0000000p+0, 0x1.dd749b67aef76p-25),
+    (double2)(0x1.2deeb20000000p+0, 0x1.76ff6f1dc04b0p-25),
+    (double2)(0x1.2e2bfe0000000p+0, 0x1.35a33d0b232a6p-25),
+    (double2)(0x1.2e69310000000p+0, 0x1.4bdc80024a4e1p-25),
+    (double2)(0x1.2ea64b0000000p+0, 0x1.ebd61770fd723p-25),
+    (double2)(0x1.2ee34d0000000p+0, 0x1.4769fc537264dp-25),
+    (double2)(0x1.2f20360000000p+0, 0x1.9021f429f3b98p-25),
+    (double2)(0x1.2f5d070000000p+0, 0x1.ee7083efbd606p-26),
+    (double2)(0x1.2f99bf0000000p+0, 0x1.ad985552a6b1ap-25),
+    (double2)(0x1.2fd65f0000000p+0, 0x1.e3df778772160p-25),
+    (double2)(0x1.3012e70000000p+0, 0x1.ca5d76ddc9b34p-25),
+    (double2)(0x1.304f570000000p+0, 0x1.91154ffdbaf74p-25),
+    (double2)(0x1.308baf0000000p+0, 0x1.67bdd57fb306ap-25),
+    (double2)(0x1.30c7ef0000000p+0, 0x1.7dc255ac40886p-25),
+    (double2)(0x1.3104180000000p+0, 0x1.219f38e8afafep-32),
+    (double2)(0x1.3140280000000p+0, 0x1.2416bf9669a04p-25),
+    (double2)(0x1.317c210000000p+0, 0x1.11c96b2b3987fp-25),
+    (double2)(0x1.31b8020000000p+0, 0x1.f99ed447e1177p-25),
+    (double2)(0x1.31f3cd0000000p+0, 0x1.3245826328a11p-30),
+    (double2)(0x1.322f7f0000000p+0, 0x1.6f56dd1e645f8p-25),
+    (double2)(0x1.326b1b0000000p+0, 0x1.6164946945535p-27),
+    (double2)(0x1.32a69f0000000p+0, 0x1.e37d59d190028p-26),
+    (double2)(0x1.32e20c0000000p+0, 0x1.68671f12bf828p-25),
+    (double2)(0x1.331d620000000p+0, 0x1.e8ecbca6aabbdp-25),
+    (double2)(0x1.3358a20000000p+0, 0x1.3f49e109a5912p-26),
+    (double2)(0x1.3393ca0000000p+0, 0x1.b8a0e11ec3043p-25),
+    (double2)(0x1.33cedc0000000p+0, 0x1.5fae00aed691ap-25),
+    (double2)(0x1.3409d70000000p+0, 0x1.c0569bece3e4ap-25),
+    (double2)(0x1.3444bc0000000p+0, 0x1.05e26744efbfep-25),
+    (double2)(0x1.347f8a0000000p+0, 0x1.5b570a94be5c5p-25),
+    (double2)(0x1.34ba420000000p+0, 0x1.d6f156ea0e063p-26),
+    (double2)(0x1.34f4e30000000p+0, 0x1.e0ca7612fc484p-25),
+    (double2)(0x1.352f6f0000000p+0, 0x1.963c927b25258p-27),
+    (double2)(0x1.3569e40000000p+0, 0x1.47930aa725a5cp-26),
+    (double2)(0x1.35a4430000000p+0, 0x1.8a79fe3af43b3p-26),
+    (double2)(0x1.35de8c0000000p+0, 0x1.e6dc29c41bdafp-26),
+    (double2)(0x1.3618bf0000000p+0, 0x1.57a2e76f863a5p-25),
+    (double2)(0x1.3652dd0000000p+0, 0x1.ae3b61716354dp-29),
+    (double2)(0x1.368ce40000000p+0, 0x1.65fb5df6906b1p-25),
+    (double2)(0x1.36c6d60000000p+0, 0x1.6177d7f588f7bp-25),
+    (double2)(0x1.3700b30000000p+0, 0x1.ad55abd091b67p-28),
+    (double2)(0x1.373a7a0000000p+0, 0x1.55337b2422d76p-30),
+    (double2)(0x1.37742b0000000p+0, 0x1.084ebe86972d5p-25),
+    (double2)(0x1.37adc70000000p+0, 0x1.56395808e1ea3p-25),
+    (double2)(0x1.37e74e0000000p+0, 0x1.1bce21b40fba7p-25),
+    (double2)(0x1.3820c00000000p+0, 0x1.006f94605b515p-26),
+    (double2)(0x1.385a1c0000000p+0, 0x1.aa676aceb1f7dp-25),
+    (double2)(0x1.3893640000000p+0, 0x1.8229f76554ce6p-26),
+    (double2)(0x1.38cc960000000p+0, 0x1.eabfc6cf57330p-25),
+    (double2)(0x1.3905b40000000p+0, 0x1.4daed9c0ce8bcp-25),
+    (double2)(0x1.393ebd0000000p+0, 0x1.0ff1768237141p-25),
+    (double2)(0x1.3977b10000000p+0, 0x1.575f83051b085p-25),
+    (double2)(0x1.39b0910000000p+0, 0x1.2667deb523e29p-27),
+    (double2)(0x1.39e95c0000000p+0, 0x1.816996954f4fdp-30),
+    (double2)(0x1.3a22120000000p+0, 0x1.87cfccf4d9cd4p-26),
+    (double2)(0x1.3a5ab40000000p+0, 0x1.2c5d018198353p-26),
+    (double2)(0x1.3a93410000000p+0, 0x1.a7a898dcc34aap-25),
+    (double2)(0x1.3acbbb0000000p+0, 0x1.cead6dadc36d1p-29),
+    (double2)(0x1.3b04200000000p+0, 0x1.a55759c498bdfp-29),
+    (double2)(0x1.3b3c700000000p+0, 0x1.c414a9ef6de04p-25),
+    (double2)(0x1.3b74ad0000000p+0, 0x1.3e2108a6e58fap-25),
+    (double2)(0x1.3bacd60000000p+0, 0x1.587fd7643d77cp-26),
+    (double2)(0x1.3be4eb0000000p+0, 0x1.901eb1d3ff3dfp-28),
+    (double2)(0x1.3c1ceb0000000p+0, 0x1.f2ccd7c812fc6p-25),
+    (double2)(0x1.3c54d90000000p+0, 0x1.1c8ee70a01049p-29),
+    (double2)(0x1.3c8cb20000000p+0, 0x1.63e8d02831eecp-26),
+    (double2)(0x1.3cc4770000000p+0, 0x1.f61a42a92c7ffp-25),
+    (double2)(0x1.3cfc2a0000000p+0, 0x1.a917399c84d24p-34),
+    (double2)(0x1.3d33c80000000p+0, 0x1.e9197c8eec2f0p-26),
+    (double2)(0x1.3d6b530000000p+0, 0x1.e6f842f5a1378p-26),
+    (double2)(0x1.3da2cb0000000p+0, 0x1.fac242a90a0fcp-29),
+    (double2)(0x1.3dda2f0000000p+0, 0x1.35ed726610227p-26),
+    (double2)(0x1.3e11800000000p+0, 0x1.0e0d64804b15bp-26),
+    (double2)(0x1.3e48be0000000p+0, 0x1.560675daba814p-31),
+    (double2)(0x1.3e7fe80000000p+0, 0x1.37388c8768032p-25),
+    (double2)(0x1.3eb7000000000p+0, 0x1.ee3c89f9e01f5p-28),
+    (double2)(0x1.3eee040000000p+0, 0x1.39f6f0d09747cp-25),
+    (double2)(0x1.3f24f60000000p+0, 0x1.322c327abb8f0p-27),
+    (double2)(0x1.3f5bd40000000p+0, 0x1.961b347c8ac80p-25),
+    (double2)(0x1.3f92a00000000p+0, 0x1.3711fbbd0f118p-25),
+    (double2)(0x1.3fc9590000000p+0, 0x1.4fad8d7718ffbp-25),
+    (double2)(0x1.3fffff0000000p+0, 0x1.fffffffffffffp-25),
+    (double2)(0x1.4036930000000p+0, 0x1.67efa79ec35b4p-25),
+    (double2)(0x1.406d140000000p+0, 0x1.a737687a254a8p-25),
+    (double2)(0x1.40a3830000000p+0, 0x1.bace0f87d924dp-26),
+    (double2)(0x1.40d9df0000000p+0, 0x1.29e37c237e392p-25),
+    (double2)(0x1.4110290000000p+0, 0x1.57ce7ac3f3012p-26),
+    (double2)(0x1.4146600000000p+0, 0x1.82829359f8fbdp-25),
+    (double2)(0x1.417c850000000p+0, 0x1.cc9be42d14676p-25),
+    (double2)(0x1.41b2980000000p+0, 0x1.a8f001c137d0bp-25),
+    (double2)(0x1.41e8990000000p+0, 0x1.36127687dda05p-25),
+    (double2)(0x1.421e880000000p+0, 0x1.24dba322646f0p-26),
+    (double2)(0x1.4254640000000p+0, 0x1.dc43f1ed210b4p-25),
+    (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25)
+};
+
+
+DECLARE_TABLE(double2, CBRT_REM_TBL, 5) = {
+    (double2)(0x1.428a2f0000000p-1, 0x1.31ae515c447bbp-26),
+    (double2)(0x1.965fea0000000p-1, 0x1.4f5b8f20ac166p-27),
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25),
+    (double2)(0x1.965fea0000000p+0, 0x1.4f5b8f20ac166p-26),
+};
+
 TABLE_FUNCTION(double2, ATAN_JBY256_TBL, atan_jby256_tbl);
 TABLE_FUNCTION(double2, TWO_TO_JBY64_EP, two_to_jby64_ep_tbl);
+TABLE_FUNCTION(double2, SINH_TBL, sinh_tbl);
+TABLE_FUNCTION(double2, COSH_TBL, cosh_tbl);
+TABLE_FUNCTION(double, CBRT_INV_TBL, cbrt_inv_tbl);
+TABLE_FUNCTION(double2, CBRT_DBL_TBL, cbrt_dbl_tbl);
+TABLE_FUNCTION(double2, CBRT_REM_TBL, cbrt_rem_tbl);
 
 #endif // cl_khr_fp64
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/tables.h libclc-0.2.0+git20170213/generic/lib/math/tables.h
--- libclc-0.2.0+git20150813/generic/lib/math/tables.h	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/tables.h	2017-02-12 21:33:49.000000000 +0000
@@ -42,6 +42,10 @@
 TABLE_FUNCTION_DECL(float, log_inv_tbl);
 TABLE_FUNCTION_DECL(float2, log2_tbl);
 TABLE_FUNCTION_DECL(uint4,  pibits_tbl);
+TABLE_FUNCTION_DECL(float2, sinhcosh_tbl);
+TABLE_FUNCTION_DECL(float2, cbrt_tbl);
+TABLE_FUNCTION_DECL(float, exp_tbl);
+TABLE_FUNCTION_DECL(float2, exp_tbl_ep);
 
 #ifdef cl_khr_fp64
 
@@ -50,4 +54,10 @@
 TABLE_FUNCTION_DECL(double2, ln_tbl);
 TABLE_FUNCTION_DECL(double2, atan_jby256_tbl);
 TABLE_FUNCTION_DECL(double2, two_to_jby64_ep_tbl);
+TABLE_FUNCTION_DECL(double2, sinh_tbl);
+TABLE_FUNCTION_DECL(double2, cosh_tbl);
+TABLE_FUNCTION_DECL(double, cbrt_inv_tbl);
+TABLE_FUNCTION_DECL(double2, cbrt_dbl_tbl);
+TABLE_FUNCTION_DECL(double2, cbrt_rem_tbl);
+
 #endif // cl_khr_fp64
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/tanh.cl libclc-0.2.0+git20170213/generic/lib/math/tanh.cl
--- libclc-0.2.0+git20150813/generic/lib/math/tanh.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/tanh.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float tanh(float x)
+{
+    // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+    // to the following three formulae:
+    // 1.  (exp(x) - exp(-x))/(exp(x) + exp(-x))
+    // 2.  (1 - (2/(exp(2*x) + 1 )))
+    // 3.  (exp(2*x) - 1)/(exp(2*x) + 1)
+    // but computationally, some formulae are better on some ranges.
+
+    const float large_threshold = 0x1.0a2b24p+3f;
+
+    uint ux = as_uint(x);
+    uint aux = ux & EXSIGNBIT_SP32;
+    uint xs = ux ^ aux;
+
+    float y = as_float(aux);
+    float y2 = y*y;
+
+    float a1 = mad(y2,
+                   mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F),
+                   -0.28192806108402678e0F);
+    float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F);
+
+    float a2 = mad(y2,
+                   mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F),
+                   -0.24069858695196524e0F);
+    float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F);
+
+    int c = y < 0.9f;
+    float a = c ? a1 : a2;
+    float b = c ? b1 : b2;
+    float zlo = mad(MATH_DIVIDE(a, b), y*y2, y);
+
+    float p = exp(2.0f * y) + 1.0f;
+    float zhi = 1.0F - MATH_DIVIDE(2.0F, p);
+
+    float z = y <= 1.0f ? zlo : zhi;
+    z = as_float(xs | as_uint(z));
+
+    // Edge cases
+    float sone = as_float(0x3f800000U | xs);
+    z = y > large_threshold ? sone : z;
+    z = aux < 0x39000000 | aux > 0x7f800000 ? x : z;
+
+    return z;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, tanh, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double tanh(double x)
+{
+    // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+    // to the following three formulae:
+    // 1.  (exp(x) - exp(-x))/(exp(x) + exp(-x))
+    // 2.  (1 - (2/(exp(2*x) + 1 )))
+    // 3.  (exp(2*x) - 1)/(exp(2*x) + 1)
+    // but computationally, some formulae are better on some ranges.
+
+    // The point at which e^-x is insignificant compared to e^x = ln(2^27)
+    const double large_threshold = 0x1.2b708872320e2p+4;
+
+    ulong ux = as_ulong(x);
+    ulong ax = ux & ~SIGNBIT_DP64;
+    ulong sx = ux ^ ax;
+    double y = as_double(ax);
+    double y2 = y * y;
+
+    // y < 0.9
+    double znl = fma(y2,
+                     fma(y2,
+                         fma(y2, -0.142077926378834722618091e-7, -0.200047621071909498730453e-3),
+                         -0.176016349003044679402273e-1),
+                     -0.274030424656179760118928e0);
+
+    double zdl = fma(y2,
+                     fma(y2,
+                         fma(y2, 0.2091140262529164482568557e-3, 0.201562166026937652780575e-1),
+                         0.381641414288328849317962e0),
+                     0.822091273968539282568011e0);
+
+    // 0.9 <= y <= 1
+    double znm = fma(y2,
+                     fma(y2,
+                         fma(y2, -0.115475878996143396378318e-7, -0.165597043903549960486816e-3),
+                         -0.146173047288731678404066e-1),
+                     -0.227793870659088295252442e0);
+
+    double zdm = fma(y2,
+                     fma(y2,
+                         fma(y2, 0.173076050126225961768710e-3, 0.167358775461896562588695e-1),
+                         0.317204558977294374244770e0),
+                     0.683381611977295894959554e0);
+
+    int c = y < 0.9;
+    double zn = c ? znl : znm;
+    double zd = c ? zdl : zdm;
+    double z = y + y*y2 * MATH_DIVIDE(zn, zd);
+
+    // y > 1
+    double p = exp(2.0 * y) + 1.0;
+    double zg = 1.0 - 2.0 / p;
+
+    z = y > 1.0 ? zg : z;
+
+    // Other cases
+    z = y < 0x1.0p-28 | ax > PINFBITPATT_DP64 ? x : z;
+
+    z = y > large_threshold ? 1.0 : z;
+
+    return as_double(sx | as_ulong(z));
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tanh, double);
+
+#endif // cl_khr_fp64
diff -Nru libclc-0.2.0+git20150813/generic/lib/math/tgamma.cl libclc-0.2.0+git20170213/generic/lib/math/tgamma.cl
--- libclc-0.2.0+git20150813/generic/lib/math/tgamma.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/math/tgamma.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016 Aaron Watry
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clc.h>
+
+#include "math.h"
+#include "../clcmacro.h"
+
+_CLC_OVERLOAD _CLC_DEF float tgamma(float x) {
+    const float pi = 3.1415926535897932384626433832795f;
+    float ax = fabs(x);
+    float lg = lgamma(ax);
+    float g = exp(lg);
+
+    if (x < 0.0f) {
+        float z = sinpi(x);
+        g = g * ax * z;
+        g = pi / g;
+        g = g == 0 ? as_float(PINFBITPATT_SP32) : g;
+        g = z == 0 ? as_float(QNANBITPATT_SP32) : g;
+    }
+
+    return g;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, tgamma, float);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_OVERLOAD _CLC_DEF double tgamma(double x) {
+    const double pi = 3.1415926535897932384626433832795;
+    double ax = fabs(x);
+    double lg = lgamma(ax);
+    double g = exp(lg);
+
+    if (x < 0.0) {
+        double z = sinpi(x);
+        g = g * ax * z;
+        g = pi / g;
+        g = g == 0 ? as_double(PINFBITPATT_DP64) : g;
+        g = z == 0 ? as_double(QNANBITPATT_DP64) : g;
+    }
+
+    return g;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tgamma, double);
+
+#endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/shared/min.inc libclc-0.2.0+git20170213/generic/lib/shared/min.inc
--- libclc-0.2.0+git20150813/generic/lib/shared/min.inc	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/shared/min.inc	2017-02-12 21:33:49.000000000 +0000
@@ -1,9 +1,9 @@
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) {
-  return (a < b ? a : b);
+  return (b < a ? b : a);
 }
 
 #ifndef __CLC_SCALAR
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) {
-  return (a < (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b);
+  return (b < (__CLC_GENTYPE)a ? (__CLC_GENTYPE)b : a);
 }
 #endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/shared/vstore.cl libclc-0.2.0+git20170213/generic/lib/shared/vstore.cl
--- libclc-0.2.0+git20150813/generic/lib/shared/vstore.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/shared/vstore.cl	2017-02-12 21:33:49.000000000 +0000
@@ -50,3 +50,48 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
     VSTORE_ADDR_SPACES(double)
 #endif
+
+/* vstore_half are legal even without cl_khr_fp16 */
+#define DECLARE_HELPER(STYPE, AS) void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *);
+
+DECLARE_HELPER(float, __private);
+DECLARE_HELPER(float, __global);
+DECLARE_HELPER(float, __local);
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+DECLARE_HELPER(double, __private);
+DECLARE_HELPER(double, __global);
+DECLARE_HELPER(double, __local);
+#endif
+
+
+#define VEC_STORE1(STYPE, AS, val) __clc_vstore_half_##STYPE##_helper##AS (val, &mem[offset++]);
+#define VEC_STORE2(STYPE, AS, val) \
+	VEC_STORE1(STYPE, AS, val.lo) \
+	VEC_STORE1(STYPE, AS, val.hi)
+#define VEC_STORE3(STYPE, AS, val) \
+	VEC_STORE1(STYPE, AS, val.s0) \
+	VEC_STORE1(STYPE, AS, val.s1) \
+	VEC_STORE1(STYPE, AS, val.s2)
+#define VEC_STORE4(STYPE, AS, val) \
+	VEC_STORE2(STYPE, AS, val.lo) \
+	VEC_STORE2(STYPE, AS, val.hi)
+#define VEC_STORE8(STYPE, AS, val) \
+	VEC_STORE4(STYPE, AS, val.lo) \
+	VEC_STORE4(STYPE, AS, val.hi)
+#define VEC_STORE16(STYPE, AS, val) \
+	VEC_STORE8(STYPE, AS, val.lo) \
+	VEC_STORE8(STYPE, AS, val.hi)
+
+#define __FUNC(SUFFIX, VEC_SIZE, TYPE, STYPE, AS) \
+  _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \
+    offset *= VEC_SIZE; \
+    VEC_STORE##VEC_SIZE(STYPE, AS, vec) \
+  }
+
+#define FUNC(SUFFIX, VEC_SIZE, TYPE, STYPE, AS) __FUNC(SUFFIX, VEC_SIZE, TYPE, STYPE, AS)
+
+#define __CLC_BODY "vstore_half.inc"
+#include <clc/math/gentype.inc>
+
diff -Nru libclc-0.2.0+git20150813/generic/lib/shared/vstore_half_helpers.ll libclc-0.2.0+git20170213/generic/lib/shared/vstore_half_helpers.ll
--- libclc-0.2.0+git20150813/generic/lib/shared/vstore_half_helpers.ll	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/shared/vstore_half_helpers.ll	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,35 @@
+define void @__clc_vstore_half_float_helper__private(float %data, half addrspace(0)* nocapture %ptr) nounwind alwaysinline {
+  %res = fptrunc float %data to half
+  store half %res, half addrspace(0)* %ptr
+  ret void
+}
+
+define void @__clc_vstore_half_float_helper__global(float %data, half addrspace(1)* nocapture %ptr) nounwind alwaysinline {
+  %res = fptrunc float %data to half
+  store half %res, half addrspace(1)* %ptr
+  ret void
+}
+
+define void @__clc_vstore_half_float_helper__local(float %data, half addrspace(3)* nocapture %ptr) nounwind alwaysinline {
+  %res = fptrunc float %data to half
+  store half %res, half addrspace(3)* %ptr
+  ret void
+}
+
+define void @__clc_vstore_half_double_helper__private(double %data, half addrspace(0)* nocapture %ptr) nounwind alwaysinline {
+  %res = fptrunc double %data to half
+  store half %res, half addrspace(0)* %ptr
+  ret void
+}
+
+define void @__clc_vstore_half_double_helper__global(double %data, half addrspace(1)* nocapture %ptr) nounwind alwaysinline {
+  %res = fptrunc double %data to half
+  store half %res, half addrspace(1)* %ptr
+  ret void
+}
+
+define void @__clc_vstore_half_double_helper__local(double %data, half addrspace(3)* nocapture %ptr) nounwind alwaysinline {
+  %res = fptrunc double %data to half
+  store half %res, half addrspace(3)* %ptr
+  ret void
+}
diff -Nru libclc-0.2.0+git20150813/generic/lib/shared/vstore_half.inc libclc-0.2.0+git20170213/generic/lib/shared/vstore_half.inc
--- libclc-0.2.0+git20150813/generic/lib/shared/vstore_half.inc	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/shared/vstore_half.inc	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,10 @@
+
+#ifdef __CLC_VECSIZE
+  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private);
+  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local);
+  FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global);
+#else
+  FUNC(, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private);
+  FUNC(, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local);
+  FUNC(, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global);
+#endif
diff -Nru libclc-0.2.0+git20150813/generic/lib/SOURCES libclc-0.2.0+git20170213/generic/lib/SOURCES
--- libclc-0.2.0+git20150813/generic/lib/SOURCES	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/SOURCES	2017-02-12 21:33:49.000000000 +0000
@@ -73,29 +73,40 @@
 math/atan2pi.cl
 math/atanh.cl
 math/atanpi.cl
+math/cbrt.cl
 math/copysign.cl
 math/cos.cl
+math/cosh.cl
 math/cospi.cl
 math/ep_log.cl
+math/erf.cl
 math/erfc.cl
 math/exp.cl
 math/exp_helper.cl
+math/expm1.cl
 math/exp2.cl
 math/exp10.cl
+math/fdim.cl
 math/fmax.cl
 math/fmin.cl
 math/fmod.cl
 math/fract.cl
+math/frexp.cl
 math/half_rsqrt.cl
 math/half_sqrt.cl
 math/hypot.cl
+math/ilogb.cl
 math/clc_ldexp.cl
 math/ldexp.cl
+math/lgamma.cl
+math/lgamma_r.cl
 math/log.cl
 math/log10.cl
 math/log1p.cl
 math/log2.cl
+math/logb.cl
 math/mad.cl
+math/modf.cl
 math/native_log.cl
 math/native_log2.cl
 math/tables.cl
@@ -109,6 +120,8 @@
 math/clc_sqrt.cl
 math/sqrt.cl
 math/tan.cl
+math/tanh.cl
+math/tgamma.cl
 relational/all.cl
 relational/any.cl
 relational/bitselect.cl
@@ -131,5 +144,7 @@
 shared/min.cl
 shared/vload.cl
 shared/vstore.cl
+shared/vstore_half_helpers.ll
 workitem/get_global_id.cl
 workitem/get_global_size.cl
+image/get_image_dim.cl
diff -Nru libclc-0.2.0+git20150813/generic/lib/workitem/get_global_id.cl libclc-0.2.0+git20170213/generic/lib/workitem/get_global_id.cl
--- libclc-0.2.0+git20150813/generic/lib/workitem/get_global_id.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/generic/lib/workitem/get_global_id.cl	2017-02-12 21:33:49.000000000 +0000
@@ -1,5 +1,5 @@
 #include <clc/clc.h>
 
 _CLC_DEF size_t get_global_id(uint dim) {
-  return get_group_id(dim)*get_local_size(dim) + get_local_id(dim);
+  return get_group_id(dim) * get_local_size(dim) + get_local_id(dim) + get_global_offset(dim);
 }
diff -Nru libclc-0.2.0+git20150813/.gitignore libclc-0.2.0+git20170213/.gitignore
--- libclc-0.2.0+git20150813/.gitignore	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/.gitignore	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,14 @@
+Makefile
+amdgcn--
+amdgcn--amdhsa
+build/*.pyc
+built_libs/
+generic--
+generic/lib/convert.cl
+libclc.pc
+nvptx--nvidiacl
+nvptx64--nvidiacl
+r600--
+utils/prepare-builtins
+utils/prepare-builtins.o
+utils/prepare-builtins.o.d
diff -Nru libclc-0.2.0+git20150813/LICENSE.TXT libclc-0.2.0+git20170213/LICENSE.TXT
--- libclc-0.2.0+git20150813/LICENSE.TXT	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/LICENSE.TXT	2017-02-12 21:33:49.000000000 +0000
@@ -11,7 +11,7 @@
 
 ==============================================================================
 
-Copyright (c) 2011-2014 by the contributors listed in CREDITS.TXT
+Copyright (c) 2011-2016 by the contributors listed in CREDITS.TXT
 
 All rights reserved.
 
diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/SOURCES libclc-0.2.0+git20170213/ptx-nvidiacl/lib/SOURCES
--- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/SOURCES	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/SOURCES	2017-02-12 21:33:49.000000000 +0000
@@ -1,4 +1,5 @@
 synchronization/barrier.cl
+workitem/get_global_id.cl
 workitem/get_group_id.cl
 workitem/get_local_id.cl
 workitem/get_local_size.cl
diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/synchronization/barrier.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/synchronization/barrier.cl
--- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/synchronization/barrier.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/synchronization/barrier.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,7 +2,7 @@
 
 _CLC_DEF void barrier(cl_mem_fence_flags flags) {
   if (flags & CLK_LOCAL_MEM_FENCE) {
-    __builtin_ptx_bar_sync(0);
+    __syncthreads();
   }
 }
 
diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_global_id.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_global_id.cl
--- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_global_id.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_global_id.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,5 @@
+#include <clc/clc.h>
+
+_CLC_DEF size_t get_global_id(uint dim) {
+  return get_group_id(dim) * get_local_size(dim) + get_local_id(dim);
+}
diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_group_id.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_group_id.cl
--- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_group_id.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_group_id.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,9 +2,9 @@
 
 _CLC_DEF size_t get_group_id(uint dim) {
   switch (dim) {
-  case 0:  return __builtin_ptx_read_ctaid_x();
-  case 1:  return __builtin_ptx_read_ctaid_y();
-  case 2:  return __builtin_ptx_read_ctaid_z();
+  case 0:  return __nvvm_read_ptx_sreg_ctaid_x();
+  case 1:  return __nvvm_read_ptx_sreg_ctaid_y();
+  case 2:  return __nvvm_read_ptx_sreg_ctaid_z();
   default: return 0;
   }
 }
diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_local_id.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_local_id.cl
--- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_local_id.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_local_id.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,9 +2,9 @@
 
 _CLC_DEF size_t get_local_id(uint dim) {
   switch (dim) {
-  case 0:  return __builtin_ptx_read_tid_x();
-  case 1:  return __builtin_ptx_read_tid_y();
-  case 2:  return __builtin_ptx_read_tid_z();
+  case 0:  return __nvvm_read_ptx_sreg_tid_x();
+  case 1:  return __nvvm_read_ptx_sreg_tid_y();
+  case 2:  return __nvvm_read_ptx_sreg_tid_z();
   default: return 0;
   }
 }
diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_local_size.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_local_size.cl
--- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_local_size.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_local_size.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,9 +2,9 @@
 
 _CLC_DEF size_t get_local_size(uint dim) {
   switch (dim) {
-  case 0:  return __builtin_ptx_read_ntid_x();
-  case 1:  return __builtin_ptx_read_ntid_y();
-  case 2:  return __builtin_ptx_read_ntid_z();
+  case 0:  return __nvvm_read_ptx_sreg_ntid_x();
+  case 1:  return __nvvm_read_ptx_sreg_ntid_y();
+  case 2:  return __nvvm_read_ptx_sreg_ntid_z();
   default: return 0;
   }
 }
diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_num_groups.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_num_groups.cl
--- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_num_groups.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_num_groups.cl	2017-02-12 21:33:49.000000000 +0000
@@ -2,9 +2,9 @@
 
 _CLC_DEF size_t get_num_groups(uint dim) {
   switch (dim) {
-  case 0:  return __builtin_ptx_read_nctaid_x();
-  case 1:  return __builtin_ptx_read_nctaid_y();
-  case 2:  return __builtin_ptx_read_nctaid_z();
+  case 0:  return __nvvm_read_ptx_sreg_nctaid_x();
+  case 1:  return __nvvm_read_ptx_sreg_nctaid_y();
+  case 2:  return __nvvm_read_ptx_sreg_nctaid_z();
   default: return 0;
   }
 }
diff -Nru libclc-0.2.0+git20150813/r600/lib/atomic/atomic.cl libclc-0.2.0+git20170213/r600/lib/atomic/atomic.cl
--- libclc-0.2.0+git20150813/r600/lib/atomic/atomic.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/atomic/atomic.cl	1970-01-01 00:00:00.000000000 +0000
@@ -1,65 +0,0 @@
-#include <clc/clc.h>
-
-#define ATOMIC_FUNC_DEFINE(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
-_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE val) { \
-	return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)val); \
-}
-
-/* For atomic functions that don't need different bitcode dependending on argument signedness */
-#define ATOMIC_FUNC_SIGN(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
-	_CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE); \
-	ATOMIC_FUNC_DEFINE(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
-	ATOMIC_FUNC_DEFINE(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE)
-
-#define ATOMIC_FUNC_ADDRSPACE(TYPE, FUNCTION) \
-	ATOMIC_FUNC_SIGN(TYPE, FUNCTION, global, 1) \
-	ATOMIC_FUNC_SIGN(TYPE, FUNCTION, local, 3)
-
-#define ATOMIC_FUNC(FUNCTION) \
-	ATOMIC_FUNC_ADDRSPACE(int, FUNCTION)
-
-#define ATOMIC_FUNC_DEFINE_3_ARG(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
-_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE cmp, RET_SIGN TYPE val) { \
-	return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)cmp, (ARG_SIGN TYPE)val); \
-}
-
-/* For atomic functions that don't need different bitcode dependending on argument signedness */
-#define ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
-	_CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE, signed TYPE); \
-	ATOMIC_FUNC_DEFINE_3_ARG(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \
-	ATOMIC_FUNC_DEFINE_3_ARG(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE)
-
-#define ATOMIC_FUNC_ADDRSPACE_3_ARG(TYPE, FUNCTION) \
-	ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, global, 1) \
-	ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, local, 3)
-
-#define ATOMIC_FUNC_3_ARG(FUNCTION) \
-	ATOMIC_FUNC_ADDRSPACE_3_ARG(int, FUNCTION)
-
-ATOMIC_FUNC(atomic_add)
-ATOMIC_FUNC(atomic_and)
-ATOMIC_FUNC(atomic_or)
-ATOMIC_FUNC(atomic_sub)
-ATOMIC_FUNC(atomic_xchg)
-ATOMIC_FUNC(atomic_xor)
-ATOMIC_FUNC_3_ARG(atomic_cmpxchg)
-
-_CLC_DECL signed int __clc_atomic_max_addr1(volatile global signed int*, signed int);
-_CLC_DECL signed int __clc_atomic_max_addr3(volatile local signed int*, signed int);
-_CLC_DECL uint __clc_atomic_umax_addr1(volatile global uint*, uint);
-_CLC_DECL uint __clc_atomic_umax_addr3(volatile local uint*, uint);
-
-ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, global, 1)
-ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, local, 3)
-ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, global, 1)
-ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, local, 3)
-
-_CLC_DECL signed int __clc_atomic_min_addr1(volatile global signed int*, signed int);
-_CLC_DECL signed int __clc_atomic_min_addr3(volatile local signed int*, signed int);
-_CLC_DECL uint __clc_atomic_umin_addr1(volatile global uint*, uint);
-_CLC_DECL uint __clc_atomic_umin_addr3(volatile local uint*, uint);
-
-ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, global, 1)
-ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, local, 3)
-ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, global, 1)
-ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, local, 3)
diff -Nru libclc-0.2.0+git20150813/r600/lib/math/ldexp.cl libclc-0.2.0+git20170213/r600/lib/math/ldexp.cl
--- libclc-0.2.0+git20150813/r600/lib/math/ldexp.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/math/ldexp.cl	1970-01-01 00:00:00.000000000 +0000
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2014 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <clc/clc.h>
-
-#include "../../../generic/lib/clcmacro.h"
-
-#ifdef __HAS_LDEXPF__
-#define BUILTINF __builtin_amdgpu_ldexpf
-#else
-#include "math/clc_ldexp.h"
-#define BUILTINF __clc_ldexp
-#endif
-
-// This defines all the ldexp(floatN, intN) variants.
-_CLC_DEFINE_BINARY_BUILTIN(float, ldexp, BUILTINF, float, int);
-
-#ifdef cl_khr_fp64
-  #pragma OPENCL EXTENSION cl_khr_fp64 : enable
-    // This defines all the ldexp(doubleN, intN) variants.
-  _CLC_DEFINE_BINARY_BUILTIN(double, ldexp, __builtin_amdgpu_ldexp, double, int);
-#endif
-
-// This defines all the ldexp(GENTYPE, int);
-#define __CLC_BODY <../../../generic/lib/math/ldexp.inc>
-#include <clc/math/gentype.inc>
-
-#undef BUILTINF
diff -Nru libclc-0.2.0+git20150813/r600/lib/math/nextafter.cl libclc-0.2.0+git20170213/r600/lib/math/nextafter.cl
--- libclc-0.2.0+git20150813/r600/lib/math/nextafter.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/math/nextafter.cl	1970-01-01 00:00:00.000000000 +0000
@@ -1,4 +0,0 @@
-#include <clc/clc.h>
-#include "../lib/clcmacro.h"
-
-_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float)
diff -Nru libclc-0.2.0+git20150813/r600/lib/math/sqrt.cl libclc-0.2.0+git20170213/r600/lib/math/sqrt.cl
--- libclc-0.2.0+git20150813/r600/lib/math/sqrt.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/math/sqrt.cl	1970-01-01 00:00:00.000000000 +0000
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2015 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#include <clc/clc.h>
-#include "../../../generic/lib/clcmacro.h"
-#include "math/clc_sqrt.h"
-
-_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float)
-
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-
-_CLC_OVERLOAD _CLC_DEF double sqrt(double x) {
-
-  uint vcc = x < 0x1p-767;
-  uint exp0 = vcc ? 0x100 : 0;
-  unsigned exp1 = vcc ? 0xffffff80 : 0;
-
-  double v01 = ldexp(x, exp0);
-  double v23 = __builtin_amdgpu_rsq(v01);
-  double v45 = v01 * v23;
-  v23 = v23 * 0.5;
-
-  double v67 = fma(-v23, v45, 0.5);
-  v45 = fma(v45, v67, v45);
-  double v89 = fma(-v45, v45, v01);
-  v23 = fma(v23, v67, v23);
-  v45 = fma(v89, v23, v45);
-  v67 = fma(-v45, v45, v01);
-  v23 = fma(v67, v23, v45);
-
-  v23 = ldexp(v23, exp1);
-  return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double);
-
-#endif
diff -Nru libclc-0.2.0+git20150813/r600/lib/OVERRIDES libclc-0.2.0+git20170213/r600/lib/OVERRIDES
--- libclc-0.2.0+git20150813/r600/lib/OVERRIDES	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/OVERRIDES	2017-02-12 21:33:49.000000000 +0000
@@ -1,2 +0,0 @@
-workitem/get_group_id.cl
-workitem/get_global_size.cl
diff -Nru libclc-0.2.0+git20150813/r600/lib/SOURCES libclc-0.2.0+git20170213/r600/lib/SOURCES
--- libclc-0.2.0+git20150813/r600/lib/SOURCES	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/SOURCES	2017-02-12 21:33:49.000000000 +0000
@@ -1,12 +1,8 @@
-atomic/atomic.cl
-math/ldexp.cl
-math/nextafter.cl
-math/sqrt.cl
-workitem/get_num_groups.ll
-workitem/get_group_id.ll
-workitem/get_local_size.ll
-workitem/get_local_id.ll
-workitem/get_global_size.ll
-workitem/get_work_dim.ll
-synchronization/barrier.cl
 synchronization/barrier_impl.ll
+workitem/get_global_offset.cl
+workitem/get_group_id.cl
+workitem/get_global_size.ll
+workitem/get_local_id.cl
+workitem/get_local_size.ll
+workitem/get_num_groups.ll
+workitem/get_work_dim.cl
diff -Nru libclc-0.2.0+git20150813/r600/lib/synchronization/barrier.cl libclc-0.2.0+git20170213/r600/lib/synchronization/barrier.cl
--- libclc-0.2.0+git20150813/r600/lib/synchronization/barrier.cl	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/synchronization/barrier.cl	1970-01-01 00:00:00.000000000 +0000
@@ -1,10 +0,0 @@
-
-#include <clc/clc.h>
-
-_CLC_DEF int __clc_clk_local_mem_fence() {
-  return CLK_LOCAL_MEM_FENCE;
-}
-
-_CLC_DEF int __clc_clk_global_mem_fence() {
-  return CLK_GLOBAL_MEM_FENCE;
-}
diff -Nru libclc-0.2.0+git20150813/r600/lib/synchronization/barrier_impl.ll libclc-0.2.0+git20170213/r600/lib/synchronization/barrier_impl.ll
--- libclc-0.2.0+git20150813/r600/lib/synchronization/barrier_impl.ll	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/synchronization/barrier_impl.ll	2017-02-12 21:33:49.000000000 +0000
@@ -1,9 +1,8 @@
-declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
-declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
-declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
+declare i32 @__clc_clk_local_mem_fence() #1
+declare i32 @__clc_clk_global_mem_fence() #1
+declare void @llvm.r600.group.barrier() #0
 
-define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
+define void @barrier(i32 %flags) #2 {
 barrier_local_test:
   %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
   %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE
@@ -11,7 +10,7 @@
   br i1 %1, label %barrier_local, label %barrier_global_test
 
 barrier_local:
-  call void @llvm.AMDGPU.barrier.local() noduplicate
+  call void @llvm.r600.group.barrier()
   br label %barrier_global_test
 
 barrier_global_test:
@@ -21,9 +20,13 @@
   br i1 %3, label %barrier_global, label %done
 
 barrier_global:
-  call void @llvm.AMDGPU.barrier.global() noduplicate
+  call void @llvm.r600.group.barrier()
   br label %done
 
 done:
   ret void
 }
+
+attributes #0 = { nounwind convergent }
+attributes #1 = { nounwind alwaysinline }
+attributes #2 = { nounwind convergent alwaysinline }
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_global_offset.cl libclc-0.2.0+git20170213/r600/lib/workitem/get_global_offset.cl
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_global_offset.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_global_offset.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+_CLC_DEF uint get_global_offset(uint dim)
+{
+	__attribute__((address_space(7))) uint * ptr =
+		(__attribute__((address_space(7))) uint *)
+		__builtin_r600_implicitarg_ptr();
+	if (dim < 3)
+		return ptr[dim + 1];
+	return 0;
+}
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_global_size.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_global_size.ll
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_global_size.ll	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_global_size.ll	2017-02-12 21:33:49.000000000 +0000
@@ -14,5 +14,5 @@
   %z = call i32 @llvm.r600.read.global.size.z() nounwind readnone
   ret i32 %z
 default:
-  ret i32 0
+  ret i32 1
 }
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_group_id.cl libclc-0.2.0+git20170213/r600/lib/workitem/get_group_id.cl
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_group_id.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_group_id.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+_CLC_DEF uint get_group_id(uint dim)
+{
+	switch(dim) {
+	case 0: return __builtin_r600_read_tgid_x();
+	case 1: return __builtin_r600_read_tgid_y();
+	case 2: return __builtin_r600_read_tgid_z();
+	default: return 1;
+	}
+}
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_group_id.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_group_id.ll
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_group_id.ll	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_group_id.ll	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
-declare i32 @llvm.r600.read.tgid.x() nounwind readnone
-declare i32 @llvm.r600.read.tgid.y() nounwind readnone
-declare i32 @llvm.r600.read.tgid.z() nounwind readnone
-
-define i32 @get_group_id(i32 %dim) nounwind readnone alwaysinline {
-  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
-x_dim:
-  %x = call i32 @llvm.r600.read.tgid.x() nounwind readnone
-  ret i32 %x
-y_dim:
-  %y = call i32 @llvm.r600.read.tgid.y() nounwind readnone
-  ret i32 %y
-z_dim:
-  %z = call i32 @llvm.r600.read.tgid.z() nounwind readnone
-  ret i32 %z
-default:
-  ret i32 0
-}
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_local_id.cl libclc-0.2.0+git20170213/r600/lib/workitem/get_local_id.cl
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_local_id.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_local_id.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,11 @@
+#include <clc/clc.h>
+
+_CLC_DEF uint get_local_id(uint dim)
+{
+	switch(dim) {
+	case 0: return __builtin_r600_read_tidig_x();
+	case 1: return __builtin_r600_read_tidig_y();
+	case 2: return __builtin_r600_read_tidig_z();
+	default: return 1;
+	}
+}
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_local_id.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_local_id.ll
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_local_id.ll	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_local_id.ll	1970-01-01 00:00:00.000000000 +0000
@@ -1,18 +0,0 @@
-declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare i32 @llvm.r600.read.tidig.y() nounwind readnone
-declare i32 @llvm.r600.read.tidig.z() nounwind readnone
-
-define i32 @get_local_id(i32 %dim) nounwind readnone alwaysinline {
-  switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
-x_dim:
-  %x = call i32 @llvm.r600.read.tidig.x() nounwind readnone
-  ret i32 %x
-y_dim:
-  %y = call i32 @llvm.r600.read.tidig.y() nounwind readnone
-  ret i32 %y
-z_dim:
-  %z = call i32 @llvm.r600.read.tidig.z() nounwind readnone
-  ret i32 %z
-default:
-  ret i32 0
-}
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_local_size.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_local_size.ll
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_local_size.ll	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_local_size.ll	2017-02-12 21:33:49.000000000 +0000
@@ -5,14 +5,14 @@
 define i32 @get_local_size(i32 %dim) nounwind readnone alwaysinline {
   switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim]
 x_dim:
-  %x = call i32 @llvm.r600.read.local.size.x() nounwind readnone
+  %x = call i32 @llvm.r600.read.local.size.x()
   ret i32 %x
 y_dim:
-  %y = call i32 @llvm.r600.read.local.size.y() nounwind readnone
+  %y = call i32 @llvm.r600.read.local.size.y()
   ret i32 %y
 z_dim:
-  %z = call i32 @llvm.r600.read.local.size.z() nounwind readnone
+  %z = call i32 @llvm.r600.read.local.size.z()
   ret i32 %z
 default:
-  ret i32 0
+  ret i32 1
 }
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_num_groups.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_num_groups.ll
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_num_groups.ll	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_num_groups.ll	2017-02-12 21:33:49.000000000 +0000
@@ -14,5 +14,5 @@
   %z = call i32 @llvm.r600.read.ngroups.z() nounwind readnone
   ret i32 %z
 default:
-  ret i32 0
+  ret i32 1
 }
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_work_dim.cl libclc-0.2.0+git20170213/r600/lib/workitem/get_work_dim.cl
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_work_dim.cl	1970-01-01 00:00:00.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_work_dim.cl	2017-02-12 21:33:49.000000000 +0000
@@ -0,0 +1,9 @@
+#include <clc/clc.h>
+
+_CLC_DEF uint get_work_dim()
+{
+	__attribute__((address_space(7))) uint * ptr =
+		(__attribute__((address_space(7))) uint *)
+		__builtin_r600_implicitarg_ptr();
+	return ptr[0];
+}
diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_work_dim.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_work_dim.ll
--- libclc-0.2.0+git20150813/r600/lib/workitem/get_work_dim.ll	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/r600/lib/workitem/get_work_dim.ll	1970-01-01 00:00:00.000000000 +0000
@@ -1,8 +0,0 @@
-declare i32 @llvm.AMDGPU.read.workdim() nounwind readnone
-
-define i32 @get_work_dim() nounwind readnone alwaysinline {
-  %x = call i32 @llvm.AMDGPU.read.workdim() nounwind readnone , !range !0
-  ret i32 %x
-}
-
-!0 = !{ i32 1, i32 4 }
diff -Nru libclc-0.2.0+git20150813/utils/prepare-builtins.cpp libclc-0.2.0+git20170213/utils/prepare-builtins.cpp
--- libclc-0.2.0+git20150813/utils/prepare-builtins.cpp	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/utils/prepare-builtins.cpp	2017-02-12 21:33:49.000000000 +0000
@@ -1,4 +1,5 @@
-#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
@@ -24,7 +25,7 @@
                cl::value_desc("filename"));
 
 int main(int argc, char **argv) {
-  LLVMContext &Context = getGlobalContext();
+  LLVMContext Context;
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
   cl::ParseCommandLineOptions(argc, argv, "libclc builtin preparation tool\n");
@@ -35,12 +36,13 @@
   {
     ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(InputFilename);
-    std::unique_ptr<MemoryBuffer> &BufferPtr = BufferOrErr.get();
-    if (std::error_code  ec = BufferOrErr.getError())
+    if (std::error_code  ec = BufferOrErr.getError()) {
       ErrorMessage = ec.message();
-    else {
+    } else {
+      std::unique_ptr<MemoryBuffer> &BufferPtr = BufferOrErr.get();
       ErrorOr<std::unique_ptr<Module>> ModuleOrErr =
-          parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context);
+          expectedToErrorOrAndEmitErrors(Context,
+          parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context));
       if (std::error_code ec = ModuleOrErr.getError())
         ErrorMessage = ec.message();
 
@@ -57,6 +59,13 @@
     return 1;
   }
 
+  // Strip the OpenCL version metadata. There are a lot of linked
+  // modules in the library build, each spamming the same
+  // version. This may also report a different version than the user
+  // program is using. This should probably be uniqued when linking.
+  if (NamedMDNode *OCLVersion = M->getNamedMetadata("opencl.ocl.version"))
+      M->eraseNamedMetadata(OCLVersion);
+
   // Set linkage of every external definition to linkonce_odr.
   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
     if (!i->isDeclaration() && i->getLinkage() == GlobalValue::ExternalLinkage)
diff -Nru libclc-0.2.0+git20150813/www/index.html libclc-0.2.0+git20170213/www/index.html
--- libclc-0.2.0+git20150813/www/index.html	2015-08-13 23:43:12.000000000 +0000
+++ libclc-0.2.0+git20170213/www/index.html	2017-02-12 21:33:49.000000000 +0000
@@ -37,8 +37,8 @@
 </p>
 
 <p>
-libclc currently only supports the PTX target, but support for more
-targets is welcome.
+libclc currently supports the AMDGCN, and R600 and NVPTX targets, but
+support for more targets is welcome.
 </p>
 
 <h2>Download</h2>
@@ -49,7 +49,7 @@
 
 <h2>Mailing List</h2>
 
-libclc-dev@pcc.me.uk (<a href="http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev">subscribe/unsubscribe</a>, <a href="http://www.pcc.me.uk/pipermail/libclc-dev/">archives</a>)
+libclc-dev@lists.llvm.org (<a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/libclc-dev">subscribe/unsubscribe</a>, <a href="http://lists.llvm.org/pipermail/libclc-dev/">archives</a>)
 
 </body>
 </html>