diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/math/ldexp.cl libclc-0.2.0+git20170213/amdgcn/lib/math/ldexp.cl --- libclc-0.2.0+git20150813/amdgcn/lib/math/ldexp.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/math/ldexp.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include "../../../generic/lib/clcmacro.h" + +#ifdef __HAS_LDEXPF__ +#define BUILTINF __builtin_amdgcn_ldexpf +#else +#include "math/clc_ldexp.h" +#define BUILTINF __clc_ldexp +#endif + +// This defines all the ldexp(floatN, intN) variants. +_CLC_DEFINE_BINARY_BUILTIN(float, ldexp, BUILTINF, float, int); + +#ifdef cl_khr_fp64 + #pragma OPENCL EXTENSION cl_khr_fp64 : enable + // This defines all the ldexp(doubleN, intN) variants. + _CLC_DEFINE_BINARY_BUILTIN(double, ldexp, __builtin_amdgcn_ldexp, double, int); +#endif + +// This defines all the ldexp(GENTYPE, int); +#define __CLC_BODY <../../../generic/lib/math/ldexp.inc> +#include + +#undef BUILTINF diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/SOURCES libclc-0.2.0+git20170213/amdgcn/lib/SOURCES --- libclc-0.2.0+git20150813/amdgcn/lib/SOURCES 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/SOURCES 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +math/ldexp.cl +synchronization/barrier_impl.ll +workitem/get_global_offset.cl +workitem/get_group_id.cl +workitem/get_global_size.ll +workitem/get_local_id.cl +workitem/get_local_size.ll +workitem/get_num_groups.ll +workitem/get_work_dim.cl diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/synchronization/barrier_impl.ll libclc-0.2.0+git20170213/amdgcn/lib/synchronization/barrier_impl.ll --- libclc-0.2.0+git20150813/amdgcn/lib/synchronization/barrier_impl.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/synchronization/barrier_impl.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,32 @@ +declare i32 @__clc_clk_local_mem_fence() #1 +declare i32 @__clc_clk_global_mem_fence() #1 +declare void @llvm.amdgcn.s.barrier() #0 + +define void @barrier(i32 %flags) #2 { +barrier_local_test: + %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence() + %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE + %1 = icmp ne i32 %0, 0 + br i1 %1, label %barrier_local, label %barrier_global_test + +barrier_local: + call void @llvm.amdgcn.s.barrier() + br label %barrier_global_test + +barrier_global_test: + %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence() + %2 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE + %3 = icmp ne i32 %2, 0 + br i1 %3, label %barrier_global, label %done + +barrier_global: + call void @llvm.amdgcn.s.barrier() + br label %done + +done: + ret void +} + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind alwaysinline } +attributes #2 = { nounwind convergent alwaysinline } diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_global_offset.cl libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_global_offset.cl --- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_global_offset.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_global_offset.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +_CLC_DEF size_t get_global_offset(uint dim) +{ + __attribute__((address_space(2))) uint * ptr = + (__attribute__((address_space(2))) uint *) + __builtin_amdgcn_implicitarg_ptr(); + if (dim < 3) + return ptr[dim + 1]; + return 0; +} diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_global_size.ll libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_global_size.ll --- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_global_size.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_global_size.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,21 @@ +declare i32 @llvm.r600.read.global.size.x() nounwind readnone +declare i32 @llvm.r600.read.global.size.y() nounwind readnone +declare i32 @llvm.r600.read.global.size.z() nounwind readnone + +define i64 @get_global_size(i32 %dim) nounwind readnone alwaysinline { + switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim] +x_dim: + %x = call i32 @llvm.r600.read.global.size.x() + %x.ext = zext i32 %x to i64 + ret i64 %x.ext +y_dim: + %y = call i32 @llvm.r600.read.global.size.y() + %y.ext = zext i32 %y to i64 + ret i64 %y.ext +z_dim: + %z = call i32 @llvm.r600.read.global.size.z() + %z.ext = zext i32 %z to i64 + ret i64 %z.ext +default: + ret i64 1 +} diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_group_id.cl libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_group_id.cl --- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_group_id.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_group_id.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +_CLC_DEF size_t get_group_id(uint dim) +{ + switch(dim) { + case 0: return __builtin_amdgcn_workgroup_id_x(); + case 1: return __builtin_amdgcn_workgroup_id_y(); + case 2: return __builtin_amdgcn_workgroup_id_z(); + default: return 1; + } +} diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_local_id.cl libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_local_id.cl --- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_local_id.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_local_id.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +_CLC_DEF size_t get_local_id(uint dim) +{ + switch(dim) { + case 0: return __builtin_amdgcn_workitem_id_x(); + case 1: return __builtin_amdgcn_workitem_id_y(); + case 2: return __builtin_amdgcn_workitem_id_z(); + default: return 1; + } +} diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_local_size.ll libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_local_size.ll --- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_local_size.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_local_size.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,21 @@ +declare i32 @llvm.r600.read.local.size.x() nounwind readnone +declare i32 @llvm.r600.read.local.size.y() nounwind readnone +declare i32 @llvm.r600.read.local.size.z() nounwind readnone + +define i64 @get_local_size(i32 %dim) nounwind readnone alwaysinline { + switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim] +x_dim: + %x = call i32 @llvm.r600.read.local.size.x() + %x.ext = zext i32 %x to i64 + ret i64 %x.ext +y_dim: + %y = call i32 @llvm.r600.read.local.size.y() + %y.ext = zext i32 %y to i64 + ret i64 %y.ext +z_dim: + %z = call i32 @llvm.r600.read.local.size.z() + %z.ext = zext i32 %z to i64 + ret i64 %z.ext +default: + ret i64 1 +} diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_num_groups.ll libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_num_groups.ll --- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_num_groups.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_num_groups.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,21 @@ +declare i32 @llvm.r600.read.ngroups.x() nounwind readnone +declare i32 @llvm.r600.read.ngroups.y() nounwind readnone +declare i32 @llvm.r600.read.ngroups.z() nounwind readnone + +define i64 @get_num_groups(i32 %dim) nounwind readnone alwaysinline { + switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim] +x_dim: + %x = call i32 @llvm.r600.read.ngroups.x() + %x.ext = zext i32 %x to i64 + ret i64 %x.ext +y_dim: + %y = call i32 @llvm.r600.read.ngroups.y() + %y.ext = zext i32 %y to i64 + ret i64 %y.ext +z_dim: + %z = call i32 @llvm.r600.read.ngroups.z() + %z.ext = zext i32 %z to i64 + ret i64 %z.ext +default: + ret i64 1 +} diff -Nru libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_work_dim.cl libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_work_dim.cl --- libclc-0.2.0+git20150813/amdgcn/lib/workitem/get_work_dim.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn/lib/workitem/get_work_dim.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#include + +_CLC_DEF uint get_work_dim() +{ + __attribute__((address_space(2))) uint * ptr = + (__attribute__((address_space(2))) uint *) + __builtin_amdgcn_implicitarg_ptr(); + return ptr[0]; +} diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/OVERRIDES libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/OVERRIDES --- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/OVERRIDES 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/OVERRIDES 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1 @@ +workitem/get_num_groups.ll diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/SOURCES libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/SOURCES --- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/SOURCES 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/SOURCES 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,3 @@ +workitem/get_global_size.ll +workitem/get_local_size.ll +workitem/get_num_groups.cl diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_global_size.ll libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_global_size.ll --- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_global_size.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_global_size.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,39 @@ +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + +define i64 @get_global_size(i32 %dim) #1 { + %dispatch_ptr = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + switch i32 %dim, label %default [ + i32 0, label %x + i32 1, label %y + i32 2, label %z + ] + +x: + %ptr_x = getelementptr inbounds i8, i8 addrspace(2)* %dispatch_ptr, i64 12 + %ptr_x32 = bitcast i8 addrspace(2)* %ptr_x to i32 addrspace(2)* + %x32 = load i32, i32 addrspace(2)* %ptr_x32, align 4, !invariant.load !0 + %size_x = zext i32 %x32 to i64 + ret i64 %size_x + +y: + %ptr_y = getelementptr inbounds i8, i8 addrspace(2)* %dispatch_ptr, i64 16 + %ptr_y32 = bitcast i8 addrspace(2)* %ptr_y to i32 addrspace(2)* + %y32 = load i32, i32 addrspace(2)* %ptr_y32, align 4, !invariant.load !0 + %size_y = zext i32 %y32 to i64 + ret i64 %size_y + +z: + %ptr_z = getelementptr inbounds i8, i8 addrspace(2)* %dispatch_ptr, i64 20 + %ptr_z32 = bitcast i8 addrspace(2)* %ptr_z to i32 addrspace(2)* + %z32 = load i32, i32 addrspace(2)* %ptr_z32, align 4, !invariant.load !0 + %size_z = zext i32 %z32 to i64 + ret i64 %size_z + +default: + ret i64 1 +} + +attributes #0 = { nounwind readnone } +attributes #1 = { alwaysinline norecurse nounwind readonly } + +!0 = !{} diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_local_size.ll libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_local_size.ll --- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_local_size.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_local_size.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,38 @@ +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + +define i64 @get_local_size(i32 %dim) #1 { + %dispatch_ptr = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %dispatch_ptr_i32 = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %xy_size_ptr = getelementptr inbounds i32, i32 addrspace(2)* %dispatch_ptr_i32, i64 1 + %xy_size = load i32, i32 addrspace(2)* %xy_size_ptr, align 4, !invariant.load !0 + switch i32 %dim, label %default [ + i32 0, label %x_dim + i32 1, label %y_dim + i32 2, label %z_dim + ] + +x_dim: + %x_size = and i32 %xy_size, 65535 + %x_size.ext = zext i32 %x_size to i64 + ret i64 %x_size.ext + +y_dim: + %y_size = lshr i32 %xy_size, 16 + %y_size.ext = zext i32 %y_size to i64 + ret i64 %y_size.ext + +z_dim: + %z_size_ptr = getelementptr inbounds i32, i32 addrspace(2)* %dispatch_ptr_i32, i64 2 + %z_size = load i32, i32 addrspace(2)* %z_size_ptr, align 4, !invariant.load !0, !range !1 + %z_size.ext = zext i32 %z_size to i64 + ret i64 %z_size.ext + +default: + ret i64 1 +} + +attributes #0 = { nounwind readnone } +attributes #1 = { alwaysinline norecurse nounwind readonly } + +!0 = !{} +!1 = !{ i32 0, i32 257 } diff -Nru libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_num_groups.cl libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_num_groups.cl --- libclc-0.2.0+git20150813/amdgcn-amdhsa/lib/workitem/get_num_groups.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgcn-amdhsa/lib/workitem/get_num_groups.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,12 @@ + +#include + +_CLC_DEF size_t get_num_groups(uint dim) { + size_t global_size = get_global_size(dim); + size_t local_size = get_local_size(dim); + size_t num_groups = global_size / local_size; + if (global_size % local_size != 0) { + num_groups++; + } + return num_groups; +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/atomic/atomic.cl libclc-0.2.0+git20170213/amdgpu/lib/atomic/atomic.cl --- libclc-0.2.0+git20150813/amdgpu/lib/atomic/atomic.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/atomic/atomic.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,65 @@ +#include + +#define ATOMIC_FUNC_DEFINE(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ +_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE val) { \ + return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)val); \ +} + +/* For atomic functions that don't need different bitcode dependending on argument signedness */ +#define ATOMIC_FUNC_SIGN(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ + _CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE); \ + ATOMIC_FUNC_DEFINE(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ + ATOMIC_FUNC_DEFINE(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) + +#define ATOMIC_FUNC_ADDRSPACE(TYPE, FUNCTION) \ + ATOMIC_FUNC_SIGN(TYPE, FUNCTION, global, 1) \ + ATOMIC_FUNC_SIGN(TYPE, FUNCTION, local, 3) + +#define ATOMIC_FUNC(FUNCTION) \ + ATOMIC_FUNC_ADDRSPACE(int, FUNCTION) + +#define ATOMIC_FUNC_DEFINE_3_ARG(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ +_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE cmp, RET_SIGN TYPE val) { \ + return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)cmp, (ARG_SIGN TYPE)val); \ +} + +/* For atomic functions that don't need different bitcode dependending on argument signedness */ +#define ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ + _CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE, signed TYPE); \ + ATOMIC_FUNC_DEFINE_3_ARG(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ + ATOMIC_FUNC_DEFINE_3_ARG(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) + +#define ATOMIC_FUNC_ADDRSPACE_3_ARG(TYPE, FUNCTION) \ + ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, global, 1) \ + ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, local, 3) + +#define ATOMIC_FUNC_3_ARG(FUNCTION) \ + ATOMIC_FUNC_ADDRSPACE_3_ARG(int, FUNCTION) + +ATOMIC_FUNC(atomic_add) +ATOMIC_FUNC(atomic_and) +ATOMIC_FUNC(atomic_or) +ATOMIC_FUNC(atomic_sub) +ATOMIC_FUNC(atomic_xchg) +ATOMIC_FUNC(atomic_xor) +ATOMIC_FUNC_3_ARG(atomic_cmpxchg) + +_CLC_DECL signed int __clc_atomic_max_addr1(volatile global signed int*, signed int); +_CLC_DECL signed int __clc_atomic_max_addr3(volatile local signed int*, signed int); +_CLC_DECL uint __clc_atomic_umax_addr1(volatile global uint*, uint); +_CLC_DECL uint __clc_atomic_umax_addr3(volatile local uint*, uint); + +ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, global, 1) +ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, local, 3) +ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, global, 1) +ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, local, 3) + +_CLC_DECL signed int __clc_atomic_min_addr1(volatile global signed int*, signed int); +_CLC_DECL signed int __clc_atomic_min_addr3(volatile local signed int*, signed int); +_CLC_DECL uint __clc_atomic_umin_addr1(volatile global uint*, uint); +_CLC_DECL uint __clc_atomic_umin_addr3(volatile local uint*, uint); + +ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, global, 1) +ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, local, 3) +ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, global, 1) +ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, local, 3) diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_attributes_impl.ll libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_attributes_impl.ll --- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_attributes_impl.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_attributes_impl.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,87 @@ +%opencl.image2d_t = type opaque +%opencl.image3d_t = type opaque + +declare i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)*) nounwind readnone +declare i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)*) nounwind readnone + +declare [3 x i32] @llvm.OpenCL.image.get.size.2d( + %opencl.image2d_t addrspace(1)*) nounwind readnone +declare [3 x i32] @llvm.OpenCL.image.get.size.3d( + %opencl.image3d_t addrspace(1)*) nounwind readnone + +declare [2 x i32] @llvm.OpenCL.image.get.format.2d( + %opencl.image2d_t addrspace(1)*) nounwind readnone +declare [2 x i32] @llvm.OpenCL.image.get.format.3d( + %opencl.image3d_t addrspace(1)*) nounwind readnone + +define i32 @__clc_get_image_width_2d( + %opencl.image2d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.2d( + %opencl.image2d_t addrspace(1)* %img) + %2 = extractvalue [3 x i32] %1, 0 + ret i32 %2 +} +define i32 @__clc_get_image_width_3d( + %opencl.image3d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d( + %opencl.image3d_t addrspace(1)* %img) + %2 = extractvalue [3 x i32] %1, 0 + ret i32 %2 +} + +define i32 @__clc_get_image_height_2d( + %opencl.image2d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.2d( + %opencl.image2d_t addrspace(1)* %img) + %2 = extractvalue [3 x i32] %1, 1 + ret i32 %2 +} +define i32 @__clc_get_image_height_3d( + %opencl.image3d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d( + %opencl.image3d_t addrspace(1)* %img) + %2 = extractvalue [3 x i32] %1, 1 + ret i32 %2 +} + +define i32 @__clc_get_image_depth_3d( + %opencl.image3d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [3 x i32] @llvm.OpenCL.image.get.size.3d( + %opencl.image3d_t addrspace(1)* %img) + %2 = extractvalue [3 x i32] %1, 2 + ret i32 %2 +} + +define i32 @__clc_get_image_channel_data_type_2d( + %opencl.image2d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.2d( + %opencl.image2d_t addrspace(1)* %img) + %2 = extractvalue [2 x i32] %1, 0 + ret i32 %2 +} +define i32 @__clc_get_image_channel_data_type_3d( + %opencl.image3d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.3d( + %opencl.image3d_t addrspace(1)* %img) + %2 = extractvalue [2 x i32] %1, 0 + ret i32 %2 +} + +define i32 @__clc_get_image_channel_order_2d( + %opencl.image2d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.2d( + %opencl.image2d_t addrspace(1)* %img) + %2 = extractvalue [2 x i32] %1, 1 + ret i32 %2 +} +define i32 @__clc_get_image_channel_order_3d( + %opencl.image3d_t addrspace(1)* nocapture %img) #0 { + %1 = tail call [2 x i32] @llvm.OpenCL.image.get.format.3d( + %opencl.image3d_t addrspace(1)* %img) + %2 = extractvalue [2 x i32] %1, 1 + ret i32 %2 +} + +attributes #0 = { nounwind readnone alwaysinline } diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_channel_data_type.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_channel_data_type.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_channel_data_type.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_channel_data_type.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,13 @@ +#include + +_CLC_DECL int __clc_get_image_channel_data_type_2d(image2d_t); +_CLC_DECL int __clc_get_image_channel_data_type_3d(image3d_t); + +_CLC_OVERLOAD _CLC_DEF int +get_image_channel_data_type(image2d_t image) { + return __clc_get_image_channel_data_type_2d(image); +} +_CLC_OVERLOAD _CLC_DEF int +get_image_channel_data_type(image3d_t image) { + return __clc_get_image_channel_data_type_3d(image); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_channel_order.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_channel_order.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_channel_order.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_channel_order.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,13 @@ +#include + +_CLC_DECL int __clc_get_image_channel_order_2d(image2d_t); +_CLC_DECL int __clc_get_image_channel_order_3d(image3d_t); + +_CLC_OVERLOAD _CLC_DEF int +get_image_channel_order(image2d_t image) { + return __clc_get_image_channel_order_2d(image); +} +_CLC_OVERLOAD _CLC_DEF int +get_image_channel_order(image3d_t image) { + return __clc_get_image_channel_order_3d(image); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_depth.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_depth.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_depth.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_depth.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,8 @@ +#include + +_CLC_DECL int __clc_get_image_depth_3d(image3d_t); + +_CLC_OVERLOAD _CLC_DEF int +get_image_depth(image3d_t image) { + return __clc_get_image_depth_3d(image); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_height.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_height.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_height.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_height.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,13 @@ +#include + +_CLC_DECL int __clc_get_image_height_2d(image2d_t); +_CLC_DECL int __clc_get_image_height_3d(image3d_t); + +_CLC_OVERLOAD _CLC_DEF int +get_image_height(image2d_t image) { + return __clc_get_image_height_2d(image); +} +_CLC_OVERLOAD _CLC_DEF int +get_image_height(image3d_t image) { + return __clc_get_image_height_3d(image); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_width.cl libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_width.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/get_image_width.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/get_image_width.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,13 @@ +#include + +_CLC_DECL int __clc_get_image_width_2d(image2d_t); +_CLC_DECL int __clc_get_image_width_3d(image3d_t); + +_CLC_OVERLOAD _CLC_DEF int +get_image_width(image2d_t image) { + return __clc_get_image_width_2d(image); +} +_CLC_OVERLOAD _CLC_DEF int +get_image_width(image3d_t image) { + return __clc_get_image_width_3d(image); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/read_imagef.cl libclc-0.2.0+git20170213/amdgpu/lib/image/read_imagef.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/read_imagef.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/read_imagef.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,14 @@ +#include + +_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2); + +_CLC_OVERLOAD _CLC_DEF float4 read_imagef(image2d_t image, sampler_t sampler, + int2 coord) { + float2 coord_float = (float2)(coord.x, coord.y); + return __clc_read_imagef_tex(image, sampler, coord_float); +} + +_CLC_OVERLOAD _CLC_DEF float4 read_imagef(image2d_t image, sampler_t sampler, + float2 coord) { + return __clc_read_imagef_tex(image, sampler, coord); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/read_imagei.cl libclc-0.2.0+git20170213/amdgpu/lib/image/read_imagei.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/read_imagei.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/read_imagei.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,23 @@ +#include + +_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2); + +int4 __clc_reinterpret_v4f_to_v4i(float4 v) { + union { + int4 v4i; + float4 v4f; + } res = { .v4f = v}; + return res.v4i; +} + +_CLC_OVERLOAD _CLC_DEF int4 read_imagei(image2d_t image, sampler_t sampler, + int2 coord) { + float2 coord_float = (float2)(coord.x, coord.y); + return __clc_reinterpret_v4f_to_v4i( + __clc_read_imagef_tex(image, sampler, coord_float)); +} +_CLC_OVERLOAD _CLC_DEF int4 read_imagei(image2d_t image, sampler_t sampler, + float2 coord) { + return __clc_reinterpret_v4f_to_v4i( + __clc_read_imagef_tex(image, sampler, coord)); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/read_image_impl.ll libclc-0.2.0+git20170213/amdgpu/lib/image/read_image_impl.ll --- libclc-0.2.0+git20150813/amdgpu/lib/image/read_image_impl.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/read_image_impl.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,46 @@ +%opencl.image2d_t = type opaque + +declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, + i32, i32, i32) readnone +declare i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)*) nounwind readnone +declare i32 @llvm.OpenCL.sampler.get.resource.id(i32) readnone + +define <4 x float> @__clc_v4f_from_v2f(<2 x float> %v) alwaysinline { + %e0 = extractelement <2 x float> %v, i32 0 + %e1 = extractelement <2 x float> %v, i32 1 + %res.0 = insertelement <4 x float> undef, float %e0, i32 0 + %res.1 = insertelement <4 x float> %res.0, float %e1, i32 1 + %res.2 = insertelement <4 x float> %res.1, float 0.0, i32 2 + %res.3 = insertelement <4 x float> %res.2, float 0.0, i32 3 + ret <4 x float> %res.3 +} + +define <4 x float> @__clc_read_imagef_tex( + %opencl.image2d_t addrspace(1)* nocapture %img, + i32 %sampler, <2 x float> %coord) alwaysinline { +entry: + %coord_v4 = call <4 x float> @__clc_v4f_from_v2f(<2 x float> %coord) + %smp_id = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %sampler) + %img_id = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %img) + %tex_id = add i32 %img_id, 2 ; First 2 IDs are reserved. + + %coord_norm = and i32 %sampler, 1 + %is_norm = icmp eq i32 %coord_norm, 1 + br i1 %is_norm, label %NormCoord, label %UnnormCoord +NormCoord: + %data.norm = call <4 x float> @llvm.R600.tex( + <4 x float> %coord_v4, + i32 0, i32 0, i32 0, ; Offset. + i32 2, i32 %smp_id, + i32 1, i32 1, i32 1, i32 1) ; Normalized coords. + ret <4 x float> %data.norm +UnnormCoord: + %data.unnorm = call <4 x float> @llvm.R600.tex( + <4 x float> %coord_v4, + i32 0, i32 0, i32 0, ; Offset. + i32 %tex_id, i32 %smp_id, + i32 0, i32 0, i32 0, i32 0) ; Unnormalized coords. + ret <4 x float> %data.unnorm +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/read_imageui.cl libclc-0.2.0+git20170213/amdgpu/lib/image/read_imageui.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/read_imageui.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/read_imageui.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,23 @@ +#include + +_CLC_DECL float4 __clc_read_imagef_tex(image2d_t, sampler_t, float2); + +uint4 __clc_reinterpret_v4f_to_v4ui(float4 v) { + union { + uint4 v4ui; + float4 v4f; + } res = { .v4f = v}; + return res.v4ui; +} + +_CLC_OVERLOAD _CLC_DEF uint4 read_imageui(image2d_t image, sampler_t sampler, + int2 coord) { + float2 coord_float = (float2)(coord.x, coord.y); + return __clc_reinterpret_v4f_to_v4ui( + __clc_read_imagef_tex(image, sampler, coord_float)); +} +_CLC_OVERLOAD _CLC_DEF uint4 read_imageui(image2d_t image, sampler_t sampler, + float2 coord) { + return __clc_reinterpret_v4f_to_v4ui( + __clc_read_imagef_tex(image, sampler, coord)); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/write_imagef.cl libclc-0.2.0+git20170213/amdgpu/lib/image/write_imagef.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/write_imagef.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/write_imagef.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#include + +_CLC_DECL void __clc_write_imagef_2d(image2d_t image, int2 coord, float4 color); + +_CLC_OVERLOAD _CLC_DEF void +write_imagef(image2d_t image, int2 coord, float4 color) +{ + __clc_write_imagef_2d(image, coord, color); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/write_imagei.cl libclc-0.2.0+git20170213/amdgpu/lib/image/write_imagei.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/write_imagei.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/write_imagei.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#include + +_CLC_DECL void __clc_write_imagei_2d(image2d_t image, int2 coord, int4 color); + +_CLC_OVERLOAD _CLC_DEF void +write_imagei(image2d_t image, int2 coord, int4 color) +{ + __clc_write_imagei_2d(image, coord, color); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/write_image_impl.ll libclc-0.2.0+git20170213/amdgpu/lib/image/write_image_impl.ll --- libclc-0.2.0+git20150813/amdgpu/lib/image/write_image_impl.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/write_image_impl.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,52 @@ +%opencl.image2d_t = type opaque +%opencl.image3d_t = type opaque + +declare i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)*) nounwind readnone +declare i32 @llvm.OpenCL.image.get.resource.id.3d( + %opencl.image3d_t addrspace(1)*) nounwind readnone + +declare void @llvm.r600.rat.store.typed(<4 x i32> %color, <4 x i32> %coord, i32 %rat_id) + +define void @__clc_write_imageui_2d( + %opencl.image2d_t addrspace(1)* nocapture %img, + <2 x i32> %coord, <4 x i32> %color) #0 { + + ; Coordinate int2 -> int4. + %e0 = extractelement <2 x i32> %coord, i32 0 + %e1 = extractelement <2 x i32> %coord, i32 1 + %coord.0 = insertelement <4 x i32> undef, i32 %e0, i32 0 + %coord.1 = insertelement <4 x i32> %coord.0, i32 %e1, i32 1 + %coord.2 = insertelement <4 x i32> %coord.1, i32 0, i32 2 + %coord.3 = insertelement <4 x i32> %coord.2, i32 0, i32 3 + + ; Get RAT ID. + %img_id = call i32 @llvm.OpenCL.image.get.resource.id.2d( + %opencl.image2d_t addrspace(1)* %img) + %rat_id = add i32 %img_id, 1 + + ; Call store intrinsic. + call void @llvm.r600.rat.store.typed(<4 x i32> %color, <4 x i32> %coord.3, i32 %rat_id) + ret void +} + +define void @__clc_write_imagei_2d( + %opencl.image2d_t addrspace(1)* nocapture %img, + <2 x i32> %coord, <4 x i32> %color) #0 { + call void @__clc_write_imageui_2d( + %opencl.image2d_t addrspace(1)* nocapture %img, + <2 x i32> %coord, <4 x i32> %color) + ret void +} + +define void @__clc_write_imagef_2d( + %opencl.image2d_t addrspace(1)* nocapture %img, + <2 x i32> %coord, <4 x float> %color) #0 { + %color.i32 = bitcast <4 x float> %color to <4 x i32> + call void @__clc_write_imageui_2d( + %opencl.image2d_t addrspace(1)* nocapture %img, + <2 x i32> %coord, <4 x i32> %color.i32) + ret void +} + +attributes #0 = { alwaysinline } diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/image/write_imageui.cl libclc-0.2.0+git20170213/amdgpu/lib/image/write_imageui.cl --- libclc-0.2.0+git20150813/amdgpu/lib/image/write_imageui.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/image/write_imageui.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#include + +_CLC_DECL void __clc_write_imageui_2d(image2d_t image, int2 coord, uint4 color); + +_CLC_OVERLOAD _CLC_DEF void +write_imageui(image2d_t image, int2 coord, uint4 color) +{ + __clc_write_imageui_2d(image, coord, color); +} diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/math/nextafter.cl libclc-0.2.0+git20170213/amdgpu/lib/math/nextafter.cl --- libclc-0.2.0+git20150813/amdgpu/lib/math/nextafter.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/math/nextafter.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#include +#include "../lib/clcmacro.h" + +_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +_CLC_DEFINE_BINARY_BUILTIN(double, nextafter, __clc_nextafter, double, double) +#endif diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/math/sqrt.cl libclc-0.2.0+git20170213/amdgpu/lib/math/sqrt.cl --- libclc-0.2.0+git20150813/amdgpu/lib/math/sqrt.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/math/sqrt.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include "../../../generic/lib/clcmacro.h" +#include "math/clc_sqrt.h" + +_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float) + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#ifdef __AMDGCN__ + #define __clc_builtin_rsq __builtin_amdgcn_rsq +#else + #define __clc_builtin_rsq __builtin_r600_recipsqrt_ieee +#endif + +_CLC_OVERLOAD _CLC_DEF double sqrt(double x) { + + uint vcc = x < 0x1p-767; + uint exp0 = vcc ? 0x100 : 0; + unsigned exp1 = vcc ? 0xffffff80 : 0; + + double v01 = ldexp(x, exp0); + double v23 = __clc_builtin_rsq(v01); + double v45 = v01 * v23; + v23 = v23 * 0.5; + + double v67 = fma(-v23, v45, 0.5); + v45 = fma(v45, v67, v45); + double v89 = fma(-v45, v45, v01); + v23 = fma(v23, v67, v23); + v45 = fma(v89, v23, v45); + v67 = fma(-v45, v45, v01); + v23 = fma(v67, v23, v45); + + v23 = ldexp(v23, exp1); + return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double); + +#endif diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/OVERRIDES libclc-0.2.0+git20170213/amdgpu/lib/OVERRIDES --- libclc-0.2.0+git20150813/amdgpu/lib/OVERRIDES 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/OVERRIDES 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,2 @@ +workitem/get_group_id.cl +workitem/get_global_size.cl diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/SOURCES libclc-0.2.0+git20170213/amdgpu/lib/SOURCES --- libclc-0.2.0+git20150813/amdgpu/lib/SOURCES 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/SOURCES 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,18 @@ +atomic/atomic.cl +math/nextafter.cl +math/sqrt.cl +synchronization/barrier.cl +image/get_image_width.cl +image/get_image_height.cl +image/get_image_depth.cl +image/get_image_channel_data_type.cl +image/get_image_channel_order.cl +image/get_image_attributes_impl.ll +image/read_imagef.cl +image/read_imagei.cl +image/read_imageui.cl +image/read_image_impl.ll +image/write_imagef.cl +image/write_imagei.cl +image/write_imageui.cl +image/write_image_impl.ll diff -Nru libclc-0.2.0+git20150813/amdgpu/lib/synchronization/barrier.cl libclc-0.2.0+git20170213/amdgpu/lib/synchronization/barrier.cl --- libclc-0.2.0+git20150813/amdgpu/lib/synchronization/barrier.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/amdgpu/lib/synchronization/barrier.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,10 @@ + +#include + +_CLC_DEF int __clc_clk_local_mem_fence() { + return CLK_LOCAL_MEM_FENCE; +} + +_CLC_DEF int __clc_clk_global_mem_fence() { + return CLK_GLOBAL_MEM_FENCE; +} diff -Nru libclc-0.2.0+git20150813/configure.py libclc-0.2.0+git20170213/configure.py --- libclc-0.2.0+git20150813/configure.py 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/configure.py 2017-02-12 21:33:49.000000000 +0000 @@ -69,8 +69,8 @@ llvm_int_version = int(llvm_version[0]) * 100 + int(llvm_version[1]) * 10 llvm_string_version = 'LLVM' + llvm_version[0] + '.' + llvm_version[1] -if llvm_int_version < 370: - print "libclc requires LLVM >= 3.7" +if llvm_int_version < 400: + print "libclc requires LLVM >= 4.0" sys.exit(1) llvm_system_libs = llvm_config(['--system-libs']) @@ -92,18 +92,22 @@ available_targets = { 'r600--' : { 'devices' : [{'gpu' : 'cedar', 'aliases' : ['palm', 'sumo', 'sumo2', 'redwood', 'juniper']}, - {'gpu' : 'cypress', 'aliases' : ['hemlock']}, - {'gpu' : 'barts', 'aliases' : ['turks', 'caicos']}, - {'gpu' : 'cayman', 'aliases' : ['aruba']}]}, + {'gpu' : 'cypress', 'aliases' : ['hemlock'] }, + {'gpu' : 'barts', 'aliases' : ['turks', 'caicos'] }, + {'gpu' : 'cayman', 'aliases' : ['aruba']} ]}, 'amdgcn--': { 'devices' : - [{'gpu' : 'tahiti', 'aliases' : ['pitcairn', 'verde', 'oland', 'hainan', 'bonaire', 'kabini', 'kaveri', 'hawaii','mullins']}]}, - 'nvptx--' : { 'devices' : [{'gpu' : '', 'aliases' : []}]}, - 'nvptx64--' : { 'devices' : [{'gpu' : '', 'aliases' : []}] }, - 'nvptx--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []}] }, - 'nvptx64--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []}] } + [{'gpu' : 'tahiti', 'aliases' : ['pitcairn', 'verde', 'oland', 'hainan', 'bonaire', 'kabini', 'kaveri', 'hawaii','mullins','tonga','carrizo','iceland','fiji','stoney','polaris10','polaris11']} ]}, + 'amdgcn--amdhsa': { 'devices' : + [{'gpu' : '', 'aliases' : ['bonaire', 'hawaii', 'kabini', 'kaveri', 'mullins', 'carrizo', 'stoney', 'fiji', 'iceland', 'tonga','polaris10','polaris11']} ]}, + 'nvptx--' : { 'devices' : [{'gpu' : '', 'aliases' : []} ]}, + 'nvptx64--' : { 'devices' : [{'gpu' : '', 'aliases' : []} ]}, + 'nvptx--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []} ]}, + 'nvptx64--nvidiacl' : { 'devices' : [{'gpu' : '', 'aliases' : []} ]}, } -default_targets = ['nvptx--nvidiacl', 'nvptx64--nvidiacl', 'r600--', 'amdgcn--'] +available_targets['amdgcn-mesa-mesa3d'] = available_targets['amdgcn--'] + +default_targets = ['nvptx--nvidiacl', 'nvptx64--nvidiacl', 'r600--', 'amdgcn--', 'amdgcn--amdhsa', 'amdgcn-mesa-mesa3d'] targets = args if not targets: @@ -165,9 +169,11 @@ for arch in archs: subdirs.append("%s-%s-%s" % (arch, t_vendor, t_os)) subdirs.append("%s-%s" % (arch, t_os)) + if t_os == 'mesa3d': + subdirs.append('amdgcn-amdhsa') subdirs.append(arch) - if arch == 'amdgcn': - subdirs.append('r600') + if arch == 'amdgcn' or arch == 'r600': + subdirs.append('amdgpu') incdirs = filter(os.path.isdir, [os.path.join(srcdir, subdir, 'include') for subdir in subdirs]) @@ -180,9 +186,6 @@ # The rule for building a .bc file for the specified architecture using clang. clang_bc_flags = "-target %s -I`dirname $in` %s " \ "-fno-builtin " \ - "-Dcl_clang_storage_class_specifiers " \ - "-Dcl_khr_fp64 " \ - "-Dcles_khr_int64 " \ "-D__CLC_INTERNAL " \ "-emit-llvm" % (target, clang_cl_includes) if device['gpu'] != '': diff -Nru libclc-0.2.0+git20150813/debian/changelog libclc-0.2.0+git20170213/debian/changelog --- libclc-0.2.0+git20150813/debian/changelog 2015-09-27 19:55:56.000000000 +0000 +++ libclc-0.2.0+git20170213/debian/changelog 2017-06-08 11:11:13.000000000 +0000 @@ -1,3 +1,55 @@ +libclc (0.2.0+git20170213-1~16.04.1) xenial; urgency=medium + + * Backport to xenial. (LP: #1687981) + * Don't use debhelper 10. + + -- Timo Aaltonen Fri, 24 Mar 2017 10:11:06 +0200 + +libclc (0.2.0+git20170213-1) experimental; urgency=medium + + [ Andreas Boll ] + * Simplify clang version updates even more. + + [ Timo Aaltonen ] + * New upstream snapshot. + * clang: Bump clang version to 4.0. + + -- Timo Aaltonen Mon, 13 Feb 2017 15:08:23 +0200 + +libclc (0.2.0+git20160907-3) unstable; urgency=medium + + * Simplify clang version updates. + * Drop de-duplication of files that aren't duplicate any more. + + -- Michael Gilbert Sat, 26 Nov 2016 03:35:48 +0000 + +libclc (0.2.0+git20160907-2) unstable; urgency=medium + + [ Andreas Boll ] + * Declare Multi-Arch: foreign for all packages (closes: #845314). + + [ Michael Gilbert ] + * Update to debhelper 10. + + -- Michael Gilbert Sat, 26 Nov 2016 02:35:37 +0000 + +libclc (0.2.0+git20160907-1) experimental; urgency=medium + + * New upstream snapshot (closes: #836960). + * Build with clang 3.9. + * Drop devices.patch, upstream. + * Use https for Vcs-Git field. + + -- Timo Aaltonen Fri, 16 Sep 2016 09:20:06 +0300 + +libclc (0.2.0+git20150813-3) unstable; urgency=medium + + * Bump standards version. + * Build with clang 3.8 (closes: #832014). + * Add support for additional GPU devices (closes: #823677). + + -- Michael Gilbert Sat, 30 Jul 2016 22:47:05 +0000 + libclc (0.2.0+git20150813-2) unstable; urgency=medium * Enable build hardening flags. diff -Nru libclc-0.2.0+git20150813/debian/clang libclc-0.2.0+git20170213/debian/clang --- libclc-0.2.0+git20150813/debian/clang 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/debian/clang 2017-06-08 10:49:21.000000000 +0000 @@ -0,0 +1 @@ +4.0 diff -Nru libclc-0.2.0+git20150813/debian/clean libclc-0.2.0+git20170213/debian/clean --- libclc-0.2.0+git20150813/debian/clean 2015-09-27 18:36:38.000000000 +0000 +++ libclc-0.2.0+git20170213/debian/clean 2017-02-13 13:00:28.000000000 +0000 @@ -1,3 +1,4 @@ Makefile libclc.pc build/*.pyc +utils/prepare-builtins.dwo diff -Nru libclc-0.2.0+git20150813/debian/control libclc-0.2.0+git20170213/debian/control --- libclc-0.2.0+git20150813/debian/control 2015-09-27 18:38:18.000000000 +0000 +++ libclc-0.2.0+git20170213/debian/control 2017-06-08 11:11:33.000000000 +0000 @@ -7,21 +7,22 @@ Build-Depends: debhelper (>= 9), python, - clang-3.7, - llvm-3.7-dev (>= 3.7), + clang-4.0, + llvm-4.0-dev, zlib1g-dev, libedit-dev, -Standards-Version: 3.9.6 +Standards-Version: 3.9.8 Homepage: http://libclc.llvm.org -Vcs-Git: git://anonscm.debian.org/pkg-opencl/libclc.git +Vcs-Git: https://anonscm.debian.org/git/pkg-opencl/libclc.git Vcs-Browser: https://anonscm.debian.org/cgit/pkg-opencl/libclc.git Package: libclc-ptx Architecture: all +Multi-Arch: foreign Depends: ${misc:Depends}, libclc-dev (= ${binary:Version}), - libclang-common-3.7-dev, + libclang-common-4.0-dev, Description: OpenCL C language implementation - ptx support libclc is an open implementation of the OpenCL C programming language, as specified by the OpenCL 1.1 Specification. @@ -30,10 +31,11 @@ Package: libclc-amdgcn Architecture: all +Multi-Arch: foreign Depends: ${misc:Depends}, libclc-dev (= ${binary:Version}), - libclang-common-3.7-dev, + libclang-common-4.0-dev, Description: OpenCL C language implementation - amdgcn support libclc is an open implementation of the OpenCL C programming language, as specified by the OpenCL 1.1 Specification. @@ -43,10 +45,11 @@ Package: libclc-r600 Architecture: all +Multi-Arch: foreign Depends: ${misc:Depends}, libclc-dev (= ${binary:Version}), - libclang-common-3.7-dev, + libclang-common-4.0-dev, Description: OpenCL C language implementation - r600 support libclc is an open implementation of the OpenCL C programming language, as specified by the OpenCL 1.1 Specification. @@ -57,6 +60,7 @@ Package: libclc-dev Section: libdevel Architecture: all +Multi-Arch: foreign Depends: ${misc:Depends}, Description: OpenCL C language implementation - development files diff -Nru libclc-0.2.0+git20150813/debian/control.in libclc-0.2.0+git20170213/debian/control.in --- libclc-0.2.0+git20150813/debian/control.in 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/debian/control.in 2017-06-08 10:49:21.000000000 +0000 @@ -0,0 +1,70 @@ +Source: libclc +Section: libs +Priority: extra +Maintainer: Debian OpenCL team +Uploaders: + Michael Gilbert , +Build-Depends: + debhelper (>= 9), + python, + clang-CLANG_VERSION, + llvm-CLANG_VERSION-dev, + zlib1g-dev, + libedit-dev, +Standards-Version: 3.9.8 +Homepage: http://libclc.llvm.org +Vcs-Git: https://anonscm.debian.org/git/pkg-opencl/libclc.git +Vcs-Browser: https://anonscm.debian.org/cgit/pkg-opencl/libclc.git + +Package: libclc-ptx +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends}, + libclc-dev (= ${binary:Version}), + libclang-common-CLANG_VERSION-dev, +Description: OpenCL C language implementation - ptx support + libclc is an open implementation of the OpenCL C programming language, + as specified by the OpenCL 1.1 Specification. + . + This package contains support for the PTX platform. + +Package: libclc-amdgcn +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends}, + libclc-dev (= ${binary:Version}), + libclang-common-CLANG_VERSION-dev, +Description: OpenCL C language implementation - amdgcn support + libclc is an open implementation of the OpenCL C programming language, + as specified by the OpenCL 1.1 Specification. + . + This package contains support for the amdgcn (AMD GPU) platform. + Supported GPU families: Southern Islands and newer. + +Package: libclc-r600 +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends}, + libclc-dev (= ${binary:Version}), + libclang-common-CLANG_VERSION-dev, +Description: OpenCL C language implementation - r600 support + libclc is an open implementation of the OpenCL C programming language, + as specified by the OpenCL 1.1 Specification. + . + This package contains support for the r600 (AMD GPU) platform. + Supported GPU families: Evergreen and Northern Islands. + +Package: libclc-dev +Section: libdevel +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends}, +Description: OpenCL C language implementation - development files + libclc is an open implementation of the OpenCL C programming language, + as specified by the OpenCL 1.1 Specification. + . + This package contains development header files. diff -Nru libclc-0.2.0+git20150813/debian/copyright libclc-0.2.0+git20170213/debian/copyright --- libclc-0.2.0+git20150813/debian/copyright 2015-09-27 18:36:38.000000000 +0000 +++ libclc-0.2.0+git20170213/debian/copyright 2017-02-13 13:00:28.000000000 +0000 @@ -9,7 +9,7 @@ Files: debian/* Copyright: - 2013-2015 Michael Gilbert + 2013-2016 Michael Gilbert 2013-2014 Julian Wollrath License: NCSA or MIT diff -Nru libclc-0.2.0+git20150813/debian/README.source libclc-0.2.0+git20170213/debian/README.source --- libclc-0.2.0+git20150813/debian/README.source 2015-09-27 18:36:38.000000000 +0000 +++ libclc-0.2.0+git20170213/debian/README.source 2017-02-13 13:00:28.000000000 +0000 @@ -1,2 +1,12 @@ +Watch File +========== There is no watch file in debian/ because upstream uses git but has made no tags, so there is currently no way to watch upstream changes. + +Clang Updates +============= +To build with a different version of clang, just alter the clang version +number contained in the debian/clang file, then update the control file +with: + +$ ./debian/rules debian/control diff -Nru libclc-0.2.0+git20150813/debian/rules libclc-0.2.0+git20170213/debian/rules --- libclc-0.2.0+git20150813/debian/rules 2015-09-27 20:01:01.000000000 +0000 +++ libclc-0.2.0+git20170213/debian/rules 2017-06-08 10:49:21.000000000 +0000 @@ -6,26 +6,13 @@ export DEB_BUILD_MAINT_OPTIONS=hardening=+all -confflags=--prefix=/usr \ - --with-llvm-config=/usr/bin/llvm-config-3.7 \ +LLVM_CONFIG=/usr/bin/llvm-config-$(shell cat debian/clang) -path=debian/tmp/usr/lib/clc +debian/control: debian/control.in + sed "s/CLANG_VERSION/$(shell cat debian/clang)/g" < $< > $@ -%: +%: debian/control dh $@ --parallel override_dh_auto_configure: - ./configure.py $(confflags) - -override_dh_install: - test $(shell sha512sum $(path)/cypress-r600--.bc) != $(shell sha512sum $(path)/cayman-r600--.bc) || \ - rm -f $(path)/cypress-r600--.bc && \ - ln -s cayman-r600--.bc $(path)/cypress-r600--.bc - test $(shell sha512sum $(path)/cedar-r600--.bc) != $(shell sha512sum $(path)/barts-r600--.bc) || \ - rm -f $(path)/cedar-r600--.bc && \ - ln -s barts-r600--.bc $(path)/cedar-r600--.bc - dh_install - -override_dh_clean: - dh_clean - find -name '*.d' -execdir rm -f {} \; + ./configure.py --prefix=/usr --with-llvm-config=$(LLVM_CONFIG) diff -Nru libclc-0.2.0+git20150813/generic/include/clc/clc.h libclc-0.2.0+git20170213/generic/include/clc/clc.h --- libclc-0.2.0+git20150813/generic/include/clc/clc.h 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/clc.h 2017-02-12 21:33:49.000000000 +0000 @@ -30,6 +30,7 @@ #include #include #include +#include /* 6.11.2 Math Functions */ #include @@ -43,30 +44,41 @@ #include #include #include +#include #include #include +#include #include #include +#include #include #include +#include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include +#include +#include #include #include #include #include +#include #include +#include #include #include #include @@ -77,6 +89,8 @@ #include #include #include +#include +#include #include #include #include @@ -88,6 +102,7 @@ #include #include #include +#include #include /* 6.11.2.1 Floating-point macros */ @@ -210,6 +225,11 @@ #include #include +/* 6.11.13 Image Read and Write Functions */ + +#include +#include + /* libclc internal defintions */ #ifdef __CLC_INTERNAL #include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/float/definitions.h libclc-0.2.0+git20170213/generic/include/clc/float/definitions.h --- libclc-0.2.0+git20150813/generic/include/clc/float/definitions.h 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/float/definitions.h 2017-02-12 21:33:49.000000000 +0000 @@ -14,6 +14,9 @@ #define FLT_MIN 0x1.0p-126f #define FLT_EPSILON 0x1.0p-23f +#define FP_ILOGB0 (-2147483647 - 1) +#define FP_ILOGBNAN (-2147483647 - 1) + #define M_E_F 0x1.5bf0a8p+1f #define M_LOG2E_F 0x1.715476p+0f #define M_LOG10E_F 0x1.bcb7b2p-2f diff -Nru libclc-0.2.0+git20150813/generic/include/clc/geometric/floatn.inc libclc-0.2.0+git20170213/generic/include/clc/geometric/floatn.inc --- libclc-0.2.0+git20150813/generic/include/clc/geometric/floatn.inc 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/geometric/floatn.inc 2017-02-12 21:33:49.000000000 +0000 @@ -19,6 +19,14 @@ #include __CLC_BODY #undef __CLC_FLOATN +#define __CLC_FLOATN float8 +#include __CLC_BODY +#undef __CLC_FLOATN + +#define __CLC_FLOATN float16 +#include __CLC_BODY +#undef __CLC_FLOATN + #undef __CLC_FLOAT #undef __CLC_FPSIZE @@ -46,6 +54,14 @@ #include __CLC_BODY #undef __CLC_FLOATN +#define __CLC_FLOATN double8 +#include __CLC_BODY +#undef __CLC_FLOATN + +#define __CLC_FLOATN double16 +#include __CLC_BODY +#undef __CLC_FLOATN + #undef __CLC_FLOAT #undef __CLC_FPSIZE diff -Nru libclc-0.2.0+git20150813/generic/include/clc/image/image_defines.h libclc-0.2.0+git20170213/generic/include/clc/image/image_defines.h --- libclc-0.2.0+git20150813/generic/include/clc/image/image_defines.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/image/image_defines.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,49 @@ +/* get_image_channel_data_type flags */ +#define CLK_SNORM_INT8 0x10D0 +#define CLK_SNORM_INT16 0x10D1 +#define CLK_UNORM_INT8 0x10D2 +#define CLK_UNORM_INT16 0x10D3 +#define CLK_UNORM_SHORT_565 0x10D4 +#define CLK_UNORM_SHORT_555 0x10D5 +#define CLK_UNORM_SHORT_101010 0x10D6 +#define CLK_SIGNED_INT8 0x10D7 +#define CLK_SIGNED_INT16 0x10D8 +#define CLK_SIGNED_INT32 0x10D9 +#define CLK_UNSIGNED_INT8 0x10DA +#define CLK_UNSIGNED_INT16 0x10DB +#define CLK_UNSIGNED_INT32 0x10DC +#define CLK_HALF_FLOAT 0x10DD +#define CLK_FLOAT 0x10DE + +/* get_image_channel_order flags */ +#define CLK_R 0x10B0 +#define CLK_A 0x10B1 +#define CLK_RG 0x10B2 +#define CLK_RA 0x10B3 +#define CLK_RGB 0x10B4 +#define CLK_RGBA 0x10B5 +#define CLK_BGRA 0x10B6 +#define CLK_ARGB 0x10B7 +#define CLK_INTENSITY 0x10B8 +#define CLK_LUMINANCE 0x10B9 +#define CLK_Rx 0x10BA +#define CLK_RGx 0x10BB +#define CLK_RGBx 0x10BC + +/* sampler normalized coords */ +#define CLK_NORMALIZED_COORDS_FALSE 0x0000 +#define CLK_NORMALIZED_COORDS_TRUE 0x0001 +#define __CLC_NORMALIZED_COORDS_MASK 0x0001 + +/* sampler addressing mode */ +#define CLK_ADDRESS_NONE 0x0000 +#define CLK_ADDRESS_CLAMP_TO_EDGE 0x0002 +#define CLK_ADDRESS_CLAMP 0x0004 +#define CLK_ADDRESS_REPEAT 0x0006 +#define CLK_ADDRESS_MIRRORED_REPEAT 0x0008 +#define __CLC_ADDRESS_MASK 0x000E + +/* sampler filter mode */ +#define CLK_FILTER_NEAREST 0x0000 +#define CLK_FILTER_LINEAR 0x0010 +#define __CLC_FILTER_MASK 0x0010 diff -Nru libclc-0.2.0+git20150813/generic/include/clc/image/image.h libclc-0.2.0+git20170213/generic/include/clc/image/image.h --- libclc-0.2.0+git20150813/generic/include/clc/image/image.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/image/image.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,36 @@ +_CLC_OVERLOAD _CLC_DECL int get_image_width (image2d_t image); +_CLC_OVERLOAD _CLC_DECL int get_image_width (image3d_t image); + +_CLC_OVERLOAD _CLC_DECL int get_image_height (image2d_t image); +_CLC_OVERLOAD _CLC_DECL int get_image_height (image3d_t image); + +_CLC_OVERLOAD _CLC_DECL int get_image_depth (image3d_t image); + +_CLC_OVERLOAD _CLC_DECL int get_image_channel_data_type (image2d_t image); +_CLC_OVERLOAD _CLC_DECL int get_image_channel_data_type (image3d_t image); + +_CLC_OVERLOAD _CLC_DECL int get_image_channel_order (image2d_t image); +_CLC_OVERLOAD _CLC_DECL int get_image_channel_order (image3d_t image); + +_CLC_OVERLOAD _CLC_DECL int2 get_image_dim (image2d_t image); +_CLC_OVERLOAD _CLC_DECL int4 get_image_dim (image3d_t image); + +_CLC_OVERLOAD _CLC_DECL void +write_imagef(image2d_t image, int2 coord, float4 color); +_CLC_OVERLOAD _CLC_DECL void +write_imagei(image2d_t image, int2 coord, int4 color); +_CLC_OVERLOAD _CLC_DECL void +write_imageui(image2d_t image, int2 coord, uint4 color); + +_CLC_OVERLOAD _CLC_DECL float4 +read_imagef(image2d_t image, sampler_t sampler, int2 coord); +_CLC_OVERLOAD _CLC_DECL float4 +read_imagef(image2d_t image, sampler_t sampler, float2 coord); +_CLC_OVERLOAD _CLC_DECL int4 +read_imagei(image2d_t image, sampler_t sampler, int2 coord); +_CLC_OVERLOAD _CLC_DECL int4 +read_imagei(image2d_t image, sampler_t sampler, float2 coord); +_CLC_OVERLOAD _CLC_DECL uint4 +read_imageui(image2d_t image, sampler_t sampler, int2 coord); +_CLC_OVERLOAD _CLC_DECL uint4 +read_imageui(image2d_t image, sampler_t sampler, float2 coord); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/integer/definitions.h libclc-0.2.0+git20170213/generic/include/clc/integer/definitions.h --- libclc-0.2.0+git20150813/generic/include/clc/integer/definitions.h 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/integer/definitions.h 2017-02-12 21:33:49.000000000 +0000 @@ -1,14 +1,14 @@ #define CHAR_BIT 8 #define INT_MAX 2147483647 -#define INT_MIN -2147483648 +#define INT_MIN (-2147483647 - 1) #define LONG_MAX 0x7fffffffffffffffL -#define LONG_MIN -0x8000000000000000L +#define LONG_MIN (-0x7fffffffffffffffL - 1) +#define CHAR_MAX SCHAR_MAX +#define CHAR_MIN SCHAR_MIN #define SCHAR_MAX 127 -#define SCHAR_MIN -128 -#define CHAR_MAX 127 -#define CHAR_MIN -128 +#define SCHAR_MIN (-127 - 1) #define SHRT_MAX 32767 -#define SHRT_MIN -32768 +#define SHRT_MIN (-32767 - 1) #define UCHAR_MAX 255 #define USHRT_MAX 65535 #define UINT_MAX 0xffffffff diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/cbrt.h libclc-0.2.0+git20170213/generic/include/clc/math/cbrt.h --- libclc-0.2.0+git20150813/generic/include/clc/math/cbrt.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/cbrt.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/cbrt.inc libclc-0.2.0+git20170213/generic/include/clc/math/cbrt.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/cbrt.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/cbrt.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE cbrt(__CLC_GENTYPE x); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/cosh.h libclc-0.2.0+git20170213/generic/include/clc/math/cosh.h --- libclc-0.2.0+git20150813/generic/include/clc/math/cosh.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/cosh.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/cosh.inc libclc-0.2.0+git20170213/generic/include/clc/math/cosh.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/cosh.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/cosh.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE cosh(__CLC_GENTYPE x); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/erf.h libclc-0.2.0+git20170213/generic/include/clc/math/erf.h --- libclc-0.2.0+git20150813/generic/include/clc/math/erf.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/erf.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#undef erfc + +#define __CLC_BODY +#define __CLC_FUNCTION erf + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/expm1.h libclc-0.2.0+git20170213/generic/include/clc/math/expm1.h --- libclc-0.2.0+git20150813/generic/include/clc/math/expm1.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/expm1.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#undef exp + +#define __CLC_BODY +#define __CLC_FUNCTION expm1 + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/fdim.h libclc-0.2.0+git20170213/generic/include/clc/math/fdim.h --- libclc-0.2.0+git20150813/generic/include/clc/math/fdim.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/fdim.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,2 @@ +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/fdim.inc libclc-0.2.0+git20170213/generic/include/clc/math/fdim.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/fdim.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/fdim.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE fdim(__CLC_GENTYPE a, __CLC_GENTYPE b); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/frexp.h libclc-0.2.0+git20170213/generic/include/clc/math/frexp.h --- libclc-0.2.0+git20150813/generic/include/clc/math/frexp.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/frexp.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,2 @@ +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/frexp.inc libclc-0.2.0+git20170213/generic/include/clc/math/frexp.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/frexp.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/frexp.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE frexp(__CLC_GENTYPE x, global __CLC_INTN *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE frexp(__CLC_GENTYPE x, local __CLC_INTN *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE frexp(__CLC_GENTYPE x, private __CLC_INTN *iptr); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/gentype.inc libclc-0.2.0+git20170213/generic/include/clc/math/gentype.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/gentype.inc 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/gentype.inc 2017-02-12 21:33:49.000000000 +0000 @@ -2,38 +2,50 @@ #define __CLC_FPSIZE 32 #define __CLC_GENTYPE float +#define __CLC_INTN int #define __CLC_SCALAR #include __CLC_BODY #undef __CLC_GENTYPE +#undef __CLC_INTN #undef __CLC_SCALAR #define __CLC_GENTYPE float2 #define __CLC_INTN int2 +#define __CLC_VECSIZE 2 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN #define __CLC_GENTYPE float3 #define __CLC_INTN int3 +#define __CLC_VECSIZE 3 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN #define __CLC_GENTYPE float4 #define __CLC_INTN int4 +#define __CLC_VECSIZE 4 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN #define __CLC_GENTYPE float8 #define __CLC_INTN int8 +#define __CLC_VECSIZE 8 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN #define __CLC_GENTYPE float16 #define __CLC_INTN int16 +#define __CLC_VECSIZE 16 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN @@ -47,37 +59,49 @@ #define __CLC_SCALAR #define __CLC_GENTYPE double +#define __CLC_INTN int #include __CLC_BODY #undef __CLC_GENTYPE +#undef __CLC_INTN #undef __CLC_SCALAR #define __CLC_GENTYPE double2 #define __CLC_INTN int2 +#define __CLC_VECSIZE 2 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN #define __CLC_GENTYPE double3 #define __CLC_INTN int3 +#define __CLC_VECSIZE 3 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN #define __CLC_GENTYPE double4 #define __CLC_INTN int4 +#define __CLC_VECSIZE 4 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN #define __CLC_GENTYPE double8 #define __CLC_INTN int8 +#define __CLC_VECSIZE 8 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN #define __CLC_GENTYPE double16 #define __CLC_INTN int16 +#define __CLC_VECSIZE 16 #include __CLC_BODY +#undef __CLC_VECSIZE #undef __CLC_GENTYPE #undef __CLC_INTN diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/ilogb.h libclc-0.2.0+git20170213/generic/include/clc/math/ilogb.h --- libclc-0.2.0+git20150813/generic/include/clc/math/ilogb.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/ilogb.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,5 @@ +#define __CLC_BODY + +#include + +#undef __CLC_BODY diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/ilogb.inc libclc-0.2.0+git20170213/generic/include/clc/math/ilogb.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/ilogb.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/ilogb.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_INTN ilogb(__CLC_GENTYPE x); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/lgamma.h libclc-0.2.0+git20170213/generic/include/clc/math/lgamma.h --- libclc-0.2.0+git20150813/generic/include/clc/math/lgamma.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/lgamma.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,2 @@ +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/lgamma.inc libclc-0.2.0+git20170213/generic/include/clc/math/lgamma.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/lgamma.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/lgamma.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE lgamma(__CLC_GENTYPE a); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/lgamma_r.h libclc-0.2.0+git20170213/generic/include/clc/math/lgamma_r.h --- libclc-0.2.0+git20150813/generic/include/clc/math/lgamma_r.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/lgamma_r.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,2 @@ +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/lgamma_r.inc libclc-0.2.0+git20170213/generic/include/clc/math/lgamma_r.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/lgamma_r.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/lgamma_r.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, global __CLC_INTN *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, local __CLC_INTN *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, private __CLC_INTN *iptr); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/logb.h libclc-0.2.0+git20170213/generic/include/clc/math/logb.h --- libclc-0.2.0+git20150813/generic/include/clc/math/logb.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/logb.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,2 @@ +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/logb.inc libclc-0.2.0+git20170213/generic/include/clc/math/logb.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/logb.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/logb.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE logb(__CLC_GENTYPE a); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/modf.h libclc-0.2.0+git20170213/generic/include/clc/math/modf.h --- libclc-0.2.0+git20150813/generic/include/clc/math/modf.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/modf.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/modf.inc libclc-0.2.0+git20170213/generic/include/clc/math/modf.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/modf.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/modf.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014, 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, global __CLC_GENTYPE *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, local __CLC_GENTYPE *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE modf(__CLC_GENTYPE x, private __CLC_GENTYPE *iptr); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/native_rsqrt.h libclc-0.2.0+git20170213/generic/include/clc/math/native_rsqrt.h --- libclc-0.2.0+git20150813/generic/include/clc/math/native_rsqrt.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/native_rsqrt.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1 @@ +#define native_rsqrt rsqrt diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/tanh.h libclc-0.2.0+git20170213/generic/include/clc/math/tanh.h --- libclc-0.2.0+git20150813/generic/include/clc/math/tanh.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/tanh.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/tanh.inc libclc-0.2.0+git20170213/generic/include/clc/math/tanh.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/tanh.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/tanh.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE tanh(__CLC_GENTYPE a); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/tgamma.h libclc-0.2.0+git20170213/generic/include/clc/math/tgamma.h --- libclc-0.2.0+git20150813/generic/include/clc/math/tgamma.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/tgamma.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,3 @@ +#define __CLC_BODY +#include +#undef __CLC_BODY diff -Nru libclc-0.2.0+git20150813/generic/include/clc/math/tgamma.inc libclc-0.2.0+git20170213/generic/include/clc/math/tgamma.inc --- libclc-0.2.0+git20150813/generic/include/clc/math/tgamma.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/math/tgamma.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE tgamma(__CLC_GENTYPE a); diff -Nru libclc-0.2.0+git20150813/generic/include/clc/shared/vstore.h libclc-0.2.0+git20170213/generic/include/clc/shared/vstore.h --- libclc-0.2.0+git20150813/generic/include/clc/shared/vstore.h 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/shared/vstore.h 2017-02-12 21:33:49.000000000 +0000 @@ -1,17 +1,20 @@ -#define _CLC_VSTORE_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \ - _CLC_OVERLOAD _CLC_DECL void vstore##WIDTH(VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out); +#define _CLC_VSTORE_DECL(SUFFIX, PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \ + _CLC_OVERLOAD _CLC_DECL void vstore##SUFFIX##WIDTH(VEC_TYPE vec, size_t offset, ADDR_SPACE PRIM_TYPE *out); -#define _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, ADDR_SPACE) \ - _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \ - _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \ - _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \ - _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \ - _CLC_VSTORE_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE) +#define _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \ + _CLC_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE) + +#define _CLC_VECTOR_VSTORE_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \ + _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \ + _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \ + _CLC_VECTOR_VSTORE_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) \ #define _CLC_VECTOR_VSTORE_PRIM1(PRIM_TYPE) \ - _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __private) \ - _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __local) \ - _CLC_VECTOR_VSTORE_DECL(PRIM_TYPE, __global) \ + _CLC_VECTOR_VSTORE_PRIM3(,PRIM_TYPE, PRIM_TYPE) \ #define _CLC_VECTOR_VSTORE_PRIM() \ _CLC_VECTOR_VSTORE_PRIM1(char) \ @@ -23,14 +26,18 @@ _CLC_VECTOR_VSTORE_PRIM1(long) \ _CLC_VECTOR_VSTORE_PRIM1(ulong) \ _CLC_VECTOR_VSTORE_PRIM1(float) \ - + _CLC_VECTOR_VSTORE_PRIM3(_half, half, float) + #ifdef cl_khr_fp64 -#define _CLC_VECTOR_VSTORE() \ - _CLC_VECTOR_VSTORE_PRIM1(double) \ - _CLC_VECTOR_VSTORE_PRIM() -#else -#define _CLC_VECTOR_VSTORE() \ - _CLC_VECTOR_VSTORE_PRIM() +#pragma OPENCL EXTENSION cl_khr_fp64: enable + _CLC_VECTOR_VSTORE_PRIM1(double) + _CLC_VECTOR_VSTORE_PRIM3(_half, half, double) + _CLC_VSTORE_DECL(_half, half, double, , __private) + _CLC_VSTORE_DECL(_half, half, double, , __local) + _CLC_VSTORE_DECL(_half, half, double, , __global) #endif -_CLC_VECTOR_VSTORE() +_CLC_VECTOR_VSTORE_PRIM() +_CLC_VSTORE_DECL(_half, half, float, , __private) +_CLC_VSTORE_DECL(_half, half, float, , __local) +_CLC_VSTORE_DECL(_half, half, float, , __global) diff -Nru libclc-0.2.0+git20150813/generic/include/clc/workitem/get_global_offset.h libclc-0.2.0+git20170213/generic/include/clc/workitem/get_global_offset.h --- libclc-0.2.0+git20150813/generic/include/clc/workitem/get_global_offset.h 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/include/clc/workitem/get_global_offset.h 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1 @@ +_CLC_DECL size_t get_global_offset(uint dim); diff -Nru libclc-0.2.0+git20150813/generic/lib/clcmacro.h libclc-0.2.0+git20170213/generic/lib/clcmacro.h --- libclc-0.2.0+git20150813/generic/lib/clcmacro.h 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/clcmacro.h 2017-02-12 21:33:49.000000000 +0000 @@ -109,6 +109,43 @@ } \ \ +#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, ADDR_SPACE, ARG2_TYPE) \ + DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ADDR_SPACE ARG2_TYPE##2 *y) { \ + return (RET_TYPE##2)( \ + FUNCTION(x.x, (ARG2_TYPE*)y), \ + FUNCTION(x.y, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+1)) \ + ); \ + } \ +\ + DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ADDR_SPACE ARG2_TYPE##3 *y) { \ + return (RET_TYPE##3)( \ + FUNCTION(x.x, (ARG2_TYPE*)y), \ + FUNCTION(x.y, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+1)), \ + FUNCTION(x.z, (ADDR_SPACE ARG2_TYPE*)((ADDR_SPACE ARG2_TYPE*)y+2)) \ + ); \ + } \ +\ + DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ADDR_SPACE ARG2_TYPE##4 *y) { \ + return (RET_TYPE##4)( \ + FUNCTION(x.lo, (ARG2_TYPE##2*)y), \ + FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##2*)((ADDR_SPACE ARG2_TYPE*)y+2)) \ + ); \ + } \ +\ + DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ADDR_SPACE ARG2_TYPE##8 *y) { \ + return (RET_TYPE##8)( \ + FUNCTION(x.lo, (ARG2_TYPE##4*)y), \ + FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##4*)((ADDR_SPACE ARG2_TYPE*)y+4)) \ + ); \ + } \ +\ + DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ADDR_SPACE ARG2_TYPE##16 *y) { \ + return (RET_TYPE##16)( \ + FUNCTION(x.lo, (ARG2_TYPE##8*)y), \ + FUNCTION(x.hi, (ADDR_SPACE ARG2_TYPE##8*)((ADDR_SPACE ARG2_TYPE*)y+8)) \ + ); \ + } + #define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, ARG2_TYPE) \ _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ return BUILTIN(x, y); \ diff -Nru libclc-0.2.0+git20150813/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl libclc-0.2.0+git20170213/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl --- libclc-0.2.0+git20150813/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/cl_khr_global_int32_base_atomics/atom_dec.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,7 +2,7 @@ #define IMPL(TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(global TYPE *p) { \ - return atom_sub(p, 1); \ + return atom_sub(p, (TYPE)1); \ } IMPL(int) diff -Nru libclc-0.2.0+git20150813/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl libclc-0.2.0+git20170213/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl --- libclc-0.2.0+git20150813/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/cl_khr_global_int32_base_atomics/atom_inc.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,7 +2,7 @@ #define IMPL(TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(global TYPE *p) { \ - return atom_add(p, 1); \ + return atom_add(p, (TYPE)1); \ } IMPL(int) diff -Nru libclc-0.2.0+git20150813/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl libclc-0.2.0+git20170213/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl --- libclc-0.2.0+git20150813/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/cl_khr_local_int32_base_atomics/atom_dec.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,7 +2,7 @@ #define IMPL(TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(local TYPE *p) { \ - return atom_sub(p, 1); \ + return atom_sub(p, (TYPE)1); \ } IMPL(int) diff -Nru libclc-0.2.0+git20150813/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl libclc-0.2.0+git20170213/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl --- libclc-0.2.0+git20150813/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/cl_khr_local_int32_base_atomics/atom_inc.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,7 +2,7 @@ #define IMPL(TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(local TYPE *p) { \ - return atom_add(p, 1); \ + return atom_add(p, (TYPE)1); \ } IMPL(int) diff -Nru libclc-0.2.0+git20150813/generic/lib/gen_convert.py libclc-0.2.0+git20170213/generic/lib/gen_convert.py --- libclc-0.2.0+git20150813/generic/lib/gen_convert.py 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/gen_convert.py 2017-02-12 21:33:49.000000000 +0000 @@ -97,14 +97,12 @@ int64_count = int64_count +1 elif dst in float64_types: float64_count = float64_count + 1 - if float64_count > 0 and int64_count > 0: - print("#if defined(cl_khr_fp64) && defined(cles_khr_int64)") - return True - elif float64_count > 0: + if float64_count > 0: + #In embedded profile, if cl_khr_fp64 is supported cles_khr_int64 has to be print("#ifdef cl_khr_fp64") return True elif int64_count > 0: - print("#ifdef cles_khr_int64") + print("#if defined cles_khr_int64 || !defined(__EMBEDDED_PROFILE__)") return True return False @@ -142,6 +140,15 @@ #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#if defined(__EMBEDDED_PROFILE__) && !defined(cles_khr_int64) +#error Embedded profile that supports cl_khr_fp64 also has to support cles_khr_int64 +#endif + +#endif + +#ifdef cles_khr_int64 +#pragma OPENCL EXTENSION cles_khr_int64 : enable #endif """) diff -Nru libclc-0.2.0+git20150813/generic/lib/image/get_image_dim.cl libclc-0.2.0+git20170213/generic/lib/image/get_image_dim.cl --- libclc-0.2.0+git20150813/generic/lib/image/get_image_dim.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/image/get_image_dim.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#include + +_CLC_OVERLOAD _CLC_DEF int2 get_image_dim (image2d_t image) { + return (int2)(get_image_width(image), get_image_height(image)); +} +_CLC_OVERLOAD _CLC_DEF int4 get_image_dim (image3d_t image) { + return (int4)(get_image_width(image), get_image_height(image), + get_image_depth(image), 0); +} diff -Nru libclc-0.2.0+git20150813/generic/lib/math/cbrt.cl libclc-0.2.0+git20170213/generic/lib/math/cbrt.cl --- libclc-0.2.0+git20150813/generic/lib/math/cbrt.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/cbrt.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include "math.h" +#include "tables.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float cbrt(float x) { + + uint xi = as_uint(x); + uint axi = xi & EXSIGNBIT_SP32; + uint xsign = axi ^ xi; + xi = axi; + + int m = (xi >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + // Treat subnormals + uint xisub = as_uint(as_float(xi | 0x3f800000) - 1.0f); + int msub = (xisub >> EXPSHIFTBITS_SP32) - 253; + int c = m == -127; + xi = c ? xisub : xi; + m = c ? msub : m; + + int m3 = m / 3; + int rem = m - m3*3; + float mf = as_float((m3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + + uint indx = (xi & 0x007f0000) + ((xi & 0x00008000) << 1); + float f = as_float((xi & MANTBITS_SP32) | 0x3f000000) - as_float(indx | 0x3f000000); + + indx >>= 16; + float r = f * USE_TABLE(log_inv_tbl, indx); + float poly = mad(mad(r, 0x1.f9add4p-5f, -0x1.c71c72p-4f), r*r, r * 0x1.555556p-2f); + + // This could also be done with a 5-element table + float remH = 0x1.428000p-1f; + float remT = 0x1.45f31ap-14f; + + remH = rem == -1 ? 0x1.964000p-1f : remH; + remT = rem == -1 ? 0x1.fea53ep-13f : remT; + + remH = rem == 0 ? 0x1.000000p+0f : remH; + remT = rem == 0 ? 0x0.000000p+0f : remT; + + remH = rem == 1 ? 0x1.428000p+0f : remH; + remT = rem == 1 ? 0x1.45f31ap-13f : remT; + + remH = rem == 2 ? 0x1.964000p+0f : remH; + remT = rem == 2 ? 0x1.fea53ep-12f : remT; + + float2 tv = USE_TABLE(cbrt_tbl, indx); + float cbrtH = tv.s0; + float cbrtT = tv.s1; + + float bH = cbrtH * remH; + float bT = mad(cbrtH, remT, mad(cbrtT, remH, cbrtT*remT)); + + float z = mad(poly, bH, mad(poly, bT, bT)) + bH; + z *= mf; + z = as_float(as_uint(z) | xsign); + c = axi >= EXPBITS_SP32 | axi == 0; + z = c ? x : z; + return z; + +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cbrt, float); + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double cbrt(double x) { + + int return_x = isinf(x) | isnan(x) | x == 0.0; + ulong ux = as_ulong(fabs(x)); + int m = (as_int2(ux).hi >> 20) - 1023; + + // Treat subnormals + ulong uxs = as_ulong(as_double(0x3ff0000000000000UL | ux) - 1.0); + int ms = m + (as_int2(uxs).hi >> 20) - 1022; + + int c = m == -1023; + ux = c ? uxs : ux; + m = c ? ms : m; + + int mby3 = m / 3; + int rem = m - 3*mby3; + + double mf = as_double((ulong)(mby3 + 1023) << 52); + + ux &= 0x000fffffffffffffUL; + double Y = as_double(0x3fe0000000000000UL | ux); + + // nearest integer + int index = as_int2(ux).hi >> 11; + index = (0x100 | (index >> 1)) + (index & 1); + double F = (double)index * 0x1.0p-9; + + double f = Y - F; + double r = f * USE_TABLE(cbrt_inv_tbl, index-256); + + double z = r * fma(r, + fma(r, + fma(r, + fma(r, + fma(r, -0x1.8090d6221a247p-6, 0x1.ee7113506ac13p-6), + -0x1.511e8d2b3183bp-5), + 0x1.f9add3c0ca458p-5), + -0x1.c71c71c71c71cp-4), + 0x1.5555555555555p-2); + + double2 tv = USE_TABLE(cbrt_rem_tbl, rem+2); + double Rem_h = tv.s0; + double Rem_t = tv.s1; + + tv = USE_TABLE(cbrt_dbl_tbl, index-256); + double F_h = tv.s0; + double F_t = tv.s1; + + double b_h = F_h * Rem_h; + double b_t = fma(Rem_t, F_h, fma(F_t, Rem_h, F_t*Rem_t)); + + double ans = fma(z, b_h, fma(z, b_t, b_t)) + b_h; + ans = copysign(ans*mf, x); + return return_x ? x : ans; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cbrt, double) + +#endif diff -Nru libclc-0.2.0+git20150813/generic/lib/math/clc_nextafter.cl libclc-0.2.0+git20170213/generic/lib/math/clc_nextafter.cl --- libclc-0.2.0+git20150813/generic/lib/math/clc_nextafter.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/clc_nextafter.cl 2017-02-12 21:33:49.000000000 +0000 @@ -1,43 +1,39 @@ #include #include "../clcmacro.h" -// This file provides OpenCL C implementations of nextafter for targets that -// don't support the clang builtin. +// This file provides OpenCL C implementations of nextafter for +// targets that don't support the clang builtin. -#define FLT_NAN 0.0f/0.0f +#define AS_TYPE(x) as_##x -#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, NAN, ZERO, NEXTAFTER_ZERO) \ +#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, INT_TYPE) \ _CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x, FLOAT_TYPE y) { \ - union { \ - FLOAT_TYPE f; \ - UINT_TYPE i; \ - } next; \ - if (isnan(x) || isnan(y)) { \ - return NAN; \ - } \ - if (x == y) { \ - return y; \ - } \ - next.f = x; \ - if (x < y) { \ - next.i++; \ - } else { \ - if (next.f == ZERO) { \ - next.i = NEXTAFTER_ZERO; \ - } else { \ - next.i--; \ - } \ - } \ - return next.f; \ + const UINT_TYPE sign_bit \ + = (UINT_TYPE)1 << (sizeof(INT_TYPE) * 8 - 1); \ + const UINT_TYPE sign_bit_mask = sign_bit - 1; \ + INT_TYPE ix = AS_TYPE(INT_TYPE)(x); \ + INT_TYPE ax = ix & sign_bit_mask; \ + INT_TYPE mx = sign_bit - ix; \ + mx = ix < 0 ? mx : ix; \ + INT_TYPE iy = AS_TYPE(INT_TYPE)(y); \ + INT_TYPE ay = iy & sign_bit_mask; \ + INT_TYPE my = sign_bit - iy; \ + my = iy < 0 ? my : iy; \ + INT_TYPE t = mx + (mx < my ? 1 : -1); \ + INT_TYPE r = sign_bit - t; \ + r = t < 0 ? r : t; \ + r = isnan(x) ? ix : r; \ + r = isnan(y) ? iy : r; \ + r = ((ax | ay) == 0 | ix == iy) ? iy : r; \ + return AS_TYPE(FLOAT_TYPE)(r); \ } -NEXTAFTER(float, uint, FLT_NAN, 0.0f, 0x80000001) +NEXTAFTER(float, uint, int) _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_nextafter, float, float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -#define DBL_NAN 0.0/0.0 -NEXTAFTER(double, ulong, DBL_NAN, 0.0, 0x8000000000000001) +NEXTAFTER(double, ulong, long) _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __clc_nextafter, double, double) #endif diff -Nru libclc-0.2.0+git20150813/generic/lib/math/cosh.cl libclc-0.2.0+git20170213/generic/lib/math/cosh.cl --- libclc-0.2.0+git20150813/generic/lib/math/cosh.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/cosh.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include "math.h" +#include "tables.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float cosh(float x) { + + // After dealing with special cases the computation is split into regions as follows. + // abs(x) >= max_cosh_arg: + // cosh(x) = sign(x)*Inf + // abs(x) >= small_threshold: + // cosh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // cosh(x) is then z. + + const float max_cosh_arg = 0x1.65a9fap+6f; + const float small_threshold = 0x1.0a2b24p+3f; + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + float y = as_float(aux); + + // Find the integer part y0 of y and the increment dy = y - y0. We then compute + // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) + // where sinh(y0) and cosh(y0) are tabulated above. + + int ind = (int)y; + ind = (uint)ind > 36U ? 0 : ind; + + float dy = y - ind; + float dy2 = dy * dy; + + float sdy = mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, 0.7746188980094184251527126e-12f, 0.160576793121939886190847e-9f), + 0.250521176994133472333666e-7f), + 0.275573191913636406057211e-5f), + 0.198412698413242405162014e-3f), + 0.833333333333329931873097e-2f), + 0.166666666666666667013899e0f); + sdy = mad(sdy, dy*dy2, dy); + + float cdy = mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, + mad(dy2, 0.1163921388172173692062032e-10f, 0.208744349831471353536305e-8f), + 0.275573350756016588011357e-6f), + 0.248015872460622433115785e-4f), + 0.138888888889814854814536e-2f), + 0.416666666666660876512776e-1f), + 0.500000000000000005911074e0f); + cdy = mad(cdy, dy2, 1.0f); + + float2 tv = USE_TABLE(sinhcosh_tbl, ind); + float z = mad(tv.s0, sdy, tv.s1 * cdy); + + // When exp(-x) is insignificant compared to exp(x), return exp(x)/2 + float t = exp(y - 0x1.62e500p-1f); + float zsmall = mad(0x1.a0210ep-18f, t, t); + z = y >= small_threshold ? zsmall : z; + + // Corner cases + z = y >= max_cosh_arg ? as_float(PINFBITPATT_SP32) : z; + z = aux > PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : z; + z = aux < 0x38800000 ? 1.0f : z; + + return z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cosh, float); + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double cosh(double x) { + + // After dealing with special cases the computation is split into + // regions as follows: + // + // abs(x) >= max_cosh_arg: + // cosh(x) = sign(x)*Inf + // + // abs(x) >= small_threshold: + // cosh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). + // + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // cosh(x) is then sign(x)*z. + + // This is ln(2^1025) + const double max_cosh_arg = 7.10475860073943977113e+02; // 0x408633ce8fb9f87e + + // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) + const double small_threshold = 0x1.2b708872320e2p+4; + + double y = fabs(x); + + // In this range we find the integer part y0 of y + // and the increment dy = y - y0. We then compute + // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) + // where sinh(y0) and cosh(y0) are tabulated above. + + int ind = min((int)y, 36); + double dy = y - ind; + double dy2 = dy * dy; + + double sdy = dy * dy2 * + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, 0.7746188980094184251527126e-12, 0.160576793121939886190847e-9), + 0.250521176994133472333666e-7), + 0.275573191913636406057211e-5), + 0.198412698413242405162014e-3), + 0.833333333333329931873097e-2), + 0.166666666666666667013899e0); + + double cdy = dy2 * fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, + fma(dy2, 0.1163921388172173692062032e-10, 0.208744349831471353536305e-8), + 0.275573350756016588011357e-6), + 0.248015872460622433115785e-4), + 0.138888888889814854814536e-2), + 0.416666666666660876512776e-1), + 0.500000000000000005911074e0); + + // At this point sinh(dy) is approximated by dy + sdy, + // and cosh(dy) is approximated by 1 + cdy. + double2 tv = USE_TABLE(cosh_tbl, ind); + double cl = tv.s0; + double ct = tv.s1; + tv = USE_TABLE(sinh_tbl, ind); + double sl = tv.s0; + double st = tv.s1; + + double z = fma(sl, dy, fma(sl, sdy, fma(cl, cdy, fma(st, dy, fma(st, sdy, ct*cdy)) + ct))) + cl; + + // Other cases + z = y < 0x1.0p-28 ? 1.0 : z; + + double t = exp(y - 0x1.62e42fefa3800p-1); + t = fma(t, -0x1.ef35793c76641p-45, t); + z = y >= small_threshold ? t : z; + + z = y >= max_cosh_arg ? as_double(PINFBITPATT_DP64) : z; + + z = isinf(x) | isnan(x) ? y : z; + + return z; + +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cosh, double) + +#endif diff -Nru libclc-0.2.0+git20150813/generic/lib/math/erf.cl libclc-0.2.0+git20170213/generic/lib/math/erf.cl --- libclc-0.2.0+git20150813/generic/lib/math/erf.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/erf.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include "math.h" +#include "../clcmacro.h" + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== +*/ + +#define erx 8.4506291151e-01f /* 0x3f58560b */ + +// Coefficients for approximation to erf on [00.84375] + +#define efx 1.2837916613e-01f /* 0x3e0375d4 */ +#define efx8 1.0270333290e+00f /* 0x3f8375d4 */ + +#define pp0 1.2837916613e-01f /* 0x3e0375d4 */ +#define pp1 -3.2504209876e-01f /* 0xbea66beb */ +#define pp2 -2.8481749818e-02f /* 0xbce9528f */ +#define pp3 -5.7702702470e-03f /* 0xbbbd1489 */ +#define pp4 -2.3763017452e-05f /* 0xb7c756b1 */ +#define qq1 3.9791721106e-01f /* 0x3ecbbbce */ +#define qq2 6.5022252500e-02f /* 0x3d852a63 */ +#define qq3 5.0813062117e-03f /* 0x3ba68116 */ +#define qq4 1.3249473704e-04f /* 0x390aee49 */ +#define qq5 -3.9602282413e-06f /* 0xb684e21a */ + +// Coefficients for approximation to erf in [0.843751.25] + +#define pa0 -2.3621185683e-03f /* 0xbb1acdc6 */ +#define pa1 4.1485610604e-01f /* 0x3ed46805 */ +#define pa2 -3.7220788002e-01f /* 0xbebe9208 */ +#define pa3 3.1834661961e-01f /* 0x3ea2fe54 */ +#define pa4 -1.1089469492e-01f /* 0xbde31cc2 */ +#define pa5 3.5478305072e-02f /* 0x3d1151b3 */ +#define pa6 -2.1663755178e-03f /* 0xbb0df9c0 */ +#define qa1 1.0642088205e-01f /* 0x3dd9f331 */ +#define qa2 5.4039794207e-01f /* 0x3f0a5785 */ +#define qa3 7.1828655899e-02f /* 0x3d931ae7 */ +#define qa4 1.2617121637e-01f /* 0x3e013307 */ +#define qa5 1.3637083583e-02f /* 0x3c5f6e13 */ +#define qa6 1.1984500103e-02f /* 0x3c445aa3 */ + +// Coefficients for approximation to erfc in [1.251/0.35] + +#define ra0 -9.8649440333e-03f /* 0xbc21a093 */ +#define ra1 -6.9385856390e-01f /* 0xbf31a0b7 */ +#define ra2 -1.0558626175e+01f /* 0xc128f022 */ +#define ra3 -6.2375331879e+01f /* 0xc2798057 */ +#define ra4 -1.6239666748e+02f /* 0xc322658c */ +#define ra5 -1.8460508728e+02f /* 0xc3389ae7 */ +#define ra6 -8.1287437439e+01f /* 0xc2a2932b */ +#define ra7 -9.8143291473e+00f /* 0xc11d077e */ +#define sa1 1.9651271820e+01f /* 0x419d35ce */ +#define sa2 1.3765776062e+02f /* 0x4309a863 */ +#define sa3 4.3456588745e+02f /* 0x43d9486f */ +#define sa4 6.4538726807e+02f /* 0x442158c9 */ +#define sa5 4.2900814819e+02f /* 0x43d6810b */ +#define sa6 1.0863500214e+02f /* 0x42d9451f */ +#define sa7 6.5702495575e+00f /* 0x40d23f7c */ +#define sa8 -6.0424413532e-02f /* 0xbd777f97 */ + +// Coefficients for approximation to erfc in [1/.3528] + +#define rb0 -9.8649431020e-03f /* 0xbc21a092 */ +#define rb1 -7.9928326607e-01f /* 0xbf4c9dd4 */ +#define rb2 -1.7757955551e+01f /* 0xc18e104b */ +#define rb3 -1.6063638306e+02f /* 0xc320a2ea */ +#define rb4 -6.3756646729e+02f /* 0xc41f6441 */ +#define rb5 -1.0250950928e+03f /* 0xc480230b */ +#define rb6 -4.8351919556e+02f /* 0xc3f1c275 */ +#define sb1 3.0338060379e+01f /* 0x41f2b459 */ +#define sb2 3.2579251099e+02f /* 0x43a2e571 */ +#define sb3 1.5367296143e+03f /* 0x44c01759 */ +#define sb4 3.1998581543e+03f /* 0x4547fdbb */ +#define sb5 2.5530502930e+03f /* 0x451f90ce */ +#define sb6 4.7452853394e+02f /* 0x43ed43a7 */ +#define sb7 -2.2440952301e+01f /* 0xc1b38712 */ + +_CLC_OVERLOAD _CLC_DEF float erf(float x) { + int hx = as_uint(x); + int ix = hx & 0x7fffffff; + float absx = as_float(ix); + + float x2 = absx * absx; + float t = 1.0f / x2; + float tt = absx - 1.0f; + t = absx < 1.25f ? tt : t; + t = absx < 0.84375f ? x2 : t; + + float u, v, tu, tv; + + // |x| < 6 + u = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, rb6, rb5), rb4), rb3), rb2), rb1), rb0); + v = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sb7, sb6), sb5), sb4), sb3), sb2), sb1); + + tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0); + tv = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1); + u = absx < 0x1.6db6dcp+1f ? tu : u; + v = absx < 0x1.6db6dcp+1f ? tv : v; + + tu = mad(t, mad(t, mad(t, mad(t, mad(t, mad(t, pa6, pa5), pa4), pa3), pa2), pa1), pa0); + tv = mad(t, mad(t, mad(t, mad(t, mad(t, qa6, qa5), qa4), qa3), qa2), qa1); + u = absx < 1.25f ? tu : u; + v = absx < 1.25f ? tv : v; + + tu = mad(t, mad(t, mad(t, mad(t, pp4, pp3), pp2), pp1), pp0); + tv = mad(t, mad(t, mad(t, mad(t, qq5, qq4), qq3), qq2), qq1); + u = absx < 0.84375f ? tu : u; + v = absx < 0.84375f ? tv : v; + + v = mad(t, v, 1.0f); + float q = MATH_DIVIDE(u, v); + + float ret = 1.0f; + + // |x| < 6 + float z = as_float(ix & 0xfffff000); + float r = exp(mad(-z, z, -0.5625f)) * exp(mad(z-absx, z+absx, q)); + r = 1.0f - MATH_DIVIDE(r, absx); + ret = absx < 6.0f ? r : ret; + + r = erx + q; + ret = absx < 1.25f ? r : ret; + + ret = as_float((hx & 0x80000000) | as_int(ret)); + + r = mad(x, q, x); + ret = absx < 0.84375f ? r : ret; + + // Prevent underflow + r = 0.125f * mad(8.0f, x, efx8 * x); + ret = absx < 0x1.0p-28f ? r : ret; + + ret = isnan(x) ? x : ret; + + return ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, erf, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +/* double erf(double x) + * double erfc(double x) + * x + * 2 |\ + * erf(x) = --------- | exp(-t*t)dt + * sqrt(pi) \| + * 0 + * + * erfc(x) = 1-erf(x) + * Note that + * erf(-x) = -erf(x) + * erfc(-x) = 2 - erfc(x) + * + * Method: + * 1. For |x| in [0, 0.84375] + * erf(x) = x + x*R(x^2) + * erfc(x) = 1 - erf(x) if x in [-.84375,0.25] + * = 0.5 + ((0.5-x)-x*R) if x in [0.25,0.84375] + * where R = P/Q where P is an odd poly of degree 8 and + * Q is an odd poly of degree 10. + * -57.90 + * | R - (erf(x)-x)/x | <= 2 + * + * + * Remark. The formula is derived by noting + * erf(x) = (2/sqrt(pi))*(x - x^3/3 + x^5/10 - x^7/42 + ....) + * and that + * 2/sqrt(pi) = 1.128379167095512573896158903121545171688 + * is close to one. The interval is chosen because the fix + * point of erf(x) is near 0.6174 (i.e., erf(x)=x when x is + * near 0.6174), and by some experiment, 0.84375 is chosen to + * guarantee the error is less than one ulp for erf. + * + * 2. For |x| in [0.84375,1.25], let s = |x| - 1, and + * c = 0.84506291151 rounded to single (24 bits) + * erf(x) = sign(x) * (c + P1(s)/Q1(s)) + * erfc(x) = (1-c) - P1(s)/Q1(s) if x > 0 + * 1+(c+P1(s)/Q1(s)) if x < 0 + * |P1/Q1 - (erf(|x|)-c)| <= 2**-59.06 + * Remark: here we use the taylor series expansion at x=1. + * erf(1+s) = erf(1) + s*Poly(s) + * = 0.845.. + P1(s)/Q1(s) + * That is, we use rational approximation to approximate + * erf(1+s) - (c = (single)0.84506291151) + * Note that |P1/Q1|< 0.078 for x in [0.84375,1.25] + * where + * P1(s) = degree 6 poly in s + * Q1(s) = degree 6 poly in s + * + * 3. For x in [1.25,1/0.35(~2.857143)], + * erfc(x) = (1/x)*exp(-x*x-0.5625+R1/S1) + * erf(x) = 1 - erfc(x) + * where + * R1(z) = degree 7 poly in z, (z=1/x^2) + * S1(z) = degree 8 poly in z + * + * 4. For x in [1/0.35,28] + * erfc(x) = (1/x)*exp(-x*x-0.5625+R2/S2) if x > 0 + * = 2.0 - (1/x)*exp(-x*x-0.5625+R2/S2) if -6 x >= 28 + * erf(x) = sign(x) *(1 - tiny) (raise inexact) + * erfc(x) = tiny*tiny (raise underflow) if x > 0 + * = 2 - tiny if x<0 + * + * 7. Special case: + * erf(0) = 0, erf(inf) = 1, erf(-inf) = -1, + * erfc(0) = 1, erfc(inf) = 0, erfc(-inf) = 2, + * erfc/erf(NaN) is NaN + */ + +#define AU0 -9.86494292470009928597e-03 +#define AU1 -7.99283237680523006574e-01 +#define AU2 -1.77579549177547519889e+01 +#define AU3 -1.60636384855821916062e+02 +#define AU4 -6.37566443368389627722e+02 +#define AU5 -1.02509513161107724954e+03 +#define AU6 -4.83519191608651397019e+02 + +#define AV1 3.03380607434824582924e+01 +#define AV2 3.25792512996573918826e+02 +#define AV3 1.53672958608443695994e+03 +#define AV4 3.19985821950859553908e+03 +#define AV5 2.55305040643316442583e+03 +#define AV6 4.74528541206955367215e+02 +#define AV7 -2.24409524465858183362e+01 + +#define BU0 -9.86494403484714822705e-03 +#define BU1 -6.93858572707181764372e-01 +#define BU2 -1.05586262253232909814e+01 +#define BU3 -6.23753324503260060396e+01 +#define BU4 -1.62396669462573470355e+02 +#define BU5 -1.84605092906711035994e+02 +#define BU6 -8.12874355063065934246e+01 +#define BU7 -9.81432934416914548592e+00 + +#define BV1 1.96512716674392571292e+01 +#define BV2 1.37657754143519042600e+02 +#define BV3 4.34565877475229228821e+02 +#define BV4 6.45387271733267880336e+02 +#define BV5 4.29008140027567833386e+02 +#define BV6 1.08635005541779435134e+02 +#define BV7 6.57024977031928170135e+00 +#define BV8 -6.04244152148580987438e-02 + +#define CU0 -2.36211856075265944077e-03 +#define CU1 4.14856118683748331666e-01 +#define CU2 -3.72207876035701323847e-01 +#define CU3 3.18346619901161753674e-01 +#define CU4 -1.10894694282396677476e-01 +#define CU5 3.54783043256182359371e-02 +#define CU6 -2.16637559486879084300e-03 + +#define CV1 1.06420880400844228286e-01 +#define CV2 5.40397917702171048937e-01 +#define CV3 7.18286544141962662868e-02 +#define CV4 1.26171219808761642112e-01 +#define CV5 1.36370839120290507362e-02 +#define CV6 1.19844998467991074170e-02 + +#define DU0 1.28379167095512558561e-01 +#define DU1 -3.25042107247001499370e-01 +#define DU2 -2.84817495755985104766e-02 +#define DU3 -5.77027029648944159157e-03 +#define DU4 -2.37630166566501626084e-05 + +#define DV1 3.97917223959155352819e-01 +#define DV2 6.50222499887672944485e-02 +#define DV3 5.08130628187576562776e-03 +#define DV4 1.32494738004321644526e-04 +#define DV5 -3.96022827877536812320e-06 + +_CLC_OVERLOAD _CLC_DEF double erf(double y) { + double x = fabs(y); + double x2 = x * x; + double xm1 = x - 1.0; + + // Poly variable + double t = 1.0 / x2; + t = x < 1.25 ? xm1 : t; + t = x < 0.84375 ? x2 : t; + + double u, ut, v, vt; + + // Evaluate rational poly + // XXX We need to see of we can grab 16 coefficents from a table + // faster than evaluating 3 of the poly pairs + // if (x < 6.0) + u = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AU6, AU5), AU4), AU3), AU2), AU1), AU0); + v = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, AV7, AV6), AV5), AV4), AV3), AV2), AV1); + + ut = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BU7, BU6), BU5), BU4), BU3), BU2), BU1), BU0); + vt = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, BV8, BV7), BV6), BV5), BV4), BV3), BV2), BV1); + u = x < 0x1.6db6ep+1 ? ut : u; + v = x < 0x1.6db6ep+1 ? vt : v; + + ut = fma(t, fma(t, fma(t, fma(t, fma(t, fma(t, CU6, CU5), CU4), CU3), CU2), CU1), CU0); + vt = fma(t, fma(t, fma(t, fma(t, fma(t, CV6, CV5), CV4), CV3), CV2), CV1); + u = x < 1.25 ? ut : u; + v = x < 1.25 ? vt : v; + + ut = fma(t, fma(t, fma(t, fma(t, DU4, DU3), DU2), DU1), DU0); + vt = fma(t, fma(t, fma(t, fma(t, DV5, DV4), DV3), DV2), DV1); + u = x < 0.84375 ? ut : u; + v = x < 0.84375 ? vt : v; + + v = fma(t, v, 1.0); + + // Compute rational approximation + double q = u / v; + + // Compute results + double z = as_double(as_long(x) & 0xffffffff00000000L); + double r = exp(-z * z - 0.5625) * exp((z - x) * (z + x) + q); + r = 1.0 - r / x; + + double ret = x < 6.0 ? r : 1.0; + + r = 8.45062911510467529297e-01 + q; + ret = x < 1.25 ? r : ret; + + q = x < 0x1.0p-28 ? 1.28379167095512586316e-01 : q; + + r = fma(x, q, x); + ret = x < 0.84375 ? r : ret; + + ret = isnan(x) ? x : ret; + + return y < 0.0 ? -ret : ret; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, erf, double); + +#endif diff -Nru libclc-0.2.0+git20150813/generic/lib/math/expm1.cl libclc-0.2.0+git20170213/generic/lib/math/expm1.cl --- libclc-0.2.0+git20150813/generic/lib/math/expm1.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/expm1.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,142 @@ +#include + +#include "math.h" +#include "tables.h" +#include "../clcmacro.h" + +/* Refer to the exp routine for the underlying algorithm */ + +_CLC_OVERLOAD _CLC_DEF float expm1(float x) { + const float X_MAX = 0x1.62e42ep+6f; // 128*log2 : 88.722839111673 + const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184 + + const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657 + const float R_LOG2_BY_64_LD = 0x1.620000p-7f; // log2/64 lead: 0.0108032227 + const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388 + + uint xi = as_uint(x); + int n = (int)(x * R_64_BY_LOG2); + float fn = (float)n; + + int j = n & 0x3f; + int m = n >> 6; + + float r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x)); + + // Truncated Taylor series + float z2 = mad(r*r, mad(r, mad(r, 0x1.555556p-5f, 0x1.555556p-3f), 0.5f), r); + + float m2 = as_float((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + float2 tv = USE_TABLE(exp_tbl_ep, j); + + float two_to_jby64_h = tv.s0 * m2; + float two_to_jby64_t = tv.s1 * m2; + float two_to_jby64 = two_to_jby64_h + two_to_jby64_t; + + z2 = mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f); + //Make subnormals work + z2 = x == 0.f ? x : z2; + z2 = x < X_MIN | m < -24 ? -1.0f : z2; + z2 = x > X_MAX ? as_float(PINFBITPATT_SP32) : z2; + z2 = isnan(x) ? x : z2; + + return z2; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, expm1, float) + +#ifdef cl_khr_fp64 + +#include "exp_helper.h" + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double expm1(double x) { + const double max_expm1_arg = 709.8; + const double min_expm1_arg = -37.42994775023704; + const double log_OnePlus_OneByFour = 0.22314355131420976; //0x3FCC8FF7C79A9A22 = log(1+1/4) + const double log_OneMinus_OneByFour = -0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4) + const double sixtyfour_by_lnof2 = 92.33248261689366; //0x40571547652b82fe + const double lnof2_by_64_head = 0.010830424696223417; //0x3f862e42fefa0000 + const double lnof2_by_64_tail = 2.5728046223276688e-14; //0x3d1cf79abc9e3b39 + + // First, assume log(1-1/4) < x < log(1+1/4) i.e -0.28768 < x < 0.22314 + double u = as_double(as_ulong(x) & 0xffffffffff000000UL); + double v = x - u; + double y = u * u * 0.5; + double z = v * (x + u) * 0.5; + + double q = fma(x, + fma(x, + fma(x, + fma(x, + fma(x, + fma(x, + fma(x, + fma(x,2.4360682937111612e-8, 2.7582184028154370e-7), + 2.7558212415361945e-6), + 2.4801576918453420e-5), + 1.9841269447671544e-4), + 1.3888888890687830e-3), + 8.3333333334012270e-3), + 4.1666666666665560e-2), + 1.6666666666666632e-1); + q *= x * x * x; + + double z1g = (u + y) + (q + (v + z)); + double z1 = x + (y + (q + z)); + z1 = y >= 0x1.0p-7 ? z1g : z1; + + // Now assume outside interval around 0 + int n = (int)(x * sixtyfour_by_lnof2); + int j = n & 0x3f; + int m = n >> 6; + + double2 tv = USE_TABLE(two_to_jby64_ep_tbl, j); + double f1 = tv.s0; + double f2 = tv.s1; + double f = f1 + f2; + + double dn = -n; + double r = fma(dn, lnof2_by_64_tail, fma(dn, lnof2_by_64_head, x)); + + q = fma(r, + fma(r, + fma(r, + fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03), + 4.16666666662260795726e-02), + 1.66666666665260878863e-01), + 5.00000000000000008883e-01); + q = fma(r*r, q, r); + + double twopm = as_double((long)(m + EXPBIAS_DP64) << EXPSHIFTBITS_DP64); + double twopmm = as_double((long)(EXPBIAS_DP64 - m) << EXPSHIFTBITS_DP64); + + // Computations for m > 52, including where result is close to Inf + ulong uval = as_ulong(0x1.0p+1023 * (f1 + (f * q + (f2)))); + int e = (int)(uval >> EXPSHIFTBITS_DP64) + 1; + + double zme1024 = as_double(((long)e << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64)); + zme1024 = e == 2047 ? as_double(PINFBITPATT_DP64) : zme1024; + + double zmg52 = twopm * (f1 + fma(f, q, f2 - twopmm)); + zmg52 = m == 1024 ? zme1024 : zmg52; + + // For m < 53 + double zml53 = twopm * ((f1 - twopmm) + fma(f1, q, f2*(1.0 + q))); + + // For m < -7 + double zmln7 = fma(twopm, f1 + fma(f, q, f2), -1.0); + + z = m < 53 ? zml53 : zmg52; + z = m < -7 ? zmln7 : z; + z = x > log_OneMinus_OneByFour & x < log_OnePlus_OneByFour ? z1 : z; + z = x > max_expm1_arg ? as_double(PINFBITPATT_DP64) : z; + z = x < min_expm1_arg ? -1.0 : z; + + return z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, expm1, double) + +#endif diff -Nru libclc-0.2.0+git20150813/generic/lib/math/fdim.cl libclc-0.2.0+git20170213/generic/lib/math/fdim.cl --- libclc-0.2.0+git20150813/generic/lib/math/fdim.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/fdim.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,10 @@ +#include + +#include "math.h" + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/lib/math/fdim.inc libclc-0.2.0+git20170213/generic/lib/math/fdim.inc --- libclc-0.2.0+git20150813/generic/lib/math/fdim.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/fdim.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * Copyright (c) 2016 Aaron Watry + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#if __CLC_FPSIZE == 32 +#ifdef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fdim(__CLC_GENTYPE x, __CLC_GENTYPE y) { + if (__builtin_isnan(x) || __builtin_isnan(y)) + return as_float(QNANBITPATT_SP32); + return fmax(x - y, 0.0f); +} +#define __CLC_FDIM_VEC(width) \ +_CLC_OVERLOAD _CLC_DEF float##width fdim(float##width x, float##width y) { \ + /* Determine if x or y is NaN. */ \ + /* Vector true is -1, i.e. all-bits-set, and NaN==NaN is false. */ \ + /* If either is NaN, then ~((x==x) & (y==y)) will be 0 (e.g. ~(-1)), as will n. */ \ + int##width n = ~((x == x) & (y == y)) & QNANBITPATT_SP32; \ + /* Calculate x-y if x>y, otherwise positive 0, again taking */ \ + /* advantage of vector true being all-bits-set. */ \ + int##width r = (x > y) & as_int##width(x - y); \ + return as_float##width(n | r); \ +} +__CLC_FDIM_VEC(2) +__CLC_FDIM_VEC(3) +__CLC_FDIM_VEC(4) +__CLC_FDIM_VEC(8) +__CLC_FDIM_VEC(16) +#undef __CLC_FDIM_VEC +#endif +#endif + +#if __CLC_FPSIZE == 64 +#ifdef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE fdim(__CLC_GENTYPE x, private __CLC_GENTYPE y) { + long n = -(isnan(x) | isnan(y)) & QNANBITPATT_DP64; + long r = -(x > y) & as_long(x - y); + return as_double(n | r); +} +#define __CLC_FDIM_VEC(width) \ +_CLC_OVERLOAD _CLC_DEF double##width fdim(double##width x, double##width y) { \ + /* See comment in float implementation for explanation. */ \ + long##width n = ~((x == x) & (y == y)) & QNANBITPATT_DP64; \ + long##width r = (x > y) & as_long##width(x - y); \ + return as_double##width(n | r); \ +} +__CLC_FDIM_VEC(2) +__CLC_FDIM_VEC(3) +__CLC_FDIM_VEC(4) +__CLC_FDIM_VEC(8) +__CLC_FDIM_VEC(16) +#undef __CLC_FDIM_VEC +#endif +#endif diff -Nru libclc-0.2.0+git20150813/generic/lib/math/frexp.cl libclc-0.2.0+git20170213/generic/lib/math/frexp.cl --- libclc-0.2.0+git20150813/generic/lib/math/frexp.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/frexp.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,10 @@ +#include + +#include "math.h" + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/lib/math/frexp.inc libclc-0.2.0+git20170213/generic/lib/math/frexp.inc --- libclc-0.2.0+git20150813/generic/lib/math/frexp.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/frexp.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * Copyright (c) 2016 Aaron Watry + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#if __CLC_FPSIZE == 32 +#ifdef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(float x, private int *ep) { + int i = as_int(x); + int ai = i & 0x7fffffff; + int d = ai > 0 & ai < 0x00800000; + // scale subnormal by 2^26 without multiplying + float s = as_float(ai | 0x0d800000) - 0x1.0p-100F; + ai = d ? as_int(s) : ai; + int e = (ai >> 23) - 126 - (d ? 26 : 0); + int t = ai == 0 | e == 129; + i = (i & 0x80000000) | 0x3f000000 | (ai & 0x007fffff); + *ep = t ? 0 : e; + return t ? x : as_float(i); +} +#define __CLC_FREXP_VEC(width) \ +_CLC_OVERLOAD _CLC_DEF float##width frexp(float##width x, private int##width *ep) { \ + int##width i = as_int##width(x); \ + int##width ai = i & 0x7fffffff; \ + int##width d = ai > 0 & ai < 0x00800000; \ + /* scale subnormal by 2^26 without multiplying */ \ + float##width s = as_float##width(ai | 0x0d800000) - 0x1.0p-100F; \ + ai = bitselect(ai, as_int##width(s), d); \ + int##width e = (ai >> 23) - 126 - bitselect((int##width)0, (int##width)26, d); \ + int##width t = ai == (int##width)0 | e == (int##width)129; \ + i = (i & (int##width)0x80000000) | (int##width)0x3f000000 | (ai & 0x007fffff); \ + *ep = bitselect(e, (int##width)0, t); \ + return bitselect(as_float##width(i), x, as_float##width(t)); \ +} +__CLC_FREXP_VEC(2) +__CLC_FREXP_VEC(3) +__CLC_FREXP_VEC(4) +__CLC_FREXP_VEC(8) +__CLC_FREXP_VEC(16) +#undef __CLC_FREXP_VEC +#endif +#endif + +#if __CLC_FPSIZE == 64 +#ifdef __CLC_SCALAR +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, private __CLC_INTN *ep) { + long i = as_long(x); + long ai = i & 0x7fffffffffffffffL; + int d = ai > 0 & ai < 0x0010000000000000L; + // scale subnormal by 2^54 without multiplying + double s = as_double(ai | 0x0370000000000000L) - 0x1.0p-968; + ai = d ? as_long(s) : ai; + int e = (int)(ai >> 52) - 1022 - (d ? 54 : 0); + int t = ai == 0 | e == 1025; + i = (i & 0x8000000000000000L) | 0x3fe0000000000000L | (ai & 0x000fffffffffffffL); + *ep = t ? 0 : e; + return t ? x : as_double(i); +} +#define __CLC_FREXP_VEC(width) \ +_CLC_OVERLOAD _CLC_DEF double##width frexp(double##width x, private int##width *ep) { \ + long##width i = as_long##width(x); \ + long##width ai = i & 0x7fffffffffffffffL; \ + long##width d = ai > 0 & ai < 0x0010000000000000L; \ + /* scale subnormal by 2^54 without multiplying */ \ + double##width s = as_double##width(ai | 0x0370000000000000L) - 0x1.0p-968; \ + ai = bitselect(ai, as_long##width(s), d); \ + int##width e = convert_int##width(ai >> 52) - 1022 - bitselect((int##width)0, (int##width)54, convert_int##width(d)); \ + int##width t = convert_int##width(ai == (long##width)0) | (e == (int##width)129); \ + i = (i & (long##width)0x8000000000000000L) | (long##width)0x3fe0000000000000L | (ai & 0x000fffffffffffffL); \ + *ep = bitselect(e, (int##width)0, t); \ + return bitselect(as_double##width(i), x, as_double##width(convert_long##width(t))); \ +} +__CLC_FREXP_VEC(2) +__CLC_FREXP_VEC(3) +__CLC_FREXP_VEC(4) +__CLC_FREXP_VEC(8) +__CLC_FREXP_VEC(16) +#undef __CLC_FREXP_VEC +#endif +#endif + +#define __CLC_FREXP_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE frexp(__CLC_GENTYPE x, addrspace __CLC_INTN *iptr) { \ + __CLC_INTN private_iptr; \ + __CLC_GENTYPE ret = frexp(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ +} + +__CLC_FREXP_DEF(local); +__CLC_FREXP_DEF(global); + +#undef __CLC_FREXP_DEF diff -Nru libclc-0.2.0+git20150813/generic/lib/math/ilogb.cl libclc-0.2.0+git20170213/generic/lib/math/ilogb.cl --- libclc-0.2.0+git20150813/generic/lib/math/ilogb.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/ilogb.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * Copyright (c) 2016 Aaron Watry + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include "../clcmacro.h" +#include "math.h" + +_CLC_OVERLOAD _CLC_DEF int ilogb(float x) { + uint ux = as_uint(x); + uint ax = ux & EXSIGNBIT_SP32; + int rs = -118 - (int) clz(ux & MANTBITS_SP32); + int r = (int) (ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + r = ax < 0x00800000U ? rs : r; + r = ax > EXPBITS_SP32 | ax == 0 ? 0x80000000 : r; + r = ax == EXPBITS_SP32 ? 0x7fffffff : r; + return r; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ilogb, float); + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF int ilogb(double x) { + ulong ux = as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + int r = (int) (ax >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + int rs = -1011 - (int) clz(ax & MANTBITS_DP64); + r = ax < 0x0010000000000000UL ? rs : r; + r = ax > 0x7ff0000000000000UL | ax == 0UL ? 0x80000000 : r; + r = ax == 0x7ff0000000000000UL ? 0x7fffffff : r; + return r; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, ilogb, double); + +#endif // cl_khr_fp64 diff -Nru libclc-0.2.0+git20150813/generic/lib/math/lgamma.cl libclc-0.2.0+git20170213/generic/lib/math/lgamma.cl --- libclc-0.2.0+git20150813/generic/lib/math/lgamma.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/lgamma.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016 Aaron Watry + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float lgamma(float x) { + int s; + return lgamma_r(x, &s); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, lgamma, float) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double lgamma(double x) { + int s; + return lgamma_r(x, &s); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, lgamma, double) + +#endif \ No newline at end of file diff -Nru libclc-0.2.0+git20150813/generic/lib/math/lgamma_r.cl libclc-0.2.0+git20170213/generic/lib/math/lgamma_r.cl --- libclc-0.2.0+git20150813/generic/lib/math/lgamma_r.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/lgamma_r.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +#include "../clcmacro.h" +#include "math.h" + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/lib/math/lgamma_r.inc libclc-0.2.0+git20170213/generic/lib/math/lgamma_r.inc --- libclc-0.2.0+git20150813/generic/lib/math/lgamma_r.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/lgamma_r.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * Copyright (c) 2016 Aaron Watry + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#if __CLC_FPSIZE == 32 +#ifdef __CLC_SCALAR +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +#define pi_f 3.1415927410e+00f /* 0x40490fdb */ + +#define a0_f 7.7215664089e-02f /* 0x3d9e233f */ +#define a1_f 3.2246702909e-01f /* 0x3ea51a66 */ +#define a2_f 6.7352302372e-02f /* 0x3d89f001 */ +#define a3_f 2.0580807701e-02f /* 0x3ca89915 */ +#define a4_f 7.3855509982e-03f /* 0x3bf2027e */ +#define a5_f 2.8905137442e-03f /* 0x3b3d6ec6 */ +#define a6_f 1.1927076848e-03f /* 0x3a9c54a1 */ +#define a7_f 5.1006977446e-04f /* 0x3a05b634 */ +#define a8_f 2.2086278477e-04f /* 0x39679767 */ +#define a9_f 1.0801156895e-04f /* 0x38e28445 */ +#define a10_f 2.5214456400e-05f /* 0x37d383a2 */ +#define a11_f 4.4864096708e-05f /* 0x383c2c75 */ + +#define tc_f 1.4616321325e+00f /* 0x3fbb16c3 */ + +#define tf_f -1.2148628384e-01f /* 0xbdf8cdcd */ +/* tt -(tail of tf) */ +#define tt_f 6.6971006518e-09f /* 0x31e61c52 */ + +#define t0_f 4.8383611441e-01f /* 0x3ef7b95e */ +#define t1_f -1.4758771658e-01f /* 0xbe17213c */ +#define t2_f 6.4624942839e-02f /* 0x3d845a15 */ +#define t3_f -3.2788541168e-02f /* 0xbd064d47 */ +#define t4_f 1.7970675603e-02f /* 0x3c93373d */ +#define t5_f -1.0314224288e-02f /* 0xbc28fcfe */ +#define t6_f 6.1005386524e-03f /* 0x3bc7e707 */ +#define t7_f -3.6845202558e-03f /* 0xbb7177fe */ +#define t8_f 2.2596477065e-03f /* 0x3b141699 */ +#define t9_f -1.4034647029e-03f /* 0xbab7f476 */ +#define t10_f 8.8108185446e-04f /* 0x3a66f867 */ +#define t11_f -5.3859531181e-04f /* 0xba0d3085 */ +#define t12_f 3.1563205994e-04f /* 0x39a57b6b */ +#define t13_f -3.1275415677e-04f /* 0xb9a3f927 */ +#define t14_f 3.3552918467e-04f /* 0x39afe9f7 */ + +#define u0_f -7.7215664089e-02f /* 0xbd9e233f */ +#define u1_f 6.3282704353e-01f /* 0x3f2200f4 */ +#define u2_f 1.4549225569e+00f /* 0x3fba3ae7 */ +#define u3_f 9.7771751881e-01f /* 0x3f7a4bb2 */ +#define u4_f 2.2896373272e-01f /* 0x3e6a7578 */ +#define u5_f 1.3381091878e-02f /* 0x3c5b3c5e */ + +#define v1_f 2.4559779167e+00f /* 0x401d2ebe */ +#define v2_f 2.1284897327e+00f /* 0x4008392d */ +#define v3_f 7.6928514242e-01f /* 0x3f44efdf */ +#define v4_f 1.0422264785e-01f /* 0x3dd572af */ +#define v5_f 3.2170924824e-03f /* 0x3b52d5db */ + +#define s0_f -7.7215664089e-02f /* 0xbd9e233f */ +#define s1_f 2.1498242021e-01f /* 0x3e5c245a */ +#define s2_f 3.2577878237e-01f /* 0x3ea6cc7a */ +#define s3_f 1.4635047317e-01f /* 0x3e15dce6 */ +#define s4_f 2.6642270386e-02f /* 0x3cda40e4 */ +#define s5_f 1.8402845599e-03f /* 0x3af135b4 */ +#define s6_f 3.1947532989e-05f /* 0x3805ff67 */ + +#define r1_f 1.3920053244e+00f /* 0x3fb22d3b */ +#define r2_f 7.2193557024e-01f /* 0x3f38d0c5 */ +#define r3_f 1.7193385959e-01f /* 0x3e300f6e */ +#define r4_f 1.8645919859e-02f /* 0x3c98bf54 */ +#define r5_f 7.7794247773e-04f /* 0x3a4beed6 */ +#define r6_f 7.3266842264e-06f /* 0x36f5d7bd */ + +#define w0_f 4.1893854737e-01f /* 0x3ed67f1d */ +#define w1_f 8.3333335817e-02f /* 0x3daaaaab */ +#define w2_f -2.7777778450e-03f /* 0xbb360b61 */ +#define w3_f 7.9365057172e-04f /* 0x3a500cfd */ +#define w4_f -5.9518753551e-04f /* 0xba1c065c */ +#define w5_f 8.3633989561e-04f /* 0x3a5b3dd2 */ +#define w6_f -1.6309292987e-03f /* 0xbad5c4e8 */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE lgamma_r(float x, private int *signp) { + int hx = as_int(x); + int ix = hx & 0x7fffffff; + float absx = as_float(ix); + + if (ix >= 0x7f800000) { + *signp = 1; + return x; + } + + if (absx < 0x1.0p-70f) { + *signp = hx < 0 ? -1 : 1; + return -log(absx); + } + + float r; + + if (absx == 1.0f | absx == 2.0f) + r = 0.0f; + + else if (absx < 2.0f) { + float y = 2.0f - absx; + int i = 0; + + int c = absx < 0x1.bb4c30p+0f; + float yt = absx - tc_f; + y = c ? yt : y; + i = c ? 1 : i; + + c = absx < 0x1.3b4c40p+0f; + yt = absx - 1.0f; + y = c ? yt : y; + i = c ? 2 : i; + + r = -log(absx); + yt = 1.0f - absx; + c = absx <= 0x1.ccccccp-1f; + r = c ? r : 0.0f; + y = c ? yt : y; + i = c ? 0 : i; + + c = absx < 0x1.769440p-1f; + yt = absx - (tc_f - 1.0f); + y = c ? yt : y; + i = c ? 1 : i; + + c = absx < 0x1.da6610p-3f; + y = c ? absx : y; + i = c ? 2 : i; + + float z, w, p1, p2, p3, p; + switch (i) { + case 0: + z = y * y; + p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10_f, a8_f), a6_f), a4_f), a2_f), a0_f); + p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11_f, a9_f), a7_f), a5_f), a3_f), a1_f); + p = mad(y, p1, p2); + r += mad(y, -0.5f, p); + break; + case 1: + z = y * y; + w = z * y; + p1 = mad(w, mad(w, mad(w, mad(w, t12_f, t9_f), t6_f), t3_f), t0_f); + p2 = mad(w, mad(w, mad(w, mad(w, t13_f, t10_f), t7_f), t4_f), t1_f); + p3 = mad(w, mad(w, mad(w, mad(w, t14_f, t11_f), t8_f), t5_f), t2_f); + p = mad(z, p1, -mad(w, -mad(y, p3, p2), tt_f)); + r += tf_f + p; + break; + case 2: + p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5_f, u4_f), u3_f), u2_f), u1_f), u0_f); + p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5_f, v4_f), v3_f), v2_f), v1_f), 1.0f); + r += mad(y, -0.5f, MATH_DIVIDE(p1, p2)); + break; + } + } else if (absx < 8.0f) { + int i = (int) absx; + float y = absx - (float) i; + float p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6_f, s5_f), s4_f), s3_f), s2_f), s1_f), s0_f); + float q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6_f, r5_f), r4_f), r3_f), r2_f), r1_f), 1.0f); + r = mad(y, 0.5f, MATH_DIVIDE(p, q)); + + float y6 = y + 6.0f; + float y5 = y + 5.0f; + float y4 = y + 4.0f; + float y3 = y + 3.0f; + float y2 = y + 2.0f; + + float z = 1.0f; + z *= i > 6 ? y6 : 1.0f; + z *= i > 5 ? y5 : 1.0f; + z *= i > 4 ? y4 : 1.0f; + z *= i > 3 ? y3 : 1.0f; + z *= i > 2 ? y2 : 1.0f; + + r += log(z); + } else if (absx < 0x1.0p+58f) { + float z = 1.0f / absx; + float y = z * z; + float w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6_f, w5_f), w4_f), w3_f), w2_f), w1_f), w0_f); + r = mad(absx - 0.5f, log(absx) - 1.0f, w); + } else + // 2**58 <= x <= Inf + r = absx * (log(absx) - 1.0f); + + int s = 1; + + if (x < 0.0f) { + float t = sinpi(x); + r = log(pi_f / fabs(t * x)) - r; + r = t == 0.0f ? as_float(PINFBITPATT_SP32) : r; + s = t < 0.0f ? -1 : s; + } + + *signp = s; + return r; +} + +_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, lgamma_r, float, private, int) + +#endif +#endif + +#if __CLC_FPSIZE == 64 +#ifdef __CLC_SCALAR +// ==================================================== +// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. +// +// Developed at SunPro, a Sun Microsystems, Inc. business. +// Permission to use, copy, modify, and distribute this +// software is freely granted, provided that this notice +// is preserved. +// ==================================================== + +// lgamma_r(x, i) +// Reentrant version of the logarithm of the Gamma function +// with user provide pointer for the sign of Gamma(x). +// +// Method: +// 1. Argument Reduction for 0 < x <= 8 +// Since gamma(1+s)=s*gamma(s), for x in [0,8], we may +// reduce x to a number in [1.5,2.5] by +// lgamma(1+s) = log(s) + lgamma(s) +// for example, +// lgamma(7.3) = log(6.3) + lgamma(6.3) +// = log(6.3*5.3) + lgamma(5.3) +// = log(6.3*5.3*4.3*3.3*2.3) + lgamma(2.3) +// 2. Polynomial approximation of lgamma around its +// minimun ymin=1.461632144968362245 to maintain monotonicity. +// On [ymin-0.23, ymin+0.27] (i.e., [1.23164,1.73163]), use +// Let z = x-ymin; +// lgamma(x) = -1.214862905358496078218 + z^2*poly(z) +// where +// poly(z) is a 14 degree polynomial. +// 2. Rational approximation in the primary interval [2,3] +// We use the following approximation: +// s = x-2.0; +// lgamma(x) = 0.5*s + s*P(s)/Q(s) +// with accuracy +// |P/Q - (lgamma(x)-0.5s)| < 2**-61.71 +// Our algorithms are based on the following observation +// +// zeta(2)-1 2 zeta(3)-1 3 +// lgamma(2+s) = s*(1-Euler) + --------- * s - --------- * s + ... +// 2 3 +// +// where Euler = 0.5771... is the Euler constant, which is very +// close to 0.5. +// +// 3. For x>=8, we have +// lgamma(x)~(x-0.5)log(x)-x+0.5*log(2pi)+1/(12x)-1/(360x**3)+.... +// (better formula: +// lgamma(x)~(x-0.5)*(log(x)-1)-.5*(log(2pi)-1) + ...) +// Let z = 1/x, then we approximation +// f(z) = lgamma(x) - (x-0.5)(log(x)-1) +// by +// 3 5 11 +// w = w0 + w1*z + w2*z + w3*z + ... + w6*z +// where +// |w - f(z)| < 2**-58.74 +// +// 4. For negative x, since (G is gamma function) +// -x*G(-x)*G(x) = pi/sin(pi*x), +// we have +// G(x) = pi/(sin(pi*x)*(-x)*G(-x)) +// since G(-x) is positive, sign(G(x)) = sign(sin(pi*x)) for x<0 +// Hence, for x<0, signgam = sign(sin(pi*x)) and +// lgamma(x) = log(|Gamma(x)|) +// = log(pi/(|x*sin(pi*x)|)) - lgamma(-x); +// Note: one should avoid compute pi*(-x) directly in the +// computation of sin(pi*(-x)). +// +// 5. Special Cases +// lgamma(2+s) ~ s*(1-Euler) for tiny s +// lgamma(1)=lgamma(2)=0 +// lgamma(x) ~ -log(x) for tiny x +// lgamma(0) = lgamma(inf) = inf +// lgamma(-integer) = +-inf +// +#define pi 3.14159265358979311600e+00 /* 0x400921FB, 0x54442D18 */ + +#define a0 7.72156649015328655494e-02 /* 0x3FB3C467, 0xE37DB0C8 */ +#define a1 3.22467033424113591611e-01 /* 0x3FD4A34C, 0xC4A60FAD */ +#define a2 6.73523010531292681824e-02 /* 0x3FB13E00, 0x1A5562A7 */ +#define a3 2.05808084325167332806e-02 /* 0x3F951322, 0xAC92547B */ +#define a4 7.38555086081402883957e-03 /* 0x3F7E404F, 0xB68FEFE8 */ +#define a5 2.89051383673415629091e-03 /* 0x3F67ADD8, 0xCCB7926B */ +#define a6 1.19270763183362067845e-03 /* 0x3F538A94, 0x116F3F5D */ +#define a7 5.10069792153511336608e-04 /* 0x3F40B6C6, 0x89B99C00 */ +#define a8 2.20862790713908385557e-04 /* 0x3F2CF2EC, 0xED10E54D */ +#define a9 1.08011567247583939954e-04 /* 0x3F1C5088, 0x987DFB07 */ +#define a10 2.52144565451257326939e-05 /* 0x3EFA7074, 0x428CFA52 */ +#define a11 4.48640949618915160150e-05 /* 0x3F07858E, 0x90A45837 */ + +#define tc 1.46163214496836224576e+00 /* 0x3FF762D8, 0x6356BE3F */ +#define tf -1.21486290535849611461e-01 /* 0xBFBF19B9, 0xBCC38A42 */ +#define tt -3.63867699703950536541e-18 /* 0xBC50C7CA, 0xA48A971F */ + +#define t0 4.83836122723810047042e-01 /* 0x3FDEF72B, 0xC8EE38A2 */ +#define t1 -1.47587722994593911752e-01 /* 0xBFC2E427, 0x8DC6C509 */ +#define t2 6.46249402391333854778e-02 /* 0x3FB08B42, 0x94D5419B */ +#define t3 -3.27885410759859649565e-02 /* 0xBFA0C9A8, 0xDF35B713 */ +#define t4 1.79706750811820387126e-02 /* 0x3F9266E7, 0x970AF9EC */ +#define t5 -1.03142241298341437450e-02 /* 0xBF851F9F, 0xBA91EC6A */ +#define t6 6.10053870246291332635e-03 /* 0x3F78FCE0, 0xE370E344 */ +#define t7 -3.68452016781138256760e-03 /* 0xBF6E2EFF, 0xB3E914D7 */ +#define t8 2.25964780900612472250e-03 /* 0x3F6282D3, 0x2E15C915 */ +#define t9 -1.40346469989232843813e-03 /* 0xBF56FE8E, 0xBF2D1AF1 */ +#define t10 8.81081882437654011382e-04 /* 0x3F4CDF0C, 0xEF61A8E9 */ +#define t11 -5.38595305356740546715e-04 /* 0xBF41A610, 0x9C73E0EC */ +#define t12 3.15632070903625950361e-04 /* 0x3F34AF6D, 0x6C0EBBF7 */ +#define t13 -3.12754168375120860518e-04 /* 0xBF347F24, 0xECC38C38 */ +#define t14 3.35529192635519073543e-04 /* 0x3F35FD3E, 0xE8C2D3F4 */ + +#define u0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */ +#define u1 6.32827064025093366517e-01 /* 0x3FE4401E, 0x8B005DFF */ +#define u2 1.45492250137234768737e+00 /* 0x3FF7475C, 0xD119BD6F */ +#define u3 9.77717527963372745603e-01 /* 0x3FEF4976, 0x44EA8450 */ +#define u4 2.28963728064692451092e-01 /* 0x3FCD4EAE, 0xF6010924 */ +#define u5 1.33810918536787660377e-02 /* 0x3F8B678B, 0xBF2BAB09 */ + +#define v1 2.45597793713041134822e+00 /* 0x4003A5D7, 0xC2BD619C */ +#define v2 2.12848976379893395361e+00 /* 0x40010725, 0xA42B18F5 */ +#define v3 7.69285150456672783825e-01 /* 0x3FE89DFB, 0xE45050AF */ +#define v4 1.04222645593369134254e-01 /* 0x3FBAAE55, 0xD6537C88 */ +#define v5 3.21709242282423911810e-03 /* 0x3F6A5ABB, 0x57D0CF61 */ + +#define s0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */ +#define s1 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */ +#define s2 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */ +#define s3 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */ +#define s4 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */ +#define s5 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */ +#define s6 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */ + +#define r1 1.39200533467621045958e+00 /* 0x3FF645A7, 0x62C4AB74 */ +#define r2 7.21935547567138069525e-01 /* 0x3FE71A18, 0x93D3DCDC */ +#define r3 1.71933865632803078993e-01 /* 0x3FC601ED, 0xCCFBDF27 */ +#define r4 1.86459191715652901344e-02 /* 0x3F9317EA, 0x742ED475 */ +#define r5 7.77942496381893596434e-04 /* 0x3F497DDA, 0xCA41A95B */ +#define r6 7.32668430744625636189e-06 /* 0x3EDEBAF7, 0xA5B38140 */ + +#define w0 4.18938533204672725052e-01 /* 0x3FDACFE3, 0x90C97D69 */ +#define w1 8.33333333333329678849e-02 /* 0x3FB55555, 0x5555553B */ +#define w2 -2.77777777728775536470e-03 /* 0xBF66C16C, 0x16B02E5C */ +#define w3 7.93650558643019558500e-04 /* 0x3F4A019F, 0x98CF38B6 */ +#define w4 -5.95187557450339963135e-04 /* 0xBF4380CB, 0x8C0FE741 */ +#define w5 8.36339918996282139126e-04 /* 0x3F4B67BA, 0x4CDAD5D1 */ +#define w6 -1.63092934096575273989e-03 /* 0xBF5AB89D, 0x0B9E43E4 */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, private __CLC_INTN *ip) { + ulong ux = as_ulong(x); + ulong ax = ux & EXSIGNBIT_DP64; + double absx = as_double(ax); + + if (ax >= 0x7ff0000000000000UL) { + // +-Inf, NaN + *ip = 1; + return absx; + } + + if (absx < 0x1.0p-70) { + *ip = ax == ux ? 1 : -1; + return -log(absx); + } + + // Handle rest of range + double r; + + if (absx < 2.0) { + int i = 0; + double y = 2.0 - absx; + + int c = absx < 0x1.bb4c3p+0; + double t = absx - tc; + i = c ? 1 : i; + y = c ? t : y; + + c = absx < 0x1.3b4c4p+0; + t = absx - 1.0; + i = c ? 2 : i; + y = c ? t : y; + + c = absx <= 0x1.cccccp-1; + t = -log(absx); + r = c ? t : 0.0; + t = 1.0 - absx; + i = c ? 0 : i; + y = c ? t : y; + + c = absx < 0x1.76944p-1; + t = absx - (tc - 1.0); + i = c ? 1 : i; + y = c ? t : y; + + c = absx < 0x1.da661p-3; + i = c ? 2 : i; + y = c ? absx : y; + + double p, q; + + switch (i) { + case 0: + p = fma(y, fma(y, fma(y, fma(y, a11, a10), a9), a8), a7); + p = fma(y, fma(y, fma(y, fma(y, p, a6), a5), a4), a3); + p = fma(y, fma(y, fma(y, p, a2), a1), a0); + r = fma(y, p - 0.5, r); + break; + case 1: + p = fma(y, fma(y, fma(y, fma(y, t14, t13), t12), t11), t10); + p = fma(y, fma(y, fma(y, fma(y, fma(y, p, t9), t8), t7), t6), t5); + p = fma(y, fma(y, fma(y, fma(y, fma(y, p, t4), t3), t2), t1), t0); + p = fma(y*y, p, -tt); + r += (tf + p); + break; + case 2: + p = y * fma(y, fma(y, fma(y, fma(y, fma(y, u5, u4), u3), u2), u1), u0); + q = fma(y, fma(y, fma(y, fma(y, fma(y, v5, v4), v3), v2), v1), 1.0); + r += fma(-0.5, y, p / q); + } + } else if (absx < 8.0) { + int i = absx; + double y = absx - (double) i; + double p = y * fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, s6, s5), s4), s3), s2), s1), s0); + double q = fma(y, fma(y, fma(y, fma(y, fma(y, fma(y, r6, r5), r4), r3), r2), r1), 1.0); + r = fma(0.5, y, p / q); + double z = 1.0; + // lgamma(1+s) = log(s) + lgamma(s) + double y6 = y + 6.0; + double y5 = y + 5.0; + double y4 = y + 4.0; + double y3 = y + 3.0; + double y2 = y + 2.0; + z *= i > 6 ? y6 : 1.0; + z *= i > 5 ? y5 : 1.0; + z *= i > 4 ? y4 : 1.0; + z *= i > 3 ? y3 : 1.0; + z *= i > 2 ? y2 : 1.0; + r += log(z); + } else { + double z = 1.0 / absx; + double z2 = z * z; + double w = fma(z, fma(z2, fma(z2, fma(z2, fma(z2, fma(z2, w6, w5), w4), w3), w2), w1), w0); + r = (absx - 0.5) * (log(absx) - 1.0) + w; + } + + if (x < 0.0) { + double t = sinpi(x); + r = log(pi / fabs(t * x)) - r; + r = t == 0.0 ? as_double(PINFBITPATT_DP64) : r; + *ip = t < 0.0 ? -1 : 1; + } else + *ip = 1; + + return r; +} + +_CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, lgamma_r, double, private, int) +#endif +#endif + +#define __CLC_LGAMMA_R_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE lgamma_r(__CLC_GENTYPE x, addrspace __CLC_INTN *iptr) { \ + __CLC_INTN private_iptr; \ + __CLC_GENTYPE ret = lgamma_r(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ +} +__CLC_LGAMMA_R_DEF(local); +__CLC_LGAMMA_R_DEF(global); + +#undef __CLC_LGAMMA_R_DEF diff -Nru libclc-0.2.0+git20150813/generic/lib/math/log2.cl libclc-0.2.0+git20170213/generic/lib/math/log2.cl --- libclc-0.2.0+git20150813/generic/lib/math/log2.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/log2.cl 2017-02-12 21:33:49.000000000 +0000 @@ -34,4 +34,6 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, log2, float); +#ifdef cl_khr_fp64 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, log2, double); +#endif // cl_khr_fp64 diff -Nru libclc-0.2.0+git20150813/generic/lib/math/logb.cl libclc-0.2.0+git20170213/generic/lib/math/logb.cl --- libclc-0.2.0+git20150813/generic/lib/math/logb.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/logb.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,31 @@ +#include +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float logb(float x) { + int ax = as_int(x) & EXSIGNBIT_SP32; + float s = -118 - clz(ax); + float r = (ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + r = ax >= PINFBITPATT_SP32 ? as_float(ax) : r; + r = ax < 0x00800000 ? s : r; + r = ax == 0 ? as_float(NINFBITPATT_SP32) : r; + return r; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, logb, float); + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double logb(double x) { + long ax = as_long(x) & EXSIGNBIT_DP64; + double s = -1011L - clz(ax); + double r = (int) (ax >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + r = ax >= PINFBITPATT_DP64 ? as_double(ax) : r; + r = ax < 0x0010000000000000L ? s : r; + r = ax == 0L ? as_double(NINFBITPATT_DP64) : r; + return r; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, logb, double) +#endif diff -Nru libclc-0.2.0+git20150813/generic/lib/math/modf.cl libclc-0.2.0+git20170213/generic/lib/math/modf.cl --- libclc-0.2.0+git20150813/generic/lib/math/modf.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/modf.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include "math.h" + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#endif + +#define __CLC_BODY +#include diff -Nru libclc-0.2.0+git20150813/generic/lib/math/modf.inc libclc-0.2.0+git20170213/generic/lib/math/modf.inc --- libclc-0.2.0+git20150813/generic/lib/math/modf.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/modf.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, __CLC_GENTYPE *iptr) { + *iptr = trunc(x); + return copysign(isinf(x) ? 0.0f : x - *iptr, x); +} + +#define MODF_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \ + __CLC_GENTYPE private_iptr; \ + __CLC_GENTYPE ret = modf(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ +} + +MODF_DEF(local); +MODF_DEF(global); diff -Nru libclc-0.2.0+git20150813/generic/lib/math/tables.cl libclc-0.2.0+git20170213/generic/lib/math/tables.cl --- libclc-0.2.0+git20150813/generic/lib/math/tables.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/tables.cl 2017-02-12 21:33:49.000000000 +0000 @@ -435,6 +435,315 @@ 230, 139, 2, 0, 0, 0, 0, 0, 0, 0 }; +// Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. +DECLARE_TABLE(float2, SINHCOSH_TBL, 37) = { + (float2)(0x0.000000p+0f, 0x1.000000p+0f), + (float2)(0x1.2cd9fcp+0f, 0x1.8b0756p+0f), + (float2)(0x1.d03cf6p+1f, 0x1.e18fa0p+1f), + (float2)(0x1.40926ep+3f, 0x1.422a4ap+3f), + (float2)(0x1.b4a380p+4f, 0x1.b4ee86p+4f), + (float2)(0x1.28d016p+6f, 0x1.28d6fcp+6f), + (float2)(0x1.936d22p+7f, 0x1.936e68p+7f), + (float2)(0x1.122876p+9f, 0x1.122894p+9f), + (float2)(0x1.749ea6p+10f, 0x1.749eaap+10f), + (float2)(0x1.fa7158p+11f, 0x1.fa7158p+11f), + (float2)(0x1.5829dcp+13f, 0x1.5829dep+13f), + (float2)(0x1.d3c448p+14f, 0x1.d3c448p+14f), + (float2)(0x1.3de166p+16f, 0x1.3de166p+16f), + (float2)(0x1.b00b5ap+17f, 0x1.b00b5ap+17f), + (float2)(0x1.259ac4p+19f, 0x1.259ac4p+19f), + (float2)(0x1.8f0ccap+20f, 0x1.8f0ccap+20f), + (float2)(0x1.0f2ebep+22f, 0x1.0f2ebep+22f), + (float2)(0x1.709348p+23f, 0x1.709348p+23f), + (float2)(0x1.f4f220p+24f, 0x1.f4f220p+24f), + (float2)(0x1.546d90p+26f, 0x1.546d90p+26f), + (float2)(0x1.ceb088p+27f, 0x1.ceb088p+27f), + (float2)(0x1.3a6e20p+29f, 0x1.3a6e20p+29f), + (float2)(0x1.ab5adcp+30f, 0x1.ab5adcp+30f), + (float2)(0x1.226af4p+32f, 0x1.226af4p+32f), + (float2)(0x1.8ab7fcp+33f, 0x1.8ab7fcp+33f), + (float2)(0x1.0c3d3ap+35f, 0x1.0c3d3ap+35f), + (float2)(0x1.6c9326p+36f, 0x1.6c9326p+36f), + (float2)(0x1.ef8230p+37f, 0x1.ef8230p+37f), + (float2)(0x1.50bba4p+39f, 0x1.50bba4p+39f), + (float2)(0x1.c9aae4p+40f, 0x1.c9aae4p+40f), + (float2)(0x1.370470p+42f, 0x1.370470p+42f), + (float2)(0x1.a6b766p+43f, 0x1.a6b766p+43f), + (float2)(0x1.1f43fcp+45f, 0x1.1f43fcp+45f), + (float2)(0x1.866f34p+46f, 0x1.866f34p+46f), + (float2)(0x1.0953e2p+48f, 0x1.0953e2p+48f), + (float2)(0x1.689e22p+49f, 0x1.689e22p+49f), + (float2)(0x1.ea215ap+50f, 0x1.ea215ap+50f) +}; + +DECLARE_TABLE(float2, CBRT_TBL, 129) = { + (float2)(0x1.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.008000p+0f, 0x1.51cb0ap-11f), + (float2)(0x1.014000p+0f, 0x1.39221ep-12f), + (float2)(0x1.01c000p+0f, 0x1.e06908p-11f), + (float2)(0x1.028000p+0f, 0x1.1d6978p-11f), + (float2)(0x1.034000p+0f, 0x1.4ea1bep-13f), + (float2)(0x1.03c000p+0f, 0x1.833b8ep-11f), + (float2)(0x1.048000p+0f, 0x1.587002p-12f), + (float2)(0x1.050000p+0f, 0x1.ceb290p-11f), + (float2)(0x1.05c000p+0f, 0x1.d57f34p-12f), + (float2)(0x1.068000p+0f, 0x1.cc53acp-21f), + (float2)(0x1.070000p+0f, 0x1.0fe098p-11f), + (float2)(0x1.07c000p+0f, 0x1.91b586p-15f), + (float2)(0x1.084000p+0f, 0x1.1c362ep-11f), + (float2)(0x1.090000p+0f, 0x1.94398ep-15f), + (float2)(0x1.098000p+0f, 0x1.1055bcp-11f), + (float2)(0x1.0a4000p+0f, 0x1.7e63cap-19f), + (float2)(0x1.0ac000p+0f, 0x1.d99e1ap-12f), + (float2)(0x1.0b4000p+0f, 0x1.d258dep-11f), + (float2)(0x1.0c0000p+0f, 0x1.645962p-12f), + (float2)(0x1.0c8000p+0f, 0x1.8c5b0ep-11f), + (float2)(0x1.0d4000p+0f, 0x1.83d0c8p-13f), + (float2)(0x1.0dc000p+0f, 0x1.300812p-11f), + (float2)(0x1.0e4000p+0f, 0x1.f9a65ap-11f), + (float2)(0x1.0f0000p+0f, 0x1.7bbcd8p-12f), + (float2)(0x1.0f8000p+0f, 0x1.7cbf68p-11f), + (float2)(0x1.104000p+0f, 0x1.b2c166p-14f), + (float2)(0x1.10c000p+0f, 0x1.d56ea4p-12f), + (float2)(0x1.114000p+0f, 0x1.99eb32p-11f), + (float2)(0x1.120000p+0f, 0x1.1007a2p-13f), + (float2)(0x1.128000p+0f, 0x1.d212aap-12f), + (float2)(0x1.130000p+0f, 0x1.890f18p-11f), + (float2)(0x1.13c000p+0f, 0x1.2104e2p-14f), + (float2)(0x1.144000p+0f, 0x1.74961ep-12f), + (float2)(0x1.14c000p+0f, 0x1.4b9b66p-11f), + (float2)(0x1.154000p+0f, 0x1.d81e66p-11f), + (float2)(0x1.160000p+0f, 0x1.7f825cp-13f), + (float2)(0x1.168000p+0f, 0x1.c5dca2p-12f), + (float2)(0x1.170000p+0f, 0x1.6153bap-11f), + (float2)(0x1.178000p+0f, 0x1.db1cc2p-11f), + (float2)(0x1.184000p+0f, 0x1.4154b0p-13f), + (float2)(0x1.18c000p+0f, 0x1.821114p-12f), + (float2)(0x1.194000p+0f, 0x1.2d4240p-11f), + (float2)(0x1.19c000p+0f, 0x1.950d82p-11f), + (float2)(0x1.1a4000p+0f, 0x1.f8755cp-11f), + (float2)(0x1.1b0000p+0f, 0x1.5e12a4p-13f), + (float2)(0x1.1b8000p+0f, 0x1.648c38p-12f), + (float2)(0x1.1c0000p+0f, 0x1.08c43ep-11f), + (float2)(0x1.1c8000p+0f, 0x1.5b0970p-11f), + (float2)(0x1.1d0000p+0f, 0x1.a91fe8p-11f), + (float2)(0x1.1d8000p+0f, 0x1.f311b6p-11f), + (float2)(0x1.1e4000p+0f, 0x1.c74618p-14f), + (float2)(0x1.1ec000p+0f, 0x1.eabb54p-13f), + (float2)(0x1.1f4000p+0f, 0x1.70db14p-12f), + (float2)(0x1.1fc000p+0f, 0x1.e45cbcp-12f), + (float2)(0x1.204000p+0f, 0x1.27faa6p-11f), + (float2)(0x1.20c000p+0f, 0x1.59db98p-11f), + (float2)(0x1.214000p+0f, 0x1.87da46p-11f), + (float2)(0x1.21c000p+0f, 0x1.b1ffa0p-11f), + (float2)(0x1.224000p+0f, 0x1.d85478p-11f), + (float2)(0x1.22c000p+0f, 0x1.fae17ep-11f), + (float2)(0x1.238000p+0f, 0x1.9af40cp-15f), + (float2)(0x1.240000p+0f, 0x1.a6319ep-14f), + (float2)(0x1.248000p+0f, 0x1.30baa6p-13f), + (float2)(0x1.250000p+0f, 0x1.7fc362p-13f), + (float2)(0x1.258000p+0f, 0x1.c05362p-13f), + (float2)(0x1.260000p+0f, 0x1.f28a98p-13f), + (float2)(0x1.268000p+0f, 0x1.0b4442p-12f), + (float2)(0x1.270000p+0f, 0x1.16361ap-12f), + (float2)(0x1.278000p+0f, 0x1.1a2a2ap-12f), + (float2)(0x1.280000p+0f, 0x1.172f8ep-12f), + (float2)(0x1.288000p+0f, 0x1.0d5530p-12f), + (float2)(0x1.290000p+0f, 0x1.f9538ep-13f), + (float2)(0x1.298000p+0f, 0x1.ca77b0p-13f), + (float2)(0x1.2a0000p+0f, 0x1.8e336ap-13f), + (float2)(0x1.2a8000p+0f, 0x1.44a304p-13f), + (float2)(0x1.2b0000p+0f, 0x1.dbc4c8p-14f), + (float2)(0x1.2b8000p+0f, 0x1.141a2ap-14f), + (float2)(0x1.2c0000p+0f, 0x1.93e44cp-17f), + (float2)(0x1.2c4000p+0f, 0x1.e6e432p-11f), + (float2)(0x1.2cc000p+0f, 0x1.c447c6p-11f), + (float2)(0x1.2d4000p+0f, 0x1.9e80d8p-11f), + (float2)(0x1.2dc000p+0f, 0x1.7595dcp-11f), + (float2)(0x1.2e4000p+0f, 0x1.498d30p-11f), + (float2)(0x1.2ec000p+0f, 0x1.1a6d1ep-11f), + (float2)(0x1.2f4000p+0f, 0x1.d077bap-12f), + (float2)(0x1.2fc000p+0f, 0x1.65ff1ep-12f), + (float2)(0x1.304000p+0f, 0x1.eaf912p-13f), + (float2)(0x1.30c000p+0f, 0x1.fbefb8p-14f), + (float2)(0x1.314000p+0f, 0x1.44905ap-19f), + (float2)(0x1.318000p+0f, 0x1.c017e6p-11f), + (float2)(0x1.320000p+0f, 0x1.7bfdbep-11f), + (float2)(0x1.328000p+0f, 0x1.34fbc6p-11f), + (float2)(0x1.330000p+0f, 0x1.d62f48p-12f), + (float2)(0x1.338000p+0f, 0x1.3cadc6p-12f), + (float2)(0x1.340000p+0f, 0x1.3afc06p-13f), + (float2)(0x1.344000p+0f, 0x1.fc556ep-11f), + (float2)(0x1.34c000p+0f, 0x1.a71f84p-11f), + (float2)(0x1.354000p+0f, 0x1.4f2290p-11f), + (float2)(0x1.35c000p+0f, 0x1.e8c79cp-12f), + (float2)(0x1.364000p+0f, 0x1.2dd0d8p-12f), + (float2)(0x1.36c000p+0f, 0x1.b5ac2ep-14f), + (float2)(0x1.370000p+0f, 0x1.d3d02ap-11f), + (float2)(0x1.378000p+0f, 0x1.6e3d58p-11f), + (float2)(0x1.380000p+0f, 0x1.060200p-11f), + (float2)(0x1.388000p+0f, 0x1.364608p-12f), + (float2)(0x1.390000p+0f, 0x1.6d29b6p-14f), + (float2)(0x1.394000p+0f, 0x1.bd8d5ep-11f), + (float2)(0x1.39c000p+0f, 0x1.4ae030p-11f), + (float2)(0x1.3a4000p+0f, 0x1.ab44b2p-12f), + (float2)(0x1.3ac000p+0f, 0x1.7761cep-13f), + (float2)(0x1.3b0000p+0f, 0x1.e38710p-11f), + (float2)(0x1.3b8000p+0f, 0x1.66b2b0p-11f), + (float2)(0x1.3c0000p+0f, 0x1.cebf96p-12f), + (float2)(0x1.3c8000p+0f, 0x1.964b20p-13f), + (float2)(0x1.3cc000p+0f, 0x1.e15004p-11f), + (float2)(0x1.3d4000p+0f, 0x1.5a9bcep-11f), + (float2)(0x1.3dc000p+0f, 0x1.a2f4d8p-12f), + (float2)(0x1.3e4000p+0f, 0x1.17c056p-13f), + (float2)(0x1.3e8000p+0f, 0x1.b800f8p-11f), + (float2)(0x1.3f0000p+0f, 0x1.27b132p-11f), + (float2)(0x1.3f8000p+0f, 0x1.2a09b8p-12f), + (float2)(0x1.400000p+0f, 0x0.000000p+0f), + (float2)(0x1.404000p+0f, 0x1.68a69cp-11f), + (float2)(0x1.40c000p+0f, 0x1.9df950p-12f), + (float2)(0x1.414000p+0f, 0x1.983050p-14f), + (float2)(0x1.418000p+0f, 0x1.94c6a4p-11f), + (float2)(0x1.420000p+0f, 0x1.e88494p-12f), + (float2)(0x1.428000p+0f, 0x1.45f31ap-13f) +}; + +DECLARE_TABLE(float, EXP_TBL, 65) = { + 0x1.000000p+0f, + 0x1.02c9a4p+0f, + 0x1.059b0ep+0f, + 0x1.087452p+0f, + 0x1.0b5586p+0f, + 0x1.0e3ec4p+0f, + 0x1.11301ep+0f, + 0x1.1429aap+0f, + 0x1.172b84p+0f, + 0x1.1a35bep+0f, + 0x1.1d4874p+0f, + 0x1.2063b8p+0f, + 0x1.2387a6p+0f, + 0x1.26b456p+0f, + 0x1.29e9e0p+0f, + 0x1.2d285ap+0f, + 0x1.306fe0p+0f, + 0x1.33c08cp+0f, + 0x1.371a74p+0f, + 0x1.3a7db4p+0f, + 0x1.3dea64p+0f, + 0x1.4160a2p+0f, + 0x1.44e086p+0f, + 0x1.486a2cp+0f, + 0x1.4bfdaep+0f, + 0x1.4f9b28p+0f, + 0x1.5342b6p+0f, + 0x1.56f474p+0f, + 0x1.5ab07ep+0f, + 0x1.5e76f2p+0f, + 0x1.6247ecp+0f, + 0x1.662388p+0f, + 0x1.6a09e6p+0f, + 0x1.6dfb24p+0f, + 0x1.71f75ep+0f, + 0x1.75feb6p+0f, + 0x1.7a1148p+0f, + 0x1.7e2f34p+0f, + 0x1.82589ap+0f, + 0x1.868d9ap+0f, + 0x1.8ace54p+0f, + 0x1.8f1aeap+0f, + 0x1.93737cp+0f, + 0x1.97d82ap+0f, + 0x1.9c4918p+0f, + 0x1.a0c668p+0f, + 0x1.a5503cp+0f, + 0x1.a9e6b6p+0f, + 0x1.ae89fap+0f, + 0x1.b33a2cp+0f, + 0x1.b7f770p+0f, + 0x1.bcc1eap+0f, + 0x1.c199bep+0f, + 0x1.c67f12p+0f, + 0x1.cb720ep+0f, + 0x1.d072d4p+0f, + 0x1.d5818ep+0f, + 0x1.da9e60p+0f, + 0x1.dfc974p+0f, + 0x1.e502eep+0f, + 0x1.ea4afap+0f, + 0x1.efa1bep+0f, + 0x1.f50766p+0f, + 0x1.fa7c18p+0f, + 0x1.000000p+1f, +}; + +DECLARE_TABLE(float2, EXP_TBL_EP, 65) = { + (float2) (0x1.000000p+0f, 0x0.000000p+0f), + (float2) (0x1.02c000p+0f, 0x1.347ceep-13f), + (float2) (0x1.058000p+0f, 0x1.b0d314p-12f), + (float2) (0x1.084000p+0f, 0x1.a28c3ap-11f), + (float2) (0x1.0b4000p+0f, 0x1.586cf8p-12f), + (float2) (0x1.0e0000p+0f, 0x1.f61968p-11f), + (float2) (0x1.110000p+0f, 0x1.80e808p-11f), + (float2) (0x1.140000p+0f, 0x1.4d5754p-11f), + (float2) (0x1.170000p+0f, 0x1.5c1e3ep-11f), + (float2) (0x1.1a0000p+0f, 0x1.adf5b6p-11f), + (float2) (0x1.1d4000p+0f, 0x1.0e62d0p-13f), + (float2) (0x1.204000p+0f, 0x1.1dc430p-11f), + (float2) (0x1.238000p+0f, 0x1.e9b9d4p-14f), + (float2) (0x1.268000p+0f, 0x1.a2b2f0p-11f), + (float2) (0x1.29c000p+0f, 0x1.4efa8ep-11f), + (float2) (0x1.2d0000p+0f, 0x1.42d372p-11f), + (float2) (0x1.304000p+0f, 0x1.7f0518p-11f), + (float2) (0x1.33c000p+0f, 0x1.164c82p-17f), + (float2) (0x1.370000p+0f, 0x1.a7373ap-12f), + (float2) (0x1.3a4000p+0f, 0x1.ed9a72p-11f), + (float2) (0x1.3dc000p+0f, 0x1.532608p-11f), + (float2) (0x1.414000p+0f, 0x1.0510fap-11f), + (float2) (0x1.44c000p+0f, 0x1.043030p-11f), + (float2) (0x1.484000p+0f, 0x1.515ae0p-11f), + (float2) (0x1.4bc000p+0f, 0x1.ed6a9ap-11f), + (float2) (0x1.4f8000p+0f, 0x1.b2769cp-12f), + (float2) (0x1.534000p+0f, 0x1.5ab4eap-15f), + (float2) (0x1.56c000p+0f, 0x1.a39b5ap-11f), + (float2) (0x1.5a8000p+0f, 0x1.83eea4p-11f), + (float2) (0x1.5e4000p+0f, 0x1.b78ad6p-11f), + (float2) (0x1.624000p+0f, 0x1.fac0e8p-14f), + (float2) (0x1.660000p+0f, 0x1.1c412ap-11f), + (float2) (0x1.6a0000p+0f, 0x1.3cccfep-13f), + (float2) (0x1.6dc000p+0f, 0x1.d91e32p-11f), + (float2) (0x1.71c000p+0f, 0x1.baf476p-11f), + (float2) (0x1.75c000p+0f, 0x1.f5ab20p-11f), + (float2) (0x1.7a0000p+0f, 0x1.1473eap-12f), + (float2) (0x1.7e0000p+0f, 0x1.799b66p-11f), + (float2) (0x1.824000p+0f, 0x1.89994cp-12f), + (float2) (0x1.868000p+0f, 0x1.b33688p-13f), + (float2) (0x1.8ac000p+0f, 0x1.ca8454p-13f), + (float2) (0x1.8f0000p+0f, 0x1.ae9914p-12f), + (float2) (0x1.934000p+0f, 0x1.9bd866p-11f), + (float2) (0x1.97c000p+0f, 0x1.829fdep-12f), + (float2) (0x1.9c4000p+0f, 0x1.230546p-13f), + (float2) (0x1.a0c000p+0f, 0x1.99ed76p-14f), + (float2) (0x1.a54000p+0f, 0x1.03b23ep-12f), + (float2) (0x1.a9c000p+0f, 0x1.35aabcp-11f), + (float2) (0x1.ae8000p+0f, 0x1.3f32b4p-13f), + (float2) (0x1.b30000p+0f, 0x1.d15c26p-11f), + (float2) (0x1.b7c000p+0f, 0x1.bb797cp-11f), + (float2) (0x1.bcc000p+0f, 0x1.e904bcp-16f), + (float2) (0x1.c18000p+0f, 0x1.9bdd84p-12f), + (float2) (0x1.c64000p+0f, 0x1.f8972ap-11f), + (float2) (0x1.cb4000p+0f, 0x1.906e76p-11f), + (float2) (0x1.d04000p+0f, 0x1.96a502p-11f), + (float2) (0x1.d58000p+0f, 0x1.8dcfbap-16f), + (float2) (0x1.da8000p+0f, 0x1.e603dap-12f), + (float2) (0x1.dfc000p+0f, 0x1.2e66f6p-13f), + (float2) (0x1.e50000p+0f, 0x1.773c58p-15f), + (float2) (0x1.ea4000p+0f, 0x1.5f4548p-13f), + (float2) (0x1.ef8000p+0f, 0x1.0df730p-11f), + (float2) (0x1.f50000p+0f, 0x1.d96db8p-14f), + (float2) (0x1.fa4000p+0f, 0x1.e0c0cep-11f), + (float2) (0x1.000000p+1f, 0x0.000000p+0f), +}; + TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl); TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl); TABLE_FUNCTION(float2, LOG2_TBL, log2_tbl); @@ -443,6 +752,11 @@ return *(__constant uint4 *)(PIBITS_TBL + idx); } +TABLE_FUNCTION(float2, SINHCOSH_TBL, sinhcosh_tbl); +TABLE_FUNCTION(float2, CBRT_TBL, cbrt_tbl); +TABLE_FUNCTION(float, EXP_TBL, exp_tbl); +TABLE_FUNCTION(float2, EXP_TBL_EP, exp_tbl_ep); + #ifdef cl_khr_fp64 DECLARE_TABLE(double2, LN_TBL, 65) = { @@ -835,7 +1149,621 @@ }; +DECLARE_TABLE(double2, SINH_TBL, 37) = { + (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.2cd9fc0000000p+0, 0x1.13ae6096a0092p-26), + (double2)(0x1.d03cf60000000p+1, 0x1.db70cfb79a640p-26), + (double2)(0x1.40926e0000000p+3, 0x1.c2526b66dc067p-23), + (double2)(0x1.b4a3800000000p+4, 0x1.b81b18647f380p-23), + (double2)(0x1.28d0160000000p+6, 0x1.bc1cdd1e1eb08p-20), + (double2)(0x1.936d228000000p+7, 0x1.d9f201534fb09p-19), + (double2)(0x1.1228768000000p+9, 0x1.d1c064a4e9954p-18), + (double2)(0x1.749ea50000000p+10, 0x1.4eca65d06ea74p-18), + (double2)(0x1.fa71570000000p+11, 0x1.0c259bcc0ecc5p-15), + (double2)(0x1.5829dc8000000p+13, 0x1.b5a6647cf9016p-13), + (double2)(0x1.d3c4488000000p+14, 0x1.9691adefb0870p-15), + (double2)(0x1.3de1650000000p+16, 0x1.3410fc29cde38p-10), + (double2)(0x1.b00b590000000p+17, 0x1.6a31a50b6fb3cp-11), + (double2)(0x1.259ac48000000p+19, 0x1.7defc71805c40p-10), + (double2)(0x1.8f0cca8000000p+20, 0x1.eb49fd80e0babp-6), + (double2)(0x1.0f2ebd0000000p+22, 0x1.4fffc7bcd5920p-7), + (double2)(0x1.7093488000000p+23, 0x1.03a93b6c63435p-3), + (double2)(0x1.f4f2208000000p+24, 0x1.1940bb255fd1cp-4), + (double2)(0x1.546d8f8000000p+26, 0x1.ed26e14260b50p-2), + (double2)(0x1.ceb0888000000p+27, 0x1.b47401fc9f2a2p+0), + (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55634f1p+3), + (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff8194ddcp+2), + (double2)(0x1.226af30000000p+32, 0x1.d8fee052ba63ap+5), + (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde3f6p+7), + (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557d1ap+8), + (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9dc4p+8), + (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abap+11), + (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13), + (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6363p+14), + (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15), + (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17), + (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19), + (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19), + (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22), + (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21), + (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22), +}; + +DECLARE_TABLE(double2, COSH_TBL, 37) = { + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.8b07550000000p+0, 0x1.d9f5504c2bd28p-28), + (double2)(0x1.e18fa08000000p+1, 0x1.7cb66f0a4c9fdp-25), + (double2)(0x1.422a490000000p+3, 0x1.f58617928e588p-23), + (double2)(0x1.b4ee858000000p+4, 0x1.bc7d000c38d48p-25), + (double2)(0x1.28d6fc8000000p+6, 0x1.f7f9d4e329998p-21), + (double2)(0x1.936e678000000p+7, 0x1.6e6e464885269p-19), + (double2)(0x1.1228948000000p+9, 0x1.ba3a8b946c154p-19), + (double2)(0x1.749eaa8000000p+10, 0x1.3f4e76110d5a4p-18), + (double2)(0x1.fa71580000000p+11, 0x1.17622515a3e2bp-15), + (double2)(0x1.5829dd0000000p+13, 0x1.4dc4b528af3d0p-17), + (double2)(0x1.d3c4488000000p+14, 0x1.1156278615e10p-14), + (double2)(0x1.3de1650000000p+16, 0x1.35ad50ed821f5p-10), + (double2)(0x1.b00b590000000p+17, 0x1.6b61055f2935cp-11), + (double2)(0x1.259ac48000000p+19, 0x1.7e2794a601240p-10), + (double2)(0x1.8f0cca8000000p+20, 0x1.eb4b45f6aadd3p-6), + (double2)(0x1.0f2ebd0000000p+22, 0x1.5000b967b3698p-7), + (double2)(0x1.7093488000000p+23, 0x1.03a940fadc092p-3), + (double2)(0x1.f4f2208000000p+24, 0x1.1940bf3bf874cp-4), + (double2)(0x1.546d8f8000000p+26, 0x1.ed26e1a2a2110p-2), + (double2)(0x1.ceb0888000000p+27, 0x1.b4740205796d6p+0), + (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55cb85dp+3), + (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff81e18acp+2), + (double2)(0x1.226af30000000p+32, 0x1.d8fee052bdea4p+5), + (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde926p+7), + (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557e0ep+8), + (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9e1cp+8), + (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abep+11), + (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13), + (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6364p+14), + (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15), + (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17), + (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19), + (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19), + (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22), + (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21), + (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22) +}; + +DECLARE_TABLE(double, CBRT_INV_TBL, 257) = { + 0x1.0000000000000p+1, + 0x1.fe01fe01fe020p+0, + 0x1.fc07f01fc07f0p+0, + 0x1.fa11caa01fa12p+0, + 0x1.f81f81f81f820p+0, + 0x1.f6310aca0dbb5p+0, + 0x1.f44659e4a4271p+0, + 0x1.f25f644230ab5p+0, + 0x1.f07c1f07c1f08p+0, + 0x1.ee9c7f8458e02p+0, + 0x1.ecc07b301ecc0p+0, + 0x1.eae807aba01ebp+0, + 0x1.e9131abf0b767p+0, + 0x1.e741aa59750e4p+0, + 0x1.e573ac901e574p+0, + 0x1.e3a9179dc1a73p+0, + 0x1.e1e1e1e1e1e1ep+0, + 0x1.e01e01e01e01ep+0, + 0x1.de5d6e3f8868ap+0, + 0x1.dca01dca01dcap+0, + 0x1.dae6076b981dbp+0, + 0x1.d92f2231e7f8ap+0, + 0x1.d77b654b82c34p+0, + 0x1.d5cac807572b2p+0, + 0x1.d41d41d41d41dp+0, + 0x1.d272ca3fc5b1ap+0, + 0x1.d0cb58f6ec074p+0, + 0x1.cf26e5c44bfc6p+0, + 0x1.cd85689039b0bp+0, + 0x1.cbe6d9601cbe7p+0, + 0x1.ca4b3055ee191p+0, + 0x1.c8b265afb8a42p+0, + 0x1.c71c71c71c71cp+0, + 0x1.c5894d10d4986p+0, + 0x1.c3f8f01c3f8f0p+0, + 0x1.c26b5392ea01cp+0, + 0x1.c0e070381c0e0p+0, + 0x1.bf583ee868d8bp+0, + 0x1.bdd2b899406f7p+0, + 0x1.bc4fd65883e7bp+0, + 0x1.bacf914c1bad0p+0, + 0x1.b951e2b18ff23p+0, + 0x1.b7d6c3dda338bp+0, + 0x1.b65e2e3beee05p+0, + 0x1.b4e81b4e81b4fp+0, + 0x1.b37484ad806cep+0, + 0x1.b2036406c80d9p+0, + 0x1.b094b31d922a4p+0, + 0x1.af286bca1af28p+0, + 0x1.adbe87f94905ep+0, + 0x1.ac5701ac5701bp+0, + 0x1.aaf1d2f87ebfdp+0, + 0x1.a98ef606a63bep+0, + 0x1.a82e65130e159p+0, + 0x1.a6d01a6d01a6dp+0, + 0x1.a574107688a4ap+0, + 0x1.a41a41a41a41ap+0, + 0x1.a2c2a87c51ca0p+0, + 0x1.a16d3f97a4b02p+0, + 0x1.a01a01a01a01ap+0, + 0x1.9ec8e951033d9p+0, + 0x1.9d79f176b682dp+0, + 0x1.9c2d14ee4a102p+0, + 0x1.9ae24ea5510dap+0, + 0x1.999999999999ap+0, + 0x1.9852f0d8ec0ffp+0, + 0x1.970e4f80cb872p+0, + 0x1.95cbb0be377aep+0, + 0x1.948b0fcd6e9e0p+0, + 0x1.934c67f9b2ce6p+0, + 0x1.920fb49d0e229p+0, + 0x1.90d4f120190d5p+0, + 0x1.8f9c18f9c18fap+0, + 0x1.8e6527af1373fp+0, + 0x1.8d3018d3018d3p+0, + 0x1.8bfce8062ff3ap+0, + 0x1.8acb90f6bf3aap+0, + 0x1.899c0f601899cp+0, + 0x1.886e5f0abb04ap+0, + 0x1.87427bcc092b9p+0, + 0x1.8618618618618p+0, + 0x1.84f00c2780614p+0, + 0x1.83c977ab2beddp+0, + 0x1.82a4a0182a4a0p+0, + 0x1.8181818181818p+0, + 0x1.8060180601806p+0, + 0x1.7f405fd017f40p+0, + 0x1.7e225515a4f1dp+0, + 0x1.7d05f417d05f4p+0, + 0x1.7beb3922e017cp+0, + 0x1.7ad2208e0ecc3p+0, + 0x1.79baa6bb6398bp+0, + 0x1.78a4c8178a4c8p+0, + 0x1.77908119ac60dp+0, + 0x1.767dce434a9b1p+0, + 0x1.756cac201756dp+0, + 0x1.745d1745d1746p+0, + 0x1.734f0c541fe8dp+0, + 0x1.724287f46debcp+0, + 0x1.713786d9c7c09p+0, + 0x1.702e05c0b8170p+0, + 0x1.6f26016f26017p+0, + 0x1.6e1f76b4337c7p+0, + 0x1.6d1a62681c861p+0, + 0x1.6c16c16c16c17p+0, + 0x1.6b1490aa31a3dp+0, + 0x1.6a13cd1537290p+0, + 0x1.691473a88d0c0p+0, + 0x1.6816816816817p+0, + 0x1.6719f3601671ap+0, + 0x1.661ec6a5122f9p+0, + 0x1.6524f853b4aa3p+0, + 0x1.642c8590b2164p+0, + 0x1.63356b88ac0dep+0, + 0x1.623fa77016240p+0, + 0x1.614b36831ae94p+0, + 0x1.6058160581606p+0, + 0x1.5f66434292dfcp+0, + 0x1.5e75bb8d015e7p+0, + 0x1.5d867c3ece2a5p+0, + 0x1.5c9882b931057p+0, + 0x1.5babcc647fa91p+0, + 0x1.5ac056b015ac0p+0, + 0x1.59d61f123ccaap+0, + 0x1.58ed2308158edp+0, + 0x1.5805601580560p+0, + 0x1.571ed3c506b3ap+0, + 0x1.56397ba7c52e2p+0, + 0x1.5555555555555p+0, + 0x1.54725e6bb82fep+0, + 0x1.5390948f40febp+0, + 0x1.52aff56a8054bp+0, + 0x1.51d07eae2f815p+0, + 0x1.50f22e111c4c5p+0, + 0x1.5015015015015p+0, + 0x1.4f38f62dd4c9bp+0, + 0x1.4e5e0a72f0539p+0, + 0x1.4d843bedc2c4cp+0, + 0x1.4cab88725af6ep+0, + 0x1.4bd3edda68fe1p+0, + 0x1.4afd6a052bf5bp+0, + 0x1.4a27fad76014ap+0, + 0x1.49539e3b2d067p+0, + 0x1.4880522014880p+0, + 0x1.47ae147ae147bp+0, + 0x1.46dce34596066p+0, + 0x1.460cbc7f5cf9ap+0, + 0x1.453d9e2c776cap+0, + 0x1.446f86562d9fbp+0, + 0x1.43a2730abee4dp+0, + 0x1.42d6625d51f87p+0, + 0x1.420b5265e5951p+0, + 0x1.4141414141414p+0, + 0x1.40782d10e6566p+0, + 0x1.3fb013fb013fbp+0, + 0x1.3ee8f42a5af07p+0, + 0x1.3e22cbce4a902p+0, + 0x1.3d5d991aa75c6p+0, + 0x1.3c995a47babe7p+0, + 0x1.3bd60d9232955p+0, + 0x1.3b13b13b13b14p+0, + 0x1.3a524387ac822p+0, + 0x1.3991c2c187f63p+0, + 0x1.38d22d366088ep+0, + 0x1.3813813813814p+0, + 0x1.3755bd1c945eep+0, + 0x1.3698df3de0748p+0, + 0x1.35dce5f9f2af8p+0, + 0x1.3521cfb2b78c1p+0, + 0x1.34679ace01346p+0, + 0x1.33ae45b57bcb2p+0, + 0x1.32f5ced6a1dfap+0, + 0x1.323e34a2b10bfp+0, + 0x1.3187758e9ebb6p+0, + 0x1.30d190130d190p+0, + 0x1.301c82ac40260p+0, + 0x1.2f684bda12f68p+0, + 0x1.2eb4ea1fed14bp+0, + 0x1.2e025c04b8097p+0, + 0x1.2d50a012d50a0p+0, + 0x1.2c9fb4d812ca0p+0, + 0x1.2bef98e5a3711p+0, + 0x1.2b404ad012b40p+0, + 0x1.2a91c92f3c105p+0, + 0x1.29e4129e4129ep+0, + 0x1.293725bb804a5p+0, + 0x1.288b01288b013p+0, + 0x1.27dfa38a1ce4dp+0, + 0x1.27350b8812735p+0, + 0x1.268b37cd60127p+0, + 0x1.25e22708092f1p+0, + 0x1.2539d7e9177b2p+0, + 0x1.2492492492492p+0, + 0x1.23eb79717605bp+0, + 0x1.23456789abcdfp+0, + 0x1.22a0122a0122ap+0, + 0x1.21fb78121fb78p+0, + 0x1.21579804855e6p+0, + 0x1.20b470c67c0d9p+0, + 0x1.2012012012012p+0, + 0x1.1f7047dc11f70p+0, + 0x1.1ecf43c7fb84cp+0, + 0x1.1e2ef3b3fb874p+0, + 0x1.1d8f5672e4abdp+0, + 0x1.1cf06ada2811dp+0, + 0x1.1c522fc1ce059p+0, + 0x1.1bb4a4046ed29p+0, + 0x1.1b17c67f2bae3p+0, + 0x1.1a7b9611a7b96p+0, + 0x1.19e0119e0119ep+0, + 0x1.19453808ca29cp+0, + 0x1.18ab083902bdbp+0, + 0x1.1811811811812p+0, + 0x1.1778a191bd684p+0, + 0x1.16e0689427379p+0, + 0x1.1648d50fc3201p+0, + 0x1.15b1e5f75270dp+0, + 0x1.151b9a3fdd5c9p+0, + 0x1.1485f0e0acd3bp+0, + 0x1.13f0e8d344724p+0, + 0x1.135c81135c811p+0, + 0x1.12c8b89edc0acp+0, + 0x1.12358e75d3033p+0, + 0x1.11a3019a74826p+0, + 0x1.1111111111111p+0, + 0x1.107fbbe011080p+0, + 0x1.0fef010fef011p+0, + 0x1.0f5edfab325a2p+0, + 0x1.0ecf56be69c90p+0, + 0x1.0e40655826011p+0, + 0x1.0db20a88f4696p+0, + 0x1.0d24456359e3ap+0, + 0x1.0c9714fbcda3bp+0, + 0x1.0c0a7868b4171p+0, + 0x1.0b7e6ec259dc8p+0, + 0x1.0af2f722eecb5p+0, + 0x1.0a6810a6810a7p+0, + 0x1.09ddba6af8360p+0, + 0x1.0953f39010954p+0, + 0x1.08cabb37565e2p+0, + 0x1.0842108421084p+0, + 0x1.07b9f29b8eae2p+0, + 0x1.073260a47f7c6p+0, + 0x1.06ab59c7912fbp+0, + 0x1.0624dd2f1a9fcp+0, + 0x1.059eea0727586p+0, + 0x1.05197f7d73404p+0, + 0x1.04949cc1664c5p+0, + 0x1.0410410410410p+0, + 0x1.038c6b78247fcp+0, + 0x1.03091b51f5e1ap+0, + 0x1.02864fc7729e9p+0, + 0x1.0204081020408p+0, + 0x1.0182436517a37p+0, + 0x1.0101010101010p+0, + 0x1.0080402010080p+0, + 0x1.0000000000000p+0 +}; + +DECLARE_TABLE(double2, CBRT_DBL_TBL, 257) = { + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.0055380000000p+0, 0x1.e6a24c81e4294p-25), + (double2)(0x1.00aa390000000p+0, 0x1.8548511e3a785p-26), + (double2)(0x1.00ff010000000p+0, 0x1.4eb9336ec07f6p-25), + (double2)(0x1.0153920000000p+0, 0x1.0ea64b8b750e1p-27), + (double2)(0x1.01a7eb0000000p+0, 0x1.61637cff8a53cp-27), + (double2)(0x1.01fc0d0000000p+0, 0x1.0733bf7bd1943p-27), + (double2)(0x1.024ff80000000p+0, 0x1.666911345ccedp-26), + (double2)(0x1.02a3ad0000000p+0, 0x1.77b7a3f592f14p-27), + (double2)(0x1.02f72b0000000p+0, 0x1.f18d3dd1a5402p-25), + (double2)(0x1.034a750000000p+0, 0x1.be2f5a58ee9a4p-29), + (double2)(0x1.039d880000000p+0, 0x1.8901f8f085fa7p-25), + (double2)(0x1.03f0670000000p+0, 0x1.c68b8cd5b5d69p-26), + (double2)(0x1.0443110000000p+0, 0x1.a6b0e8624be42p-26), + (double2)(0x1.0495870000000p+0, 0x1.c4b22b06f68e7p-36), + (double2)(0x1.04e7c80000000p+0, 0x1.0f3f0afcabe9bp-25), + (double2)(0x1.0539d60000000p+0, 0x1.48495bca4e1b7p-26), + (double2)(0x1.058bb00000000p+0, 0x1.6107f1abdfdc3p-25), + (double2)(0x1.05dd570000000p+0, 0x1.e67261878288ap-25), + (double2)(0x1.062ecc0000000p+0, 0x1.a6bc155286f1ep-26), + (double2)(0x1.06800e0000000p+0, 0x1.8a759c64a85f2p-26), + (double2)(0x1.06d11e0000000p+0, 0x1.5fce70a4a8d09p-27), + (double2)(0x1.0721fc0000000p+0, 0x1.2f9cbf373fe1dp-28), + (double2)(0x1.0772a80000000p+0, 0x1.90564ce4ac359p-26), + (double2)(0x1.07c3230000000p+0, 0x1.ac29ce761b02fp-26), + (double2)(0x1.08136d0000000p+0, 0x1.cb752f497381cp-26), + (double2)(0x1.0863860000000p+0, 0x1.8bb9e1cfb35e0p-25), + (double2)(0x1.08b36f0000000p+0, 0x1.5b4917099de90p-25), + (double2)(0x1.0903280000000p+0, 0x1.cc77ac9c65ef2p-26), + (double2)(0x1.0952b10000000p+0, 0x1.7a0f3e7be3dbap-26), + (double2)(0x1.09a20a0000000p+0, 0x1.6ec851ee0c16fp-25), + (double2)(0x1.09f1340000000p+0, 0x1.89449bf2946dap-25), + (double2)(0x1.0a402f0000000p+0, 0x1.98f25301ba223p-25), + (double2)(0x1.0a8efc0000000p+0, 0x1.47d5ec651f549p-28), + (double2)(0x1.0add990000000p+0, 0x1.c33ec9a86007ap-25), + (double2)(0x1.0b2c090000000p+0, 0x1.e0b6653e92649p-26), + (double2)(0x1.0b7a4b0000000p+0, 0x1.bd64ac09d755fp-28), + (double2)(0x1.0bc85f0000000p+0, 0x1.f537506f78167p-29), + (double2)(0x1.0c16450000000p+0, 0x1.2c382d1b3735ep-25), + (double2)(0x1.0c63fe0000000p+0, 0x1.e20ed659f99e1p-25), + (double2)(0x1.0cb18b0000000p+0, 0x1.86b633a9c182ap-26), + (double2)(0x1.0cfeeb0000000p+0, 0x1.45cfd5a65e777p-27), + (double2)(0x1.0d4c1e0000000p+0, 0x1.0c8770f58bca4p-25), + (double2)(0x1.0d99250000000p+0, 0x1.739e44b0933c5p-25), + (double2)(0x1.0de6010000000p+0, 0x1.27dc3d9ce7bd8p-31), + (double2)(0x1.0e32b00000000p+0, 0x1.3c53c7c5a7b64p-25), + (double2)(0x1.0e7f340000000p+0, 0x1.9669683830cecp-25), + (double2)(0x1.0ecb8d0000000p+0, 0x1.8d772c39bdcc4p-25), + (double2)(0x1.0f17bb0000000p+0, 0x1.9b0008bcf6d7bp-25), + (double2)(0x1.0f63bf0000000p+0, 0x1.bbb305825ce4fp-28), + (double2)(0x1.0faf970000000p+0, 0x1.da3f4af13a406p-25), + (double2)(0x1.0ffb460000000p+0, 0x1.f36b96f74ce86p-26), + (double2)(0x1.1046cb0000000p+0, 0x1.65c002303f790p-30), + (double2)(0x1.1092250000000p+0, 0x1.82f84095ba7d5p-25), + (double2)(0x1.10dd560000000p+0, 0x1.d46433541b2c6p-25), + (double2)(0x1.11285e0000000p+0, 0x1.71c3d56e93a89p-25), + (double2)(0x1.11733d0000000p+0, 0x1.98dcef4e40012p-26), + (double2)(0x1.11bdf30000000p+0, 0x1.530ebef17fe03p-27), + (double2)(0x1.1208800000000p+0, 0x1.e8b8fa3715066p-27), + (double2)(0x1.1252e40000000p+0, 0x1.ab26eb3b211dcp-25), + (double2)(0x1.129d210000000p+0, 0x1.54dd4dc906307p-27), + (double2)(0x1.12e7350000000p+0, 0x1.c9f962387984ep-26), + (double2)(0x1.1331210000000p+0, 0x1.c62a959afec09p-25), + (double2)(0x1.137ae60000000p+0, 0x1.638d9ac6a866ap-25), + (double2)(0x1.13c4840000000p+0, 0x1.38704eca8a22dp-28), + (double2)(0x1.140dfa0000000p+0, 0x1.e6c9e1db14f8fp-27), + (double2)(0x1.1457490000000p+0, 0x1.8744b7f9c9eaap-26), + (double2)(0x1.14a0710000000p+0, 0x1.6c2893486373bp-25), + (double2)(0x1.14e9730000000p+0, 0x1.b36bce31699b7p-26), + (double2)(0x1.15324e0000000p+0, 0x1.71e3813d200c7p-25), + (double2)(0x1.157b030000000p+0, 0x1.99755ab40aa88p-25), + (double2)(0x1.15c3920000000p+0, 0x1.b45ca0e4bcfc0p-25), + (double2)(0x1.160bfc0000000p+0, 0x1.2dd090d869c5dp-28), + (double2)(0x1.16543f0000000p+0, 0x1.4fe0516b917dap-25), + (double2)(0x1.169c5d0000000p+0, 0x1.94563226317a2p-25), + (double2)(0x1.16e4560000000p+0, 0x1.53d8fafc2c851p-25), + (double2)(0x1.172c2a0000000p+0, 0x1.dcbd41fbd41a3p-26), + (double2)(0x1.1773d90000000p+0, 0x1.862ff5285f59cp-26), + (double2)(0x1.17bb630000000p+0, 0x1.3072ea97a1e1cp-25), + (double2)(0x1.1802c90000000p+0, 0x1.2839075184805p-26), + (double2)(0x1.184a0a0000000p+0, 0x1.4b0323e9eff42p-25), + (double2)(0x1.1891270000000p+0, 0x1.b158893c45484p-25), + (double2)(0x1.18d8210000000p+0, 0x1.149ef0fc35826p-28), + (double2)(0x1.191ef60000000p+0, 0x1.f2e77ea96acaap-26), + (double2)(0x1.1965a80000000p+0, 0x1.200074c471a95p-26), + (double2)(0x1.19ac360000000p+0, 0x1.3f8cc517f6f04p-25), + (double2)(0x1.19f2a10000000p+0, 0x1.60ba2e311bb55p-25), + (double2)(0x1.1a38e90000000p+0, 0x1.4b788730bbec3p-25), + (double2)(0x1.1a7f0e0000000p+0, 0x1.57090795ee20cp-25), + (double2)(0x1.1ac5100000000p+0, 0x1.d9ffe983670b1p-25), + (double2)(0x1.1b0af00000000p+0, 0x1.2a463ff61bfdap-25), + (double2)(0x1.1b50ad0000000p+0, 0x1.9d1bc6a5e65cfp-25), + (double2)(0x1.1b96480000000p+0, 0x1.8718abaa9e922p-25), + (double2)(0x1.1bdbc10000000p+0, 0x1.3c2f52ffa342ep-25), + (double2)(0x1.1c21180000000p+0, 0x1.0fae13ff42c80p-25), + (double2)(0x1.1c664d0000000p+0, 0x1.5440f0ef00d57p-25), + (double2)(0x1.1cab610000000p+0, 0x1.6fcd22d4e3c1ep-27), + (double2)(0x1.1cf0530000000p+0, 0x1.e0c60b409e863p-27), + (double2)(0x1.1d35230000000p+0, 0x1.f9cab5a5f0333p-25), + (double2)(0x1.1d79d30000000p+0, 0x1.30f24744c333dp-25), + (double2)(0x1.1dbe620000000p+0, 0x1.b50622a76b2fep-27), + (double2)(0x1.1e02cf0000000p+0, 0x1.fdb94ba595375p-25), + (double2)(0x1.1e471d0000000p+0, 0x1.861b9b945a171p-28), + (double2)(0x1.1e8b490000000p+0, 0x1.54348015188c4p-25), + (double2)(0x1.1ecf550000000p+0, 0x1.b54d149865523p-25), + (double2)(0x1.1f13410000000p+0, 0x1.a0bb783d9de33p-25), + (double2)(0x1.1f570d0000000p+0, 0x1.629d12b1a2157p-25), + (double2)(0x1.1f9ab90000000p+0, 0x1.467fe35d179dfp-25), + (double2)(0x1.1fde450000000p+0, 0x1.9763f3e26c8f7p-25), + (double2)(0x1.2021b20000000p+0, 0x1.3f798bb9f7679p-26), + (double2)(0x1.2064ff0000000p+0, 0x1.52e577e855898p-26), + (double2)(0x1.20a82c0000000p+0, 0x1.fde47e5502c3ap-25), + (double2)(0x1.20eb3b0000000p+0, 0x1.cbd0b548d96a0p-26), + (double2)(0x1.212e2a0000000p+0, 0x1.a9cd9f7be8de8p-25), + (double2)(0x1.2170fb0000000p+0, 0x1.22bbe704886dep-26), + (double2)(0x1.21b3ac0000000p+0, 0x1.e3dea8317f020p-25), + (double2)(0x1.21f63f0000000p+0, 0x1.e812085ac8855p-25), + (double2)(0x1.2238b40000000p+0, 0x1.c87144f24cb07p-26), + (double2)(0x1.227b0a0000000p+0, 0x1.1e128ee311fa2p-25), + (double2)(0x1.22bd420000000p+0, 0x1.b5c163d61a2d3p-26), + (double2)(0x1.22ff5c0000000p+0, 0x1.7d97e7fb90633p-27), + (double2)(0x1.2341570000000p+0, 0x1.efe899d50f6a7p-25), + (double2)(0x1.2383350000000p+0, 0x1.d0333eb75de5ap-25), + (double2)(0x1.23c4f60000000p+0, 0x1.0e590be73a573p-27), + (double2)(0x1.2406980000000p+0, 0x1.8ce8dcac3cdd2p-25), + (double2)(0x1.24481d0000000p+0, 0x1.ee8a48954064bp-25), + (double2)(0x1.2489850000000p+0, 0x1.aa62f18461e09p-25), + (double2)(0x1.24cad00000000p+0, 0x1.01e5940986a15p-25), + (double2)(0x1.250bfe0000000p+0, 0x1.b082f4f9b8d4cp-28), + (double2)(0x1.254d0e0000000p+0, 0x1.876e0e5527f5ap-25), + (double2)(0x1.258e020000000p+0, 0x1.3617080831e6bp-25), + (double2)(0x1.25ced90000000p+0, 0x1.81b26e34aa4a2p-25), + (double2)(0x1.260f940000000p+0, 0x1.52ee66dfab0c1p-26), + (double2)(0x1.2650320000000p+0, 0x1.d85a5329e8819p-26), + (double2)(0x1.2690b40000000p+0, 0x1.105c1b646b5d1p-26), + (double2)(0x1.26d1190000000p+0, 0x1.bb6690c1a379cp-25), + (double2)(0x1.2711630000000p+0, 0x1.86aeba73ce3a9p-26), + (double2)(0x1.2751900000000p+0, 0x1.dd16198294dd4p-25), + (double2)(0x1.2791a20000000p+0, 0x1.454e675775e83p-25), + (double2)(0x1.27d1980000000p+0, 0x1.3842e026197eap-25), + (double2)(0x1.2811720000000p+0, 0x1.f1ce0e70c44d2p-25), + (double2)(0x1.2851310000000p+0, 0x1.ad636441a5627p-25), + (double2)(0x1.2890d50000000p+0, 0x1.4c205d7212abbp-26), + (double2)(0x1.28d05d0000000p+0, 0x1.167c86c116419p-25), + (double2)(0x1.290fca0000000p+0, 0x1.38ec3ef16e294p-25), + (double2)(0x1.294f1c0000000p+0, 0x1.473fceace9321p-25), + (double2)(0x1.298e530000000p+0, 0x1.7af53a836dba7p-25), + (double2)(0x1.29cd700000000p+0, 0x1.a51f3c383b652p-30), + (double2)(0x1.2a0c710000000p+0, 0x1.3696da190822dp-25), + (double2)(0x1.2a4b580000000p+0, 0x1.2f9adec77074bp-25), + (double2)(0x1.2a8a250000000p+0, 0x1.8190fd5bee55fp-28), + (double2)(0x1.2ac8d70000000p+0, 0x1.bfee8fac68e55p-27), + (double2)(0x1.2b076f0000000p+0, 0x1.31c9d6bc5f68ap-28), + (double2)(0x1.2b45ec0000000p+0, 0x1.89d0523737edfp-25), + (double2)(0x1.2b84500000000p+0, 0x1.a295943bf47bbp-26), + (double2)(0x1.2bc29a0000000p+0, 0x1.96be32e5b3207p-28), + (double2)(0x1.2c00c90000000p+0, 0x1.e44c7d909fa0ep-25), + (double2)(0x1.2c3ee00000000p+0, 0x1.b2505da94d9eap-29), + (double2)(0x1.2c7cdc0000000p+0, 0x1.0c851f46c9c98p-25), + (double2)(0x1.2cbabf0000000p+0, 0x1.da71f7d9aa3b7p-26), + (double2)(0x1.2cf8880000000p+0, 0x1.f1b605d019ef1p-25), + (double2)(0x1.2d36390000000p+0, 0x1.386e8a2189563p-27), + (double2)(0x1.2d73d00000000p+0, 0x1.b19fa5d306ba7p-28), + (double2)(0x1.2db14d0000000p+0, 0x1.dd749b67aef76p-25), + (double2)(0x1.2deeb20000000p+0, 0x1.76ff6f1dc04b0p-25), + (double2)(0x1.2e2bfe0000000p+0, 0x1.35a33d0b232a6p-25), + (double2)(0x1.2e69310000000p+0, 0x1.4bdc80024a4e1p-25), + (double2)(0x1.2ea64b0000000p+0, 0x1.ebd61770fd723p-25), + (double2)(0x1.2ee34d0000000p+0, 0x1.4769fc537264dp-25), + (double2)(0x1.2f20360000000p+0, 0x1.9021f429f3b98p-25), + (double2)(0x1.2f5d070000000p+0, 0x1.ee7083efbd606p-26), + (double2)(0x1.2f99bf0000000p+0, 0x1.ad985552a6b1ap-25), + (double2)(0x1.2fd65f0000000p+0, 0x1.e3df778772160p-25), + (double2)(0x1.3012e70000000p+0, 0x1.ca5d76ddc9b34p-25), + (double2)(0x1.304f570000000p+0, 0x1.91154ffdbaf74p-25), + (double2)(0x1.308baf0000000p+0, 0x1.67bdd57fb306ap-25), + (double2)(0x1.30c7ef0000000p+0, 0x1.7dc255ac40886p-25), + (double2)(0x1.3104180000000p+0, 0x1.219f38e8afafep-32), + (double2)(0x1.3140280000000p+0, 0x1.2416bf9669a04p-25), + (double2)(0x1.317c210000000p+0, 0x1.11c96b2b3987fp-25), + (double2)(0x1.31b8020000000p+0, 0x1.f99ed447e1177p-25), + (double2)(0x1.31f3cd0000000p+0, 0x1.3245826328a11p-30), + (double2)(0x1.322f7f0000000p+0, 0x1.6f56dd1e645f8p-25), + (double2)(0x1.326b1b0000000p+0, 0x1.6164946945535p-27), + (double2)(0x1.32a69f0000000p+0, 0x1.e37d59d190028p-26), + (double2)(0x1.32e20c0000000p+0, 0x1.68671f12bf828p-25), + (double2)(0x1.331d620000000p+0, 0x1.e8ecbca6aabbdp-25), + (double2)(0x1.3358a20000000p+0, 0x1.3f49e109a5912p-26), + (double2)(0x1.3393ca0000000p+0, 0x1.b8a0e11ec3043p-25), + (double2)(0x1.33cedc0000000p+0, 0x1.5fae00aed691ap-25), + (double2)(0x1.3409d70000000p+0, 0x1.c0569bece3e4ap-25), + (double2)(0x1.3444bc0000000p+0, 0x1.05e26744efbfep-25), + (double2)(0x1.347f8a0000000p+0, 0x1.5b570a94be5c5p-25), + (double2)(0x1.34ba420000000p+0, 0x1.d6f156ea0e063p-26), + (double2)(0x1.34f4e30000000p+0, 0x1.e0ca7612fc484p-25), + (double2)(0x1.352f6f0000000p+0, 0x1.963c927b25258p-27), + (double2)(0x1.3569e40000000p+0, 0x1.47930aa725a5cp-26), + (double2)(0x1.35a4430000000p+0, 0x1.8a79fe3af43b3p-26), + (double2)(0x1.35de8c0000000p+0, 0x1.e6dc29c41bdafp-26), + (double2)(0x1.3618bf0000000p+0, 0x1.57a2e76f863a5p-25), + (double2)(0x1.3652dd0000000p+0, 0x1.ae3b61716354dp-29), + (double2)(0x1.368ce40000000p+0, 0x1.65fb5df6906b1p-25), + (double2)(0x1.36c6d60000000p+0, 0x1.6177d7f588f7bp-25), + (double2)(0x1.3700b30000000p+0, 0x1.ad55abd091b67p-28), + (double2)(0x1.373a7a0000000p+0, 0x1.55337b2422d76p-30), + (double2)(0x1.37742b0000000p+0, 0x1.084ebe86972d5p-25), + (double2)(0x1.37adc70000000p+0, 0x1.56395808e1ea3p-25), + (double2)(0x1.37e74e0000000p+0, 0x1.1bce21b40fba7p-25), + (double2)(0x1.3820c00000000p+0, 0x1.006f94605b515p-26), + (double2)(0x1.385a1c0000000p+0, 0x1.aa676aceb1f7dp-25), + (double2)(0x1.3893640000000p+0, 0x1.8229f76554ce6p-26), + (double2)(0x1.38cc960000000p+0, 0x1.eabfc6cf57330p-25), + (double2)(0x1.3905b40000000p+0, 0x1.4daed9c0ce8bcp-25), + (double2)(0x1.393ebd0000000p+0, 0x1.0ff1768237141p-25), + (double2)(0x1.3977b10000000p+0, 0x1.575f83051b085p-25), + (double2)(0x1.39b0910000000p+0, 0x1.2667deb523e29p-27), + (double2)(0x1.39e95c0000000p+0, 0x1.816996954f4fdp-30), + (double2)(0x1.3a22120000000p+0, 0x1.87cfccf4d9cd4p-26), + (double2)(0x1.3a5ab40000000p+0, 0x1.2c5d018198353p-26), + (double2)(0x1.3a93410000000p+0, 0x1.a7a898dcc34aap-25), + (double2)(0x1.3acbbb0000000p+0, 0x1.cead6dadc36d1p-29), + (double2)(0x1.3b04200000000p+0, 0x1.a55759c498bdfp-29), + (double2)(0x1.3b3c700000000p+0, 0x1.c414a9ef6de04p-25), + (double2)(0x1.3b74ad0000000p+0, 0x1.3e2108a6e58fap-25), + (double2)(0x1.3bacd60000000p+0, 0x1.587fd7643d77cp-26), + (double2)(0x1.3be4eb0000000p+0, 0x1.901eb1d3ff3dfp-28), + (double2)(0x1.3c1ceb0000000p+0, 0x1.f2ccd7c812fc6p-25), + (double2)(0x1.3c54d90000000p+0, 0x1.1c8ee70a01049p-29), + (double2)(0x1.3c8cb20000000p+0, 0x1.63e8d02831eecp-26), + (double2)(0x1.3cc4770000000p+0, 0x1.f61a42a92c7ffp-25), + (double2)(0x1.3cfc2a0000000p+0, 0x1.a917399c84d24p-34), + (double2)(0x1.3d33c80000000p+0, 0x1.e9197c8eec2f0p-26), + (double2)(0x1.3d6b530000000p+0, 0x1.e6f842f5a1378p-26), + (double2)(0x1.3da2cb0000000p+0, 0x1.fac242a90a0fcp-29), + (double2)(0x1.3dda2f0000000p+0, 0x1.35ed726610227p-26), + (double2)(0x1.3e11800000000p+0, 0x1.0e0d64804b15bp-26), + (double2)(0x1.3e48be0000000p+0, 0x1.560675daba814p-31), + (double2)(0x1.3e7fe80000000p+0, 0x1.37388c8768032p-25), + (double2)(0x1.3eb7000000000p+0, 0x1.ee3c89f9e01f5p-28), + (double2)(0x1.3eee040000000p+0, 0x1.39f6f0d09747cp-25), + (double2)(0x1.3f24f60000000p+0, 0x1.322c327abb8f0p-27), + (double2)(0x1.3f5bd40000000p+0, 0x1.961b347c8ac80p-25), + (double2)(0x1.3f92a00000000p+0, 0x1.3711fbbd0f118p-25), + (double2)(0x1.3fc9590000000p+0, 0x1.4fad8d7718ffbp-25), + (double2)(0x1.3fffff0000000p+0, 0x1.fffffffffffffp-25), + (double2)(0x1.4036930000000p+0, 0x1.67efa79ec35b4p-25), + (double2)(0x1.406d140000000p+0, 0x1.a737687a254a8p-25), + (double2)(0x1.40a3830000000p+0, 0x1.bace0f87d924dp-26), + (double2)(0x1.40d9df0000000p+0, 0x1.29e37c237e392p-25), + (double2)(0x1.4110290000000p+0, 0x1.57ce7ac3f3012p-26), + (double2)(0x1.4146600000000p+0, 0x1.82829359f8fbdp-25), + (double2)(0x1.417c850000000p+0, 0x1.cc9be42d14676p-25), + (double2)(0x1.41b2980000000p+0, 0x1.a8f001c137d0bp-25), + (double2)(0x1.41e8990000000p+0, 0x1.36127687dda05p-25), + (double2)(0x1.421e880000000p+0, 0x1.24dba322646f0p-26), + (double2)(0x1.4254640000000p+0, 0x1.dc43f1ed210b4p-25), + (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25) +}; + + +DECLARE_TABLE(double2, CBRT_REM_TBL, 5) = { + (double2)(0x1.428a2f0000000p-1, 0x1.31ae515c447bbp-26), + (double2)(0x1.965fea0000000p-1, 0x1.4f5b8f20ac166p-27), + (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), + (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25), + (double2)(0x1.965fea0000000p+0, 0x1.4f5b8f20ac166p-26), +}; + TABLE_FUNCTION(double2, ATAN_JBY256_TBL, atan_jby256_tbl); TABLE_FUNCTION(double2, TWO_TO_JBY64_EP, two_to_jby64_ep_tbl); +TABLE_FUNCTION(double2, SINH_TBL, sinh_tbl); +TABLE_FUNCTION(double2, COSH_TBL, cosh_tbl); +TABLE_FUNCTION(double, CBRT_INV_TBL, cbrt_inv_tbl); +TABLE_FUNCTION(double2, CBRT_DBL_TBL, cbrt_dbl_tbl); +TABLE_FUNCTION(double2, CBRT_REM_TBL, cbrt_rem_tbl); #endif // cl_khr_fp64 diff -Nru libclc-0.2.0+git20150813/generic/lib/math/tables.h libclc-0.2.0+git20170213/generic/lib/math/tables.h --- libclc-0.2.0+git20150813/generic/lib/math/tables.h 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/tables.h 2017-02-12 21:33:49.000000000 +0000 @@ -42,6 +42,10 @@ TABLE_FUNCTION_DECL(float, log_inv_tbl); TABLE_FUNCTION_DECL(float2, log2_tbl); TABLE_FUNCTION_DECL(uint4, pibits_tbl); +TABLE_FUNCTION_DECL(float2, sinhcosh_tbl); +TABLE_FUNCTION_DECL(float2, cbrt_tbl); +TABLE_FUNCTION_DECL(float, exp_tbl); +TABLE_FUNCTION_DECL(float2, exp_tbl_ep); #ifdef cl_khr_fp64 @@ -50,4 +54,10 @@ TABLE_FUNCTION_DECL(double2, ln_tbl); TABLE_FUNCTION_DECL(double2, atan_jby256_tbl); TABLE_FUNCTION_DECL(double2, two_to_jby64_ep_tbl); +TABLE_FUNCTION_DECL(double2, sinh_tbl); +TABLE_FUNCTION_DECL(double2, cosh_tbl); +TABLE_FUNCTION_DECL(double, cbrt_inv_tbl); +TABLE_FUNCTION_DECL(double2, cbrt_dbl_tbl); +TABLE_FUNCTION_DECL(double2, cbrt_rem_tbl); + #endif // cl_khr_fp64 diff -Nru libclc-0.2.0+git20150813/generic/lib/math/tanh.cl libclc-0.2.0+git20170213/generic/lib/math/tanh.cl --- libclc-0.2.0+git20150813/generic/lib/math/tanh.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/tanh.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float tanh(float x) +{ + // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + // to the following three formulae: + // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + // 2. (1 - (2/(exp(2*x) + 1 ))) + // 3. (exp(2*x) - 1)/(exp(2*x) + 1) + // but computationally, some formulae are better on some ranges. + + const float large_threshold = 0x1.0a2b24p+3f; + + uint ux = as_uint(x); + uint aux = ux & EXSIGNBIT_SP32; + uint xs = ux ^ aux; + + float y = as_float(aux); + float y2 = y*y; + + float a1 = mad(y2, + mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F), + -0.28192806108402678e0F); + float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F); + + float a2 = mad(y2, + mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F), + -0.24069858695196524e0F); + float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F); + + int c = y < 0.9f; + float a = c ? a1 : a2; + float b = c ? b1 : b2; + float zlo = mad(MATH_DIVIDE(a, b), y*y2, y); + + float p = exp(2.0f * y) + 1.0f; + float zhi = 1.0F - MATH_DIVIDE(2.0F, p); + + float z = y <= 1.0f ? zlo : zhi; + z = as_float(xs | as_uint(z)); + + // Edge cases + float sone = as_float(0x3f800000U | xs); + z = y > large_threshold ? sone : z; + z = aux < 0x39000000 | aux > 0x7f800000 ? x : z; + + return z; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, tanh, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double tanh(double x) +{ + // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + // to the following three formulae: + // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + // 2. (1 - (2/(exp(2*x) + 1 ))) + // 3. (exp(2*x) - 1)/(exp(2*x) + 1) + // but computationally, some formulae are better on some ranges. + + // The point at which e^-x is insignificant compared to e^x = ln(2^27) + const double large_threshold = 0x1.2b708872320e2p+4; + + ulong ux = as_ulong(x); + ulong ax = ux & ~SIGNBIT_DP64; + ulong sx = ux ^ ax; + double y = as_double(ax); + double y2 = y * y; + + // y < 0.9 + double znl = fma(y2, + fma(y2, + fma(y2, -0.142077926378834722618091e-7, -0.200047621071909498730453e-3), + -0.176016349003044679402273e-1), + -0.274030424656179760118928e0); + + double zdl = fma(y2, + fma(y2, + fma(y2, 0.2091140262529164482568557e-3, 0.201562166026937652780575e-1), + 0.381641414288328849317962e0), + 0.822091273968539282568011e0); + + // 0.9 <= y <= 1 + double znm = fma(y2, + fma(y2, + fma(y2, -0.115475878996143396378318e-7, -0.165597043903549960486816e-3), + -0.146173047288731678404066e-1), + -0.227793870659088295252442e0); + + double zdm = fma(y2, + fma(y2, + fma(y2, 0.173076050126225961768710e-3, 0.167358775461896562588695e-1), + 0.317204558977294374244770e0), + 0.683381611977295894959554e0); + + int c = y < 0.9; + double zn = c ? znl : znm; + double zd = c ? zdl : zdm; + double z = y + y*y2 * MATH_DIVIDE(zn, zd); + + // y > 1 + double p = exp(2.0 * y) + 1.0; + double zg = 1.0 - 2.0 / p; + + z = y > 1.0 ? zg : z; + + // Other cases + z = y < 0x1.0p-28 | ax > PINFBITPATT_DP64 ? x : z; + + z = y > large_threshold ? 1.0 : z; + + return as_double(sx | as_ulong(z)); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tanh, double); + +#endif // cl_khr_fp64 diff -Nru libclc-0.2.0+git20150813/generic/lib/math/tgamma.cl libclc-0.2.0+git20170213/generic/lib/math/tgamma.cl --- libclc-0.2.0+git20150813/generic/lib/math/tgamma.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/math/tgamma.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016 Aaron Watry + * Copyright (c) 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include "math.h" +#include "../clcmacro.h" + +_CLC_OVERLOAD _CLC_DEF float tgamma(float x) { + const float pi = 3.1415926535897932384626433832795f; + float ax = fabs(x); + float lg = lgamma(ax); + float g = exp(lg); + + if (x < 0.0f) { + float z = sinpi(x); + g = g * ax * z; + g = pi / g; + g = g == 0 ? as_float(PINFBITPATT_SP32) : g; + g = z == 0 ? as_float(QNANBITPATT_SP32) : g; + } + + return g; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, tgamma, float); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_OVERLOAD _CLC_DEF double tgamma(double x) { + const double pi = 3.1415926535897932384626433832795; + double ax = fabs(x); + double lg = lgamma(ax); + double g = exp(lg); + + if (x < 0.0) { + double z = sinpi(x); + g = g * ax * z; + g = pi / g; + g = g == 0 ? as_double(PINFBITPATT_DP64) : g; + g = z == 0 ? as_double(QNANBITPATT_DP64) : g; + } + + return g; +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tgamma, double); + +#endif diff -Nru libclc-0.2.0+git20150813/generic/lib/shared/min.inc libclc-0.2.0+git20170213/generic/lib/shared/min.inc --- libclc-0.2.0+git20150813/generic/lib/shared/min.inc 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/shared/min.inc 2017-02-12 21:33:49.000000000 +0000 @@ -1,9 +1,9 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_GENTYPE b) { - return (a < b ? a : b); + return (b < a ? b : a); } #ifndef __CLC_SCALAR _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE min(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b) { - return (a < (__CLC_GENTYPE)b ? a : (__CLC_GENTYPE)b); + return (b < (__CLC_GENTYPE)a ? (__CLC_GENTYPE)b : a); } #endif diff -Nru libclc-0.2.0+git20150813/generic/lib/shared/vstore.cl libclc-0.2.0+git20170213/generic/lib/shared/vstore.cl --- libclc-0.2.0+git20150813/generic/lib/shared/vstore.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/shared/vstore.cl 2017-02-12 21:33:49.000000000 +0000 @@ -50,3 +50,48 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable VSTORE_ADDR_SPACES(double) #endif + +/* vstore_half are legal even without cl_khr_fp16 */ +#define DECLARE_HELPER(STYPE, AS) void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *); + +DECLARE_HELPER(float, __private); +DECLARE_HELPER(float, __global); +DECLARE_HELPER(float, __local); + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +DECLARE_HELPER(double, __private); +DECLARE_HELPER(double, __global); +DECLARE_HELPER(double, __local); +#endif + + +#define VEC_STORE1(STYPE, AS, val) __clc_vstore_half_##STYPE##_helper##AS (val, &mem[offset++]); +#define VEC_STORE2(STYPE, AS, val) \ + VEC_STORE1(STYPE, AS, val.lo) \ + VEC_STORE1(STYPE, AS, val.hi) +#define VEC_STORE3(STYPE, AS, val) \ + VEC_STORE1(STYPE, AS, val.s0) \ + VEC_STORE1(STYPE, AS, val.s1) \ + VEC_STORE1(STYPE, AS, val.s2) +#define VEC_STORE4(STYPE, AS, val) \ + VEC_STORE2(STYPE, AS, val.lo) \ + VEC_STORE2(STYPE, AS, val.hi) +#define VEC_STORE8(STYPE, AS, val) \ + VEC_STORE4(STYPE, AS, val.lo) \ + VEC_STORE4(STYPE, AS, val.hi) +#define VEC_STORE16(STYPE, AS, val) \ + VEC_STORE8(STYPE, AS, val.lo) \ + VEC_STORE8(STYPE, AS, val.hi) + +#define __FUNC(SUFFIX, VEC_SIZE, TYPE, STYPE, AS) \ + _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \ + offset *= VEC_SIZE; \ + VEC_STORE##VEC_SIZE(STYPE, AS, vec) \ + } + +#define FUNC(SUFFIX, VEC_SIZE, TYPE, STYPE, AS) __FUNC(SUFFIX, VEC_SIZE, TYPE, STYPE, AS) + +#define __CLC_BODY "vstore_half.inc" +#include + diff -Nru libclc-0.2.0+git20150813/generic/lib/shared/vstore_half_helpers.ll libclc-0.2.0+git20170213/generic/lib/shared/vstore_half_helpers.ll --- libclc-0.2.0+git20150813/generic/lib/shared/vstore_half_helpers.ll 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/shared/vstore_half_helpers.ll 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,35 @@ +define void @__clc_vstore_half_float_helper__private(float %data, half addrspace(0)* nocapture %ptr) nounwind alwaysinline { + %res = fptrunc float %data to half + store half %res, half addrspace(0)* %ptr + ret void +} + +define void @__clc_vstore_half_float_helper__global(float %data, half addrspace(1)* nocapture %ptr) nounwind alwaysinline { + %res = fptrunc float %data to half + store half %res, half addrspace(1)* %ptr + ret void +} + +define void @__clc_vstore_half_float_helper__local(float %data, half addrspace(3)* nocapture %ptr) nounwind alwaysinline { + %res = fptrunc float %data to half + store half %res, half addrspace(3)* %ptr + ret void +} + +define void @__clc_vstore_half_double_helper__private(double %data, half addrspace(0)* nocapture %ptr) nounwind alwaysinline { + %res = fptrunc double %data to half + store half %res, half addrspace(0)* %ptr + ret void +} + +define void @__clc_vstore_half_double_helper__global(double %data, half addrspace(1)* nocapture %ptr) nounwind alwaysinline { + %res = fptrunc double %data to half + store half %res, half addrspace(1)* %ptr + ret void +} + +define void @__clc_vstore_half_double_helper__local(double %data, half addrspace(3)* nocapture %ptr) nounwind alwaysinline { + %res = fptrunc double %data to half + store half %res, half addrspace(3)* %ptr + ret void +} diff -Nru libclc-0.2.0+git20150813/generic/lib/shared/vstore_half.inc libclc-0.2.0+git20170213/generic/lib/shared/vstore_half.inc --- libclc-0.2.0+git20150813/generic/lib/shared/vstore_half.inc 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/shared/vstore_half.inc 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,10 @@ + +#ifdef __CLC_VECSIZE + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private); + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local); + FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global); +#else + FUNC(, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __private); + FUNC(, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __local); + FUNC(, 1, __CLC_GENTYPE, __CLC_SCALAR_GENTYPE, __global); +#endif diff -Nru libclc-0.2.0+git20150813/generic/lib/SOURCES libclc-0.2.0+git20170213/generic/lib/SOURCES --- libclc-0.2.0+git20150813/generic/lib/SOURCES 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/SOURCES 2017-02-12 21:33:49.000000000 +0000 @@ -73,29 +73,40 @@ math/atan2pi.cl math/atanh.cl math/atanpi.cl +math/cbrt.cl math/copysign.cl math/cos.cl +math/cosh.cl math/cospi.cl math/ep_log.cl +math/erf.cl math/erfc.cl math/exp.cl math/exp_helper.cl +math/expm1.cl math/exp2.cl math/exp10.cl +math/fdim.cl math/fmax.cl math/fmin.cl math/fmod.cl math/fract.cl +math/frexp.cl math/half_rsqrt.cl math/half_sqrt.cl math/hypot.cl +math/ilogb.cl math/clc_ldexp.cl math/ldexp.cl +math/lgamma.cl +math/lgamma_r.cl math/log.cl math/log10.cl math/log1p.cl math/log2.cl +math/logb.cl math/mad.cl +math/modf.cl math/native_log.cl math/native_log2.cl math/tables.cl @@ -109,6 +120,8 @@ math/clc_sqrt.cl math/sqrt.cl math/tan.cl +math/tanh.cl +math/tgamma.cl relational/all.cl relational/any.cl relational/bitselect.cl @@ -131,5 +144,7 @@ shared/min.cl shared/vload.cl shared/vstore.cl +shared/vstore_half_helpers.ll workitem/get_global_id.cl workitem/get_global_size.cl +image/get_image_dim.cl diff -Nru libclc-0.2.0+git20150813/generic/lib/workitem/get_global_id.cl libclc-0.2.0+git20170213/generic/lib/workitem/get_global_id.cl --- libclc-0.2.0+git20150813/generic/lib/workitem/get_global_id.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/generic/lib/workitem/get_global_id.cl 2017-02-12 21:33:49.000000000 +0000 @@ -1,5 +1,5 @@ #include _CLC_DEF size_t get_global_id(uint dim) { - return get_group_id(dim)*get_local_size(dim) + get_local_id(dim); + return get_group_id(dim) * get_local_size(dim) + get_local_id(dim) + get_global_offset(dim); } diff -Nru libclc-0.2.0+git20150813/.gitignore libclc-0.2.0+git20170213/.gitignore --- libclc-0.2.0+git20150813/.gitignore 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/.gitignore 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,14 @@ +Makefile +amdgcn-- +amdgcn--amdhsa +build/*.pyc +built_libs/ +generic-- +generic/lib/convert.cl +libclc.pc +nvptx--nvidiacl +nvptx64--nvidiacl +r600-- +utils/prepare-builtins +utils/prepare-builtins.o +utils/prepare-builtins.o.d diff -Nru libclc-0.2.0+git20150813/LICENSE.TXT libclc-0.2.0+git20170213/LICENSE.TXT --- libclc-0.2.0+git20150813/LICENSE.TXT 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/LICENSE.TXT 2017-02-12 21:33:49.000000000 +0000 @@ -11,7 +11,7 @@ ============================================================================== -Copyright (c) 2011-2014 by the contributors listed in CREDITS.TXT +Copyright (c) 2011-2016 by the contributors listed in CREDITS.TXT All rights reserved. diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/SOURCES libclc-0.2.0+git20170213/ptx-nvidiacl/lib/SOURCES --- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/SOURCES 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/SOURCES 2017-02-12 21:33:49.000000000 +0000 @@ -1,4 +1,5 @@ synchronization/barrier.cl +workitem/get_global_id.cl workitem/get_group_id.cl workitem/get_local_id.cl workitem/get_local_size.cl diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/synchronization/barrier.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/synchronization/barrier.cl --- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/synchronization/barrier.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/synchronization/barrier.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,7 +2,7 @@ _CLC_DEF void barrier(cl_mem_fence_flags flags) { if (flags & CLK_LOCAL_MEM_FENCE) { - __builtin_ptx_bar_sync(0); + __syncthreads(); } } diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_global_id.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_global_id.cl --- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_global_id.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_global_id.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,5 @@ +#include + +_CLC_DEF size_t get_global_id(uint dim) { + return get_group_id(dim) * get_local_size(dim) + get_local_id(dim); +} diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_group_id.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_group_id.cl --- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_group_id.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_group_id.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,9 +2,9 @@ _CLC_DEF size_t get_group_id(uint dim) { switch (dim) { - case 0: return __builtin_ptx_read_ctaid_x(); - case 1: return __builtin_ptx_read_ctaid_y(); - case 2: return __builtin_ptx_read_ctaid_z(); + case 0: return __nvvm_read_ptx_sreg_ctaid_x(); + case 1: return __nvvm_read_ptx_sreg_ctaid_y(); + case 2: return __nvvm_read_ptx_sreg_ctaid_z(); default: return 0; } } diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_local_id.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_local_id.cl --- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_local_id.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_local_id.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,9 +2,9 @@ _CLC_DEF size_t get_local_id(uint dim) { switch (dim) { - case 0: return __builtin_ptx_read_tid_x(); - case 1: return __builtin_ptx_read_tid_y(); - case 2: return __builtin_ptx_read_tid_z(); + case 0: return __nvvm_read_ptx_sreg_tid_x(); + case 1: return __nvvm_read_ptx_sreg_tid_y(); + case 2: return __nvvm_read_ptx_sreg_tid_z(); default: return 0; } } diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_local_size.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_local_size.cl --- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_local_size.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_local_size.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,9 +2,9 @@ _CLC_DEF size_t get_local_size(uint dim) { switch (dim) { - case 0: return __builtin_ptx_read_ntid_x(); - case 1: return __builtin_ptx_read_ntid_y(); - case 2: return __builtin_ptx_read_ntid_z(); + case 0: return __nvvm_read_ptx_sreg_ntid_x(); + case 1: return __nvvm_read_ptx_sreg_ntid_y(); + case 2: return __nvvm_read_ptx_sreg_ntid_z(); default: return 0; } } diff -Nru libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_num_groups.cl libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_num_groups.cl --- libclc-0.2.0+git20150813/ptx-nvidiacl/lib/workitem/get_num_groups.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/ptx-nvidiacl/lib/workitem/get_num_groups.cl 2017-02-12 21:33:49.000000000 +0000 @@ -2,9 +2,9 @@ _CLC_DEF size_t get_num_groups(uint dim) { switch (dim) { - case 0: return __builtin_ptx_read_nctaid_x(); - case 1: return __builtin_ptx_read_nctaid_y(); - case 2: return __builtin_ptx_read_nctaid_z(); + case 0: return __nvvm_read_ptx_sreg_nctaid_x(); + case 1: return __nvvm_read_ptx_sreg_nctaid_y(); + case 2: return __nvvm_read_ptx_sreg_nctaid_z(); default: return 0; } } diff -Nru libclc-0.2.0+git20150813/r600/lib/atomic/atomic.cl libclc-0.2.0+git20170213/r600/lib/atomic/atomic.cl --- libclc-0.2.0+git20150813/r600/lib/atomic/atomic.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/atomic/atomic.cl 1970-01-01 00:00:00.000000000 +0000 @@ -1,65 +0,0 @@ -#include - -#define ATOMIC_FUNC_DEFINE(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ -_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE val) { \ - return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)val); \ -} - -/* For atomic functions that don't need different bitcode dependending on argument signedness */ -#define ATOMIC_FUNC_SIGN(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ - _CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE); \ - ATOMIC_FUNC_DEFINE(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ - ATOMIC_FUNC_DEFINE(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) - -#define ATOMIC_FUNC_ADDRSPACE(TYPE, FUNCTION) \ - ATOMIC_FUNC_SIGN(TYPE, FUNCTION, global, 1) \ - ATOMIC_FUNC_SIGN(TYPE, FUNCTION, local, 3) - -#define ATOMIC_FUNC(FUNCTION) \ - ATOMIC_FUNC_ADDRSPACE(int, FUNCTION) - -#define ATOMIC_FUNC_DEFINE_3_ARG(RET_SIGN, ARG_SIGN, TYPE, CL_FUNCTION, CLC_FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ -_CLC_OVERLOAD _CLC_DEF RET_SIGN TYPE CL_FUNCTION (volatile CL_ADDRSPACE RET_SIGN TYPE *p, RET_SIGN TYPE cmp, RET_SIGN TYPE val) { \ - return (RET_SIGN TYPE)__clc_##CLC_FUNCTION##_addr##LLVM_ADDRSPACE((volatile CL_ADDRSPACE ARG_SIGN TYPE*)p, (ARG_SIGN TYPE)cmp, (ARG_SIGN TYPE)val); \ -} - -/* For atomic functions that don't need different bitcode dependending on argument signedness */ -#define ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ - _CLC_DECL signed TYPE __clc_##FUNCTION##_addr##LLVM_ADDRSPACE(volatile CL_ADDRSPACE signed TYPE*, signed TYPE, signed TYPE); \ - ATOMIC_FUNC_DEFINE_3_ARG(signed, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) \ - ATOMIC_FUNC_DEFINE_3_ARG(unsigned, signed, TYPE, FUNCTION, FUNCTION, CL_ADDRSPACE, LLVM_ADDRSPACE) - -#define ATOMIC_FUNC_ADDRSPACE_3_ARG(TYPE, FUNCTION) \ - ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, global, 1) \ - ATOMIC_FUNC_SIGN_3_ARG(TYPE, FUNCTION, local, 3) - -#define ATOMIC_FUNC_3_ARG(FUNCTION) \ - ATOMIC_FUNC_ADDRSPACE_3_ARG(int, FUNCTION) - -ATOMIC_FUNC(atomic_add) -ATOMIC_FUNC(atomic_and) -ATOMIC_FUNC(atomic_or) -ATOMIC_FUNC(atomic_sub) -ATOMIC_FUNC(atomic_xchg) -ATOMIC_FUNC(atomic_xor) -ATOMIC_FUNC_3_ARG(atomic_cmpxchg) - -_CLC_DECL signed int __clc_atomic_max_addr1(volatile global signed int*, signed int); -_CLC_DECL signed int __clc_atomic_max_addr3(volatile local signed int*, signed int); -_CLC_DECL uint __clc_atomic_umax_addr1(volatile global uint*, uint); -_CLC_DECL uint __clc_atomic_umax_addr3(volatile local uint*, uint); - -ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, global, 1) -ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_max, atomic_max, local, 3) -ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, global, 1) -ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_max, atomic_umax, local, 3) - -_CLC_DECL signed int __clc_atomic_min_addr1(volatile global signed int*, signed int); -_CLC_DECL signed int __clc_atomic_min_addr3(volatile local signed int*, signed int); -_CLC_DECL uint __clc_atomic_umin_addr1(volatile global uint*, uint); -_CLC_DECL uint __clc_atomic_umin_addr3(volatile local uint*, uint); - -ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, global, 1) -ATOMIC_FUNC_DEFINE(signed, signed, int, atomic_min, atomic_min, local, 3) -ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, global, 1) -ATOMIC_FUNC_DEFINE(unsigned, unsigned, int, atomic_min, atomic_umin, local, 3) diff -Nru libclc-0.2.0+git20150813/r600/lib/math/ldexp.cl libclc-0.2.0+git20170213/r600/lib/math/ldexp.cl --- libclc-0.2.0+git20150813/r600/lib/math/ldexp.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/math/ldexp.cl 1970-01-01 00:00:00.000000000 +0000 @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include - -#include "../../../generic/lib/clcmacro.h" - -#ifdef __HAS_LDEXPF__ -#define BUILTINF __builtin_amdgpu_ldexpf -#else -#include "math/clc_ldexp.h" -#define BUILTINF __clc_ldexp -#endif - -// This defines all the ldexp(floatN, intN) variants. -_CLC_DEFINE_BINARY_BUILTIN(float, ldexp, BUILTINF, float, int); - -#ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - // This defines all the ldexp(doubleN, intN) variants. - _CLC_DEFINE_BINARY_BUILTIN(double, ldexp, __builtin_amdgpu_ldexp, double, int); -#endif - -// This defines all the ldexp(GENTYPE, int); -#define __CLC_BODY <../../../generic/lib/math/ldexp.inc> -#include - -#undef BUILTINF diff -Nru libclc-0.2.0+git20150813/r600/lib/math/nextafter.cl libclc-0.2.0+git20170213/r600/lib/math/nextafter.cl --- libclc-0.2.0+git20150813/r600/lib/math/nextafter.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/math/nextafter.cl 1970-01-01 00:00:00.000000000 +0000 @@ -1,4 +0,0 @@ -#include -#include "../lib/clcmacro.h" - -_CLC_DEFINE_BINARY_BUILTIN(float, nextafter, __clc_nextafter, float, float) diff -Nru libclc-0.2.0+git20150813/r600/lib/math/sqrt.cl libclc-0.2.0+git20170213/r600/lib/math/sqrt.cl --- libclc-0.2.0+git20150813/r600/lib/math/sqrt.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/math/sqrt.cl 1970-01-01 00:00:00.000000000 +0000 @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2015 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include -#include "../../../generic/lib/clcmacro.h" -#include "math/clc_sqrt.h" - -_CLC_DEFINE_UNARY_BUILTIN(float, sqrt, __clc_sqrt, float) - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - - -_CLC_OVERLOAD _CLC_DEF double sqrt(double x) { - - uint vcc = x < 0x1p-767; - uint exp0 = vcc ? 0x100 : 0; - unsigned exp1 = vcc ? 0xffffff80 : 0; - - double v01 = ldexp(x, exp0); - double v23 = __builtin_amdgpu_rsq(v01); - double v45 = v01 * v23; - v23 = v23 * 0.5; - - double v67 = fma(-v23, v45, 0.5); - v45 = fma(v45, v67, v45); - double v89 = fma(-v45, v45, v01); - v23 = fma(v23, v67, v23); - v45 = fma(v89, v23, v45); - v67 = fma(-v45, v45, v01); - v23 = fma(v67, v23, v45); - - v23 = ldexp(v23, exp1); - return ((x == __builtin_inf()) || (x == 0.0)) ? v01 : v23; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sqrt, double); - -#endif diff -Nru libclc-0.2.0+git20150813/r600/lib/OVERRIDES libclc-0.2.0+git20170213/r600/lib/OVERRIDES --- libclc-0.2.0+git20150813/r600/lib/OVERRIDES 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/OVERRIDES 2017-02-12 21:33:49.000000000 +0000 @@ -1,2 +0,0 @@ -workitem/get_group_id.cl -workitem/get_global_size.cl diff -Nru libclc-0.2.0+git20150813/r600/lib/SOURCES libclc-0.2.0+git20170213/r600/lib/SOURCES --- libclc-0.2.0+git20150813/r600/lib/SOURCES 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/SOURCES 2017-02-12 21:33:49.000000000 +0000 @@ -1,12 +1,8 @@ -atomic/atomic.cl -math/ldexp.cl -math/nextafter.cl -math/sqrt.cl -workitem/get_num_groups.ll -workitem/get_group_id.ll -workitem/get_local_size.ll -workitem/get_local_id.ll -workitem/get_global_size.ll -workitem/get_work_dim.ll -synchronization/barrier.cl synchronization/barrier_impl.ll +workitem/get_global_offset.cl +workitem/get_group_id.cl +workitem/get_global_size.ll +workitem/get_local_id.cl +workitem/get_local_size.ll +workitem/get_num_groups.ll +workitem/get_work_dim.cl diff -Nru libclc-0.2.0+git20150813/r600/lib/synchronization/barrier.cl libclc-0.2.0+git20170213/r600/lib/synchronization/barrier.cl --- libclc-0.2.0+git20150813/r600/lib/synchronization/barrier.cl 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/synchronization/barrier.cl 1970-01-01 00:00:00.000000000 +0000 @@ -1,10 +0,0 @@ - -#include - -_CLC_DEF int __clc_clk_local_mem_fence() { - return CLK_LOCAL_MEM_FENCE; -} - -_CLC_DEF int __clc_clk_global_mem_fence() { - return CLK_GLOBAL_MEM_FENCE; -} diff -Nru libclc-0.2.0+git20150813/r600/lib/synchronization/barrier_impl.ll libclc-0.2.0+git20170213/r600/lib/synchronization/barrier_impl.ll --- libclc-0.2.0+git20150813/r600/lib/synchronization/barrier_impl.ll 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/synchronization/barrier_impl.ll 2017-02-12 21:33:49.000000000 +0000 @@ -1,9 +1,8 @@ -declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline -declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate -declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate +declare i32 @__clc_clk_local_mem_fence() #1 +declare i32 @__clc_clk_global_mem_fence() #1 +declare void @llvm.r600.group.barrier() #0 -define void @barrier(i32 %flags) nounwind noduplicate alwaysinline { +define void @barrier(i32 %flags) #2 { barrier_local_test: %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence() %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE @@ -11,7 +10,7 @@ br i1 %1, label %barrier_local, label %barrier_global_test barrier_local: - call void @llvm.AMDGPU.barrier.local() noduplicate + call void @llvm.r600.group.barrier() br label %barrier_global_test barrier_global_test: @@ -21,9 +20,13 @@ br i1 %3, label %barrier_global, label %done barrier_global: - call void @llvm.AMDGPU.barrier.global() noduplicate + call void @llvm.r600.group.barrier() br label %done done: ret void } + +attributes #0 = { nounwind convergent } +attributes #1 = { nounwind alwaysinline } +attributes #2 = { nounwind convergent alwaysinline } diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_global_offset.cl libclc-0.2.0+git20170213/r600/lib/workitem/get_global_offset.cl --- libclc-0.2.0+git20150813/r600/lib/workitem/get_global_offset.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_global_offset.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +_CLC_DEF uint get_global_offset(uint dim) +{ + __attribute__((address_space(7))) uint * ptr = + (__attribute__((address_space(7))) uint *) + __builtin_r600_implicitarg_ptr(); + if (dim < 3) + return ptr[dim + 1]; + return 0; +} diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_global_size.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_global_size.ll --- libclc-0.2.0+git20150813/r600/lib/workitem/get_global_size.ll 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_global_size.ll 2017-02-12 21:33:49.000000000 +0000 @@ -14,5 +14,5 @@ %z = call i32 @llvm.r600.read.global.size.z() nounwind readnone ret i32 %z default: - ret i32 0 + ret i32 1 } diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_group_id.cl libclc-0.2.0+git20170213/r600/lib/workitem/get_group_id.cl --- libclc-0.2.0+git20150813/r600/lib/workitem/get_group_id.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_group_id.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +_CLC_DEF uint get_group_id(uint dim) +{ + switch(dim) { + case 0: return __builtin_r600_read_tgid_x(); + case 1: return __builtin_r600_read_tgid_y(); + case 2: return __builtin_r600_read_tgid_z(); + default: return 1; + } +} diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_group_id.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_group_id.ll --- libclc-0.2.0+git20150813/r600/lib/workitem/get_group_id.ll 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_group_id.ll 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -declare i32 @llvm.r600.read.tgid.x() nounwind readnone -declare i32 @llvm.r600.read.tgid.y() nounwind readnone -declare i32 @llvm.r600.read.tgid.z() nounwind readnone - -define i32 @get_group_id(i32 %dim) nounwind readnone alwaysinline { - switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim] -x_dim: - %x = call i32 @llvm.r600.read.tgid.x() nounwind readnone - ret i32 %x -y_dim: - %y = call i32 @llvm.r600.read.tgid.y() nounwind readnone - ret i32 %y -z_dim: - %z = call i32 @llvm.r600.read.tgid.z() nounwind readnone - ret i32 %z -default: - ret i32 0 -} diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_local_id.cl libclc-0.2.0+git20170213/r600/lib/workitem/get_local_id.cl --- libclc-0.2.0+git20150813/r600/lib/workitem/get_local_id.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_local_id.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,11 @@ +#include + +_CLC_DEF uint get_local_id(uint dim) +{ + switch(dim) { + case 0: return __builtin_r600_read_tidig_x(); + case 1: return __builtin_r600_read_tidig_y(); + case 2: return __builtin_r600_read_tidig_z(); + default: return 1; + } +} diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_local_id.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_local_id.ll --- libclc-0.2.0+git20150813/r600/lib/workitem/get_local_id.ll 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_local_id.ll 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare i32 @llvm.r600.read.tidig.y() nounwind readnone -declare i32 @llvm.r600.read.tidig.z() nounwind readnone - -define i32 @get_local_id(i32 %dim) nounwind readnone alwaysinline { - switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim] -x_dim: - %x = call i32 @llvm.r600.read.tidig.x() nounwind readnone - ret i32 %x -y_dim: - %y = call i32 @llvm.r600.read.tidig.y() nounwind readnone - ret i32 %y -z_dim: - %z = call i32 @llvm.r600.read.tidig.z() nounwind readnone - ret i32 %z -default: - ret i32 0 -} diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_local_size.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_local_size.ll --- libclc-0.2.0+git20150813/r600/lib/workitem/get_local_size.ll 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_local_size.ll 2017-02-12 21:33:49.000000000 +0000 @@ -5,14 +5,14 @@ define i32 @get_local_size(i32 %dim) nounwind readnone alwaysinline { switch i32 %dim, label %default [i32 0, label %x_dim i32 1, label %y_dim i32 2, label %z_dim] x_dim: - %x = call i32 @llvm.r600.read.local.size.x() nounwind readnone + %x = call i32 @llvm.r600.read.local.size.x() ret i32 %x y_dim: - %y = call i32 @llvm.r600.read.local.size.y() nounwind readnone + %y = call i32 @llvm.r600.read.local.size.y() ret i32 %y z_dim: - %z = call i32 @llvm.r600.read.local.size.z() nounwind readnone + %z = call i32 @llvm.r600.read.local.size.z() ret i32 %z default: - ret i32 0 + ret i32 1 } diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_num_groups.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_num_groups.ll --- libclc-0.2.0+git20150813/r600/lib/workitem/get_num_groups.ll 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_num_groups.ll 2017-02-12 21:33:49.000000000 +0000 @@ -14,5 +14,5 @@ %z = call i32 @llvm.r600.read.ngroups.z() nounwind readnone ret i32 %z default: - ret i32 0 + ret i32 1 } diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_work_dim.cl libclc-0.2.0+git20170213/r600/lib/workitem/get_work_dim.cl --- libclc-0.2.0+git20150813/r600/lib/workitem/get_work_dim.cl 1970-01-01 00:00:00.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_work_dim.cl 2017-02-12 21:33:49.000000000 +0000 @@ -0,0 +1,9 @@ +#include + +_CLC_DEF uint get_work_dim() +{ + __attribute__((address_space(7))) uint * ptr = + (__attribute__((address_space(7))) uint *) + __builtin_r600_implicitarg_ptr(); + return ptr[0]; +} diff -Nru libclc-0.2.0+git20150813/r600/lib/workitem/get_work_dim.ll libclc-0.2.0+git20170213/r600/lib/workitem/get_work_dim.ll --- libclc-0.2.0+git20150813/r600/lib/workitem/get_work_dim.ll 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/r600/lib/workitem/get_work_dim.ll 1970-01-01 00:00:00.000000000 +0000 @@ -1,8 +0,0 @@ -declare i32 @llvm.AMDGPU.read.workdim() nounwind readnone - -define i32 @get_work_dim() nounwind readnone alwaysinline { - %x = call i32 @llvm.AMDGPU.read.workdim() nounwind readnone , !range !0 - ret i32 %x -} - -!0 = !{ i32 1, i32 4 } diff -Nru libclc-0.2.0+git20150813/utils/prepare-builtins.cpp libclc-0.2.0+git20170213/utils/prepare-builtins.cpp --- libclc-0.2.0+git20150813/utils/prepare-builtins.cpp 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/utils/prepare-builtins.cpp 2017-02-12 21:33:49.000000000 +0000 @@ -1,4 +1,5 @@ -#include "llvm/Bitcode/ReaderWriter.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LLVMContext.h" @@ -24,7 +25,7 @@ cl::value_desc("filename")); int main(int argc, char **argv) { - LLVMContext &Context = getGlobalContext(); + LLVMContext Context; llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. cl::ParseCommandLineOptions(argc, argv, "libclc builtin preparation tool\n"); @@ -35,12 +36,13 @@ { ErrorOr> BufferOrErr = MemoryBuffer::getFile(InputFilename); - std::unique_ptr &BufferPtr = BufferOrErr.get(); - if (std::error_code ec = BufferOrErr.getError()) + if (std::error_code ec = BufferOrErr.getError()) { ErrorMessage = ec.message(); - else { + } else { + std::unique_ptr &BufferPtr = BufferOrErr.get(); ErrorOr> ModuleOrErr = - parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context); + expectedToErrorOrAndEmitErrors(Context, + parseBitcodeFile(BufferPtr.get()->getMemBufferRef(), Context)); if (std::error_code ec = ModuleOrErr.getError()) ErrorMessage = ec.message(); @@ -57,6 +59,13 @@ return 1; } + // Strip the OpenCL version metadata. There are a lot of linked + // modules in the library build, each spamming the same + // version. This may also report a different version than the user + // program is using. This should probably be uniqued when linking. + if (NamedMDNode *OCLVersion = M->getNamedMetadata("opencl.ocl.version")) + M->eraseNamedMetadata(OCLVersion); + // Set linkage of every external definition to linkonce_odr. for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) { if (!i->isDeclaration() && i->getLinkage() == GlobalValue::ExternalLinkage) diff -Nru libclc-0.2.0+git20150813/www/index.html libclc-0.2.0+git20170213/www/index.html --- libclc-0.2.0+git20150813/www/index.html 2015-08-13 23:43:12.000000000 +0000 +++ libclc-0.2.0+git20170213/www/index.html 2017-02-12 21:33:49.000000000 +0000 @@ -37,8 +37,8 @@

-libclc currently only supports the PTX target, but support for more -targets is welcome. +libclc currently supports the AMDGCN, and R600 and NVPTX targets, but +support for more targets is welcome.

Download

@@ -49,7 +49,7 @@

Mailing List

-libclc-dev@pcc.me.uk (subscribe/unsubscribe, archives) +libclc-dev@lists.llvm.org (subscribe/unsubscribe, archives)