diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/appveyor.yml compyle-0.6/appveyor.yml --- compyle-0.6~dev0~20190922.gitaa5a50d/appveyor.yml 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/appveyor.yml 2020-06-14 18:54:39.000000000 +0000 @@ -1,9 +1,10 @@ environment: matrix: - - PYTHON: "C:\\Python27" - - PYTHON: "C:\\Python36-x64" - PYTHON: "C:\\Python37-x64" + - PYTHON: "C:\\Python38-x64" + +skip_branch_with_pr: true install: - "%PYTHON%\\python.exe -m pip install -r requirements.txt" diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/CHANGES.rst compyle-0.6/CHANGES.rst --- compyle-0.6~dev0~20190922.gitaa5a50d/CHANGES.rst 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/CHANGES.rst 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,23 @@ +0.6 +~~~~ + +* Release date: 15th June, 2020. +* Add some non-trivial examples showcasing the package. +* Document how one can use clang + OpenMP. +* Add sorting, align, and other functions to array module. +* Support for mapping structs on a GPU with CUDA. +* Add address, cast, and address low-level functions. +* Support for mako-templates for reducing repetitive code. +* Bitwise operator support. +* Attempt to auto-declare variables when possible. +* Fix several bugs and issues. + + + +0.5 +~~~~ + +* Release date: 3rd, December 2018 +* First public release. +* Support for elementwise, scan, and reduction operations on CPU and GPU using + Cython, OpenCL and CUDA. diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/array.py compyle-0.6/compyle/array.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/array.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/array.py 2020-06-14 18:54:39.000000000 +0000 @@ -6,6 +6,7 @@ from .types import (annotate, dtype_to_ctype, dtype_to_knowntype, knowntype_to_ctype) from .template import Template +from .sort import radix_sort try: @@ -118,7 +119,8 @@ @memoize(key=minmax_collector_key) -def make_collector_dtype(device, dtype, props, name, only_min, only_max, backend): +def make_collector_dtype(device, dtype, props, name, + only_min, only_max, backend): fields = [("pad", np.int32)] for prop in props: @@ -143,7 +145,7 @@ @memoize(key=lambda *args: (args[-3], args[-2], args[-1])) def get_minmax_kernel(ctx, dtype, inf, mmc_dtype, prop_names, - only_min, only_max, name, mmc_c_decl, backend): + only_min, only_max, name, mmc_c_decl, backend): tpl_args = ", ".join( ["%(dtype)s %(prop)s" % {'dtype': dtype, 'prop': prop} for prop in prop_names] @@ -157,7 +159,7 @@ ) mmc_c_decl_lines = mmc_c_decl.splitlines() mmc_c_decl_lines = mmc_c_decl_lines[:-2] + \ - mmc_overload.splitlines() + mmc_c_decl_lines[-2:] + mmc_overload.splitlines() + mmc_c_decl_lines[-2:] mmc_c_decl = '\n'.join(mmc_c_decl_lines) mmc_preamble = mmc_c_decl + minmax_tpl @@ -454,7 +456,10 @@ def sort_by_keys(ary_list, out_list=None, key_bits=None, - backend=None): + backend=None, use_radix_sort=False): + # FIXME: Need to use returned values, cuda backend uses + # thrust that will internally allocate a new array for storing + # the sorted data so out_list will not have the sorted arrays # first arg of ary_list is the key if backend is None: backend = ary_list[0].backend @@ -462,6 +467,12 @@ from .jit import get_ctype_from_arg from compyle.opencl import get_queue + if not out_list: + out_list = [ + Array(ary.dtype, allocate=False, backend=backend) + for ary in ary_list + ] + arg_types = [get_ctype_from_arg(arg) for arg in ary_list] sort_knl = get_cl_sort_kernel(arg_types, ary_list) @@ -469,12 +480,26 @@ arg_list = [ary.dev for ary in ary_list] - out_list, event = sort_knl(*arg_list, key_bits=key_bits, - allocator=allocator) + out_arrays, event = sort_knl(*arg_list, key_bits=key_bits, + allocator=allocator) + for i, out in enumerate(out_list): + out.set_data(out_arrays[i]) + return out_list + elif backend == 'cython' and use_radix_sort: + out_list, order = radix_sort(ary_list, out_list=out_list, + max_key_bits=key_bits, backend=backend) + return out_list + elif backend == 'cython': + order = wrap(np.argsort(ary_list[0].dev), backend=backend) + out_list = align(ary_list, order, out_list=out_list, + backend=backend) return out_list else: order = argsort(ary_list[0], backend=backend) - out_list = align(ary_list[1:], order, out_list=out_list, + modified_out_list = None + if out_list: + modified_out_list = out_list[1:] + out_list = align(ary_list[1:], order, out_list=modified_out_list, backend=backend) return [ary_list[0]] + out_list @@ -611,6 +636,7 @@ from .jit import get_ctype_from_arg key = [get_ctype_from_arg(ary) for ary in ary_list] key.append(backend) + key.append(get_config().use_openmp) return tuple(key) @@ -676,7 +702,7 @@ elif isinstance(key, Array): return self.align(key) # NOTE: Not sure about this, done for PyCUDA compatibility - if self.backend is not 'cython': + if self.backend != 'cython': return self.dev[key].get() else: return self.dev[key] @@ -816,7 +842,6 @@ else: update_minmax_gpu([self]) - def fill(self, value): self.dev.fill(value) diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/cython_generator.py compyle-0.6/compyle/cython_generator.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/cython_generator.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/cython_generator.py 2020-06-14 18:54:39.000000000 +0000 @@ -27,16 +27,20 @@ logger = logging.getLogger(__name__) -def get_parallel_range(start, stop=None, step=1): +def get_parallel_range(start, stop=None, step=1, **kwargs): config = get_config() if stop is None: stop = start start = 0 - args = "{start},{stop},{step}" + args = "{start}, {stop}, {step}" if config.use_openmp: schedule = config.omp_schedule[0] chunksize = config.omp_schedule[1] + if 'schedule' in kwargs: + schedule = kwargs.pop('schedule') + if 'chunksize' in kwargs: + chunksize = kwargs.pop('chunksize') if schedule is not None: args = args + ", schedule='{schedule}'" @@ -44,6 +48,9 @@ if chunksize is not None: args = args + ", chunksize={chunksize}" + for k, v in kwargs.items(): + args = args + ", %s=%r" % (k, v) + args = args.format(start=start, stop=stop, step=step, schedule=schedule, chunksize=chunksize) return "prange({})".format(args) @@ -195,10 +202,11 @@ def get_code(self): return self.code - def parse(self, obj, declarations=None): + def parse(self, obj, declarations=None, is_serial=False): obj_type = type(obj) if isinstance(obj, types.FunctionType): - self._parse_function(obj, declarations=declarations) + self._parse_function(obj, declarations=declarations, + is_serial=is_serial) elif hasattr(obj, '__class__'): self._parse_instance(obj) else: @@ -347,16 +355,22 @@ return methods - def _get_method_body(self, meth, lines, indent=' ' * 8, declarations=None): + def _get_method_body(self, meth, lines, indent=' ' * 8, declarations=None, + is_serial=False): getfullargspec = getattr( inspect, 'getfullargspec', inspect.getargspec ) args = set(getfullargspec(meth).args) - src = [self._process_body_line(line) for line in lines] + src = [self._process_body_line(line, is_serial=is_serial) + for line in lines] if declarations: cy_decls = [] for var, decl in declarations.items(): - cy_decls.append((var, indent + 'cdef %s\n' % decl[:-1])) + dtype, name = decl[:-1].split(' ') + if dtype[0] == 'u': + dtype = 'unsigned %s' % dtype[1:] + modified_decl = '%s %s' % (dtype, name) + cy_decls.append((var, indent + 'cdef %s\n' % modified_decl)) src = cy_decls + src declared = [] if not declarations else list(declarations.keys()) for names, defn in src: @@ -371,13 +385,15 @@ code = ''.join(declare) + cython_body return code - def _get_method_wrapper(self, meth, indent=' ' * 8, declarations=None): + def _get_method_wrapper(self, meth, indent=' ' * 8, declarations=None, + is_serial=False): sourcelines = getsourcelines(meth)[0] defn, lines = get_func_definition(sourcelines) m_name, returns, args = self._analyze_method(meth, lines) c_defn = self._get_c_method_spec(m_name, returns, args) c_body = self._get_method_body(meth, lines, indent=indent, - declarations=declarations) + declarations=declarations, + is_serial=is_serial) self.code = '{defn}\n{body}'.format(defn=c_defn, body=c_body) if self.python_methods: defn, body = self._get_py_method_spec(m_name, returns, args, @@ -451,9 +467,28 @@ defn = 'cdef {type} {name}'.format(type=ctype, name=name) return defn - def _parse_function(self, obj, declarations=None): + def _handle_cast_statement(self, name, call): + # FIXME: This won't handle casting to pointers + # using something like 'intp' + call_args = call[5:-1].split(',') + expr = call_args[0].strip() + ctype = call_args[1].strip()[1:-1] + return '%s = <%s> (%s)' % (name, ctype, expr) + + def _handle_atomic_statement(self, name, call, is_serial): + # FIXME: This won't handle casting to pointers + # using something like 'intp' + call_arg = call[11:-1].strip() + if self._config.use_openmp and not is_serial: + return['openmp.omp_set_lock(&cy_lock)', '%s = %s' % (name, call_arg), + '%s += 1' % call_arg, 'openmp.omp_unset_lock(&cy_lock)'] + else: + return ['%s = %s' % (name, call_arg), '%s += 1' % call_arg] + + def _parse_function(self, obj, declarations=None, is_serial=False): c_code, py_code = self._get_method_wrapper(obj, indent=' ' * 4, - declarations=declarations) + declarations=declarations, + is_serial=is_serial) code = '{defn}\n{body}'.format(defn=c_code[0], body=c_code[1]) if self.python_methods: code += '\n' @@ -469,7 +504,7 @@ methods=methods) self.code = helper.generate() - def _process_body_line(self, line): + def _process_body_line(self, line, is_serial=False): """Returns the name defined and the processed line itself. This hack primarily lets us declare variables from Python and inject @@ -484,6 +519,23 @@ defn = self._handle_declare_statement(name, declare) indent = line[:line.index(name)] return name, indent + defn + '\n' + elif words[1].startswith('cast') and \ + not line.strip().startswith('#'): + name = words[0] + call = words[1] + stmt = self._handle_cast_statement(name, call) + indent = line[:line.index(name)] + return '', indent + stmt + '\n' + elif words[1].startswith('atomic_inc') and \ + not line.strip().startswith('#'): + name = words[0] + call = words[1] + indent = line[:line.index(name)] + stmts = self._handle_atomic_statement(name, call, is_serial) + result = '' + for stmt in stmts: + result += indent + stmt + '\n' + return '', result + '\n' else: return '', line else: diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/ext_module.py compyle-0.6/compyle/ext_module.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/ext_module.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/ext_module.py 2020-06-14 18:54:39.000000000 +0000 @@ -4,13 +4,13 @@ from distutils.util import get_platform from distutils.errors import CompileError, LinkError import hashlib -import imp import importlib import io import logging import numpy import os -from os.path import dirname, exists, expanduser, isdir, join +from os.path import exists, expanduser, isdir, join +import platform from pyximport import pyxbuild import shutil import sys @@ -57,6 +57,23 @@ return unicode(s) +def get_openmp_flags(): + """Return the OpenMP flags for the platform. + + This returns two lists, [extra_compile_args], [extra_link_args] + """ + if sys.platform == 'win32': + return ['/openmp'], [] + elif sys.platform == 'darwin': + if (os.environ.get('CC') is not None and + os.environ.get('CXX') is not None): + return ['-fopenmp'], ['-fopenmp'] + else: + return ['-Xpreprocessor', '-fopenmp'], ['-lomp'] + else: + return ['-fopenmp'], ['-fopenmp'] + + class ExtModule(object): """Encapsulates the generated code, extension module etc. """ @@ -106,7 +123,7 @@ self.extra_link_args = extra_link_args if extra_link_args else [] def _add_local_include(self): - if sys.platform != 'win32': + if 'bsd' in platform.system().lower(): local = '/usr/local/include' if local not in self.extra_inc_dirs: self.extra_inc_dirs.append(local) @@ -262,16 +279,16 @@ Returns """ self.write_and_build() - file, path, desc = imp.find_module(self.name, [dirname(self.ext_path)]) - return imp.load_module(self.name, file, path, desc) + spec = importlib.util.spec_from_file_location(self.name, self.ext_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module def _get_extra_args(self): ec, el = self.extra_compile_args, self.extra_link_args if get_config().use_openmp: - if sys.platform == 'win32': - return ['/openmp'] + ec, [] + el - else: - return ['-fopenmp'] + ec, ['-fopenmp'] + el + _ec, _el = get_openmp_flags() + return _ec + ec, _el + el else: return ec, el diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/__init__.py compyle-0.6/compyle/__init__.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/__init__.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/__init__.py 2020-06-14 18:54:39.000000000 +0000 @@ -1,2 +1,2 @@ # See PEP 440 for more on suitable version numbers. -__version__ = '0.6.dev0' +__version__ = '0.6' diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/jit.py compyle-0.6/compyle/jit.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/jit.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/jit.py 2020-06-14 18:54:39.000000000 +0000 @@ -18,9 +18,24 @@ from . import parallel +def memoize_kernel(key=lambda *args: args): + def memoize_deco(method): + def wrapper(*args): + f = args[0].func + key_val = key(*args) + if not hasattr(f, 'cached_kernel'): + setattr(f, 'cached_kernel', {key_val: method(*args)}) + elif key_val not in f.cached_kernel: + f.cached_kernel[key_val] = method(*args) + return f.cached_kernel[key_val] + return wrapper + return memoize_deco + + def kernel_cache_key_args(obj, *args): key = [get_ctype_from_arg(arg) for arg in args] key.append(obj.func) + key.append(obj.name) key.append(obj.backend) key.append(obj._config.use_openmp) return tuple(key) @@ -205,6 +220,16 @@ names = [x.id for x in left.elts] for name in names: self.var_types[name] = self.get_type(type) + elif right.func.id == 'cast': + if not isinstance(right.args[1], ast.Str): + self.error("Cast type should be a string.", node) + type = right.args[1].s + if isinstance(left, ast.Name): + self.undecl_var_types[left.id] = self.get_type(type) + elif right.func.id == 'atomic_inc': + if left.id not in self.var_types and \ + left.id not in self.undecl_var_types: + self.undecl_var_types[left.id] = self.visit(right.args[0]) elif isinstance(left, ast.Name): if left.id not in self.var_types and \ left.id not in self.undecl_var_types: @@ -268,7 +293,7 @@ backend = array.get_backend(backend) self.tp = Transpiler(backend=backend) self.backend = backend - self.name = func.__name__ + self.name = 'elwise_%s' % func.__name__ self.func = func self._config = get_config() self.cython_gen = CythonGenerator() @@ -289,7 +314,7 @@ type_info[name] = arg_type return type_info - @memoize(key=kernel_cache_key_args) + @memoize_kernel(key=kernel_cache_key_args) def _generate_kernel(self, *args): if self.func is not None: arg_types = self.get_type_info_from_args(*args) @@ -364,7 +389,7 @@ type_info[name] = arg_type return type_info - @memoize(key=kernel_cache_key_args) + @memoize_kernel(key=kernel_cache_key_args) def _generate_kernel(self, *args): if self.func is not None: arg_types = self.get_type_info_from_args(*args) @@ -491,17 +516,18 @@ def __call__(self, **kwargs): c_func = self._generate_kernel(**kwargs) c_args_dict = {k: self._massage_arg(x) for k, x in kwargs.items()} + output_arg_keys = self.output_func.arg_keys[self._get_backend_key()] if self.backend == 'cython': - size = len(c_args_dict[self.output_func.arg_keys[1]]) + size = len(c_args_dict[output_arg_keys[1]]) c_args_dict['SIZE'] = size - c_func(*[c_args_dict[k] for k in self.output_func.arg_keys]) + c_func(*[c_args_dict[k] for k in output_arg_keys]) elif self.backend == 'opencl': - c_func(*[c_args_dict[k] for k in self.output_func.arg_keys]) + c_func(*[c_args_dict[k] for k in output_arg_keys]) self.queue.finish() elif self.backend == 'cuda': import pycuda.driver as drv event = drv.Event() - c_func(*[c_args_dict[k] for k in self.output_func.arg_keys]) + c_func(*[c_args_dict[k] for k in output_arg_keys]) event.record() event.synchronize() diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/low_level.py compyle-0.6/compyle/low_level.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/low_level.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/low_level.py 2020-06-14 18:54:39.000000000 +0000 @@ -319,11 +319,20 @@ pass +class _cast(Extern): + def code(self, backend): + return '' + + def __call__(self, *args, **kw): + pass + + prange = _prange() parallel = _parallel() nogil = _nogil() address = _address() atomic_inc = _atomic_inc() +cast = _cast() class Cython(object): diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/parallel.py compyle-0.6/compyle/parallel.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/parallel.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/parallel.py 2020-06-14 18:54:39.000000000 +0000 @@ -62,8 +62,8 @@ a, b = b, a%b return a -cdef int get_stride(sz, itemsize): - return sz/gcd(sz, itemsize) +cdef int get_stride(int sz, int itemsize): + return sz//gcd(sz, itemsize) cdef ${type} c_${name}(${c_arg_sig}): @@ -130,7 +130,7 @@ return a cdef int get_stride(int sz, int itemsize): - return sz / gcd(sz, itemsize) + return sz // gcd(sz, itemsize) cdef void c_${name}(${c_arg_sig}): @@ -175,7 +175,7 @@ # between threads % if not calc_last_item: # A chunk of 1 MB per thread - cdef int chunksize = 1048576 / sz + cdef int chunksize = 1048576 // sz % else: # Process all data together. Only then can we get # the last item immediately @@ -194,7 +194,7 @@ buffer_idx = tid * scan_stride start = offset + tid * chunksize - end = offset + min((tid + 1) * chunksize, SIZE) + end = min(offset + (tid + 1) * chunksize, SIZE) has_segment = 0 temp = ${neutral} @@ -281,7 +281,7 @@ carry = buffer[buffer_idx] start = offset + tid * chunksize - end = offset + min((tid + 1) * chunksize, SIZE) + end = min(offset + (tid + 1) * chunksize, SIZE) for i in range(start, end): # Output @@ -326,7 +326,6 @@ return c_${name}(${py_args}) ''' -# No support for last_item in single thread scan_cy_single_thread_template = ''' from cython.parallel import parallel, prange, threadid from libc.stdlib cimport abort, malloc, free @@ -338,6 +337,14 @@ cdef ${type} a, b, item N = SIZE + % if calc_last_item: + a = ${neutral} + for i in range(N): + b = ${input_expr} + a = ${scan_expr} + last_item = a + % endif + a = ${neutral} for i in range(N): @@ -375,12 +382,26 @@ return result +def serial(func=None, **kw): + """Decorator to specify serial execution of a cython + function + """ + def wrapper(func): + func.is_serial = True + return func + + if func is None: + return wrapper + else: + return wrapper(func) + + class ElementwiseBase(object): def __init__(self, func, backend=None): backend = array.get_backend(backend) self.tp = Transpiler(backend=backend) self.backend = backend - self.name = func.__name__ + self.name = 'elwise_%s' % func.__name__ self.func = func self._config = get_config() self.cython_gen = CythonGenerator() @@ -390,23 +411,25 @@ def _generate(self, declarations=None): self.tp.add(self.func, declarations=declarations) if self.backend == 'cython': + # FIXME: Handle the name of the kernel correctly py_data, c_data = self.cython_gen.get_func_signature(self.func) py_defn = ['long SIZE'] + py_data[0][1:] c_defn = ['long SIZE'] + c_data[0][1:] py_args = ['SIZE'] + py_data[1][1:] template = Template(text=elementwise_cy_template) src = template.render( - name=self.name, + name=self.name[7:], c_arg_sig=', '.join(c_defn), c_args=', '.join(c_data[1]), py_arg_sig=', '.join(py_defn), py_args=', '.join(py_args), - openmp=self._config.use_openmp, + openmp=self._config.use_openmp and not getattr( + self.func, 'is_serial', False), get_parallel_range=get_parallel_range ) self.tp.add_code(src) self.tp.compile() - return getattr(self.tp.mod, 'py_' + self.name) + return getattr(self.tp.mod, 'py_' + self.name[7:]) elif self.backend == 'opencl': py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) @@ -428,7 +451,7 @@ ) knl = ElementwiseKernel( ctx, - name='elwise_%s' % self.name, + name=self.name, arguments=arguments, operation=expr, preamble="\n".join([cluda_preamble, preamble]) @@ -453,7 +476,7 @@ double_support=True ) knl = ElementwiseKernel( - name='elwise_%s' % self.name, + name=self.name, arguments=arguments, operation=expr, preamble="\n".join([cluda_preamble, preamble]) @@ -785,6 +808,9 @@ self.queue = None self.c_func = self._generate() + def _get_backend_key(self): + return (self.backend, self._config.use_openmp, self._config.use_double) + def _correct_return_type(self, c_data, modifier): code = self.tp.blocks[-1].code.splitlines() if self._config.use_openmp: @@ -839,10 +865,11 @@ input_expr = 'input[i]' return py_data, c_data, input_expr - def _wrap_cython_code(self, func, func_type=None): + def _wrap_cython_code(self, func, func_type=None, + declarations=None): name = self.name if func is not None: - self.tp.add(func) + self.tp.add(func, declarations=declarations) py_data, c_data = self.cython_gen.get_func_signature(func) self._correct_return_type(c_data, func_type) @@ -872,15 +899,18 @@ all_c_data = [[], []] # Process input function - py_data, c_data, input_expr = self._wrap_cython_code(self.input_func, - func_type='input') + py_data, c_data, input_expr = self._wrap_cython_code( + self.input_func, + func_type='input', + declarations=declarations + ) self._append_cython_arg_data(all_py_data, all_c_data, py_data, c_data) # Process segment function use_segment = True if self.is_segment_func is not None else False py_data, c_data, segment_expr = self._wrap_cython_code( - self.is_segment_func, func_type='segment') + self.is_segment_func, func_type='segment', declarations=declarations) self._append_cython_arg_data(all_py_data, all_c_data, py_data, c_data) # Process output expression @@ -888,7 +918,8 @@ calc_prev_item = False py_data, c_data, output_expr = self._wrap_cython_code( - self.output_func, func_type='output') + self.output_func, func_type='output', + declarations=declarations) if self.output_func is not None: calc_last_item = self._include_last_item() calc_prev_item = self._include_prev_item() @@ -906,7 +937,9 @@ py_args = drop_duplicates(py_args) c_args = drop_duplicates(c_args) - self.output_func.arg_keys = c_args + if not hasattr(self.output_func, 'arg_keys'): + self.output_func.arg_keys = {} + self.output_func.arg_keys[self._get_backend_key()] = c_args if self._config.use_openmp: template = Template(text=scan_cy_template) @@ -991,7 +1024,9 @@ c_args = input_c_args + segment_c_args + output_c_args c_args = drop_duplicates(c_args) - self.output_func.arg_keys = c_args + if not hasattr(self.output_func, 'arg_keys'): + self.output_func.arg_keys = {} + self.output_func.arg_keys[self._get_backend_key()] = c_args return scan_expr, arg_defn, input_expr, output_expr, \ segment_expr, preamble @@ -1075,18 +1110,19 @@ def __call__(self, **kwargs): c_args_dict = {k: self._massage_arg(x) for k, x in kwargs.items()} + output_arg_keys = self.output_func.arg_keys[self._get_backend_key()] if self.backend == 'cython': - size = len(c_args_dict[self.output_func.arg_keys[1]]) + size = len(c_args_dict[output_arg_keys[1]]) c_args_dict['SIZE'] = size - self.c_func(*[c_args_dict[k] for k in self.output_func.arg_keys]) + self.c_func(*[c_args_dict[k] for k in output_arg_keys]) elif self.backend == 'opencl': - self.c_func(*[c_args_dict[k] for k in self.output_func.arg_keys]) + self.c_func(*[c_args_dict[k] for k in output_arg_keys]) self.queue.finish() elif self.backend == 'cuda': import pycuda.driver as drv event = drv.Event() - self.c_func(*[c_args_dict[k] for k in self.output_func.arg_keys]) + self.c_func(*[c_args_dict[k] for k in output_arg_keys]) event.record() event.synchronize() @@ -1095,6 +1131,7 @@ def __init__(self, input=None, output=None, scan_expr="a+b", is_segment=None, dtype=np.float64, neutral='0', complex_map=False, backend=None): + # FIXME: Revisit these conditions input_base = input is None or \ getattr(input, '__annotations__', None) and \ not hasattr(input, 'is_jit') diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/sort.py compyle-0.6/compyle/sort.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/sort.py 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/compyle/sort.py 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,88 @@ +import numpy as np + +from .config import get_config +from .cython_generator import get_parallel_range, CythonGenerator +from .transpiler import Transpiler, convert_to_float_if_needed +from .types import dtype_to_ctype, annotate +from .parallel import Scan +from .template import Template + +from . import array + + +class OutputSortBit(Template): + def __init__(self, name, num_arys): + super(OutputSortBit, self).__init__(name=name) + self.num_arys = num_arys + + def extra_args(self): + args = ['inp_%s' % num for num in range(self.num_arys)] + args += ['out_%s' % num for num in range(self.num_arys)] + return args, {} + + def template(self, i, item, prev_item, last_item, bit_number, indices, + sorted_indices): + ''' + key_bit = (inp_0[i] >> bit_number) & 1 + t = last_item + i - prev_item + idx = t if key_bit else prev_item + + sorted_indices[idx] = indices[i] + % for num in range(obj.num_arys): + out_${num}[idx] = inp_${num}[i] + % endfor + ''' + + +@annotate +def input_sort_bit(i, inp_0, bit_number): + return 1 if (inp_0[i] >> bit_number) & 1 == 0 else 0 + + +def radix_sort(ary_list, out_list=None, max_key_bits=None, backend=None): + keys = ary_list[0] + backend = array.get_backend(backend) + if not np.issubdtype(keys.dtype, np.integer): + raise ValueError("RadixSort can only sort integer types") + if max_key_bits is None: + max_key_bits = 8 * keys.dtype.itemsize + + # temp arrays + sorted_indices = array.zeros(keys.length, np.int32, backend=backend) + temp_indices = array.zeros_like(sorted_indices) + + indices = array.arange(0, keys.length, 1, backend=backend) + + # allocate temp arrays + if out_list: + temp_ary_list = out_list + else: + temp_ary_list = [array.zeros_like(ary) for ary in ary_list] + sorted_ary_list = [array.zeros_like(ary) for ary in ary_list] + + # kernel + output_sort_bit = OutputSortBit('output_sort_bit', len(ary_list)) + + sort_bit_knl = Scan(input_sort_bit, output_sort_bit.function, + 'a+b', dtype=keys.dtype, backend=backend) + + for bit_number in range(max_key_bits): + if bit_number == 0: + inp_indices = indices + inp_ary_list = ary_list + else: + inp_indices = temp_indices + inp_ary_list = temp_ary_list + + args = {'bit_number': bit_number, 'indices': indices, + 'sorted_indices': sorted_indices} + args.update({'inp_%i' % i: ary for i, ary in enumerate(inp_ary_list)}) + args.update({'out_%i' % + i: ary for i, ary in enumerate(sorted_ary_list)}) + + sort_bit_knl(**args) + + temp_indices, sorted_indices = sorted_indices, temp_indices + temp_ary_list, sorted_ary_list = sorted_ary_list, temp_ary_list + + return temp_ary_list, temp_indices diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_array.py compyle-0.6/compyle/tests/test_array.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_array.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/tests/test_array.py 2020-06-14 18:54:39.000000000 +0000 @@ -2,11 +2,12 @@ import numpy as np from ..array import Array +from ..config import get_config import compyle.array as array -test_all_backends = pytest.mark.parametrize('backend', - ['cython', 'opencl', 'cuda']) +check_all_backends = pytest.mark.parametrize('backend', + ['cython', 'opencl', 'cuda']) def make_dev_array(backend, n=16): @@ -23,7 +24,7 @@ pytest.importorskip('pycuda') -@test_all_backends +@check_all_backends def test_reserve(backend): check_import(backend) @@ -39,7 +40,7 @@ assert dev_array[0] == 1 -@test_all_backends +@check_all_backends def test_resize_with_reallocation(backend): check_import(backend) @@ -55,7 +56,7 @@ assert dev_array[0] == 1 -@test_all_backends +@check_all_backends def test_resize_without_reallocation(backend): check_import(backend) @@ -71,7 +72,7 @@ assert dev_array[0] == 1 -@test_all_backends +@check_all_backends def test_copy(backend): check_import(backend) @@ -89,7 +90,7 @@ assert dev_array[0] != dev_array_copy[0] -@test_all_backends +@check_all_backends def test_append_with_reallocation(backend): check_import(backend) @@ -104,7 +105,7 @@ assert len(dev_array.get_data()) == 32 -@test_all_backends +@check_all_backends def test_append_without_reallocation(backend): check_import(backend) @@ -120,7 +121,7 @@ assert len(dev_array.get_data()) == 20 -@test_all_backends +@check_all_backends def test_extend(backend): check_import(backend) @@ -137,7 +138,7 @@ assert np.all(old_nparr[-len(new_array)] == new_nparr) -@test_all_backends +@check_all_backends def test_remove(backend): check_import(backend) @@ -155,7 +156,7 @@ assert np.all(dev_array.get() == (8 + indices).get()) -@test_all_backends +@check_all_backends def test_align(backend): check_import(backend) @@ -172,7 +173,7 @@ assert np.all(dev_array.get() == indices.get()) -@test_all_backends +@check_all_backends def test_align_multiple(backend): check_import(backend) @@ -196,7 +197,7 @@ assert np.all(dev_array_b.get() - 1024 == indices.get()) -@test_all_backends +@check_all_backends def test_squeeze(backend): check_import(backend) @@ -213,7 +214,7 @@ assert dev_array.alloc == 16 -@test_all_backends +@check_all_backends def test_copy_values(backend): check_import(backend) @@ -231,7 +232,7 @@ assert np.all(dev_array[:len(indices)].get() == dest.get()) -@test_all_backends +@check_all_backends def test_min_max(backend): check_import(backend) @@ -248,21 +249,75 @@ assert dev_array.maximum == 10 -@test_all_backends +@check_all_backends def test_sort_by_keys(backend): check_import(backend) # Given - dev_array = make_dev_array(backend) + nparr1 = np.random.randint(0, 100, 16, dtype=np.int32) + nparr2 = np.random.randint(0, 100, 16, dtype=np.int32) + dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend) + + # When + out_array1, out_array2 = array.sort_by_keys([dev_array1, dev_array2]) + + # Then + order = np.argsort(nparr1) + act_result1 = np.take(nparr1, order) + act_result2 = np.take(nparr2, order) + assert np.all(out_array1.get() == act_result1) + assert np.all(out_array2.get() == act_result2) + + +def test_radix_sort_by_keys(): + backend = 'cython' + for use_openmp in [True, False]: + get_config().use_openmp = use_openmp + # Given + nparr1 = np.random.randint(0, 100, 16, dtype=np.int32) + nparr2 = np.random.randint(0, 100, 16, dtype=np.int32) + dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend) + + # When + out_array1, out_array2 = array.sort_by_keys([dev_array1, dev_array2], + use_radix_sort=True) + + # Then + order = np.argsort(nparr1) + act_result1 = np.take(nparr1, order) + act_result2 = np.take(nparr2, order) + assert np.all(out_array1.get() == act_result1) + assert np.all(out_array2.get() == act_result2) + get_config().use_openmp = False + + +@pytest.mark.parametrize( + 'backend', ['cython', 'opencl', + pytest.param('cuda', marks=pytest.mark.xfail)]) +def test_sort_by_keys_with_output(backend): + check_import(backend) + + # Given + nparr1 = np.random.randint(0, 100, 16, dtype=np.int32) + nparr2 = np.random.randint(0, 100, 16, dtype=np.int32) + dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend) + out_arrays = [ + array.zeros_like(dev_array1), + array.zeros_like(dev_array2)] # When - out_array = array.sort_by_keys([dev_array])[0] + array.sort_by_keys([dev_array1, dev_array2], + out_list=out_arrays, use_radix_sort=False) # Then - assert np.all(out_array.get() == np.sort(dev_array.get())) + order = np.argsort(nparr1) + act_result1 = np.take(nparr1, order) + act_result2 = np.take(nparr2, order) + assert np.all(out_arrays[0].get() == act_result1) + assert np.all(out_arrays[1].get() == act_result2) -@test_all_backends +@check_all_backends def test_dot(backend): check_import(backend) diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_cython_generator.py compyle-0.6/compyle/tests/test_cython_generator.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_cython_generator.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/tests/test_cython_generator.py 2020-06-14 18:54:39.000000000 +0000 @@ -6,10 +6,10 @@ import sys -from ..config import get_config, set_config +from ..config import get_config, set_config, use_config from ..types import declare, KnownType, annotate from ..cython_generator import (CythonGenerator, CythonClassHelper, - all_numeric) + all_numeric, get_parallel_range) class BasicEq: @@ -123,6 +123,81 @@ """) self.assert_code_equal(c.generate().strip(), expect.strip()) + def test_get_parallel_range_without_openmp(self): + with use_config(use_openmp=False): + # Given/When + res = get_parallel_range('NP') + # Then + self.assertEqual(res, 'range(0, NP, 1)') + + # Given/When + res = get_parallel_range('START', 'NP') + # Then + self.assertEqual(res, 'range(START, NP, 1)') + + # Given/When + res = get_parallel_range('NP', step=2) + # Then + self.assertEqual(res, 'range(0, NP, 2)') + + # Given/When + res = get_parallel_range(1, 'NP+1', 2) + # Then + self.assertEqual(res, 'range(1, NP+1, 2)') + + def test_get_parallel_range_with_openmp(self): + with use_config(use_openmp=True): + cfg = get_config() + sched, chunk = cfg.omp_schedule + # Given/When + res = get_parallel_range('NP') + # Then + expect = "prange(0, NP, 1, schedule='{}', chunksize={})".format( + sched, chunk + ) + self.assertEqual(res, expect) + + # Given/When + res = get_parallel_range('START', 'NP', 2) + # Then + expect = ( + "prange(START, NP, 2, schedule='{}', chunksize={})".format( + sched, chunk + ) + ) + self.assertEqual(res, expect) + + # Given/When + res = get_parallel_range('NP', nogil=True) + # Then + expect = ( + "prange(0, NP, 1, schedule='{}', chunksize={}, " + "nogil=True)".format( + sched, chunk + ) + ) + self.assertEqual(res, expect) + + # Given/When + res = get_parallel_range('NP', nogil=True, num_threads=4) + # Then + expect = ( + "prange(0, NP, 1, schedule='{}', chunksize={}, " + "nogil=True, num_threads=4)".format( + sched, chunk + ) + ) + self.assertEqual(res, expect) + + with use_config(use_openmp=True, omp_schedule=('static', 32)): + # Given/When + res = get_parallel_range('NP') + # Then + expect = "prange(0, NP, 1, schedule='{}', chunksize={})".format( + 'static', 32 + ) + self.assertEqual(res, expect) + class TestCythonCodeGenerator(TestBase): def setUp(self): diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_ext_module.py compyle-0.6/compyle/tests/test_ext_module.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_ext_module.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/tests/test_ext_module.py 2020-06-14 18:54:39.000000000 +0000 @@ -61,6 +61,7 @@ def setUp(self): self.root = tempfile.mkdtemp() self.data = dedent('''\ + # cython: language_level=3 def f(): return "hello world" ''') @@ -112,7 +113,7 @@ self.assertTrue(exists(s.ext_path)) def _create_dummy_module(self): - code = "def hello(): return 'hello'" + code = "# cython: language_level=3\ndef hello(): return 'hello'" modname = 'test_rebuild.py' f = join(self.root, modname) with open(f, 'w') as fp: diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_low_level.py compyle-0.6/compyle/tests/test_low_level.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_low_level.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/tests/test_low_level.py 2020-06-14 18:54:39.000000000 +0000 @@ -147,6 +147,14 @@ y[i] = x[i] * a +@annotate(int='num, return_') +def _factorial(num): + if num == 0: + return 1 + else: + return num*_factorial(num - 1) + + class TestCython(unittest.TestCase): def test_cython_code_with_return_and_nested_call(self): # Given @@ -177,3 +185,12 @@ # Then self.assertTrue(np.allclose(y, x * a)) + + def test_recursive_function(self): + # Given/when + fac = Cython(_factorial) + + # Then + self.assertEqual(fac(0), 1) + self.assertEqual(fac(1), 1) + self.assertEqual(fac(3), 6) diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_parallel.py compyle-0.6/compyle/tests/test_parallel.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_parallel.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/tests/test_parallel.py 2020-06-14 18:54:39.000000000 +0000 @@ -5,9 +5,10 @@ from pytest import importorskip from ..config import get_config, use_config -from ..array import wrap +from ..array import wrap, zeros from ..types import annotate from ..parallel import Elementwise, Reduction, Scan +from ..low_level import atomic_inc from .test_jit import g MY_CONST = 42 @@ -94,6 +95,14 @@ with use_config(use_openmp=True): self._test_scan(backend='cython') + def test_large_scan_works_cython_parallel(self): + with use_config(use_openmp=True): + self._test_large_scan(backend='cython') + + def test_large_scan_works_cython_parallel(self): + with use_config(use_openmp=True): + self._test_large_scan(backend='cython') + def test_scan_works_opencl(self): importorskip('pyopencl') self._test_scan(backend='opencl') @@ -160,6 +169,9 @@ with use_config(use_openmp=True): self._test_scan_last_item(backend='cython') + def test_scan_last_item_cython_serial(self): + self._test_scan_last_item(backend='cython') + def test_scan_last_item_opencl(self): importorskip('pyopencl') self._test_scan_last_item(backend='opencl') @@ -168,6 +180,21 @@ importorskip('pycuda') self._test_scan_last_item(backend='cuda') + def test_atomic_inc_cython(self): + self._test_atomic_inc(backend='cython') + + def test_atomic_inc_cython_parallel(self): + with use_config(use_openmp=True): + self._test_atomic_inc(backend='cython') + + def test_atomic_inc_opencl(self): + importorskip('pyopencl') + self._test_atomic_inc(backend='opencl') + + def test_atomic_inc_cuda(self): + importorskip('pycuda') + self._test_atomic_inc(backend='cuda') + class TestParallelUtils(ParallelUtilsBase, unittest.TestCase): def setUp(self): @@ -298,6 +325,33 @@ # Then np.testing.assert_equal(expect, result) + def _test_large_scan(self, backend): + # Given + a = np.ones(3000000, dtype=np.int32) + data = a.copy() + expect = np.cumsum(data) + + a = wrap(a, backend=backend) + + @annotate(i='int', ary='intp', return_='int') + def input_f(i, ary): + return ary[i] + + @annotate(int='i, item', ary='intp') + def output_f(i, item, ary): + ary[i] = item + + # When + scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, + backend=backend) + scan(ary=a) + + a.pull() + result = a.data + + # Then + np.testing.assert_equal(expect, result) + def _test_scan_with_external_func(self, backend): # Given a = np.arange(10000, dtype=np.int32) @@ -426,6 +480,25 @@ # Then np.testing.assert_equal(expect, a.data) + def _test_atomic_inc(self, backend): + # Given + a = np.random.randint(0, 100, 50000, dtype=np.int32) + result = zeros(1, dtype=np.int32, backend=backend) + + a = wrap(a, backend=backend) + + @annotate(gintp='x, result', i='int') + def reduce_knl(i, x, result): + b = declare('int') + b = atomic_inc(result[0]) + + # When + knl = Elementwise(reduce_knl, backend=backend) + knl(a, result) + + # Then + self.assertTrue(result[0] == 50000) + class TestParallelUtilsJIT(ParallelUtilsBase, unittest.TestCase): def setUp(self): @@ -558,6 +631,33 @@ # Then np.testing.assert_equal(expect, result) + def _test_large_scan(self, backend): + # Given + a = np.ones(3000000, dtype=np.int32) + data = a.copy() + expect = np.cumsum(data) + + a = wrap(a, backend=backend) + + @annotate + def input_f(i, ary): + return ary[i] + + @annotate + def output_f(i, item, ary): + ary[i] = item + + # When + scan = Scan(input_f, output_f, 'a+b', dtype=np.int32, + backend=backend) + scan(ary=a) + + a.pull() + result = a.data + + # Then + np.testing.assert_equal(expect, result) + def _test_scan_with_external_func(self, backend): # Given a = np.arange(10000, dtype=np.int32) @@ -684,3 +784,22 @@ # Then np.testing.assert_equal(expect, a.data) + + def _test_atomic_inc(self, backend): + # Given + a = np.random.randint(0, 100, 50000, dtype=np.int32) + result = zeros(1, dtype=np.int32, backend=backend) + + a = wrap(a, backend=backend) + + @annotate + def reduce_knl(i, x, result): + b = declare('int') + b = atomic_inc(result[0]) + + # When + knl = Elementwise(reduce_knl, backend=backend) + knl(a, result) + + # Then + self.assertTrue(result[0] == 50000) diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_translator.py compyle-0.6/compyle/tests/test_translator.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_translator.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/tests/test_translator.py 2020-06-14 18:54:39.000000000 +0000 @@ -1367,3 +1367,36 @@ ''') assert code.strip() == expect.strip() + +def test_cast_works(): + # Given + def f(x=1.0): + return cast(x, "float") + + # When + t = OpenCLConverter() + code = t.parse_function(f) + + # Then + expect = dedent(''' + WITHIN_KERNEL double f(double x) + { + return (float) (x); + } + ''') + + assert code.strip() == expect.strip() + + # When + t = CUDAConverter() + code = t.parse_function(f) + + # Then + expect = dedent(''' + WITHIN_KERNEL double f(double x) + { + return (float) (x); + } + ''') + + assert code.strip() == expect.strip() diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_transpiler.py compyle-0.6/compyle/tests/test_transpiler.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_transpiler.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/tests/test_transpiler.py 2020-06-14 18:54:39.000000000 +0000 @@ -37,6 +37,13 @@ foo(x) +def _factorial(num): + if num == 0: + return 1 + else: + return num*_factorial(num - 1) + + class TestTranspiler(unittest.TestCase): def test_get_external_symbols_and_calls(self): # Given/When @@ -65,6 +72,18 @@ self.assertRaises(NameError, get_external_symbols_and_calls, undefined_call, 'cython') + def test_get_external_symbols_and_calls_handles_recursion(self): + # Given/When + syms, implicit, calls, ext = get_external_symbols_and_calls( + _factorial, 'cython' + ) + + # Then + self.assertEqual(syms, {}) + self.assertEqual(calls, []) + self.assertEqual(implicit, set()) + self.assertEqual(ext, []) + def test_transpiler(self): # Given t = Transpiler(backend='cython') diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/translator.py compyle-0.6/compyle/translator.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/translator.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/translator.py 2020-06-14 18:54:39.000000000 +0000 @@ -384,6 +384,8 @@ return '(&%s)' % self.visit(node.args[0]) elif 'atomic' in node.func.id: return self.render_atomic(node.func.id, node.args[0]) + elif node.func.id == 'cast': + return '(%s) (%s)' % (node.args[1].s, self.visit(node.args[0])) else: return '{func}({args})'.format( func=node.func.id, diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/transpiler.py compyle-0.6/compyle/transpiler.py --- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/transpiler.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/compyle/transpiler.py 2020-06-14 18:54:39.000000000 +0000 @@ -69,6 +69,8 @@ names, calls = get_unknown_names_and_calls(src) names -= ignore calls = filter_calls(calls) + if func.__name__ in calls: + calls.remove(func.__name__) mod = importlib.import_module(func.__module__) symbols = {} implicit = set() @@ -140,11 +142,21 @@ if backend == 'cython': self._cgen = CythonGenerator() self.header = dedent(''' + # cython: language_level=3 from libc.stdio cimport printf from libc.math cimport * from libc.math cimport fabs as abs from cython.parallel import parallel, prange ''') + + if get_config().use_openmp: + self.header += dedent(''' + cimport openmp + + cdef openmp.omp_lock_t cy_lock + openmp.omp_init_lock(&cy_lock) + ''') + elif backend == 'opencl': from pyopencl._cluda import CLUDA_PREAMBLE self._cgen = OpenCLConverter() @@ -257,9 +269,10 @@ self._handle_external(obj, declarations=declarations) if self.backend == 'cython': + is_serial = getattr(obj, 'is_serial', False) self._cgen.parse( obj, declarations=declarations.get(obj.__name__) - if declarations else None) + if declarations else None, is_serial=is_serial) code = self._cgen.get_code() elif self.backend == 'opencl' or self.backend == 'cuda': code = self._cgen.parse( diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/debian/changelog compyle-0.6/debian/changelog --- compyle-0.6~dev0~20190922.gitaa5a50d/debian/changelog 2020-01-01 19:54:59.000000000 +0000 +++ compyle-0.6/debian/changelog 2020-06-16 06:31:52.000000000 +0000 @@ -1,3 +1,12 @@ +compyle (0.6-1) unstable; urgency=medium + + * New upstream version. + * Standards version bumped to 4.5.0 (no changes). + * debian/patches: + - refresh all patches + + -- Antonio Valentino Tue, 16 Jun 2020 06:31:52 +0000 + compyle (0.6~dev0~20190922.gitaa5a50d-2) unstable; urgency=medium * Team upload. diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/debian/control compyle-0.6/debian/control --- compyle-0.6~dev0~20190922.gitaa5a50d/debian/control 2019-09-26 06:52:01.000000000 +0000 +++ compyle-0.6/debian/control 2020-06-16 06:31:52.000000000 +0000 @@ -15,7 +15,7 @@ python3-pytest, python3-pytools, python3-setuptools, -Standards-Version: 4.4.0 +Standards-Version: 4.5.0 Vcs-Browser: https://salsa.debian.org/science-team/compyle Vcs-Git: https://salsa.debian.org/science-team/compyle.git Homepage: https://github.com/pypr/compyle diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/debian/.gitlab-ci.yml compyle-0.6/debian/.gitlab-ci.yml --- compyle-0.6~dev0~20190922.gitaa5a50d/debian/.gitlab-ci.yml 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/debian/.gitlab-ci.yml 2020-06-16 06:31:52.000000000 +0000 @@ -0,0 +1,3 @@ +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/debian/patches/0001-Disable-cython-extension.patch compyle-0.6/debian/patches/0001-Disable-cython-extension.patch --- compyle-0.6~dev0~20190922.gitaa5a50d/debian/patches/0001-Disable-cython-extension.patch 2019-09-26 06:52:01.000000000 +0000 +++ compyle-0.6/debian/patches/0001-Disable-cython-extension.patch 2020-06-16 06:31:52.000000000 +0000 @@ -4,8 +4,8 @@ --- compyle/cuda.py | 2 +- - setup.py | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) + setup.py | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/compyle/cuda.py b/compyle/cuda.py index 9986646..c3decc6 100644 @@ -21,9 +21,18 @@ from pycuda.compiler import SourceModule as _SourceModule from pycuda.tools import dtype_to_ctype diff --git a/setup.py b/setup.py -index 73f58a2..3ae91c4 100644 +index 97e48f1..c9ea834 100644 --- a/setup.py +++ b/setup.py +@@ -24,7 +24,7 @@ tests_require = ['pytest'] + if sys.version_info[0] < 3: + tests_require += ['mock>=1.0'] + docs_require = ['sphinx'] +-cuda_require = ['pycuda', 'cupy'] ++cuda_require = ['pycuda'] # , 'cupy'] + opencl_require = ['pyopencl'] + + classes = ''' @@ -67,7 +67,7 @@ setup( url='https://github.com/pypr/compyle', classifiers=classifiers, diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/docs/source/installation.rst compyle-0.6/docs/source/installation.rst --- compyle-0.6~dev0~20190922.gitaa5a50d/docs/source/installation.rst 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/docs/source/installation.rst 2020-06-14 18:54:39.000000000 +0000 @@ -23,6 +23,7 @@ .. _numpy: http://www.numpy.org .. _PyCUDA: https://documen.tician.de/pycuda .. _OpenMP: http://openmp.org/ +.. _CuPy: https://cupy.chainer.org/ Setting up on GNU/Linux @@ -42,25 +43,59 @@ `_ if you installed XCode but can't find clang or gcc. +If you are getting strange errors of the form:: + + lang: warning: libstdc++ is deprecated; move to libc++ with a minimum deployment target of OS X 10.9 [-Wdeprecated] + ld: library not found for -lstdc++ + clang: error: linker command failed with exit code 1 (use -v to see invocation) + +Then try this (on a bash shell):: + + $ export MACOSX_DEPLOYMENT_TARGET=10.9 + +And run your command again (replace the above with a suitable line on other +shells). This is necessary because your Python was compiled with an older +deployment target and the current version of XCode that you have installed is +not compatible with that. By setting the environment variable you allow +compyle to use a newer version. If this works, it is a good idea to set this +in your default environment (``.bashrc`` for bash shells) so you do not have +to do this every time. + OpenMP on MacOS ~~~~~~~~~~~~~~~~ -The default "gcc" available on OSX uses an LLVM backend and does not support -OpenMP_. To use OpenMP_ on OSX, you can install the GCC available on brew_ using -:: +The default clang compiler available on MacOS uses an LLVM backend and does +not support OpenMP_. There are two ways to support OpenMP. The first involves +installing the OpenMP support for clang. This can be done with brew_ using:: + + $ brew install libomp + +Once that is done, it should "just work". If you get strange errors, try +setting the ``MACOSX_DEPLOYMENT_TARGET`` as shown above. + +Another option is to install GCC for MacOS available on brew_ using :: $ brew install gcc Once this is done, you need to use this as your default compiler. The ``gcc`` -formula on brew currently ships with gcc version 8. Therefore, you can +formula on brew currently ships with gcc version 9. Therefore, you can tell Python to use the GCC installed by brew by setting:: - $ export CC=gcc-8 - $ export CXX=g++-8 + $ export CC=gcc-9 + $ export CXX=g++-9 -Note that you still do need to have the command-line-tools for XCode installed, -otherwise the important header files are not available. +Note that you still do need to have the command-line-tools for XCode +installed, otherwise the important header files are not available. See +`how-to-install-xcode-command-line-tools +`_ +for more details. You may also want to set these environment variables in your +``.bashrc`` so you don't have to do this every time. + +Once you do this, compyle will automatically use this version of GCC and will +also work with OpenMP. Note that on some preliminary benchmarks, GCC's OpenMP +implementation seems about 10% or so faster than the LLVM version. Your +mileage may vary. .. _brew: http://brew.sh/ @@ -73,13 +108,16 @@ https://wiki.python.org/moin/WindowsCompilers -OpenMP will work if you have this installed. +OpenMP will work if you have this installed. For recent Python versions +(>=3.5), install the `Microsoft Build Tools for Visual Studio 2019 +`_ Setting up OpenCL/CUDA ----------------------- -This is too involved a topic for this instead look at the appropriate -documentation for PyOpenCL_ and PyCUDA_. Once those packages work correctly, you -should be all set. Note that if you are only using OpenCL/CUDA you do not need -to have Cython or a C/C++ compiler. +This is too involved a topic to discuss here, instead look at the appropriate +documentation for PyOpenCL_ and PyCUDA_. Once those packages work correctly, +you should be all set. Note that if you are only using OpenCL/CUDA you do not +need to have Cython or a C/C++ compiler. Some features on CUDA require the use +of the CuPy_ library. diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/laplace.py compyle-0.6/examples/laplace.py --- compyle-0.6~dev0~20190922.gitaa5a50d/examples/laplace.py 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/examples/laplace.py 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,136 @@ +import numpy as np +from math import pi +import time + +from compyle.config import get_config +from compyle.api import declare, annotate +from compyle.parallel import Elementwise +from compyle.array import get_backend, wrap + +import compyle.array as carr + + +def bc(x, y): + return np.sin(np.pi * (x + y)) + + +@annotate +def laplace_step(i, u, res, err, nx, ny, dx2, dy2, dnr_inv): + xid = i % nx + yid = i / nx + + if xid == 0 or xid == nx - 1 or yid == 0 or yid == ny - 1: + return + + res[i] = ((u[i - 1] + u[i + 1]) * dx2 + + (u[i - nx] + u[i + nx]) * dy2) * dnr_inv + + diff = res[i] - u[i] + + err[i] = diff * diff + + +class Grid(object): + def __init__(self, nx=10, ny=10, xmin=0., xmax=1., + ymin=0., ymax=1., bc=lambda x: 0, backend=None): + self.backend = get_backend(backend) + self.xmin, self.xmax, self.ymin, self.ymax = xmin, xmax, ymin, ymax + self.nx, self.ny = nx, ny + self.dx = (xmax - xmin) / (nx - 1) + self.dy = (ymax - ymin) / (ny - 1) + self.x = np.arange(self.xmin, self.xmax + self.dx * 0.5, self.dx) + self.y = np.arange(self.ymin, self.ymax + self.dy * 0.5, self.dy) + self.bc = bc + self.setup() + + def setup(self): + u_host = np.zeros((self.nx, self.ny)).astype(np.float32) + + u_host[0, :] = self.bc(self.xmin, self.y) + u_host[-1, :] = self.bc(self.xmax, self.y) + u_host[:, 0] = self.bc(self.x, self.ymin) + u_host[:, -1] = self.bc(self.x, self.ymax) + + self.u = wrap(u_host.flatten(), backend=self.backend) + self.err = carr.zeros_like(self.u) + + def get(self): + u_host = self.u.get() + return np.resize(u_host, (self.nx, self.ny)) + + def compute_err(self): + return np.sqrt(carr.dot(self.err, self.err)) + + def plot(self): + import matplotlib.pyplot as plt + plt.imshow(self.get()) + plt.show() + + +class LaplaceSolver(object): + def __init__(self, grid, backend=None): + self.grid = grid + self.backend = get_backend(backend) + self.step_method = Elementwise(laplace_step, backend=self.backend) + self.res = self.grid.u.copy() + + def solve(self, max_iter=None, eps=1.0e-8): + err = np.inf + + g = self.grid + + dx2 = g.dx ** 2 + dy2 = g.dy ** 2 + dnr_inv = 0.5 / (dx2 + dy2) + + count = 0 + + while err > eps: + if max_iter and count >= max_iter: + return err, count + self.step_method(g.u, self.res, g.err, g.nx, g.ny, + dx2, dy2, dnr_inv) + err = g.compute_err() + + tmp = g.u + g.u = self.res + self.res = tmp + + count += 1 + + return err, count + + +if __name__ == '__main__': + from argparse import ArgumentParser + p = ArgumentParser() + p.add_argument( + '-b', '--backend', action='store', dest='backend', default='cython', + help='Choose the backend.' + ) + p.add_argument( + '--openmp', action='store_true', dest='openmp', default=False, + help='Use OpenMP.' + ) + p.add_argument( + '--use-double', action='store_true', dest='use_double', + default=False, help='Use double precision on the GPU.' + ) + p.add_argument('--nx', action='store', type=int, dest='nx', + default=100, help='Number of grid points in x.') + p.add_argument('--ny', action='store', type=int, dest='ny', + default=100, help='Number of grid points in y.') + o = p.parse_args() + get_config().use_openmp = o.openmp + get_config().use_double = o.use_double + + grid = Grid(nx=o.nx, ny=o.ny, bc=bc, backend=o.backend) + + solver = LaplaceSolver(grid, backend=o.backend) + + start = time.time() + err, count = solver.solve(eps=1e-6) + end = time.time() + + print("Number of iterations = %s" % count) + print("Time taken = %g secs" % (end - start)) diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/md_nnps.py compyle-0.6/examples/molecular_dynamics/md_nnps.py --- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/md_nnps.py 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/examples/molecular_dynamics/md_nnps.py 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,233 @@ +import numpy as np +from math import pi +import time + +from compyle.config import get_config +from compyle.api import declare, annotate +from compyle.parallel import Elementwise, Reduction +from compyle.array import get_backend, wrap +import compyle.array as carr + +from nnps import NNPSCountingSort, NNPSRadixSort + + +@annotate +def calculate_energy(i, vx, vy, pe, num_particles): + ke = 0.5 * (vx[i] * vx[i] + vy[i] * vy[i]) + return pe[i] + ke + + +@annotate +def calculate_force(i, x, y, fx, fy, pe, nbr_starts, nbr_lengths, nbrs): + start_idx = nbr_starts[i] + length = nbr_lengths[i] + for k in range(start_idx, start_idx + length): + j = nbrs[k] + if i == j: + continue + xij = x[i] - x[j] + yij = y[i] - y[j] + rij2 = xij * xij + yij * yij + irij2 = 1.0 / rij2 + irij6 = irij2 * irij2 * irij2 + irij12 = irij6 * irij6 + pe[i] += (4 * (irij12 - irij6)) + f_base = 24 * irij2 * (2 * irij12 - irij6) + + fx[i] += f_base * xij + fy[i] += f_base * yij + + +@annotate +def step_method1(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, + ymin, ymax, m, dt, nbr_starts, nbr_lengths, + nbrs): + integrate_step1(i, m, dt, x, y, vx, vy, fx, fy) + boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, + ymin, ymax) + + +@annotate +def step_method2(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, + ymin, ymax, m, dt, nbr_starts, nbr_lengths, + nbrs): + calculate_force(i, x, y, fx, fy, pe, nbr_starts, nbr_lengths, nbrs) + integrate_step2(i, m, dt, x, y, vx, vy, fx, fy) + + +@annotate +def integrate_step1(i, m, dt, x, y, vx, vy, fx, fy): + axi = fx[i] + ayi = fy[i] + x[i] += vx[i] * dt + 0.5 * axi * dt * dt + y[i] += vy[i] * dt + 0.5 * ayi * dt * dt + vx[i] += 0.5 * axi * dt + vy[i] += 0.5 * ayi * dt + + +@annotate +def integrate_step2(i, m, dt, x, y, vx, vy, fx, fy): + axi = fx[i] + ayi = fy[i] + vx[i] += 0.5 * axi * dt + vy[i] += 0.5 * ayi * dt + + +@annotate +def boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax): + xwidth = xmax - xmin + ywidth = ymax - ymin + stiffness = 50. + pe[i] = 0. + if x[i] < 0.5: + fx[i] = stiffness * (0.5 - x[i]) + pe[i] += 0.5 * stiffness * (0.5 - x[i]) * (0.5 - x[i]) + elif x[i] > xwidth - 0.5: + fx[i] = stiffness * (xwidth - 0.5 - x[i]) + pe[i] += 0.5 * stiffness * (xwidth - 0.5 - x[i]) * (xwidth - 0.5 - x[i]) + else: + fx[i] = 0. + + if y[i] < 0.5: + fy[i] = stiffness * (0.5 - y[i]) + pe[i] += 0.5 * stiffness * (0.5 - y[i]) * (0.5 - y[i]) + elif y[i] > ywidth - 0.5: + fy[i] = stiffness * (ywidth - 0.5 - y[i]) + pe[i] += 0.5 * stiffness * (ywidth - 0.5 - y[i]) * (ywidth - 0.5 - y[i]) + else: + fy[i] = 0. + + +class MDSolver(object): + def __init__(self, num_particles, x=None, y=None, vx=None, vy=None, + xmax=100., ymax=100., dx=1.5, init_T=0., + backend=None, use_count_sort=False): + self.nnps_algorithm = NNPSCountingSort \ + if use_count_sort else NNPSRadixSort + self.backend = get_backend(backend) + self.num_particles = num_particles + self.xmin, self.xmax = 0., xmax + self.ymin, self.ymax = 0., ymax + self.m = 1. + if x is None and y is None: + self.x, self.y = self.setup_positions(num_particles, dx) + if vx is None and vy is None: + self.vx, self.vy = self.setup_velocities(init_T, + num_particles) + self.fx = carr.zeros_like(self.x, backend=self.backend) + self.fy = carr.zeros_like(self.x, backend=self.backend) + self.pe = carr.zeros_like(self.x, backend=self.backend) + self.nnps = self.nnps_algorithm(self.x, self.y, 3., self.xmax, + self.ymax, backend=self.backend) + self.init_forces = Elementwise(calculate_force, backend=self.backend) + self.step1 = Elementwise(step_method1, backend=self.backend) + self.step2 = Elementwise(step_method2, backend=self.backend) + self.energy_calc = Reduction("a+b", map_func=calculate_energy, + backend=self.backend) + + def setup_velocities(self, T, num_particles): + np.random.seed(123) + vx = np.random.uniform(0, 1., size=num_particles).astype(np.float64) + vy = np.random.uniform(0, 1., size=num_particles).astype(np.float64) + T_current = np.average(vx ** 2 + vy ** 2) + scaling_factor = (T / T_current) ** 0.5 + vx = vx * scaling_factor + vy = vy * scaling_factor + return wrap(vx, vy, backend=self.backend) + + def setup_positions(self, num_particles, dx): + ndim = np.ceil(num_particles ** 0.5) + dim_length = ndim * dx + + self.xmax = dim_length * 2 + self.ymax = dim_length * 2 + + xmin_eff = ((self.xmax - self.xmin) - dim_length) / 2. + xmax_eff = ((self.xmax - self.xmin) + dim_length) / 2. + + x, y = np.mgrid[xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx] + x = x.ravel().astype(np.float64)[:num_particles] + y = y.ravel().astype(np.float64)[:num_particles] + return wrap(x, y, backend=self.backend) + + def post_step(self, t): + energy = self.energy_calc(self.vx, self.vy, self.pe, + self.num_particles) + print("Energy at time =", t, "is", energy) + + def solve(self, t, dt): + num_steps = int(t // dt) + curr_t = 0. + self.nnps.build() + self.nnps.get_neighbors() + self.init_forces(self.x, self.y, self.fx, self.fy, + self.pe, self.nnps.nbr_starts, + self.nnps.nbr_lengths, self.nnps.nbrs) + for i in range(num_steps): + self.step1(self.x, self.y, self.vx, self.vy, self.fx, + self.fy, self.pe, self.xmin, self.xmax, self.ymin, + self.ymax, self.m, dt, self.nnps.nbr_starts, + self.nnps.nbr_lengths, self.nnps.nbrs) + self.nnps.build() + self.nnps.get_neighbors() + self.step2(self.x, self.y, self.vx, self.vy, self.fx, + self.fy, self.pe, self.xmin, self.xmax, self.ymin, + self.ymax, self.m, dt, self.nnps.nbr_starts, + self.nnps.nbr_lengths, self.nnps.nbrs) + curr_t += dt + if i % 100 == 0: + self.post_step(curr_t) + + def pull(self): + self.x.pull() + self.y.pull() + + def plot(self): + import matplotlib.pyplot as plt + plt.xlim(self.xmin, self.xmax) + plt.ylim(self.ymin, self.ymax) + plt.scatter(self.x.data, self.y.data, 4.2) + plt.savefig("sim.png", dpi=300) + + +if __name__ == '__main__': + from argparse import ArgumentParser + p = ArgumentParser() + p.add_argument( + '-b', '--backend', action='store', dest='backend', default='cython', + help='Choose the backend.' + ) + p.add_argument( + '--openmp', action='store_true', dest='openmp', default=False, + help='Use OpenMP.' + ) + p.add_argument( + '--use-double', action='store_true', dest='use_double', + default=False, help='Use double precision on the GPU.' + ) + p.add_argument( + '--use-count-sort', action='store_true', dest='use_count_sort', + default=False, help='Use count sort instead of radix sort' + ) + + p.add_argument('-n', action='store', type=int, dest='n', + default=100, help='Number of particles') + + p.add_argument('--tf', action='store', type=float, dest='t', + default=40., help='Final time') + + p.add_argument('--dt', action='store', type=float, dest='dt', + default=0.02, help='Time step') + + o = p.parse_args() + get_config().use_openmp = o.openmp + get_config().use_double = o.use_double + + solver = MDSolver(o.n, backend=o.backend, use_count_sort=o.use_count_sort) + + start = time.time() + solver.solve(o.t, o.dt) + end = time.time() + print("Time taken for N = %i is %g secs" % (o.n, (end - start))) + solver.pull() + solver.plot() diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/md_simple.py compyle-0.6/examples/molecular_dynamics/md_simple.py --- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/md_simple.py 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/examples/molecular_dynamics/md_simple.py 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,215 @@ +import numpy as np +from math import pi +import time + +from compyle.config import get_config +from compyle.api import declare, annotate +from compyle.parallel import Elementwise, Reduction +from compyle.array import get_backend, wrap + +import compyle.array as carr + + +@annotate +def calculate_energy(i, vx, vy, pe, num_particles): + ke = 0.5 * (vx[i] * vx[i] + vy[i] * vy[i]) + return pe[i] + ke + + +@annotate +def calculate_force(i, x, y, fx, fy, pe, num_particles): + force_cutoff = 3. + force_cutoff2 = force_cutoff * force_cutoff + for j in range(num_particles): + if i == j: + continue + xij = x[i] - x[j] + yij = y[i] - y[j] + rij2 = xij * xij + yij * yij + if rij2 > force_cutoff2: + continue + irij2 = 1.0 / rij2 + irij6 = irij2 * irij2 * irij2 + irij12 = irij6 * irij6 + pe[i] += (4 * (irij12 - irij6)) + f_base = 24 * irij2 * (2 * irij12 - irij6) + + fx[i] += f_base * xij + fy[i] += f_base * yij + + +@annotate +def step_method1(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, + ymin, ymax, m, dt, num_particles): + integrate_step1(i, m, dt, x, y, vx, vy, fx, fy) + boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, + ymin, ymax) + + +@annotate +def step_method2(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, + ymin, ymax, m, dt, num_particles): + calculate_force(i, x, y, fx, fy, pe, num_particles) + integrate_step2(i, m, dt, x, y, vx, vy, fx, fy) + + +@annotate +def integrate_step1(i, m, dt, x, y, vx, vy, fx, fy): + axi = fx[i] + ayi = fy[i] + x[i] += vx[i] * dt + 0.5 * axi * dt * dt + y[i] += vy[i] * dt + 0.5 * ayi * dt * dt + vx[i] += 0.5 * axi * dt + vy[i] += 0.5 * ayi * dt + + +@annotate +def integrate_step2(i, m, dt, x, y, vx, vy, fx, fy): + axi = fx[i] + ayi = fy[i] + vx[i] += 0.5 * axi * dt + vy[i] += 0.5 * ayi * dt + + +@annotate +def boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax): + xwidth = xmax - xmin + ywidth = ymax - ymin + stiffness = 50. + pe[i] = 0. + if x[i] < 0.5: + fx[i] = stiffness * (0.5 - x[i]) + pe[i] += 0.5 * stiffness * (0.5 - x[i]) * (0.5 - x[i]) + elif x[i] > xwidth - 0.5: + fx[i] = stiffness * (xwidth - 0.5 - x[i]) + pe[i] += 0.5 * stiffness * (xwidth - 0.5 - x[i]) * (xwidth - 0.5 - x[i]) + else: + fx[i] = 0. + + if y[i] < 0.5: + fy[i] = stiffness * (0.5 - y[i]) + pe[i] += 0.5 * stiffness * (0.5 - y[i]) * (0.5 - y[i]) + elif y[i] > ywidth - 0.5: + fy[i] = stiffness * (ywidth - 0.5 - y[i]) + pe[i] += 0.5 * stiffness * (ywidth - 0.5 - y[i]) * (ywidth - 0.5 - y[i]) + else: + fy[i] = 0. + + +class MDSolver(object): + def __init__(self, num_particles, x=None, y=None, vx=None, vy=None, + xmax=100., ymax=100., dx=1.5, init_T=0., + backend=None): + self.backend = backend + self.num_particles = num_particles + self.xmin, self.xmax = 0., xmax + self.ymin, self.ymax = 0., ymax + self.m = 1. + if x is None and y is None: + self.x, self.y = self.setup_positions(num_particles, dx) + if vx is None and vy is None: + self.vx, self.vy = self.setup_velocities(init_T, num_particles) + self.fx = carr.zeros_like(self.x, backend=self.backend) + self.fy = carr.zeros_like(self.x, backend=self.backend) + self.pe = carr.zeros_like(self.x, backend=self.backend) + self.init_forces = Elementwise(calculate_force, backend=self.backend) + self.step1 = Elementwise(step_method1, backend=self.backend) + self.step2 = Elementwise(step_method2, backend=self.backend) + self.energy_calc = Reduction("a+b", map_func=calculate_energy, + backend=self.backend) + + def setup_velocities(self, T, num_particles): + np.random.seed(123) + vx = np.random.uniform(0, 1., size=num_particles).astype(np.float64) + vy = np.random.uniform(0, 1., size=num_particles).astype(np.float64) + T_current = np.average(vx ** 2 + vy ** 2) + scaling_factor = (T / T_current) ** 0.5 + vx = vx * scaling_factor + vy = vy * scaling_factor + return wrap(vx, vy, backend=self.backend) + + def setup_positions(self, num_particles, dx): + ndim = np.ceil(num_particles ** 0.5) + dim_length = ndim * dx + + self.xmax = dim_length * 3 + self.ymax = dim_length * 3 + + xmin_eff = ((self.xmax - self.xmin) - dim_length) / 2. + xmax_eff = ((self.xmax - self.xmin) + dim_length) / 2. + + x, y = np.mgrid[xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx] + x = x.ravel().astype(np.float64)[:num_particles] + y = y.ravel().astype(np.float64)[:num_particles] + return wrap(x, y, backend=self.backend) + + def post_step(self, t): + energy = self.energy_calc(self.vx, self.vy, self.pe, + self.num_particles) + print("Energy at time =", t, "is", energy) + + def solve(self, t, dt): + num_steps = int(t // dt) + curr_t = 0. + self.init_forces(self.x, self.y, self.fx, self.fy, self.pe, + self.num_particles) + for i in range(num_steps): + self.step1(self.x, self.y, self.vx, self.vy, self.fx, self.fy, + self.pe, self.xmin, self.xmax, self.ymin, self.ymax, + self.m, dt, self.num_particles) + self.step2(self.x, self.y, self.vx, self.vy, self.fx, self.fy, + self.pe, self.xmin, self.xmax, self.ymin, self.ymax, + self.m, dt, self.num_particles) + curr_t += dt + if i % 100 == 0: + self.post_step(curr_t) + + def pull(self): + self.x.pull() + self.y.pull() + + def plot(self): + import matplotlib.pyplot as plt + plt.xlim(self.xmin, self.xmax) + plt.ylim(self.ymin, self.ymax) + plt.scatter(self.x.data, self.y.data, 4.2) + plt.savefig("sim.png", dpi=300) + + +if __name__ == '__main__': + from argparse import ArgumentParser + p = ArgumentParser() + p.add_argument( + '-b', '--backend', action='store', dest='backend', default='cython', + help='Choose the backend.' + ) + p.add_argument( + '--openmp', action='store_true', dest='openmp', default=False, + help='Use OpenMP.' + ) + p.add_argument( + '--use-double', action='store_true', dest='use_double', + default=False, help='Use double precision on the GPU.' + ) + + p.add_argument('-n', action='store', type=int, dest='n', + default=100, help='Number of particles') + + p.add_argument('--tf', action='store', type=float, dest='t', + default=40., help='Final time') + + p.add_argument('--dt', action='store', type=float, dest='dt', + default=0.02, help='Time step') + + o = p.parse_args() + get_config().use_openmp = o.openmp + get_config().use_double = o.use_double + + solver = MDSolver(o.n, backend=o.backend) + + start = time.time() + solver.solve(o.t, o.dt) + end = time.time() + print("Time taken for N = %i is %g secs" % (o.n, (end - start))) + solver.pull() + solver.plot() diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/nnps_kernels.py compyle-0.6/examples/molecular_dynamics/nnps_kernels.py --- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/nnps_kernels.py 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/examples/molecular_dynamics/nnps_kernels.py 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,142 @@ +from compyle.api import declare, annotate +from compyle.parallel import serial +from compyle.low_level import atomic_inc, cast +from math import floor +import numpy as np + + +@annotate +def find_cell_id(x, y, h, c): + c[0] = cast(floor((x) / h), "int") + c[1] = cast(floor((y) / h), "int") + + +@annotate +def flatten(p, q, qmax): + return p * qmax + q + + +@serial +@annotate +def count_bins(i, x, y, h, cmax, keys, bin_counts, + sort_offsets): + c = declare('matrix(2, "int")') + find_cell_id(x[i], y[i], h, c) + key = flatten(c[0], c[1], cmax) + keys[i] = key + idx = atomic_inc(bin_counts[key]) + sort_offsets[i] = idx + + +@annotate +def sort_indices(i, keys, sort_offsets, start_indices, sorted_indices): + key = keys[i] + offset = sort_offsets[i] + start_idx = start_indices[key] + sorted_indices[start_idx + offset] = i + + +@annotate +def input_start_indices(i, counts): + return 0 if i == 0 else counts[i - 1] + + +@annotate +def output_start_indices(i, item, indices): + indices[i] = item + + +@annotate +def fill_keys(i, x, y, h, cmax, indices, keys): + c = declare('matrix(2, "int")') + find_cell_id(x[i], y[i], h, c) + key = flatten(c[0], c[1], cmax) + keys[i] = key + indices[i] = i + + +@annotate +def input_scan_keys(i, keys): + return 1 if i == 0 or keys[i] != keys[i - 1] else 0 + + +@annotate +def output_scan_keys(i, item, prev_item, keys, start_indices): + key = keys[i] + if item != prev_item: + start_indices[key] = i + + +@annotate +def fill_bin_counts(i, keys, start_indices, bin_counts, num_particles): + if i == num_particles - 1: + last_key = keys[num_particles - 1] + bin_counts[last_key] = num_particles - start_indices[last_key] + if i == 0 or keys[i] == keys[i - 1]: + return + key = keys[i] + prev_key = keys[i - 1] + bin_counts[prev_key] = start_indices[key] - start_indices[prev_key] + + +@annotate +def find_neighbor_lengths_knl(i, x, y, h, cmax, start_indices, sorted_indices, + bin_counts, nbr_lengths, max_key): + d = h * h + q_c = declare('matrix(2, "int")') + find_cell_id(x[i], y[i], h, q_c) + + for p in range(-1, 2): + for q in range(-1, 2): + cx = q_c[0] + p + cy = q_c[1] + q + + key = flatten(cx, cy, cmax) + + if key >= max_key or key < 0: + continue + + start_idx = start_indices[key] + np = bin_counts[key] + + for k in range(np): + j = sorted_indices[start_idx + k] + xij = x[i] - x[j] + yij = y[i] - y[j] + rij2 = xij * xij + yij * yij + + if rij2 < d: + nbr_lengths[i] += 1 + + +@annotate +def find_neighbors_knl(i, x, y, h, cmax, start_indices, sorted_indices, + bin_counts, nbr_starts, nbrs, max_key): + d = h * h + q_c = declare('matrix(2, "int")') + find_cell_id(x[i], y[i], h, q_c) + length = 0 + nbr_start_idx = nbr_starts[i] + + for p in range(-1, 2): + for q in range(-1, 2): + cx = q_c[0] + p + cy = q_c[1] + q + + key = flatten(cx, cy, cmax) + + if key >= max_key or key < 0: + continue + + start_idx = start_indices[key] + np = bin_counts[key] + + for k in range(np): + j = sorted_indices[start_idx + k] + xij = x[i] - x[j] + yij = y[i] - y[j] + rij2 = xij * xij + yij * yij + + if rij2 < d: + nbrs[nbr_start_idx + length] = j + length += 1 diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/nnps.py compyle-0.6/examples/molecular_dynamics/nnps.py --- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/nnps.py 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/examples/molecular_dynamics/nnps.py 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,159 @@ +from nnps_kernels import * +from compyle.config import get_config +from compyle.api import declare, annotate +from compyle.parallel import serial, Elementwise, Reduction, Scan +from compyle.array import get_backend, wrap +from compyle.low_level import atomic_inc, cast +from math import floor +from time import time + +import numpy as np +import compyle.array as carr + + +class NNPS(object): + def __init__(self, x, y, h, xmax, ymax, backend=None): + self.backend = backend + self.num_particles = x.length + self.x, self.y = x, y + self.h = h + + cmax = np.array([floor(xmax / h), floor(ymax / h)], dtype=np.int32) + self.max_key = 1 + flatten(cmax[0], cmax[1], 1 + cmax[1]) + self.qmax = 1 + cmax[1] + + # neighbor kernels + self.find_neighbor_lengths = Elementwise(find_neighbor_lengths_knl, + backend=self.backend) + self.find_neighbors = Elementwise(find_neighbors_knl, + backend=self.backend) + self.scan_start_indices = Scan(input=input_start_indices, + output=output_start_indices, + scan_expr="a+b", dtype=np.int32, + backend=self.backend) + self.init_arrays() + + def init_arrays(self): + # sort arrays + self.bin_counts = carr.zeros(self.max_key, dtype=np.int32, + backend=self.backend) + self.start_indices = carr.zeros(self.max_key, dtype=np.int32, + backend=self.backend) + self.keys = carr.zeros(self.num_particles, dtype=np.int32, + backend=self.backend) + self.sorted_indices = carr.zeros(self.num_particles, dtype=np.int32, + backend=self.backend) + + # neighbor arrays + self.nbr_lengths = carr.zeros(self.num_particles, dtype=np.int32, + backend=self.backend) + self.nbr_starts = carr.zeros(self.num_particles, dtype=np.int32, + backend=self.backend) + self.nbrs = carr.zeros(2 * self.num_particles, dtype=np.int32, + backend=self.backend) + + def reset_arrays(self): + # sort arrays + self.bin_counts.fill(0) + self.start_indices.fill(0) + self.sorted_indices.fill(0) + + # neighbors array + self.nbr_lengths.fill(0) + self.nbr_starts.fill(0) + + def get_neighbors(self): + self.find_neighbor_lengths(self.x, self.y, self.h, self.qmax, + self.start_indices, self.sorted_indices, + self.bin_counts, self.nbr_lengths, + self.max_key) + self.scan_start_indices(counts=self.nbr_lengths, + indices=self.nbr_starts) + self.total_neighbors = int(self.nbr_lengths[-1] + self.nbr_starts[-1]) + self.nbrs.resize(self.total_neighbors) + self.find_neighbors(self.x, self.y, self.h, self.qmax, + self.start_indices, self.sorted_indices, + self.bin_counts, self.nbr_starts, + self.nbrs, self.max_key) + + +class NNPSCountingSort(NNPS): + def __init__(self, x, y, h, xmax, ymax, backend=None): + super().__init__(x, y, h, xmax, ymax, backend=backend) + # sort kernels + self.count_bins = Elementwise(count_bins, backend=self.backend) + self.sort_indices = Elementwise(sort_indices, backend=self.backend) + + def init_arrays(self): + super().init_arrays() + self.sort_offsets = carr.zeros(self.num_particles, dtype=np.int32, + backend=self.backend) + + def reset_arrays(self): + super().reset_arrays() + # sort arrays + self.sort_offsets.fill(0) + + def build(self): + self.reset_arrays() + self.count_bins(self.x, self.y, self.h, self.qmax, self.keys, + self.bin_counts, self.sort_offsets) + self.scan_start_indices(counts=self.bin_counts, + indices=self.start_indices) + self.sort_indices(self.keys, self.sort_offsets, self.start_indices, + self.sorted_indices) + + +class NNPSRadixSort(NNPS): + def __init__(self, x, y, h, xmax, ymax, backend=None): + super().__init__(x, y, h, xmax, ymax, backend=backend) + self.max_bits = np.ceil(np.log2(self.max_key)) + + # sort kernels + self.fill_keys = Elementwise(fill_keys, backend=self.backend) + self.fill_bin_counts = Elementwise(fill_bin_counts, + backend=self.backend) + self.scan_keys = Scan(input=input_scan_keys, + output=output_scan_keys, + scan_expr="a+b", dtype=np.int32, + backend=self.backend) + + def init_arrays(self): + super().init_arrays() + # sort arrays + self.sorted_keys = carr.zeros(self.num_particles, dtype=np.int32, + backend=self.backend) + self.indices = carr.zeros(self.num_particles, dtype=np.int32, + backend=self.backend) + + def reset_arrays(self): + super().reset_arrays() + self.sorted_keys.fill(0) + + def build(self): + self.reset_arrays() + self.fill_keys(self.x, self.y, self.h, self.qmax, self.indices, + self.keys) + self.sorted_keys, self.sorted_indices = carr.sort_by_keys( + [self.keys, self.indices], + key_bits=self.max_bits, backend=self.backend) + self.scan_keys(keys=self.sorted_keys, + start_indices=self.start_indices) + self.fill_bin_counts(self.sorted_keys, self.start_indices, + self.bin_counts, self.num_particles) + + +if __name__ == "__main__": + import sys + backend = sys.argv[1] if len(sys.argv) > 1 else 'cython' + np.random.seed(123) + num_particles = 20 + x = np.random.uniform(0, 10., size=num_particles).astype(np.float32) + y = np.random.uniform(0, 10., size=num_particles).astype(np.float32) + x, y = wrap(x, y, backend=backend) + nnps = NNPSRadixSort(x, y, 3., 10., 10., backend=backend) + nnps.build() + nnps.get_neighbors() + print(nnps.start_indices) + print(nnps.bin_counts) + print(nnps.nbr_lengths) diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/performance_comparison.py compyle-0.6/examples/molecular_dynamics/performance_comparison.py --- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/performance_comparison.py 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/examples/molecular_dynamics/performance_comparison.py 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,144 @@ +import numpy as np +import time +import md_simple +import md_nnps + +from compyle.config import get_config + + +def solve(n, backend, solver_algo, tf=0.5, dt=0.02, use_count_sort=False): + solver = solver_algo(n, backend=backend.replace("_omp", "")) + start = time.time() + solver.solve(tf, dt) + end = time.time() + print("Time taken for backend = %s, N = %i is %g secs" % + (backend, n, (end - start))) + return end - start + + +def compare(backends, n_list, solver_algo, niter=3): + t_list = {b: [] for b in backends} + speedups = {b: [] for b in backends} + for n in n_list: + print("Running for N = %i" % n) + for backend in backends: + if "omp" in backend: + get_config().use_openmp = True + t = 1e9 + for it in range(niter): + t = min(t, solve(n, backend, solver_algo)) + t_list[backend].append(t) + if "omp" in backend: + get_config().use_openmp = False + + if 'cython' in backends: + for backend in backends: + for i, n in enumerate(n_list): + speedups[backend].append( + t_list["cython"][i] / t_list[backend][i]) + else: + speedups = None + + return speedups, t_list + + +def compare_implementations(backend, n_list, niter=3): + import matplotlib.pyplot as plt + sp, nnps_tlist = compare([backend], n_list, + md_nnps.MDSolver, niter=niter) + sp, simple_tlist = compare([backend], n_list, + md_simple.MDSolver, niter=niter) + + speedup = [simple_tlist[backend][i] / nnps_tlist[backend][i] + for i in range(len(n_list))] + + plt.loglog(n_list, nnps_tlist[backend], 'x-', label="Linear") + plt.loglog(n_list, simple_tlist[backend], 'x-', label="Simple") + + plt.xlabel("Number of particles") + plt.ylabel("Time (secs)") + plt.legend() + plt.grid(True) + plt.savefig("time_comp_impl.png", dpi=300) + + plt.clf() + + plt.loglog(n_list, speedup, 'x-') + + plt.xlabel("Number of particles") + plt.ylabel("Speedup") + plt.grid(True) + plt.savefig("speedup_comp_impl.png", dpi=300) + + +def plot(n_list, speedups, t_list, label): + backend_label_map = {'cython': 'Cython', 'cython_omp': 'OpenMP', + 'opencl': 'OpenCL', 'cuda': 'CUDA'} + import matplotlib.pyplot as plt + plt.figure() + + if speedups: + for backend, arr in speedups.items(): + if backend == "cython": + continue + plt.semilogx(n_list, arr, 'x-', label=backend_label_map[backend]) + + plt.xlabel("Number of particles") + plt.ylabel("Speedup") + plt.legend() + plt.grid(True) + plt.savefig("%s_speedup_%s.png" % + (label, "_".join(speedups.keys())), dpi=300) + + plt.clf() + + for backend, arr in t_list.items(): + plt.loglog(n_list, arr, 'x-', label=backend_label_map[backend]) + + plt.xlabel("Number of particles") + plt.ylabel("Time (secs)") + plt.legend() + plt.grid(True) + plt.savefig("%s_time_%s.png" % (label, "_".join(t_list.keys())), dpi=300) + + +if __name__ == "__main__": + from argparse import ArgumentParser + p = ArgumentParser() + p.add_argument( + '-c', '--comparison', action='store', dest='comp', default='gpu_comp', + choices=['gpu_comp', 'omp_comp', 'comp_algo'], + help='Choose the comparison.' + ) + p.add_argument( + '--nnps', action='store', dest='nnps', default='linear', + choices=['linear', 'simple'], + help='Choose algorithm.' + ) + p.add_argument( + '--use-double', action='store_true', dest='use_double', + default=False, help='Use double precision on the GPU.' + ) + + o = p.parse_args() + get_config().use_double = o.use_double + solver_algo = (md_nnps.MDSolver if o.nnps == 'linear' + else md_simple.MDSolver) + n_list = [10000 * (2 ** i) for i in range(10)] if o.nnps == 'linear' else \ + [500 * (2 ** i) for i in range(8)] + + if o.comp == "gpu_comp": + backends = ["opencl", "cuda", "cython"] + print("Running for", n_list) + speedups, t_list = compare(backends, n_list, solver_algo) + plot(n_list, speedups, t_list, o.nnps) + elif o.comp == "omp_comp": + backends = ["cython_omp", "cython"] + print("Running for", n_list) + speedups, t_list = compare(backends, n_list, solver_algo) + plot(n_list, speedups, t_list, o.nnps) + elif o.comp == "comp_algo": + backend = "cython" + n_list = [500, 1000, 2000, 4000, 8000, 16000, 32000] + print("Running for", n_list) + compare_implementations(backend, n_list) diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/README.rst compyle-0.6/examples/molecular_dynamics/README.rst --- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/README.rst 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/examples/molecular_dynamics/README.rst 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,21 @@ +Molecular Dynamics Example +-------------------------- + +We have 3 implementations of a simple molecular dynamics simulation +of an N body problem in Lennard Jones potential. The first implementation +is a simple :math:`O(N^2)` implementation that can be found in +:code:`md_simple.py`. The second implementation is using nearest neighbor +searching to reduce the complexity to :math:`O(N)` and can be +found in :code:`md_nnps.py`. + +We also have two different implementations of nearest neighbor search +algorithms, one using a radix sort on GPU and numpy sort on CPU +and the other using a native counting sort implementation. The counting +sort version is about 30% faster. Both these implementations can be +found in :code:`nnps.py`. + +We have also provided a script to run performance comparison between +these implementations in `performance_comparison.py`. Users can use +the google colab notebook +`here `_. + diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/pyproject.toml compyle-0.6/pyproject.toml --- compyle-0.6~dev0~20190922.gitaa5a50d/pyproject.toml 1970-01-01 00:00:00.000000000 +0000 +++ compyle-0.6/pyproject.toml 2020-06-14 18:54:39.000000000 +0000 @@ -0,0 +1,9 @@ +[build-system] +requires = [ + "wheel>=0.29.0", + "setuptools>=42.0.0", + "oldest-supported-numpy", + "Cython>=0.20", + "mako", + "pytools" +] \ No newline at end of file diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/setup.py compyle-0.6/setup.py --- compyle-0.6~dev0~20190922.gitaa5a50d/setup.py 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/setup.py 2020-06-14 18:54:39.000000000 +0000 @@ -24,7 +24,7 @@ if sys.version_info[0] < 3: tests_require += ['mock>=1.0'] docs_require = ['sphinx'] -cuda_require = ['pycuda'] +cuda_require = ['pycuda', 'cupy'] opencl_require = ['pyopencl'] classes = ''' diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/.travis.yml compyle-0.6/.travis.yml --- compyle-0.6~dev0~20190922.gitaa5a50d/.travis.yml 2019-06-05 08:48:46.000000000 +0000 +++ compyle-0.6/.travis.yml 2020-06-14 18:54:39.000000000 +0000 @@ -6,25 +6,24 @@ language: python python: - - 2.7 - 3.7 + - 3.8 install: - sudo apt-get update - - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then - wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; - else - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; - fi + - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - - export PATH="$HOME/miniconda/bin:$PATH" + - source "$HOME/miniconda/etc/profile.d/conda.sh" - hash -r - conda config --set always_yes yes --set changeps1 no - conda update -q conda - conda info -a - conda config --add channels conda-forge - conda config --add channels defaults - - conda install -c conda-forge pocl==1.2 pyopencl cython numpy + + - conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION + - conda activate testenv + - conda install -c conda-forge pocl pyopencl numpy cython - python -c 'import pyopencl as cl' - pip install -r requirements.txt - pip install coverage codecov