diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/appveyor.yml compyle-0.6/appveyor.yml
--- compyle-0.6~dev0~20190922.gitaa5a50d/appveyor.yml	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/appveyor.yml	2020-06-14 18:54:39.000000000 +0000
@@ -1,9 +1,10 @@
 environment:
 
   matrix:
-    - PYTHON: "C:\\Python27"
-    - PYTHON: "C:\\Python36-x64"
     - PYTHON: "C:\\Python37-x64"
+    - PYTHON: "C:\\Python38-x64"
+
+skip_branch_with_pr: true
 
 install:
   - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/CHANGES.rst compyle-0.6/CHANGES.rst
--- compyle-0.6~dev0~20190922.gitaa5a50d/CHANGES.rst	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/CHANGES.rst	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,23 @@
+0.6
+~~~~
+
+* Release date: 15th June, 2020.
+* Add some non-trivial examples showcasing the package.
+* Document how one can use clang + OpenMP.
+* Add sorting, align, and other functions to array module.
+* Support for mapping structs on a GPU with CUDA.
+* Add address, cast, and address low-level functions.
+* Support for mako-templates for reducing repetitive code.
+* Bitwise operator support.
+* Attempt to auto-declare variables when possible.
+* Fix several bugs and issues.
+
+
+
+0.5
+~~~~
+
+* Release date: 3rd, December 2018
+* First public release.
+* Support for elementwise, scan, and reduction operations on CPU and GPU using
+  Cython, OpenCL and CUDA.
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/array.py compyle-0.6/compyle/array.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/array.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/array.py	2020-06-14 18:54:39.000000000 +0000
@@ -6,6 +6,7 @@
 from .types import (annotate, dtype_to_ctype,
                     dtype_to_knowntype, knowntype_to_ctype)
 from .template import Template
+from .sort import radix_sort
 
 
 try:
@@ -118,7 +119,8 @@
 
 
 @memoize(key=minmax_collector_key)
-def make_collector_dtype(device, dtype, props, name, only_min, only_max, backend):
+def make_collector_dtype(device, dtype, props, name,
+                         only_min, only_max, backend):
     fields = [("pad", np.int32)]
 
     for prop in props:
@@ -143,7 +145,7 @@
 
 @memoize(key=lambda *args: (args[-3], args[-2], args[-1]))
 def get_minmax_kernel(ctx, dtype, inf, mmc_dtype, prop_names,
-                       only_min, only_max, name, mmc_c_decl, backend):
+                      only_min, only_max, name, mmc_c_decl, backend):
     tpl_args = ", ".join(
         ["%(dtype)s %(prop)s" % {'dtype': dtype, 'prop': prop}
             for prop in prop_names]
@@ -157,7 +159,7 @@
         )
         mmc_c_decl_lines = mmc_c_decl.splitlines()
         mmc_c_decl_lines = mmc_c_decl_lines[:-2] + \
-                mmc_overload.splitlines() + mmc_c_decl_lines[-2:]
+            mmc_overload.splitlines() + mmc_c_decl_lines[-2:]
         mmc_c_decl = '\n'.join(mmc_c_decl_lines)
 
     mmc_preamble = mmc_c_decl + minmax_tpl
@@ -454,7 +456,10 @@
 
 
 def sort_by_keys(ary_list, out_list=None, key_bits=None,
-                 backend=None):
+                 backend=None, use_radix_sort=False):
+    # FIXME: Need to use returned values, cuda backend uses
+    # thrust that will internally allocate a new array for storing
+    # the sorted data so out_list will not have the sorted arrays
     # first arg of ary_list is the key
     if backend is None:
         backend = ary_list[0].backend
@@ -462,6 +467,12 @@
         from .jit import get_ctype_from_arg
         from compyle.opencl import get_queue
 
+        if not out_list:
+            out_list = [
+                Array(ary.dtype, allocate=False, backend=backend)
+                for ary in ary_list
+            ]
+
         arg_types = [get_ctype_from_arg(arg) for arg in ary_list]
 
         sort_knl = get_cl_sort_kernel(arg_types, ary_list)
@@ -469,12 +480,26 @@
 
         arg_list = [ary.dev for ary in ary_list]
 
-        out_list, event = sort_knl(*arg_list, key_bits=key_bits,
-                                   allocator=allocator)
+        out_arrays, event = sort_knl(*arg_list, key_bits=key_bits,
+                                     allocator=allocator)
+        for i, out in enumerate(out_list):
+            out.set_data(out_arrays[i])
+        return out_list
+    elif backend == 'cython' and use_radix_sort:
+        out_list, order = radix_sort(ary_list, out_list=out_list,
+                                     max_key_bits=key_bits, backend=backend)
+        return out_list
+    elif backend == 'cython':
+        order = wrap(np.argsort(ary_list[0].dev), backend=backend)
+        out_list = align(ary_list, order, out_list=out_list,
+                         backend=backend)
         return out_list
     else:
         order = argsort(ary_list[0], backend=backend)
-        out_list = align(ary_list[1:], order, out_list=out_list,
+        modified_out_list = None
+        if out_list:
+            modified_out_list = out_list[1:]
+        out_list = align(ary_list[1:], order, out_list=modified_out_list,
                          backend=backend)
         return [ary_list[0]] + out_list
 
@@ -611,6 +636,7 @@
     from .jit import get_ctype_from_arg
     key = [get_ctype_from_arg(ary) for ary in ary_list]
     key.append(backend)
+    key.append(get_config().use_openmp)
     return tuple(key)
 
 
@@ -676,7 +702,7 @@
         elif isinstance(key, Array):
             return self.align(key)
         # NOTE: Not sure about this, done for PyCUDA compatibility
-        if self.backend is not 'cython':
+        if self.backend != 'cython':
             return self.dev[key].get()
         else:
             return self.dev[key]
@@ -816,7 +842,6 @@
         else:
             update_minmax_gpu([self])
 
-
     def fill(self, value):
         self.dev.fill(value)
 
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/cython_generator.py compyle-0.6/compyle/cython_generator.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/cython_generator.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/cython_generator.py	2020-06-14 18:54:39.000000000 +0000
@@ -27,16 +27,20 @@
 logger = logging.getLogger(__name__)
 
 
-def get_parallel_range(start, stop=None, step=1):
+def get_parallel_range(start, stop=None, step=1, **kwargs):
     config = get_config()
     if stop is None:
         stop = start
         start = 0
 
-    args = "{start},{stop},{step}"
+    args = "{start}, {stop}, {step}"
     if config.use_openmp:
         schedule = config.omp_schedule[0]
         chunksize = config.omp_schedule[1]
+        if 'schedule' in kwargs:
+            schedule = kwargs.pop('schedule')
+        if 'chunksize' in kwargs:
+            chunksize = kwargs.pop('chunksize')
 
         if schedule is not None:
             args = args + ", schedule='{schedule}'"
@@ -44,6 +48,9 @@
         if chunksize is not None:
             args = args + ", chunksize={chunksize}"
 
+        for k, v in kwargs.items():
+            args = args + ", %s=%r" % (k, v)
+
         args = args.format(start=start, stop=stop, step=step,
                            schedule=schedule, chunksize=chunksize)
         return "prange({})".format(args)
@@ -195,10 +202,11 @@
     def get_code(self):
         return self.code
 
-    def parse(self, obj, declarations=None):
+    def parse(self, obj, declarations=None, is_serial=False):
         obj_type = type(obj)
         if isinstance(obj, types.FunctionType):
-            self._parse_function(obj, declarations=declarations)
+            self._parse_function(obj, declarations=declarations,
+                                 is_serial=is_serial)
         elif hasattr(obj, '__class__'):
             self._parse_instance(obj)
         else:
@@ -347,16 +355,22 @@
 
         return methods
 
-    def _get_method_body(self, meth, lines, indent=' ' * 8, declarations=None):
+    def _get_method_body(self, meth, lines, indent=' ' * 8, declarations=None,
+                         is_serial=False):
         getfullargspec = getattr(
             inspect, 'getfullargspec', inspect.getargspec
         )
         args = set(getfullargspec(meth).args)
-        src = [self._process_body_line(line) for line in lines]
+        src = [self._process_body_line(line, is_serial=is_serial)
+               for line in lines]
         if declarations:
             cy_decls = []
             for var, decl in declarations.items():
-                cy_decls.append((var, indent + 'cdef %s\n' % decl[:-1]))
+                dtype, name = decl[:-1].split(' ')
+                if dtype[0] == 'u':
+                    dtype = 'unsigned %s' % dtype[1:]
+                modified_decl = '%s %s' % (dtype, name)
+                cy_decls.append((var, indent + 'cdef %s\n' % modified_decl))
             src = cy_decls + src
         declared = [] if not declarations else list(declarations.keys())
         for names, defn in src:
@@ -371,13 +385,15 @@
         code = ''.join(declare) + cython_body
         return code
 
-    def _get_method_wrapper(self, meth, indent=' ' * 8, declarations=None):
+    def _get_method_wrapper(self, meth, indent=' ' * 8, declarations=None,
+                            is_serial=False):
         sourcelines = getsourcelines(meth)[0]
         defn, lines = get_func_definition(sourcelines)
         m_name, returns, args = self._analyze_method(meth, lines)
         c_defn = self._get_c_method_spec(m_name, returns, args)
         c_body = self._get_method_body(meth, lines, indent=indent,
-                                       declarations=declarations)
+                                       declarations=declarations,
+                                       is_serial=is_serial)
         self.code = '{defn}\n{body}'.format(defn=c_defn, body=c_body)
         if self.python_methods:
             defn, body = self._get_py_method_spec(m_name, returns, args,
@@ -451,9 +467,28 @@
             defn = 'cdef {type} {name}'.format(type=ctype, name=name)
             return defn
 
-    def _parse_function(self, obj, declarations=None):
+    def _handle_cast_statement(self, name, call):
+        # FIXME: This won't handle casting to pointers
+        # using something like 'intp'
+        call_args = call[5:-1].split(',')
+        expr = call_args[0].strip()
+        ctype = call_args[1].strip()[1:-1]
+        return '%s = <%s> (%s)' % (name, ctype, expr)
+
+    def _handle_atomic_statement(self, name, call, is_serial):
+        # FIXME: This won't handle casting to pointers
+        # using something like 'intp'
+        call_arg = call[11:-1].strip()
+        if self._config.use_openmp and not is_serial:
+            return['openmp.omp_set_lock(&cy_lock)', '%s = %s' % (name, call_arg),
+                   '%s += 1' % call_arg, 'openmp.omp_unset_lock(&cy_lock)']
+        else:
+            return ['%s = %s' % (name, call_arg), '%s += 1' % call_arg]
+
+    def _parse_function(self, obj, declarations=None, is_serial=False):
         c_code, py_code = self._get_method_wrapper(obj, indent=' ' * 4,
-                                                   declarations=declarations)
+                                                   declarations=declarations,
+                                                   is_serial=is_serial)
         code = '{defn}\n{body}'.format(defn=c_code[0], body=c_code[1])
         if self.python_methods:
             code += '\n'
@@ -469,7 +504,7 @@
                                    methods=methods)
         self.code = helper.generate()
 
-    def _process_body_line(self, line):
+    def _process_body_line(self, line, is_serial=False):
         """Returns the name defined and the processed line itself.
 
         This hack primarily lets us declare variables from Python and inject
@@ -484,6 +519,23 @@
                 defn = self._handle_declare_statement(name, declare)
                 indent = line[:line.index(name)]
                 return name, indent + defn + '\n'
+            elif words[1].startswith('cast') and \
+                    not line.strip().startswith('#'):
+                name = words[0]
+                call = words[1]
+                stmt = self._handle_cast_statement(name, call)
+                indent = line[:line.index(name)]
+                return '', indent + stmt + '\n'
+            elif words[1].startswith('atomic_inc') and \
+                    not line.strip().startswith('#'):
+                name = words[0]
+                call = words[1]
+                indent = line[:line.index(name)]
+                stmts = self._handle_atomic_statement(name, call, is_serial)
+                result = ''
+                for stmt in stmts:
+                    result += indent + stmt + '\n'
+                return '', result + '\n'
             else:
                 return '', line
         else:
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/ext_module.py compyle-0.6/compyle/ext_module.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/ext_module.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/ext_module.py	2020-06-14 18:54:39.000000000 +0000
@@ -4,13 +4,13 @@
 from distutils.util import get_platform
 from distutils.errors import CompileError, LinkError
 import hashlib
-import imp
 import importlib
 import io
 import logging
 import numpy
 import os
-from os.path import dirname, exists, expanduser, isdir, join
+from os.path import exists, expanduser, isdir, join
+import platform
 from pyximport import pyxbuild
 import shutil
 import sys
@@ -57,6 +57,23 @@
         return unicode(s)
 
 
+def get_openmp_flags():
+    """Return the OpenMP flags for the platform.
+
+    This returns two lists, [extra_compile_args], [extra_link_args]
+    """
+    if sys.platform == 'win32':
+        return ['/openmp'], []
+    elif sys.platform == 'darwin':
+        if (os.environ.get('CC') is not None and
+           os.environ.get('CXX') is not None):
+            return ['-fopenmp'], ['-fopenmp']
+        else:
+            return ['-Xpreprocessor', '-fopenmp'], ['-lomp']
+    else:
+        return ['-fopenmp'], ['-fopenmp']
+
+
 class ExtModule(object):
     """Encapsulates the generated code, extension module etc.
     """
@@ -106,7 +123,7 @@
         self.extra_link_args = extra_link_args if extra_link_args else []
 
     def _add_local_include(self):
-        if sys.platform != 'win32':
+        if 'bsd' in platform.system().lower():
             local = '/usr/local/include'
             if local not in self.extra_inc_dirs:
                 self.extra_inc_dirs.append(local)
@@ -262,16 +279,16 @@
         Returns
         """
         self.write_and_build()
-        file, path, desc = imp.find_module(self.name, [dirname(self.ext_path)])
-        return imp.load_module(self.name, file, path, desc)
+        spec = importlib.util.spec_from_file_location(self.name, self.ext_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
 
     def _get_extra_args(self):
         ec, el = self.extra_compile_args, self.extra_link_args
         if get_config().use_openmp:
-            if sys.platform == 'win32':
-                return ['/openmp'] + ec, [] + el
-            else:
-                return ['-fopenmp'] + ec, ['-fopenmp'] + el
+            _ec, _el = get_openmp_flags()
+            return _ec + ec, _el + el
         else:
             return ec, el
 
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/__init__.py compyle-0.6/compyle/__init__.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/__init__.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/__init__.py	2020-06-14 18:54:39.000000000 +0000
@@ -1,2 +1,2 @@
 # See PEP 440 for more on suitable version numbers.
-__version__ = '0.6.dev0'
+__version__ = '0.6'
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/jit.py compyle-0.6/compyle/jit.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/jit.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/jit.py	2020-06-14 18:54:39.000000000 +0000
@@ -18,9 +18,24 @@
 from . import parallel
 
 
+def memoize_kernel(key=lambda *args: args):
+    def memoize_deco(method):
+        def wrapper(*args):
+            f = args[0].func
+            key_val = key(*args)
+            if not hasattr(f, 'cached_kernel'):
+                setattr(f, 'cached_kernel', {key_val: method(*args)})
+            elif key_val not in f.cached_kernel:
+                f.cached_kernel[key_val] = method(*args)
+            return f.cached_kernel[key_val]
+        return wrapper
+    return memoize_deco
+
+
 def kernel_cache_key_args(obj, *args):
     key = [get_ctype_from_arg(arg) for arg in args]
     key.append(obj.func)
+    key.append(obj.name)
     key.append(obj.backend)
     key.append(obj._config.use_openmp)
     return tuple(key)
@@ -205,6 +220,16 @@
                     names = [x.id for x in left.elts]
                     for name in names:
                         self.var_types[name] = self.get_type(type)
+            elif right.func.id == 'cast':
+                if not isinstance(right.args[1], ast.Str):
+                    self.error("Cast type should be a string.", node)
+                type = right.args[1].s
+                if isinstance(left, ast.Name):
+                    self.undecl_var_types[left.id] = self.get_type(type)
+            elif right.func.id == 'atomic_inc':
+                if left.id not in self.var_types and \
+                        left.id not in self.undecl_var_types:
+                    self.undecl_var_types[left.id] = self.visit(right.args[0])
             elif isinstance(left, ast.Name):
                 if left.id not in self.var_types and \
                         left.id not in self.undecl_var_types:
@@ -268,7 +293,7 @@
         backend = array.get_backend(backend)
         self.tp = Transpiler(backend=backend)
         self.backend = backend
-        self.name = func.__name__
+        self.name = 'elwise_%s' % func.__name__
         self.func = func
         self._config = get_config()
         self.cython_gen = CythonGenerator()
@@ -289,7 +314,7 @@
             type_info[name] = arg_type
         return type_info
 
-    @memoize(key=kernel_cache_key_args)
+    @memoize_kernel(key=kernel_cache_key_args)
     def _generate_kernel(self, *args):
         if self.func is not None:
             arg_types = self.get_type_info_from_args(*args)
@@ -364,7 +389,7 @@
             type_info[name] = arg_type
         return type_info
 
-    @memoize(key=kernel_cache_key_args)
+    @memoize_kernel(key=kernel_cache_key_args)
     def _generate_kernel(self, *args):
         if self.func is not None:
             arg_types = self.get_type_info_from_args(*args)
@@ -491,17 +516,18 @@
     def __call__(self, **kwargs):
         c_func = self._generate_kernel(**kwargs)
         c_args_dict = {k: self._massage_arg(x) for k, x in kwargs.items()}
+        output_arg_keys = self.output_func.arg_keys[self._get_backend_key()]
 
         if self.backend == 'cython':
-            size = len(c_args_dict[self.output_func.arg_keys[1]])
+            size = len(c_args_dict[output_arg_keys[1]])
             c_args_dict['SIZE'] = size
-            c_func(*[c_args_dict[k] for k in self.output_func.arg_keys])
+            c_func(*[c_args_dict[k] for k in output_arg_keys])
         elif self.backend == 'opencl':
-            c_func(*[c_args_dict[k] for k in self.output_func.arg_keys])
+            c_func(*[c_args_dict[k] for k in output_arg_keys])
             self.queue.finish()
         elif self.backend == 'cuda':
             import pycuda.driver as drv
             event = drv.Event()
-            c_func(*[c_args_dict[k] for k in self.output_func.arg_keys])
+            c_func(*[c_args_dict[k] for k in output_arg_keys])
             event.record()
             event.synchronize()
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/low_level.py compyle-0.6/compyle/low_level.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/low_level.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/low_level.py	2020-06-14 18:54:39.000000000 +0000
@@ -319,11 +319,20 @@
         pass
 
 
+class _cast(Extern):
+    def code(self, backend):
+        return ''
+
+    def __call__(self, *args, **kw):
+        pass
+
+
 prange = _prange()
 parallel = _parallel()
 nogil = _nogil()
 address = _address()
 atomic_inc = _atomic_inc()
+cast = _cast()
 
 
 class Cython(object):
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/parallel.py compyle-0.6/compyle/parallel.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/parallel.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/parallel.py	2020-06-14 18:54:39.000000000 +0000
@@ -62,8 +62,8 @@
         a, b = b, a%b
     return a
 
-cdef int get_stride(sz, itemsize):
-    return sz/gcd(sz, itemsize)
+cdef int get_stride(int sz, int itemsize):
+    return sz//gcd(sz, itemsize)
 
 
 cdef ${type} c_${name}(${c_arg_sig}):
@@ -130,7 +130,7 @@
     return a
 
 cdef int get_stride(int sz, int itemsize):
-    return sz / gcd(sz, itemsize)
+    return sz // gcd(sz, itemsize)
 
 
 cdef void c_${name}(${c_arg_sig}):
@@ -175,7 +175,7 @@
     # between threads
     % if not calc_last_item:
     # A chunk of 1 MB per thread
-    cdef int chunksize = 1048576 / sz
+    cdef int chunksize = 1048576 // sz
     % else:
     # Process all data together. Only then can we get
     # the last item immediately
@@ -194,7 +194,7 @@
             buffer_idx = tid * scan_stride
 
             start = offset + tid * chunksize
-            end = offset + min((tid + 1) * chunksize, SIZE)
+            end = min(offset + (tid + 1) * chunksize, SIZE)
             has_segment = 0
 
             temp = ${neutral}
@@ -281,7 +281,7 @@
             carry = buffer[buffer_idx]
 
             start = offset + tid * chunksize
-            end = offset + min((tid + 1) * chunksize, SIZE)
+            end = min(offset + (tid + 1) * chunksize, SIZE)
 
             for i in range(start, end):
                 # Output
@@ -326,7 +326,6 @@
     return c_${name}(${py_args})
 '''
 
-# No support for last_item in single thread
 scan_cy_single_thread_template = '''
 from cython.parallel import parallel, prange, threadid
 from libc.stdlib cimport abort, malloc, free
@@ -338,6 +337,14 @@
     cdef ${type} a, b, item
     N = SIZE
 
+    % if calc_last_item:
+    a = ${neutral}
+    for i in range(N):
+        b = ${input_expr}
+        a = ${scan_expr}
+    last_item = a
+    % endif
+
     a = ${neutral}
 
     for i in range(N):
@@ -375,12 +382,26 @@
     return result
 
 
+def serial(func=None, **kw):
+    """Decorator to specify serial execution of a cython
+    function
+    """
+    def wrapper(func):
+        func.is_serial = True
+        return func
+
+    if func is None:
+        return wrapper
+    else:
+        return wrapper(func)
+
+
 class ElementwiseBase(object):
     def __init__(self, func, backend=None):
         backend = array.get_backend(backend)
         self.tp = Transpiler(backend=backend)
         self.backend = backend
-        self.name = func.__name__
+        self.name = 'elwise_%s' % func.__name__
         self.func = func
         self._config = get_config()
         self.cython_gen = CythonGenerator()
@@ -390,23 +411,25 @@
     def _generate(self, declarations=None):
         self.tp.add(self.func, declarations=declarations)
         if self.backend == 'cython':
+            # FIXME: Handle the name of the kernel correctly
             py_data, c_data = self.cython_gen.get_func_signature(self.func)
             py_defn = ['long SIZE'] + py_data[0][1:]
             c_defn = ['long SIZE'] + c_data[0][1:]
             py_args = ['SIZE'] + py_data[1][1:]
             template = Template(text=elementwise_cy_template)
             src = template.render(
-                name=self.name,
+                name=self.name[7:],
                 c_arg_sig=', '.join(c_defn),
                 c_args=', '.join(c_data[1]),
                 py_arg_sig=', '.join(py_defn),
                 py_args=', '.join(py_args),
-                openmp=self._config.use_openmp,
+                openmp=self._config.use_openmp and not getattr(
+                    self.func, 'is_serial', False),
                 get_parallel_range=get_parallel_range
             )
             self.tp.add_code(src)
             self.tp.compile()
-            return getattr(self.tp.mod, 'py_' + self.name)
+            return getattr(self.tp.mod, 'py_' + self.name[7:])
         elif self.backend == 'opencl':
             py_data, c_data = self.cython_gen.get_func_signature(self.func)
             self._correct_opencl_address_space(c_data)
@@ -428,7 +451,7 @@
             )
             knl = ElementwiseKernel(
                 ctx,
-                name='elwise_%s' % self.name,
+                name=self.name,
                 arguments=arguments,
                 operation=expr,
                 preamble="\n".join([cluda_preamble, preamble])
@@ -453,7 +476,7 @@
                 double_support=True
             )
             knl = ElementwiseKernel(
-                name='elwise_%s' % self.name,
+                name=self.name,
                 arguments=arguments,
                 operation=expr,
                 preamble="\n".join([cluda_preamble, preamble])
@@ -785,6 +808,9 @@
         self.queue = None
         self.c_func = self._generate()
 
+    def _get_backend_key(self):
+        return (self.backend, self._config.use_openmp, self._config.use_double)
+
     def _correct_return_type(self, c_data, modifier):
         code = self.tp.blocks[-1].code.splitlines()
         if self._config.use_openmp:
@@ -839,10 +865,11 @@
         input_expr = 'input[i]'
         return py_data, c_data, input_expr
 
-    def _wrap_cython_code(self, func, func_type=None):
+    def _wrap_cython_code(self, func, func_type=None,
+                          declarations=None):
         name = self.name
         if func is not None:
-            self.tp.add(func)
+            self.tp.add(func, declarations=declarations)
             py_data, c_data = self.cython_gen.get_func_signature(func)
             self._correct_return_type(c_data, func_type)
 
@@ -872,15 +899,18 @@
         all_c_data = [[], []]
 
         # Process input function
-        py_data, c_data, input_expr = self._wrap_cython_code(self.input_func,
-                                                             func_type='input')
+        py_data, c_data, input_expr = self._wrap_cython_code(
+            self.input_func,
+            func_type='input',
+            declarations=declarations
+        )
         self._append_cython_arg_data(all_py_data, all_c_data,
                                      py_data, c_data)
 
         # Process segment function
         use_segment = True if self.is_segment_func is not None else False
         py_data, c_data, segment_expr = self._wrap_cython_code(
-            self.is_segment_func, func_type='segment')
+            self.is_segment_func, func_type='segment', declarations=declarations)
         self._append_cython_arg_data(all_py_data, all_c_data, py_data, c_data)
 
         # Process output expression
@@ -888,7 +918,8 @@
         calc_prev_item = False
 
         py_data, c_data, output_expr = self._wrap_cython_code(
-            self.output_func, func_type='output')
+            self.output_func, func_type='output',
+            declarations=declarations)
         if self.output_func is not None:
             calc_last_item = self._include_last_item()
             calc_prev_item = self._include_prev_item()
@@ -906,7 +937,9 @@
         py_args = drop_duplicates(py_args)
         c_args = drop_duplicates(c_args)
 
-        self.output_func.arg_keys = c_args
+        if not hasattr(self.output_func, 'arg_keys'):
+            self.output_func.arg_keys = {}
+        self.output_func.arg_keys[self._get_backend_key()] = c_args
 
         if self._config.use_openmp:
             template = Template(text=scan_cy_template)
@@ -991,7 +1024,9 @@
 
         c_args = input_c_args + segment_c_args + output_c_args
         c_args = drop_duplicates(c_args)
-        self.output_func.arg_keys = c_args
+        if not hasattr(self.output_func, 'arg_keys'):
+            self.output_func.arg_keys = {}
+        self.output_func.arg_keys[self._get_backend_key()] = c_args
 
         return scan_expr, arg_defn, input_expr, output_expr, \
             segment_expr, preamble
@@ -1075,18 +1110,19 @@
 
     def __call__(self, **kwargs):
         c_args_dict = {k: self._massage_arg(x) for k, x in kwargs.items()}
+        output_arg_keys = self.output_func.arg_keys[self._get_backend_key()]
 
         if self.backend == 'cython':
-            size = len(c_args_dict[self.output_func.arg_keys[1]])
+            size = len(c_args_dict[output_arg_keys[1]])
             c_args_dict['SIZE'] = size
-            self.c_func(*[c_args_dict[k] for k in self.output_func.arg_keys])
+            self.c_func(*[c_args_dict[k] for k in output_arg_keys])
         elif self.backend == 'opencl':
-            self.c_func(*[c_args_dict[k] for k in self.output_func.arg_keys])
+            self.c_func(*[c_args_dict[k] for k in output_arg_keys])
             self.queue.finish()
         elif self.backend == 'cuda':
             import pycuda.driver as drv
             event = drv.Event()
-            self.c_func(*[c_args_dict[k] for k in self.output_func.arg_keys])
+            self.c_func(*[c_args_dict[k] for k in output_arg_keys])
             event.record()
             event.synchronize()
 
@@ -1095,6 +1131,7 @@
     def __init__(self, input=None, output=None, scan_expr="a+b",
                  is_segment=None, dtype=np.float64, neutral='0',
                  complex_map=False, backend=None):
+        # FIXME: Revisit these conditions
         input_base = input is None or \
             getattr(input, '__annotations__', None) and \
             not hasattr(input, 'is_jit')
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/sort.py compyle-0.6/compyle/sort.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/sort.py	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/compyle/sort.py	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,88 @@
+import numpy as np
+
+from .config import get_config
+from .cython_generator import get_parallel_range, CythonGenerator
+from .transpiler import Transpiler, convert_to_float_if_needed
+from .types import dtype_to_ctype, annotate
+from .parallel import Scan
+from .template import Template
+
+from . import array
+
+
+class OutputSortBit(Template):
+    def __init__(self, name, num_arys):
+        super(OutputSortBit, self).__init__(name=name)
+        self.num_arys = num_arys
+
+    def extra_args(self):
+        args = ['inp_%s' % num for num in range(self.num_arys)]
+        args += ['out_%s' % num for num in range(self.num_arys)]
+        return args, {}
+
+    def template(self, i, item, prev_item, last_item, bit_number, indices,
+                 sorted_indices):
+        '''
+        key_bit = (inp_0[i] >> bit_number) & 1
+        t = last_item + i - prev_item
+        idx = t if key_bit else prev_item
+
+        sorted_indices[idx] = indices[i]
+        % for num in range(obj.num_arys):
+        out_${num}[idx] = inp_${num}[i]
+        % endfor
+        '''
+
+
+@annotate
+def input_sort_bit(i, inp_0, bit_number):
+    return 1 if (inp_0[i] >> bit_number) & 1 == 0 else 0
+
+
+def radix_sort(ary_list, out_list=None, max_key_bits=None, backend=None):
+    keys = ary_list[0]
+    backend = array.get_backend(backend)
+    if not np.issubdtype(keys.dtype, np.integer):
+        raise ValueError("RadixSort can only sort integer types")
+    if max_key_bits is None:
+        max_key_bits = 8 * keys.dtype.itemsize
+
+    # temp arrays
+    sorted_indices = array.zeros(keys.length, np.int32, backend=backend)
+    temp_indices = array.zeros_like(sorted_indices)
+
+    indices = array.arange(0, keys.length, 1, backend=backend)
+
+    # allocate temp arrays
+    if out_list:
+        temp_ary_list = out_list
+    else:
+        temp_ary_list = [array.zeros_like(ary) for ary in ary_list]
+    sorted_ary_list = [array.zeros_like(ary) for ary in ary_list]
+
+    # kernel
+    output_sort_bit = OutputSortBit('output_sort_bit', len(ary_list))
+
+    sort_bit_knl = Scan(input_sort_bit, output_sort_bit.function,
+                        'a+b', dtype=keys.dtype, backend=backend)
+
+    for bit_number in range(max_key_bits):
+        if bit_number == 0:
+            inp_indices = indices
+            inp_ary_list = ary_list
+        else:
+            inp_indices = temp_indices
+            inp_ary_list = temp_ary_list
+
+        args = {'bit_number': bit_number, 'indices': indices,
+                'sorted_indices': sorted_indices}
+        args.update({'inp_%i' % i: ary for i, ary in enumerate(inp_ary_list)})
+        args.update({'out_%i' %
+                     i: ary for i, ary in enumerate(sorted_ary_list)})
+
+        sort_bit_knl(**args)
+
+        temp_indices, sorted_indices = sorted_indices, temp_indices
+        temp_ary_list, sorted_ary_list = sorted_ary_list, temp_ary_list
+
+    return temp_ary_list, temp_indices
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_array.py compyle-0.6/compyle/tests/test_array.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_array.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/tests/test_array.py	2020-06-14 18:54:39.000000000 +0000
@@ -2,11 +2,12 @@
 import numpy as np
 
 from ..array import Array
+from ..config import get_config
 import compyle.array as array
 
 
-test_all_backends = pytest.mark.parametrize('backend',
-                                            ['cython', 'opencl', 'cuda'])
+check_all_backends = pytest.mark.parametrize('backend',
+                                             ['cython', 'opencl', 'cuda'])
 
 
 def make_dev_array(backend, n=16):
@@ -23,7 +24,7 @@
         pytest.importorskip('pycuda')
 
 
-@test_all_backends
+@check_all_backends
 def test_reserve(backend):
     check_import(backend)
 
@@ -39,7 +40,7 @@
     assert dev_array[0] == 1
 
 
-@test_all_backends
+@check_all_backends
 def test_resize_with_reallocation(backend):
     check_import(backend)
 
@@ -55,7 +56,7 @@
     assert dev_array[0] == 1
 
 
-@test_all_backends
+@check_all_backends
 def test_resize_without_reallocation(backend):
     check_import(backend)
 
@@ -71,7 +72,7 @@
     assert dev_array[0] == 1
 
 
-@test_all_backends
+@check_all_backends
 def test_copy(backend):
     check_import(backend)
 
@@ -89,7 +90,7 @@
     assert dev_array[0] != dev_array_copy[0]
 
 
-@test_all_backends
+@check_all_backends
 def test_append_with_reallocation(backend):
     check_import(backend)
 
@@ -104,7 +105,7 @@
     assert len(dev_array.get_data()) == 32
 
 
-@test_all_backends
+@check_all_backends
 def test_append_without_reallocation(backend):
     check_import(backend)
 
@@ -120,7 +121,7 @@
     assert len(dev_array.get_data()) == 20
 
 
-@test_all_backends
+@check_all_backends
 def test_extend(backend):
     check_import(backend)
 
@@ -137,7 +138,7 @@
     assert np.all(old_nparr[-len(new_array)] == new_nparr)
 
 
-@test_all_backends
+@check_all_backends
 def test_remove(backend):
     check_import(backend)
 
@@ -155,7 +156,7 @@
     assert np.all(dev_array.get() == (8 + indices).get())
 
 
-@test_all_backends
+@check_all_backends
 def test_align(backend):
     check_import(backend)
 
@@ -172,7 +173,7 @@
     assert np.all(dev_array.get() == indices.get())
 
 
-@test_all_backends
+@check_all_backends
 def test_align_multiple(backend):
     check_import(backend)
 
@@ -196,7 +197,7 @@
     assert np.all(dev_array_b.get() - 1024 == indices.get())
 
 
-@test_all_backends
+@check_all_backends
 def test_squeeze(backend):
     check_import(backend)
 
@@ -213,7 +214,7 @@
     assert dev_array.alloc == 16
 
 
-@test_all_backends
+@check_all_backends
 def test_copy_values(backend):
     check_import(backend)
 
@@ -231,7 +232,7 @@
     assert np.all(dev_array[:len(indices)].get() == dest.get())
 
 
-@test_all_backends
+@check_all_backends
 def test_min_max(backend):
     check_import(backend)
 
@@ -248,21 +249,75 @@
     assert dev_array.maximum == 10
 
 
-@test_all_backends
+@check_all_backends
 def test_sort_by_keys(backend):
     check_import(backend)
 
     # Given
-    dev_array = make_dev_array(backend)
+    nparr1 = np.random.randint(0, 100, 16, dtype=np.int32)
+    nparr2 = np.random.randint(0, 100, 16, dtype=np.int32)
+    dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend)
+
+    # When
+    out_array1, out_array2 = array.sort_by_keys([dev_array1, dev_array2])
+
+    # Then
+    order = np.argsort(nparr1)
+    act_result1 = np.take(nparr1, order)
+    act_result2 = np.take(nparr2, order)
+    assert np.all(out_array1.get() == act_result1)
+    assert np.all(out_array2.get() == act_result2)
+
+
+def test_radix_sort_by_keys():
+    backend = 'cython'
+    for use_openmp in [True, False]:
+        get_config().use_openmp = use_openmp
+        # Given
+        nparr1 = np.random.randint(0, 100, 16, dtype=np.int32)
+        nparr2 = np.random.randint(0, 100, 16, dtype=np.int32)
+        dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend)
+
+        # When
+        out_array1, out_array2 = array.sort_by_keys([dev_array1, dev_array2],
+                                                    use_radix_sort=True)
+
+        # Then
+        order = np.argsort(nparr1)
+        act_result1 = np.take(nparr1, order)
+        act_result2 = np.take(nparr2, order)
+        assert np.all(out_array1.get() == act_result1)
+        assert np.all(out_array2.get() == act_result2)
+    get_config().use_openmp = False
+
+
+@pytest.mark.parametrize(
+    'backend', ['cython', 'opencl',
+                pytest.param('cuda', marks=pytest.mark.xfail)])
+def test_sort_by_keys_with_output(backend):
+    check_import(backend)
+
+    # Given
+    nparr1 = np.random.randint(0, 100, 16, dtype=np.int32)
+    nparr2 = np.random.randint(0, 100, 16, dtype=np.int32)
+    dev_array1, dev_array2 = array.wrap(nparr1, nparr2, backend=backend)
+    out_arrays = [
+        array.zeros_like(dev_array1),
+        array.zeros_like(dev_array2)]
 
     # When
-    out_array = array.sort_by_keys([dev_array])[0]
+    array.sort_by_keys([dev_array1, dev_array2],
+                       out_list=out_arrays, use_radix_sort=False)
 
     # Then
-    assert np.all(out_array.get() == np.sort(dev_array.get()))
+    order = np.argsort(nparr1)
+    act_result1 = np.take(nparr1, order)
+    act_result2 = np.take(nparr2, order)
+    assert np.all(out_arrays[0].get() == act_result1)
+    assert np.all(out_arrays[1].get() == act_result2)
 
 
-@test_all_backends
+@check_all_backends
 def test_dot(backend):
     check_import(backend)
 
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_cython_generator.py compyle-0.6/compyle/tests/test_cython_generator.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_cython_generator.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/tests/test_cython_generator.py	2020-06-14 18:54:39.000000000 +0000
@@ -6,10 +6,10 @@
 import sys
 
 
-from ..config import get_config, set_config
+from ..config import get_config, set_config, use_config
 from ..types import declare, KnownType, annotate
 from ..cython_generator import (CythonGenerator, CythonClassHelper,
-                                all_numeric)
+                                all_numeric, get_parallel_range)
 
 
 class BasicEq:
@@ -123,6 +123,81 @@
         """)
         self.assert_code_equal(c.generate().strip(), expect.strip())
 
+    def test_get_parallel_range_without_openmp(self):
+        with use_config(use_openmp=False):
+            # Given/When
+            res = get_parallel_range('NP')
+            # Then
+            self.assertEqual(res, 'range(0, NP, 1)')
+
+            # Given/When
+            res = get_parallel_range('START', 'NP')
+            # Then
+            self.assertEqual(res, 'range(START, NP, 1)')
+
+            # Given/When
+            res = get_parallel_range('NP', step=2)
+            # Then
+            self.assertEqual(res, 'range(0, NP, 2)')
+
+            # Given/When
+            res = get_parallel_range(1, 'NP+1', 2)
+            # Then
+            self.assertEqual(res, 'range(1, NP+1, 2)')
+
+    def test_get_parallel_range_with_openmp(self):
+        with use_config(use_openmp=True):
+            cfg = get_config()
+            sched, chunk = cfg.omp_schedule
+            # Given/When
+            res = get_parallel_range('NP')
+            # Then
+            expect = "prange(0, NP, 1, schedule='{}', chunksize={})".format(
+                sched, chunk
+            )
+            self.assertEqual(res, expect)
+
+            # Given/When
+            res = get_parallel_range('START', 'NP', 2)
+            # Then
+            expect = (
+                "prange(START, NP, 2, schedule='{}', chunksize={})".format(
+                    sched, chunk
+                )
+            )
+            self.assertEqual(res, expect)
+
+            # Given/When
+            res = get_parallel_range('NP', nogil=True)
+            # Then
+            expect = (
+                "prange(0, NP, 1, schedule='{}', chunksize={}, "
+                "nogil=True)".format(
+                    sched, chunk
+                )
+            )
+            self.assertEqual(res, expect)
+
+            # Given/When
+            res = get_parallel_range('NP', nogil=True, num_threads=4)
+            # Then
+            expect = (
+                "prange(0, NP, 1, schedule='{}', chunksize={}, "
+                "nogil=True, num_threads=4)".format(
+                    sched, chunk
+                )
+            )
+            self.assertEqual(res, expect)
+
+        with use_config(use_openmp=True, omp_schedule=('static', 32)):
+            # Given/When
+            res = get_parallel_range('NP')
+            # Then
+            expect = "prange(0, NP, 1, schedule='{}', chunksize={})".format(
+                'static', 32
+            )
+            self.assertEqual(res, expect)
+
 
 class TestCythonCodeGenerator(TestBase):
     def setUp(self):
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_ext_module.py compyle-0.6/compyle/tests/test_ext_module.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_ext_module.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/tests/test_ext_module.py	2020-06-14 18:54:39.000000000 +0000
@@ -61,6 +61,7 @@
     def setUp(self):
         self.root = tempfile.mkdtemp()
         self.data = dedent('''\
+        # cython: language_level=3
         def f():
             return "hello world"
         ''')
@@ -112,7 +113,7 @@
         self.assertTrue(exists(s.ext_path))
 
     def _create_dummy_module(self):
-        code = "def hello(): return 'hello'"
+        code = "# cython: language_level=3\ndef hello(): return 'hello'"
         modname = 'test_rebuild.py'
         f = join(self.root, modname)
         with open(f, 'w') as fp:
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_low_level.py compyle-0.6/compyle/tests/test_low_level.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_low_level.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/tests/test_low_level.py	2020-06-14 18:54:39.000000000 +0000
@@ -147,6 +147,14 @@
             y[i] = x[i] * a
 
 
+@annotate(int='num, return_')
+def _factorial(num):
+    if num == 0:
+        return 1
+    else:
+        return num*_factorial(num - 1)
+
+
 class TestCython(unittest.TestCase):
     def test_cython_code_with_return_and_nested_call(self):
         # Given
@@ -177,3 +185,12 @@
 
         # Then
         self.assertTrue(np.allclose(y, x * a))
+
+    def test_recursive_function(self):
+        # Given/when
+        fac = Cython(_factorial)
+
+        # Then
+        self.assertEqual(fac(0), 1)
+        self.assertEqual(fac(1), 1)
+        self.assertEqual(fac(3), 6)
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_parallel.py compyle-0.6/compyle/tests/test_parallel.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_parallel.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/tests/test_parallel.py	2020-06-14 18:54:39.000000000 +0000
@@ -5,9 +5,10 @@
 from pytest import importorskip
 
 from ..config import get_config, use_config
-from ..array import wrap
+from ..array import wrap, zeros
 from ..types import annotate
 from ..parallel import Elementwise, Reduction, Scan
+from ..low_level import atomic_inc
 from .test_jit import g
 
 MY_CONST = 42
@@ -94,6 +95,14 @@
         with use_config(use_openmp=True):
             self._test_scan(backend='cython')
 
+    def test_large_scan_works_cython_parallel(self):
+        with use_config(use_openmp=True):
+            self._test_large_scan(backend='cython')
+
+    def test_large_scan_works_cython_parallel(self):
+        with use_config(use_openmp=True):
+            self._test_large_scan(backend='cython')
+
     def test_scan_works_opencl(self):
         importorskip('pyopencl')
         self._test_scan(backend='opencl')
@@ -160,6 +169,9 @@
         with use_config(use_openmp=True):
             self._test_scan_last_item(backend='cython')
 
+    def test_scan_last_item_cython_serial(self):
+        self._test_scan_last_item(backend='cython')
+
     def test_scan_last_item_opencl(self):
         importorskip('pyopencl')
         self._test_scan_last_item(backend='opencl')
@@ -168,6 +180,21 @@
         importorskip('pycuda')
         self._test_scan_last_item(backend='cuda')
 
+    def test_atomic_inc_cython(self):
+        self._test_atomic_inc(backend='cython')
+
+    def test_atomic_inc_cython_parallel(self):
+        with use_config(use_openmp=True):
+            self._test_atomic_inc(backend='cython')
+
+    def test_atomic_inc_opencl(self):
+        importorskip('pyopencl')
+        self._test_atomic_inc(backend='opencl')
+
+    def test_atomic_inc_cuda(self):
+        importorskip('pycuda')
+        self._test_atomic_inc(backend='cuda')
+
 
 class TestParallelUtils(ParallelUtilsBase, unittest.TestCase):
     def setUp(self):
@@ -298,6 +325,33 @@
         # Then
         np.testing.assert_equal(expect, result)
 
+    def _test_large_scan(self, backend):
+        # Given
+        a = np.ones(3000000, dtype=np.int32)
+        data = a.copy()
+        expect = np.cumsum(data)
+
+        a = wrap(a, backend=backend)
+
+        @annotate(i='int', ary='intp', return_='int')
+        def input_f(i, ary):
+            return ary[i]
+
+        @annotate(int='i, item', ary='intp')
+        def output_f(i, item, ary):
+            ary[i] = item
+
+        # When
+        scan = Scan(input_f, output_f, 'a+b', dtype=np.int32,
+                    backend=backend)
+        scan(ary=a)
+
+        a.pull()
+        result = a.data
+
+        # Then
+        np.testing.assert_equal(expect, result)
+
     def _test_scan_with_external_func(self, backend):
         # Given
         a = np.arange(10000, dtype=np.int32)
@@ -426,6 +480,25 @@
         # Then
         np.testing.assert_equal(expect, a.data)
 
+    def _test_atomic_inc(self, backend):
+        # Given
+        a = np.random.randint(0, 100, 50000, dtype=np.int32)
+        result = zeros(1, dtype=np.int32, backend=backend)
+
+        a = wrap(a, backend=backend)
+
+        @annotate(gintp='x, result', i='int')
+        def reduce_knl(i, x, result):
+            b = declare('int')
+            b = atomic_inc(result[0])
+
+        # When
+        knl = Elementwise(reduce_knl, backend=backend)
+        knl(a, result)
+
+        # Then
+        self.assertTrue(result[0] == 50000)
+
 
 class TestParallelUtilsJIT(ParallelUtilsBase, unittest.TestCase):
     def setUp(self):
@@ -558,6 +631,33 @@
         # Then
         np.testing.assert_equal(expect, result)
 
+    def _test_large_scan(self, backend):
+        # Given
+        a = np.ones(3000000, dtype=np.int32)
+        data = a.copy()
+        expect = np.cumsum(data)
+
+        a = wrap(a, backend=backend)
+
+        @annotate
+        def input_f(i, ary):
+            return ary[i]
+
+        @annotate
+        def output_f(i, item, ary):
+            ary[i] = item
+
+        # When
+        scan = Scan(input_f, output_f, 'a+b', dtype=np.int32,
+                    backend=backend)
+        scan(ary=a)
+
+        a.pull()
+        result = a.data
+
+        # Then
+        np.testing.assert_equal(expect, result)
+
     def _test_scan_with_external_func(self, backend):
         # Given
         a = np.arange(10000, dtype=np.int32)
@@ -684,3 +784,22 @@
 
         # Then
         np.testing.assert_equal(expect, a.data)
+
+    def _test_atomic_inc(self, backend):
+        # Given
+        a = np.random.randint(0, 100, 50000, dtype=np.int32)
+        result = zeros(1, dtype=np.int32, backend=backend)
+
+        a = wrap(a, backend=backend)
+
+        @annotate
+        def reduce_knl(i, x, result):
+            b = declare('int')
+            b = atomic_inc(result[0])
+
+        # When
+        knl = Elementwise(reduce_knl, backend=backend)
+        knl(a, result)
+
+        # Then
+        self.assertTrue(result[0] == 50000)
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_translator.py compyle-0.6/compyle/tests/test_translator.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_translator.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/tests/test_translator.py	2020-06-14 18:54:39.000000000 +0000
@@ -1367,3 +1367,36 @@
     ''')
 
     assert code.strip() == expect.strip()
+
+def test_cast_works():
+    # Given
+    def f(x=1.0):
+        return cast(x, "float")
+
+    # When
+    t = OpenCLConverter()
+    code = t.parse_function(f)
+
+    # Then
+    expect = dedent('''
+    WITHIN_KERNEL double f(double x)
+    {
+        return (float) (x);
+    }
+    ''')
+
+    assert code.strip() == expect.strip()
+
+    # When
+    t = CUDAConverter()
+    code = t.parse_function(f)
+
+    # Then
+    expect = dedent('''
+    WITHIN_KERNEL double f(double x)
+    {
+        return (float) (x);
+    }
+    ''')
+
+    assert code.strip() == expect.strip()
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_transpiler.py compyle-0.6/compyle/tests/test_transpiler.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/tests/test_transpiler.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/tests/test_transpiler.py	2020-06-14 18:54:39.000000000 +0000
@@ -37,6 +37,13 @@
     foo(x)
 
 
+def _factorial(num):
+    if num == 0:
+        return 1
+    else:
+        return num*_factorial(num - 1)
+
+
 class TestTranspiler(unittest.TestCase):
     def test_get_external_symbols_and_calls(self):
         # Given/When
@@ -65,6 +72,18 @@
         self.assertRaises(NameError, get_external_symbols_and_calls,
                           undefined_call, 'cython')
 
+    def test_get_external_symbols_and_calls_handles_recursion(self):
+        # Given/When
+        syms, implicit, calls, ext = get_external_symbols_and_calls(
+            _factorial, 'cython'
+        )
+
+        # Then
+        self.assertEqual(syms, {})
+        self.assertEqual(calls, [])
+        self.assertEqual(implicit, set())
+        self.assertEqual(ext, [])
+
     def test_transpiler(self):
         # Given
         t = Transpiler(backend='cython')
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/translator.py compyle-0.6/compyle/translator.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/translator.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/translator.py	2020-06-14 18:54:39.000000000 +0000
@@ -384,6 +384,8 @@
                 return '(&%s)' % self.visit(node.args[0])
             elif 'atomic' in node.func.id:
                 return self.render_atomic(node.func.id, node.args[0])
+            elif node.func.id == 'cast':
+                return '(%s) (%s)' % (node.args[1].s, self.visit(node.args[0]))
             else:
                 return '{func}({args})'.format(
                     func=node.func.id,
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/compyle/transpiler.py compyle-0.6/compyle/transpiler.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/compyle/transpiler.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/compyle/transpiler.py	2020-06-14 18:54:39.000000000 +0000
@@ -69,6 +69,8 @@
     names, calls = get_unknown_names_and_calls(src)
     names -= ignore
     calls = filter_calls(calls)
+    if func.__name__ in calls:
+        calls.remove(func.__name__)
     mod = importlib.import_module(func.__module__)
     symbols = {}
     implicit = set()
@@ -140,11 +142,21 @@
         if backend == 'cython':
             self._cgen = CythonGenerator()
             self.header = dedent('''
+            # cython: language_level=3
             from libc.stdio cimport printf
             from libc.math cimport *
             from libc.math cimport fabs as abs
             from cython.parallel import parallel, prange
             ''')
+
+            if get_config().use_openmp:
+                self.header += dedent('''
+                cimport openmp
+
+                cdef openmp.omp_lock_t cy_lock
+                openmp.omp_init_lock(&cy_lock)
+                ''')
+
         elif backend == 'opencl':
             from pyopencl._cluda import CLUDA_PREAMBLE
             self._cgen = OpenCLConverter()
@@ -257,9 +269,10 @@
         self._handle_external(obj, declarations=declarations)
 
         if self.backend == 'cython':
+            is_serial = getattr(obj, 'is_serial', False)
             self._cgen.parse(
                 obj, declarations=declarations.get(obj.__name__)
-                if declarations else None)
+                if declarations else None, is_serial=is_serial)
             code = self._cgen.get_code()
         elif self.backend == 'opencl' or self.backend == 'cuda':
             code = self._cgen.parse(
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/debian/changelog compyle-0.6/debian/changelog
--- compyle-0.6~dev0~20190922.gitaa5a50d/debian/changelog	2020-01-01 19:54:59.000000000 +0000
+++ compyle-0.6/debian/changelog	2020-06-16 06:31:52.000000000 +0000
@@ -1,3 +1,12 @@
+compyle (0.6-1) unstable; urgency=medium
+
+  * New upstream version.
+  * Standards version bumped to 4.5.0 (no changes).
+  * debian/patches:
+    - refresh all patches
+
+ -- Antonio Valentino <antonio.valentino@tiscali.it>  Tue, 16 Jun 2020 06:31:52 +0000
+
 compyle (0.6~dev0~20190922.gitaa5a50d-2) unstable; urgency=medium
 
   * Team upload.
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/debian/control compyle-0.6/debian/control
--- compyle-0.6~dev0~20190922.gitaa5a50d/debian/control	2019-09-26 06:52:01.000000000 +0000
+++ compyle-0.6/debian/control	2020-06-16 06:31:52.000000000 +0000
@@ -15,7 +15,7 @@
                python3-pytest,
                python3-pytools,
                python3-setuptools,
-Standards-Version: 4.4.0
+Standards-Version: 4.5.0
 Vcs-Browser: https://salsa.debian.org/science-team/compyle
 Vcs-Git: https://salsa.debian.org/science-team/compyle.git
 Homepage: https://github.com/pypr/compyle
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/debian/.gitlab-ci.yml compyle-0.6/debian/.gitlab-ci.yml
--- compyle-0.6~dev0~20190922.gitaa5a50d/debian/.gitlab-ci.yml	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/debian/.gitlab-ci.yml	2020-06-16 06:31:52.000000000 +0000
@@ -0,0 +1,3 @@
+include:
+ - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml
+ - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/debian/patches/0001-Disable-cython-extension.patch compyle-0.6/debian/patches/0001-Disable-cython-extension.patch
--- compyle-0.6~dev0~20190922.gitaa5a50d/debian/patches/0001-Disable-cython-extension.patch	2019-09-26 06:52:01.000000000 +0000
+++ compyle-0.6/debian/patches/0001-Disable-cython-extension.patch	2020-06-16 06:31:52.000000000 +0000
@@ -4,8 +4,8 @@
 
 ---
  compyle/cuda.py | 2 +-
- setup.py        | 2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
+ setup.py        | 4 ++--
+ 2 files changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/compyle/cuda.py b/compyle/cuda.py
 index 9986646..c3decc6 100644
@@ -21,9 +21,18 @@
  from pycuda.compiler import SourceModule as _SourceModule
  from pycuda.tools import dtype_to_ctype
 diff --git a/setup.py b/setup.py
-index 73f58a2..3ae91c4 100644
+index 97e48f1..c9ea834 100644
 --- a/setup.py
 +++ b/setup.py
+@@ -24,7 +24,7 @@ tests_require = ['pytest']
+ if sys.version_info[0] < 3:
+     tests_require += ['mock>=1.0']
+ docs_require = ['sphinx']
+-cuda_require = ['pycuda', 'cupy']
++cuda_require = ['pycuda']  # , 'cupy']
+ opencl_require = ['pyopencl']
+ 
+ classes = '''
 @@ -67,7 +67,7 @@ setup(
      url='https://github.com/pypr/compyle',
      classifiers=classifiers,
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/docs/source/installation.rst compyle-0.6/docs/source/installation.rst
--- compyle-0.6~dev0~20190922.gitaa5a50d/docs/source/installation.rst	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/docs/source/installation.rst	2020-06-14 18:54:39.000000000 +0000
@@ -23,6 +23,7 @@
 .. _numpy: http://www.numpy.org
 .. _PyCUDA: https://documen.tician.de/pycuda
 .. _OpenMP: http://openmp.org/
+.. _CuPy: https://cupy.chainer.org/
 
 
 Setting up on GNU/Linux
@@ -42,25 +43,59 @@
 <http://stackoverflow.com/questions/12228382/after-install-xcode-where-is-clang>`_
 if you installed XCode but can't find clang or gcc.
 
+If you are getting strange errors of the form::
+
+  lang: warning: libstdc++ is deprecated; move to libc++ with a minimum deployment target of OS X 10.9 [-Wdeprecated]
+  ld: library not found for -lstdc++
+  clang: error: linker command failed with exit code 1 (use -v to see invocation)
+
+Then try this (on a bash shell)::
+
+  $ export MACOSX_DEPLOYMENT_TARGET=10.9
+
+And run your command again (replace the above with a suitable line on other
+shells). This is necessary because your Python was compiled with an older
+deployment target and the current version of XCode that you have installed is
+not compatible with that. By setting the environment variable you allow
+compyle to use a newer version. If this works, it is a good idea to set this
+in your default environment (``.bashrc`` for bash shells) so you do not have
+to do this every time.
+
 
 OpenMP on MacOS
 ~~~~~~~~~~~~~~~~
 
-The default "gcc" available on OSX uses an LLVM backend and does not support
-OpenMP_. To use OpenMP_ on OSX, you can install the GCC available on brew_ using
-::
+The default clang compiler available on MacOS uses an LLVM backend and does
+not support OpenMP_. There are two ways to support OpenMP. The first involves
+installing the OpenMP support for clang. This can be done with brew_ using::
+
+  $ brew install libomp
+
+Once that is done, it should "just work". If you get strange errors, try
+setting the ``MACOSX_DEPLOYMENT_TARGET`` as shown above.
+
+Another option is to install GCC for MacOS available on brew_ using ::
 
     $ brew install gcc
 
 Once this is done, you need to use this as your default compiler. The ``gcc``
-formula on brew currently ships with gcc version 8. Therefore, you can
+formula on brew currently ships with gcc version 9. Therefore, you can
 tell Python to use the GCC installed by brew by setting::
 
-    $ export CC=gcc-8
-    $ export CXX=g++-8
+    $ export CC=gcc-9
+    $ export CXX=g++-9
 
-Note that you still do need to have the command-line-tools for XCode installed,
-otherwise the important header files are not available.
+Note that you still do need to have the command-line-tools for XCode
+installed, otherwise the important header files are not available. See
+`how-to-install-xcode-command-line-tools
+<https://stackoverflow.com/questions/9329243/how-to-install-xcode-command-line-tools>`_
+for more details. You may also want to set these environment variables in your
+``.bashrc`` so you don't have to do this every time.
+
+Once you do this, compyle will automatically use this version of GCC and will
+also work with OpenMP. Note that on some preliminary benchmarks, GCC's OpenMP
+implementation seems about 10% or so faster than the LLVM version. Your
+mileage may vary.
 
 .. _brew: http://brew.sh/
 
@@ -73,13 +108,16 @@
 
 https://wiki.python.org/moin/WindowsCompilers
 
-OpenMP will work if you have this installed.
+OpenMP will work if you have this installed. For recent Python versions
+(>=3.5), install the `Microsoft Build Tools for Visual Studio 2019
+<https://www.visualstudio.com/downloads/#build-tools-for-visual-studio-2019>`_
 
 
 Setting up OpenCL/CUDA
 -----------------------
 
-This is too involved a topic for this instead look at the appropriate
-documentation for PyOpenCL_ and PyCUDA_. Once those packages work correctly, you
-should be all set. Note that if you are only using OpenCL/CUDA you do not need
-to have Cython or a C/C++ compiler.
+This is too involved a topic to discuss here, instead look at the appropriate
+documentation for PyOpenCL_ and PyCUDA_. Once those packages work correctly,
+you should be all set. Note that if you are only using OpenCL/CUDA you do not
+need to have Cython or a C/C++ compiler. Some features on CUDA require the use
+of the CuPy_ library.
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/laplace.py compyle-0.6/examples/laplace.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/examples/laplace.py	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/examples/laplace.py	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,136 @@
+import numpy as np
+from math import pi
+import time
+
+from compyle.config import get_config
+from compyle.api import declare, annotate
+from compyle.parallel import Elementwise
+from compyle.array import get_backend, wrap
+
+import compyle.array as carr
+
+
+def bc(x, y):
+    return np.sin(np.pi * (x + y))
+
+
+@annotate
+def laplace_step(i, u, res, err, nx, ny, dx2, dy2, dnr_inv):
+    xid = i % nx
+    yid = i / nx
+
+    if xid == 0 or xid == nx - 1 or yid == 0 or yid == ny - 1:
+        return
+
+    res[i] = ((u[i - 1] + u[i + 1]) * dx2 +
+              (u[i - nx] + u[i + nx]) * dy2) * dnr_inv
+
+    diff = res[i] - u[i]
+
+    err[i] = diff * diff
+
+
+class Grid(object):
+    def __init__(self, nx=10, ny=10, xmin=0., xmax=1.,
+                 ymin=0., ymax=1., bc=lambda x: 0, backend=None):
+        self.backend = get_backend(backend)
+        self.xmin, self.xmax, self.ymin, self.ymax = xmin, xmax, ymin, ymax
+        self.nx, self.ny = nx, ny
+        self.dx = (xmax - xmin) / (nx - 1)
+        self.dy = (ymax - ymin) / (ny - 1)
+        self.x = np.arange(self.xmin, self.xmax + self.dx * 0.5, self.dx)
+        self.y = np.arange(self.ymin, self.ymax + self.dy * 0.5, self.dy)
+        self.bc = bc
+        self.setup()
+
+    def setup(self):
+        u_host = np.zeros((self.nx, self.ny)).astype(np.float32)
+
+        u_host[0, :] = self.bc(self.xmin, self.y)
+        u_host[-1, :] = self.bc(self.xmax, self.y)
+        u_host[:, 0] = self.bc(self.x, self.ymin)
+        u_host[:, -1] = self.bc(self.x, self.ymax)
+
+        self.u = wrap(u_host.flatten(), backend=self.backend)
+        self.err = carr.zeros_like(self.u)
+
+    def get(self):
+        u_host = self.u.get()
+        return np.resize(u_host, (self.nx, self.ny))
+
+    def compute_err(self):
+        return np.sqrt(carr.dot(self.err, self.err))
+
+    def plot(self):
+        import matplotlib.pyplot as plt
+        plt.imshow(self.get())
+        plt.show()
+
+
+class LaplaceSolver(object):
+    def __init__(self, grid, backend=None):
+        self.grid = grid
+        self.backend = get_backend(backend)
+        self.step_method = Elementwise(laplace_step, backend=self.backend)
+        self.res = self.grid.u.copy()
+
+    def solve(self, max_iter=None, eps=1.0e-8):
+        err = np.inf
+
+        g = self.grid
+
+        dx2 = g.dx ** 2
+        dy2 = g.dy ** 2
+        dnr_inv = 0.5 / (dx2 + dy2)
+
+        count = 0
+
+        while err > eps:
+            if max_iter and count >= max_iter:
+                return err, count
+            self.step_method(g.u, self.res, g.err, g.nx, g.ny,
+                             dx2, dy2, dnr_inv)
+            err = g.compute_err()
+
+            tmp = g.u
+            g.u = self.res
+            self.res = tmp
+
+            count += 1
+
+        return err, count
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    p = ArgumentParser()
+    p.add_argument(
+        '-b', '--backend', action='store', dest='backend', default='cython',
+        help='Choose the backend.'
+    )
+    p.add_argument(
+        '--openmp', action='store_true', dest='openmp', default=False,
+        help='Use OpenMP.'
+    )
+    p.add_argument(
+        '--use-double', action='store_true', dest='use_double',
+        default=False, help='Use double precision on the GPU.'
+    )
+    p.add_argument('--nx', action='store', type=int, dest='nx',
+                   default=100, help='Number of grid points in x.')
+    p.add_argument('--ny', action='store', type=int, dest='ny',
+                   default=100, help='Number of grid points in y.')
+    o = p.parse_args()
+    get_config().use_openmp = o.openmp
+    get_config().use_double = o.use_double
+
+    grid = Grid(nx=o.nx, ny=o.ny, bc=bc, backend=o.backend)
+
+    solver = LaplaceSolver(grid, backend=o.backend)
+
+    start = time.time()
+    err, count = solver.solve(eps=1e-6)
+    end = time.time()
+
+    print("Number of iterations = %s" % count)
+    print("Time taken = %g secs" % (end - start))
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/md_nnps.py compyle-0.6/examples/molecular_dynamics/md_nnps.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/md_nnps.py	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/examples/molecular_dynamics/md_nnps.py	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,233 @@
+import numpy as np
+from math import pi
+import time
+
+from compyle.config import get_config
+from compyle.api import declare, annotate
+from compyle.parallel import Elementwise, Reduction
+from compyle.array import get_backend, wrap
+import compyle.array as carr
+
+from nnps import NNPSCountingSort, NNPSRadixSort
+
+
+@annotate
+def calculate_energy(i, vx, vy, pe, num_particles):
+    ke = 0.5 * (vx[i] * vx[i] + vy[i] * vy[i])
+    return pe[i] + ke
+
+
+@annotate
+def calculate_force(i, x, y, fx, fy, pe, nbr_starts, nbr_lengths, nbrs):
+    start_idx = nbr_starts[i]
+    length = nbr_lengths[i]
+    for k in range(start_idx, start_idx + length):
+        j = nbrs[k]
+        if i == j:
+            continue
+        xij = x[i] - x[j]
+        yij = y[i] - y[j]
+        rij2 = xij * xij + yij * yij
+        irij2 = 1.0 / rij2
+        irij6 = irij2 * irij2 * irij2
+        irij12 = irij6 * irij6
+        pe[i] += (4 * (irij12 - irij6))
+        f_base = 24 * irij2 * (2 * irij12 - irij6)
+
+        fx[i] += f_base * xij
+        fy[i] += f_base * yij
+
+
+@annotate
+def step_method1(i, x, y, vx, vy, fx, fy, pe, xmin, xmax,
+                 ymin, ymax, m, dt, nbr_starts, nbr_lengths,
+                 nbrs):
+    integrate_step1(i, m, dt, x, y, vx, vy, fx, fy)
+    boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax,
+                       ymin, ymax)
+
+
+@annotate
+def step_method2(i, x, y, vx, vy, fx, fy, pe, xmin, xmax,
+                 ymin, ymax, m, dt, nbr_starts, nbr_lengths,
+                 nbrs):
+    calculate_force(i, x, y, fx, fy, pe, nbr_starts, nbr_lengths, nbrs)
+    integrate_step2(i, m, dt, x, y, vx, vy, fx, fy)
+
+
+@annotate
+def integrate_step1(i, m, dt, x, y, vx, vy, fx, fy):
+    axi = fx[i]
+    ayi = fy[i]
+    x[i] += vx[i] * dt + 0.5 * axi * dt * dt
+    y[i] += vy[i] * dt + 0.5 * ayi * dt * dt
+    vx[i] += 0.5 * axi * dt
+    vy[i] += 0.5 * ayi * dt
+
+
+@annotate
+def integrate_step2(i, m, dt, x, y, vx, vy, fx, fy):
+    axi = fx[i]
+    ayi = fy[i]
+    vx[i] += 0.5 * axi * dt
+    vy[i] += 0.5 * ayi * dt
+
+
+@annotate
+def boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax):
+    xwidth = xmax - xmin
+    ywidth = ymax - ymin
+    stiffness = 50.
+    pe[i] = 0.
+    if x[i] < 0.5:
+        fx[i] = stiffness * (0.5 - x[i])
+        pe[i] += 0.5 * stiffness * (0.5 - x[i]) * (0.5 - x[i])
+    elif x[i] > xwidth - 0.5:
+        fx[i] = stiffness * (xwidth - 0.5 - x[i])
+        pe[i] += 0.5 * stiffness * (xwidth - 0.5 - x[i]) * (xwidth - 0.5 - x[i])
+    else:
+        fx[i] = 0.
+
+    if y[i] < 0.5:
+        fy[i] = stiffness * (0.5 - y[i])
+        pe[i] += 0.5 * stiffness * (0.5 - y[i]) * (0.5 - y[i])
+    elif y[i] > ywidth - 0.5:
+        fy[i] = stiffness * (ywidth - 0.5 - y[i])
+        pe[i] += 0.5 * stiffness * (ywidth - 0.5 - y[i]) * (ywidth - 0.5 - y[i])
+    else:
+        fy[i] = 0.
+
+
+class MDSolver(object):
+    def __init__(self, num_particles, x=None, y=None, vx=None, vy=None,
+                 xmax=100., ymax=100., dx=1.5, init_T=0.,
+                 backend=None, use_count_sort=False):
+        self.nnps_algorithm = NNPSCountingSort \
+                if use_count_sort else NNPSRadixSort
+        self.backend = get_backend(backend)
+        self.num_particles = num_particles
+        self.xmin, self.xmax = 0., xmax
+        self.ymin, self.ymax = 0., ymax
+        self.m = 1.
+        if x is None and y is None:
+            self.x, self.y = self.setup_positions(num_particles, dx)
+        if vx is None and vy is None:
+            self.vx, self.vy = self.setup_velocities(init_T,
+                                                     num_particles)
+        self.fx = carr.zeros_like(self.x, backend=self.backend)
+        self.fy = carr.zeros_like(self.x, backend=self.backend)
+        self.pe = carr.zeros_like(self.x, backend=self.backend)
+        self.nnps = self.nnps_algorithm(self.x, self.y, 3., self.xmax,
+                                        self.ymax, backend=self.backend)
+        self.init_forces = Elementwise(calculate_force, backend=self.backend)
+        self.step1 = Elementwise(step_method1, backend=self.backend)
+        self.step2 = Elementwise(step_method2, backend=self.backend)
+        self.energy_calc = Reduction("a+b", map_func=calculate_energy,
+                                     backend=self.backend)
+
+    def setup_velocities(self, T, num_particles):
+        np.random.seed(123)
+        vx = np.random.uniform(0, 1., size=num_particles).astype(np.float64)
+        vy = np.random.uniform(0, 1., size=num_particles).astype(np.float64)
+        T_current = np.average(vx ** 2 + vy ** 2)
+        scaling_factor = (T / T_current) ** 0.5
+        vx = vx * scaling_factor
+        vy = vy * scaling_factor
+        return wrap(vx, vy, backend=self.backend)
+
+    def setup_positions(self, num_particles, dx):
+        ndim = np.ceil(num_particles ** 0.5)
+        dim_length = ndim * dx
+
+        self.xmax = dim_length * 2
+        self.ymax = dim_length * 2
+
+        xmin_eff = ((self.xmax - self.xmin) - dim_length) / 2.
+        xmax_eff = ((self.xmax - self.xmin) + dim_length) / 2.
+
+        x, y = np.mgrid[xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx]
+        x = x.ravel().astype(np.float64)[:num_particles]
+        y = y.ravel().astype(np.float64)[:num_particles]
+        return wrap(x, y, backend=self.backend)
+
+    def post_step(self, t):
+        energy = self.energy_calc(self.vx, self.vy, self.pe,
+                                  self.num_particles)
+        print("Energy at time =", t, "is", energy)
+
+    def solve(self, t, dt):
+        num_steps = int(t // dt)
+        curr_t = 0.
+        self.nnps.build()
+        self.nnps.get_neighbors()
+        self.init_forces(self.x, self.y, self.fx, self.fy,
+                         self.pe, self.nnps.nbr_starts,
+                         self.nnps.nbr_lengths, self.nnps.nbrs)
+        for i in range(num_steps):
+            self.step1(self.x, self.y, self.vx, self.vy, self.fx,
+                       self.fy, self.pe, self.xmin, self.xmax, self.ymin,
+                       self.ymax, self.m, dt, self.nnps.nbr_starts,
+                       self.nnps.nbr_lengths, self.nnps.nbrs)
+            self.nnps.build()
+            self.nnps.get_neighbors()
+            self.step2(self.x, self.y, self.vx, self.vy, self.fx,
+                       self.fy, self.pe, self.xmin, self.xmax, self.ymin,
+                       self.ymax, self.m, dt, self.nnps.nbr_starts,
+                       self.nnps.nbr_lengths, self.nnps.nbrs)
+            curr_t += dt
+            if i % 100 == 0:
+                self.post_step(curr_t)
+
+    def pull(self):
+        self.x.pull()
+        self.y.pull()
+
+    def plot(self):
+        import matplotlib.pyplot as plt
+        plt.xlim(self.xmin, self.xmax)
+        plt.ylim(self.ymin, self.ymax)
+        plt.scatter(self.x.data, self.y.data, 4.2)
+        plt.savefig("sim.png", dpi=300)
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    p = ArgumentParser()
+    p.add_argument(
+        '-b', '--backend', action='store', dest='backend', default='cython',
+        help='Choose the backend.'
+    )
+    p.add_argument(
+        '--openmp', action='store_true', dest='openmp', default=False,
+        help='Use OpenMP.'
+    )
+    p.add_argument(
+        '--use-double', action='store_true', dest='use_double',
+        default=False, help='Use double precision on the GPU.'
+    )
+    p.add_argument(
+        '--use-count-sort', action='store_true', dest='use_count_sort',
+        default=False, help='Use count sort instead of radix sort'
+    )
+
+    p.add_argument('-n', action='store', type=int, dest='n',
+                   default=100, help='Number of particles')
+
+    p.add_argument('--tf', action='store', type=float, dest='t',
+                   default=40., help='Final time')
+
+    p.add_argument('--dt', action='store', type=float, dest='dt',
+                   default=0.02, help='Time step')
+
+    o = p.parse_args()
+    get_config().use_openmp = o.openmp
+    get_config().use_double = o.use_double
+
+    solver = MDSolver(o.n, backend=o.backend, use_count_sort=o.use_count_sort)
+
+    start = time.time()
+    solver.solve(o.t, o.dt)
+    end = time.time()
+    print("Time taken for N = %i is %g secs" % (o.n, (end - start)))
+    solver.pull()
+    solver.plot()
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/md_simple.py compyle-0.6/examples/molecular_dynamics/md_simple.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/md_simple.py	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/examples/molecular_dynamics/md_simple.py	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,215 @@
+import numpy as np
+from math import pi
+import time
+
+from compyle.config import get_config
+from compyle.api import declare, annotate
+from compyle.parallel import Elementwise, Reduction
+from compyle.array import get_backend, wrap
+
+import compyle.array as carr
+
+
+@annotate
+def calculate_energy(i, vx, vy, pe, num_particles):
+    ke = 0.5 * (vx[i] * vx[i] + vy[i] * vy[i])
+    return pe[i] + ke
+
+
+@annotate
+def calculate_force(i, x, y, fx, fy, pe, num_particles):
+    force_cutoff = 3.
+    force_cutoff2 = force_cutoff * force_cutoff
+    for j in range(num_particles):
+        if i == j:
+            continue
+        xij = x[i] - x[j]
+        yij = y[i] - y[j]
+        rij2 = xij * xij + yij * yij
+        if rij2 > force_cutoff2:
+            continue
+        irij2 = 1.0 / rij2
+        irij6 = irij2 * irij2 * irij2
+        irij12 = irij6 * irij6
+        pe[i] += (4 * (irij12 - irij6))
+        f_base = 24 * irij2 * (2 * irij12 - irij6)
+
+        fx[i] += f_base * xij
+        fy[i] += f_base * yij
+
+
+@annotate
+def step_method1(i, x, y, vx, vy, fx, fy, pe, xmin, xmax,
+                 ymin, ymax, m, dt, num_particles):
+    integrate_step1(i, m, dt, x, y, vx, vy, fx, fy)
+    boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax,
+                       ymin, ymax)
+
+
+@annotate
+def step_method2(i, x, y, vx, vy, fx, fy, pe, xmin, xmax,
+                 ymin, ymax, m, dt, num_particles):
+    calculate_force(i, x, y, fx, fy, pe, num_particles)
+    integrate_step2(i, m, dt, x, y, vx, vy, fx, fy)
+
+
+@annotate
+def integrate_step1(i, m, dt, x, y, vx, vy, fx, fy):
+    axi = fx[i]
+    ayi = fy[i]
+    x[i] += vx[i] * dt + 0.5 * axi * dt * dt
+    y[i] += vy[i] * dt + 0.5 * ayi * dt * dt
+    vx[i] += 0.5 * axi * dt
+    vy[i] += 0.5 * ayi * dt
+
+
+@annotate
+def integrate_step2(i, m, dt, x, y, vx, vy, fx, fy):
+    axi = fx[i]
+    ayi = fy[i]
+    vx[i] += 0.5 * axi * dt
+    vy[i] += 0.5 * ayi * dt
+
+
+@annotate
+def boundary_condition(i, x, y, vx, vy, fx, fy, pe, xmin, xmax, ymin, ymax):
+    xwidth = xmax - xmin
+    ywidth = ymax - ymin
+    stiffness = 50.
+    pe[i] = 0.
+    if x[i] < 0.5:
+        fx[i] = stiffness * (0.5 - x[i])
+        pe[i] += 0.5 * stiffness * (0.5 - x[i]) * (0.5 - x[i])
+    elif x[i] > xwidth - 0.5:
+        fx[i] = stiffness * (xwidth - 0.5 - x[i])
+        pe[i] += 0.5 * stiffness * (xwidth - 0.5 - x[i]) * (xwidth - 0.5 - x[i])
+    else:
+        fx[i] = 0.
+
+    if y[i] < 0.5:
+        fy[i] = stiffness * (0.5 - y[i])
+        pe[i] += 0.5 * stiffness * (0.5 - y[i]) * (0.5 - y[i])
+    elif y[i] > ywidth - 0.5:
+        fy[i] = stiffness * (ywidth - 0.5 - y[i])
+        pe[i] += 0.5 * stiffness * (ywidth - 0.5 - y[i]) * (ywidth - 0.5 - y[i])
+    else:
+        fy[i] = 0.
+
+
+class MDSolver(object):
+    def __init__(self, num_particles, x=None, y=None, vx=None, vy=None,
+                 xmax=100., ymax=100., dx=1.5, init_T=0.,
+                 backend=None):
+        self.backend = backend
+        self.num_particles = num_particles
+        self.xmin, self.xmax = 0., xmax
+        self.ymin, self.ymax = 0., ymax
+        self.m = 1.
+        if x is None and y is None:
+            self.x, self.y = self.setup_positions(num_particles, dx)
+        if vx is None and vy is None:
+            self.vx, self.vy = self.setup_velocities(init_T, num_particles)
+        self.fx = carr.zeros_like(self.x, backend=self.backend)
+        self.fy = carr.zeros_like(self.x, backend=self.backend)
+        self.pe = carr.zeros_like(self.x, backend=self.backend)
+        self.init_forces = Elementwise(calculate_force, backend=self.backend)
+        self.step1 = Elementwise(step_method1, backend=self.backend)
+        self.step2 = Elementwise(step_method2, backend=self.backend)
+        self.energy_calc = Reduction("a+b", map_func=calculate_energy,
+                                     backend=self.backend)
+
+    def setup_velocities(self, T, num_particles):
+        np.random.seed(123)
+        vx = np.random.uniform(0, 1., size=num_particles).astype(np.float64)
+        vy = np.random.uniform(0, 1., size=num_particles).astype(np.float64)
+        T_current = np.average(vx ** 2 + vy ** 2)
+        scaling_factor = (T / T_current) ** 0.5
+        vx = vx * scaling_factor
+        vy = vy * scaling_factor
+        return wrap(vx, vy, backend=self.backend)
+
+    def setup_positions(self, num_particles, dx):
+        ndim = np.ceil(num_particles ** 0.5)
+        dim_length = ndim * dx
+
+        self.xmax = dim_length * 3
+        self.ymax = dim_length * 3
+
+        xmin_eff = ((self.xmax - self.xmin) - dim_length) / 2.
+        xmax_eff = ((self.xmax - self.xmin) + dim_length) / 2.
+
+        x, y = np.mgrid[xmin_eff:xmax_eff:dx, xmin_eff:xmax_eff:dx]
+        x = x.ravel().astype(np.float64)[:num_particles]
+        y = y.ravel().astype(np.float64)[:num_particles]
+        return wrap(x, y, backend=self.backend)
+
+    def post_step(self, t):
+        energy = self.energy_calc(self.vx, self.vy, self.pe,
+                                  self.num_particles)
+        print("Energy at time =", t, "is", energy)
+
+    def solve(self, t, dt):
+        num_steps = int(t // dt)
+        curr_t = 0.
+        self.init_forces(self.x, self.y, self.fx, self.fy, self.pe,
+                         self.num_particles)
+        for i in range(num_steps):
+            self.step1(self.x, self.y, self.vx, self.vy, self.fx, self.fy,
+                       self.pe, self.xmin, self.xmax, self.ymin, self.ymax,
+                       self.m, dt, self.num_particles)
+            self.step2(self.x, self.y, self.vx, self.vy, self.fx, self.fy,
+                       self.pe, self.xmin, self.xmax, self.ymin, self.ymax,
+                       self.m, dt, self.num_particles)
+            curr_t += dt
+            if i % 100 == 0:
+                self.post_step(curr_t)
+
+    def pull(self):
+        self.x.pull()
+        self.y.pull()
+
+    def plot(self):
+        import matplotlib.pyplot as plt
+        plt.xlim(self.xmin, self.xmax)
+        plt.ylim(self.ymin, self.ymax)
+        plt.scatter(self.x.data, self.y.data, 4.2)
+        plt.savefig("sim.png", dpi=300)
+
+
+if __name__ == '__main__':
+    from argparse import ArgumentParser
+    p = ArgumentParser()
+    p.add_argument(
+        '-b', '--backend', action='store', dest='backend', default='cython',
+        help='Choose the backend.'
+    )
+    p.add_argument(
+        '--openmp', action='store_true', dest='openmp', default=False,
+        help='Use OpenMP.'
+    )
+    p.add_argument(
+        '--use-double', action='store_true', dest='use_double',
+        default=False, help='Use double precision on the GPU.'
+    )
+
+    p.add_argument('-n', action='store', type=int, dest='n',
+                   default=100, help='Number of particles')
+
+    p.add_argument('--tf', action='store', type=float, dest='t',
+                   default=40., help='Final time')
+
+    p.add_argument('--dt', action='store', type=float, dest='dt',
+                   default=0.02, help='Time step')
+
+    o = p.parse_args()
+    get_config().use_openmp = o.openmp
+    get_config().use_double = o.use_double
+
+    solver = MDSolver(o.n, backend=o.backend)
+
+    start = time.time()
+    solver.solve(o.t, o.dt)
+    end = time.time()
+    print("Time taken for N = %i is %g secs" % (o.n, (end - start)))
+    solver.pull()
+    solver.plot()
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/nnps_kernels.py compyle-0.6/examples/molecular_dynamics/nnps_kernels.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/nnps_kernels.py	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/examples/molecular_dynamics/nnps_kernels.py	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,142 @@
+from compyle.api import declare, annotate
+from compyle.parallel import serial
+from compyle.low_level import atomic_inc, cast
+from math import floor
+import numpy as np
+
+
+@annotate
+def find_cell_id(x, y, h, c):
+    c[0] = cast(floor((x) / h), "int")
+    c[1] = cast(floor((y) / h), "int")
+
+
+@annotate
+def flatten(p, q, qmax):
+    return p * qmax + q
+
+
+@serial
+@annotate
+def count_bins(i, x, y, h, cmax, keys, bin_counts,
+               sort_offsets):
+    c = declare('matrix(2, "int")')
+    find_cell_id(x[i], y[i], h, c)
+    key = flatten(c[0], c[1], cmax)
+    keys[i] = key
+    idx = atomic_inc(bin_counts[key])
+    sort_offsets[i] = idx
+
+
+@annotate
+def sort_indices(i, keys, sort_offsets, start_indices, sorted_indices):
+    key = keys[i]
+    offset = sort_offsets[i]
+    start_idx = start_indices[key]
+    sorted_indices[start_idx + offset] = i
+
+
+@annotate
+def input_start_indices(i, counts):
+    return 0 if i == 0 else counts[i - 1]
+
+
+@annotate
+def output_start_indices(i, item, indices):
+    indices[i] = item
+
+
+@annotate
+def fill_keys(i, x, y, h, cmax, indices, keys):
+    c = declare('matrix(2, "int")')
+    find_cell_id(x[i], y[i], h, c)
+    key = flatten(c[0], c[1], cmax)
+    keys[i] = key
+    indices[i] = i
+
+
+@annotate
+def input_scan_keys(i, keys):
+    return 1 if i == 0 or keys[i] != keys[i - 1] else 0
+
+
+@annotate
+def output_scan_keys(i, item, prev_item, keys, start_indices):
+    key = keys[i]
+    if item != prev_item:
+        start_indices[key] = i
+
+
+@annotate
+def fill_bin_counts(i, keys, start_indices, bin_counts, num_particles):
+    if i == num_particles - 1:
+        last_key = keys[num_particles - 1]
+        bin_counts[last_key] = num_particles - start_indices[last_key]
+    if i == 0 or keys[i] == keys[i - 1]:
+        return
+    key = keys[i]
+    prev_key = keys[i - 1]
+    bin_counts[prev_key] = start_indices[key] - start_indices[prev_key]
+
+
+@annotate
+def find_neighbor_lengths_knl(i, x, y, h, cmax, start_indices, sorted_indices,
+                              bin_counts, nbr_lengths, max_key):
+    d = h * h
+    q_c = declare('matrix(2, "int")')
+    find_cell_id(x[i], y[i], h, q_c)
+
+    for p in range(-1, 2):
+        for q in range(-1, 2):
+            cx = q_c[0] + p
+            cy = q_c[1] + q
+
+            key = flatten(cx, cy, cmax)
+
+            if key >= max_key or key < 0:
+                continue
+
+            start_idx = start_indices[key]
+            np = bin_counts[key]
+
+            for k in range(np):
+                j = sorted_indices[start_idx + k]
+                xij = x[i] - x[j]
+                yij = y[i] - y[j]
+                rij2 = xij * xij + yij * yij
+
+                if rij2 < d:
+                    nbr_lengths[i] += 1
+
+
+@annotate
+def find_neighbors_knl(i, x, y, h, cmax, start_indices, sorted_indices,
+                       bin_counts, nbr_starts, nbrs, max_key):
+    d = h * h
+    q_c = declare('matrix(2, "int")')
+    find_cell_id(x[i], y[i], h, q_c)
+    length = 0
+    nbr_start_idx = nbr_starts[i]
+
+    for p in range(-1, 2):
+        for q in range(-1, 2):
+            cx = q_c[0] + p
+            cy = q_c[1] + q
+
+            key = flatten(cx, cy, cmax)
+
+            if key >= max_key or key < 0:
+                continue
+
+            start_idx = start_indices[key]
+            np = bin_counts[key]
+
+            for k in range(np):
+                j = sorted_indices[start_idx + k]
+                xij = x[i] - x[j]
+                yij = y[i] - y[j]
+                rij2 = xij * xij + yij * yij
+
+                if rij2 < d:
+                    nbrs[nbr_start_idx + length] = j
+                    length += 1
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/nnps.py compyle-0.6/examples/molecular_dynamics/nnps.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/nnps.py	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/examples/molecular_dynamics/nnps.py	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,159 @@
+from nnps_kernels import *
+from compyle.config import get_config
+from compyle.api import declare, annotate
+from compyle.parallel import serial, Elementwise, Reduction, Scan
+from compyle.array import get_backend, wrap
+from compyle.low_level import atomic_inc, cast
+from math import floor
+from time import time
+
+import numpy as np
+import compyle.array as carr
+
+
+class NNPS(object):
+    def __init__(self, x, y, h, xmax, ymax, backend=None):
+        self.backend = backend
+        self.num_particles = x.length
+        self.x, self.y = x, y
+        self.h = h
+
+        cmax = np.array([floor(xmax / h), floor(ymax / h)], dtype=np.int32)
+        self.max_key = 1 + flatten(cmax[0], cmax[1], 1 + cmax[1])
+        self.qmax = 1 + cmax[1]
+
+        # neighbor kernels
+        self.find_neighbor_lengths = Elementwise(find_neighbor_lengths_knl,
+                                                 backend=self.backend)
+        self.find_neighbors = Elementwise(find_neighbors_knl,
+                                          backend=self.backend)
+        self.scan_start_indices = Scan(input=input_start_indices,
+                                       output=output_start_indices,
+                                       scan_expr="a+b", dtype=np.int32,
+                                       backend=self.backend)
+        self.init_arrays()
+
+    def init_arrays(self):
+        # sort arrays
+        self.bin_counts = carr.zeros(self.max_key, dtype=np.int32,
+                                     backend=self.backend)
+        self.start_indices = carr.zeros(self.max_key, dtype=np.int32,
+                                        backend=self.backend)
+        self.keys = carr.zeros(self.num_particles, dtype=np.int32,
+                               backend=self.backend)
+        self.sorted_indices = carr.zeros(self.num_particles, dtype=np.int32,
+                                         backend=self.backend)
+
+        # neighbor arrays
+        self.nbr_lengths = carr.zeros(self.num_particles, dtype=np.int32,
+                                      backend=self.backend)
+        self.nbr_starts = carr.zeros(self.num_particles, dtype=np.int32,
+                                     backend=self.backend)
+        self.nbrs = carr.zeros(2 * self.num_particles, dtype=np.int32,
+                               backend=self.backend)
+
+    def reset_arrays(self):
+        # sort arrays
+        self.bin_counts.fill(0)
+        self.start_indices.fill(0)
+        self.sorted_indices.fill(0)
+
+        # neighbors array
+        self.nbr_lengths.fill(0)
+        self.nbr_starts.fill(0)
+
+    def get_neighbors(self):
+        self.find_neighbor_lengths(self.x, self.y, self.h, self.qmax,
+                                   self.start_indices, self.sorted_indices,
+                                   self.bin_counts, self.nbr_lengths,
+                                   self.max_key)
+        self.scan_start_indices(counts=self.nbr_lengths,
+                                indices=self.nbr_starts)
+        self.total_neighbors = int(self.nbr_lengths[-1] + self.nbr_starts[-1])
+        self.nbrs.resize(self.total_neighbors)
+        self.find_neighbors(self.x, self.y, self.h, self.qmax,
+                            self.start_indices, self.sorted_indices,
+                            self.bin_counts, self.nbr_starts,
+                            self.nbrs, self.max_key)
+
+
+class NNPSCountingSort(NNPS):
+    def __init__(self, x, y, h, xmax, ymax, backend=None):
+        super().__init__(x, y, h, xmax, ymax, backend=backend)
+        # sort kernels
+        self.count_bins = Elementwise(count_bins, backend=self.backend)
+        self.sort_indices = Elementwise(sort_indices, backend=self.backend)
+
+    def init_arrays(self):
+        super().init_arrays()
+        self.sort_offsets = carr.zeros(self.num_particles, dtype=np.int32,
+                                       backend=self.backend)
+
+    def reset_arrays(self):
+        super().reset_arrays()
+        # sort arrays
+        self.sort_offsets.fill(0)
+
+    def build(self):
+        self.reset_arrays()
+        self.count_bins(self.x, self.y, self.h, self.qmax, self.keys,
+                        self.bin_counts, self.sort_offsets)
+        self.scan_start_indices(counts=self.bin_counts,
+                                indices=self.start_indices)
+        self.sort_indices(self.keys, self.sort_offsets, self.start_indices,
+                          self.sorted_indices)
+
+
+class NNPSRadixSort(NNPS):
+    def __init__(self, x, y, h, xmax, ymax, backend=None):
+        super().__init__(x, y, h, xmax, ymax, backend=backend)
+        self.max_bits = np.ceil(np.log2(self.max_key))
+
+        # sort kernels
+        self.fill_keys = Elementwise(fill_keys, backend=self.backend)
+        self.fill_bin_counts = Elementwise(fill_bin_counts,
+                                           backend=self.backend)
+        self.scan_keys = Scan(input=input_scan_keys,
+                              output=output_scan_keys,
+                              scan_expr="a+b", dtype=np.int32,
+                              backend=self.backend)
+
+    def init_arrays(self):
+        super().init_arrays()
+        # sort arrays
+        self.sorted_keys = carr.zeros(self.num_particles, dtype=np.int32,
+                                      backend=self.backend)
+        self.indices = carr.zeros(self.num_particles, dtype=np.int32,
+                                  backend=self.backend)
+
+    def reset_arrays(self):
+        super().reset_arrays()
+        self.sorted_keys.fill(0)
+
+    def build(self):
+        self.reset_arrays()
+        self.fill_keys(self.x, self.y, self.h, self.qmax, self.indices,
+                       self.keys)
+        self.sorted_keys, self.sorted_indices = carr.sort_by_keys(
+            [self.keys, self.indices],
+            key_bits=self.max_bits, backend=self.backend)
+        self.scan_keys(keys=self.sorted_keys,
+                       start_indices=self.start_indices)
+        self.fill_bin_counts(self.sorted_keys, self.start_indices,
+                             self.bin_counts, self.num_particles)
+
+
+if __name__ == "__main__":
+    import sys
+    backend = sys.argv[1] if len(sys.argv) > 1 else 'cython'
+    np.random.seed(123)
+    num_particles = 20
+    x = np.random.uniform(0, 10., size=num_particles).astype(np.float32)
+    y = np.random.uniform(0, 10., size=num_particles).astype(np.float32)
+    x, y = wrap(x, y, backend=backend)
+    nnps = NNPSRadixSort(x, y, 3., 10., 10., backend=backend)
+    nnps.build()
+    nnps.get_neighbors()
+    print(nnps.start_indices)
+    print(nnps.bin_counts)
+    print(nnps.nbr_lengths)
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/performance_comparison.py compyle-0.6/examples/molecular_dynamics/performance_comparison.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/performance_comparison.py	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/examples/molecular_dynamics/performance_comparison.py	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,144 @@
+import numpy as np
+import time
+import md_simple
+import md_nnps
+
+from compyle.config import get_config
+
+
+def solve(n, backend, solver_algo, tf=0.5, dt=0.02, use_count_sort=False):
+    solver = solver_algo(n, backend=backend.replace("_omp", ""))
+    start = time.time()
+    solver.solve(tf, dt)
+    end = time.time()
+    print("Time taken for backend = %s, N = %i is %g secs" %
+          (backend, n, (end - start)))
+    return end - start
+
+
+def compare(backends, n_list, solver_algo, niter=3):
+    t_list = {b: [] for b in backends}
+    speedups = {b: [] for b in backends}
+    for n in n_list:
+        print("Running for N = %i" % n)
+        for backend in backends:
+            if "omp" in backend:
+                get_config().use_openmp = True
+            t = 1e9
+            for it in range(niter):
+                t = min(t, solve(n, backend, solver_algo))
+            t_list[backend].append(t)
+            if "omp" in backend:
+                get_config().use_openmp = False
+
+    if 'cython' in backends:
+        for backend in backends:
+            for i, n in enumerate(n_list):
+                speedups[backend].append(
+                    t_list["cython"][i] / t_list[backend][i])
+    else:
+        speedups = None
+
+    return speedups, t_list
+
+
+def compare_implementations(backend, n_list, niter=3):
+    import matplotlib.pyplot as plt
+    sp, nnps_tlist = compare([backend], n_list,
+                             md_nnps.MDSolver, niter=niter)
+    sp, simple_tlist = compare([backend], n_list,
+                               md_simple.MDSolver, niter=niter)
+
+    speedup = [simple_tlist[backend][i] / nnps_tlist[backend][i]
+               for i in range(len(n_list))]
+
+    plt.loglog(n_list, nnps_tlist[backend], 'x-', label="Linear")
+    plt.loglog(n_list, simple_tlist[backend], 'x-', label="Simple")
+
+    plt.xlabel("Number of particles")
+    plt.ylabel("Time (secs)")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig("time_comp_impl.png", dpi=300)
+
+    plt.clf()
+
+    plt.loglog(n_list, speedup, 'x-')
+
+    plt.xlabel("Number of particles")
+    plt.ylabel("Speedup")
+    plt.grid(True)
+    plt.savefig("speedup_comp_impl.png", dpi=300)
+
+
+def plot(n_list, speedups, t_list, label):
+    backend_label_map = {'cython': 'Cython', 'cython_omp': 'OpenMP',
+                         'opencl': 'OpenCL', 'cuda': 'CUDA'}
+    import matplotlib.pyplot as plt
+    plt.figure()
+
+    if speedups:
+        for backend, arr in speedups.items():
+            if backend == "cython":
+                continue
+            plt.semilogx(n_list, arr, 'x-', label=backend_label_map[backend])
+
+        plt.xlabel("Number of particles")
+        plt.ylabel("Speedup")
+        plt.legend()
+        plt.grid(True)
+        plt.savefig("%s_speedup_%s.png" %
+                    (label, "_".join(speedups.keys())), dpi=300)
+
+    plt.clf()
+
+    for backend, arr in t_list.items():
+        plt.loglog(n_list, arr, 'x-', label=backend_label_map[backend])
+
+    plt.xlabel("Number of particles")
+    plt.ylabel("Time (secs)")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig("%s_time_%s.png" % (label, "_".join(t_list.keys())), dpi=300)
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    p = ArgumentParser()
+    p.add_argument(
+        '-c', '--comparison', action='store', dest='comp', default='gpu_comp',
+        choices=['gpu_comp', 'omp_comp', 'comp_algo'],
+        help='Choose the comparison.'
+    )
+    p.add_argument(
+        '--nnps', action='store', dest='nnps', default='linear',
+        choices=['linear', 'simple'],
+        help='Choose algorithm.'
+    )
+    p.add_argument(
+        '--use-double', action='store_true', dest='use_double',
+        default=False,  help='Use double precision on the GPU.'
+    )
+
+    o = p.parse_args()
+    get_config().use_double = o.use_double
+    solver_algo = (md_nnps.MDSolver if o.nnps == 'linear'
+                   else md_simple.MDSolver)
+    n_list = [10000 * (2 ** i) for i in range(10)] if o.nnps == 'linear' else \
+        [500 * (2 ** i) for i in range(8)]
+
+    if o.comp == "gpu_comp":
+        backends = ["opencl", "cuda", "cython"]
+        print("Running for", n_list)
+        speedups, t_list = compare(backends, n_list, solver_algo)
+        plot(n_list, speedups, t_list, o.nnps)
+    elif o.comp == "omp_comp":
+        backends = ["cython_omp", "cython"]
+        print("Running for", n_list)
+        speedups, t_list = compare(backends, n_list, solver_algo)
+        plot(n_list, speedups, t_list, o.nnps)
+    elif o.comp == "comp_algo":
+        backend = "cython"
+        n_list = [500, 1000, 2000, 4000, 8000, 16000, 32000]
+        print("Running for", n_list)
+        compare_implementations(backend, n_list)
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/README.rst compyle-0.6/examples/molecular_dynamics/README.rst
--- compyle-0.6~dev0~20190922.gitaa5a50d/examples/molecular_dynamics/README.rst	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/examples/molecular_dynamics/README.rst	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,21 @@
+Molecular Dynamics Example
+--------------------------
+
+We have 3 implementations of a simple molecular dynamics simulation
+of an N body problem in Lennard Jones potential. The first implementation
+is a simple :math:`O(N^2)` implementation that can be found in
+:code:`md_simple.py`. The second implementation is using nearest neighbor
+searching to reduce the complexity to :math:`O(N)` and can be
+found in :code:`md_nnps.py`.
+
+We also have two different implementations of nearest neighbor search
+algorithms, one using a radix sort on GPU and numpy sort on CPU
+and the other using a native counting sort implementation. The counting
+sort version is about 30% faster. Both these implementations can be
+found in :code:`nnps.py`.
+
+We have also provided a script to run performance comparison between
+these implementations in `performance_comparison.py`. Users can use
+the google colab notebook 
+`here <https://colab.research.google.com/drive/1SGRiArYXV1LEkZtUeg9j0qQ21MDqQR2U?usp=sharing>`_.
+
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/pyproject.toml compyle-0.6/pyproject.toml
--- compyle-0.6~dev0~20190922.gitaa5a50d/pyproject.toml	1970-01-01 00:00:00.000000000 +0000
+++ compyle-0.6/pyproject.toml	2020-06-14 18:54:39.000000000 +0000
@@ -0,0 +1,9 @@
+[build-system]
+requires = [
+    "wheel>=0.29.0",
+    "setuptools>=42.0.0",
+    "oldest-supported-numpy",
+    "Cython>=0.20",
+    "mako",
+    "pytools"
+]
\ No newline at end of file
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/setup.py compyle-0.6/setup.py
--- compyle-0.6~dev0~20190922.gitaa5a50d/setup.py	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/setup.py	2020-06-14 18:54:39.000000000 +0000
@@ -24,7 +24,7 @@
 if sys.version_info[0] < 3:
     tests_require += ['mock>=1.0']
 docs_require = ['sphinx']
-cuda_require = ['pycuda']
+cuda_require = ['pycuda', 'cupy']
 opencl_require = ['pyopencl']
 
 classes = '''
diff -Nru compyle-0.6~dev0~20190922.gitaa5a50d/.travis.yml compyle-0.6/.travis.yml
--- compyle-0.6~dev0~20190922.gitaa5a50d/.travis.yml	2019-06-05 08:48:46.000000000 +0000
+++ compyle-0.6/.travis.yml	2020-06-14 18:54:39.000000000 +0000
@@ -6,25 +6,24 @@
 
 language: python
 python:
-    - 2.7
     - 3.7
+    - 3.8
 
 install:
     - sudo apt-get update
-    - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
-        wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
-      else
-        wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
-      fi
+    - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
     - bash miniconda.sh -b -p $HOME/miniconda
-    - export PATH="$HOME/miniconda/bin:$PATH"
+    - source "$HOME/miniconda/etc/profile.d/conda.sh"
     - hash -r
     - conda config --set always_yes yes --set changeps1 no
     - conda update -q conda
     - conda info -a
     - conda config --add channels conda-forge
     - conda config --add channels defaults
-    - conda install -c conda-forge pocl==1.2 pyopencl cython numpy
+
+    - conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION
+    - conda activate testenv
+    - conda install -c conda-forge pocl pyopencl numpy cython
     - python -c 'import pyopencl as cl'
     - pip install -r requirements.txt
     - pip install coverage codecov