diff -Nru pycuda-2017.1.1/aksetup_helper.py pycuda-2018.1.1/aksetup_helper.py
--- pycuda-2017.1.1/aksetup_helper.py	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/aksetup_helper.py	2018-10-31 18:05:29.000000000 +0000
@@ -285,7 +285,7 @@
             if value is not None:
                 filevars[key] = value
 
-        keys = filevars.keys()
+        keys = list(filevars.keys())
         keys.sort()
 
         outf = open(filename, "w")
diff -Nru pycuda-2017.1.1/debian/changelog pycuda-2018.1.1/debian/changelog
--- pycuda-2017.1.1/debian/changelog	2018-11-03 16:34:53.000000000 +0000
+++ pycuda-2018.1.1/debian/changelog	2018-11-09 14:13:18.000000000 +0000
@@ -1,3 +1,30 @@
+pycuda (2018.1.1-1ubuntu1) disco; urgency=low
+
+  * Merge from Debian unstable.  Remaining changes:
+    - Fix ftbfs due to boost-python soname change.
+
+ -- Gianfranco Costamagna <locutusofborg@debian.org>  Fri, 09 Nov 2018 15:13:18 +0100
+
+pycuda (2018.1.1-1) unstable; urgency=medium
+
+  [ Andreas Beckmann ]
+  * Put package under maintenance by the Debian NVIDIA Maintainers team, move
+    Tomasz to Uploaders.
+  * Switch Vcs-* URLs to salsa.debian.org.
+
+  [ Tomasz Rybak ]
+  * New upstream release (Closes: #903826).
+  * Add Rules-Requires-Root to d/control.
+  * Update d/copyright links to use https protocol.
+  * Add disclaimer to d/copyright describing why PyCUDA is in contrib.
+  * Reorder d/control putting Python 3 packages first.
+  * Remove unnecessary X-Python{,3}-Version fields.
+  * Update Standards-Version to 4.2.1; no changes necessary.
+  * Set compatibility level to 11
+    * Point python-pycuda-doc.doc-base to main package's doc directory.
+
+ -- Tomasz Rybak <serpent@debian.org>  Wed, 31 Oct 2018 18:59:56 +0100
+
 pycuda (2017.1.1-2ubuntu2) disco; urgency=medium
 
   * No-change rebuild to build without python3.6 support.
@@ -231,3 +258,4 @@
     as package is using Debian ones
 
  -- Tomasz Rybak <bogomips@post.pl>  Sat, 20 Aug 2011 23:02:55 +0200
+
diff -Nru pycuda-2017.1.1/debian/compat pycuda-2018.1.1/debian/compat
--- pycuda-2017.1.1/debian/compat	2018-03-09 18:07:39.000000000 +0000
+++ pycuda-2018.1.1/debian/compat	2018-11-03 19:49:14.000000000 +0000
@@ -1 +1 @@
-10
+11
diff -Nru pycuda-2017.1.1/debian/control pycuda-2018.1.1/debian/control
--- pycuda-2017.1.1/debian/control	2018-08-10 01:38:42.000000000 +0000
+++ pycuda-2018.1.1/debian/control	2018-11-09 14:13:14.000000000 +0000
@@ -1,9 +1,10 @@
 Source: pycuda
 Section: contrib/python
 Priority: optional
-Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
-XSBC-Original-Maintainer: Tomasz Rybak <serpent@debian.org>
-Build-Depends: debhelper (>= 10),
+Maintainer: Debian NVIDIA Maintainers <pkg-nvidia-devel@lists.alioth.debian.org>
+Uploaders:
+ Tomasz Rybak <serpent@debian.org>,
+Build-Depends: debhelper (>= 11),
  dh-python,
  python-all-dev,
  python-all-dbg,
@@ -24,30 +25,29 @@
  python3-sphinx (>= 1.0.7+dfsg) <!nodoc>,
  python-pytools (>= 2011.5),
  python3-pytools
-Standards-Version: 4.1.3
-X-Python-Version: >= 2.6
-X-Python3-Version: >= 3.3
+Standards-Version: 4.2.1
+Rules-Requires-Root: no
 Homepage: http://mathema.tician.de/software/pycuda
-Vcs-Git: https://anonscm.debian.org/git/collab-maint/python-pycuda.git
-Vcs-Browser: https://anonscm.debian.org/gitweb/?p=collab-maint/python-pycuda.git
+Vcs-Browser: https://salsa.debian.org/nvidia-team/python-pycuda
+Vcs-Git: https://salsa.debian.org/nvidia-team/python-pycuda.git
 
-Package: python-pycuda
+Package: python3-pycuda
 Architecture: amd64
 Multi-Arch: no
-Depends: ${shlibs:Depends}, ${misc:Depends}, ${python:Depends},
+Depends: ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends},
  nvidia-cuda-toolkit,
- python-appdirs (>= 1.4.0),
- python-numpy,
- python-decorator (>= 3.2.0),
- python-pytools (>= 2011.5)
+ python3-appdirs (>= 1.4.0),
+ python3-numpy,
+ python3-decorator (>= 3.2.0),
+ python3-pytools
 Recommends: python-pycuda-doc <!nodoc>,
- python-mako
-Suggests: python-pytest,
- python-opengl,
- python-matplotlib,
- python-pycuda-dbg
+ python3-mako
+Suggests: python3-pytest,
+ python3-opengl,
+ python3-matplotlib,
+ python3-pycuda-dbg
 Replaces: python-pycuda-headers
-Description: Python module to access Nvidia‘s CUDA parallel computation API
+Description: Python 3 module to access Nvidia‘s CUDA parallel computation API
  PyCUDA lets you access Nvidia‘s CUDA parallel computation API from Python.
  Several wrappers of the CUDA API already exist–so what’s so special about
  PyCUDA?
@@ -65,14 +65,16 @@
   * Speed. PyCUDA’s base layer is written in C++, so all the niceties
     above are virtually free.
   * Helpful Documentation.
+ .
+ This package contains Python 3 modules.
 
-Package: python-pycuda-dbg
+Package: python3-pycuda-dbg
 Section: contrib/debug
 Architecture: amd64
 Multi-Arch: no
-Depends: python-pycuda (= ${binary:Version}), python-dbg,
- ${shlibs:Depends}, ${misc:Depends}, ${python:Depends},
-Description: Python module to access Nvidia‘s CUDA API (debug extensions)
+Depends: python3-pycuda (= ${binary:Version}), python3-dbg,
+ ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends},
+Description: Python 3 module to access Nvidia‘s CUDA API (debug extensions)
  PyCUDA lets you access Nvidia‘s CUDA parallel computation API from Python.
  Several wrappers of the CUDA API already exist–so what’s so special about
  PyCUDA?
@@ -91,25 +93,25 @@
     above are virtually free.
   * Helpful Documentation.
  .
- This package contains debug extensions build for the Python debug interpreter.
+ This package contains debug extensions for the Python 3 debug interpreter.
 
-Package: python3-pycuda
+Package: python-pycuda
 Architecture: amd64
 Multi-Arch: no
-Depends: ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends},
+Depends: ${shlibs:Depends}, ${misc:Depends}, ${python:Depends},
  nvidia-cuda-toolkit,
- python3-appdirs (>= 1.4.0),
- python3-numpy,
- python3-decorator (>= 3.2.0),
- python3-pytools
+ python-appdirs (>= 1.4.0),
+ python-numpy,
+ python-decorator (>= 3.2.0),
+ python-pytools (>= 2011.5)
 Recommends: python-pycuda-doc <!nodoc>,
- python3-mako
-Suggests: python3-pytest,
- python3-opengl,
- python3-matplotlib,
- python3-pycuda-dbg
+ python-mako
+Suggests: python-pytest,
+ python-opengl,
+ python-matplotlib,
+ python-pycuda-dbg
 Replaces: python-pycuda-headers
-Description: Python 3 module to access Nvidia‘s CUDA parallel computation API
+Description: Python module to access Nvidia‘s CUDA parallel computation API
  PyCUDA lets you access Nvidia‘s CUDA parallel computation API from Python.
  Several wrappers of the CUDA API already exist–so what’s so special about
  PyCUDA?
@@ -127,16 +129,14 @@
   * Speed. PyCUDA’s base layer is written in C++, so all the niceties
     above are virtually free.
   * Helpful Documentation.
- .
- This package contains Python 3 modules.
 
-Package: python3-pycuda-dbg
+Package: python-pycuda-dbg
 Section: contrib/debug
 Architecture: amd64
 Multi-Arch: no
-Depends: python3-pycuda (= ${binary:Version}), python3-dbg,
- ${shlibs:Depends}, ${misc:Depends}, ${python3:Depends},
-Description: Python 3 module to access Nvidia‘s CUDA API (debug extensions)
+Depends: python-pycuda (= ${binary:Version}), python-dbg,
+ ${shlibs:Depends}, ${misc:Depends}, ${python:Depends},
+Description: Python module to access Nvidia‘s CUDA API (debug extensions)
  PyCUDA lets you access Nvidia‘s CUDA parallel computation API from Python.
  Several wrappers of the CUDA API already exist–so what’s so special about
  PyCUDA?
@@ -155,7 +155,7 @@
     above are virtually free.
   * Helpful Documentation.
  .
- This package contains debug extensions for the Python 3 debug interpreter.
+ This package contains debug extensions build for the Python debug interpreter.
 
 Package: python-pycuda-doc
 Section: contrib/doc
diff -Nru pycuda-2017.1.1/debian/copyright pycuda-2018.1.1/debian/copyright
--- pycuda-2017.1.1/debian/copyright	2018-03-09 18:07:39.000000000 +0000
+++ pycuda-2018.1.1/debian/copyright	2018-11-03 19:49:14.000000000 +0000
@@ -1,9 +1,13 @@
 Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Name: PyCUDA
 Upstream-Contact: Andreas Klöckner <lists@informa.tiker.net>
-Source: http://git.tiker.net/trees/pycuda.git
- http://pypi.python.org/pypi/pycuda/
- http://pypi.python.org/packages/source/p/pycuda/
+Source: https://git.tiker.net/pycuda.git
+ https://pypi.python.org/pypi/pycuda/
+ https://github.com/inducer/pycuda
+Disclaimer: Although PyCUDA itself is free software, it depends on CUDA
+ which is proprietary NVIDIA software.  Package nvidia-cuda-toolkit,
+ dependency of PyCUDA packages, is in non-free which means that,
+ ccording to Policy 2.2.2, PyCUDA needs to be in contrib.
 
 Files: *
 Copyright: 2009-2011 Andreas Klöckner <lists@informa.tiker.net>
diff -Nru pycuda-2017.1.1/debian/python-pycuda-doc.doc-base pycuda-2018.1.1/debian/python-pycuda-doc.doc-base
--- pycuda-2017.1.1/debian/python-pycuda-doc.doc-base	2018-03-09 18:07:39.000000000 +0000
+++ pycuda-2018.1.1/debian/python-pycuda-doc.doc-base	2018-11-03 19:49:14.000000000 +0000
@@ -6,5 +6,5 @@
 Section: Programming/Python
 
 Format: HTML
-Index: /usr/share/doc/python-pycuda-doc/html/index.html
-Files: /usr/share/doc/python-pycuda-doc/html/*.html
+Index: /usr/share/doc/python-pycuda/html/index.html
+Files: /usr/share/doc/python-pycuda/html/*.html
diff -Nru pycuda-2017.1.1/debian/rules pycuda-2018.1.1/debian/rules
--- pycuda-2017.1.1/debian/rules	2018-08-10 01:38:42.000000000 +0000
+++ pycuda-2018.1.1/debian/rules	2018-11-03 19:49:14.000000000 +0000
@@ -57,7 +57,7 @@
 endif
 
 
-GIT_REVISION=v2017.1.1
+GIT_REVISION=v2018.1.1
 GIT_SUBMODULES=pycuda/compyte
 GIT_URL=https://git.tiker.net/trees/pycuda.git
 
diff -Nru pycuda-2017.1.1/doc/source/driver.rst pycuda-2018.1.1/doc/source/driver.rst
--- pycuda-2017.1.1/doc/source/driver.rst	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/doc/source/driver.rst	2018-10-31 18:05:29.000000000 +0000
@@ -1986,7 +1986,7 @@
 
     .. versionadded:: 2011.1
 
-.. function:: stop()
+.. function:: stop_profiler()
 
     .. versionadded:: 2011.1
 
diff -Nru pycuda-2017.1.1/doc/source/index.rst pycuda-2018.1.1/doc/source/index.rst
--- pycuda-2017.1.1/doc/source/index.rst	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/doc/source/index.rst	2018-10-31 18:05:29.000000000 +0000
@@ -28,7 +28,7 @@
 
 * Helpful Documentation. You're looking at it. ;)
 
-Here's an example, to given you an impression::
+Here's an example, to give you an impression::
 
   import pycuda.autoinit
   import pycuda.driver as drv
diff -Nru pycuda-2017.1.1/MANIFEST.in pycuda-2018.1.1/MANIFEST.in
--- pycuda-2017.1.1/MANIFEST.in	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/MANIFEST.in	2018-10-31 18:05:29.000000000 +0000
@@ -21,5 +21,7 @@
 include README.rst
 include LICENSE
 
+recursive-include pycuda *.pyx
+
 recursive-include bpl-subset/bpl_subset/boost *.h *.hpp *.cpp *.html *.inl *.ipp *.pl *.txt
 recursive-include bpl-subset/bpl_subset/libs *.h *.hpp *.cpp *.html *.inl *.ipp *.pl *.txt
diff -Nru pycuda-2017.1.1/pycuda/characterize.py pycuda-2018.1.1/pycuda/characterize.py
--- pycuda-2017.1.1/pycuda/characterize.py	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/pycuda/characterize.py	2018-10-31 18:05:29.000000000 +0000
@@ -6,7 +6,11 @@
 
 
 def platform_bits():
-    return tuple.__itemsize__ * 8
+    import sys
+    if sys.maxsize > 2**32:
+        return 64
+    else:
+        return 32
 
 
 def has_stack():
diff -Nru pycuda-2017.1.1/pycuda/compiler.py pycuda-2018.1.1/pycuda/compiler.py
--- pycuda-2017.1.1/pycuda/compiler.py	2017-12-27 21:34:52.000000000 +0000
+++ pycuda-2018.1.1/pycuda/compiler.py	2018-10-31 18:05:29.000000000 +0000
@@ -95,7 +95,7 @@
             finally:
                 cache_file.close()
 
-        except:
+        except Exception:
             pass
 
     from tempfile import mkdtemp
@@ -221,7 +221,6 @@
         cache_dir = False
 
     if cache_dir is None:
-        from os.path import join
         import appdirs
         cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"),
                 "compiler-cache-v1")
@@ -254,6 +253,7 @@
 
     return compile_plain(source, options, keep, nvcc, cache_dir, target)
 
+
 class CudaModule(object):
     def _check_arch(self, arch):
         if arch is None:
@@ -265,7 +265,7 @@
                 from warnings import warn
                 warn("trying to compile for a compute capability "
                         "higher than selected GPU")
-        except:
+        except Exception:
             pass
 
     def _bind_module(self):
@@ -393,13 +393,14 @@
                         libdir = ld_path
                         break
 
+            if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'):
+                libdir = '/usr/lib/x86_64-linux-gnu'
+
             if libdir is None:
                 nvcc_path = _find_nvcc_on_path()
                 if nvcc_path is not None:
                     libdir = join(os.path.dirname(nvcc_path), "..", "lib64")
 
-            if libdir is None and isfile('/usr/lib/x86_64-linux-gnu/libcudadevrt.a'):
-                libdir = '/usr/lib/x86_64-linux-gnu'
             libptn = 'lib%s.a'
         if libdir is None:
             raise RuntimeError('Unable to locate the CUDA SDK installation '
@@ -429,7 +430,7 @@
         from os.path import isfile, join
         libpath = join(self.libdir, self.libptn % libname)
         if not isfile(libpath):
-            raise FileNotFoundError('CUDA SDK library file "%s" not found' % libpath)
+            raise OSError('CUDA SDK library file "%s" not found' % libpath)
         from pycuda.driver import jit_input_type
         self.linker.add_file(libpath, jit_input_type.LIBRARY)
         return self
@@ -450,13 +451,15 @@
     - source is linked against the CUDA device runtime library cudadevrt
     - library cudadevrt is statically linked into the generated Module
     '''
-    def __init__(self, source, nvcc="nvcc", options=[], keep=False,
+    def __init__(self, source, nvcc="nvcc", options=None, keep=False,
             no_extern_c=False, arch=None, code=None, cache_dir=None,
             include_dirs=[], cuda_libdir=None):
         super(DynamicSourceModule, self).__init__(nvcc=nvcc,
             link_options=None, keep=keep, no_extern_c=no_extern_c,
             arch=arch, code=code, cache_dir=cache_dir,
             include_dirs=include_dirs, cuda_libdir=cuda_libdir)
+        if options is None:
+            options = DEFAULT_NVCC_FLAGS
         options = options[:]
         if '-rdc=true' not in options:
             options.append('-rdc=true')
diff -Nru pycuda-2017.1.1/pycuda/compyte/dtypes.py pycuda-2018.1.1/pycuda/compyte/dtypes.py
--- pycuda-2017.1.1/pycuda/compyte/dtypes.py	2017-12-27 21:34:52.000000000 +0000
+++ pycuda-2018.1.1/pycuda/compyte/dtypes.py	2018-10-31 18:05:31.000000000 +0000
@@ -210,7 +210,10 @@
     elif name_to_dtype is None:
         name_to_dtype = NAME_TO_DTYPE.__getitem__
 
-    c_arg = c_arg.replace("const", "").replace("volatile", "")
+    c_arg = (c_arg
+            .replace("const", "")
+            .replace("volatile", "")
+            .replace("__restrict__", ""))
 
     # process and remove declarator
     import re
diff -Nru pycuda-2017.1.1/pycuda/elementwise.py pycuda-2018.1.1/pycuda/elementwise.py
--- pycuda-2017.1.1/pycuda/elementwise.py	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/pycuda/elementwise.py	2018-10-31 18:05:29.000000000 +0000
@@ -149,7 +149,7 @@
     func = mod.get_function(name)
     func.prepare("".join(arg.struct_char for arg in arguments))
 
-    return func, arguments
+    return mod, func, arguments
 
 
 def get_elwise_kernel(arguments, operation,
@@ -157,7 +157,7 @@
     """Return a L{pycuda.driver.Function} that performs the same scalar operation
     on one or several vectors.
     """
-    func, arguments = get_elwise_kernel_and_types(
+    mod, func, arguments = get_elwise_kernel_and_types(
             arguments, operation, name, keep, options, **kwargs)
 
     return func
@@ -171,9 +171,13 @@
         self.gen_kwargs.update(dict(keep=keep, options=options, name=name,
             operation=operation, arguments=arguments))
 
+    def get_texref(self, name, use_range=False):
+        mod, knl, arguments = self.generate_stride_kernel_and_types(use_range=use_range)
+        return mod.get_texref(name)
+
     @memoize_method
     def generate_stride_kernel_and_types(self, use_range):
-        knl, arguments = get_elwise_kernel_and_types(use_range=use_range,
+        mod, knl, arguments = get_elwise_kernel_and_types(use_range=use_range,
                 **self.gen_kwargs)
 
         assert [i for i, arg in enumerate(arguments)
@@ -181,7 +185,7 @@
                 "ElementwiseKernel can only be used with functions that " \
                 "have at least one vector argument"
 
-        return knl, arguments
+        return mod, knl, arguments
 
     def __call__(self, *args, **kwargs):
         vectors = []
@@ -195,7 +199,7 @@
                     + ", ".join(six.iterkeys(kwargs)))
 
         invocation_args = []
-        func, arguments = self.generate_stride_kernel_and_types(
+        mod, func, arguments = self.generate_stride_kernel_and_types(
                 range_ is not None or slice_ is not None)
 
         for arg, arg_descr in zip(args, arguments):
diff -Nru pycuda-2017.1.1/pycuda/gpuarray.py pycuda-2018.1.1/pycuda/gpuarray.py
--- pycuda-2017.1.1/pycuda/gpuarray.py	2017-12-27 21:34:52.000000000 +0000
+++ pycuda-2018.1.1/pycuda/gpuarray.py	2018-10-31 18:05:29.000000000 +0000
@@ -227,7 +227,24 @@
     def flags(self):
         return _ArrayFlags(self)
 
-    def set(self, ary, async=False, stream=None):
+    def set(self, ary, async_=False, stream=None, **kwargs):
+        # {{{ handle 'async' deprecation
+
+        async_arg = kwargs.pop("async", None)
+        if async_arg is not None:
+            if async_ is not None:
+                raise TypeError("may not specify both 'async' and 'async_'")
+            async_ = async_arg
+
+        if async_ is None:
+            async_ = False
+
+        if kwargs:
+            raise TypeError("extra keyword arguments specified: %s"
+                    % ", ".join(kwargs))
+
+        # }}}
+
         if ary.size != self.size:
             raise ValueError("ary and self must be the same size")
         if ary.shape != self.shape:
@@ -240,12 +257,29 @@
             raise ValueError("ary and self must have the same dtype")
 
         if self.size:
-            _memcpy_discontig(self, ary, async=async, stream=stream)
+            _memcpy_discontig(self, ary, async_=async_, stream=stream)
 
     def set_async(self, ary, stream=None):
-        return self.set(ary, async=True, stream=stream)
+        return self.set(ary, async_=True, stream=stream)
+
+    def get(self, ary=None, pagelocked=False, async_=False, stream=None, **kwargs):
+        # {{{ handle 'async' deprecation
+
+        async_arg = kwargs.pop("async", None)
+        if async_arg is not None:
+            if async_ is not None:
+                raise TypeError("may not specify both 'async' and 'async_'")
+            async_ = async_arg
+
+        if async_ is None:
+            async_ = False
+
+        if kwargs:
+            raise TypeError("extra keyword arguments specified: %s"
+                    % ", ".join(kwargs))
+
+        # }}}
 
-    def get(self, ary=None, pagelocked=False, async=False, stream=None):
         if ary is None:
             if pagelocked:
                 ary = drv.pagelocked_empty(self.shape, self.dtype)
@@ -268,11 +302,11 @@
                 raise TypeError("self and ary must have the same dtype")
 
         if self.size:
-            _memcpy_discontig(ary, self, async=async, stream=stream)
+            _memcpy_discontig(ary, self, async_=async_, stream=stream)
         return ary
 
     def get_async(self, stream=None, ary=None):
-        return self.get(ary=ary, async=True, stream=stream)
+        return self.get(ary=ary, async_=True, stream=stream)
 
     def copy(self):
         new = GPUArray(self.shape, self.dtype, self.allocator)
@@ -379,9 +413,8 @@
         strides = None
         if dtype is None:
             dtype = self.dtype
-        else:
-            if dtype == self.dtype:
-                strides = self.strides
+        if dtype == self.dtype:
+            strides = self.strides
 
         return self.__class__(self.shape, dtype,
                 allocator=self.allocator, strides=strides, order=order)
@@ -605,14 +638,10 @@
 
         return result
 
-    def __pow__(self, other):
-        """pow function::
-
-           example:
-                   array = pow(array)
-                   array = pow(array,4)
-                   array = pow(array,array)
-
+    def _pow(self, other, new):
+        """
+        Do the pow operator.
+        with new, the user can choose between ipow or just pow
         """
 
         if isinstance(other, GPUArray):
@@ -622,7 +651,10 @@
 
             assert self.shape == other.shape
 
-            result = self._new_like_me(_get_common_dtype(self, other))
+            if new:
+                result = self._new_like_me(_get_common_dtype(self, other))
+            else:
+                result = self
 
             func = elementwise.get_pow_array_kernel(
                     self.dtype, other.dtype, result.dtype)
@@ -637,7 +669,10 @@
                 raise RuntimeError("only contiguous arrays may "
                         "be used as arguments to this operation")
 
-            result = self._new_like_me()
+            if new:
+                result = self._new_like_me()
+            else:
+                result = self
             func = elementwise.get_pow_kernel(self.dtype)
             func.prepared_async_call(self._grid, self._block, None,
                     other, self.gpudata, result.gpudata,
@@ -645,6 +680,28 @@
 
             return result
 
+    def __pow__(self, other):
+        """pow function::
+
+           example:
+                   array = pow(array)
+                   array = pow(array,4)
+                   array = pow(array,array)
+
+        """
+        return self._pow(other,new=True)
+
+    def __ipow__(self, other):
+        """ipow function::
+
+           example:
+                   array **= 4
+                   array **= array
+
+        """
+        return self._pow(other,new=False)
+
+
     def reverse(self, stream=None):
         """Return this array in reversed order. The array is treated
         as one-dimensional.
@@ -834,7 +891,7 @@
 
                 array_stride = self.strides[array_axis]
 
-                new_shape.append((stop-start-1)//idx_stride+1)
+                new_shape.append((abs(stop-start)-1)//abs(idx_stride)+1)
                 new_strides.append(idx_stride*array_stride)
                 new_offset += array_stride*start
 
@@ -1171,7 +1228,7 @@
     return strides
 
 
-def _memcpy_discontig(dst, src, async=False, stream=None):
+def _memcpy_discontig(dst, src, async_=False, stream=None):
     """Copy the contents of src into dst.
 
     The two arrays should have the same dtype, shape, and order, but
@@ -1208,7 +1265,9 @@
         dst_strides = [dst.strides[axis] for axis in axes]
 
         # copy functions require contiguity in minor axis, so add new axis if needed
-        if len(shape) == 0 or src_strides[0] != src.dtype.itemsize or dst_strides[0] != dst.dtype.itemsize:
+        if (len(shape) == 0
+                or src_strides[0] != src.dtype.itemsize
+                or dst_strides[0] != dst.dtype.itemsize):
             shape[0:0] = [1]
             src_strides[0:0] = [0]
             dst_strides[0:0] = [0]
@@ -1221,7 +1280,7 @@
             if dst_strides[i] < dst_strides[i-1]:
                 raise ValueError("src and dst must have same order")
             if (src_strides[i-1] * shape[i-1] == src_strides[i] and
-                dst_strides[i-1] * shape[i-1] == dst_strides[i]):
+                    dst_strides[i-1] * shape[i-1] == dst_strides[i]):
                 shape[i-1:i+1] = [shape[i-1] * shape[i]]
                 del src_strides[i]
                 del dst_strides[i]
@@ -1232,8 +1291,9 @@
     if len(shape) <= 1:
         if isinstance(src, GPUArray):
             if isinstance(dst, GPUArray):
-                if async:
-                    drv.memcpy_dtod_async(dst.gpudata, src.gpudata, src.nbytes, stream=stream)
+                if async_:
+                    drv.memcpy_dtod_async(
+                            dst.gpudata, src.gpudata, src.nbytes, stream=stream)
                 else:
                     drv.memcpy_dtod(dst.gpudata, src.gpudata, src.nbytes)
             else:
@@ -1241,14 +1301,15 @@
                 # having no gaps, but the axes could be transposed
                 # so that the order is neither Fortran or C.
                 # So, we attempt to get a contiguous view of dst.
-                dst = _as_strided(dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
-                if async:
+                dst = _as_strided(
+                        dst, shape=(dst.size,), strides=(dst.dtype.itemsize,))
+                if async_:
                     drv.memcpy_dtoh_async(dst, src.gpudata, stream=stream)
                 else:
                     drv.memcpy_dtoh(dst, src.gpudata)
         else:
             src = _as_strided(src, shape=(src.size,), strides=(src.dtype.itemsize,))
-            if async:
+            if async_:
                 drv.memcpy_htod_async(dst.gpudata, src, stream=stream)
             else:
                 drv.memcpy_htod(dst.gpudata, src)
@@ -1259,7 +1320,9 @@
     elif len(shape) == 3:
         copy = drv.Memcpy3D()
     else:
-        raise ValueError("more than 2 discontiguous axes not supported %s" % (tuple(sorted(axes)),))
+        raise ValueError(
+                "more than 2 discontiguous axes not supported %s"
+                % (tuple(sorted(axes)),))
 
     if isinstance(src, GPUArray):
         copy.set_src_device(src.gpudata)
@@ -1278,22 +1341,24 @@
     copy.height = shape[1]
 
     if len(shape) == 2:
-        if async:
+        if async_:
             copy(stream)
         else:
             copy(aligned=True)
 
-    else: # len(shape) == 3
+    else:  # len(shape) == 3
         if src_strides[2] % src_strides[1] != 0:
-            raise RuntimeError("src's major stride must be a multiple of middle stride")
+            raise RuntimeError(
+                    "src's major stride must be a multiple of middle stride")
         copy.src_height = src_strides[2] // src_strides[1]
 
         if dst_strides[2] % dst_strides[1] != 0:
-            raise RuntimeError("dst's major stride must be a multiple of middle stride")
+            raise RuntimeError(
+                    "dst's major stride must be a multiple of middle stride")
         copy.dst_height = dst_strides[2] // dst_strides[1]
 
         copy.depth = shape[2]
-        if async:
+        if async_:
             copy(stream)
         else:
             copy()
@@ -1500,13 +1565,13 @@
     return a.transpose(axes)
 
 
-def reshape(a, shape):
+def reshape(a, *shape, **kwargs):
     """Gives a new shape to an array without changing its data.
 
     .. versionadded:: 2015.2
     """
 
-    return a.reshape(shape)
+    return a.reshape(*shape, **kwargs)
 
 # }}}
 
diff -Nru pycuda-2017.1.1/pycuda/__init__.py pycuda-2018.1.1/pycuda/__init__.py
--- pycuda-2017.1.1/pycuda/__init__.py	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/pycuda/__init__.py	2018-10-31 18:05:29.000000000 +0000
@@ -1,3 +1,3 @@
-VERSION = (2017, 1, 1)
+VERSION = (2018, 1, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff -Nru pycuda-2017.1.1/pycuda/scan.py pycuda-2018.1.1/pycuda/scan.py
--- pycuda-2017.1.1/pycuda/scan.py	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/pycuda/scan.py	2018-10-31 18:05:29.000000000 +0000
@@ -345,7 +345,7 @@
 class _ScanKernelBase(object):
     def __init__(self, dtype,
             scan_expr, neutral=None,
-            name_prefix="scan", options=[], preamble="", devices=None):
+            name_prefix="scan", options=None, preamble="", devices=None):
 
         if isinstance(self, ExclusiveScanKernel) and neutral is None:
             raise ValueError("neutral element is required for exclusive scan")
diff -Nru pycuda-2017.1.1/pycuda/sparse/pkt_build.py pycuda-2018.1.1/pycuda/sparse/pkt_build.py
--- pycuda-2017.1.1/pycuda/sparse/pkt_build.py	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/pycuda/sparse/pkt_build.py	2018-10-31 18:05:29.000000000 +0000
@@ -12,6 +12,7 @@
     packet_start = 0
     base_dof_nr = 0
 
+    max_thread_costs = int(max_thread_costs)
     index_array = np.zeros(
             max_thread_costs*thread_count, dtype=spmv.packed_index_dtype)
     data_array = np.zeros(
diff -Nru pycuda-2017.1.1/src/cpp/cuda.hpp pycuda-2018.1.1/src/cpp/cuda.hpp
--- pycuda-2017.1.1/src/cpp/cuda.hpp	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/src/cpp/cuda.hpp	2018-10-31 18:05:29.000000000 +0000
@@ -1556,7 +1556,7 @@
             py::handle<>(
 #if PY_VERSION_HEX >= 0x03030000
               PyMemoryView_FromMemory((char *) (m_devptr + offset), size,
-                PyBUF_READ | PyBUF_WRITE)
+                PyBUF_WRITE)
 #else /* Py2 */
               PyBuffer_FromReadWriteMemory((void *) (m_devptr + offset), size)
 #endif
diff -Nru pycuda-2017.1.1/src/wrapper/wrap_cudadrv.cpp pycuda-2018.1.1/src/wrapper/wrap_cudadrv.cpp
--- pycuda-2017.1.1/src/wrapper/wrap_cudadrv.cpp	2017-12-27 21:34:52.000000000 +0000
+++ pycuda-2018.1.1/src/wrapper/wrap_cudadrv.cpp	2018-10-31 18:05:29.000000000 +0000
@@ -1019,7 +1019,7 @@
 #if CUDAPP_CUDA_VERSION >= 3020
     .value("MALLOC_HEAP_SIZE", CU_LIMIT_MALLOC_HEAP_SIZE)
 #endif
-#if CUDAPP_CUDA_VERSION >= 3050
+#if CUDAPP_CUDA_VERSION >= 4010
     .value("DEV_RUNTIME_SYNC_DEPTH", CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH)
     .value("DEV_RUNTIME_PENDING_LAUNCH_COUNT", CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT)
 #endif
diff -Nru pycuda-2017.1.1/test/test_cumath.py pycuda-2018.1.1/test/test_cumath.py
--- pycuda-2017.1.1/test/test_cumath.py	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/test/test_cumath.py	2018-10-31 18:05:29.000000000 +0000
@@ -78,9 +78,9 @@
     test_sqrt = make_unary_function_test("sqrt", 1e-5, 1, 2e-7)
 
     test_sin = make_unary_function_test("sin", -10, 10, 1e-7)
-    test_sin_c = make_unary_function_test("sin", -3, 3, 2e-6, complex=True)
+    test_sin_c = make_unary_function_test("sin", -3, 3, 2.1e-6, complex=True)
     test_cos = make_unary_function_test("cos", -10, 10, 1e-7)
-    test_cos_c = make_unary_function_test("cos", -3, 3, 2e-6, complex=True)
+    test_cos_c = make_unary_function_test("cos", -3, 3, 2.1e-6, complex=True)
     test_asin = make_unary_function_test("asin", -0.9, 0.9, 5e-7)
     #test_sin_c = make_unary_function_test("sin", -0.9, 0.9, 2e-6, complex=True)
     test_acos = make_unary_function_test("acos", -0.9, 0.9, 5e-7)
@@ -242,5 +242,5 @@
     if len(sys.argv) > 1:
         exec (sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
diff -Nru pycuda-2017.1.1/test/test_driver.py pycuda-2018.1.1/test/test_driver.py
--- pycuda-2017.1.1/test/test_driver.py	2017-12-27 21:34:50.000000000 +0000
+++ pycuda-2018.1.1/test/test_driver.py	2018-10-31 18:05:29.000000000 +0000
@@ -1,10 +1,8 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
+from __future__ import division, absolute_import, print_function
 import numpy as np
 import numpy.linalg as la
 from pycuda.tools import mark_cuda_test, dtype_to_ctype
-import pytest
+import pytest  # noqa
 from six.moves import range
 
 
@@ -12,7 +10,7 @@
     try:
         import pycuda  # noqa
         return True
-    except:
+    except Exception:
         return False
 
 
@@ -98,7 +96,7 @@
         a = gpuarray.vec.make_float3(1, 2, 3)
         dest = np.empty((400), gpuarray.vec.float3)
 
-        set_them(drv.Out(dest), a, block=(400,1,1))
+        set_them(drv.Out(dest), a, block=(400, 1, 1))
         assert (dest == a).all()
 
     @mark_cuda_test
@@ -905,7 +903,7 @@
             drv.memcpy_dtoh(e, e_gpu)
             drv.memcpy_dtoh(f, f_gpu)
 
-            #print(c,d,e,f)
+            # print(c,d,e,f)
 
         a = np.random.randint(10, size=100)
         b = np.random.randint(10, size=100)
@@ -918,6 +916,7 @@
 
     @mark_cuda_test
     def test_jit_link_module(self):
+        from pycuda.compiler import DEFAULT_NVCC_FLAGS
         if drv.Context.get_device().compute_capability() < (3, 5):
             from pytest import skip
             skip("need compute capability 3.5 or higher for dynamic parallelism")
@@ -936,13 +935,17 @@
 
         from pycuda.compiler import DynamicModule
         mod = DynamicModule()
-        mod.add_source(test_outer_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
-        mod.add_source(test_inner_cu, nvcc_options=['-rdc=true', '-lcudadevrt'])
+        mod.add_source(
+                test_outer_cu, nvcc_options=(
+                    ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS))
+        mod.add_source(
+                test_inner_cu, nvcc_options=(
+                    ['-rdc=true', '-lcudadevrt']+DEFAULT_NVCC_FLAGS))
         mod.add_stdlib('cudadevrt')
         mod.link()
 
         test_kernel = mod.get_function('test_kernel')
-        test_kernel(grid=(2,1), block=(1,1,1))
+        test_kernel(grid=(2, 1), block=(1, 1, 1))
 
 
 def test_import_pyopencl_before_pycuda():
@@ -959,7 +962,7 @@
 
     import sys
     if len(sys.argv) > 1:
-        exec (sys.argv[1])
+        exec(sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])
diff -Nru pycuda-2017.1.1/test/test_gpuarray.py pycuda-2018.1.1/test/test_gpuarray.py
--- pycuda-2017.1.1/test/test_gpuarray.py	2017-12-27 21:34:52.000000000 +0000
+++ pycuda-2018.1.1/test/test_gpuarray.py	2018-10-31 18:05:29.000000000 +0000
@@ -36,6 +36,10 @@
         result = (a_gpu**a_gpu).get()
         assert (np.abs(pow(a, a) - result) < 1e-3).all()
 
+        a_gpu **= a_gpu
+        a_gpu = a_gpu.get()
+        assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all()
+
     @mark_cuda_test
     def test_pow_number(self):
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
@@ -44,6 +48,10 @@
         result = pow(a_gpu, 2).get()
         assert (np.abs(a**2 - result) < 1e-3).all()
 
+        a_gpu **= 2
+        a_gpu = a_gpu.get()
+        assert (np.abs(a**2 - a_gpu) < 1e-3).all()
+
     @mark_cuda_test
     def test_numpy_integer_shape(self):
         gpuarray.empty(np.int32(17), np.float32)
@@ -944,6 +952,9 @@
 
     @mark_cuda_test
     def test_dot_allocator(self):
+        from pytest import skip
+        skip("https://github.com/inducer/pycuda/issues/163")
+
         import pycuda.tools
         pool = pycuda.tools.DeviceMemoryPool()
 
@@ -1142,5 +1153,5 @@
     if len(sys.argv) > 1:
         exec (sys.argv[1])
     else:
-        from py.test.cmdline import main
+        from pytest import main
         main([__file__])