diff -Nru pyopencl-2016.1+git20161003/debian/changelog pyopencl-2016.1+git20161130/debian/changelog
--- pyopencl-2016.1+git20161003/debian/changelog	2016-11-22 21:48:00.000000000 +0000
+++ pyopencl-2016.1+git20161130/debian/changelog	2016-12-05 17:43:23.000000000 +0000
@@ -1,3 +1,13 @@
+pyopencl (2016.1+git20161130-1) unstable; urgency=medium
+
+  * New upstream release.
+  * Add DEB_BUILD_PROFILES support for disabling documentation building .
+  * Patch Sphinx documentation to allow for reproducible builds.
+  * Do not try to fetch external Sphinx objects, use ones from Debian
+    packages.
+
+ -- Tomasz Rybak <tomasz.rybak@post.pl>  Mon, 05 Dec 2016 18:43:23 +0100
+
 pyopencl (2016.1+git20161003-2) unstable; urgency=medium
 
   * Drop mips64el architecture as there is no OpenCL ICD for it.
diff -Nru pyopencl-2016.1+git20161003/debian/control pyopencl-2016.1+git20161130/debian/control
--- pyopencl-2016.1+git20161003/debian/control	2016-11-22 21:48:00.000000000 +0000
+++ pyopencl-2016.1+git20161130/debian/control	2016-12-05 17:43:23.000000000 +0000
@@ -8,6 +8,9 @@
  python-all-dbg,
  python3-all-dev,
  python3-all-dbg,
+ python3-doc <!nodoc>,
+ python-numpy-doc <!nodoc>,
+ python-mako-doc <!nodoc>,
  python-cffi (>= 1.1.0),
  python-cffi-backend-dbg,
  python3-cffi (>= 1.1.0),
@@ -19,8 +22,8 @@
  mesa-common-dev,
  python-numpy (>= 1:1.4.1-4~),
  python3-numpy,
- python-sphinx (>= 1.0.7+dfsg),
- python3-sphinx (>= 1.0.7+dfsg),
+ python-sphinx (>= 1.0.7+dfsg) <!nodoc>,
+ python3-sphinx (>= 1.0.7+dfsg) <!nodoc>,
  python3-mako,
  python-pytools (>= 2015.1.2),
  python3-pytools (>= 2015.1.2)
@@ -41,7 +44,7 @@
  python-numpy,
  python-pytools (>= 2015.1.2),
  python-six
-Recommends: python-pyopencl-doc,
+Recommends: python-pyopencl-doc <!nodoc>,
  python-opengl,
  python-mako
 Suggests: python-imaging-tk,
@@ -104,7 +107,7 @@
  python3-numpy,
  python3-pytools (>= 2015.1.2),
  python3-six
-Recommends: python-pyopencl-doc,
+Recommends: python-pyopencl-doc <!nodoc>,
  python3-mako
 Suggests: python3-imaging-tk,
  python3-opengl,
@@ -163,8 +166,12 @@
 Section: doc
 Architecture: all
 Multi-Arch: foreign
+Build-Profiles: <!nodoc>
 Depends: ${sphinxdoc:Depends},
  ${misc:Depends}
+Recommends: python3-doc,
+ python-numpy-doc,
+ python-mako-doc
 Suggests: python-pyopencl,
  python3-pyopencl
 Description: module to access OpenCL parallel computation API (documentation)
diff -Nru pyopencl-2016.1+git20161003/debian/patches/reproducible-documentation.patch pyopencl-2016.1+git20161130/debian/patches/reproducible-documentation.patch
--- pyopencl-2016.1+git20161003/debian/patches/reproducible-documentation.patch	1970-01-01 00:00:00.000000000 +0000
+++ pyopencl-2016.1+git20161130/debian/patches/reproducible-documentation.patch	2016-12-05 17:43:23.000000000 +0000
@@ -0,0 +1,49 @@
+Description: Ensure reproducible documentation
+ Do not fetch external Sphinx inverntory but use one from Debian packages.
+ Use OrderedDict to ensure the same order of added external links.
+Forwarded: not-needed
+Author: Tomasz Rybak <tomasz.rybak@post.pl>
+Last-Update: 2016-11-28
+Index: pyopencl-2016.1+git20161003/doc/conf.py
+===================================================================
+--- pyopencl-2016.1+git20161003.orig/doc/conf.py
++++ pyopencl-2016.1+git20161003/doc/conf.py
+@@ -1,6 +1,7 @@
+ # -*- coding: utf-8 -*-
+ 
+ from __future__ import absolute_import
++import collections
+ 
+ # PyOpenCL documentation build configuration file, created by
+ # sphinx-quickstart on Fri Jun 13 00:51:19 2008.
+@@ -92,10 +93,10 @@ pygments_style = 'sphinx'
+ html_theme = "alabaster"
+ 
+ html_theme_options = {
+-        "extra_nav_links": {
+-            "🚀 Github": "https://github.com/pyopencl/pyopencl",
+-            "💾 Download Releases": "https://pypi.python.org/pypi/pyopencl",
+-            }
++        "extra_nav_links": collections.OrderedDict((
++            ("🚀 Github", "https://github.com/pyopencl/pyopencl"),
++            ("💾 Download Releases", "https://pypi.python.org/pypi/pyopencl"),
++            ))
+         }
+ 
+ html_sidebars = {
+@@ -192,9 +193,12 @@ latex_documents = [
+ #latex_use_modindex = True
+ 
+ intersphinx_mapping = {
+-        'http://docs.python.org/dev': None,
+-        'http://docs.scipy.org/doc/numpy/': None,
+-        'http://docs.makotemplates.org/en/latest/': None,
++        'file:///usr/share/doc/python3-doc/html/':
++	    '/usr/share/doc/python3-doc/html/objects.inv',
++        'file:///usr/share/doc/python-numpy-doc/html/':
++            '/usr/share/doc/python-numpy-doc/html/objects.inv',
++        'file:///usr/share/doc/python-mako-doc/html/':
++            '/usr/share/doc/python-mako-doc/html/objects.inv',
+         }
+ 
+ autoclass_content = "both"
diff -Nru pyopencl-2016.1+git20161003/debian/patches/series pyopencl-2016.1+git20161130/debian/patches/series
--- pyopencl-2016.1+git20161003/debian/patches/series	2016-11-22 21:48:00.000000000 +0000
+++ pyopencl-2016.1+git20161130/debian/patches/series	2016-12-05 17:43:23.000000000 +0000
@@ -1,2 +1,3 @@
 hardening.patch
 fix-setup.patch
+reproducible-documentation.patch
diff -Nru pyopencl-2016.1+git20161003/debian/rules pyopencl-2016.1+git20161130/debian/rules
--- pyopencl-2016.1+git20161003/debian/rules	2016-11-22 21:48:00.000000000 +0000
+++ pyopencl-2016.1+git20161130/debian/rules	2016-12-05 17:43:23.000000000 +0000
@@ -2,10 +2,6 @@
 
 #export DH_VERBOSE = 1
 #export DH_BUILD_DDEBS = 1
-#PACKAGE_DIR=$(CURDIR)/debian/python$(if $(patsubst 3.%,,$(1)),,3)-pyopencl
-#INSTALL_DIR=$(CURDIR)/debian/tmp/
-#PYTHON2=$(shell pyversions -vr)
-#PYTHON3=$(shell py3versions -vr)
 
 export DEB_BUILD_MAINT_OPTIONS = hardening=+bindnow
 DPKG_EXPORT_BUILDFLAGS = 1
@@ -38,8 +34,10 @@
 	dh_numpy3
 
 override_dh_installdocs:
+ifeq ($(filter nodocs,$(DEB_BUILD_PROFILES)),)
 	pybuild --build -i pythonX.Y -s custom --build-args 'make -C {dir}/doc html'
 	dh_installdocs
+endif
 
 # Sphinx documentation is architecture-independent
 override_dh_sphinxdoc-arch:
@@ -62,7 +60,7 @@
 MODULE_NAME=pyopencl
 DEB_UPSTREAM_VERSION=$(shell dpkg-parsechangelog \
 	| sed -rne 's/^Version: ([^-]+).*/\1/p')
-GIT_REVISION=3118256c2aa939969a0a7e91be8e98a333e94cc5
+GIT_REVISION=19015994653dffe2ee407271e19a46e1d6a62796
 GIT_SUBMODULES=pyopencl/compyte
 GIT_URL=http://git.tiker.net/trees/pyopencl.git 
 
diff -Nru pyopencl-2016.1+git20161003/doc/misc.rst pyopencl-2016.1+git20161130/doc/misc.rst
--- pyopencl-2016.1+git20161003/doc/misc.rst	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/doc/misc.rst	2016-12-05 17:46:14.000000000 +0000
@@ -103,7 +103,7 @@
 User-visible Changes
 ====================
 
-Version 2016.2
+Version 2016.3
 --------------
 
 .. note::
@@ -111,6 +111,9 @@
     This version is currently under development. You can get snapshots from
     PyOpenCL's `git repository <https://github.com/pyopencl/pyopencl>`_
 
+Version 2016.2
+--------------
+
 * Deprecate RANLUXCL. It will be removed in the 2018.x series of PyOpenCL.
 * Introduce Random123 random number generators. See :mod:`pyopencl.clrandom`
   for more information.
diff -Nru pyopencl-2016.1+git20161003/examples/median-filter.py pyopencl-2016.1+git20161130/examples/median-filter.py
--- pyopencl-2016.1+git20161003/examples/median-filter.py	1970-01-01 00:00:00.000000000 +0000
+++ pyopencl-2016.1+git20161130/examples/median-filter.py	2016-12-05 17:46:14.000000000 +0000
@@ -0,0 +1,100 @@
+import pyopencl as cl
+import numpy as np
+from scipy.misc import imread, imsave
+
+#Read in image
+img = imread('noisyImage.jpg', flatten=True).astype(np.float32)
+
+# Get platforms, both CPU and GPU
+plat = cl.get_platforms()
+CPU = plat[0].get_devices()
+try:
+    GPU = plat[1].get_devices()
+except IndexError:
+    GPU = "none"
+
+#Create context for GPU/CPU
+if GPU!= "none":
+    ctx = cl.Context(GPU)
+else:
+    ctx = cl.Context(CPU)
+
+# Create queue for each kernel execution
+queue = cl.CommandQueue(ctx)
+
+mf = cl.mem_flags
+
+# Kernel function
+src = '''
+void sort(int *a, int *b, int *c) {
+   int swap;
+   if(*a > *b) {
+      swap = *a;
+      *a = *b;
+      *b = swap;
+   }
+   if(*a > *c) {
+      swap = *a;
+      *a = *c;
+      *c = swap;
+   }
+   if(*b > *c) {
+      swap = *b;
+      *b = *c;
+      *c = swap;
+   }
+}
+__kernel void medianFilter(__global float *img, __global float *result, __global int *width, __global int *height)
+{
+    int w = *width;
+    int h = *height;
+    int posx = get_global_id(1);
+    int posy = get_global_id(0);
+    int i = w*posy + posx;
+    // Keeping the edge pixels the same
+    if( posx == 0 || posy == 0 || posx == w-1 || posy == h-1 )
+    {
+        result[i] = img[i];
+    }
+    else
+    {
+        int pixel00, pixel01, pixel02, pixel10, pixel11, pixel12, pixel20, pixel21, pixel22;
+        pixel00 = img[i - 1 - w];
+        pixel01 = img[i- w];
+        pixel02 = img[i + 1 - w];
+        pixel10 = img[i - 1];
+        pixel11 = img[i];
+        pixel12 = img[i + 1];
+        pixel20 = img[i - 1 + w];
+        pixel21 = img[i + w];
+        pixel22 = img[i + 1 + w];
+        //sort the rows
+        sort( &(pixel00), &(pixel01), &(pixel02) );
+        sort( &(pixel10), &(pixel11), &(pixel12) );
+        sort( &(pixel20), &(pixel21), &(pixel22) );
+        //sort the columns
+        sort( &(pixel00), &(pixel10), &(pixel20) );
+        sort( &(pixel01), &(pixel11), &(pixel21) );
+        sort( &(pixel02), &(pixel12), &(pixel22) );
+        //sort the diagonal
+        sort( &(pixel00), &(pixel11), &(pixel22) );
+        // median is the the middle value of the diagonal
+        result[i] = pixel11;
+    }
+}
+'''
+
+#Kernel function instantiation
+prg = cl.Program(ctx, src).build()
+#Allocate memory for variables on the device
+img_g =  cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=img)
+result_g = cl.Buffer(ctx, mf.WRITE_ONLY, img.nbytes)
+width_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[1]))
+height_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np.int32(img.shape[0]))
+# Call Kernel. Automatically takes care of block/grid distribution
+prg.medianFilter(queue, img.shape, None , img_g, result_g, width_g, height_g)
+result = np.empty_like(img)
+cl.enqueue_copy(queue, result, result_g)
+
+# Show the blurred image
+imsave('medianFilter-OpenCL.jpg',result)
\ No newline at end of file
diff -Nru pyopencl-2016.1+git20161003/.gitlab-ci.yml pyopencl-2016.1+git20161130/.gitlab-ci.yml
--- pyopencl-2016.1+git20161003/.gitlab-ci.yml	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/.gitlab-ci.yml	2016-12-05 17:46:14.000000000 +0000
@@ -12,6 +12,7 @@
   - opengl
   except:
   - tags
+
 Python 3.5 Intel CPU:
   script:
   - export PY_EXE=python3.5
@@ -48,6 +49,7 @@
   - amd-cl-cpu
   except:
   - tags
+
 Python 3.5 Titan X:
   script:
   - export PY_EXE=python3.5
@@ -60,6 +62,7 @@
   - nvidia-titan-x
   except:
   - tags
+
 Python 3.5 K40:
   script:
   - export PY_EXE=python3.5
@@ -72,6 +75,7 @@
   - nvidia-k40
   except:
   - tags
+
 Python 3.5 AMD GPU:
   script:
   - export PY_EXE=python3.5
@@ -84,6 +88,7 @@
   - amd-fiji
   except:
   - tags
+
 Python 3.5 POCL CL 1.1:
   script:
   - export PY_EXE=python3.5
@@ -97,6 +102,7 @@
   - pocl
   except:
   - tags
+
 Python 2.7 POCL:
   script:
   - export PY_EXE=python2.7
@@ -109,6 +115,7 @@
   - pocl
   except:
   - tags
+
 Python 2.7 Apple:
   script:
   - export PY_EXE=python2.7
@@ -121,6 +128,7 @@
   - apple
   except:
   - tags
+
 PyPy POCL:
   script:
   - export PY_EXE=pypy
@@ -133,6 +141,7 @@
   - pocl
   except:
   - tags
+
 Documentation:
   script:
   - EXTRA_INSTALL="numpy mako"
@@ -142,3 +151,12 @@
   - python3.5
   only:
   - master
+
+Flake8:
+  script:
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+  - ". ./prepare-and-run-flake8.sh pyopencl test"
+  tags:
+  - python3.5
+  except:
+  - tags
diff -Nru pyopencl-2016.1+git20161003/pyopencl/algorithm.py pyopencl-2016.1+git20161130/pyopencl/algorithm.py
--- pyopencl-2016.1+git20161003/pyopencl/algorithm.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/algorithm.py	2016-12-05 17:46:14.000000000 +0000
@@ -300,6 +300,7 @@
     dtype = get_or_register_dtype(name, dtype)
     return name, dtype, c_decl
 
+
 # {{{ types, helpers preamble
 
 RADIX_SORT_PREAMBLE_TPL = Template(r"""//CL//
diff -Nru pyopencl-2016.1+git20161003/pyopencl/array.py pyopencl-2016.1+git20161130/pyopencl/array.py
--- pyopencl-2016.1+git20161003/pyopencl/array.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/array.py	2016-12-05 17:46:14.000000000 +0000
@@ -48,6 +48,7 @@
     return _get_common_dtype_base(obj1, obj2,
             has_double_support(queue.device))
 
+
 # Work around PyPy not currently supporting the object dtype.
 # (Yes, it doesn't even support checking!)
 # (as of May 27, 2014 on PyPy 2.3)
@@ -150,6 +151,7 @@
             vec.types[np.dtype(base_type), count] = dtype
             vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count
 
+
 _create_vector_types()
 
 # }}}
@@ -393,7 +395,15 @@
     .. automethod :: __rdiv__
     .. automethod :: __pow__
 
+    .. automethod :: __and__
+    .. automethod :: __xor__
+    .. automethod :: __or__
+    .. automethod :: __iand__
+    .. automethod :: __ixor__
+    .. automethod :: __ior__
+
     .. automethod :: __abs__
+    .. automethod :: __invert__
 
     .. UNDOC reverse()
 
@@ -862,6 +872,29 @@
             return self.__class__(self.context, self.shape, dtype,
                     strides=strides, allocator=self.allocator)
 
+    @staticmethod
+    @elwise_kernel_runner
+    def _scalar_binop(out, a, b, queue=None, op=None):
+        return elementwise.get_array_scalar_binop_kernel(
+                out.context, op, out.dtype, a.dtype,
+                np.array(b).dtype)
+
+    @staticmethod
+    @elwise_kernel_runner
+    def _array_binop(out, a, b, queue=None, op=None):
+        if a.shape != b.shape:
+            raise ValueError("shapes of binop arguments do not match")
+        return elementwise.get_array_binop_kernel(
+                out.context, op, out.dtype, a.dtype, b.dtype)
+
+    @staticmethod
+    @elwise_kernel_runner
+    def _unop(out, a, queue=None, op=None):
+        if out.shape != a.shape:
+            raise ValueError("shapes of arguments do not match")
+        return elementwise.get_unop_kernel(
+                out.context, op, a.dtype, out.dtype)
+
     # }}}
 
     # {{{ operators
@@ -1041,6 +1074,99 @@
 
     __rtruediv__ = __rdiv__
 
+    def __and__(self, other):
+        common_dtype = _get_common_dtype(self, other, self.queue)
+
+        if not np.issubdtype(common_dtype, np.integer):
+            raise TypeError("Integral types only")
+
+        if isinstance(other, Array):
+            result = self._new_like_me(common_dtype)
+            result.add_event(self._array_binop(result, self, other, op="&"))
+        else:
+            # create a new array for the result
+            result = self._new_like_me(common_dtype)
+            result.add_event(
+                    self._scalar_binop(result, self, other, op="&"))
+
+        return result
+
+    __rand__ = __and__  # commutes
+
+    def __or__(self, other):
+        common_dtype = _get_common_dtype(self, other, self.queue)
+
+        if not np.issubdtype(common_dtype, np.integer):
+            raise TypeError("Integral types only")
+
+        if isinstance(other, Array):
+            result = self._new_like_me(common_dtype)
+            result.add_event(self._array_binop(result, self, other, op="|"))
+        else:
+            # create a new array for the result
+            result = self._new_like_me(common_dtype)
+            result.add_event(
+                    self._scalar_binop(result, self, other, op="|"))
+
+        return result
+
+    __ror__ = __or__  # commutes
+
+    def __xor__(self, other):
+        common_dtype = _get_common_dtype(self, other, self.queue)
+
+        if not np.issubdtype(common_dtype, np.integer):
+            raise TypeError("Integral types only")
+
+        if isinstance(other, Array):
+            result = self._new_like_me(common_dtype)
+            result.add_event(self._array_binop(result, self, other, op="^"))
+        else:
+            # create a new array for the result
+            result = self._new_like_me(common_dtype)
+            result.add_event(
+                    self._scalar_binop(result, self, other, op="^"))
+
+        return result
+
+    __rxor__ = __xor__  # commutes
+
+    def __iand__(self, other):
+        common_dtype = _get_common_dtype(self, other, self.queue)
+
+        if not np.issubdtype(common_dtype, np.integer):
+            raise TypeError("Integral types only")
+
+        if isinstance(other, Array):
+            self.add_event(self._array_binop(self, self, other, op="&"))
+        else:
+            self.add_event(
+                    self._scalar_binop(self, self, other, op="&"))
+
+    def __ior__(self, other):
+        common_dtype = _get_common_dtype(self, other, self.queue)
+
+        if not np.issubdtype(common_dtype, np.integer):
+            raise TypeError("Integral types only")
+
+        if isinstance(other, Array):
+            self.add_event(self._array_binop(self, self, other, op="|"))
+        else:
+            self.add_event(
+                    self._scalar_binop(self, self, other, op="|"))
+
+    def __ixor__(self, other):
+        common_dtype = _get_common_dtype(self, other, self.queue)
+
+        if not np.issubdtype(common_dtype, np.integer):
+            raise TypeError("Integral types only")
+
+        if isinstance(other, Array):
+            self.add_event(self._array_binop(self, self, other, op="^"))
+        else:
+            self.add_event(
+                    self._scalar_binop(self, self, other, op="^"))
+
     def _zero_fill(self, queue=None, wait_for=None):
         queue = queue or self.queue
 
@@ -1109,6 +1235,15 @@
                 self._rpow_scalar(result, common_dtype.type(other), self))
         return result
 
+    def __invert__(self):
+        if not np.issubdtype(self.dtype, np.integer):
+            raise TypeError("Integral types only")
+
+        result = self._new_like_me()
+        result.add_event(self._unop(result, self, op="~"))
+
+        return result
+
     # }}}
 
     def reverse(self, queue=None):
@@ -1650,6 +1785,8 @@
             Added *wait_for*.
         """
 
+        queue = queue or self.queue or value.queue
+
         if isinstance(subscript, Array):
             if subscript.dtype.kind != "i":
                 raise TypeError(
@@ -1659,14 +1796,12 @@
                         "multidimensional fancy indexing is not supported")
             if len(self.shape) != 1:
                 raise NotImplementedError(
-                        "fancy indexing into a multi-d array is supported")
+                        "fancy indexing into a multi-d array is not supported")
 
-            multi_put([value], subscript, out=[self], queue=self.queue,
+            multi_put([value], subscript, out=[self], queue=queue,
                     wait_for=wait_for)
             return
 
-        queue = queue or self.queue or value.queue
-
         subarray = self[subscript]
 
         if isinstance(value, np.ndarray):
@@ -1738,7 +1873,7 @@
 
 # {{{ creation helpers
 
-class _same_as_transfer(object):
+class _same_as_transfer(object):  # noqa
     pass
 
 
@@ -2360,6 +2495,7 @@
 
     return f
 
+
 min = _make_minmax_kernel("min")
 min.__doc__ = """
     .. versionadded:: 2011.1
@@ -2379,6 +2515,7 @@
 
     return f
 
+
 subset_min = _make_subset_minmax_kernel("min")
 subset_min.__doc__ = """.. versionadded:: 2011.1"""
 subset_max = _make_subset_minmax_kernel("max")
diff -Nru pyopencl-2016.1+git20161003/pyopencl/bitonic_sort_templates.py pyopencl-2016.1+git20161130/pyopencl/bitonic_sort_templates.py
--- pyopencl-2016.1+git20161003/pyopencl/bitonic_sort_templates.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/bitonic_sort_templates.py	2016-12-05 17:46:14.000000000 +0000
@@ -36,7 +36,7 @@
 
 # {{{ defines
 
-defines = """//CL//
+defines = """//CL//  # noqa
 
 % if dtype == "double":
     #if __OPENCL_C_VERSION__ < 120
@@ -78,7 +78,7 @@
                         x[a] = (swap)?auxb:auxa; x[b] = (swap)?auxa:auxb;${NS}
                         y[a] = (swap)?auyb:auya; y[b] = (swap)?auya:auyb;}
 #define B2V(x,y,a)  { ORDERV(x,y,a,a+1) }
-#define B4V(x,y,a)  { for (int i4=0;i4<2;i4++) { ORDERV(x,y,a+i4,a+i4+2) } B2V(x,y,a) B2V(x,y,a+2) }
+#define B4V(x,y,a)  { for (int i4=0;i4<2;i4++) { ORDERV(x,y,a+i4,a+i4+2) } B2V(x,y,a) B2V(x,y,a+2) } 
 #define B8V(x,y,a)  { for (int i8=0;i8<4;i8++) { ORDERV(x,y,a+i8,a+i8+4) } B4V(x,y,a) B4V(x,y,a+4) }
 #define B16V(x,y,a) { for (int i16=0;i16<8;i16++) { ORDERV(x,y,a+i16,a+i16+8) } B8V(x,y,a) B8V(x,y,a+8) }
 % else:
@@ -328,7 +328,7 @@
 
 # IF YOU REENABLE THIS, YOU NEED TO ADJUST LOCAL_MEM_FACTOR TO 4
 
-ParallelBitonic_C4 = """//CL//
+ParallelBitonic_C4 = """//CL//  # noqa
 //ParallelBitonic_C4
 __kernel void run\\
 % if argsort:
@@ -402,7 +402,7 @@
 
 # {{{ local merge
 
-ParallelMerge_Local = """//CL//
+ParallelMerge_Local = """//CL//  # noqa
 // N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
 __kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
 {
@@ -450,7 +450,7 @@
 
 # {{{
 
-ParallelBitonic_Local = """//CL//
+ParallelBitonic_Local = """//CL//  # noqa
 // N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
 __kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
 {
@@ -521,7 +521,7 @@
 
 # {{{ local optim
 
-ParallelBitonic_Local_Optim = """//CL//
+ParallelBitonic_Local_Optim = """//CL//  # noqa
 __kernel void run\\
 % if argsort:
 (__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
diff -Nru pyopencl-2016.1+git20161003/pyopencl/_buffers.py pyopencl-2016.1+git20161130/pyopencl/_buffers.py
--- pyopencl-2016.1+git20161003/pyopencl/_buffers.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/_buffers.py	2016-12-05 17:46:14.000000000 +0000
@@ -108,7 +108,8 @@
     CheckBuffer.restype = ctypes.c_int
 except AttributeError as err:
     # Python 2.6 doesn't appear to have CheckBuffer support...
-    CheckBuffer = lambda x: True
+    def CheckBuffer(x):  # noqa
+        return True
 
 IncRef = ctypes.pythonapi.Py_IncRef
 IncRef.argtypes = [ctypes.py_object]
diff -Nru pyopencl-2016.1+git20161003/pyopencl/capture_call.py pyopencl-2016.1+git20161130/pyopencl/capture_call.py
--- pyopencl-2016.1+git20161003/pyopencl/capture_call.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/capture_call.py	2016-12-05 17:46:14.000000000 +0000
@@ -93,7 +93,7 @@
                     arg.dtype.type.__name__, repr(complex(arg))))
             else:
                 try:
-                    arg_buf = buffer(arg)
+                    arg_buf = memoryview(arg)
                 except:
                     raise RuntimeError("cannot capture: "
                             "unsupported arg nr %d (0-based)" % i)
@@ -150,7 +150,7 @@
     for name, val in arg_data:
         cg("%s = (" % name)
         with Indentation(cg):
-            val = str(b64encode(compress(buffer(val))))
+            val = str(b64encode(compress(memoryview(val))))
             i = 0
             while i < len(val):
                 cg(repr(val[i:i+line_len]))
diff -Nru pyopencl-2016.1+git20161003/pyopencl/cffi_cl.py pyopencl-2016.1+git20161130/pyopencl/cffi_cl.py
--- pyopencl-2016.1+git20161003/pyopencl/cffi_cl.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/cffi_cl.py	2016-12-05 17:46:14.000000000 +0000
@@ -498,6 +498,7 @@
 
 # }}}
 
+
 _locals = locals()
 
 
@@ -2071,6 +2072,7 @@
         else:
             return self.event.get_profiling_info(inf_attr)
 
+
 Event.profile = property(ProfilingInfoGetter)
 
 
@@ -2609,6 +2611,7 @@
         return Event._create(ptr_event[0])
     return enqueue_gl_objects
 
+
 if _lib.have_gl():
     enqueue_acquire_gl_objects = _create_gl_enqueue(
         _lib.enqueue_acquire_gl_objects)
@@ -3003,6 +3006,7 @@
         else:
             return self.event.get_image_info(inf_attr)
 
+
 Image.image = property(_ImageInfoGetter)
 
 # }}}
@@ -3124,6 +3128,7 @@
             setattr(cls, info_lower, make_getinfo(
                     info_method, info_name, info_constant))
 
+
 add_get_info_attrs(Platform, Platform.get_info, platform_info),
 add_get_info_attrs(Device, Device.get_info, device_info,
                 ["PLATFORM", "MAX_WORK_GROUP_SIZE", "MAX_COMPUTE_UNITS"])
diff -Nru pyopencl-2016.1+git20161003/pyopencl/clmath.py pyopencl-2016.1+git20161130/pyopencl/clmath.py
--- pyopencl-2016.1+git20161003/pyopencl/clmath.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/clmath.py	2016-12-05 17:46:14.000000000 +0000
@@ -46,6 +46,7 @@
 
     return f
 
+
 # See table 6.8 in the CL 1.1 spec
 acos = _make_unary_array_func("acos")
 acosh = _make_unary_array_func("acosh")
@@ -171,6 +172,7 @@
     _ldexp(result, significand, exponent)
     return result
 
+
 lgamma = _make_unary_array_func("lgamma")
 # TODO: lgamma_r
 
@@ -200,6 +202,7 @@
     _modf(intpart, fracpart, arg, queue=queue)
     return fracpart, intpart
 
+
 nan = _make_unary_array_func("nan")
 
 # TODO: nextafter
diff -Nru pyopencl-2016.1+git20161003/pyopencl/elementwise.py pyopencl-2016.1+git20161130/pyopencl/elementwise.py
--- pyopencl-2016.1+git20161003/pyopencl/elementwise.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/elementwise.py	2016-12-05 17:46:14.000000000 +0000
@@ -835,6 +835,38 @@
 
 
 @context_dependent_memoize
+def get_unop_kernel(context, operator, res_dtype, in_dtype):
+    return get_elwise_kernel(context, [
+        VectorArg(res_dtype, "z", with_offset=True),
+        VectorArg(in_dtype, "y", with_offset=True),
+        ],
+        "z[i] = %s y[i]" % operator,
+        name="unary_op_kernel")
+
+
+@context_dependent_memoize
+def get_array_scalar_binop_kernel(context, operator, dtype_res, dtype_a, dtype_b):
+    return get_elwise_kernel(context, [
+        VectorArg(dtype_res, "out", with_offset=True),
+        VectorArg(dtype_a, "a", with_offset=True),
+        ScalarArg(dtype_b, "b"),
+        ],
+        "out[i] = a[i] %s b" % operator,
+        name="scalar_binop_kernel")
+
+
+@context_dependent_memoize
+def get_array_binop_kernel(context, operator, dtype_res, dtype_a, dtype_b):
+    return get_elwise_kernel(context, [
+        VectorArg(dtype_res, "out", with_offset=True),
+        VectorArg(dtype_a, "a", with_offset=True),
+        VectorArg(dtype_b, "b", with_offset=True),
+        ],
+        "out[i] = a[i] %s b[i]" % operator,
+        name="binop_kernel")
+
+
+@context_dependent_memoize
 def get_array_scalar_comparison_kernel(context, operator, dtype_a):
     return get_elwise_kernel(context, [
         VectorArg(np.int8, "out", with_offset=True),
diff -Nru pyopencl-2016.1+git20161003/pyopencl/__init__.py pyopencl-2016.1+git20161130/pyopencl/__init__.py
--- pyopencl-2016.1+git20161003/pyopencl/__init__.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/__init__.py	2016-12-05 17:46:14.000000000 +0000
@@ -194,7 +194,7 @@
     try:
         # Try to find the resource with pkg_resources (the recommended
         # setuptools approach)
-        return resource_filename(Requirement.parse("pyopencl2"), "pyopencl/cl")
+        return resource_filename(Requirement.parse("pyopencl"), "pyopencl/cl")
     except DistributionNotFound:
         # If pkg_resources can't find it (e.g. if the module is part of a
         # frozen application), try to find the include path in the same
@@ -488,6 +488,7 @@
     def __hash__(self):
         return hash(self._get_prg())
 
+
 _add_get_info_attrs(Program, Program.get_info, program_info)
 
 
@@ -636,6 +637,7 @@
 
     return Context(devices, cache_dir=cache_dir)
 
+
 _csc = create_some_context
 
 # }}}
diff -Nru pyopencl-2016.1+git20161003/pyopencl/_mymako.py pyopencl-2016.1+git20161130/pyopencl/_mymako.py
--- pyopencl-2016.1+git20161003/pyopencl/_mymako.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/_mymako.py	2016-12-05 17:46:14.000000000 +0000
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 try:
-    import mako.template
+    import mako.template  # noqa
 except ImportError:
     raise ImportError(
             "Some of PyOpenCL's facilities require the Mako templating engine.\n"
@@ -12,4 +12,4 @@
             "- aptitude install python-mako\n"
             "\nor whatever else is appropriate for your system.")
 
-from mako import *
+from mako import *  # noqa
diff -Nru pyopencl-2016.1+git20161003/pyopencl/scan.py pyopencl-2016.1+git20161130/pyopencl/scan.py
--- pyopencl-2016.1+git20161003/pyopencl/scan.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/scan.py	2016-12-05 17:46:14.000000000 +0000
@@ -742,6 +742,7 @@
     assert result <= val
     return result
 
+
 _PREFIX_WORDS = set("""
         ldata partial_scan_buffer global scan_offset
         segment_start_in_k_group carry
@@ -850,6 +851,7 @@
 
     return mako.template.Template(s, strict_undefined=True)
 
+
 from pytools import Record
 
 
@@ -1412,6 +1414,7 @@
 
 # }}}
 
+
 # {{{ debug kernel
 
 DEBUG_SCAN_TEMPLATE = SHARED_PREAMBLE + r"""//CL//
diff -Nru pyopencl-2016.1+git20161003/pyopencl/tools.py pyopencl-2016.1+git20161130/pyopencl/tools.py
--- pyopencl-2016.1+git20161003/pyopencl/tools.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/pyopencl/tools.py	2016-12-05 17:46:14.000000000 +0000
@@ -53,6 +53,7 @@
     get_or_register_dtype("cfloat_t", np.complex64)
     get_or_register_dtype("cdouble_t", np.complex128)
 
+
 _register_types()
 
 
@@ -97,6 +98,7 @@
         arg_dict[args] = result
         return result
 
+
 context_dependent_memoize = first_arg_dependent_memoize
 
 
@@ -159,6 +161,7 @@
     for cache in _first_arg_dependent_caches:
         cache.clear()
 
+
 import atexit
 atexit.register(clear_first_arg_caches)
 
diff -Nru pyopencl-2016.1+git20161003/setup.cfg pyopencl-2016.1+git20161130/setup.cfg
--- pyopencl-2016.1+git20161003/setup.cfg	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/setup.cfg	2016-12-05 17:46:14.000000000 +0000
@@ -1,3 +1,4 @@
 [flake8]
 ignore = E126,E127,E128,E123,E226,E241,E242,E265,W503,E402
 max-line-length=85
+exclude=pyopencl/compyte/ndarray,pyopencl/compyte/array.py
diff -Nru pyopencl-2016.1+git20161003/test/test_algorithm.py pyopencl-2016.1+git20161130/test/test_algorithm.py
--- pyopencl-2016.1+git20161003/test/test_algorithm.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/test/test_algorithm.py	2016-12-05 17:46:14.000000000 +0000
@@ -511,6 +511,7 @@
 
     return " ".join(entries)
 
+
 scan_test_counts = [
     10,
     2 ** 8 - 1,
diff -Nru pyopencl-2016.1+git20161003/test/test_array.py pyopencl-2016.1+git20161130/test/test_array.py
--- pyopencl-2016.1+git20161003/test/test_array.py	2016-10-28 20:03:03.000000000 +0000
+++ pyopencl-2016.1+git20161130/test/test_array.py	2016-12-05 17:46:14.000000000 +0000
@@ -497,6 +497,76 @@
     a_divide = (b_gpu / a_gpu).get()
     assert (np.abs(b / a - a_divide) < 1e-3).all()
 
+
+def test_bitwise(ctx_factory):
+    if _PYPY:
+        pytest.xfail("numpypy: missing bitwise ops")
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from itertools import product
+
+    dtypes = [np.dtype(t) for t in (np.int64, np.int32, np.int16, np.int8)]
+
+    from pyopencl.clrandom import rand as clrand
+
+    for a_dtype, b_dtype in product(dtypes, dtypes):
+        l = 16
+
+        np.random.seed(10)
+
+        int32_min = np.iinfo(np.int32).min
+        int32_max = np.iinfo(np.int32).max
+
+        a_dev = clrand(
+            queue, (l,), a=int32_min, b=1+int32_max, dtype=np.int64).astype(a_dtype)
+        b_dev = clrand(
+            queue, (l,), a=int32_min, b=1+int32_max, dtype=np.int64).astype(b_dtype)
+
+        a = a_dev.get()
+        b = b_dev.get()
+        s = int((clrand(queue, (), a=int32_min, b=1+int32_max, dtype=np.int64)
+                 .astype(b_dtype).get()))
+
+        import operator as o
+
+        for op in [o.and_, o.or_, o.xor]:
+            res_dev = op(a_dev, b_dev)
+            res = op(a, b)
+
+            assert (res_dev.get() == res).all()
+
+            res_dev = op(a_dev, s)
+            res = op(a, s)
+
+            assert (res_dev.get() == res).all()
+
+            res_dev = op(s, b_dev)
+            res = op(s, b)
+
+            assert (res_dev.get() == res).all()
+
+        for op in [o.iand, o.ior, o.ixor]:
+            res_dev = a_dev.copy()
+            op(res_dev, b_dev)
+            res = a.copy()
+            op(res, b)
+
+            assert (res_dev.get() == res).all()
+
+            res_dev = a_dev.copy()
+            op(res_dev, s)
+            res = a.copy()
+            op(res, s)
+
+            assert (res_dev.get() == res).all()
+
+        # Test unary ~
+        res_dev = ~a_dev
+        res = ~a
+        assert (res_dev.get() == res).all()
+
 # }}}