diff -Nru python-scrapy-1.4.0/artwork/README.rst python-scrapy-1.5.0/artwork/README.rst --- python-scrapy-1.4.0/artwork/README.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/artwork/README.rst 2017-12-29 21:09:52.000000000 +0000 @@ -10,10 +10,10 @@ Main Scrapy logo, in JPEG format. -qlassik.zip +qlassik.zip ----------- -Font used for Scrapy logo. Homepage: http://www.dafont.com/qlassik.font +Font used for Scrapy logo. Homepage: https://www.dafont.com/qlassik.font scrapy-blog.logo.xcf -------------------- diff -Nru python-scrapy-1.4.0/.bumpversion.cfg python-scrapy-1.5.0/.bumpversion.cfg --- python-scrapy-1.4.0/.bumpversion.cfg 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/.bumpversion.cfg 2017-12-29 21:09:52.000000000 +0000 @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.4.0 +current_version = 1.5.0 commit = True tag = True tag_name = {new_version} diff -Nru python-scrapy-1.4.0/CONTRIBUTING.md python-scrapy-1.5.0/CONTRIBUTING.md --- python-scrapy-1.4.0/CONTRIBUTING.md 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/CONTRIBUTING.md 2017-12-29 21:09:52.000000000 +0000 @@ -1,6 +1,6 @@ The guidelines for contributing are available here: -http://doc.scrapy.org/en/master/contributing.html +https://doc.scrapy.org/en/master/contributing.html Please do not abuse the issue tracker for support questions. If your issue topic can be rephrased to "How to ...?", please use the -support channels to get it answered: http://scrapy.org/community/ +support channels to get it answered: https://scrapy.org/community/ diff -Nru python-scrapy-1.4.0/debian/changelog python-scrapy-1.5.0/debian/changelog --- python-scrapy-1.4.0/debian/changelog 2017-08-29 20:06:51.000000000 +0000 +++ python-scrapy-1.5.0/debian/changelog 2018-01-21 20:30:28.000000000 +0000 @@ -1,3 +1,19 @@ +python-scrapy (1.5.0-1) unstable; urgency=low + + * Team upload. + * New upstream release. + * Always use pristine-tar. + * Bump debhelper compatibility and version to 11. + * Bump Standards-Version to 4.1.3. + * Run wrap-and-sort -bast to reduce diff size of future changes. + * Enable autopkgtest-pkg-python testsuite. + * Remove debian/python-scrapy.examples as upstream no longer provides + the related files. + * Build documentation in override_dh_sphinxdoc. + * Remove trailing whitespaces in previous changelog entries. + + -- Michael Fladischer Sun, 21 Jan 2018 21:30:28 +0100 + python-scrapy (1.4.0-1) unstable; urgency=low * Upload to unstable (Closes: #871385). @@ -263,7 +279,7 @@ - Removed references to scrapy-ctl.1 and scrapy-sqs.1. - Added scrapy.1 manpage. * debian/watch: - - Updated the file to point to the new PyPI repository. Thank you Janos + - Updated the file to point to the new PyPI repository. Thank you Janos Guljas for the patch. * debian/python-scrapy.bash-completion: - New file pointing to the bash-completion file distributed by upstream. @@ -291,7 +307,7 @@ - Bumped Standards-Version to 3.9.1. - Moved python-boto from Recommends to Depends as it is now needed by the scrapy-sqs script. - - Removed dependencies on python-beautifulsoup and python-clientform as + - Removed dependencies on python-beautifulsoup and python-clientform as the embedded modules in scrapy/xlib/ will be used instead. * debian/copyright: - Removed reference to deprecated /usr/share/common-licenses/BSD. diff -Nru python-scrapy-1.4.0/debian/compat python-scrapy-1.5.0/debian/compat --- python-scrapy-1.4.0/debian/compat 2017-08-29 20:06:51.000000000 +0000 +++ python-scrapy-1.5.0/debian/compat 2018-01-21 20:30:28.000000000 +0000 @@ -1 +1 @@ -9 +11 diff -Nru python-scrapy-1.4.0/debian/control python-scrapy-1.5.0/debian/control --- python-scrapy-1.4.0/debian/control 2017-08-29 20:06:51.000000000 +0000 +++ python-scrapy-1.5.0/debian/control 2018-01-21 20:30:28.000000000 +0000 @@ -1,71 +1,78 @@ Source: python-scrapy Maintainer: Debian Python Modules Team -Uploaders: Ignace Mouzannar +Uploaders: + Ignace Mouzannar , Section: python Priority: optional -Build-Depends: bash-completion, - debhelper (>= 9), - dh-python, - python-all, - python3-all -Build-Depends-Indep: libjs-jquery, - python-botocore, - python-jmespath, - python-lxml, - python-mock, - python-parsel (>= 1.1.0-1~), - python-pil, - python-pydispatch, - python-pytest, - python-queuelib, - python-setuptools, - python-testfixtures, - python-twisted-conch (>= 1:17.1.0), - python-twisted-core (>= 17.1.0), - python-twisted-web (>= 17.1.0), - python-w3lib (>= 1.16.0-1~), - python3-botocore, - python3-jmespath, - python3-lxml, - python3-mock, - python3-parsel (>= 1.1.0-1~), - python3-pil, - python3-pydispatch, - python3-pytest, - python3-queuelib, - python3-setuptools, - python3-sphinx, - python3-sphinx-rtd-theme, - python3-testfixtures, - python3-twisted (>= 17.1.0), - python3-w3lib (>= 1.16.0-1~) -Standards-Version: 4.1.0 +Build-Depends: + bash-completion, + debhelper (>= 11), + dh-python, + python-all, + python3-all, +Build-Depends-Indep: + libjs-jquery, + python-botocore, + python-jmespath, + python-lxml, + python-mock, + python-parsel (>= 1.1.0-1~), + python-pil, + python-pydispatch, + python-pytest, + python-queuelib, + python-setuptools, + python-testfixtures, + python-twisted-conch (>= 1:17.1.0), + python-twisted-core (>= 17.1.0), + python-twisted-web (>= 17.1.0), + python-w3lib (>= 1.16.0-1~), + python3-botocore, + python3-jmespath, + python3-lxml, + python3-mock, + python3-parsel (>= 1.1.0-1~), + python3-pil, + python3-pydispatch, + python3-pytest, + python3-queuelib, + python3-setuptools, + python3-sphinx, + python3-sphinx-rtd-theme, + python3-testfixtures, + python3-twisted (>= 17.1.0), + python3-w3lib (>= 1.16.0-1~), +Standards-Version: 4.1.3 Vcs-Browser: https://anonscm.debian.org/cgit/python-modules/packages/python-scrapy.git Vcs-Git: https://anonscm.debian.org/git/python-modules/packages/python-scrapy.git Homepage: http://scrapy.org/ X-Python-Version: >= 2.7 X-Python3-Version: >= 3.3 +Testsuite: autopkgtest-pkg-python Package: python-scrapy Architecture: all -Depends: python-boto, - python-cssselect, - python-libxml2, - python-lxml, - python-queuelib, - python-twisted-conch, - python-twisted-core, - python-twisted-mail, - python-twisted-web, - python-w3lib (>= 1.8), - ${misc:Depends}, - ${python:Depends} -Recommends: ipython, - python-mysqldb, - python-pil, - python-pygments -Suggests: python-openssl, - python-scrapy-doc (= ${binary:Version}) +Depends: + python-boto, + python-cssselect, + python-libxml2, + python-lxml, + python-queuelib, + python-twisted-conch, + python-twisted-core, + python-twisted-mail, + python-twisted-web, + python-w3lib (>= 1.8), + ${misc:Depends}, + ${python:Depends}, +Recommends: + ipython, + python-mysqldb, + python-pil, + python-pygments, +Suggests: + python-openssl, + python-scrapy-doc (= ${binary:Version}), Description: Python web scraping and crawling framework (Python 2) Scrapy is a fast high-level screen scraping and web crawling framework, used to crawl websites and extract structured data from their pages. @@ -74,40 +81,44 @@ . This package provides the scrapy module for Python 2. -Package: python3-scrapy +Package: python-scrapy-doc Architecture: all -Depends: python3-boto, - python3-cssselect, - python3-libxml2, - python3-lxml, - python3-queuelib, - python3-twisted, - python3-w3lib (>= 1.8), - ${misc:Depends}, - ${python3:Depends} -Recommends: ipython3, - python3-mysqldb, - python3-pil, - python3-pygments -Suggests: python-scrapy-doc (= ${binary:Version}), - python3-openssl -Description: Python web scraping and crawling framework (Python 3) +Section: doc +Depends: + ${misc:Depends}, + ${sphinxdoc:Depends}, +Description: Python web scraping and crawling framework documentation Scrapy is a fast high-level screen scraping and web crawling framework, used to crawl websites and extract structured data from their pages. It can be used for a wide range of purposes, from data mining to monitoring and automated testing. . - This package provides the scrapy module for Python 3. + This package provides the python-scrapy documentation in HTML format. -Package: python-scrapy-doc +Package: python3-scrapy Architecture: all -Section: doc -Depends: ${misc:Depends}, - ${sphinxdoc:Depends} -Description: Python web scraping and crawling framework documentation +Depends: + python3-boto, + python3-cssselect, + python3-libxml2, + python3-lxml, + python3-queuelib, + python3-twisted, + python3-w3lib (>= 1.8), + ${misc:Depends}, + ${python3:Depends}, +Recommends: + ipython3, + python3-mysqldb, + python3-pil, + python3-pygments, +Suggests: + python-scrapy-doc (= ${binary:Version}), + python3-openssl, +Description: Python web scraping and crawling framework (Python 3) Scrapy is a fast high-level screen scraping and web crawling framework, used to crawl websites and extract structured data from their pages. It can be used for a wide range of purposes, from data mining to monitoring and automated testing. . - This package provides the python-scrapy documentation in HTML format. + This package provides the scrapy module for Python 3. diff -Nru python-scrapy-1.4.0/debian/gbp.conf python-scrapy-1.5.0/debian/gbp.conf --- python-scrapy-1.4.0/debian/gbp.conf 2017-08-29 20:06:51.000000000 +0000 +++ python-scrapy-1.5.0/debian/gbp.conf 2018-01-21 20:30:28.000000000 +0000 @@ -1,2 +1,3 @@ [DEFAULT] debian-branch=debian/master +pristine-tar=True diff -Nru python-scrapy-1.4.0/debian/python-scrapy-doc.docs python-scrapy-1.5.0/debian/python-scrapy-doc.docs --- python-scrapy-1.4.0/debian/python-scrapy-doc.docs 2017-08-29 20:06:51.000000000 +0000 +++ python-scrapy-1.5.0/debian/python-scrapy-doc.docs 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -docs/build/html diff -Nru python-scrapy-1.4.0/debian/python-scrapy.examples python-scrapy-1.5.0/debian/python-scrapy.examples --- python-scrapy-1.4.0/debian/python-scrapy.examples 2017-08-29 20:06:51.000000000 +0000 +++ python-scrapy-1.5.0/debian/python-scrapy.examples 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -examples/* diff -Nru python-scrapy-1.4.0/debian/rules python-scrapy-1.5.0/debian/rules --- python-scrapy-1.4.0/debian/rules 2017-08-29 20:06:51.000000000 +0000 +++ python-scrapy-1.5.0/debian/rules 2018-01-21 20:30:28.000000000 +0000 @@ -5,10 +5,6 @@ export PYBUILD_BEFORE_TEST=cd {dir}/tests/keys; cat example-com.key.pem example-com.cert.pem >cert.pem export DH_ALWAYS_EXCLUDE=license.txt:_sources/:.buildinfo -LAST_CHANGE = $(shell dpkg-parsechangelog -S Date) -BUILD_DATE = $(shell LC_ALL=C date -u "+%B %d, %Y" -d "$(LAST_CHANGE)") -SPHINXOPTS := -D html_last_updated_fmt=\"$(BUILD_DATE)\" - %: dh $@ --with python2,python3,bash_completion,sphinxdoc --buildsystem=pybuild @@ -16,14 +12,19 @@ dh_auto_install mv debian/python-scrapy/usr/bin/scrapy debian/python-scrapy/usr/bin/python2-scrapy mv debian/python3-scrapy/usr/bin/scrapy debian/python3-scrapy/usr/bin/python3-scrapy - cd docs/ && $(MAKE) html SPHINXOPTS="$(SPHINXOPTS)" + +override_dh_sphinxdoc: +ifeq (,$(findstring nodoc, $(DEB_BUILD_OPTIONS))) + PYTHONPATH=. sphinx-build -b html -N docs/ $(CURDIR)/debian/python-scrapy-doc/usr/share/doc/python-scrapy-doc/html + dh_sphinxdoc +endif override_dh_install: dh_install -Xjquery.js -override_dh_auto_clean: - dh_auto_clean - cd docs/ && $(MAKE) clean +override_dh_clean: + rm -rf docs/.build + dh_clean override_dh_compress: dh_compress -X.js -Xobjects.inv diff -Nru python-scrapy-1.4.0/docs/conf.py python-scrapy-1.5.0/docs/conf.py --- python-scrapy-1.4.0/docs/conf.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/conf.py 2017-12-29 21:09:52.000000000 +0000 @@ -191,8 +191,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, document class [howto/manual]). latex_documents = [ - ('index', 'Scrapy.tex', ur'Scrapy Documentation', - ur'Scrapy developers', 'manual'), + ('index', 'Scrapy.tex', u'Scrapy Documentation', + u'Scrapy developers', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of diff -Nru python-scrapy-1.4.0/docs/contributing.rst python-scrapy-1.5.0/docs/contributing.rst --- python-scrapy-1.4.0/docs/contributing.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/contributing.rst 2017-12-29 21:09:52.000000000 +0000 @@ -7,7 +7,7 @@ .. important:: Double check you are reading the most recent version of this document at - http://doc.scrapy.org/en/master/contributing.html + https://doc.scrapy.org/en/master/contributing.html There are many ways to contribute to Scrapy. Here are some of them: @@ -19,12 +19,16 @@ the guidelines detailed in `Reporting bugs`_ below. * Submit patches for new functionality and/or bug fixes. Please read - `Writing patches`_ and `Submitting patches`_ below for details on how to + :ref:`writing-patches` and `Submitting patches`_ below for details on how to write and submit a patch. -* Join the `scrapy-users`_ mailing list and share your ideas on how to +* Join the `Scrapy subreddit`_ and share your ideas on how to improve Scrapy. We're always open to suggestions. +* Answer Scrapy questions at + `Stack Overflow `__. + + Reporting bugs ============== @@ -40,13 +44,18 @@ * check the :ref:`FAQ ` first to see if your issue is addressed in a well-known question -* check the `open issues`_ to see if it has already been reported. If it has, - don't dismiss the report but check the ticket history and comments, you may - find additional useful information to contribute. +* if you have a general question about scrapy usage, please ask it at + `Stack Overflow `__ + (use "scrapy" tag). -* search the `scrapy-users`_ list to see if it has been discussed there, or - if you're not sure if what you're seeing is a bug. You can also ask in the - `#scrapy` IRC channel. +* check the `open issues`_ to see if it has already been reported. If it has, + don't dismiss the report, but check the ticket history and comments. If you + have additional useful information, please leave a comment, or consider + :ref:`sending a pull request ` with a fix. + +* search the `scrapy-users`_ list and `Scrapy subreddit`_ to see if it has + been discussed there, or if you're not sure if what you're seeing is a bug. + You can also ask in the `#scrapy` IRC channel. * write **complete, reproducible, specific bug reports**. The smaller the test case, the better. Remember that other developers won't have your project to @@ -54,12 +63,20 @@ it. See for example StackOverflow's guide on creating a `Minimal, Complete, and Verifiable example`_ exhibiting the issue. +* the most awesome way to provide a complete reproducible example is to + send a pull request which adds a failing test case to the + Scrapy testing suite (see :ref:`submitting-patches`). + This is helpful even if you don't have an intention to + fix the issue yourselves. + * include the output of ``scrapy version -v`` so developers working on your bug know exactly which version and platform it occurred on, which is often very helpful for reproducing it, or knowing if it was already fixed. .. _Minimal, Complete, and Verifiable example: https://stackoverflow.com/help/mcve +.. _writing-patches: + Writing patches =============== @@ -83,6 +100,8 @@ the documentation changes in the same patch. See `Documentation policies`_ below. +.. _submitting-patches: + Submitting patches ================== @@ -98,13 +117,31 @@ and show that you have put some additional thought into the subject. A good starting point is to send a pull request on GitHub. It can be simple enough to illustrate your idea, and leave documentation/tests for later, after the idea -has been validated and proven useful. Alternatively, you can send an email to -`scrapy-users`_ to discuss your idea first. +has been validated and proven useful. Alternatively, you can start a +conversation in the `Scrapy subreddit`_ to discuss your idea first. + +Sometimes there is an existing pull request for the problem you'd like to +solve, which is stalled for some reason. Often the pull request is in a +right direction, but changes are requested by Scrapy maintainers, and the +original pull request author haven't had time to address them. +In this case consider picking up this pull request: open +a new pull request with all commits from the original pull request, as well as +additional changes to address the raised issues. Doing so helps a lot; it is +not considered rude as soon as the original author is acknowledged by keeping +his/her commits. + +You can pull an existing pull request to a local branch +by running ``git fetch upstream pull/$PR_NUMBER/head:$BRANCH_NAME_TO_CREATE`` +(replace 'upstream' with a remote name for scrapy repository, +``$PR_NUMBER`` with an ID of the pull request, and ``$BRANCH_NAME_TO_CREATE`` +with a name of the branch you want to create locally). +See also: https://help.github.com/articles/checking-out-pull-requests-locally/#modifying-an-inactive-pull-request-locally. + When writing GitHub pull requests, try to keep titles short but descriptive. E.g. For bug #411: "Scrapy hangs if an exception raises in start_requests" prefer "Fix hanging when exception occurs in start_requests (#411)" -instead of "Fix for #411". -Complete titles make it easy to skim through the issue tracker. +instead of "Fix for #411". Complete titles make it easy to skim through +the issue tracker. Finally, try to keep aesthetic changes (:pep:`8` compliance, unused imports removal, etc) in separate commits than functional changes. This will make pull @@ -121,21 +158,29 @@ * It's OK to use lines longer than 80 chars if it improves the code readability. -* Don't put your name in the code you contribute. Our policy is to keep - the contributor's name in the `AUTHORS`_ file distributed with Scrapy. +* Don't put your name in the code you contribute; git provides enough + metadata to identify author of the code. + See https://help.github.com/articles/setting-your-username-in-git/ for + setup instructions. Documentation policies ====================== * **Don't** use docstrings for documenting classes, or methods which are - already documented in the official (sphinx) documentation. For example, the - :meth:`ItemLoader.add_value` method should be documented in the sphinx - documentation, not its docstring. + already documented in the official (sphinx) documentation. Alternatively, + **do** provide a docstring, but make sure sphinx documentation uses + autodoc_ extension to pull the docstring. For example, the + :meth:`ItemLoader.add_value` method should be either + documented only in the sphinx documentation (not it a docstring), or + it should have a docstring which is pulled to sphinx documentation using + autodoc_ extension. * **Do** use docstrings for documenting functions not present in the official (sphinx) documentation, such as functions from ``scrapy.utils`` package and its sub-modules. +.. _autodoc: http://www.sphinx-doc.org/en/stable/ext/autodoc.html + Tests ===== @@ -188,6 +233,7 @@ .. _issue tracker: https://github.com/scrapy/scrapy/issues .. _scrapy-users: https://groups.google.com/forum/#!forum/scrapy-users +.. _Scrapy subreddit: https://reddit.com/r/scrapy .. _Twisted unit-testing framework: https://twistedmatrix.com/documents/current/core/development/policy/test-standard.html .. _AUTHORS: https://github.com/scrapy/scrapy/blob/master/AUTHORS .. _tests/: https://github.com/scrapy/scrapy/tree/master/tests diff -Nru python-scrapy-1.4.0/docs/faq.rst python-scrapy-1.5.0/docs/faq.rst --- python-scrapy-1.4.0/docs/faq.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/faq.rst 2017-12-29 21:09:52.000000000 +0000 @@ -21,7 +21,7 @@ In other words, comparing `BeautifulSoup`_ (or `lxml`_) to Scrapy is like comparing `jinja2`_ to `Django`_. -.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ +.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ .. _lxml: http://lxml.de/ .. _jinja2: http://jinja.pocoo.org/ .. _Django: https://www.djangoproject.com/ @@ -69,9 +69,11 @@ What Python versions does Scrapy support? ----------------------------------------- -Scrapy is supported under Python 2.7 and Python 3.3+. +Scrapy is supported under Python 2.7 and Python 3.4+ +under CPython (default Python implementation) and PyPy (starting with PyPy 5.9). Python 2.6 support was dropped starting at Scrapy 0.20. Python 3 support was added in Scrapy 1.1. +PyPy support was added in Scrapy 1.4, PyPy3 support was added in Scrapy 1.5. .. note:: For Python 3 support on Windows, it is recommended to use diff -Nru python-scrapy-1.4.0/docs/index.rst python-scrapy-1.5.0/docs/index.rst --- python-scrapy-1.4.0/docs/index.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/index.rst 2017-12-29 21:09:52.000000000 +0000 @@ -13,14 +13,14 @@ * Try the :doc:`FAQ ` -- it's got answers to some common questions. * Looking for specific information? Try the :ref:`genindex` or :ref:`modindex`. -* Ask or search questions in `StackOverflow using the scrapy tag`_, -* Search for information in the `archives of the scrapy-users mailing list`_, or - `post a question`_. +* Ask or search questions in `StackOverflow using the scrapy tag`_. +* Ask or search questions in the `Scrapy subreddit`_. +* Search for questions on the archives of the `scrapy-users mailing list`_. * Ask a question in the `#scrapy IRC channel`_, * Report bugs with Scrapy in our `issue tracker`_. -.. _archives of the scrapy-users mailing list: https://groups.google.com/forum/#!forum/scrapy-users -.. _post a question: https://groups.google.com/forum/#!forum/scrapy-users +.. _scrapy-users mailing list: https://groups.google.com/forum/#!forum/scrapy-users +.. _Scrapy subreddit: https://www.reddit.com/r/scrapy/ .. _StackOverflow using the scrapy tag: https://stackoverflow.com/tags/scrapy .. _#scrapy IRC channel: irc://irc.freenode.net/scrapy .. _issue tracker: https://github.com/scrapy/scrapy/issues diff -Nru python-scrapy-1.4.0/docs/intro/install.rst python-scrapy-1.5.0/docs/intro/install.rst --- python-scrapy-1.4.0/docs/intro/install.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/intro/install.rst 2017-12-29 21:09:52.000000000 +0000 @@ -7,7 +7,8 @@ Installing Scrapy ================= -Scrapy runs on Python 2.7 and Python 3.3 or above. +Scrapy runs on Python 2.7 and Python 3.4 or above +under CPython (default Python implementation) and PyPy (starting with PyPy 5.9). If you're using `Anaconda`_ or `Miniconda`_, you can install the package from the `conda-forge`_ channel, which has up-to-date packages for Linux, Windows @@ -107,7 +108,7 @@ .. _virtualenv: https://virtualenv.pypa.io .. _virtualenv installation instructions: https://virtualenv.pypa.io/en/stable/installation/ -.. _virtualenvwrapper: http://virtualenvwrapper.readthedocs.io/en/latest/install.html +.. _virtualenvwrapper: https://virtualenvwrapper.readthedocs.io/en/latest/install.html .. _user guide: https://virtualenv.pypa.io/en/stable/userguide/ @@ -132,12 +133,12 @@ .. _intro-install-ubuntu: -Ubuntu 12.04 or above +Ubuntu 14.04 or above --------------------- Scrapy is currently tested with recent-enough versions of lxml, twisted and pyOpenSSL, and is compatible with recent Ubuntu distributions. -But it should support older versions of Ubuntu too, like Ubuntu 12.04, +But it should support older versions of Ubuntu too, like Ubuntu 14.04, albeit with potential issues with TLS connections. **Don't** use the ``python-scrapy`` package provided by Ubuntu, they are @@ -163,8 +164,8 @@ pip install scrapy .. note:: - The same non-python dependencies can be used to install Scrapy in Debian - Wheezy (7.0) and above. + The same non-Python dependencies can be used to install Scrapy in Debian + Jessie (8.0) and above. .. _intro-install-macos: @@ -188,7 +189,7 @@ that doesn't conflict with the rest of your system. Here's how to do it using the `homebrew`_ package manager: - * Install `homebrew`_ following the instructions in http://brew.sh/ + * Install `homebrew`_ following the instructions in https://brew.sh/ * Update your ``PATH`` variable to state that homebrew packages should be used before system packages (Change ``.bashrc`` to ``.zshrc`` accordantly @@ -223,6 +224,29 @@ pip install Scrapy +PyPy +---- + +We recommend using the latest PyPy version. The version tested is 5.9.0. +For PyPy3, only Linux installation was tested. + +Most scrapy dependencides now have binary wheels for CPython, but not for PyPy. +This means that these dependecies will be built during installation. +On OS X, you are likely to face an issue with building Cryptography dependency, +solution to this problem is described +`here `_, +that is to ``brew install openssl`` and then export the flags that this command +recommends (only needed when installing scrapy). Installing on Linux has no special +issues besides installing build dependencies. +Installing scrapy with PyPy on Windows is not tested. + +You can check that scrapy is installed correctly by running ``scrapy bench``. +If this command gives errors such as +``TypeError: ... got 2 unexpected keyword arguments``, this means +that setuptools was unable to pick up one PyPy-specific dependency. +To fix this issue, run ``pip install 'PyPyDispatcher>=2.1.0'``. + + .. _Python: https://www.python.org/ .. _pip: https://pip.pypa.io/en/latest/installing/ .. _lxml: http://lxml.de/ @@ -233,9 +257,9 @@ .. _pyOpenSSL: https://pypi.python.org/pypi/pyOpenSSL .. _setuptools: https://pypi.python.org/pypi/setuptools .. _AUR Scrapy package: https://aur.archlinux.org/packages/scrapy/ -.. _homebrew: http://brew.sh/ -.. _zsh: http://www.zsh.org/ -.. _Scrapinghub: http://scrapinghub.com -.. _Anaconda: http://docs.continuum.io/anaconda/index -.. _Miniconda: http://conda.pydata.org/docs/install/quick.html -.. _conda-forge: https://conda-forge.github.io/ +.. _homebrew: https://brew.sh/ +.. _zsh: https://www.zsh.org/ +.. _Scrapinghub: https://scrapinghub.com +.. _Anaconda: https://docs.anaconda.com/anaconda/ +.. _Miniconda: https://conda.io/docs/user-guide/install/index.html +.. _conda-forge: https://conda-forge.org/ diff -Nru python-scrapy-1.4.0/docs/intro/overview.rst python-scrapy-1.5.0/docs/intro/overview.rst --- python-scrapy-1.4.0/docs/intro/overview.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/intro/overview.rst 2017-12-29 21:09:52.000000000 +0000 @@ -160,8 +160,8 @@ a full-blown Scrapy project and `join the community`_. Thanks for your interest! -.. _join the community: http://scrapy.org/community/ +.. _join the community: https://scrapy.org/community/ .. _web scraping: https://en.wikipedia.org/wiki/Web_scraping .. _Amazon Associates Web Services: https://affiliate-program.amazon.com/gp/advertising/api/detail/main.html .. _Amazon S3: https://aws.amazon.com/s3/ -.. _Sitemaps: http://www.sitemaps.org +.. _Sitemaps: https://www.sitemaps.org/index.html diff -Nru python-scrapy-1.4.0/docs/intro/tutorial.rst python-scrapy-1.5.0/docs/intro/tutorial.rst --- python-scrapy-1.4.0/docs/intro/tutorial.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/intro/tutorial.rst 2017-12-29 21:09:52.000000000 +0000 @@ -34,7 +34,7 @@ .. _this list of Python resources for non-programmers: https://wiki.python.org/moin/BeginnersGuide/NonProgrammers .. _Dive Into Python 3: http://www.diveintopython3.net .. _Python Tutorial: https://docs.python.org/3/tutorial -.. _Learn Python The Hard Way: http://learnpythonthehardway.org/book/ +.. _Learn Python The Hard Way: https://learnpythonthehardway.org/book/ Creating a project @@ -54,6 +54,8 @@ __init__.py items.py # project items definition file + + middlewares.py # project middlewares file pipelines.py # project pipelines file @@ -452,7 +454,7 @@ its contents. If you run this command twice without removing the file before the second time, you'll end up with a broken JSON file. -You can also used other formats, like `JSON Lines`_:: +You can also use other formats, like `JSON Lines`_:: scrapy crawl quotes -o quotes.jl diff -Nru python-scrapy-1.4.0/docs/news.rst python-scrapy-1.5.0/docs/news.rst --- python-scrapy-1.4.0/docs/news.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/news.rst 2017-12-29 21:09:52.000000000 +0000 @@ -3,6 +3,118 @@ Release notes ============= +Scrapy 1.5.0 (2017-12-29) +------------------------- + +This release brings small new features and improvements across the codebase. +Some highlights: + +* Google Cloud Storage is supported in FilesPipeline and ImagesPipeline. +* Crawling with proxy servers becomes more efficient, as connections + to proxies can be reused now. +* Warnings, exception and logging messages are improved to make debugging + easier. +* ``scrapy parse`` command now allows to set custom request meta via + ``--meta`` argument. +* Compatibility with Python 3.6, PyPy and PyPy3 is improved; + PyPy and PyPy3 are now supported officially, by running tests on CI. +* Better default handling of HTTP 308, 522 and 524 status codes. +* Documentation is improved, as usual. + +Backwards Incompatible Changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Scrapy 1.5 drops support for Python 3.3. +* Default Scrapy User-Agent now uses https link to scrapy.org (:issue:`2983`). + **This is technically backwards-incompatible**; override + :setting:`USER_AGENT` if you relied on old value. +* Logging of settings overridden by ``custom_settings`` is fixed; + **this is technically backwards-incompatible** because the logger + changes from ``[scrapy.utils.log]`` to ``[scrapy.crawler]``. If you're + parsing Scrapy logs, please update your log parsers (:issue:`1343`). +* LinkExtractor now ignores ``m4v`` extension by default, this is change + in behavior. +* 522 and 524 status codes are added to ``RETRY_HTTP_CODES`` (:issue:`2851`) + +New features +~~~~~~~~~~~~ + +- Support ```` tags in ``Response.follow`` (:issue:`2785`) +- Support for ``ptpython`` REPL (:issue:`2654`) +- Google Cloud Storage support for FilesPipeline and ImagesPipeline + (:issue:`2923`). +- New ``--meta`` option of the "scrapy parse" command allows to pass additional + request.meta (:issue:`2883`) +- Populate spider variable when using ``shell.inspect_response`` (:issue:`2812`) +- Handle HTTP 308 Permanent Redirect (:issue:`2844`) +- Add 522 and 524 to ``RETRY_HTTP_CODES`` (:issue:`2851`) +- Log versions information at startup (:issue:`2857`) +- ``scrapy.mail.MailSender`` now works in Python 3 (it requires Twisted 17.9.0) +- Connections to proxy servers are reused (:issue:`2743`) +- Add template for a downloader middleware (:issue:`2755`) +- Explicit message for NotImplementedError when parse callback not defined + (:issue:`2831`) +- CrawlerProcess got an option to disable installation of root log handler + (:issue:`2921`) +- LinkExtractor now ignores ``m4v`` extension by default +- Better log messages for responses over :setting:`DOWNLOAD_WARNSIZE` and + :setting:`DOWNLOAD_MAXSIZE` limits (:issue:`2927`) +- Show warning when a URL is put to ``Spider.allowed_domains`` instead of + a domain (:issue:`2250`). + +Bug fixes +~~~~~~~~~ + +- Fix logging of settings overridden by ``custom_settings``; + **this is technically backwards-incompatible** because the logger + changes from ``[scrapy.utils.log]`` to ``[scrapy.crawler]``, so please + update your log parsers if needed (:issue:`1343`) +- Default Scrapy User-Agent now uses https link to scrapy.org (:issue:`2983`). + **This is technically backwards-incompatible**; override + :setting:`USER_AGENT` if you relied on old value. +- Fix PyPy and PyPy3 test failures, support them officially + (:issue:`2793`, :issue:`2935`, :issue:`2990`, :issue:`3050`, :issue:`2213`, + :issue:`3048`) +- Fix DNS resolver when ``DNSCACHE_ENABLED=False`` (:issue:`2811`) +- Add ``cryptography`` for Debian Jessie tox test env (:issue:`2848`) +- Add verification to check if Request callback is callable (:issue:`2766`) +- Port ``extras/qpsclient.py`` to Python 3 (:issue:`2849`) +- Use getfullargspec under the scenes for Python 3 to stop DeprecationWarning + (:issue:`2862`) +- Update deprecated test aliases (:issue:`2876`) +- Fix ``SitemapSpider`` support for alternate links (:issue:`2853`) + +Docs +~~~~ + +- Added missing bullet point for the ``AUTOTHROTTLE_TARGET_CONCURRENCY`` + setting. (:issue:`2756`) +- Update Contributing docs, document new support channels + (:issue:`2762`, issue:`3038`) +- Include references to Scrapy subreddit in the docs +- Fix broken links; use https:// for external links + (:issue:`2978`, :issue:`2982`, :issue:`2958`) +- Document CloseSpider extension better (:issue:`2759`) +- Use ``pymongo.collection.Collection.insert_one()`` in MongoDB example + (:issue:`2781`) +- Spelling mistake and typos + (:issue:`2828`, :issue:`2837`, :issue:`#2884`, :issue:`2924`) +- Clarify ``CSVFeedSpider.headers`` documentation (:issue:`2826`) +- Document ``DontCloseSpider`` exception and clarify ``spider_idle`` + (:issue:`2791`) +- Update "Releases" section in README (:issue:`2764`) +- Fix rst syntax in ``DOWNLOAD_FAIL_ON_DATALOSS`` docs (:issue:`2763`) +- Small fix in description of startproject arguments (:issue:`2866`) +- Clarify data types in Response.body docs (:issue:`2922`) +- Add a note about ``request.meta['depth']`` to DepthMiddleware docs (:issue:`2374`) +- Add a note about ``request.meta['dont_merge_cookies']`` to CookiesMiddleware + docs (:issue:`2999`) +- Up-to-date example of project structure (:issue:`2964`, :issue:`2976`) +- A better example of ItemExporters usage (:issue:`2989`) +- Document ``from_crawler`` methods for spider and downloader middlewares + (:issue:`3019`) + + Scrapy 1.4.0 (2017-05-18) ------------------------- @@ -12,7 +124,7 @@ Scrapy now supports anonymous FTP sessions with customizable user and password via the new :setting:`FTP_USER` and :setting:`FTP_PASSWORD` settings. And if you're using Twisted version 17.1.0 or above, FTP is now available -with Python 3. +with Python 3. There's a new :meth:`response.follow ` method for creating requests; **it is now a recommended way to create Requests @@ -407,7 +519,7 @@ - ``canonicalize_url`` has been moved to `w3lib.url`_ (:issue:`2168`). -.. _w3lib.url: http://w3lib.readthedocs.io/en/latest/w3lib.html#w3lib.url.canonicalize_url +.. _w3lib.url: https://w3lib.readthedocs.io/en/latest/w3lib.html#w3lib.url.canonicalize_url Tests & Requirements ~~~~~~~~~~~~~~~~~~~~ diff -Nru python-scrapy-1.4.0/docs/topics/autothrottle.rst python-scrapy-1.5.0/docs/topics/autothrottle.rst --- python-scrapy-1.4.0/docs/topics/autothrottle.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/autothrottle.rst 2017-12-29 21:09:52.000000000 +0000 @@ -88,6 +88,7 @@ * :setting:`AUTOTHROTTLE_ENABLED` * :setting:`AUTOTHROTTLE_START_DELAY` * :setting:`AUTOTHROTTLE_MAX_DELAY` +* :setting:`AUTOTHROTTLE_TARGET_CONCURRENCY` * :setting:`AUTOTHROTTLE_DEBUG` * :setting:`CONCURRENT_REQUESTS_PER_DOMAIN` * :setting:`CONCURRENT_REQUESTS_PER_IP` diff -Nru python-scrapy-1.4.0/docs/topics/broad-crawls.rst python-scrapy-1.5.0/docs/topics/broad-crawls.rst --- python-scrapy-1.4.0/docs/topics/broad-crawls.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/broad-crawls.rst 2017-12-29 21:09:52.000000000 +0000 @@ -20,7 +20,7 @@ * they crawl many domains (often, unbounded) instead of a specific set of sites -* they don't necessarily crawl domains to completion, because it would +* they don't necessarily crawl domains to completion, because it would be impractical (or impossible) to do so, and instead limit the crawl by time or number of pages crawled @@ -85,8 +85,8 @@ get and any errors found. These stats are reported by Scrapy when using the ``INFO`` log level. In order to save CPU (and log storage requirements) you should not use ``DEBUG`` log level when preforming large broad crawls in -production. Using ``DEBUG`` level when developing your (broad) crawler may fine -though. +production. Using ``DEBUG`` level when developing your (broad) crawler may be +fine though. To set the log level use:: diff -Nru python-scrapy-1.4.0/docs/topics/commands.rst python-scrapy-1.5.0/docs/topics/commands.rst --- python-scrapy-1.4.0/docs/topics/commands.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/commands.rst 2017-12-29 21:09:52.000000000 +0000 @@ -55,6 +55,7 @@ myproject/ __init__.py items.py + middlewares.py pipelines.py settings.py spiders/ @@ -187,7 +188,7 @@ Creates a new Scrapy project named ``project_name``, under the ``project_dir`` directory. -If ``project_dir`` wasn't specified, ``project_dir`` will be the same as ``myproject``. +If ``project_dir`` wasn't specified, ``project_dir`` will be the same as ``project_name``. Usage example:: @@ -430,6 +431,9 @@ * ``--callback`` or ``-c``: spider method to use as callback for parsing the response +* ``--meta`` or ``-m``: additional request meta that will be passed to the callback + request. This must be a valid json string. Example: --meta='{"foo" : "bar"}' + * ``--pipelines``: process items through pipelines * ``--rules`` or ``-r``: use :class:`~scrapy.spiders.CrawlSpider` @@ -543,7 +547,7 @@ COMMANDS_MODULE = 'mybot.commands' -.. _Deploying your project: http://scrapyd.readthedocs.org/en/latest/deploy.html +.. _Deploying your project: https://scrapyd.readthedocs.io/en/latest/deploy.html Register commands via setup.py entry points ------------------------------------------- diff -Nru python-scrapy-1.4.0/docs/topics/debug.rst python-scrapy-1.5.0/docs/topics/debug.rst --- python-scrapy-1.4.0/docs/topics/debug.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/debug.rst 2017-12-29 21:09:52.000000000 +0000 @@ -142,4 +142,4 @@ For more information, check the :ref:`topics-logging` section. -.. _base tag: http://www.w3schools.com/tags/tag_base.asp +.. _base tag: https://www.w3schools.com/tags/tag_base.asp diff -Nru python-scrapy-1.4.0/docs/topics/deploy.rst python-scrapy-1.5.0/docs/topics/deploy.rst --- python-scrapy-1.4.0/docs/topics/deploy.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/deploy.rst 2017-12-29 21:09:52.000000000 +0000 @@ -50,10 +50,10 @@ just like ``scrapyd-deploy``. .. _Scrapyd: https://github.com/scrapy/scrapyd -.. _Deploying your project: https://scrapyd.readthedocs.org/en/latest/deploy.html -.. _Scrapy Cloud: http://scrapinghub.com/scrapy-cloud/ +.. _Deploying your project: https://scrapyd.readthedocs.io/en/latest/deploy.html +.. _Scrapy Cloud: https://scrapinghub.com/scrapy-cloud .. _scrapyd-client: https://github.com/scrapy/scrapyd-client -.. _shub: http://doc.scrapinghub.com/shub.html -.. _scrapyd-deploy documentation: http://scrapyd.readthedocs.org/en/latest/deploy.html -.. _Scrapy Cloud documentation: http://doc.scrapinghub.com/scrapy-cloud.html -.. _Scrapinghub: http://scrapinghub.com/ +.. _shub: https://doc.scrapinghub.com/shub.html +.. _scrapyd-deploy documentation: https://scrapyd.readthedocs.io/en/latest/deploy.html +.. _Scrapy Cloud documentation: https://doc.scrapinghub.com/scrapy-cloud.html +.. _Scrapinghub: https://scrapinghub.com/ diff -Nru python-scrapy-1.4.0/docs/topics/downloader-middleware.rst python-scrapy-1.5.0/docs/topics/downloader-middleware.rst --- python-scrapy-1.4.0/docs/topics/downloader-middleware.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/downloader-middleware.rst 2017-12-29 21:09:52.000000000 +0000 @@ -157,6 +157,17 @@ :param spider: the spider for which this request is intended :type spider: :class:`~scrapy.spiders.Spider` object + .. method:: from_crawler(cls, crawler) + + If present, this classmethod is called to create a middleware instance + from a :class:`~scrapy.crawler.Crawler`. It must return a new instance + of the middleware. Crawler object provides access to all Scrapy core + components like settings and signals; it is a way for middleware to + access them and hook its functionality into Scrapy. + + :param crawler: crawler that uses this middleware + :type crawler: :class:`~scrapy.crawler.Crawler` object + .. _topics-downloader-middleware-ref: Built-in downloader middleware reference @@ -226,6 +237,17 @@ Whether to enable the cookies middleware. If disabled, no cookies will be sent to web servers. +Notice that if the :class:`~scrapy.http.Request` +has ``meta['dont_merge_cookies']`` evaluated to ``True``. +despite the value of :setting:`COOKIES_ENABLED` the cookies will **not** be +sent to web servers and received cookies in +:class:`~scrapy.http.Response` will **not** be merged with the existing +cookies. + +For more detailed information see the ``cookies`` parameter in +:class:`~scrapy.http.Request` + + .. setting:: COOKIES_DEBUG COOKIES_DEBUG diff -Nru python-scrapy-1.4.0/docs/topics/email.rst python-scrapy-1.5.0/docs/topics/email.rst --- python-scrapy-1.4.0/docs/topics/email.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/email.rst 2017-12-29 21:09:52.000000000 +0000 @@ -54,10 +54,10 @@ :param smtpuser: the SMTP user. If omitted, the :setting:`MAIL_USER` setting will be used. If not given, no SMTP authentication will be performed. - :type smtphost: str + :type smtphost: str or bytes :param smtppass: the SMTP pass for authentication. - :type smtppass: str + :type smtppass: str or bytes :param smtpport: the SMTP port to connect to :type smtpport: int diff -Nru python-scrapy-1.4.0/docs/topics/exceptions.rst python-scrapy-1.5.0/docs/topics/exceptions.rst --- python-scrapy-1.4.0/docs/topics/exceptions.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/exceptions.rst 2017-12-29 21:09:52.000000000 +0000 @@ -39,6 +39,14 @@ if 'Bandwidth exceeded' in response.body: raise CloseSpider('bandwidth_exceeded') +DontCloseSpider +--------------- + +.. exception:: DontCloseSpider + +This exception can be raised in a :signal:`spider_idle` signal handler to +prevent the spider from being closed. + IgnoreRequest ------------- diff -Nru python-scrapy-1.4.0/docs/topics/exporters.rst python-scrapy-1.5.0/docs/topics/exporters.rst --- python-scrapy-1.4.0/docs/topics/exporters.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/exporters.rst 2017-12-29 21:09:52.000000000 +0000 @@ -36,38 +36,36 @@ 3. and finally call the :meth:`~BaseItemExporter.finish_exporting` to signal the end of the exporting process -Here you can see an :doc:`Item Pipeline ` which uses an Item -Exporter to export scraped items to different files, one per spider:: - - from scrapy import signals - from scrapy.exporters import XmlItemExporter - - class XmlExportPipeline(object): - - def __init__(self): - self.files = {} - - @classmethod - def from_crawler(cls, crawler): - pipeline = cls() - crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) - crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) - return pipeline - - def spider_opened(self, spider): - file = open('%s_products.xml' % spider.name, 'w+b') - self.files[spider] = file - self.exporter = XmlItemExporter(file) - self.exporter.start_exporting() - - def spider_closed(self, spider): - self.exporter.finish_exporting() - file = self.files.pop(spider) - file.close() - - def process_item(self, item, spider): - self.exporter.export_item(item) - return item +Here you can see an :doc:`Item Pipeline ` which uses multiple +Item Exporters to group scraped items to different files according to the +value of one of their fields:: + + from scrapy.exporters import XmlItemExporter + + class PerYearXmlExportPipeline(object): + """Distribute items across multiple XML files according to their 'year' field""" + + def open_spider(self, spider): + self.year_to_exporter = {} + + def close_spider(self, spider): + for exporter in self.year_to_exporter.values(): + exporter.finish_exporting() + exporter.file.close() + + def _exporter_for_item(self, item): + year = item['year'] + if year not in self.year_to_exporter: + f = open('{}.xml'.format(year), 'wb') + exporter = XmlItemExporter(f) + exporter.start_exporting() + self.year_to_exporter[year] = exporter + return self.year_to_exporter[year] + + def process_item(self, item, spider): + exporter = self._exporter_for_item(item) + exporter.export_item(item) + return item .. _topics-exporters-field-serialization: diff -Nru python-scrapy-1.4.0/docs/topics/extensions.rst python-scrapy-1.5.0/docs/topics/extensions.rst --- python-scrapy-1.4.0/docs/topics/extensions.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/extensions.rst 2017-12-29 21:09:52.000000000 +0000 @@ -277,9 +277,11 @@ Default: ``0`` An integer which specifies a number of items. If the spider scrapes more than -that amount if items and those items are passed by the item pipeline, the -spider will be closed with the reason ``closespider_itemcount``. If zero (or -non set), spiders won't be closed by number of passed items. +that amount and those items are passed by the item pipeline, the +spider will be closed with the reason ``closespider_itemcount``. +Requests which are currently in the downloader queue (up to +:setting:`CONCURRENT_REQUESTS` requests) are still processed. +If zero (or non set), spiders won't be closed by number of passed items. .. setting:: CLOSESPIDER_PAGECOUNT @@ -371,4 +373,4 @@ This extension only works on POSIX-compliant platforms (ie. not Windows). .. _Python debugger: https://docs.python.org/2/library/pdb.html -.. _Debugging in Python: http://www.ferg.org/papers/debugging_in_python.html +.. _Debugging in Python: https://pythonconquerstheuniverse.wordpress.com/2009/09/10/debugging-in-python/ diff -Nru python-scrapy-1.4.0/docs/topics/firebug.rst python-scrapy-1.5.0/docs/topics/firebug.rst --- python-scrapy-1.4.0/docs/topics/firebug.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/firebug.rst 2017-12-29 21:09:52.000000000 +0000 @@ -23,7 +23,7 @@ Project`_ used in the :ref:`tutorial ` but with a different face. -.. _Firebug: http://getfirebug.com +.. _Firebug: https://getfirebug.com/ .. _Google Directory: http://directory.google.com/ .. _Open Directory Project: http://www.dmoz.org diff -Nru python-scrapy-1.4.0/docs/topics/firefox.rst python-scrapy-1.5.0/docs/topics/firefox.rst --- python-scrapy-1.4.0/docs/topics/firefox.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/firefox.rst 2017-12-29 21:09:52.000000000 +0000 @@ -17,7 +17,7 @@ after applying some browser clean up and executing Javascript code. Firefox, in particular, is known for adding ```` elements to tables. Scrapy, on the other hand, does not modify the original page HTML, so you won't be able to -extract any data if you use ```` in your XPath expressions. +extract any data if you use ```` in your XPath expressions. Therefore, you should keep in mind the following things when working with Firefox and XPath: @@ -71,11 +71,11 @@ `Firecookie`_ makes it easier to view and manage cookies. You can use this extension to create a new cookie, delete existing cookies, see a list of cookies -for the current site, manage cookies permissions and a lot more. +for the current site, manage cookies permissions and a lot more. -.. _Firebug: http://getfirebug.com +.. _Firebug: https://getfirebug.com/ .. _Inspect Element: https://www.youtube.com/watch?v=-pT_pDe54aA -.. _XPather: https://addons.mozilla.org/en-US/firefox/addon/xpather/ +.. _XPather: https://addons.mozilla.org/en-US/firefox/addon/xpather/ .. _XPath Checker: https://addons.mozilla.org/en-US/firefox/addon/xpath-checker/ .. _Tamper Data: https://addons.mozilla.org/en-US/firefox/addon/tamper-data/ .. _Firecookie: https://addons.mozilla.org/en-US/firefox/addon/firecookie/ diff -Nru python-scrapy-1.4.0/docs/topics/item-pipeline.rst python-scrapy-1.5.0/docs/topics/item-pipeline.rst --- python-scrapy-1.4.0/docs/topics/item-pipeline.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/item-pipeline.rst 2017-12-29 21:09:52.000000000 +0000 @@ -156,7 +156,7 @@ self.client.close() def process_item(self, item, spider): - self.db[self.collection_name].insert(dict(item)) + self.db[self.collection_name].insert_one(dict(item)) return item .. _MongoDB: https://www.mongodb.org/ @@ -208,7 +208,7 @@ item["screenshot_filename"] = filename return item -.. _Splash: http://splash.readthedocs.io/en/stable/ +.. _Splash: https://splash.readthedocs.io/en/stable/ .. _Deferred: https://twistedmatrix.com/documents/current/core/howto/defer.html Duplicates filter diff -Nru python-scrapy-1.4.0/docs/topics/items.rst python-scrapy-1.5.0/docs/topics/items.rst --- python-scrapy-1.4.0/docs/topics/items.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/items.rst 2017-12-29 21:09:52.000000000 +0000 @@ -21,7 +21,7 @@ Various Scrapy components use extra information provided by Items: exporters look at declared fields to figure out columns to export, serialization can be customized using Item fields metadata, :mod:`trackref` -tracks Item instances to help finding memory leaks +tracks Item instances to help find memory leaks (see :ref:`topics-leaks-trackrefs`), etc. .. _dictionary-like: https://docs.python.org/2/library/stdtypes.html#dict diff -Nru python-scrapy-1.4.0/docs/topics/jobs.rst python-scrapy-1.5.0/docs/topics/jobs.rst --- python-scrapy-1.4.0/docs/topics/jobs.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/jobs.rst 2017-12-29 21:09:52.000000000 +0000 @@ -100,4 +100,4 @@ :setting:`SCHEDULER_DEBUG` setting to ``True`` in the project's settings page. It is ``False`` by default. -.. _pickle: http://docs.python.org/library/pickle.html +.. _pickle: https://docs.python.org/library/pickle.html diff -Nru python-scrapy-1.4.0/docs/topics/loaders.rst python-scrapy-1.5.0/docs/topics/loaders.rst --- python-scrapy-1.4.0/docs/topics/loaders.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/loaders.rst 2017-12-29 21:09:52.000000000 +0000 @@ -518,8 +518,8 @@ Example:: diff -Nru python-scrapy-1.4.0/docs/topics/logging.rst python-scrapy-1.5.0/docs/topics/logging.rst --- python-scrapy-1.4.0/docs/topics/logging.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/logging.rst 2017-12-29 21:09:52.000000000 +0000 @@ -102,7 +102,7 @@ class MySpider(scrapy.Spider): name = 'myspider' - start_urls = ['http://scrapinghub.com'] + start_urls = ['https://scrapinghub.com'] def parse(self, response): self.logger.info('Parse function called on %s', response.url) @@ -118,7 +118,7 @@ class MySpider(scrapy.Spider): name = 'myspider' - start_urls = ['http://scrapinghub.com'] + start_urls = ['https://scrapinghub.com'] def parse(self, response): logger.info('Parse function called on %s', response.url) diff -Nru python-scrapy-1.4.0/docs/topics/media-pipeline.rst python-scrapy-1.5.0/docs/topics/media-pipeline.rst --- python-scrapy-1.4.0/docs/topics/media-pipeline.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/media-pipeline.rst 2017-12-29 21:09:52.000000000 +0000 @@ -15,7 +15,8 @@ Both pipelines implement these features: * Avoid re-downloading media that was downloaded recently -* Specifying where to store the media (filesystem directory, Amazon S3 bucket) +* Specifying where to store the media (filesystem directory, Amazon S3 bucket, + Google Cloud Storage bucket) The Images Pipeline has a few extra functions for processing images: @@ -116,10 +117,11 @@ Supported Storage ================= -File system is currently the only officially supported storage, but there is -also support for storing files in `Amazon S3`_. +File system is currently the only officially supported storage, but there are +also support for storing files in `Amazon S3`_ and `Google Cloud Storage`_. .. _Amazon S3: https://aws.amazon.com/s3/ +.. _Google Cloud Storage: https://cloud.google.com/storage/ File system storage ------------------- @@ -169,7 +171,26 @@ For more information, see `canned ACLs`_ in the Amazon S3 Developer Guide. -.. _canned ACLs: http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl +.. _canned ACLs: https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl + +Google Cloud Storage +--------------------- + +.. setting:: GCS_PROJECT_ID + +:setting:`FILES_STORE` and :setting:`IMAGES_STORE` can represent a Google Cloud Storage +bucket. Scrapy will automatically upload the files to the bucket. (requires `google-cloud-storage`_ ) + +.. _google-cloud-storage: https://cloud.google.com/storage/docs/reference/libraries#client-libraries-install-python + +For example, these are valid :setting:`IMAGES_STORE` and :setting:`GCS_PROJECT_ID` settings:: + + IMAGES_STORE = 'gs://bucket/images/' + GCS_PROJECT_ID = 'project_id' + +For information about authentication, see this `documentation`_. + +.. _documentation: https://cloud.google.com/docs/authentication/production Usage example ============= diff -Nru python-scrapy-1.4.0/docs/topics/practices.rst python-scrapy-1.5.0/docs/topics/practices.rst --- python-scrapy-1.4.0/docs/topics/practices.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/practices.rst 2017-12-29 21:09:52.000000000 +0000 @@ -248,10 +248,10 @@ `commercial support`_. .. _Tor project: https://www.torproject.org/ -.. _commercial support: http://scrapy.org/support/ -.. _ProxyMesh: http://proxymesh.com/ +.. _commercial support: https://scrapy.org/support/ +.. _ProxyMesh: https://proxymesh.com/ .. _Google cache: http://www.googleguide.com/cached_pages.html .. _testspiders: https://github.com/scrapinghub/testspiders .. _Twisted Reactor Overview: https://twistedmatrix.com/documents/current/core/howto/reactor-basics.html -.. _Crawlera: http://scrapinghub.com/crawlera -.. _scrapoxy: http://scrapoxy.io/ +.. _Crawlera: https://scrapinghub.com/crawlera +.. _scrapoxy: https://scrapoxy.io/ diff -Nru python-scrapy-1.4.0/docs/topics/request-response.rst python-scrapy-1.5.0/docs/topics/request-response.rst --- python-scrapy-1.4.0/docs/topics/request-response.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/request-response.rst 2017-12-29 21:09:52.000000000 +0000 @@ -525,11 +525,11 @@ (for single valued headers) or lists (for multi-valued headers). :type headers: dict - :param body: the response body. It must be str, not unicode, unless you're - using a encoding-aware :ref:`Response subclass - `, such as - :class:`TextResponse`. - :type body: str + :param body: the response body. To access the decoded text as str (unicode + in Python 2) you can use ``response.text`` from an encoding-aware + :ref:`Response subclass `, + such as :class:`TextResponse`. + :type body: bytes :param flags: is a list containing the initial values for the :attr:`Response.flags` attribute. If given, the list will be shallow @@ -734,7 +734,7 @@ which adds encoding auto-discovering support by looking into the HTML `meta http-equiv`_ attribute. See :attr:`TextResponse.encoding`. -.. _meta http-equiv: http://www.w3schools.com/TAGS/att_meta_http_equiv.asp +.. _meta http-equiv: https://www.w3schools.com/TAGS/att_meta_http_equiv.asp XmlResponse objects ------------------- diff -Nru python-scrapy-1.4.0/docs/topics/scrapyd.rst python-scrapy-1.5.0/docs/topics/scrapyd.rst --- python-scrapy-1.4.0/docs/topics/scrapyd.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/scrapyd.rst 2017-12-29 21:09:52.000000000 +0000 @@ -10,4 +10,4 @@ Its documentation is now hosted at: - http://scrapyd.readthedocs.org/en/latest/ + https://scrapyd.readthedocs.io/en/latest/ diff -Nru python-scrapy-1.4.0/docs/topics/selectors.rst python-scrapy-1.5.0/docs/topics/selectors.rst --- python-scrapy-1.4.0/docs/topics/selectors.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/selectors.rst 2017-12-29 21:09:52.000000000 +0000 @@ -36,7 +36,7 @@ For a complete reference of the selectors API see :ref:`Selector reference ` -.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ +.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ .. _lxml: http://lxml.de/ .. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html .. _cssselect: https://pypi.python.org/pypi/cssselect/ @@ -86,7 +86,7 @@ provides interactive testing) and an example page located in the Scrapy documentation server: - http://doc.scrapy.org/en/latest/_static/selectors-sample1.html + https://doc.scrapy.org/en/latest/_static/selectors-sample1.html .. _topics-selectors-htmlcode: @@ -99,7 +99,7 @@ First, let's open the shell:: - scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html + scrapy shell https://doc.scrapy.org/en/latest/_static/selectors-sample1.html Then, after the shell loads, you'll have the response available as ``response`` shell variable, and its attached selector in ``response.selector`` attribute. diff -Nru python-scrapy-1.4.0/docs/topics/settings.rst python-scrapy-1.5.0/docs/topics/settings.rst --- python-scrapy-1.4.0/docs/topics/settings.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/settings.rst 2017-12-29 21:09:52.000000000 +0000 @@ -627,7 +627,7 @@ circumstances, from server misconfiguration to network errors to data corruption. It is up to the user to decide if it makes sense to process broken responses considering they may contain partial or incomplete content. - If setting:`RETRY_ENABLED` is ``True`` and this setting is set to ``True``, + If :setting:`RETRY_ENABLED` is ``True`` and this setting is set to ``True``, the ``ResponseFailed([_DataLoss])`` failure will be retried as usual. .. setting:: DUPEFILTER_CLASS @@ -1002,7 +1002,7 @@ If :setting:`DOWNLOAD_DELAY` is zero (default) this option has no effect. -.. _wget: http://www.gnu.org/software/wget/manual/wget.html +.. _wget: https://www.gnu.org/software/wget/manual/wget.html .. setting:: REACTOR_THREADPOOL_MAXSIZE @@ -1317,14 +1317,14 @@ Scope: ``spidermiddlewares.urllength`` The maximum URL length to allow for crawled URLs. For more information about -the default value for this setting see: http://www.boutell.com/newfaq/misc/urllength.html +the default value for this setting see: https://boutell.com/newfaq/misc/urllength.html .. setting:: USER_AGENT USER_AGENT ---------- -Default: ``"Scrapy/VERSION (+http://scrapy.org)"`` +Default: ``"Scrapy/VERSION (+https://scrapy.org)"`` The default User-Agent to use when crawling, unless overridden. diff -Nru python-scrapy-1.4.0/docs/topics/shell.rst python-scrapy-1.5.0/docs/topics/shell.rst --- python-scrapy-1.4.0/docs/topics/shell.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/shell.rst 2017-12-29 21:09:52.000000000 +0000 @@ -39,9 +39,9 @@ [settings] shell = bpython -.. _IPython: http://ipython.org/ -.. _IPython installation guide: http://ipython.org/install.html -.. _bpython: http://www.bpython-interpreter.org/ +.. _IPython: https://ipython.org/ +.. _IPython installation guide: https://ipython.org/install.html +.. _bpython: https://www.bpython-interpreter.org/ Launch the shell ================ @@ -142,7 +142,7 @@ ======================== Here's an example of a typical shell session where we start by scraping the -http://scrapy.org page, and then proceed to scrape the https://reddit.com +https://scrapy.org page, and then proceed to scrape the https://reddit.com page. Finally, we modify the (Reddit) request method to POST and re-fetch it getting an error. We end the session by typing Ctrl-D (in Unix systems) or Ctrl-Z in Windows. @@ -154,7 +154,7 @@ First, we launch the shell:: - scrapy shell 'http://scrapy.org' --nolog + scrapy shell 'https://scrapy.org' --nolog Then, the shell fetches the URL (using the Scrapy downloader) and prints the list of available objects and useful shortcuts (you'll notice that these lines @@ -164,7 +164,7 @@ [s] scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc) [s] crawler [s] item {} - [s] request + [s] request [s] response <200 https://scrapy.org/> [s] settings [s] spider @@ -182,7 +182,7 @@ >>> response.xpath('//title/text()').extract_first() 'Scrapy | A Fast and Powerful Scraping and Web Crawling Framework' - >>> fetch("http://reddit.com") + >>> fetch("https://reddit.com") >>> response.xpath('//title/text()').extract() ['reddit: the front page of the internet'] diff -Nru python-scrapy-1.4.0/docs/topics/signals.rst python-scrapy-1.5.0/docs/topics/signals.rst --- python-scrapy-1.4.0/docs/topics/signals.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/signals.rst 2017-12-29 21:09:52.000000000 +0000 @@ -189,14 +189,20 @@ the engine starts closing the spider. After the spider has finished closing, the :signal:`spider_closed` signal is sent. - You can, for example, schedule some requests in your :signal:`spider_idle` - handler to prevent the spider from being closed. + You may raise a :exc:`~scrapy.exceptions.DontCloseSpider` exception to + prevent the spider from being closed. This signal does not support returning deferreds from their handlers. :param spider: the spider which has gone idle :type spider: :class:`~scrapy.spiders.Spider` object +.. note:: Scheduling some requests in your :signal:`spider_idle` handler does + **not** guarantee that it can prevent the spider from being closed, + although it sometimes can. That's because the spider may still remain idle + if all the scheduled requests are rejected by the scheduler (e.g. filtered + due to duplication). + spider_error ------------ diff -Nru python-scrapy-1.4.0/docs/topics/spider-middleware.rst python-scrapy-1.5.0/docs/topics/spider-middleware.rst --- python-scrapy-1.4.0/docs/topics/spider-middleware.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/spider-middleware.rst 2017-12-29 21:09:52.000000000 +0000 @@ -164,6 +164,17 @@ :param spider: the spider to whom the start requests belong :type spider: :class:`~scrapy.spiders.Spider` object + .. method:: from_crawler(cls, crawler) + + If present, this classmethod is called to create a middleware instance + from a :class:`~scrapy.crawler.Crawler`. It must return a new instance + of the middleware. Crawler object provides access to all Scrapy core + components like settings and signals; it is a way for middleware to + access them and hook its functionality into Scrapy. + + :param crawler: crawler that uses this middleware + :type crawler: :class:`~scrapy.crawler.Crawler` object + .. _Exception: https://docs.python.org/2/library/exceptions.html#exceptions.Exception @@ -188,9 +199,13 @@ .. class:: DepthMiddleware - DepthMiddleware is a scrape middleware used for tracking the depth of each - Request inside the site being scraped. It can be used to limit the maximum - depth to scrape or things like that. + DepthMiddleware is used for tracking the depth of each Request inside the + site being scraped. It works by setting `request.meta['depth'] = 0` whenever + there is no value previously set (usually just the first Request) and + incrementing it by 1 otherwise. + + It can be used to limit the maximum depth to scrape, control Request + priority based on their depth, and things like that. The :class:`DepthMiddleware` can be configured through the following settings (see the settings documentation for more info): diff -Nru python-scrapy-1.4.0/docs/topics/spiders.rst python-scrapy-1.5.0/docs/topics/spiders.rst --- python-scrapy-1.4.0/docs/topics/spiders.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/spiders.rst 2017-12-29 21:09:52.000000000 +0000 @@ -80,7 +80,7 @@ allowed to crawl. Requests for URLs not belonging to the domain names specified in this list (or their subdomains) won't be followed if :class:`~scrapy.spidermiddlewares.offsite.OffsiteMiddleware` is enabled. - + Let's say your target url is ``https://www.example.com/1.html``, then add ``'example.com'`` to the list. @@ -578,8 +578,7 @@ .. attribute:: headers - A list of the rows contained in the file CSV feed which will be used to - extract fields from it. + A list of the column names in the CSV file. .. method:: parse_row(response, row) @@ -752,8 +751,8 @@ def parse_other(self, response): pass # ... scrape other here ... -.. _Sitemaps: http://www.sitemaps.org -.. _Sitemap index files: http://www.sitemaps.org/protocol.html#index +.. _Sitemaps: https://www.sitemaps.org/index.html +.. _Sitemap index files: https://www.sitemaps.org/protocol.html#index .. _robots.txt: http://www.robotstxt.org/ .. _TLD: https://en.wikipedia.org/wiki/Top-level_domain -.. _Scrapyd documentation: http://scrapyd.readthedocs.org/en/latest/ +.. _Scrapyd documentation: https://scrapyd.readthedocs.io/en/latest/ diff -Nru python-scrapy-1.4.0/docs/topics/ubuntu.rst python-scrapy-1.5.0/docs/topics/ubuntu.rst --- python-scrapy-1.4.0/docs/topics/ubuntu.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/topics/ubuntu.rst 2017-12-29 21:09:52.000000000 +0000 @@ -37,5 +37,5 @@ .. warning:: `python-scrapy` is a different package provided by official debian repositories, it's very outdated and it isn't supported by Scrapy team. -.. _Scrapinghub: http://scrapinghub.com/ +.. _Scrapinghub: https://scrapinghub.com/ .. _GitHub repo: https://github.com/scrapy/scrapy diff -Nru python-scrapy-1.4.0/docs/utils/linkfix.py python-scrapy-1.5.0/docs/utils/linkfix.py --- python-scrapy-1.4.0/docs/utils/linkfix.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/docs/utils/linkfix.py 2017-12-29 21:09:52.000000000 +0000 @@ -20,7 +20,7 @@ _contents = None # A regex that matches standard linkcheck output lines -line_re = re.compile(ur'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))') +line_re = re.compile(u'(.*)\:\d+\:\s\[(.*)\]\s(?:(.*)\sto\s(.*)|(.*))') # Read lines from the linkcheck output file try: diff -Nru python-scrapy-1.4.0/extras/coverage-report.sh python-scrapy-1.5.0/extras/coverage-report.sh --- python-scrapy-1.4.0/extras/coverage-report.sh 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/extras/coverage-report.sh 2017-12-29 21:09:52.000000000 +0000 @@ -1,6 +1,6 @@ # Run tests, generate coverage report and open it on a browser # -# Requires: coverage 3.3 or above from http://pypi.python.org/pypi/coverage +# Requires: coverage 3.3 or above from https://pypi.python.org/pypi/coverage coverage run --branch $(which trial) --reporter=text tests coverage html -i diff -Nru python-scrapy-1.4.0/extras/qpsclient.py python-scrapy-1.5.0/extras/qpsclient.py --- python-scrapy-1.4.0/extras/qpsclient.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/extras/qpsclient.py 2017-12-29 21:09:52.000000000 +0000 @@ -41,7 +41,7 @@ slots = int(self.slots) if slots > 1: - urls = [url.replace('localhost', '127.0.0.%d' % (x + 1)) for x in xrange(slots)] + urls = [url.replace('localhost', '127.0.0.%d' % (x + 1)) for x in range(slots)] else: urls = [url] diff -Nru python-scrapy-1.4.0/INSTALL python-scrapy-1.5.0/INSTALL --- python-scrapy-1.4.0/INSTALL 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/INSTALL 2017-12-29 21:09:52.000000000 +0000 @@ -1,4 +1,4 @@ For information about installing Scrapy see: * docs/intro/install.rst (local file) -* http://doc.scrapy.org/en/latest/intro/install.html (online version) +* https://doc.scrapy.org/en/latest/intro/install.html (online version) diff -Nru python-scrapy-1.4.0/README.rst python-scrapy-1.5.0/README.rst --- python-scrapy-1.4.0/README.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/README.rst 2017-12-29 21:09:52.000000000 +0000 @@ -6,8 +6,12 @@ :target: https://pypi.python.org/pypi/Scrapy :alt: PyPI Version +.. image:: https://img.shields.io/pypi/pyversions/Scrapy.svg + :target: https://pypi.python.org/pypi/Scrapy + :alt: Supported Python Versions + .. image:: https://img.shields.io/travis/scrapy/scrapy/master.svg - :target: http://travis-ci.org/scrapy/scrapy + :target: https://travis-ci.org/scrapy/scrapy :alt: Build Status .. image:: https://img.shields.io/badge/wheel-yes-brightgreen.svg @@ -15,7 +19,7 @@ :alt: Wheel Status .. image:: https://img.shields.io/codecov/c/github/scrapy/scrapy/master.svg - :target: http://codecov.io/github/scrapy/scrapy?branch=master + :target: https://codecov.io/github/scrapy/scrapy?branch=master :alt: Coverage report .. image:: https://anaconda.org/conda-forge/scrapy/badges/version.svg @@ -31,12 +35,12 @@ a wide range of purposes, from data mining to monitoring and automated testing. For more information including a list of features check the Scrapy homepage at: -http://scrapy.org +https://scrapy.org Requirements ============ -* Python 2.7 or Python 3.3+ +* Python 2.7 or Python 3.4+ * Works on Linux, Windows, Mac OSX, BSD Install @@ -47,29 +51,28 @@ pip install scrapy For more details see the install section in the documentation: -http://doc.scrapy.org/en/latest/intro/install.html - -Releases -======== - -You can download the latest stable and development releases from: -http://scrapy.org/download/ +https://doc.scrapy.org/en/latest/intro/install.html Documentation ============= -Documentation is available online at http://doc.scrapy.org/ and in the ``docs`` +Documentation is available online at https://doc.scrapy.org/ and in the ``docs`` directory. +Releases +======== + +You can find release notes at https://doc.scrapy.org/en/latest/news.html + Community (blog, twitter, mail list, IRC) ========================================= -See http://scrapy.org/community/ +See https://scrapy.org/community/ Contributing ============ -See http://doc.scrapy.org/en/master/contributing.html +See https://doc.scrapy.org/en/master/contributing.html Code of Conduct --------------- @@ -83,9 +86,9 @@ Companies using Scrapy ====================== -See http://scrapy.org/companies/ +See https://scrapy.org/companies/ Commercial Support ================== -See http://scrapy.org/support/ +See https://scrapy.org/support/ diff -Nru python-scrapy-1.4.0/requirements-py3.txt python-scrapy-1.5.0/requirements-py3.txt --- python-scrapy-1.4.0/requirements-py3.txt 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/requirements-py3.txt 2017-12-29 21:09:52.000000000 +0000 @@ -1,4 +1,4 @@ -Twisted >= 15.5.0 +Twisted >= 17.9.0 lxml>=3.2.4 pyOpenSSL>=0.13.1 cssselect>=0.9 diff -Nru python-scrapy-1.4.0/scrapy/cmdline.py python-scrapy-1.5.0/scrapy/cmdline.py --- python-scrapy-1.4.0/scrapy/cmdline.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/cmdline.py 2017-12-29 21:09:52.000000000 +0000 @@ -11,6 +11,7 @@ from scrapy.exceptions import UsageError from scrapy.utils.misc import walk_modules from scrapy.utils.project import inside_project, get_project_settings +from scrapy.utils.python import garbage_collect from scrapy.settings.deprecated import check_deprecated_settings def _iter_command_classes(module_name): @@ -165,4 +166,9 @@ p.dump_stats(opts.profile) if __name__ == '__main__': - execute() + try: + execute() + finally: + # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() + # on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies + garbage_collect() diff -Nru python-scrapy-1.4.0/scrapy/commands/edit.py python-scrapy-1.5.0/scrapy/commands/edit.py --- python-scrapy-1.4.0/scrapy/commands/edit.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/commands/edit.py 2017-12-29 21:09:52.000000000 +0000 @@ -1,4 +1,5 @@ -import sys, os +import sys +import os from scrapy.commands import ScrapyCommand from scrapy.exceptions import UsageError diff -Nru python-scrapy-1.4.0/scrapy/commands/parse.py python-scrapy-1.5.0/scrapy/commands/parse.py --- python-scrapy-1.4.0/scrapy/commands/parse.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/commands/parse.py 2017-12-29 21:09:52.000000000 +0000 @@ -1,4 +1,5 @@ from __future__ import print_function +import json import logging from w3lib.url import is_url @@ -48,6 +49,8 @@ help="use CrawlSpider rules to discover the callback") parser.add_option("-c", "--callback", dest="callback", help="use this callback for parsing, instead looking for a callback") + parser.add_option("-m", "--meta", dest="meta", + help="inject extra meta into the Request, it must be a valid raw json string") parser.add_option("-d", "--depth", dest="depth", type="int", default=1, help="maximum depth for parsing requests [default: %default]") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", @@ -142,7 +145,8 @@ logger.error('Unable to find spider for: %(url)s', {'url': url}) - request = Request(url, opts.callback) + # Request requires callback argument as callable or None, not string + request = Request(url, None) _start_requests = lambda s: [self.prepare_request(s, request, opts)] self.spidercls.start_requests = _start_requests @@ -164,7 +168,9 @@ # determine real callback cb = response.meta['_callback'] if not cb: - if opts.rules and self.first_response == response: + if opts.callback: + cb = opts.callback + elif opts.rules and self.first_response == response: cb = self.get_callback_from_rules(spider, response) if not cb: @@ -201,6 +207,10 @@ req.callback = callback return requests + #update request meta if any extra meta was passed through the --meta/-m opts. + if opts.meta: + request.meta.update(opts.meta) + request.meta['_depth'] = 1 request.meta['_callback'] = request.callback request.callback = callback @@ -208,11 +218,27 @@ def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) + + self.process_spider_arguments(opts) + self.process_request_meta(opts) + + def process_spider_arguments(self, opts): + try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) + def process_request_meta(self, opts): + + if opts.meta: + try: + opts.meta = json.loads(opts.meta) + except ValueError: + raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. " \ + "Example: --meta='{\"foo\" : \"bar\"}'", print_help=False) + + def run(self, args, opts): # parse arguments if not len(args) == 1 or not is_url(args[0]): diff -Nru python-scrapy-1.4.0/scrapy/commands/version.py python-scrapy-1.5.0/scrapy/commands/version.py --- python-scrapy-1.4.0/scrapy/commands/version.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/commands/version.py 2017-12-29 21:09:52.000000000 +0000 @@ -1,12 +1,8 @@ from __future__ import print_function -import sys -import platform - -import twisted -import OpenSSL import scrapy from scrapy.commands import ScrapyCommand +from scrapy.utils.versions import scrapy_components_versions class Command(ScrapyCommand): @@ -27,38 +23,11 @@ def run(self, args, opts): if opts.verbose: - import cssselect - import parsel - import lxml.etree - import w3lib - - lxml_version = ".".join(map(str, lxml.etree.LXML_VERSION)) - libxml2_version = ".".join(map(str, lxml.etree.LIBXML_VERSION)) - - try: - w3lib_version = w3lib.__version__ - except AttributeError: - w3lib_version = "<1.14.3" - - print("Scrapy : %s" % scrapy.__version__) - print("lxml : %s" % lxml_version) - print("libxml2 : %s" % libxml2_version) - print("cssselect : %s" % cssselect.__version__) - print("parsel : %s" % parsel.__version__) - print("w3lib : %s" % w3lib_version) - print("Twisted : %s" % twisted.version.short()) - print("Python : %s" % sys.version.replace("\n", "- ")) - print("pyOpenSSL : %s" % self._get_openssl_version()) - print("Platform : %s" % platform.platform()) + versions = scrapy_components_versions() + width = max(len(n) for (n, _) in versions) + patt = "%-{}s : %s".format(width) + for name, version in versions: + print(patt % (name, version)) else: print("Scrapy %s" % scrapy.__version__) - def _get_openssl_version(self): - try: - openssl = OpenSSL.SSL.SSLeay_version(OpenSSL.SSL.SSLEAY_VERSION)\ - .decode('ascii', errors='replace') - # pyOpenSSL 0.12 does not expose openssl version - except AttributeError: - openssl = 'Unknown OpenSSL version' - - return '{} ({})'.format(OpenSSL.version.__version__, openssl) diff -Nru python-scrapy-1.4.0/scrapy/core/downloader/contextfactory.py python-scrapy-1.5.0/scrapy/core/downloader/contextfactory.py --- python-scrapy-1.4.0/scrapy/core/downloader/contextfactory.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/core/downloader/contextfactory.py 2017-12-29 21:09:52.000000000 +0000 @@ -64,7 +64,7 @@ """ Twisted-recommended context factory for web clients. - Quoting http://twistedmatrix.com/documents/current/api/twisted.web.client.Agent.html: + Quoting https://twistedmatrix.com/documents/current/api/twisted.web.client.Agent.html: "The default is to use a BrowserLikePolicyForHTTPS, so unless you have special requirements you can leave this as-is." @@ -100,6 +100,6 @@ def getContext(self, hostname=None, port=None): ctx = ClientContextFactory.getContext(self) # Enable all workarounds to SSL bugs as documented by - # http://www.openssl.org/docs/ssl/SSL_CTX_set_options.html + # https://www.openssl.org/docs/manmaster/man3/SSL_CTX_set_options.html ctx.set_options(SSL.OP_ALL) return ctx diff -Nru python-scrapy-1.4.0/scrapy/core/downloader/handlers/http11.py python-scrapy-1.5.0/scrapy/core/downloader/handlers/http11.py --- python-scrapy-1.4.0/scrapy/core/downloader/handlers/http11.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/core/downloader/handlers/http11.py 2017-12-29 21:09:52.000000000 +0000 @@ -15,6 +15,10 @@ from twisted.web.http import _DataLoss, PotentialDataLoss from twisted.web.client import Agent, ProxyAgent, ResponseDone, \ HTTPConnectionPool, ResponseFailed +try: + from twisted.web.client import URI +except ImportError: + from twisted.web.client import _URI as URI from twisted.internet.endpoints import TCP4ClientEndpoint from scrapy.http import Headers @@ -228,10 +232,38 @@ headers, bodyProducer, requestPath) +class ScrapyProxyAgent(Agent): + + def __init__(self, reactor, proxyURI, + connectTimeout=None, bindAddress=None, pool=None): + super(ScrapyProxyAgent, self).__init__(reactor, + connectTimeout=connectTimeout, + bindAddress=bindAddress, + pool=pool) + self._proxyURI = URI.fromBytes(proxyURI) + + def request(self, method, uri, headers=None, bodyProducer=None): + """ + Issue a new request via the configured proxy. + """ + # Cache *all* connections under the same key, since we are only + # connecting to a single destination, the proxy: + if twisted_version >= (15, 0, 0): + proxyEndpoint = self._getEndpoint(self._proxyURI) + else: + proxyEndpoint = self._getEndpoint(self._proxyURI.scheme, + self._proxyURI.host, + self._proxyURI.port) + key = ("http-proxy", self._proxyURI.host, self._proxyURI.port) + return self._requestWithEndpoint(key, proxyEndpoint, method, + URI.fromBytes(uri), headers, + bodyProducer, uri) + + class ScrapyAgent(object): _Agent = Agent - _ProxyAgent = ProxyAgent + _ProxyAgent = ScrapyProxyAgent _TunnelingAgent = TunnelingAgent def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None, @@ -260,9 +292,8 @@ contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: - endpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort, - timeout=timeout, bindAddress=bindaddress) - return self._ProxyAgent(endpoint) + return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'), + connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) @@ -344,8 +375,8 @@ if warnsize and expected_size > warnsize: logger.warning("Expected response size (%(size)s) larger than " - "download warn size (%(warnsize)s).", - {'size': expected_size, 'warnsize': warnsize}) + "download warn size (%(warnsize)s) in request %(request)s.", + {'size': expected_size, 'warnsize': warnsize, 'request': request}) def _cancel(_): # Abort connection inmediately. @@ -412,9 +443,10 @@ if self._maxsize and self._bytes_received > self._maxsize: logger.error("Received (%(bytes)s) bytes larger than download " - "max size (%(maxsize)s).", + "max size (%(maxsize)s) in request %(request)s.", {'bytes': self._bytes_received, - 'maxsize': self._maxsize}) + 'maxsize': self._maxsize, + 'request': self._request}) # Clear buffer earlier to avoid keeping data in memory for a long # time. self._bodybuf.truncate(0) diff -Nru python-scrapy-1.4.0/scrapy/crawler.py python-scrapy-1.5.0/scrapy/crawler.py --- python-scrapy-1.4.0/scrapy/crawler.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/crawler.py 2017-12-29 21:09:52.000000000 +0000 @@ -11,7 +11,7 @@ from scrapy.resolver import CachingThreadedResolver from scrapy.interfaces import ISpiderLoader from scrapy.extension import ExtensionManager -from scrapy.settings import Settings +from scrapy.settings import overridden_settings, Settings from scrapy.signalmanager import SignalManager from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.utils.ossignal import install_shutdown_handlers, signal_names @@ -34,13 +34,16 @@ self.settings = settings.copy() self.spidercls.update_settings(self.settings) + d = dict(overridden_settings(self.settings)) + logger.info("Overridden settings: %(settings)r", {'settings': d}) + self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) if get_scrapy_root_handler() is not None: - # scrapy root handler alread installed: update it with new settings + # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope @@ -80,7 +83,7 @@ yield defer.maybeDeferred(self.engine.start) except Exception: # In Python 2 reraising an exception after yield discards - # the original traceback (see http://bugs.python.org/issue7563), + # the original traceback (see https://bugs.python.org/issue7563), # so sys.exc_info() workaround is used. # This workaround also works in Python 3, but it is not needed, # and it is slower, so in Python 3 we use native `raise`. @@ -234,15 +237,18 @@ The CrawlerProcess object must be instantiated with a :class:`~scrapy.settings.Settings` object. + :param install_root_handler: whether to install root logging handler + (default: True) + This class shouldn't be needed (since Scrapy is responsible of using it accordingly) unless writing scripts that manually handle the crawling process. See :ref:`run-from-script` for an example. """ - def __init__(self, settings=None): + def __init__(self, settings=None, install_root_handler=True): super(CrawlerProcess, self).__init__(settings) install_shutdown_handlers(self._signal_shutdown) - configure_logging(self.settings) + configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings) def _signal_shutdown(self, signum, _): diff -Nru python-scrapy-1.4.0/scrapy/downloadermiddlewares/chunked.py python-scrapy-1.5.0/scrapy/downloadermiddlewares/chunked.py --- python-scrapy-1.4.0/scrapy/downloadermiddlewares/chunked.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/downloadermiddlewares/chunked.py 2017-12-29 21:09:52.000000000 +0000 @@ -11,7 +11,7 @@ class ChunkedTransferMiddleware(object): """This middleware adds support for chunked transfer encoding, as - documented in: http://en.wikipedia.org/wiki/Chunked_transfer_encoding + documented in: https://en.wikipedia.org/wiki/Chunked_transfer_encoding """ def process_response(self, request, response, spider): diff -Nru python-scrapy-1.4.0/scrapy/downloadermiddlewares/httpcache.py python-scrapy-1.5.0/scrapy/downloadermiddlewares/httpcache.py --- python-scrapy-1.4.0/scrapy/downloadermiddlewares/httpcache.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/downloadermiddlewares/httpcache.py 2017-12-29 21:09:52.000000000 +0000 @@ -75,7 +75,7 @@ return response # RFC2616 requires origin server to set Date header, - # http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18 + # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18 if 'Date' not in response.headers: response.headers['Date'] = formatdate(usegmt=1) diff -Nru python-scrapy-1.4.0/scrapy/downloadermiddlewares/redirect.py python-scrapy-1.5.0/scrapy/downloadermiddlewares/redirect.py --- python-scrapy-1.4.0/scrapy/downloadermiddlewares/redirect.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/downloadermiddlewares/redirect.py 2017-12-29 21:09:52.000000000 +0000 @@ -64,7 +64,7 @@ request.meta.get('handle_httpstatus_all', False)): return response - allowed_status = (301, 302, 303, 307) + allowed_status = (301, 302, 303, 307, 308) if 'Location' not in response.headers or response.status not in allowed_status: return response @@ -72,7 +72,7 @@ redirected_url = urljoin(request.url, location) - if response.status in (301, 307) or request.method == 'HEAD': + if response.status in (301, 307, 308) or request.method == 'HEAD': redirected = request.replace(url=redirected_url) return self._redirect(redirected, request, spider, response.status) diff -Nru python-scrapy-1.4.0/scrapy/exporters.py python-scrapy-1.5.0/scrapy/exporters.py --- python-scrapy-1.4.0/scrapy/exporters.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/exporters.py 2017-12-29 21:09:52.000000000 +0000 @@ -188,7 +188,7 @@ self.xg.endElement(name) self._beautify_newline() - # Workaround for http://bugs.python.org/issue17606 + # Workaround for https://bugs.python.org/issue17606 # Before Python 2.7.4 xml.sax.saxutils required bytes; # since 2.7.4 it requires unicode. The bug is likely to be # fixed in 2.7.6, but 2.7.6 will still support unicode, diff -Nru python-scrapy-1.4.0/scrapy/extensions/httpcache.py python-scrapy-1.5.0/scrapy/extensions/httpcache.py --- python-scrapy-1.4.0/scrapy/extensions/httpcache.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/extensions/httpcache.py 2017-12-29 21:09:52.000000000 +0000 @@ -13,7 +13,7 @@ from scrapy.utils.request import request_fingerprint from scrapy.utils.project import data_path from scrapy.utils.httpobj import urlparse_cached -from scrapy.utils.python import to_bytes, to_unicode +from scrapy.utils.python import to_bytes, to_unicode, garbage_collect logger = logging.getLogger(__name__) @@ -70,8 +70,8 @@ return True def should_cache_response(self, response, request): - # What is cacheable - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1 - # Response cacheability - http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4 + # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec14.9.1 + # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4 # Status code 206 is not included because cache can not deal with partial contents cc = self._parse_cachecontrol(response) # obey directive "Cache-Control: no-store" @@ -163,7 +163,7 @@ def _compute_freshness_lifetime(self, response, request, now): # Reference nsHttpResponseHead::ComputeFreshnessLifetime - # http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#410 + # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706 cc = self._parse_cachecontrol(response) maxage = self._get_max_age(cc) if maxage is not None: @@ -194,7 +194,7 @@ def _compute_current_age(self, response, request, now): # Reference nsHttpResponseHead::ComputeCurrentAge - # http://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#366 + # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658 currentage = 0 # If Date header is not set we assume it is a fast connection, and # clock is in sync with the server @@ -362,6 +362,7 @@ # avoid them being removed in storages with timestamp-based autoremoval. self.db.CompactRange() del self.db + garbage_collect() def retrieve_response(self, spider, request): data = self._read_data(spider, request) @@ -413,7 +414,7 @@ def parse_cachecontrol(header): """Parse Cache-Control header - http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9 + https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9 >>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None, ... b'max-age': b'3600'} diff -Nru python-scrapy-1.4.0/scrapy/extensions/telnet.py python-scrapy-1.5.0/scrapy/extensions/telnet.py --- python-scrapy-1.4.0/scrapy/extensions/telnet.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/extensions/telnet.py 2017-12-29 21:09:52.000000000 +0000 @@ -82,7 +82,7 @@ 'prefs': print_live_refs, 'hpy': hpy, 'help': "This is Scrapy telnet console. For more info see: " \ - "http://doc.scrapy.org/en/latest/topics/telnetconsole.html", + "https://doc.scrapy.org/en/latest/topics/telnetconsole.html", } self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars) return telnet_vars diff -Nru python-scrapy-1.4.0/scrapy/http/request/__init__.py python-scrapy-1.5.0/scrapy/http/request/__init__.py --- python-scrapy-1.4.0/scrapy/http/request/__init__.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/http/request/__init__.py 2017-12-29 21:09:52.000000000 +0000 @@ -27,6 +27,10 @@ assert isinstance(priority, int), "Request priority not an integer: %r" % priority self.priority = priority + if callback is not None and not callable(callback): + raise TypeError('callback must be a callable, got %s' % type(callback).__name__) + if errback is not None and not callable(errback): + raise TypeError('errback must be a callable, got %s' % type(errback).__name__) assert callback or not errback, "Cannot use errback without a callback" self.callback = callback self.errback = errback diff -Nru python-scrapy-1.4.0/scrapy/http/response/text.py python-scrapy-1.5.0/scrapy/http/response/text.py --- python-scrapy-1.4.0/scrapy/http/response/text.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/http/response/text.py 2017-12-29 21:09:52.000000000 +0000 @@ -135,7 +135,7 @@ * an attribute Selector (not SelectorList) - e.g. ``response.css('a::attr(href)')[0]`` or ``response.xpath('//img/@src')[0]``. - * a Selector for ```` element, e.g. + * a Selector for ```` or ```` element, e.g. ``response.css('a.my_link')[0]``. See :ref:`response-follow-example` for usage examples. @@ -165,10 +165,11 @@ return strip_html5_whitespace(sel.root) if not hasattr(sel.root, 'tag'): raise ValueError("Unsupported selector: %s" % sel) - if sel.root.tag != 'a': - raise ValueError("Only elements are supported; got <%s>" % + if sel.root.tag not in ('a', 'link'): + raise ValueError("Only and elements are supported; got <%s>" % sel.root.tag) href = sel.root.get('href') if href is None: - raise ValueError(" element has no href attribute: %s" % sel) + raise ValueError("<%s> element has no href attribute: %s" % + (sel.root.tag, sel)) return strip_html5_whitespace(href) diff -Nru python-scrapy-1.4.0/scrapy/linkextractors/__init__.py python-scrapy-1.5.0/scrapy/linkextractors/__init__.py --- python-scrapy-1.4.0/scrapy/linkextractors/__init__.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/linkextractors/__init__.py 2017-12-29 21:09:52.000000000 +0000 @@ -28,7 +28,7 @@ # video '3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv', - 'm4a', + 'm4a', 'm4v', # office suites 'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg', diff -Nru python-scrapy-1.4.0/scrapy/mail.py python-scrapy-1.5.0/scrapy/mail.py --- python-scrapy-1.4.0/scrapy/mail.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/mail.py 2017-12-29 21:09:52.000000000 +0000 @@ -5,7 +5,10 @@ """ import logging -from six.moves import cStringIO as StringIO +try: + from cStringIO import StringIO as BytesIO +except ImportError: + from io import BytesIO import six from email.utils import COMMASPACE, formatdate @@ -21,19 +24,26 @@ from twisted.internet import defer, reactor, ssl -from .utils.misc import arg_to_iter +from scrapy.utils.misc import arg_to_iter +from scrapy.utils.python import to_bytes logger = logging.getLogger(__name__) +def _to_bytes_or_none(text): + if text is None: + return None + return to_bytes(text) + + class MailSender(object): def __init__(self, smtphost='localhost', mailfrom='scrapy@localhost', smtpuser=None, smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False): self.smtphost = smtphost self.smtpport = smtpport - self.smtpuser = smtpuser - self.smtppass = smtppass + self.smtpuser = _to_bytes_or_none(smtpuser) + self.smtppass = _to_bytes_or_none(smtppass) self.smtptls = smtptls self.smtpssl = smtpssl self.mailfrom = mailfrom @@ -88,7 +98,7 @@ 'mailattachs': len(attachs)}) return - dfd = self._sendmail(rcpts, msg.as_string()) + dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8')) dfd.addCallbacks(self._sent_ok, self._sent_failed, callbackArgs=[to, cc, subject, len(attachs)], errbackArgs=[to, cc, subject, len(attachs)]) @@ -112,7 +122,7 @@ def _sendmail(self, to_addrs, msg): # Import twisted.mail here because it is not available in python3 from twisted.mail.smtp import ESMTPSenderFactory - msg = StringIO(msg) + msg = BytesIO(msg) d = defer.Deferred() factory = ESMTPSenderFactory(self.smtpuser, self.smtppass, self.mailfrom, \ to_addrs, msg, d, heloFallback=True, requireAuthentication=False, \ diff -Nru python-scrapy-1.4.0/scrapy/_monkeypatches.py python-scrapy-1.5.0/scrapy/_monkeypatches.py --- python-scrapy-1.4.0/scrapy/_monkeypatches.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/_monkeypatches.py 2017-12-29 21:09:52.000000000 +0000 @@ -4,12 +4,12 @@ if sys.version_info[0] == 2: from urlparse import urlparse - # workaround for http://bugs.python.org/issue7904 - Python < 2.7 + # workaround for https://bugs.python.org/issue7904 - Python < 2.7 if urlparse('s3://bucket/key').netloc != 'bucket': from urlparse import uses_netloc uses_netloc.append('s3') - # workaround for http://bugs.python.org/issue9374 - Python < 2.7.4 + # workaround for https://bugs.python.org/issue9374 - Python < 2.7.4 if urlparse('s3://bucket/key?key=value').query != 'key=value': from urlparse import uses_query uses_query.append('s3') diff -Nru python-scrapy-1.4.0/scrapy/pipelines/files.py python-scrapy-1.5.0/scrapy/pipelines/files.py --- python-scrapy-1.4.0/scrapy/pipelines/files.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/pipelines/files.py 2017-12-29 21:09:52.000000000 +0000 @@ -120,7 +120,7 @@ def _get_boto_bucket(self): # disable ssl (is_secure=False) because of this python bug: - # http://bugs.python.org/issue5103 + # https://bugs.python.org/issue5103 c = self.S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, is_secure=False) return c.get_bucket(self.bucket, validate=False) @@ -194,6 +194,47 @@ return extra +class GCSFilesStore(object): + + GCS_PROJECT_ID = None + + CACHE_CONTROL = 'max-age=172800' + + def __init__(self, uri): + from google.cloud import storage + client = storage.Client(project=self.GCS_PROJECT_ID) + bucket, prefix = uri[5:].split('/', 1) + self.bucket = client.bucket(bucket) + self.prefix = prefix + + def stat_file(self, path, info): + def _onsuccess(blob): + if blob: + checksum = blob.md5_hash + last_modified = time.mktime(blob.updated.timetuple()) + return {'checksum': checksum, 'last_modified': last_modified} + else: + return {} + + return threads.deferToThread(self.bucket.get_blob, path).addCallback(_onsuccess) + + def _get_content_type(self, headers): + if headers and 'Content-Type' in headers: + return headers['Content-Type'] + else: + return 'application/octet-stream' + + def persist_file(self, path, buf, info, meta=None, headers=None): + blob = self.bucket.blob(self.prefix + path) + blob.cache_control = self.CACHE_CONTROL + blob.metadata = {k: str(v) for k, v in six.iteritems(meta or {})} + return threads.deferToThread( + blob.upload_from_string, + data=buf.getvalue(), + content_type=self._get_content_type(headers) + ) + + class FilesPipeline(MediaPipeline): """Abstract pipeline that implement the file downloading @@ -219,6 +260,7 @@ '': FSFilesStore, 'file': FSFilesStore, 's3': S3FilesStore, + 'gs': GCSFilesStore, } DEFAULT_FILES_URLS_FIELD = 'file_urls' DEFAULT_FILES_RESULT_FIELD = 'files' @@ -226,7 +268,7 @@ def __init__(self, store_uri, download_func=None, settings=None): if not store_uri: raise NotConfigured - + if isinstance(settings, dict) or settings is None: settings = Settings(settings) @@ -258,6 +300,9 @@ s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] s3store.POLICY = settings['FILES_STORE_S3_ACL'] + gcs_store = cls.STORE_SCHEMES['gs'] + gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID'] + store_uri = settings['FILES_STORE'] return cls(store_uri, settings=settings) diff -Nru python-scrapy-1.4.0/scrapy/pipelines/images.py python-scrapy-1.5.0/scrapy/pipelines/images.py --- python-scrapy-1.4.0/scrapy/pipelines/images.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/pipelines/images.py 2017-12-29 21:09:52.000000000 +0000 @@ -91,6 +91,9 @@ s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] s3store.POLICY = settings['IMAGES_STORE_S3_ACL'] + gcs_store = cls.STORE_SCHEMES['gs'] + gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID'] + store_uri = settings['IMAGES_STORE'] return cls(store_uri, settings=settings) diff -Nru python-scrapy-1.4.0/scrapy/resolver.py python-scrapy-1.5.0/scrapy/resolver.py --- python-scrapy-1.4.0/scrapy/resolver.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/resolver.py 2017-12-29 21:09:52.000000000 +0000 @@ -22,7 +22,8 @@ # to enforce Scrapy's DNS_TIMEOUT setting's value timeout = (self.timeout,) d = super(CachingThreadedResolver, self).getHostByName(name, timeout) - d.addCallback(self._cache_result, name) + if dnscache.limit: + d.addCallback(self._cache_result, name) return d def _cache_result(self, result, name): diff -Nru python-scrapy-1.4.0/scrapy/settings/default_settings.py python-scrapy-1.5.0/scrapy/settings/default_settings.py --- python-scrapy-1.4.0/scrapy/settings/default_settings.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/settings/default_settings.py 2017-12-29 21:09:52.000000000 +0000 @@ -234,7 +234,7 @@ RETRY_ENABLED = True RETRY_TIMES = 2 # initial response + 2 retries = 3 requests -RETRY_HTTP_CODES = [500, 502, 503, 504, 408] +RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408] RETRY_PRIORITY_ADJUST = -1 ROBOTSTXT_OBEY = False @@ -270,7 +270,7 @@ URLLENGTH_LIMIT = 2083 -USER_AGENT = 'Scrapy/%s (+http://scrapy.org)' % import_module('scrapy').__version__ +USER_AGENT = 'Scrapy/%s (+https://scrapy.org)' % import_module('scrapy').__version__ TELNETCONSOLE_ENABLED = 1 TELNETCONSOLE_PORT = [6023, 6073] diff -Nru python-scrapy-1.4.0/scrapy/shell.py python-scrapy-1.5.0/scrapy/shell.py --- python-scrapy-1.4.0/scrapy/shell.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/shell.py 2017-12-29 21:09:52.000000000 +0000 @@ -164,7 +164,7 @@ def inspect_response(response, spider): """Open a shell to inspect the given response""" - Shell(spider.crawler).start(response=response) + Shell(spider.crawler).start(response=response, spider=spider) def _request_deferred(request): diff -Nru python-scrapy-1.4.0/scrapy/signalmanager.py python-scrapy-1.5.0/scrapy/signalmanager.py --- python-scrapy-1.4.0/scrapy/signalmanager.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/signalmanager.py 2017-12-29 21:09:52.000000000 +0000 @@ -55,7 +55,7 @@ The keyword arguments are passed to the signal handlers (connected through the :meth:`connect` method). - .. _deferreds: http://twistedmatrix.com/documents/current/core/howto/defer.html + .. _deferreds: https://twistedmatrix.com/documents/current/core/howto/defer.html """ kwargs.setdefault('sender', self.sender) return _signal.send_catch_log_deferred(signal, **kwargs) diff -Nru python-scrapy-1.4.0/scrapy/spidermiddlewares/offsite.py python-scrapy-1.5.0/scrapy/spidermiddlewares/offsite.py --- python-scrapy-1.4.0/scrapy/spidermiddlewares/offsite.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/spidermiddlewares/offsite.py 2017-12-29 21:09:52.000000000 +0000 @@ -6,6 +6,7 @@ import re import logging +import warnings from scrapy import signals from scrapy.http import Request @@ -52,9 +53,18 @@ allowed_domains = getattr(spider, 'allowed_domains', None) if not allowed_domains: return re.compile('') # allow all by default + url_pattern = re.compile("^https?://.*$") + for domain in allowed_domains: + if url_pattern.match(domain): + warnings.warn("allowed_domains accepts only domains, not URLs. Ignoring URL entry %s in allowed_domains." % domain, URLWarning) + regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in allowed_domains if d is not None) return re.compile(regex) def spider_opened(self, spider): self.host_regex = self.get_host_regex(spider) self.domains_seen = set() + + +class URLWarning(Warning): + pass diff -Nru python-scrapy-1.4.0/scrapy/spiders/__init__.py python-scrapy-1.5.0/scrapy/spiders/__init__.py --- python-scrapy-1.4.0/scrapy/spiders/__init__.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/spiders/__init__.py 2017-12-29 21:09:52.000000000 +0000 @@ -87,7 +87,7 @@ return Request(url, dont_filter=True) def parse(self, response): - raise NotImplementedError + raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__)) @classmethod def update_settings(cls, settings): diff -Nru python-scrapy-1.4.0/scrapy/spiders/sitemap.py python-scrapy-1.5.0/scrapy/spiders/sitemap.py --- python-scrapy-1.4.0/scrapy/spiders/sitemap.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/spiders/sitemap.py 2017-12-29 21:09:52.000000000 +0000 @@ -48,7 +48,7 @@ if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': - for loc in iterloc(s): + for loc in iterloc(s, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) diff -Nru python-scrapy-1.4.0/scrapy/templates/project/module/items.py.tmpl python-scrapy-1.5.0/scrapy/templates/project/module/items.py.tmpl --- python-scrapy-1.4.0/scrapy/templates/project/module/items.py.tmpl 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/templates/project/module/items.py.tmpl 2017-12-29 21:09:52.000000000 +0000 @@ -3,7 +3,7 @@ # Define here the models for your scraped items # # See documentation in: -# http://doc.scrapy.org/en/latest/topics/items.html +# https://doc.scrapy.org/en/latest/topics/items.html import scrapy diff -Nru python-scrapy-1.4.0/scrapy/templates/project/module/middlewares.py.tmpl python-scrapy-1.5.0/scrapy/templates/project/module/middlewares.py.tmpl --- python-scrapy-1.4.0/scrapy/templates/project/module/middlewares.py.tmpl 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/templates/project/module/middlewares.py.tmpl 2017-12-29 21:09:52.000000000 +0000 @@ -3,7 +3,7 @@ # Define here the models for your spider middleware # # See documentation in: -# http://doc.scrapy.org/en/latest/topics/spider-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals @@ -54,3 +54,50 @@ def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) + + +class ${ProjectName}DownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff -Nru python-scrapy-1.4.0/scrapy/templates/project/module/pipelines.py.tmpl python-scrapy-1.5.0/scrapy/templates/project/module/pipelines.py.tmpl --- python-scrapy-1.4.0/scrapy/templates/project/module/pipelines.py.tmpl 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/templates/project/module/pipelines.py.tmpl 2017-12-29 21:09:52.000000000 +0000 @@ -3,7 +3,7 @@ # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class ${ProjectName}Pipeline(object): diff -Nru python-scrapy-1.4.0/scrapy/templates/project/module/settings.py.tmpl python-scrapy-1.5.0/scrapy/templates/project/module/settings.py.tmpl --- python-scrapy-1.4.0/scrapy/templates/project/module/settings.py.tmpl 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/templates/project/module/settings.py.tmpl 2017-12-29 21:09:52.000000000 +0000 @@ -5,9 +5,9 @@ # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # -# http://doc.scrapy.org/en/latest/topics/settings.html -# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html -# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = '$project_name' @@ -25,7 +25,7 @@ #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) -# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: @@ -45,31 +45,31 @@ #} # Enable or disable spider middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # '$project_name.middlewares.${ProjectName}SpiderMiddleware': 543, #} # Enable or disable downloader middlewares -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { -# '$project_name.middlewares.MyCustomDownloaderMiddleware': 543, +# '$project_name.middlewares.${ProjectName}DownloaderMiddleware': 543, #} # Enable or disable extensions -# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +# See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines -# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # '$project_name.pipelines.${ProjectName}Pipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) -# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 @@ -82,7 +82,7 @@ #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) -# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' diff -Nru python-scrapy-1.4.0/scrapy/templates/project/scrapy.cfg python-scrapy-1.5.0/scrapy/templates/project/scrapy.cfg --- python-scrapy-1.4.0/scrapy/templates/project/scrapy.cfg 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/templates/project/scrapy.cfg 2017-12-29 21:09:52.000000000 +0000 @@ -1,7 +1,7 @@ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: -# https://scrapyd.readthedocs.org/en/latest/deploy.html +# https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = ${project_name}.settings diff -Nru python-scrapy-1.4.0/scrapy/utils/console.py python-scrapy-1.5.0/scrapy/utils/console.py --- python-scrapy-1.4.0/scrapy/utils/console.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/console.py 2017-12-29 21:09:52.000000000 +0000 @@ -31,6 +31,15 @@ bpython.embed(locals_=namespace, banner=banner) return wrapper +def _embed_ptpython_shell(namespace={}, banner=''): + """Start a ptpython shell""" + import ptpython.repl + @wraps(_embed_ptpython_shell) + def wrapper(namespace=namespace, banner=''): + print(banner) + ptpython.repl.embed(locals=namespace) + return wrapper + def _embed_standard_shell(namespace={}, banner=''): """Start a standard python shell""" import code @@ -47,9 +56,10 @@ return wrapper DEFAULT_PYTHON_SHELLS = OrderedDict([ + ('ptpython', _embed_ptpython_shell), ('ipython', _embed_ipython_shell), ('bpython', _embed_bpython_shell), - ( 'python', _embed_standard_shell), + ('python', _embed_standard_shell), ]) def get_shell_embed_func(shells=None, known_shells=None): diff -Nru python-scrapy-1.4.0/scrapy/utils/defer.py python-scrapy-1.5.0/scrapy/utils/defer.py --- python-scrapy-1.4.0/scrapy/utils/defer.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/defer.py 2017-12-29 21:09:52.000000000 +0000 @@ -57,7 +57,7 @@ """Execute a callable over the objects in the given iterable, in parallel, using no more than ``count`` concurrent calls. - Taken from: http://jcalderone.livejournal.com/24285.html + Taken from: https://jcalderone.livejournal.com/24285.html """ coop = task.Cooperator() work = (callable(elem, *args, **named) for elem in iterable) diff -Nru python-scrapy-1.4.0/scrapy/utils/deprecate.py python-scrapy-1.5.0/scrapy/utils/deprecate.py --- python-scrapy-1.4.0/scrapy/utils/deprecate.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/deprecate.py 2017-12-29 21:09:52.000000000 +0000 @@ -71,8 +71,8 @@ warnings.warn(msg, warn_category, stacklevel=2) super(DeprecatedClass, cls).__init__(name, bases, clsdict_) - # see http://www.python.org/dev/peps/pep-3119/#overloading-isinstance-and-issubclass - # and http://docs.python.org/2/reference/datamodel.html#customizing-instance-and-subclass-checks + # see https://www.python.org/dev/peps/pep-3119/#overloading-isinstance-and-issubclass + # and https://docs.python.org/reference/datamodel.html#customizing-instance-and-subclass-checks # for implementation details def __instancecheck__(cls, inst): return any(cls.__subclasscheck__(c) @@ -159,10 +159,10 @@ def method_is_overridden(subclass, base_class, method_name): - """ - Return True if a method named ``method_name`` of a ``base_class`` - is overridden in a ``subclass``. - + """ + Return True if a method named ``method_name`` of a ``base_class`` + is overridden in a ``subclass``. + >>> class Base(object): ... def foo(self): ... pass diff -Nru python-scrapy-1.4.0/scrapy/utils/http.py python-scrapy-1.5.0/scrapy/utils/http.py --- python-scrapy-1.4.0/scrapy/utils/http.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/http.py 2017-12-29 21:09:52.000000000 +0000 @@ -11,7 +11,7 @@ decoded body. For more info see: - http://en.wikipedia.org/wiki/Chunked_transfer_encoding + https://en.wikipedia.org/wiki/Chunked_transfer_encoding """ body, h, t = '', '', chunked_body diff -Nru python-scrapy-1.4.0/scrapy/utils/log.py python-scrapy-1.5.0/scrapy/utils/log.py --- python-scrapy-1.4.0/scrapy/utils/log.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/log.py 2017-12-29 21:09:52.000000000 +0000 @@ -9,8 +9,10 @@ from twisted.python import log as twisted_log import scrapy -from scrapy.settings import overridden_settings, Settings +from scrapy.settings import Settings from scrapy.exceptions import ScrapyDeprecationWarning +from scrapy.utils.versions import scrapy_components_versions + logger = logging.getLogger(__name__) @@ -142,16 +144,17 @@ def log_scrapy_info(settings): logger.info("Scrapy %(version)s started (bot: %(bot)s)", {'version': scrapy.__version__, 'bot': settings['BOT_NAME']}) - - d = dict(overridden_settings(settings)) - logger.info("Overridden settings: %(settings)r", {'settings': d}) + logger.info("Versions: %(versions)s", + {'versions': ", ".join("%s %s" % (name, version) + for name, version in scrapy_components_versions() + if name != "Scrapy")}) class StreamLogger(object): """Fake file-like stream object that redirects writes to a logger instance Taken from: - http://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/ + https://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/ """ def __init__(self, logger, log_level=logging.INFO): self.logger = logger diff -Nru python-scrapy-1.4.0/scrapy/utils/python.py python-scrapy-1.5.0/scrapy/utils/python.py --- python-scrapy-1.4.0/scrapy/utils/python.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/python.py 2017-12-29 21:09:52.000000000 +0000 @@ -1,6 +1,7 @@ """ This module contains essential stuff that should've come with Python itself ;) """ +import gc import os import re import inspect @@ -8,6 +9,7 @@ import errno import six from functools import partial, wraps +import sys from scrapy.utils.decorators import deprecated @@ -195,10 +197,30 @@ return all(c not in _BINARYCHARS for c in data) +def _getargspec_py23(func): + """_getargspec_py23(function) -> named tuple ArgSpec(args, varargs, keywords, + defaults) + + Identical to inspect.getargspec() in python2, but uses + inspect.getfullargspec() for python3 behind the scenes to avoid + DeprecationWarning. + + >>> def f(a, b=2, *ar, **kw): + ... pass + + >>> _getargspec_py23(f) + ArgSpec(args=['a', 'b'], varargs='ar', keywords='kw', defaults=(2,)) + """ + if six.PY2: + return inspect.getargspec(func) + + return inspect.ArgSpec(*inspect.getfullargspec(func)[:4]) + + def get_func_args(func, stripself=False): """Return the argument name list of a callable""" if inspect.isfunction(func): - func_args, _, _, _ = inspect.getargspec(func) + func_args, _, _, _ = _getargspec_py23(func) elif inspect.isclass(func): return get_func_args(func.__init__, True) elif inspect.ismethod(func): @@ -245,9 +267,9 @@ """ if inspect.isfunction(func) or inspect.ismethod(func): - spec = inspect.getargspec(func) + spec = _getargspec_py23(func) elif hasattr(func, '__call__'): - spec = inspect.getargspec(func.__call__) + spec = _getargspec_py23(func.__call__) else: raise TypeError('%s is not callable' % type(func)) @@ -355,3 +377,13 @@ 'scrapy.http.request.Request' """ return "%s.%s" % (obj.__module__, obj.__name__) + + +if hasattr(sys, "pypy_version_info"): + def garbage_collect(): + # Collecting weakreferences can take two collections on PyPy. + gc.collect() + gc.collect() +else: + def garbage_collect(): + gc.collect() diff -Nru python-scrapy-1.4.0/scrapy/utils/test.py python-scrapy-1.5.0/scrapy/utils/test.py --- python-scrapy-1.4.0/scrapy/utils/test.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/test.py 2017-12-29 21:09:52.000000000 +0000 @@ -20,6 +20,12 @@ if 'AWS_ACCESS_KEY_ID' not in os.environ: raise SkipTest("AWS keys not found") + +def assert_gcs_environ(): + if 'GCS_PROJECT_ID' not in os.environ: + raise SkipTest("GCS_PROJECT_ID not found") + + def skip_if_no_boto(): try: is_botocore() @@ -45,6 +51,15 @@ bucket.delete_key(path) return (content, key) if with_key else content +def get_gcs_content_and_delete(bucket, path): + from google.cloud import storage + client = storage.Client(project=os.environ.get('GCS_PROJECT_ID')) + bucket = client.get_bucket(bucket) + blob = bucket.get_blob(path) + content = blob.download_as_string() + bucket.delete_blob(path) + return content, blob + def get_crawler(spidercls=None, settings_dict=None): """Return an unconfigured Crawler object. If settings_dict is given, it will be used to populate the crawler settings with a project level diff -Nru python-scrapy-1.4.0/scrapy/utils/url.py python-scrapy-1.5.0/scrapy/utils/url.py --- python-scrapy-1.4.0/scrapy/utils/url.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/url.py 2017-12-29 21:09:52.000000000 +0000 @@ -47,7 +47,7 @@ def escape_ajax(url): """ Return the crawleable url according to: - http://code.google.com/web/ajaxcrawling/docs/getting-started.html + https://developers.google.com/webmasters/ajax-crawling/docs/getting-started >>> escape_ajax("www.example.com/ajax.html#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' diff -Nru python-scrapy-1.4.0/scrapy/utils/versions.py python-scrapy-1.5.0/scrapy/utils/versions.py --- python-scrapy-1.4.0/scrapy/utils/versions.py 1970-01-01 00:00:00.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/utils/versions.py 2017-12-29 21:09:52.000000000 +0000 @@ -0,0 +1,50 @@ +import platform +import sys + +import cssselect +import lxml.etree +import parsel +import twisted +import w3lib + +import scrapy + + +def scrapy_components_versions(): + lxml_version = ".".join(map(str, lxml.etree.LXML_VERSION)) + libxml2_version = ".".join(map(str, lxml.etree.LIBXML_VERSION)) + try: + w3lib_version = w3lib.__version__ + except AttributeError: + w3lib_version = "<1.14.3" + try: + import cryptography + cryptography_version = cryptography.__version__ + except ImportError: + cryptography_version = "unknown" + + return [ + ("Scrapy", scrapy.__version__), + ("lxml", lxml_version), + ("libxml2", libxml2_version), + ("cssselect", cssselect.__version__), + ("parsel", parsel.__version__), + ("w3lib", w3lib_version), + ("Twisted", twisted.version.short()), + ("Python", sys.version.replace("\n", "- ")), + ("pyOpenSSL", _get_openssl_version()), + ("cryptography", cryptography_version), + ("Platform", platform.platform()), + ] + + +def _get_openssl_version(): + try: + import OpenSSL + openssl = OpenSSL.SSL.SSLeay_version(OpenSSL.SSL.SSLEAY_VERSION)\ + .decode('ascii', errors='replace') + # pyOpenSSL 0.12 does not expose openssl version + except AttributeError: + openssl = 'Unknown OpenSSL version' + + return '{} ({})'.format(OpenSSL.version.__version__, openssl) diff -Nru python-scrapy-1.4.0/scrapy/VERSION python-scrapy-1.5.0/scrapy/VERSION --- python-scrapy-1.4.0/scrapy/VERSION 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/scrapy/VERSION 2017-12-29 21:09:52.000000000 +0000 @@ -1 +1 @@ -1.4.0 +1.5.0 diff -Nru python-scrapy-1.4.0/sep/sep-001.rst python-scrapy-1.5.0/sep/sep-001.rst --- python-scrapy-1.4.0/sep/sep-001.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/sep/sep-001.rst 2017-12-29 21:09:52.000000000 +0000 @@ -61,7 +61,7 @@ -------- Pros: -- same API used for Items (see http://doc.scrapy.org/en/latest/topics/items.html) +- same API used for Items (see https://doc.scrapy.org/en/latest/topics/items.html) - some people consider setitem API more elegant than methods API Cons: diff -Nru python-scrapy-1.4.0/sep/sep-006.rst python-scrapy-1.5.0/sep/sep-006.rst --- python-scrapy-1.4.0/sep/sep-006.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/sep/sep-006.rst 2017-12-29 21:09:52.000000000 +0000 @@ -16,7 +16,7 @@ ========== When you use Selectors in Scrapy, your final goal is to "extract" the data that -you've selected, as the [http://doc.scrapy.org/en/latest/topics/selectors.html +you've selected, as the [https://doc.scrapy.org/en/latest/topics/selectors.html XPath Selectors documentation] says (bolding by me): When you’re scraping web pages, the most common task you need to perform is @@ -58,7 +58,7 @@ descriptive nor mnemotechnic enough and clearly clashes with ``extract`` method (x sounds like a short for extract in english), we propose to rename it to `select`, `sel` (is shortness if required), or `xpath` after `lxml's -`_ ``xpath`` method. +`_ ``xpath`` method. Bonus (ItemBuilder) =================== @@ -71,5 +71,5 @@ References ========== - 1. XPath Selectors (http://doc.scrapy.org/topics/selectors.html) - 2. XPath and XSLT with lxml (http://codespeak.net/lxml/xpathxslt.html) + 1. XPath Selectors (https://doc.scrapy.org/topics/selectors.html) + 2. XPath and XSLT with lxml (http://lxml.de/xpathxslt.html) diff -Nru python-scrapy-1.4.0/sep/sep-013.rst python-scrapy-1.5.0/sep/sep-013.rst --- python-scrapy-1.4.0/sep/sep-013.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/sep/sep-013.rst 2017-12-29 21:09:52.000000000 +0000 @@ -44,7 +44,7 @@ Most of the inconsistencies come from the fact that middlewares don't follow the typical -[http://twistedmatrix.com/projects/core/documentation/howto/defer.html +[https://twistedmatrix.com/projects/core/documentation/howto/defer.html deferred] callback/errback chaining logic. Twisted logic is fine and quite intuitive, and also fits middlewares very well. Due to some bad design choices the integration between middleware calls and deferred is far from optional. So diff -Nru python-scrapy-1.4.0/sep/sep-017.rst python-scrapy-1.5.0/sep/sep-017.rst --- python-scrapy-1.4.0/sep/sep-017.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/sep/sep-017.rst 2017-12-29 21:09:52.000000000 +0000 @@ -13,7 +13,7 @@ The motivation for Spider Contracts is to build a lightweight mechanism for testing your spiders, and be able to run the tests quickly without having to wait for all the spider to run. It's partially based on the -[http://en.wikipedia.org/wiki/Design_by_contract Design by contract] approach +[https://en.wikipedia.org/wiki/Design_by_contract Design by contract] approach (hence its name) where you define certain conditions that spider callbacks must met, and you give example testing pages. diff -Nru python-scrapy-1.4.0/sep/sep-020.rst python-scrapy-1.5.0/sep/sep-020.rst --- python-scrapy-1.4.0/sep/sep-020.rst 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/sep/sep-020.rst 2017-12-29 21:09:52.000000000 +0000 @@ -29,7 +29,7 @@ fields. One pattern that is particularly well suited for auto-populating an Item Loader -is the `definition list `_:: +is the `definition list `_::
diff -Nru python-scrapy-1.4.0/setup.py python-scrapy-1.5.0/setup.py --- python-scrapy-1.4.0/setup.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/setup.py 2017-12-29 21:09:52.000000000 +0000 @@ -1,15 +1,35 @@ from os.path import dirname, join -from setuptools import setup, find_packages +from pkg_resources import parse_version +from setuptools import setup, find_packages, __version__ as setuptools_version with open(join(dirname(__file__), 'scrapy/VERSION'), 'rb') as f: version = f.read().decode('ascii').strip() +def has_environment_marker_platform_impl_support(): + """Code extracted from 'pytest/setup.py' + https://github.com/pytest-dev/pytest/blob/7538680c/setup.py#L31 + + The first known release to support environment marker with range operators + it is 18.5, see: + https://setuptools.readthedocs.io/en/latest/history.html#id235 + """ + return parse_version(setuptools_version) >= parse_version('18.5') + + +extras_require = {} + +if has_environment_marker_platform_impl_support(): + extras_require[':platform_python_implementation == "PyPy"'] = [ + 'PyPyDispatcher>=2.1.0', + ] + + setup( name='Scrapy', version=version, - url='http://scrapy.org', + url='https://scrapy.org', description='A high-level Web Crawling and Web Scraping framework', long_description=open('README.rst').read(), author='Scrapy developers', @@ -33,14 +53,16 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Software Development :: Libraries :: Application Frameworks', 'Topic :: Software Development :: Libraries :: Python Modules', ], + python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*', install_requires=[ 'Twisted>=13.1.0', 'w3lib>=1.17.0', @@ -53,4 +75,5 @@ 'PyDispatcher>=2.0.5', 'service_identity', ], + extras_require=extras_require, ) diff -Nru python-scrapy-1.4.0/tests/__init__.py python-scrapy-1.5.0/tests/__init__.py --- python-scrapy-1.4.0/tests/__init__.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/__init__.py 2017-12-29 21:09:52.000000000 +0000 @@ -1,7 +1,7 @@ """ tests: this package contains all Scrapy unittests -see http://doc.scrapy.org/en/latest/contributing.html#running-tests +see https://doc.scrapy.org/en/latest/contributing.html#running-tests """ import os diff -Nru python-scrapy-1.4.0/tests/keys/example-com.conf python-scrapy-1.5.0/tests/keys/example-com.conf --- python-scrapy-1.4.0/tests/keys/example-com.conf 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/keys/example-com.conf 2017-12-29 21:09:52.000000000 +0000 @@ -1,4 +1,4 @@ -# this is copied from http://stackoverflow.com/a/27931596 +# this is copied from https://stackoverflow.com/a/27931596 [ req ] default_bits = 2048 default_keyfile = server-key.pem @@ -24,7 +24,7 @@ # Use a friendly name here because its presented to the user. The server's DNS # names are placed in Subject Alternate Names. Plus, DNS names here is deprecated -# by both IETF and CA/Browser Forums. If you place a DNS name here, then you +# by both IETF and CA/Browser Forums. If you place a DNS name here, then you # must include the DNS name in the SAN too (otherwise, Chrome and others that # strictly follow the CA/Browser Baseline Requirements will fail). commonName = Common Name (e.g. server FQDN or YOUR name) diff -Nru python-scrapy-1.4.0/tests/test_cmdline/__init__.py python-scrapy-1.5.0/tests/test_cmdline/__init__.py --- python-scrapy-1.4.0/tests/test_cmdline/__init__.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_cmdline/__init__.py 2017-12-29 21:09:52.000000000 +0000 @@ -68,4 +68,4 @@ settingsstr = settingsstr.replace(char, '"') settingsdict = json.loads(settingsstr) six.assertCountEqual(self, settingsdict.keys(), EXTENSIONS.keys()) - self.assertEquals(200, settingsdict[EXT_PATH]) + self.assertEqual(200, settingsdict[EXT_PATH]) diff -Nru python-scrapy-1.4.0/tests/test_command_parse.py python-scrapy-1.5.0/tests/test_command_parse.py --- python-scrapy-1.4.0/tests/test_command_parse.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_command_parse.py 2017-12-29 21:09:52.000000000 +0000 @@ -29,6 +29,21 @@ self.logger.debug('It Works!') return [scrapy.Item(), dict(foo='bar')] + def parse_request_with_meta(self, response): + foo = response.meta.get('foo', 'bar') + + if foo == 'bar': + self.logger.debug('It Does Not Work :(') + else: + self.logger.debug('It Works!') + + def parse_request_without_meta(self, response): + foo = response.meta.get('foo', 'bar') + + if foo == 'bar': + self.logger.debug('It Works!') + else: + self.logger.debug('It Does Not Work :(') class MyGoodCrawlSpider(CrawlSpider): name = 'goodcrawl{0}' @@ -85,6 +100,30 @@ self.assertIn("DEBUG: It Works!", to_native_str(stderr)) @defer.inlineCallbacks + def test_request_with_meta(self): + raw_json_string = '{"foo" : "baz"}' + _, _, stderr = yield self.execute(['--spider', self.spider_name, + '--meta', raw_json_string, + '-c', 'parse_request_with_meta', + self.url('/html')]) + self.assertIn("DEBUG: It Works!", to_native_str(stderr)) + + _, _, stderr = yield self.execute(['--spider', self.spider_name, + '-m', raw_json_string, + '-c', 'parse_request_with_meta', + self.url('/html')]) + self.assertIn("DEBUG: It Works!", to_native_str(stderr)) + + + @defer.inlineCallbacks + def test_request_without_meta(self): + _, _, stderr = yield self.execute(['--spider', self.spider_name, + '-c', 'parse_request_without_meta', + self.url('/html')]) + self.assertIn("DEBUG: It Works!", to_native_str(stderr)) + + + @defer.inlineCallbacks def test_pipelines(self): _, _, stderr = yield self.execute(['--spider', self.spider_name, '--pipelines', diff -Nru python-scrapy-1.4.0/tests/test_commands.py python-scrapy-1.5.0/tests/test_commands.py --- python-scrapy-1.4.0/tests/test_commands.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_commands.py 2017-12-29 21:09:52.000000000 +0000 @@ -226,6 +226,27 @@ self.assertNotIn("DEBUG: It Works!", log) self.assertIn("INFO: Spider opened", log) + def test_runspider_dnscache_disabled(self): + # see https://github.com/scrapy/scrapy/issues/2811 + # The spider below should not be able to connect to localhost:12345, + # which is intended, + # but this should not be because of DNS lookup error + # assumption: localhost will resolve in all cases (true?) + log = self.get_log(""" +import scrapy + +class MySpider(scrapy.Spider): + name = 'myspider' + start_urls = ['http://localhost:12345'] + + def parse(self, response): + return {'test': 'value'} +""", + args=('-s', 'DNSCACHE_ENABLED=False')) + print(log) + self.assertNotIn("DNSLookupError", log) + self.assertIn("INFO: Spider opened", log) + def test_runspider_log_short_names(self): log1 = self.get_log(self.debug_log_spider, args=('-s', 'LOG_SHORT_NAMES=1')) diff -Nru python-scrapy-1.4.0/tests/test_command_version.py python-scrapy-1.5.0/tests/test_command_version.py --- python-scrapy-1.4.0/tests/test_command_version.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_command_version.py 2017-12-29 21:09:52.000000000 +0000 @@ -28,4 +28,4 @@ self.assertEqual(headers, ['Scrapy', 'lxml', 'libxml2', 'cssselect', 'parsel', 'w3lib', 'Twisted', 'Python', 'pyOpenSSL', - 'Platform']) + 'cryptography', 'Platform']) diff -Nru python-scrapy-1.4.0/tests/test_downloader_handlers.py python-scrapy-1.5.0/tests/test_downloader_handlers.py --- python-scrapy-1.4.0/tests/test_downloader_handlers.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloader_handlers.py 2017-12-29 21:09:52.000000000 +0000 @@ -98,9 +98,9 @@ def test_download(self): def _test(response): - self.assertEquals(response.url, request.url) - self.assertEquals(response.status, 200) - self.assertEquals(response.body, b'0123456789') + self.assertEqual(response.url, request.url) + self.assertEqual(response.status, 200) + self.assertEqual(response.body, b'0123456789') request = Request(path_to_file_uri(self.tmpname + '^')) assert request.url.upper().endswith('%5E') @@ -241,28 +241,28 @@ request = Request(self.getURL('file')) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) - d.addCallback(self.assertEquals, b"0123456789") + d.addCallback(self.assertEqual, b"0123456789") return d def test_download_head(self): request = Request(self.getURL('file'), method='HEAD') d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) - d.addCallback(self.assertEquals, b'') + d.addCallback(self.assertEqual, b'') return d def test_redirect_status(self): request = Request(self.getURL('redirect')) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.status) - d.addCallback(self.assertEquals, 302) + d.addCallback(self.assertEqual, 302) return d def test_redirect_status_head(self): request = Request(self.getURL('redirect'), method='HEAD') d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.status) - d.addCallback(self.assertEquals, 302) + d.addCallback(self.assertEqual, 302) return d @defer.inlineCallbacks @@ -285,24 +285,24 @@ def test_host_header_not_in_request_headers(self): def _test(response): - self.assertEquals( + self.assertEqual( response.body, to_bytes('%s:%d' % (self.host, self.portno))) - self.assertEquals(request.headers, {}) + self.assertEqual(request.headers, {}) request = Request(self.getURL('host')) return self.download_request(request, Spider('foo')).addCallback(_test) def test_host_header_seted_in_request_headers(self): def _test(response): - self.assertEquals(response.body, b'example.com') - self.assertEquals(request.headers.get('Host'), b'example.com') + self.assertEqual(response.body, b'example.com') + self.assertEqual(request.headers.get('Host'), b'example.com') request = Request(self.getURL('host'), headers={'Host': 'example.com'}) return self.download_request(request, Spider('foo')).addCallback(_test) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) - d.addCallback(self.assertEquals, b'example.com') + d.addCallback(self.assertEqual, b'example.com') return d def test_content_length_zero_bodyless_post_request_headers(self): @@ -317,7 +317,7 @@ https://bugs.python.org/issue14721 """ def _test(response): - self.assertEquals(response.body, b'0') + self.assertEqual(response.body, b'0') request = Request(self.getURL('contentlength'), method='POST', headers={'Host': 'example.com'}) return self.download_request(request, Spider('foo')).addCallback(_test) @@ -327,8 +327,8 @@ import json headers = Headers(json.loads(response.text)['headers']) contentlengths = headers.getlist('Content-Length') - self.assertEquals(len(contentlengths), 1) - self.assertEquals(contentlengths, [b"0"]) + self.assertEqual(len(contentlengths), 1) + self.assertEqual(contentlengths, [b"0"]) request = Request(self.getURL('echo'), method='POST') return self.download_request(request, Spider('foo')).addCallback(_test) @@ -338,7 +338,7 @@ request = Request(self.getURL('payload'), method='POST', body=body) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) - d.addCallback(self.assertEquals, body) + d.addCallback(self.assertEqual, body) return d @@ -364,7 +364,7 @@ request = Request(self.getURL('file')) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) - d.addCallback(self.assertEquals, b"0123456789") + d.addCallback(self.assertEqual, b"0123456789") return d def test_response_class_choosing_request(self): @@ -374,7 +374,7 @@ body = b'Some plain text\ndata with tabs\t and null bytes\0' def _test_type(response): - self.assertEquals(type(response), TextResponse) + self.assertEqual(type(response), TextResponse) request = Request(self.getURL('nocontenttype'), body=body) d = self.download_request(request, Spider('foo')) @@ -389,7 +389,7 @@ # response body. (regardless of headers) d = self.download_request(request, Spider('foo', download_maxsize=10)) d.addCallback(lambda r: r.body) - d.addCallback(self.assertEquals, b"0123456789") + d.addCallback(self.assertEqual, b"0123456789") yield d d = self.download_request(request, Spider('foo', download_maxsize=9)) @@ -431,14 +431,14 @@ request = Request(self.getURL('file')) d = self.download_request(request, Spider('foo', download_maxsize=100)) d.addCallback(lambda r: r.body) - d.addCallback(self.assertEquals, b"0123456789") + d.addCallback(self.assertEqual, b"0123456789") return d def test_download_chunked_content(self): request = Request(self.getURL('chunked')) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) - d.addCallback(self.assertEquals, b"chunked content\n") + d.addCallback(self.assertEqual, b"chunked content\n") return d def test_download_broken_content_cause_data_loss(self, url='broken'): @@ -597,9 +597,9 @@ def test_download_with_proxy(self): def _test(response): - self.assertEquals(response.status, 200) - self.assertEquals(response.url, request.url) - self.assertEquals(response.body, b'http://example.com') + self.assertEqual(response.status, 200) + self.assertEqual(response.url, request.url) + self.assertEqual(response.body, b'http://example.com') http_proxy = self.getURL('') request = Request('http://example.com', meta={'proxy': http_proxy}) @@ -607,9 +607,9 @@ def test_download_with_proxy_https_noconnect(self): def _test(response): - self.assertEquals(response.status, 200) - self.assertEquals(response.url, request.url) - self.assertEquals(response.body, b'https://example.com') + self.assertEqual(response.status, 200) + self.assertEqual(response.url, request.url) + self.assertEqual(response.body, b'https://example.com') http_proxy = '%s?noconnect' % self.getURL('') request = Request('https://example.com', meta={'proxy': http_proxy}) @@ -617,9 +617,9 @@ def test_download_without_proxy(self): def _test(response): - self.assertEquals(response.status, 200) - self.assertEquals(response.url, request.url) - self.assertEquals(response.body, b'/path/to/resource') + self.assertEqual(response.status, 200) + self.assertEqual(response.url, request.url) + self.assertEqual(response.body, b'/path/to/resource') request = Request(self.getURL('path/to/resource')) return self.download_request(request, Spider('foo')).addCallback(_test) @@ -978,7 +978,7 @@ uri = "data:,A%20brief%20note" def _test(response): - self.assertEquals(response.url, uri) + self.assertEqual(response.url, uri) self.assertFalse(response.headers) request = Request(uri) @@ -986,39 +986,39 @@ def test_default_mediatype_encoding(self): def _test(response): - self.assertEquals(response.text, 'A brief note') - self.assertEquals(type(response), + self.assertEqual(response.text, 'A brief note') + self.assertEqual(type(response), responsetypes.from_mimetype("text/plain")) - self.assertEquals(response.encoding, "US-ASCII") + self.assertEqual(response.encoding, "US-ASCII") request = Request("data:,A%20brief%20note") return self.download_request(request, self.spider).addCallback(_test) def test_default_mediatype(self): def _test(response): - self.assertEquals(response.text, u'\u038e\u03a3\u038e') - self.assertEquals(type(response), + self.assertEqual(response.text, u'\u038e\u03a3\u038e') + self.assertEqual(type(response), responsetypes.from_mimetype("text/plain")) - self.assertEquals(response.encoding, "iso-8859-7") + self.assertEqual(response.encoding, "iso-8859-7") request = Request("data:;charset=iso-8859-7,%be%d3%be") return self.download_request(request, self.spider).addCallback(_test) def test_text_charset(self): def _test(response): - self.assertEquals(response.text, u'\u038e\u03a3\u038e') - self.assertEquals(response.body, b'\xbe\xd3\xbe') - self.assertEquals(response.encoding, "iso-8859-7") + self.assertEqual(response.text, u'\u038e\u03a3\u038e') + self.assertEqual(response.body, b'\xbe\xd3\xbe') + self.assertEqual(response.encoding, "iso-8859-7") request = Request("data:text/plain;charset=iso-8859-7,%be%d3%be") return self.download_request(request, self.spider).addCallback(_test) def test_mediatype_parameters(self): def _test(response): - self.assertEquals(response.text, u'\u038e\u03a3\u038e') - self.assertEquals(type(response), + self.assertEqual(response.text, u'\u038e\u03a3\u038e') + self.assertEqual(type(response), responsetypes.from_mimetype("text/plain")) - self.assertEquals(response.encoding, "utf-8") + self.assertEqual(response.encoding, "utf-8") request = Request('data:text/plain;foo=%22foo;bar%5C%22%22;' 'charset=utf-8;bar=%22foo;%5C%22 foo ;/,%22' @@ -1027,7 +1027,7 @@ def test_base64(self): def _test(response): - self.assertEquals(response.text, 'Hello, world.') + self.assertEqual(response.text, 'Hello, world.') request = Request('data:text/plain;base64,SGVsbG8sIHdvcmxkLg%3D%3D') return self.download_request(request, self.spider).addCallback(_test) diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_cookies.py python-scrapy-1.5.0/tests/test_downloadermiddleware_cookies.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_cookies.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_cookies.py 2017-12-29 21:09:52.000000000 +0000 @@ -36,7 +36,7 @@ req2 = Request('http://scrapytest.org/sub1/') assert self.mw.process_request(req2, self.spider) is None - self.assertEquals(req2.headers.get('Cookie'), b"C1=value1") + self.assertEqual(req2.headers.get('Cookie'), b"C1=value1") def test_setting_false_cookies_enabled(self): self.assertRaises( @@ -131,12 +131,12 @@ # check that cookies are merged back req = Request('http://scrapytest.org/mergeme') assert self.mw.process_request(req, self.spider) is None - self.assertEquals(req.headers.get('Cookie'), b'C1=value1') + self.assertEqual(req.headers.get('Cookie'), b'C1=value1') # check that cookies are merged when dont_merge_cookies is passed as 0 req = Request('http://scrapytest.org/mergeme', meta={'dont_merge_cookies': 0}) assert self.mw.process_request(req, self.spider) is None - self.assertEquals(req.headers.get('Cookie'), b'C1=value1') + self.assertEqual(req.headers.get('Cookie'), b'C1=value1') def test_complex_cookies(self): # merge some cookies into jar @@ -157,7 +157,7 @@ # embed C2 for scrapytest.org/bar req = Request('http://scrapytest.org/bar') self.mw.process_request(req, self.spider) - self.assertEquals(req.headers.get('Cookie'), b'C2=value2') + self.assertEqual(req.headers.get('Cookie'), b'C2=value2') # embed nothing for scrapytest.org/baz req = Request('http://scrapytest.org/baz') @@ -167,7 +167,7 @@ def test_merge_request_cookies(self): req = Request('http://scrapytest.org/', cookies={'galleta': 'salada'}) assert self.mw.process_request(req, self.spider) is None - self.assertEquals(req.headers.get('Cookie'), b'galleta=salada') + self.assertEqual(req.headers.get('Cookie'), b'galleta=salada') headers = {'Set-Cookie': 'C1=value1; path=/'} res = Response('http://scrapytest.org/', headers=headers) @@ -181,7 +181,7 @@ def test_cookiejar_key(self): req = Request('http://scrapytest.org/', cookies={'galleta': 'salada'}, meta={'cookiejar': "store1"}) assert self.mw.process_request(req, self.spider) is None - self.assertEquals(req.headers.get('Cookie'), b'galleta=salada') + self.assertEqual(req.headers.get('Cookie'), b'galleta=salada') headers = {'Set-Cookie': 'C1=value1; path=/'} res = Response('http://scrapytest.org/', headers=headers, request=req) @@ -193,7 +193,7 @@ req3 = Request('http://scrapytest.org/', cookies={'galleta': 'dulce'}, meta={'cookiejar': "store2"}) assert self.mw.process_request(req3, self.spider) is None - self.assertEquals(req3.headers.get('Cookie'), b'galleta=dulce') + self.assertEqual(req3.headers.get('Cookie'), b'galleta=dulce') headers = {'Set-Cookie': 'C2=value2; path=/'} res2 = Response('http://scrapytest.org/', headers=headers, request=req3) @@ -213,16 +213,16 @@ req5_2 = Request('http://scrapytest.org:1104/some-redirected-path') assert self.mw.process_request(req5_2, self.spider) is None - self.assertEquals(req5_2.headers.get('Cookie'), b'C1=value1') + self.assertEqual(req5_2.headers.get('Cookie'), b'C1=value1') req5_3 = Request('http://scrapytest.org/some-redirected-path') assert self.mw.process_request(req5_3, self.spider) is None - self.assertEquals(req5_3.headers.get('Cookie'), b'C1=value1') + self.assertEqual(req5_3.headers.get('Cookie'), b'C1=value1') #skip cookie retrieval for not http request req6 = Request('file:///scrapy/sometempfile') assert self.mw.process_request(req6, self.spider) is None - self.assertEquals(req6.headers.get('Cookie'), None) + self.assertEqual(req6.headers.get('Cookie'), None) def test_local_domain(self): request = Request("http://example-host/", cookies={'currencyCookie': 'USD'}) diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_defaultheaders.py python-scrapy-1.5.0/tests/test_downloadermiddleware_defaultheaders.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_defaultheaders.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_defaultheaders.py 2017-12-29 21:09:52.000000000 +0000 @@ -22,15 +22,15 @@ defaults, spider, mw = self.get_defaults_spider_mw() req = Request('http://www.scrapytest.org') mw.process_request(req, spider) - self.assertEquals(req.headers, defaults) + self.assertEqual(req.headers, defaults) def test_update_headers(self): defaults, spider, mw = self.get_defaults_spider_mw() headers = {'Accept-Language': ['es'], 'Test-Header': ['test']} bytes_headers = {b'Accept-Language': [b'es'], b'Test-Header': [b'test']} req = Request('http://www.scrapytest.org', headers=headers) - self.assertEquals(req.headers, bytes_headers) + self.assertEqual(req.headers, bytes_headers) mw.process_request(req, spider) defaults.update(bytes_headers) - self.assertEquals(req.headers, defaults) + self.assertEqual(req.headers, defaults) diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_downloadtimeout.py python-scrapy-1.5.0/tests/test_downloadermiddleware_downloadtimeout.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_downloadtimeout.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_downloadtimeout.py 2017-12-29 21:09:52.000000000 +0000 @@ -18,20 +18,20 @@ req, spider, mw = self.get_request_spider_mw() mw.spider_opened(spider) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta.get('download_timeout'), 180) + self.assertEqual(req.meta.get('download_timeout'), 180) def test_string_download_timeout(self): req, spider, mw = self.get_request_spider_mw({'DOWNLOAD_TIMEOUT': '20.1'}) mw.spider_opened(spider) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta.get('download_timeout'), 20.1) + self.assertEqual(req.meta.get('download_timeout'), 20.1) def test_spider_has_download_timeout(self): req, spider, mw = self.get_request_spider_mw() spider.download_timeout = 2 mw.spider_opened(spider) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta.get('download_timeout'), 2) + self.assertEqual(req.meta.get('download_timeout'), 2) def test_request_has_download_timeout(self): req, spider, mw = self.get_request_spider_mw() @@ -39,4 +39,4 @@ mw.spider_opened(spider) req.meta['download_timeout'] = 1 assert mw.process_request(req, spider) is None - self.assertEquals(req.meta.get('download_timeout'), 1) + self.assertEqual(req.meta.get('download_timeout'), 1) diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_httpauth.py python-scrapy-1.5.0/tests/test_downloadermiddleware_httpauth.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_httpauth.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_httpauth.py 2017-12-29 21:09:52.000000000 +0000 @@ -23,10 +23,10 @@ def test_auth(self): req = Request('http://scrapytest.org/') assert self.mw.process_request(req, self.spider) is None - self.assertEquals(req.headers['Authorization'], b'Basic Zm9vOmJhcg==') + self.assertEqual(req.headers['Authorization'], b'Basic Zm9vOmJhcg==') def test_auth_already_set(self): req = Request('http://scrapytest.org/', headers=dict(Authorization='Digest 123')) assert self.mw.process_request(req, self.spider) is None - self.assertEquals(req.headers['Authorization'], b'Digest 123') + self.assertEqual(req.headers['Authorization'], b'Digest 123') diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_httpcache.py python-scrapy-1.5.0/tests/test_downloadermiddleware_httpcache.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_httpcache.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_httpcache.py 2017-12-29 21:09:52.000000000 +0000 @@ -322,6 +322,7 @@ (True, 203, {'Last-Modified': self.yesterday}), (True, 300, {'Last-Modified': self.yesterday}), (True, 301, {'Last-Modified': self.yesterday}), + (True, 308, {'Last-Modified': self.yesterday}), (True, 401, {'Last-Modified': self.yesterday}), (True, 404, {'Cache-Control': 'public, max-age=600'}), (True, 302, {'Expires': self.tomorrow}), diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_httpcompression.py python-scrapy-1.5.0/tests/test_downloadermiddleware_httpcompression.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_httpcompression.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_httpcompression.py 2017-12-29 21:09:52.000000000 +0000 @@ -248,4 +248,4 @@ response = response.replace(body = None) newresponse = self.mw.process_response(request, response, self.spider) self.assertIs(newresponse, response) - self.assertEquals(response.body, b'') + self.assertEqual(response.body, b'') diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_httpproxy.py python-scrapy-1.5.0/tests/test_downloadermiddleware_httpproxy.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_httpproxy.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_httpproxy.py 2017-12-29 21:09:52.000000000 +0000 @@ -28,17 +28,17 @@ crawler = Crawler(spider, settings) self.assertRaises(NotConfigured, partial(HttpProxyMiddleware.from_crawler, crawler)) - def test_no_enviroment_proxies(self): + def test_no_environment_proxies(self): os.environ = {'dummy_proxy': 'reset_env_and_do_not_raise'} mw = HttpProxyMiddleware() for url in ('http://e.com', 'https://e.com', 'file:///tmp/a'): req = Request(url) assert mw.process_request(req, spider) is None - self.assertEquals(req.url, url) - self.assertEquals(req.meta, {}) + self.assertEqual(req.url, url) + self.assertEqual(req.meta, {}) - def test_enviroment_proxies(self): + def test_environment_proxies(self): os.environ['http_proxy'] = http_proxy = 'https://proxy.for.http:3128' os.environ['https_proxy'] = https_proxy = 'http://proxy.for.https:8080' os.environ.pop('file_proxy', None) @@ -48,41 +48,41 @@ ('https://e.com', https_proxy), ('file://tmp/a', None)]: req = Request(url) assert mw.process_request(req, spider) is None - self.assertEquals(req.url, url) - self.assertEquals(req.meta.get('proxy'), proxy) + self.assertEqual(req.url, url) + self.assertEqual(req.meta.get('proxy'), proxy) def test_proxy_precedence_meta(self): os.environ['http_proxy'] = 'https://proxy.com' mw = HttpProxyMiddleware() req = Request('http://scrapytest.org', meta={'proxy': 'https://new.proxy:3128'}) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://new.proxy:3128'}) + self.assertEqual(req.meta, {'proxy': 'https://new.proxy:3128'}) def test_proxy_auth(self): os.environ['http_proxy'] = 'https://user:pass@proxy:3128' mw = HttpProxyMiddleware() req = Request('http://scrapytest.org') assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'}) - self.assertEquals(req.headers.get('Proxy-Authorization'), b'Basic dXNlcjpwYXNz') + self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) + self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic dXNlcjpwYXNz') # proxy from request.meta req = Request('http://scrapytest.org', meta={'proxy': 'https://username:password@proxy:3128'}) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'}) - self.assertEquals(req.headers.get('Proxy-Authorization'), b'Basic dXNlcm5hbWU6cGFzc3dvcmQ=') + self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) + self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic dXNlcm5hbWU6cGFzc3dvcmQ=') def test_proxy_auth_empty_passwd(self): os.environ['http_proxy'] = 'https://user:@proxy:3128' mw = HttpProxyMiddleware() req = Request('http://scrapytest.org') assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'}) - self.assertEquals(req.headers.get('Proxy-Authorization'), b'Basic dXNlcjo=') + self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) + self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic dXNlcjo=') # proxy from request.meta req = Request('http://scrapytest.org', meta={'proxy': 'https://username:@proxy:3128'}) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'}) - self.assertEquals(req.headers.get('Proxy-Authorization'), b'Basic dXNlcm5hbWU6') + self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) + self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic dXNlcm5hbWU6') def test_proxy_auth_encoding(self): # utf-8 encoding @@ -90,27 +90,27 @@ mw = HttpProxyMiddleware(auth_encoding='utf-8') req = Request('http://scrapytest.org') assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'}) - self.assertEquals(req.headers.get('Proxy-Authorization'), b'Basic bcOhbjpwYXNz') + self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) + self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic bcOhbjpwYXNz') # proxy from request.meta req = Request('http://scrapytest.org', meta={'proxy': u'https://\u00FCser:pass@proxy:3128'}) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'}) - self.assertEquals(req.headers.get('Proxy-Authorization'), b'Basic w7xzZXI6cGFzcw==') + self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) + self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic w7xzZXI6cGFzcw==') # default latin-1 encoding mw = HttpProxyMiddleware(auth_encoding='latin-1') req = Request('http://scrapytest.org') assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'}) - self.assertEquals(req.headers.get('Proxy-Authorization'), b'Basic beFuOnBhc3M=') + self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) + self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic beFuOnBhc3M=') # proxy from request.meta, latin-1 encoding req = Request('http://scrapytest.org', meta={'proxy': u'https://\u00FCser:pass@proxy:3128'}) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'https://proxy:3128'}) - self.assertEquals(req.headers.get('Proxy-Authorization'), b'Basic /HNlcjpwYXNz') + self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) + self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic /HNlcjpwYXNz') def test_proxy_already_seted(self): os.environ['http_proxy'] = 'https://proxy.for.http:3128' @@ -142,4 +142,4 @@ os.environ['no_proxy'] = '*' req = Request('http://noproxy.com', meta={'proxy': 'http://proxy.com'}) assert mw.process_request(req, spider) is None - self.assertEquals(req.meta, {'proxy': 'http://proxy.com'}) + self.assertEqual(req.meta, {'proxy': 'http://proxy.com'}) diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_redirect.py python-scrapy-1.5.0/tests/test_downloadermiddleware_redirect.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_redirect.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_redirect.py 2017-12-29 21:09:52.000000000 +0000 @@ -22,12 +22,12 @@ req2 = self.mw.process_response(req, rsp, self.spider) assert req2.priority > req.priority - def test_redirect_301(self): - def _test(method): - url = 'http://www.example.com/301' + def test_redirect_3xx_permanent(self): + def _test(method, status=301): + url = 'http://www.example.com/{}'.format(status) url2 = 'http://www.example.com/redirected' req = Request(url, method=method) - rsp = Response(url, headers={'Location': url2}, status=301) + rsp = Response(url, headers={'Location': url2}, status=status) req2 = self.mw.process_response(req, rsp, self.spider) assert isinstance(req2, Request) @@ -42,6 +42,14 @@ _test('POST') _test('HEAD') + _test('GET', status=307) + _test('POST', status=307) + _test('HEAD', status=307) + + _test('GET', status=308) + _test('POST', status=308) + _test('HEAD', status=308) + def test_dont_redirect(self): url = 'http://www.example.com/301' url2 = 'http://www.example.com/redirected' @@ -158,7 +166,7 @@ resp = Response('http://scrapytest.org/first', headers={'Location': latin1_location}, status=302) req_result = self.mw.process_response(req, resp, self.spider) perc_encoded_utf8_url = 'http://scrapytest.org/a%E7%E3o' - self.assertEquals(perc_encoded_utf8_url, req_result.url) + self.assertEqual(perc_encoded_utf8_url, req_result.url) def test_utf8_location(self): req = Request('http://scrapytest.org/first') @@ -166,7 +174,7 @@ resp = Response('http://scrapytest.org/first', headers={'Location': utf8_location}, status=302) req_result = self.mw.process_response(req, resp, self.spider) perc_encoded_utf8_url = 'http://scrapytest.org/a%C3%A7%C3%A3o' - self.assertEquals(perc_encoded_utf8_url, req_result.url) + self.assertEqual(perc_encoded_utf8_url, req_result.url) class MetaRefreshMiddlewareTest(unittest.TestCase): diff -Nru python-scrapy-1.4.0/tests/test_downloadermiddleware_useragent.py python-scrapy-1.5.0/tests/test_downloadermiddleware_useragent.py --- python-scrapy-1.4.0/tests/test_downloadermiddleware_useragent.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_downloadermiddleware_useragent.py 2017-12-29 21:09:52.000000000 +0000 @@ -17,7 +17,7 @@ spider, mw = self.get_spider_and_mw('default_useragent') req = Request('http://scrapytest.org/') assert mw.process_request(req, spider) is None - self.assertEquals(req.headers['User-Agent'], b'default_useragent') + self.assertEqual(req.headers['User-Agent'], b'default_useragent') def test_remove_agent(self): # settings UESR_AGENT to None should remove the user agent @@ -34,7 +34,7 @@ mw.spider_opened(spider) req = Request('http://scrapytest.org/') assert mw.process_request(req, spider) is None - self.assertEquals(req.headers['User-Agent'], b'spider_useragent') + self.assertEqual(req.headers['User-Agent'], b'spider_useragent') def test_header_agent(self): spider, mw = self.get_spider_and_mw('default_useragent') @@ -43,7 +43,7 @@ req = Request('http://scrapytest.org/', headers={'User-Agent': 'header_useragent'}) assert mw.process_request(req, spider) is None - self.assertEquals(req.headers['User-Agent'], b'header_useragent') + self.assertEqual(req.headers['User-Agent'], b'header_useragent') def test_no_agent(self): spider, mw = self.get_spider_and_mw(None) diff -Nru python-scrapy-1.4.0/tests/test_http_cookies.py python-scrapy-1.5.0/tests/test_http_cookies.py --- python-scrapy-1.4.0/tests/test_http_cookies.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_http_cookies.py 2017-12-29 21:09:52.000000000 +0000 @@ -62,7 +62,7 @@ self.wrapped = WrappedResponse(self.response) def test_info(self): - self.assert_(self.wrapped.info() is self.wrapped) + self.assertIs(self.wrapped.info(), self.wrapped) def test_getheaders(self): self.assertEqual(self.wrapped.getheaders('content-type'), ['text/html']) diff -Nru python-scrapy-1.4.0/tests/test_http_request.py python-scrapy-1.5.0/tests/test_http_request.py --- python-scrapy-1.4.0/tests/test_http_request.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_http_request.py 2017-12-29 21:09:52.000000000 +0000 @@ -64,9 +64,9 @@ h = Headers({'key1': u'val1', u'key2': 'val2'}) h[u'newkey'] = u'newval' for k, v in h.iteritems(): - self.assert_(isinstance(k, bytes)) + self.assertIsInstance(k, bytes) for s in v: - self.assert_(isinstance(s, bytes)) + self.assertIsInstance(s, bytes) def test_eq(self): url = 'http://www.scrapy.org' @@ -235,6 +235,26 @@ self.assertRaises(AttributeError, setattr, r, 'url', 'http://example2.com') self.assertRaises(AttributeError, setattr, r, 'body', 'xxx') + def test_callback_is_callable(self): + def a_function(): + pass + r = self.request_class('http://example.com') + self.assertIsNone(r.callback) + r = self.request_class('http://example.com', a_function) + self.assertIs(r.callback, a_function) + with self.assertRaises(TypeError): + self.request_class('http://example.com', 'a_function') + + def test_errback_is_callable(self): + def a_function(): + pass + r = self.request_class('http://example.com') + self.assertIsNone(r.errback) + r = self.request_class('http://example.com', a_function, errback=a_function) + self.assertIs(r.errback, a_function) + with self.assertRaises(TypeError): + self.request_class('http://example.com', a_function, errback='a_function') + class FormRequestTest(RequestTest): diff -Nru python-scrapy-1.4.0/tests/test_http_response.py python-scrapy-1.5.0/tests/test_http_response.py --- python-scrapy-1.4.0/tests/test_http_response.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_http_response.py 2017-12-29 21:09:52.000000000 +0000 @@ -162,7 +162,6 @@ def test_follow_whitespace_link(self): self._assert_followed_url(Link('http://example.com/foo '), 'http://example.com/foo%20') - def _assert_followed_url(self, follow_obj, target_url, response=None): if response is None: response = self._links_response() @@ -402,6 +401,13 @@ for sel, url in zip(sellist, urls): self._assert_followed_url(sel, url, response=resp) + # select elements + self._assert_followed_url( + Selector(text='').css('link')[0], + 'http://example.com/foo', + response=resp + ) + # href attributes should work for sellist in [resp.css('a::attr(href)'), resp.xpath('//a/@href')]: for sel, url in zip(sellist, urls): diff -Nru python-scrapy-1.4.0/tests/test_item.py python-scrapy-1.5.0/tests/test_item.py --- python-scrapy-1.4.0/tests/test_item.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_item.py 2017-12-29 21:09:52.000000000 +0000 @@ -270,7 +270,7 @@ def f(self): # For rationale of this see: # https://github.com/python/cpython/blob/ee1a81b77444c6715cbe610e951c655b6adab88b/Lib/test/test_super.py#L222 - return __class__ + return __class__ # noqa https://github.com/scrapy/scrapy/issues/2836 MyItem() diff -Nru python-scrapy-1.4.0/tests/test_loader.py python-scrapy-1.5.0/tests/test_loader.py --- python-scrapy-1.4.0/tests/test_loader.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_loader.py 2017-12-29 21:09:52.000000000 +0000 @@ -290,7 +290,7 @@ il = TestItemLoader() il.add_value('name', [u'$10']) try: - float('$10') + float(u'$10') except Exception as e: expected_exc_str = str(e) @@ -437,7 +437,7 @@ self.assertRaises(TypeError, proc, [None, '', 'hello', 'world']) self.assertEqual(proc(['', 'hello', 'world']), u' hello world') self.assertEqual(proc(['hello', 'world']), u'hello world') - self.assert_(isinstance(proc(['hello', 'world']), six.text_type)) + self.assertIsInstance(proc(['hello', 'world']), six.text_type) def test_compose(self): proc = Compose(lambda v: v[0], str.upper) @@ -482,7 +482,7 @@ def test_constructor_with_selector(self): sel = Selector(text=u"
marta
") l = TestItemLoader(selector=sel) - self.assert_(l.selector is sel) + self.assertIs(l.selector, sel) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) @@ -490,21 +490,21 @@ def test_constructor_with_selector_css(self): sel = Selector(text=u"
marta
") l = TestItemLoader(selector=sel) - self.assert_(l.selector is sel) + self.assertIs(l.selector, sel) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) def test_constructor_with_response(self): l = TestItemLoader(response=self.response) - self.assert_(l.selector) + self.assertTrue(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) def test_constructor_with_response_css(self): l = TestItemLoader(response=self.response) - self.assert_(l.selector) + self.assertTrue(l.selector) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) @@ -526,7 +526,7 @@ def test_replace_xpath(self): l = TestItemLoader(response=self.response) - self.assert_(l.selector) + self.assertTrue(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_xpath('name', '//p/text()') @@ -552,7 +552,7 @@ def test_replace_xpath_re(self): l = TestItemLoader(response=self.response) - self.assert_(l.selector) + self.assertTrue(l.selector) l.add_xpath('name', '//div/text()') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_xpath('name', '//div/text()', re='ma') @@ -568,7 +568,7 @@ def test_replace_css(self): l = TestItemLoader(response=self.response) - self.assert_(l.selector) + self.assertTrue(l.selector) l.add_css('name', 'div::text') self.assertEqual(l.get_output_value('name'), [u'Marta']) l.replace_css('name', 'p::text') @@ -606,7 +606,7 @@ def test_replace_css_re(self): l = TestItemLoader(response=self.response) - self.assert_(l.selector) + self.assertTrue(l.selector) l.add_css('url', 'a::attr(href)') self.assertEqual(l.get_output_value('url'), [u'http://www.scrapy.org']) l.replace_css('url', 'a::attr(href)', re='http://www\.(.+)') diff -Nru python-scrapy-1.4.0/tests/test_pipeline_files.py python-scrapy-1.5.0/tests/test_pipeline_files.py --- python-scrapy-1.4.0/tests/test_pipeline_files.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_pipeline_files.py 2017-12-29 21:09:52.000000000 +0000 @@ -11,12 +11,13 @@ from twisted.trial import unittest from twisted.internet import defer -from scrapy.pipelines.files import FilesPipeline, FSFilesStore, S3FilesStore +from scrapy.pipelines.files import FilesPipeline, FSFilesStore, S3FilesStore, GCSFilesStore from scrapy.item import Item, Field from scrapy.http import Request, Response from scrapy.settings import Settings from scrapy.utils.python import to_bytes from scrapy.utils.test import assert_aws_environ, get_s3_content_and_delete +from scrapy.utils.test import assert_gcs_environ, get_gcs_content_and_delete from scrapy.utils.boto import is_botocore from tests import mock @@ -375,6 +376,31 @@ self.assertEqual(key.content_type, 'image/png') +class TestGCSFilesStore(unittest.TestCase): + @defer.inlineCallbacks + def test_persist(self): + assert_gcs_environ() + uri = os.environ.get('GCS_TEST_FILE_URI') + if not uri: + raise unittest.SkipTest("No GCS URI available for testing") + data = b"TestGCSFilesStore: \xe2\x98\x83" + buf = BytesIO(data) + meta = {'foo': 'bar'} + path = 'full/filename' + store = GCSFilesStore(uri) + yield store.persist_file(path, buf, info=None, meta=meta, headers=None) + s = yield store.stat_file(path, info=None) + self.assertIn('last_modified', s) + self.assertIn('checksum', s) + self.assertEqual(s['checksum'], 'zc2oVgXkbQr2EQdSdw3OPA==') + u = urlparse(uri) + content, blob = get_gcs_content_and_delete(u.hostname, u.path[1:]+path) + self.assertEqual(content, data) + self.assertEqual(blob.metadata, {'foo': 'bar'}) + self.assertEqual(blob.cache_control, GCSFilesStore.CACHE_CONTROL) + self.assertEqual(blob.content_type, 'application/octet-stream') + + class ItemWithFiles(Item): file_urls = Field() files = Field() diff -Nru python-scrapy-1.4.0/tests/test_pipeline_images.py python-scrapy-1.5.0/tests/test_pipeline_images.py --- python-scrapy-1.4.0/tests/test_pipeline_images.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_pipeline_images.py 2017-12-29 21:09:52.000000000 +0000 @@ -81,28 +81,28 @@ COLOUR = (0, 127, 255) im = _create_image('JPEG', 'RGB', SIZE, COLOUR) converted, _ = self.pipeline.convert_image(im) - self.assertEquals(converted.mode, 'RGB') - self.assertEquals(converted.getcolors(), [(10000, COLOUR)]) + self.assertEqual(converted.mode, 'RGB') + self.assertEqual(converted.getcolors(), [(10000, COLOUR)]) # check that thumbnail keep image ratio thumbnail, _ = self.pipeline.convert_image(converted, size=(10, 25)) - self.assertEquals(thumbnail.mode, 'RGB') - self.assertEquals(thumbnail.size, (10, 10)) + self.assertEqual(thumbnail.mode, 'RGB') + self.assertEqual(thumbnail.size, (10, 10)) # transparency case: RGBA and PNG COLOUR = (0, 127, 255, 50) im = _create_image('PNG', 'RGBA', SIZE, COLOUR) converted, _ = self.pipeline.convert_image(im) - self.assertEquals(converted.mode, 'RGB') - self.assertEquals(converted.getcolors(), [(10000, (205, 230, 255))]) + self.assertEqual(converted.mode, 'RGB') + self.assertEqual(converted.getcolors(), [(10000, (205, 230, 255))]) # transparency case with palette: P and PNG COLOUR = (0, 127, 255, 50) im = _create_image('PNG', 'RGBA', SIZE, COLOUR) im = im.convert('P') converted, _ = self.pipeline.convert_image(im) - self.assertEquals(converted.mode, 'RGB') - self.assertEquals(converted.getcolors(), [(10000, (205, 230, 255))]) + self.assertEqual(converted.mode, 'RGB') + self.assertEqual(converted.getcolors(), [(10000, (205, 230, 255))]) class DeprecatedImagesPipeline(ImagesPipeline): diff -Nru python-scrapy-1.4.0/tests/test_selector.py python-scrapy-1.5.0/tests/test_selector.py --- python-scrapy-1.4.0/tests/test_selector.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_selector.py 2017-12-29 21:09:52.000000000 +0000 @@ -84,7 +84,7 @@ headers = {'Content-Type': ['text/html; charset=utf-8']} response = HtmlResponse(url="http://example.com", headers=headers, body=html_utf8) x = Selector(response) - self.assertEquals(x.xpath("//span[@id='blank']/text()").extract(), + self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), [u'\xa3']) def test_badly_encoded_body(self): diff -Nru python-scrapy-1.4.0/tests/test_spidermiddleware_depth.py python-scrapy-1.5.0/tests/test_spidermiddleware_depth.py --- python-scrapy-1.4.0/tests/test_spidermiddleware_depth.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_spidermiddleware_depth.py 2017-12-29 21:09:52.000000000 +0000 @@ -25,18 +25,18 @@ result = [Request('http://scrapytest.org')] out = list(self.mw.process_spider_output(resp, result, self.spider)) - self.assertEquals(out, result) + self.assertEqual(out, result) rdc = self.stats.get_value('request_depth_count/1', spider=self.spider) - self.assertEquals(rdc, 1) + self.assertEqual(rdc, 1) req.meta['depth'] = 1 out2 = list(self.mw.process_spider_output(resp, result, self.spider)) - self.assertEquals(out2, []) + self.assertEqual(out2, []) rdm = self.stats.get_value('request_depth_max', spider=self.spider) - self.assertEquals(rdm, 1) + self.assertEqual(rdm, 1) def tearDown(self): self.stats.close_spider(self.spider, '') diff -Nru python-scrapy-1.4.0/tests/test_spidermiddleware_httperror.py python-scrapy-1.5.0/tests/test_spidermiddleware_httperror.py --- python-scrapy-1.4.0/tests/test_spidermiddleware_httperror.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_spidermiddleware_httperror.py 2017-12-29 21:09:52.000000000 +0000 @@ -67,16 +67,16 @@ self.res200, self.res404 = _responses(self.req, [200, 404]) def test_process_spider_input(self): - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(self.res200, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, self.res404, self.spider) def test_process_spider_exception(self): - self.assertEquals([], + self.assertEqual([], self.mw.process_spider_exception(self.res404, HttpError(self.res404), self.spider)) - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_exception(self.res404, Exception(), self.spider)) @@ -84,11 +84,11 @@ res = self.res404.copy() res.request = Request('http://scrapytest.org', meta={'handle_httpstatus_list': [404]}) - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(res, self.spider)) self.spider.handle_httpstatus_list = [404] - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(self.res404, self.spider)) @@ -102,11 +102,11 @@ self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402]) def test_process_spider_input(self): - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(self.res200, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, self.res404, self.spider) - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(self.res402, self.spider)) def test_meta_overrides_settings(self): @@ -117,14 +117,14 @@ res402 = self.res402.copy() res402.request = request - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(res404, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, res402, self.spider) def test_spider_override_settings(self): self.spider.handle_httpstatus_list = [404] - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(self.res404, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, self.res402, self.spider) @@ -139,9 +139,9 @@ self.res200, self.res404, self.res402 = _responses(self.req, [200, 404, 402]) def test_process_spider_input(self): - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(self.res200, self.spider)) - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(self.res404, self.spider)) def test_meta_overrides_settings(self): @@ -152,7 +152,7 @@ res402 = self.res402.copy() res402.request = request - self.assertEquals(None, + self.assertEqual(None, self.mw.process_spider_input(res404, self.spider)) self.assertRaises(HttpError, self.mw.process_spider_input, res402, self.spider) diff -Nru python-scrapy-1.4.0/tests/test_spidermiddleware_offsite.py python-scrapy-1.5.0/tests/test_spidermiddleware_offsite.py --- python-scrapy-1.4.0/tests/test_spidermiddleware_offsite.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_spidermiddleware_offsite.py 2017-12-29 21:09:52.000000000 +0000 @@ -5,7 +5,9 @@ from scrapy.http import Response, Request from scrapy.spiders import Spider from scrapy.spidermiddlewares.offsite import OffsiteMiddleware +from scrapy.spidermiddlewares.offsite import URLWarning from scrapy.utils.test import get_crawler +import warnings class TestOffsiteMiddleware(TestCase): @@ -37,7 +39,7 @@ reqs = onsite_reqs + offsite_reqs out = list(self.mw.process_spider_output(res, reqs, self.spider)) - self.assertEquals(out, onsite_reqs) + self.assertEqual(out, onsite_reqs) class TestOffsiteMiddleware2(TestOffsiteMiddleware): @@ -49,7 +51,7 @@ res = Response('http://scrapytest.org') reqs = [Request('http://a.com/b.html'), Request('http://b.com/1')] out = list(self.mw.process_spider_output(res, reqs, self.spider)) - self.assertEquals(out, reqs) + self.assertEqual(out, reqs) class TestOffsiteMiddleware3(TestOffsiteMiddleware2): @@ -67,4 +69,14 @@ res = Response('http://scrapytest.org') reqs = [Request('http://scrapytest.org/1')] out = list(self.mw.process_spider_output(res, reqs, self.spider)) - self.assertEquals(out, reqs) + self.assertEqual(out, reqs) + + +class TestOffsiteMiddleware5(TestOffsiteMiddleware4): + + def test_get_host_regex(self): + self.spider.allowed_domains = ['http://scrapytest.org', 'scrapy.org', 'scrapy.test.org'] + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + self.mw.get_host_regex(self.spider) + assert issubclass(w[-1].category, URLWarning) diff -Nru python-scrapy-1.4.0/tests/test_spidermiddleware_referer.py python-scrapy-1.5.0/tests/test_spidermiddleware_referer.py --- python-scrapy-1.4.0/tests/test_spidermiddleware_referer.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_spidermiddleware_referer.py 2017-12-29 21:09:52.000000000 +0000 @@ -45,7 +45,7 @@ response = self.get_response(origin) request = self.get_request(target) out = list(self.mw.process_spider_output(response, [request], self.spider)) - self.assertEquals(out[0].headers.get('Referer'), referrer) + self.assertEqual(out[0].headers.get('Referer'), referrer) class MixinDefault(object): @@ -490,7 +490,7 @@ ]: settings = Settings({'REFERRER_POLICY': s}) mw = RefererMiddleware(settings) - self.assertEquals(mw.default_policy, p) + self.assertEqual(mw.default_policy, p) def test_valid_name_casevariants(self): for s, p in [ @@ -506,7 +506,7 @@ ]: settings = Settings({'REFERRER_POLICY': s.upper()}) mw = RefererMiddleware(settings) - self.assertEquals(mw.default_policy, p) + self.assertEqual(mw.default_policy, p) def test_invalid_name(self): settings = Settings({'REFERRER_POLICY': 'some-custom-unknown-policy'}) @@ -581,7 +581,7 @@ request = self.get_request(target) out = list(self.referrermw.process_spider_output(response, [request], self.spider)) - self.assertEquals(out[0].headers.get('Referer'), init_referrer) + self.assertEqual(out[0].headers.get('Referer'), init_referrer) for status, url in redirections: response = Response(request.url, headers={'Location': url}, status=status) @@ -589,7 +589,7 @@ self.referrermw.request_scheduled(request, self.spider) assert isinstance(request, Request) - self.assertEquals(request.headers.get('Referer'), final_referrer) + self.assertEqual(request.headers.get('Referer'), final_referrer) class TestReferrerOnRedirectNoReferrer(TestReferrerOnRedirect): diff -Nru python-scrapy-1.4.0/tests/test_spidermiddleware_urllength.py python-scrapy-1.5.0/tests/test_spidermiddleware_urllength.py --- python-scrapy-1.4.0/tests/test_spidermiddleware_urllength.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_spidermiddleware_urllength.py 2017-12-29 21:09:52.000000000 +0000 @@ -17,5 +17,5 @@ mw = UrlLengthMiddleware(maxlength=25) spider = Spider('foo') out = list(mw.process_spider_output(res, reqs, spider)) - self.assertEquals(out, [short_url_req]) + self.assertEqual(out, [short_url_req]) diff -Nru python-scrapy-1.4.0/tests/test_spider.py python-scrapy-1.5.0/tests/test_spider.py --- python-scrapy-1.4.0/tests/test_spider.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_spider.py 2017-12-29 21:09:52.000000000 +0000 @@ -207,7 +207,7 @@ output = list(spider._requests_to_follow(response)) self.assertEqual(len(output), 3) self.assertTrue(all(map(lambda r: isinstance(r, Request), output))) - self.assertEquals([r.url for r in output], + self.assertEqual([r.url for r in output], ['http://example.org/somepage/item/12.html', 'http://example.org/about.html', 'http://example.org/nofollow.html']) @@ -234,7 +234,7 @@ output = list(spider._requests_to_follow(response)) self.assertEqual(len(output), 2) self.assertTrue(all(map(lambda r: isinstance(r, Request), output))) - self.assertEquals([r.url for r in output], + self.assertEqual([r.url for r in output], ['http://example.org/somepage/item/12.html', 'http://example.org/about.html']) @@ -258,7 +258,7 @@ output = list(spider._requests_to_follow(response)) self.assertEqual(len(output), 3) self.assertTrue(all(map(lambda r: isinstance(r, Request), output))) - self.assertEquals([r.url for r in output], + self.assertEqual([r.url for r in output], ['http://example.org/somepage/item/12.html', 'http://example.org/about.html', 'http://example.org/nofollow.html']) @@ -348,6 +348,33 @@ 'http://example.com/sitemap-uppercase.xml', 'http://www.example.com/sitemap-relative-url.xml']) + def test_alternate_url_locs(self): + sitemap = b""" + + + http://www.example.com/english/ + + + + + + """ + r = TextResponse(url="http://www.example.com/sitemap.xml", body=sitemap) + spider = self.spider_class("example.com") + self.assertEqual([req.url for req in spider._parse_sitemap(r)], + ['http://www.example.com/english/']) + + spider.sitemap_alternate_links = True + self.assertEqual([req.url for req in spider._parse_sitemap(r)], + ['http://www.example.com/english/', + 'http://www.example.com/deutsch/', + 'http://www.example.com/schweiz-deutsch/', + 'http://www.example.com/italiano/']) + class DeprecationTest(unittest.TestCase): @@ -429,3 +456,17 @@ self.assertEqual(len(requests), 1) self.assertEqual(requests[0].url, 'http://example.com/foo') self.assertEqual(len(w), 1) + + +class NoParseMethodSpiderTest(unittest.TestCase): + + spider_class = Spider + + def test_undefined_parse_method(self): + spider = self.spider_class('example.com') + text = b'Random text' + resp = TextResponse(url="http://www.example.com/random_url", body=text) + + exc_msg = 'Spider.parse callback is not defined' + with self.assertRaisesRegexp(NotImplementedError, exc_msg): + spider.parse(resp) diff -Nru python-scrapy-1.4.0/tests/test_squeues.py python-scrapy-1.5.0/tests/test_squeues.py --- python-scrapy-1.4.0/tests/test_squeues.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_squeues.py 2017-12-29 21:09:52.000000000 +0000 @@ -1,3 +1,5 @@ +import pickle + from queuelib.tests import test_queue as t from scrapy.squeues import MarshalFifoDiskQueue, MarshalLifoDiskQueue, PickleFifoDiskQueue, PickleLifoDiskQueue from scrapy.item import Item, Field @@ -14,6 +16,22 @@ default_item_class = TestItem name_out = staticmethod(_test_procesor) +def nonserializable_object_test(self): + try: + pickle.dumps(lambda x: x) + except Exception: + # Trigger Twisted bug #7989 + import twisted.persisted.styles # NOQA + q = self.queue() + self.assertRaises(ValueError, q.push, lambda x: x) + else: + # Use a different unpickleable object + class A(object): pass + a = A() + a.__reduce__ = a.__reduce_ex__ = None + q = self.queue() + self.assertRaises(ValueError, q.push, a) + class MarshalFifoDiskQueueTest(t.FifoDiskQueueTest): chunksize = 100000 @@ -30,11 +48,7 @@ self.assertEqual(q.pop(), 123) self.assertEqual(q.pop(), {'a': 'dict'}) - def test_nonserializable_object(self): - # Trigger Twisted bug #7989 - import twisted.persisted.styles # NOQA - q = self.queue() - self.assertRaises(ValueError, q.push, lambda x: x) + test_nonserializable_object = nonserializable_object_test class ChunkSize1MarshalFifoDiskQueueTest(MarshalFifoDiskQueueTest): chunksize = 1 @@ -110,11 +124,7 @@ self.assertEqual(q.pop(), 123) self.assertEqual(q.pop(), 'a') - def test_nonserializable_object(self): - # Trigger Twisted bug #7989 - import twisted.persisted.styles # NOQA - q = self.queue() - self.assertRaises(ValueError, q.push, lambda x: x) + test_nonserializable_object = nonserializable_object_test class PickleLifoDiskQueueTest(MarshalLifoDiskQueueTest): diff -Nru python-scrapy-1.4.0/tests/test_urlparse_monkeypatches.py python-scrapy-1.5.0/tests/test_urlparse_monkeypatches.py --- python-scrapy-1.4.0/tests/test_urlparse_monkeypatches.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_urlparse_monkeypatches.py 2017-12-29 21:09:52.000000000 +0000 @@ -6,7 +6,7 @@ def test_s3_url(self): p = urlparse('s3://bucket/key/name?param=value') - self.assertEquals(p.scheme, 's3') - self.assertEquals(p.hostname, 'bucket') - self.assertEquals(p.path, '/key/name') - self.assertEquals(p.query, 'param=value') + self.assertEqual(p.scheme, 's3') + self.assertEqual(p.hostname, 'bucket') + self.assertEqual(p.path, '/key/name') + self.assertEqual(p.query, 'param=value') diff -Nru python-scrapy-1.4.0/tests/test_utils_datatypes.py python-scrapy-1.5.0/tests/test_utils_datatypes.py --- python-scrapy-1.4.0/tests/test_utils_datatypes.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_utils_datatypes.py 2017-12-29 21:09:52.000000000 +0000 @@ -202,22 +202,22 @@ seq = range(10, 20, 3) d = SequenceExclude(seq) are_not_in = [v for v in range(10, 20, 3) if v in d] - self.assertEquals([], are_not_in) + self.assertEqual([], are_not_in) are_not_in = [v for v in range(10, 20) if v in d] - self.assertEquals([11, 12, 14, 15, 17, 18], are_not_in) + self.assertEqual([11, 12, 14, 15, 17, 18], are_not_in) def test_string_seq(self): seq = "cde" d = SequenceExclude(seq) chars = "".join(v for v in "abcdefg" if v in d) - self.assertEquals("abfg", chars) + self.assertEqual("abfg", chars) def test_stringset_seq(self): seq = set("cde") d = SequenceExclude(seq) chars = "".join(v for v in "abcdefg" if v in d) - self.assertEquals("abfg", chars) + self.assertEqual("abfg", chars) def test_set(self): """Anything that is not in the supplied sequence will evaluate as 'in' the container.""" diff -Nru python-scrapy-1.4.0/tests/test_utils_defer.py python-scrapy-1.5.0/tests/test_utils_defer.py --- python-scrapy-1.4.0/tests/test_utils_defer.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_utils_defer.py 2017-12-29 21:09:52.000000000 +0000 @@ -89,7 +89,7 @@ errors = [] out = list(iter_errback(itergood(), errors.append)) self.assertEqual(out, list(range(10))) - self.failIf(errors) + self.assertFalse(errors) def test_iter_errback_bad(self): def iterbad(): diff -Nru python-scrapy-1.4.0/tests/test_utils_iterators.py python-scrapy-1.5.0/tests/test_utils_iterators.py --- python-scrapy-1.4.0/tests/test_utils_iterators.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_utils_iterators.py 2017-12-29 21:09:52.000000000 +0000 @@ -252,8 +252,8 @@ # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: - self.assert_(all((isinstance(k, six.text_type) for k in result_row.keys()))) - self.assert_(all((isinstance(v, six.text_type) for v in result_row.values()))) + self.assertTrue(all((isinstance(k, six.text_type) for k in result_row.keys()))) + self.assertTrue(all((isinstance(v, six.text_type) for v in result_row.values()))) def test_csviter_delimiter(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t') diff -Nru python-scrapy-1.4.0/tests/test_utils_misc/__init__.py python-scrapy-1.5.0/tests/test_utils_misc/__init__.py --- python-scrapy-1.4.0/tests/test_utils_misc/__init__.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_utils_misc/__init__.py 2017-12-29 21:09:52.000000000 +0000 @@ -23,20 +23,20 @@ 'tests.test_utils_misc.test_walk_modules.mod.mod0', 'tests.test_utils_misc.test_walk_modules.mod1', ] - self.assertEquals(set([m.__name__ for m in mods]), set(expected)) + self.assertEqual(set([m.__name__ for m in mods]), set(expected)) mods = walk_modules('tests.test_utils_misc.test_walk_modules.mod') expected = [ 'tests.test_utils_misc.test_walk_modules.mod', 'tests.test_utils_misc.test_walk_modules.mod.mod0', ] - self.assertEquals(set([m.__name__ for m in mods]), set(expected)) + self.assertEqual(set([m.__name__ for m in mods]), set(expected)) mods = walk_modules('tests.test_utils_misc.test_walk_modules.mod1') expected = [ 'tests.test_utils_misc.test_walk_modules.mod1', ] - self.assertEquals(set([m.__name__ for m in mods]), set(expected)) + self.assertEqual(set([m.__name__ for m in mods]), set(expected)) self.assertRaises(ImportError, walk_modules, 'nomodule999') @@ -51,7 +51,7 @@ 'testegg.spiders.b', 'testegg' ] - self.assertEquals(set([m.__name__ for m in mods]), set(expected)) + self.assertEqual(set([m.__name__ for m in mods]), set(expected)) finally: sys.path.remove(egg) diff -Nru python-scrapy-1.4.0/tests/test_utils_project.py python-scrapy-1.5.0/tests/test_utils_project.py --- python-scrapy-1.4.0/tests/test_utils_project.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_utils_project.py 2017-12-29 21:09:52.000000000 +0000 @@ -25,14 +25,14 @@ class ProjectUtilsTest(unittest.TestCase): def test_data_path_outside_project(self): - self.assertEquals('.scrapy/somepath', data_path('somepath')) - self.assertEquals('/absolute/path', data_path('/absolute/path')) + self.assertEqual('.scrapy/somepath', data_path('somepath')) + self.assertEqual('/absolute/path', data_path('/absolute/path')) def test_data_path_inside_project(self): with inside_a_project() as proj_path: expected = os.path.join(proj_path, '.scrapy', 'somepath') - self.assertEquals( + self.assertEqual( os.path.realpath(expected), os.path.realpath(data_path('somepath')) ) - self.assertEquals('/absolute/path', data_path('/absolute/path')) + self.assertEqual('/absolute/path', data_path('/absolute/path')) diff -Nru python-scrapy-1.4.0/tests/test_utils_python.py python-scrapy-1.5.0/tests/test_utils_python.py --- python-scrapy-1.4.0/tests/test_utils_python.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_utils_python.py 2017-12-29 21:09:52.000000000 +0000 @@ -1,7 +1,9 @@ +import gc import functools import operator import unittest from itertools import count +import platform import six from scrapy.utils.python import ( @@ -95,9 +97,9 @@ a = Obj() b = Obj() # no attributes given return False - self.failIf(equal_attributes(a, b, [])) + self.assertFalse(equal_attributes(a, b, [])) # not existent attributes - self.failIf(equal_attributes(a, b, ['x', 'y'])) + self.assertFalse(equal_attributes(a, b, ['x', 'y'])) a.x = 1 b.x = 1 @@ -106,7 +108,7 @@ b.y = 2 # obj1 has no attribute y - self.failIf(equal_attributes(a, b, ['x', 'y'])) + self.assertFalse(equal_attributes(a, b, ['x', 'y'])) a.y = 2 # equal attributes @@ -114,7 +116,7 @@ a.y = 1 # differente attributes - self.failIf(equal_attributes(a, b, ['x', 'y'])) + self.assertFalse(equal_attributes(a, b, ['x', 'y'])) # test callable a.meta = {} @@ -132,7 +134,7 @@ self.assertTrue(equal_attributes(a, b, [compare_z, 'x'])) # fail z equality a.meta['z'] = 2 - self.failIf(equal_attributes(a, b, [compare_z, 'x'])) + self.assertFalse(equal_attributes(a, b, [compare_z, 'x'])) def test_weakkeycache(self): class _Weakme(object): pass @@ -144,6 +146,9 @@ self.assertNotEqual(v, wk[_Weakme()]) self.assertEqual(v, wk[k]) del k + for _ in range(100): + if wk._weakdict: + gc.collect() self.assertFalse(len(wk._weakdict)) @unittest.skipUnless(six.PY2, "deprecated function") @@ -151,9 +156,9 @@ d = {'a': 123, u'b': b'c', u'd': u'e', object(): u'e'} d2 = stringify_dict(d, keys_only=False) self.assertEqual(d, d2) - self.failIf(d is d2) # shouldn't modify in place - self.failIf(any(isinstance(x, six.text_type) for x in d2.keys())) - self.failIf(any(isinstance(x, six.text_type) for x in d2.values())) + self.assertIsNot(d, d2) # shouldn't modify in place + self.assertFalse(any(isinstance(x, six.text_type) for x in d2.keys())) + self.assertFalse(any(isinstance(x, six.text_type) for x in d2.values())) @unittest.skipUnless(six.PY2, "deprecated function") def test_stringify_dict_tuples(self): @@ -161,17 +166,17 @@ d = dict(tuples) d2 = stringify_dict(tuples, keys_only=False) self.assertEqual(d, d2) - self.failIf(d is d2) # shouldn't modify in place - self.failIf(any(isinstance(x, six.text_type) for x in d2.keys()), d2.keys()) - self.failIf(any(isinstance(x, six.text_type) for x in d2.values())) + self.assertIsNot(d, d2) # shouldn't modify in place + self.assertFalse(any(isinstance(x, six.text_type) for x in d2.keys()), d2.keys()) + self.assertFalse(any(isinstance(x, six.text_type) for x in d2.values())) @unittest.skipUnless(six.PY2, "deprecated function") def test_stringify_dict_keys_only(self): d = {'a': 123, u'b': 'c', u'd': u'e', object(): u'e'} d2 = stringify_dict(d) self.assertEqual(d, d2) - self.failIf(d is d2) # shouldn't modify in place - self.failIf(any(isinstance(x, six.text_type) for x in d2.keys())) + self.assertIsNot(d, d2) # shouldn't modify in place + self.assertFalse(any(isinstance(x, six.text_type) for x in d2.keys())) def test_get_func_args(self): def f1(a, b, c): @@ -208,10 +213,19 @@ self.assertEqual(get_func_args(cal), ['a', 'b', 'c']) self.assertEqual(get_func_args(object), []) - # TODO: how do we fix this to return the actual argument names? - self.assertEqual(get_func_args(six.text_type.split), []) - self.assertEqual(get_func_args(" ".join), []) - self.assertEqual(get_func_args(operator.itemgetter(2)), []) + if platform.python_implementation() == 'CPython': + # TODO: how do we fix this to return the actual argument names? + self.assertEqual(get_func_args(six.text_type.split), []) + self.assertEqual(get_func_args(" ".join), []) + self.assertEqual(get_func_args(operator.itemgetter(2)), []) + else: + stripself = not six.PY2 # PyPy3 exposes them as methods + self.assertEqual( + get_func_args(six.text_type.split, stripself), ['sep', 'maxsplit']) + self.assertEqual(get_func_args(" ".join, stripself), ['list']) + self.assertEqual( + get_func_args(operator.itemgetter(2), stripself), ['obj']) + def test_without_none_values(self): self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4]) diff -Nru python-scrapy-1.4.0/tests/test_utils_reqser.py python-scrapy-1.5.0/tests/test_utils_reqser.py --- python-scrapy-1.4.0/tests/test_utils_reqser.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_utils_reqser.py 2017-12-29 21:09:52.000000000 +0000 @@ -17,8 +17,8 @@ def test_all_attributes(self): r = Request("http://www.example.com", - callback='parse_item', - errback='handle_error', + callback=self.spider.parse_item, + errback=self.spider.handle_error, method="POST", body=b"some body", headers={'content-encoding': 'text/html; charset=latin-1'}, @@ -27,7 +27,7 @@ priority=20, meta={'a': 'b'}, flags=['testFlag']) - self._assert_serializes_ok(r) + self._assert_serializes_ok(r, spider=self.spider) def test_latin1_body(self): r = Request("http://www.example.com", body=b"\xa3") diff -Nru python-scrapy-1.4.0/tests/test_utils_signal.py python-scrapy-1.5.0/tests/test_utils_signal.py --- python-scrapy-1.4.0/tests/test_utils_signal.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_utils_signal.py 2017-12-29 21:09:52.000000000 +0000 @@ -29,7 +29,7 @@ self.assertIn('error_handler', record.getMessage()) self.assertEqual(record.levelname, 'ERROR') self.assertEqual(result[0][0], self.error_handler) - self.assert_(isinstance(result[0][1], Failure)) + self.assertIsInstance(result[0][1], Failure) self.assertEqual(result[1], (self.ok_handler, "OK")) dispatcher.disconnect(self.error_handler, signal=test_signal) diff -Nru python-scrapy-1.4.0/tests/test_webclient.py python-scrapy-1.5.0/tests/test_webclient.py --- python-scrapy-1.4.0/tests/test_webclient.py 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tests/test_webclient.py 2017-12-29 21:09:52.000000000 +0000 @@ -71,7 +71,7 @@ for url, test in tests: test = tuple( to_bytes(x) if not isinstance(x, int) else x for x in test) - self.assertEquals(client._parse(url), test, url) + self.assertEqual(client._parse(url), test, url) def test_externalUnicodeInterference(self): """ @@ -258,16 +258,16 @@ def testPayload(self): s = "0123456789" * 10 return getPage(self.getURL("payload"), body=s).addCallback( - self.assertEquals, to_bytes(s)) + self.assertEqual, to_bytes(s)) def testHostHeader(self): # if we pass Host header explicitly, it should be used, otherwise # it should extract from url return defer.gatherResults([ getPage(self.getURL("host")).addCallback( - self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno)), + self.assertEqual, to_bytes("127.0.0.1:%d" % self.portno)), getPage(self.getURL("host"), headers={"Host": "www.example.com"}).addCallback( - self.assertEquals, to_bytes("www.example.com"))]) + self.assertEqual, to_bytes("www.example.com"))]) def test_getPage(self): """ @@ -275,7 +275,7 @@ the body of the response if the default method B{GET} is used. """ d = getPage(self.getURL("file")) - d.addCallback(self.assertEquals, b"0123456789") + d.addCallback(self.assertEqual, b"0123456789") return d def test_getPageHead(self): @@ -298,7 +298,7 @@ """ d = getPage(self.getURL("host"), timeout=100) d.addCallback( - self.assertEquals, to_bytes("127.0.0.1:%d" % self.portno)) + self.assertEqual, to_bytes("127.0.0.1:%d" % self.portno)) return d def test_timeoutTriggering(self): @@ -326,7 +326,7 @@ return getPage(self.getURL('notsuchfile')).addCallback(self._cbNoSuchFile) def _cbNoSuchFile(self, pageData): - self.assert_(b'404 - No Such Resource' in pageData) + self.assertIn(b'404 - No Such Resource', pageData) def testFactoryInfo(self): url = self.getURL('file') @@ -336,16 +336,16 @@ return factory.deferred.addCallback(self._cbFactoryInfo, factory) def _cbFactoryInfo(self, ignoredResult, factory): - self.assertEquals(factory.status, b'200') - self.assert_(factory.version.startswith(b'HTTP/')) - self.assertEquals(factory.message, b'OK') - self.assertEquals(factory.response_headers[b'content-length'], b'10') + self.assertEqual(factory.status, b'200') + self.assertTrue(factory.version.startswith(b'HTTP/')) + self.assertEqual(factory.message, b'OK') + self.assertEqual(factory.response_headers[b'content-length'], b'10') def testRedirect(self): return getPage(self.getURL("redirect")).addCallback(self._cbRedirect) def _cbRedirect(self, pageData): - self.assertEquals(pageData, + self.assertEqual(pageData, b'\n\n \n \n' b' \n \n ' b'
click here\n \n\n') @@ -360,6 +360,6 @@ def _check_Encoding(self, response, original_body): content_encoding = to_unicode(response.headers[b'Content-Encoding']) - self.assertEquals(content_encoding, EncodingResource.out_encoding) - self.assertEquals( + self.assertEqual(content_encoding, EncodingResource.out_encoding) + self.assertEqual( response.body.decode(content_encoding), to_unicode(original_body)) diff -Nru python-scrapy-1.4.0/tox.ini python-scrapy-1.5.0/tox.ini --- python-scrapy-1.4.0/tox.ini 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/tox.ini 2017-12-29 21:09:52.000000000 +0000 @@ -1,4 +1,4 @@ -# Tox (http://tox.testrun.org/) is a tool for running tests +# Tox (https://tox.readthedocs.io/) is a tool for running tests # in multiple virtualenvs. This configuration file will run the # test suite on all supported python versions. To use it, "pip install tox" # and then run "tox" from this directory. @@ -11,6 +11,7 @@ -rrequirements.txt # Extras botocore + google-cloud-storage Pillow != 3.0.0 leveldb -rtests/requirements.txt @@ -18,6 +19,8 @@ S3_TEST_FILE_URI AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY + GCS_TEST_FILE_URI + GCS_PROJECT_ID commands = py.test --cov=scrapy --cov-report= {posargs:scrapy tests} @@ -38,6 +41,7 @@ # https://packages.debian.org/en/jessie/zope/ basepython = python2.7 deps = + cryptography==0.6.1 pyOpenSSL==0.14 lxml==3.4.0 Twisted==14.0.2 @@ -59,25 +63,27 @@ commands = py.test {posargs:scrapy tests} -[testenv:py33] -basepython = python3.3 +[testenv:py34] +basepython = python3.4 deps = -rrequirements-py3.txt # Extras Pillow -rtests/requirements-py3.txt -[testenv:py34] -basepython = python3.4 -deps = {[testenv:py33]deps} - [testenv:py35] basepython = python3.5 -deps = {[testenv:py33]deps} +deps = {[testenv:py34]deps} [testenv:py36] basepython = python3.6 -deps = {[testenv:py33]deps} +deps = {[testenv:py34]deps} + +[testenv:pypy3] +basepython = pypy3 +deps = {[testenv:py34]deps} +commands = + py.test {posargs:scrapy tests} [docs] changedir = docs diff -Nru python-scrapy-1.4.0/.travis.yml python-scrapy-1.5.0/.travis.yml --- python-scrapy-1.4.0/.travis.yml 2017-05-18 21:01:05.000000000 +0000 +++ python-scrapy-1.5.0/.travis.yml 2017-12-29 21:09:52.000000000 +0000 @@ -11,32 +11,32 @@ env: TOXENV=py27 - python: 2.7 env: TOXENV=jessie - - python: 3.3 - env: TOXENV=py33 + - python: 2.7 + env: TOXENV=pypy + - python: 2.7 + env: TOXENV=pypy3 + - python: 3.4 + env: TOXENV=py34 - python: 3.5 env: TOXENV=py35 - python: 3.6 env: TOXENV=py36 - - python: 2.7 - env: TOXENV=pypy - python: 3.6 env: TOXENV=docs - allow_failures: - - python: 2.7 - env: TOXENV=pypy install: - | if [ "$TOXENV" = "pypy" ]; then - export PYENV_ROOT="$HOME/.pyenv" - if [ -f "$PYENV_ROOT/bin/pyenv" ]; then - pushd "$PYENV_ROOT" && git pull && popd - else - rm -rf "$PYENV_ROOT" && git clone --depth 1 https://github.com/yyuu/pyenv.git "$PYENV_ROOT" - fi - # get latest portable PyPy from pyenv directly (thanks to natural version sort option -V) - export PYPY_VERSION=`"$PYENV_ROOT/bin/pyenv" install --list |grep -o -E 'pypy-portable-[0-9][\.0-9]*$' |sort -V |tail -1` - "$PYENV_ROOT/bin/pyenv" install --skip-existing "$PYPY_VERSION" - virtualenv --python="$PYENV_ROOT/versions/$PYPY_VERSION/bin/python" "$HOME/virtualenvs/$PYPY_VERSION" + export PYPY_VERSION="pypy-5.9-linux_x86_64-portable" + wget "https://bitbucket.org/squeaky/portable-pypy/downloads/${PYPY_VERSION}.tar.bz2" + tar -jxf ${PYPY_VERSION}.tar.bz2 + virtualenv --python="$PYPY_VERSION/bin/pypy" "$HOME/virtualenvs/$PYPY_VERSION" + source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate" + fi + if [ "$TOXENV" = "pypy3" ]; then + export PYPY_VERSION="pypy3.5-5.9-beta-linux_x86_64-portable" + wget "https://bitbucket.org/squeaky/portable-pypy/downloads/${PYPY_VERSION}.tar.bz2" + tar -jxf ${PYPY_VERSION}.tar.bz2 + virtualenv --python="$PYPY_VERSION/bin/pypy3" "$HOME/virtualenvs/$PYPY_VERSION" source "$HOME/virtualenvs/$PYPY_VERSION/bin/activate" fi - pip install -U tox twine wheel codecov