diff -Nru zim-tools-2.0.0/ChangeLog zim-tools-2.1.0/ChangeLog --- zim-tools-2.0.0/ChangeLog 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/ChangeLog 2020-11-17 16:00:36.000000000 +0000 @@ -1,3 +1,29 @@ +zim-tools 2.1.0 +=============== + + * Add small description text to help option (@kelson42) + * zimcheck: Better error reporting (@MiguelRocha, @kelson42) + * zimwriterfs: Update font mimetype (@kelson42) + * zimcheck: Fix various crash (@MiguelRocha) + * ci: Add building of deb package and publication on ppa (@legoktm) + * Add unit tests (@MiguelRocha, @asashnov) + * zimcheck: Handle `data:` urls correctly (@kelson42) + * zimcheck: Add testof empty links (@MiguelRocha) + * readme: Add link to packages links (repology) (@hashworks) + * zimwriterfs & zimrecreate: Add a option to write ZIM files using Zstandard + compression (default lzma) (@kelson42) + * zimcheck: Performance improvement (@MiguelRocha, @veloman-yunkan) + * zimwriterfs: Create zim redirect entry when file is actually a symlink + (@asashnov) + * zimcheck: Add a progress option (@kelson42) + * zimsplit: Use docopt to parse command line option (@MiguelRocha) + * zimwriterfs: Add a metadata `M/Scraper` with the name (zimwriterfs) and + version of the scraper. (@asashnov) + * zimwriterfs: Do not allow to create a zim file inside the HTML source + directory (@asashnov) + * zimcheck: Add internal integrity check using libzim `validate` + (@veloman-yunkan) + zim-tools 2.0.0 =============== diff -Nru zim-tools-2.0.0/.codecov.yml zim-tools-2.1.0/.codecov.yml --- zim-tools-2.0.0/.codecov.yml 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/.codecov.yml 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,16 @@ +codecov: + notify: + require_ci_to_pass: yes + +coverage: + status: + project: + default: + threshold: 5% + patch: + default: + target: 80% + threshold: 0% + +ignore: + - "test" diff -Nru zim-tools-2.0.0/debian/changelog zim-tools-2.1.0/debian/changelog --- zim-tools-2.0.0/debian/changelog 2020-07-28 16:21:29.000000000 +0000 +++ zim-tools-2.1.0/debian/changelog 2020-11-17 21:15:46.000000000 +0000 @@ -1,8 +1,9 @@ -zim-tools (2.0.0-2build1) groovy; urgency=medium +zim-tools (2.1.0-1) unstable; urgency=medium - * No-change rebuild against libicu67 + * New upstream version 2.1.0 + * Run tests at build time with gtest - -- Steve Langasek Tue, 28 Jul 2020 16:21:29 +0000 + -- Kunal Mehta Tue, 17 Nov 2020 13:15:46 -0800 zim-tools (2.0.0-2) unstable; urgency=medium diff -Nru zim-tools-2.0.0/debian/control zim-tools-2.1.0/debian/control --- zim-tools-2.0.0/debian/control 2020-07-28 16:21:29.000000000 +0000 +++ zim-tools-2.1.0/debian/control 2020-11-17 20:58:01.000000000 +0000 @@ -1,18 +1,18 @@ Source: zim-tools Section: utils Priority: optional -Maintainer: Ubuntu Developers -XSBC-Original-Maintainer: Kunal Mehta +Maintainer: Kunal Mehta Build-Depends: debhelper-compat (= 13), meson, pkg-config, - libzim-dev (>= 6.1.8), + libzim-dev (>= 6.3.0), libmagic-dev, zlib1g-dev, libgumbo-dev, libicu-dev, libdocopt-dev, - cmake + cmake, + libgtest-dev Standards-Version: 4.5.0 Homepage: https://github.com/openzim/zim-tools Vcs-Browser: https://salsa.debian.org/debian/zim-tools diff -Nru zim-tools-2.0.0/debian/copyright zim-tools-2.1.0/debian/copyright --- zim-tools-2.0.0/debian/copyright 2020-07-21 15:56:08.000000000 +0000 +++ zim-tools-2.1.0/debian/copyright 2020-11-17 20:54:17.000000000 +0000 @@ -3,12 +3,16 @@ Source: https://github.com/openzim/zim-tools Files: * -Copyright: 2003, 2004, 2006, 2007, 2009, 2010 Tommi Maekitalo +Copyright: 2006, 2007, 2009 Tommi Maekitalo 2013 Kiran Mathew Koshy 2013-2016 Emmanuel Engelhart 2016, 2017 Matthieu Gautier License: GPL-3.0-or-later +Files: test/data/zimfiles/wikibooks_be_all_nopic_2017-02.zim +Copyright: Wikibooks contributors +License: CC-BY-SA-3.0 + Files: debian/* Copyright: 2020 Kunal Mehta License: GPL-3.0-or-later @@ -29,3 +33,365 @@ . On Debian systems, the complete text of the GNU General Public License version 3 can be found in "/usr/share/common-licenses/GPL-3". + +License: CC-BY-SA-3.0 + Creative Commons Legal Code + . + Attribution-ShareAlike 3.0 Unported + . + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR + DAMAGES RESULTING FROM ITS USE. + . + License + . + THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE + COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY + COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS + AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. + . + BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE + TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY + BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS + CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND + CONDITIONS. + . + 1. Definitions + . + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + b. "Collection" means a collection of literary or artistic works, such as + encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(f) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined below) for the purposes of this + License. + c. "Creative Commons Compatible License" means a license that is listed + at https://creativecommons.org/compatiblelicenses that has been + approved by Creative Commons as being essentially equivalent to this + License, including, at a minimum, because that license: (i) contains + terms that have the same purpose, meaning and effect as the License + Elements of this License; and, (ii) explicitly permits the relicensing + of adaptations of works made available under that license under this + License or a Creative Commons jurisdiction license with the same + License Elements as this License. + d. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + e. "License Elements" means the following high-level license attributes + as selected by Licensor and indicated in the title of this License: + Attribution, ShareAlike. + f. "Licensor" means the individual, individuals, entity or entities that + offer(s) the Work under the terms of this License. + g. "Original Author" means, in the case of a literary or artistic work, + the individual, individuals, entity or entities who created the Work + or if no individual or entity can be identified, the publisher; and in + addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + h. "Work" means the literary and/or artistic work offered under the terms + of this License including without limitation any production in the + literary, scientific and artistic domain, whatever may be the mode or + form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + i. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + j. "Publicly Perform" means to perform public recitations of the Work and + to communicate to the public those public recitations, by any means or + process, including by wire or wireless means or public digital + performances; to make available to the public Works in such a way that + members of the public may access these Works from a place and at a + place individually chosen by them; to perform the Work to the public + by any means or process and the communication to the public of the + performances of the Work, including by public digital performance; to + broadcast and rebroadcast the Work by any means including signs, + sounds or images. + k. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + . + 2. Fair Dealing Rights. Nothing in this License is intended to reduce, + limit, or restrict any uses free from copyright or rights arising from + limitations or exceptions that are provided for in connection with the + copyright protection under copyright law or other applicable laws. + . + 3. License Grant. Subject to the terms and conditions of this License, + Licensor hereby grants You a worldwide, royalty-free, non-exclusive, + perpetual (for the duration of the applicable copyright) license to + exercise the rights in the Work as stated below: + . + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + b. to create and Reproduce Adaptations provided that any such Adaptation, + including any translation in any medium, takes reasonable steps to + clearly label, demarcate or otherwise identify that changes were made + to the original Work. For example, a translation could be marked "The + original work was translated from English to Spanish," or a + modification could indicate "The original work has been modified."; + c. to Distribute and Publicly Perform the Work including as incorporated + in Collections; and, + d. to Distribute and Publicly Perform Adaptations. + e. For the avoidance of doubt: + . + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor + reserves the exclusive right to collect such royalties for any + exercise by You of the rights granted under this License; + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor waives the + exclusive right to collect such royalties for any exercise by You + of the rights granted under this License; and, + iii. Voluntary License Schemes. The Licensor waives the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise + by You of the rights granted under this License. + . + The above rights may be exercised in all media and formats whether now + known or hereafter devised. The above rights include the right to make + such modifications as are technically necessary to exercise the rights in + other media and formats. Subject to Section 8(f), all rights not expressly + granted by Licensor are hereby reserved. + . + 4. Restrictions. The license granted in Section 3 above is expressly made + subject to and limited by the following restrictions: + . + a. You may Distribute or Publicly Perform the Work only under the terms + of this License. You must include a copy of, or the Uniform Resource + Identifier (URI) for, this License with every copy of the Work You + Distribute or Publicly Perform. You may not offer or impose any terms + on the Work that restrict the terms of this License or the ability of + the recipient of the Work to exercise the rights granted to that + recipient under the terms of the License. You may not sublicense the + Work. You must keep intact all notices that refer to this License and + to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(c), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(c), as requested. + b. You may Distribute or Publicly Perform an Adaptation only under the + terms of: (i) this License; (ii) a later version of this License with + the same License Elements as this License; (iii) a Creative Commons + jurisdiction license (either this or a later license version) that + contains the same License Elements as this License (e.g., + Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible + License. If you license the Adaptation under one of the licenses + mentioned in (iv), you must comply with the terms of that license. If + you license the Adaptation under the terms of any of the licenses + mentioned in (i), (ii) or (iii) (the "Applicable License"), you must + comply with the terms of the Applicable License generally and the + following provisions: (I) You must include a copy of, or the URI for, + the Applicable License with every copy of each Adaptation You + Distribute or Publicly Perform; (II) You may not offer or impose any + terms on the Adaptation that restrict the terms of the Applicable + License or the ability of the recipient of the Adaptation to exercise + the rights granted to that recipient under the terms of the Applicable + License; (III) You must keep intact all notices that refer to the + Applicable License and to the disclaimer of warranties with every copy + of the Work as included in the Adaptation You Distribute or Publicly + Perform; (IV) when You Distribute or Publicly Perform the Adaptation, + You may not impose any effective technological measures on the + Adaptation that restrict the ability of a recipient of the Adaptation + from You to exercise the rights granted to that recipient under the + terms of the Applicable License. This Section 4(b) applies to the + Adaptation as incorporated in a Collection, but this does not require + the Collection apart from the Adaptation itself to be made subject to + the terms of the Applicable License. + c. If You Distribute, or Publicly Perform the Work or any Adaptations or + Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and (iv) , consistent with Ssection 3(b), in the case of an + Adaptation, a credit identifying the use of the Work in the Adaptation + (e.g., "French translation of the Work by Original Author," or + "Screenplay based on original Work by Original Author"). The credit + required by this Section 4(c) may be implemented in any reasonable + manner; provided, however, that in the case of a Adaptation or + Collection, at a minimum such credit will appear, if a credit for all + contributing authors of the Adaptation or Collection appears, then as + part of these credits and in a manner at least as prominent as the + credits for the other contributing authors. For the avoidance of + doubt, You may only use the credit required by this Section for the + purpose of attribution in the manner set out above and, by exercising + Your rights under this License, You may not implicitly or explicitly + assert or imply any connection with, sponsorship or endorsement by the + Original Author, Licensor and/or Attribution Parties, as appropriate, + of You or Your use of the Work, without the separate, express prior + written permission of the Original Author, Licensor and/or Attribution + Parties. + d. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + . + 5. Representations, Warranties and Disclaimer + . + UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR + OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY + KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, + INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, + FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF + LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, + WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION + OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. + . + 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE + LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR + ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES + ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS + BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + . + 7. Termination + . + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this License. + Individuals or entities who have received Adaptations or Collections + from You under this License, however, will not have their licenses + terminated provided such individuals or entities remain in full + compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will + survive any termination of this License. + b. Subject to the above terms and conditions, the license granted here is + perpetual (for the duration of the applicable copyright in the Work). + Notwithstanding the above, Licensor reserves the right to release the + Work under different license terms or to stop distributing the Work at + any time; provided, however that any such election will not serve to + withdraw this License (or any other license that has been, or is + required to be, granted under the terms of this License), and this + License will continue in full force and effect unless terminated as + stated above. + . + 8. Miscellaneous + . + a. Each time You Distribute or Publicly Perform the Work or a Collection, + the Licensor offers to the recipient a license to the Work on the same + terms and conditions as the license granted to You under this License. + b. Each time You Distribute or Publicly Perform an Adaptation, Licensor + offers to the recipient a license to the original Work on the same + terms and conditions as the license granted to You under this License. + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + e. This License constitutes the entire agreement between the parties with + respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + f. The rights granted under, and the subject matter referenced, in this + License were drafted utilizing the terminology of the Berne Convention + for the Protection of Literary and Artistic Works (as amended on + September 28, 1979), the Rome Convention of 1961, the WIPO Copyright + Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 + and the Universal Copyright Convention (as revised on July 24, 1971). + These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. + . + . + Creative Commons Notice + . + Creative Commons is not a party to this License, and makes no warranty + whatsoever in connection with the Work. Creative Commons will not be + liable to You or any party on any legal theory for any damages + whatsoever, including without limitation any general, special, + incidental or consequential damages arising in connection to this + license. Notwithstanding the foregoing two (2) sentences, if Creative + Commons has expressly identified itself as the Licensor hereunder, it + shall have all rights and obligations of Licensor. + . + Except for the limited purpose of indicating to the public that the + Work is licensed under the CCPL, Creative Commons does not authorize + the use by either party of the trademark "Creative Commons" or any + related trademark or logo of Creative Commons without the prior + written consent of Creative Commons. Any permitted use will be in + compliance with Creative Commons' then-current trademark usage + guidelines, as may be published on its website or otherwise made + available upon request from time to time. For the avoidance of doubt, + this trademark restriction does not form part of the License. + . + Creative Commons may be contacted at https://creativecommons.org/. + diff -Nru zim-tools-2.0.0/debian/rules zim-tools-2.1.0/debian/rules --- zim-tools-2.0.0/debian/rules 2020-07-03 02:52:22.000000000 +0000 +++ zim-tools-2.1.0/debian/rules 2020-07-21 22:35:00.000000000 +0000 @@ -3,4 +3,3 @@ %: dh $@ - diff -Nru zim-tools-2.0.0/docker/Dockerfile zim-tools-2.1.0/docker/Dockerfile --- zim-tools-2.0.0/docker/Dockerfile 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/docker/Dockerfile 2020-11-17 16:00:36.000000000 +0000 @@ -15,7 +15,7 @@ # Install necessary packages RUN apt-get update -y && \ - apt-get install -y --no-install-recommends git pkg-config libtool automake autoconf make g++ liblzma-dev libzstd-dev coreutils meson ninja-build wget zlib1g-dev libicu-dev libgumbo-dev libmagic-dev ca-certificates xz-utils cmake && \ + apt-get install -y --no-install-recommends git pkg-config libtool automake autoconf make g++ liblzma-dev libzstd-dev coreutils meson ninja-build wget zlib1g-dev libicu-dev libgumbo-dev libmagic-dev libdocopt-dev ca-certificates xz-utils cmake && \ apt-get clean -y && \ rm -rf /var/lib/apt/lists/* @@ -23,15 +23,15 @@ RUN update-ca-certificates # Install Xapian (wget zlib1g-dev) -RUN wget https://oligarchy.co.uk/xapian/1.4.16/xapian-core-1.4.16.tar.xz -RUN tar xvf xapian-core-1.4.16.tar.xz -RUN cd xapian-core-1.4.16 && ./configure --prefix=/usr -RUN cd xapian-core-1.4.16 && make all install +RUN wget https://oligarchy.co.uk/xapian/1.4.17/xapian-core-1.4.17.tar.xz +RUN tar xvf xapian-core-1.4.17.tar.xz +RUN cd xapian-core-1.4.17 && ./configure --prefix=/usr +RUN cd xapian-core-1.4.17 && make all install RUN rm -rf xapian # Install zimlib (libicu-dev) RUN git clone https://github.com/openzim/libzim.git -RUN cd libzim && git checkout 6.1.7 +RUN cd libzim && git checkout 6.2.2 RUN cd libzim && meson . build --prefix=/usr RUN cd libzim && ninja -C build install RUN rm -rf libzim diff -Nru zim-tools-2.0.0/meson.build zim-tools-2.1.0/meson.build --- zim-tools-2.0.0/meson.build 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/meson.build 2020-11-17 16:00:36.000000000 +0000 @@ -1,5 +1,5 @@ project('zim-tools', ['c', 'cpp'], - version : '2.0.0', + version : '2.1.0', license : 'GPLv3+', default_options : ['c_std=c11', 'cpp_std=c++11', 'werror=true']) @@ -12,7 +12,7 @@ add_global_link_arguments('-static-libstdc++', '--static', language:'cpp') endif -libzim_dep = dependency('libzim', version : '>=6.1.8', static:static_linkage) +libzim_dep = dependency('libzim', version : '>=6.3.0', static:static_linkage) compiler = meson.get_compiler('cpp') find_library_in_compiler = meson.version().version_compare('>=0.31.0') @@ -62,3 +62,4 @@ endif subdir('src') +subdir('test') diff -Nru zim-tools-2.0.0/README.md zim-tools-2.1.0/README.md --- zim-tools-2.0.0/README.md 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/README.md 2020-11-17 16:00:36.000000000 +0000 @@ -7,6 +7,7 @@ [![latest release](https://img.shields.io/github/v/tag/openzim/zim-tools?label=latest%20release&sort=semver)](https://download.openzim.org/release/zim-tools/) [![Build Status](https://github.com/openzim/zim-tools/workflows/CI/badge.svg?query=branch%3Amaster)](https://github.com/openzim/zim-tools/actions?query=branch%3Amaster) [![Docker Build Status](https://img.shields.io/docker/build/openzim/zim-tools)](https://hub.docker.com/r/openzim/zim-tools) +[![codecov](https://codecov.io/gh/openzim/zim-tools/branch/master/graph/badge.svg)](https://codecov.io/gh/openzim/zim-tools) [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zim-tools/badge)](https://www.codefactor.io/repository/github/openzim/zim-tools) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) @@ -18,6 +19,7 @@ can be opened with a ZIM reader; [Kiwix](https://kiwix.org) is one example, but there are [others](https://openzim.org/wiki/ZIM_Readers). +[![Packaging status](https://repology.org/badge/vertical-allrepos/zim-tools.svg)](https://repology.org/project/zim-tools/versions) Releases ---------- @@ -38,12 +40,15 @@ Dependencies ------------ -Most of the utilities form zim-tools programs relies on the libzim: +Most of the utilities that form the zim-tools programs rely on libzim: * [ZIM](https://openzim.org) (package `libzim-dev` on Debian/Ubuntu) -`zimwriterfs` relies on many third parts software libraries. They are -prerequisites to the Zimwriterfs compilation. Following libraries +`zimdump` additionally requires: +* [docopt.cpp](https://github.com/docopt/docopt.cpp) (package `libdocopt-dev` on Debian/Ubuntu) + +`zimwriterfs` relies on many third-party software libraries. They are +prerequisites to compiling zimwriterfs. The following libraries need to be available: * [Magic](https://www.darwinsys.com/file/) (package `libmagic-dev` on Debian/Ubuntu) @@ -54,7 +59,7 @@ These dependencies may or may not be packaged by your operating system. They may also be packaged but only in an older version. The compilation script will tell you if one of them is missing or too old. -In the worse case, you will have to download and compile a more recent +In the worst case, you will have to download and compile a more recent version by hand. If you want to install these dependencies locally, then ensure that @@ -92,6 +97,15 @@ Depending of you system, `ninja` may be called `ninja-build`. +Testing +------- + +To run the automated tests: +```bash +cd build +meson test +``` + Installation ------------ diff -Nru zim-tools-2.0.0/src/arg.h zim-tools-2.1.0/src/arg.h --- zim-tools-2.0.0/src/arg.h 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/arg.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,532 +0,0 @@ -/* - * Copyright (C) 2003,2004,2010 Tommi Maekitalo - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - * MA 02110-1301, USA. - */ - -#ifndef ZIM_ARG_H -#define ZIM_ARG_H - -#include -#include - -namespace zim -{ - -class ArgBase -{ - protected: - bool m_isset; - - static void removeArg(int& argc, char* argv[], int pos, int n) - { - for ( ; pos < argc - n; ++pos) - argv[pos] = argv[pos + n]; - argc -= n; - argv[argc] = 0; - } - - public: - ArgBase() - : m_isset(false) - { } - - /** - * returns true if the option was found and the default value was not used - */ - bool isSet() const { return m_isset; } -}; - -template -class ArgBaseT : public ArgBase -{ - T m_value; - - protected: - explicit ArgBaseT(const T& def) - : m_value(def) - { } - - bool extract(const char* str, int& argc, char* argv[], int i, int n) - { - std::istringstream s(str); - s >> m_value; - if (!s.fail()) - { - m_isset = true; - removeArg(argc, argv, i, n); - return true; - } - return false; - } - - public: - /** - returns the value. - */ - const T& getValue() const { return m_value; } - - /** @brief Read and extract commandline parameters from argc/argv. - - Programs usually need some parameters. Usually they start with a '-' - followed by a single character and optionally a value. - Arg extracts these and other parameters. - - This default class processes paramters with a value, which defines - a input-extractor-operator operator>> (istream&, T&). - - Options are removed from the option-list, so programs can easily check - after all options are extracted, if there are parameters left. - - example: - \code - int main(int argc, char* argv[]) - { - zim::Arg option_n(argc, argv, 'n', 0); - std::cout << "value for -n: " << option_n << endl; - } - \endcode - - */ - operator T() const { return m_value; } -}; - -template <> -class ArgBaseT : public ArgBase -{ - const char* m_value; - - protected: - explicit ArgBaseT(const char* def) - : m_value(def) - { } - - bool extract(const char* str, int& argc, char* argv[], int i, int n) - { - m_value = str; - m_isset = true; - removeArg(argc, argv, i, n); - return true; - } - - public: - /// returns the extracted value. - const char* getValue() const { return m_value; } - - /// class is convertible to "const char*" - operator const char*() const { return m_value; } - - -}; - -template <> -class ArgBaseT : public ArgBase -{ - std::string m_value; - - protected: - explicit ArgBaseT(const std::string& def) - : m_value(def) - { } - - bool extract(const char* str, int& argc, char* argv[], int i, int n) - { - m_value = str; - m_isset = true; - removeArg(argc, argv, i, n); - return true; - } - - public: - /// returns the extracted value. - const std::string& getValue() const { return m_value; } - - /// class is convertible to "const std::string&" - operator const std::string&() const { return m_value; } - -}; - -/** @brief Read and extract commandline parameters from argc/argv. - - Programs usually need some parameters. Usually they start with a '-' - followed by a single character and optionally a value. - Arg extracts these and other parameters. - - This default class processes paramters with a value, which defines - a input-extractor-operator operator>> (istream&, T&). - - Options are removed from the option-list, so programs can easily check - after all options are extracted, if there are parameters left. - - example: - \code - int main(int argc, char* argv[]) - { - zim::Arg option_n(argc, argv, 'n', 0); - std::cout << "value for -n: " << option_n << endl; - } - \endcode - - */ -template -class Arg : public ArgBaseT -{ - public: - /** - default constructor. Initializes value. - - \param def initial value - */ - Arg(const T& def = T()) - : ArgBaseT(def) - { } - - /** - extract parameter. - - \param argc 1. parameter of main - \param argv 2. of main - \param ch optioncharacter - \param def default-value - - example: - \code - zim::Arg offset(argc, argv, 'o', 0); - unsigned value = offset.getValue(); - \endcode - */ - Arg(int& argc, char* argv[], char ch, const T& def = T()) - : ArgBaseT(def) - { - set(argc, argv, ch); - } - - /** - GNU defines long options starting with "--". This (and more) is - supported here. Instead of giving a single option-character, you - specify a string. - - example: - \code - zim::Arg option_number(argc, argv, "--number", 0); - std::cout << "number =" << option_number.getValue() << std::endl; - \endcode - */ - Arg(int& argc, char* argv[], const char* str, const T& def = T()) - : ArgBaseT(def) - { - this->m_isset = set(argc, argv, str); - } - - Arg(int& argc, char* argv[]) - : ArgBaseT(T()) - { - this->m_isset = set(argc, argv); - } - - /** - extract parameter. - - \param argc 1. parameter of main - \param argv 2. of main - \param ch optioncharacter - - example: - \code - zim::Arg offset; - offset.set(argc, argv, 'o'); - unsigned value = offset.getValue(); - \endcode - */ - bool set(int& argc, char* argv[], char ch) - { - // don't extract value, when already found - if (this->m_isset) - return false; - - for (int i = 1; i < argc; ++i) - { - if (argv[i][0] == '-' && argv[i][1] == ch) - { - if (argv[i][2] == '\0' && i < argc - 1) - { - // -O foo - if (this->extract(argv[i + 1], argc, argv, i, 2)) - return true; - } - - // -Ofoo - if (this->extract(argv[i] + 2, argc, argv, i, 1)) - return true; - } - } - - return false; - } - - /** - GNU defines long options starting with "--". This (and more) is - supported here. Instead of giving a single option-character, you - specify a string. - - example: - \code - zim::Arg option_number; - number.set(argc, argv, "--number"); - std::cout << "number =" << option_number.getValue() << std::endl; - \endcode - */ - bool set(int& argc, char* argv[], const char* str) - { - // don't extract value, when already found - if (this->m_isset) - return false; - - unsigned n = strlen(str); - for (int i = 1; i < argc; ++i) - { - if (strncmp(argv[i], str, n) == 0) - { - if (i < argc - 1 && argv[i][n] == '\0') - { - // --option value - if (this->extract(argv[i + 1], argc, argv, i, 2)) - return true; - } - - if (argv[i][n] == '=') - { - // --option=vlaue - if (this->extract(argv[i] + n + 1, argc, argv, i, 1)) - return true; - } - } - } - - return false; - } - - /** - Reads next parameter and removes it. - */ - bool set(int& argc, char* argv[]) - { - // don't extract value, when already found - if (this->m_isset) - return false; - - if (argc > 1) - this->extract(argv[1], argc, argv, 1, 1); - - return this->m_isset; - } -}; - -//////////////////////////////////////////////////////////////////////// -/** - specialization for bool. - - Often programs need some switches, which are switched on or off. - Users just enter a option without parameter. - - example: - \code - zim::Arg debug(argc, argv, 'd'); - if (debug) - std::cout << "debug-mode is set" << std::endl; - \endcode - */ -template <> -class Arg : public ArgBase -{ - public: - /** - default constructor. Initializes value. - - \param def initial value - */ - Arg(bool def = false) - : m_value(def) - { } - - /** - Use this constructor to extract a bool-parameter. - - As a special case options can be grouped. The parameter is - recognized also in a argument, which starts with a '-' and contains - somewhere the given character. - - example: - \code - zim::Arg debug(argc, argv, 'd'); - zim::Arg ignore(argc, argv, 'i'); - \endcode - - Arguments debug and ignore are both set when the program is called - with: - \code - prog -id - - prog -i -d - \endcode - - Options can also switched off with a following '-' like this: - \code - prog -d- - \endcode - - In the program use: - \code - Arg debug(argc, argv, 'd'); - if (debug.isSet()) - { - if (debug) - std::cout << "you entered -d" << std::endl; - else - std::cout << "you entered -d-" << std::endl; - } - else - std::cout << "no -d option given" << std::endl; - \endcode - - This is useful, if a program defaults to some enabled feature, - which can be disabled. - */ - Arg(int& argc, char* argv[], char ch, bool def = false) - : m_value(def) - { - m_isset = set(argc, argv, ch); - } - - Arg(int& argc, char* argv[], const char* str, bool def = false) - : m_value(def) - { - m_isset = set(argc, argv, str); - } - - bool set(int& argc, char* argv[], char ch) - { - // don't extract value, when already found - if (m_isset) - return false; - - for (int i = 1; i < argc; ++i) - { - if (argv[i][0] == '-' && argv[i][1] != '-') - { - // starts with a '-', but not with "--" - if (argv[i][1] == ch && argv[i][2] == '\0') - { - // single option found - m_value = true; - m_isset = true; - removeArg(argc, argv, i, 1); - return true; - } - else if (argv[i][1] == ch && argv[i][2] == '-' && argv[i][3] == '\0') - { - // Option was explicitly disabled with -x- - m_value = false; - m_isset = true; - removeArg(argc, argv, i, 1); - return true; - } - else - { - // look, if we find the option in an optiongroup - for (char* p = argv[i] + 1; *p != '\0'; ++p) - if (*p == ch) - { - // here it is - extract it - m_value = true; - m_isset = true; - do - { - *p = *(p + 1); - } while (*p++ != '\0'); - - return true; - } - } - } - } - - return false; - } - - /** - Setter for long-options. - - The option-parameter is defined with a string. This can extract - long-options like: - \code - prog --debug - \endcode - - with - \code - Arg debug(argc, argv, "--debug"); - \endcode - - */ - bool set(int& argc, char* argv[], const char* str) - { - // don't extract value, when already found - if (m_isset) - return false; - - for (int i = 1; i < argc; ++i) - { - if (strcmp(argv[i], str) == 0) - { - m_value = true; - m_isset = true; - removeArg(argc, argv, i, 1); - return true; - } - } - - return false; - } - - /** - returns true, if options is set. - */ - bool isTrue() const { return m_value; } - - /** - returns true, if options is not set. - */ - bool isFalse() const { return !m_value; } - - /** - convertable to bool. - */ - operator bool() const { return m_value; } - - private: - bool m_value; -}; - -template -std::ostream& operator<< (std::ostream& out, const ArgBaseT arg) -{ - return out << arg.getValue(); -} - -} - -#endif // ZIM_ARG_H diff -Nru zim-tools-2.0.0/src/meson.build zim-tools-2.1.0/src/meson.build --- zim-tools-2.0.0/src/meson.build 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/meson.build 2020-11-17 16:00:36.000000000 +0000 @@ -23,17 +23,15 @@ install: true) executable('zimsplit', 'zimsplit.cpp', - dependencies: libzim_dep, - install: true) - -executable('zimcheck', 'zimcheck.cpp', - dependencies: libzim_dep, + dependencies: [libzim_dep, docopt_dep], install: true) executable('zimrecreate', 'zimrecreate.cpp', dependencies: libzim_dep, install: true) +subdir('zimcheck') + if with_writer subdir('zimwriterfs') endif diff -Nru zim-tools-2.0.0/src/tools.cpp zim-tools-2.1.0/src/tools.cpp --- zim-tools-2.0.0/src/tools.cpp 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/tools.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -31,9 +31,7 @@ #include #include #include - -#include -#include +#include #ifdef _WIN32 #define SEPARATOR "\\" @@ -42,9 +40,6 @@ #endif -extern bool uniqueNamespace; - - unsigned int getFileSize(const std::string& path) { struct stat filestatus; @@ -64,6 +59,13 @@ return flag; } +bool isDirectory(const std::string &path) +{ + struct stat filestatus; + stat(path.c_str(), &filestatus); + return (filestatus.st_mode & S_IFMT) == S_IFDIR; +} + /* base64 */ static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" @@ -130,13 +132,17 @@ std::string::size_type pos = 0; while ((pos = url.find('%', pos)) != std::string::npos && pos + 2 < url.length()) { + if (!isxdigit(url[pos+1]) || !isxdigit(url[pos+2])) { + ++pos; + continue; + } url.replace(pos, 3, 1, charFromHex(url.substr(pos + 1, 2))); ++pos; } return url; } -std::string removeLastPathElement(const std::string& path, +static std::string removeLastPathElement(const std::string& path, const bool removePreSeparator, const bool removePostSeparator) { @@ -154,7 +160,7 @@ } /* Split string in a token array */ -std::vector split(const std::string& str, +static std::vector split(const std::string& str, const std::string& delims = " *-") { std::string::size_type lastPos = str.find_first_not_of(delims, 0); @@ -170,18 +176,13 @@ return tokens; } -std::vector split(const char* lhs, const char* rhs) +static std::vector split(const char* lhs, const char* rhs) { const std::string m1(lhs), m2(rhs); return split(m1, m2); } -std::vector split(const char* lhs, const std::string& rhs) -{ - return split(lhs, rhs.c_str()); -} - -std::vector split(const std::string& lhs, const char* rhs) +static std::vector split(const std::string& lhs, const char* rhs) { return split(lhs.c_str(), rhs); } @@ -216,6 +217,9 @@ std::string computeRelativePath(const std::string path, const std::string absolutePath) { + if (path.empty()) + return ""; + std::vector pathParts = split(path, "/"); std::vector absolutePathParts = split(absolutePath, "/"); @@ -256,6 +260,9 @@ const std::string& search, const std::string& replace) { + if (search.empty()) + return; + size_t pos = 0; while ((pos = subject.find(search, pos)) != std::string::npos) { subject.replace(pos, search.length(), replace); @@ -272,7 +279,7 @@ replaceStringInPlace(str, "\u202C", ""); } -std::string getNamespaceForMimeType(const std::string& mimeType) +std::string getNamespaceForMimeType(const std::string& mimeType, bool uniqueNamespace) { if (uniqueNamespace || mimeType.find("text") == 0 || mimeType.empty()) { if (uniqueNamespace || mimeType.find("text/html") == 0 @@ -296,20 +303,6 @@ } } - -std::string removeAccents(const std::string& text) -{ - ucnv_setDefaultName("UTF-8"); - static UErrorCode status = U_ZERO_ERROR; - static std::unique_ptr removeAccentsTrans(icu::Transliterator::createInstance( - "Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status)); - icu::UnicodeString ustring(text.c_str()); - removeAccentsTrans->transliterate(ustring); - std::string unaccentedText; - ustring.toUTF8String(unaccentedText); - return unaccentedText; -} - void remove_all(const std::string& path) { DIR* dir; @@ -332,3 +325,148 @@ remove(path.c_str()); } } + +std::vector generic_getLinks(const std::string& page) +{ + const char* p = page.c_str(); + const char* linkStart; + std::vector links; + std::string attr; + + while (*p) { + if (strncmp(p, " href", 5) == 0) { + attr = "href"; + p += 5; + } else if (strncmp(p, " src", 4) == 0) { + attr = "src"; + p += 4; + } else { + p += 1; + continue; + } + + while (*p == ' ') + p += 1 ; + if (*(p++) != '=') + continue; + while (*p == ' ') + p += 1; + char delimiter = *p++; + if (delimiter != '\'' && delimiter != '"') + continue; + + linkStart = p; + // [TODO] Handle escape char + while(*p != delimiter) + p++; + links.push_back({attr,std::string(linkStart, p)}); + p += 1; + } + return links; +} + +bool isOutofBounds(const std::string& input, std::string base) +{ + if (input.empty()) return false; + + if (base.back() != '/') + base.push_back('/'); + + int nr = 0; + if (base.front() != '/') + nr++; + + //count nr of substrings ../ + int nrsteps = 0; + std::string::size_type pos = 0; + while((pos = input.find("../", pos)) != std::string::npos) { + nrsteps++; + pos += 3; + } + + return nrsteps >= (nr + std::count(base.cbegin(), base.cend(), '/')); +} + +int adler32(const std::string& buf) +{ + unsigned int s1 = 1; + unsigned int s2 = 0; + unsigned int sz=buf.size(); + for (size_t n = 0; n #include +#include +#include +#include + +/* Formatter for std::exception what() message: + * throw std::runtime_error( + * Formatter() << "zimwriterfs: Unable to read" << filename << ": " << strerror(errno)); + */ +class Formatter +{ +public: + Formatter() {} + ~Formatter() {} + + template + Formatter & operator << (const Type & value) + { + stream_ << value; + return *this; + } + + // std::string str() const { return stream_.str(); } + operator std::string () const { return stream_.str(); } + +private: + Formatter(const Formatter &); + Formatter & operator = (Formatter &); + + std::stringstream stream_; +}; + + +typedef struct html_link +{ + std::string attribute; + std::string link; +} html_link; -std::string getMimeTypeForFile(const std::string& filename); -std::string getNamespaceForMimeType(const std::string& mimeType); +std::string getMimeTypeForFile(const std::string& basedir, const std::string& filename); +std::string getNamespaceForMimeType(const std::string& mimeType, bool uniqueNamespace); std::string getFileContent(const std::string& path); unsigned int getFileSize(const std::string& path); std::string decodeUrl(const std::string& encodedUrl); std::string computeAbsolutePath(const std::string& path, const std::string& relativePath); bool fileExists(const std::string& path); -std::string removeLastPathElement(const std::string& path, - const bool removePreSeparator, - const bool removePostSeparator); -std::string computeNewUrl(const std::string& aid, const std::string& baseUrl, const std::string& targetUrl); +bool isDirectory(const std::string &path); std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len); @@ -51,8 +85,21 @@ std::string computeRelativePath(const std::string path, const std::string absolutePath); -std::string removeAccents(const std::string& text); - void remove_all(const std::string& path); -#endif // OPENZIM_TOOLS_H +//Returns a vector of the links in a particular page. includes links under 'href' and 'src' +std::vector generic_getLinks(const std::string& page); + +// checks if a relative path is out of bounds (relative to base) +bool isOutofBounds(const std::string& input, std::string base); + +//Adler32 Hash Function. Used to hash the BLOB data obtained from each article, for redundancy checks. +//Please note that the adler32 hash function has a high number of collisions, and that the hash match is not taken as final. +int adler32(const std::string& buf); + +//Removes extra spaces from URLs. Usually done by the browser, so web authors sometimes tend to ignore it. +//Converts the %20 to space.Essential for comparing URLs. +std::string normalize_link(const std::string& input, const std::string& baseUrl); + + +#endif // OPENZIM_TOOLS_H diff -Nru zim-tools-2.0.0/src/zimbench.cpp zim-tools-2.1.0/src/zimbench.cpp --- zim-tools-2.0.0/src/zimbench.cpp 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/zimbench.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -100,7 +100,8 @@ if (filename.empty()) { - std::cerr << "usage: " << argv[0] << " [options] zimfile\n" + std::cerr << "\nzimbench benchmarks a ZIM file reading speed.\n\n" + "usage: " << argv[0] << " [options] zimfile\n" "\t-n number\tnumber of linear accessed articles (default 1000)\n" "\t-r number\tnumber of random accessed articles (default: same as -n)\n" "\t-d number\tnumber of distinct articles used for random access (default: same as -r)\n\n" diff -Nru zim-tools-2.0.0/src/zimcheck/checks.cpp zim-tools-2.1.0/src/zimcheck/checks.cpp --- zim-tools-2.0.0/src/zimcheck/checks.cpp 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/src/zimcheck/checks.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,268 @@ +#include "checks.h" +#include "../tools.h" + +#include +#include +#include +#include +#include +#include +#include + +inline bool isDataUrl(const std::string& input_string) +{ + static std::regex data_url_regex = + std::regex("data:.+", std::regex_constants::icase); + return std::regex_match(input_string, data_url_regex); +} + +inline bool isExternalUrl(const std::string& input_string) +{ + // A string starting with "://" or "geo:" or "tel:" or "javascript:" or "mailto:" + static std::regex external_url_regex = + std::regex("([^:/?#]+:\\/\\/|geo:|tel:|mailto:|javascript:).*", + std::regex_constants::icase); + return std::regex_match(input_string, external_url_regex); +} + +// Checks if a URL is an internal URL or not. Uses RegExp. +inline bool isInternalUrl(const std::string& input_string) +{ + return !isExternalUrl(input_string) && !isDataUrl(input_string); +} + + +void test_checksum(zim::File& f, ErrorLogger& reporter) { + std::cout << "[INFO] Verifying Internal Checksum..." << std::endl; + bool result = f.verify(); + reporter.setTestResult(TestType::CHECKSUM, result); + if (!result) { + std::cout << " [ERROR] Wrong Checksum in ZIM file" << std::endl; + std::ostringstream ss; + ss << "ZIM File Checksum in file: " << f.getChecksum() << std::endl; + reporter.addReportMsg(TestType::CHECKSUM, ss.str()); + } +} + +void test_integrity(const std::string& filename, ErrorLogger& reporter) { + std::cout << "[INFO] Verifying ZIM-file structure integrity..." << std::endl; + zim::IntegrityCheckList checks; + checks.set(); // enable all checks (including checksum) + bool result = zim::validate(filename, checks); + reporter.setTestResult(TestType::INTEGRITY, result); + if (!result) { + std::cout << " [ERROR] ZIM file's low level structure is invalid" << std::endl; + } +} + + +void test_metadata(const zim::File& f, ErrorLogger& reporter) { + std::cout << "[INFO] Searching for metadata entries..." << std::endl; + static const char* const test_meta[] = { + "Title", + "Creator", + "Publisher", + "Date", + "Description", + "Language"}; + for (auto &meta : test_meta) { + auto article = f.getArticle('M', meta); + if (!article.good()) { + reporter.setTestResult(TestType::METADATA, false); + reporter.addReportMsg(TestType::METADATA, meta); + } + } +} + +void test_favicon(const zim::File& f, ErrorLogger& reporter) { + std::cout << "[INFO] Searching for Favicon..." << std::endl; + static const char* const favicon_paths[] = {"-/favicon.png", "I/favicon.png", "I/favicon", "-/favicon"}; + for (auto &path: favicon_paths) { + auto article = f.getArticleByUrl(path); + if (article.good()) { + return; + } + } + reporter.setTestResult(TestType::FAVICON, false); +} + +void test_mainpage(const zim::File& f, ErrorLogger& reporter) { + std::cout << "[INFO] Searching for main page..." << std::endl; + zim::Fileheader fh=f.getFileheader(); + bool testok = true; + if( !fh.hasMainPage() ) + testok = false; + else if( fh.getMainPage() > fh.getArticleCount() ) + testok = false; + reporter.setTestResult(TestType::MAIN_PAGE, testok); + if (!testok) { + std::ostringstream ss; + ss << "Main Page Index stored in File Header: " << fh.getMainPage(); + reporter.addReportMsg(TestType::MAIN_PAGE, ss.str()); + } +} + +void test_articles(const zim::File& f, ErrorLogger& reporter, ProgressBar progress, + bool redundant_data, bool url_check, bool url_check_external, bool empty_check) { + std::cout << "[INFO] Verifying Articles' content..." << std::endl; + // Article are store in a map>. + // So all article with the same hash will be stored in the same list. + std::map> hash_main; + + int previousIndex = -1; + + progress.reset(f.getFileheader().getArticleCount()); + for (zim::File::const_iterator it = f.begin(); it != f.end(); ++it) + { + progress.report(); + + if (it->getArticleSize() == 0 && + empty_check && + (it->getNamespace() == 'A' || + it->getNamespace() == 'I')) { + std::ostringstream ss; + ss << "Entry " << it->getLongUrl() << " is empty"; + reporter.addReportMsg(TestType::EMPTY, ss.str()); + reporter.setTestResult(TestType::EMPTY, false); + } + + if (it->isRedirect() || + it->isLinktarget() || + it->isDeleted() || + it->getArticleSize() == 0 || + it->getNamespace() == 'M') { + continue; + } + + std::string data; + if (redundant_data || it->getMimeType() == "text/html") + data = it->getData(); + + if(redundant_data) + hash_main[adler32(data)].push_back( it->getIndex() ); + + if (it->getMimeType() != "text/html") + continue; + + std::vector links; + if (url_check || url_check_external) { + links = generic_getLinks(it->getData()); + } + + if(url_check) + { + auto baseUrl = it->getLongUrl(); + auto pos = baseUrl.find_last_of('/'); + baseUrl.resize( pos==baseUrl.npos ? 0 : pos ); + + std::unordered_map> filtered; + int nremptylinks = 0; + for (const auto &l : links) + { + if (l.link.front() == '#' || l.link.front() == '?') continue; + if (isInternalUrl(l.link) == false) continue; + if (l.link.empty()) + { + nremptylinks++; + continue; + } + + + if (isOutofBounds(l.link, baseUrl)) + { + std::ostringstream ss; + ss << l.link << " is out of bounds. Article: " << it->getLongUrl(); + reporter.addReportMsg(TestType::URL_INTERNAL, ss.str()); + reporter.setTestResult(TestType::URL_INTERNAL, false); + continue; + } + + auto normalized = normalize_link(l.link, baseUrl); + filtered[normalized].push_back(l.link); + } + + if (nremptylinks) + { + std::ostringstream ss; + ss << "Found " << nremptylinks << " empty links in article: " << it->getLongUrl(); + reporter.addReportMsg(TestType::URL_INTERNAL, ss.str()); + reporter.setTestResult(TestType::URL_INTERNAL, false); + } + + for(const auto &p: filtered) + { + const std::string link = p.first; + auto a = f.getArticleByUrl(link); + if (!a.good()) + { + int index = it->getIndex(); + if (previousIndex != index) + { + std::ostringstream ss; + ss << "The following links:\n"; + for (const auto &olink : p.second) + ss << "- " << olink << '\n'; + ss << "(" << link << ") were not found in article " << it->getLongUrl(); + reporter.addReportMsg(TestType::URL_INTERNAL, ss.str()); + previousIndex = index; + } + reporter.setTestResult(TestType::URL_INTERNAL, false); + } + } + } + + if (url_check_external) + { + if (it->getMimeType() != "text/html") + continue; + + for (auto &l: links) + { + if (l.attribute == "src" && isExternalUrl(l.link)) + { + std::ostringstream ss; + ss << l.link << " is an external dependence in article " << it->getLongUrl(); + reporter.addReportMsg(TestType::URL_EXTERNAL, ss.str()); + reporter.setTestResult(TestType::URL_EXTERNAL, false); + break; + } + } + } + } + + if (redundant_data) + { + std::cout << "[INFO] Searching for redundant articles..." << std::endl; + std::cout << " Verifying Similar Articles for redundancies..." << std::endl; + std::ostringstream output_details; + progress.reset(hash_main.size()); + for(const auto &it: hash_main) + { + progress.report(); + auto l = it.second; + while ( !l.empty() ) { + const auto a1 = f.getArticle(l.front()); + l.pop_front(); + if ( !l.empty() ) { + const std::string s1 = a1.getData(); + decltype(l) articlesDifferentFromA1; + for(auto other : l) { + auto a2 = f.getArticle(other); + std::string s2 = a2.getData(); + if (s1 != s2 ) { + articlesDifferentFromA1.push_back(other); + continue; + } + + reporter.setTestResult(TestType::REDUNDANT, false); + std::ostringstream ss; + ss << a1.getTitle() << " (idx " << a1.getIndex() << ") and " + << a2.getTitle() << " (idx " << a2.getIndex() << ")"; + reporter.addReportMsg(TestType::REDUNDANT, ss.str()); + } + l.swap(articlesDifferentFromA1); + } + } + } + } +} diff -Nru zim-tools-2.0.0/src/zimcheck/checks.h zim-tools-2.1.0/src/zimcheck/checks.h --- zim-tools-2.0.0/src/zimcheck/checks.h 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/src/zimcheck/checks.h 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,116 @@ +#ifndef _ZIM_TOOL_ZIMFILECHECKS_H_ +#define _ZIM_TOOL_ZIMFILECHECKS_H_ + +#include +#include + +#include + +#include "../progress.h" + +enum StatusCode : int { + PASS = 0, + FAIL = 1, + EXCEPTION = 2 +}; + +enum class LogTag { ERROR, WARNING }; + +// Specialization of std::hash needed for our unordered_map. Can be removed in c++14 +namespace std { + template <> struct hash { + size_t operator() (const LogTag &t) const { return size_t(t); } + }; +} + +static std::unordered_map tagToStr{ {LogTag::ERROR, "ERROR"}, + {LogTag::WARNING, "WARNING"}}; + +enum class TestType { + CHECKSUM, + INTEGRITY, + EMPTY, + METADATA, + FAVICON, + MAIN_PAGE, + REDUNDANT, + URL_INTERNAL, + URL_EXTERNAL, + MIME, + OTHER +}; + +// Specialization of std::hash needed for our unordered_map. Can be removed in c++14 +namespace std { + template <> struct hash { + size_t operator() (const TestType &t) const { return size_t(t); } + }; +} + +static std::unordered_map> errormapping = { + { TestType::CHECKSUM, {LogTag::ERROR, "Invalid checksum"}}, + { TestType::INTEGRITY, {LogTag::ERROR, "Invalid low-level structure"}}, + { TestType::EMPTY, {LogTag::ERROR, "Empty articles"}}, + { TestType::METADATA, {LogTag::ERROR, "Missing metadata entries"}}, + { TestType::FAVICON, {LogTag::ERROR, "Missing favicon"}}, + { TestType::MAIN_PAGE, {LogTag::ERROR, "Missing mainpage"}}, + { TestType::REDUNDANT, {LogTag::WARNING, "Redundant data found"}}, + { TestType::URL_INTERNAL, {LogTag::ERROR, "Invalid internal links found"}}, + { TestType::URL_EXTERNAL, {LogTag::ERROR, "Invalid external links found"}}, + { TestType::MIME, {LogTag::ERROR, "Incoherent mimeType found"}}, + { TestType::OTHER, {LogTag::ERROR, "Other errors found"}} +}; + +class ErrorLogger { + private: + std::unordered_map> reportMsgs; + std::unordered_map testStatus; + + public: + ErrorLogger() + { + for (const auto &m : errormapping) { + testStatus[m.first] = true; + } + } + + void setTestResult(TestType type, bool status) { + testStatus[type] = status; + } + + void addReportMsg(TestType type, const std::string& message) { + reportMsgs[type].push_back(message); + } + + void report(bool error_details) const { + for (auto testmsg : reportMsgs) { + auto &p = errormapping[testmsg.first]; + std::cout << "[" + tagToStr[p.first] + "] " << p.second << ":" << std::endl; + for (auto& msg: testmsg.second) { + std::cout << " " << msg << std::endl; + } + } + } + + inline bool overalStatus() const { + return std::all_of(testStatus.begin(), testStatus.end(), + [](std::pair e){ + if (errormapping[e.first].first == LogTag::ERROR) + { + return e.second; //return the test status result + } + return true; + }); + } +}; + + +void test_checksum(zim::File& f, ErrorLogger& reporter); +void test_integrity(const std::string& filename, ErrorLogger& reporter); +void test_metadata(const zim::File& f, ErrorLogger& reporter); +void test_favicon(const zim::File& f, ErrorLogger& reporter); +void test_mainpage(const zim::File& f, ErrorLogger& reporter); +void test_articles(const zim::File& f, ErrorLogger& reporter, ProgressBar progress, + bool redundant_data, bool url_check, bool url_check_external, bool empty_check); + +#endif diff -Nru zim-tools-2.0.0/src/zimcheck/main.cpp zim-tools-2.1.0/src/zimcheck/main.cpp --- zim-tools-2.0.0/src/zimcheck/main.cpp 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/src/zimcheck/main.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,363 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * Copyright (C) Kiran Mathew Koshy + * Copyright (C) Matthieu Gautier + * Copyright (C) Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../progress.h" +#include "../version.h" +#include "../tools.h" +#include "checks.h" + +void displayHelp() +{ + std::cout<<"\n" + "zimcheck checks the quality of a ZIM file.\n\n" + "Usage: zimcheck [options] zimfile\n" + "options:\n" + "-A , --all run all tests. Default if no flags are given.\n" + "-0 , --empty Empty content\n" + "-C , --checksum Internal CheckSum Test\n" + "-I , --integrity Low-level correctness/integrity checks\n" + "-M , --metadata MetaData Entries\n" + "-F , --favicon Favicon\n" + "-P , --main Main page\n" + "-R , --redundant Redundant data check\n" + "-U , --url_internal URL check - Internal URLs\n" + "-X , --url_external URL check - External URLs\n" + "-E , --mime MIME checks\n" + "-D , --details Details of error\n" + "-B , --progress Print progress report\n" + "-H , --help Displays Help\n" + "-V , --version Displays software version\n" + "examples:\n" + "zimcheck -A wikipedia.zim\n" + "zimcheck --checksum --redundant wikipedia.zim\n" + "zimcheck -F -R wikipedia.zim\n" + "zimcheck -M --favicon wikipedia.zim\n"; + return; +} + +int main (int argc, char **argv) +{ + // To calculate the total time taken by the program to run. + time_t startTime,endTime; + double timeDiffference; + time( &startTime); + + // The boolean values which will be used to store the output from + // getopt_long(). These boolean values will be then read by the + // program to execute the different parts of the program. + + bool run_all = false; + bool checksum = false; + bool metadata = false; + bool favicon = false; + bool main_page = false; + bool redundant_data = false; + bool integrity = false; + bool url_check = false; + bool url_check_external = false; + bool empty_check = false; + bool mime_check = false; + bool error_details = false; + bool no_args = true; + bool help = false; + + std::string filename = ""; + ProgressBar progress(1); + ErrorLogger error; + + StatusCode status_code = PASS; + + //Parsing through arguments using getopt_long(). Both long and short arguments are allowed. + while (1) + { + static struct option long_options[] = + { + { "all", no_argument, 0, 'A'}, + { "progress", no_argument, 0, 'B'}, + { "empty", no_argument, 0, '0'}, + { "checksum", no_argument, 0, 'C'}, + { "integrity", no_argument, 0, 'I'}, + { "metadata", no_argument, 0, 'M'}, + { "favicon", no_argument, 0, 'F'}, + { "main", no_argument, 0, 'P'}, + { "redundant", no_argument, 0, 'R'}, + { "url_internal", no_argument, 0, 'U'}, + { "url_external", no_argument, 0, 'X'}, + { "mime", no_argument, 0, 'E'}, + { "details", no_argument, 0, 'D'}, + { "help", no_argument, 0, 'H'}, + { "version", no_argument, 0, 'V'}, + { 0, 0, 0, 0} + }; + int option_index = 0; + int c = getopt_long (argc, argv, "ACIMFPRUXEDHBVacimfpruxedhbv", + long_options, &option_index); + //c = getopt (argc, argv, "ACMFPRUXED"); + if(c == -1) + break; + switch (c) + { + case 'A': + case 'a': + run_all = true; + no_args = false; + break; + case '0': + empty_check = true; + break; + case 'C': + case 'c': + checksum = true; + no_args = false; + break; + case 'I': + case 'i': + integrity = true; + no_args = false; + break; + case 'M': + case 'm': + metadata = true; + no_args = false; + break; + case 'B': + case 'b': + progress.set_progress_report(true); + break; + case 'F': + case 'f': + favicon = true; + no_args = false; + break; + case 'P': + case 'p': + main_page = true; + no_args = false; + break; + case 'R': + case 'r': + redundant_data = true; + no_args = false; + break; + case 'U': + case 'u': + url_check = true; + no_args = false; + break; + case 'X': + case 'x': + url_check_external = true; + no_args = false; + break; + case 'E': + case 'e': + mime_check = true; + no_args = false; + break; + case 'D': + case 'd': + error_details = true; + break; + case 'H': + case 'h': + help=true; + break; + case '?': + if (optopt == 'c') + { + std::cerr<<"Option "<<(char)optopt<<" requires an argument.\n"; + displayHelp(); + } + else if ( isprint (optopt) ) + std::cerr<<"Unknown option `"<<( char )optopt<<"'.\n"; + else + { + std::cerr<<"Unknown option\n"; + displayHelp(); + } + return 1; + case 'V': + case 'v': + version(); + return 0; + default: + abort (); + } + } + + //Displaying Help for --help argument + if(help) + { + displayHelp(); + return -1; + } + + //If no arguments are given to the program, all the tests are performed. + if ( run_all || no_args ) + { + checksum = integrity = metadata = favicon = main_page = redundant_data = + url_check = url_check_external = mime_check = empty_check = true; + } + + //Obtaining filename from argument list + filename = ""; + for(int i = 0; i < argc; i++) + { + if( (argv[i][0] != '-') && (i != 0)) + { + filename = argv[i]; + } + } + if(filename == "") + { + std::cerr<<"No file provided as argument\n"; + displayHelp(); + return -1; + } + //Tests. + try + { + std::cout << "[INFO] Checking zim file " << filename << std::endl; + + //Test 0: Low-level ZIM-file structure integrity checks + if(integrity) + test_integrity(filename, error); + + // Does it make sense to do the other checks if the integrity + // check fails? + zim::File f( filename ); + + //Test 1: Internal Checksum + if(checksum) { + if ( integrity ) { + std::cout << "[INFO] Avoiding redundant checksum test" + << " (already performed by the integrity check)." + << std::endl; + } else { + test_checksum(f, error); + } + } + + //Test 2: Metadata Entries: + //The file is searched for the compulsory metadata entries. + if(metadata) + test_metadata(f, error); + + //Test 3: Test for Favicon. + if(favicon) + test_favicon(f, error); + + + //Test 4: Main Page Entry + if(main_page) + test_mainpage(f, error); + + /* Now we want to avoid to loop on the tests but on the article. + * + * If we loop of the tests we will have : + * + * for (test: tests) { + * for(article: articles) { + * data = article->getData(); + * ... + * } + * } + * + * And so we will get several the data of an article (and so decompression and so). + * By looping on the articles first, we have : + * + * for (article: articles) { + * data = article->getData() { + * for (test: tests) { + * ... + * } + * } + */ + + if ( redundant_data || url_check || url_check_external || empty_check ) + test_articles(f, error, progress, redundant_data, url_check, url_check_external, empty_check); + + + //Test 8: Verifying MIME Types + //MIME Checks is intended to verify that all the MIME types of all different articles are listed in the file header. + //As of now, there is no method in the existing zimlib to get the list of MIME types listed in the file header. + //A bug has been reported for the above problem, and once the bug is fixed, it will be used to add MIME checks to the zimcheck tool. + /* + if(mime_check) + { + std::cout<<"\nTest 8: Verifying MIME Types.. \n"< - * Copyright (C) Emmanuel Engelhart - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - * MA 02110-1301, USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "progress.h" -#include "version.h" - -enum TestType { - CHECKSUM, - EMPTY, - METADATA, - FAVICON, - MAIN_PAGE, - REDUNDANT, - URL_INTERNAL, - URL_EXTERNAL, - MIME, - OTHER -}; - -enum StatusCode : int { - PASS = 0, - FAIL = 1, - EXCEPTION = 2 -}; - -static const char *errorString[OTHER+1] = { - "Invalid checksum", - "Missing metadata entries", - "Missing favicon", - "Missing mainpage", - "Redundant data found", - "Invalid internal links found", - "Invalid external links found", - "Incoherent mimeType found", - "Other errors found" -}; - - -class ErrorLogger { - private: - std::map> errors; - bool testStatus[OTHER+1]; - - public: - ErrorLogger() { - for (int testType=CHECKSUM; testType<=OTHER; ++testType) { - testStatus[testType] = true; - } - } - - void setTestResult(TestType type, bool status) { - testStatus[type] = status; - } - - void addError(TestType type, const std::string& message) { - errors[type].push_back(message); - } - - void report(bool error_details) { - for (int testType=CHECKSUM; testType<=OTHER; ++testType) { - if (!testStatus[testType]) { - std::cout << "[ERROR] " << errorString[testType] << " :" << std::endl; - for (auto& msg: errors[(TestType)testType]) { - std::cout << " " << msg << std::endl; - } - } - } - } - - bool overalStatus() { - for (int testType=CHECKSUM; testType<=OTHER; ++testType) { - if (!testStatus[testType]) { - return false; - } - } - return true; - } -}; - - -std::vector getLinks(const std::string& page, bool withHref = true) //Returns a vector of the links in a particular page. includes links under 'href' and 'src' -{ - const char* p = page.c_str(); - const char* linkStart; - std::vector links; - - while (*p) { - if (withHref && strncmp(p, " href", 5) == 0) { - p += 5; - } else if (strncmp(p, " src", 4) == 0) { - p += 4; - } else { - p += 1; - continue; - } - - while (*p == ' ') - p += 1 ; - if (*(p++) != '=') - continue; - while (*p == ' ') - p += 1; - char delimiter = *p++; - if (delimiter != '\'' && delimiter != '"') - continue; - - linkStart = p; - // [TODO] Handle escape char - while(*p != delimiter) - p++; - links.push_back(std::string(linkStart, p)); - p += 1; - } - return links; -} - -std::vector getDependencies(const std::string& page) //Returns a vector of the links in a particular page. includes links under 'href' and 'src' -{ - return getLinks(page, false); -} - -int adler32(std::string buf) //Adler32 Hash Function. Used to hash the BLOB data obtained from each article, for redundancy checks. -{ //Please note that the adler32 hash function has a high number of collisions, and that the hash match is not taken as final. - unsigned int s1 = 1; - unsigned int s2 = 0; - unsigned int sz=buf.size(); - for (size_t n = 0; n ://" or "geo:" or "tel:" or "javascript:" or "mailto:" - static std::regex external_url_regex = - std::regex("([^:/?#]+:\\/\\/|geo:|tel:|mailto:|javascript:).*", - std::regex_constants::icase); - return std::regex_match(input_string, external_url_regex); -} - -// Checks if a URL is an internal URL or not. Uses RegExp. -inline bool isInternalUrl(const std::string& input_string) -{ - return !isExternalUrl(input_string); -} - -//Removes extra spaces from URLs. Usually done by the browser, so web authors sometimes tend to ignore it. -//Converts the %20 to space.Essential for comparing URLs. - -std::string normalize_link(const std::string& input, const std::string& baseUrl) -{ - std::string output; - output.reserve(baseUrl.size() + input.size() + 1); - - bool in_query = false; - bool check_rel = false; - const char* p = input.c_str(); - if ( *(p) == '/') { - // This is an absolute url. - p++; - } else { - //This is a relative url, use base url - output = baseUrl; - if (output.back() != '/') - output += '/'; - check_rel = true; - } - - //URL Decoding. - while (*p) - { - if ( !in_query && check_rel ) { - if (strncmp(p, "../", 3) == 0) { - // We must go "up" - // Remove the '/' at the end of output. - output.resize(output.size()-1); - // Remove the last part. - auto pos = output.find_last_of('/'); - output.resize(pos==output.npos?0:pos); - // Move after the "..". - p += 2; - check_rel = false; - continue; - } - if (strncmp(p, "./", 2) == 0) { - // We must simply skip this part - // Simply move after the ".". - p += 2; - check_rel = false; - continue; - } - } - if ( *p == '#' || *p == '?') - // This is a beginning of the #anchor inside a page. No need to decode more - break; - if ( *p == '%') - { - char ch; - sscanf(p+1, "%2hhx", &ch); - output += ch; - p += 3; - continue; - } - if ( *p == '?' ) { - // We are in the query, so don't try to interprete '/' as path separator - in_query = true; - } - if ( *p == '/') { - check_rel = true; - if (output.empty()) { - // Do not add '/' at beginning of output - p++; - continue; - } - } - output += *(p++); - } - return output; -} - -void displayHelp() -{ - std::cout<<"\n" - "zimcheck\n" - "Written by : Kiran Mathew Koshy\n" - "A tool to check the quality of a ZIM file\n." - "To list the details of the error reported, add a flag -D.\n" - "Usage: zimcheck [options] zimfile\n" - "options:\n" - "-A , --all run all tests. Default if no flags are given.\n" - "-0 , --empty Empty content\n" - "-C , --checksum Internal CheckSum Test\n" - "-M , --metadata MetaData Entries\n" - "-F , --favicon Favicon\n" - "-P , --main Main page\n" - "-R , --redundant Redundant data check\n" - "-U , --url_internal URL check - Internal URLs\n" - "-X , --url_external URL check - External URLs\n" - "-E , --mime MIME checks\n" - "-D , --details Details of error\n" - "-B , --progress Print progress report\n" - "-H , --help Displays Help\n" - "-V , --version Displays software version\n" - "examples:\n" - "zimcheck -A wikipedia.zim\n" - "zimcheck --checksum --redundant wikipedia.zim\n" - "zimcheck -F -R wikipedia.zim\n" - "zimcheck -M --favicon wikipedia.zim\n"; - return; -} - - -void test_checksum(zim::File& f, ErrorLogger& reporter) { - std::cout << "[INFO] Verifying Internal Checksum.. " << std::endl; - bool result = f.verify(); - reporter.setTestResult(CHECKSUM, result); - if( result ) - std::cout << " [INFO] Internal checksum found correct" << std::endl; - else - { - std::cout << " [ERROR] Wrong Checksum in ZIM file" << std::endl; - std::ostringstream ss; - ss << "ZIM File Checksum in file: " << f.getChecksum() << std::endl; - reporter.addError(CHECKSUM, ss.str()); - } -} - - -void test_metadata(const zim::File& f, ErrorLogger& reporter) { - std::cout << "[INFO] Searching for metadata entries.." << std::endl; - static const char* const test_meta[] = { - "Title", - "Creator", - "Publisher", - "Date", - "Description", - "Language"}; - for (auto &meta : test_meta) { - auto article = f.getArticle('M', meta); - if (!article.good()) { - reporter.setTestResult(METADATA, false); - reporter.addError(METADATA, meta); - } - } -} - -void test_favicon(const zim::File& f, ErrorLogger& reporter) { - std::cout << "[INFO] Searching for Favicon.." << std::endl; - static const char* const favicon_paths[] = {"-/favicon.png", "I/favicon.png", "I/favicon", "-/favicon"}; - for (auto &path: favicon_paths) { - auto article = f.getArticleByUrl(path); - if (article.good()) { - return; - } - } - reporter.setTestResult(FAVICON, false); -} - -void test_mainpage(const zim::File& f, ErrorLogger& reporter) { - std::cout << "[INFO] Searching for main page.." << std::endl; - zim::Fileheader fh=f.getFileheader(); - bool testok = true; - if( !fh.hasMainPage() ) - testok = false; - else if( fh.getMainPage() > fh.getArticleCount() ) - testok = false; - reporter.setTestResult(MAIN_PAGE, testok); - if (!testok) { - std::ostringstream ss; - ss << "Main Page Index stored in File Header: " << fh.getMainPage(); - reporter.addError(MAIN_PAGE, ss.str()); - } -} - - -void test_articles(const zim::File& f, ErrorLogger& reporter, ProgressBar progress, - bool redundant_data, bool url_check, bool url_check_external, bool empty_check) { - std::cout << "[INFO] Verifying Articles' content.. " << std::endl; - // Article are store in a map>. - // So all article with the same hash will be stored in the same list. - std::map> hash_main; - - std::string previousLink; - int previousIndex = -1; - - progress.reset(f.getFileheader().getArticleCount()); - for (zim::File::const_iterator it = f.begin(); it != f.end(); ++it) - { - progress.report(); - - if (it->getArticleSize() == 0 && - empty_check && - (it->getNamespace() == 'A' || - it->getNamespace() == 'I')) { - std::ostringstream ss; - ss << "Entry " << it->getLongUrl() << " is empty"; - reporter.addError(EMPTY, ss.str()); - reporter.setTestResult(EMPTY, false); - } - - if (it->isRedirect() || - it->isLinktarget() || - it->isDeleted() || - it->getArticleSize() == 0 || - it->getNamespace() == 'M') { - continue; - } - - std::string data; - if (redundant_data || it->getMimeType() == "text/html") - data = it->getData(); - - if(redundant_data) - hash_main[adler32(data)].push_back( it->getIndex() ); - - if (it->getMimeType() != "text/html") - continue; - - if(url_check) - { - auto baseUrl = it->getLongUrl(); - auto pos = baseUrl.find_last_of('/'); - baseUrl.resize( pos==baseUrl.npos ? 0 : pos ); - - auto links = getLinks(it->getData()); - for(auto olink: links) - { - if (olink.front() == '#' || olink.front() == '?') - continue; - if (isInternalUrl(olink)) { - auto link = normalize_link(olink, baseUrl); - char nm = link[0]; - std::string shortUrl(link.substr(2)); - auto a = f.getArticle(nm, shortUrl); - if (!a.good()) - { - int index = it->getIndex(); - if ((previousLink != link) && (previousIndex != index) ) - { - std::ostringstream ss; - ss << link << " (" << olink << ") was not found in article " << it->getLongUrl(); - reporter.addError(URL_INTERNAL, ss.str()); - previousLink = link; - previousIndex = index; - } - reporter.setTestResult(URL_INTERNAL, false); - } - } - } - } - - if (url_check_external) - { - if (it->getMimeType() != "text/html") - continue; - - auto links = getDependencies(it->getPage()); - for (auto &link: links) - { - if (isExternalUrl( link )) - { - std::ostringstream ss; - ss << link << " is an external dependence in article " << it->getLongUrl(); - reporter.addError(URL_EXTERNAL, ss.str()); - reporter.setTestResult(URL_EXTERNAL, false); - break; - } - } - } - } - - if (redundant_data) - { - std::cout << "[INFO] Searching for redundant articles.." << std::endl; - std::cout << " Verifying Similar Articles for redundancies.." << std::endl; - std::ostringstream output_details; - progress.reset(hash_main.size()); - for(auto &it: hash_main) - { - progress.report(); - auto l = it.second; - // If only one article has this hash, no need to test. - if(l.size() <= 1) - continue; - for (auto current=l.begin(); current!=l.end(); current++) { - auto a1 = f.getArticle(*current); - std::string s1 = a1.getData(); - for(auto other=std::next(current); other!=l.end(); other++) { - auto a2 = f.getArticle(*other); - std::string s2 = a2.getData(); - if (s1 != s2 ) - continue; - - reporter.setTestResult(REDUNDANT, false); - std::ostringstream ss; - ss << a1.getTitle() << " (idx " << a1.getIndex() << ") and " - << a2.getTitle() << " (idx " << a2.getIndex() << ")"; - reporter.addError(REDUNDANT, ss.str()); - } - } - } - } -} - - -int main (int argc, char **argv) -{ - // To calculate the total time taken by the program to run. - time_t startTime,endTime; - double timeDiffference; - time( &startTime); - - // The boolean values which will be used to store the output from - // getopt_long(). These boolean values will be then read by the - // program to execute the different parts of the program. - - bool run_all = false; - bool checksum = false; - bool metadata = false; - bool favicon = false; - bool main_page = false; - bool redundant_data = false; - bool url_check = false; - bool url_check_external = false; - bool empty_check = false; - bool mime_check = false; - bool error_details = false; - bool no_args = true; - bool help = false; - - std::string filename = ""; - ProgressBar progress(1); - ErrorLogger error; - - StatusCode status_code = PASS; - - //Parsing through arguments using getopt_long(). Both long and short arguments are allowed. - while (1) - { - static struct option long_options[] = - { - { "all", no_argument, 0, 'A'}, - { "empty", no_argument, 0, '0'}, - { "checksum", no_argument, 0, 'C'}, - { "metadata", no_argument, 0, 'M'}, - { "favicon", no_argument, 0, 'F'}, - { "main", no_argument, 0, 'P'}, - { "redundant", no_argument, 0, 'R'}, - { "url_internal", no_argument, 0, 'U'}, - { "url_external", no_argument, 0, 'X'}, - { "mime", no_argument, 0, 'E'}, - { "details", no_argument, 0, 'D'}, - { "help", no_argument, 0, 'H'}, - { "version", no_argument, 0, 'V'}, - { 0, 0, 0, 0} - }; - int option_index = 0; - int c = getopt_long (argc, argv, "ACMFPRUXEDHBVacmfpruxedhbv", - long_options, &option_index); - //c = getopt (argc, argv, "ACMFPRUXED"); - if(c == -1) - break; - switch (c) - { - case 'A': - case 'a': - run_all = true; - no_args = false; - break; - case '0': - empty_check = true; - break; - case 'C': - case 'c': - checksum = true; - no_args = false; - break; - case 'M': - case 'm': - metadata = true; - no_args = false; - break; - case 'B': - case 'b': - progress.set_progress_report(true); - break; - case 'F': - case 'f': - favicon = true; - no_args = false; - break; - case 'P': - case 'p': - main_page = true; - no_args = false; - break; - case 'R': - case 'r': - redundant_data = true; - no_args = false; - break; - case 'U': - case 'u': - url_check = true; - no_args = false; - break; - case 'X': - case 'x': - url_check_external = true; - no_args = false; - break; - case 'E': - case 'e': - mime_check = true; - no_args = false; - break; - case 'D': - case 'd': - error_details = true; - break; - case 'H': - case 'h': - help=true; - break; - case '?': - if (optopt == 'c') - { - std::cerr<<"Option "<<(char)optopt<<" requires an argument.\n"; - displayHelp(); - } - else if ( isprint (optopt) ) - std::cerr<<"Unknown option `"<<( char )optopt<<"'.\n"; - else - { - std::cerr<<"Unknown option\n"; - displayHelp(); - } - return 1; - case 'V': - case 'v': - version(); - return 0; - default: - abort (); - } - } - - //Displaying Help for --help argument - if(help) - { - displayHelp(); - return -1; - } - - //If no arguments are given to the program, all the tests are performed. - if ( run_all || no_args ) - { - checksum = metadata = favicon = main_page = redundant_data = - url_check = url_check_external = mime_check = empty_check = true; - } - - //Obtaining filename from argument list - filename = ""; - for(int i = 0; i < argc; i++) - { - if( (argv[i][0] != '-') && (i != 0)) - { - filename = argv[i]; - } - } - if(filename == "") - { - std::cerr<<"No file provided as argument\n"; - displayHelp(); - return -1; - } - //Tests. - try - { - std::cout << "[INFO] Checking zim file " << filename << std::endl; - zim::File f( filename ); - - //Test 1: Internal Checksum - if(checksum) - test_checksum(f, error); - - //Test 2: Metadata Entries: - //The file is searched for the compulsory metadata entries. - if(metadata) - test_metadata(f, error); - - //Test 3: Test for Favicon. - if(favicon) - test_favicon(f, error); - - - //Test 4: Main Page Entry - if(main_page) - test_mainpage(f, error); - - /* Now we want to avoid to loop on the tests but on the article. - * - * If we loop of the tests we will have : - * - * for (test: tests) { - * for(article: articles) { - * data = article->getData(); - * ... - * } - * } - * - * And so we will get several the data of an article (and so decompression and so). - * By looping on the articles first, we have : - * - * for (article: articles) { - * data = article->getData() { - * for (test: tests) { - * ... - * } - * } - */ - - if ( redundant_data || url_check || url_check_external || empty_check ) - test_articles(f, error, progress, redundant_data, url_check, url_check_external, empty_check); - - - //Test 8: Verifying MIME Types - //MIME Checks is intended to verify that all the MIME types of all different articles are listed in the file header. - //As of now, there is no method in the existing zimlib to get the list of MIME types listed in the file header. - //A bug has been reported for the above problem, and once the bug is fixed, it will be used to add MIME checks to the zimcheck tool. - /* - if(mime_check) - { - std::cout<<"\nTest 8: Verifying MIME Types.. \n"< -#include "arg.h" +#include + #include "version.h" #define BUFFER_SIZE 4096 +#define DEFAULT_PART_SIZE 2147483648 + class ZimSplitter { private: zim::File file; const std::string prefix; - size_t partSize; + zim::size_type partSize; char first_index, second_index; std::ifstream ifile; std::ofstream ofile; std::string part_name; - size_t out_size; + zim::size_type out_size; char* batch_buffer; public: - ZimSplitter(const std::string& fname, const std::string& out_prefix, size_t partSize) + ZimSplitter(const std::string& fname, const std::string& out_prefix, zim::size_type partSize) : file(fname), prefix(out_prefix), partSize(partSize), @@ -94,12 +97,12 @@ copy_out(file.getClusterOffset(0)); } - void copy_out(zim::offset_type size) { + void copy_out(zim::size_type size) { while (size > 0) { - auto size_to_copy = std::min(size, BUFFER_SIZE); + auto size_to_copy = std::min(size, BUFFER_SIZE); ifile.read(batch_buffer, size_to_copy); if (!ifile) { - throw std::runtime_error("Error while reading zim file"); + throw std::runtime_error("Error while reading zim file"); } ofile.write(batch_buffer, size_to_copy); if (!ofile) { @@ -131,7 +134,7 @@ if (out_size+lastPartSize > partSize) { new_file(); } - copy_out(lastPartSize); + copy_out(lastPartSize); } bool check() { @@ -159,39 +162,44 @@ } }; +static const char USAGE[] = R"( + zimsplit splits smartly a ZIM file in smaller parts. + +Usage: + zimsplit [--prefix=PREFIX] [--force] [--size=N] + zimsplit --version + +Options: + --prefix=PREFIX Prefix of output file parts. Default: + --size=N The file size for each part. Default: 2GB + --force Create zim parts even if it is impossible to have all part size smaller than requested + -h, --help Show this help message + --version Show zimsplit version. +)"; + int main(int argc, char* argv[]) { try { - zim::Arg out_prefix(argc, argv, 'o'); - zim::Arg part_size(argc, argv, 's'); - zim::Arg force(argc, argv, "--force"); - zim::Arg printVersion(argc, argv, "--version"); - - // version number - if (printVersion) - { - version(); - return 0; - } - - if (argc <= 1) - { - std::cerr << "usage: " << argv[0] << " [options] zimfile\n" - "\n" - "options:\n" - " -o prefix of output file parts\n" - " -s size of each file parts\n" - " --force create zim parts even if it is impossible to have all part size smaller than requested\n" - " --version print the software version\n" - << std::flush; - return -1; - } + + std::string versionstr("zimsplit " + std::string(VERSION)); + std::map args = docopt::docopt(USAGE, + {argv + 1, argv + argc}, + true, + versionstr); + + std::string prefix = args[""].asString(); + if (args["--prefix"]) + prefix = args["--prefix"].asString(); + + zim::size_type size = DEFAULT_PART_SIZE; + if (args["--size"]) + size = args["--size"].asLong(); // initalize app - ZimSplitter app(argv[1], out_prefix, part_size); + ZimSplitter app(args[""].asString(), prefix, size); - if (!force && app.check()) { + if (!args["--force"] && app.check()) { std::cout << "Creation of zim parts canceled because of previous errors." << std::endl; std::cout << "Use --force option to create zim parts anyway." << std::endl; return -1; diff -Nru zim-tools-2.0.0/src/zimwriterfs/article.cpp zim-tools-2.1.0/src/zimwriterfs/article.cpp --- zim-tools-2.0.0/src/zimwriterfs/article.cpp 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/zimwriterfs/article.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -18,15 +18,15 @@ * MA 02110-1301, USA. */ -#include "article.h" -#include "tools.h" -#include "../tools.h" - #include #include #include -extern std::string directoryPath; +#include "article.h" +#include "tools.h" +#include "../tools.h" +#include "zimcreatorfs.h" + zim::writer::Url Article::getUrl() const { @@ -72,18 +72,19 @@ dataRead = true; } -FileArticle::FileArticle(const std::string& path, const bool detectRedirects) - : dataRead(false) +FileArticle::FileArticle(const ZimCreatorFS *_creator, const std::string& full_path, const bool detect_html_redirects) + : creator(_creator) + , dataRead(false) { invalid = false; - url = path.substr(directoryPath.size() + 1); + url = full_path.substr(creator->basedir().size() + 1); /* mime-type */ - mimeType = getMimeTypeForFile(url); + mimeType = getMimeTypeForFile(creator->basedir(), url); /* namespace */ - ns = getNamespaceForMimeType(mimeType)[0]; + ns = getNamespaceForMimeType(mimeType, creator->uniqNamespace())[0]; /* HTML specific code */ if ( mimeType.find("text/html") != std::string::npos @@ -92,7 +93,7 @@ } if ( mimeType.find("text/html") != std::string::npos ) { - parseAndAdaptHtml(detectRedirects); + parseAndAdaptHtml(detect_html_redirects); } else if (mimeType.find("text/css") != std::string::npos) { adaptCss(); } @@ -151,7 +152,7 @@ } if (!targetUrl.empty()) { auto redirectUrl = computeAbsolutePath(url, decodeUrl(targetUrl)); - if (!fileExists(directoryPath + "/" + redirectUrl)) { + if (!fileExists(creator->basedir() + "/" + redirectUrl)) { redirectUrl.clear(); invalid = true; } else { @@ -190,7 +191,7 @@ && target.substr(0, 5) != "data:") { replaceStringInPlace(data, "\"" + target + "\"", - "\"" + computeNewUrl(url, longUrl, target) + "\""); + "\"" + creator->computeNewUrl(url, longUrl, target) + "\""); } } gumbo_destroy_output(&kGumboDefaultOptions, output); @@ -232,7 +233,7 @@ /* Embeded fonts need to be inline because Kiwix is otherwise not able to load same because of the same-origin security */ - std::string mimeType = getMimeTypeForFile(path); + std::string mimeType = getMimeTypeForFile(creator->basedir(), path); if (mimeType == "application/font-ttf" || mimeType == "application/font-woff" || mimeType == "application/font-woff2" @@ -240,7 +241,7 @@ || mimeType == "application/vnd.ms-fontobject") { try { std::string fontContent = getFileContent( - directoryPath + "/" + computeAbsolutePath(this->url, path)); + creator->basedir() + "/" + computeAbsolutePath(this->url, path)); replaceStringInPlaceOnce( data, startDelimiter + url + endDelimiter, @@ -260,7 +261,7 @@ replaceStringInPlaceOnce( data, startDelimiter + url + endDelimiter, - startDelimiter + computeNewUrl(this->url, longUrl, path) + endDelimiter); + startDelimiter + creator->computeNewUrl(this->url, longUrl, path) + endDelimiter); } } } @@ -276,7 +277,7 @@ std::string FileArticle::_getFilename() const { - return directoryPath + "/" + url; + return creator->basedir() + "/" + url; } std::string FileArticle::getFilename() const @@ -298,16 +299,18 @@ return in.tellg(); } -RedirectArticle::RedirectArticle(char ns, +RedirectArticle::RedirectArticle(const ZimCreatorFS *_creator, + char ns, const std::string& url, const std::string& title, const zim::writer::Url& redirectUrl) + : creator(_creator) { this->ns = ns; this->url = url; this->title = title; this->redirectUrl = redirectUrl; - mimeType = getMimeTypeForFile(redirectUrl.getUrl()); + mimeType = getMimeTypeForFile(creator->basedir(), redirectUrl.getUrl()); } SimpleMetadataArticle::SimpleMetadataArticle(const std::string& id, diff -Nru zim-tools-2.0.0/src/zimwriterfs/article.h zim-tools-2.1.0/src/zimwriterfs/article.h --- zim-tools-2.0.0/src/zimwriterfs/article.h 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/zimwriterfs/article.h 2020-11-17 16:00:36.000000000 +0000 @@ -25,7 +25,7 @@ #include #include -extern std::string favicon; +class ZimCreatorFS; class Article : public zim::writer::Article { @@ -85,6 +85,7 @@ } }; +/// This class creates a redirect entry to image/png favicon class MetadataFaviconArticle : public MetadataArticle { private: @@ -119,6 +120,7 @@ class FileArticle : public Article { private: + const ZimCreatorFS *creator; mutable std::string data; mutable bool dataRead; bool invalid; @@ -128,20 +130,27 @@ void adaptCss(); public: - explicit FileArticle(const std::string& path, - const bool detectRedirects = true); + //! Must be initialized with full file path + explicit FileArticle(const ZimCreatorFS *creator, + const std::string& full_path, + const bool detect_html_redirects = true); virtual zim::Blob getData() const; virtual bool isLinktarget() const { return false; } virtual bool isDeleted() const { return false; } virtual zim::size_type getSize() const; + + //! Returns full filename; or empty string if content already read from the file virtual std::string getFilename() const; + virtual bool isInvalid() const; }; +/// Redirect entry from user-supplied file class RedirectArticle : public Article { public: - explicit RedirectArticle(char ns, + explicit RedirectArticle(const ZimCreatorFS *creator, + char ns, const std::string& url, const std::string& title, const zim::writer::Url& redirectUrl); @@ -151,6 +160,8 @@ virtual bool isDeleted() const { return false; } virtual zim::size_type getSize() const { return 0; } virtual std::string getFilename() const { return ""; } +private: + const ZimCreatorFS *creator; }; #endif // OPENZIM_ZIMWRITERFS_ARTICLE_H diff -Nru zim-tools-2.0.0/src/zimwriterfs/queue.h zim-tools-2.1.0/src/zimwriterfs/queue.h --- zim-tools-2.0.0/src/zimwriterfs/queue.h 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/zimwriterfs/queue.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,88 +0,0 @@ -/* - * Copyright 2016 Matthieu Gautier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - * MA 02110-1301, USA. - */ - -#ifndef OPENZIM_ZIMWRITERFS_QUEUE_H -#define OPENZIM_ZIMWRITERFS_QUEUE_H - -#define MAX_QUEUE_SIZE 100 - -#include -#include - -template -class Queue { - public: - Queue() {pthread_mutex_init(&m_queueMutex,NULL);}; - virtual ~Queue() {pthread_mutex_destroy(&m_queueMutex);}; - virtual bool isEmpty(); - virtual void pushToQueue(const T& element); - virtual bool popFromQueue(T &filename); - - protected: - std::queue m_realQueue; - pthread_mutex_t m_queueMutex; - - private: - // Make this queue non copyable - Queue(const Queue&); - Queue& operator=(const Queue&); -}; - -template -bool Queue::isEmpty() { - pthread_mutex_lock(&m_queueMutex); - bool retVal = m_realQueue.empty(); - pthread_mutex_unlock(&m_queueMutex); - return retVal; -} - -template -void Queue::pushToQueue(const T &element) { - unsigned int wait = 0; - unsigned int queueSize = 0; - - do { - usleep(wait); - pthread_mutex_lock(&m_queueMutex); - queueSize = m_realQueue.size(); - pthread_mutex_unlock(&m_queueMutex); - wait += 10; - } while (queueSize > MAX_QUEUE_SIZE); - - pthread_mutex_lock(&m_queueMutex); - m_realQueue.push(element); - pthread_mutex_unlock(&m_queueMutex); -} - -template -bool Queue::popFromQueue(T &element) { - pthread_mutex_lock(&m_queueMutex); - if (m_realQueue.empty()) { - pthread_mutex_unlock(&m_queueMutex); - return false; - } - - element = m_realQueue.front(); - m_realQueue.pop(); - pthread_mutex_unlock(&m_queueMutex); - - return true; -} - -#endif // OPENZIM_ZIMWRITERFS_QUEUE_H \ No newline at end of file diff -Nru zim-tools-2.0.0/src/zimwriterfs/tools.cpp zim-tools-2.1.0/src/zimwriterfs/tools.cpp --- zim-tools-2.0.0/src/zimwriterfs/tools.cpp 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/zimwriterfs/tools.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -70,16 +70,20 @@ extMimeTypes["json"] = "application/json"; extMimeTypes["CSS"] = "text/css"; extMimeTypes["css"] = "text/css"; - extMimeTypes["otf"] = "application/vnd.ms-opentype"; - extMimeTypes["OTF"] = "application/vnd.ms-opentype"; + extMimeTypes["otf"] = "font/otf"; + extMimeTypes["OTF"] = "font/otf"; + extMimeTypes["sfnt"] = "font/sfnt"; + extMimeTypes["SFNT"] = "font/sfnt"; extMimeTypes["eot"] = "application/vnd.ms-fontobject"; extMimeTypes["EOT"] = "application/vnd.ms-fontobject"; - extMimeTypes["ttf"] = "application/font-ttf"; - extMimeTypes["TTF"] = "application/font-ttf"; - extMimeTypes["woff"] = "application/font-woff"; - extMimeTypes["WOFF"] = "application/font-woff"; - extMimeTypes["woff2"] = "application/font-woff2"; - extMimeTypes["WOFF2"] = "application/font-woff2"; + extMimeTypes["ttf"] = "font/ttf"; + extMimeTypes["TTF"] = "font/ttf"; + extMimeTypes["collection"] = "font/collection"; + extMimeTypes["COLLECTION"] = "font/collection"; + extMimeTypes["woff"] = "font/woff"; + extMimeTypes["WOFF"] = "font/woff"; + extMimeTypes["woff2"] = "font/woff2"; + extMimeTypes["WOFF2"] = "font/woff2"; extMimeTypes["vtt"] = "text/vtt"; extMimeTypes["VTT"] = "text/vtt"; extMimeTypes["webm"] = "video/webm"; @@ -110,7 +114,6 @@ static std::map fileMimeTypes; -extern std::string directoryPath; extern bool inflateHtmlFlag; extern magic_t magic; @@ -265,7 +268,7 @@ } } -std::string getMimeTypeForFile(const std::string& filename) +std::string getMimeTypeForFile(const std::string &directoryPath, const std::string& filename) { std::string mimeType; @@ -298,33 +301,3 @@ return mimeType; } } - -inline std::string removeLocalTagAndParameters(const std::string& url) -{ - std::string retVal = url; - std::size_t found; - - /* Remove URL arguments */ - found = retVal.find("?"); - if (found != std::string::npos) { - retVal = retVal.substr(0, found); - } - - /* Remove local tag */ - found = retVal.find("#"); - if (found != std::string::npos) { - retVal = retVal.substr(0, found); - } - - return retVal; -} - -std::string computeNewUrl(const std::string& aid, const std::string& baseUrl, const std::string& targetUrl) -{ - std::string filename = computeAbsolutePath(aid, targetUrl); - std::string targetMimeType - = getMimeTypeForFile(decodeUrl(removeLocalTagAndParameters(filename))); - std::string newUrl - = "/" + getNamespaceForMimeType(targetMimeType) + "/" + filename; - return computeRelativePath(baseUrl, newUrl); -} diff -Nru zim-tools-2.0.0/src/zimwriterfs/zimcreatorfs.cpp zim-tools-2.1.0/src/zimwriterfs/zimcreatorfs.cpp --- zim-tools-2.0.0/src/zimwriterfs/zimcreatorfs.cpp 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/zimwriterfs/zimcreatorfs.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -26,9 +26,30 @@ #include #include #include +#include +#include +#include bool isVerbose(); +ZimCreatorFS::ZimCreatorFS(std::string _directoryPath, std::string mainPage, + bool verbose, bool uniqueNamespace, bool zstd) + : zim::writer::Creator(verbose, zstd ? zim::zimcompZstd : zim::zimcompLzma), + directoryPath(_directoryPath), + mainPage(mainPage), + uniqueNamespace(uniqueNamespace) +{ + char buf[PATH_MAX]; + + if (realpath(directoryPath.c_str(), buf) != buf) { + throw std::invalid_argument( + Formatter() << "Unable to canonicalize HTML directory path " + << directoryPath << ": " << strerror(errno)); + } + + canonical_basedir = buf; +} + zim::writer::Url ZimCreatorFS::getMainUrl() const { return zim::writer::Url('A', mainPage); @@ -53,10 +74,11 @@ } auto redirectArticle = std::make_shared( - matches[1].str()[0], - matches[2].str(), - matches[3].str(), - matches[4].str()); + this, + matches[1].str()[0], // ns + matches[2].str(), // URL + matches[3].str(), // title + matches[4].str()); // redirect URL addArticle(redirectArticle); ++line_number; } @@ -90,10 +112,10 @@ switch (entry->d_type) { case DT_REG: + addArticle(fullEntryName); + break; case DT_LNK: - { - addArticle(fullEntryName); - } + processSymlink(path, fullEntryName); break; case DT_DIR: visitDirectory(fullEntryName); @@ -140,15 +162,15 @@ closedir(directory); } -void ZimCreatorFS::addMetadata(const std::string& metadata, const std::string& content) +void ZimCreatorFS::addMetadata(const std::string& title, const std::string& content) { - auto article = std::make_shared(metadata, content); + auto article = std::make_shared(title, content); addArticle(article); } void ZimCreatorFS::addArticle(const std::string& path) { - auto farticle = std::make_shared(path); + auto farticle = std::make_shared(this, path); if (farticle->isInvalid()) { return; } @@ -163,6 +185,49 @@ } } +void ZimCreatorFS::processSymlink(const std::string& curdir, const std::string& symlink_path) +{ + /* #102 Links can be 3 different types: + * - dandling (not pointing to a valid file) + * - pointing to file but outside of 'directoryPath' + * - looped symlinks + */ + char resolved[PATH_MAX]; + if (realpath(symlink_path.c_str(), resolved) != resolved) { + // looping symlinks also fall here: Too many levels of symbolic links + // It also handles dangling symlink: No such file or directory + std::cerr << "Unable to resolve symlink " << symlink_path + << ": " << strerror(errno) << std::endl; + return; + } + + if (isDirectory(resolved)) { + std::cerr << "Skip symlink " << symlink_path + << ": points to a directory" << std::endl; + return; + } + + if (strncmp(canonical_basedir.c_str(), resolved, canonical_basedir.size()) != 0 + || resolved[canonical_basedir.size()] != '/') { + std::cerr << "Skip symlink " << symlink_path + << ": points outside of HTML directory" << std::endl; + return; + } + + std::string source_url = symlink_path.substr(directoryPath.size() + 1); + std::string source_mimeType = getMimeTypeForFile(directoryPath, source_url); + const char source_ns = getNamespaceForMimeType(source_mimeType, uniqNamespace())[0]; + + std::string target_url = std::string(resolved).substr(canonical_basedir.size() + 1); + std::string target_mimeType = getMimeTypeForFile(directoryPath, target_url); + const char target_ns = getNamespaceForMimeType(target_mimeType, uniqNamespace())[0]; + + std::shared_ptr redirect_article( + new RedirectArticle(this, source_ns, source_url, "", + zim::writer::Url(target_ns, target_url))); + addArticle(redirect_article); +} + void ZimCreatorFS::finishZimCreation() { for(auto& handler: articleHandlers) { @@ -175,3 +240,33 @@ { articleHandlers.push_back(handler); } + +inline std::string removeLocalTagAndParameters(const std::string& url) +{ + std::string retVal = url; + std::size_t found; + + /* Remove URL arguments */ + found = retVal.find("?"); + if (found != std::string::npos) { + retVal = retVal.substr(0, found); + } + + /* Remove local tag */ + found = retVal.find("#"); + if (found != std::string::npos) { + retVal = retVal.substr(0, found); + } + + return retVal; +} + +std::string ZimCreatorFS::computeNewUrl(const std::string& aid, const std::string& baseUrl, const std::string& targetUrl) const +{ + std::string filename = computeAbsolutePath(aid, targetUrl); + std::string targetMimeType + = getMimeTypeForFile(directoryPath, decodeUrl(removeLocalTagAndParameters(filename))); + std::string newUrl + = "/" + getNamespaceForMimeType(targetMimeType, uniqNamespace()) + "/" + filename; + return computeRelativePath(baseUrl, newUrl); +} diff -Nru zim-tools-2.0.0/src/zimwriterfs/zimcreatorfs.h zim-tools-2.1.0/src/zimwriterfs/zimcreatorfs.h --- zim-tools-2.0.0/src/zimwriterfs/zimcreatorfs.h 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/zimwriterfs/zimcreatorfs.h 2020-11-17 16:00:36.000000000 +0000 @@ -38,10 +38,10 @@ class ZimCreatorFS : public zim::writer::Creator { public: - ZimCreatorFS(std::string mainPage, bool verbose) - : zim::writer::Creator(verbose), - mainPage(mainPage) {} + ZimCreatorFS(std::string _directoryPath, std::string mainPage, bool verbose, + bool uniqueNamespace, bool zstd = false); virtual ~ZimCreatorFS() = default; + virtual zim::writer::Url getMainUrl() const; virtual void add_customHandler(IHandler* handler); virtual void add_redirectArticles_from_file(const std::string& path); @@ -52,9 +52,18 @@ virtual void addArticle(std::shared_ptr article); virtual void finishZimCreation(); + void processSymlink(const std::string& curdir, const std::string& symlink_path); + std::string computeNewUrl(const std::string& aid, const std::string& baseUrl, const std::string& targetUrl) const; + const std::string & basedir() const { return directoryPath; } + bool uniqNamespace() const { return uniqueNamespace; } + const std::string & canonicalBaseDir() const { return canonical_basedir; } + private: std::vector articleHandlers; + std::string directoryPath; ///< html dir without trailing slash std::string mainPage; + std::string canonical_basedir; + bool uniqueNamespace; }; #endif // OPENZIM_ZIMWRITERFS_ARTICLESOURCE_H diff -Nru zim-tools-2.0.0/src/zimwriterfs/zimwriterfs.cpp zim-tools-2.1.0/src/zimwriterfs/zimwriterfs.cpp --- zim-tools-2.0.0/src/zimwriterfs/zimwriterfs.cpp 2020-07-15 15:19:30.000000000 +0000 +++ zim-tools-2.1.0/src/zimwriterfs/zimwriterfs.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -22,7 +22,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -32,7 +35,6 @@ #include "article.h" #include "zimcreatorfs.h" #include "mimetypecounter.h" -#include "queue.h" #include "../tools.h" /* Check for version number */ @@ -40,28 +42,36 @@ #define VERSION "UNKNOWN" #endif -/* Global access strings */ +namespace { +/* Command line options */ std::string language; std::string creator; std::string publisher; std::string title; std::string tags; std::string flavour; -std::string scraper; +std::string scraper = "zimwriterfs-" VERSION; std::string name; std::string source; std::string description; std::string welcome; std::string favicon; -std::string directoryPath; std::string redirectsPath; std::string zimPath; +std::string directoryPath; + +int minChunkSize = 2048; bool verboseFlag = false; -pthread_mutex_t verboseMutex; -bool inflateHtmlFlag = false; bool uniqueNamespace = false; bool withoutFTIndex = false; +bool zstdFlag = false; +} + +// Global flags +bool inflateHtmlFlag = false; + +pthread_mutex_t verboseMutex; magic_t magic; @@ -149,6 +159,8 @@ << std::endl; std::cout << "\t-s, --scraper\t\tname & version of tool used to produce HTML content" << std::endl; + std::cout << "\t-z, --zstd\t\tuse Zstandard as ZIM compression (lzma otherwise)" + << std::endl; std::cout << std::endl; std::cout << "Example:" << std::endl; @@ -159,17 +171,15 @@ std::cout << std::endl; std::cout << "Documentation:" << std::endl; - std::cout << "\tzimwriterfs source code: https://github.com/openzim/zimwriterfs" + std::cout << "\tzimwriterfs source code: https://github.com/openzim/zim-tools" << std::endl; std::cout << "\tZIM format: https://openzim.org" << std::endl; std::cout << std::endl; } -/* Main program entry point */ -int main(int argc, char** argv) -{ - int minChunkSize = 2048; +void parse_args(int argc, char** argv) +{ /* Argument parsing */ static struct option long_options[] = {{"help", no_argument, 0, 'h'}, @@ -191,6 +201,7 @@ {"description", required_argument, 0, 'd'}, {"creator", required_argument, 0, 'c'}, {"publisher", required_argument, 0, 'p'}, + {"zstd", no_argument, 0, 'z'}, {"withoutFTIndex", no_argument, 0, 'j'}, // Only for backward compatibility @@ -202,7 +213,7 @@ do { c = getopt_long( - argc, argv, "hVvijxuw:m:f:t:d:c:l:p:r:e:n:", long_options, &option_index); + argc, argv, "hVvijxuzw:m:f:t:d:c:l:p:r:e:n:", long_options, &option_index); if (c != -1) { switch (c) { @@ -271,6 +282,9 @@ case 'w': welcome = optarg; break; + case 'z': + zstdFlag = true; + break; } } } while (c != -1); @@ -298,6 +312,8 @@ } /* Check arguments */ + + // delete / from the end of filename if (directoryPath[directoryPath.length() - 1] == '/') { directoryPath = directoryPath.substr(0, directoryPath.length() - 1); } @@ -327,8 +343,31 @@ tags += "_ftindex:yes"; tags += ";_ftindex"; // For backward compatibility } +} - ZimCreatorFS zimCreator(welcome, isVerbose()); +void create_zim() +{ + ZimCreatorFS zimCreator(directoryPath, welcome, isVerbose(), uniqueNamespace, zstdFlag); + + if (zimPath.size() >= (MAXPATHLEN-1)) { + throw std::invalid_argument("Target .zim file path is too long"); + } + + char buf[MAXPATHLEN]; + strncpy(buf, zimPath.c_str(), sizeof(buf)-1); + // dirname() can modify its argument, so need to pass a copy + std::string zimdir = dirname(buf); + + if (realpath(zimdir.c_str(), buf) != buf) { + throw std::invalid_argument( + Formatter() << "Unable to canonicalize target directory of .zim " + << zimdir << ": " << strerror(errno)); + } + + // Check that the resulting .zim file isn't located under source HTML directory + if (std::string(buf).find(zimCreator.canonicalBaseDir()) == 0) { + throw std::invalid_argument(".zim file to create cannot be located inside of source HTML directory"); + } zimCreator.setMinChunkSize(minChunkSize); zimCreator.setIndexing(!withoutFTIndex, language); @@ -347,10 +386,6 @@ zimCreator.addArticle(std::make_shared()); zimCreator.addArticle(std::make_shared(zim::writer::Url('I', favicon))); - /* Init */ - magic = magic_open(MAGIC_MIME); - magic_load(magic, NULL); - pthread_mutex_init(&verboseMutex, NULL); /* Directory visitor */ MimetypeCounter mimetypeCounter; @@ -373,6 +408,25 @@ } } zimCreator.finishZimCreation(); +} + + +/* Main program entry point */ +int main(int argc, char** argv) +{ + /* Init */ + magic = magic_open(MAGIC_MIME); + magic_load(magic, NULL); + pthread_mutex_init(&verboseMutex, NULL); + + try { + parse_args(argc, argv); + create_zim(); + } + catch(std::exception &e) { + std::cerr << "zimwriterfs: " << e.what() << std::endl; + exit(1); + } magic_close(magic); /* Destroy mutex */ Binary files /tmp/tmp9HHlz0/RMav1Dz_3P/zim-tools-2.0.0/test/data/minimal-content/favicon.png and /tmp/tmp9HHlz0/yrleVeAeRQ/zim-tools-2.1.0/test/data/minimal-content/favicon.png differ diff -Nru zim-tools-2.0.0/test/data/minimal-content/hello.html zim-tools-2.1.0/test/data/minimal-content/hello.html --- zim-tools-2.0.0/test/data/minimal-content/hello.html 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/data/minimal-content/hello.html 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,10 @@ + + + + + HTML title tag content + + +

hello, html

+ + diff -Nru zim-tools-2.0.0/test/data/with-symlink/another.html zim-tools-2.1.0/test/data/with-symlink/another.html --- zim-tools-2.0.0/test/data/with-symlink/another.html 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/data/with-symlink/another.html 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,10 @@ + + + + + Another HTML file + + +

This content should be accessible by symlink

+ + diff -Nru zim-tools-2.0.0/test/data/with-symlink/hello.html zim-tools-2.1.0/test/data/with-symlink/hello.html --- zim-tools-2.0.0/test/data/with-symlink/hello.html 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/data/with-symlink/hello.html 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,11 @@ + + + + + HTML title tag content + + +

hello, html

+ Link to symlink + + diff -Nru zim-tools-2.0.0/test/data/with-symlink/symlink.html zim-tools-2.1.0/test/data/with-symlink/symlink.html --- zim-tools-2.0.0/test/data/with-symlink/symlink.html 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/data/with-symlink/symlink.html 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,10 @@ + + + + + Another HTML file + + +

This content should be accessible by symlink

+ + Binary files /tmp/tmp9HHlz0/RMav1Dz_3P/zim-tools-2.0.0/test/data/zimfiles/wikibooks_be_all_nopic_2017-02.zim and /tmp/tmp9HHlz0/yrleVeAeRQ/zim-tools-2.1.0/test/data/zimfiles/wikibooks_be_all_nopic_2017-02.zim differ diff -Nru zim-tools-2.0.0/test/meson.build zim-tools-2.1.0/test/meson.build --- zim-tools-2.0.0/test/meson.build 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/meson.build 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,32 @@ +gtest_dep = dependency('gtest', main:true, fallback:['gtest', 'gtest_main_dep'], required:false) + +tests = [ + 'tools-test', + 'zimwriterfs-article', + 'zimwriterfs-zimcreatorfs', + 'zimcheck-test' +] + +zimwriter_srcs = [ '../src/zimwriterfs/article.cpp', + '../src/zimwriterfs/tools.cpp', + '../src/zimwriterfs/zimcreatorfs.cpp', + '../src/zimwriterfs/mimetypecounter.cpp', + '../src/tools.cpp'] + +tests_src_map = { 'zimcheck-test' : ['../src/zimcheck/checks.cpp', '../src/tools.cpp'], + 'zimwriterfs-article' : zimwriter_srcs, + 'tools-test' : zimwriter_srcs, + 'zimwriterfs-zimcreatorfs' : zimwriter_srcs } + +if gtest_dep.found() and not meson.is_cross_build() + + foreach test_name : tests + + test_exe = executable(test_name, [test_name+'.cpp'] + tests_src_map[test_name], + dependencies : [gtest_dep, libzim_dep, gumbo_dep, magic_dep, zlib_dep], + build_rpath : '$ORIGIN') + + test(test_name, test_exe, timeout : 60, + workdir: meson.current_source_dir()) + endforeach +endif diff -Nru zim-tools-2.0.0/test/tools-test.cpp zim-tools-2.1.0/test/tools-test.cpp --- zim-tools-2.0.0/test/tools-test.cpp 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/tools-test.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,232 @@ +#include "gtest/gtest.h" + +#include "../src/tools.h" +#include +#include + +magic_t magic; +bool inflateHtmlFlag = false; +bool isVerbose() { return false; } + +TEST(CommonTools, getFileSize) +{ + std::string fn = "data/minimal-content/favicon.png"; + + ASSERT_TRUE(fileExists(fn)); + EXPECT_EQ(getFileSize(fn), 2725u); +} + +TEST(CommonTools, isDirectory) +{ + ASSERT_FALSE(isDirectory("data/minimal-content/favicon.png")); + ASSERT_TRUE(isDirectory("data/minimal-content")); +} + +TEST(CommonTools, base64_encode) +{ + unsigned char data[] = { 0xff, 0x00, 0x7a }; + std::string txt = base64_encode(data, sizeof(data)); + EXPECT_EQ(txt, "/wB6"); +} + +TEST(CommonTools, decodeUrl) +{ + std::string src = "%00"; + std::string res = decodeUrl(src); + EXPECT_EQ(res.size(), 1u); + EXPECT_EQ(res[0], '\0'); + + src = "%ff"; + res = decodeUrl(src); + EXPECT_EQ(res.size(), 1u); + EXPECT_EQ(res[0], '\xff'); + + std::unordered_map expectationsMap = { + // test normal use + { "https://www.example.com/cgi-bin/search.cgi?q=example%20search", + "https://www.example.com/cgi-bin/search.cgi?q=example search" }, + { "%2a", "*" }, + // test corner cases + { "%", "%" }, + { "%2", "%2" }, + { "%%","%%" }, + { "%%%", "%%%" }, + { "%at", "%at" }, + { "%%ft", "%%ft" }, + { "%%53", "%S"}, + { "%%5t", "%%5t"} + }; + + for (auto p : expectationsMap) { + std::string res = decodeUrl(p.first); + EXPECT_EQ(res, p.second); + } +} + +TEST(CommonTools, computeAbsolutePath) +{ + std::string str; + + str = computeAbsolutePath("", ""); + EXPECT_EQ(str, ""); + + str = computeAbsolutePath("/home/alex/oz/zim-tools/test/data/", "minimal-content/hello.html"); + EXPECT_EQ(str, "/home/alex/oz/zim-tools/test/data/minimal-content/hello.html"); + + str = computeAbsolutePath("../test/data", "minimal-content/hello.html"); + EXPECT_EQ(str, "../test/minimal-content/hello.html"); + + // without trailing / 'data' component will be stripped from path: + str = computeAbsolutePath("/home/alex/oz/zim-tools/test/data", "minimal-content/hello.html"); + EXPECT_EQ(str, "/home/alex/oz/zim-tools/test/minimal-content/hello.html"); +} + +TEST(CommonTools, computeRelativePath) +{ + std::string str; + + str = computeRelativePath("", ""); + EXPECT_EQ(str, ""); + + str = computeRelativePath("a", "a"); + EXPECT_EQ(str, ""); + + str = computeRelativePath("aa/b", "aa/c"); + EXPECT_EQ(str, "c"); + + str = computeRelativePath("b", "ab"); + EXPECT_EQ(str, "ab"); + + str = computeRelativePath("a", "a/b"); + EXPECT_EQ(str, "b"); + + str = computeRelativePath("a/b", "a/b"); + EXPECT_EQ(str, ""); + + str = computeRelativePath("old/article1.html", "new/article1.html"); + EXPECT_EQ(str, "../new/article1.html"); +} + +TEST(CommonTools, replaceStringInPlaceOnce) +{ + std::string str; + + str = ""; + replaceStringInPlaceOnce(str, "", ""); + EXPECT_EQ(str, ""); + + str = "abcd"; + replaceStringInPlace(str, "a", ""); + EXPECT_EQ(str, "bcd"); + + str = "abcd"; + replaceStringInPlaceOnce(str, "a", "b"); + EXPECT_EQ(str, "bbcd"); + + str = "aabcd"; + replaceStringInPlaceOnce(str, "a", "b"); + EXPECT_EQ(str, "babcd"); +} + +TEST(CommonTools, replaceStringInPlace) +{ + std::string str; + + str = ""; + replaceStringInPlace(str, "", ""); + EXPECT_EQ(str, ""); + + str = "abcd"; + replaceStringInPlace(str, "a", "b"); + EXPECT_EQ(str, "bbcd"); + + str = "abcd"; + replaceStringInPlace(str, "a", ""); + EXPECT_EQ(str, "bcd"); + + str = "aabcd"; + replaceStringInPlace(str, "a", "b"); + EXPECT_EQ(str, "bbbcd"); +} + +TEST(CommonTools, stripTitleInvalidChars) +{ + std::string str; + + str = "\u202Aheader\u202A"; + stripTitleInvalidChars(str); + EXPECT_EQ(str, "header"); +} + +TEST(CommonTools, getNamespaceForMimeType) +{ + // with uniq namespace + EXPECT_EQ(getNamespaceForMimeType("text/html", true), "A"); + EXPECT_EQ(getNamespaceForMimeType("text/xml", true), "A"); + EXPECT_EQ(getNamespaceForMimeType("image/png", true), "A"); + EXPECT_EQ(getNamespaceForMimeType("application/json", true), "A"); + + // without uniq namespace + EXPECT_EQ(getNamespaceForMimeType("text/html", false), "A"); + EXPECT_EQ(getNamespaceForMimeType("text/xml", false), "-"); + EXPECT_EQ(getNamespaceForMimeType("image/png", false), "I"); + EXPECT_EQ(getNamespaceForMimeType("application/json", false), "-"); +} + +TEST(tools, isOutofBounds) +{ + ASSERT_FALSE(isOutofBounds("", "")); + ASSERT_TRUE(isOutofBounds("../../..", "")); + ASSERT_TRUE(isOutofBounds("../", "")); + ASSERT_FALSE(isOutofBounds("../", "/a/b")); + ASSERT_FALSE(isOutofBounds("../", "/a")); + ASSERT_TRUE(isOutofBounds("../../", "/a")); + ASSERT_TRUE(isOutofBounds("../../../-/s/css_modules/ext.cite.ux-enhancements.css", "A/Blood_/")); +} + +TEST(tools, normalize_link) +{ + ASSERT_EQ(normalize_link("/a", "/b"), "a"); + + // not absolute + ASSERT_EQ(normalize_link("a", "/b"), "/b/a"); + ASSERT_EQ(normalize_link("../a", "/b/c"), "/b/a"); + ASSERT_EQ(normalize_link(".././a", "/b/c"), "/b/a"); + ASSERT_EQ(normalize_link("../a/b/aa#localanchor", "/b/c"), "/b/a/b/aa"); + ASSERT_EQ(normalize_link("../a/b/aa?localanchor", "/b/c"), "/b/a/b/aa"); +} + +TEST(tools, addler32) +{ + ASSERT_EQ(adler32("sdfkhewruhwe8"), 640746832); + ASSERT_EQ(adler32("sdifjsdf"), 251593550); + ASSERT_EQ(adler32("q"), 7471218); + ASSERT_EQ(adler32(""), 1); +} + +TEST(tools, getLinks) +{ + auto v = generic_getLinks(""); + + ASSERT_TRUE(v.empty()); + + std::string page1 = ""; + auto v1 = generic_getLinks(page1); + + ASSERT_TRUE(v1.size() == 1); + ASSERT_EQ(v1[0].attribute, "href"); + ASSERT_EQ(v1[0].link, "https://fonts.goos.com/css?family=OpenSans"); + + std::string page2 = ""; + auto v2 = generic_getLinks(page2); + + ASSERT_TRUE(v2.size() == 1); + ASSERT_EQ(v1[0].attribute, "href"); + + std::string page3 = ""; + auto v3 = generic_getLinks(page3); + + ASSERT_TRUE(v3.size() == 1); + ASSERT_EQ(v3[0].attribute, "src"); + ASSERT_EQ(v3[0].link, "https://fonts.goos.com/css?family=OpenSans"); +} diff -Nru zim-tools-2.0.0/test/zimcheck-test.cpp zim-tools-2.1.0/test/zimcheck-test.cpp --- zim-tools-2.0.0/test/zimcheck-test.cpp 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/zimcheck-test.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,68 @@ +#include "gtest/gtest.h" + +#include "zim/zim.h" +#include "zim/file.h" +#include "../src/zimcheck/checks.h" + + +TEST(zimfilechecks, test_checksum) +{ + std::string fn = "data/zimfiles/wikibooks_be_all_nopic_2017-02.zim"; + + zim::File file(fn); + ErrorLogger logger; + + test_checksum(file, logger); + + ASSERT_TRUE(logger.overalStatus()); +} + +TEST(zimfilechecks, test_metadata) +{ + std::string fn = "data/zimfiles/wikibooks_be_all_nopic_2017-02.zim"; + + zim::File file(fn); + ErrorLogger logger; + + test_metadata(file, logger); + + ASSERT_TRUE(logger.overalStatus()); +} + +TEST(zimfilechecks, test_favicon) +{ + std::string fn = "data/zimfiles/wikibooks_be_all_nopic_2017-02.zim"; + + zim::File file(fn); + ErrorLogger logger; + + test_favicon(file, logger); + + ASSERT_TRUE(logger.overalStatus()); +} + +TEST(zimfilechecks, test_mainpage) +{ + std::string fn = "data/zimfiles/wikibooks_be_all_nopic_2017-02.zim"; + + zim::File file(fn); + ErrorLogger logger; + + test_mainpage(file, logger); + + ASSERT_TRUE(logger.overalStatus()); +} + +TEST(zimfilechecks, test_articles) +{ + std::string fn = "data/zimfiles/wikibooks_be_all_nopic_2017-02.zim"; + + zim::File file(fn); + ErrorLogger logger; + ProgressBar progress(1); + + + test_articles(file, logger, progress, true, true, true ,true); + + ASSERT_TRUE(logger.overalStatus()); +} diff -Nru zim-tools-2.0.0/test/zimwriterfs-article.cpp zim-tools-2.1.0/test/zimwriterfs-article.cpp --- zim-tools-2.0.0/test/zimwriterfs-article.cpp 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/zimwriterfs-article.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,177 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +#include "../src/zimwriterfs/article.h" +#include "../src/zimwriterfs/zimcreatorfs.h" +#include "gtest/gtest.h" +#include "../src/tools.h" + +// stub from zimwriterfs.cpp +bool isVerbose() { return false; } +bool inflateHtmlFlag = false; +magic_t magic; + + +TEST(ArticleTest, SimpleMetadata) +{ + std::string t = "Example content"; + SimpleMetadataArticle article("Title", t); + + // test zim::writer::Article interface + EXPECT_EQ(article.getUrl(), zim::writer::Url('M', "Title")); + EXPECT_EQ(article.getTitle(), ""); + ASSERT_FALSE(article.isRedirect()); + ASSERT_FALSE(article.isLinktarget()); + ASSERT_FALSE(article.isDeleted()); + EXPECT_EQ(article.getMimeType(), "text/plain"); + ASSERT_TRUE(article.shouldCompress()); + ASSERT_FALSE(article.shouldIndex()); + EXPECT_EQ(article.getRedirectUrl(), zim::writer::Url()); + EXPECT_EQ(article.getSize(), t.size()); + ASSERT_EQ(article.getData(), zim::Blob(t.data(), t.size())); + EXPECT_EQ(article.getFilename(), ""); +} + + +TEST(ArticleTest, MetadataFaviconArticle) +{ + std::string fn = "favicon.png"; + + MetadataFaviconArticle article("I/" + fn); + + // test zim::writer::Article interface + EXPECT_EQ(article.getUrl(), zim::writer::Url('-', "favicon")); + EXPECT_EQ(article.getTitle(), ""); + ASSERT_TRUE(article.isRedirect()); + EXPECT_FALSE(article.isLinktarget()); + EXPECT_FALSE(article.isDeleted()); + ASSERT_EQ(article.getMimeType(), "image/png"); + EXPECT_FALSE(article.shouldCompress()); + EXPECT_FALSE(article.shouldIndex()); + EXPECT_EQ(article.getRedirectUrl(), zim::writer::Url('I', fn)); + ASSERT_EQ(article.getSize(), 0u); + ASSERT_EQ(article.getData(), zim::Blob()); + EXPECT_EQ(article.getFilename(), ""); +} + +TEST(ArticleTest, MetadataDate) +{ + MetadataDateArticle article; + + // test zim::writer::Article interface + EXPECT_EQ(article.getUrl(), zim::writer::Url('M', "Date")); + EXPECT_EQ(article.getTitle(), ""); + ASSERT_FALSE(article.isRedirect()); + ASSERT_FALSE(article.isLinktarget()); + ASSERT_FALSE(article.isDeleted()); + EXPECT_EQ(article.getMimeType(), "text/plain"); + ASSERT_TRUE(article.shouldCompress()); + ASSERT_FALSE(article.shouldIndex()); + EXPECT_EQ(article.getRedirectUrl(), zim::writer::Url()); + EXPECT_TRUE(article.getSize() > 8 && article.getSize() < 15); // date string is about 10 chars + EXPECT_EQ(article.getFilename(), ""); +} + +TEST(ArticleTest, FileArticlePng) +{ + std::string directoryPath = "data/minimal-content"; + ZimCreatorFS creator(directoryPath, "mainPage", false, false); + + std::string fn = "favicon.png"; + unsigned int size = getFileSize(directoryPath + "/" + fn); + std::string data = getFileContent(directoryPath + "/" + fn); + + FileArticle article(&creator, directoryPath + "/" + fn, false); + + // test zim::writer::Article interface + EXPECT_EQ(article.getUrl(), zim::writer::Url('I', fn)); + EXPECT_EQ(article.getTitle(), ""); + ASSERT_FALSE(article.isRedirect()); + EXPECT_FALSE(article.isLinktarget()); + EXPECT_FALSE(article.isDeleted()); + ASSERT_EQ(article.getMimeType(), "image/png"); + EXPECT_FALSE(article.shouldCompress()); + EXPECT_FALSE(article.shouldIndex()); + EXPECT_EQ(article.getRedirectUrl(), zim::writer::Url()); + ASSERT_EQ(article.getSize(), size); + + // see FileArticle::getFilename() + // after file content is read, getFilename() no more returns the filename + EXPECT_EQ(article.getFilename(), directoryPath + "/" + fn); + ASSERT_EQ(article.getData(), zim::Blob(data.data(), data.size())); + EXPECT_EQ(article.getFilename(), ""); + // file size still should be returned: + EXPECT_EQ(article.getSize(), size); +} + +TEST(ArticleTest, FileArticleHTML) +{ + std::string directoryPath = "data/minimal-content"; + ZimCreatorFS creator(directoryPath, "mainPage", false, false); + + std::string fn = "hello.html"; + unsigned int size = getFileSize(directoryPath + "/" + fn); + std::string data = getFileContent(directoryPath + "/" + fn); + + FileArticle article(&creator, directoryPath + "/" + fn, false); + + // see FileArticle::getFilename() and the constructor + // because HTML content are read right away, getFilename() always returns empty string + EXPECT_EQ(article.getFilename(), ""); + + // test zim::writer::Article interface + EXPECT_EQ(article.getUrl(), zim::writer::Url('A', fn)); + EXPECT_EQ(article.getTitle(), "HTML title tag content"); + ASSERT_FALSE(article.isRedirect()); + EXPECT_FALSE(article.isLinktarget()); + EXPECT_FALSE(article.isDeleted()); + ASSERT_EQ(article.getMimeType(), "text/html"); + EXPECT_TRUE(article.shouldCompress()); + EXPECT_TRUE(article.shouldIndex()); + EXPECT_EQ(article.getRedirectUrl(), zim::writer::Url()); + ASSERT_EQ(article.getSize(), size); + ASSERT_EQ(article.getData(), zim::Blob(data.data(), data.size())); + EXPECT_EQ(article.getFilename(), ""); +} + +TEST(ArticleTest, RedirectArticle) +{ + std::string directoryPath = "data/minimal-content"; + ZimCreatorFS creator(directoryPath, "mainPage", false, false); + + RedirectArticle article(&creator, 'A', "index.html", "Start page", zim::writer::Url("A/home.html")); + + // test zim::writer::Article interface + EXPECT_EQ(article.getUrl(), zim::writer::Url('A', "index.html")); + EXPECT_EQ(article.getTitle(), "Start page"); + ASSERT_TRUE(article.isRedirect()); + EXPECT_FALSE(article.isLinktarget()); + EXPECT_FALSE(article.isDeleted()); + ASSERT_EQ(article.getMimeType(), "text/html"); + + // FIXME: maybe this is shouldn't be that way: + EXPECT_TRUE(article.shouldCompress()); + EXPECT_TRUE(article.shouldIndex()); + + EXPECT_EQ(article.getRedirectUrl(), zim::writer::Url('A', "home.html")); + ASSERT_EQ(article.getSize(), 0u); + ASSERT_EQ(article.getData(), zim::Blob()); + EXPECT_EQ(article.getFilename(), ""); +} diff -Nru zim-tools-2.0.0/test/zimwriterfs-zimcreatorfs.cpp zim-tools-2.1.0/test/zimwriterfs-zimcreatorfs.cpp --- zim-tools-2.0.0/test/zimwriterfs-zimcreatorfs.cpp 1970-01-01 00:00:00.000000000 +0000 +++ zim-tools-2.1.0/test/zimwriterfs-zimcreatorfs.cpp 2020-11-17 16:00:36.000000000 +0000 @@ -0,0 +1,133 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include + +#include +#include + +#include "gtest/gtest.h" + +#include "../src/zimwriterfs/zimcreatorfs.h" +#include "../src/zimwriterfs/article.h" +#include "../src/tools.h" + + +// stub from zimwriterfs.cpp +bool inflateHtmlFlag = false; +bool isVerbose() { return false; } +magic_t magic; + +class LibMagicInit +{ +public: + LibMagicInit() + { + if (! done) { + magic = magic_open(MAGIC_MIME); + magic_load(magic, NULL); + done = true; + } + } +private: + static bool done; +}; + +bool LibMagicInit::done = false; + + +class TempFile +{ +public: + TempFile(const char *name) { _name = "/tmp/"; _name += name; } + ~TempFile() { unlink(_name.c_str()); } + const char *path() { return _name.c_str(); } +private: + std::string _name; +}; + + +TEST(ZimCreatorFSTest, MinimalZim) +{ + LibMagicInit libmagic; + + std::string directoryPath = "data/minimal-content"; + ZimCreatorFS zimCreator(directoryPath, "index.html", false, false); + + TempFile out("minimal.zim"); + + zimCreator.startZimCreation(out.path()); + zimCreator.visitDirectory(directoryPath); + + std::shared_ptr redirect_article(new RedirectArticle(&zimCreator, 'A', "index.html", "Start page", zim::writer::Url("A/hello.html"))); + zimCreator.addArticle(redirect_article); + + zimCreator.finishZimCreation(); + + // verify the created .zim file with 'zimdump' + zim::File zimfile(out.path()); + EXPECT_EQ(zimfile.getCountArticles(), 4u); + + zim::Article a1 = zimfile.getArticle('A', "index.html"); + EXPECT_TRUE(a1.isRedirect()); + + zim::Article a2 = a1.getRedirectArticle(); + EXPECT_EQ(a2.getTitle(), "HTML title tag content"); +} + +TEST(ZimCreatorFSTest, SymlinkShouldCreateRedirectArticleEntry) +{ + LibMagicInit libmagic; + + std::string directoryPath = "data/with-symlink"; + ZimCreatorFS zimCreator(directoryPath, "hello.html", false, false); + + TempFile out("with-symlink.zim"); + + zimCreator.startZimCreation(out.path()); + zimCreator.visitDirectory(directoryPath); + zimCreator.finishZimCreation(); + + + // VERIFY the created .zim file with 'zimdump' + zim::File zimfile(out.path()); + EXPECT_EQ(zimfile.getCountArticles(), 4u); + + zim::Article a1 = zimfile.getArticle('A', "symlink.html"); + EXPECT_TRUE(a1.isRedirect()); + + zim::Article a2 = a1.getRedirectArticle(); + EXPECT_EQ(a2.getTitle(), "Another HTML file"); + + zim::Article a3 = zimfile.getArticle('A', "symlink-outside.html"); + EXPECT_FALSE(a3.good()); + + zim::Article a4 = zimfile.getArticle('A', "symlink-not-existing.html"); + EXPECT_FALSE(a4.good()); + + zim::Article a5 = zimfile.getArticle('A', "symlink-self.html"); + EXPECT_FALSE(a5.good()); +} + +TEST(ZimCreatorFSTest, ThrowsErrorIfDirectoryNotExist) +{ + EXPECT_THROW({ + ZimCreatorFS zimCreator("Non-existing-dir", "index.html", false, false); + }, std::invalid_argument ); +}