diff -Nru libencode-zapcp1252-perl-0.33/Build.PL libencode-zapcp1252-perl-0.40/Build.PL --- libencode-zapcp1252-perl-0.33/Build.PL 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/Build.PL 2020-02-04 15:30:19.000000000 +0000 @@ -16,9 +16,9 @@ }, meta_merge => { resources => { - homepage => 'http://search.cpan.org/dist/Encode-CP1252/', - bugtracker => 'http://github.com/theory/encode-cp1252/issues/', - repository => 'http://github.com/theory/encode-cp1252/tree', + homepage => 'https://search.cpan.org/dist/Encode-ZapCP1252/', + bugtracker => 'https://github.com/theory/encode-zapcp1252/issues/', + repository => 'https://github.com/theory/encode-zapcp1252', } }, ); diff -Nru libencode-zapcp1252-perl-0.33/Changes libencode-zapcp1252-perl-0.40/Changes --- libencode-zapcp1252-perl-0.33/Changes 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/Changes 2020-02-04 15:30:19.000000000 +0000 @@ -1,5 +1,13 @@ Revision history for Perl extension Encode::ZapCP1252 +0.40 2020-02-04T15:30:50Z + - Added link to Encoding::FixLatin. Suggested by Jonas Smedegaard. + - Removed Pod tests from the distribution. + - Fixed an issue discovered on Perl 5.31 that incorrectly resulted in + the creation of malformed UTF-8 when fixing unicode strings. Thanks to + Karl Williamson for the continuing improvement of Unicode support in + Perl and for the fix to this module. + 0.33 2011-11-23T05:19:36Z - Require Test::Pod 1.41 to support `L` syntax in the Pod. - Support for decoded strings is supported only in 5.8.8 and higher due @@ -16,25 +24,25 @@ 0.30 2010-06-12T18:05:38 - The conversion functions now ignore `undef` arguments and just return without doing anything. - - Strings are no longer modifed in-place unless the conversion + - Strings are no longer modified in-place unless the conversion subroutines are called in a void context. - - The conversion functions may optionally be called with no arugment + - The conversion functions may optionally be called with no argument when run in Perl 5.10 or higher, in which case they will instead act on `$_`. 0.20 2010-06-12T00:39:35 - - Added `local` to examples of changing the maping tables. - - When the Encode module is insatlled, zapping and fixing CP1252 + - Added `local` to examples of changing the mapping tables. + - When the Encode module is installed, zapping and fixing CP1252 gremlins now works in decoded strings, too. - For convenience, the functions now return the strings they've modified. - Shipping with a traditional `Makefile.PL` rather than one that passes through to Module::Build. - - Moved repository to [GitHub](http://github.com/theory/encode-zapcp1252). + - Moved repository to [GitHub](https://github.com/theory/encode-zapcp1252). 0.12 2008-06-23T17:48:04 - Fixed pasto in the "Support" section of the docs. - - Fixed a typo in the "Synopsis" section of the docs, thaks to David + - Fixed a typo in the "Synopsis" section of the docs, thanks to David Beaudet. - Fixed the 5.6.2 requirement to be properly detected in Perl 5.5. Thanks to Slaven Rezic for the report. diff -Nru libencode-zapcp1252-perl-0.33/debian/changelog libencode-zapcp1252-perl-0.40/debian/changelog --- libencode-zapcp1252-perl-0.33/debian/changelog 2014-12-08 14:39:09.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/debian/changelog 2020-05-18 17:16:10.000000000 +0000 @@ -1,3 +1,25 @@ +libencode-zapcp1252-perl (0.40-1) unstable; urgency=medium + + * Team upload. + + [ Salvatore Bonaccorso ] + * Update Vcs-* headers for switch to salsa.debian.org + + [ gregor herrmann ] + * debian/*: update URLs from {search,www}.cpan.org to MetaCPAN. + * debian/*: update GitHub URLs to use HTTPS. + + [ Debian Janitor ] + * Use secure copyright file specification URI. + * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, + Repository-Browse. + + [ Niko Tyni ] + * Import new upstream release 0.40. + * Remove (now unused) libtest-pod-perl from build dependencies. + + -- Niko Tyni Mon, 18 May 2020 20:16:10 +0300 + libencode-zapcp1252-perl (0.33-3) unstable; urgency=medium * Fix stray "2" in description for license shortstring GPL-1+. diff -Nru libencode-zapcp1252-perl-0.33/debian/control libencode-zapcp1252-perl-0.40/debian/control --- libencode-zapcp1252-perl-0.33/debian/control 2014-12-08 14:30:07.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/debian/control 2020-05-18 17:16:05.000000000 +0000 @@ -7,13 +7,12 @@ libmodule-build-perl, debhelper, dh-buildinfo, - libencode-perl, - libtest-pod-perl + libencode-perl Maintainer: Debian Perl Group Uploaders: Jonas Smedegaard Standards-Version: 3.9.6 -Vcs-Git: git://anonscm.debian.org/pkg-perl/packages/libencode-zapcp1252-perl -Vcs-Browser: https://anonscm.debian.org/cgit/pkg-perl/packages/libencode-zapcp1252-perl.git +Vcs-Git: https://salsa.debian.org/perl-team/modules/packages/libencode-zapcp1252-perl.git +Vcs-Browser: https://salsa.debian.org/perl-team/modules/packages/libencode-zapcp1252-perl Homepage: https://metacpan.org/release/Encode-ZapCP1252 Package: libencode-zapcp1252-perl diff -Nru libencode-zapcp1252-perl-0.33/debian/copyright libencode-zapcp1252-perl-0.40/debian/copyright --- libencode-zapcp1252-perl-0.33/debian/copyright 2014-12-08 14:36:56.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/debian/copyright 2020-05-18 17:05:07.000000000 +0000 @@ -1,8 +1,8 @@ -Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: Encode::ZapCP1252 Upstream-Contact: http://github.com/theory/encode-zapcp1252/issues/ -Source: http://www.cpan.org/authors/id/D/DW/DWHEELER - git://github.com/theory/encode-zapcp1252 +Source: https://cpan.metacpan.org/authors/id/D/DW/DWHEELER + https://github.com/theory/encode-zapcp1252 Files: * Copyright: 2005-2010, David E. Wheeler diff -Nru libencode-zapcp1252-perl-0.33/debian/upstream/metadata libencode-zapcp1252-perl-0.40/debian/upstream/metadata --- libencode-zapcp1252-perl-0.33/debian/upstream/metadata 1970-01-01 00:00:00.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/debian/upstream/metadata 2020-05-18 17:05:07.000000000 +0000 @@ -0,0 +1,4 @@ +Bug-Database: http://github.com/theory/encode-cp1252/issues/ +Bug-Submit: https://github.com/theory/encode-cp1252/issues//new +Repository: https://github.com/theory/encode-cp1252.git +Repository-Browse: https://github.com/theory/encode-cp1252 diff -Nru libencode-zapcp1252-perl-0.33/debian/watch libencode-zapcp1252-perl-0.40/debian/watch --- libencode-zapcp1252-perl-0.33/debian/watch 2014-12-08 11:22:55.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/debian/watch 2020-05-18 17:05:07.000000000 +0000 @@ -1,4 +1,4 @@ # Run the "uscan" command to check for upstream updates and more. version=3 -http://www.cpan.org/authors/id/D/DW/DWHEELER/Encode-ZapCP1252-([\d.]+)\.tar\.gz +https://cpan.metacpan.org/authors/id/D/DW/DWHEELER/Encode-ZapCP1252-([\d.]+)\.tar\.gz https://metacpan.org/release/Encode-ZapCP1252 .*/Encode-ZapCP1252-([\d.]+)\.tar\.gz diff -Nru libencode-zapcp1252-perl-0.33/lib/Encode/ZapCP1252.pm libencode-zapcp1252-perl-0.40/lib/Encode/ZapCP1252.pm --- libencode-zapcp1252-perl-0.33/lib/Encode/ZapCP1252.pm 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/lib/Encode/ZapCP1252.pm 2020-02-04 15:30:19.000000000 +0000 @@ -5,14 +5,14 @@ use vars qw($VERSION @ISA @EXPORT); use 5.006_002; -$VERSION = '0.33'; +$VERSION = '0.40'; @ISA = qw(Exporter); @EXPORT = qw(zap_cp1252 fix_cp1252); use constant PERL588 => $] >= 5.008_008; -require Encode if PERL588; +use Encode (); our %ascii_for = ( - # http://en.wikipedia.org/wiki/Windows-1252 + # https://en.wikipedia.org/wiki/Windows-1252 "\x80" => 'e', # EURO SIGN "\x82" => ',', # SINGLE LOW-9 QUOTATION MARK "\x83" => 'f', # LATIN SMALL LETTER F WITH HOOK @@ -43,7 +43,7 @@ ); our %utf8_for = ( - # http://en.wikipedia.org/wiki/Windows-1252 + # https://en.wikipedia.org/wiki/Windows-1252 "\x80" => '€', # EURO SIGN "\x82" => ',', # SINGLE LOW-9 QUOTATION MARK "\x83" => 'ƒ', # LATIN SMALL LETTER F WITH HOOK @@ -73,42 +73,137 @@ "\x9f" => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS ); +my @utf8_skip = ( +# This translates a utf-8-encoded byte into how many bytes the full utf8 +# character occupies. Illegal start bytes have a negative count. + +# UTF-8 is a variable-length encoding. The 128 ASCII characters were very +# deliberately set to be themselves, so UTF-8 would be backwards compatible +# with 7-bit applications. Every other character has 2 - 13 bytes comprising +# it. +# +# If the first bit of the first byte in a character is 0, it is one of those +# 128 ASCII characters with length 1. + +# Otherwise, the first bit is 1, and if the second bit is also one, this byte +# starts the sequence of bytes that represent the character. The bytes C0-FF +# have the characteristic that the first two bits are both one. The number of +# bytes that form a character corresponds to the number of consecutive leading +# bits that are all one in the start byte. In the case of FE, the first 7 +# bits are one, so the number of bytes in the character it represents is 7. +# FF is a special case, and Perl has arbitrarily set it to 13 instead of the +# expected 8. +# +# The remaining bytes begin with '10', from 80..9F. They are called +# continuation bytes, and a UTF-8 character is comprised of a start byte +# indicating 'n' bytes total in it, then 'n-1' of these continuation bytes. +# What the character is that each sequence represents is derived by shifting +# and adding the other bits in the bytes. (C0 and C1 aren't actually legal +# start bytes for security reasons that need not concern us here, hence are +# marked as negative in the table below.) + + # 0 1 2 3 4 5 6 7 8 9 A B C D E F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 7 + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 8 + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 9 + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # A + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # B + -1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13, # F +); + BEGIN { my $proto = $] >= 5.010000 ? '_' : '$'; eval "sub zap_cp1252($proto) { unshift \@_, \\%ascii_for; &_tweakit; }"; eval "sub fix_cp1252($proto) { unshift \@_, \\%utf8_for; &_tweakit; }"; } +# These are the bytes that CP1252 redefines +my $cp1252_re = qr/[\x80\x82-\x8c\x8e\x91-\x9c\x9e\x9f]/; + sub _tweakit { my $table = shift; return unless defined $_[0]; local $_[0] = $_[0] if defined wantarray; - if (PERL588 && Encode::is_utf8($_[0])) { - _tweak_decoded($table, $_[0]); - } else { - $_[0] =~ s{([\x80-\x9f])}{$table->{$1} || $1}emxsg; - } - return $_[0] if defined wantarray; -} + my $is_utf8 = PERL588 && Encode::is_utf8($_[0]); + my $valid_utf8 = $is_utf8 && utf8::valid($_[0]); + if (!$is_utf8) { + + # Here is non-UTF-8. Change the 1252 characters to their UTF-8 + # counterparts. These bytes are very rarely used in real world + # applications, so their presence likely indicates that CP1252 was + # meant. + $_[0] =~ s/($cp1252_re)/$table->{$1}/gems; + } elsif ($valid_utf8) { + + # Here is well-formed Perl extended UTF-8 and has the UTF-8 flag on + # and the string is held as bytes. Change the 1252 characters to their + # Unicode counterparts. + $_[0] =~ s/($cp1252_re)/Encode::decode_utf8($table->{$1})/gems; + } else { # Invalid UTF-8. Look for single-byte CP1252 gremlins -sub _tweak_decoded { - my $table = shift; - local $@; - # First, try to replace in the decoded string. - eval { - $_[0] =~ s{([\x80-\x9f])}{ - $table->{$1} ? Encode::decode('UTF-8', $table->{$1}) : $1 - }emxsg - }; - if (my $err = $@) { - # If we got a "Malformed UTF-8 character" error, then someone - # likely turned on the utf8 flag without decoding. So turn it off. - # and try again. - die if $err !~ /Malformed/; + # Turn off the UTF-8 flag so that we can go through the string + # byte-by-byte. Encode::_utf8_off($_[0]); - $_[0] =~ s/([\x80-\x9f])/$table->{$1} || $1/emxsg; + + my $i = 0; + my $length = length $_[0]; + my $fixed = ""; # The input after being fixed up by this loop + while ($i < $length) { + + # Each time through the loop, we should here be ready to look at a + # new character, and it's 0th byte is called a 'start byte' + my $start_byte = substr($_[0], $i, 1); + my $skip = $utf8_skip[ord $start_byte]; + + # The table is set up so that legal UTF-8 start bytes have a + # positive byte length. Simply add all the bytes in the character + # to the output, and go on to handle the next character in the + # next loop iteration. + if ($skip > 0) { + $fixed .= substr($_[0], $i, $skip); + $i += $skip; + next; + } + + # Here we have a byte that isn't a start byte in a position that + # should oughta be a start byte. The whole point of this loop is + # to find such bytes that are CP1252 ones and which were + # incorrectly inserted by the upstream process into an otherwise + # valid UTF-8 string. So, if we have such a one, change it into + # its corresponding correct character. + if ($start_byte =~ s/($cp1252_re)/$table->{$1}/ems) { + + # The correct character may be UTF-8 bytes. We treat them as + # just a sequence of non-UTF-8 bytes, because that's what + # $fixed has in it so far. After everything is consistently + # added, we turn the UTF-8 flag back on before returning at + # the end. + Encode::_utf8_off($start_byte); + $fixed .= $start_byte; + $i++; + next; + } + + # Here the byte isn't a CP1252 one. + die "Unexpected continuation byte: %02x", ord $start_byte; + } + + # $fixed now has everything properly in it, but set to return it in + # $_[0], marked as UTF-8. + $_[0] = $fixed; Encode::_utf8_on($_[0]); } + return $_[0] if defined wantarray; } 1; @@ -142,7 +237,7 @@ encoding is Latin-1, mostly things will come out right, but a few things--like curly quotes, m-dashes, ellipses, and the like--may not. The differences are well-known; you see a nice chart at documenting the differences on -L. +L. Of course, that won't really help you. What will help you is to quit using Latin-1 and switch to UTF-8. Then you can just convert from CP1252 to UTF-8 @@ -161,9 +256,8 @@ gremlins mixed in with properly encoded characters. I've seen examples of just this sort of thing when processing GMail messages and attempting to insert them into a UTF-8 database, as well as in some feeds processed by, say -L. Doesn't work so well. For such cases, -there's C, which converts those CP1252 gremlins into their UTF-8 -equivalents. +Yahoo! Pipes. Doesn't work so well. For such cases, there's C, +which converts those CP1252 gremlins into their UTF-8 equivalents. =head1 Usage @@ -188,6 +282,14 @@ In this case, even constant values can be processed. Either way, Cs will be ignored. +In Perl 5.10 and higher, the functions may optionally be called with no +arguments, in which case C<$_> will be converted, instead: + + zap_cp1252; # Modify $_ in-place. + fix_cp1252; # Modify $_ in-place. + my $zapped = zap_cp1252; # Copy $_ and return zapped + my $fixed = zap_cp1252; # Copy $_ and return fixed + In Perl 5.8.8 and higher, the conversion will work even when the string is decoded to Perl's internal form (usually via C) or the string is encoded (and thus simply processed by Perl as a series of @@ -197,13 +299,9 @@ removing those CP1252 gremlins no matter what kind of processing has already been executed on the string. -In Perl 5.10 and higher, the functions may optionally be called with no -arguments, in which case C<$_> will be converted, instead: - - zap_cp1252; # Modify $_ in-place. - fix_cp1252; # Modify $_ in-place. - my $zapped = zap_cp1252; # Copy $_ and return zapped - my $fixed = zap_cp1252; # Copy $_ and return fixed +That said, although C takes a conservative approach to replacing +text in Unicode strings, it should be used as a very last option. Really, +avoid that situation if you can. =head1 Conversion Table @@ -251,11 +349,11 @@ local $Encode::ZapCP1252::ascii_for{"\x80"} = 'E'; -Or if, for some bizarre reason, you wanted the UTF-8 equivalent for a bullet -converted by C to really be an asterisk (why would you? Just use -C for that!), you can do this: +Or if, for some reason, you wanted the UTF-8 equivalent for a bullet +converted by C to be a black square, you can assign the +bytes (never a Unicode string) like so: - local $Encode::ZapCP1252::utf8_for{"\x95"} = '*'; + local $Encode::ZapCP1252::utf8_for{"\x95"} = Encode::encode_utf8('■'); Just remember, without C this would be a global change. In that case, be careful if your code zaps CP1252 elsewhere. Of course, it shouldn't really @@ -269,18 +367,20 @@ =item L -=item L +=item L + +=item L =back =head1 Support This module is stored in an open L. Feel free to fork +repository|https://github.com/theory/encode-zapcp1252/>. Feel free to fork and contribute! Please file bug reports via L or by sending mail to +Issues|https://github.com/theory/encode-zapcp1252/issues/> or by sending mail to L. =head1 Author @@ -290,11 +390,12 @@ =head1 Acknowledgments My thanks to Sean Burke for sending me his original method for converting -CP1252 gremlins to more-or-less appropriate ASCII characters. +CP1252 gremlins to more-or-less appropriate ASCII characters, and to Karl +Williamson for more correct handling of Unicode strings. =head1 Copyright and License -Copyright (c) 2005-2010 David E. Wheeler. Some Rights Reserved. +Copyright (c) 2005-2020 David E. Wheeler. Some Rights Reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff -Nru libencode-zapcp1252-perl-0.33/Makefile.PL libencode-zapcp1252-perl-0.40/Makefile.PL --- libencode-zapcp1252-perl-0.33/Makefile.PL 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/Makefile.PL 2020-02-04 15:30:19.000000000 +0000 @@ -1,16 +1,16 @@ -# Note: this file was auto-generated by Module::Build::Compat version 0.3800 +# Note: this file was auto-generated by Module::Build::Compat version 0.4229 require 5.006002; use ExtUtils::MakeMaker; WriteMakefile ( - 'NAME' => 'Encode::ZapCP1252', + 'PL_FILES' => {}, + 'INSTALLDIRS' => 'site', 'VERSION_FROM' => 'lib/Encode/ZapCP1252.pm', 'PREREQ_PM' => { 'Module::Build' => '0.36', 'Test::More' => '0.17' }, - 'INSTALLDIRS' => 'site', 'EXE_FILES' => [], - 'PL_FILES' => {} + 'NAME' => 'Encode::ZapCP1252' ) ; diff -Nru libencode-zapcp1252-perl-0.33/MANIFEST libencode-zapcp1252-perl-0.40/MANIFEST --- libencode-zapcp1252-perl-0.33/MANIFEST 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/MANIFEST 2020-02-04 15:30:19.000000000 +0000 @@ -1,12 +1,11 @@ Build.PL Changes lib/Encode/ZapCP1252.pm +Makefile.PL MANIFEST This list of files -README +META.json +META.yml +README.md t/base.t t/decoded.t t/perl-510.t -t/pod.t -Makefile.PL -META.yml -META.json diff -Nru libencode-zapcp1252-perl-0.33/META.json libencode-zapcp1252-perl-0.40/META.json --- libencode-zapcp1252-perl-0.33/META.json 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/META.json 2020-02-04 15:30:19.000000000 +0000 @@ -4,13 +4,13 @@ "David E. Wheeler " ], "dynamic_config" : 1, - "generated_by" : "Module::Build version 0.38, CPAN::Meta::Converter version 2.112150", + "generated_by" : "Module::Build version 0.4229", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", - "version" : "2" + "version" : 2 }, "name" : "Encode-ZapCP1252", "prereqs" : { @@ -27,7 +27,7 @@ }, "runtime" : { "recommends" : { - "Encode" : 0, + "Encode" : "0", "Test::Pod" : "1.41" }, "requires" : { @@ -38,21 +38,22 @@ "provides" : { "Encode::ZapCP1252" : { "file" : "lib/Encode/ZapCP1252.pm", - "version" : "0.33" + "version" : "0.40" } }, "release_status" : "stable", "resources" : { "bugtracker" : { - "web" : "http://github.com/theory/encode-cp1252/issues/" + "web" : "https://github.com/theory/encode-zapcp1252/issues/" }, - "homepage" : "http://search.cpan.org/dist/Encode-CP1252/", + "homepage" : "https://search.cpan.org/dist/Encode-ZapCP1252/", "license" : [ "http://dev.perl.org/licenses/" ], "repository" : { - "url" : "http://github.com/theory/encode-cp1252/tree" + "url" : "https://github.com/theory/encode-zapcp1252" } }, - "version" : "0.33" + "version" : "0.40", + "x_serialization_backend" : "JSON::PP version 4.02" } diff -Nru libencode-zapcp1252-perl-0.33/META.yml libencode-zapcp1252-perl-0.40/META.yml --- libencode-zapcp1252-perl-0.33/META.yml 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/META.yml 2020-02-04 15:30:19.000000000 +0000 @@ -3,29 +3,30 @@ author: - 'David E. Wheeler ' build_requires: - Module::Build: 0.36 - Test::More: 0.17 + Module::Build: '0.36' + Test::More: '0.17' configure_requires: - Module::Build: 0.36 + Module::Build: '0.36' dynamic_config: 1 -generated_by: 'Module::Build version 0.38, CPAN::Meta::Converter version 2.112150' +generated_by: 'Module::Build version 0.4229, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html - version: 1.4 + version: '1.4' name: Encode-ZapCP1252 provides: Encode::ZapCP1252: file: lib/Encode/ZapCP1252.pm - version: 0.33 + version: '0.40' recommends: - Encode: 0 - Test::Pod: 1.41 + Encode: '0' + Test::Pod: '1.41' requires: - perl: 5.006002 + perl: '5.006002' resources: - bugtracker: http://github.com/theory/encode-cp1252/issues/ - homepage: http://search.cpan.org/dist/Encode-CP1252/ + bugtracker: https://github.com/theory/encode-zapcp1252/issues/ + homepage: https://search.cpan.org/dist/Encode-ZapCP1252/ license: http://dev.perl.org/licenses/ - repository: http://github.com/theory/encode-cp1252/tree -version: 0.33 + repository: https://github.com/theory/encode-zapcp1252 +version: '0.40' +x_serialization_backend: 'CPAN::Meta::YAML version 0.018' diff -Nru libencode-zapcp1252-perl-0.33/README libencode-zapcp1252-perl-0.40/README --- libencode-zapcp1252-perl-0.33/README 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/README 1970-01-01 00:00:00.000000000 +0000 @@ -1,44 +0,0 @@ -Encode/CP1252 version 0.33 -========================== - -Have you ever been processing a Web form submit, assuming that the incoming -text was encoded in ISO-8859-1 (Latin-1), only to end up with a bunch of junk -because someone pasted in content from Microsoft Word? Well, this is because -Microsoft uses a superset of the Latin-1 encoding called "Windows Western" or -"CP1252". So mostly things will come out right, but a few things--like curly -quotes, m-dashes, ellipses, and the like--will not. The differences are -well-known; you see a nice chart at documenting the differences on -[Wikipedia](http://en.wikipedia.org/wiki/Windows-1252). - -Of course, that won't really help you. So this library's module, -Encode::ZapCP1252, provides subroutines for removing Windows Western Gremlins -from strings, turning them into their appropriate UTF-8 or ASCII -approximations: - - my $clean_latin1 = zap_cp1252 $latin1_text; - my $fixed_utf8 = fix_cp1252 $utf8_text; - -Installation ------------- - -To install this module, type the following: - - perl Build.PL - ./Build - ./Build test - ./Build install - -Or, if you don't have Module::Build installed, type the following: - - perl Makefile.PL - make - make test - make install - -Copyright and Licence ---------------------- - -Copyright (c) 2005-2010 David E. Wheeler. Some Rights Reserved. - -This module is free software; you can redistribute it and/or modify it under -the same terms as Perl itself. diff -Nru libencode-zapcp1252-perl-0.33/README.md libencode-zapcp1252-perl-0.40/README.md --- libencode-zapcp1252-perl-0.33/README.md 1970-01-01 00:00:00.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/README.md 2020-02-04 15:30:19.000000000 +0000 @@ -0,0 +1,47 @@ +Encode/CP1252 version 0.40 +========================== + +[![CPAN version](https://badge.fury.io/pl/Encode-CP1252.svg)](https://badge.fury.io/pl/Encode-CP1252) +[![Build Status](https://github.com/theory/encode-zapcp1252/workflows/CI/badge.svg)](/theory/encode-zapcp1252/actions/) + +Have you ever been processing a Web form submit, assuming that the incoming +text was encoded in ISO-8859-1 (Latin-1), only to end up with a bunch of junk +because someone pasted in content from Microsoft Word? Well, this is because +Microsoft uses a superset of the Latin-1 encoding called "Windows Western" or +"CP1252". So mostly things will come out right, but a few things--like curly +quotes, m-dashes, ellipses, and the like--will not. The differences are +well-known; you see a nice chart at documenting the differences on +[Wikipedia](https://en.wikipedia.org/wiki/Windows-1252). + +Of course, that won't really help you. So this library's module, +Encode::ZapCP1252, provides subroutines for removing Windows Western Gremlins +from strings, turning them into their appropriate UTF-8 or ASCII +approximations: + + my $clean_latin1 = zap_cp1252 $latin1_text; + my $fixed_utf8 = fix_cp1252 $utf8_text; + +Installation +------------ + +To install this module, type the following: + + perl Build.PL + ./Build + ./Build test + ./Build install + +Or, if you don't have Module::Build installed, type the following: + + perl Makefile.PL + make + make test + make install + +Copyright and Licence +--------------------- + +Copyright (c) 2005-2020 David E. Wheeler. Some Rights Reserved. + +This module is free software; you can redistribute it and/or modify it under +the same terms as Perl itself. diff -Nru libencode-zapcp1252-perl-0.33/t/decoded.t libencode-zapcp1252-perl-0.40/t/decoded.t --- libencode-zapcp1252-perl-0.33/t/decoded.t 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/t/decoded.t 2020-02-04 15:30:19.000000000 +0000 @@ -6,7 +6,7 @@ BEGIN { plan skip_all => 'These tests require Perl 5.8.8 or higher' unless $] >= 5.008_008; - plan tests => 6; + plan tests => 10; } BEGIN { use_ok 'Encode::ZapCP1252' or die; } @@ -15,7 +15,7 @@ my $ascii = q{e , f ,, ... + ++ ^ % S < OE Z ' ' " " * - -- ~ (tm) s > oe z Y}; my $utf8 = q{€ , ƒ „ … † ‡ ˆ ‰ Š ‹ Œ Ž ‘ ’ “ ” • – — ˜ ™ š › œ ž Ÿ}; -# Test conversion of decoded from ISO-8859-1. +# Test conversion of text decoded from ISO-8859-1. my $fix_me = Encode::decode( 'ISO-8859-1', join ' ', map { chr } 0x80, 0x82 .. 0x8c, 0x8e, 0x91 .. 0x9c, 0x9e, 0x9f @@ -45,6 +45,7 @@ is $fix_me, $ascii, 'Convert utf8-bit-flipped to ascii'; # Test conversion to decoded with modified table. +my $euro = $Encode::ZapCP1252::utf8_for{"\x80"}; $Encode::ZapCP1252::utf8_for{"\x80"} = 'E'; $utf8 =~ s/€/E/; @@ -56,4 +57,14 @@ fix_cp1252 $fix_me; is $fix_me, $utf8, 'Convert decoded from Latin-1 with modified table'; - +# Test it with the valid use of one of the gremlins (π is [0xcf,0x80]) in UTF-8. +is fix_cp1252 'π', 'π', 'Should not convert valid use of 0x80'; +is zap_cp1252 'π', 'π', 'Should not zap valid use of 0x80'; + +# But it should convert it if it's not UTF-8. +my $utf8_euro = Encode::encode_utf8($euro); +$Encode::ZapCP1252::utf8_for{"\x80"} = $utf8_euro; +is fix_cp1252 "\xCF\x80", "\xCF" . $utf8_euro, + 'Should convert 0x80 when not parsing UTF-8'; +is zap_cp1252 "\xCF\x80", qq{\xCF$Encode::ZapCP1252::ascii_for{"\x80"}}, +'Should convert 0x80 to ASCII when not parsing UTF-8'; diff -Nru libencode-zapcp1252-perl-0.33/t/pod.t libencode-zapcp1252-perl-0.40/t/pod.t --- libencode-zapcp1252-perl-0.33/t/pod.t 2011-11-23 05:19:12.000000000 +0000 +++ libencode-zapcp1252-perl-0.40/t/pod.t 1970-01-01 00:00:00.000000000 +0000 @@ -1,9 +0,0 @@ -#!perl -w - -use strict; -use Test::More; -eval 'use Test::Pod 1.41'; -plan skip_all => 'Test::Pod 1.41 required for testing POD' if $@; -eval 'use Encode'; -plan skip_all => 'Encode 1.20 required for testing POD because it has UTF-8 characters' if $@; -all_pod_files_ok();