diff -Nru libfile-bom-perl-0.14/Changes libfile-bom-perl-0.15/Changes --- libfile-bom-perl-0.14/Changes 2006-10-04 10:45:57.000000000 +0000 +++ libfile-bom-perl-0.15/Changes 2015-12-14 21:33:22.000000000 +0000 @@ -1,5 +1,8 @@ File::BOM changes document +0.15 - Mon Dec 14 2015 + - Fix spelling errors spotted by debian team. Thanks to Gregor Herrmann. + 0.14 - Wed Oct 4 2006 - More workarounds for platforms with odd read() behaviour. diff -Nru libfile-bom-perl-0.14/debian/changelog libfile-bom-perl-0.15/debian/changelog --- libfile-bom-perl-0.14/debian/changelog 2015-08-12 18:56:24.000000000 +0000 +++ libfile-bom-perl-0.15/debian/changelog 2015-12-15 17:34:12.000000000 +0000 @@ -1,3 +1,11 @@ +libfile-bom-perl (0.15-1) unstable; urgency=medium + + * Team upload. + * New upstream release. + * Drop spelling.patch, merged upstream. + + -- gregor herrmann Tue, 15 Dec 2015 18:33:57 +0100 + libfile-bom-perl (0.14-3) unstable; urgency=medium * Team upload. diff -Nru libfile-bom-perl-0.14/debian/patches/series libfile-bom-perl-0.15/debian/patches/series --- libfile-bom-perl-0.14/debian/patches/series 2015-08-12 18:56:24.000000000 +0000 +++ libfile-bom-perl-0.15/debian/patches/series 2015-12-15 17:34:12.000000000 +0000 @@ -1,2 +1 @@ autopkgtest.patch -spelling.patch diff -Nru libfile-bom-perl-0.14/debian/patches/spelling.patch libfile-bom-perl-0.15/debian/patches/spelling.patch --- libfile-bom-perl-0.14/debian/patches/spelling.patch 2015-08-12 18:56:24.000000000 +0000 +++ libfile-bom-perl-0.15/debian/patches/spelling.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -Description: fix a spelling mistake -Origin: vendor -Author: gregor herrmann -Last-Update: 2015-08-12 -Forwarded: https://rt.cpan.org/Ticket/Display.html?id=106391 -Bug: https://rt.cpan.org/Ticket/Display.html?id=106391 - ---- a/lib/File/BOM.pm -+++ b/lib/File/BOM.pm -@@ -234,7 +234,7 @@ - Opens my_file.txt for reading in an appropriate encoding found from the BOM in - that file, or as a UTF-8 file if none is found. - --In the absense of a $default_mode argument, the following 2 calls should be equivalent: -+In the absence of a $default_mode argument, the following 2 calls should be equivalent: - - open_bom(FH, 'no_bom.txt'); - diff -Nru libfile-bom-perl-0.14/lib/File/BOM.pm libfile-bom-perl-0.15/lib/File/BOM.pm --- libfile-bom-perl-0.14/lib/File/BOM.pm 2006-10-04 10:45:57.000000000 +0000 +++ libfile-bom-perl-0.15/lib/File/BOM.pm 2015-12-14 21:33:35.000000000 +0000 @@ -101,7 +101,7 @@ my @vars = qw( %bom2enc %enc2bom ); -our $VERSION = '0.14'; +our $VERSION = '0.15'; our @EXPORT = (); our @EXPORT_OK = ( @subs, @vars ); @@ -234,7 +234,7 @@ Opens my_file.txt for reading in an appropriate encoding found from the BOM in that file, or as a UTF-8 file if none is found. -In the absense of a $default_mode argument, the following 2 calls should be equivalent: +In the absence of a $default_mode argument, the following 2 calls should be equivalent: open_bom(FH, 'no_bom.txt'); diff -Nru libfile-bom-perl-0.14/Makefile.PL libfile-bom-perl-0.15/Makefile.PL --- libfile-bom-perl-0.14/Makefile.PL 2006-10-04 10:45:57.000000000 +0000 +++ libfile-bom-perl-0.15/Makefile.PL 2015-12-14 21:38:19.000000000 +0000 @@ -1,31 +1,35 @@ -# Note: this file was auto-generated by Module::Build::Compat version 0.03 - - unless (eval "use Module::Build::Compat 0.02; 1" ) { - print "This module requires Module::Build to install itself.\n"; - - require ExtUtils::MakeMaker; - my $yn = ExtUtils::MakeMaker::prompt - (' Install Module::Build now from CPAN?', 'y'); - - unless ($yn =~ /^y/i) { - die " *** Cannot install without Module::Build. Exiting ...\n"; - } - - require Cwd; - require File::Spec; - require CPAN; - - # Save this 'cause CPAN will chdir all over the place. - my $cwd = Cwd::cwd(); - - CPAN::Shell->install('Module::Build::Compat'); - CPAN::Shell->expand("Module", "Module::Build::Compat")->uptodate - or die "Couldn't install Module::Build, giving up.\n"; - - chdir $cwd or die "Cannot chdir() back to $cwd: $!"; - } - eval "use Module::Build::Compat 0.02; 1" or die $@; - - Module::Build::Compat->run_build_pl(args => \@ARGV); - require Module::Build; - Module::Build::Compat->write_makefile(build_class => 'Module::Build'); +# Note: this file was auto-generated by Module::Build::Compat version 0.4212 +require 5.008003; + + unless (eval "use Module::Build::Compat 0.02; 1" ) { + print "This module requires Module::Build to install itself.\n"; + + require ExtUtils::MakeMaker; + my $yn = ExtUtils::MakeMaker::prompt + (' Install Module::Build now from CPAN?', 'y'); + + unless ($yn =~ /^y/i) { + die " *** Cannot install without Module::Build. Exiting ...\n"; + } + + require Cwd; + require File::Spec; + require CPAN; + + # Save this 'cause CPAN will chdir all over the place. + my $cwd = Cwd::cwd(); + + CPAN::Shell->install('Module::Build::Compat'); + CPAN::Shell->expand("Module", "Module::Build::Compat")->uptodate + or die "Couldn't install Module::Build, giving up.\n"; + + chdir $cwd or die "Cannot chdir() back to $cwd: $!"; + } + eval "use Module::Build::Compat 0.02; 1" or die $@; + + Module::Build::Compat->run_build_pl(args => \@ARGV); + my $build_script = 'Build'; + $build_script .= '.com' if $^O eq 'VMS'; + exit(0) unless(-e $build_script); # cpantesters convention + require Module::Build; + Module::Build::Compat->write_makefile(build_class => 'Module::Build'); diff -Nru libfile-bom-perl-0.14/MANIFEST libfile-bom-perl-0.15/MANIFEST --- libfile-bom-perl-0.14/MANIFEST 2006-10-04 10:45:57.000000000 +0000 +++ libfile-bom-perl-0.15/MANIFEST 2015-12-14 21:38:12.000000000 +0000 @@ -15,3 +15,4 @@ t/data/broken_bom.txt t/lib/Test/Framework.pm TODO +META.json diff -Nru libfile-bom-perl-0.14/META.json libfile-bom-perl-0.15/META.json --- libfile-bom-perl-0.14/META.json 1970-01-01 00:00:00.000000000 +0000 +++ libfile-bom-perl-0.15/META.json 2015-12-14 21:38:19.000000000 +0000 @@ -0,0 +1,50 @@ +{ + "abstract" : "Utilities for handling Byte Order Marks", + "author" : [ + "Matt Lawrence Emattlaw@cpan.orgE" + ], + "dynamic_config" : 1, + "generated_by" : "Module::Build version 0.4212", + "license" : [ + "perl_5" + ], + "meta-spec" : { + "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", + "version" : "2" + }, + "name" : "File-BOM", + "prereqs" : { + "build" : { + "requires" : { + "Module::Build" : "0.20", + "Test::Exception" : "0.20", + "Test::More" : "0.10" + } + }, + "configure" : { + "requires" : { + "Module::Build" : "0.42" + } + }, + "runtime" : { + "requires" : { + "Encode" : "1.99", + "Readonly" : "0.06", + "perl" : "v5.8.3" + } + } + }, + "provides" : { + "File::BOM" : { + "file" : "lib/File/BOM.pm", + "version" : "0.15" + } + }, + "release_status" : "stable", + "resources" : { + "license" : [ + "http://dev.perl.org/licenses/" + ] + }, + "version" : "0.15" +} diff -Nru libfile-bom-perl-0.14/META.yml libfile-bom-perl-0.15/META.yml --- libfile-bom-perl-0.14/META.yml 2006-10-04 10:45:57.000000000 +0000 +++ libfile-bom-perl-0.15/META.yml 2015-12-14 21:38:19.000000000 +0000 @@ -1,25 +1,28 @@ ---- -name: File-BOM -version: 0.14 -author: - - 'Matt Lawrence Emattlaw@cpan.orgE' -abstract: Utilities for handling Byte Order Marks -license: perl -resources: - license: http://dev.perl.org/licenses/ -requires: - Encode: 1.99 - Readonly: 0.06 - perl: 5.8.3 -build_requires: - Module::Build: 0.20 - Test::Exception: 0.20 - Test::More: 0.10 -provides: - File::BOM: - file: lib/File/BOM.pm - version: 0.14 -generated_by: Module::Build version 0.28 -meta-spec: - url: http://module-build.sourceforge.net/META-spec-v1.2.html - version: 1.2 +--- +abstract: 'Utilities for handling Byte Order Marks' +author: + - 'Matt Lawrence Emattlaw@cpan.orgE' +build_requires: + Module::Build: '0.20' + Test::Exception: '0.20' + Test::More: '0.10' +configure_requires: + Module::Build: '0.42' +dynamic_config: 1 +generated_by: 'Module::Build version 0.4212, CPAN::Meta::Converter version 2.150001' +license: perl +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: '1.4' +name: File-BOM +provides: + File::BOM: + file: lib/File/BOM.pm + version: '0.15' +requires: + Encode: '1.99' + Readonly: '0.06' + perl: v5.8.3 +resources: + license: http://dev.perl.org/licenses/ +version: '0.15' diff -Nru libfile-bom-perl-0.14/README libfile-bom-perl-0.15/README --- libfile-bom-perl-0.14/README 2006-10-04 10:45:57.000000000 +0000 +++ libfile-bom-perl-0.15/README 2015-12-14 21:38:19.000000000 +0000 @@ -1,363 +1,383 @@ -NAME - File::BOM - Utilities for handling Byte Order Marks - -SYNOPSIS - use File::BOM qw( :all ) - - high-level functions - # read a file with encoding from the BOM: - open_bom(FH, $file) - open_bom(FH, $file, ':utf8') # the same but with a default encoding - - # get encoding too - $encoding = open_bom(FH, $file, ':utf8'); - - # open a potentially unseekable file: - ($encoding, $spillage) = open_bom(FH, $file, ':utf8'); - - # change encoding of an open handle according to BOM - $encoding = defuse(*HANDLE); - ($encoding, $spillage) = defuse(*HANDLE); - - # Decode a string according to leading BOM: - $unicode = decode_from_bom($string_with_bom); - - # Decode a string and get the encoding: - ($unicode, $encoding) = decode_from_bom($string_with_bom) - - PerlIO::via interface - # Read the Right Thing from a unicode file with BOM: - open(HANDLE, '<:via(File::BOM)', $filename) - - # Writing little-endian UTF-16 file with BOM: - open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', $filename) - - lower-level functions - # read BOM encoding from a filehandle: - $encoding = get_encoding_from_filehandle(FH) - - # Get encoding even if FH is unseekable: - ($encoding, $spillage) = get_encoding_from_filehandle(FH); - - # Get encoding from a known unseekable handle: - ($encdoing, $spillage) = get_encoding_from_stream(FH); - - # get encoding and BOM length from BOM at start of string: - ($encoding, $offset) = get_encoding_from_bom($string); - - variables - # print a BOM for a known encoding - print FH $enc2bom{$encoding}; - - # get an encoding from a known BOM - $enc = $bom2enc{$bom} - -DESCRIPTION - This module provides functions for handling unicode byte order marks, - which are to be found at the beginning of some files and streams. - - For details about what a byte order mark is, see - - - The intention of File::BOM is for files with BOMs to be readable as - seamlessly as possible, regardless of the encoding used. To that end, - several different interfaces are available, as shown in the synopsis - above. - -EXPORTS - Nothing by default. - - symbols - * open_bom() - * defuse() - * decode_from_bom() - * get_encoding_from_filehandle() - * get_encoding_from_stream() - * get_encoding_from_bom() - * %bom2enc - * %enc2bom - - tags - * :all - All of the above - - * :subs - subroutines only - - * :vars - just %bom2enc and %enc2bom - -VARIABLES - %bom2enc - Maps Byte Order marks to their encodings. - - The keys of this hash are strings which represent the BOMs, the values - are their encodings, in a format which is understood by Encode - - The encodings represented in this hash are: UTF-8, UTF-16BE, UTF-16LE, - UTF-32BE and UTF-32LE - - %enc2bom - A reverse-lookup hash for bom2enc, with a few aliases used in Encode, - namely utf8, iso-10646-1 and UCS-2. - - Note that UTF-16, UTF-32 and UCS-4 are not included in this hash. Mainly - because Encode::encode automatically puts BOMs on output. See - Encode::Unicode - -FUNCTIONS - open_bom - $encoding = open_bom(HANDLE, $filename, $default_mode) - - ($encoding, $spill) = open_bom(HANDLE, $filename, $default_mode) - - opens HANDLE for reading on $filename, setting the mode to the - appropriate encoding for the BOM stored in the file. - - On failure, a fatal error is raised, see the DIAGNOSTICS section for - details on how to catch these. This is in order to allow the return - value(s) to be used for other purposes. - - If the file doesn't contain a BOM, $default_mode is used instead. Hence: - - open_bom(FH, 'my_file.txt', ':utf8') - - Opens my_file.txt for reading in an appropriate encoding found from the - BOM in that file, or as a UTF-8 file if none is found. - - In the absense of a $default_mode argument, the following 2 calls should - be equivalent: - - open_bom(FH, 'no_bom.txt'); - - open(FH, '<', 'no_bom.txt'); - - If an undefined value is passed as the handle, a symbol will be - generated for it like open() does: - - # create filehandle on the fly - $enc = open_bom(my $fh, $filename, ':utf8'); - $line = <$fh>; - - The filehandle will be cued up to read after the BOM. Unseekable files - (e.g. fifos) will cause croaking, unless called in list context to catch - spillage from the handle. Any spillage will be automatically decoded - from the encoding, if found. - - e.g. - - # croak if my_socket is unseekable - open_bom(FH, 'my_socket'); - - # keep spillage if my_socket is unseekable - ($encoding, $spillage) = open_bom(FH, 'my_socket'); - - # discard any spillage from open_bom - ($encoding) = open_bom(FH, 'my_socket'); - - defuse - $enc = defuse(FH); - - ($enc, $spill) = defuse(FH); - - FH should be a filehandle opened for reading, it will have the relevant - encoding layer pushed onto it be binmode if a BOM is found. Spillage - should be Unicode, not bytes. - - Any uncaptured spillage will be silently lost. If the handle is - unseekable, use list context to avoid data loss. - - If no BOM is found, the mode will be unaffected. - - decode_from_bom - $unicode_string = decode_from_bom($string, $default, $check) - - ($unicode_string, $encoding) = decode_from_bom($string, $default, $check) - - Reads a BOM from the beginning of $string, decodes $string (minus the - BOM) and returns it to you as a perl unicode string. - - if $string doesn't have a BOM, $default is used instead. - - $check, if supplied, is passed to Encode::decode as the third argument. - - If there's no BOM and no default, the original string is returned and - encoding is ''. - - See Encode - - get_encoding_from_filehandle - $encoding = get_encoding_from_filehandle(HANDLE) - - ($encoding, $spillage) = get_encoding_from_filehandle(HANDLE) - - Returns the encoding found in the given filehandle. - - The handle should be opened in a non-unicode way (e.g. mode '<:bytes') - so that the BOM can be read in its natural state. - - After calling, the handle will be set to read at a point after the BOM - (or at the beginning of the file if no BOM was found) - - If called in scalar context, unseekable handles cause a croak(). - - If called in list context, unseekable handles will be read byte-by-byte - and any spillage will be returned. See get_encoding_from_stream() - - get_encoding_from_stream - ($encoding, $spillage) = get_encoding_from_stream(*FH); - - Read a BOM from an unrewindable source. This means reading the stream - one byte at a time until either a BOM is found or every possible BOM is - ruled out. Any non-BOM bytes read from the handle will be returned in - $spillage. - - If a BOM is found and the spillage contains a partial character (judging - by the expected character width for the encoding) more bytes will be - read from the handle to ensure that a complete character is returned. - - Spillage is always in bytes, not characters. - - This function is less efficient than get_encoding_from_filehandle, but - should work just as well on a seekable handle as on an unseekable one. - - get_encoding_from_bom - ($encoding, $offset) = get_encoding_from_bom($string) - - Returns the encoding and length in bytes of the BOM in $string. - - If there is no BOM, an empty string is returned and $offset is zero. - - To get the data from the string, the following should work: - - use Encode; - - my($encoding, $offset) = get_encoding_from_bom($string); - - if ($encoding) { - $string = decode($encoding, substr($string, $offset)) - } - -PerlIO::via interface - File::BOM can be used as a PerlIO::via interface. - - open(HANDLE, '<:via(File::BOM)', 'my_file.txt'); - - open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', 'out_file.txt) - print "foo\n"; # BOM is written to file here - - This method is less prone to errors on non-seekable files as spillage is - incorporated into an internal buffer, but it doesn't give you any - information about the encoding being used, or indeed whether or not a - BOM was present. - - There are a few known problems with this interface, especially - surrounding seek() and tell(), please see the BUGS section for more - details about this. - - Reading - The via(File::BOM) layer must be added before the handle is read from, - otherwise any BOM will be missed. If there is no BOM, no decoding will - be done. - - Because of a limitation in PerlIO::via, read() always works on bytes, - not characters. BOM decoding will still be done but output will be bytes - of UTF-8. - - open(BOM, '<:via(File::BOM)', $file) - $bytes_read = read(BOM, $buffer, $length); - $unicode = decode('UTF-8', $buffer, Encode::FB_QUIET); - - # Now $unicode is valid unicode and $buffer contains any left-over bytes - - Writing - Add the via(File::BOM) layer on top of a unicode encoding layer to print - a BOM at the start of the output file. This needs to be done before any - data is written. The BOM is written as part of the first print command - on the handle, so if you don't print anything to the handle, you won't - get a BOM. - - There is a "Wide character in print" warning generated when the - via(File::BOM) layer doesn't receive utf8 on writing. This glitch was - resolved in perl version 5.8.7, but if your perl version is older than - that, you'll need to make sure that the via(File::BOM) layer receives - utf8 like this: - - # This works OK - open(FH, '>:encoding(UTF-16LE):via(File::BOM):utf8', $filename) - - # This generates warnings with older perls - open(FH, '>:encoding(UTF-16LE):via(File::BOM)', $filename) - - Seeking - Seeking with SEEK_SET results in an offset equal to the length of any - detected BOM being applied to the position parameter. Thus: - - # Seek to end of BOM (not start of file!) - seek(FILE_BOM_HANDLE, 0, SEEK_SET) - - Telling - In order to work correctly with seek(), tell() also returns a postion - adjusted by the length of the BOM. - -SEE ALSO - * Encode - * Encode::Unicode - * - -DIAGNOSTICS - The following exceptions are raised via croak() - - * Couldn't read '': $! - open_bom() couldn't open the given file for reading - - * Couldn't set binmode of handle opened on '' to '': $! - open_bom() couldn't set the binmode of the handle - - * No string - decode_from_bom called on an undefined value - - * Unseekable handle: $! - get_encoding_from_filehandle() or open_bom() called on an unseekable - file or handle in scalar context. - - * Couldn't read from handle: $! - _get_encoding_seekable() couldn't read the handle. This function is - called from get_encoding_from_filehandle(), defuse() and open_bom() - - * Couldn't reset read position: $! - _get_encoding_seekable couldn't seek to the position after the BOM. - - * Couldn't read byte: $! - get_encoding_from_stream couldn't read from the handle. This - function is called from get_encoding_from_filehandle() and - open_bom() when the handle or file is unseekable. - -BUGS - Older versions of PerlIO::via have a few problems with writing, see - above. - - The current version of PerlIO::via has limitations with regard to seek - and tell, currently only line-wise seek and tell are supported by this - module. If read() is used to read partial lines, tell() will still give - the position of the end of the last line read. - - Under windows, tell() seems to return erroneously when reading files - with unix line endings. - - Under windows, warnings may be generated when using the PerlIO::via - interface to read UTF-16LE and UTF-32LE encoded files. This seems to be - a bug in the relevant encoding(...) layers. - -AUTHOR - Matt Lawrence - - With thanks to Mark Fowler and Steve Purkis for additional tests and - advice. - -COPYRIGHT - Copyright 2005 Matt Lawrence, All Rights Reserved. - - This program is free software; you can redistribute it and/or modify it - under the same terms as Perl itself. - +NAME + File::BOM - Utilities for handling Byte Order Marks + +SYNOPSIS + use File::BOM qw( :all ) + + high-level functions + # read a file with encoding from the BOM: + open_bom(FH, $file) + open_bom(FH, $file, ':utf8') # the same but with a default encoding + + # get encoding too + $encoding = open_bom(FH, $file, ':utf8'); + + # open a potentially unseekable file: + ($encoding, $spillage) = open_bom(FH, $file, ':utf8'); + + # change encoding of an open handle according to BOM + $encoding = defuse(*HANDLE); + ($encoding, $spillage) = defuse(*HANDLE); + + # Decode a string according to leading BOM: + $unicode = decode_from_bom($string_with_bom); + + # Decode a string and get the encoding: + ($unicode, $encoding) = decode_from_bom($string_with_bom) + + PerlIO::via interface + # Read the Right Thing from a unicode file with BOM: + open(HANDLE, '<:via(File::BOM)', $filename) + + # Writing little-endian UTF-16 file with BOM: + open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', $filename) + + lower-level functions + # read BOM encoding from a filehandle: + $encoding = get_encoding_from_filehandle(FH) + + # Get encoding even if FH is unseekable: + ($encoding, $spillage) = get_encoding_from_filehandle(FH); + + # Get encoding from a known unseekable handle: + ($encdoing, $spillage) = get_encoding_from_stream(FH); + + # get encoding and BOM length from BOM at start of string: + ($encoding, $offset) = get_encoding_from_bom($string); + + variables + # print a BOM for a known encoding + print FH $enc2bom{$encoding}; + + # get an encoding from a known BOM + $enc = $bom2enc{$bom} + +DESCRIPTION + This module provides functions for handling unicode byte order marks, + which are to be found at the beginning of some files and streams. + + For details about what a byte order mark is, see + + + The intention of File::BOM is for files with BOMs to be readable as + seamlessly as possible, regardless of the encoding used. To that end, + several different interfaces are available, as shown in the synopsis + above. + +EXPORTS + Nothing by default. + + symbols + * open_bom() + + * defuse() + + * decode_from_bom() + + * get_encoding_from_filehandle() + + * get_encoding_from_stream() + + * get_encoding_from_bom() + + * %bom2enc + + * %enc2bom + + tags + * :all + + All of the above + + * :subs + + subroutines only + + * :vars + + just %bom2enc and %enc2bom + +VARIABLES + %bom2enc + Maps Byte Order marks to their encodings. + + The keys of this hash are strings which represent the BOMs, the values + are their encodings, in a format which is understood by Encode + + The encodings represented in this hash are: UTF-8, UTF-16BE, UTF-16LE, + UTF-32BE and UTF-32LE + + %enc2bom + A reverse-lookup hash for bom2enc, with a few aliases used in Encode, + namely utf8, iso-10646-1 and UCS-2. + + Note that UTF-16, UTF-32 and UCS-4 are not included in this hash. Mainly + because Encode::encode automatically puts BOMs on output. See + Encode::Unicode + +FUNCTIONS + open_bom + $encoding = open_bom(HANDLE, $filename, $default_mode) + + ($encoding, $spill) = open_bom(HANDLE, $filename, $default_mode) + + opens HANDLE for reading on $filename, setting the mode to the + appropriate encoding for the BOM stored in the file. + + On failure, a fatal error is raised, see the DIAGNOSTICS section for + details on how to catch these. This is in order to allow the return + value(s) to be used for other purposes. + + If the file doesn't contain a BOM, $default_mode is used instead. Hence: + + open_bom(FH, 'my_file.txt', ':utf8') + + Opens my_file.txt for reading in an appropriate encoding found from the + BOM in that file, or as a UTF-8 file if none is found. + + In the absence of a $default_mode argument, the following 2 calls should + be equivalent: + + open_bom(FH, 'no_bom.txt'); + + open(FH, '<', 'no_bom.txt'); + + If an undefined value is passed as the handle, a symbol will be + generated for it like open() does: + + # create filehandle on the fly + $enc = open_bom(my $fh, $filename, ':utf8'); + $line = <$fh>; + + The filehandle will be cued up to read after the BOM. Unseekable files + (e.g. fifos) will cause croaking, unless called in list context to catch + spillage from the handle. Any spillage will be automatically decoded + from the encoding, if found. + + e.g. + + # croak if my_socket is unseekable + open_bom(FH, 'my_socket'); + + # keep spillage if my_socket is unseekable + ($encoding, $spillage) = open_bom(FH, 'my_socket'); + + # discard any spillage from open_bom + ($encoding) = open_bom(FH, 'my_socket'); + + defuse + $enc = defuse(FH); + + ($enc, $spill) = defuse(FH); + + FH should be a filehandle opened for reading, it will have the relevant + encoding layer pushed onto it be binmode if a BOM is found. Spillage + should be Unicode, not bytes. + + Any uncaptured spillage will be silently lost. If the handle is + unseekable, use list context to avoid data loss. + + If no BOM is found, the mode will be unaffected. + + decode_from_bom + $unicode_string = decode_from_bom($string, $default, $check) + + ($unicode_string, $encoding) = decode_from_bom($string, $default, $check) + + Reads a BOM from the beginning of $string, decodes $string (minus the + BOM) and returns it to you as a perl unicode string. + + if $string doesn't have a BOM, $default is used instead. + + $check, if supplied, is passed to Encode::decode as the third argument. + + If there's no BOM and no default, the original string is returned and + encoding is ''. + + See Encode + + get_encoding_from_filehandle + $encoding = get_encoding_from_filehandle(HANDLE) + + ($encoding, $spillage) = get_encoding_from_filehandle(HANDLE) + + Returns the encoding found in the given filehandle. + + The handle should be opened in a non-unicode way (e.g. mode '<:bytes') + so that the BOM can be read in its natural state. + + After calling, the handle will be set to read at a point after the BOM + (or at the beginning of the file if no BOM was found) + + If called in scalar context, unseekable handles cause a croak(). + + If called in list context, unseekable handles will be read byte-by-byte + and any spillage will be returned. See get_encoding_from_stream() + + get_encoding_from_stream + ($encoding, $spillage) = get_encoding_from_stream(*FH); + + Read a BOM from an unrewindable source. This means reading the stream + one byte at a time until either a BOM is found or every possible BOM is + ruled out. Any non-BOM bytes read from the handle will be returned in + $spillage. + + If a BOM is found and the spillage contains a partial character (judging + by the expected character width for the encoding) more bytes will be + read from the handle to ensure that a complete character is returned. + + Spillage is always in bytes, not characters. + + This function is less efficient than get_encoding_from_filehandle, but + should work just as well on a seekable handle as on an unseekable one. + + get_encoding_from_bom + ($encoding, $offset) = get_encoding_from_bom($string) + + Returns the encoding and length in bytes of the BOM in $string. + + If there is no BOM, an empty string is returned and $offset is zero. + + To get the data from the string, the following should work: + + use Encode; + + my($encoding, $offset) = get_encoding_from_bom($string); + + if ($encoding) { + $string = decode($encoding, substr($string, $offset)) + } + +PerlIO::via interface + File::BOM can be used as a PerlIO::via interface. + + open(HANDLE, '<:via(File::BOM)', 'my_file.txt'); + + open(HANDLE, '>:encoding(UTF-16LE):via(File::BOM)', 'out_file.txt) + print "foo\n"; # BOM is written to file here + + This method is less prone to errors on non-seekable files as spillage is + incorporated into an internal buffer, but it doesn't give you any + information about the encoding being used, or indeed whether or not a + BOM was present. + + There are a few known problems with this interface, especially + surrounding seek() and tell(), please see the BUGS section for more + details about this. + + Reading + The via(File::BOM) layer must be added before the handle is read from, + otherwise any BOM will be missed. If there is no BOM, no decoding will + be done. + + Because of a limitation in PerlIO::via, read() always works on bytes, + not characters. BOM decoding will still be done but output will be bytes + of UTF-8. + + open(BOM, '<:via(File::BOM)', $file) + $bytes_read = read(BOM, $buffer, $length); + $unicode = decode('UTF-8', $buffer, Encode::FB_QUIET); + + # Now $unicode is valid unicode and $buffer contains any left-over bytes + + Writing + Add the via(File::BOM) layer on top of a unicode encoding layer to print + a BOM at the start of the output file. This needs to be done before any + data is written. The BOM is written as part of the first print command + on the handle, so if you don't print anything to the handle, you won't + get a BOM. + + There is a "Wide character in print" warning generated when the + via(File::BOM) layer doesn't receive utf8 on writing. This glitch was + resolved in perl version 5.8.7, but if your perl version is older than + that, you'll need to make sure that the via(File::BOM) layer receives + utf8 like this: + + # This works OK + open(FH, '>:encoding(UTF-16LE):via(File::BOM):utf8', $filename) + + # This generates warnings with older perls + open(FH, '>:encoding(UTF-16LE):via(File::BOM)', $filename) + + Seeking + Seeking with SEEK_SET results in an offset equal to the length of any + detected BOM being applied to the position parameter. Thus: + + # Seek to end of BOM (not start of file!) + seek(FILE_BOM_HANDLE, 0, SEEK_SET) + + Telling + In order to work correctly with seek(), tell() also returns a postion + adjusted by the length of the BOM. + +SEE ALSO + * Encode + + * Encode::Unicode + + * + +DIAGNOSTICS + The following exceptions are raised via croak() + + * Couldn't read '': $! + + open_bom() couldn't open the given file for reading + + * Couldn't set binmode of handle opened on '' to '': + $! + + open_bom() couldn't set the binmode of the handle + + * No string + + decode_from_bom called on an undefined value + + * Unseekable handle: $! + + get_encoding_from_filehandle() or open_bom() called on an unseekable + file or handle in scalar context. + + * Couldn't read from handle: $! + + _get_encoding_seekable() couldn't read the handle. This function is + called from get_encoding_from_filehandle(), defuse() and open_bom() + + * Couldn't reset read position: $! + + _get_encoding_seekable couldn't seek to the position after the BOM. + + * Couldn't read byte: $! + + get_encoding_from_stream couldn't read from the handle. This + function is called from get_encoding_from_filehandle() and + open_bom() when the handle or file is unseekable. + +BUGS + Older versions of PerlIO::via have a few problems with writing, see + above. + + The current version of PerlIO::via has limitations with regard to seek + and tell, currently only line-wise seek and tell are supported by this + module. If read() is used to read partial lines, tell() will still give + the position of the end of the last line read. + + Under windows, tell() seems to return erroneously when reading files + with unix line endings. + + Under windows, warnings may be generated when using the PerlIO::via + interface to read UTF-16LE and UTF-32LE encoded files. This seems to be + a bug in the relevant encoding(...) layers. + +AUTHOR + Matt Lawrence + + With thanks to Mark Fowler and Steve Purkis for additional tests and + advice. + +COPYRIGHT + Copyright 2005 Matt Lawrence, All Rights Reserved. + + This program is free software; you can redistribute it and/or modify it + under the same terms as Perl itself. +