diff -Nru liblingua-en-nameparse-perl-1.27/Changes liblingua-en-nameparse-perl-1.30/Changes --- liblingua-en-nameparse-perl-1.27/Changes 2010-07-04 07:46:44.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/Changes 2011-03-31 00:11:11.000000000 +0000 @@ -1,5 +1,26 @@ Revision history for Perl CPAN module Lingua::En::NameParse +1.30 31 Marr 2011 + Added component ordering for Mr_J_Adam_Smith name type, thanks to John Hansen + Corrected some of the documentation + Added more military titles + +1.29 23 Jan 2011 + Corrected documentation of case_components module, thanks to John Hansen + Removed invalid space after /Pilot Officer/ in extended titles grammar, thanks to John Hansen + Added the 'Mr_J_Adam_Smith' name type, thanks to John Hansen + Added the 'John' name type, thanks to Graham Seamen + Moved NameGrammar.pm to Lingua::EN::NameParse::Grammar name space + +1.28 3 Jan 2011 + Added more extended titles including Pilot Officer, Count, Duke, Dutchess, Marquess (thanks to Hugh Myers) + Allowed T-Bone as a given name (thanks to Hugh Myers) + Added name type Mr_John_Adam_Smith (thanks to Chris Brown) + Allowed joint names to be reverse cased + Fixed bug when printing report on name of unknown type + Moved saluation paramters from 'new' to 'salutation' method + Allowed for two types of salutation, given_name and title_plus_surname + 1.27 4 Jul 2010 Changed my cygwin fstab file to use noacl option for default mount points. Before this, the tar tool was creating directories with no read permision. diff -Nru liblingua-en-nameparse-perl-1.27/debian/changelog liblingua-en-nameparse-perl-1.30/debian/changelog --- liblingua-en-nameparse-perl-1.27/debian/changelog 2010-08-17 19:45:14.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/debian/changelog 2011-03-31 15:02:06.000000000 +0000 @@ -1,3 +1,34 @@ +liblingua-en-nameparse-perl (1.30-1) unstable; urgency=low + + * New upstream release + * Refreshed patch + + -- Nicholas Bamber Thu, 31 Mar 2011 09:57:48 +0100 + +liblingua-en-nameparse-perl (1.29-1) unstable; urgency=low + + * New upstream release + * Refreshed patch + + -- Nicholas Bamber Fri, 21 Jan 2011 23:07:32 +0000 + +liblingua-en-nameparse-perl (1.28-1) unstable; urgency=low + + [ Russ Allbery ] + * Remove myself from Uploaders. + + [ Nicholas Bamber ] + * New upstream release + * Refreshed interpreter path patch + * Refreshed short description + * Refreshed copyright + + [ gregor herrmann ] + * Update years of upstream copyright (there are 3 different years in the + upstream code ...). + + -- Nicholas Bamber Tue, 04 Jan 2011 20:56:57 +0000 + liblingua-en-nameparse-perl (1.27-1) unstable; urgency=low [ gregor herrmann ] diff -Nru liblingua-en-nameparse-perl-1.27/debian/control liblingua-en-nameparse-perl-1.30/debian/control --- liblingua-en-nameparse-perl-1.27/debian/control 2010-08-16 22:02:51.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/debian/control 2011-01-04 21:31:44.000000000 +0000 @@ -2,9 +2,10 @@ Section: perl Priority: optional Build-Depends: debhelper (>= 7) -Build-Depends-Indep: perl, libparse-recdescent-perl, libtest-pod-perl, libtest-pod-coverage-perl +Build-Depends-Indep: perl, libparse-recdescent-perl, libtest-pod-perl, + libtest-pod-coverage-perl Maintainer: Debian Perl Group -Uploaders: Damyan Ivanov , Russ Allbery , +Uploaders: Damyan Ivanov , Nicholas Bamber Homepage: http://search.cpan.org/dist/Lingua-EN-NameParse/ Vcs-Svn: svn://svn.debian.org/pkg-perl/trunk/liblingua-en-nameparse-perl/ @@ -14,7 +15,7 @@ Package: liblingua-en-nameparse-perl Architecture: all Depends: ${misc:Depends}, ${perl:Depends}, libparse-recdescent-perl -Description: Perl routines for manipulating a person's name +Description: module for parsing a person's name in free text Lingua::EN::NameParse takes as input a person or persons name in free format text such as, . diff -Nru liblingua-en-nameparse-perl-1.27/debian/copyright liblingua-en-nameparse-perl-1.30/debian/copyright --- liblingua-en-nameparse-perl-1.27/debian/copyright 2010-08-16 22:02:51.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/debian/copyright 2011-01-04 21:34:01.000000000 +0000 @@ -4,26 +4,25 @@ Name: Lingua-EN-NameParse Files: * -Copyright: 2008, Kim Ryan +Copyright: 2005-2011, Kim Ryan License: Artistic or GPL-1+ Files: debian/* -Copyright: - 2005-2008, Ernesto Hernández-Novich +Copyright: 2005-2008, Ernesto Hernández-Novich 2007, Russ Allbery 2007, Damyan Ivanov 2008, Roberto C. Sanchez 2008-2010, gregor hermann 2010, Nathan Handler - 2010, Nicholas Bamber + 2010-2011, Nicholas Bamber License: Artistic or GPL-1+ License: Artistic This program is free software; you can redistribute it and/or modify it under the terms of the Artistic License, which comes with Perl. . - On Debian GNU/Linux systems, the complete text of the Artistic License - can be found in `/usr/share/common-licenses/Artistic'. + On Debian systems, the complete text of the Artistic License can be + found in `/usr/share/common-licenses/Artistic'. License: GPL-1+ This program is free software; you can redistribute it and/or modify @@ -31,5 +30,5 @@ the Free Software Foundation; either version 1, or (at your option) any later version. . - On Debian GNU/Linux systems, the complete text of version 1 of the - General Public License can be found in `/usr/share/common-licenses/GPL-1'. + On Debian systems, the complete text of version 1 of the GNU General + Public License can be found in `/usr/share/common-licenses/GPL-1'. diff -Nru liblingua-en-nameparse-perl-1.27/debian/patches/fix_interpreter_path.patch liblingua-en-nameparse-perl-1.30/debian/patches/fix_interpreter_path.patch --- liblingua-en-nameparse-perl-1.27/debian/patches/fix_interpreter_path.patch 2010-08-15 17:16:15.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/debian/patches/fix_interpreter_path.patch 2011-01-04 21:31:44.000000000 +0000 @@ -1,7 +1,7 @@ Subject: Debianize interpreter path in example script Author: Nicholas Bamber Forwarded: not-needed -Last-Update: 2010-08-10 +Last-Update: 2011-01-04 --- a/examples/demo.pl +++ b/examples/demo.pl @@ -1,4 +1,4 @@ @@ -10,7 +10,7 @@ # Demo script for Lingua::EN::NameParse.pm -@@ -109,4 +109,4 @@ +@@ -107,4 +107,4 @@ James Graham, Marquess of Montrose Flight Officer John Gillespie Magee Sir Author Conan Doyle diff -Nru liblingua-en-nameparse-perl-1.27/debian/patches/pod.patch liblingua-en-nameparse-perl-1.30/debian/patches/pod.patch --- liblingua-en-nameparse-perl-1.27/debian/patches/pod.patch 1970-01-01 00:00:00.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/debian/patches/pod.patch 2011-03-31 15:02:06.000000000 +0000 @@ -0,0 +1,15 @@ +Author: Nicholas Bamber +Subject: Spelling mistakes +Last-Update: 2011-03-31 +Forwarded: no +--- a/lib/Lingua/EN/NameParse.pm ++++ b/lib/Lingua/EN/NameParse.pm +@@ -796,7 +796,7 @@ + If name type is unknown , returns undef + + If the name type has a joint name, such as 'Mr_A_Smith_&_Ms_B_Jones', return undef, +-as it is ambigious which surname to place at the start of the string ++as it is ambiguous which surname to place at the start of the string + + Else, returns a string of all cased components in correct reversed order + diff -Nru liblingua-en-nameparse-perl-1.27/debian/patches/series liblingua-en-nameparse-perl-1.30/debian/patches/series --- liblingua-en-nameparse-perl-1.27/debian/patches/series 2010-08-15 17:16:15.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/debian/patches/series 2011-01-04 21:31:44.000000000 +0000 @@ -1 +1,2 @@ fix_interpreter_path.patch +pod.patch diff -Nru liblingua-en-nameparse-perl-1.27/examples/demo.pl liblingua-en-nameparse-perl-1.30/examples/demo.pl --- liblingua-en-nameparse-perl-1.27/examples/demo.pl 2010-07-03 00:59:39.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/examples/demo.pl 2011-01-03 05:40:29.000000000 +0000 @@ -13,8 +13,6 @@ my %args = ( - salutation => 'Dear', - sal_default => 'Friend', auto_clean => 1, force_case => 1, lc_prefix => 0, diff -Nru liblingua-en-nameparse-perl-1.27/lib/Lingua/EN/NameGrammar.pm liblingua-en-nameparse-perl-1.30/lib/Lingua/EN/NameGrammar.pm --- liblingua-en-nameparse-perl-1.27/lib/Lingua/EN/NameGrammar.pm 2010-07-03 00:56:13.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/lib/Lingua/EN/NameGrammar.pm 1970-01-01 00:00:00.000000000 +0000 @@ -1,707 +0,0 @@ -=head1 NAME - -Lingua::EN::NameGrammar - grammar tree for Lingua::EN::NameParse - -=head1 SYNOPSIS - -Internal functions called from NameParse.pm module - -=head1 DESCRIPTION - -Grammar tree of personal name syntax for Lingua::EN::NameParse module. - -The grammar defined here is for use with the Parse::RecDescent module. -Note that parsing is done depth first, meaning match the shortest string first. -To avoid premature matches, when one rule is a sub set of another longer rule, -it must appear after the longer rule. See the Parse::RecDescent documentation -for more details. - - -=head1 AUTHOR - -NameGrammar was written by Kim Ryan . - -=head1 COPYRIGHT AND LICENSE - -Copyright (c) 2005 Kim Ryan. All rights reserved. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself, either Perl version 5.8.4 or, -at your option, any later version of Perl 5 you may have available. - - - - -=cut -#------------------------------------------------------------------------------ - -package Lingua::EN::NameGrammar; -use strict; -use warnings; - - -# Rules that define valid orderings of a names components - -my $rules_start = q{ full_name : }; - -my $rules_joint_names = -q{ - - # A (?) refers to an optional component, occurring 0 or more times. - # Optional items are returned as an array, which for our case will - # always consist of one element, when they exist. - - title given_name surname conjunction title given_name surname non_matching(?) - { - # block of code to define actions upon successful completion of a - # 'production' or rule - - # Two separate people - $return = - { - # Parse::RecDescent lets you return a single scalar, which we use as - # an anonymous hash reference - title_1 => $item[1], - given_name_1 => $item[2], - surname_1 => $item[3], - conjunction_1 => $item[4], - title_2 => $item[5], - given_name_2 => $item[6], - surname_2 => $item[7], - non_matching => $item[8][0], - number => 2, - type => 'Mr_John_Smith_&_Ms_Mary_Jones' - } - } - | - - - title initials surname conjunction title initials surname non_matching(?) - { - $return = - { - title_1 => $item[1], - initials_1 => $item[2], - surname_1 => $item[3], - conjunction_1 => $item[4], - title_2 => $item[5], - initials_2 => $item[6], - surname_2 => $item[7], - non_matching => $item[8][0], - number => 2, - type => 'Mr_A_Smith_&_Ms_B_Jones' - } - } - | - - title initials conjunction initials surname non_matching(?) - { - # Two related people, shared title, separate initials, - # shared surname. Example, father and son, sisters - $return = - { - title_1 => $item[1], - initials_1 => $item[2], - conjunction_1 => $item[3], - initials_2 => $item[4], - surname_1 => $item[5], - non_matching => $item[6][0], - number => 2, - type => 'Mr_A_&_B_Smith' - } - } - | - - title conjunction title initials conjunction initials surname non_matching(?) - { - # Two related people, own initials, shared surname - - $return = - { - title_1 => $item[1], - conjunction_1 => $item[2], - title_2 => $item[3], - initials_1 => $item[4], - conjunction_2 => $item[5], - initials_2 => $item[6], - surname_1 => $item[7], - non_matching => $item[8][0], - number => 2, - type => 'Mr_&_Ms_A_&_B_Smith' - } - } - | - - title initials conjunction title initials surname non_matching(?) - { - # Two related people, own initials, shared surname - $return = - { - title_1 => $item[1], - initials_1 => $item[2], - conjunction_1 => $item[3], - title_2 => $item[4], - initials_2 => $item[5], - surname_1 => $item[6], - non_matching => $item[7][0], - number => 2, - type => 'Mr_A_&_Ms_B_Smith' - } - } - | - - title conjunction title initials surname non_matching(?) - { - # Two related people, shared initials, shared surname - $return = - { - title_1 => $item[1], - conjunction_1 => $item[2], - title_2 => $item[3], - initials_1 => $item[4], - surname_1 => $item[5], - non_matching => $item[6][0], - number => 2, - type => 'Mr_&_Ms_A_Smith' - } - } - | - - given_name surname conjunction given_name surname non_matching(?) - { - $return = - { - given_name_1 => $item[1], - surname_1 => $item[2], - conjunction_1 => $item[3], - given_name_2 => $item[4], - surname_2 => $item[5], - non_matching => $item[6][0], - number => 2, - type => 'John_Smith_&_Mary_Jones' - } - } - | - - initials surname conjunction initials surname non_matching(?) - { - $return = - { - initials_1 => $item[1], - surname_1 => $item[2], - conjunction_1 => $item[3], - initials_2 => $item[4], - surname_2 => $item[5], - non_matching => $item[6][0], - number => 2, - type => 'A_Smith_&_B_Jones' - } - } - | - - given_name conjunction given_name surname non_matching(?) - { - $return = - { - given_name_1 => $item[1], - conjunction_1 => $item[2], - given_name_2 => $item[3], - surname_2 => $item[4], - non_matching => $item[5][0], - number => 2, - type => 'John_&_Mary_Smith' - } - } - | - -}; - -my $rules_single_names = -q{ - - - precursor(?) title given_name single_initial surname suffix(?) non_matching(?) - { - $return = - { - precursor => $item[1][0], - title_1 => $item[2], - given_name_1 => $item[3], - initials_1 => $item[4], - surname_1 => $item[5], - suffix => $item[6][0], - non_matching => $item[7][0], - number => 1, - type => 'Mr_John_A_Smith' - } - } - | - - - precursor(?) title given_name surname suffix(?) non_matching(?) - { - $return = - { - precursor => $item[1][0], - title_1 => $item[2], - given_name_1 => $item[3], - surname_1 => $item[4], - suffix => $item[5][0], - non_matching => $item[6][0], - number => 1, - type => 'Mr_John_Smith' - } - } - | - - precursor(?) title initials surname suffix(?) non_matching(?) - { - $return = - { - precursor => $item[1][0], - title_1 => $item[2], - initials_1 => $item[3], - surname_1 => $item[4], - suffix => $item[5][0], - non_matching => $item[6][0], - number => 1, - type => 'Mr_A_Smith' - } - } - | - - precursor(?) given_name_min_2 middle_name surname suffix(?) non_matching(?) - { - $return = - { - precursor => $item[1][0], - given_name_1 => $item[2], - middle_name => $item[3], - surname_1 => $item[4], - suffix => $item[5][0], - non_matching => $item[6][0], - number => 1, - type => 'John_Adam_Smith' - } - } - | - - precursor(?) given_name_min_2 single_initial surname suffix(?) non_matching(?) - { - $return = - { - precursor => $item[1][0], - given_name_1 => $item[2], - initials_1 => $item[3], - surname_1 => $item[4], - suffix => $item[5][0], - non_matching => $item[6][0], - number => 1, - type => 'John_A_Smith' - } - } - | - - precursor(?) single_initial middle_name surname suffix(?) non_matching(?) - { - $return = - { - precursor => $item[1][0], - initials_1 => $item[2], - middle_name => $item[3], - surname_1 => $item[4], - suffix => $item[5][0], - non_matching => $item[6][0], - number => 1, - type => 'J_Adam_Smith' - } - } - | - - precursor(?) given_name surname suffix(?) non_matching(?) - { - $return = - { - precursor => $item[1][0], - given_name_1 => $item[2], - surname_1 => $item[3], - suffix => $item[4][0], - non_matching => $item[5][0], - number => 1, - type => 'John_Smith' - } - } - | - - precursor(?) initials surname suffix(?) non_matching(?) - { - $return = - { - precursor => $item[1][0], - initials_1 => $item[2], - surname_1 => $item[3], - suffix => $item[4][0], - non_matching => $item[5][0], - number => 1, - type => 'A_Smith', - } - } - | - - non_matching(?) - { - $return = - { - non_matching => $item[1][0], - number => 0, - type => 'unknown' - } - } -}; - -#------------------------------------------------------------------------------ -# Individual components that a name can be composed from. Components are -# expressed as literals or Perl regular expressions. - -my $precursors = -q -{ - precursor : - - /Estate Of (The Late )?/i | - /His (Excellency|Honou?r) /i | - /Her (Excellency|Honou?r) /i | - /The Right Honou?rable /i | - /The Honou?rable /i | - /Right Honou?rable /i | - /The Rt\.? Hon\.? /i | - /The Hon\.? /i | - /Rt\.? Hon\.? /i - -}; - -my $titles = -q{ - - title : - - /Mr\.? /i | - /Ms\.? /i | - /M\/s\.? /i | - /Mrs\.? /i | - /Miss\.? /i | - - /Dr\.? /i | - /Sir /i | - /Dame /i - -}; - -my $extended_titles = -q{ - | - /Messrs /i | # plural or Mr - /Mme\.? /i | # Madame - /Mister /i | - /Mast(\.|er)? /i | - /Ms?gr\.? /i | # Monsignor - /Lord /i | - /Lady /i | - - /Madam(e)? /i | - - # Medical - /Doctor /i | - /Sister /i | - /Matron /i | - - # Legal - /Judge /i | - /Justice /i | - - # Police - /Det\.? /i | - /Insp\.? /i | - - # Military - /Brig(adier)? /i | - /Captain /i | - /Capt\.? /i | - /Colonel /i | - /Col\.? /i | - /Commander /i | - /Commodore /i | - /Cdr\.? /i | # Commander, Commodore - /Field Marshall /i | - /Fl\.? Off\.? /i | - /Flight Officer /i | - /Flt Lt /i | - /Flight Lieutenant /i | - /Gen(\.|eral)? /i | - /Gen\. /i | - /Pte\. /i | - /Private /i | - /Sgt\.? /i | - /Sargent /i | - /Air Commander /i | - /Air Commodore /i | - /Air Marshall /i | - /Lieutenant Colonel /i | - /Lt\.? Col\.? /i | - /Lt\.? Gen\.? /i | - /Lt\.? Cdr\.? /i | - /Lieutenant /i | - /(Lt|Leut|Lieut)\.? /i | - /Major General /i | - /Maj\.? Gen\.?/i | - /Major /i | - /Maj\.? /i - - - # Religious - /Rabbi /i | - /Bishop /i | - /Brother /i | - /Chaplain /i | - /Father /i | - /Pastor /i | - /Mother Superior /i | - /Mother /i | - /Most Rever[e|a]nd /i | - /Very Rever[e|a]nd /i | - /Rever[e|a]nd /i | - /Mt\.? Revd\.? /i | - /V\.? Revd?\.? /i | - /Revd?\.? /i | - - - # Other - /Prof(\.|essor)? /i | - /Ald(\.|erman)? /i -}; - -my $conjunction = q{ conjunction : /And |& /i }; - -# Used in the John_A_Smith and J_Adam_Smith name types. Although this -# duplicates $initials_1, it is needed because this type of initial must -# always be one character long, regardless of the length of initials set -# by the user in the 'new' method. -my $single_initial = q{ single_initial: /[A-Z]\.? /i }; - -# Define given name combinations, specifying the minimum number of letters. -# The correct pair of rules is determined by the 'initials' key in the hash -# passed to the 'new' method. - -# Jo, Jo-Anne, D'Artagnan, O'Shaugnessy La'Keishia -my $given_name_min_2 = -q{ - given_name: /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i -}; - -# Joe ... -my $given_name_min_3 = -q{ - given_name: /[A-Z]{3,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i -}; - -my $given_name_min_4 = -q{ - given_name: /[A-Z]{4,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{3,} /i -}; - -# For use with John_Adam_Smith and John_A_Smith name types -my $fixed_length_given_name = -q{ - given_name_min_2 : /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i -}; - - -# Define initials combinations specifying the minimum and maximum letters. -# Order from most complex to simplest, to avoid premature matching. - -# 'A' 'A.' -my $initials_1 = q{ initials: /[A-Z]\.? /i }; - -# 'A. B.' 'A.B.' 'AB' 'A B' - -my $initials_2 = -q{ - initials: /([A-Z]\. ){1,2}/i | /([A-Z]\.){1,2} /i | /([A-Z] ){1,2}/i | /([A-Z]){1,2} /i -}; - -# 'A. B. C. ' 'A.B.C' 'ABC' 'A B C' -my $initials_3 = -q{ - initials: /([A-Z]\. ){1,3}/i | /([A-Z]\.){1,3} /i | /([A-Z] ){1,3}/i | /([A-Z]){1,3} /i -}; - - -# Jo, Jo-Anne, La'Keishia, D'Artagnan, O'Shaugnessy -my $middle_name = -q{ - middle_name: - - # Dont grab surname prefix too early. For example, John Van Dam could be - # interpreted as middle name of Van and Surname of Dam. So exclude prefixs - # from middle names - ...!prefix /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i - { - $return = $item[2]; - } -}; - - -my $full_surname = -q{ - # Use look-ahead to avoid ambiguity between surname and suffix. For example, - # John Smith Snr, would detect Snr as the surname and Smith as the middle name - surname : ...!suffix sub_surname second_name(?) - { - if ( $item[2] and $item[3][0] ) - { - $return = "$item[2]$item[3][0]"; - } - else - { - $return = $item[2]; - } - } - - sub_surname : prefix(?) name - { - # To prevent warnings when compiling with the -w switch, - # do not return uninitialized variables. - if ( $item[1][0] ) - { - $return = "$item[1][0]$item[2]"; - } - else - { - $return = $item[2]; - } - } - - second_name : '-' sub_surname - { - if ( $item[1] and $item[2] ) - { - $return = "$item[1]$item[2]"; - } - } - - # Patronymic, place name and other surname prefixes - prefix: - - /[A|E]l /i | # Arabic, Greek, - /Ap /i | # Welsh - /Ben /i | # Hebrew - - /Dell([a|e])? /i | # ITALIAN - /Dalle /i | - /D[a|e]ll'/i | - /Dela /i | - /Del /i | - /De (La |Los )?/i | - /D[a|i|u] /i | - /L[a|e|o] /i | - - /[D|L|O]'/i | # Italian, Irish or French - /St\.? /i | # abbreviation for Saint - /San /i | # Spanish - - /Den /i | # DUTCH - /Von (Der )?/i | - /Van (De(n|r)? )?/i - - # space needed for any following text - name: /[A-Z]{2,} ?/i - -}; - -my $suffix = -q{ - suffix: - - # word boundaries are used to stop partial matches from surnames such as - # the "VI" in "VINCE" - - /Esq(\.|uire)?\b ?/i | - /Sn?r\.?\b ?/i | # Senior - /Jn?r\.?\b ?/i | # Junior - /PhD\.?\b ?/i | - /MD\.?\b ?/i | - /LLB\.?\b ?/i | - - - /XI{1,3}\b ?/i | # 11th, 12th, 13th - /X\b ?/i | # 10th - /IV\b ?/i | # 4th - /VI{1,3}\b ?/i | # 6th, 7th, 8th - /V\b ?/i | # 5th - /IX\b ?/i | # 9th - /I{1,3}\b ?/i # 1st, 2nd, 3rd -}; - -# Two or more charaters. This is set to 2 as a work around for the problem -# with detecting suffixes like Snr. and Jnr. The dot here gets picked up -# as non matching. - -my $non_matching = q{ non_matching: /.{2,}/ }; - - -#------------------------------------------------------------------------------- -# Assemble correct combination for grammar tree. - -sub _create -{ - my $name = shift; - - my $grammar = $rules_start; - - if ( $name->{joint_names} ) - { - $grammar .= $rules_joint_names; - } - $grammar .= $rules_single_names . $precursors . $titles; - - if ( $name->{extended_titles} ) - { - $grammar .= $extended_titles; - } - - $grammar .= $conjunction; - - $grammar .= $single_initial; - - $name->{initials} > 3 and $name->{initials} = 3; - $name->{initials} < 1 and $name->{initials} = 1; - - # Define limit of when a string is treated as an initial, or - # a given name. For example, if initials are set to 2, MR TO SMITH - # will have initials of T & O and no given name, but MR TOM SMITH will - # have no initials, and a given name of Tom. - - if ( $name->{initials} == 1 ) - { - $grammar .= $given_name_min_2 . $initials_1; - } - elsif ( $name->{initials} == 2 ) - { - $grammar .= $given_name_min_3 . $initials_2; - } - elsif ( $name->{initials} == 3 ) - { - $grammar .= $given_name_min_4 . $initials_3; - } - - $grammar .= $fixed_length_given_name - . $middle_name - . $full_surname - . $suffix - . $non_matching - ; - - return($grammar); -} -#------------------------------------------------------------------------------- -1; diff -Nru liblingua-en-nameparse-perl-1.27/lib/Lingua/EN/NameParse/Grammar.pm liblingua-en-nameparse-perl-1.30/lib/Lingua/EN/NameParse/Grammar.pm --- liblingua-en-nameparse-perl-1.27/lib/Lingua/EN/NameParse/Grammar.pm 1970-01-01 00:00:00.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/lib/Lingua/EN/NameParse/Grammar.pm 2011-03-31 00:12:05.000000000 +0000 @@ -0,0 +1,761 @@ +=head1 NAME + +Lingua::EN::NameGrammar - grammar tree for Lingua::EN::NameParse + +=head1 SYNOPSIS + +Internal functions called from NameParse.pm module + +=head1 DESCRIPTION + +Grammar tree of personal name syntax for Lingua::EN::NameParse module. + +The grammar defined here is for use with the Parse::RecDescent module. +Note that parsing is done depth first, meaning match the shortest string first. +To avoid premature matches, when one rule is a sub set of another longer rule, +it must appear after the longer rule. See the Parse::RecDescent documentation +for more details. + + +=head1 AUTHOR + +NameParse::Grammar was written by Kim Ryan . + +=head1 COPYRIGHT AND LICENSE + +Copyright (c) 2011 Kim Ryan. All rights reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + + +=cut +#------------------------------------------------------------------------------ + +package Lingua::EN::NameParse::Grammar; +use strict; +use warnings; + +our $VERSION = '1.30'; + + +# Rules that define valid orderings of a names components + +my $rules_start = q{ full_name : }; + +my $rules_joint_names = +q{ + + # A (?) refers to an optional component, occurring 0 or more times. + # Optional items are returned as an array, which for our case will + # always consist of one element, when they exist. + + title given_name surname conjunction title given_name surname non_matching(?) + { + # block of code to define actions upon successful completion of a + # 'production' or rule + + # Two separate people + $return = + { + # Parse::RecDescent lets you return a single scalar, which we use as + # an anonymous hash reference + title_1 => $item[1], + given_name_1 => $item[2], + surname_1 => $item[3], + conjunction_1 => $item[4], + title_2 => $item[5], + given_name_2 => $item[6], + surname_2 => $item[7], + non_matching => $item[8][0], + number => 2, + type => 'Mr_John_Smith_&_Ms_Mary_Jones' + } + } + | + + + title initials surname conjunction title initials surname non_matching(?) + { + $return = + { + title_1 => $item[1], + initials_1 => $item[2], + surname_1 => $item[3], + conjunction_1 => $item[4], + title_2 => $item[5], + initials_2 => $item[6], + surname_2 => $item[7], + non_matching => $item[8][0], + number => 2, + type => 'Mr_A_Smith_&_Ms_B_Jones' + } + } + | + + title initials conjunction initials surname non_matching(?) + { + # Two related people, shared title, separate initials, + # shared surname. Example, father and son, sisters + $return = + { + title_1 => $item[1], + initials_1 => $item[2], + conjunction_1 => $item[3], + initials_2 => $item[4], + surname_1 => $item[5], + non_matching => $item[6][0], + number => 2, + type => 'Mr_A_&_B_Smith' + } + } + | + + title conjunction title initials conjunction initials surname non_matching(?) + { + # Two related people, own initials, shared surname + + $return = + { + title_1 => $item[1], + conjunction_1 => $item[2], + title_2 => $item[3], + initials_1 => $item[4], + conjunction_2 => $item[5], + initials_2 => $item[6], + surname_1 => $item[7], + non_matching => $item[8][0], + number => 2, + type => 'Mr_&_Ms_A_&_B_Smith' + } + } + | + + title initials conjunction title initials surname non_matching(?) + { + # Two related people, own initials, shared surname + $return = + { + title_1 => $item[1], + initials_1 => $item[2], + conjunction_1 => $item[3], + title_2 => $item[4], + initials_2 => $item[5], + surname_1 => $item[6], + non_matching => $item[7][0], + number => 2, + type => 'Mr_A_&_Ms_B_Smith' + } + } + | + + title conjunction title initials surname non_matching(?) + { + # Two related people, shared initials, shared surname + $return = + { + title_1 => $item[1], + conjunction_1 => $item[2], + title_2 => $item[3], + initials_1 => $item[4], + surname_1 => $item[5], + non_matching => $item[6][0], + number => 2, + type => 'Mr_&_Ms_A_Smith' + } + } + | + + given_name surname conjunction given_name surname non_matching(?) + { + $return = + { + given_name_1 => $item[1], + surname_1 => $item[2], + conjunction_1 => $item[3], + given_name_2 => $item[4], + surname_2 => $item[5], + non_matching => $item[6][0], + number => 2, + type => 'John_Smith_&_Mary_Jones' + } + } + | + + initials surname conjunction initials surname non_matching(?) + { + $return = + { + initials_1 => $item[1], + surname_1 => $item[2], + conjunction_1 => $item[3], + initials_2 => $item[4], + surname_2 => $item[5], + non_matching => $item[6][0], + number => 2, + type => 'A_Smith_&_B_Jones' + } + } + | + + given_name conjunction given_name surname non_matching(?) + { + $return = + { + given_name_1 => $item[1], + conjunction_1 => $item[2], + given_name_2 => $item[3], + surname_2 => $item[4], + non_matching => $item[5][0], + number => 2, + type => 'John_&_Mary_Smith' + } + } + | + +}; + +my $rules_single_names = +q{ + + precursor(?) title given_name middle_name surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + title_1 => $item[2], + given_name_1 => $item[3], + middle_name => $item[4], + surname_1 => $item[5], + suffix => $item[6][0], + non_matching => $item[7][0], + number => 1, + type => 'Mr_John_Adam_Smith' + } + } + | + + precursor(?) title given_name single_initial surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + title_1 => $item[2], + given_name_1 => $item[3], + initials_1 => $item[4], + surname_1 => $item[5], + suffix => $item[6][0], + non_matching => $item[7][0], + number => 1, + type => 'Mr_John_A_Smith' + } + } + | + + precursor(?) title single_initial middle_name surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + title_1 => $item[2], + initials_1 => $item[3], + middle_name => $item[4], + surname_1 => $item[5], + suffix => $item[6][0], + non_matching => $item[7][0], + number => 1, + type => 'Mr_J_Adam_Smith' + } + } + | + + + + precursor(?) title given_name surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + title_1 => $item[2], + given_name_1 => $item[3], + surname_1 => $item[4], + suffix => $item[5][0], + non_matching => $item[6][0], + number => 1, + type => 'Mr_John_Smith' + } + } + | + + precursor(?) title initials surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + title_1 => $item[2], + initials_1 => $item[3], + surname_1 => $item[4], + suffix => $item[5][0], + non_matching => $item[6][0], + number => 1, + type => 'Mr_A_Smith' + } + } + | + + precursor(?) given_name_min_2 middle_name surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + given_name_1 => $item[2], + middle_name => $item[3], + surname_1 => $item[4], + suffix => $item[5][0], + non_matching => $item[6][0], + number => 1, + type => 'John_Adam_Smith' + } + } + | + + precursor(?) given_name_min_2 single_initial surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + given_name_1 => $item[2], + initials_1 => $item[3], + surname_1 => $item[4], + suffix => $item[5][0], + non_matching => $item[6][0], + number => 1, + type => 'John_A_Smith' + } + } + | + + precursor(?) single_initial middle_name surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + initials_1 => $item[2], + middle_name => $item[3], + surname_1 => $item[4], + suffix => $item[5][0], + non_matching => $item[6][0], + number => 1, + type => 'J_Adam_Smith' + } + } + | + + precursor(?) given_name surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + given_name_1 => $item[2], + surname_1 => $item[3], + suffix => $item[4][0], + non_matching => $item[5][0], + number => 1, + type => 'John_Smith' + } + } + | + + precursor(?) initials surname suffix(?) non_matching(?) + { + $return = + { + precursor => $item[1][0], + initials_1 => $item[2], + surname_1 => $item[3], + suffix => $item[4][0], + non_matching => $item[5][0], + number => 1, + type => 'A_Smith' + } + } + | + + given_name non_matching(?) + { + $return = + { + given_name_1 => $item[1], + non_matching => $item[2][0], + number => 1, + type => 'John' + } + } + | + + non_matching(?) + { + $return = + { + non_matching => $item[1][0], + number => 0, + type => 'unknown' + } + } +}; + +#------------------------------------------------------------------------------ +# Individual components that a name can be composed from. Components are +# expressed as literals or Perl regular expressions. + +my $precursors = +q +{ + precursor : + + /Estate Of (The Late )?/i | + /His (Excellency|Honou?r) /i | + /Her (Excellency|Honou?r) /i | + /The Right Honou?rable /i | + /The Honou?rable /i | + /Right Honou?rable /i | + /The Rt\.? Hon\.? /i | + /The Hon\.? /i | + /Rt\.? Hon\.? /i + +}; + +my $titles = +q{ + + title : + + /Mr\.? /i | + /Ms\.? /i | + /M\/s\.? /i | + /Mrs\.? /i | + /Miss\.? /i | + + /Dr\.? /i | + /Sir /i | + /Dame /i + +}; + +my $extended_titles = +q{ + | # contiues from titles above + /Messrs /i | # plural or Mr + /Madam(e)? /i | + /Mme\.? /i | # Madame + /Mister /i | + /Mast(\.|er)? /i | + /Ms?gr\.? /i | # Monsignor + /Count /i | + /Countess /i | + /Duke /i | + /Duchess /i | + /Lord /i | + /Lady /i | + /Marquess i/ | + + # Medical + /Doctor /i | + /Sister /i | + /Matron /i | + + # Legal + /Judge /i | + /Justice /i | + + # Police + /Det\.? /i | + /Insp\.? /i | + + # Military + /Brig(adier)? /i | + /Captain /i | + /Capt\.? /i | + /Colonel /i | + /Col\.? /i | + /Commander in Chief /i | + /Commander /i | + /Commodore /i | + /Cdr\.? /i | # Commander, Commodore + /Field Marshall /i | + /Fl\.? Off\.? /i | + /Flight Officer /i | + /Flt Lt /i | + /Flight Lieutenant /i | + /General of the Army /i | + /Gen(\.|eral)? /i | + /Gen\. /i | + /Pte\. /i | + /Private /i | + /Sgt\.? /i | + /Sargent /i | + /Air Commander /i | + /Air Commodore /i | + /Air Marshall /i | + /Lieutenant Colonel /i | + /Lt\.? Col\.? /i | + /Lt\.? Gen\.? /i | + /Lt\.? Cdr\.? /i | + /Lieutenant /i | + /(Lt|Leut|Lieut)\.? /i | + /Major General /i | + /Maj\.? Gen\.?/i | + /Major /i | + /Maj\.? /i | + /Pilot Officer /i | + + + # Religious + /Rabbi /i | + /Bishop /i | + /Brother /i | + /Chaplain /i | + /Father /i | + /Pastor /i | + /Mother Superior /i | + /Mother /i | + /Most Rever[e|a]nd /i | + /Very Rever[e|a]nd /i | + /Rever[e|a]nd /i | + /Mt\.? Revd\.? /i | + /V\.? Revd?\.? /i | + /Revd?\.? /i | + + + # Other + /Prof(\.|essor)? /i | + /Ald(\.|erman)? /i +}; + +my $conjunction = q{ conjunction : /And |& /i }; + +# Used in the John_A_Smith and J_Adam_Smith name types. Although this +# duplicates $initials_1, it is needed because this type of initial must +# always be one character long, regardless of the length of initials set +# by the user in the 'new' method. +my $single_initial = q{ single_initial: /[A-Z]\.? /i }; + +# Define given name combinations, specifying the minimum number of letters. +# The correct pair of rules is determined by the 'initials' key in the hash +# passed to the 'new' method. + +# Examples are Jo, Jo-Anne, D'Artagnan, O'Shaugnessy La'Keishia, T-Bone +my $given_name_min_2 = +q{ + given_name: /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i | /T\-Bone /i +}; + +# Joe, Jo-Anne ... +my $given_name_min_3 = +q{ + given_name: /[A-Z]{3,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i | /T\-Bone /i +}; + + +# John ... +my $given_name_min_4 = +q{ + given_name: /[A-Z]{4,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{3,} /i | /T\-Bone /i +}; + +# For use with John_Adam_Smith and John_A_Smith name types +my $fixed_length_given_name = +q{ + given_name_min_2 : /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i +}; + + +# Define initials combinations specifying the minimum and maximum letters. +# Order from most complex to simplest, to avoid premature matching. + +# 'A' 'A.' +my $initials_1 = q{ initials: /[A-Z]\.? /i }; + +# 'A. B.' 'A.B.' 'AB' 'A B' + +my $initials_2 = +q{ + initials: /([A-Z]\. ){1,2}/i | /([A-Z]\.){1,2} /i | /([A-Z] ){1,2}/i | /([A-Z]){1,2} /i +}; + +# 'A. B. C. ' 'A.B.C' 'ABC' 'A B C' +my $initials_3 = +q{ + initials: /([A-Z]\. ){1,3}/i | /([A-Z]\.){1,3} /i | /([A-Z] ){1,3}/i | /([A-Z]){1,3} /i +}; + + +# Jo, Jo-Anne, La'Keishia, D'Artagnan, O'Shaugnessy +my $middle_name = +q{ + middle_name: + + # Dont grab surname prefix too early. For example, John Van Dam could be + # interpreted as middle name of Van and Surname of Dam. So exclude prefixs + # from middle names + ...!prefix /[A-Z]{2,} /i | /[A-Z]{2,}\-[A-Z]{2,} /i | /[A-Z]{1,}\'[A-Z]{2,} /i + { + $return = $item[2]; + } +}; + + +my $full_surname = +q{ + # Use look-ahead to avoid ambiguity between surname and suffix. For example, + # John Smith Snr, would detect Snr as the surname and Smith as the middle name + surname : ...!suffix sub_surname second_name(?) + { + if ( $item[2] and $item[3][0] ) + { + $return = "$item[2]$item[3][0]"; + } + else + { + $return = $item[2]; + } + } + + sub_surname : prefix(?) name + { + # To prevent warnings when compiling with the -w switch, + # do not return uninitialized variables. + if ( $item[1][0] ) + { + $return = "$item[1][0]$item[2]"; + } + else + { + $return = $item[2]; + } + } + + second_name : '-' sub_surname + { + if ( $item[1] and $item[2] ) + { + $return = "$item[1]$item[2]"; + } + } + + # Patronymic, place name and other surname prefixes + prefix: + + /[A|E]l /i | # Arabic, Greek, + /Ap /i | # Welsh + /Ben /i | # Hebrew + + /Dell([a|e])? /i | # ITALIAN + /Dalle /i | + /D[a|e]ll'/i | + /Dela /i | + /Del /i | + /De (La |Los )?/i | + /D[a|i|u] /i | + /L[a|e|o] /i | + + /[D|L|O]'/i | # Italian, Irish or French + /St\.? /i | # abbreviation for Saint + /San /i | # Spanish + + /Den /i | # DUTCH + /Von (Der )?/i | + /Van (De(n|r)? )?/i + + # space needed for any following text + name: /[A-Z]{2,} ?/i + +}; + +my $suffix = +q{ + suffix: + + # word boundaries are used to stop partial matches from surnames such as + # the "VI" in "VINCE" + + /Esq(\.|uire)?\b ?/i | + /Sn?r\.?\b ?/i | # Senior + /Jn?r\.?\b ?/i | # Junior + /PhD\.?\b ?/i | + /MD\.?\b ?/i | + /LLB\.?\b ?/i | + + + /XI{1,3}\b ?/i | # 11th, 12th, 13th + /X\b ?/i | # 10th + /IV\b ?/i | # 4th + /VI{1,3}\b ?/i | # 6th, 7th, 8th + /V\b ?/i | # 5th + /IX\b ?/i | # 9th + /I{1,3}\b ?/i # 1st, 2nd, 3rd +}; + +# Two or more characters. This is set to 2 as a work around for the problem +# with detecting suffixes like Snr. and Jnr. The dot here gets picked up +# as non matching. + +my $non_matching = q{ non_matching: /.{2,}/ }; + + +#------------------------------------------------------------------------------- +# Assemble correct combination for grammar tree. + +sub _create +{ + my $name = shift; + + my $grammar = $rules_start; + + if ( $name->{joint_names} ) + { + $grammar .= $rules_joint_names; + } + $grammar .= $rules_single_names . $precursors . $titles; + + if ( $name->{extended_titles} ) + { + $grammar .= $extended_titles; + } + + $grammar .= $conjunction; + + $grammar .= $single_initial; + + $name->{initials} > 3 and $name->{initials} = 3; + $name->{initials} < 1 and $name->{initials} = 1; + + # Define limit of when a string is treated as an initial, or + # a given name. For example, if initials are set to 2, MR TO SMITH + # will have initials of T & O and no given name, but MR TOM SMITH will + # have no initials, and a given name of Tom. + + if ( $name->{initials} == 1 ) + { + $grammar .= $given_name_min_2 . $initials_1; + } + elsif ( $name->{initials} == 2 ) + { + $grammar .= $given_name_min_3 . $initials_2; + } + elsif ( $name->{initials} == 3 ) + { + $grammar .= $given_name_min_4 . $initials_3; + } + + $grammar .= $fixed_length_given_name + . $middle_name + . $full_surname + . $suffix + . $non_matching + ; + + return($grammar); +} +#------------------------------------------------------------------------------- +1; diff -Nru liblingua-en-nameparse-perl-1.27/lib/Lingua/EN/NameParse.pm liblingua-en-nameparse-perl-1.30/lib/Lingua/EN/NameParse.pm --- liblingua-en-nameparse-perl-1.27/lib/Lingua/EN/NameParse.pm 2010-07-04 07:47:08.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/lib/Lingua/EN/NameParse.pm 2011-03-31 00:11:58.000000000 +0000 @@ -9,8 +9,6 @@ # optional configuration arguments my %args = ( - salutation => 'Dear', - sal_default => 'Friend', auto_clean => 1, force_case => 1, lc_prefix => 1, @@ -32,8 +30,8 @@ $correct_casing = $name->case_all_reversed ; # de Silva, AC $good_name = &clean("Bad Na9me "); # "Bad Name" - - $name->salutation; # Dear Mr de Silva + + $salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend')); # Dear Mr de Silva %my_properties = $name->properties; $number_surnames = $my_properties{number}; # 1 @@ -94,11 +92,16 @@ To describe the formats supported by NameParse, a short hand representation of the name is used. The following formats are currently supported : + Mr_John_Smith_&_Ms_Mary_Jones Mr_A_Smith_&_Ms_B_Jones Mr_&_Ms_A_&_B_Smith Mr_A_&_Ms_B_Smith Mr_&_Ms_A_Smith Mr_A_&_B_Smith + John_Smith_&_Mary_Jones + John_&_Mary_Smith + A_Smith_&_B_Jones + Mr_John_Adam_Smith Mr_John_A_Smith Mr_J_Adam_Smith @@ -109,19 +112,10 @@ J_Adam_Smith John_Smith A_Smith + John +Precursors and suffixes may be applied to single names that include a surname -Precursors and suffixes are only applied to the following formats: - - Mr_John_A_Smith - Mr_John_Smith - Mr_John_Smith - Mr_A_Smith - John_Adam_Smith - John_A_Smith - J_Adam_Smith - John_Smith - A_Smith =head1 METHODS @@ -141,8 +135,6 @@ my %args = ( - salutation => 'Dear', - sal_default => 'Friend', auto_clean => 1, force_case => 1, lc_prefix => 1, @@ -156,20 +148,6 @@ =over 4 -=item salutation - -The option defines the salutation word, such as "Dear" or "Greetings". It -must be defined if you are planning to use the C method. - -=item sal_default - -This option defines the defaulting word to substitute for the title and -surname(s), when parsing fails to identify them. It is also used when a -precursor occurs. Examples are "Friend" or "Member". It must be defined if -you are planning to use the C method. If an '&' or 'and' occurs -in the unmatched section then it is assumed that we are dealing with more than -one person, and an 's' is appended to the defaulting word. - =item force_case This option will force the C method to name case the entire input @@ -210,7 +188,7 @@ Jones, Jim De Silva, Professor A.B. -The program change the order of the name back to the non reversed format, and +The program changes the order of the name back to the non reversed format, and then performs the normal parsing. Note that if the name can be parsed, the fact that it's order was originally reversed, is not recorded as a property of the name object. @@ -242,18 +220,7 @@ Dr Sir Dame - Reverend - Reverand - Father - Captain - Capt - Colonel - Col - General - Gen - Major - Maj - + Note that if this option is not specified, than by default extended titles are ignored. Disabling extended titles speeds up the processing. @@ -302,7 +269,7 @@ =head2 case_components - %my_name = $name->components; + %my_name = $name->case_components; $cased_surname = $my_name{surname_1}; @@ -370,16 +337,33 @@ =head2 salutation + $salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend',sal_type => 'given_name')); + The C method converts a name into a personal greeting, -such as "Dear Mr & Mrs O'Brien". +such as "Dear Mr & Mrs O'Brien" or "Dear Sue and John" + +Optional parameters may be specided in a hash as follows: -If an error is detected during parsing, such as with the name -"AB Smith & Associates", the title (if it occurs) and the surname(s) are -replaced with a default word like "Friend" or "Member". If the input string -contains a conjunction, an 's' is added to the default. -If the name contains a precursor, a default salutation is also produced. + salutation: + + The greeting word such as 'Dear' or 'Greetings'. If not spefied than 'Dear' is used + + sal_default: + + The default word used when a personalised salution cannot be generated. If not + specified, than 'Friend' is used. + + sal_type: + + Can be either 'given_name' such as 'Dear Sue' or 'title_plus_name' such as 'Dear Ms Smith' + If not specified, than 'given_name' is used. + +If an error is detected during parsing, such as with the name "AB Smith & Associates", +then the value of sal_default is used instead of a given name, or a title and surname. +If the input string contains a conjunction, an 's' is added to the value of sal_default. +If the name contains a precursor, a default salutation is produced. =head2 clean @@ -402,21 +386,24 @@ =item type The type of format a name is in, as one of the following strings: - - Mr_A_Smith_&_Ms_B_Jones - Mr_&_Ms_A_&_B_Smith - Mr_A_&_Ms_B_Smith - Mr_&_Ms_A_Smith - Mr_A_&_B_Smith - Mr_John_A_Smith - Mr_John_Smith - Mr_A_Smith - John_Adam_Smith - John_A_Smith - J_Adam_Smith - John_Smith - A_Smith - unknown + + Mr_A_Smith_&_Ms_B_Jones + Mr_&_Ms_A_&_B_Smith + Mr_A_&_Ms_B_Smith + Mr_&_Ms_A_Smith + Mr_A_&_B_Smith + Mr_John_Adam_Smith + Mr_John_A_Smith + Mr_J_Adam_Smith + Mr_John_Smith + Mr_A_Smith + John_Adam_Smith + John_A_Smith + J_Adam_Smith + John_Smith + A_Smith + John + unknown =item non_matching @@ -442,6 +429,7 @@ Macbeth or MacBeth, are both valid spellings Is ED WOOD E.D. Wood or Edward Wood Is 'Mr Rapid Print' a name or a company + Does John Bradfield Smith have a middle name of Bradfield, or a surname of Bradfield-Smith? One approach is to have large lookup files of names and words, statistical rules and fuzzy logic to attempt to derive context. This approach gives high levels of @@ -487,10 +475,6 @@ Add transforming methods to do things like remove dots from initials Try to derive gender (Mr... is male, Ms, Mrs... is female) -Let the user select what level of complexity of grammar they need for -their data. For example, if you know most of your names are in a "John Smith" -format, you can avoid the ambiguity between two letter given names and -initials. Using a limited grammar subset will also be much faster. Define grammar for other languages. Hopefully, all that would be needed is to specify a new module with its own grammar, and inherit all the existing @@ -533,15 +517,12 @@ =head1 COPYRIGHT AND LICENSE -Copyright (c) 2008 Kim Ryan. All rights reserved. +Copyright (c) 2011 Kim Ryan. All rights reserved. This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself, either Perl version 5.8.4 or, -at your option, any later version of Perl 5 you may have available. - +it under the same terms as Perl itself. =cut - #------------------------------------------------------------------------------- package Lingua::EN::NameParse; @@ -549,13 +530,13 @@ use strict; use warnings; -use Lingua::EN::NameGrammar; +use Lingua::EN::NameParse::Grammar; use Parse::RecDescent; use Exporter; use vars qw (@ISA @EXPORT_OK); -our $VERSION = '1.27'; +our $VERSION = '1.30'; @ISA = qw(Exporter); @EXPORT_OK = qw(&clean &case_surname); @@ -565,33 +546,26 @@ sub new { - my $class = shift; - my %args = @_; - - my $name = {}; - bless($name,$class); - - # Default to 2 initials per name. Can be overwritten if user defines - # 'initials' as a key in the hash supplied to new method. - $name->{initials} = 2; - - my $current_key; - foreach my $current_key (keys %args) - { - if ( $current_key eq 'salutation' or $current_key eq 'sal_default' ) - { - $name->{$current_key} = &_case_word($args{$current_key}); - } - else - { - $name->{$current_key} = $args{$current_key}; - } - } - - my $grammar = &Lingua::EN::NameGrammar::_create($name); - $name->{parse} = new Parse::RecDescent($grammar); - - return ($name); + my $class = shift; + my %args = @_; + + my $name = {}; + bless($name,$class); + + # Default to 2 initials per name. Can be overwritten if user defines + # 'initials' as a key in the hash supplied to new method. + $name->{initials} = 2; + + my $current_key; + foreach my $current_key (keys %args) + { + $name->{$current_key} = $args{$current_key}; + } + + my $grammar = &Lingua::EN::NameParse::Grammar::_create($name); + $name->{parse} = new Parse::RecDescent($grammar); + + return ($name); } #------------------------------------------------------------------------------- # Attempt to parse a string and retrieve it's components and properties @@ -600,41 +574,41 @@ sub parse { - my $name = shift; - my ($input_string) = @_; - - chomp($input_string); - - # If reverse ordered names are allowed, swap the surname component, before - # the comma, with the rest of the name. Rejoin the name, replacing comma - # with a space. - - if ( $name->{allow_reversed} and $input_string =~ /,/ ) - { - my ($first,$second) = split(/,/,$input_string); - $input_string = join(' ',$second,$first); - } - - $name->{components} = (); - $name->{properties} = (); - $name->{properties}{type} = 'unknown'; - $name->{error} = 0; - - $name->{input_string} = $input_string; - - $name = &_pre_parse($name); - unless ( $name->{error} ) - { - $name = &_assemble($name); - &_validate($name); - - if ( $name->{error} and $name->{auto_clean} ) - { - $name->{input_string} = &clean($name->{input_string}); - $name = &_assemble($name); - &_validate($name); - } - } + my $name = shift; + my ($input_string) = @_; + + chomp($input_string); + + # If reverse ordered names are allowed, swap the surname component, before + # the comma, with the rest of the name. Rejoin the name, replacing comma + # with a space. + + if ( $name->{allow_reversed} and $input_string =~ /,/ ) + { + my ($first,$second) = split(/,/,$input_string); + $input_string = join(' ',$second,$first); + } + + $name->{components} = (); + $name->{properties} = (); + $name->{properties}{type} = 'unknown'; + $name->{error} = 0; + + $name->{input_string} = $input_string; + + $name = &_pre_parse($name); + unless ( $name->{error} ) + { + $name = &_assemble($name); + &_validate($name); + + if ( $name->{error} and $name->{auto_clean} ) + { + $name->{input_string} = &clean($name->{input_string}); + $name = &_assemble($name); + &_validate($name); + } + } return($name->{error}); } @@ -643,19 +617,19 @@ sub clean { - my ($input_string) = @_; - - # remove illegal characters - $input_string =~ s/[^A-Za-z\-\'\.&\/ ]//go; - - # remove repeating spaces - $input_string =~ s/ +/ /go ; - - # remove any remaining leading or trailing space - $input_string =~ s/^ //; - $input_string =~ s/ $//; - - return($input_string); + my ($input_string) = @_; + + # remove illegal characters + $input_string =~ s/[^A-Za-z\-\'\.&\/ ]//go; + + # remove repeating spaces + $input_string =~ s/ +/ /go ; + + # remove any remaining leading or trailing space + $input_string =~ s/^ //; + $input_string =~ s/ $//; + + return($input_string); } #------------------------------------------------------------------------------- # Given a name object, returns all components in a hash @@ -714,8 +688,8 @@ #------------------------------------------------------------------------------- # Hash of of lists, indicating the order that name components are assembled in. # Each list element is itself the name of the key value in a name object. -# Used by the case_all, case_all_reversed and salutation methods. -# These hashes are created here globally, ais quite a large overhead is +# Used by the case_all and case_all_reversed methods. +# These hashes are created here globally, as quite a large overhead is # imposed if the are created locally, each time the method is invoked my %component_order= @@ -730,21 +704,34 @@ 'John_&_Mary_Smith' => ['given_name_1','conjunction_1','given_name_2','surname_1'], 'A_Smith_&_B_Jones' => ['initials_1','surname_1','conjunction_1','initials_2','surname_2'], - 'Mr_John_A_Smith' => ['precursor','title_1','given_name_1','initials_1','surname_1','suffix'], - 'Mr_John_Smith' => ['precursor','title_1','given_name_1','surname_1','suffix'], - 'Mr_A_Smith' => ['precursor','title_1','initials_1','surname_1','suffix'], - 'John_Adam_Smith' => ['precursor','given_name_1','middle_name','surname_1','suffix'], - 'John_A_Smith' => ['precursor','given_name_1','initials_1','surname_1','suffix'], - 'J_Adam_Smith' => ['precursor','initials_1','middle_name','surname_1','suffix'], - 'John_Smith' => ['precursor','given_name_1','surname_1','suffix'], - 'A_Smith' => ['precursor','initials_1','surname_1','suffix'] + 'Mr_John_Adam_Smith' => ['precursor','title_1','given_name_1','middle_name','surname_1','suffix'], + 'Mr_John_A_Smith' => ['precursor','title_1','given_name_1','initials_1','surname_1','suffix'], + 'Mr_J_Adam_Smith' => ['precursor','title_1','initials_1','middle_name','surname_1','suffix'], + 'Mr_John_Smith' => ['precursor','title_1','given_name_1','surname_1','suffix'], + 'Mr_A_Smith' => ['precursor','title_1','initials_1','surname_1','suffix'], + 'John_Adam_Smith' => ['precursor','given_name_1','middle_name','surname_1','suffix'], + 'John_A_Smith' => ['precursor','given_name_1','initials_1','surname_1','suffix'], + 'J_Adam_Smith' => ['precursor','initials_1','middle_name','surname_1','suffix'], + 'John_Smith' => ['precursor','given_name_1','surname_1','suffix'], + 'A_Smith' => ['precursor','initials_1','surname_1','suffix'], + 'John' => ['given_name_1'] ); + +# only include names with a single surname my %reverse_component_order= ( - 'Mr_John_A_Smith' => ['surname_1','given_name_1','initials_1','suffix'], - 'Mr_John_Smith' => ['surname_1','given_name_1','suffix'], - 'Mr_A_Smith' => ['surname_1','initials_1','suffix'], + 'Mr_&_Ms_A_&_B_Smith' => ['surname_1','title_1','conjunction_1','title_2','initials_1','conjunction_1','initials_2'], + 'Mr_A_&_Ms_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','title_2','initials_2'], + 'Mr_&_Ms_A_Smith' => ['surname_1','title_1','title_1','conjunction_1','title_2','initials_1'], + 'Mr_A_&_B_Smith' => ['surname_1','title_1','initials_1','conjunction_1','initials_2'], + 'John_&_Mary_Smith' => ['surname_1','given_name_1','conjunction_1','given_name_2'], + + 'Mr_John_Adam_Smith' => ['surname_1','title_1','given_name_1','middle_name','suffix'], + 'Mr_John_A_Smith' => ['surname_1','title_1','given_name_1','initials_1','suffix'], + 'Mr_J_Adam_Smith' => ['surname_1','title_1','initials_1','middle_name','suffix'], + 'Mr_John_Smith' => ['surname_1','title_1','given_name_1','suffix'], + 'Mr_A_Smith' => ['surname_1','title_1','initials_1','suffix'], 'John_Adam_Smith' => ['surname_1','given_name_1','middle_name','suffix'], 'John_A_Smith' => ['surname_1','given_name_1','initials_1','suffix'], 'J_Adam_Smith' => ['surname_1','initials_1','middle_name','suffix'], @@ -754,74 +741,101 @@ #------------------------------------------------------------------------------- # Apply correct capitalisation to a person's entire name -# Return a string of all cased components in correct order +# If the name type is unknown, return undef +# Else, return a string of all cased components in correct order sub case_all { - my $name = shift; + my $name = shift; + + my @cased_name; + + if ( $name->{properties}{type} eq 'unknown' ) + { + return undef; + } + + unless ( $component_order{$name->{properties}{type}} ) + { + # component order missing in array defined above + warn "Component order not defined for: $name->{properties}{type}"; + return undef; + } + + my %component_vals = $name->case_components; + my @order = @{ $component_order{$name->{properties}{type}} }; + + foreach my $component_key ( @order ) + { + # As some components such as precursors are optional, they will appear + # in the order array but may or may not have have a value, so only + # process defined values + if ( $component_vals{$component_key} ) + { + push(@cased_name,$component_vals{$component_key}); + } + } + if ( $name->{error} and $name->{force_case} ) + { + # Despite errors, try to name case non-matching section. As the format + # of this section is unknown, surname case will provide the best + # approximation, but still fail on initials of more than 1 letter + push(@cased_name,&case_surname($name->{properties}{non_matching},$name->{lc_prefix})); + } + + return(join(' ',@cased_name)); +} + +#------------------------------------------------------------------------------- +=head1 case_all_reversed - my @cased_name; +Apply correct capitalisation to a person's entire name and reverse the order +so that surname is first, followed by the other components, such as: Smith, Mr John A +Useful for creating a list of names that can be sorted by surname. - unless ( $name->{properties}{type} eq 'unknown' ) - { - my %component_vals = $name->case_components; - my @order = @{ $component_order{$name->{properties}{type} } }; +If name type is unknown , returns undef - foreach my $component_key ( @order ) - { - # As some components such as precursors are optional, they will appear - # in the order array but may or may not have have a value, so only - # process defined values - if ( $component_vals{$component_key} ) - { - push(@cased_name,$component_vals{$component_key}); - } - } - } +If the name type has a joint name, such as 'Mr_A_Smith_&_Ms_B_Jones', return undef, +as it is ambigious which surname to place at the start of the string - if ( $name->{error} and $name->{force_case} ) - { - # Despite errors, try to name case non-matching section. As the format - # of this section is unknown, surname case will provide the best - # approximation, but still fail on initials of more than 1 letter - push(@cased_name,&case_surname($name->{properties}{non_matching},$name->{lc_prefix})); - } +Else, returns a string of all cased components in correct reversed order - return(join(' ',@cased_name)); -} -#------------------------------------------------------------------------------- -# Apply correct capitalisation to a person's entire name -# Return a string of all cased components in correct reversed order +=cut sub case_all_reversed { - my $name = shift; - - my @cased_name_reversed; - - unless ( $name->{properties}{type} eq 'unknown' ) - { - my %component_vals = $name->case_components; - my @reverse_order = @{ $reverse_component_order{$name->{properties}{type} } }; - - foreach my $component_key ( @reverse_order ) - { - # As some components such as precursors are optional, they will appear - # in the order array but may or may not have have a value, so only - # process defined values - - my $component_value = $component_vals{$component_key}; - if ( $component_value ) - { - if ($component_key eq 'surname_1') + my $name = shift; + + my @cased_name_reversed; + + unless ( $name->{properties}{type} eq 'unknown' ) + { + unless ( $reverse_component_order{$name->{properties}{type} } ) + { + # this type of name should not be reversed, such as two surnames + return undef; + } + my %component_vals = $name->case_components; + my @reverse_order = @{ $reverse_component_order{$name->{properties}{type} } }; + + foreach my $component_key ( @reverse_order ) + { + # As some components such as precursors are optional, they will appear + # in the order array but may or may not have have a value, so only + # process defined values + + my $component_value = $component_vals{$component_key}; + if ( $component_value ) { - $component_value .= ','; + if ($component_key eq 'surname_1') + { + $component_value .= ','; + } + push(@cased_name_reversed,$component_value); } - push(@cased_name_reversed,$component_value); - } - } - } - return(join(' ',@cased_name_reversed)); + } + } + return(join(' ',@cased_name_reversed)); } #------------------------------------------------------------------------------- # The user may specify their own preferred spelling for surnames. @@ -913,7 +927,7 @@ # Correct for possessives such as "John's" or "Australia's". Although this # should not occur in a person's name, they are valid for proper names. # As this subroutine may be used to capitalise words other than names, - # we may need to account for this case. Note that the s must be at the + # we may need to account for this case. Note that the 's' must be at the # end of the string $surname =~ s/(\w+)'S(\s+)/$1's$2/; $surname =~ s/(\w+)'S$/$1's/; @@ -930,80 +944,137 @@ } #------------------------------------------------------------------------------- # Create a personalised greeting from one or two person's names -# Returns the salutation as a string, such as "Dear Mr Smith" +# Returns the salutation as a string, such as "Dear Mr Smith", or "Dear Sue" sub salutation { - my $name = shift; - - unless ( $name->{salutation} and $name->{sal_default}) - { - die ("No salutation word or default defined"); - } - - my @salutation; - push(@salutation,$name->{salutation}); + my $name = shift; + my %args = @_; + + my $salutation = 'Dear'; + my $sal_default = 'Friend'; + my $sal_type = 'title_plus_surname'; - # Personalised salutations cannot be created for Estates or people - # without some title, refer to default salutation - if - ( - $name->{error} or - ( $name->{components}{precursor} and $name->{components}{precursor} =~ /Estate/i) or - not $name->{components}{title_1} - ) - { - # create salutation in the form: Dear Friend(s)? - my $default = $name->{sal_default}; + # Check to see if we should override defualts with any user specified preferences + if ( %args ) + { + foreach my $current_key (keys %args) + { + $current_key eq 'salutation' and $salutation = $args{$current_key}; + $current_key eq 'sal_default' and $sal_default = $args{$current_key}; + $current_key eq 'sal_type' and $sal_type = $args{$current_key}; + } + } + - # Despite an error, the presence of a conjunction probably - # means we are dealing with 2 or more people. - # For example Mr AB Smith & John Jones - if ( $name->{input_string} =~ / (And|&) /i ) - { - $default .= 's'; - } - push(@salutation,$default); - } - else - { - # create salutation in the form: Dear - my %component_vals = $name->case_components; - my @order = @{ $component_order{$name->{properties}{type} } }; - my ($component,@cased_components); - foreach my $component ( @order ) - { - unless - ( - # ignore inital_1, initials_2, given_name_1, etc - $component =~ /precursor|initial|given_name|middle_name|suffix/ or - not $component_vals{$component} ) - { - push(@salutation,$component_vals{$component}); - # shared initial and surname (eg brothers), so duplicate title_1 - if ( $name->{properties}{type} eq 'Mr_A_&_B_Smith' and $component eq 'conjunction_1' ) + my @greeting; + push(@greeting,$salutation); + + # Personalised salutations cannot be created for Estates or people + # without some title + if + ( + $name->{error} or + ( $name->{components}{precursor} and $name->{components}{precursor} =~ /Estate/i) + ) + { + # Despite an error, the presence of a conjunction probably + # means we are dealing with 2 or more people. + # For example Mr AB Smith & John Jones + if ( $name->{input_string} =~ / (And|&) /i ) + { + $sal_default .= 's'; + } + push(@greeting,$sal_default); + } + else + { + my %component_vals = $name->case_components; + + if ( $sal_type eq 'given_name') + { + if ( $component_vals{'given_name_1'} ) { - push(@salutation,$component_vals{title_1}); + push(@greeting,$component_vals{'given_name_1'}); + if ( $component_vals{'given_name_2'} ) + { + push(@greeting,$component_vals{'conjunction_1'}); + push(@greeting,$component_vals{'given_name_2'}); + } } - } - } - } - return(join(' ',@salutation)); + else + { + # No given name such as 'A_Smith','J_Adam_Smith','Mr_A_Smith' + # Must use default + push(@greeting,$sal_default); + } + } + elsif ( $sal_type eq 'title_plus_surname' ) + { + if ( $name->{properties}{number} == 1 ) + { + if ( $component_vals{'title_1'} ) + { + push(@greeting,$component_vals{'title_1'}); + push(@greeting,$component_vals{'surname_1'}); + } + else + { + # No title such as 'A_Smith','J_Adam_Smith', so must use default + push(@greeting,$sal_default); + } + } + elsif ( $name->{properties}{number} == 2 ) + { + # a joint name + + my $type = $name->{properties}{type}; + if ( $type eq 'Mr_&_Ms_A_Smith' or $type eq 'Mr_A_&_Ms_B_Smith' or $type eq 'Mr_&_Ms_A_&_B_Smith' ) + { + # common surname + push(@greeting,$component_vals{'title_1'}); + push(@greeting,$component_vals{'conjunction_1'}); + push(@greeting,$component_vals{'title_2'}); + push(@greeting,$component_vals{'surname_1'}); + + } + elsif ( $type eq 'Mr_A_Smith_&_Ms_B_Jones' or $type eq 'Mr_John_Smith_&_Ms_Mary_Jones' ) + { + push(@greeting,$component_vals{'title_1'}); + push(@greeting,$component_vals{'surname_1'}); + push(@greeting,$component_vals{'conjunction_1'}); + push(@greeting,$component_vals{'title_2'}); + push(@greeting,$component_vals{'surname_2'}); + } + else + { + # No title such as A_Smith_&_B_Jones', 'John_Smith_&_Mary_Jones' + # Must use default + push(@greeting,$sal_default); + } + } + } + else + { + warn "Invalid sal_type : ", $sal_type; + push(@greeting,$sal_default); + } + } + return(join(' ',@greeting)); } #------------------------------------------------------------------------------- -# Return all name properties in a hash +# Return all name properties as a hash sub properties { - my $name = shift; - return(%{ $name->{properties} }); + my $name = shift; + return(%{ $name->{properties} }); } - #------------------------------------------------------------------------------- # Create a text report to standard output listing # - the input string, -# - the name of each defined component +# - the name of each defined component, if it exists # - any non matching component sub report @@ -1011,15 +1082,20 @@ my $name = shift; printf("%-17.17s : %-40.40s\n","Input",$name->{input_string}); - my %comps = $name->case_components; - if ( %comps ) + + my %props = $name->properties; + unless ($props{type} eq 'unknown') { - foreach my $comp ( sort keys %comps) + my %comps = $name->case_components; + if ( %comps ) { - printf("%-17.17s : %s\n",$comp,$comps{$comp}); + foreach my $comp ( sort keys %comps) + { + printf("%-17.17s : %s\n",$comp,$comps{$comp}); + } } } - my %props = $name->properties; + if ( $props{type} ) { printf("%-17.17s : %-40.40s\n","Name type",$props{type}); @@ -1036,18 +1112,26 @@ # PRIVATE METHODS #------------------------------------------------------------------------------- -# Check that common reserved word (as found in company names) do not appear + sub _pre_parse { - my $name = shift; - - if ( $name->{input_string} =~ - /\bPty\.? Ltd\.?$|\bLtd\.?$|\bPLC$|Association|Department|National|Society/i ) - { - $name->{error} = 1; - $name->{properties}{non_matching} = $name->{input_string}; - } - return($name); + my $name = shift; + # Check that common reserved word (as found in company names) do not appear + if ( $name->{input_string} =~ + /\bPty\.? Ltd\.?$|\bLtd\.?$|\bPLC$|Association|Department|National|Society/i ) + { + $name->{error} = 1; + $name->{properties}{non_matching} = $name->{input_string}; + } + + # For the case of a single name such as 'Voltaire' we need to add a trailing space + # to the input string. This is because the grammar tree expects a terminator (the space) + # optionally followed by other productions or non matching text + if ( $name->{input_string} =~ /^[A-Z]{2,}(\-)?[A-Z]{0,}$/i ) + { + $name->{input_string} .= ' '; + } + return($name); } #------------------------------------------------------------------------------- @@ -1056,118 +1140,121 @@ # sub _assemble { - my $name = shift; - - my $parsed_name = $name->{parse}->full_name($name->{input_string}); - - # Place components into a separate hash, so they can be easily returned - # for the user to inspect and modify. - - # For correct matching, the grammar of each component must include the - # trailing space that separates it from any following word. This should - # now be removed from the components, and will be restored by the - # case_all and salutation methods, if called. - - $name->{components}{precursor} = q{}; - if ( $parsed_name->{precursor} ) - { - $name->{components}{precursor} = &_trim_space($parsed_name->{precursor}); - } - - $name->{components}{title_1} = q{}; - if ( $parsed_name->{title_1} ) - { - $name->{components}{title_1} = &_trim_space($parsed_name->{title_1}); - } - - $name->{components}{title_2} = q{}; - if ( $parsed_name->{title_2} ) - { - $name->{components}{title_2} = &_trim_space($parsed_name->{title_2}); - } - - $name->{components}{given_name_1} = q{}; - if ( $parsed_name->{given_name_1} ) - { - $name->{components}{given_name_1} = &_trim_space($parsed_name->{given_name_1}); - } - - $name->{components}{given_name_2} = q{}; - if ( $parsed_name->{given_name_2} ) - { - $name->{components}{given_name_2} = &_trim_space($parsed_name->{given_name_2}); - } - - - $name->{components}{middle_name} = q{}; - if ( $parsed_name->{middle_name} ) - { - $name->{components}{middle_name} = &_trim_space($parsed_name->{middle_name}); - } - - $name->{components}{initials_1} = q{}; - if ( $parsed_name->{initials_1} ) - { - $name->{components}{initials_1} = &_trim_space($parsed_name->{initials_1}); - } - - $name->{components}{initials_2} = q{}; - if ( $parsed_name->{initials_2} ) - { - $name->{components}{initials_2} = &_trim_space($parsed_name->{initials_2}); - } - - $name->{components}{conjunction_1} = q{}; - if ( $parsed_name->{conjunction_1} ) - { - $name->{components}{conjunction_1} = &_trim_space($parsed_name->{conjunction_1}); - } - - $name->{components}{conjunction_2} = q{}; - if ( $parsed_name->{conjunction_2} ) - { - $name->{components}{conjunction_2} = &_trim_space($parsed_name->{conjunction_2}); - } - - $name->{components}{surname_1} = q{}; - if ( $parsed_name->{surname_1} ) - { - $name->{components}{surname_1} = &_trim_space($parsed_name->{surname_1}); - } - - $name->{components}{surname_2} = q{}; - if ( $parsed_name->{surname_2} ) - { - $name->{components}{surname_2} = &_trim_space($parsed_name->{surname_2}); - } - - $name->{components}{suffix} = q{}; - if ( $parsed_name->{suffix} ) - { - $name->{components}{suffix} = &_trim_space($parsed_name->{suffix}); - } - - - $name->{properties}{non_matching} = q{}; - if ( $parsed_name->{non_matching} ) - { - $name->{properties}{non_matching} = $parsed_name->{non_matching}; - } - - $name->{properties}{number} = 0; - $name->{properties}{number} = $parsed_name->{number}; - $name->{properties}{type} = $parsed_name->{type}; - - return($name); + my $name = shift; + + # $::RD_TRACE = 1; # for debugging RecDescent output + # Use Parse::RecDescent to do the parsing. 'full_name' is a label for the complete grammar tree + # defined in Lingua::EN::NameParse::Grammar + my $parsed_name = $name->{parse}->full_name($name->{input_string}); + + # Place components into a separate hash, so they can be easily returned + # for the user to inspect and modify. + + # For correct matching, the grammar of each component must include the + # trailing space that separates it from any following word. This should + # now be removed from the components, and will be restored by the + # case_all and salutation methods, if called. + + $name->{components}{precursor} = q{}; + if ( $parsed_name->{precursor} ) + { + $name->{components}{precursor} = &_trim_space($parsed_name->{precursor}); + } + + $name->{components}{title_1} = q{}; + if ( $parsed_name->{title_1} ) + { + $name->{components}{title_1} = &_trim_space($parsed_name->{title_1}); + } + + $name->{components}{title_2} = q{}; + if ( $parsed_name->{title_2} ) + { + $name->{components}{title_2} = &_trim_space($parsed_name->{title_2}); + } + + $name->{components}{given_name_1} = q{}; + if ( $parsed_name->{given_name_1} ) + { + $name->{components}{given_name_1} = &_trim_space($parsed_name->{given_name_1}); + } + + $name->{components}{given_name_2} = q{}; + if ( $parsed_name->{given_name_2} ) + { + $name->{components}{given_name_2} = &_trim_space($parsed_name->{given_name_2}); + } + + + $name->{components}{middle_name} = q{}; + if ( $parsed_name->{middle_name} ) + { + $name->{components}{middle_name} = &_trim_space($parsed_name->{middle_name}); + } + + $name->{components}{initials_1} = q{}; + if ( $parsed_name->{initials_1} ) + { + $name->{components}{initials_1} = &_trim_space($parsed_name->{initials_1}); + } + + $name->{components}{initials_2} = q{}; + if ( $parsed_name->{initials_2} ) + { + $name->{components}{initials_2} = &_trim_space($parsed_name->{initials_2}); + } + + $name->{components}{conjunction_1} = q{}; + if ( $parsed_name->{conjunction_1} ) + { + $name->{components}{conjunction_1} = &_trim_space($parsed_name->{conjunction_1}); + } + + $name->{components}{conjunction_2} = q{}; + if ( $parsed_name->{conjunction_2} ) + { + $name->{components}{conjunction_2} = &_trim_space($parsed_name->{conjunction_2}); + } + + $name->{components}{surname_1} = q{}; + if ( $parsed_name->{surname_1} ) + { + $name->{components}{surname_1} = &_trim_space($parsed_name->{surname_1}); + } + + $name->{components}{surname_2} = q{}; + if ( $parsed_name->{surname_2} ) + { + $name->{components}{surname_2} = &_trim_space($parsed_name->{surname_2}); + } + + $name->{components}{suffix} = q{}; + if ( $parsed_name->{suffix} ) + { + $name->{components}{suffix} = &_trim_space($parsed_name->{suffix}); + } + + + $name->{properties}{non_matching} = q{}; + if ( $parsed_name->{non_matching} ) + { + $name->{properties}{non_matching} = $parsed_name->{non_matching}; + } + + $name->{properties}{number} = 0; + $name->{properties}{number} = $parsed_name->{number}; + $name->{properties}{type} = $parsed_name->{type}; + + return($name); } #------------------------------------------------------------------------------- # Remove any trailing spaces sub _trim_space { - my ($string) = @_; - $string =~ s/ $//; - return($string); + my ($string) = @_; + $string =~ s/ $//; + return($string); } #------------------------------------------------------------------------------- # Check if any name components have illegal characters, or do not have the @@ -1176,38 +1263,38 @@ sub _validate { - my $name = shift; - - if ( $name->{properties}{non_matching} ) - { - $name->{error} = 1; - } - # illegal characters found - elsif ( $name->{input_string} =~ /[^A-Za-z\-\'\.,&\/ ]/ ) - { - $name->{error} = 1; - } - elsif ( not &_valid_name($name->{components}{given_name_1}) ) - { - $name->{error} = 1; - } - elsif ( not &_valid_name($name->{components}{middle_name}) ) - { - $name->{error} = 1; - } - - elsif ( not &_valid_name($name->{components}{surname_1}) ) - { - $name->{error} = 1; - } - elsif ( not &_valid_name($name->{components}{surname_2}) ) - { - $name->{error} = 1; - } - else - { - $name->{error} = 0; - } + my $name = shift; + + if ( $name->{properties}{non_matching} ) + { + $name->{error} = 1; + } + # illegal characters found + elsif ( $name->{input_string} =~ /[^A-Za-z\-\'\.,&\/ ]/ ) + { + $name->{error} = 1; + } + elsif ( not &_valid_name($name->{components}{given_name_1}) ) + { + $name->{error} = 1; + } + elsif ( not &_valid_name($name->{components}{middle_name}) ) + { + $name->{error} = 1; + } + + elsif ( not &_valid_name($name->{components}{surname_1}) ) + { + $name->{error} = 1; + } + elsif ( not &_valid_name($name->{components}{surname_2}) ) + { + $name->{error} = 1; + } + else + { + $name->{error} = 0; + } } #------------------------------------------------------------------------------- # If the name has an assigned value, check that it contains a vowel sound, @@ -1216,30 +1303,30 @@ sub _valid_name { - my ($name) = @_; - if ( not $name ) - { - return(1); - } - # Names should have a vowel sound, - # valid exceptions are Ng, Tsz,Md, Cng,Hng,Chng etc - elsif ( $name and $name =~ /[aeiouyj]|^(ng|tsz|md|(c?h|[pts])ng)$/i ) - { - return(1); - } - else - { - return(0); - } + my ($name) = @_; + if ( not $name ) + { + return(1); + } + # Names should have a vowel sound, + # valid exceptions are Ng, Tsz,Md, Cng,Hng,Chng etc + elsif ( $name and $name =~ /[aeiouyj]|^(ng|tsz|md|(c?h|[pts])ng)$/i ) + { + return(1); + } + else + { + return(0); + } } #------------------------------------------------------------------------------- # Upper case first letter, lower case the rest, for all words in string sub _case_word { - my ($word) = @_; - - $word =~ s/(\w+)/\u\L$1/g; - return($word); + my ($word) = @_; + + $word =~ s/(\w+)/\u\L$1/g; + return($word); } #------------------------------------------------------------------------------- return(1); diff -Nru liblingua-en-nameparse-perl-1.27/MANIFEST liblingua-en-nameparse-perl-1.30/MANIFEST --- liblingua-en-nameparse-perl-1.27/MANIFEST 2010-07-03 00:57:49.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/MANIFEST 2011-01-18 04:52:58.000000000 +0000 @@ -4,7 +4,7 @@ Makefile.PL examples/demo.pl lib/Lingua/EN/NameParse.pm -lib/Lingua/EN/NameGrammar.pm +lib/Lingua/EN/NameParse/Grammar.pm surname_prefs.txt t/main.t t/rules.t diff -Nru liblingua-en-nameparse-perl-1.27/META.yml liblingua-en-nameparse-perl-1.30/META.yml --- liblingua-en-nameparse-perl-1.27/META.yml 2010-07-04 07:52:43.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/META.yml 2011-03-31 00:14:10.000000000 +0000 @@ -1,14 +1,22 @@ --- #YAML:1.0 -name: Lingua-EN-NameParse -version: 1.27 -abstract: Manipulate peoples names, titles and initials -license: perl -author: +name: Lingua-EN-NameParse +version: 1.30 +abstract: Manipulate peoples names, titles and initials +author: - Kim Ryan -generated_by: ExtUtils::MakeMaker version 6.42_01 -distribution_type: module -requires: - Parse::RecDescent: 0 +license: perl +distribution_type: module +configure_requires: + ExtUtils::MakeMaker: 0 +build_requires: + ExtUtils::MakeMaker: 0 +requires: + Parse::RecDescent: 0 +no_index: + directory: + - t + - inc +generated_by: ExtUtils::MakeMaker version 6.56 meta-spec: - url: http://module-build.sourceforge.net/META-spec-v1.3.html - version: 1.3 + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: 1.4 diff -Nru liblingua-en-nameparse-perl-1.27/README liblingua-en-nameparse-perl-1.30/README --- liblingua-en-nameparse-perl-1.27/README 2010-07-01 11:40:07.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/README 2010-12-30 22:20:19.000000000 +0000 @@ -4,45 +4,43 @@ SYNOPSIS - use Lingua::EN::NameParse qw(clean case_surname); + use Lingua::EN::NameParse qw(clean case_surname); - # optional configuration arguments - my %args = - ( - salutation => 'Dear', - sal_default => 'Friend', - auto_clean => 1, - force_case => 1, - lc_prefix => 1, - initials => 3, - allow_reversed => 1, - joint_names => 0, - extended_titles => 1 - ); + # optional configuration arguments + my %args = + ( + auto_clean => 1, + force_case => 1, + lc_prefix => 1, + initials => 3, + allow_reversed => 1, + joint_names => 0, + extended_titles => 0 + ); - my $name = new Lingua::EN::NameParse(%args); + my $name = new Lingua::EN::NameParse(%args); - $error = $name->parse("MR AC DE SILVA"); + $error = $name->parse("MR AC DE SILVA"); - %name_comps = $name->components; - $surname = $name_comps{surname_1}; # DE SILVA + %name_comps = $name->components; + $surname = $name_comps{surname_1}; # DE SILVA - $correct_casing = $name->case_all; # Mr AC de Silva + $correct_casing = $name->case_all; # Mr AC de Silva - $correct_casing = $name->case_all_reversed ; # de Silva, AC + $correct_casing = $name->case_all_reversed ; # de Silva, AC - $good_name = &clean("Bad Na9me "); # "Bad Name" + $good_name = &clean("Bad Na9me "); # "Bad Name" + + $salutation = $name->salutation(salutation => 'Dear',sal_default => 'Friend')); # Dear Mr de Silva - $name->salutation; # Dear Mr de Silva + %my_properties = $name->properties; + $number_surnames = $my_properties{number}; # 1 + $bad_input = $my_properties{non_matching}; - %my_properties = $name->properties; - $number_surnames = $my_properties{number}; # 1 - $bad_input = $my_properties{non_matching}; + $name->report; # create a report listing all information about the parsed name - $name->report; # create a report listing all information about the parsed name - - $lc_prefix = 0; - $correct_case = &case_surname("DE SILVA-MACNAY",$lc_prefix); # De Silva-MacNay + $lc_prefix = 0; + $correct_case = &case_surname("DE SILVA-MACNAY",$lc_prefix); # De Silva-MacNay DESCRIPTION @@ -88,7 +86,7 @@ COPYRIGHT AND LICENSE -Copyright (c) 2008 Kim Ryan. All rights reserved. +Copyright (c) 2011 Kim Ryan. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the Perl Artistic License (see http://www.perl.com/perl/misc/Artistic.html). diff -Nru liblingua-en-nameparse-perl-1.27/t/main.t liblingua-en-nameparse-perl-1.30/t/main.t --- liblingua-en-nameparse-perl-1.27/t/main.t 2010-07-03 00:58:32.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/t/main.t 2011-01-03 05:37:43.000000000 +0000 @@ -4,7 +4,7 @@ #------------------------------------------------------------------------------ use strict; -use Test::Simple tests => 11; +use Test::Simple tests => 12; use Lingua::EN::NameParse qw(clean case_surname); @@ -39,16 +39,22 @@ $name->parse($input); ok( $name->case_all eq "Mr AB Machlin & Jane O'Brien" ,'force casing'); -# Test salutation -$input = "DR. A.B.C. FEELGOOD"; -$name->parse($input); -ok( $name->salutation eq 'Dear Dr. Feelgood' ,'salutation'); - # Test default salutation $input = "john smith"; $name->parse($input); ok( $name->salutation eq 'Dear Friend' ,'default salutation'); +# Test title_plus_surname salutation +$input = "DR. A.B.C. FEELGOOD"; +$name->parse($input); +ok( $name->salutation(sal_type => 'title_plus_surname') eq 'Dear Dr. Feelgood' ,'title_plus_surname salutation'); + +# Test given_name salutation +$input = "DR ANDREW FEELGOOD"; +$name->parse($input); +ok( $name->salutation(sal_type => 'given_name') eq 'Dear Andrew' ,'given_name salutation'); + + # Test component extraction $input = "Estate Of The Late Lieutenant Colonel AB Van Der Heiden Jnr"; $name->parse($input); diff -Nru liblingua-en-nameparse-perl-1.27/t/rules.t liblingua-en-nameparse-perl-1.30/t/rules.t --- liblingua-en-nameparse-perl-1.27/t/rules.t 2010-07-03 00:59:16.000000000 +0000 +++ liblingua-en-nameparse-perl-1.30/t/rules.t 2011-01-18 06:31:26.000000000 +0000 @@ -4,7 +4,7 @@ #------------------------------------------------------------------------------ use strict; -use Test::Simple tests => 17; +use Test::Simple tests => 20; use Lingua::EN::NameParse; my %args = @@ -62,6 +62,16 @@ %props = $name->properties; ok( $props{type} eq 'A_Smith_&_B_Jones', 'A_Smith_&_B_Jones format'); +$input = "MR JOHN FITZGERALD KENNEDY"; +$name->parse($input); +%props = $name->properties; +ok( $props{type} eq 'Mr_John_Adam_Smith', 'Mr_John_Adam_Smith format'); + +$input = "MR J FITZGERALD KENNEDY"; +$name->parse($input); +%props = $name->properties; +ok( $props{type} eq 'Mr_J_Adam_Smith', 'Mr_J_Adam_Smith format'); + $input = "MR JOHN F KENNEDY"; $name->parse($input); %props = $name->properties; @@ -102,4 +112,9 @@ %props = $name->properties; ok( $props{type} eq 'A_Smith', 'A_Smith format'); +$input = "Voltaire"; +$name->parse($input); +%props = $name->properties; +ok( $props{type} eq 'John', 'John format'); +