Wednesday, 24 March 2010
perl unicode: combined vs full unicode graphemes
http://www.effectiveperlprogramming.com/blog/102
-----
use charnames ':full';
my $string =
"\N{LATIN SMALL LETTER A WITH DIAERESIS}"
. "\N{LATIN SMALL LETTER A}"
. "\N{COMBINING DIAERESIS}"
;
ää
------
my (@g) = $string =~ /(.)/g;
say scalar(@g); # 3
ä a¨
-----
my (@g) = $string =~ /(\X)/g;
say scalar(@g); # 2
ä ä
------
my $precomposed =
"\N{LATIN SMALL LETTER A WITH DIAERESIS}";
my $combined =
"\N{LATIN SMALL LETTER A}" .
"\N{COMBINING DIAERESIS}";
------
if ($precomposed eq $combined) {
say 'equal';
} else {
say 'unequal';
}
unequal
------
use Unicode::Normalize;
my $postcomposed = NFC($precomposed);
if ($precomposed eq $postcomposed) {
say 'equal';
} else {
say 'unequal';
}
equal
------
use open IO => ':utf8';
use open OUT => ':shiftjis';
use open IN => ':cp1251';
open my ($ofh), '>:utf8', $filename;
open my ($ifh), '<:encoding(iso-8859-1)', $filename;
------
But what about command line arguments?
use I18N::Langinfo qw(langinfo CODESET);
use Encode qw(decode);
my $codeset = langinfo(CODESET);
@ARGV = map { decode $codeset, $_ } @ARGV;
------
------
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment