Loading unicode 0 → 100755 +767 −0 Original line number Diff line number Diff line #!/usr/bin/env perl use v5.30; use utf8; use warnings; use experimental qw(signatures); =encoding utf-8 =head1 NAME C<unicode> — identify Unicode characters or print other useful info =head1 SYNOPSIS unicode [COMMON OPTIONS] COMMAND [OPTIONS] [ARGUMENTS…] Common options: -h Show synopsis and exit. --help Show extended help and exit. -p --pager=[PAGER] Use ‹PAGER› program to display output. If ‹PAGER› is not provided, use ‹$PAGER› from environment. -P --no-pager Do not use program to display output. --no-less Commands and their options identify [--file|--argv] [--ws] [--csv=[SEP]] [@FILE|STRING]… encode [--file|argv] [--ws] [--as=ENCODING] [@FILE|STRING]… categories [KEY…] charblocks|charscripts [--list] charblocks|charscripts [--file|--argv] [--ws] (@FILE|STRING)… charblocks|charscripts (--get|--search) BLOCK… charblocks|charscripts --list-chars [--table] BLOCK… =head1 DESCRIPTION Depending on the command, the script provides information from Unicode database. =head2 C<help> Equivalent to C<--help>. =head2 C<identify> identify [OPTIONS] [@FILE|STRING]… Aliases: C<ident>, C<id>. Identify characters provided read from a C<@FILE> or from a C<STRING>. -F --file[s] Interpret all arguments as files to read from. -A --argv Interpret all arguments as literal strings. --ws Display information about whitespaces as well. --csv=[SEP] Display information as values separated by ‹SEP›. Uses ‹;› by default. =head2 C<encode> encode [OPTIONS] [@FILE|STRING]… Aliases: C<enc>. Display encoding of characters from provided C<@FILE> or C<STRING>. Options C<--file>, C<--files>, C<--argv> and C<--ws> are the same as for C<identify> command. In addition, C<encode> supports the following options: -E --as=ENCODING Use given ‹ENCODING› instead of (default) UTF-8. You may list available encodings with ‹iconv --list›. =head2 C<categories> categories [KEY…] Aliases: C<cat>, C<category>. Display list of categories with their names. If arguments are provided, only categories matching given keys are displayed. =head2 C<charblocks> charblocks [OPTIONS] [ARGS…] Aliases: C<blocks>, C<charblock>, C<block>. Displays various information about character blocks. The behaviour depends on provided options and arguments: =over =item C<charblocks [--list|-l]> Lists all character blocks and their ranges. =item C<charblocks [--file|--argv] [--ws] (@FILE|STRING)…> Displays character blocks from given sources. See C<identify> command for documentation of C<--file>, C<--argv> and C<--ws>. =item C<charblocks (--get|-g) BLOCK…> As C<--list>, except prints only blocks named C<BLOCK>. =item C<charblocks (--search|-s) PATTERN…> As C<--list>, except prints only blocks matching at least one C<PATTERN>. =item C<charblocks (--list-chars|-x) [--table|-t] BLOCK…> Lists characters of the given B<BLOCK>s. The format is same as for C<identify> command, unless C<--table> is also given, in which case a table output of characters is printed. =back =head2 C<charscripts> charscripts [OPTIONS] [ARGS…] Aliases: C<scripts>, C<charscript>, C<script>. This command has the same options and semantics as C<charblocks>, except it works with character scripts instead of character blocks. The difference is explained e.g. here: L<Unicode::UCD|https://perldoc.perl.org/Unicode::UCD#Blocks-versus-Scripts>. =cut #------------------------------------------------------------------------------ package CharSource; #------------------------------------------------------------------------------ # This is an “abstract” class for character sources. sub new($class, $options, @) { return bless { _opt => $options, }, $class; } sub next($self) { ...; } #------------------------------------------------------------------------------ package CharSource::Empty; #------------------------------------------------------------------------------ # Empty character source. use parent -norequire, qw(CharSource); sub next($self) { # Nop, this will return empty list to the caller. } #------------------------------------------------------------------------------ package CharSource::String; #------------------------------------------------------------------------------ # Character source that provides characters from a string literal. use Encode; use parent -norequire, qw(CharSource); sub new($class, $options, $string) { my $self = $class->SUPER::new($options); $self->{_str} = [ split //, $string ]; return $self; } sub next($self) { if (!$self->{_opt}->{whitespaces}) { while ($self->{_str}->@* > 0 && $self->{_str}->[0] =~ /\s/) { shift $self->{_str}->@*; } } return if $self->{_str}->@* == 0; return shift $self->{_str}->@*; } #------------------------------------------------------------------------------ package CharSource::File; #------------------------------------------------------------------------------ # Character source that provides characters from a file. use parent -norequire, qw(CharSource); sub new($class, $options, $filename) { my $self = $class->SUPER::new($options); open $self->{_fh}, '<:encoding(utf-8)', $filename or die "$filename: $!\n"; return $self; } sub next($self) { if ($self->{_cur} && defined(my $chr = $self->{_cur}->next)) { return $chr; } my $line = readline $self->{_fh}; return if !defined $line; $self->{_cur} = CharSource::String->new($self->{_opt}, $line); return $self->next; } #------------------------------------------------------------------------------ package CharSource::File::Armour; #------------------------------------------------------------------------------ use parent -norequire, qw(CharSource::File); # Same as ‹CharSource::File›, except protect against exception. It will # behave like ‹CharSource::Empty› in that case. sub new($class, $options, $filename) { my $self = eval { $class->SUPER::new($options, $filename); }; return $self if !$@; chomp $@; say STDERR "$@"; return CharSource::Empty->new($options); } #------------------------------------------------------------------------------ package CharSource::Dispatcher; #------------------------------------------------------------------------------ # Class that dispatches String or File sources on provided arguments. use Encode; use parent -norequire, qw(CharSource); sub new($class, $options, $argv) { my $self = $class->SUPER::new($options); $self->{_argv} = [ $argv->@* ]; return $self; } sub _get_source($self, $arg) { if ((!$self->{_opt}->{files} && !$self->{_opt}->{argv} && $arg =~ s/^@//) || $self->{_opt}->{files}) { return CharSource::File::Armour->new($self->{_opt}, $arg); } return CharSource::String->new($self->{_opt}, Encode::decode('utf-8', $arg)); } sub next($self) { if ($self->{_cur} && defined(my $chr = $self->{_cur}->next)) { return $chr; } my $arg = shift $self->{_argv}->@*; return if !defined $arg; $self->{_cur} = $self->_get_source($arg); return $self->next; } #------------------------------------------------------------------------------ package Pager; #------------------------------------------------------------------------------ sub _exec_child($pager, $pipe_in, $pipe_out) { close $pipe_out; open STDIN, "<&", $pipe_in or die "pager: Cannot dup() standard input: $!\n"; exec $pager or exit 1; } sub _exec_parent($pipe_in, $pipe_out) { close $pipe_in; open STDOUT, ">&", $pipe_out or die "pager: Cannot dup() to standard output:: $!\n"; } sub _exec($pager) { my ($pipe_in, $pipe_out); die "pager: Cannot create pipe: $!\n" if !pipe $pipe_in, $pipe_out; my $pid = fork; die "pager: Cannot fork: $!\n" if !defined $pid; _exec_child($pager, $pipe_in, $pipe_out) if $pid == 0; _exec_parent($pipe_in, $pipe_out); return $pid; } sub new($class, $pager) { return bless { _pid => Pager::_exec($pager), }, $class; } sub DESTROY($self) { return if !defined $self->{_pid}; close STDOUT; waitpid delete $self->{_pid}, 0; } #------------------------------------------------------------------------------ package main; #------------------------------------------------------------------------------ use Data::Dumper; use Getopt::Long qw(:config posix_default); use List::Util qw(max); use Unicode::UCD; use Pod::Usage; my $CATEGORIES = Unicode::UCD::general_categories; my $CHAR_BLOCKS = Unicode::UCD::charblocks; sub _get_pager($options) { return Pager->new($options->{pager}) if $options->{pager}; return Pager->new($ENV{PAGER} // 'less -R') if defined $options->{pager}; } sub _get_src($options, $args) { return CharSource::Dispatcher->new($options, $args); } sub _get_display_char($char, $info) { return "\N{DOTTED CIRCLE}$char" if $info->{category} eq 'Mn'; return "\N{OPEN BOX}" if $info->{category} eq 'Zs'; return "\N{WHITE BULLET}" if $char !~ /\p{IsGraph}/; return $char; } sub print_char($char, $info, %o) { my $padding = " " x $o{padding}; my $display = _get_display_char($char, $info); if ($display eq $char) { printf "${padding}\e[96m%s\e[0m", $display; } else { printf "${padding}\e[97m%s\e[0m", $display; } } sub _identify_char_output($options, $char, $ci, %o) { $o{padding} //= 0; my $padding = " " x $o{padding}; my $char_offset = 6; my $char_shift = $o{padding} + $char_offset; if (!defined $ci) { printf "${padding}\e[97m%-3s\e[0m ", "\N{OPEN BOX}"; print "${padding}\e[91mUnknown character\e[0m\n"; return; } print_char($char, $ci, %o); printf "%sU+%s \e[92m%s\e[0m\n", " " x ($char_offset - 1), $ci->{code}, $ci->{name}; printf "${padding}%s\e[37m%s: %s\e[0m (\e[96m%s\e[36m:%s\e[0m)\n", " " x $char_offset, $ci->{script}, $ci->{block}, $ci->{category}, $CATEGORIES->{$ci->{category}}; } sub _identify_char_csv($options, $char, $ci) { if (!defined $ci) { printf STDERR "%08x: Unknown character\n", ord $char; return; } if ($char eq $options->{csv}) { printf STDERR "Separator ignored\n"; return; } say join $options->{csv}, $char, $ci->@{qw(code name script block category)}, $CATEGORIES->{$ci->{category}}; } sub identify_char($options, $char, %o) { my $ord = ord $char; my $ci = Unicode::UCD::charinfo($ord); if (defined $options->{csv}) { _identify_char_csv($options, $char, $ci); } else { _identify_char_output($options, $char, $ci); } } sub identify_chars($options, $src) { my $pager = _get_pager($options); my $counter = 0; while (defined(my $char = $src->next)) { if ($counter++ && !exists $options->{csv}) { printf "\n"; } identify_char($options, $char); } } sub utf8_sequences($options, $src) { my $pager = _get_pager($options); while (defined(my $char = $src->next)) { if ($char =~ /\p{IsGraph}/) { printf "\e[96m%s\e[0m\e[1G\e[4C", $char; } else { printf "\e[97m%-3s\e[0m ", "\N{OPEN BOX}"; } my @bytes = unpack 'C*', Encode::encode($options->{encoding} // 'utf-8', $char); my $counter = 0; foreach my $byte (@bytes) { if ($counter++) { print " "; } printf "0x%02x", $byte; } print "\n"; } } sub _maximum_code_point() { return max map { max($_->[0]->@[0, 1]) } values $CHAR_BLOCKS->%*; } sub _categories_short($options, $args) { my $pager = _get_pager($options); my $count = 0; my $max = _maximum_code_point; my $on_line = 0; for (my $ord = 0; $ord <= $max; ++$ord) { my $ci = Unicode::UCD::charinfo($ord) or next; if (grep { $_ eq $ci->{category} || (length $_ == 1 && substr($ci->{category}, 0, 1) eq $_) } $args->@*) { printf " U+%s \e[96m%s\e[0m", $ci->{code}, chr $ord; ++$count; if (++$on_line >= 8) { printf "\n"; $on_line = 0; } } } print "\n"; print "\e[37m$count characters displayed\n"; } sub categories($options, $args) { my $pager = _get_pager($options); foreach my $key (sort keys $CATEGORIES->%*) { next if $args->@* > 0 && !grep { $_ eq $key } $args->@*; if (length $key == 1) { printf "\e[95m%-3s\e[0m \e[95m%s\e[0m\n", $key, $CATEGORIES->{$key}; } else { printf "\e[94m%-3s\e[0m \e[37m%s\e[0m\n", $key, $CATEGORIES->{$key}; } } } sub _charset_print($options, $name, $block, %o) { $o{padding} //= 2; print "\e[92m$name\e[0m\n"; if ($o{range} // 1) { foreach my $rec ($block->@*) { printf "%sU+%04X → U+%04X\n", " " x $o{padding}, $rec->@[0, 1]; } } } sub _charset_list($methods, $options, $args) { my $blocks = $methods->{list}->(); my $re = '(' . join('|', $args->@*) . ')'; foreach my $key (sort keys $blocks->%*) { if ($options->{search} && $key !~ /$re/i) { next; } print "\e[92m• \e[0m"; _charset_print($options, $key, $blocks->{$key}); } } sub _charset_long_ranges($options, $ranges) { foreach my $range ($ranges->@*) { printf "\e[92m · U+%04X → U+%04X\e[0m\n", $range->@[0, 1]; foreach my $ord ($range->[0]..$range->[1]) { identify_char($options, chr $ord, padding => 4); print "\n"; } } } sub _charset_table_ranges($options, $ranges) { my $width = 3; my $cols = 16; foreach my $range ($ranges->@*) { printf "\e[92m • U+%04X → U+%04X\e[0m\n", $range->@[0, 1]; foreach my $ord ($range->[0]..$range->[1]) { if (($ord - $range->[0]) % $cols == 0) { printf " \e[36mU+%04X\e[0m\e[37m│\e[0m", $ord; } my $ci = Unicode::UCD::charinfo($ord); if (!defined $ci) { printf "\e[91m%*s\e[0m", -$width, '×'; } else { my $char = chr $ord; my $display = _get_display_char($char, $ci); if ($display eq $char) { printf " %*s", -$width, $display; } else { printf " \e[97m%*s\e[0m", -$width, $display; } } if (($ord - $range->[0]) % $cols == $cols - 1) { print "\n"; } elsif (($ord - $range->[0]) % 8 == 7) { print "\e[37m│\e[0m"; } } print "\n"; } } sub _charset_list_chars($methods, $options, $args) { foreach my $arg ($args->@*) { my $ranges = $methods->{get}->($arg); if (!defined $ranges) { print "\e[91m$arg: No such character set\e[0m\n"; next; } if ($args->@* > 1) { print "\e[95m◦ $arg\e[0m\n"; } if ($options->{table}) { _charset_table_ranges($options, $ranges); } else { _charset_long_ranges($options, $ranges); } } } sub _charset_get($methods, $options, $args) { foreach my $name ($args->@*) { my $blocks = $methods->{get}->($name); if (!defined $blocks) { print "\e[91m$name: No such code\e[0m\n"; } else { print "\e[92m• \e[0m"; _charset_print($options, $name, $blocks); } } } sub _charset($methods, $options, $arg) { if (defined (my $name = $methods->{get}->(ord $arg))) { my $block = $methods->{get}->($name); print "\e[96m$arg\e[0m "; _charset_print($options, $name, $block, padding => 4, range => 0); } } sub _charset_dispatcher($methods, $options, $args) { if ((grep { $options->{$_} } qw(list_chars get list search)) > 1) { say STDERR "Conflicting operations"; pod2usage; } my $pager = _get_pager($options); return _charset_list_chars($methods, $options, $args) if $options->{list_chars}; return _charset_get($methods, $options, $args) if $options->{get}; return _charset_list($methods, $options, $args) if $options->{list} || $options->{search} || $args->@* == 0; my $src = _get_src($options, $args); while (defined(my $c = $src->next)) { _charset($methods, $options, $c); } } sub charblock($options, $args) { my $dispatcher = { list => \&Unicode::UCD::charblocks, get => \&Unicode::UCD::charblock, }; _charset_dispatcher($dispatcher, $options, $args); } sub charscript($options, $args) { my $dispatcher = { list => \&Unicode::UCD::charscripts, get => \&Unicode::UCD::charscript, }; _charset_dispatcher($dispatcher, $options, $args); } sub Options::fix_pager($options, $cmd, @regices) { return if defined $options->{pager} || $options->{no_pager}; return if !-t STDOUT; # $options->{pager} = '' # if !grep { $cmd =~ $_ } @regices; } sub Options::fix_csv($options) { $options->{csv} = Encode::decode('utf-8', $options->{csv} || ';') if exists $options->{csv}; } my $options = {}; binmode *STDOUT, ':encoding(utf-8)'; # Get common options. Yes, we could be using some argparse equivalent, # but in core modules, we only have ‹Getopt::Long›. C'est la vie. GetOptions($options, qw( h help pager|p:s no_pager|no-pager|P )) or pod2usage; pod2usage(-exitval => 0, -verbose => 0) if $options->{h} || @ARGV == 0; pod2usage(-exitval => 0, -verbose => 1) if $options->{help}; my $cmd = shift @ARGV; my $cmd_identify_regex = qr/^id(ent(ify)?)?$/; my $cmd_encode_regex = qr/^enc(ode)?$/; Options::fix_pager($options, $cmd, $cmd_identify_regex, $cmd_encode_regex); if ($cmd eq 'help') { pod2usage(-exitval => 0, -verbose => 1); } if ($cmd =~ $cmd_identify_regex) { GetOptions($options, qw( argv|A files|file|F csv:s whitespaces|ws|w )) or pod2usage(-verbose => 1); Options::fix_csv($options); my $src = _get_src($options, \@ARGV); identify_chars($options, $src); } elsif ($cmd =~ $cmd_encode_regex) { GetOptions($options, qw( argv|A files|file|F whitespaces|ws|w encoding|as|E=s )) or pod2usage(-verbose => 1); my $src = _get_src($options, \@ARGV); utf8_sequences($options, $src); } elsif ($cmd =~ /^cat(egor(y|ies))?$/) { GetOptions($options) or pod2usage(-verbose => 1); categories($options, \@ARGV); } elsif ($cmd =~ /^(char)?blocks?$/) { GetOptions($options, qw( argv|A files|file|F whitespaces|ws|w list_chars|list-chars|chars|x table|t get|g list|l search|s )) or pod2usage(-verbose => 1); Options::fix_csv($options); charblock($options, \@ARGV); } elsif ($cmd =~ /^(char)?scripts?$/) { charscript($options, \@ARGV); } else { say STDERR "$cmd: Unknown command"; pod2usage(-verbose => 1); } Loading
unicode 0 → 100755 +767 −0 Original line number Diff line number Diff line #!/usr/bin/env perl use v5.30; use utf8; use warnings; use experimental qw(signatures); =encoding utf-8 =head1 NAME C<unicode> — identify Unicode characters or print other useful info =head1 SYNOPSIS unicode [COMMON OPTIONS] COMMAND [OPTIONS] [ARGUMENTS…] Common options: -h Show synopsis and exit. --help Show extended help and exit. -p --pager=[PAGER] Use ‹PAGER› program to display output. If ‹PAGER› is not provided, use ‹$PAGER› from environment. -P --no-pager Do not use program to display output. --no-less Commands and their options identify [--file|--argv] [--ws] [--csv=[SEP]] [@FILE|STRING]… encode [--file|argv] [--ws] [--as=ENCODING] [@FILE|STRING]… categories [KEY…] charblocks|charscripts [--list] charblocks|charscripts [--file|--argv] [--ws] (@FILE|STRING)… charblocks|charscripts (--get|--search) BLOCK… charblocks|charscripts --list-chars [--table] BLOCK… =head1 DESCRIPTION Depending on the command, the script provides information from Unicode database. =head2 C<help> Equivalent to C<--help>. =head2 C<identify> identify [OPTIONS] [@FILE|STRING]… Aliases: C<ident>, C<id>. Identify characters provided read from a C<@FILE> or from a C<STRING>. -F --file[s] Interpret all arguments as files to read from. -A --argv Interpret all arguments as literal strings. --ws Display information about whitespaces as well. --csv=[SEP] Display information as values separated by ‹SEP›. Uses ‹;› by default. =head2 C<encode> encode [OPTIONS] [@FILE|STRING]… Aliases: C<enc>. Display encoding of characters from provided C<@FILE> or C<STRING>. Options C<--file>, C<--files>, C<--argv> and C<--ws> are the same as for C<identify> command. In addition, C<encode> supports the following options: -E --as=ENCODING Use given ‹ENCODING› instead of (default) UTF-8. You may list available encodings with ‹iconv --list›. =head2 C<categories> categories [KEY…] Aliases: C<cat>, C<category>. Display list of categories with their names. If arguments are provided, only categories matching given keys are displayed. =head2 C<charblocks> charblocks [OPTIONS] [ARGS…] Aliases: C<blocks>, C<charblock>, C<block>. Displays various information about character blocks. The behaviour depends on provided options and arguments: =over =item C<charblocks [--list|-l]> Lists all character blocks and their ranges. =item C<charblocks [--file|--argv] [--ws] (@FILE|STRING)…> Displays character blocks from given sources. See C<identify> command for documentation of C<--file>, C<--argv> and C<--ws>. =item C<charblocks (--get|-g) BLOCK…> As C<--list>, except prints only blocks named C<BLOCK>. =item C<charblocks (--search|-s) PATTERN…> As C<--list>, except prints only blocks matching at least one C<PATTERN>. =item C<charblocks (--list-chars|-x) [--table|-t] BLOCK…> Lists characters of the given B<BLOCK>s. The format is same as for C<identify> command, unless C<--table> is also given, in which case a table output of characters is printed. =back =head2 C<charscripts> charscripts [OPTIONS] [ARGS…] Aliases: C<scripts>, C<charscript>, C<script>. This command has the same options and semantics as C<charblocks>, except it works with character scripts instead of character blocks. The difference is explained e.g. here: L<Unicode::UCD|https://perldoc.perl.org/Unicode::UCD#Blocks-versus-Scripts>. =cut #------------------------------------------------------------------------------ package CharSource; #------------------------------------------------------------------------------ # This is an “abstract” class for character sources. sub new($class, $options, @) { return bless { _opt => $options, }, $class; } sub next($self) { ...; } #------------------------------------------------------------------------------ package CharSource::Empty; #------------------------------------------------------------------------------ # Empty character source. use parent -norequire, qw(CharSource); sub next($self) { # Nop, this will return empty list to the caller. } #------------------------------------------------------------------------------ package CharSource::String; #------------------------------------------------------------------------------ # Character source that provides characters from a string literal. use Encode; use parent -norequire, qw(CharSource); sub new($class, $options, $string) { my $self = $class->SUPER::new($options); $self->{_str} = [ split //, $string ]; return $self; } sub next($self) { if (!$self->{_opt}->{whitespaces}) { while ($self->{_str}->@* > 0 && $self->{_str}->[0] =~ /\s/) { shift $self->{_str}->@*; } } return if $self->{_str}->@* == 0; return shift $self->{_str}->@*; } #------------------------------------------------------------------------------ package CharSource::File; #------------------------------------------------------------------------------ # Character source that provides characters from a file. use parent -norequire, qw(CharSource); sub new($class, $options, $filename) { my $self = $class->SUPER::new($options); open $self->{_fh}, '<:encoding(utf-8)', $filename or die "$filename: $!\n"; return $self; } sub next($self) { if ($self->{_cur} && defined(my $chr = $self->{_cur}->next)) { return $chr; } my $line = readline $self->{_fh}; return if !defined $line; $self->{_cur} = CharSource::String->new($self->{_opt}, $line); return $self->next; } #------------------------------------------------------------------------------ package CharSource::File::Armour; #------------------------------------------------------------------------------ use parent -norequire, qw(CharSource::File); # Same as ‹CharSource::File›, except protect against exception. It will # behave like ‹CharSource::Empty› in that case. sub new($class, $options, $filename) { my $self = eval { $class->SUPER::new($options, $filename); }; return $self if !$@; chomp $@; say STDERR "$@"; return CharSource::Empty->new($options); } #------------------------------------------------------------------------------ package CharSource::Dispatcher; #------------------------------------------------------------------------------ # Class that dispatches String or File sources on provided arguments. use Encode; use parent -norequire, qw(CharSource); sub new($class, $options, $argv) { my $self = $class->SUPER::new($options); $self->{_argv} = [ $argv->@* ]; return $self; } sub _get_source($self, $arg) { if ((!$self->{_opt}->{files} && !$self->{_opt}->{argv} && $arg =~ s/^@//) || $self->{_opt}->{files}) { return CharSource::File::Armour->new($self->{_opt}, $arg); } return CharSource::String->new($self->{_opt}, Encode::decode('utf-8', $arg)); } sub next($self) { if ($self->{_cur} && defined(my $chr = $self->{_cur}->next)) { return $chr; } my $arg = shift $self->{_argv}->@*; return if !defined $arg; $self->{_cur} = $self->_get_source($arg); return $self->next; } #------------------------------------------------------------------------------ package Pager; #------------------------------------------------------------------------------ sub _exec_child($pager, $pipe_in, $pipe_out) { close $pipe_out; open STDIN, "<&", $pipe_in or die "pager: Cannot dup() standard input: $!\n"; exec $pager or exit 1; } sub _exec_parent($pipe_in, $pipe_out) { close $pipe_in; open STDOUT, ">&", $pipe_out or die "pager: Cannot dup() to standard output:: $!\n"; } sub _exec($pager) { my ($pipe_in, $pipe_out); die "pager: Cannot create pipe: $!\n" if !pipe $pipe_in, $pipe_out; my $pid = fork; die "pager: Cannot fork: $!\n" if !defined $pid; _exec_child($pager, $pipe_in, $pipe_out) if $pid == 0; _exec_parent($pipe_in, $pipe_out); return $pid; } sub new($class, $pager) { return bless { _pid => Pager::_exec($pager), }, $class; } sub DESTROY($self) { return if !defined $self->{_pid}; close STDOUT; waitpid delete $self->{_pid}, 0; } #------------------------------------------------------------------------------ package main; #------------------------------------------------------------------------------ use Data::Dumper; use Getopt::Long qw(:config posix_default); use List::Util qw(max); use Unicode::UCD; use Pod::Usage; my $CATEGORIES = Unicode::UCD::general_categories; my $CHAR_BLOCKS = Unicode::UCD::charblocks; sub _get_pager($options) { return Pager->new($options->{pager}) if $options->{pager}; return Pager->new($ENV{PAGER} // 'less -R') if defined $options->{pager}; } sub _get_src($options, $args) { return CharSource::Dispatcher->new($options, $args); } sub _get_display_char($char, $info) { return "\N{DOTTED CIRCLE}$char" if $info->{category} eq 'Mn'; return "\N{OPEN BOX}" if $info->{category} eq 'Zs'; return "\N{WHITE BULLET}" if $char !~ /\p{IsGraph}/; return $char; } sub print_char($char, $info, %o) { my $padding = " " x $o{padding}; my $display = _get_display_char($char, $info); if ($display eq $char) { printf "${padding}\e[96m%s\e[0m", $display; } else { printf "${padding}\e[97m%s\e[0m", $display; } } sub _identify_char_output($options, $char, $ci, %o) { $o{padding} //= 0; my $padding = " " x $o{padding}; my $char_offset = 6; my $char_shift = $o{padding} + $char_offset; if (!defined $ci) { printf "${padding}\e[97m%-3s\e[0m ", "\N{OPEN BOX}"; print "${padding}\e[91mUnknown character\e[0m\n"; return; } print_char($char, $ci, %o); printf "%sU+%s \e[92m%s\e[0m\n", " " x ($char_offset - 1), $ci->{code}, $ci->{name}; printf "${padding}%s\e[37m%s: %s\e[0m (\e[96m%s\e[36m:%s\e[0m)\n", " " x $char_offset, $ci->{script}, $ci->{block}, $ci->{category}, $CATEGORIES->{$ci->{category}}; } sub _identify_char_csv($options, $char, $ci) { if (!defined $ci) { printf STDERR "%08x: Unknown character\n", ord $char; return; } if ($char eq $options->{csv}) { printf STDERR "Separator ignored\n"; return; } say join $options->{csv}, $char, $ci->@{qw(code name script block category)}, $CATEGORIES->{$ci->{category}}; } sub identify_char($options, $char, %o) { my $ord = ord $char; my $ci = Unicode::UCD::charinfo($ord); if (defined $options->{csv}) { _identify_char_csv($options, $char, $ci); } else { _identify_char_output($options, $char, $ci); } } sub identify_chars($options, $src) { my $pager = _get_pager($options); my $counter = 0; while (defined(my $char = $src->next)) { if ($counter++ && !exists $options->{csv}) { printf "\n"; } identify_char($options, $char); } } sub utf8_sequences($options, $src) { my $pager = _get_pager($options); while (defined(my $char = $src->next)) { if ($char =~ /\p{IsGraph}/) { printf "\e[96m%s\e[0m\e[1G\e[4C", $char; } else { printf "\e[97m%-3s\e[0m ", "\N{OPEN BOX}"; } my @bytes = unpack 'C*', Encode::encode($options->{encoding} // 'utf-8', $char); my $counter = 0; foreach my $byte (@bytes) { if ($counter++) { print " "; } printf "0x%02x", $byte; } print "\n"; } } sub _maximum_code_point() { return max map { max($_->[0]->@[0, 1]) } values $CHAR_BLOCKS->%*; } sub _categories_short($options, $args) { my $pager = _get_pager($options); my $count = 0; my $max = _maximum_code_point; my $on_line = 0; for (my $ord = 0; $ord <= $max; ++$ord) { my $ci = Unicode::UCD::charinfo($ord) or next; if (grep { $_ eq $ci->{category} || (length $_ == 1 && substr($ci->{category}, 0, 1) eq $_) } $args->@*) { printf " U+%s \e[96m%s\e[0m", $ci->{code}, chr $ord; ++$count; if (++$on_line >= 8) { printf "\n"; $on_line = 0; } } } print "\n"; print "\e[37m$count characters displayed\n"; } sub categories($options, $args) { my $pager = _get_pager($options); foreach my $key (sort keys $CATEGORIES->%*) { next if $args->@* > 0 && !grep { $_ eq $key } $args->@*; if (length $key == 1) { printf "\e[95m%-3s\e[0m \e[95m%s\e[0m\n", $key, $CATEGORIES->{$key}; } else { printf "\e[94m%-3s\e[0m \e[37m%s\e[0m\n", $key, $CATEGORIES->{$key}; } } } sub _charset_print($options, $name, $block, %o) { $o{padding} //= 2; print "\e[92m$name\e[0m\n"; if ($o{range} // 1) { foreach my $rec ($block->@*) { printf "%sU+%04X → U+%04X\n", " " x $o{padding}, $rec->@[0, 1]; } } } sub _charset_list($methods, $options, $args) { my $blocks = $methods->{list}->(); my $re = '(' . join('|', $args->@*) . ')'; foreach my $key (sort keys $blocks->%*) { if ($options->{search} && $key !~ /$re/i) { next; } print "\e[92m• \e[0m"; _charset_print($options, $key, $blocks->{$key}); } } sub _charset_long_ranges($options, $ranges) { foreach my $range ($ranges->@*) { printf "\e[92m · U+%04X → U+%04X\e[0m\n", $range->@[0, 1]; foreach my $ord ($range->[0]..$range->[1]) { identify_char($options, chr $ord, padding => 4); print "\n"; } } } sub _charset_table_ranges($options, $ranges) { my $width = 3; my $cols = 16; foreach my $range ($ranges->@*) { printf "\e[92m • U+%04X → U+%04X\e[0m\n", $range->@[0, 1]; foreach my $ord ($range->[0]..$range->[1]) { if (($ord - $range->[0]) % $cols == 0) { printf " \e[36mU+%04X\e[0m\e[37m│\e[0m", $ord; } my $ci = Unicode::UCD::charinfo($ord); if (!defined $ci) { printf "\e[91m%*s\e[0m", -$width, '×'; } else { my $char = chr $ord; my $display = _get_display_char($char, $ci); if ($display eq $char) { printf " %*s", -$width, $display; } else { printf " \e[97m%*s\e[0m", -$width, $display; } } if (($ord - $range->[0]) % $cols == $cols - 1) { print "\n"; } elsif (($ord - $range->[0]) % 8 == 7) { print "\e[37m│\e[0m"; } } print "\n"; } } sub _charset_list_chars($methods, $options, $args) { foreach my $arg ($args->@*) { my $ranges = $methods->{get}->($arg); if (!defined $ranges) { print "\e[91m$arg: No such character set\e[0m\n"; next; } if ($args->@* > 1) { print "\e[95m◦ $arg\e[0m\n"; } if ($options->{table}) { _charset_table_ranges($options, $ranges); } else { _charset_long_ranges($options, $ranges); } } } sub _charset_get($methods, $options, $args) { foreach my $name ($args->@*) { my $blocks = $methods->{get}->($name); if (!defined $blocks) { print "\e[91m$name: No such code\e[0m\n"; } else { print "\e[92m• \e[0m"; _charset_print($options, $name, $blocks); } } } sub _charset($methods, $options, $arg) { if (defined (my $name = $methods->{get}->(ord $arg))) { my $block = $methods->{get}->($name); print "\e[96m$arg\e[0m "; _charset_print($options, $name, $block, padding => 4, range => 0); } } sub _charset_dispatcher($methods, $options, $args) { if ((grep { $options->{$_} } qw(list_chars get list search)) > 1) { say STDERR "Conflicting operations"; pod2usage; } my $pager = _get_pager($options); return _charset_list_chars($methods, $options, $args) if $options->{list_chars}; return _charset_get($methods, $options, $args) if $options->{get}; return _charset_list($methods, $options, $args) if $options->{list} || $options->{search} || $args->@* == 0; my $src = _get_src($options, $args); while (defined(my $c = $src->next)) { _charset($methods, $options, $c); } } sub charblock($options, $args) { my $dispatcher = { list => \&Unicode::UCD::charblocks, get => \&Unicode::UCD::charblock, }; _charset_dispatcher($dispatcher, $options, $args); } sub charscript($options, $args) { my $dispatcher = { list => \&Unicode::UCD::charscripts, get => \&Unicode::UCD::charscript, }; _charset_dispatcher($dispatcher, $options, $args); } sub Options::fix_pager($options, $cmd, @regices) { return if defined $options->{pager} || $options->{no_pager}; return if !-t STDOUT; # $options->{pager} = '' # if !grep { $cmd =~ $_ } @regices; } sub Options::fix_csv($options) { $options->{csv} = Encode::decode('utf-8', $options->{csv} || ';') if exists $options->{csv}; } my $options = {}; binmode *STDOUT, ':encoding(utf-8)'; # Get common options. Yes, we could be using some argparse equivalent, # but in core modules, we only have ‹Getopt::Long›. C'est la vie. GetOptions($options, qw( h help pager|p:s no_pager|no-pager|P )) or pod2usage; pod2usage(-exitval => 0, -verbose => 0) if $options->{h} || @ARGV == 0; pod2usage(-exitval => 0, -verbose => 1) if $options->{help}; my $cmd = shift @ARGV; my $cmd_identify_regex = qr/^id(ent(ify)?)?$/; my $cmd_encode_regex = qr/^enc(ode)?$/; Options::fix_pager($options, $cmd, $cmd_identify_regex, $cmd_encode_regex); if ($cmd eq 'help') { pod2usage(-exitval => 0, -verbose => 1); } if ($cmd =~ $cmd_identify_regex) { GetOptions($options, qw( argv|A files|file|F csv:s whitespaces|ws|w )) or pod2usage(-verbose => 1); Options::fix_csv($options); my $src = _get_src($options, \@ARGV); identify_chars($options, $src); } elsif ($cmd =~ $cmd_encode_regex) { GetOptions($options, qw( argv|A files|file|F whitespaces|ws|w encoding|as|E=s )) or pod2usage(-verbose => 1); my $src = _get_src($options, \@ARGV); utf8_sequences($options, $src); } elsif ($cmd =~ /^cat(egor(y|ies))?$/) { GetOptions($options) or pod2usage(-verbose => 1); categories($options, \@ARGV); } elsif ($cmd =~ /^(char)?blocks?$/) { GetOptions($options, qw( argv|A files|file|F whitespaces|ws|w list_chars|list-chars|chars|x table|t get|g list|l search|s )) or pod2usage(-verbose => 1); Options::fix_csv($options); charblock($options, \@ARGV); } elsif ($cmd =~ /^(char)?scripts?$/) { charscript($options, \@ARGV); } else { say STDERR "$cmd: Unknown command"; pod2usage(-verbose => 1); }