Verified Commit 69df4323 authored by Roman Lacko's avatar Roman Lacko
Browse files

Add unicode script

parent 7a47f213
Loading
Loading
Loading
Loading

unicode

0 → 100755
+767 −0
Original line number Diff line number Diff line
#!/usr/bin/env perl

use v5.30;

use utf8;
use warnings;

use experimental qw(signatures);

=encoding utf-8

=head1 NAME

C<unicode> — identify Unicode characters or print other useful info

=head1 SYNOPSIS

    unicode [COMMON OPTIONS] COMMAND [OPTIONS] [ARGUMENTS…]

Common options:

    -h                  Show synopsis and exit.
       --help           Show extended help and exit.

    -p --pager=[PAGER]  Use ‹PAGER› program to display output.
                        If ‹PAGER› is not provided, use ‹$PAGER› from environment.
    -P --no-pager       Do not use program to display output.
       --no-less

Commands and their options

    identify [--file|--argv] [--ws] [--csv=[SEP]] [@FILE|STRING]…
    encode [--file|argv] [--ws] [--as=ENCODING] [@FILE|STRING]…
    categories [KEY…]
    charblocks|charscripts [--list]
    charblocks|charscripts [--file|--argv] [--ws] (@FILE|STRING)…
    charblocks|charscripts (--get|--search) BLOCK…
    charblocks|charscripts --list-chars [--table] BLOCK…

=head1 DESCRIPTION

Depending on the command, the script provides information from Unicode
database.

=head2 C<help>

Equivalent to C<--help>.

=head2 C<identify>

    identify [OPTIONS] [@FILE|STRING]…

Aliases: C<ident>, C<id>.

Identify characters provided read from a C<@FILE> or from a C<STRING>.

    -F --file[s]        Interpret all arguments as files to read from.
    -A --argv           Interpret all arguments as literal strings.
       --ws             Display information about whitespaces as well.
       --csv=[SEP]      Display information as values separated by ‹SEP›.
                        Uses ‹;› by default.

=head2 C<encode>

    encode [OPTIONS] [@FILE|STRING]…

Aliases: C<enc>.

Display encoding of characters from provided C<@FILE> or C<STRING>.

Options C<--file>, C<--files>, C<--argv> and C<--ws> are the same as for
C<identify> command. In addition, C<encode> supports the following options:

    -E --as=ENCODING    Use given ‹ENCODING› instead of (default) UTF-8.
                        You may list available encodings with ‹iconv --list›.

=head2 C<categories>

    categories [KEY…]

Aliases: C<cat>, C<category>.

Display list of categories with their names. If arguments are provided,
only categories matching given keys are displayed.

=head2 C<charblocks>

    charblocks [OPTIONS] [ARGS…]

Aliases: C<blocks>, C<charblock>, C<block>.

Displays various information about character blocks. The behaviour depends
on provided options and arguments:

=over

=item C<charblocks [--list|-l]>

Lists all character blocks and their ranges.

=item C<charblocks [--file|--argv] [--ws] (@FILE|STRING)…>

Displays character blocks from given sources.
See C<identify> command for documentation of C<--file>, C<--argv> and C<--ws>.

=item C<charblocks (--get|-g) BLOCK…>

As C<--list>, except prints only blocks named C<BLOCK>.

=item C<charblocks (--search|-s) PATTERN…>

As C<--list>, except prints only blocks matching at least one C<PATTERN>.

=item C<charblocks (--list-chars|-x) [--table|-t] BLOCK…>

Lists characters of the given B<BLOCK>s. The format is same as for C<identify>
command, unless C<--table> is also given, in which case a table output of
characters is printed.

=back

=head2 C<charscripts>

    charscripts [OPTIONS] [ARGS…]

Aliases: C<scripts>, C<charscript>, C<script>.

This command has the same options and semantics as C<charblocks>, except it
works with character scripts instead of character blocks.

The difference is explained e.g. here:
L<Unicode::UCD|https://perldoc.perl.org/Unicode::UCD#Blocks-versus-Scripts>.

=cut

#------------------------------------------------------------------------------
package CharSource;
#------------------------------------------------------------------------------

# This is an “abstract” class for character sources.

sub new($class, $options, @) {
    return bless {
        _opt => $options,
    }, $class;
}

sub next($self) {
    ...;
}

#------------------------------------------------------------------------------
package CharSource::Empty;
#------------------------------------------------------------------------------

# Empty character source.

use parent -norequire, qw(CharSource);

sub next($self) {
    # Nop, this will return empty list to the caller.
}

#------------------------------------------------------------------------------
package CharSource::String;
#------------------------------------------------------------------------------

# Character source that provides characters from a string literal.

use Encode;

use parent -norequire, qw(CharSource);

sub new($class, $options, $string) {
    my $self = $class->SUPER::new($options);

    $self->{_str} = [ split //, $string ];

    return $self;
}

sub next($self) {
    if (!$self->{_opt}->{whitespaces}) {
        while ($self->{_str}->@* > 0 && $self->{_str}->[0] =~ /\s/) {
            shift $self->{_str}->@*;
        }
    }

    return if $self->{_str}->@* == 0;
    return shift $self->{_str}->@*;
}

#------------------------------------------------------------------------------
package CharSource::File;
#------------------------------------------------------------------------------

# Character source that provides characters from a file.

use parent -norequire, qw(CharSource);

sub new($class, $options, $filename) {
    my $self = $class->SUPER::new($options);

    open $self->{_fh}, '<:encoding(utf-8)', $filename
        or die "$filename: $!\n";

    return $self;
}

sub next($self) {
    if ($self->{_cur} && defined(my $chr = $self->{_cur}->next)) {
        return $chr;
    }

    my $line = readline $self->{_fh};
    return if !defined $line;

    $self->{_cur} = CharSource::String->new($self->{_opt}, $line);

    return $self->next;
}

#------------------------------------------------------------------------------
package CharSource::File::Armour;
#------------------------------------------------------------------------------

use parent -norequire, qw(CharSource::File);

# Same as ‹CharSource::File›, except protect against exception. It will
# behave like ‹CharSource::Empty› in that case.

sub new($class, $options, $filename) {
    my $self = eval {
        $class->SUPER::new($options, $filename);
    };

    return $self
        if !$@;

    chomp $@;
    say STDERR "$@";
    return CharSource::Empty->new($options);
}

#------------------------------------------------------------------------------
package CharSource::Dispatcher;
#------------------------------------------------------------------------------

# Class that dispatches String or File sources on provided arguments.

use Encode;

use parent -norequire, qw(CharSource);

sub new($class, $options, $argv) {
    my $self = $class->SUPER::new($options);

    $self->{_argv} = [ $argv->@* ];

    return $self;
}

sub _get_source($self, $arg) {
    if ((!$self->{_opt}->{files} && !$self->{_opt}->{argv} && $arg =~ s/^@//)
            || $self->{_opt}->{files}) {
        return CharSource::File::Armour->new($self->{_opt}, $arg);
    }

    return CharSource::String->new($self->{_opt}, Encode::decode('utf-8', $arg));
}

sub next($self) {
    if ($self->{_cur} && defined(my $chr = $self->{_cur}->next)) {
        return $chr;
    }

    my $arg = shift $self->{_argv}->@*;
    return if !defined $arg;

    $self->{_cur} = $self->_get_source($arg);
    return $self->next;
}

#------------------------------------------------------------------------------
package Pager;
#------------------------------------------------------------------------------

sub _exec_child($pager, $pipe_in, $pipe_out) {
    close $pipe_out;

    open STDIN, "<&", $pipe_in
        or die "pager: Cannot dup() standard input: $!\n";

    exec $pager
        or exit 1;
}

sub _exec_parent($pipe_in, $pipe_out) {
    close $pipe_in;
    open STDOUT, ">&", $pipe_out
        or die "pager: Cannot dup() to standard output:: $!\n";
}

sub _exec($pager) {
    my ($pipe_in, $pipe_out);
    die "pager: Cannot create pipe: $!\n"
        if !pipe $pipe_in, $pipe_out;

    my $pid = fork;
    die "pager: Cannot fork: $!\n"
        if !defined $pid;

    _exec_child($pager, $pipe_in, $pipe_out) if $pid == 0;
    _exec_parent($pipe_in, $pipe_out);
    return $pid;
}

sub new($class, $pager) {
    return bless {
        _pid => Pager::_exec($pager),
    }, $class;
}

sub DESTROY($self) {
    return if !defined $self->{_pid};

    close STDOUT;
    waitpid delete $self->{_pid}, 0;
}

#------------------------------------------------------------------------------
package main;
#------------------------------------------------------------------------------

use Data::Dumper;

use Getopt::Long qw(:config posix_default);
use List::Util qw(max);
use Unicode::UCD;
use Pod::Usage;

my $CATEGORIES = Unicode::UCD::general_categories;
my $CHAR_BLOCKS = Unicode::UCD::charblocks;

sub _get_pager($options) {
    return Pager->new($options->{pager})
        if $options->{pager};

    return Pager->new($ENV{PAGER} // 'less -R')
        if defined $options->{pager};
}

sub _get_src($options, $args) {
    return CharSource::Dispatcher->new($options, $args);
}

sub _get_display_char($char, $info) {
    return "\N{DOTTED CIRCLE}$char"
        if $info->{category} eq 'Mn';
    return "\N{OPEN BOX}"
        if $info->{category} eq 'Zs';
    return "\N{WHITE BULLET}"
        if $char !~ /\p{IsGraph}/;
    return $char;
}

sub print_char($char, $info, %o) {
    my $padding = " " x $o{padding};

    my $display = _get_display_char($char, $info);
    if ($display eq $char) {
        printf "${padding}\e[96m%s\e[0m", $display;
    } else {
        printf "${padding}\e[97m%s\e[0m", $display;
    }
}

sub _identify_char_output($options, $char, $ci, %o) {
    $o{padding} //= 0;

    my $padding = " " x $o{padding};
    my $char_offset = 6;
    my $char_shift = $o{padding} + $char_offset;

    if (!defined $ci) {
        printf "${padding}\e[97m%-3s\e[0m ", "\N{OPEN BOX}";
        print "${padding}\e[91mUnknown character\e[0m\n";
        return;
    }

    print_char($char, $ci, %o);
    printf "%sU+%s \e[92m%s\e[0m\n",
        " " x ($char_offset - 1),
        $ci->{code},
        $ci->{name};
    printf "${padding}%s\e[37m%s: %s\e[0m (\e[96m%s\e[36m:%s\e[0m)\n",
        " " x $char_offset,
        $ci->{script},
        $ci->{block},
        $ci->{category},
        $CATEGORIES->{$ci->{category}};
}

sub _identify_char_csv($options, $char, $ci) {
    if (!defined $ci) {
        printf STDERR "%08x: Unknown character\n", ord $char;
        return;
    }

    if ($char eq $options->{csv}) {
        printf STDERR "Separator ignored\n";
        return;
    }

    say join $options->{csv}, $char, $ci->@{qw(code name script block category)},
            $CATEGORIES->{$ci->{category}};
}

sub identify_char($options, $char, %o) {
    my $ord = ord $char;
    my $ci = Unicode::UCD::charinfo($ord);

    if (defined $options->{csv}) {
        _identify_char_csv($options, $char, $ci);
    } else {
        _identify_char_output($options, $char, $ci);
    }
}

sub identify_chars($options, $src) {
    my $pager = _get_pager($options);

    my $counter = 0;
    while (defined(my $char = $src->next)) {
        if ($counter++ && !exists $options->{csv}) {
            printf "\n";
        }

        identify_char($options, $char);
    }
}

sub utf8_sequences($options, $src) {
    my $pager = _get_pager($options);

    while (defined(my $char = $src->next)) {
        if ($char =~ /\p{IsGraph}/) {
            printf "\e[96m%s\e[0m\e[1G\e[4C", $char;
        } else {
            printf "\e[97m%-3s\e[0m ", "\N{OPEN BOX}";
        }

        my @bytes = unpack 'C*', Encode::encode($options->{encoding} // 'utf-8', $char);

        my $counter = 0;
        foreach my $byte (@bytes) {
            if ($counter++) {
                print " ";
            }

            printf "0x%02x", $byte;
        }

        print "\n";
    }
}

sub _maximum_code_point() {
    return max map { max($_->[0]->@[0, 1]) }
        values $CHAR_BLOCKS->%*;
}

sub _categories_short($options, $args) {
    my $pager = _get_pager($options);

    my $count = 0;
    my $max = _maximum_code_point;
    my $on_line = 0;

    for (my $ord = 0; $ord <= $max; ++$ord) {
        my $ci = Unicode::UCD::charinfo($ord)
            or next;

        if (grep { $_ eq $ci->{category}
                    || (length $_ == 1 && substr($ci->{category}, 0, 1) eq $_) }
                $args->@*) {

            printf "  U+%s \e[96m%s\e[0m", $ci->{code}, chr $ord;

            ++$count;
            if (++$on_line >= 8) {
                printf "\n";
                $on_line = 0;
            }
        }
    }

    print "\n";
    print "\e[37m$count characters displayed\n";

}

sub categories($options, $args) {
    my $pager = _get_pager($options);

    foreach my $key (sort keys $CATEGORIES->%*) {
        next if $args->@* > 0 && !grep { $_ eq $key } $args->@*;

        if (length $key == 1) {
            printf "\e[95m%-3s\e[0m \e[95m%s\e[0m\n", $key, $CATEGORIES->{$key};
        } else {
            printf "\e[94m%-3s\e[0m \e[37m%s\e[0m\n", $key, $CATEGORIES->{$key};
        }
    }
}

sub _charset_print($options, $name, $block, %o) {
    $o{padding} //= 2;

    print "\e[92m$name\e[0m\n";

    if ($o{range} // 1) {
        foreach my $rec ($block->@*) {
            printf "%sU+%04X → U+%04X\n", " " x $o{padding}, $rec->@[0, 1];
        }
    }
}

sub _charset_list($methods, $options, $args) {
    my $blocks = $methods->{list}->();

    my $re = '(' . join('|', $args->@*) . ')';

    foreach my $key (sort keys $blocks->%*) {
        if ($options->{search} && $key !~ /$re/i) {
            next;
        }

        print "\e[92m• \e[0m";
        _charset_print($options, $key, $blocks->{$key});
    }
}

sub _charset_long_ranges($options, $ranges) {
    foreach my $range ($ranges->@*) {
        printf "\e[92m  · U+%04X → U+%04X\e[0m\n", $range->@[0, 1];

        foreach my $ord ($range->[0]..$range->[1]) {
            identify_char($options, chr $ord, padding => 4);
            print "\n";
        }
    }
}

sub _charset_table_ranges($options, $ranges) {
    my $width = 3;
    my $cols = 16;

    foreach my $range ($ranges->@*) {
        printf "\e[92m  • U+%04X → U+%04X\e[0m\n", $range->@[0, 1];

        foreach my $ord ($range->[0]..$range->[1]) {
            if (($ord - $range->[0]) % $cols == 0) {
                printf "    \e[36mU+%04X\e[0m\e[37m│\e[0m", $ord;
            }

            my $ci = Unicode::UCD::charinfo($ord);

            if (!defined $ci) {
                printf "\e[91m%*s\e[0m", -$width, '×';
            } else {
                my $char = chr $ord;
                my $display = _get_display_char($char, $ci);

                if ($display eq $char) {
                    printf " %*s", -$width, $display;
                } else {
                    printf " \e[97m%*s\e[0m", -$width, $display;
                }
            }

            if (($ord - $range->[0]) % $cols == $cols - 1) {
                print "\n";
            } elsif (($ord - $range->[0]) % 8 == 7) {
                print "\e[37m│\e[0m";
            }
        }

        print "\n";
    }
}

sub _charset_list_chars($methods, $options, $args) {
    foreach my $arg ($args->@*) {
        my $ranges = $methods->{get}->($arg);

        if (!defined $ranges) {
            print "\e[91m$arg: No such character set\e[0m\n";
            next;
        }

        if ($args->@* > 1) {
            print "\e[95m◦ $arg\e[0m\n";
        }

        if ($options->{table}) {
            _charset_table_ranges($options, $ranges);
        } else {
            _charset_long_ranges($options, $ranges);
        }
    }
}

sub _charset_get($methods, $options, $args) {
    foreach my $name ($args->@*) {
        my $blocks = $methods->{get}->($name);

        if (!defined $blocks) {
            print "\e[91m$name: No such code\e[0m\n";
        } else {
            print "\e[92m• \e[0m";
            _charset_print($options, $name, $blocks);
        }
    }
}

sub _charset($methods, $options, $arg) {
    if (defined (my $name = $methods->{get}->(ord $arg))) {
        my $block = $methods->{get}->($name);

        print "\e[96m$arg\e[0m   ";
        _charset_print($options, $name, $block, padding => 4, range => 0);
    }
}

sub _charset_dispatcher($methods, $options, $args) {
    if ((grep { $options->{$_} } qw(list_chars get list search)) > 1) {
        say STDERR "Conflicting operations";
        pod2usage;
    }

    my $pager = _get_pager($options);

    return _charset_list_chars($methods, $options, $args)
        if $options->{list_chars};

    return _charset_get($methods, $options, $args)
        if $options->{get};

    return _charset_list($methods, $options, $args)
        if $options->{list} || $options->{search} || $args->@* == 0;

    my $src = _get_src($options, $args);
    while (defined(my $c = $src->next)) {
        _charset($methods, $options, $c);
    }
}

sub charblock($options, $args) {
    my $dispatcher = {
        list => \&Unicode::UCD::charblocks,
        get => \&Unicode::UCD::charblock,
    };

    _charset_dispatcher($dispatcher, $options, $args);
}

sub charscript($options, $args) {
    my $dispatcher = {
        list => \&Unicode::UCD::charscripts,
        get => \&Unicode::UCD::charscript,
    };

    _charset_dispatcher($dispatcher, $options, $args);
}

sub Options::fix_pager($options, $cmd, @regices) {
    return if defined $options->{pager} || $options->{no_pager};
    return if !-t STDOUT;

#   $options->{pager} = ''
#       if !grep { $cmd =~ $_ } @regices;
}

sub Options::fix_csv($options) {
    $options->{csv} = Encode::decode('utf-8', $options->{csv} || ';')
        if exists $options->{csv};
}

my $options = {};

binmode *STDOUT, ':encoding(utf-8)';

# Get common options. Yes, we could be using some argparse equivalent,
# but in core modules, we only have ‹Getopt::Long›. C'est la vie.
GetOptions($options, qw(
    h
    help
    pager|p:s
    no_pager|no-pager|P
)) or pod2usage;

pod2usage(-exitval => 0, -verbose => 0)
    if $options->{h} || @ARGV == 0;
pod2usage(-exitval => 0, -verbose => 1)
    if $options->{help};

my $cmd = shift @ARGV;

my $cmd_identify_regex = qr/^id(ent(ify)?)?$/;
my $cmd_encode_regex = qr/^enc(ode)?$/;

Options::fix_pager($options, $cmd, $cmd_identify_regex, $cmd_encode_regex);

if ($cmd eq 'help') {
    pod2usage(-exitval => 0, -verbose => 1);
}

if ($cmd =~ $cmd_identify_regex) {
    GetOptions($options, qw(
        argv|A
        files|file|F
        csv:s
        whitespaces|ws|w
    )) or pod2usage(-verbose => 1);

    Options::fix_csv($options);

    my $src = _get_src($options, \@ARGV);
    identify_chars($options, $src);
} elsif ($cmd =~ $cmd_encode_regex) {
    GetOptions($options, qw(
        argv|A
        files|file|F
        whitespaces|ws|w
        encoding|as|E=s
    )) or pod2usage(-verbose => 1);

    my $src = _get_src($options, \@ARGV);
    utf8_sequences($options, $src);
} elsif ($cmd =~ /^cat(egor(y|ies))?$/) {
    GetOptions($options)
        or pod2usage(-verbose => 1);

    categories($options, \@ARGV);
} elsif ($cmd =~ /^(char)?blocks?$/) {
    GetOptions($options, qw(
        argv|A
        files|file|F
        whitespaces|ws|w

        list_chars|list-chars|chars|x
        table|t
        get|g
        list|l
        search|s
    )) or pod2usage(-verbose => 1);

    Options::fix_csv($options);

    charblock($options, \@ARGV);
} elsif ($cmd =~ /^(char)?scripts?$/) {
    charscript($options, \@ARGV);
} else {
    say STDERR "$cmd: Unknown command";
    pod2usage(-verbose => 1);
}