#!/usr/bin/env perl

##
## cdif: word context diff
##
## Copyright (c) 1992- Kazumasa Utashiro
##
## Original version on Mar 11 1992
##

use v5.14;
use warnings;

use utf8;
use Carp;
use Encode;
use Encode::Guess;

use Pod::Usage;
use List::Util qw(first sum pairmap);
use Text::ParseWords qw(shellwords);
use Text::VisualWidth::PP qw(vwidth);
use Data::Dumper; {
    no warnings 'redefine';
    *Data::Dumper::qquote = sub { qq["${\(shift)}"] };
    $Data::Dumper::Terse = 1;
    $Data::Dumper::Sortkeys = 1;
    $Data::Dumper::Useperl = 1;
}

use App::sdif;
our $VERSION = $App::sdif::VERSION;

use App::sdif::Util;
use App::cdif::Command::Mecab;

##
## options
##

use charnames ':full';
my %visible = (
    nul => [ 1, "\000" => "\N{SYMBOL FOR NULL}" ],
    soh => [ 1, "\001" => "\N{SYMBOL FOR START OF HEADING}" ],
    bel => [ 1, "\007" => "\N{SYMBOL FOR BELL}" ],
    bs  => [ 1, "\010" => "\N{SYMBOL FOR BACKSPACE}" ],
    ht  => [ 0, "\011" => "\N{SYMBOL FOR HORIZONTAL TABULATION}" ],
    nl  => [ 0, "\012" => "\N{SYMBOL FOR NEWLINE}" . "\n" ],
    vt  => [ 1, "\013" => "\N{SYMBOL FOR VERTICAL TABULATION}" ],
    np  => [ 1, "\014" => "\N{SYMBOL FOR FORM FEED}" ],
    cr  => [ 1, "\015" => "\N{SYMBOL FOR CARRIAGE RETURN}" ],
    esc => [ 1, "\033" => "\N{SYMBOL FOR ESCAPE}" ],
    sp  => [ 0, "\040" => "\N{SYMBOL FOR SPACE}" ],
    del => [ 1, "\177" => "\N{SYMBOL FOR DELETE}" ],
    );
my %opt_visible = do {
    map { $_ => splice @{$visible{$_}}, 0, 1 }
    keys %visible;
};

my @rcsopt;
my @diffopts;
my @sub_diffopts;

binmode STDOUT, ":encoding(utf8)";
binmode STDERR, ":encoding(utf8)";

if (my $env = $ENV{'CDIFOPTS'}) {
    unshift(@ARGV, shellwords($env));
}

my $unit_default = 'word';
my $prefix_default = q/(?:\\| )*(?:  )*/;

my $app;

use Getopt::EX::Hashed; {

    Getopt::EX::Hashed->configure( DEFAULT => [ is => 'rw' ] ) ;

    has help           => '        ' ;
    has version        => '        ' ;
    has man            => '        ' ;
    has debug          => ' d  =s@ ' , default => [] ;
    has diff           => '    =s  ' ;
    has subdiff        => '    =s  ' , default => 'diff' ;
    has color          => '    !   ' , default => 1 ;
    has 256            => '    !   ' , default => 1 ;
    has commandcolor   => ' cc !   ' , default => 1 ;
    has markcolor      => ' mc !   ' , default => 1 ;
    has textcolor      => ' tc !   ' , default => 1 ;
    has unknowncolor   => ' uc !   ' , default => 1 ;
    has colormap       => ' cm =s@ ' , default => [] ;
    has colordump      => '        ' ;
    has sdif           => '        ' ;
    has stat           => '    !   ' ;
    has unit           => ' by :s  ' , default => $unit_default ,
				     , any => [ qw(char word letter mecab 0), '' ];
    has show_old       => '    !   ' , default => 1 , alias => 'old' ;
    has show_new       => '    !   ' , default => 1 , alias => 'new' ;
    has show_mrg       => '    !   ' , default => 1 , alias => 'mrg' ;
    has command        => '    !   ' , default => 1 ;
    has unknown        => '    !   ' , default => 1 ;
    has mark           => '    !   ' , default => 1 ;
    has prefix         => '    !   ' , default => 1 ;
    has prefix_pattern => '    =s  ' , default => $prefix_default ;
    has visible        => '    =s@ ' , default => [] ;
    has lenience       => '    !   ' , default => 1 ;
    has linebyline     => '    :2  ' , alias => 'lxl';
    has style          => '    =s  ' , default => '' ;

    has ignore_case         => 'i' ;
    has ignore_space_change => 'b' ;
    has ignore_all_space    => 'w' ;
    has expand_tabs         => 't' ;

    has rcs                 => '' ;

    has '+sdif' => sub {
	$app->commandcolor = 0;
	$app->markcolor    = 0;
	$app->textcolor    = 0;
	$app->unknowncolor = 0;
    } ;
    has mecab => '!', action => sub {
	$app->unit = $_[1] ? 'mecab' : $unit_default;
    } ;
    has visible_cr  => ' vcr  !  ' , action => sub { $opt_visible{cr} = $_[1] } ;
    has visible_esc => ' vesc !  ' , action => sub { $opt_visible{esc} = $_[1] } ;
    has c           => '         ' , action => sub { push @diffopts, "-c" } ;
    has u           => '         ' , action => sub { push @diffopts, "-u" } ;
    has context     => ' C    =i ' , action => sub { push @diffopts, "-C" . $_[1] } ;
    has unified     => ' U    =i ' , action => sub { push @diffopts, "-U" . $_[1] } ;
    has r           => '      =s ' , action => sub { push @rcsopt, "-r$_[1]" } ;
    has q           => '         ' , action => sub { push @rcsopt, "-q" } ;

    has '+help' => sub {
	pod2usage
	    -verbose  => 99,
	    -sections => [ qw(SYNOPSIS VERSION) ];
    };
    has '+man' => sub {
	pod2usage -verbose => 2;
    };
    has '+version' => sub {
	print "Version: $VERSION\n";
	exit;
    };

} no Getopt::EX::Hashed;

$app = Getopt::EX::Hashed->new() or die;

use Getopt::EX::Long qw(:DEFAULT Configure ExConfigure);
ExConfigure BASECLASS => [ "App::cdif", "Getopt::EX" ];
Configure("bundling");
$app->getopt || usage({status => 1});

my %debug = map { $_ => 1 } map { split // } @{$app->debug};

$App::sdif::Util::NO_WARNINGS = $app->lenience;
$App::cdif::Command::Mecab::debug = 1 if $debug{m};

$app->rcs++ if @rcsopt;
my $diff = $app->rcs ? 'rcsdiff' : 'diff' unless $app->diff;

push @diffopts, map { $_->[1] } grep { $_->[0] } (
    [ $app->ignore_case,         "-i" ],
    [ $app->ignore_space_change, "-b" ],
    [ $app->ignore_all_space,    "-w" ],
    [ $app->expand_tabs,         "-t" ],
);

push @sub_diffopts, map { $_->[1] } grep { $_->[0] } (
    [ $app->ignore_case,      "-i" ],
    [ $app->ignore_all_space, "-w" ],
);

my %colormap = do {
    my $col = $app->{256} ? 0 : 1;
    pairmap { $a => (ref $b eq 'ARRAY') ? $b->[$col] : $b } (
	UNKNOWN =>   ""   ,
	UMARK   => [ "/444" , "/w" ] ,
	CMARK   =>   "GS" ,
	OMARK   =>   "CS" ,
	NMARK   =>   "MS" ,
	MMARK   =>   "YS" ,
	UTEXT   =>   ""  ,
	CTEXT   =>   "G"  ,
	OTEXT   =>   "C"  ,
	NTEXT   =>   "M"  ,
	MTEXT   =>   "Y"  ,
	COMMAND => [ "555/222E" , "w/kE" ] ,
	OCHANGE => [ "K/445"    , "B/w" ] ,
	NCHANGE => [ "K/445"    , "B/w" ] ,
	DELETE  => [ "K/544"    , "R/w" ] ,
	APPEND  => [ "K/544"    , "R/w" ] ,
	VISIBLE => "",
    );
};

use Getopt::EX::Colormap qw(ansi_code);
my $color_handler = Getopt::EX::Colormap
    ->new(HASH => \%colormap)
    ->load_params(@{$app->colormap});

for (
    [ $app->unknowncolor => q/UNKNOWN/ ],
    [ $app->commandcolor => q/COMMAND/ ],
    [ $app->markcolor    => q/MARK/ ],
    [ $app->textcolor    => q/TEXT/ ],
) {
    my($color, $label) = @$_;
    $color and next;
    for (grep /$label/, keys %colormap) {
	$colormap{$_} = '';
    }
}

warn 'colormap = ', Dumper \%colormap if $debug{c};

if ($app->colordump) {
    print $color_handler->colormap(
	name => '--changeme', option => '--colormap');
    exit;
}

sub color {
    $color_handler->color(@_);
}

my $prefix_re = do {
    if ($app->prefix) {
	qr/$app->{prefix_pattern}/;
    } else {
	"";
    }
};

my $DIFF;
my $OLD;
my $NEW;

if ($app->rcs) {
    my $rcsfile = shift || usage("No RCS filename\n\n");
    $DIFF = "$diff @diffopts @rcsopt $rcsfile|";
} elsif (@ARGV == 2) {
    ($OLD, $NEW) = splice(@ARGV, 0, 2);
    $DIFF = "$diff @diffopts $OLD $NEW |";
} elsif (@ARGV < 2) {
    $DIFF = shift || '-';
} else {
    usage("Arguments error.\n\n") if @ARGV;
}
warn "DIFF = \"$DIFF\"\n" if $debug{f};

my %func = do {
    my $col = $app->color ? 0 : 1;
    pairmap { $a => $b->[$col] } (
	DELETE  => [ sub { color("DELETE",  @_) }, \&bd ],
	APPEND  => [ sub { color("APPEND",  @_) }, \&bd ],
	OLD     => [ sub { color("OCHANGE", @_) }, \&ul ],
	NEW     => [ sub { color("NCHANGE", @_) }, \&ul ],
	UNKNOWN => [ sub { color("UNKNOWN", @_) }, undef ],
    );
};

my $w_pattern = do {
    if ($app->unit =~ /^char/) {
	qr/\X/s;
    } else {
	my $w = $app->unit eq 'letter' ? '' : '_';
	qr{
	    \p{Han} | \p{InHiragana}+ | \p{InKatakana}+ |
	    [$w\p{Latin}]+    |
	    [$w\p{Hang}]+     |
	    [$w\p{Cyrillic}]+ |
	    [$w\p{Arabic}]+   |
	    [$w\p{Thai}]+     |
	    \d+ |
	#   (\p{Punct})\g{-1}* |
	    [\h\r\f]*\n | \s+ | (\X)\g{-1}*
	}x;
    }
};

##
## Converter/Effector function for visible characters
##
my($converter, $effector);
use Getopt::EX::LabeledParam;
Getopt::EX::LabeledParam
    ->new(HASH => \%opt_visible)
    ->load_params (@{$app->visible});
if (my @names = grep $opt_visible{$_}, keys %opt_visible) {
    my @chars = map $visible{$_}->[0], @names;
    my %hash = map { @$_ } values %visible;
    my $re = do { local $" = ''; qr/[\Q@chars\E]/ };
    my $sub0 = sub {          s/($re)/$hash{$1}/g };
    my $sub1 = sub { $_[0] =~ s/($re)/$hash{$1}/gr };
    my $sub2 = sub {
	my $mark_re = shift;
	for (@_) {
	    $_ // next;
	    s{^$mark_re\K(?=.*$re)(.*\R?)}{
		$sub1->($1);
	    }mge;
	}
    };
    $converter = $sub2;

    if (my $color = $colormap{'VISIBLE'}) {
	my $s = ansi_code($color);
	my $e = ansi_code($color =~ s/(?=.)/~/gr); # cancel the effect
	my $symbols = join('',
			   map  { $visible{$_}->[-1] =~ s/\s+//gr }
			   @names);
	$effector = sub { s/([\Q$symbols\E]+)/${s}${1}${e}/g };
    }
}

##
## Temporary files
##
use App::cdif::Tmpfile;
my $T1 = App::cdif::Tmpfile->new;
my $T2 = App::cdif::Tmpfile->new;

##
## Total statistic info
##
my %stat;
@stat{'a', 'd', 'c', 'anl', 'dnl', 'cnl'} = (0, 0, 0, 0, 0, 0);
@stat{'anlb', 'dnlb', 'cnlb'} = (0, 0, 0);

open(DIFF, $DIFF) || die "$DIFF: $!\n";
binmode DIFF, ":encoding(utf8)";

my $stdout = App::sdif::Divert->new;
sub sprintln { map { s/(?<=[^\n])\z/\n/r } @_ }
sub  println { print sprintln @_ }
sub print_lxl {
    my($l1, $l2) = (sprintln($_[0]), sprintln($_[1]));
    if ($app->style eq 'diff') {
	printf "1c1\n". "< $l1". "---\n". "> $l2",
    } else {
	print $l1, $l2;
    }
}

while (<DIFF>) {
    #
    # --linebyline, --lxl option
    #
    if ($app->linebyline) {
	my $old = $_;
	my $new = <DIFF> // do {
	    print $old;
	    next;
	};
	compare(\$old, \$new) if $app->unit;
	print_lxl(color("OTEXT", $old),
		  color("NTEXT", $new));
    }
    #
    # normal diff
    #
    elsif (/^([\d,]+)([adc])([\d,]+)\r?$/) {
	my($left, $cmd, $right) = ($1, $2, $3);
	my $command_line = $_;
	my($old, $del, $new);
	eval {
	    if ($cmd =~ /[cd]/) {
		$old =  read_diff(*DIFF, scalar(range($left)));
		$old =~ /^(?!<)/m and die;
	    }
	    if ($cmd =~ /[c]/) {
		$del =  read_diff(*DIFF, 1);
		$del =~ /^(?!---)/m and die;
	    }
	    if ($cmd =~ /[ca]/) {
		$new =  read_diff(*DIFF, scalar(range($right)));
		$new =~ /^(?!>)/m and die;
	    }
	    1;
	}
	or do {
	    defined and print for ($command_line, $old, $del, $new);
	    next;
	};

	print_command($command_line);

	if ($converter) {
	    $converter->(qr/[<>][ \t]/, $old, $new);
	}

	if ($cmd eq 'c') {
	    compare(\$old, \$new, qr/<[ \t]/, qr/>[ \t]/) if $app->unit;
	}

	if ($app->color) {
	    $old =~ s{^(<[ \t])(.*)}{
		color("OMARK", $1) . color("OTEXT", $2)
	    }mge if $old;
	    $new =~ s{^(>[ \t])(.*)}{
		color("NMARK", $1) . color("NTEXT", $2)
	    }mge if $new;
	}

	println $old if $old and $app->show_old;
	println $del if $del;
	println $new if $new and $app->show_new;
    }
    #
    # diff -c
    #
    elsif (/^\*\*\* ([\d,]+) \*\*\*\*\r?$/) {
	my $left = $1;
	print_command($_);
	my(@old, @new);
	my $oline = range($left);
	@old = read_diffc(*DIFF, $oline);
	my $new;
	if (@old and $old[0] =~ /^--- /) {
	    $new = shift @old;
	    @old = ("");
	} else {
	    $new = <DIFF>;
	}
	my $dline = map { /^-/mg } @old;
	if ($new =~ /^--- ([\d,]+) ----$/) {
	    my $right = $1;
	    my $nline = range($right);
	    if (@old == 1 and $old[0] ne "" and $oline - $dline == $nline) {
		@new = ("");
	    } else {
		@new = read_diffc(*DIFF, $nline);
	    }
	    if ($converter) {
		$converter->(qr/[\-\+\!\ ][ \t]/, @old, @new);
	    }
	    my $mark_re = qr/![ \t]/;
	    for my $i (keys @old) {
		my $cmark = "! ";
		if ($i % 2) {
		    compare(\$old[$i], \$new[$i], $mark_re) if $app->unit;
		}
		if ($app->color) {
		    $old[$i] =~ s{^([\-\!][ \t])(.*)}{
			color("OMARK", $1) . color("OTEXT", $2)
		    }mge;
		    $new[$i] =~ s{^([\+\!][ \t])(.*)}{
			color("NMARK", $1) . color("NTEXT", $2)
		    }mge;
		}
	    }
	}
	println @old if $app->show_old;
	println $new;
	println @new if $app->show_new;
    }
    #
    # diff --combined (generic)
    #
    elsif (m{^
	   (?<prefix> $prefix_re)
	   (?<command>
	     (?<mark> \@{2,} ) [ ]
	     (?<lines> (?: [-+]\d+(?:,\d+)? [ ] ){2,} )
	     \g{mark}
	     (?s:.*)
	   )
	   }x) {
	my($prefix, $command, $lines) = @+{qw(prefix command lines)};
	my $column = length $+{mark};
	my @lines = map {
	    $_ eq ' ' ? 1 : int $_
	} $lines =~ /\d+(?|,(\d+)|( ))/g;

	if (@lines != $column) {
	    print;
	    next;
	}

	my($divert, %read_opt);
	if ($prefix) {
	    $read_opt{prefix} = $prefix;
	    use App::sdif::Divert;
	    $divert = App::sdif::Divert->new(FINAL => sub { s/^/$prefix/mg });
	}

	print_command($command);

	my @buf = read_unified \%read_opt, *DIFF, @lines;

	state @mark_re;
	my $mark_re = $mark_re[$column] //= do {
	    my $mark = '.' x ($column - 1);
	    qr/$mark/;
	};

	if ($converter) {
	    map { $converter->($mark_re, @$_) }
	    map { $_->lists }
	    @buf;
	}

	for my $buf (@buf) {
	    my @result = compare_unified($column, $buf, $mark_re);
	    if (@result == 3) {
		$app->show_new or splice @result, 2, 1;
		$app->show_old or splice @result, 1, 1;
	    }
	    println @result;
	}
    }
    #
    # conflict marker
    #
    elsif (/^<<<<<<<\s+(.*)/) {
      CONFLICT:
	{
	    my $c1 = $_;

	    my @old = read_until { /^=======$/ } *DIFF;
	    my $c2 = pop @old // do {
		print $c1, @old;
		last;
	    };

	    my @new = read_until { /^>>>>>>>\s+(.*)/ } *DIFF;
	    my $c3 = pop @new // do {
		print $c1, @old, $c2, @new;
		last;
	    };

	    my $i = first { $old[$_] =~ /^\Q|||||||\E/ } keys @old;
	    my @mrg = defined $i ? splice @old, $i : ();

	    my $old = join '', @old;
	    my $new = join '', @new;
	    $converter->('', $old, $new) if $converter;
	    compare(\$old, \$new) if $app->unit;

	    print $c1 if $app->mark;
	    print color("OTEXT", $old) if $app->show_old;
	    if (@mrg) {
		my $c4 = shift @mrg;
		print $c4 if $app->mark;
		my $mrg = join '', @mrg;
		$converter->('', $mrg) if $converter;
		print color("MTEXT", $mrg) if $app->show_mrg;
	    }
	    println $c2 if $app->mark;
	    println color("NTEXT", $new) if $app->show_new;
	    println $c3 if $app->mark;
	}
    }
    else {
	if ($app->unknown) {
	    if (my $f = $func{UNKNOWN}) {
		print $f->($_);
	    } else {
		print;
	    }
	}
    }
}
continue {
    $stdout->fh->flush;
    local *_ = $stdout->buffer;
    if (length) {
	$_ = decode 'utf8', $_;
	$effector->() if $effector;
	STDOUT->printflush($_);
	$stdout->clear;
    }
}
close DIFF;
my $exit = $DIFF =~ /\|$/ ? $? >> 8 : 0;

select STDOUT;

if ($app->stat) {
    select STDERR;

    print("TOTAL: ");
    printf("append=%d delete=%d change=%d\n",
	   $stat{'a'}, $stat{'d'}, $stat{'c'});
    print("INGORE WHITESPACE: ");
    printf("append=%d delete=%d change=%d\n",
	   $stat{'anl'},
	   $stat{'dnl'},
	   $stat{'cnl'});
    print("INGORE WHITESPACE (bytes): ");
    printf("append=%d delete=%d change=%d\n",
	   $stat{'anlb'},
	   $stat{'dnlb'},
	   $stat{'cnlb'});
}

exit($exit > 1);

######################################################################

sub compare_unified {
    my $column = shift;
    goto &compare_unified_3 if $column == 3;
    goto &compare_unified_generic;
}

sub compare_unified_generic {
    my($buf, $mark_re) = @_;
    my $c = $buf->collect(qr/^[\t ]+$/);
    my $o = $buf->collect(qr/[-]/);
    my $n = $buf->collect(qr/[+]/);
    compare(\$o, \$n, $mark_re) if $app->unit;
    if ($app->color) {
	map {
	    my($buf, $m, $t) = @$_;
	    $$buf =~ s{^($mark_re)(.*)}{
	    ($app->mark ? color($m, $1) : '') . color($t, $2)
	    }mge;
	} ( [ \$c, 'UMARK', 'UTEXT' ],
	    [ \$o, 'OMARK', 'OTEXT' ],
	    [ \$n, 'NMARK', 'NTEXT' ] );
    } else {
	map { s/^$mark_re//mg } $c, $o, $n if not $app->mark;
    }
    ($c, $o, $n);
}

sub compare_unified_3 {
    my($buf, $mark_re) = @_;

    my @mark = ( "  ", "--", "- ", " -", "+ ", " +", "++" );
    my %buf = map { $_ => scalar($buf->collect($_)) } @mark;

    goto SKIP unless $app->unit;

    my @r;
    $r[0] =     compare(\@buf{q/--/, q/++/}, $mark_re);
    $r[1] =     compare(\@buf{q/- /, q/ -/}, $mark_re);
    if ($r[1] == 0) {
	$r[2] = compare(\@buf{q/- /, q/+ /}, $mark_re);
	$r[3] = compare(\@buf{q/ -/, q/ +/}, $mark_re);
    }
    if (sum(@r) == 0) {
	$r[4] = compare(\@buf{q/- /, q/++/}, $mark_re);
	$r[5] = compare(\@buf{q/ -/, q/++/}, $mark_re) unless $r[4];
	$r[6] = compare(\@buf{q/--/, q/+ /}, $mark_re);
	$r[7] = compare(\@buf{q/--/, q/ +/}, $mark_re) unless $r[6];
    }

  SKIP:

    if ($app->color) {
	map { s/^$mark_re//mg } $buf{q/  /} if not $app->mark;
	map {
	    my($s, $m, $t) = @$_;
	    $$s =~ s{^($mark_re)(.*)}{
	    ($app->mark ? color($m, $1) : '') . color($t, $2)
	    }mge if $$s;
	} ( [ \$buf{q/--/}, 'CMARK', 'CTEXT' ],
	    [ \$buf{q/- /}, 'OMARK', 'OTEXT' ],
	    [ \$buf{q/ -/}, 'NMARK', 'NTEXT' ],
	    [ \$buf{q/+ /}, 'MMARK', 'MTEXT' ],
	    [ \$buf{q/ +/}, 'MMARK', 'MTEXT' ],
	    [ \$buf{q/++/}, 'MMARK', 'MTEXT' ] );
    } else {
	map { s/^$mark_re//mg } @buf{@mark} if not $app->mark;
    }

    @buf{$buf->labels};
}

sub print_command {
    return unless $app->command;
    my $line = shift;
    if ($app->color) {
	$line = color($colormap{COMMAND}, $line);
    }
    print $line;
}

sub compare {
    my($old, $new) = splice @_, 0, 2;
    return 0 unless $old && $new && $$old && $$new;

    my $omark_re = shift || undef;
    my $nmark_re = shift || $omark_re;

    my(@omark, @nmark);
    if ($omark_re) {
	$$old =~ s/^($omark_re)/push(@omark, $1), ""/mge;
	$$new =~ s/^($nmark_re)/push(@nmark, $1), ""/mge;
    }

    ($$old, $$new) = context($$old, $$new);

    $$old =~ s/^/shift @omark/mge if @omark;
    $$new =~ s/^/shift @nmark/mge if @nmark;

    1;
}

use Getopt::EX::Colormap qw(colorize);

sub debug_list {
    my $i = 0;
    my @cmap = qw( K/444 K/333 );
    join "", map { colorize $cmap[$i++ % @cmap], $_ } @_;
}

sub context {
    my($old, $new) = @_;
    local $_;

    if ($debug{s}) {
	print STDERR "****************************** Comparing ...\n";
	print STDERR $old;
	print STDERR "****************************** and\n";
	print STDERR $new;
	print STDERR "****************************** .\n";
    }

    my @owlist = wordlist($old);
    my @nwlist = wordlist($new);

    if ($debug{w}) {
	print STDERR "****************************** Comparing ...\n";
	print STDERR debug_list @owlist;
	print STDERR "****************************** and\n";
	print STDERR debug_list @nwlist;
	print STDERR "****************************** .\n";
    }

    maketmp($T1, @owlist);
    maketmp($T2, @nwlist);

    my $diff = sprintf "$app->{subdiff} @sub_diffopts %s %s", $T1->path, $T2->path;
    open my $cdif, "$diff |" or die "diff: $!\n";
    binmode $cdif, ":encoding(utf8)";
    my @command;
    my %c = (a => 0, d => 0, c => 0);
    while (<$cdif>) {
	warn $_ if $debug{d};

	## Quick hack to read `git diff` unified output
	if (/^\@\@ \s+ \-(\d+)(?:,(\d+))? \s+ \+(\d+)(?:,(\d+))? \s+ \@\@/x) {
	    my($o, $ol, $n, $nl) = ($1, $2//1, $3, $4//1);
	    my $cmd = ($ol == 0) ? "a" : ($nl == 0) ? "d" : "c";
	    my $old = ($ol <= 1) ? $o  : sprintf "%d,%d", $o, $o + $ol - 1;
	    my $new = ($nl <= 1) ? $n  : sprintf "%d,%d", $n, $n + $nl - 1;
	    $_ = sprintf "%s%s%s\n", $old, $cmd, $new;
	}

	if (/^[\d,]+([adc])[\d,]+$/) {
	    push @command, $_;
	    $c{$1}++;
	}
    }
    close $cdif;

    if ($debug{d}) {
	printf(STDERR "old=%d new=%d command=%d\n",
	       @owlist+0, @nwlist+0, @command+0);
	printf(STDERR "append=$c{a} delete=$c{d} change=$c{c}\n");
    }

    my($obuf, $nbuf) = makebuf(\@owlist, \@nwlist, \@command);
    my $status = $?>>8;
    die "Unexpected status of subprocess ($status)\n" if $status > 1;

    ($obuf, $nbuf);
}

sub wordlist {
    my $text = shift;
    if ($app->unit eq 'mecab' and $text =~ /\P{ASCII}/) {
	mecab_words($text);
    } else {
	normal_words($text);
    }
}

sub normal_words {
    my $text = shift;
    my @words;
    while ($text =~ /\G($w_pattern)/g) {
	push @words, $1;
    }
    @words;
}

sub mecab_words {
    my $text = shift;
    state $mecab = App::cdif::Command::Mecab->new;
    $mecab->wordlist($text);
}

sub maketmp {
    my($tmpfile, @list) = @_;
    $tmpfile->reset;
    for (@list) {
	s/[ \t]+// if $app->ignore_space_change || $app->ignore_all_space;
	$tmpfile->write($_);
	$tmpfile->write("\n") unless /\n\z/;
    }
    $tmpfile->flush->rewind;
}

##
##  @owlist: old word list
##  @nwlist: new word list
##  @command: how different these lists (`diff' result command lines)
##
sub makebuf {
    my($ol, $nl, $c) = @_;
    my @owlist = @$ol;
    my @nwlist = @$nl;
    my @command = @$c;

    my($o, $n) = (0, 0);	# pointers
    my(@obuf, @nbuf);

    for (@command) {
	my($old, $cmd, $new) = /([\d,]+)([adc])([\d,]+)/ or do {
	    die "Panic! Unexpected diff output";
	};
	my($o1, $o2) = range($old);
	my($n1, $n2) = range($new);
	map { $_-- } $o1, $o2, $n1, $n2; # make them zero origined

	push(@obuf, @owlist[$o .. $o1 - 1]), $o = $o1 if $o < $o1;
	push(@nbuf, @nwlist[$n .. $n1 - 1]), $n = $n1 if $n < $n1;

	$stat{$cmd}++;

	if ($cmd eq 'd') {
	    my $os = join('', @owlist[$o1 .. $o2]);
	    if ($owlist[$o2] =~ /\S/) {
		$stat{'dnl'}++;
		$stat{'dnlb'} += length($os);
	    }
	    push(@obuf, $func{DELETE}->($os));
	    $o = $o2 + 1;
	}
	elsif ($cmd eq 'c') {
	    my $os = join('', @owlist[$o1 .. $o2]);
	    my $ns = join('', @nwlist[$n1 .. $n2]);
	    if (($owlist[$o2] =~ /\S/) || ($nwlist[$n2] =~ /\S/)) {
		$stat{'cnl'}++;
		$stat{'cnlb'} += length($os);
		$stat{'cnlb'} += length($ns);
	    }
	    push(@obuf, $func{OLD}->($os));
	    push(@nbuf, $func{NEW}->($ns));
	    $o = $o2 + 1;
	    $n = $n2 + 1;
	}
	elsif ($cmd eq 'a') {
	    my $ns = join('', @nwlist[$n1 .. $n2]);
	    if ($nwlist[$n2] =~ /\S/) {
		$stat{'anl'}++;
		$stat{'anlb'} += length($ns);
	    }
	    push(@nbuf, $func{APPEND}->($ns));
	    $n = $n2 + 1;
	}
    }
    push(@obuf, @owlist[$o .. $#owlist]);
    push(@nbuf, @nwlist[$n .. $#nwlist]);

    (join('', @obuf), join('', @nbuf));
}

sub read_diff {
    my($FH, $c) = @_;
    my @buf = ();
    while ($c-- > 0) {
	push @buf, scalar <$FH>;
    }
    wantarray ? @buf : join '', @buf;
}

sub read_diffc {
    my($FH, $n) = @_;
    my @buf;
    local $_;
    my $i = 0;
    while ($n-- && ($_ = <$FH>)) {
	$i++ if ($i % 2) != /^!/;
	$buf[$i] .= $_;
	last if /^--- /;
    }
    map { $_ // "" } @buf;
}

sub ul {
    local $_ = join '', @_;
    s/(.)/["", "_\010", "__\010\010"]->[vwidth($1)].$1/ge;
    $_;
}
sub bd {
    local $_ = join '', @_;
    s/(\S)/$1.["", "\010", "\010\010"]->[vwidth($1)].$1/ge;
    $_;
}

sub wc_l {
    my $file = shift;
    my $line;
    $file->rewind;
    $line++ while $file->fh->getline;
    $file->rewind;
    $line;
}

sub eval {
    print STDERR &unctrl($_[0]), "\n" x ($_[0] !~ /\n$/) if $_[1] || $debug{e};
    CORE::eval shift;
    die sprintf("eval failed in file %s on line %s\n$@", (caller)[1,2]) if $@;
}

######################################################################

=head1 NAME

cdif - word context diff

=head1 VERSION

Version 4.41

=head1 SYNOPSIS

cdif [option] file1 file2

cdif [option] [diff-data]

Options:

	-c, -Cn		context diff
	-u, -Un		unified diff
	-i		ignore case
	-b		ignore space change
	-w		ignore whitespace
	-t		expand tabs

	--diff=command      specify diff command
	--subdiff=command   specify backend diff command
	--stat              show statistical information
	--colormap=s        specify color map
	--sdif              sdif friendly option
	--[no]color         color or not            (default true)
	--[no]256           ANSI 256 color mode     (default true)
	--[no]cc            color command line      (default true)
	--[no]mc            color diff mark         (default true)
	--[no]tc            color normal text       (default true)
	--[no]uc            color unknown text      (default true)
	--[no]old	    print old text          (default true)
	--[no]new	    print new text          (default true)
	--[no]mrg	    print merged text       (default true)
	--[no]command	    print diff command line (default true)
	--[no]unknown	    print unknown line      (default true)
	--[no]mark	    print mark or not       (default true)
	--[no]prefix	    read git --graph output (default true)
	--unit=s            word/letter/char/mecab  (default word)
	--[no]mecab	    use mecab tokenizer     (default false)
	--prefix-pattern    prefix pattern
	--visible char=?    set visible attributes
	--[no]lenience      suppress unexpected input warning (default true)
	--lxl               compare input data line-by-line
	--style=diff        print --lxl output in diff style


=head1 DESCRIPTION

B<cdif> is a post-processor of the Unix diff command.  It highlights
deleted, changed and added words based on word context (B<--unit=word>
by default).  If you want to compare text character-by-character, use
option B<--unit=char>.  Option B<--unit=mecab> tells to use external
B<mecab> command as a tokenizer for Japanese text.

If single or no file is specified, B<cdif> reads that file or STDIN as
an output from diff command.  In addition to normal diff, context
diff, and unified (combined) diff, the L<git(1)>-compatible conflict
marker format is supported as input format.

Lines those don't look like diff output are simply ignored and
printed.

=head2 STARTUP and MODULE

B<cdif> utilizes Perl L<Getopt::EX> module, and reads I<~/.cdifrc>
file if available when starting up.  You can define original and
default option there.  Next line enables B<--mecab> option and add
crossed-out effect for deleted words.

    option default --mecab --cm DELETE=+X

Modules under B<App::cdif> can be loaded by B<-M> option without
prefix.  Next command load B<App::cdif::colors> module.

    $ cdif -Mcolors

You can also define options in module file.  Read `perldoc
Getopt::EX::Module` for detail.

=head2 COLOR

Each lines are displayed in different colors.  Each text segment has
own labels, and color for them can be specified by B<--colormap>
option.  Read `perldoc Getopt::EX::Colormap` for detail.

Standard module B<-Mcolors> is loaded by default, and define several
color maps for light and dark screen.  If you want to use CMY colors in
dark screen, place next line in your F<~/.cdifrc>.

    option default --dark-cmy

Option B<--autocolor> is defined in B<default> module to call
L<Getopt::EX::termcolor> module.  It sets B<--light> or B<--dark>
option according to the brightness of the terminal screen.  You can
set preferred color in your F<~/.cdifrc> like:

    option --light --cmy
    option --dark  --dark-cmy

Automatic setting is done by L<Getopt::EX::termcolor> module and it
works with macOS Terminal.app and iTerm.app, and other XTerm
compatible terminals.  This module accept environment variable
L<TERM_BGCOLOR> as a terminal background color.  For example, use
C<000> or C<#000000> for black and C<555> or C<#FFFFFF> for white.

Option B<--autocolor> is set by default, so override it to do nothing
to disable.

    option --autocolor --nop

=head2 EXIT STATUS

B<cdif> always exit with status zero unless error occurred.

=head1 OPTIONS

=over 7

=item B<->[B<cCuUibwtT>]

Almost same as B<diff> command.

=begin deprecated

=item B<--rcs>, B<-r>I<rev>, B<-q>

Use rcsdiff instead of normal diff.  Option B<--rcs> is not required
when B<-r>I<rev> is supplied.

=end deprecated

=item B<-->B<unit>=[C<word>,C<letter>,C<char>,C<mecab>,C<0>,C<>]

=item B<-->B<by>=[C<word>,C<letter>,C<char>,C<mecab>,C<0>,C<>]

Specify the comparing unit.  Default is I<word> and compare each line
word-by-word.  Specify C<char> if you want to compare them
character-by-character.  Unit C<letter> is almost same as C<word> but
does not include underscore.

When C<mecab> is given as an unit, B<mecab> command is called as a
tokenizer for non-ASCII text.  ASCII text is compared word-by-word.
External B<mecab> command has to been installed.

If you give empty string like C<--unit=>, or C<0>, B<cdif> does not
compare text in any way.  You'll still get colorization effect.

=item B<--mecab>

Shortcut for B<--unit=mecab>.

=item B<--diff>=I<command>

Specify the diff command to use.

=item B<--subdiff>=I<command>

Specify the backend diff command to get word differences.  Accept
normal and unified diff format.

If you want to use B<git diff> command, don't forget to set I<-U0>
option.

    --subdiff="git diff -U0 --no-index --histogram"

=item B<-->[B<no->]B<color>

Use ANSI color escape sequence for output.

=item B<--colormap>=I<colormap>, B<--cm>=I<colormap>

Basic I<colormap> format is :

    FIELD=COLOR

where the FIELD is one from these :

    COMMAND  Command line
    OMARK    Old mark
    NMARK    New mark
    UTEXT    Same text
    OTEXT    Old text
    NTEXT    New text
    OCHANGE  Old change part
    NCHANGE  New change part
    APPEND   Appended part
    DELETE   Deleted part

=for comment
    VISIBLE  Visualized invisible chars

and additional I<Common> and I<Merged> FIELDs for git-diff combined
format.

    CMARK    Common mark
    CTEXT    Common text
    MMARK    Merged mark
    MTEXT    Merged text

You can make multiple fields same color joining them by = :

    FIELD1=FIELD2=...=COLOR

Also wildcard can be used for field name :

    *CHANGE=BDw

Multiple fields can be specified by repeating options

    --cm FILED1=COLOR1 --cm FIELD2=COLOR2 ...

or combined with comma (,) :

    --cm FILED1=COLOR1,FIELD2=COLOR2, ...

Color specification is a combination of single uppercase character
representing 8 colors :

    R  Red
    G  Green
    B  Blue
    C  Cyan
    M  Magenta
    Y  Yellow
    K  Black
    W  White

and alternative (usually brighter) colors in lowercase :

    r, g, b, c, m, y, k, w

or RGB values and 24 grey levels if using ANSI 256 or full color
terminal :

    (255,255,255)      : 24bit decimal RGB colors
    #000000 .. #FFFFFF : 24bit hex RGB colors
    #000    .. #FFF    : 12bit hex RGB 4096 colors
    000 .. 555         : 6x6x6 RGB 216 colors
    L00 .. L25         : Black (L00), 24 grey levels, White (L25)

or color names enclosed by angle bracket :

    <red> <blue> <green> <cyan> <magenta> <yellow>
    <aliceblue> <honeydue> <hotpink> <moccasin>
    <medium_aqua_marine>

with other special effects :

    D  Double-struck (boldface)
    I  Italic
    U  Underline
    S  Stand-out (reverse video)

Above color spec is simplified summary so if you want complete
information, read L<Getopt::EX::Colormap>.

Defaults are :

    COMMAND => "555/222E"
    OMARK   => "CS"
    NMARK   => "MS"
    UTEXT   => ""
    OTEXT   => "C"
    NTEXT   => "M"
    OCHANGE => "K/445"
    NCHANGE => "K/445"
    DELETE  => "K/544"
    APPEND  => "K/544"

    CMARK   => "GS"
    MMARK   => "YS"
    CTEXT   => "G"
    MTEXT   => "Y"

This is equivalent to :

    cdif --cm 'COMMAND=555/222E,OMARK=CS,NMARK=MS' \
         --cm 'UTEXT=,OTEXT=C,NTEXT=M,*CHANGE=BD/445,DELETE=APPEND=RD/544' \
         --cm 'CMARK=GS,MMARK=YS,CTEXT=G,MTEXT=Y'

=item B<--colormap>=C<&func>

=item B<--colormap>=C<sub{...}>

You can also set the name of perl subroutine name or definition to be
called handling matched words.  Target word is passed as variable
C<$_>, and the return value of the subroutine will be displayed.

Next option produces L<wdiff>-like formatted output.

    --cm '*'= \
    --cm DELETE=OCHANGE='sub{"[-$_-]"}' \
    --cm APPEND=NCHANGE='sub{"{+$_+}"}'

See L<Getopt::EX::Colormap/FUNCTION SPEC> for detail.

=item B<-->[B<no->]B<cc>, B<-->[B<no->]B<commandcolor>

=item B<-->[B<no->]B<mc>, B<-->[B<no->]B<markcolor>

=item B<-->[B<no->]B<tc>, B<-->[B<no->]B<textcolor>

=item B<-->[B<no->]B<uc>, B<-->[B<no->]B<unknowncolor>

Enable/Disable using color for the corresponding field.

=item B<--sdif>

Disable options appropriate to use for B<sdif>'s input:
B<--commandcolor>, B<--markcolor>, B<--textcolor> and
B<--unknowncolor>.

=item B<-->[B<no->]B<old>, B<-->[B<no->]B<new>, B<-->[B<no->]B<mrg>

Print or not old/new/mrg text in diff output.

=item B<-->[B<no->]B<command>

Print or not command lines preceding diff output.

=item B<-->[B<no->]B<unknown>

Print or not lines not look like diff output.

=item B<-->[B<no->]B<mark>

Print or not marks at the top of diff output lines.  At this point,
this option is effective only for unified diff.

Next example produces the output exactly same as I<new> except visual
effects.

    cdif -U100 --no-mark --no-old --no-command --no-unknown old new

These options are prepared for watchdiff(1) command.

=item B<-->[B<no->]B<prefix>

Understand prefix for diff output including B<git> B<--graph> option.
True by default.

=item B<--prefix-pattern>=I<pattern>

Specify prefix pattern in regex.  Default pattern is:

    (?:\| )*(?:  )*

This pattern matches B<git> graph style and whitespace indented diff
output.

=item B<--visible> I<charname>=[0,1]

Set visible attribute for specified characters.  Visible character is
converted to corresponding Unicode symbol character.  Default visible:
nul, bel, bs, vt, np, cr, esc, del.  Default invisible: ht, nl, sp.

    NAME  CODE  Unicode NAME                      DEFAULT
    ----  ----  --------------------------------  -------
    nul   \000  SYMBOL FOR NULL                   YES
    soh   \001  SYMBOL FOR SOH*                   YES
    bel   \007  SYMBOL FOR BELL                   YES
    bs    \010  SYMBOL FOR BACKSPACE              YES
    ht    \011  SYMBOL FOR HORIZONTAL TABULATION  NO
    nl    \012  SYMBOL FOR NEWLINE                NO
    vt    \013  SYMBOL FOR VERTICAL TABULATION    YES
    np    \014  SYMBOL FOR FORM FEED              YES
    cr    \015  SYMBOL FOR CARRIAGE RETURN        YES
    esc   \033  SYMBOL FOR ESCAPE                 YES
    sp    \040  SYMBOL FOR SPACE                  NO
    del   \177  SYMBOL FOR DELETE                 YES

Multiple characters can be specified at once, by assembling them by
comma (C<,>) like C<--visible ht=1,sp=1>; or connecting them by equal
sign (C<=>) like C<--visible ht=sp=1>.  Character name accept
wildcard; C<--visible '*=1'>.

=begin comment

Colormap label C<VISIBLE> is applied to those characters.  Default
setting is C<S>, and visible characters are displayed in reverse
video.  Unlike other colormaps, only special effects can be set to
this label.  Effect C<D> (double-struck) is exception (See
L<Getopt::EX::Colormap/~>).

=end comment

B<sdif> command also support B<--visible> option for horizontal tab
with better visibility.

=begin deprecated

=item B<-->[B<no->]B<visible-cr>

=item B<-->[B<no->]B<visible-esc>

Set CARRIAGE-RETURN and ESCAPE visible attributes.  These options will
be deprecated soon.  Use B<--visible> option instead.

=end deprecated

=item B<--stat>

Print statistical information at the end of output.  It shows number
of total appended/deleted/changed words in the context of cdif.  It's
common to have many insertions and deletions of newlines because of
text filling process.  So normal information is followed by modified
number which ignores insert/delete newlines.

=item B<-->[B<no->]B<lenience>

Suppress warning message for unexpected input from diff command.  True
by default.

=item B<--linebyline>, B<--lxl>

=item B<--style>=I<style>

Compare input data line-by-line.  Consider the inputs as pairs of two
lines each, and output the result of comparing each two lines.

Suppose you have a document with old and new text on lines beginning
with L<OLD:> and L<NEW:> labels.

    OLD: this is old text
    NEW: and this is updated document

Only this old/new part can be compared using B<greple>'s B<-Mtee>
module as follows.

    greple -Mtee cdif --lxl -- --cm=N -GE '^OLD: (.*\n)^NEW: (.*\n)'

B<-Mtee> module sends matched parts to the filter command and replace
them by its result.  Consult L<App::Greple::tee> for detail.

You can use L<teip(1)> command as well.

    teip -g '^(OLD|NEW):' -- cdif --lxl

If the C<--style=diff> option is given, the two strings are output in
B<diff> format and can be used in combination with the C<sdif> command as
follows.

    cdif --lxl --style=diff ... | sdif --no-cdif

Currently the only valid value for C<--style> is C<diff>, which
affects only the behavior of C<--lxl> option.

=back

=head1 GIT

See `perldoc App::sdif` how to use related commands under the GIT
environment.

=head1 ENVIRONMENT

=over 7

=item B<CDIFOPTS>

Environment variable B<CDIFOPTS> is used to set default options.

=item B<LESS>

=item B<LESSANSIENDCHARS>

Since B<cdif> produces ANSI Erase Line terminal sequence, it is
convenient to set B<less> command understand them.

    LESS=-cR
    LESSANSIENDCHARS=mK

=back

=head1 AUTHOR

Kazumasa Utashiro

=head1 LICENSE

Copyright 1992-2025 Kazumasa Utashiro

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.


=head1 SEE ALSO

L<App::sdif>, L<https://github.com/kaz-utashiro/sdif-tools>

L<sdif(1)>, L<watchdiff(1)>

L<Getopt::EX::Colormap>

L<App::Greple::tee>

L<https://taku910.github.io/mecab/>

=head1 BUGS

B<cdif> is naturally not very fast because it uses normal diff command
as a back-end processor to compare words.

=cut

#  LocalWords:  cdif diff rcs rcsdiff colormap commandcolor markcolor
#  LocalWords:  textcolor stdin OMARK NMARK OTEXT NTEXT OCHANGE CMARK
#  LocalWords:  NCHANGE CTEXT MMARK MTEXT Cyan RGB nomark stat cdifrc
#  LocalWords:  watchdiff mecab tokenizer Unicode lenience CDIFOPTS
#  LocalWords:  Kazumasa Utashiro sdif perldoc CMY cmy autocolor perl
#  LocalWords:  macOS XTerm subdiff LESSANSIENDCHARS
