#!/usr/bin/perl -w -C

=pod

=head1 NAME

tv_grab_jp - Grab TV listings for Japan.

=head1 SYNOPSIS

tv_grab_jp --help

tv_grab_jp [--config-file FILE] --configure

tv_grab_jp [--config-file FILE] [--output FILE] [--days N]
           [--offset N] [--enable-readstr] [--no-category-code]
           [--quiet]

tv_grab_jp --list-channels

=head1 DESCRIPTION

Output TV listings for several channels available in Japan.
The grabber relies on parsing HTML so it might stop working at any
time.

First run B<tv_grab_jp --configure> to choose, which channels you want
to download. Then running B<tv_grab_jp> with no arguments will output
listings in XML format to standard output.

tv_grab_jp always grab 7 days of listings.

B<--configure> Prompt for which channels,
and write the configuration file.

B<--config-file FILE> Set the name of the configuration file, the
default is B<~/.xmltv/tv_grab_jp.conf>.  This is the file written by
B<--configure> and read when grabbing.

B<--output FILE> write to FILE rather than standard output.

B<--days N> grab N days.  The default and the maximum is 7.

B<--offset N> start N days in the future.  The default is to start
from today.

B<--enable-readstr> add Hiragana(read strings) for program title.

B<--no-category-code> suppress category code. English translated
category names will be added to the listings without this option.

B<--quiet> suppress the progress messages normally written to standard
error.

B<--list-channels> write output giving <channel> elements for every
channel available , but no programmes.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Takeru Komoriya<komoriya@paken.org>
Based on tv_grab_fi by Matti Airas and tv_grab_sn by Stefan G:orling.

=head1 BUGS

The data source may not suit recommended XMLTV DTD format.

=cut

######################################################################
# initializations

use strict;
use XMLTV::Version '$Id: tv_grab_jp,v 1.2 2004/02/26 04:28:26 komoriya Exp $ ';
use Getopt::Long;
use Date::Manip;
use HTML::TreeBuilder;
use HTML::Entities; # parse entities
use IO::File;

use XMLTV;
use XMLTV::Memoize;
use XMLTV::Ask;
use XMLTV::Config_file;
use XMLTV::Mode;
use XMLTV::Get_nice;

use Text::Kakasi;
use utf8;
use Encode qw(from_to);

# Todo: perhaps we should internationalize messages and docs?
use XMLTV::Usage <<END
$0: get Japanese television listings in XMLTV format
To configure: $0 --configure [--config-file FILE]
To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
        [--offset N] [--enable-readstr] [--no-category-code]
        [--quiet]
To list channels: $0 --list-channels
END
  ;

# Use Term::ProgressBar if installed.
use constant Have_bar => eval { require Term::ProgressBar; 1 };

# Attributes of the root element in output.
my $HEAD = { 'source-info-url'     => 'http://www.ontvjapan.com/',
	     'source-data-url'     => "http://www.ontvjapan.com/program/",
	     'generator-info-name' => 'XMLTV',
	     'generator-info-url'  => 'http://membled.com/work/apps/xmltv/',
	   };

# The timezone in Japan.
my $TZ="+0900";

# language for XML output
my $XMLLANG = "ja_JP";

# message language
my $lang = $ENV{'LANG'};
$lang = '' if not defined $lang;

# base URL of source data
my $urlbase = "http://www.ontvjapan.com";

# xmltv channel id extension
my $channel_ext = ".ontvjapan.com";

# Global channel data.
our @ch_all;

# category translation data
# (movie,series,sports,tvshow)
my %categories = ('ドラマ'         => 'series',
		  '映画'           => 'movie',
		  'スポーツ'       => 'sports',
		  '演劇・伝統芸'   => 'etc',
		  '音楽'           => 'tvshow',
		  'バラエティー'   => 'tvshow',
		  '教養'           => 'tvshow',
		  'アニメ・キッズ' => 'series',
		  '社会・報道'     => 'tvshow',
		  '趣味・暮らし'   => 'tvshow',
		  'その他'         => 'etc');


######################################################################
# get options

XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
my ($opt_days, $opt_offset, $opt_help, $opt_output,
    $opt_configure, $opt_config_file, $opt_quiet,
    $opt_readstr, $opt_no_cat_code, $opt_list_channels);
$opt_days        = 7; # default
$opt_offset      = 0; # default
$opt_readstr     = 0; # default
$opt_no_cat_code = 0; # default
$opt_quiet       = 0; # default
GetOptions('help'             => \$opt_help,
	   'configure'        => \$opt_configure,
	   'config-file=s'    => \$opt_config_file,
	   'output=s'         => \$opt_output,
	   'quiet'            => \$opt_quiet,
	   'enable-readstr'   => \$opt_readstr,
	   'no-category-code' => \$opt_no_cat_code,
	   'list-channels'    => \$opt_list_channels,
	   'days=i'           => \$opt_days,
	   'offset=i'         => \$opt_offset,
	  )
  or usage(0);
die 'number of days must not be negative'
    if (defined $opt_days && $opt_days < 0);
die 'number of offset must not be negative'
    if (defined $opt_offset && $opt_offset < 0);
usage(1) if $opt_help;

my $mode = XMLTV::Mode::mode('grab', # default
			     $opt_configure => 'configure',
			     $opt_list_channels => 'list-channels',
			    );

# File that stores which channels to download.
my $config_file;
my @config_lines; # used only in grab mode
if ($mode eq 'configure') {
    $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_jp', $opt_quiet);
    XMLTV::Config_file::check_no_overwrite($config_file);
}
elsif ($mode eq 'grab') {
    $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_jp', $opt_quiet);
    @config_lines = XMLTV::Config_file::read_lines($config_file);
}

######################################################################
# write configuration

if ($mode eq 'configure') {
    my $msg;
    open CONF, ">:utf8", "$config_file" or die "cannot write to $config_file: $!";

    # Get region information
    my $barmsg = select_lang('地域情報を取得中',
			     'getting regions');
    my $bar = new Term::ProgressBar($barmsg, 1) if Have_bar;
    my %regions = get_regions();
    update $bar if Have_bar;

    # Ask region
    my %ask_regions;
    my @r;
    my $regionid = '0002';
    my $req_region;
    Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-Ja', '-Ha', '-Ka');
    foreach (sort keys %regions) {
	my $region = $regions{$_};
	if ($lang ne 'ja_JP.UTF-8') {
	      utf8::encode($region);
	      from_to($region, "utf8", "euc-jp"); # convert to EUC
	      if ($lang ne 'ja_JP.eucJP') {
		  $region=Text::Kakasi::do_kakasi($region); # convert to Romaji
	      }
	}
	$ask_regions{$region} = $_;
	push @r, $region;
	if ($_ eq $regionid) {
	    $req_region = $region;
	}
    }
    Text::Kakasi::close_kanwadict();

    $msg = select_lang("\n地域を指定してください: ",
		       "\nSelect your region: ");
    $req_region = askQuestion($msg, $req_region, @r);
    $regionid = $ask_regions{$req_region};

    if ($regionid eq 'CATV') {
	$msg = select_lang("README.CATV.jaを参照して4桁のID番号を入力してください: ",
			   "See README.CATV and input region ID: ");
	$regionid = ask($msg);
	if (length($regionid) != 4) {
	    $msg = select_lang("IDが間違っています．\n",
			       "Invalid region ID.\n");
	    die $msg;
	}
    }
    print CONF "region $regionid\n";

    $barmsg = select_lang('チャンネル情報を取得中',
			     'getting channels');
    $bar = new Term::ProgressBar($barmsg, 1) if Have_bar;
    my @channels = get_channels($regionid);
    update $bar if Have_bar;

    # Ask about each channel.
    Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-Ja', '-Ha', '-Ka', '-Ea');
    my @qs;
    foreach (@channels) {
	my $name = $_->{station};
	my $callsign = $_->{callsign};
	if (($lang ne 'ja_JP.UTF-8') and ($lang ne 'ja_JP.eucJP')) {
	    utf8::encode($name);
	    from_to($name, "utf8", "euc-jp"); # convert to EUC
	    $name = Text::Kakasi::do_kakasi($name); # convert to Romaji
	}
	my $msg = select_lang("「$name($callsign)」を追加しますか？ ",
				  "add $name($callsign)? ");
	push @qs, $msg;
    }
    Text::Kakasi::close_kanwadict();

    my @want = askManyBooleanQuestions(1, @qs);
    foreach (@channels) {
	my $w = shift @want;
	my $id = $_->{id};
	my $name = $_->{station};
	my $callsign = $_->{callsign};
	warn("cannot read input, stopping channel questions"), last
	    if not defined $w;

	# Print a config line, but comment it out if channel not wanted.
	print CONF '#' if not $w;
	print CONF "channel $id\n";
    }

    close CONF or warn "cannot close $config_file: $!";
    $msg = select_lang("設定完了.\n", "Finished\n");
    print $msg;

    exit();
}

# Not configuration, we must be writing something, either full
# listings or just channels.
#
die if $mode ne 'grab' and $mode ne 'list-channels';

# Options to be used for XMLTV::Writer.
my %w_args;
if (defined $opt_output) {
    my $fh = new IO::File(">$opt_output");
    die "cannot write to $opt_output: $!" if not defined $fh;
    $w_args{OUTPUT} = $fh;
}
$w_args{encoding} = 'UTF-8';
my $writer = new XMLTV::Writer(%w_args);
$writer->start($HEAD);

######################################################################
# Channel listings
if ($mode eq 'list-channels') {
    # Get region information
    my $barmsg = select_lang('地域情報を取得中',
			     'getting regions');
    my $bar = new Term::ProgressBar($barmsg, 1) if Have_bar;
    my %regions = get_regions();
    update $bar if Have_bar;

    # get channels for all region
    my @ch_regions = grep { $_ ne 'CATV' } sort keys %regions;
    $barmsg = select_lang('チャンネル情報を取得中',
			  'getting channels');
    $bar = new Term::ProgressBar($barmsg, scalar keys %regions)
	if Have_bar;
    foreach (@ch_regions) {
	my $regionid = $_;
	my @channels = get_channels($regionid);
	foreach my $ch (@channels) {
	    my $id = $ch->{'id'};
	    my $exist = 0;
	    foreach (@ch_all) {
		$exist = 1 if ($id eq $_->{'id'});
	    }
	    if (not $exist) {
		my $name = $ch->{'station'};
		my $callsign = $ch->{'callsign'};
		utf8::encode($name);
		push @ch_all, { 'id' => $id,
				'station' => $name,
				'callsign' => $callsign };
	    }
	}
	update $bar if Have_bar;
    }

    # Write channels mode.
    foreach (@ch_all) {
	write_channel($_);
    }
    $writer->end();
    exit();
}

# We are producing full listings.
die if $mode ne 'grab';

######################################################################
# begin main program

# Read channels from configuration, push them to @ch_all
my $regionid = '';
my @ch_conf;
my $line_num = 1;
foreach (@config_lines) {
    ++ $line_num;
    next if not defined;
    if (/^region:?\s+(\S+)/) {
	$regionid = $1;
    }
    elsif (/^channel:?\s+(\S+)/) {
	push @ch_conf, $1;
    }
    else {
	warn "$config_file:$line_num: bad line\n";
    }
}

my $ch_number = @ch_conf;
my $diemsg = select_lang("チャンネルが設定されていません．\n"
			 . "まず --configure オプションをつけて設定をしてください．\n",
			 "Channels are not configured. Run with --configure first.\n");
die $diemsg if ($ch_number == 0) or ($regionid eq '');

my $barmsg = select_lang('チャンネル情報を取得中',
			 'getting channels');
my $bar = new Term::ProgressBar($barmsg, 1) if Have_bar;
my @channels = get_channels($regionid);
update $bar if Have_bar;

foreach (@channels) {
    my $id = $_->{id};
    my $name = $_->{station};
    utf8::encode($name);
    my $callsign = $_->{callsign};
    foreach (@ch_conf) {
	if ($_ eq $id) {
	    push @ch_all, { 'id' => $id,
			    'station' => $name,
			    'callsign' => $callsign };
	}
    }
}

# the order in which we fetch the channels matters
write_channel($_) foreach (@ch_all);

# This progress bar is for both downloading and parsing.
$barmsg = select_lang('番組表を取得中',
		      'getting program info');
$bar = new Term::ProgressBar($barmsg, scalar @ch_all)
    if Have_bar && not $opt_quiet;
foreach (@ch_all) {
    process_table($_->{'id'});
    update $bar if Have_bar && not $opt_quiet;
}
$writer->end();

######################################################################
# subroutine definitions

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
	*t = sub {};
	*d = sub { '' };
    }
    else {
	*t = \&Log::TraceMessages::t;
	*d = \&Log::TraceMessages::d;
	Log::TraceMessages::check_argv();
    }
}

# write out channel data
sub write_channel {
    my ($ch) = @_;
    $writer->startTag('channel', 'id' =>$ch->{'id'});
    $writer->dataElement('display-name', $ch->{'station'}, 'lang'=>$XMLLANG);
    $writer->dataElement('display-name', $ch->{'callsign'}, lang => 'en');
    $writer->endTag('channel');
}

####
# process_table: fetch a URL and process it
#
# arguments:
#    id of channel
# returns: list of program hashes to write
#
sub process_table {
    my ($channel_id) = @_;

    my $data = get_table($channel_id);

    # parse the page to a document object
    my $tree = HTML::TreeBuilder->new();
    $tree->parse($data);
    parse_program_table($channel_id, $tree);
}

# parse program table
sub parse_program_table {
    my ($channel, $tree) = @_;
    t "parse_program_table() ENTRY for tree: $tree";

    # hash for checking program continuity of the day
    my %date_changed;
    my %prog_time;

    # elements with <a> tag of which 'href' contains 'detail.php3'
    my @elems = $tree->look_down( '_tag','a');

    # parse by elements and push program infomation to @data
    foreach my $elem (@elems) {
	my $href = $elem->attr('href');
	next if not defined $href;
	next if not $href =~ m/detail.php3/;
	t 'doing elem: ' . d $elem;
	# get program infomation
	get_content($elem, $channel, \%date_changed, \%prog_time);
    }
}

sub get_content {
    my ($elem, $channel, $date_changed, $prog_time) = @_;
    my $p;

    # program title
    my $title = '';
    foreach $p ($elem->content_list()) {
	if (ref $p) {
	    # News/Weather icons
	    my $src = $p->attr('src');
	    if ( $src =~ m/n.gif/ ) {
		$title .= "[Ｎ]";
	    } elsif ( $src =~ m/w.gif/ ) {
		$title .= "[天]";
	    }
	} else {
	    # title
	    from_to($p, "euc-jp", "utf8"); # convert to utf-8
	    utf8::decode($p);
	    $title .= $p;
	}
    }
    $title =~ s/\s+/　/g;      # convert Hankaku spaces to Zenkaku
    utf8::encode($title);
    return undef if $title eq '';

    # contents
    my $desc = '';
    foreach $p ($elem->right()) {
	if (not ref $p) {
	    $p =~ s/^\s+//;
	    last if ($p =~ m/^\d\d:\d\d/); # if time is inserted, then next program
	    from_to($p, "euc-jp", "utf8"); # convert to utf-8
	    $desc .= $p;
	}
    }

    # air date
    my $hsid = $elem->attr('href');
    $hsid =~ m/hsid=(\d{8})(\d{4})(\d{3})/;
    return undef if (not defined $1) or (not defined $3) ;
    my $date = $1;
    my $prog_id = $3; # incremental program counter of the day

    # air time and genre
    my $time_genre = $elem->attr('title');
    $time_genre =~ m/^(\d\d:\d\d)-(\d\d:\d\d)\s*(\S*)/;
    my $starttime = $1;
    my $endtime = $2;
    my $genre = $3;
    from_to($genre, "euc-jp", "utf8"); # convert to UTF-8
    $genre =~ s/\/.*$//;   # delete after '/'
    return undef if (not defined $starttime) or (not defined $endtime);

    # start/end time
    $starttime =~ s/://;
    $endtime =~ s/://;

    my $startdate = $date;
    my $enddate = $date;
    # for the program over midnight
    if ($starttime > $endtime) {
	# real date of program end
	$enddate = UnixDate(DateCalc($date,"+ 1 day"), '%Q');
    }

    # date transition check
    if (not defined $$date_changed{$date}) {
	if ((not defined $$prog_time{$date}) or ($$prog_time{$date} < $starttime)) {
	    $$prog_time{$date} = $starttime;
	} else {
	    # real date is changed here
	    $$date_changed{$date} = $prog_id;
	}
    }

    # set real date
    if ((defined $$date_changed{$date}) and ($prog_id >= $$date_changed{$date})) {
	$startdate = UnixDate(DateCalc($date,"+ 1 day"), '%Q');
	$enddate = UnixDate(DateCalc($date,"+ 1 day"), '%Q');
    }

    # check the date for --days and --offset option
    my $today = ParseDate("today");
    my $output_sdate = UnixDate(DateCalc($today, "+ $opt_offset days"), '%Q');
    my $edays = $opt_offset + $opt_days - 1;
    my $output_edate = UnixDate(DateCalc($today, "+ $edays days"), '%Q');
    if ($opt_offset > 0 || $opt_days < 7) {
	return undef if ($startdate < $output_sdate);
	return undef if ($startdate > $output_edate);
    }

    # read string (Romaji) of title
    my $t = $title;
    utf8::decode($t);
    $t =~ s/\[.+\]//g;   # delete [NEWS] etc.
    $t =~ s/◇//g;       # delete '◇'
    $t =~ s/[0-9]//g;    # delete figures that represent time
			 # We cannot use \d here, because it's locale-aware and
			 # matches figures in wide charactors. Use [0-9] instead.
    utf8::encode($t);
    Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-JH', '-KH', '-aE');
    from_to($t, "utf8", "euc-jp"); # convert to EUC
    my $readstr = '';
    if ($t ne '') {
	$readstr=Text::Kakasi::do_kakasi($t);
	from_to($readstr, "euc-jp", "utf8"); # convert to UTF-8
    }
    Text::Kakasi::close_kanwadict();

    t "TITLE:$title($readstr) $date $starttime-$endtime DESC:$desc GENRE:$genre\n";

    # write out program
    my $start = $startdate . $starttime . "00 " . $TZ;
    my $stop = $enddate . $endtime . "00 " . $TZ;

    $writer->startTag('programme', 'start'=>$start, 'stop'=>$stop, 'channel'=>$channel);
    $writer->dataElement('title', $title, 'lang'=>$XMLLANG);
    $writer->dataElement('title', $readstr, 'lang'=>$XMLLANG . '@kana')
	if ($readstr ne '') and $opt_readstr;
    $desc =~ s/\s+$//;
    $writer->dataElement('desc', $desc, 'lang'=>$XMLLANG) if $desc ne '';
    $writer->dataElement('category', $genre, 'lang'=>$XMLLANG) if $genre ne '';
    if (($genre ne '') and (not $opt_no_cat_code)) {
	# Convert category code to English for MythTV's benefit.
	utf8::decode($genre);
	my $cat_code = $categories{$genre};
	$writer->dataElement('category', $cat_code, lang => 'en')
	    if (defined $cat_code and $cat_code ne '');
    }
    $writer->endTag('programme');

}

# get channel listing
sub get_channels {
    my ($regionid) = @_;
    my @channels;
    my $local_data = get_channel_table($regionid);

    my $tree = HTML::TreeBuilder->new();
    $tree->parse($local_data);

    # channels elements are specially formatted <a> tags
    # with href="gridChannel.php..."
    my @ch_a_elems = $tree->look_down(_tag => 'a');

    # Zenkaku -> Hankaku conversion
    Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-Ea');

    # get channels
    while (@ch_a_elems) {
	my $elem = shift @ch_a_elems;
	my $href = $elem->attr('href');
	if ($href =~ m/^gridChannel.php\?tikicd=${regionid}&ch=(\d\d\d\d)/) {
	    my $channel_id = $1;
	    $channel_id .= $channel_ext;    # this is xmltv channel id
	    my $callsign = ($elem->content_list)[0];
	    $callsign = Text::Kakasi::do_kakasi($callsign);
	    $callsign =~ s/[\(\)]//g;
	    $elem = shift @ch_a_elems;
	    my $channel = ($elem->content_list)[0];
	    # convert to UTF-8
	    from_to($channel, "euc-jp", "utf8");
	    from_to($callsign, "euc-jp", "utf8");
	    utf8::decode($channel);
	    utf8::decode($callsign);
	    # special fix for NHK-ETV call sign
	    $callsign = 'ETV' if ($callsign eq 'NHK' and $channel eq 'ＮＨＫ教育');
	    # special fix for Housou Daigaku
	    $callsign = 'UAIR' if ($channel eq '放送大学');
	    push @channels, { 'id' => $channel_id,
			      'station' => $channel,
			      'callsign' => $callsign };
        }
    }
    Text::Kakasi::close_kanwadict();

    my $chnum = @channels;
    my $diemsg = select_lang("チャンネル情報の取得に失敗しました\n"
			     . "ネットワークの接続と地域番号を確認してください",
			     "Failed to get channel information.\n"
			     . "Please check if region id and network connections are correct.");
    die $diemsg if $chnum == 0;
    return @channels;
}

# get regions
sub get_regions {
    my %regions;
    my $local_data = get_area_list();

    my $tree = HTML::TreeBuilder->new();
    $tree->parse($local_data);

    # area elements are formatted <a> tags
    # with href="javascript:sendUrl(0,'$id')"
    my @area_elems = $tree->look_down(_tag => 'a');

    # get channels
    foreach (@area_elems) {
	my $href = $_->attr('href');
	if ($href =~ m/^javascript:sendUrl.*'(.{3})'/) {
	    my $region_id = '0' . $1;
	    my $region_name = ($_->content_list)[0];
	    # convert to UTF-8
	    from_to($region_name, "euc-jp", "utf8");
	    utf8::decode($region_name);
	    $regions{$region_id} = $region_name;
	}
    }

    my $diemsg = select_lang("地域情報の取得に失敗しました\n"
			     . "ネットワークの接続を確認してください",
			     "Failed to get region information.\n"
			     . "Please check if network connections are correct.");
    die $diemsg if scalar keys %regions == 0;

    # for CATV station
    $regions{'CATV'} = 'CATV';

    return %regions;
}

# get program table
sub get_table {
    my ($channelid) = @_;
    my $ext = quotemeta $channel_ext;
    $channelid =~ s/$ext//;
    my $url = "$urlbase/program/gridChannel.php?ch=${channelid}&genre=all";
    my $content = XMLTV::Get_nice::get_nice($url);
    return $content;
}

# get channel table
sub get_channel_table {
    my ($regionid) = @_;
    # request channel data
    my $url = "$urlbase/program/gridChannel.php?tikicd=${regionid}";
    my $content = XMLTV::Get_nice::get_nice($url);
    return $content;
}

# get area list
sub get_area_list {
    # request area selection page
    my $url = "$urlbase/areachange/areaselect.php3?tv=1&force=1";
    my $content = XMLTV::Get_nice::get_nice($url);
    return $content;
}

# Select proper language and encoding of output message
sub select_lang {
    my ($ja, $eng) = @_;

    if ($lang eq 'ja_JP.UTF-8') {
	return $ja;
    } elsif ($lang eq 'ja_JP.eucJP') {
	utf8::encode($ja);
	from_to($ja, "utf8", "euc-jp"); # convert to EUC
	return $ja;
    } else {
	return $eng;
    }
}
