#! /usr/bin/perl
##
#	HTML to Doc Text converter for 3Com PalmPilots
#	html2pdbtxt
#
#	Copyright (C) 1998  Paul J. Lucas
#	based on convert.pl by Christopher Heschong <chris@screwdriver.net>
#
#	This program is free software; you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation; either version 2 of the License, or
#	(at your option) any later version.
#
#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.
#
#	You should have received a copy of the GNU General Public License
#	along with this program; if not, write to the Free Software
#	Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
##

$HR = "-" x 10;			# \x97 = emdash on Pilot

########## You shouldn't have to change anything below this line. #############

use File::Basename;
use Getopt::Std;
use encoding 'utf8';

$me = basename( $0 );
$VERSION = '1.2.1';

sub usage {
	die "usage: $me [-b chars] [-t title] [-u URL] file.html [ file.txt ]\n       $me -v\n";
}

getopts( 'b:t:u:v' ) or usage;

###############################################################################
#	Input checks
###############################################################################

if ( $opt_v ) {
	print "$me version $VERSION\n";
	exit 0;
}

$bookmark = $opt_b ? $opt_b : '(*)';
die "$me: bookmark can not contain > character\n" if $bookmark =~ />/;

usage unless $#ARGV + 1 >= 1;
( $html_file, $txt_file ) = @ARGV;

open( INPUT, '<:encoding(utf8)', "$html_file" ) or die "$me: can not open $html_file for input\n";
$_ = join( '', <INPUT> );		# slurp up all of HTML
close( INPUT );

if ( $txt_file ) {
    open( OUTPUT, '>:encoding(iso-8859-1)', "$txt_file" ) or
	die "$me: can not open $txt_file for output\n";
    select OUTPUT;
}

###############################################################################
#	Subroutines
###############################################################################

sub encode {
	( $_ = $_[0] ) =~ s/./sprintf("%02X",ord($&))/egs;
	return "%PRE%$_%/PRE%";
}
sub decode {
	( $_ = $_[0] ) =~ s/[A-F0-9]{2}/pack('C',hex($&))/eg;
	return "\n$_\n";
}

###############################################################################
#	Do it!
###############################################################################

s!<PRE>(.+?)</PRE>!encode($1)!egs;	# "freeze" <PRE> blocks
s!\s+! !g;				# turn HTML into one big long string

unless ( $title = $opt_t ) {
	$title = $1 if /<TITLE>\s*([^\s].*[^\s])\s*<\/TITLE>/i;
     $title = $1 if /<TITLE>.*?<\/TITLE>/i;
}

##
# Convert various tags
##
s!<TITLE>(.*?)<\/TITLE>!\xF8$1!gi;
s!<BLOCKQUOTE.*?>!\n\n\t!gi;
s!</?(?:BR|CENTER|DIV|OPTION|SELECT|TABLE).*?>!\n!gi;
s!</?(?:ADDRESS|BLOCKQUOTE|DL|DT|H[1-6]|OL|P|UL).*?>!\n\n!gi;
s!<DD>!\n\t!gi;
s!<HR.*?>!\n$HR\n!gio;
s!<LI>!\n\x95 !gi;			# \x95 = bullet character on Pilot
s!<(SCRIPT|STYLE|TITLE).*?>.*?</\1>!!gi;
s!<TD.*?>! !gi;
s!<TR.*?>!\n!gi;
s!<.*?ALT="([^"]+?)".*?>![$1]!gi;	# extract text from ALT attributes

my $endmarker = "---(EOF)---\n";
s!<\/HTML>!$endmarker!gi;   # put end-of-file marker for each file in html archives

# Create bookmarks at <A NAME ...> tags for non-empty anchors!
$bookmarks = s!<A\s+NAME.*?>(.+?)<\/A>!$bookmark$1!gio;
s!<A\s+NAME.*?>(\w+?)!$bookmark$1!gio;

s!<.*?>!!g;				# strip all other HTML tags


##
# Clean up whitespace
##
s! +! !gm;				# collapse spaces
s!^ +!!gm;				# strip leading spaces
s!	 !	!g;			# tab-space -> tab
s!^[ 	]+$!!gm;			# strip lines of whitespace
s!^\n{2,}!\n!gm;			# collapse newlines
s!\s*%PRE%(.*?)%/PRE%\s*!decode($1)!egs; # "thaw" <PRE> blocks
s! +$!!gm;				# strip trailing spaces

# Convert numeric entity references
s!&#(\d+);!pack( 'c', $1 )!eg;
s!&#X([\dA-F]+);!pack( 'c', hex($1) )!eg;

# Convert character entity references
%char_entity = (
	'quot',		'"',
	'amp',		'&',
	'lt',		'<',
	'gt',		'>',
	'emsp',		"\x80", # em space (HTML 2.0)
	'sbquo',	"\x82", # single low-9 (bottom) quotation mark
	'fnof',		"\x83", # Florin or Guilder (currency)
	'bdquo',	"\x84", # double low-9 (bottom) quotation mark
	'hellip',	"\x85", # horizontal ellipsis
	'dagger',	"\x86", # dagger
	'Dagger',	"\x87", # double dagger
	'circ',		"\x88", # modifier letter circumflex accent
	'permil',	"\x89", # per mill sign
	'Scaron',	"\x8A", # latin capital letter S with caron
	'lsaquo',	"\x8B", # left single angle quotation mark
	'OElig',	"\x8C", # latin capital ligature OE
	'diams',	"\x8D", # diamond suit
	'clubs',	"\x8E", # club suit
	'hearts',	"\x8F", # heart suit
	'spades',	"\x90", # spade suit
	'lsquo',	"\x91", # left single quotation mark
	'rsquo',	"\x92", # right single quotation mark
	'ldquo',	"\x93", # left double quotation mark
	'rdquo',	"\x94", # right double quotation mark
	'endash',	"\x96", # dash the width of ensp (Lynx)
	'ndash',	"\x96", # dash the width of ensp (HTML 2.0)
	'emdash',	"\x97", # dash the width of emsp (Lynx)
	'mdash',	"\x97", # dash the width of emsp (HTML 2.0)
	'tilde',	"\x98", # small tilde
	'trade',	"\x99", # trademark sign (HTML 2.0)
	'scaron',	"\x9A", # latin small letter s with caron
	'rsaquo',	"\x8B", # right single angle quotation mark
	'oelig',	"\x9C", # latin small ligature oe
	'Yuml',		"\x9F", # latin capital letter Y with diaeresis
	'ensp',		"\xA0", # en space (HTML 2.0)
	'thinsp',	"\xA0", # thin space (Lynx)
	'nbsp',		"\xA0", # non breaking space
	'iexcl',	"\xA1", # inverted exclamation mark
	'cent',		"\xA2", # cent (currency)
	'pound',	"\xA3", # pound sterling (currency)
	'curren',	"\xA4", # general currency sign (currency)
	'yen',		"\xA5", # yen (currency)
	'brkbar',	"\xA6", # broken vertical bar (Lynx)
	'brvbar',	"\xA6", # broken vertical bar
	'sect',		"\xA7", # section sign
	'die',		"\xA8", # spacing dieresis (Lynx)
	'uml',		"\xA8", # spacing dieresis
	'copy',		"\xA9", # copyright sign
	'ordf',		"\xAA", # feminine ordinal indicator
	'laquo',	"\xAB", # angle quotation mark, left
	'not',		"\xAC", # negation sign
	'shy',		"\xAD", # soft hyphen
	'reg',		"\xAE", # circled R registered sign
	'hibar',	"\xAF", # spacing macron (Lynx)
	'macr',		"\xAF", # spacing macron
	'deg',		"\xB0", # degree sign
	'plusmn',	"\xB1", # plus-or-minus sign
	'sup2',		"\xB2", # superscript 2
	'sup3',		"\xB3", # superscript 3
	'acute',	"\xB4", # spacing acute
	'micro',	"\xB5", # micro sign
	'para',		"\xB6", # paragraph sign
	'middot',	"\xB7", # middle dot
	'cedil',	"\xB8", # spacing cedilla
	'sup1',		"\xB9", # superscript 1
	'ordm',		"\xBA", # masculine ordinal indicator
	'raquo',	"\xBB", # angle quotation mark, right
	'frac12',	"\xBC", # fraction 1/2
	'frac14',	"\xBD", # fraction 1/4
	'frac34',	"\xBE", # fraction 3/4
	'iquest',	"\xBF", # inverted question mark
	'Agrave',	"\xC0", # capital A, grave accent
	'Aacute',	"\xC1", # capital A, acute accent
	'Acirc',	"\xC2", # capital A, circumflex accent
	'Atilde',	"\xC3", # capital A, tilde
	'Auml',		"\xC4", # capital A, dieresis or umlaut mark
	'Aring',	"\xC5", # capital A, ring
	'AElig',	"\xC6", # capital AE diphthong (ligature)
	'Ccedil',	"\xC7", # capital C, cedilla
	'Egrave',	"\xC8", # capital E, grave accent
	'Eacute',	"\xC9", # capital E, acute accent
	'Ecirc',	"\xCA", # capital E, circumflex accent
	'Euml',		"\xCB", # capital E, dieresis or umlaut mark
	'Igrave',	"\xCC", # capital I, grave accent
	'Iacute',	"\xCD", # capital I, acute accent
	'Icirc',	"\xCE", # capital I, circumflex accent
	'Iuml',		"\xCF", # capital I, dieresis or umlaut mark
	'Dstrok',	"\xD0", # capital Eth, Icelandic (Lynx)
	'ETH',		"\xD0", # capital Eth, Icelandic
	'Ntilde',	"\xD1", # capital N, tilde
	'Ograve',	"\xD2", # capital O, grave accent
	'Oacute',	"\xD3", # capital O, acute accent
	'Ocirc',	"\xD4", # capital O, circumflex accent
	'Otilde',	"\xD5", # capital O, tilde
	'Ouml',		"\xD6", # capital O, dieresis or umlaut mark
	'times',	"\xD7", # multiplication sign
	'Oslash',	"\xD8", # capital O, slash
	'Ugrave',	"\xD9", # capital U, grave accent
	'Uacute',	"\xDA", # capital U, acute accent
	'Ucirc',	"\xDB", # capital U, circumflex accent
	'Uuml',		"\xDC", # capital U, dieresis or umlaut mark
	'Yacute',	"\xDD", # capital Y, acute accent
	'THORN',	"\xDE", # capital THORN, Icelandic
	'szlig',	"\xDF", # small sharp s, German (sz ligature)
	'agrave',	"\xE0", # small a, grave accent
	'aacute',	"\xE1", # small a, acute accent
	'acirc',	"\xE2", # small a, circumflex accent
	'atilde',	"\xE3", # small a, tilde
	'auml',		"\xE4", # small a, dieresis or umlaut mark
	'aring',	"\xE5", # small a, ring
	'aelig',	"\xE6", # small ae diphthong (ligature)
	'ccedil',	"\xE7", # small c, cedilla
	'egrave',	"\xE8", # small e, grave accent
	'eacute',	"\xE9", # small e, acute accent
	'ecirc',	"\xEA", # small e, circumflex accent
	'euml',		"\xEB", # small e, dieresis or umlaut mark
	'igrave',	"\xEC", # small i, grave accent
	'iacute',	"\xED", # small i, acute accent
	'icirc',	"\xEE", # small i, circumflex accent
	'iuml',		"\xEF", # small i, dieresis or umlaut mark
	'dstrok',	"\xF0", # small eth, Icelandic (Lynx)
	'eth',		"\xF0", # small eth, Icelandic
	'ntilde',	"\xF1", # small n, tilde
	'ograve',	"\xF2", # small o, grave accent
	'oacute',	"\xF3", # small o, acute accent
	'ocirc',	"\xF4", # small o, circumflex accent
	'otilde',	"\xF5", # small o, tilde
	'ouml',		"\xF6", # small o, dieresis or umlaut mark
	'divide',	"\xF7", # division sign
	'oslash',	"\xF8", # small o, slash
	'ugrave',	"\xF9", # small u, grave accent
	'uacute',	"\xFA", # small u, acute accent
	'ucirc',	"\xFB", # small u, circumflex accent
	'uuml',		"\xFC", # small u, dieresis or umlaut mark
	'yacute',	"\xFD", # small y, acute accent
	'thorn',	"\xFE", # small thorn, Icelandic
	'yuml',		"\xFF", # small y, dieresis or umlaut mark
);
s/&(\w+);/exists $char_entity{ $1 } ? $char_entity{ $1 } : ' '/eg;


# Converted UTF codes for non-matching ISO-8859-1
# Strip it down to basic ASCII
%utf_entity = (
	"\x{2019}",	"'",
	"\x{201c}",	'"',
	"\x{201d}",	'"',
	"\x{2026}",	"...",
        "\x{fffd}",     "",
);
s/(\X)/ exists $utf_entity{$1} ? $utf_entity{$1} : $1 /eg;

##
# Output Doc text
##
print "\xF8$title\n" if $title;
print "$opt_u\n" if $opt_u;
print "$HR\n" if $title || $opt_u;
print "$_\n";
print "<$bookmark>\n" if $bookmarks;