#!/bin/sh
#
# simple html to text converter specially for LDAS log
# files, but generally usable.
#
# Accepts input on stdin, output goes to stdout, so
# behaves as a unix filter.
#
# the next line restarts using -*-Tcl-*-sh \
exec tclsh "$0" ${1+"$@"}

proc htmlFilter { text } {
     
     ;## table of escape characters
     array set esc {
     lt     <    gt     >    quot   \"   ob     \x7b  cb    \x7d
     nbsp   \xa0 iexcl  \xa1 cent   \xa2 pound  \xa3 curren \xa4
     yen    \xa5 brvbar \xa6 sect   \xa7 uml    \xa8 copy   \xa9
     ordf   \xaa laquo  \xab not    \xac shy    \xad reg    \xae
     hibar  \xaf deg    \xb0 plusmn \xb1 sup2   \xb2 sup3   \xb3
     acute  \xb4 micro  \xb5 para   \xb6 middot \xb7 cedil  \xb8
     sup1   \xb9 ordm   \xba raquo  \xbb frac14 \xbc frac12 \xbd
     frac34 \xbe iquest \xbf Agrave \xc0 Aacute \xc1 Acirc  \xc2
     Atilde \xc3 Auml   \xc4 Aring  \xc5 AElig  \xc6 Ccedil \xc7
     Egrave \xc8 Eacute \xc9 Ecirc  \xca Euml   \xcb Igrave \xcc
     Iacute \xcd Icirc  \xce Iuml   \xcf ETH    \xd0 Ntilde \xd1
     Ograve \xd2 Oacute \xd3 Ocirc  \xd4 Otilde \xd5 Ouml   \xd6
     times  \xd7 Oslash \xd8 Ugrave \xd9 Uacute \xda Ucirc  \xdb
     Uuml   \xdc Yacute \xdd THORN  \xde szlig  \xdf agrave \xe0
     aacute \xe1 acirc  \xe2 atilde \xe3 auml   \xe4 aring  \xe5
     aelig  \xe6 ccedil \xe7 egrave \xe8 eacute \xe9 ecirc  \xea
     euml   \xeb igrave \xec iacute \xed icirc  \xee iuml   \xef
     eth    \xf0 ntilde \xf1 ograve \xf2 oacute \xf3 ocirc  \xf4
     otilde \xf5 ouml   \xf6 divide \xf7 oslash \xf8 ugrave \xf9
     uacute \xfa ucirc  \xfb uuml   \xfc yacute \xfd thorn  \xfe
     yuml   \xff amp    &    #013   \n	  
     }

     ;## special handler for list items
     regsub -all {<[Ll][Ii]>}          $text {  * } text
     ;## and for images
     regsub -all {<[Ii][Mm][Gg][^>]+>} $text {* }   text
     ;## all other tags just GO AWAY
     regsub -all {<[^>]+>}             $text {}     text
     ;## escape curlies properly
     regsub -all {\\\}}                $text \}     text
     regsub -all {\\\{}                $text \{     text
     ;## maybe we are rendering something with embedded
     ;## tcl code -- declaw it!
     regsub -all {\$}                  $text {\\$}  text
     regsub -all {\[}                  $text {\\[}  text
     regsub -all {\]}                  $text {\\]}  text
     ;## replace html escape sequences with literals
     regsub -all -nocase {&([0-9a-z#]*);} $text {$esc(\1)} text
     ;## this line causes the $esc() to be evaluated
     set text [ subst $text ]

     ;## and ship it back!
     return $text
}

set data [ read stdin       ]
puts     [ htmlFilter $data ]
