#!/bin/sh -- # A comment mentioning perl eval 'exec perl -S $0 ${1+"$@"}' if 0; # Usage: dehtml [-f] [] if ($ARGV[0] eq '-f' ) { shift; $fmt = 1; } $nested_comments = 1; # try to remove nested ... --> $all = join('', <>); $all =~ s/\r//g; if ( $all !~ /\n$/ ) { $all .= "\n\n"; } # remove blocks of unwanted stuff: if ( $nested_comments ) { $all = &remove_nested_html_comments($all); } else { $all =~ s///sg; } $all =~ s,,,sig; $all =~ s,,,sig; # simple formating: $all =~ s/
  • /\n\t/ig; $all =~ s/

    /\n\n/ig; $all =~ s/
    /\n/ig; $all =~ s,,*,ig; # remove html tags $all =~ s/<[^>]*>//g; # expand some of the &foo; items: $all =~ s/&(nbsp|#160);?/ /ig; $all =~ s/<?//ig; $all =~ s/&?/&/ig; $all =~ s/&(quote|#34);?/°/ig; $all =~ s/&(cent|#162);?/¢/ig; $all =~ s/&(pound|#163);?/£/ig; $all =~ s/&(copy|#169);?/(c)/ig; $all =~ s/&(reg|#174);?/(R)/ig; $all =~ s/&(deg|#176);?/°/ig; $all =~ s/&(trade|#8482|#153);?/[tm]/ig; $all =~ s/&(middot|#183);?/·/ig; $all =~ s/&(frac14|#188);?/¼/ig; $all =~ s/&(frac12|#189);?/½/ig; $all =~ s/&(frac34|#190);?/¾/ig; $all =~ s/&(divide|#247);?/÷/ig; $all =~ s/&(ntilde|#241);?/ñ/ig; # trim excess white space $all =~ s/^[ \t]*$//mg; # limit number of blank lines in a row to 3: $all =~ s/\n\n(\n)+/\n\n\n/sg; if ( $fmt && open(FMT, "|fmt") ) { print FMT $all; close(FMT); } else { print STDOUT $all; } exit; # this is a crude way to remove nested comments: sub remove_nested_html_comments { my ($html) = @_; $html =~ s/%/___QUOTE_PERCNT_$$_/og; $html =~ s/=/___QUOTE_EQUALS_$$_/og; $html =~ s/<\s*!--/%/g; $html =~ s/--\s*>/=/g; while ( $html =~ /%[^%=]*=/ ) { $html =~ s/%[^%=]*=//sg; } $html =~ s/%//g; $html =~ s/___QUOTE_PERCNT_$$_/%/og; $html =~ s/___QUOTE_EQUALS_$$_/=/og; return $html; }