#!/bin/bash

# w3m-man
#
# Use wget, w3m and less to download and view man pages.
#
# Homepage:  https://github.com/sc0ttj/w3m-man
#
# Supports using $MANPAGER, $HTMLPAGER, $TERM_BROWSER, and
# falls back to w3m and less if these are not set.
#
# How it works:
#
# - saves manpage from online sources to $HOME/.w3m-manpages/
# - not proper man pages, just plain text output of websites
# - prettifies them a bit (removes junk headers, links, messages)
#
# Usage:
#
#  man <command>        # view man page in plain text in $PAGER
#  man -H <command>     # view man page as HTML in $BROWSER
#  man <command> --url  # print the URL of the man page to STDOUT

# TODO:
#
# - Display the path searched for manpages:
#
#   man --path
#
# - Display the location of a manpage rather than the manpage itself:
#
#   man -w command
#
# - Search for manpages containing a search string:
#
#   man -k "search_string"
#

if [ "$1" = "--help" ];then

  echo "# man (w3m-man)
# Use wget, w3m and less to download and view man pages
# See https://github.com/sc0ttj/w3m-man

Usage:

  man <command>         # (download and) print the man page

  man <command> --url   # print the URL from which the man
                        # page was/would be downloaded

  man --help            # print this help info

Where <command> is the command name you want to read about.

Examples:

  man diff                # view 'diff' command (section 1)

  man mount.8             # view 'mount' in section 8 (config stuff)

  man 8 mount             # same as above, but doesn't work with -H

  man -H mount            # view 'mount' man page as HTML using $BROWSER

  man <command> --url     # only print the URL from which the man page
                          # was/would be downloaded

Man pages are divided into sections, as follows:

 1. User: most user commands and programs.

 2. System: calls by the Linux kernel.

 3. Library: documents provided by the standard C library.

 4. Devices: documents various devices, most of which reside in /dev.

 5. Files: describes various file formats and filesystems and proc(5).

 7. Overviews, conventions, and miscellaneous.

 8. Superuser and system administration commands.

Checks the following URLs:

the Ubuntu or Debian man pages, also:

http://man.he.net/?topic=\${command}&section=\${section}
http://manpages.org/\${command}/\${section}
https://linux.die.net/man/\${section}/\${command}
https://www.mankier.com/\${section}/\${command}
https://man7.org/linux/man-pages/man\${section}/\${command}.\${section}.html
http://manpages.org/\${command}
http://man.he.net/?topic=\${command}&section=all
https://ss64.com/bash/\${command}.html
"
  exit 0
fi

old_man="$(command -v man)"
unset man

command="$1"
section="1"

DISTRO_COMPAT_VERSION="latest"
if [ -f /etc/DISTRO_SPECS ];then
  source /etc/DISTRO_SPECS
fi


# support GNU man -H option (to view as HTML)
if [ "$1" = "-H" ];then
  command="$2"
fi

# support GNU man syntax `man 1 mount`
re='^[1-8]+$'
if [[ $1 =~ $re ]] ; then
  command="${2}.$1"
  section="$1"
fi

# support GNU man syntax `man mount.1`, `man mount.2`, etc
section="${command//*\./}"
command="${command//\.*/}"
if [ "$command" = "$section" ];then
  section=1
fi

manpage_file="$HOME/.w3m-manpages/$command.$section"

# support env var $MANPAGER
pager=${MANPAGER:-$PAGER}
pager=${pager:-less -XR}
w3m='w3m -o auto_image=false -o display_image=false'
browser=${TERM_BROWSER:-$TERMBROWSER}
browser=${browser:-$w3m}
htmlpager=${HTMLPAGER:-w3m -dump}

# Remove some junk from the plain text files generated by w3m,
# such as headers, links to adverts, etc
function prettifier {
  url="$1"
  case "$url" in

    *'ubuntu.com'*|*'debian.org'*)
      sed \
        -e '1,5d' \
        -e 's/ bug$//' \
        -e 's/      / /g'
      ;;

    *'linux.die.net'*)
      sed -e '1d' -e 's/\[INS::INS\]//g' -e 's/    \[          \]//g' \
        -e 's/Site Search//g' 2>/dev/null | head -n -12
      ;;

    *mankier*)
      grep -vEi '  • [A-Z]|      □ [A-Z]' | sed -e '1,7d' -e 's/tldr.sh//g' \
        -e "s/$command /$command($section) /g"
      ;;

    *ss64*com*)
      sed '1,5d' 2>/dev/null
      ;;

    *'man7.org'*)
      sed -e '1,4d' -e '8,13d' -e 's/         top/            /g' \
        | grep -vE 'StatCounter' | head -n -15
    ;;

    *manpages*org*)
      sed -e '1,3d' \
        | head -n -14
      ;;

    *man*he*net*) cat - ;;

    *) cat - ;;
  esac
}

# create the config dir, if needed
[ ! -d $HOME/.w3m-manpages ] && mkdir -p $HOME/.w3m-manpages

# if we already have a proper man page, just print it and exit
if [ ! -z "$MANPATH" ] && [ "$(which groff)" != "" ] && [ "$1" != "-H" ] && [ "$2" != "--url" ];then
  usegroff=false
  groff_file=''
  zipped=false
  paths=$(echo "${MANPATH}" | tr ':' '\n')
  # for each path
  for dir in ./ $paths
  do
    zipped=false
    # let's find the man page file
    groff_file=$dir/man${section}/${command}.${section}
    # it might be gzipped
    [ ! -f $groff_file ] && groff_file=$dir/man${section}/${command}.${section}.gz && zipped=true
    # it might be in the current directory
    [ -f ./${command}.${section} ] && groff_file="./${command}.${section}"
    [ -f ./${command}.${section}.gz ] && groff_file="./${command}.${section}.gz" && zipped=true
    # if we found the file
    if [ -f $groff_file ];then
      # unpack it, if needed
      [ $zipped = true ] && zcat $groff_file > /tmp/unzipped && groff_file=/tmp/unzipped
      # now lets read the man page and exit
      groff -T utf8 -man $groff_file | $pager && exit 0
    fi
  done
fi

# if we already have the man page as plain text, just print it and exit
[ "$1" != "-H" ] && [ "$2" != "--url" ] && [ -f "$manpage_file" ] && cat "$manpage_file" | $pager && exit 0

# set a list of urls to check:

# these man pages cover the correct versions of the program you have installed,
# for the OS you're actually using (if using an Ubuntu or Debian based pup)
if [ "$DISTRO_BINARY_COMPAT" = "ubuntu" ];then
  URLS="https://manpages.ubuntu.com/manpages/${DISTRO_COMPAT_VERSION}/man${section}/${command}.${section}.html"
elif [ "$DISTRO_BINARY_COMPAT" = "debian" ];then
  URLS="https://manpages.debian.org/${DISTRO_COMPAT_VERSION}/${command}/${command}.${section}.en.html"
fi

# add the rest
URLS="$URLS
http://man.he.net/?topic=${command}&section=${section}
http://manpages.org/${command}/${section}
https://linux.die.net/man/${section}/${command}
https://www.mankier.com/${section}/${command}
https://man7.org/linux/man-pages/man${section}/${command}.${section}.html
http://manpages.org/${command}
http://man.he.net/?topic=${command}&section=all
https://ss64.com/bash/${command}.html
"
# for each url in the list
for url in ${URLS}
do
  # skip any empty urls
  [ "$url" != ""  ] || continue
  [ "$url" != " " ] || continue

  # crawl the url
  wget --timeout=2 --spider -S -o /tmp/"$command".html "$url"

  # get the status code
  grep -m1 'HTTP/[1-3].[0-9] [0-9][0-9][0-9]' /tmp/"$command".html | grep -E '200 OK' > /tmp/response

  # if HTTP status not 200, skip this url
  grep -q -m1 "200" /tmp/response || continue

  # print the URL if --url given
  [ "$2" = "--url" ]  && echo "$url" && exit 0

  # grab the URL contents as plain text, put it into a file
  timeout 2 $htmlpager "$url" | prettifier "$url" > "$manpage_file"

  # if file is empty, remove it, skip this url
  [ ! -s "$manpage_file" ] && rm "$manpage_file" && continue

  # check if we if we got a "not found" page
  notfound=false
  grep -qiE "^Couldn|Hmmm|Invalid characters|No matches for \"|t found manual page under category" "$manpage_file" && notfound=true

  # if we DID get a "not found" page, delete the man page, skip this url
  [ "$notfound" = true   ] && rm "$manpage_file" && continue

  # if man page not a file, skip this url
  [ ! -f "$manpage_file" ] && continue

  # if -H was given, print it out as HTML (like GNU man)
  if [ "$1" = "-H" ] ;then
    $browser "$url"
  fi

  #
  # add a footer to the man page
  #
  echo >> "$manpage_file"
  echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> "$manpage_file"
  echo >> "$manpage_file"
  echo "Generated by https://github.com/sc0ttj/w3m-manpage" >> "$manpage_file"
  echo >> "$manpage_file"
  echo "Source URL   $url" >> "$manpage_file"
  echo >> "$manpage_file"

  # man page created, dont need to do another url
  break
done

# if we still did not _create_ a file, see if we have local
# matching ones, in a different section
if [ ! -f "$manpage_file" ];then
  local_pages=$(ls $HOME/.w3m-manpages | grep "${command}.")
  if [ "$local_pages" != "" ];then
    if [ $(echo "$local_pages" | wc -l) = 1 ];then
      echo
      echo "Loading $local_pages" | tr -d '\n'
      echo
      sleep 2
      manpage_file="$HOME/.w3m-manpages/$local_pages"
    elif [ $(echo "$local_pages" | wc -l) -ge 2 ];then
      echo
      echo "Did you mean any of these?"
      echo
      echo "$local_pages"
      exit 1
    fi
  fi
fi

# if we have a man page, and didnt already print the html, print it here
[ "$1" != "-H" ] && [ -f "$manpage_file" ] \
  && cat "$manpage_file" | $pager && exit 0

# if _still_ nothing, try download it online from the next section
if [ ! -f "$manpage_file" ];then
  if [ -f /tmp/man_loopcount ] && [ $(wc -l /tmp/man_loopcount | cut -f1 -d' ') -ge 8 ];then
    rm /tmp/man_loopcount
    exit 1
  fi
  next_section=$(($section + 1))
  [ $next_section -eq 9 ] && next_section=1
  echo -n "."
  echo "." >> /tmp/man_loopcount
  exec $0 $next_section $command
  retval=$?
  [ $retval -eq 0 ] && rm /tmp/manprogress
  echo
fi

man="$old_man"