#!/bin/bash
#
# capture_ocr -d
#
# Look for and convert on screen text to actual text in both a clipboard
# and a pop up window, using OCR (Optical Character Recognition)
#
# OPTIONS:
#    -d   Display results in a X window editor
#         It is also available in clipboard
#
# Requires tesseract and imagemagick to be installed.
#
# Anthony Thyssen  10 July 2018
#
PROGNAME=`type $0 | awk '{print $3}'`  # search for executable on path
PROGDIR=`dirname "$PROGNAME"`          # extract directory of program
PROGNAME=`basename "$PROGNAME"`        # base name of program

: ${XPAGER:=xless}           # Select "xless" if a pager is not defined
XEDITOR=${XEDITOR:-$XPAGER}  # Pop it in an editor is that is defined

DEPENDENCIES="convert tesseract xsel error_message $XEDITOR"

# Is the command available?
type -t cmd_found >/dev/null ||
  cmd_found() { type -t "$1" >/dev/null; }

# Check Dependencies a script requires is available
for i in $DEPENDENCIES; do
  if cmd_found $i; then
    : all good
  else
    echo >&2 "$PROGNAME: Required program dependency \"$i\" missing"
    error_message "$PROGNAME: Required program dependency \"$i\" missing"
    exit 10
  fi
done

# Uncomment to have script pause while showing the image, before it is
# passed to tesseract for decoding.
#debug="-write show:"

# Grab and improve the image to be decoded.
#  * increase the size of the image so it decodes better.
#  * quantize (reduce colors) to a 2 colors, and make black and white.
#
# See Image Magick Examples for other things you can do to images.
#   http://www.imagemagick.org/Usage/
# For posible improvements to help OCR decoding in specific situations see...
#   http://community.aiim.org/blogs/richard-medina/2013/03/15
#   https://mathieularose.com/decoding-captchas/
#
# old method...
#convert x: -modulate 100,0 \
#         -resize 300% -set density 300 \
#         $debug png:- |
# new method...
convert x: -resize 300% -set density 300 \
        +dither  -colors 2  -normalize \
        $debug png:- |


  # Optical Character Recognition (OCR) decoding..
  # Tesseract is a great free tool for this...
  # NB: the --psm option defines how the text is arranged
  # --psm 6   block of text   --psm 7  single line
  # To limit what letters use..
  #   -c tessedit_char_whitelist=ABCDEFGHIJKLMOPQRSTUVWXYZ
  #   -c tessedit_char_whitelist=$(echo {A..Z} {a..z} {0..9} | tr -d ' ')
  #   -c tessedit_char_blacklist=0123456789
  tesseract --dpi 300 --psm 6 stdin stdout |

  # Replace specific unicode output with equivelent ASCII
  # There are many way of doing this.
  #tr '“”—' '""-' |  # tr does not do it right!
  sed 's/[“”]/"/g; s/—/-/g' |

  # Save result into clipboard
  xsel -ib

if (( $(xsel -ob | wc -l) == 0 )); then
  echo >&2 "$PROGNAME: OCR found no text in selection"
  error_message "$PROGNAME: OCR found no text in selection" &
  exit 10
fi

# Popup a window with the OCR text stored in the Clipboard
if [[ "$1" = "-e" ]]; then
  xsel -ob | $XEDITOR -      # popup text window
fi