commit 0423d4ad321228d5991f3c355a32ca25f16bc105
parent 9e1a20422644554d418844520926e3d8619a085e
Author: krasjet
Date: 2020-08-07 08:15Z

xmeta: switch to getopt for better flexibility

Diffstat:
Mpdfxmeta/app.py | 182++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
1 file changed, 107 insertions(+), 75 deletions(-)

diff --git a/pdfxmeta/app.py b/pdfxmeta/app.py @@ -1,78 +1,60 @@ """The executable of pdfxmeta""" -import argparse +import getopt import sys import pdfxmeta -from argparse import Namespace +from getopt import GetoptError +from typing import Optional, TextIO from fitzutils import open_pdf -from textwrap import indent, dedent +from textwrap import indent from pdfxmeta import dump_meta, dump_toml, extract_meta -def getargs() -> Namespace: - """parse commandline arguments""" - - app_desc = dedent(""" - pdfxmeta: extract metadata for a string in a pdf document. - - To use this command, first open up the pdf file your favorite pdf reader - and find the string you want to search for. Then use - - $ pdfxmeta -p 1 in.pdf "Subsection One" - - to find the metadata, mainly the font attributes and bounding box, of lines - containing the pattern "Subsection One" on page 1. Specifying a page number - is optional but highly recommended, since it greatly reduces the ambiguity - of matches and execution time. - - The output of this command can be directly copy-pasted to build a recipe - file for pdftocgen. Alternatively, you could also use the --auto or -a flag - to output a valid heading filter directly - - $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml - - where the argument of -a is the level of the heading filter, which in this - case is 2. - """) - parser = argparse.ArgumentParser( - description=app_desc, - formatter_class=argparse.RawDescriptionHelpFormatter - ) - - parser.add_argument('input', - metavar='in.pdf', - help="path to the input pdf file") - parser.add_argument('pattern', - help="the pattern to search for (python regex)") - parser.add_argument('-p', '--page', - type=int, - help="""specify the page in which the string occurs - (1-based index)""") - parser.add_argument('-i', '--ignore-case', - action='store_true', - help="""when flag is set, search will be - case-insensitive""") - parser.add_argument('-a', '--auto', - metavar='level', - type=int, - const=1, - nargs='?', - help="""when flag is set, the output would be a valid - heading filter of the specified level with the most - common settings, directly usable by pdftocgen. the - default level is 1""") - parser.add_argument('-o', '--out', - metavar="file", - type=argparse.FileType('w'), - default='-', - help="""path to the output file. if this flag is not - specified, the default is stdout""") - parser.add_argument('-V', '--version', - action='version', - version='%(prog)s ' + pdfxmeta.__version__) - - return parser.parse_args() +usage_s = """ +usage: pdfxmeta [options] doc.pdf [pattern] +""".strip() + +help_s = """ +usage: pdfxmeta [options] doc.pdf [pattern] + +Extract the metadata for pattern in doc.pdf. + +To use this command, first open up the pdf file your favorite pdf reader and +find the text you want to search for. Then use + + $ pdfxmeta -p 1 in.pdf "Subsection One" + +to find the metadata, mainly the font attributes and bounding box, of lines +containing the pattern "Subsection One" on page 1. Specifying a page number is +optional but highly recommended, since it greatly reduces the ambiguity of +matches and execution time. + +The output of this command can be directly copy-pasted to build a recipe file +for pdftocgen. Alternatively, you could also use the --auto or -a flag to +output a valid heading filter directly + + $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml + +where the argument of -a is the level of the heading filter, which in this case +is 2. + +arguments + doc.pdf path to the input PDF document + [pattern] the pattern to search for (python regex). if not given, + dump the entire document + +options + -h, --help show help + -p PAGE, --page PAGE specify the page to search for (1-based index) + -i, --ignore-case when flag is set, search will be case-insensitive + -a level, --auto level when flag is set, the output would be a valid heading + filter of the specified heading level in default + settings. it is directly usable by pdftocgen. + -o file, --out file path to the output file. if this flag is not + specified, the default is stdout + -V, --version show version number +""".strip() def print_result(meta: str) -> str: @@ -81,21 +63,71 @@ def print_result(meta: str) -> str: def main(): - args = getargs() - - with open_pdf(args.input) as doc: - meta = extract_meta(doc, args.pattern, args.page, args.ignore_case) + # parse arguments + try: + opts, args = getopt.gnu_getopt( + sys.argv[1:], + "hiVp:a:o:", + ["help", "ignore-case", "version", "page=", "auto=", "out="] + ) + except GetoptError as e: + print(e, file=sys.stderr) + print(usage_s, file=sys.stderr) + sys.exit(2) + + ignore_case: bool = False + page: Optional[int] = None + auto_level: Optional[int] = None + out: TextIO = sys.stdout + + for o, a in opts: + if o in ("-i", "--ignore-case"): + ignore_case = True + elif o in ("-p", "--page"): + page = int(a) + elif o in ("-a", "--auto"): + auto_level = int(a) + elif o in ("-o", "--out"): + try: + out = open(a, "w") + except IOError as e: + print(e, file=sys.stderr) + sys.exit(1) + elif o in ("-V", "--version"): + print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr) + sys.exit() + elif o in ("-h", "--help"): + print(help_s, file=sys.stderr) + sys.exit() + + argc = len(args) + + if argc < 1: + print("error: no input pdf is given", file=sys.stderr) + print(usage_s, file=sys.stderr) + sys.exit(1) + + path_in: str = args[0] + pattern: str = "" + + if argc >= 2: + pattern = args[1] + + # done parsing arguments + + with open_pdf(path_in) as doc: + meta = extract_meta(doc, pattern, page, ignore_case) # nothing found if len(meta) == 0: sys.exit(1) # should we add \n between each output? - addnl = not args.out.isatty() + addnl = not out.isatty() - if args.auto: + if auto_level: print('\n'.join( - [dump_toml(m, args.auto, addnl) for m in meta] - ), file=args.out) + [dump_toml(m, auto_level, addnl) for m in meta] + ), file=out) else: - print('\n'.join(map(print_result, meta)), file=args.out) + print('\n'.join(map(print_result, meta)), file=out)