commit 0423d4ad321228d5991f3c355a32ca25f16bc105
parent 9e1a20422644554d418844520926e3d8619a085e
Author: krasjet
Date: 2020-08-07 08:15Z
xmeta: switch to getopt for better flexibility
Diffstat:
M | pdfxmeta/app.py | | | 182 | ++++++++++++++++++++++++++++++++++++++++++++++--------------------------------- |
1 file changed, 107 insertions(+), 75 deletions(-)
diff --git a/pdfxmeta/app.py b/pdfxmeta/app.py
@@ -1,78 +1,60 @@
"""The executable of pdfxmeta"""
-import argparse
+import getopt
import sys
import pdfxmeta
-from argparse import Namespace
+from getopt import GetoptError
+from typing import Optional, TextIO
from fitzutils import open_pdf
-from textwrap import indent, dedent
+from textwrap import indent
from pdfxmeta import dump_meta, dump_toml, extract_meta
-def getargs() -> Namespace:
- """parse commandline arguments"""
-
- app_desc = dedent("""
- pdfxmeta: extract metadata for a string in a pdf document.
-
- To use this command, first open up the pdf file your favorite pdf reader
- and find the string you want to search for. Then use
-
- $ pdfxmeta -p 1 in.pdf "Subsection One"
-
- to find the metadata, mainly the font attributes and bounding box, of lines
- containing the pattern "Subsection One" on page 1. Specifying a page number
- is optional but highly recommended, since it greatly reduces the ambiguity
- of matches and execution time.
-
- The output of this command can be directly copy-pasted to build a recipe
- file for pdftocgen. Alternatively, you could also use the --auto or -a flag
- to output a valid heading filter directly
-
- $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml
-
- where the argument of -a is the level of the heading filter, which in this
- case is 2.
- """)
- parser = argparse.ArgumentParser(
- description=app_desc,
- formatter_class=argparse.RawDescriptionHelpFormatter
- )
-
- parser.add_argument('input',
- metavar='in.pdf',
- help="path to the input pdf file")
- parser.add_argument('pattern',
- help="the pattern to search for (python regex)")
- parser.add_argument('-p', '--page',
- type=int,
- help="""specify the page in which the string occurs
- (1-based index)""")
- parser.add_argument('-i', '--ignore-case',
- action='store_true',
- help="""when flag is set, search will be
- case-insensitive""")
- parser.add_argument('-a', '--auto',
- metavar='level',
- type=int,
- const=1,
- nargs='?',
- help="""when flag is set, the output would be a valid
- heading filter of the specified level with the most
- common settings, directly usable by pdftocgen. the
- default level is 1""")
- parser.add_argument('-o', '--out',
- metavar="file",
- type=argparse.FileType('w'),
- default='-',
- help="""path to the output file. if this flag is not
- specified, the default is stdout""")
- parser.add_argument('-V', '--version',
- action='version',
- version='%(prog)s ' + pdfxmeta.__version__)
-
- return parser.parse_args()
+usage_s = """
+usage: pdfxmeta [options] doc.pdf [pattern]
+""".strip()
+
+help_s = """
+usage: pdfxmeta [options] doc.pdf [pattern]
+
+Extract the metadata for pattern in doc.pdf.
+
+To use this command, first open up the pdf file your favorite pdf reader and
+find the text you want to search for. Then use
+
+ $ pdfxmeta -p 1 in.pdf "Subsection One"
+
+to find the metadata, mainly the font attributes and bounding box, of lines
+containing the pattern "Subsection One" on page 1. Specifying a page number is
+optional but highly recommended, since it greatly reduces the ambiguity of
+matches and execution time.
+
+The output of this command can be directly copy-pasted to build a recipe file
+for pdftocgen. Alternatively, you could also use the --auto or -a flag to
+output a valid heading filter directly
+
+ $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml
+
+where the argument of -a is the level of the heading filter, which in this case
+is 2.
+
+arguments
+ doc.pdf path to the input PDF document
+ [pattern] the pattern to search for (python regex). if not given,
+ dump the entire document
+
+options
+ -h, --help show help
+ -p PAGE, --page PAGE specify the page to search for (1-based index)
+ -i, --ignore-case when flag is set, search will be case-insensitive
+ -a level, --auto level when flag is set, the output would be a valid heading
+ filter of the specified heading level in default
+ settings. it is directly usable by pdftocgen.
+ -o file, --out file path to the output file. if this flag is not
+ specified, the default is stdout
+ -V, --version show version number
+""".strip()
def print_result(meta: str) -> str:
@@ -81,21 +63,71 @@ def print_result(meta: str) -> str:
def main():
- args = getargs()
-
- with open_pdf(args.input) as doc:
- meta = extract_meta(doc, args.pattern, args.page, args.ignore_case)
+ # parse arguments
+ try:
+ opts, args = getopt.gnu_getopt(
+ sys.argv[1:],
+ "hiVp:a:o:",
+ ["help", "ignore-case", "version", "page=", "auto=", "out="]
+ )
+ except GetoptError as e:
+ print(e, file=sys.stderr)
+ print(usage_s, file=sys.stderr)
+ sys.exit(2)
+
+ ignore_case: bool = False
+ page: Optional[int] = None
+ auto_level: Optional[int] = None
+ out: TextIO = sys.stdout
+
+ for o, a in opts:
+ if o in ("-i", "--ignore-case"):
+ ignore_case = True
+ elif o in ("-p", "--page"):
+ page = int(a)
+ elif o in ("-a", "--auto"):
+ auto_level = int(a)
+ elif o in ("-o", "--out"):
+ try:
+ out = open(a, "w")
+ except IOError as e:
+ print(e, file=sys.stderr)
+ sys.exit(1)
+ elif o in ("-V", "--version"):
+ print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr)
+ sys.exit()
+ elif o in ("-h", "--help"):
+ print(help_s, file=sys.stderr)
+ sys.exit()
+
+ argc = len(args)
+
+ if argc < 1:
+ print("error: no input pdf is given", file=sys.stderr)
+ print(usage_s, file=sys.stderr)
+ sys.exit(1)
+
+ path_in: str = args[0]
+ pattern: str = ""
+
+ if argc >= 2:
+ pattern = args[1]
+
+ # done parsing arguments
+
+ with open_pdf(path_in) as doc:
+ meta = extract_meta(doc, pattern, page, ignore_case)
# nothing found
if len(meta) == 0:
sys.exit(1)
# should we add \n between each output?
- addnl = not args.out.isatty()
+ addnl = not out.isatty()
- if args.auto:
+ if auto_level:
print('\n'.join(
- [dump_toml(m, args.auto, addnl) for m in meta]
- ), file=args.out)
+ [dump_toml(m, auto_level, addnl) for m in meta]
+ ), file=out)
else:
- print('\n'.join(map(print_result, meta)), file=args.out)
+ print('\n'.join(map(print_result, meta)), file=out)