Module `pd3f.parsr_wrapper`

Wrapper to interaction with parsr (using parsr's Python client)

Expand source code

"""Wrapper to interaction with parsr (using parsr's Python client)
"""

import importlib.resources
import json
import logging
import tempfile
from pathlib import Path

from parsr_client import ParsrClient as client

from .utils import update_dict, write_dict

logger = logging.getLogger(__name__)


def setup_config(config, adjust_cleaner_config, check_tables, fast):
    """

    """
    # update base config of parsr
    with importlib.resources.path("pd3f", "pd3fConfig.json") as cfg_path:
        jdata = json.loads(cfg_path.read_text())
    jdata = update_dict(jdata, config)

    # Update parsr cleaner config since it's more complicated.
    # The cleaner consists of a pipeline, so we first have to find the matching module.
    # Then update its configuration.
    for new_cl in adjust_cleaner_config:
        for idx, cl in enumerate(jdata["cleaner"]):
            if type(cl) != list:
                continue
            if cl[0] != new_cl[0]:
                continue
            jdata["cleaner"][idx] = [cl[0], {**cl[1], **new_cl[1]}]

    if not check_tables:
        jdata["cleaner"] = [
            x
            for x in jdata["cleaner"]
            if type(x) is str or "table-detection" not in x[0]
        ]

    if fast:
        jdata["cleaner"] = [
            x
            for x in jdata["cleaner"]
            if type(x) is str and x != "drawing-detection" or x[0] != "image-detection"
        ]
    return jdata


def run_parsr(
    file_path,
    out_dir=None,
    config={},
    adjust_cleaner_config=[],
    text=False,
    markdown=False,
    check_tables=False,
    fast=False,
    parsr_location="localhost:3001",
    **kwargs,
):
    """Wrapper to interact with parsr (using parsr's Python client)
    """
    parsr = client(parsr_location)

    parsr_config = setup_config(config, adjust_cleaner_config, check_tables, fast)

    with tempfile.NamedTemporaryFile(mode="w+") as tmp_config:
        json.dump(parsr_config, tmp_config)
        tmp_config.flush()  # persist

        # TODO: when upgrading to v3.2, use file_path and config_path
        logger.info("sending PDF to Parsr")

        logger.debug(parsr_config)

        parsr.send_document(
            file=file_path,
            config=tmp_config.name,
            wait_till_finished=True,
            save_request_id=True,
            silent=False,
        )

    logger.info("got response from Parsr")

    tables = []
    if check_tables:
        for page, table in parsr.get_tables_info():
            # table gets returned as panda df
            tables.append(parsr.get_table(page=page, table=table))

    if not out_dir is None:
        out_dir = Path(out_dir) / Path(file_path).stem
        out_dir.mkdir(exist_ok=True, parents=True)

        if text:
            (out_dir / "text.txt").write_text(parsr.get_text())

        if markdown:
            (out_dir / "text.md").write_text(parsr.get_markdown())

        if check_tables:
            for idx, t in enumerate(tables):
                (out_dir / f"table_{idx}.csv").write_text(t.to_csv())

        write_dict(parsr.get_json(), out_dir / "data.json")

    if not check_tables:
        return parsr.get_json(), None
    return parsr.get_json(), [x.to_csv() for x in tables]

Functions

def run_parsr(file_path, out_dir=None, config={}, adjust_cleaner_config=[], text=False, markdown=False, check_tables=False, fast=False, parsr_location='localhost:3001', **kwargs)

Wrapper to interact with parsr (using parsr's Python client)

Expand source code

def run_parsr(
    file_path,
    out_dir=None,
    config={},
    adjust_cleaner_config=[],
    text=False,
    markdown=False,
    check_tables=False,
    fast=False,
    parsr_location="localhost:3001",
    **kwargs,
):
    """Wrapper to interact with parsr (using parsr's Python client)
    """
    parsr = client(parsr_location)

    parsr_config = setup_config(config, adjust_cleaner_config, check_tables, fast)

    with tempfile.NamedTemporaryFile(mode="w+") as tmp_config:
        json.dump(parsr_config, tmp_config)
        tmp_config.flush()  # persist

        # TODO: when upgrading to v3.2, use file_path and config_path
        logger.info("sending PDF to Parsr")

        logger.debug(parsr_config)

        parsr.send_document(
            file=file_path,
            config=tmp_config.name,
            wait_till_finished=True,
            save_request_id=True,
            silent=False,
        )

    logger.info("got response from Parsr")

    tables = []
    if check_tables:
        for page, table in parsr.get_tables_info():
            # table gets returned as panda df
            tables.append(parsr.get_table(page=page, table=table))

    if not out_dir is None:
        out_dir = Path(out_dir) / Path(file_path).stem
        out_dir.mkdir(exist_ok=True, parents=True)

        if text:
            (out_dir / "text.txt").write_text(parsr.get_text())

        if markdown:
            (out_dir / "text.md").write_text(parsr.get_markdown())

        if check_tables:
            for idx, t in enumerate(tables):
                (out_dir / f"table_{idx}.csv").write_text(t.to_csv())

        write_dict(parsr.get_json(), out_dir / "data.json")

    if not check_tables:
        return parsr.get_json(), None
    return parsr.get_json(), [x.to_csv() for x in tables]

def setup_config(config, adjust_cleaner_config, check_tables, fast)

Expand source code

def setup_config(config, adjust_cleaner_config, check_tables, fast):
    """

    """
    # update base config of parsr
    with importlib.resources.path("pd3f", "pd3fConfig.json") as cfg_path:
        jdata = json.loads(cfg_path.read_text())
    jdata = update_dict(jdata, config)

    # Update parsr cleaner config since it's more complicated.
    # The cleaner consists of a pipeline, so we first have to find the matching module.
    # Then update its configuration.
    for new_cl in adjust_cleaner_config:
        for idx, cl in enumerate(jdata["cleaner"]):
            if type(cl) != list:
                continue
            if cl[0] != new_cl[0]:
                continue
            jdata["cleaner"][idx] = [cl[0], {**cl[1], **new_cl[1]}]

    if not check_tables:
        jdata["cleaner"] = [
            x
            for x in jdata["cleaner"]
            if type(x) is str or "table-detection" not in x[0]
        ]

    if fast:
        jdata["cleaner"] = [
            x
            for x in jdata["cleaner"]
            if type(x) is str and x != "drawing-detection" or x[0] != "image-detection"
        ]
    return jdata