Module `pd3f.doc_output`

Document represenation after extraction stuff from parsr

Expand source code

"""Document represenation after extraction stuff from parsr
"""

import logging
import re

from .dehyphen_wrapper import is_split_paragraph
from .string_utils import strip_spaces_line_end
from .utils import flatten

logger = logging.getLogger(__name__)


class DocumentOutput:
    def __init__(self, data, header, footer, order, lang):
        self.data = data or []
        self.header = header or []
        self.footer = footer or []
        self.order = order or []
        self.lang = lang
        self.merged_elements = {}

    def __getitem__(self, key):
        return self.data[key]

    def __setitem__(self, key, value):
        self.data[key] = value

    def get_element(self, elem_id):
        """Returns element from the data. Returns `None` if the element is not part out of the output anymore.
        """
        if elem_id in self.merged_elements:
            elem_id = self.merged_elements[elem_id]
        result = list(filter(lambda x: x.id == elem_id, self))

        # `result` may be empty if the elem was port of footer / header and is gone now (due to dudeplication)
        if len(result) == 1:
            return result[0]
        return None

    def get_first_of_type_on_page(self, find_types, idx_page):
        for ele_id in self.order[idx_page]:
            ele = self.get_element(ele_id)
            if ele is None:
                continue
            if ele.type in find_types:
                return ele
        return None

    def get_last_of_type_on_page(self, find_types, idx_page):
        for ele_id in reversed(self.order[idx_page]):
            ele = self.get_element(ele_id)
            if ele is None:
                continue
            if ele.type in find_types:
                return ele
        return None

    def reverse_page_break(self):
        """join paragraphs that were split between pages

        gets complicated when footnotes are not re-ordered
        """
        for idx, page in enumerate(self.order[:-1]):
            logger.info(f"reversing page break page #{idx}")
            last_element = self.get_last_of_type_on_page(("body", "heading"), idx)
            next_element = self.get_first_of_type_on_page(("body", "heading"), idx + 1)

            if last_element is None or next_element is None:
                logger.debug("some element is none, cannot test")
                continue

            if last_element.type == "heading" or next_element.type == "heading":
                logger.debug("some element is a header, cannot test")
                continue

            # It cannot contain newlines
            if last_element.ends_newline:
                logger.debug(
                    f"the last element has ends with a newline. Do not try to join with the next one."
                )
                continue

            fixed = is_split_paragraph(last_element, next_element, self.lang)
            if fixed is None:
                logger.debug("looks like a split paragraph")
                continue

            logger.debug("joining the following paragraphs")
            logger.debug(f"{last_element}\n{next_element}\n{fixed}")

            # set new paragraph
            self[self.data.index(last_element)] = fixed
            self.data.remove(next_element)
            self.merged_elements[next_element.id] = last_element.id

    def reorder_footnotes(self):
        new_data, all_footsnotes = [], []
        for element in self:
            if element.type == "footnotes":
                all_footsnotes.append(element)
            else:
                new_data.append(element)

        self.data = new_data + all_footsnotes

    def markdown(self):
        return self.text(markdown=True)

    def text(self, markdown=False):
        txt = ""

        txt += "\n\n".join([str(x) for x in flatten(self.header)])

        for element in self:
            if markdown and element.type == "heading":
                # prepend dashes
                txt += "#" * element.level + " "
            txt += str(element)

        txt += "\n\n".join([str(x) for x in flatten(self.footer)])

        # hotfix, there are sometimes too many newlines
        txt = re.sub(r"(\n){3,}", "\n\n", txt)
        return txt


class Element:
    def __init__(
        self,
        element_type,
        lines,
        element_id,
        idx_page=None,
        num_newlines=0,
        level=None,
        ends_newline=None,
    ):
        assert element_type in ("body", "heading", "footnotes")
        self.type = element_type
        self.lines = lines
        self.id = element_id
        self.level = level
        self.idx_page = idx_page
        self.num_newlines = num_newlines
        self.ends_newline = ends_newline

        for x in lines:
            assert len(x) > 0

    def __getitem__(self, key):
        return self.lines[key]

    def __str__(self):
        if self.type == "footnotes":
            # In some cases, there were unnesary spaces added befor newlines.
            # So this should generally not happend in the first place.
            fixed_lines = [strip_spaces_line_end(" ".join(line)) for line in self.lines]
            return "".join(fixed_lines) + "\n"

        return "".join([" ".join(line) for line in self.lines]) + "\n\n"

    def __add__(self, other_element):
        assert self.type == other_element.type
        self.lines += other_element.lines
        return self

    def __len__(self):
        return len(self.lines)

Classes

class DocumentOutput (data, header, footer, order, lang)

Expand source code

class DocumentOutput:
    def __init__(self, data, header, footer, order, lang):
        self.data = data or []
        self.header = header or []
        self.footer = footer or []
        self.order = order or []
        self.lang = lang
        self.merged_elements = {}

    def __getitem__(self, key):
        return self.data[key]

    def __setitem__(self, key, value):
        self.data[key] = value

    def get_element(self, elem_id):
        """Returns element from the data. Returns `None` if the element is not part out of the output anymore.
        """
        if elem_id in self.merged_elements:
            elem_id = self.merged_elements[elem_id]
        result = list(filter(lambda x: x.id == elem_id, self))

        # `result` may be empty if the elem was port of footer / header and is gone now (due to dudeplication)
        if len(result) == 1:
            return result[0]
        return None

    def get_first_of_type_on_page(self, find_types, idx_page):
        for ele_id in self.order[idx_page]:
            ele = self.get_element(ele_id)
            if ele is None:
                continue
            if ele.type in find_types:
                return ele
        return None

    def get_last_of_type_on_page(self, find_types, idx_page):
        for ele_id in reversed(self.order[idx_page]):
            ele = self.get_element(ele_id)
            if ele is None:
                continue
            if ele.type in find_types:
                return ele
        return None

    def reverse_page_break(self):
        """join paragraphs that were split between pages

        gets complicated when footnotes are not re-ordered
        """
        for idx, page in enumerate(self.order[:-1]):
            logger.info(f"reversing page break page #{idx}")
            last_element = self.get_last_of_type_on_page(("body", "heading"), idx)
            next_element = self.get_first_of_type_on_page(("body", "heading"), idx + 1)

            if last_element is None or next_element is None:
                logger.debug("some element is none, cannot test")
                continue

            if last_element.type == "heading" or next_element.type == "heading":
                logger.debug("some element is a header, cannot test")
                continue

            # It cannot contain newlines
            if last_element.ends_newline:
                logger.debug(
                    f"the last element has ends with a newline. Do not try to join with the next one."
                )
                continue

            fixed = is_split_paragraph(last_element, next_element, self.lang)
            if fixed is None:
                logger.debug("looks like a split paragraph")
                continue

            logger.debug("joining the following paragraphs")
            logger.debug(f"{last_element}\n{next_element}\n{fixed}")

            # set new paragraph
            self[self.data.index(last_element)] = fixed
            self.data.remove(next_element)
            self.merged_elements[next_element.id] = last_element.id

    def reorder_footnotes(self):
        new_data, all_footsnotes = [], []
        for element in self:
            if element.type == "footnotes":
                all_footsnotes.append(element)
            else:
                new_data.append(element)

        self.data = new_data + all_footsnotes

    def markdown(self):
        return self.text(markdown=True)

    def text(self, markdown=False):
        txt = ""

        txt += "\n\n".join([str(x) for x in flatten(self.header)])

        for element in self:
            if markdown and element.type == "heading":
                # prepend dashes
                txt += "#" * element.level + " "
            txt += str(element)

        txt += "\n\n".join([str(x) for x in flatten(self.footer)])

        # hotfix, there are sometimes too many newlines
        txt = re.sub(r"(\n){3,}", "\n\n", txt)
        return txt

Methods

def get_element(self, elem_id)

Returns element from the data. Returns None if the element is not part out of the output anymore.

Expand source code

def get_element(self, elem_id):
    """Returns element from the data. Returns `None` if the element is not part out of the output anymore.
    """
    if elem_id in self.merged_elements:
        elem_id = self.merged_elements[elem_id]
    result = list(filter(lambda x: x.id == elem_id, self))

    # `result` may be empty if the elem was port of footer / header and is gone now (due to dudeplication)
    if len(result) == 1:
        return result[0]
    return None

def get_first_of_type_on_page(self, find_types, idx_page)

Expand source code

def get_first_of_type_on_page(self, find_types, idx_page):
    for ele_id in self.order[idx_page]:
        ele = self.get_element(ele_id)
        if ele is None:
            continue
        if ele.type in find_types:
            return ele
    return None

def get_last_of_type_on_page(self, find_types, idx_page)

Expand source code

def get_last_of_type_on_page(self, find_types, idx_page):
    for ele_id in reversed(self.order[idx_page]):
        ele = self.get_element(ele_id)
        if ele is None:
            continue
        if ele.type in find_types:
            return ele
    return None

def markdown(self)

Expand source code

def markdown(self):
    return self.text(markdown=True)

def reorder_footnotes(self)

Expand source code

def reorder_footnotes(self):
    new_data, all_footsnotes = [], []
    for element in self:
        if element.type == "footnotes":
            all_footsnotes.append(element)
        else:
            new_data.append(element)

    self.data = new_data + all_footsnotes

def reverse_page_break(self)

join paragraphs that were split between pages

gets complicated when footnotes are not re-ordered

Expand source code

def reverse_page_break(self):
    """join paragraphs that were split between pages

    gets complicated when footnotes are not re-ordered
    """
    for idx, page in enumerate(self.order[:-1]):
        logger.info(f"reversing page break page #{idx}")
        last_element = self.get_last_of_type_on_page(("body", "heading"), idx)
        next_element = self.get_first_of_type_on_page(("body", "heading"), idx + 1)

        if last_element is None or next_element is None:
            logger.debug("some element is none, cannot test")
            continue

        if last_element.type == "heading" or next_element.type == "heading":
            logger.debug("some element is a header, cannot test")
            continue

        # It cannot contain newlines
        if last_element.ends_newline:
            logger.debug(
                f"the last element has ends with a newline. Do not try to join with the next one."
            )
            continue

        fixed = is_split_paragraph(last_element, next_element, self.lang)
        if fixed is None:
            logger.debug("looks like a split paragraph")
            continue

        logger.debug("joining the following paragraphs")
        logger.debug(f"{last_element}\n{next_element}\n{fixed}")

        # set new paragraph
        self[self.data.index(last_element)] = fixed
        self.data.remove(next_element)
        self.merged_elements[next_element.id] = last_element.id

def text(self, markdown=False)

Expand source code

def text(self, markdown=False):
    txt = ""

    txt += "\n\n".join([str(x) for x in flatten(self.header)])

    for element in self:
        if markdown and element.type == "heading":
            # prepend dashes
            txt += "#" * element.level + " "
        txt += str(element)

    txt += "\n\n".join([str(x) for x in flatten(self.footer)])

    # hotfix, there are sometimes too many newlines
    txt = re.sub(r"(\n){3,}", "\n\n", txt)
    return txt

class Element (element_type, lines, element_id, idx_page=None, num_newlines=0, level=None, ends_newline=None)

Expand source code

class Element:
    def __init__(
        self,
        element_type,
        lines,
        element_id,
        idx_page=None,
        num_newlines=0,
        level=None,
        ends_newline=None,
    ):
        assert element_type in ("body", "heading", "footnotes")
        self.type = element_type
        self.lines = lines
        self.id = element_id
        self.level = level
        self.idx_page = idx_page
        self.num_newlines = num_newlines
        self.ends_newline = ends_newline

        for x in lines:
            assert len(x) > 0

    def __getitem__(self, key):
        return self.lines[key]

    def __str__(self):
        if self.type == "footnotes":
            # In some cases, there were unnesary spaces added befor newlines.
            # So this should generally not happend in the first place.
            fixed_lines = [strip_spaces_line_end(" ".join(line)) for line in self.lines]
            return "".join(fixed_lines) + "\n"

        return "".join([" ".join(line) for line in self.lines]) + "\n\n"

    def __add__(self, other_element):
        assert self.type == other_element.type
        self.lines += other_element.lines
        return self

    def __len__(self):
        return len(self.lines)