Module pd3f.doc_info
Statistics and information about document elements.
Expand source code
"""Statistics and information about document elements.
"""
import logging
from collections import Counter
from statistics import median
from textdistance import jaccard
from cleantext import clean, fix_bad_unicode
from .dehyphen_wrapper import single_score
from .geometry import sim_bbox
from .utils import flatten
logger = logging.getLogger(__name__)
def avg_word_space(line):
"""Average word space on a line, util for words / lines
src: https://github.com/axa-group/Parsr/blob/69e6b9bf33f1cc43d5a87d428cedf1132ccc48e8/server/src/types/DocumentRepresentation/Paragraph.ts#L460
"""
def calc_margins(index, word):
if index > 0:
return word["box"]["l"] - (
line["content"][index - 1]["box"]["l"]
+ line["content"][index - 1]["box"]["w"]
)
return 0
margins = [calc_margins(i, w) for i, w in enumerate(line["content"])]
return sum(margins) / len(margins)
def roughly_same_font(f1, f2):
# unreliable
assert f1["sizeUnit"] == "px"
assert f2["sizeUnit"] == "px"
return abs(f1["size"] - f2["size"]) < max(f1["size"], f2["size"]) * 0.2
def extract_elements(outer_element, element_type):
def traverse(element):
if type(element) is dict:
if "type" in element and element["type"] == element_type:
return element
if "content" in element:
return traverse(element["content"])
return None
if type(element) is list:
return [traverse(e) for e in element]
return [
x for x in flatten(traverse(outer_element), keep_dict=True) if x is not None
]
def font_stats(outer_element):
return [x["font"] for x in extract_elements(outer_element, "word")]
def most_used_font(element):
return Counter(font_stats(element)).most_common(1)[0][0]
def get_lineheight(l1, l2):
# l1 or l2 can be the upper line
if l2["box"]["t"] < l1["box"]["t"]:
l1, l2 = l2, l1
dif = l2["box"]["t"] - l1["box"]["t"] - l1["box"]["h"]
# it may happen that the lines are on the same
return dif if dif > 0 else None
def median_from_counter(c):
data = []
for value, count in c.most_common():
data += [value] * count
return median(data)
def only_text(es):
r = []
for e in es:
for x in extract_elements(e, "word"):
r.append(x["content"].strip())
return fix_bad_unicode(" ".join(r))
def only_points(es):
r = []
for e in es:
b = e["box"]
r.append((b["t"], b["l"]))
r.append((b["t"] + b["h"], b["l"]))
r.append((b["t"], b["l"] + b["w"]))
r.append((b["t"] + b["h"], b["l"] + b["w"]))
return r
def super_similiar(es1, es2, sim_factor=0.8, sim_box=0.6):
"""Check if two elements are super similiar by text (Jaccad) and visually (compare bbox).
"""
text1 = only_text(es1)
text2 = only_text(es2)
points1 = only_points(es1)
points2 = only_points(es2)
if min(len(points1), len(points2)) < 4:
return False
logger.debug("points")
logger.debug(points1)
logger.debug(points2)
j_sim = jaccard(text1, text2)
b_sim = sim_bbox(points1, points2)
logger.debug(f"footer/header sims {j_sim} {b_sim}")
return j_sim > sim_factor and b_sim > sim_box
def remove_duplicates(page_items, lang):
results = [page_items[0]]
for elements in page_items[1:]:
cool = True
for r in results:
if len(r) == 0:
continue
# only choose the best first one?
if super_similiar(r, elements):
logger.debug("items are super similiar")
if single_score(only_text(r), lang) <= single_score(
only_text(elements), lang
):
logger.debug(
"okay, skipping here, the previous one got better / same score"
)
cool = False
break
else:
logger.debug("removing previous one, this is better")
results.remove(r)
if cool:
results.append(elements)
else:
results.append([])
return results
def remove_page_number_header_footer(page_items):
"""Rough check to remove elements with text such as `Seite $NUM von $NUM` or just `$NUM`.
TODO: Make it work if the pager number is part of a bigger header/footer. And also consider the language.
"""
texts = [
clean(only_text(x), replace_with_number="", no_punct=True)
.replace("seite", "")
.replace("von", "")
for x in page_items
]
results = []
for idx, x in enumerate(page_items):
if texts[idx].strip() != "":
results.append(x)
return results
def calc_line_space(lines):
if len(lines) <= 1:
return []
lineheights = []
for i, _ in enumerate(lines[:-1]):
if (x := get_lineheight(lines[i], lines[i + 1])) is not None:
lineheights.append(x)
return lineheights
class DocumentInfo:
def __init__(self, input_data) -> None:
self.input_data = input_data
# needs to be done first
self.element_order_page()
self.document_font_stats()
self.document_paragraph_stats()
# free memory (code should have been re-written but whatehver)
del self.input_data
def document_paragraph_stats(self):
"""
"""
self.counter_width = Counter()
self.counter_height = Counter()
self.counter_lineheight = Counter()
self.counter_line_left = Counter()
for n_page, p in enumerate(self.input_data["pages"]):
for e in p["elements"]:
lis = extract_elements(e, "line")
for x in lis:
x["idx_page"] = n_page
self.id_to_elem[x["id"]] = x
self.counter_width.update([x["box"]["w"] for x in lis])
self.counter_height.update([x["box"]["h"] for x in lis])
self.counter_lineheight.update(calc_line_space(lis))
self.counter_line_left.update([x["box"]["l"] for x in lis])
if (
min(
map(
len,
[
self.counter_width,
self.counter_height,
self.counter_lineheight,
self.counter_line_left,
],
)
)
== 0
):
raise ValueError(
"Something is wrong with the document. Is the text in the PDF broken (copy the text out of the doc and see how it looks)?"
)
self.median_line_width = median_from_counter(self.counter_width)
self.median_line_height = median_from_counter(self.counter_height)
# line space: line height
self.median_line_space = median_from_counter(self.counter_lineheight)
self.median_line_left = median_from_counter(self.counter_line_left)
logger.info(f"media line width: {self.median_line_width}")
logger.info(f"median line height: {self.median_line_height}")
logger.info(f"median line space: {self.median_line_space}")
logger.info(f"counter width: {self.counter_width.most_common(5)}")
logger.info(f"counter height: {self.counter_height.most_common(5)}")
logger.info(f"counter lineheight: {self.counter_lineheight.most_common(5)}")
def document_font_stats(self):
"""Get statistics about font usage in the document
"""
c = Counter()
for p in self.input_data["pages"]:
for e in p["elements"]:
c.update(font_stats(e))
if len(c) == 0:
raise ValueError(
"Something is wrong with the document. Is the text in the PDF broken (copy the text out of the doc and see how it looks)?"
)
self.body_font = c.most_common(1)[0][0]
self.font_counter = c
self.font_info = {}
for x in self.input_data["fonts"]:
self.font_info[x["id"]] = x
assert x["sizeUnit"] == "px"
def seperate_lines(self, l1, l2, factor=0.5):
lh = get_lineheight(l1, l2)
if lh is None:
return False
# space between lines can only be + 0.5x the body lineheight
return ((lh - self.median_line_space) / self.median_line_space) > factor
def on_same_page(self, e1, e2):
"""Check if both elements are on the same page
"""
return (
self.id_to_elem[e1["id"]]["idx_page"]
== self.id_to_elem[e2["id"]]["idx_page"]
)
def element_order_page(self):
"""Save the order of paragraphes for each page, exclude header / footer
"""
self.order_page = []
self.id_to_elem = {}
for idx_page, p in enumerate(self.input_data["pages"]):
per_page = []
for e in p["elements"]:
# not all elements are included here
e["idx_page"] = idx_page
self.id_to_elem[e["id"]] = e
if not e["type"] in ("paragraph", "heading"):
continue
if "isHeader" in e["properties"] and e["properties"]["isHeader"]:
continue
if "isFooter" in e["properties"] and e["properties"]["isFooter"]:
continue
per_page.append(e["id"])
self.order_page.append(per_page)
def is_body_paragrah(self, para):
lines = extract_elements(para, "line")
w_lines = [x["box"]["w"] for x in lines]
h_lines = [x["box"]["h"] for x in lines]
l_lines = [x["box"]["l"] for x in lines]
logger.debug("is it a body para?")
if abs(self.median_line_width - max(w_lines)) > 5:
return False
if abs(self.median_line_height - median(h_lines)) > 2:
return False
if abs(self.median_line_left - median(l_lines)) > 5:
return False
logger.debug("yes!")
return True
Functions
def avg_word_space(line)
-
Average word space on a line, util for words / lines
Expand source code
def avg_word_space(line): """Average word space on a line, util for words / lines src: https://github.com/axa-group/Parsr/blob/69e6b9bf33f1cc43d5a87d428cedf1132ccc48e8/server/src/types/DocumentRepresentation/Paragraph.ts#L460 """ def calc_margins(index, word): if index > 0: return word["box"]["l"] - ( line["content"][index - 1]["box"]["l"] + line["content"][index - 1]["box"]["w"] ) return 0 margins = [calc_margins(i, w) for i, w in enumerate(line["content"])] return sum(margins) / len(margins)
def calc_line_space(lines)
-
Expand source code
def calc_line_space(lines): if len(lines) <= 1: return [] lineheights = [] for i, _ in enumerate(lines[:-1]): if (x := get_lineheight(lines[i], lines[i + 1])) is not None: lineheights.append(x) return lineheights
def extract_elements(outer_element, element_type)
-
Expand source code
def extract_elements(outer_element, element_type): def traverse(element): if type(element) is dict: if "type" in element and element["type"] == element_type: return element if "content" in element: return traverse(element["content"]) return None if type(element) is list: return [traverse(e) for e in element] return [ x for x in flatten(traverse(outer_element), keep_dict=True) if x is not None ]
def font_stats(outer_element)
-
Expand source code
def font_stats(outer_element): return [x["font"] for x in extract_elements(outer_element, "word")]
def get_lineheight(l1, l2)
-
Expand source code
def get_lineheight(l1, l2): # l1 or l2 can be the upper line if l2["box"]["t"] < l1["box"]["t"]: l1, l2 = l2, l1 dif = l2["box"]["t"] - l1["box"]["t"] - l1["box"]["h"] # it may happen that the lines are on the same return dif if dif > 0 else None
def median_from_counter(c)
-
Expand source code
def median_from_counter(c): data = [] for value, count in c.most_common(): data += [value] * count return median(data)
def most_used_font(element)
-
Expand source code
def most_used_font(element): return Counter(font_stats(element)).most_common(1)[0][0]
def only_points(es)
-
Expand source code
def only_points(es): r = [] for e in es: b = e["box"] r.append((b["t"], b["l"])) r.append((b["t"] + b["h"], b["l"])) r.append((b["t"], b["l"] + b["w"])) r.append((b["t"] + b["h"], b["l"] + b["w"])) return r
def only_text(es)
-
Expand source code
def only_text(es): r = [] for e in es: for x in extract_elements(e, "word"): r.append(x["content"].strip()) return fix_bad_unicode(" ".join(r))
def remove_duplicates(page_items, lang)
-
Expand source code
def remove_duplicates(page_items, lang): results = [page_items[0]] for elements in page_items[1:]: cool = True for r in results: if len(r) == 0: continue # only choose the best first one? if super_similiar(r, elements): logger.debug("items are super similiar") if single_score(only_text(r), lang) <= single_score( only_text(elements), lang ): logger.debug( "okay, skipping here, the previous one got better / same score" ) cool = False break else: logger.debug("removing previous one, this is better") results.remove(r) if cool: results.append(elements) else: results.append([]) return results
-
Rough check to remove elements with text such as
Seite $NUM von $NUM
or just$NUM
.TODO: Make it work if the pager number is part of a bigger header/footer. And also consider the language.
Expand source code
def remove_page_number_header_footer(page_items): """Rough check to remove elements with text such as `Seite $NUM von $NUM` or just `$NUM`. TODO: Make it work if the pager number is part of a bigger header/footer. And also consider the language. """ texts = [ clean(only_text(x), replace_with_number="", no_punct=True) .replace("seite", "") .replace("von", "") for x in page_items ] results = [] for idx, x in enumerate(page_items): if texts[idx].strip() != "": results.append(x) return results
def roughly_same_font(f1, f2)
-
Expand source code
def roughly_same_font(f1, f2): # unreliable assert f1["sizeUnit"] == "px" assert f2["sizeUnit"] == "px" return abs(f1["size"] - f2["size"]) < max(f1["size"], f2["size"]) * 0.2
def super_similiar(es1, es2, sim_factor=0.8, sim_box=0.6)
-
Check if two elements are super similiar by text (Jaccad) and visually (compare bbox).
Expand source code
def super_similiar(es1, es2, sim_factor=0.8, sim_box=0.6): """Check if two elements are super similiar by text (Jaccad) and visually (compare bbox). """ text1 = only_text(es1) text2 = only_text(es2) points1 = only_points(es1) points2 = only_points(es2) if min(len(points1), len(points2)) < 4: return False logger.debug("points") logger.debug(points1) logger.debug(points2) j_sim = jaccard(text1, text2) b_sim = sim_bbox(points1, points2) logger.debug(f"footer/header sims {j_sim} {b_sim}") return j_sim > sim_factor and b_sim > sim_box
Classes
class DocumentInfo (input_data)
-
Expand source code
class DocumentInfo: def __init__(self, input_data) -> None: self.input_data = input_data # needs to be done first self.element_order_page() self.document_font_stats() self.document_paragraph_stats() # free memory (code should have been re-written but whatehver) del self.input_data def document_paragraph_stats(self): """ """ self.counter_width = Counter() self.counter_height = Counter() self.counter_lineheight = Counter() self.counter_line_left = Counter() for n_page, p in enumerate(self.input_data["pages"]): for e in p["elements"]: lis = extract_elements(e, "line") for x in lis: x["idx_page"] = n_page self.id_to_elem[x["id"]] = x self.counter_width.update([x["box"]["w"] for x in lis]) self.counter_height.update([x["box"]["h"] for x in lis]) self.counter_lineheight.update(calc_line_space(lis)) self.counter_line_left.update([x["box"]["l"] for x in lis]) if ( min( map( len, [ self.counter_width, self.counter_height, self.counter_lineheight, self.counter_line_left, ], ) ) == 0 ): raise ValueError( "Something is wrong with the document. Is the text in the PDF broken (copy the text out of the doc and see how it looks)?" ) self.median_line_width = median_from_counter(self.counter_width) self.median_line_height = median_from_counter(self.counter_height) # line space: line height self.median_line_space = median_from_counter(self.counter_lineheight) self.median_line_left = median_from_counter(self.counter_line_left) logger.info(f"media line width: {self.median_line_width}") logger.info(f"median line height: {self.median_line_height}") logger.info(f"median line space: {self.median_line_space}") logger.info(f"counter width: {self.counter_width.most_common(5)}") logger.info(f"counter height: {self.counter_height.most_common(5)}") logger.info(f"counter lineheight: {self.counter_lineheight.most_common(5)}") def document_font_stats(self): """Get statistics about font usage in the document """ c = Counter() for p in self.input_data["pages"]: for e in p["elements"]: c.update(font_stats(e)) if len(c) == 0: raise ValueError( "Something is wrong with the document. Is the text in the PDF broken (copy the text out of the doc and see how it looks)?" ) self.body_font = c.most_common(1)[0][0] self.font_counter = c self.font_info = {} for x in self.input_data["fonts"]: self.font_info[x["id"]] = x assert x["sizeUnit"] == "px" def seperate_lines(self, l1, l2, factor=0.5): lh = get_lineheight(l1, l2) if lh is None: return False # space between lines can only be + 0.5x the body lineheight return ((lh - self.median_line_space) / self.median_line_space) > factor def on_same_page(self, e1, e2): """Check if both elements are on the same page """ return ( self.id_to_elem[e1["id"]]["idx_page"] == self.id_to_elem[e2["id"]]["idx_page"] ) def element_order_page(self): """Save the order of paragraphes for each page, exclude header / footer """ self.order_page = [] self.id_to_elem = {} for idx_page, p in enumerate(self.input_data["pages"]): per_page = [] for e in p["elements"]: # not all elements are included here e["idx_page"] = idx_page self.id_to_elem[e["id"]] = e if not e["type"] in ("paragraph", "heading"): continue if "isHeader" in e["properties"] and e["properties"]["isHeader"]: continue if "isFooter" in e["properties"] and e["properties"]["isFooter"]: continue per_page.append(e["id"]) self.order_page.append(per_page) def is_body_paragrah(self, para): lines = extract_elements(para, "line") w_lines = [x["box"]["w"] for x in lines] h_lines = [x["box"]["h"] for x in lines] l_lines = [x["box"]["l"] for x in lines] logger.debug("is it a body para?") if abs(self.median_line_width - max(w_lines)) > 5: return False if abs(self.median_line_height - median(h_lines)) > 2: return False if abs(self.median_line_left - median(l_lines)) > 5: return False logger.debug("yes!") return True
Methods
def document_font_stats(self)
-
Get statistics about font usage in the document
Expand source code
def document_font_stats(self): """Get statistics about font usage in the document """ c = Counter() for p in self.input_data["pages"]: for e in p["elements"]: c.update(font_stats(e)) if len(c) == 0: raise ValueError( "Something is wrong with the document. Is the text in the PDF broken (copy the text out of the doc and see how it looks)?" ) self.body_font = c.most_common(1)[0][0] self.font_counter = c self.font_info = {} for x in self.input_data["fonts"]: self.font_info[x["id"]] = x assert x["sizeUnit"] == "px"
def document_paragraph_stats(self)
-
Expand source code
def document_paragraph_stats(self): """ """ self.counter_width = Counter() self.counter_height = Counter() self.counter_lineheight = Counter() self.counter_line_left = Counter() for n_page, p in enumerate(self.input_data["pages"]): for e in p["elements"]: lis = extract_elements(e, "line") for x in lis: x["idx_page"] = n_page self.id_to_elem[x["id"]] = x self.counter_width.update([x["box"]["w"] for x in lis]) self.counter_height.update([x["box"]["h"] for x in lis]) self.counter_lineheight.update(calc_line_space(lis)) self.counter_line_left.update([x["box"]["l"] for x in lis]) if ( min( map( len, [ self.counter_width, self.counter_height, self.counter_lineheight, self.counter_line_left, ], ) ) == 0 ): raise ValueError( "Something is wrong with the document. Is the text in the PDF broken (copy the text out of the doc and see how it looks)?" ) self.median_line_width = median_from_counter(self.counter_width) self.median_line_height = median_from_counter(self.counter_height) # line space: line height self.median_line_space = median_from_counter(self.counter_lineheight) self.median_line_left = median_from_counter(self.counter_line_left) logger.info(f"media line width: {self.median_line_width}") logger.info(f"median line height: {self.median_line_height}") logger.info(f"median line space: {self.median_line_space}") logger.info(f"counter width: {self.counter_width.most_common(5)}") logger.info(f"counter height: {self.counter_height.most_common(5)}") logger.info(f"counter lineheight: {self.counter_lineheight.most_common(5)}")
def element_order_page(self)
-
Save the order of paragraphes for each page, exclude header / footer
Expand source code
def element_order_page(self): """Save the order of paragraphes for each page, exclude header / footer """ self.order_page = [] self.id_to_elem = {} for idx_page, p in enumerate(self.input_data["pages"]): per_page = [] for e in p["elements"]: # not all elements are included here e["idx_page"] = idx_page self.id_to_elem[e["id"]] = e if not e["type"] in ("paragraph", "heading"): continue if "isHeader" in e["properties"] and e["properties"]["isHeader"]: continue if "isFooter" in e["properties"] and e["properties"]["isFooter"]: continue per_page.append(e["id"]) self.order_page.append(per_page)
def is_body_paragrah(self, para)
-
Expand source code
def is_body_paragrah(self, para): lines = extract_elements(para, "line") w_lines = [x["box"]["w"] for x in lines] h_lines = [x["box"]["h"] for x in lines] l_lines = [x["box"]["l"] for x in lines] logger.debug("is it a body para?") if abs(self.median_line_width - max(w_lines)) > 5: return False if abs(self.median_line_height - median(h_lines)) > 2: return False if abs(self.median_line_left - median(l_lines)) > 5: return False logger.debug("yes!") return True
def on_same_page(self, e1, e2)
-
Check if both elements are on the same page
Expand source code
def on_same_page(self, e1, e2): """Check if both elements are on the same page """ return ( self.id_to_elem[e1["id"]]["idx_page"] == self.id_to_elem[e2["id"]]["idx_page"] )
def seperate_lines(self, l1, l2, factor=0.5)
-
Expand source code
def seperate_lines(self, l1, l2, factor=0.5): lh = get_lineheight(l1, l2) if lh is None: return False # space between lines can only be + 0.5x the body lineheight return ((lh - self.median_line_space) / self.median_line_space) > factor