Module pd3f.dehyphen_wrapper
Interaction with dehyphen
, cache results
Expand source code
"""Interaction with `dehyphen`, cache results
"""
from functools import lru_cache
from joblib import Memory
from dehyphen import FlairScorer
# cache max 100mb
memory = Memory(
"~/.cache/pd3f/dehyphen", verbose=0, compress=5, bytes_limit=100 * 1000 * 1000
)
scorer = None
def get_scorer(lang):
"""Simple singleton to avoid re-initialization of the language model.
"""
global scorer
if scorer is None:
# simplify Flair's naming of models
if lang.endswith("-fast"):
scorer = FlairScorer(lang=lang[:-5], fast=True)
else:
scorer = FlairScorer(lang=lang)
return scorer
@memory.cache
def dehyphen_paragraph(lines, lang):
scorer = get_scorer(lang)
return scorer.dehyphen_paragraph(lines)
@memory.cache
def is_split_paragraph(p1, p2, lang):
scorer = get_scorer(lang)
return scorer.is_split_paragraph(p1, p2)
@memory.cache
def newline_or_not(l1, l2, lang):
"""Decide whether to add a newline or not.
"""
# Flair does not work with only one char, thus this special case
if len(l1) == 1 and len(l1[0]) == 1:
return True
if len(l2) == 1 and len(l2[0]) == 1:
return False
scorer = get_scorer(lang)
texts = [l1, l2, l1 + " " + l2]
scores = scorer.score(texts)
best_score_idx = scores.index(min(scores))
return best_score_idx != 2
@lru_cache
def single_score(text, lang):
scorer = get_scorer(lang)
# Flair does not work with only one char, thus this special case
if len(text) == 1:
return float("inf")
return scorer.score([text])[0]
Functions
def dehyphen_paragraph(lines, lang)
-
Expand source code
@memory.cache def dehyphen_paragraph(lines, lang): scorer = get_scorer(lang) return scorer.dehyphen_paragraph(lines)
def get_scorer(lang)
-
Simple singleton to avoid re-initialization of the language model.
Expand source code
def get_scorer(lang): """Simple singleton to avoid re-initialization of the language model. """ global scorer if scorer is None: # simplify Flair's naming of models if lang.endswith("-fast"): scorer = FlairScorer(lang=lang[:-5], fast=True) else: scorer = FlairScorer(lang=lang) return scorer
def is_split_paragraph(p1, p2, lang)
-
Expand source code
@memory.cache def is_split_paragraph(p1, p2, lang): scorer = get_scorer(lang) return scorer.is_split_paragraph(p1, p2)
def newline_or_not(l1, l2, lang)
-
Decide whether to add a newline or not.
Expand source code
@memory.cache def newline_or_not(l1, l2, lang): """Decide whether to add a newline or not. """ # Flair does not work with only one char, thus this special case if len(l1) == 1 and len(l1[0]) == 1: return True if len(l2) == 1 and len(l2[0]) == 1: return False scorer = get_scorer(lang) texts = [l1, l2, l1 + " " + l2] scores = scorer.score(texts) best_score_idx = scores.index(min(scores)) return best_score_idx != 2
def single_score(text, lang)
-
Expand source code
@lru_cache def single_score(text, lang): scorer = get_scorer(lang) # Flair does not work with only one char, thus this special case if len(text) == 1: return float("inf") return scorer.score([text])[0]