Source code for sentency.sentency

import re

from spacy.language import Language
from spacy.tokens import Doc, Span

from .logs import get_logger

logger = get_logger(__name__)


[docs]@Language.factory( "sentex", default_config={ "sentence_regex": "", "ignore_regex": "", "annotate_ents": False, "label": "Sentex", }, ) def create_sentex_component( nlp: Language, name: str, sentence_regex: str, ignore_regex: str, annotate_ents: bool, label: str, ): return Sentex(nlp, sentence_regex, ignore_regex, annotate_ents, label)
[docs]class Sentex: """ Sentex is a spaCy pipeline component that adds spans to the list `Doc._.sentex` based on regular expression matches within each sentence of the document. If an `ignore_regex` is given, sentences matching that regular expression will be ignored. nlp: `Language`, A required argument for spacy to use this as a factory sentence_regex : `str`, A regular expression to match spans within each sentence of the document. ignore_regex : `str`, A regular expression to identify sentences that should be ignored. annotate_ents: `bool`, Write/overwrite matches to Doc.ents label: `str`, If annotate_ents == True, the label for the matched entity """ def __init__( self, nlp: Language, sentence_regex: str, ignore_regex: str, annotate_ents: bool, label: str, ): self.sentence_regex = sentence_regex self.ignore_regex = ignore_regex self.annotate_ents = annotate_ents self.label = label if not Doc.has_extension("sentex"): Doc.set_extension("sentex", default=[]) def __call__(self, doc: Doc) -> Doc: # keep track of previous sentence prev_sent = None for i, sent in enumerate(doc.sents): logger.debug(f"sentence {i}: {sent}") should_ignore = self.ignore_regex.strip() != "" and bool( re.search(self.ignore_regex, sent.text) ) if should_ignore: logger.debug("sentence ignored") prev_sent = sent continue for match in re.finditer(self.sentence_regex, sent.text): logger.debug(f"match: {str(match)}") start, end = match.span() # convert to doc in order to use 'expand' alignment mode # in case indicies are inside token boundaries span = sent.as_doc().char_span(start, end, alignment_mode="expand") if span is not None: # realign span so start/end are relative to doc, not sent span = self._realign_span(doc, span, prev_sent) logger.debug(f"match {span.text} start: {start} end: {end}") logger.debug( f"start {span.start} end {span.end}\ start char {span.start_char} end char {span.end_char}" ) logger.debug("adding match to sentex") doc._.sentex.append(span) else: logger.debug("span is None") prev_sent = sent if self.annotate_ents: self.set_annotations(doc) return doc
[docs] def set_annotations(self, doc): """Modify the document in place. Logic taken from spacy.pipeline.entityruler.EntityRuler """ entities = list(doc.ents) logger.debug(f"current entities: {entities}") new_entities = [] seen_tokens = set() matches = self._get_matches(doc) for match_id, start, end in matches: # check for end - 1 here because boundaries are inclusive if start not in seen_tokens and end - 1 not in seen_tokens: span = Span(doc, start, end, label=match_id) new_entities.append(span) logger.debug(f"new entity: {span.text} start: {start} end: {end}") entities = [ e for e in entities if not (e.start < end and e.end > start) ] seen_tokens.update(range(start, end)) doc.ents = entities + new_entities
def _get_matches(self, doc: Doc): return [(self.label, m.start, m.end) for m in doc._.sentex] def _realign_span(self, doc: Doc, span: Span, prev_sent: Span): offset = 0 if prev_sent is None else prev_sent.end start = span.start + offset end = span.end + offset return doc[start:end]
[docs]@Language.factory( "size", default_config={ "size_regex": r"(i?)((\d+(\.\d+)?)\s*([Xx]\s*(\d+(\.\d+)?)\s*([Xx]\s*(\d+(\.\d+)?))?)?\s*)(?=\W?[cm]m)", "sentex_only": True, "annotate_ents": True, "label": "SIZE" }, ) def create_size_component( nlp: Language, name: str, size_regex: str, sentex_only: bool, annotate_ents: bool, label: str ): return Size(nlp, size_regex, sentex_only, annotate_ents, label)
[docs]class Size: """ Size is a spaCy pipeline component that adds spans to the list `Doc._.size` based on regular expression matches within each sentence of the document. nlp: `Language`, A required argument for spacy to use this as a factory size_regex : `str`, A regular expression to match spans within each sentence of the document. sentex_only : `bool`, Only match in sentences with Sentex-matched entities annotate_ents: `bool`, Write/overwrite matches to Doc.ents label: `str`, If annotate_ents == True, the label for the matched entity """ def __init__( self, nlp: Language, size_regex: str, sentex_only: bool, annotate_ents: bool, label: str ): self.size_regex = size_regex self.sentex_only = sentex_only self.annotate_ents = annotate_ents self.label = label if not Doc.has_extension("size"): Doc.set_extension("size", default=[]) def __call__(self, doc: Doc) -> Doc: self.process(doc) if self.annotate_ents: self.set_annotations(doc) return doc
[docs] def set_annotations(self, doc): """Modify the document in place. Logic taken from spacy.pipeline.entityruler.EntityRuler """ entities = list(doc.ents) logger.debug(f"current entities: {entities}") new_entities = [] seen_tokens = set() matches = self._get_matches(doc) for match_id, start, end in matches: # check for end - 1 here because boundaries are inclusive if start not in seen_tokens and end - 1 not in seen_tokens: span = Span(doc, start, end, label=match_id) new_entities.append(span) logger.debug(f"new entity: {span.text} start: {start} end: {end}") entities = [ e for e in entities if not (e.start < end and e.end > start) ] seen_tokens.update(range(start, end)) doc.ents = entities + new_entities
[docs] def process(self, doc: Doc) -> Doc: if self.sentex_only: sents = set([span.sent for span in doc._.sentex]) else: sents = doc.sents for sent in sents: match = re.search(self.size_regex, str(sent)) if match is None: continue start, end = match.span() num_start = sent.start_char + start num_end = sent.start_char + end span = sent.doc.char_span(num_start,num_end, alignment_mode='contract') logger.debug(f'span: {span} indexes: {num_start} to {num_end}') size = self.parse_size(str(span)) doc._.size.append((span, size))
def _get_matches(self, doc: Doc): return [(self.label, m[0].start, m[0].end) for m in doc._.size]
[docs] def parse_size(self, text: str) -> float: sizes = [n for n in text.lower().replace('x',' ').split(' ') if n != ''] sizes = [float(n) for n in sizes] sizes = sizes if len(sizes) <= 2 else sizes[:2] size = max(sizes) return size