Index: gc_core/js/lang_core/gc_engine.js ================================================================== --- gc_core/js/lang_core/gc_engine.js +++ gc_core/js/lang_core/gc_engine.js @@ -161,24 +161,10 @@ //// Parsing parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) { let oText = new TextParser(sText); return oText.parse(sCountry, bDebug, dOptions, bContext); - }, - - _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"), - _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"), - _zEndOfParagraph: new RegExp ("[-  .,;?!…–—]*$", "ig"), - - getSentenceBoundaries: function* (sText) { - let mBeginOfSentence = this._zBeginOfParagraph.exec(sText); - let iStart = this._zBeginOfParagraph.lastIndex; - let m; - while ((m = this._zEndOfSentence.exec(sText)) !== null) { - yield [iStart, this._zEndOfSentence.lastIndex]; - iStart = this._zEndOfSentence.lastIndex; - } } }; class TextParser { @@ -241,11 +227,11 @@ if (this.sText.includes("@@")) { this.sText = this.sText.replace(/@@+/g, ""); } // parse sentence - for (let [iStart, iEnd] of gc_engine.getSentenceBoundaries(this.sText)) { + for (let [iStart, iEnd] of text.getSentenceBoundaries(this.sText)) { try { this.sSentence = this.sText.slice(iStart, iEnd); this.sSentence0 = this.sText0.slice(iStart, iEnd); this.nOffsetWithinParagraph = iStart; this.lToken = Array.from(_oTokenizer.genTokens(this.sSentence, true)); Index: gc_core/js/text.js ================================================================== --- gc_core/js/text.js +++ gc_core/js/text.js @@ -6,10 +6,25 @@ "use strict"; var text = { + + _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"), + _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"), + + getSentenceBoundaries: function* (sText) { + // generator: returns start and end of sentences found in + let mBeginOfSentence = this._zBeginOfParagraph.exec(sText); + let iStart = this._zBeginOfParagraph.lastIndex; + let m; + while ((m = this._zEndOfSentence.exec(sText)) !== null) { + yield [iStart, this._zEndOfSentence.lastIndex]; + iStart = this._zEndOfSentence.lastIndex; + } + }, + getParagraph: function* (sText, sSepParagraph = "\n") { // generator: returns paragraphs of text let iStart = 0; let iEnd = 0; sText = sText.replace("\r\n", "\n").replace("\r", "\n"); Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -8,10 +8,13 @@ #import unicodedata from itertools import chain from ..graphspell.spellchecker import SpellChecker from ..graphspell.echo import echo + +from .. import text + from . import gc_options try: # LibreOffice / OpenOffice from com.sun.star.linguistic2 import SingleProofreadingError @@ -188,21 +191,10 @@ global _dOptions _dOptions = getDefaultOptions() #### Parsing - -_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)') -_zBeginOfParagraph = re.compile(r"^\W*") -_zEndOfParagraph = re.compile(r"\W*$") - -def _getSentenceBoundaries (sText): - iStart = _zBeginOfParagraph.match(sText).end() - for m in _zEndOfSentence.finditer(sText): - yield (iStart, m.end()) - iStart = m.end() - def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False): "init point to analyze a text" oText = TextParser(sText) return oText.parse(sCountry, bDebug, dOptions, bContext) @@ -264,11 +256,11 @@ sText = sText.replace("‑", "-") # nobreakdash if "@@" in sText: sText = re.sub("@@+", "", sText) # parse sentences - for iStart, iEnd in _getSentenceBoundaries(sText): + for iStart, iEnd in text.getSentenceBoundaries(sText): if 4 < (iEnd - iStart) < 2000: try: self.sSentence = sText[iStart:iEnd] self.sSentence0 = self.sText0[iStart:iEnd] self.nOffsetWithinParagraph = iStart Index: gc_core/py/text.py ================================================================== --- gc_core/py/text.py +++ gc_core/py/text.py @@ -2,13 +2,25 @@ """ Text tools """ +import re import textwrap from itertools import chain + +_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)') +_zBeginOfParagraph = re.compile(r"^\W*") + +def getSentenceBoundaries (sText): + "generator: returns start and end of sentences found in " + iStart = _zBeginOfParagraph.match(sText).end() + for m in _zEndOfSentence.finditer(sText): + yield (iStart, m.end()) + iStart = m.end() + def getParagraph (sText): "generator: returns paragraphs of text" iStart = 0 sText = sText.replace("\r\n", "\n").replace("\r", "\n")