Overview
Comment: | [core] move getSentenceBoundaries from gc_engine to text module |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | core |
Files: | files | file ages | folders |
SHA3-256: |
bf0a1bdd5dcdb4b2f65922bfa97012a4 |
User & Date: | olr on 2019-05-24 07:42:20 |
Other Links: | manifest | tags |
Context
2019-05-24
| ||
08:50 | [core] sentence splitting: code clarification check-in: 311ccab788 user: olr tags: trunk, core | |
07:42 | [core] move getSentenceBoundaries from gc_engine to text module check-in: bf0a1bdd5d user: olr tags: trunk, core | |
07:03 | [fr] faux positifs et ajustements check-in: 2f921877e7 user: olr tags: trunk, fr | |
Changes
Modified gc_core/js/lang_core/gc_engine.js from [842c50636b] to [0ae80cfb32].
︙ | ︙ | |||
159 160 161 162 163 164 165 | }, //// Parsing parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) { let oText = new TextParser(sText); return oText.parse(sCountry, bDebug, dOptions, bContext); | < < < < < < < < < < < < < < | 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | }, //// Parsing parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) { let oText = new TextParser(sText); return oText.parse(sCountry, bDebug, dOptions, bContext); } }; class TextParser { constructor (sText) { |
︙ | ︙ | |||
239 240 241 242 243 244 245 | this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash } if (this.sText.includes("@@")) { this.sText = this.sText.replace(/@@+/g, ""); } // parse sentence | | | 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash } if (this.sText.includes("@@")) { this.sText = this.sText.replace(/@@+/g, ""); } // parse sentence for (let [iStart, iEnd] of text.getSentenceBoundaries(this.sText)) { try { this.sSentence = this.sText.slice(iStart, iEnd); this.sSentence0 = this.sText0.slice(iStart, iEnd); this.nOffsetWithinParagraph = iStart; this.lToken = Array.from(_oTokenizer.genTokens(this.sSentence, true)); this.dTokenPos.clear(); for (let dToken of this.lToken) { |
︙ | ︙ |
Modified gc_core/js/text.js from [1251d5448b] to [dc3ef10e8d].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | // JavaScript /* jshint esversion:6, -W097 */ /* jslint esversion:6 */ /* global require, exports, console */ "use strict"; var text = { getParagraph: function* (sText, sSepParagraph = "\n") { // generator: returns paragraphs of text let iStart = 0; let iEnd = 0; sText = sText.replace("\r\n", "\n").replace("\r", "\n"); while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) { yield sText.slice(iStart, iEnd); | > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | // JavaScript /* jshint esversion:6, -W097 */ /* jslint esversion:6 */ /* global require, exports, console */ "use strict"; var text = { _zEndOfSentence: new RegExp ('([.?!:;…][ .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"), _zBeginOfParagraph: new RegExp ("^[- –—.,;?!…]*", "ig"), getSentenceBoundaries: function* (sText) { // generator: returns start and end of sentences found in <sText> let mBeginOfSentence = this._zBeginOfParagraph.exec(sText); let iStart = this._zBeginOfParagraph.lastIndex; let m; while ((m = this._zEndOfSentence.exec(sText)) !== null) { yield [iStart, this._zEndOfSentence.lastIndex]; iStart = this._zEndOfSentence.lastIndex; } }, getParagraph: function* (sText, sSepParagraph = "\n") { // generator: returns paragraphs of text let iStart = 0; let iEnd = 0; sText = sText.replace("\r\n", "\n").replace("\r", "\n"); while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) { yield sText.slice(iStart, iEnd); |
︙ | ︙ |
Modified gc_core/py/lang_core/gc_engine.py from [b546c7e179] to [c8470c3c39].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | """ Grammalecte Grammar checker engine """ import re import traceback #import unicodedata from itertools import chain from ..graphspell.spellchecker import SpellChecker from ..graphspell.echo import echo from . import gc_options try: # LibreOffice / OpenOffice from com.sun.star.linguistic2 import SingleProofreadingError from com.sun.star.text.TextMarkupType import PROOFREADING from com.sun.star.beans import PropertyValue | > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | """ Grammalecte Grammar checker engine """ import re import traceback #import unicodedata from itertools import chain from ..graphspell.spellchecker import SpellChecker from ..graphspell.echo import echo from .. import text from . import gc_options try: # LibreOffice / OpenOffice from com.sun.star.linguistic2 import SingleProofreadingError from com.sun.star.text.TextMarkupType import PROOFREADING from com.sun.star.beans import PropertyValue |
︙ | ︙ | |||
186 187 188 189 190 191 192 | def resetOptions (): "set options to default values" global _dOptions _dOptions = getDefaultOptions() #### Parsing | < < < < < < < < < < < | 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | def resetOptions (): "set options to default values" global _dOptions _dOptions = getDefaultOptions() #### Parsing def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False): "init point to analyze a text" oText = TextParser(sText) return oText.parse(sCountry, bDebug, dOptions, bContext) |
︙ | ︙ | |||
262 263 264 265 266 267 268 | sText = sText.replace("'", "’") if "‑" in sText: sText = sText.replace("‑", "-") # nobreakdash if "@@" in sText: sText = re.sub("@@+", "", sText) # parse sentences | | | 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 | sText = sText.replace("'", "’") if "‑" in sText: sText = sText.replace("‑", "-") # nobreakdash if "@@" in sText: sText = re.sub("@@+", "", sText) # parse sentences for iStart, iEnd in text.getSentenceBoundaries(sText): if 4 < (iEnd - iStart) < 2000: try: self.sSentence = sText[iStart:iEnd] self.sSentence0 = self.sText0[iStart:iEnd] self.nOffsetWithinParagraph = iStart self.lToken = list(_oTokenizer.genTokens(self.sSentence, True)) self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" } |
︙ | ︙ |
Modified gc_core/py/text.py from [36660e74f0] to [81c9cf089f].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | #!python3 """ Text tools """ import textwrap from itertools import chain def getParagraph (sText): "generator: returns paragraphs of text" iStart = 0 sText = sText.replace("\r\n", "\n").replace("\r", "\n") iEnd = sText.find("\n", iStart) while iEnd != -1: | > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | #!python3 """ Text tools """ import re import textwrap from itertools import chain _zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)') _zBeginOfParagraph = re.compile(r"^\W*") def getSentenceBoundaries (sText): "generator: returns start and end of sentences found in <sText>" iStart = _zBeginOfParagraph.match(sText).end() for m in _zEndOfSentence.finditer(sText): yield (iStart, m.end()) iStart = m.end() def getParagraph (sText): "generator: returns paragraphs of text" iStart = 0 sText = sText.replace("\r\n", "\n").replace("\r", "\n") iEnd = sText.find("\n", iStart) while iEnd != -1: |
︙ | ︙ |