Grammalecte  Check-in [bf0a1bdd5d]

Overview
Comment:[core] move getSentenceBoundaries from gc_engine to text module
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: bf0a1bdd5dcdb4b2f65922bfa97012a4eb4f1ab5df68bf2a797ee178156b1742
User & Date: olr on 2019-05-24 07:42:20
Other Links: manifest | tags
Context
2019-05-24
08:50
[core] sentence splitting: code clarification check-in: 311ccab788 user: olr tags: trunk, core
07:42
[core] move getSentenceBoundaries from gc_engine to text module check-in: bf0a1bdd5d user: olr tags: trunk, core
07:03
[fr] faux positifs et ajustements check-in: 2f921877e7 user: olr tags: trunk, fr
Changes

Modified gc_core/js/lang_core/gc_engine.js from [842c50636b] to [0ae80cfb32].

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
159
160
161
162
163
164
165














166
167
168
169
170
171
172







-
-
-
-
-
-
-
-
-
-
-
-
-
-







    },

    //// Parsing

    parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
        let oText = new TextParser(sText);
        return oText.parse(sCountry, bDebug, dOptions, bContext);
    },

    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),
    _zEndOfParagraph: new RegExp ("[-  .,;?!…–—]*$", "ig"),

    getSentenceBoundaries: function* (sText) {
        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
        let iStart = this._zBeginOfParagraph.lastIndex;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
    }
};


class TextParser {

    constructor (sText) {
239
240
241
242
243
244
245
246

247
248
249
250
251
252
253
225
226
227
228
229
230
231

232
233
234
235
236
237
238
239







-
+







            this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash
        }
        if (this.sText.includes("@@")) {
            this.sText = this.sText.replace(/@@+/g, "");
        }

        // parse sentence
        for (let [iStart, iEnd] of gc_engine.getSentenceBoundaries(this.sText)) {
        for (let [iStart, iEnd] of text.getSentenceBoundaries(this.sText)) {
            try {
                this.sSentence = this.sText.slice(iStart, iEnd);
                this.sSentence0 = this.sText0.slice(iStart, iEnd);
                this.nOffsetWithinParagraph = iStart;
                this.lToken = Array.from(_oTokenizer.genTokens(this.sSentence, true));
                this.dTokenPos.clear();
                for (let dToken of this.lToken) {

Modified gc_core/js/text.js from [1251d5448b] to [dc3ef10e8d].

1
2
3
4
5
6
7
8
9
10















11
12
13
14
15
16
17
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32










+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







// JavaScript

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),

    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
        let iStart = this._zBeginOfParagraph.lastIndex;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
    },

    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");
        while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {
            yield sText.slice(iStart, iEnd);

Modified gc_core/py/lang_core/gc_engine.py from [b546c7e179] to [c8470c3c39].

1
2
3
4
5
6
7
8
9
10
11
12



13
14
15
16
17
18
19
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22












+
+
+







"""
Grammalecte
Grammar checker engine
"""

import re
import traceback
#import unicodedata
from itertools import chain

from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo

from .. import text

from . import gc_options

try:
    # LibreOffice / OpenOffice
    from com.sun.star.linguistic2 import SingleProofreadingError
    from com.sun.star.text.TextMarkupType import PROOFREADING
    from com.sun.star.beans import PropertyValue
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
189
190
191
192
193
194
195











196
197
198
199
200
201
202







-
-
-
-
-
-
-
-
-
-
-







def resetOptions ():
    "set options to default values"
    global _dOptions
    _dOptions = getDefaultOptions()


#### Parsing

_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
_zEndOfParagraph = re.compile(r"\W*$")

def _getSentenceBoundaries (sText):
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
    "init point to analyze a text"
    oText = TextParser(sText)
    return oText.parse(sCountry, bDebug, dOptions, bContext)


262
263
264
265
266
267
268
269

270
271
272
273
274
275
276
254
255
256
257
258
259
260

261
262
263
264
265
266
267
268







-
+







            sText = sText.replace("'", "’")
        if "‑" in sText:
            sText = sText.replace("‑", "-") # nobreakdash
        if "@@" in sText:
            sText = re.sub("@@+", "", sText)

        # parse sentences
        for iStart, iEnd in _getSentenceBoundaries(sText):
        for iStart, iEnd in text.getSentenceBoundaries(sText):
            if 4 < (iEnd - iStart) < 2000:
                try:
                    self.sSentence = sText[iStart:iEnd]
                    self.sSentence0 = self.sText0[iStart:iEnd]
                    self.nOffsetWithinParagraph = iStart
                    self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
                    self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }

Modified gc_core/py/text.py from [36660e74f0] to [81c9cf089f].

1
2
3
4
5
6

7
8
9











10
11
12
13
14
15
16
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28






+



+
+
+
+
+
+
+
+
+
+
+







#!python3

"""
Text tools
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")

def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)
    while iEnd != -1: