Grammalecte  Check-in [d0bd3382eb]

Overview
Comment:[core] update: paragraph splitting
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: d0bd3382ebaefaa81cbeb41789eacce763b4bb638f12820d10e56db40801d00b
User & Date: olr on 2019-04-16 16:05:28
Other Links: manifest | tags
Context
2019-04-16
16:29
[fr] faux positif (ocr) check-in: 660df51ea1 user: olr tags: trunk, fr
16:05
[core] update: paragraph splitting check-in: d0bd3382eb user: olr tags: trunk, core
15:58
[fr] faux positifs (ocr) check-in: a052c4803b user: olr tags: trunk, fr
Changes

Modified gc_core/js/lang_core/gc_engine.js from [577effc815] to [5d05a79843].

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
    //// Parsing

    parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
        let oText = new TextParser(sText);
        return oText.parse(sCountry, bDebug, dOptions, bContext);
    },

    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»”")]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),
    _zEndOfParagraph: new RegExp ("[-  .,;?!…–—]*$", "ig"),

    getSentenceBoundaries: function* (sText) {
        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
        let iStart = this._zBeginOfParagraph.lastIndex;
        let m;







|







161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
    //// Parsing

    parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
        let oText = new TextParser(sText);
        return oText.parse(sCountry, bDebug, dOptions, bContext);
    },

    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),
    _zEndOfParagraph: new RegExp ("[-  .,;?!…–—]*$", "ig"),

    getSentenceBoundaries: function* (sText) {
        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
        let iStart = this._zBeginOfParagraph.lastIndex;
        let m;

Modified gc_core/py/lang_core/gc_engine.py from [ca233af37a] to [52335d206d].

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
    "set options to default values"
    global _dOptions
    _dOptions = dict(gc_options.getOptions(_sAppContext))


#### Parsing

_zEndOfSentence = re.compile(r'([.?!:;…][   .?!…»”"’)]+(?=[A-ZÉÈÎÔ])|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
_zEndOfParagraph = re.compile(r"\W*$")

def _getSentenceBoundaries (sText):
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())







|







187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
    "set options to default values"
    global _dOptions
    _dOptions = dict(gc_options.getOptions(_sAppContext))


#### Parsing

_zEndOfSentence = re.compile(r'([.?!:;…][   .?!…»«“”"‘’)]+(?=[A-ZÉÈÎÔ])|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
_zEndOfParagraph = re.compile(r"\W*$")

def _getSentenceBoundaries (sText):
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())