Grammalecte  Check-in [f5dac644a7]

Overview
Comment:[core] text: sentence splitter update
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: f5dac644a7749463ed3378f7e16207593712bbcb3d87cdb31f9ef3df343fd25f
User & Date: olr on 2019-05-26 08:12:51
Other Links: manifest | tags
Context
2019-05-26
08:17
[cli] command update check-in: 7e80aa3aa5 user: olr tags: trunk, cli
08:12
[core] text: sentence splitter update check-in: f5dac644a7 user: olr tags: trunk, core
08:06
[cli] update full analysis check-in: f9ab0161bc user: olr tags: trunk, cli
Changes

Modified gc_core/js/text.js from [b124979f5a] to [09888a5f30].

1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19











-
+







// JavaScript

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('[.?!:;…]+[   ]+[»”’]?(?=[«"“‘–—   ]?[A-ZÉÈÎÔ])', "g"),
    _zEndOfSentence: new RegExp ('[.?!:;…]+[   ]+[»”’]?(?=[«"“‘–—   ]?[A-ZÀÂÉÈÊÎÔÇ])', "g"),

    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let iStart = 0;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];

Modified gc_core/py/text.py from [bbd48992a7] to [d87792122b].

1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19











-
+







#!python3

"""
Text tools
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'[.?!:;…]+[   ]+[»”’]?(?=[«"“‘–—   ]?[A-ZÉÈÎÔ])')
_zEndOfSentence = re.compile(r'[.?!:;…]+[   ]+[»”’]?(?=[«"“‘–—   ]?[A-ZÀÂÉÈÊÎÔÇ])')

def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = 0
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()