Grammalecte  Check-in [2777d8cef6]

Overview
Comment:[core] new regex for sentence splitting, generator of sentences
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 2777d8cef6c55a83e05311347076867a67f3cf007a7f3e7298d41a770d8510f7
User & Date: olr on 2019-05-24 12:21:35
Other Links: manifest | tags
Context
2019-05-24
14:12
[core] sentence splitting: code clarification check-in: b52cb827b1 user: olr tags: trunk, core
12:21
[core] new regex for sentence splitting, generator of sentences check-in: 2777d8cef6 user: olr tags: trunk, core
12:20
[fr] commentaire check-in: 8fd1fbf7f3 user: olr tags: trunk, fr
Changes

Modified gc_core/js/text.js from [3e4e705e26] to [091c88a16c].

1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
20
21
22
23







24
25
26
27
28
29
30
1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37











-
+











+
+
+
+
+
+
+







// JavaScript

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('[.?!:;…][   .?!…»«”"‘’)–—]+(?=[A-ZÉÈÎÔ])', "g"),
    _zEndOfSentence: new RegExp ('[.?!:;…]+[   ]+[»”’]?(?=[«"‘]?[A-ZÉÈÎÔ–—])', "g"),

    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let iStart = 0;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
        yield [iStart, sText.length];
    },

    getSentence: function* (sText) {
        // generator: returns sentences found in <sText>
        for (let [iStart, iEnd] of this.getSentenceBoundaries(sText)) {
            yield sText.slice(iStart, iEnd);
        }
    },

    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");
        while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {

Modified gc_core/py/text.py from [71de26b7a5] to [dd28033587].

1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
20
21






22
23
24
25
26
27
28
1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34











-
+









+
+
+
+
+
+







#!python3

"""
Text tools
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'[.?!:;…]\W+(?=[A-ZÉÈÎÔ])')
_zEndOfSentence = re.compile(r'[.?!:;…]+[   ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])')

def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = 0
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()
    yield (iStart, len(sText))


def getSentence (sText):
    "generator: returns sentences found in <sText>"
    for iStart, iEnd in getSentenceBoundaries(sText):
        yield sText[iStart:iEnd]


def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)
    while iEnd != -1: