Grammalecte  Check-in [311ccab788]

Overview
Comment:[core] sentence splitting: code clarification
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 311ccab78875d456cb1047be38072b6556d2c927d802c96c3896d65615cad872
User & Date: olr on 2019-05-24 08:50:20
Other Links: manifest | tags
Context
2019-05-24
11:21
[fr] conversion: regex rules -> graph rules check-in: a5b3aff838 user: olr tags: trunk, fr
08:50
[core] sentence splitting: code clarification check-in: 311ccab788 user: olr tags: trunk, core
07:42
[core] move getSentenceBoundaries from gc_engine to text module check-in: bf0a1bdd5d user: olr tags: trunk, core
Changes

Modified gc_core/js/text.js from [dc3ef10e8d] to [3e4e705e26].

1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18

19
20
21
22
23

24
25
26
27
28
29
30
1
2
3
4
5
6
7
8
9
10
11

12

13
14
15


16
17
18
19
20
21
22
23
24
25
26
27
28
29











-
+
-



-
-
+





+







// JavaScript

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    _zEndOfSentence: new RegExp ('[.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])', "g"),
    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),

    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
        let iStart = this._zBeginOfParagraph.lastIndex;
        let iStart = 0;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
        yield [iStart, sText.length];
    },

    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");

Modified gc_core/py/text.py from [81c9cf089f] to [71de26b7a5].

1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17

18
19
20

21
22
23
24
25
26
27
1
2
3
4
5
6
7
8
9
10
11

12

13
14
15

16
17
18
19
20
21
22
23
24
25
26
27











-
+
-



-
+



+







#!python3

"""
Text tools
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)')
_zEndOfSentence = re.compile(r'[.?!:;…]\W+(?=[A-ZÉÈÎÔ])')
_zBeginOfParagraph = re.compile(r"^\W*")

def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = _zBeginOfParagraph.match(sText).end()
    iStart = 0
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()
    yield (iStart, len(sText))


def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)

Modified gc_lang/fr/rules.grx from [e6a924b2a3] to [8019fdef02].

1590
1591
1592
1593
1594
1595
1596



1597
1598

1599
1600
1601

1602
1603
1604
1605
1606
1607
1608
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600

1601
1602
1603

1604
1605
1606
1607
1608
1609
1610
1611







+
+
+

-
+


-
+








!!!
!!!
!!! Processeur: épuration des signes inutiles et quelques simplifications                         !!
!!!
!!!

# début de phrase
__<s>(p_début_de_phrase)__      ^[ .?!:;–—•·… »«‘’“”\"'¿¡-]+  <<- ~>> *

# fin de phrase
__<s>(p_fin_de_phrase)__        [.?!:;…][ .?!… »”")]*$  <<- ~>> *
__<s>(p_fin_de_phrase)__        [ .?!:;–—•·… »«‘’“\"'¿¡-]+$  <<- ~>> *

# Guillemets et exposants
__<s>(p_guillemets_exposants)__ [«»“”"„`¹²³⁴⁵⁶⁷⁸⁹⁰]+ <<- ~>> *
__<s>(p_guillemets_exposants)__ [«»“”\"„`¹²³⁴⁵⁶⁷⁸⁹⁰]+ <<- ~>> *

# Chapitres et références
__<s>(p_chapitre_référence)__   [\[({][\dIVXLCDM]+, \d+[\])}]   <js>[\[\(\{][\dIVXLCDM]+, \d+[\]\)\}]</js>   <<- ~>> *

# le, la ou les chose(s)
__[i>(p_le_ou_les)__            l[ea] ou les {w_2}([(]s[)]) @@$ <<- ~1>> s
__[i](p_le_ou_la)__             l(e ou la|a ou le) {w_2} @@1 <<- ~1>> ’