Overview
| Comment: | [core] new regex for sentence splitting, generator of sentences | 
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive | 
| Timelines: | family | ancestors | descendants | both | trunk | core | 
| Files: | files | file ages | folders | 
| SHA3-256: | 
2777d8cef6c55a83e05311347076867a | 
| User & Date: | olr on 2019-05-24 12:21:35 | 
| Other Links: | manifest | tags | 
Context
| 
   2019-05-24 
 | ||
| 14:12 | [core] sentence splitting: code clarification check-in: b52cb827b1 user: olr tags: trunk, core | |
| 12:21 | [core] new regex for sentence splitting, generator of sentences check-in: 2777d8cef6 user: olr tags: trunk, core | |
| 12:20 | [fr] commentaire check-in: 8fd1fbf7f3 user: olr tags: trunk, fr | |
Changes
Modified gc_core/js/text.js from [3e4e705e26] to [091c88a16c].
1 2 3 4 5 6 7 8 9 10 11  | 
// JavaScript
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */
"use strict";
var text = {
 | | > > > > > > >  | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37  | 
// JavaScript
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */
"use strict";
var text = {
    _zEndOfSentence: new RegExp ('[.?!:;…]+[   ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])', "g"),
    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let iStart = 0;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
        yield [iStart, sText.length];
    },
    getSentence: function* (sText) {
        // generator: returns sentences found in <sText>
        for (let [iStart, iEnd] of this.getSentenceBoundaries(sText)) {
            yield sText.slice(iStart, iEnd);
        }
    },
    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");
        while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {
 | 
| ︙ | ︙ | 
Modified gc_core/py/text.py from [71de26b7a5] to [dd28033587].
1 2 3 4 5 6 7 8 9 10 11  | #!python3 """ Text tools """ import re import textwrap from itertools import chain  | | > > > > > >  | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34  | 
#!python3
"""
Text tools
"""
import re
import textwrap
from itertools import chain
_zEndOfSentence = re.compile(r'[.?!:;…]+[   ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])')
def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = 0
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()
    yield (iStart, len(sText))
def getSentence (sText):
    "generator: returns sentences found in <sText>"
    for iStart, iEnd in getSentenceBoundaries(sText):
        yield sText[iStart:iEnd]
def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)
    while iEnd != -1:
 | 
| ︙ | ︙ |