Overview
| Comment: | [core] sentence splitting: code clarification |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | core |
| Files: | files | file ages | folders |
| SHA3-256: |
311ccab78875d456cb1047be38072b65 |
| User & Date: | olr on 2019-05-24 08:50:20 |
| Other Links: | manifest | tags |
Context
|
2019-05-24
| ||
| 11:21 | [fr] conversion: regex rules -> graph rules check-in: a5b3aff838 user: olr tags: trunk, fr | |
| 08:50 | [core] sentence splitting: code clarification check-in: 311ccab788 user: olr tags: trunk, core | |
| 07:42 | [core] move getSentenceBoundaries from gc_engine to text module check-in: bf0a1bdd5d user: olr tags: trunk, core | |
Changes
Modified gc_core/js/text.js from [dc3ef10e8d] to [3e4e705e26].
1 2 3 4 5 6 7 8 9 10 11 |
// JavaScript
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */
"use strict";
var text = {
| | < < | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
// JavaScript
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */
"use strict";
var text = {
_zEndOfSentence: new RegExp ('[.?!:;…][ .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])', "g"),
getSentenceBoundaries: function* (sText) {
// generator: returns start and end of sentences found in <sText>
let iStart = 0;
let m;
while ((m = this._zEndOfSentence.exec(sText)) !== null) {
yield [iStart, this._zEndOfSentence.lastIndex];
iStart = this._zEndOfSentence.lastIndex;
}
yield [iStart, sText.length];
},
getParagraph: function* (sText, sSepParagraph = "\n") {
// generator: returns paragraphs of text
let iStart = 0;
let iEnd = 0;
sText = sText.replace("\r\n", "\n").replace("\r", "\n");
|
| ︙ | ︙ |
Modified gc_core/py/text.py from [81c9cf089f] to [71de26b7a5].
1 2 3 4 5 6 7 8 9 10 11 | #!python3 """ Text tools """ import re import textwrap from itertools import chain | | < | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
#!python3
"""
Text tools
"""
import re
import textwrap
from itertools import chain
_zEndOfSentence = re.compile(r'[.?!:;…]\W+(?=[A-ZÉÈÎÔ])')
def getSentenceBoundaries (sText):
"generator: returns start and end of sentences found in <sText>"
iStart = 0
for m in _zEndOfSentence.finditer(sText):
yield (iStart, m.end())
iStart = m.end()
yield (iStart, len(sText))
def getParagraph (sText):
"generator: returns paragraphs of text"
iStart = 0
sText = sText.replace("\r\n", "\n").replace("\r", "\n")
iEnd = sText.find("\n", iStart)
|
| ︙ | ︙ |
Modified gc_lang/fr/rules.grx from [e6a924b2a3] to [8019fdef02].
| ︙ | ︙ | |||
1590 1591 1592 1593 1594 1595 1596 1597 | !!! !!! !!! Processeur: épuration des signes inutiles et quelques simplifications !! !!! !!! # fin de phrase | > > > | | | 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 |
!!!
!!!
!!! Processeur: épuration des signes inutiles et quelques simplifications !!
!!!
!!!
# début de phrase
__<s>(p_début_de_phrase)__ ^[ .?!:;–—•·… »«‘’“”\"'¿¡-]+ <<- ~>> *
# fin de phrase
__<s>(p_fin_de_phrase)__ [ .?!:;–—•·… »«‘’“”\"'¿¡-]+$ <<- ~>> *
# Guillemets et exposants
__<s>(p_guillemets_exposants)__ [«»“”\"„`¹²³⁴⁵⁶⁷⁸⁹⁰]+ <<- ~>> *
# Chapitres et références
__<s>(p_chapitre_référence)__ [\[({][\dIVXLCDM]+, \d+[\])}] <js>[\[\(\{][\dIVXLCDM]+, \d+[\]\)\}]</js> <<- ~>> *
# le, la ou les chose(s)
__[i>(p_le_ou_les)__ l[ea] ou les {w_2}([(]s[)]) @@$ <<- ~1>> s
__[i](p_le_ou_la)__ l(e ou la|a ou le) {w_2} @@1 <<- ~1>> ’
|
| ︙ | ︙ |