Index: gc_core/js/text.js ================================================================== --- gc_core/js/text.js +++ gc_core/js/text.js @@ -7,11 +7,11 @@ "use strict"; var text = { - _zEndOfSentence: new RegExp ('[.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])', "g"), + _zEndOfSentence: new RegExp ('[.?!:;…]+[   ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])', "g"), getSentenceBoundaries: function* (sText) { // generator: returns start and end of sentences found in let iStart = 0; let m; @@ -19,10 +19,17 @@ yield [iStart, this._zEndOfSentence.lastIndex]; iStart = this._zEndOfSentence.lastIndex; } yield [iStart, sText.length]; }, + + getSentence: function* (sText) { + // generator: returns sentences found in + for (let [iStart, iEnd] of this.getSentenceBoundaries(sText)) { + yield sText.slice(iStart, iEnd); + } + }, getParagraph: function* (sText, sSepParagraph = "\n") { // generator: returns paragraphs of text let iStart = 0; let iEnd = 0; Index: gc_core/py/text.py ================================================================== --- gc_core/py/text.py +++ gc_core/py/text.py @@ -7,20 +7,26 @@ import re import textwrap from itertools import chain -_zEndOfSentence = re.compile(r'[.?!:;…]\W+(?=[A-ZÉÈÎÔ])') +_zEndOfSentence = re.compile(r'[.?!:;…]+[   ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])') def getSentenceBoundaries (sText): "generator: returns start and end of sentences found in " iStart = 0 for m in _zEndOfSentence.finditer(sText): yield (iStart, m.end()) iStart = m.end() yield (iStart, len(sText)) + +def getSentence (sText): + "generator: returns sentences found in " + for iStart, iEnd in getSentenceBoundaries(sText): + yield sText[iStart:iEnd] + def getParagraph (sText): "generator: returns paragraphs of text" iStart = 0 sText = sText.replace("\r\n", "\n").replace("\r", "\n")