Overview
Comment: | [core] new regex for sentence splitting, generator of sentences |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | core |
Files: | files | file ages | folders |
SHA3-256: |
2777d8cef6c55a83e05311347076867a |
User & Date: | olr on 2019-05-24 12:21:35 |
Other Links: | manifest | tags |
Context
2019-05-24
| ||
14:12 | [core] sentence splitting: code clarification check-in: b52cb827b1 user: olr tags: trunk, core | |
12:21 | [core] new regex for sentence splitting, generator of sentences check-in: 2777d8cef6 user: olr tags: trunk, core | |
12:20 | [fr] commentaire check-in: 8fd1fbf7f3 user: olr tags: trunk, fr | |
Changes
Modified gc_core/js/text.js from [3e4e705e26] to [091c88a16c].
1 2 3 4 5 6 7 8 9 10 11 | // JavaScript /* jshint esversion:6, -W097 */ /* jslint esversion:6 */ /* global require, exports, console */ "use strict"; var text = { | | > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | // JavaScript /* jshint esversion:6, -W097 */ /* jslint esversion:6 */ /* global require, exports, console */ "use strict"; var text = { _zEndOfSentence: new RegExp ('[.?!:;…]+[ ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])', "g"), getSentenceBoundaries: function* (sText) { // generator: returns start and end of sentences found in <sText> let iStart = 0; let m; while ((m = this._zEndOfSentence.exec(sText)) !== null) { yield [iStart, this._zEndOfSentence.lastIndex]; iStart = this._zEndOfSentence.lastIndex; } yield [iStart, sText.length]; }, getSentence: function* (sText) { // generator: returns sentences found in <sText> for (let [iStart, iEnd] of this.getSentenceBoundaries(sText)) { yield sText.slice(iStart, iEnd); } }, getParagraph: function* (sText, sSepParagraph = "\n") { // generator: returns paragraphs of text let iStart = 0; let iEnd = 0; sText = sText.replace("\r\n", "\n").replace("\r", "\n"); while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) { |
︙ | ︙ |
Modified gc_core/py/text.py from [71de26b7a5] to [dd28033587].
1 2 3 4 5 6 7 8 9 10 11 | #!python3 """ Text tools """ import re import textwrap from itertools import chain | | > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | #!python3 """ Text tools """ import re import textwrap from itertools import chain _zEndOfSentence = re.compile(r'[.?!:;…]+[ ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])') def getSentenceBoundaries (sText): "generator: returns start and end of sentences found in <sText>" iStart = 0 for m in _zEndOfSentence.finditer(sText): yield (iStart, m.end()) iStart = m.end() yield (iStart, len(sText)) def getSentence (sText): "generator: returns sentences found in <sText>" for iStart, iEnd in getSentenceBoundaries(sText): yield sText[iStart:iEnd] def getParagraph (sText): "generator: returns paragraphs of text" iStart = 0 sText = sText.replace("\r\n", "\n").replace("\r", "\n") iEnd = sText.find("\n", iStart) while iEnd != -1: |
︙ | ︙ |