Overview
| Comment: | [core] move getSentenceBoundaries from gc_engine to text module |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk | core |
| Files: | files | file ages | folders |
| SHA3-256: |
bf0a1bdd5dcdb4b2f65922bfa97012a4 |
| User & Date: | olr on 2019-05-24 07:42:20 |
| Other Links: | manifest | tags |
Context
|
2019-05-24
| ||
| 08:50 | [core] sentence splitting: code clarification check-in: 311ccab788 user: olr tags: trunk, core | |
| 07:42 | [core] move getSentenceBoundaries from gc_engine to text module check-in: bf0a1bdd5d user: olr tags: trunk, core | |
| 07:03 | [fr] faux positifs et ajustements check-in: 2f921877e7 user: olr tags: trunk, fr | |
Changes
Modified gc_core/js/lang_core/gc_engine.js from [842c50636b] to [0ae80cfb32].
| ︙ | ︙ | |||
159 160 161 162 163 164 165 |
},
//// Parsing
parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
let oText = new TextParser(sText);
return oText.parse(sCountry, bDebug, dOptions, bContext);
| < < < < < < < < < < < < < < | 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
},
//// Parsing
parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
let oText = new TextParser(sText);
return oText.parse(sCountry, bDebug, dOptions, bContext);
}
};
class TextParser {
constructor (sText) {
|
| ︙ | ︙ | |||
239 240 241 242 243 244 245 |
this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash
}
if (this.sText.includes("@@")) {
this.sText = this.sText.replace(/@@+/g, "");
}
// parse sentence
| | | 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash
}
if (this.sText.includes("@@")) {
this.sText = this.sText.replace(/@@+/g, "");
}
// parse sentence
for (let [iStart, iEnd] of text.getSentenceBoundaries(this.sText)) {
try {
this.sSentence = this.sText.slice(iStart, iEnd);
this.sSentence0 = this.sText0.slice(iStart, iEnd);
this.nOffsetWithinParagraph = iStart;
this.lToken = Array.from(_oTokenizer.genTokens(this.sSentence, true));
this.dTokenPos.clear();
for (let dToken of this.lToken) {
|
| ︙ | ︙ |
Modified gc_core/js/text.js from [1251d5448b] to [dc3ef10e8d].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
// JavaScript
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */
"use strict";
var text = {
getParagraph: function* (sText, sSepParagraph = "\n") {
// generator: returns paragraphs of text
let iStart = 0;
let iEnd = 0;
sText = sText.replace("\r\n", "\n").replace("\r", "\n");
while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {
yield sText.slice(iStart, iEnd);
| > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
// JavaScript
/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global require, exports, console */
"use strict";
var text = {
_zEndOfSentence: new RegExp ('([.?!:;…][ .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
_zBeginOfParagraph: new RegExp ("^[- –—.,;?!…]*", "ig"),
getSentenceBoundaries: function* (sText) {
// generator: returns start and end of sentences found in <sText>
let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
let iStart = this._zBeginOfParagraph.lastIndex;
let m;
while ((m = this._zEndOfSentence.exec(sText)) !== null) {
yield [iStart, this._zEndOfSentence.lastIndex];
iStart = this._zEndOfSentence.lastIndex;
}
},
getParagraph: function* (sText, sSepParagraph = "\n") {
// generator: returns paragraphs of text
let iStart = 0;
let iEnd = 0;
sText = sText.replace("\r\n", "\n").replace("\r", "\n");
while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {
yield sText.slice(iStart, iEnd);
|
| ︙ | ︙ |
Modified gc_core/py/lang_core/gc_engine.py from [b546c7e179] to [c8470c3c39].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
"""
Grammalecte
Grammar checker engine
"""
import re
import traceback
#import unicodedata
from itertools import chain
from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo
from . import gc_options
try:
# LibreOffice / OpenOffice
from com.sun.star.linguistic2 import SingleProofreadingError
from com.sun.star.text.TextMarkupType import PROOFREADING
from com.sun.star.beans import PropertyValue
| > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
"""
Grammalecte
Grammar checker engine
"""
import re
import traceback
#import unicodedata
from itertools import chain
from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo
from .. import text
from . import gc_options
try:
# LibreOffice / OpenOffice
from com.sun.star.linguistic2 import SingleProofreadingError
from com.sun.star.text.TextMarkupType import PROOFREADING
from com.sun.star.beans import PropertyValue
|
| ︙ | ︙ | |||
186 187 188 189 190 191 192 |
def resetOptions ():
"set options to default values"
global _dOptions
_dOptions = getDefaultOptions()
#### Parsing
| < < < < < < < < < < < | 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
def resetOptions ():
"set options to default values"
global _dOptions
_dOptions = getDefaultOptions()
#### Parsing
def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
"init point to analyze a text"
oText = TextParser(sText)
return oText.parse(sCountry, bDebug, dOptions, bContext)
|
| ︙ | ︙ | |||
262 263 264 265 266 267 268 |
sText = sText.replace("'", "’")
if "‑" in sText:
sText = sText.replace("‑", "-") # nobreakdash
if "@@" in sText:
sText = re.sub("@@+", "", sText)
# parse sentences
| | | 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
sText = sText.replace("'", "’")
if "‑" in sText:
sText = sText.replace("‑", "-") # nobreakdash
if "@@" in sText:
sText = re.sub("@@+", "", sText)
# parse sentences
for iStart, iEnd in text.getSentenceBoundaries(sText):
if 4 < (iEnd - iStart) < 2000:
try:
self.sSentence = sText[iStart:iEnd]
self.sSentence0 = self.sText0[iStart:iEnd]
self.nOffsetWithinParagraph = iStart
self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" }
|
| ︙ | ︙ |
Modified gc_core/py/text.py from [36660e74f0] to [81c9cf089f].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
#!python3
"""
Text tools
"""
import textwrap
from itertools import chain
def getParagraph (sText):
"generator: returns paragraphs of text"
iStart = 0
sText = sText.replace("\r\n", "\n").replace("\r", "\n")
iEnd = sText.find("\n", iStart)
while iEnd != -1:
| > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
#!python3
"""
Text tools
"""
import re
import textwrap
from itertools import chain
_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
def getSentenceBoundaries (sText):
"generator: returns start and end of sentences found in <sText>"
iStart = _zBeginOfParagraph.match(sText).end()
for m in _zEndOfSentence.finditer(sText):
yield (iStart, m.end())
iStart = m.end()
def getParagraph (sText):
"generator: returns paragraphs of text"
iStart = 0
sText = sText.replace("\r\n", "\n").replace("\r", "\n")
iEnd = sText.find("\n", iStart)
while iEnd != -1:
|
| ︙ | ︙ |