159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
|
},
//// Parsing
parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
let oText = new TextParser(sText);
return oText.parse(sCountry, bDebug, dOptions, bContext);
},
_zEndOfSentence: new RegExp ('([.?!:;…][ .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
_zBeginOfParagraph: new RegExp ("^[- –—.,;?!…]*", "ig"),
_zEndOfParagraph: new RegExp ("[- .,;?!…–—]*$", "ig"),
getSentenceBoundaries: function* (sText) {
let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
let iStart = this._zBeginOfParagraph.lastIndex;
let m;
while ((m = this._zEndOfSentence.exec(sText)) !== null) {
yield [iStart, this._zEndOfSentence.lastIndex];
iStart = this._zEndOfSentence.lastIndex;
}
}
};
class TextParser {
constructor (sText) {
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
},
//// Parsing
parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
let oText = new TextParser(sText);
return oText.parse(sCountry, bDebug, dOptions, bContext);
}
};
class TextParser {
constructor (sText) {
|
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
|
this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash
}
if (this.sText.includes("@@")) {
this.sText = this.sText.replace(/@@+/g, "");
}
// parse sentence
for (let [iStart, iEnd] of gc_engine.getSentenceBoundaries(this.sText)) {
try {
this.sSentence = this.sText.slice(iStart, iEnd);
this.sSentence0 = this.sText0.slice(iStart, iEnd);
this.nOffsetWithinParagraph = iStart;
this.lToken = Array.from(_oTokenizer.genTokens(this.sSentence, true));
this.dTokenPos.clear();
for (let dToken of this.lToken) {
|
|
|
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
|
this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash
}
if (this.sText.includes("@@")) {
this.sText = this.sText.replace(/@@+/g, "");
}
// parse sentence
for (let [iStart, iEnd] of text.getSentenceBoundaries(this.sText)) {
try {
this.sSentence = this.sText.slice(iStart, iEnd);
this.sSentence0 = this.sText0.slice(iStart, iEnd);
this.nOffsetWithinParagraph = iStart;
this.lToken = Array.from(_oTokenizer.genTokens(this.sSentence, true));
this.dTokenPos.clear();
for (let dToken of this.lToken) {
|