Overview
Comment: | [fr][js] màj: lexicographe, tokenizer |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | fr | kill_innerHTML |
Files: | files | file ages | folders |
SHA3-256: |
074cb33c80a58b9d6c4e443e433869f7 |
User & Date: | olr on 2017-07-12 11:03:48 |
Other Links: | branch diff | manifest | tags |
Context
2017-07-13
| ||
06:50 | [fx][fr][js] tokeniseur --> lexicographe check-in: b4cf00a4c8 user: olr tags: fr, fx, kill_innerHTML | |
2017-07-12
| ||
11:03 | [fr][js] màj: lexicographe, tokenizer check-in: 074cb33c80 user: olr tags: fr, kill_innerHTML | |
2017-07-11
| ||
13:58 | [fr] pt: à chacun, à s’en +infi check-in: 52649f7a5b user: olr tags: trunk, fr | |
Changes
Modified gc_core/js/tokenizer.js from [d06151ccd7] to [4eb5311054].
1 2 3 4 5 6 7 | // JavaScript // Very simple tokenizer "use strict"; const helpers = require("resource://grammalecte/helpers.js"); | < | | | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | // JavaScript // Very simple tokenizer "use strict"; const helpers = require("resource://grammalecte/helpers.js"); const aPatterns = { // All regexps must start with ^. "default": [ [/^[ \t]+/, 'SPACE'], [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], [/^\d\d?h\d\d\b/, 'HOUR'], [/^-?\d+(?:[.,]\d+|)/, 'NUM'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] ], "fr": [ [/^[ \t]+/, 'SPACE'], [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'], [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'], [/^\d\d?[hm]\d\d\b/, 'HOUR'], [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'], [/^-?\d+(?:[.,]\d+|)/, 'NUM'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD'] ] } class Tokenizer { constructor (sLang) { this.sLang = sLang; if (!aPatterns.hasOwnProperty(sLang)) { this.sLang = "default"; } this.aRules = aPatterns[this.sLang]; }; * genTokens (sText) { let m; let i = 0; while (sText) { let nCut = 1; |
︙ | ︙ | |||
65 66 67 68 69 70 71 72 73 74 75 | catch (e) { helpers.logerror(e); } } i += nCut; sText = sText.slice(nCut); } } } exports.Tokenizer = Tokenizer; | > > > > > > > > > > | 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | catch (e) { helpers.logerror(e); } } i += nCut; sText = sText.slice(nCut); } }; getSpellingErrors (sText, oDict) { let aSpellErr = []; for (let oToken of this.genTokens(sText)) { if (oToken.sType === 'WORD' && !oDict.isValidToken(oToken.sValue)) { aSpellErr.push(oToken); } } return aSpellErr; } } exports.Tokenizer = Tokenizer; |
Modified gc_lang/fr/config.ini from [94e5221453] to [2a6b6ad1cb].
1 2 3 4 5 6 7 | [args] lang = fr lang_name = French locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_MC fr_BF fr_CI fr_SN fr_ML fr_NE fr_TG fr_BJ country_default = FR name = Grammalecte implname = grammalecte | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | [args] lang = fr lang_name = French locales = fr_FR fr_BE fr_CA fr_CH fr_LU fr_MC fr_BF fr_CI fr_SN fr_ML fr_NE fr_TG fr_BJ country_default = FR name = Grammalecte implname = grammalecte version = 0.5.18 author = Olivier R. provider = Dicollecte link = http://grammalecte.net description = Correcteur grammatical pour le français. extras = README_fr.txt logo = logo.png |
︙ | ︙ |
Modified gc_lang/fr/modules-js/lexicographe.js from [7e54b6b49b] to [98b82a04c2].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | // Grammalecte - Lexicographe // License: MPL 2 "use strict"; ${string} const helpers = require("resource://grammalecte/helpers.js"); const _dTAGS = new Map ([ [':G', "[mot grammatical]"], [':N', " nom,"], [':A', " adjectif,"], [':M1', " prénom,"], | > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | // Grammalecte - Lexicographe // License: MPL 2 "use strict"; ${string} ${map} const helpers = require("resource://grammalecte/helpers.js"); const tkz = require("resource://grammalecte/tokenizer.js"); const _dTAGS = new Map ([ [':G', "[mot grammatical]"], [':N', " nom,"], [':A', " adjectif,"], [':M1', " prénom,"], |
︙ | ︙ | |||
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | ['en', " pronom adverbial"], ["m'en", " (me) pronom personnel objet + (en) pronom adverbial"], ["t'en", " (te) pronom personnel objet + (en) pronom adverbial"], ["s'en", " (se) pronom personnel objet + (en) pronom adverbial"] ]); class Lexicographe { constructor (oDict) { this.oDict = oDict; this._zElidedPrefix = new RegExp ("^([dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)['’](.+)", "i"); this._zCompoundWord = new RegExp ("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts][’'](?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous)$", "i"); this._zTag = new RegExp ("[:;/][a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ*][^:;/]*", "g"); }; | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | ['en', " pronom adverbial"], ["m'en", " (me) pronom personnel objet + (en) pronom adverbial"], ["t'en", " (te) pronom personnel objet + (en) pronom adverbial"], ["s'en", " (se) pronom personnel objet + (en) pronom adverbial"] ]); const _dSeparator = new Map ([ ['.', "point"], ['·', "point médian"], ['…', "points de suspension"], [';', "point-virgule"], [',', "virgule"], ['?', "point d’interrogation"], ['!', "point d’exclamation"], ['(', "parenthèse ouvrante"], [')', "parenthèse fermante"], ['[', "crochet ouvrante"], [']', "crochet fermante"], ['{', "accolade ouvrante"], ['}', "accolade fermante"], ['—', "tiret cadratin"], ['–', "tiret demi-cadratin"], ['«', "guillemet ouvrant (chevrons)"], ['»', "guillemet fermant (chevrons)"], ['“', "guillemet ouvrant double"], ['”', "guillemet fermant double"], ['‘', "guillemet ouvrant"], ['’', "guillemet fermant"], ]); class Lexicographe { constructor (oDict) { this.oDict = oDict; this._zElidedPrefix = new RegExp ("^([dljmtsncç]|quoiqu|lorsqu|jusqu|puisqu|qu)['’](.+)", "i"); this._zCompoundWord = new RegExp ("([a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ]+)-((?:les?|la)-(?:moi|toi|lui|[nv]ous|leur)|t-(?:il|elle|on)|y|en|[mts][’'](?:y|en)|les?|l[aà]|[mt]oi|leur|lui|je|tu|ils?|elles?|on|[nv]ous)$", "i"); this._zTag = new RegExp ("[:;/][a-zA-Zà-ö0-9À-Öø-ÿØ-ßĀ-ʯ*][^:;/]*", "g"); }; getInfoForToken (oToken) { // Token: .sType, .sValue, .nStart, .nEnd let m = null; try { helpers.echo(oToken); switch (oToken.sType) { case 'SEPARATOR': return [oToken.sType, oToken.sValue, _dSeparator._get(oToken.sValue, "caractère indéterminé")]; break; case 'NUM': return [oToken.sType, oToken.sValue, "nombre"]; break; case 'LINK': return [oToken.sType, oToken.sValue.slice(0,40)+"…", "hyperlien"]; break; case 'ELPFX': sTemp = oToken.sValue.replace("’", "'").replace("`", "'").toLowerCase(); return [oToken.sType, oToken.sValue, _dPFX._get(sTemp, "préfixe élidé inconnu")]; break; case 'WORD': if (oToken.sValue._count("-") > 4) { return ["COMPLEX", oToken.sValue, "élément complexe indéterminé"]; } else if (this.oDict.isValidToken(oToken.sValue)) { let lMorph = this.oDict.getMorph(oToken.sValue); let aElem = [ for (s of lMorph) if (s.includes(":")) this._formatTags(s) ]; return [ oToken.sType, oToken.sValue, [aElem] ]; } else if (m = this._zCompoundWord.exec(oToken.sValue)) { // mots composés let lMorph = this.oDict.getMorph(m[1]); let aElem = [ for (s of lMorph) if (s.includes(":")) this._formatTags(s) ]; aElem.push("-" + m[2] + ": " + this._formatSuffix(m[2].toLowerCase())); return [ oToken.sType, oToken.sValue, [aElem] ]; } else { return ["INCONNU", oToken.sValue, "inconnu du dictionnaire"]; } break; } } catch (e) { helpers.logerror(e); } return null }; getHTMLForText (sText) { sText = sText.replace(/[.,.?!:;…\/()\[\]“”«»"„{}–—#+*<>%=\n]/g, " ").replace(/\s+/g, " "); let iStart = 0; let iEnd = 0; let sHtml = '<div class="paragraph">\n'; while ((iEnd = sText.indexOf(" ", iStart)) !== -1) { sHtml += this.getHTMLForToken(sText.slice(iStart, iEnd)); iStart = iEnd + 1; } sHtml += this.getHTMLForToken(sText.slice(iStart)); return sHtml + '</div>\n'; }; getHTMLForToken (sWord) { try { if (!sWord) { return ""; } if (sWord._count("-") > 4) { return '<p><b class="mbok">' + sWord + "</b> <s>:</s> élément complexe indéterminé</p>\n"; } |
︙ | ︙ | |||
203 204 205 206 207 208 209 | let m2 = this._zCompoundWord.exec(sWord); if (m2 !== null) { sWord = m2[1]; } // Morphologies let lMorph = this.oDict.getMorph(sWord); if (lMorph.length === 1) { | | | | | 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | let m2 = this._zCompoundWord.exec(sWord); if (m2 !== null) { sWord = m2[1]; } // Morphologies let lMorph = this.oDict.getMorph(sWord); if (lMorph.length === 1) { sHtml += "<p><b>" + sWord + "</b> <s>:</s> " + this._formatTags(lMorph[0]) + "</p>\n"; } else if (lMorph.length > 1) { sHtml += "<p><b>" + sWord + "</b><ul><li>" + [for (s of lMorph) if (s.includes(":")) this._formatTags(s)].join(" </li><li> ") + "</li></ul></p>\n"; } else { sHtml += '<p><b class="unknown">' + sWord + "</b> <s>:</s> absent du dictionnaire<p>\n"; } // suffixe d’un mot composé if (m2) { sHtml += "<p>-<b>" + m2[2] + "</b> <s>:</s> " + this._formatSuffix(m2[2].toLowerCase()) + "</p>\n"; } // Verbes //let aVerb = new Set([ for (s of lMorph) if (s.includes(":V")) s.slice(1, s.indexOf(" ")) ]); return sHtml; } catch (e) { helpers.logerror(e); return "#erreur"; } }; _formatTags (sTags) { let sRes = ""; sTags = sTags.replace(/V([0-3][ea]?)[itpqnmr_eaxz]+/, "V$1"); let m; while ((m = this._zTag.exec(sTags)) !== null) { sRes += _dTAGS.get(m[0]); if (sRes.length > 100) { break; |
︙ | ︙ |
Modified gc_lang/fr/xpi/gce_worker.js from [378b08526d] to [31c05a3116].
︙ | ︙ | |||
82 83 84 85 86 87 88 | function parse (sText, sLang, bDebug, bContext) { let aGrammErr = gce.parse(sText, sLang, bDebug, bContext); return JSON.stringify(aGrammErr); } function parseAndSpellcheck (sText, sLang, bDebug, bContext) { let aGrammErr = gce.parse(sText, sLang, bDebug, bContext); | | < < < < < < < < < < < > | 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | function parse (sText, sLang, bDebug, bContext) { let aGrammErr = gce.parse(sText, sLang, bDebug, bContext); return JSON.stringify(aGrammErr); } function parseAndSpellcheck (sText, sLang, bDebug, bContext) { let aGrammErr = gce.parse(sText, sLang, bDebug, bContext); let aSpellErr = oTokenizer.getSpellingErrors(sText, oDict); return JSON.stringify({ aGrammErr: aGrammErr, aSpellErr: aSpellErr }); } function parseAndTag (sText, iParagraph, sLang, bDebug) { sText = text.addHtmlEntities(sText); let aGrammErr = gce.parse(sText, sLang, bDebug); let aSpellErr = oTokenizer.getSpellingErrors(sText, oDict); let sHtml = text.tagParagraph(sText, iParagraph, aGrammErr, aSpellErr); return sHtml; } function parseAndGenerateParagraph (sText, iParagraph, sLang, bDebug) { return text.createHTMLBlock(parseAndTag(sText, iParagraph, sLang, bDebug), iParagraph); } |
︙ | ︙ | |||
154 155 156 157 158 159 160 | return sAllRes; } // Lexicographer function analyzeWords (sText) { | > | > > > > > > > > > > > > > > > > > > > | 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | return sAllRes; } // Lexicographer function analyzeWords (sText) { getListOfElements(sText); return oLxg.getHTMLForText(sText); } function getListOfElements (sText) { try { helpers.echo("================================================="); let aElem = []; let aRes = null; for (let oToken of oTokenizer.genTokens(sText)) { aRes = oLxg.getInfoForToken(oToken); if (aRes) { helpers.echo(aRes); aElem = aElem.concat(aRes); } } return JSON.stringify(aElem); } catch (e) { helpers.logerror(e); } } |
Modified gc_lang/fr/xpi/package.json from [f43a7abb52] to [7e2435c537].
1 2 3 4 | { "name": "grammalecte-fr", "title": "Grammalecte [fr]", "id": "French-GC@grammalecte.net", | | | 1 2 3 4 5 6 7 8 9 10 11 12 | { "name": "grammalecte-fr", "title": "Grammalecte [fr]", "id": "French-GC@grammalecte.net", "version": "0.5.18", "description": "Correcteur grammatical pour le français", "homepage": "http://www.dicollecte.org/grammalecte", "main": "ui.js", "icon": "data/img/icon-48.png", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, |
︙ | ︙ |