Index: gc_core/py/lang_core/gc_engine.py ================================================================== --- gc_core/py/lang_core/gc_engine.py +++ gc_core/py/lang_core/gc_engine.py @@ -231,35 +231,21 @@ #for nPos, dToken in self.dTokenPos.items(): # s += "{}\t{}\n".format(nPos, dToken) return s def parse (self, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False): - "analyses the paragraph sText and returns list of errors" + "analyses sText and returns an iterable of errors" #sText = unicodedata.normalize("NFC", sText) dOpt = dOptions or _dOptions bShowRuleId = option('idrule') - # parse paragraph try: self.parseText(self.sText, self.sText0, True, 0, sCountry, dOpt, bShowRuleId, bDebug, bContext) except: raise - - # cleanup - sText = self.sText - if " " in sText: - sText = sText.replace(" ", ' ') # nbsp - if " " in sText: - sText = sText.replace(" ", ' ') # nnbsp - if "'" in sText: - sText = sText.replace("'", "’") - if "‑" in sText: - sText = sText.replace("‑", "-") # nobreakdash - if "@@" in sText: - sText = re.sub("@@+", "", sText) - # parse sentences + sText = self._getCleanText() for iStart, iEnd in text.getSentenceBoundaries(sText): if 4 < (iEnd - iStart) < 2000: try: self.sSentence = sText[iStart:iEnd] self.sSentence0 = self.sText0[iStart:iEnd] @@ -268,10 +254,57 @@ self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" } self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext) except: raise return self.dError.values() # this is a view (iterable) + + def parseAndGetSentences (self, sCountry="${country_default}", bDebug=False): + "analyses sText and returns a list of sentences with their tokens" + #sText = unicodedata.normalize("NFC", sText) + # parse paragraph + try: + self.parseText(self.sText, self.sText0, True, 0, sCountry, dOptions, bShowRuleId, bDebug, bContext) + except: + raise + # parse sentences + sText = self._getCleanText() + lSentence = [] + i = 0 + for iStart, iEnd in text.getSentenceBoundaries(sText): + try: + self.sSentence = sText[iStart:iEnd] + self.sSentence0 = self.sText0[iStart:iEnd] + self.nOffsetWithinParagraph = iStart + self.lToken = list(_oTokenizer.genTokens(self.sSentence, True)) + self.dTokenPos = { dToken["nStart"]: dToken for dToken in self.lToken if dToken["sType"] != "INFO" } + i += 1 + lSentence.append({ + "i": i, + "iStart": iStart, + "iEnd": iEnd, + "sSentence": self.sSentence, + "sSentence0": self.sSentence0, + "lToken": list(lToken) # this is a copy + }) + self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOptions, False, False, False) + except: + raise + return lSentence + + def _getCleanText (self): + sText = self.sText + if " " in sText: + sText = sText.replace(" ", ' ') # nbsp + if " " in sText: + sText = sText.replace(" ", ' ') # nnbsp + if "'" in sText: + sText = sText.replace("'", "’") + if "‑" in sText: + sText = sText.replace("‑", "-") # nobreakdash + if "@@" in sText: + sText = re.sub("@@+", "", sText) + return sText def parseText (self, sText, sText0, bParagraph, nOffset, sCountry, dOptions, bShowRuleId, bDebug, bContext): "parse the text with rules" bChange = False for sOption, lRuleGroup in _getRules(bParagraph): Index: grammalecte-cli.py ================================================================== --- grammalecte-cli.py +++ grammalecte-cli.py @@ -23,15 +23,17 @@ ?word1 [word2] ... words analysis !word suggestion >word draw path of word in the word graph =filter show all entries whose morphology fits to filter /lopt /lo list options - /+ option1 [option2] ... activate grammar checking options - /- option1 [option2] ... deactivate grammar checking options /lrules [pattern] /lr list rules - /--rule1 [rule2] ... deactivate grammar checking rule - /++rule1 [rule2] ... reactivate grammar checking rule + /o+ option1 [option2] ... activate grammar checking options + /o- option1 [option2] ... deactivate grammar checking options + /r+ rule1 [rule2] ... reactivate grammar checking rule + /r- rule1 [rule2] ... deactivate grammar checking rule + /textformatter /tf switch on/off the text formatter + /debug /d switch on/off the debug mode /quit /q exit """ def _getText (sInputText): @@ -223,21 +225,21 @@ else: sFlexPattern = sSearch sTagsPattern = "" for aRes in oSpellChecker.select(sFlexPattern, sTagsPattern): echo("\t".join(aRes)) - elif sText.startswith("/+ "): + elif sText.startswith("/o+ "): oGrammarChecker.gce.setOptions({ opt:True for opt in sText[3:].strip().split() if opt in oGrammarChecker.gce.getOptions() }) echo("done") - elif sText.startswith("/- "): + elif sText.startswith("/o- "): oGrammarChecker.gce.setOptions({ opt:False for opt in sText[3:].strip().split() if opt in oGrammarChecker.gce.getOptions() }) echo("done") - elif sText.startswith("/-- "): + elif sText.startswith("/r- "): for sRule in sText[3:].strip().split(): oGrammarChecker.gce.ignoreRule(sRule) echo("done") - elif sText.startswith("/++ "): + elif sText.startswith("/r+ "): for sRule in sText[3:].strip().split(): oGrammarChecker.gce.reactivateRule(sRule) echo("done") elif sText in ("/debug", "/d"): xArgs.debug = not xArgs.debug