Added a new module for performing the spelling correction. Needs testing

This commit is contained in:
Manuel Cortez 2019-12-03 09:59:54 -06:00
parent 28e2d3df08
commit c74c6f6f5d
7 changed files with 146 additions and 24 deletions

View File

@ -11,6 +11,8 @@
### Changes ### Changes
* Replaced the underlying library we were using for spelling correction as is no longer in development. Instead, we started to use a new approach in socializer, which, in theory, should allow us to switch language for spelling correction and other benefits a bit later. For now, available languages are Russian, Ukranian, English, Polish and Spanish, but more languages can be added by request.
## Changes in Version 0.23 (11.11.2019) ## Changes in Version 0.23 (11.11.2019)
### New additions ### New additions

View File

@ -0,0 +1,119 @@
# -*- coding: utf-8 -*-
""" High level Spell checker module by using the SymSpellPy library. """
import os
import glob
import shutil
import logging
import paths
from symspellpy.symspellpy import SymSpell, Verbosity
from codecs import open as open_
log = logging.getLogger("SpellChecker.checker")
loaded_dicts = dict()
ready = False
def load_dicts():
global loaded_dicts, ready
log.debug("Start dictionary loading for spelling checker module...")
if len(loaded_dicts) > 0:
loaded_dicts = dict()
path = os.path.join(paths.config_path(), "dicts")
if os.path.isdir(path):
log.debug("Loading language dictionaries from path %s" % (path,))
files = glob.glob(os.path.join(path, "*.txt"))
log.debug("%r files found." % (len(files)))
for i in files:
key = os.path.splitext(os.path.basename(i))[0]
dictionary = SymSpell()
dictionary.load_dictionary(i, 0, 1, encoding="utf-8")
loaded_dicts[key] = dictionary
log.debug("Added dictionary for language %s " % (key,))
ready = True
log.debug("All dicts were loaded.")
def prepare_dicts(language):
""" Copy the main dictionary file to the user's config directory so it can be modified and read without needing to require privileged sessions.
@ language: two letter language code.
"""
log.debug("preparing dictionary data...")
path = os.path.join(paths.config_path(), "dicts")
if os.path.exists(path) == False:
log.debug("Creating dicts folder in config directory...")
os.mkdir(path)
original_file = os.path.join(paths.app_path(), "dictionaries", language+".txt")
if os.path.exists(original_file) and os.path.exists(os.path.join(paths.config_path(), "dicts", language+".txt")) == False:
log.debug("Dictionary for language %s is not present in user config. Coppying... " % (language,))
dst_file = shutil.copy(original_file, os.path.join(paths.config_path(), "dicts"))
class SpellChecker(object):
def __init__(self, wordlist=None, *args, **kwargs):
self.kwargs = kwargs
self.dictionary = None
self.ignored_words = []
self.word_index = 0
def set_language(self, lang):
global loaded_dicts
if loaded_dicts.get(lang) != None:
self.dictionary = loaded_dicts[lang]
else:
raise ValueError("Dictionary not found for the specified language")
def set_text(self, text):
self.transformed_words = text.split()
self.word_index = 0
def check_words(self):
for word in range(0, len(self.transformed_words)):
if self.transformed_words[word] in self.ignored_words:
continue
suggestions = self.dictionary.lookup(self.transformed_words[word], Verbosity.CLOSEST, 2, transfer_casing=True)
valid_word = True
if len(suggestions) == 0:
continue
for s in suggestions:
print(s.term)
print(s.distance)
if s.distance == 0:
valid_word = False
if valid_word == False:
continue
if word <= 10:
if len(self.transformed_words) <= 10:
context = " ".join(self.transformed_words)
else:
context = " ".join(self.transformed_words[0:10])
elif word >= len(self.transformed_words)-9:
context = " ".join(self.transformed_words[-10])
else:
context = " ".join(self.transformed_words[word-5:word+5])
self.word_index = word
# print(self.word)
# print(suggestions[0].distance)
yield (suggestions, context, word)
def replace(self, suggestion):
if len(self.transformed_words) < self.word_index:
raise ValueError("Word index is not present in the current text")
self.transformed_words[self.word_index] = suggestion
def replace_all(self, word):
existing_word = self.word
for i in range(0, len(self.transformed_words)):
if self.transformed_words[i] == existing_word:
self.transformed_words[i] = word
def ignore_word(self, word):
self.ignored_words.append(word)
@property
def text(self):
return " ".join(self.transformed_words)
@property
def word(self):
if len(self.transformed_words) == 0 or self.word_index >= len(self.transformed_words):
return None
return self.transformed_words[self.word_index]

View File

@ -1,35 +1,31 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
import os
import logging import logging
import widgetUtils import widgetUtils
import output import output
import config import config
import languageHandler import languageHandler
from enchant.checker import SpellChecker from platform_utils import paths
from enchant.errors import DictNotFoundError from . import checker
from enchant import tokenize
from . import wx_ui from . import wx_ui
log = logging.getLogger("extra.SpellChecker.spellChecker") log = logging.getLogger("extra.SpellChecker.spellChecker")
class spellChecker(object): class spellChecker(object):
def __init__(self, text, dictionary): def __init__(self, text):
super(spellChecker, self).__init__() super(spellChecker, self).__init__()
log.debug("Creating the SpellChecker object. Dictionary: %s" % (dictionary,))
self.active = True self.active = True
self.checker = checker.SpellChecker()
log.debug("Using language: %s" % (languageHandler.getLanguage(),))
try: try:
if config.app["app-settings"]["language"] == "system": self.checker.set_language(languageHandler.curLang)
log.debug("Using the system language") except ValueError:
self.checker = SpellChecker(languageHandler.curLang, filters=[tokenize.EmailFilter, tokenize.URLFilter]) log.exception("Dictionary for language %s not found." % (languageHandler.curLang,))
else:
log.debug("Using language: %s" % (languageHandler.getLanguage(),))
self.checker = SpellChecker(languageHandler.curLang, filters=[tokenize.EmailFilter, tokenize.URLFilter])
self.checker.set_text(text)
except DictNotFoundError:
print("no dict")
log.exception("Dictionary for language %s not found." % (dictionary,))
wx_ui.dict_not_found_error() wx_ui.dict_not_found_error()
self.active = False self.active = False
self.checker.set_text(text)
self.generator = self.checker.check_words()
if self.active == True: if self.active == True:
log.debug("Creating dialog...") log.debug("Creating dialog...")
self.dialog = wx_ui.spellCheckerDialog() self.dialog = wx_ui.spellCheckerDialog()
@ -39,16 +35,16 @@ class spellChecker(object):
widgetUtils.connect_event(self.dialog.replaceAll, widgetUtils.BUTTON_PRESSED, self.replaceAll) widgetUtils.connect_event(self.dialog.replaceAll, widgetUtils.BUTTON_PRESSED, self.replaceAll)
self.check() self.check()
self.dialog.get_response() self.dialog.get_response()
self.fixed_text = self.checker.get_text() self.fixed_text = self.checker.text
def check(self): def check(self):
try: try:
next(self.checker) suggestions, context, self.wordIndex = next(self.generator)
textToSay = _("Misspelled word: %s") % (self.checker.word,) textToSay = _("Misspelled word: %s") % (self.checker.word,)
context = "... %s %s %s" % (self.checker.leading_context(10), self.checker.word, self.checker.trailing_context(10)) context = context
self.dialog.set_title(textToSay) self.dialog.set_title(textToSay)
output.speak(textToSay) output.speak(textToSay)
self.dialog.set_word_and_suggestions(word=self.checker.word, context=context, suggestions=self.checker.suggest()) self.dialog.set_word_and_suggestions(word=self.checker.word, context=context, suggestions=[suggestion.term for suggestion in suggestions])
except StopIteration: except StopIteration:
log.debug("Process finished.") log.debug("Process finished.")
wx_ui.finished() wx_ui.finished()
@ -58,7 +54,7 @@ class spellChecker(object):
self.check() self.check()
def ignoreAll(self, ev): def ignoreAll(self, ev):
self.checker.ignore_always(word=self.checker.word) self.checker.ignore_word(word=self.checker.word)
self.check() self.check()
def replace(self, ev): def replace(self, ev):
@ -66,7 +62,7 @@ class spellChecker(object):
self.check() self.check()
def replaceAll(self, ev): def replaceAll(self, ev):
self.checker.replace_always(self.dialog.get_selected_suggestion()) self.checker.replace_all(self.dialog.get_selected_suggestion())
self.check() self.check()
def clean(self): def clean(self):

View File

@ -19,6 +19,7 @@ if hasattr(sys, "frozen"):
sys.excepthook = lambda x, y, z: logging.critical(''.join(traceback.format_exception(x, y, z))) sys.excepthook = lambda x, y, z: logging.critical(''.join(traceback.format_exception(x, y, z)))
from mysc.thread_utils import call_threaded from mysc.thread_utils import call_threaded
from wxUI import commonMessages from wxUI import commonMessages
from extra.SpellChecker import checker # Load dictionaries in advance for spelling correction
log = logging.getLogger("main") log = logging.getLogger("main")
@ -58,6 +59,10 @@ def setup():
sm = sessionManager.sessionManagerController() sm = sessionManager.sessionManagerController()
sm.show() sm.show()
del sm del sm
log.debug("Loading dictionaries for spelling correction...")
# Let's copy dictionary files for the selected language just in case it is not present already.
checker.prepare_dicts(languageHandler.curLang)
call_threaded(checker.load_dicts)
r = mainController.Controller() r = mainController.Controller()
call_threaded(r.login) call_threaded(r.login)
app.run() app.run()

View File

@ -47,7 +47,7 @@ class createPostPresenter(base.basePresenter):
output.speak(_("Translated")) output.speak(_("Translated"))
def spellcheck(self, text): def spellcheck(self, text):
checker = SpellChecker.spellchecker.spellChecker(text, "") checker = SpellChecker.spellchecker.spellChecker(text)
if hasattr(checker, "fixed_text"): if hasattr(checker, "fixed_text"):
self.send_message("set", control="text", value=checker.fixed_text) self.send_message("set", control="text", value=checker.fixed_text)
self.send_message("focus_control", control="text") self.send_message("focus_control", control="text")

View File

@ -33,7 +33,7 @@ class sessionManagerController(object):
self.sessions = [] self.sessions = []
log.debug("Filling the session list...") log.debug("Filling the session list...")
for i in os.listdir(paths.config_path()): for i in os.listdir(paths.config_path()):
if os.path.isdir(os.path.join(paths.config_path(), i)): if i != "dicts" and os.path.isdir(os.path.join(paths.config_path(), i)):
log.debug("Adding session %s" % (i,)) log.debug("Adding session %s" % (i,))
config_test = Configuration(os.path.join(paths.config_path(), i, "session.conf")) config_test = Configuration(os.path.join(paths.config_path(), i, "session.conf"))
name = config_test["vk"]["user"] name = config_test["vk"]["user"]

View File

@ -34,7 +34,7 @@ build_exe_options = dict(
optimize=2, optimize=2,
include_msvcr=True, include_msvcr=True,
zip_include_packages=["accessible_output2", "sound_lib", "arrow"], zip_include_packages=["accessible_output2", "sound_lib", "arrow"],
include_files=["session.defaults", "cacert.pem", "app-configuration.defaults", "locales", "sounds", "documentation", "../windows-dependencies/x86/oggenc2.exe", "../windows-dependencies/x86/bootstrap.exe", ("../windows-dependencies/dictionaries", "lib/enchant/share/enchant/myspell"), find_sound_lib_datafiles(), find_accessible_output2_datafiles()], include_files=["session.defaults", "cacert.pem", "app-configuration.defaults", "locales", "sounds", "documentation", "../windows-dependencies/x86/oggenc2.exe", "../windows-dependencies/x86/bootstrap.exe", "../windows-dependencies/dictionaries", find_sound_lib_datafiles(), find_accessible_output2_datafiles()],
packages=["interactors", "presenters", "views", "wxUI"], packages=["interactors", "presenters", "views", "wxUI"],
) )