diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f665a64..7bcc3d4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,12 +11,12 @@ repos: - id: check-added-large-files args: ['--maxkb=200'] - repo: https://github.com/psf/black - rev: 23.3.0 + rev: 25.1.0 hooks: - id: black args: ['--check', '--diff'] - repo: https://github.com/pycqa/flake8 - rev: 6.0.0 + rev: 7.0.0 hooks: - id: flake8 entry: pflake8 diff --git a/comment_spell_check/__init__.py b/comment_spell_check/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/comment_spell_check/additional_dictionary.txt b/comment_spell_check/additional_dictionary.txt index ccbf38d..28ed7da 100644 --- a/comment_spell_check/additional_dictionary.txt +++ b/comment_spell_check/additional_dictionary.txt @@ -242,6 +242,7 @@ Goldfarb Goodlet Gouaillard Gouraud +Gnuplot GradientDescentOptimizerBasev GradientRecursiveGaussianImageFiter GrayscaleFillhole @@ -267,6 +268,7 @@ Haynor Hee Heimburg Herk +Hermitian Herve Hexahedral Hexahedron @@ -420,6 +422,7 @@ Mamata Manduchi Margolin Mathieu +Matlab Matsumoto Mattmann Mauna @@ -556,8 +559,10 @@ RenyiEntropyThreshold RenyiEntropyThresholdCalculator RenyiEntropyThresholdImageFilter RenyiEntropys +Res ResamplerType Rescale +Resize Reson Ridler Riemannian @@ -638,6 +643,7 @@ Sommerlad Spaeth Splitter Springer +Stat Stejskal Stolfi Stroustrup @@ -645,6 +651,7 @@ Structs Styner Su Subpixel +Subsample Subsampling Suyash SyN @@ -657,6 +664,7 @@ TRE TYPENOTAPPLICABLE Takuji Tcl +Tech Teo Testbed Thevenaz @@ -718,6 +726,7 @@ UnRegisterAllFactories UnRegisterFactory Unary Undefine +Unix Unlinks Unregister Unser @@ -753,6 +762,7 @@ Viergever Vigneault Vincken Vis +VolView Vor Voronoi Wada @@ -814,6 +824,7 @@ alignas alignof allclose allocator +alt amax amin amongst @@ -865,6 +876,7 @@ basefield beginpoint behaviour bellew +bf bigcup bigwww bilinear @@ -882,6 +894,7 @@ bowdler br break bspline +builtin burton bwh byteorder @@ -894,6 +907,7 @@ casted catch cbegin cbrt +cc ccc cccc cdot @@ -912,6 +926,8 @@ chamfer char char16_t char32_t +checkbox +chtml cin cindex circ @@ -942,6 +958,8 @@ compat compl complex128 complex64 +composited +compositing computeboundingbox concat concept @@ -967,6 +985,7 @@ copyreg copysign cout cov +covariant covariances cpp cppcheck @@ -986,6 +1005,7 @@ dataset datasets datastructure datastructures +datatype datatypes dave ddots @@ -1001,6 +1021,7 @@ deconvolution deconvolved decr deeppink +def default deffield defgroup @@ -1026,6 +1047,7 @@ dev devirtualized df diag +dict diffeomorphic diffeomorphism differencing @@ -1035,12 +1057,14 @@ discretization discretized disp displaystyle +dist dll dllexport dlls dm do documentclass +doesn doi double downsample @@ -1090,10 +1114,12 @@ eqnarray equidistribution erosions et +etc euclidian euler eval exe +exp exp2 expander explicit @@ -1101,6 +1127,7 @@ exponentials exponentiate export expr +ext extendible extensibility extern @@ -1118,11 +1145,14 @@ fdata fdgradf fe feret +ff ffdgradf fft fftw fiducials fiji +filename +filenames final findall finditer @@ -1147,10 +1177,15 @@ frexp friend frombuffer fs +ft +ftp func +functionalities functools +functor fv fvisibility +fwd gabor garbow gaussian @@ -1186,6 +1221,7 @@ hashlib hashtable hdl heaviside +hexadecimals hexahedra hexahedron hexahedrons @@ -1325,6 +1361,8 @@ jpg jupyter kd keio +kitware +km kron labelings labelled @@ -1352,6 +1390,7 @@ liness linspace linux ljust +ll llvm lmi localised @@ -1361,7 +1400,9 @@ log2 long longdouble longlong +lookup lookups +lossless lossy lshift lt @@ -1383,6 +1424,7 @@ mbox md mediumblue mediumvioletred +metadata metaprogramming metrics metricv @@ -1393,12 +1435,15 @@ minipipeline minkoswki minmax mirr +mm modf module morphologic +mp mpl msvc mul +multi multiband multilabel multimodality @@ -1416,6 +1461,7 @@ na nabla namespace namespaces +namic nan narrowband narrowbanding @@ -1436,6 +1482,7 @@ netlib neurodegenerative neuroimage new +newline nifti nih nii @@ -1450,6 +1497,7 @@ nonpositive nonrectilinear nonuniformity normals +noqa not not_eq np @@ -1460,6 +1508,7 @@ nullptr num numarray numpy +obj objectness oct octree @@ -1476,6 +1525,7 @@ optimizied or or_eq orangered +org organising orthogonally orthonormal @@ -1502,6 +1552,8 @@ parallelized param parameterisation parameterization +parameterize +parameterized params parsers partitioner @@ -1568,6 +1620,7 @@ pyplot qe ql qs +qt quadric quantile quaternion @@ -1576,12 +1629,14 @@ rbegin rdbuf rdstate reallocations +rec recalibrate recoded recognised recomputation recursing reentrant +refactoring register registrator regridding @@ -1601,6 +1656,7 @@ rescaled rescope resizable resizeable +resized return reusability rfind @@ -1666,6 +1722,7 @@ sline smap smaps smartpointer +snarky sobelOperator sourcecode spacings @@ -1689,6 +1746,7 @@ startswith static static_assert static_cast +std stdcall stderr stdin @@ -1765,6 +1823,7 @@ todo toolkits tparam tql +tr tractography transformMov transforms @@ -1779,6 +1838,8 @@ trunc try tt ttest +tuple +tuples tx txt ty @@ -1867,6 +1928,9 @@ wavefronts wchar_t while wilkinson +workaround +workarounds +workflows writeme wrt www diff --git a/comment_spell_check/comment_spell_check.py b/comment_spell_check/comment_spell_check.py index b0a7be1..e5a793b 100755 --- a/comment_spell_check/comment_spell_check.py +++ b/comment_spell_check/comment_spell_check.py @@ -18,27 +18,26 @@ # # ==========================================================================*/ -""" spell check the comments in code. """ +"""spell check the comments in code.""" import sys import os import fnmatch import glob -import argparse import re +import unicodedata +import logging from pathlib import Path from importlib.metadata import version, PackageNotFoundError -from enchant.checker import SpellChecker -from enchant.tokenize import EmailFilter, URLFilter -from enchant import Dict - from comment_parser import comment_parser -try: - from comment_spell_check.lib import bibtex_loader -except ImportError: - from lib import bibtex_loader +from spellchecker import SpellChecker + +from comment_spell_check.utils import parseargs +from comment_spell_check.utils import bibtex_loader +from comment_spell_check.utils import create_checker +from comment_spell_check.utils import url_remove __version__ = "unknown" @@ -110,119 +109,147 @@ def load_text_file(filename): return output +def remove_accents(input_str): + """Removes accents from a string using Unicode normalization.""" + nfkd_form = unicodedata.normalize("NFKD", input_str) + return "".join([c for c in nfkd_form if not unicodedata.combining(c)]) + + +def filter_string(input_str: str): + """Filter out unwanted characters from the input string. + That includes removing single quote that are not part of a + contraction.""" + + # map accented characters to their unaccented equivalent + line = remove_accents(input_str) + + # Keep letters and single quotes + line = re.sub(r"[^a-zA-Z']", " ", line) + + # Split the line into words + words = line.split() + + contraction_apostrophe = re.compile(r"\b\w+'\w+\b") + + w2 = [] + + # Check each word for contractions and apostrophes + for w in words: + if "'" in w: + matches = contraction_apostrophe.findall(w) + if len(matches) > 0: + # if there is a contraction, allow it + w2.append(w) + else: + # apostrophe is not in a contraction so remove it + new_word = re.sub("'", "", w) + if len(new_word) > 0: + w2.append(new_word) + else: + w2.append(w) + + return w2 + + def spell_check_words(spell_checker: SpellChecker, words: list[str]): - """Check each word and report False if at least one has an spelling error.""" + """Check each word and report False if at least one has an spelling + error.""" for word in words: - if not spell_checker.check(word): + if not (word in spell_checker or word.lower() in spell_checker): return False return True +def find_misspellings(spell: SpellChecker, line: str) -> list[str]: + """Find misspellings in a line of text.""" + + logger = logging.getLogger("comment_spell_check") + words = filter_string(line) + + mistakes = [] + + for word in words: + if not (word.lower() in spell or word in spell): + logger.info("Misspelled word: %s", word) + mistakes.append(word) + return mistakes + + +def remove_contractions(word: str): + """Remove contractions from the word.""" + + logger = logging.getLogger("comment_spell_check") + for contraction in CONTRACTIONS: + if word.endswith(contraction): + logger.info("Contraction: %s -> %s", word, word[: -len(contraction)]) + return word[: -len(contraction)] + return word + + +def remove_prefix(word: str, prefixes: list[str]): + """Remove the prefix from the word.""" + for prefix in prefixes: + if word.startswith(prefix): + return word[len(prefix) :] + return word + + def spell_check_comment( - spell_checker: SpellChecker, + spell: SpellChecker, c: comment_parser.common.Comment, prefixes: list[str] = None, - output_lvl=2, ) -> list[str]: """Check comment and return list of identified issues if any.""" - if output_lvl > 1: - print(f"Line {c.line_number()}: {c}") + logger = logging.getLogger("comment_spell_check") + logger.info("Line #%d: %s", c.line_number(), c.text()) + + line = c.text() + if "https://" in line or "http://" in line: + line = url_remove.remove_urls(line) + logger.debug(" Removed URLs: %s", line) + + bad_words = find_misspellings(spell, line) mistakes = [] - spell_checker.set_text(c.text()) - - for error in spell_checker: - error_word = error.word - - if output_lvl > 1: - print(f" Error: {error_word}") - - valid = False - - # Check for contractions - for contraction in CONTRACTIONS: - if error_word.endswith(contraction): - original_error_word = error_word - error_word = error_word[: -len(contraction)] - if output_lvl > 1: - print( - " Stripping contraction: " - + f"{original_error_word} -> {error_word}" - ) - if spell_checker.check(error_word): - valid = True - break - - if valid: - continue + for error_word in bad_words: + logger.debug(" Error: %s", error_word) - if prefixes is None: - prefixes = [] - - # Check if the bad word starts with a prefix. - # If so, spell check the word without that prefix. - - for pre in prefixes: - if error_word.startswith(pre): - # check if the word is only the prefix - if len(pre) == len(error_word): - if output_lvl > 1: - print(f" Prefix '{pre}' matches word") - valid = True - break - - # remove the prefix - wrd = error_word[len(pre) :] - if output_lvl > 1: - print(f" Trying without '{pre}' prefix: {error_word} -> {wrd}") - try: - if spell_checker.check(wrd): - valid = True - else: - # Try splitting camel case words and checking each sub-words - if output_lvl > 1: - print(f" Trying splitting camel case word: {wrd}") - sub_words = split_camel_case(wrd) - if output_lvl > 1: - print(" Sub-words: ", sub_words) - if len(sub_words) > 1 and spell_check_words( - spell_checker, sub_words - ): - valid = True - break - except TypeError: - print(f" Caught an exception for word {error_word} {wrd}") - - if valid: + error_word = remove_contractions(error_word) + + prefixes = prefixes or [] + error_word = remove_prefix(error_word, prefixes) + + if len(error_word) == 0 or error_word in spell or error_word.lower() in spell: continue # Try splitting camel case words and checking each sub-word - if output_lvl > 1: - print(f" Trying splitting camel case word: {error_word}") sub_words = split_camel_case(error_word) - if len(sub_words) > 1 and spell_check_words(spell_checker, sub_words): + logger.debug(" Trying splitting camel case word: %s", error_word) + logger.debug(" Sub-words: %s", sub_words) + + if len(sub_words) > 1 and spell_check_words(spell, sub_words): continue - if output_lvl > 1: - msg = f" Error: '{error_word}', suggestions: {spell_checker.suggest()}" - else: - msg = error_word + msg = f"'{error_word}', " + f"suggestions: {spell.candidates(error_word)}" mistakes.append(msg) return mistakes def spell_check_file( - filename, spell_checker, mime_type="", output_lvl=1, prefixes=None + filename: str, + spell_checker: SpellChecker, + mime_type: str = "", + prefixes=None, ): """Check spelling in ``filename``.""" if len(mime_type) == 0: mime_type = get_mime_type(filename) - if output_lvl > 0: - print(f"spell_check_file: {filename}, {mime_type}") + logger = logging.getLogger("comment_spell_check") + logger.info("spell_check_file: %s, %s", filename, mime_type) # Returns a list of comment_parser.parsers.common.Comments if mime_type == "text/plain": @@ -231,36 +258,32 @@ def spell_check_file( try: clist = comment_parser.extract_comments(filename, mime=mime_type) except TypeError: - print(f"Parser failed, skipping file {filename}") + logger.error("Parser failed, skipping file %s", filename) return [] bad_words = [] + line_count = 0 for c in clist: - mistakes = spell_check_comment( - spell_checker, c, prefixes=prefixes, output_lvl=output_lvl - ) + mistakes = spell_check_comment(spell_checker, c, prefixes=prefixes) if len(mistakes) > 0: - if output_lvl > 0: - print(f"\nLine number {c.line_number()}") - if output_lvl > 0: - print(c.text()) + logger.info("\nLine number %s", c.line_number()) + logger.info(c.text()) for m in mistakes: - if output_lvl >= 0: - print(f" {m}") + logger.info(" %s", m) bad_words.append([m, filename, c.line_number()]) + line_count = line_count + 1 bad_words = sorted(bad_words) - if output_lvl > 1: - print("\nResults") - for x in bad_words: - print(x) + logger.info("Results") + for x in bad_words: + logger.info(x) - return bad_words + return bad_words, line_count -def exclude_check(name, exclude_list): +def exclude_check(name: str, exclude_list: list[str] = None): """Return True if ``name`` matches any of the regular expressions listed in ``exclude_list``.""" if exclude_list is None: @@ -272,7 +295,7 @@ def exclude_check(name, exclude_list): return False -def skip_check(name, skip_list): +def skip_check(name: str, skip_list: list[str] = None): """Return True if ``name`` matches any of the glob pattern listed in ``skip_list``.""" if skip_list is None: @@ -283,193 +306,115 @@ def skip_check(name, skip_list): return False -def parse_args(): - """parse the command-line arguments.""" - parser = argparse.ArgumentParser() - - parser.add_argument("filenames", nargs="*") - - parser.add_argument( - "--brief", - "-b", - action="store_true", - default=False, - dest="brief", - help="Make output brief", - ) - - parser.add_argument( - "--verbose", - "-v", - action="store_true", - default=False, - dest="verbose", - help="Make output verbose", - ) - - parser.add_argument( - "--first", - "-f", - action="store_true", - default=False, - dest="first", - help="Show only first occurrence of a mispelling", - ) - - parser.add_argument( - "--vim", - "-V", - action="store_true", - default=False, - dest="vim", - help="Output results in vim command format", - ) - - parser.add_argument( - "--dict", - "-d", - "--ignore-words", - "-I", - action="append", - dest="dict", - help="File that contains words that will be ignored." - " Argument can be passed multiple times." - " File must contain 1 word per line.", - ) - - parser.add_argument( - "--exclude", - "-e", - action="append", - dest="exclude", - help="Specify regex for excluding files." - " Argument can be passed multiple times.", - ) - - parser.add_argument( - "--skip", - "-S", - action="append", - help="Comma-separated list of files to skip. It " - "accepts globs as well. E.g.: if you want " - "coment_spell_check.py to skip .eps and .txt files, " - 'you\'d give "*.eps,*.txt" to this option.' - " Argument can be passed multiple times.", - ) - - parser.add_argument( - "--prefix", - "-p", - action="append", - default=[], - dest="prefixes", - help="Add word prefix. Argument can be passed multiple times.", - ) - - parser.add_argument( - "--miss", - "-m", - action="store_true", - default=False, - dest="miss", - help="Only output the misspelt words", - ) - - parser.add_argument( - "--suffix", - "-s", - action="append", - default=[".h"], - dest="suffix", - help="File name suffix. Argument can be passed multiple times.", - ) - - parser.add_argument( - "--type", - "-t", - action="store", - default="", - dest="mime_type", - help="Set file mime type. File name suffix will be ignored.", - ) - - parser.add_argument( - "--bibtex", - action="append", - dest="bibtex", - help="Bibtex file to load for additional dictionary words.", - ) - - parser.add_argument("--version", action="version", version=f"{__version__}") - - args = parser.parse_args() - return args - - -def add_dict(enchant_dict, filename, verbose=False): - """Update ``enchant_dict`` spell checking dictionary with the words listed - in ``filename`` (one word per line).""" - if verbose: - print(f"Additional dictionary: {filename}") - - with open(filename, encoding="utf-8") as f: - lines = f.read().splitlines() - - # You better not have more than 1 word in a line - for wrd in lines: - if not wrd.replace("'", "").isidentifier(): +def build_dictionary_list(args): + """build a list of dictionaries to use for spell checking.""" + dict_list = [] + initial_dct = Path(__file__).parent / "additional_dictionary.txt" + + logger = logging.getLogger("comment_spell_check") + if initial_dct.exists(): + dict_list.append(initial_dct) + else: + logger.warning("Initial dictionary not found: %s", initial_dct) + + if not isinstance(args.dict, list): + return dict_list + + for d in args.dict: + dpath = Path(d) + if dpath.exists(): + dict_list.append(dpath) + + return dict_list + + +def add_bibtex_words(spell: SpellChecker, bibtex_files: list[str]): + """Add words from bibtex files to the spell checker.""" + + if list is None: + return + + logger = logging.getLogger("comment_spell_check") + + for bibtex_file in bibtex_files: + logger.info("Loading bibtex file: %s", bibtex_file) + bibtex_loader.add_bibtex(spell, bibtex_file) + + +def output_results(args, bad_words): + """Output the results of the spell check.""" + + print("\nBad words\n" if not args.miss else "", end="") + + previous_word = "" + + for misspelled_word, found_file, line_num in bad_words: + if misspelled_word != previous_word and args.first: + print(f"\n{misspelled_word}:") + + if (misspelled_word == previous_word) and args.first: + sys.stderr.write(".") + continue + + if args.vim: + print(f"vim +{line_num} {found_file}", file=sys.stderr) + else: print( - "Warning: adding word with non-alphanumeric characters to dictionary:", - wrd, + f"file: {found_file:30} line: {line_num:3d} word: {misspelled_word}", + file=sys.stderr, ) - if not enchant_dict.check(wrd): - enchant_dict.add(wrd) + previous_word = misspelled_word -def create_spell_checker(args, output_lvl): - """Create a SpellChecker.""" + print(f"\n{len(bad_words)} misspellings found") - my_dict = Dict("en_US") - # Load the dictionary files - # - initial_dct = Path(__file__).parent / "additional_dictionary.txt" - if not initial_dct.exists(): - initial_dct = None - else: - add_dict(my_dict, str(initial_dct), any([args.brief, output_lvl >= 0])) +def setup_logger(args): + """Sets up a logger that outputs to the console.""" - if args.dict is not None: - for d in args.dict: - add_dict(my_dict, d, any([args.brief, output_lvl >= 0])) + level = logging.INFO + if args.verbose: + level = logging.DEBUG + print("Verbose mode enabled") + if args.miss: + level = logging.ERROR + if args.brief: + level = logging.WARNING - # Load the bibliography files - # - if args.bibtex is not None: - for bib in args.bibtex: - bibtex_loader.add_bibtex(my_dict, bib, any([args.brief, output_lvl >= 0])) + logger = logging.getLogger("comment_spell_check") + logger.setLevel(level) - # Create the spell checking object - spell_checker = SpellChecker(my_dict, filters=[EmailFilter, URLFilter]) + if level in (logging.INFO, logging.DEBUG): + # info and debug messages will be printed to the console - return spell_checker + # Create a console handler + ch = logging.StreamHandler() + ch.setLevel(level) + # Create a formatter + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) -def main(): + # Add formatter to ch + ch.setFormatter(formatter) + + # Add ch to logger + logger.addHandler(ch) + + return logger + + +def comment_spell_check(args): """comment_spell_check main function.""" - args = parse_args() + logger = setup_logger(args) - # Set the amount of debugging messages to print. - output_lvl = 1 - if args.brief: - output_lvl = 0 - else: - if args.verbose: - output_lvl = 2 - if args.miss: - output_lvl = -1 + dict_list = build_dictionary_list(args) - spell_checker = create_spell_checker(args, output_lvl) + spell = create_checker.create_checker(dict_list) + + if args.bibtex: + add_bibtex_words(spell, args.bibtex) file_list = [] if len(args.filenames): @@ -483,16 +428,14 @@ def main(): suffixes = [*set(args.suffix)] # remove duplicates - if any([args.brief, output_lvl >= 0]): - print(f"Prefixes: {prefixes}") - print(f"Suffixes: {suffixes}") + logger.info("Prefixes: %s\nSuffixes: %s", prefixes, suffixes) + + counts = [0, 0] # # Spell check the files # for f in file_list: - if not args.miss: - print(f"\nChecking {f}") # If f is a directory, recursively check for files in it. if os.path.isdir(f): @@ -501,75 +444,52 @@ def main(): for s in suffixes: dir_entries = dir_entries + glob.glob(f + "/**/*" + s, recursive=True) - if output_lvl > 0: - print(dir_entries) + logger.info(dir_entries) # spell check the files found in f for x in dir_entries: if exclude_check(x, args.exclude) or skip_check(x, args.skip): - if not args.miss: - print(f"\nExcluding {x}") + logger.info("Excluding %s", x) continue - if not args.miss: - print(f"\nChecking {x}") - result = spell_check_file( + logger.info("Checking %s", x) + result, lc = spell_check_file( x, - spell_checker, + spell, args.mime_type, - output_lvl=output_lvl, prefixes=prefixes, ) bad_words = sorted(bad_words + result) + counts[0] = counts[0] + 1 + counts[1] = counts[1] + lc else: # f is a file if exclude_check(f, args.exclude) or skip_check(f, args.skip): - if not args.miss: - print(f"\nExcluding {x}") + logger.info("Excluding %s", f) continue # f is a file, so spell check it - result = spell_check_file( + result, lc = spell_check_file( f, - spell_checker, + spell, args.mime_type, - output_lvl=output_lvl, prefixes=prefixes, ) - bad_words = sorted(bad_words + result) + counts[0] = counts[0] + 1 + counts[1] = counts[1] + lc - # Done spell checking. Print out all the words not found in our dictionary. - # - if not args.miss: - print("\nBad words") - - previous_word = "" - print("") + output_results(args, bad_words) - for misspelled_word, found_file, line_num in bad_words: - if misspelled_word != previous_word and args.first: - print(f"\n{misspelled_word}:") + logger.info("%s files checked, %s lines checked", counts[0], counts[1]) - if (misspelled_word == previous_word) and args.first: - sys.stderr.write(".") - continue - - if args.vim: - print(f"vim +{line_num} {found_file}", file=sys.stderr) - else: - print( - f"file: {found_file:30} line: {line_num:3d} word: {misspelled_word}", - file=sys.stderr, - ) - - previous_word = misspelled_word + sys.exit(len(bad_words)) - print("") - print(f"{len(bad_words)} misspellings found") - sys.exit(len(bad_words)) +def main(): + args = parseargs.parse_args() + comment_spell_check(args) if __name__ == "__main__": diff --git a/comment_spell_check/utils/__init__.py b/comment_spell_check/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/comment_spell_check/lib/bibtex_loader.py b/comment_spell_check/utils/bibtex_loader.py similarity index 52% rename from comment_spell_check/lib/bibtex_loader.py rename to comment_spell_check/utils/bibtex_loader.py index b715a2d..b18d84b 100644 --- a/comment_spell_check/lib/bibtex_loader.py +++ b/comment_spell_check/utils/bibtex_loader.py @@ -1,9 +1,11 @@ -""" Load Bibtex files into a spell checking dictionary. """ +"""Load Bibtex files into a spell checking dictionary.""" +import logging import bibtexparser +import spellchecker -def split_bibtex_name(name): +def split_bibtex_name(name: str): """ Split a Bibtex name, which is two words seperated by a number. """ @@ -17,19 +19,21 @@ def split_bibtex_name(name): return words -def add_bibtex(enchant_dict, filename, verbose=False): - """Update ``enchant_dict`` spell checking dictionary with names +def add_bibtex(spell: spellchecker.SpellChecker, filename: str): + """Update ``spell`` spell checking dictionary with names from ``filename``, a Bibtex file.""" - if verbose: - print(f"Bibtex file: {filename}") + logger = logging.getLogger("comment_spell_check.bibtex_loader") + logger.info("Bibtex file: %s", filename) + + word_list = [] with open(filename, "rt", encoding="utf-8") as biblatex_file: bib_database = bibtexparser.load(biblatex_file) for k in bib_database.get_entry_dict().keys(): words = split_bibtex_name(k) - for w in words: - enchant_dict.add(w) - if verbose: - print("Added Bibtex word:", w) + word_list.extend(words) + + logger.info("Words: %s", word_list) + spell.word_frequency.load_words(word_list) diff --git a/comment_spell_check/utils/create_checker.py b/comment_spell_check/utils/create_checker.py new file mode 100644 index 0000000..3111e85 --- /dev/null +++ b/comment_spell_check/utils/create_checker.py @@ -0,0 +1,34 @@ +"""Create a case sensitive spell checker with the English dictionary and +additional dictionaries if provided. +""" + +import logging +import importlib.resources +import spellchecker + + +def create_checker(dict_list: list[str] = None) -> spellchecker.SpellChecker: + """Create a case sensitive spell checker with the English dictionary and + additional dictionaries if provided.""" + + logger = logging.getLogger("comment_spell_check.create_checker") + + # create an empty SpellChecker object, because we want a case + # sensitive checker + checker = spellchecker.SpellChecker(language=None, case_sensitive=True) + + # load the English dictionary + lib_path = importlib.resources.files(spellchecker) + english_dict = str(lib_path) + "/resources/en.json.gz" + logger.info("Loading English dictionary from: %s", english_dict) + checker.word_frequency.load_dictionary(english_dict) + + # load the additional dictionaries + if not isinstance(dict_list, list): + return checker + if len(dict_list) > 0: + for d in dict_list: + logger.info("Loading additional dictionary from: %s", d) + checker.word_frequency.load_text_file(d) + + return checker diff --git a/comment_spell_check/utils/parseargs.py b/comment_spell_check/utils/parseargs.py new file mode 100644 index 0000000..8b84dd1 --- /dev/null +++ b/comment_spell_check/utils/parseargs.py @@ -0,0 +1,137 @@ +import argparse +from importlib.metadata import version, PackageNotFoundError + +__version__ = "unknown" + +try: + __version__ = version("comment_spell_check") +except PackageNotFoundError: + # package is not installed + pass + + +def create_parser(): + parser = argparse.ArgumentParser() + + parser.add_argument("filenames", nargs="*") + + parser.add_argument( + "--brief", + "-b", + action="store_true", + default=False, + dest="brief", + help="Make output brief", + ) + + parser.add_argument( + "--verbose", + "-v", + action="store_true", + default=False, + dest="verbose", + help="Make output verbose", + ) + + parser.add_argument( + "--first", + "-f", + action="store_true", + default=False, + dest="first", + help="Show only first occurrence of a mispelling", + ) + + parser.add_argument( + "--vim", + "-V", + action="store_true", + default=False, + dest="vim", + help="Output results in vim command format", + ) + + parser.add_argument( + "--dict", + "-d", + "--ignore-words", + "-I", + action="append", + dest="dict", + help="File that contains words that will be ignored." + " Argument can be passed multiple times." + " File must contain 1 word per line.", + ) + + parser.add_argument( + "--exclude", + "-e", + action="append", + dest="exclude", + help="Specify regex for excluding files." + " Argument can be passed multiple times.", + ) + + parser.add_argument( + "--skip", + "-S", + action="append", + help="Comma-separated list of files to skip. It " + "accepts globs as well. E.g.: if you want " + "coment_spell_check.py to skip .eps and .txt files, " + 'you\'d give "*.eps,*.txt" to this option.' + " Argument can be passed multiple times.", + ) + + parser.add_argument( + "--prefix", + "-p", + action="append", + default=[], + dest="prefixes", + help="Add word prefix. Argument can be passed multiple times.", + ) + + parser.add_argument( + "--miss", + "-m", + action="store_true", + default=False, + dest="miss", + help="Only output the misspelt words", + ) + + parser.add_argument( + "--suffix", + "-s", + action="append", + default=[".h"], + dest="suffix", + help="File name suffix. Argument can be passed multiple times.", + ) + + parser.add_argument( + "--type", + "-t", + action="store", + default="", + dest="mime_type", + help="Set file mime type. File name suffix will be ignored.", + ) + + parser.add_argument( + "--bibtex", + action="append", + dest="bibtex", + help="Bibtex file to load for additional dictionary words.", + ) + + parser.add_argument("--version", action="version", version=f"{__version__}") + return parser + + +def parse_args(parser=create_parser()): + """parse the command-line arguments.""" + + args = parser.parse_args() + return args diff --git a/comment_spell_check/utils/url_remove.py b/comment_spell_check/utils/url_remove.py new file mode 100644 index 0000000..74e346b --- /dev/null +++ b/comment_spell_check/utils/url_remove.py @@ -0,0 +1,20 @@ +"""Module to remove URLs from a string.""" + +import re + + +def remove_urls(text): + """ + Removes URLs from a string using a regular expression. + + Args: + text: The input string. + + Returns: + The string with URLs removed. + """ + url_pattern = re.compile( + r"(?:https?:\/\/)?[\w.-]+\.[\w.-]+[^\s]*", + re.IGNORECASE, + ) + return url_pattern.sub("", text) diff --git a/requirements.txt b/requirements.txt index 1e7beed..552927d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ comment_parser -pyenchant +pyspellchecker bibtexparser diff --git a/tests/example.h b/tests/example.h index 8eee308..da3d44b 100644 --- a/tests/example.h +++ b/tests/example.h @@ -29,7 +29,7 @@ int test_int; int hello_world() { - // Sup, bro? + // Sup, dude? print("Mmmm, pie."); } diff --git a/tests/test_comment_spell_check.py b/tests/test_comment_spell_check.py index f42a7dc..b745b2b 100644 --- a/tests/test_comment_spell_check.py +++ b/tests/test_comment_spell_check.py @@ -33,9 +33,8 @@ def test_basic(self): """Basic test""" runresult = subprocess.run( [ - "python", - "comment_spell_check.py", - "--verbose", + "comment_spell_check", + "--miss", "--dict", "../tests/dict.txt", "--prefix", @@ -51,8 +50,7 @@ def test_codebase(self): """Code base test""" runresult = subprocess.run( [ - "python", - "comment_spell_check.py", + "comment_spell_check", "--verbose", "--prefix", "myprefix", @@ -71,8 +69,7 @@ def test_version(self): """Version test""" runresult = subprocess.run( [ - "python", - "comment_spell_check.py", + "comment_spell_check", "--version", ], cwd="comment_spell_check", @@ -89,9 +86,7 @@ def test_bibtex(self): """Bibtext test""" runresult = subprocess.run( [ - "python", - "comment_spell_check.py", - "--verbose", + "comment_spell_check", "--bibtex", "../tests/itk.bib", "../tests/bibtest.py", @@ -100,3 +95,7 @@ def test_bibtex(self): stdout=subprocess.PIPE, ) self.assertEqual(runresult.returncode, 0, runresult.stdout) + + +if __name__ == "__main__": + unittest.main()