update a 0.1.3, support for more matchings and deleting local data

longemen3000 · Aug 1, 2021 · b805f1b · b805f1b · longemen3000 · Aug 1, 2021
1 parent 09f5dd2
commit b805f1b
Show file tree

Hide file tree

Showing 6 changed files with 86 additions and 32 deletions.
diff --git a/LocalPreferences.toml b/LocalPreferences.toml
@@ -0,0 +1,2 @@
+[ChemicalIdentifiers]
+CLEAR_CACHE = "false"
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,14 @@
 name = "ChemicalIdentifiers"
 uuid = "fa4ea961-1416-484e-bda2-883ee1634ba5"
-authors = ["longemen3000 <[email protected]> and contributors"]
-version = "0.1.2"
+authors = ["Andrés Riedemann <[email protected]> and contributors"]
+version = "0.1.3"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Scratch = "6c6a2e73-6563-6170-7368-637461726353"
+UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]

diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ filepath = "path/to/my/db.tsv"
 ChemicalIdentifiers.load_data!(:custom,file = filepath)
 ChemicalIdentifiers.load_db!(:custom)
 ```
-`ChemicalIdentifiers.load_data!` will generate a named tuple of file paths (stored in `ChemicalIdentifiers.DATA_INFO`), and `ChemicalIdentifiers.load_db!` will use that data to generate the corresponding Apache Arrow files and store those in a [scratch](https://github.com/JuliaPackaging/Scratch.jl) space (`ChemicalIdentifiers.download_cache`). 
+`ChemicalIdentifiers.load_data!` will generate a named tuple of file paths (stored in `ChemicalIdentifiers.DATA_INFO`), and `ChemicalIdentifiers.load_db!` will use that data to generate the corresponding Apache Arrow files and store those in a [scratch](https://github.com/JuliaPackaging/Scratch.jl) space (`ChemicalIdentifiers.download_cache`). This download cache can be cleaned (in case a download goes wrong) with `ChemicalIdentifiers.clear_download_cache!()`
 
 The raw databases are then stored in `ChemicalIdentifiers.DATA_DB`. if the data was already processed, then the arrow files are read directly, saving significant loading time.
 

diff --git a/src/ChemicalIdentifiers.jl b/src/ChemicalIdentifiers.jl
@@ -3,13 +3,14 @@ module ChemicalIdentifiers
     const DATA_DB = Dict{Symbol,Any}()   
     const DATA_INFO = Dict{Symbol,Any}()
 
-    import Unicode,Downloads,Arrow
+    import UUIDs, Unicode, Downloads
+    import Arrow
     import Scratch, Preferences
     export search_chemical
 
     download_cache = ""    
 
-
+    const PKG_UUID = parse(UUIDs.UUID,"fa4ea961-1416-484e-bda2-883ee1634ba5")
 
 """
     function search_chemical(query,cache=cache=ChemicalIdentifiers.SEARCH_CACHE)
@@ -81,10 +82,10 @@ The package stores each query in `ChemicalIdentifiers.SEARCH_CACHE` as a `Dict{S
 If you don't want to store the query, you could use `search_chemical(query,nothing)`, or, if you want your own cache to be used, pass your own cache via `search_chemical(query,mycache)`. 
     
 """
-    function search_chemical end
-    include("data_script.jl")
-    include("search_types.jl")
-    include("search.jl")
+  function search_chemical end
+  include("data_script.jl")
+  include("search_types.jl")
+  include("search.jl")
 
   function _precompile_()
     ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
@@ -94,17 +95,27 @@ If you don't want to store the query, you could use `search_chemical(query,nothi
     Base.precompile(search_chemical_id,(AnyQuery,))
   end
 
-  function __init__()
-      global download_cache = Scratch.@get_scratch!("databases")
-
-
-      url_short = "https://github.com/CalebBell/chemicals/raw/master/chemicals/Identifiers/chemical%20identifiers%20pubchem%20small.tsv"
-      url_long = "https://github.com/CalebBell/chemicals/raw/master/chemicals/Identifiers/chemical%20identifiers%20pubchem%20large.tsv"
-      load_data!(:short,url= url_short)
-      load_data!(:long,url = url_long)
+  function clear_download_cache!()
+    Preferences.@set_preferences!("CLEAR_CACHE" => "true")
+    @info("ChemicalIdentifiers.jl download cache has been marked for deletion; restart your Julia session for this change to take effect.")
+  end  
 
-      load_db!(:short)
-      load_db!(:long)
+  function __init__()
+    clear_cache = Preferences.@load_preference("CLEAR_CACHE","false")
+    if clear_cache == "true"
+      @info "deleting download cache..."
+      Scratch.delete_scratch!(PKG_UUID,"databases")
+      Preferences.@set_preferences!("CLEAR_CACHE" => "false")
+    end  
+    global download_cache = Scratch.@get_scratch!("databases")
+
+    url_short = "https://github.com/CalebBell/chemicals/raw/master/chemicals/Identifiers/chemical%20identifiers%20pubchem%20small.tsv"
+    url_long = "https://github.com/CalebBell/chemicals/raw/master/chemicals/Identifiers/chemical%20identifiers%20pubchem%20large.tsv"
+    load_data!(:short,url= url_short)
+    load_data!(:long,url = url_long)
+    load_db!(:short)
+    load_db!(:long)
+    return nothing
   end
 end
 
diff --git a/src/data_script.jl b/src/data_script.jl
@@ -22,7 +22,6 @@ end
 
 function load_db!(dbtype::Symbol)
     data = DATA_INFO[dbtype]
-
     if !isfile(data.textdb)
         @info ":" * string(dbtype) * " database file not found, downloading from " * data.url
         url  = data.url
@@ -64,11 +63,10 @@ function parse_and_write_db!(dbtype::Symbol)
     common_name = Vector{String}(undef,i)
     _synonyms  = Vector{Vector{String}}(undef,i)
 
-    #@show i
     i = 0
     for line in eachline(path)
         i += 1
-        strs = line |> z->rstrip(z,'\n') |> z->split(z,'\t')
+        strs = line |> z->strip(z) |> z->split(z,('\t',';'))
         pubchemid[i] = parse(Int64,strs[1])
         CAS[i] = cas_parse(strs[2])
         formula[i] = strs[3]
@@ -80,21 +78,28 @@ function parse_and_write_db!(dbtype::Symbol)
         common_name[i] = strs[9]
         _synonyms[i]  = strs[10:end]
     end
-
+    
     syms_i = mapreduce(length,+,_synonyms)
     synonyms_list = Vector{String}(undef,syms_i)
     synonyms_index = Vector{Int}(undef,syms_i)
 
+    #for some reason,some empty strings are generated as synonyms.
+    #those are eliminated here.
+
     k = 0
     for (ii,sym_vec) in pairs(_synonyms)
 
         for (jj,sym) in pairs(sym_vec)
-            k+=1
-            synonyms_list[k] = sym
-            synonyms_index[k] = ii
+            if !isempty(sym)
+                k+=1
+                synonyms_list[k] = sym
+                synonyms_index[k] = ii
+            end
         end
     end
 
+    resize!(synonyms_list,k)
+    resize!(synonyms_index,k)
 
     pubchemid_sort =sortperm(pubchemid)
     CAS_sort = sortperm(CAS)
@@ -122,6 +127,7 @@ function parse_and_write_db!(dbtype::Symbol)
     arrow_sort_db = Arrow.Table(data.sorteddb)
     return arrow_db,arrow_synonym_db,arrow_sort_db
 end
+
 """
     load_data!(key::Symbol;url=nothing,file=nothing)
 

diff --git a/src/search.jl b/src/search.jl
@@ -1,4 +1,24 @@
 const SEARCH_CACHE = Dict{String,Any}()
+"""
+    synonyms(query) 
+
+Given a chemical search query, return the synonyms associated to that query.
+This function doesn't have any cache.
+
+# Examples:
+```repl
+synonyms("water")
+```
+"""
+function synonyms(query)
+    compound_id,key = search_chemical_id(detect_query(query))
+    return __synonyms(compound_id,key)
+end
+
+function __synonyms(idx,key)
+    db,sdb = DATA_DB[key]
+    return sdb.list[findall(isequal(idx),sdb.index)]
+end
 
 function build_result(idx,key)
     if idx == -1
@@ -112,21 +132,33 @@ function search_chemical_id(ID::AnyQuery;skip_common_name = false,try_strategies
             return compound_id,key 
         end
     end
-
+ 
     if !search_done
         if !try_strategies #bail out here if requested
             return -1,:not_found
         end
     end
-    #result not found, trying same strategies as present in CalebBell/Chemicals
-    #strategy 1: trying without spaces and dashs
-    _ids = Vector{String}(undef,5)
+    #==
+    result not found, trying same strategies as present in CalebBell/Chemicals
+    #strategy 1: trying without spaces and dashs.
+    ==#
+    _ids = Vector{String}(undef,7)
     _ids[1] = Unicode.normalize(id,casefold = true,stripmark=true)
     _ids[2] = replace(_ids[1]," "=>"")
     _ids[3] = replace(_ids[2],"-"=>"")
     _ids[4] = replace(id," "=>"")
     _ids[5] = replace(_ids[4],"-"=>"")
-
+
+    #those matches find chemicals of the form n-name 
+    #or 1-name 
+    _ids[6] = begin
+        if occursin(r"^1-[A-Za-z]+$",_ids[1]) |  occursin(r"^n-[A-Za-z]+$",_ids[1])
+            chop(id,head=2,tail=0)
+        else
+            _ids[6] = id
+        end
+    end
+    _ids[7] = Unicode.normalize(_ids[6],casefold = true,stripmark=true) 
     _ids = unique!(_ids)
     _ids = setdiff!(_ids,[id])
 
@@ -137,6 +169,8 @@ function search_chemical_id(ID::AnyQuery;skip_common_name = false,try_strategies
             break
         end
     end
+
+
     #strategy 2: trying to match in the form 'water (H2O)'
     if !search_done 
         re = r"\w+\s+\([\s\w]+\)"     
@@ -152,7 +186,7 @@ function search_chemical_id(ID::AnyQuery;skip_common_name = false,try_strategies
             end
         end
     end
-
+    
     #if something worked, return here, else, return not found
     if search_done
        return compound_id,key