Skip to content

Commit

Permalink
update a 0.1.3, support for more matchings and deleting local data
Browse files Browse the repository at this point in the history
  • Loading branch information
longemen3000 committed Aug 1, 2021
1 parent 09f5dd2 commit b805f1b
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 32 deletions.
2 changes: 2 additions & 0 deletions LocalPreferences.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[ChemicalIdentifiers]
CLEAR_CACHE = "false"
5 changes: 3 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
name = "ChemicalIdentifiers"
uuid = "fa4ea961-1416-484e-bda2-883ee1634ba5"
authors = ["longemen3000 <[email protected]> and contributors"]
version = "0.1.2"
authors = ["Andrés Riedemann <[email protected]> and contributors"]
version = "0.1.3"

[deps]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Scratch = "6c6a2e73-6563-6170-7368-637461726353"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[compat]
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ filepath = "path/to/my/db.tsv"
ChemicalIdentifiers.load_data!(:custom,file = filepath)
ChemicalIdentifiers.load_db!(:custom)
```
`ChemicalIdentifiers.load_data!` will generate a named tuple of file paths (stored in `ChemicalIdentifiers.DATA_INFO`), and `ChemicalIdentifiers.load_db!` will use that data to generate the corresponding Apache Arrow files and store those in a [scratch](https://github.com/JuliaPackaging/Scratch.jl) space (`ChemicalIdentifiers.download_cache`).
`ChemicalIdentifiers.load_data!` will generate a named tuple of file paths (stored in `ChemicalIdentifiers.DATA_INFO`), and `ChemicalIdentifiers.load_db!` will use that data to generate the corresponding Apache Arrow files and store those in a [scratch](https://github.com/JuliaPackaging/Scratch.jl) space (`ChemicalIdentifiers.download_cache`). This download cache can be cleaned (in case a download goes wrong) with `ChemicalIdentifiers.clear_download_cache!()`

The raw databases are then stored in `ChemicalIdentifiers.DATA_DB`. if the data was already processed, then the arrow files are read directly, saving significant loading time.

Expand Down
43 changes: 27 additions & 16 deletions src/ChemicalIdentifiers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ module ChemicalIdentifiers
const DATA_DB = Dict{Symbol,Any}()
const DATA_INFO = Dict{Symbol,Any}()

import Unicode,Downloads,Arrow
import UUIDs, Unicode, Downloads
import Arrow
import Scratch, Preferences
export search_chemical

download_cache = ""


const PKG_UUID = parse(UUIDs.UUID,"fa4ea961-1416-484e-bda2-883ee1634ba5")

"""
function search_chemical(query,cache=cache=ChemicalIdentifiers.SEARCH_CACHE)
Expand Down Expand Up @@ -81,10 +82,10 @@ The package stores each query in `ChemicalIdentifiers.SEARCH_CACHE` as a `Dict{S
If you don't want to store the query, you could use `search_chemical(query,nothing)`, or, if you want your own cache to be used, pass your own cache via `search_chemical(query,mycache)`.
"""
function search_chemical end
include("data_script.jl")
include("search_types.jl")
include("search.jl")
function search_chemical end
include("data_script.jl")
include("search_types.jl")
include("search.jl")

function _precompile_()
ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
Expand All @@ -94,17 +95,27 @@ If you don't want to store the query, you could use `search_chemical(query,nothi
Base.precompile(search_chemical_id,(AnyQuery,))
end

function __init__()
global download_cache = Scratch.@get_scratch!("databases")


url_short = "https://github.com/CalebBell/chemicals/raw/master/chemicals/Identifiers/chemical%20identifiers%20pubchem%20small.tsv"
url_long = "https://github.com/CalebBell/chemicals/raw/master/chemicals/Identifiers/chemical%20identifiers%20pubchem%20large.tsv"
load_data!(:short,url= url_short)
load_data!(:long,url = url_long)
function clear_download_cache!()
Preferences.@set_preferences!("CLEAR_CACHE" => "true")
@info("ChemicalIdentifiers.jl download cache has been marked for deletion; restart your Julia session for this change to take effect.")
end

load_db!(:short)
load_db!(:long)
function __init__()
clear_cache = Preferences.@load_preference("CLEAR_CACHE","false")
if clear_cache == "true"
@info "deleting download cache..."
Scratch.delete_scratch!(PKG_UUID,"databases")
Preferences.@set_preferences!("CLEAR_CACHE" => "false")
end
global download_cache = Scratch.@get_scratch!("databases")

url_short = "https://github.com/CalebBell/chemicals/raw/master/chemicals/Identifiers/chemical%20identifiers%20pubchem%20small.tsv"
url_long = "https://github.com/CalebBell/chemicals/raw/master/chemicals/Identifiers/chemical%20identifiers%20pubchem%20large.tsv"
load_data!(:short,url= url_short)
load_data!(:long,url = url_long)
load_db!(:short)
load_db!(:long)
return nothing
end
end

20 changes: 13 additions & 7 deletions src/data_script.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ end

function load_db!(dbtype::Symbol)
data = DATA_INFO[dbtype]

if !isfile(data.textdb)
@info ":" * string(dbtype) * " database file not found, downloading from " * data.url
url = data.url
Expand Down Expand Up @@ -64,11 +63,10 @@ function parse_and_write_db!(dbtype::Symbol)
common_name = Vector{String}(undef,i)
_synonyms = Vector{Vector{String}}(undef,i)

#@show i
i = 0
for line in eachline(path)
i += 1
strs = line |> z->rstrip(z,'\n') |> z->split(z,'\t')
strs = line |> z->strip(z) |> z->split(z,('\t',';'))
pubchemid[i] = parse(Int64,strs[1])
CAS[i] = cas_parse(strs[2])
formula[i] = strs[3]
Expand All @@ -80,21 +78,28 @@ function parse_and_write_db!(dbtype::Symbol)
common_name[i] = strs[9]
_synonyms[i] = strs[10:end]
end

syms_i = mapreduce(length,+,_synonyms)
synonyms_list = Vector{String}(undef,syms_i)
synonyms_index = Vector{Int}(undef,syms_i)

#for some reason,some empty strings are generated as synonyms.
#those are eliminated here.

k = 0
for (ii,sym_vec) in pairs(_synonyms)

for (jj,sym) in pairs(sym_vec)
k+=1
synonyms_list[k] = sym
synonyms_index[k] = ii
if !isempty(sym)
k+=1
synonyms_list[k] = sym
synonyms_index[k] = ii
end
end
end

resize!(synonyms_list,k)
resize!(synonyms_index,k)

pubchemid_sort =sortperm(pubchemid)
CAS_sort = sortperm(CAS)
Expand Down Expand Up @@ -122,6 +127,7 @@ function parse_and_write_db!(dbtype::Symbol)
arrow_sort_db = Arrow.Table(data.sorteddb)
return arrow_db,arrow_synonym_db,arrow_sort_db
end

"""
load_data!(key::Symbol;url=nothing,file=nothing)
Expand Down
46 changes: 40 additions & 6 deletions src/search.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,24 @@
const SEARCH_CACHE = Dict{String,Any}()
"""
synonyms(query)
Given a chemical search query, return the synonyms associated to that query.
This function doesn't have any cache.
# Examples:
```repl
synonyms("water")
```
"""
function synonyms(query)
compound_id,key = search_chemical_id(detect_query(query))
return __synonyms(compound_id,key)
end

function __synonyms(idx,key)
db,sdb = DATA_DB[key]
return sdb.list[findall(isequal(idx),sdb.index)]
end

function build_result(idx,key)
if idx == -1
Expand Down Expand Up @@ -112,21 +132,33 @@ function search_chemical_id(ID::AnyQuery;skip_common_name = false,try_strategies
return compound_id,key
end
end

if !search_done
if !try_strategies #bail out here if requested
return -1,:not_found
end
end
#result not found, trying same strategies as present in CalebBell/Chemicals
#strategy 1: trying without spaces and dashs
_ids = Vector{String}(undef,5)
#==
result not found, trying same strategies as present in CalebBell/Chemicals
#strategy 1: trying without spaces and dashs.
==#
_ids = Vector{String}(undef,7)
_ids[1] = Unicode.normalize(id,casefold = true,stripmark=true)
_ids[2] = replace(_ids[1]," "=>"")
_ids[3] = replace(_ids[2],"-"=>"")
_ids[4] = replace(id," "=>"")
_ids[5] = replace(_ids[4],"-"=>"")


#those matches find chemicals of the form n-name
#or 1-name
_ids[6] = begin
if occursin(r"^1-[A-Za-z]+$",_ids[1]) | occursin(r"^n-[A-Za-z]+$",_ids[1])
chop(id,head=2,tail=0)
else
_ids[6] = id
end
end
_ids[7] = Unicode.normalize(_ids[6],casefold = true,stripmark=true)
_ids = unique!(_ids)
_ids = setdiff!(_ids,[id])

Expand All @@ -137,6 +169,8 @@ function search_chemical_id(ID::AnyQuery;skip_common_name = false,try_strategies
break
end
end


#strategy 2: trying to match in the form 'water (H2O)'
if !search_done
re = r"\w+\s+\([\s\w]+\)"
Expand All @@ -152,7 +186,7 @@ function search_chemical_id(ID::AnyQuery;skip_common_name = false,try_strategies
end
end
end

#if something worked, return here, else, return not found
if search_done
return compound_id,key
Expand Down

2 comments on commit b805f1b

@longemen3000
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/41952

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.3 -m "<description of version>" b805f1b2e6d93be8f1e71e5fca0888abbaa8d63a
git push origin v0.1.3

Please sign in to comment.