diff --git a/Project.toml b/Project.toml index 04f8db4..279dd4d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PubChemCrawler" uuid = "30e472fa-2b12-4c0b-9705-07d174b7a4e1" authors = ["Tim Holy and contributors"] -version = "1.0.0" +version = "1.1.0" [deps] HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" diff --git a/docs/src/index.md b/docs/src/index.md index f3ece1a..cef37a8 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -49,7 +49,7 @@ julia> using CSV, DataFrames julia> df = CSV.File(get_for_cids(2244; properties="MolecularFormula,MolecularWeight,XLogP,IsomericSMILES", output="CSV")) |> DataFrame 1×5 DataFrame │ Row │ CID │ MolecularFormula │ MolecularWeight │ XLogP │ IsomericSMILES │ -│ │ Int64 │ String │ Float64 │ Float64 │ String │ +│ │ $Int │ String │ Float64 │ Float64 │ String │ ├─────┼───────┼──────────────────┼─────────────────┼─────────┼──────────────────────────┤ │ 1 │ 2244 │ C9H8O4 │ 180.16 │ 1.2 │ CC(=O)OC1=CC=CC=C1C(=O)O │ ``` @@ -65,13 +65,13 @@ julia> open("/tmp/aspirin.sdf", "w") do io 3637 ``` -Finally, you can perform substructure searches. Let's retrieve up to 10 [bicyclic](https://en.wikipedia.org/wiki/Bicyclic_molecule) compounds: +Finally, you can perform substructure searches. Let's retrieve up to 10 [bicyclic](https://en.wikipedia.org/wiki/Bicyclic_molecule) compounds using a [SMARTS](https://en.wikipedia.org/wiki/SMILES_arbitrary_target_specification) search: ```julia julia> cids = query_substructure_pug(smarts = "[\$([*R2]([*R])([*R])([*R]))].[\$([*R2]([*R])([*R])([*R]))]", maxhits = 10) ┌ Warning: maxhits was hit, results are partial └ @ PubChemCrawler ~/.julia/dev/PubChemCrawler/src/pugxml.jl:164 -10-element Vector{Int64}: +10-element Vector{$Int}: 135398658 5280795 5430 diff --git a/src/pugxml.jl b/src/pugxml.jl index 6a31da7..7a74abd 100644 --- a/src/pugxml.jl +++ b/src/pugxml.jl @@ -12,7 +12,7 @@ Retrieve a list of compounds containing a substructure specified via its `cid`, julia> using PubChemCrawler julia> cids = query_substructure_pug(smarts="[r13]Br") # query brominated 13-atom rings -66-element Vector{Int64}: +66-element Vector{$Int}: 54533707 153064026 152829033 diff --git a/src/query.jl b/src/query.jl index 91b2f3e..2e03dd9 100644 --- a/src/query.jl +++ b/src/query.jl @@ -30,8 +30,6 @@ The output is a `Vector{UInt8}`. For `output="CSV"`, a good choice to generate a `DataFrame(CSV.File(msg))` from the DataFrames and CSV packages, respectively. Alternatively `String(msg)` will convert it to a string, which you can write to a file. -For complex queries that risk timing out, consider [`query_substructure_pug`](@ref). - # Example ``` @@ -43,7 +41,7 @@ julia> cid = get_cid(name="estriol") julia> df = CSV.File(query_substructure(;cid)) |> DataFrame # on Julia 1.0, use `(;cid=cid)` 11607×4 DataFrame │ Row │ CID │ MolecularFormula │ MolecularWeight │ XLogP │ -│ │ Int64 │ String │ Float64 │ Float64? │ +│ │ $Int │ String │ Float64 │ Float64? │ ├──────┼───────────┼──────────────────┼─────────────────┼──────────┤ │ 1 │ 5756 │ C18H24O3 │ 288.4 │ 2.5 │ │ 2 │ 5281904 │ C24H32O9 │ 464.5 │ 1.1 │ @@ -52,6 +50,9 @@ julia> df = CSV.File(query_substructure(;cid)) |> DataFrame # on Julia 1.0, ``` will query for derivatives of [estriol](https://en.wikipedia.org/wiki/Estriol). + +!!! info + For complex queries that risk timing out, consider [`query_substructure_pug`](@ref) in combination with [`get_for_cids`](@ref). """ function query_substructure(;cid=nothing, smiles=nothing, smarts=nothing, # inputs properties="MolecularFormula,MolecularWeight,XLogP,", # http://pubchemdocs.ncbi.nlm.nih.gov/pug-rest, "Compound Property Tables" @@ -76,29 +77,50 @@ function query_substructure(;cid=nothing, smiles=nothing, smarts=nothing, end """ - msg = get_for_cids(cids; properties=nothing, xrefs=nothing, output="CSV") + msg = get_for_cids(cids; properties|xrefs|cids_type|record_type, output="CSV") -Retrieve the given `properties` or `xrefs` for a list of compounds specified by their `cids`. +Retrieve the given `properties`, `xrefs`, CIDs, or records, respectively, for a list of compounds specified by their `cids`. +The documentation for these traits can be found at http://pubchemdocs.ncbi.nlm.nih.gov/pug-rest; this URL will be referred to as +PUGREST below. -See [`query_substructure`](@ref) for information about the arguments and return value. -The supported values for `xrefs` are available at https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest under "XRefs". +- `properties` include structural features like the molecular formula, number of undefined stereocenters, and so on. + Specify these as a comma-separated list from among the choices in PUGREST under "Compound Property Tables". +- `xrefs` ("cross-references") include identifiers used by other databases, e.g., the CAS (Registry) number, PubMedID, and so on. + The supported values for `xrefs` are available at PUGREST under "XRefs". +- `cids_type` is used to retrieve CIDs for compounds related to those specified in `cids`; see PUGREST under "SIDS / CIDS / AIDS". +- `record_type` is used to retrieve data files and to specify options for these files, e.g., 2d or 3d SDF files. + See PUGREST under "Full-record Retrieval". -# Example +`output` specifies the output format. Not all options are applicable to all queries; for example, "CSV" is appropriate for +`properties` queries but "SDF" might be used for a `record_type` query. See PUGREST, "Output". + +# Examples ``` -julia> using PubChem, JSON3 +julia> using PubChemCrawler, CSV, DataFrames, JSON3 julia> cids = [get_cid(name="cyclic guanosine monophosphate"), get_cid(name="aspirin")] -2-element Array{Int64,1}: +2-element Array{$Int,1}: 135398570 2244 -julia> dct = JSON3.read(get_for_cids(cids; xrefs="RN,", output="JSON")) # get the Registry Number(s) (CAS) -JSON3.Object{Array{UInt8,1},Array{UInt64,1}} with 1 entry: - :InformationList => {… +julia> CSV.File(get_for_cids(cids; properties="MolecularFormula,XLogP", output="CSV")) |> DataFrame +2×3 DataFrame + Row │ CID MolecularFormula XLogP + │ $Int String Float64 +─────┼────────────────────────────────────── + 1 │ 135398570 C10H12N5O7P -3.4 + 2 │ 2244 C9H8O4 1.2 + +julia> open("/tmp/aspirin_3d.sdf", "w") do io # save the 3d SDF file for aspirin (CID 2244) + write(io, get_for_cids(2244; record_type="3d", output="SDF")) + end +4055 + +julia> dct = JSON3.read(get_for_cids(cids; xrefs="RN,", output="JSON")); # get the Registry Number(s) (CAS) julia> dct[:InformationList][:Information] -2-element JSON3.Array{JSON3.Object,Array{UInt8,1},SubArray{UInt64,1,Array{UInt64,1},Tuple{UnitRange{Int64}},true}}: +2-element JSON3.Array{JSON3.Object,Array{UInt8,1},SubArray{$UInt,1,Array{$UInt,1},Tuple{UnitRange{$Int}},true}}: { "CID": 135398570, "RN": [ @@ -123,19 +145,25 @@ julia> dct[:InformationList][:Information] function get_for_cids(cids; properties=nothing, xrefs=nothing, - output="CSV", + cids_type=nothing, record_type=nothing, + output="CSV", kwargs...) url = prolog * "compound/cid/" if xrefs === nothing if properties !== nothing url *= canonicalize_properties("property/" * properties) + elseif cids_type !== nothing + url *= "cids/" end else properties === nothing || error("cannot specify both xref and properties in a single query") url *= canonicalize_properties("xrefs/" * xrefs) end url = joinpath(url, output) + if cids_type !== nothing + url *= "?cids_type=" * cids_type + end if record_type !== nothing url *= "?record_type=" * record_type end diff --git a/test/http_record/sodium_acetate_parent.bson b/test/http_record/sodium_acetate_parent.bson new file mode 100644 index 0000000..610f16f Binary files /dev/null and b/test/http_record/sodium_acetate_parent.bson differ diff --git a/test/runtests.jl b/test/runtests.jl index 60a19a0..1c945b9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -70,6 +70,11 @@ BrokenRecord.configure!(; path="http_record") flds = split(line) @test parse(Float32, flds[1]) != 0 && parse(Float32, flds[2]) != 0 && parse(Float32, flds[3]) != 0 + # parent compounds (sodium acetate is 517045, acetic acid is 176) + sleep(5.0 * get_recordings) + str = String(playback(() -> get_for_cids(517045; cids_type="parent", output="TXT"), "sodium_acetate_parent.bson")) + @test parse(Int, chomp(str)) == 176 + # xrefs sleep(4.0 * get_recordings) # next one is two requests cids = [playback(() -> get_cid(name="cyclic guanosine monophosphate"), "cGMP_cid.bson")