Merge pull request #16 from JuliaHealth/teh/cids_type

Support `cids_type` queries
JuliaHealth · Dec 7, 2020 · f644e4f · f644e4f · timholy · Dec 7, 2020
2 parents d6b318f + 2e12aa6
commit f644e4f
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 20 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "PubChemCrawler"
 uuid = "30e472fa-2b12-4c0b-9705-07d174b7a4e1"
 authors = ["Tim Holy <[email protected]> and contributors"]
-version = "1.0.0"
+version = "1.1.0"
 
 [deps]
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -49,7 +49,7 @@ julia> using CSV, DataFrames
 julia> df = CSV.File(get_for_cids(2244; properties="MolecularFormula,MolecularWeight,XLogP,IsomericSMILES", output="CSV")) |> DataFrame
 1×5 DataFrame
 │ Row │ CID   │ MolecularFormula │ MolecularWeight │ XLogP   │ IsomericSMILES           │
-│     │ Int64 │ String           │ Float64         │ Float64 │ String                   │
+│     │ $Int │ String           │ Float64         │ Float64 │ String                   │
 ├─────┼───────┼──────────────────┼─────────────────┼─────────┼──────────────────────────┤
 │ 1   │ 2244  │ C9H8O4           │ 180.16          │ 1.2     │ CC(=O)OC1=CC=CC=C1C(=O)O │
 ```
@@ -65,13 +65,13 @@ julia> open("/tmp/aspirin.sdf", "w") do io
 3637
 ```
 
-Finally, you can perform substructure searches. Let's retrieve up to 10 [bicyclic](https://en.wikipedia.org/wiki/Bicyclic_molecule) compounds:
+Finally, you can perform substructure searches. Let's retrieve up to 10 [bicyclic](https://en.wikipedia.org/wiki/Bicyclic_molecule) compounds using a [SMARTS](https://en.wikipedia.org/wiki/SMILES_arbitrary_target_specification) search:
 
 ```julia
 julia> cids = query_substructure_pug(smarts = "[\$([*R2]([*R])([*R])([*R]))].[\$([*R2]([*R])([*R])([*R]))]", maxhits = 10)
 ┌ Warning: maxhits was hit, results are partial
 └ @ PubChemCrawler ~/.julia/dev/PubChemCrawler/src/pugxml.jl:164
-10-element Vector{Int64}:
+10-element Vector{$Int}:
  135398658
    5280795
       5430

diff --git a/src/pugxml.jl b/src/pugxml.jl
@@ -12,7 +12,7 @@ Retrieve a list of compounds containing a substructure specified via its `cid`,
 julia> using PubChemCrawler
 
 julia> cids = query_substructure_pug(smarts="[r13]Br")   # query brominated 13-atom rings
-66-element Vector{Int64}:
+66-element Vector{$Int}:
   54533707
  153064026
  152829033

diff --git a/src/query.jl b/src/query.jl
@@ -30,8 +30,6 @@ The output is a `Vector{UInt8}`. For `output="CSV"`, a good choice to generate a
 `DataFrame(CSV.File(msg))` from the DataFrames and CSV packages, respectively.
 Alternatively `String(msg)` will convert it to a string, which you can write to a file.
 
-For complex queries that risk timing out, consider [`query_substructure_pug`](@ref).
-
 # Example
 
 ```
@@ -43,7 +41,7 @@ julia> cid = get_cid(name="estriol")
 julia> df = CSV.File(query_substructure(;cid)) |> DataFrame      # on Julia 1.0, use `(;cid=cid)`
 11607×4 DataFrame
 │ Row  │ CID       │ MolecularFormula │ MolecularWeight │ XLogP    │
-│      │ Int64     │ String           │ Float64         │ Float64? │
+│      │ $Int     │ String           │ Float64         │ Float64? │
 ├──────┼───────────┼──────────────────┼─────────────────┼──────────┤
 │ 1    │ 5756      │ C18H24O3         │ 288.4           │ 2.5      │
 │ 2    │ 5281904   │ C24H32O9         │ 464.5           │ 1.1      │
@@ -52,6 +50,9 @@ julia> df = CSV.File(query_substructure(;cid)) |> DataFrame      # on Julia 1.0,
 ```
 
 will query for derivatives of [estriol](https://en.wikipedia.org/wiki/Estriol).
+
+!!! info
+    For complex queries that risk timing out, consider [`query_substructure_pug`](@ref) in combination with [`get_for_cids`](@ref).
 """
 function query_substructure(;cid=nothing, smiles=nothing, smarts=nothing,          # inputs
                              properties="MolecularFormula,MolecularWeight,XLogP,", # http://pubchemdocs.ncbi.nlm.nih.gov/pug-rest, "Compound Property Tables"
@@ -76,29 +77,50 @@ function query_substructure(;cid=nothing, smiles=nothing, smarts=nothing,
 end
 
 """
-    msg = get_for_cids(cids; properties=nothing, xrefs=nothing, output="CSV")
+    msg = get_for_cids(cids; properties|xrefs|cids_type|record_type, output="CSV")
 
-Retrieve the given `properties` or `xrefs` for a list of compounds specified by their `cids`.
+Retrieve the given `properties`, `xrefs`, CIDs, or records, respectively, for a list of compounds specified by their `cids`.
+The documentation for these traits can be found at http://pubchemdocs.ncbi.nlm.nih.gov/pug-rest; this URL will be referred to as
+PUGREST below.
 
-See [`query_substructure`](@ref) for information about the arguments and return value.
-The supported values for `xrefs` are available at https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest under "XRefs".
+- `properties` include structural features like the molecular formula, number of undefined stereocenters, and so on.
+  Specify these as a comma-separated list from among the choices in PUGREST under "Compound Property Tables".
+- `xrefs` ("cross-references") include identifiers used by other databases, e.g., the CAS (Registry) number, PubMedID, and so on.
+  The supported values for `xrefs` are available at PUGREST under "XRefs".
+- `cids_type` is used to retrieve CIDs for compounds related to those specified in `cids`; see PUGREST under "SIDS / CIDS / AIDS".
+- `record_type` is used to retrieve data files and to specify options for these files, e.g., 2d or 3d SDF files.
+  See PUGREST under "Full-record Retrieval".
 
-# Example
+`output` specifies the output format. Not all options are applicable to all queries; for example, "CSV" is appropriate for
+`properties` queries but "SDF" might be used for a `record_type` query. See PUGREST, "Output".
+
+# Examples
 
 ```
-julia> using PubChem, JSON3
+julia> using PubChemCrawler, CSV, DataFrames, JSON3
 
 julia> cids = [get_cid(name="cyclic guanosine monophosphate"), get_cid(name="aspirin")]
-2-element Array{Int64,1}:
+2-element Array{$Int,1}:
  135398570
       2244
 
-julia> dct = JSON3.read(get_for_cids(cids; xrefs="RN,", output="JSON"))   # get the Registry Number(s) (CAS)
-JSON3.Object{Array{UInt8,1},Array{UInt64,1}} with 1 entry:
-  :InformationList => {…
+julia> CSV.File(get_for_cids(cids; properties="MolecularFormula,XLogP", output="CSV")) |> DataFrame
+2×3 DataFrame
+ Row │ CID        MolecularFormula  XLogP
+     │ $Int      String            Float64
+─────┼──────────────────────────────────────
+   1 │ 135398570  C10H12N5O7P          -3.4
+   2 │      2244  C9H8O4                1.2
+
+julia> open("/tmp/aspirin_3d.sdf", "w") do io    # save the 3d SDF file for aspirin (CID 2244)
+           write(io, get_for_cids(2244; record_type="3d", output="SDF"))
+       end
+4055
+
+julia> dct = JSON3.read(get_for_cids(cids; xrefs="RN,", output="JSON"));   # get the Registry Number(s) (CAS)
 
 julia> dct[:InformationList][:Information]
-2-element JSON3.Array{JSON3.Object,Array{UInt8,1},SubArray{UInt64,1,Array{UInt64,1},Tuple{UnitRange{Int64}},true}}:
+2-element JSON3.Array{JSON3.Object,Array{UInt8,1},SubArray{$UInt,1,Array{$UInt,1},Tuple{UnitRange{$Int}},true}}:
  {
    "CID": 135398570,
     "RN": [
@@ -123,19 +145,25 @@ julia> dct[:InformationList][:Information]
 function get_for_cids(cids;
                       properties=nothing,
                       xrefs=nothing,
-                      output="CSV",
+                      cids_type=nothing,
                       record_type=nothing,
+                      output="CSV",
                       kwargs...)
     url = prolog * "compound/cid/"
     if xrefs === nothing
         if properties !== nothing
             url *= canonicalize_properties("property/" * properties)
+        elseif cids_type !== nothing
+            url *= "cids/"
         end
     else
         properties === nothing || error("cannot specify both xref and properties in a single query")
         url *= canonicalize_properties("xrefs/" * xrefs)
     end
     url = joinpath(url, output)
+    if cids_type !== nothing
+        url *= "?cids_type=" * cids_type
+    end
     if record_type !== nothing
         url *= "?record_type=" * record_type
     end

diff --git a/test/http_record/sodium_acetate_parent.bson b/test/http_record/sodium_acetate_parent.bson
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -70,6 +70,11 @@ BrokenRecord.configure!(; path="http_record")
     flds = split(line)
     @test parse(Float32, flds[1]) != 0 && parse(Float32, flds[2]) != 0 && parse(Float32, flds[3]) != 0
 
+    # parent compounds (sodium acetate is 517045, acetic acid is 176)
+    sleep(5.0 * get_recordings)
+    str = String(playback(() -> get_for_cids(517045; cids_type="parent", output="TXT"), "sodium_acetate_parent.bson"))
+    @test parse(Int, chomp(str)) == 176
+
     # xrefs
     sleep(4.0 * get_recordings)  # next one is two requests
     cids = [playback(() -> get_cid(name="cyclic guanosine monophosphate"), "cGMP_cid.bson")