diff --git a/build.sbt b/build.sbt index 23941fd..167647e 100644 --- a/build.sbt +++ b/build.sbt @@ -1,4 +1,4 @@ -name := "import" +name := "release.generic" organization := "bio4j" description := "generic bio4j data import" @@ -7,24 +7,28 @@ bucketSuffix := "era7.com" scalaVersion := "2.11.8" libraryDependencies ++= Seq ( - "bio4j" % "bio4j" % "0.12.0-227-g60cce98", - "bio4j" %% "data-uniprot" % "0.1.1", - "org.scala-lang.modules" %% "scala-xml" % "1.0.5", - "org.scala-lang.modules" %% "scala-java8-compat" % "0.8.0-RC3", - "ohnosequences" %% "fastarious" % "0.6.0" -) ++ testDependencies - -lazy val testDependencies = Seq ( - "org.scalatest" %% "scalatest" % "2.2.6" % Test + "bio4j" % "bio4j" % "0.12.0-227-g60cce98", + "bio4j" %% "data-uniprot" % "0.1.1", + "org.scala-lang.modules" %% "scala-xml" % "1.0.6", + "org.scala-lang.modules" %% "scala-java8-compat" % "0.8.0", + "ohnosequences" %% "fastarious" % "0.6.0", + "ohnosequences" %% "statika" % "2.0.0-M5", + "org.scalatest" %% "scalatest" % "2.2.6" % Test ) dependencyOverrides := Set ( "org.scala-lang.modules" %% "scala-xml" % "1.0.5", - "org.scala-lang" % "scala-library" % "2.11.8", - "com.github.pathikrit" %% "better-files" % "2.13.0" + // "org.scala-lang" % "scala-library" % "2.11.8", + "com.github.pathikrit" %% "better-files" % "2.16.0" ) -wartremoverExcluded ++= Seq( - baseDirectory.value/"src"/"main"/"scala"/"uniprot"/"uniprotEntry.scala", - baseDirectory.value/"src"/"test"/"scala"/"ncbiTaxonomy.scala" -) +wartremoverErrors in (Compile, compile) := Seq() +// wartremoverExcluded ++= Seq( +// baseDirectory.value/"src"/"main"/"scala"/"uniprot"/"uniprotEntry.scala", +// baseDirectory.value/"src"/"test"/"scala"/"ncbiTaxonomy.scala" +// ) + +generateStatikaMetadataIn(Compile) + +// This turns on fat-jar publishing during release process: +publishFatArtifact in Release := true diff --git a/project/build.properties b/project/build.properties index 35c88ba..27e88aa 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=0.13.12 +sbt.version=0.13.13 diff --git a/project/plugins.sbt b/project/plugins.sbt index c04e7c2..24cdb7f 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,3 +1,6 @@ -resolvers += "Era7 maven releases" at "https://s3-eu-west-1.amazonaws.com/releases.era7.com" +resolvers ++= Seq( + "Era7 maven releases" at "https://s3-eu-west-1.amazonaws.com/releases.era7.com", + "repo.jenkins-ci.org" at "https://repo.jenkins-ci.org/public" +) -addSbtPlugin("ohnosequences" % "nice-sbt-settings" % "0.8.0-RC2") +addSbtPlugin("ohnosequences" % "nice-sbt-settings" % "0.8.0-RC4") diff --git a/src/main/scala/bundles.scala b/src/main/scala/bundles.scala new file mode 100644 index 0000000..605b77e --- /dev/null +++ b/src/main/scala/bundles.scala @@ -0,0 +1,104 @@ +package com.bio4j.data + +import ohnosequences.statika._ +import com.amazonaws.auth._ +import ohnosequences.awstools._, s3._ +import com.amazonaws.services.s3.transfer._ +import java.net.URL +import sys.process._ +import better.files._ + +case object bundles { + + val s3ReleasesPrefix = S3Folder("eu-west-1.raw.bio4j.com", "data/2016_11/") + + abstract class GetRawData( + val urls: Seq[URL], + val baseDirectory: File, + val gunzip: Boolean + )(deps: AnyBundle*) extends Bundle(deps: _*) { + + def destination(url: URL): File = { + val urlFile = url.getFile + val name = + if (gunzip && urlFile.endsWith(".gz")) urlFile.stripSuffix(".gz") + else urlFile + + (baseDirectory / name).createIfNotExists() + } + + lazy val files: Seq[File] = urls.map(destination) + + def inputStream(url: URL) = { + val stream = url.openStream + if (gunzip && url.getFile.endsWith(".gz")) stream.gzipped + else stream + } + + def instructions: AnyInstructions = { + LazyTry { + for { + url <- urls + inS <- inputStream(url).autoClosed + outS <- destination(url).outputStream + } yield inS pipeTo outS + // TODO: some retry logic? + } ->- + say(s"Files are downloaded to ${baseDirectory}") + } + + } + + + abstract class CopyToS3( + val files: Seq[File], + val s3folder: S3Folder + )(deps: AnyBundle*) extends Bundle(deps: _*) { + + lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) + lazy val transferManager = new TransferManager(s3client.s3) + + def instructions: AnyInstructions = { + + LazyTry { + files.foreach { file => + + val target = s3folder / file.name + + transferManager.upload( + target.bucket, target.key, + file.toJava + ).waitForCompletion + } + + transferManager.shutdownNow() + } ->- + say(s"Files are uploaded to ${s3folder.url}") + } + + } + + + abstract class GetS3Copy( + val s3copy: CopyToS3, + val baseDirectory: File + )(deps: AnyBundle*) extends Bundle(deps: _*) { + + lazy val s3client = S3.create(new InstanceProfileCredentialsProvider()) + lazy val transferManager = new TransferManager(s3client.s3) + + def instructions: AnyInstructions = { + LazyTry { + transferManager.downloadDirectory( + s3copy.s3folder.bucket, s3copy.s3folder.key, + baseDirectory.toJava + ).waitForCompletion + + transferManager.shutdownNow() + } ->- + say(s"Files are downloaded to ${baseDirectory}") + } + + } + +} diff --git a/src/main/scala/enzyme/bundles.scala b/src/main/scala/enzyme/bundles.scala new file mode 100644 index 0000000..13e3cd9 --- /dev/null +++ b/src/main/scala/enzyme/bundles.scala @@ -0,0 +1,39 @@ +package com.bio4j.data.enzyme + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + case object fileNames { + val enzyme = "enzyme.dat" + val enzclass = "enzclass.txt" + } + + case object rawData extends GetRawData( + urls = Seq( + fileNames.enzyme, + fileNames.enzclass + ).map { suffix => + new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/enzyme/release/${suffix}") + }, + baseDirectory = file"/media/ephemeral0/data/enzyme/", + gunzip = false + )() + + case object copyData extends CopyToS3( + rawData.files, + s3ReleasesPrefix / "enzyme" / + )() + + case object mirroredData extends GetS3Copy( + copyData, + file"/media/ephemeral0/data/enzyme/" + )() { + + val enzyme = baseDirectory / fileNames.enzyme + val enzclass = baseDirectory / fileNames.enzclass + } + +} diff --git a/src/main/scala/go/bundles.scala b/src/main/scala/go/bundles.scala new file mode 100644 index 0000000..219ec56 --- /dev/null +++ b/src/main/scala/go/bundles.scala @@ -0,0 +1,37 @@ +package com.bio4j.data.go + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + val release: String = "latest" + + case object fileNames { + val obo = "go_daily-termdb.obo-xml" + } + + case object rawData extends GetRawData( + urls = Seq( + // NOTE: this is daily automatic build, I'm not sure this is the source we want + new URL("http", "archive.geneontology.org", s"/termdb/${release}/${fileNames.obo}.gz") + ), + baseDirectory = file"/media/ephemeral0/data/go/", + gunzip = true + )() + + case object copyData extends CopyToS3( + rawData.files, + s3ReleasesPrefix / "go" / + )() + + case object mirroredData extends GetS3Copy( + copyData, + file"/media/ephemeral0/data/go/" + )() { + + val obo = baseDirectory / fileNames.obo + } + +} diff --git a/src/main/scala/ncbiTaxonomy/bundles.scala b/src/main/scala/ncbiTaxonomy/bundles.scala new file mode 100644 index 0000000..f44c5fe --- /dev/null +++ b/src/main/scala/ncbiTaxonomy/bundles.scala @@ -0,0 +1,40 @@ +package com.bio4j.data.ncbiTaxonomy + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + case object fileNames { + val nodes = "nodes.dmp" + val names = "names.dmp" + } + + case object rawData extends GetRawData( + urls = Seq( + new URL("ftp", "ftp.ncbi.nih.gov", "/pub/taxonomy/taxdump.tar.gz") + ), + baseDirectory = file"/media/ephemeral0/data/ncbiTaxonomy/", + gunzip = true + )() { + + val nodes = baseDirectory / "taxdump" / fileNames.nodes + val names = baseDirectory / "taxdump" / fileNames.names + } + + case object copyData extends CopyToS3( + Seq(rawData.nodes, rawData.names), + s3ReleasesPrefix / "ncbiTaxonomy" / + )() + + case object mirroredData extends GetS3Copy( + copyData, + file"/media/ephemeral0/data/ncbiTaxonomy/" + )() { + + val nodes = baseDirectory / fileNames.nodes + val names = baseDirectory / fileNames.names + } + +} diff --git a/src/main/scala/uniprot/bundles.scala b/src/main/scala/uniprot/bundles.scala new file mode 100644 index 0000000..3af0170 --- /dev/null +++ b/src/main/scala/uniprot/bundles.scala @@ -0,0 +1,35 @@ +package com.bio4j.data.uniprot + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + // NOTE: only old releases have a date-tag + val release = "current_release" + + case object fileNames { + val sprot = "uniprot_sprot.dat" // 517MB gz + val trembl = "uniprot_trembl.dat" // 38.9GB gz + val varsplic = "uniprot_sprot_varsplic.fasta" // 7.7MB gz + } + + // TODO: probably it's better to make 3 separate data and import bundles + case object rawData extends GetRawData( + urls = Seq( + fileNames.sprot, + fileNames.trembl + ).map { suffix => + new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/uniprot/current_release/knowledgebase/complete/${suffix}.gz") + }, + baseDirectory = file"/media/ephemeral0/data/enzyme/", + gunzip = true + )() { + + val sprot = baseDirectory / fileNames.sprot + val trembl = baseDirectory / fileNames.trembl + val varsplic = baseDirectory / fileNames.varsplic + } + +} diff --git a/src/main/scala/uniref/bundles.scala b/src/main/scala/uniref/bundles.scala new file mode 100644 index 0000000..ed784de --- /dev/null +++ b/src/main/scala/uniref/bundles.scala @@ -0,0 +1,36 @@ +package com.bio4j.data.uniref + +import com.bio4j.data.bundles._ +import java.net.URL +import better.files._ + +case object bundles { + + // NOTE: only old releases have a date-tag + val release = "current_release" + + case object fileNames { + val uniref50 = "uniref50.xml" // 8.5GB gz + val uniref90 = "uniref90.xml" // 15.4GB gz + val uniref100 = "uniref100.xml" // 27.7GB gz + } + + // TODO: probably it's better to make 3 separate data and import bundles + case object rawData extends GetRawData( + urls = Seq( + fileNames.uniref50, + fileNames.uniref90, + fileNames.uniref100 + ).map { suffix => + new URL("ftp", "ftp.ebi.ac.uk", s"/pub/databases/uniprot/current_release/uniref/${suffix}/${suffix}.gz") + }, + baseDirectory = file"/media/ephemeral0/data/enzyme/", + gunzip = true + )() { + + val uniref50 = baseDirectory / fileNames.uniref50 + val uniref90 = baseDirectory / fileNames.uniref90 + val uniref100 = baseDirectory / fileNames.uniref100 + } + +}