From 3ecf6bd708c50b8e00e27031feded802d32d2a7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 9 Jul 2020 15:04:22 +0200 Subject: [PATCH 01/24] First shot at POM reading, extracts PublicationDate, Description and Name. --- .../crawler/preprocessing/MavenArtifact.scala | 18 +++++++++++- .../preprocessing/MavenDownloadActor.scala | 29 +++++++++++++++---- .../preprocessing/MavenDownloader.scala | 2 +- .../crawler/preprocessing/package.scala | 4 +-- .../processing/MavenDependencyActor.scala | 5 ++-- .../delphi/crawler/tools/HttpDownloader.scala | 18 +++++++++++- .../delphi/crawler/preprocessing/Common.scala | 5 ++-- .../MavenDownloadActorTest.scala | 9 ++++-- .../preprocessing/MavenDownloaderSpec.scala | 2 +- 9 files changed, 74 insertions(+), 18 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index 3025eff..eac35d6 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -16,6 +16,22 @@ package de.upb.cs.swt.delphi.crawler.preprocessing +import java.io.ByteArrayInputStream + import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import org.apache.maven.model.io.xpp3.MavenXpp3Reader +import org.joda.time.DateTime + +case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile, metadata: MavenArtifactMetadata) + +case class MavenArtifactMetadata(publicationDate: DateTime, name: String, description: String) + +object MavenArtifactMetadata { + def readFromPom(pubDate: DateTime, pomFile: PomFile): Option[MavenArtifactMetadata] = { + val pomReader: MavenXpp3Reader = new MavenXpp3Reader() + + val pomObj = pomReader.read(new ByteArrayInputStream(pomFile.content)) -case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile) + Some(MavenArtifactMetadata(pubDate, pomObj.getName, pomObj.getDescription)) + } +} diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala index 52be089..c96f76d 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala @@ -16,11 +16,15 @@ package de.upb.cs.swt.delphi.crawler.preprocessing +import java.util.Locale + import akka.actor.{Actor, ActorLogging, ActorSystem, Props} import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier import de.upb.cs.swt.delphi.crawler.tools.HttpDownloader +import org.joda.time.DateTime +import org.joda.time.format.DateTimeFormat -import scala.util.{Failure, Success} +import scala.util.{Failure, Success, Try} class MavenDownloadActor extends Actor with ActorLogging { override def receive: Receive = { @@ -30,14 +34,29 @@ class MavenDownloadActor extends Actor with ActorLogging { val downloader = new HttpDownloader val jarStream = downloader.downloadFromUri(m.toJarLocation.toString()) - val pomStream = downloader.downloadFromUri(m.toPomLocation.toString()) + val pomResponse = downloader.downloadFromUriWithHeaders(m.toPomLocation.toString()) jarStream match { case Success(jar) => { - pomStream match { - case Success(pom) => { + pomResponse match { + case Success((pomStream, pomHeaders)) => { log.info(s"Downloaded $m") - sender() ! Success(MavenArtifact(m, JarFile(jar, m.toJarLocation.toURL), PomFile(pom))) + + // Extract and parse publication date from header + val datePattern = DateTimeFormat.forPattern("E, dd MMM yyyy HH:mm:ss zzz").withLocale(Locale.ENGLISH) + val pomPublicationDate = pomHeaders.find( _.lowercaseName().equals("last-modified") ) + .map( header => Try(datePattern.parseDateTime(header.value())) ) match { + case Some(Success(date)) => Some(date) + case Some(Failure(x)) => x.printStackTrace(); None + case _ => None + } + + val pomFile = PomFile(Stream.continually(pomStream.read).takeWhile(_ != -1).map(_.toByte).toArray) + + // Build and initialize metadata from POM + val metadata = MavenArtifactMetadata.readFromPom(pomPublicationDate.orNull, pomFile).orNull + + sender() ! Success(MavenArtifact(m, JarFile(jar, m.toJarLocation.toURL), pomFile, metadata)) } case Failure(e) => { // TODO: push error to actor diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloader.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloader.scala index c17d761..1650059 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloader.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloader.scala @@ -51,7 +51,7 @@ class MavenDownloader(identifier: MavenIdentifier) { } def downloadPom(): PomFile = { - PomFile(pomResource.read()) + PomFile(Stream.continually(pomResource.read().read).takeWhile(_ != -1).map(_.toByte).toArray) } def downloadMeta(): MetaFile = { diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/package.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/package.scala index 26e189b..d7d89b6 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/package.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/package.scala @@ -33,9 +33,9 @@ package object preprocessing { /** * Used for identification (Pattern matching) of pom file - * @param is pom file stream + * @param content Pom File Content as Byte Array */ - case class PomFile(is: InputStream) + case class PomFile(content: Array[Byte]) /** * Used for identification (Pattern matching) of metadata file diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala index 040cb55..1f6a6f7 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala @@ -1,5 +1,7 @@ package de.upb.cs.swt.delphi.crawler.processing +import java.io.ByteArrayInputStream + import akka.actor.{Actor, ActorLogging, Props} import de.upb.cs.swt.delphi.crawler.Configuration import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier @@ -91,8 +93,7 @@ class MavenDependencyActor(configuration: Configuration) extends Actor with Acto } def getDependencies(pomFile: PomFile): Set[MavenIdentifier] = { - val pomObj = pomReader.read(pomFile.is) - pomFile.is.close() + val pomObj = pomReader.read(new ByteArrayInputStream(pomFile.content)) val pomSet = pomObj.getDependencies.asScala.toSet[Dependency].map(resolveIdentifier(_, pomObj)) for (util.Success(id) <- pomSet) yield id diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpDownloader.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpDownloader.scala index 452b6cf..f43aec5 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpDownloader.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpDownloader.scala @@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit import akka.actor.ActorSystem import akka.http.scaladsl.Http -import akka.http.scaladsl.model.{HttpRequest, HttpResponse, StatusCodes} +import akka.http.scaladsl.model.{HttpHeader, HttpRequest, HttpResponse, StatusCodes} import akka.stream.ActorMaterializer import akka.stream.scaladsl.{Sink, StreamConverters} import akka.util.ByteString @@ -48,4 +48,20 @@ class HttpDownloader(implicit val system: ActorSystem) { Failure(new HttpException(code)) } } + + def downloadFromUriWithHeaders(requestedUri: String): Try[(InputStream, Seq[HttpHeader])] = { + val responseFuture: Future[HttpResponse] = + Http().singleRequest(HttpRequest(uri = requestedUri)) + + + Await.result(responseFuture, Duration.Inf) match { + case HttpResponse(StatusCodes.OK, headers, entity, _) => + Try(( + new ByteArrayInputStream(Await.result(entity.dataBytes.runFold(ByteString.empty)(_ ++ _).map(_.toArray), Duration.Inf)), + headers)) + case resp@HttpResponse(code, _, _, _) => + resp.discardEntityBytes() + Failure(new HttpException(code)) + } + } } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala index f5cc556..51abd4d 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala @@ -44,11 +44,10 @@ object Common { assert(jarPath.toFile.exists()) assert(jarPath.toFile.length() > 0) } - def checkPom(is:InputStream):Unit={ - val pomBytes = inputStreamToBytes(is) + def checkPom(content: Array[Byte]):Unit={ val tmpDir = System.getProperty("java.io.tmpdir") val pomPath = Paths.get(tmpDir).resolve("pom.xml") - Files.write(pomPath, pomBytes) + Files.write(pomPath, content) assert(pomPath.toFile.exists()) assert(pomPath.toFile.length() > 0) } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala index 022369e..fb61d4d 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala @@ -44,7 +44,7 @@ class MavenDownloadActorTest extends TestKit(ActorSystem("DownloadActor")) "The maven download actor" must { "create a maven artifact with a jar and pom file" in { - val mavenIdentifier = new MavenIdentifier("http://central.maven.org/maven2/", "junit", "junit", "4.12") + val mavenIdentifier = new MavenIdentifier("https://repo1.maven.org/maven2/", "junit", "junit", "4.12") val downloadActor = system.actorOf(MavenDownloadActor.props) implicit val timeout = Timeout(10 seconds) @@ -57,8 +57,13 @@ class MavenDownloadActorTest extends TestKit(ActorSystem("DownloadActor")) assert(msg.isInstanceOf[Success[MavenArtifact]]) val artifact = msg.asInstanceOf[Success[MavenArtifact]].get checkJar(artifact.jarFile.is) - checkPom(artifact.pomFile.is) + checkPom(artifact.pomFile.content) + val metadata = artifact.metadata + assert(metadata != null) + assert(metadata.publicationDate != null) + assert(metadata.description != null && !metadata.description.isEmpty) + assert(metadata.name != null && !metadata.name.isEmpty) } } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloaderSpec.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloaderSpec.scala index 8835857..009f91f 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloaderSpec.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloaderSpec.scala @@ -30,7 +30,7 @@ class MavenDownloaderSpec extends FlatSpec with Matchers { val mavenIdentifier = new MavenIdentifier("http://central.maven.org/maven2/", "junit", "junit", "4.12") val downloader = new MavenDownloader(mavenIdentifier) val pomStream = downloader.downloadPom() - checkPom(pomStream.is) + checkPom(pomStream.content) } } From e96b08e925b474814407016f78ecbe68e2f316f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 9 Jul 2020 15:42:43 +0200 Subject: [PATCH 02/24] Made style of pom file processing more inline with rest of application. --- .../maven/MavenDiscoveryProcess.scala | 5 ++- .../crawler/preprocessing/MavenArtifact.scala | 18 +++----- .../preprocessing/MavenDownloadActor.scala | 9 +--- .../preprocessing/MavenDownloader.scala | 2 +- .../preprocessing/PomFileReadActor.scala | 44 +++++++++++++++++++ .../crawler/preprocessing/package.scala | 4 +- .../processing/MavenDependencyActor.scala | 2 +- .../delphi/crawler/preprocessing/Common.scala | 5 ++- .../MavenDownloadActorTest.scala | 10 ++--- .../preprocessing/MavenDownloaderSpec.scala | 2 +- 10 files changed, 67 insertions(+), 34 deletions(-) create mode 100644 src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala index 14c3825..fc9fe35 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala @@ -28,7 +28,7 @@ import de.upb.cs.swt.delphi.crawler.{AppLogging, Configuration} import de.upb.cs.swt.delphi.crawler.control.Phase import de.upb.cs.swt.delphi.crawler.control.Phase.Phase import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} -import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActor} +import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor, PomFileReadActor} import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults} import de.upb.cs.swt.delphi.crawler.storage.ArtifactExistsQuery import de.upb.cs.swt.delphi.crawler.tools.NotYetImplementedException @@ -57,6 +57,7 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) private val seen = mutable.HashSet[MavenIdentifier]() val downloaderPool = system.actorOf(SmallestMailboxPool(8).props(MavenDownloadActor.props)) + val pomReaderPool = system.actorOf(SmallestMailboxPool(8).props(PomFileReadActor.props)) val hermesPool = system.actorOf(SmallestMailboxPool(configuration.hermesActorPoolSize).props(HermesActor.props())) override def phase: Phase = Phase.Discovery @@ -88,6 +89,8 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) .alsoTo(createSinkFromActorRef[MavenIdentifier](elasticPool)) .mapAsync(8)(identifier => (downloaderPool ? identifier).mapTo[Try[MavenArtifact]]) .filter(artifact => artifact.isSuccess) + .mapAsync(parallelism = 8)(artifact => (pomReaderPool ? artifact.get).mapTo[Try[MavenArtifact]]) + .filter(artifact => artifact.isSuccess) .map(artifact => artifact.get) val finalizer = diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index eac35d6..993c542 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -16,22 +16,16 @@ package de.upb.cs.swt.delphi.crawler.preprocessing -import java.io.ByteArrayInputStream - import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier -import org.apache.maven.model.io.xpp3.MavenXpp3Reader import org.joda.time.DateTime -case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile, metadata: MavenArtifactMetadata) - -case class MavenArtifactMetadata(publicationDate: DateTime, name: String, description: String) - -object MavenArtifactMetadata { - def readFromPom(pubDate: DateTime, pomFile: PomFile): Option[MavenArtifactMetadata] = { - val pomReader: MavenXpp3Reader = new MavenXpp3Reader() +case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile, + publicationDate: Option[DateTime], metadata: Option[MavenArtifactMetadata]) - val pomObj = pomReader.read(new ByteArrayInputStream(pomFile.content)) +case class MavenArtifactMetadata(name: String, description: String) - Some(MavenArtifactMetadata(pubDate, pomObj.getName, pomObj.getDescription)) +object MavenArtifact{ + def withMetadata(artifact: MavenArtifact, metadata: MavenArtifactMetadata): MavenArtifact = { + MavenArtifact(artifact.identifier, artifact.jarFile, artifact.pomFile, artifact.publicationDate, Some(metadata)) } } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala index c96f76d..cf9aea9 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala @@ -21,7 +21,6 @@ import java.util.Locale import akka.actor.{Actor, ActorLogging, ActorSystem, Props} import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier import de.upb.cs.swt.delphi.crawler.tools.HttpDownloader -import org.joda.time.DateTime import org.joda.time.format.DateTimeFormat import scala.util.{Failure, Success, Try} @@ -51,12 +50,8 @@ class MavenDownloadActor extends Actor with ActorLogging { case _ => None } - val pomFile = PomFile(Stream.continually(pomStream.read).takeWhile(_ != -1).map(_.toByte).toArray) - - // Build and initialize metadata from POM - val metadata = MavenArtifactMetadata.readFromPom(pomPublicationDate.orNull, pomFile).orNull - - sender() ! Success(MavenArtifact(m, JarFile(jar, m.toJarLocation.toURL), pomFile, metadata)) + sender() ! Success(MavenArtifact(m, JarFile(jar, m.toJarLocation.toURL), PomFile(pomStream), + pomPublicationDate, None)) } case Failure(e) => { // TODO: push error to actor diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloader.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloader.scala index 1650059..c17d761 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloader.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloader.scala @@ -51,7 +51,7 @@ class MavenDownloader(identifier: MavenIdentifier) { } def downloadPom(): PomFile = { - PomFile(Stream.continually(pomResource.read().read).takeWhile(_ != -1).map(_.toByte).toArray) + PomFile(pomResource.read()) } def downloadMeta(): MetaFile = { diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala new file mode 100644 index 0000000..45f8cd7 --- /dev/null +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala @@ -0,0 +1,44 @@ +// Copyright (C) 2018 The Delphi Team. +// See the LICENCE file distributed with this work for additional +// information regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de.upb.cs.swt.delphi.crawler.preprocessing + +import akka.actor.{Actor, ActorLogging, Props} +import org.apache.maven.model.io.xpp3.MavenXpp3Reader + +import scala.util.Success + +class PomFileReadActor extends Actor with ActorLogging{ + + val pomReader: MavenXpp3Reader = new MavenXpp3Reader() + + override def receive: Receive = { + case artifact@MavenArtifact(_, _ ,PomFile(pomStream), _, _) => { + //TODO: Errorhandling + val pomObject = pomReader.read(pomStream) + pomStream.close() + + val metadata = MavenArtifactMetadata(pomObject.getName, pomObject.getDescription) + + sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) + } + } + +} + +object PomFileReadActor { + def props = Props(new PomFileReadActor) +} diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/package.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/package.scala index d7d89b6..26e189b 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/package.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/package.scala @@ -33,9 +33,9 @@ package object preprocessing { /** * Used for identification (Pattern matching) of pom file - * @param content Pom File Content as Byte Array + * @param is pom file stream */ - case class PomFile(content: Array[Byte]) + case class PomFile(is: InputStream) /** * Used for identification (Pattern matching) of metadata file diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala index 1f6a6f7..b5a1296 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala @@ -93,7 +93,7 @@ class MavenDependencyActor(configuration: Configuration) extends Actor with Acto } def getDependencies(pomFile: PomFile): Set[MavenIdentifier] = { - val pomObj = pomReader.read(new ByteArrayInputStream(pomFile.content)) + val pomObj = pomReader.read(pomFile.is) val pomSet = pomObj.getDependencies.asScala.toSet[Dependency].map(resolveIdentifier(_, pomObj)) for (util.Success(id) <- pomSet) yield id diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala index 51abd4d..4e81f94 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala @@ -44,10 +44,11 @@ object Common { assert(jarPath.toFile.exists()) assert(jarPath.toFile.length() > 0) } - def checkPom(content: Array[Byte]):Unit={ + def checkPom(is: InputStream):Unit={ + val pomBytes = inputStreamToBytes(is) val tmpDir = System.getProperty("java.io.tmpdir") val pomPath = Paths.get(tmpDir).resolve("pom.xml") - Files.write(pomPath, content) + Files.write(pomPath, pomBytes) assert(pomPath.toFile.exists()) assert(pomPath.toFile.length() > 0) } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala index fb61d4d..f8cd7c1 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala @@ -57,14 +57,10 @@ class MavenDownloadActorTest extends TestKit(ActorSystem("DownloadActor")) assert(msg.isInstanceOf[Success[MavenArtifact]]) val artifact = msg.asInstanceOf[Success[MavenArtifact]].get checkJar(artifact.jarFile.is) - checkPom(artifact.pomFile.content) - - val metadata = artifact.metadata - assert(metadata != null) - assert(metadata.publicationDate != null) - assert(metadata.description != null && !metadata.description.isEmpty) - assert(metadata.name != null && !metadata.name.isEmpty) + checkPom(artifact.pomFile.is) + assert(artifact.metadata.isEmpty) + assert(artifact.publicationDate.isDefined && artifact.publicationDate.get != null) } } } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloaderSpec.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloaderSpec.scala index 009f91f..8835857 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloaderSpec.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloaderSpec.scala @@ -30,7 +30,7 @@ class MavenDownloaderSpec extends FlatSpec with Matchers { val mavenIdentifier = new MavenIdentifier("http://central.maven.org/maven2/", "junit", "junit", "4.12") val downloader = new MavenDownloader(mavenIdentifier) val pomStream = downloader.downloadPom() - checkPom(pomStream.content) + checkPom(pomStream.is) } } From b3a2dfab627496fa723793aa21c3ad4005fdf0e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 9 Jul 2020 16:04:53 +0200 Subject: [PATCH 03/24] Add test for pom file reader --- .../preprocessing/PomFileReadActorTest.scala | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala new file mode 100644 index 0000000..90907fa --- /dev/null +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala @@ -0,0 +1,71 @@ +// Copyright (C) 2018 The Delphi Team. +// See the LICENCE file distributed with this work for additional +// information regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de.upb.cs.swt.delphi.crawler.preprocessing + +import akka.actor.ActorSystem +import akka.pattern.ask +import akka.testkit.{ImplicitSender, TestKit} +import akka.util.Timeout +import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike} + +import scala.concurrent.duration._ +import scala.concurrent.{Await, ExecutionContext} +import scala.util.Success + +class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) + with ImplicitSender + with WordSpecLike + with Matchers + with BeforeAndAfterAll { + + override def afterAll { + TestKit.shutdownActorSystem(system) + } + + "The POM file reader actor " must { + "create a maven artifact with valid metadata" in { + val mavenIdentifier = new MavenIdentifier("https://repo1.maven.org/maven2/", "junit", "junit", "4.12") + val downloadActor = system.actorOf(MavenDownloadActor.props) + val readerActor = system.actorOf(PomFileReadActor.props) + + implicit val timeout: Timeout = Timeout(10 seconds) + implicit val ec: ExecutionContext = system.dispatcher + + val f = downloadActor ? mavenIdentifier + + val msg = Await.result(f, 10 seconds) + + assert(msg.isInstanceOf[Success[MavenArtifact]]) + val artifact = msg.asInstanceOf[Success[MavenArtifact]].get + + assert(artifact.metadata.isEmpty) + assert(artifact.publicationDate.isDefined && artifact.publicationDate.get != null) + + val result = Await.result(readerActor ? artifact, 10 seconds) + assert(result.isInstanceOf[Success[MavenArtifact]]) + val annotatedArtifact = result.asInstanceOf[Success[MavenArtifact]].get + + assert(annotatedArtifact.metadata.isDefined) + val metadata = annotatedArtifact.metadata.get + + assert(metadata.name != null && metadata.name.equals("JUnit")) + assert(metadata.description != null && metadata.description.startsWith("JUnit is a unit testing framework for Java,")) + } + } + +} From cc9ffda614b2e3ebb58a2d278d1556d56d2cd74d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 9 Jul 2020 16:19:15 +0200 Subject: [PATCH 04/24] Proper error handling in POM file reading actor --- .../preprocessing/PomFileReadActor.scala | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala index 45f8cd7..86a0b08 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala @@ -19,26 +19,32 @@ package de.upb.cs.swt.delphi.crawler.preprocessing import akka.actor.{Actor, ActorLogging, Props} import org.apache.maven.model.io.xpp3.MavenXpp3Reader -import scala.util.Success +import scala.util.{Failure, Success, Try} class PomFileReadActor extends Actor with ActorLogging{ val pomReader: MavenXpp3Reader = new MavenXpp3Reader() override def receive: Receive = { - case artifact@MavenArtifact(_, _ ,PomFile(pomStream), _, _) => { - //TODO: Errorhandling - val pomObject = pomReader.read(pomStream) + case artifact@MavenArtifact(identifier, _ ,PomFile(pomStream), _, _) => + + val pomObject = Try(pomReader.read(pomStream)) pomStream.close() - val metadata = MavenArtifactMetadata(pomObject.getName, pomObject.getDescription) + pomObject match { + case Success(pom) => - sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) - } - } + val metadata = MavenArtifactMetadata(pom.getName, pom.getDescription) + sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) + case Failure(ex) => + log.error(s"Failed to parse POM file for artifact $identifier",ex ) + sender() ! Failure(ex) + } + + } } object PomFileReadActor { - def props = Props(new PomFileReadActor) + def props: Props = Props(new PomFileReadActor) } From ece0b24f5210fa8fc0d600f93a24684e92d0ce28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 9 Jul 2020 16:33:59 +0200 Subject: [PATCH 05/24] Added processing of issue management system to POM reader. --- .../swt/delphi/crawler/preprocessing/MavenArtifact.scala | 3 ++- .../delphi/crawler/preprocessing/PomFileReadActor.scala | 8 +++++++- .../crawler/preprocessing/PomFileReadActorTest.scala | 4 ++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index 993c542..3588a1b 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -22,7 +22,8 @@ import org.joda.time.DateTime case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile, publicationDate: Option[DateTime], metadata: Option[MavenArtifactMetadata]) -case class MavenArtifactMetadata(name: String, description: String) +case class MavenArtifactMetadata(name: String, description: String, issueManagement: Option[IssueManagementData]) +case class IssueManagementData(system: String, url: String) object MavenArtifact{ def withMetadata(artifact: MavenArtifact, metadata: MavenArtifactMetadata): MavenArtifact = { diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala index 86a0b08..27de813 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala @@ -34,7 +34,13 @@ class PomFileReadActor extends Actor with ActorLogging{ pomObject match { case Success(pom) => - val metadata = MavenArtifactMetadata(pom.getName, pom.getDescription) + val issueManagement = if (pom.getIssueManagement != null) { + Some(IssueManagementData(pom.getIssueManagement.getSystem, pom.getIssueManagement.getUrl)) + } else { + None + } + + val metadata = MavenArtifactMetadata(pom.getName, pom.getDescription, issueManagement) sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) case Failure(ex) => diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala index 90907fa..d65e349 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala @@ -65,6 +65,10 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) assert(metadata.name != null && metadata.name.equals("JUnit")) assert(metadata.description != null && metadata.description.startsWith("JUnit is a unit testing framework for Java,")) + + assert(metadata.issueManagement.isDefined) + assertResult("https://github.com/junit-team/junit/issues")(metadata.issueManagement.get.url) + assertResult("github")(metadata.issueManagement.get.system) } } From d254798d3de43c8125f9220275ff8f94ea3419a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Mon, 13 Jul 2020 11:40:23 +0200 Subject: [PATCH 06/24] Revert unnecessary changes (whitespaces) --- .../swt/delphi/crawler/processing/MavenDependencyActor.scala | 3 +-- .../de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala index b5a1296..040cb55 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/MavenDependencyActor.scala @@ -1,7 +1,5 @@ package de.upb.cs.swt.delphi.crawler.processing -import java.io.ByteArrayInputStream - import akka.actor.{Actor, ActorLogging, Props} import de.upb.cs.swt.delphi.crawler.Configuration import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier @@ -94,6 +92,7 @@ class MavenDependencyActor(configuration: Configuration) extends Actor with Acto def getDependencies(pomFile: PomFile): Set[MavenIdentifier] = { val pomObj = pomReader.read(pomFile.is) + pomFile.is.close() val pomSet = pomObj.getDependencies.asScala.toSet[Dependency].map(resolveIdentifier(_, pomObj)) for (util.Success(id) <- pomSet) yield id diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala index 4e81f94..f5cc556 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/Common.scala @@ -44,7 +44,7 @@ object Common { assert(jarPath.toFile.exists()) assert(jarPath.toFile.length() > 0) } - def checkPom(is: InputStream):Unit={ + def checkPom(is:InputStream):Unit={ val pomBytes = inputStreamToBytes(is) val tmpDir = System.getProperty("java.io.tmpdir") val pomPath = Paths.get(tmpDir).resolve("pom.xml") From f063cac46539164e79786f570ebc4f2016671578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Wed, 15 Jul 2020 11:56:05 +0200 Subject: [PATCH 07/24] Moved PomFileReadActor to processing package, changed behavior on failure --- .../discovery/maven/MavenDiscoveryProcess.scala | 7 +++---- .../PomFileReadActor.scala | 13 +++++++++++-- .../PomFileReadActorTest.scala | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) rename src/main/scala/de/upb/cs/swt/delphi/crawler/{preprocessing => processing}/PomFileReadActor.scala (73%) rename src/test/scala/de/upb/cs/swt/delphi/crawler/{preprocessing => processing}/PomFileReadActorTest.scala (93%) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala index fc9fe35..44ba3c7 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala @@ -28,8 +28,8 @@ import de.upb.cs.swt.delphi.crawler.{AppLogging, Configuration} import de.upb.cs.swt.delphi.crawler.control.Phase import de.upb.cs.swt.delphi.crawler.control.Phase.Phase import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} -import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor, PomFileReadActor} -import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults} +import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor} +import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults, PomFileReadActor} import de.upb.cs.swt.delphi.crawler.storage.ArtifactExistsQuery import de.upb.cs.swt.delphi.crawler.tools.NotYetImplementedException @@ -89,12 +89,11 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) .alsoTo(createSinkFromActorRef[MavenIdentifier](elasticPool)) .mapAsync(8)(identifier => (downloaderPool ? identifier).mapTo[Try[MavenArtifact]]) .filter(artifact => artifact.isSuccess) - .mapAsync(parallelism = 8)(artifact => (pomReaderPool ? artifact.get).mapTo[Try[MavenArtifact]]) - .filter(artifact => artifact.isSuccess) .map(artifact => artifact.get) val finalizer = preprocessing + .mapAsync(8)(artifact => (pomReaderPool ? artifact).mapTo[MavenArtifact]) .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[Try[HermesResults]]) .filter(results => results.isSuccess) .map(results => results.get) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala similarity index 73% rename from src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala rename to src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index 27de813..a9d9b5c 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -14,13 +14,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -package de.upb.cs.swt.delphi.crawler.preprocessing +package de.upb.cs.swt.delphi.crawler.processing import akka.actor.{Actor, ActorLogging, Props} +import de.upb.cs.swt.delphi.crawler.preprocessing.{IssueManagementData, MavenArtifact, MavenArtifactMetadata, PomFile} import org.apache.maven.model.io.xpp3.MavenXpp3Reader import scala.util.{Failure, Success, Try} +/** + * An Actor that receives MavenArtifacts and extracts metadata from its POM file. If successful, an + * MavenMetadata object is attached to the artifact and the artifact is returned. If failures occurr, + * the artifact is returned without metadata. + * + * @author Johannes Düsing + */ class PomFileReadActor extends Actor with ActorLogging{ val pomReader: MavenXpp3Reader = new MavenXpp3Reader() @@ -45,7 +53,8 @@ class PomFileReadActor extends Actor with ActorLogging{ case Failure(ex) => log.error(s"Failed to parse POM file for artifact $identifier",ex ) - sender() ! Failure(ex) + // Best effort semantics: If parsing fails, artifact is returned without metadata + sender() ! artifact } } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala similarity index 93% rename from src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala rename to src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index d65e349..503d8a2 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -14,13 +14,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -package de.upb.cs.swt.delphi.crawler.preprocessing +package de.upb.cs.swt.delphi.crawler.processing import akka.actor.ActorSystem import akka.pattern.ask import akka.testkit.{ImplicitSender, TestKit} import akka.util.Timeout import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActor} import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike} import scala.concurrent.duration._ From 27409924bb487648b07aa464f3fba8de4fb9ecf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Sun, 2 Aug 2020 15:29:42 +0200 Subject: [PATCH 08/24] Added storage trait for POM file properties. Now also extracting licenses and developers --- .../maven/MavenDiscoveryProcess.scala | 1 + .../crawler/preprocessing/MavenArtifact.scala | 8 ++++- .../crawler/processing/PomFileReadActor.scala | 12 ++++++-- .../crawler/storage/ElasticStoreQueries.scala | 30 +++++++++++++++++++ .../processing/PomFileReadActorTest.scala | 5 ++++ 5 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala index 44ba3c7..4a9315b 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala @@ -94,6 +94,7 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) val finalizer = preprocessing .mapAsync(8)(artifact => (pomReaderPool ? artifact).mapTo[MavenArtifact]) + .alsoTo(createSinkFromActorRef[MavenArtifact](elasticPool)) .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[Try[HermesResults]]) .filter(results => results.isSuccess) .map(results => results.get) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index 3588a1b..b38224b 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -22,8 +22,14 @@ import org.joda.time.DateTime case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile, publicationDate: Option[DateTime], metadata: Option[MavenArtifactMetadata]) -case class MavenArtifactMetadata(name: String, description: String, issueManagement: Option[IssueManagementData]) +case class MavenArtifactMetadata(name: String, + description: String, + developers: List[String], + licenses: List[ArtifactLicense], + issueManagement: Option[IssueManagementData]) + case class IssueManagementData(system: String, url: String) +case class ArtifactLicense(name: String, url:String) object MavenArtifact{ def withMetadata(artifact: MavenArtifact, metadata: MavenArtifactMetadata): MavenArtifact = { diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index a9d9b5c..6dc6d57 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -17,9 +17,10 @@ package de.upb.cs.swt.delphi.crawler.processing import akka.actor.{Actor, ActorLogging, Props} -import de.upb.cs.swt.delphi.crawler.preprocessing.{IssueManagementData, MavenArtifact, MavenArtifactMetadata, PomFile} +import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactLicense, IssueManagementData, MavenArtifact, MavenArtifactMetadata, PomFile} import org.apache.maven.model.io.xpp3.MavenXpp3Reader +import scala.collection.JavaConverters._ import scala.util.{Failure, Success, Try} /** @@ -48,7 +49,14 @@ class PomFileReadActor extends Actor with ActorLogging{ None } - val metadata = MavenArtifactMetadata(pom.getName, pom.getDescription, issueManagement) + + + val metadata = MavenArtifactMetadata(pom.getName, + pom.getDescription, + pom.getDevelopers.asScala.map(_.getId).toList, + pom.getLicenses.asScala.map(l => ArtifactLicense(l.getName, l.getUrl)).toList, + issueManagement) + sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) case Failure(ex) => diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala index e26ba3a..1162181 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala @@ -23,6 +23,7 @@ import com.sksamuel.elastic4s.http.update.UpdateResponse import com.sksamuel.elastic4s.http.{ElasticClient, Response} import de.upb.cs.swt.delphi.crawler.discovery.git.GitIdentifier import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.preprocessing.MavenArtifact import de.upb.cs.swt.delphi.crawler.processing.{HermesAnalyzer, HermesResults} import org.joda.time.DateTime @@ -49,6 +50,35 @@ trait ElasticStoreQueries { } } + def store(m: MavenArtifact)(implicit client: ElasticClient, log: LoggingAdapter): Option[Response[UpdateResponse]] = { + elasticId(m.identifier) match { + case Some(id) => + log.info(s"Pushing POM file contents for ${m.identifier} under id $id") + + m.metadata match { + case Some(metadata) => + Some(client.execute { + update(id).in(delphiProjectType).doc(fields = "pom" -> Map( + "name" -> metadata.name, + "description" -> metadata.description, + "issueManagement" -> metadata.issueManagement + .map(management => Map("url" -> management.url, "system" -> management.system)).getOrElse("None"), + "developers" -> metadata.developers.mkString(","), + "licenses" -> metadata.licenses.map(l => Map("name" -> l.name, "url" -> l.url)) + ), "published" -> m.publicationDate.getOrElse("Unknown")) + }.await) + case None => + log.warning(s"Tried to push POM file results to database, but no results are present for identifier: ${m.identifier}") + None + } + + + case None => + log.warning(s"Tried to push POM file results for non-existing identifier: ${m.identifier}.") + None + } + } + def store(g: GitIdentifier)(implicit client: ElasticClient, log: LoggingAdapter): Response[IndexResponse] = { log.info("Pushing new git identifier to elastic: [{}]", g) client.execute { diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index 503d8a2..8bf7fcf 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -70,6 +70,11 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) assert(metadata.issueManagement.isDefined) assertResult("https://github.com/junit-team/junit/issues")(metadata.issueManagement.get.url) assertResult("github")(metadata.issueManagement.get.system) + + assertResult(4)(metadata.developers.size) + + assertResult(1)(metadata.licenses.size) + assertResult("Eclipse Public License 1.0")(metadata.licenses.head.name) } } From 7c84ead0216fbed8829647265c0cb90b78e56463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 17 Sep 2020 12:49:17 +0200 Subject: [PATCH 09/24] Add dependency extraction for POM files. Some basic variable resolving, but no parent processing yet. Also no storage yet --- .../maven/MavenDiscoveryProcess.scala | 2 +- .../crawler/preprocessing/MavenArtifact.scala | 3 +- .../crawler/processing/PomFileReadActor.scala | 99 ++++++++++++++++++- .../processing/PomFileReadActorTest.scala | 29 +++++- 4 files changed, 126 insertions(+), 7 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala index 4a9315b..01e2e27 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala @@ -57,7 +57,7 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) private val seen = mutable.HashSet[MavenIdentifier]() val downloaderPool = system.actorOf(SmallestMailboxPool(8).props(MavenDownloadActor.props)) - val pomReaderPool = system.actorOf(SmallestMailboxPool(8).props(PomFileReadActor.props)) + val pomReaderPool = system.actorOf(SmallestMailboxPool(8).props(PomFileReadActor.props(configuration))) val hermesPool = system.actorOf(SmallestMailboxPool(configuration.hermesActorPoolSize).props(HermesActor.props())) override def phase: Phase = Phase.Discovery diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index b38224b..6605b3c 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -26,7 +26,8 @@ case class MavenArtifactMetadata(name: String, description: String, developers: List[String], licenses: List[ArtifactLicense], - issueManagement: Option[IssueManagementData]) + issueManagement: Option[IssueManagementData], + dependencies: Set[MavenIdentifier]) case class IssueManagementData(system: String, url: String) case class ArtifactLicense(name: String, url:String) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index 6dc6d57..066ed6f 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -17,7 +17,10 @@ package de.upb.cs.swt.delphi.crawler.processing import akka.actor.{Actor, ActorLogging, Props} +import de.upb.cs.swt.delphi.crawler.Configuration +import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactLicense, IssueManagementData, MavenArtifact, MavenArtifactMetadata, PomFile} +import org.apache.maven.model.{Dependency, Model} import org.apache.maven.model.io.xpp3.MavenXpp3Reader import scala.collection.JavaConverters._ @@ -30,7 +33,7 @@ import scala.util.{Failure, Success, Try} * * @author Johannes Düsing */ -class PomFileReadActor extends Actor with ActorLogging{ +class PomFileReadActor(configuration: Configuration) extends Actor with ActorLogging{ val pomReader: MavenXpp3Reader = new MavenXpp3Reader() @@ -49,13 +52,14 @@ class PomFileReadActor extends Actor with ActorLogging{ None } - + val dependencies = getDependencies(pom, identifier) val metadata = MavenArtifactMetadata(pom.getName, pom.getDescription, pom.getDevelopers.asScala.map(_.getId).toList, pom.getLicenses.asScala.map(l => ArtifactLicense(l.getName, l.getUrl)).toList, - issueManagement) + issueManagement, + dependencies) sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) @@ -66,8 +70,95 @@ class PomFileReadActor extends Actor with ActorLogging{ } } + + /** + * Retrieve all dependencies specified in the given POM file as MavenIdentifiers. Try to resolve variables as well. + * Only returns successfully resolved dependencies, omits failures. + * @param pomContent Object holding POM file contents + * @param identifier Maven identifier, as sometimes version / groupID is not part of POM file! + * @return Set of MavenIdentifiers for each successfully parsed dependency + */ + private def getDependencies(implicit pomContent: Model, identifier: MavenIdentifier): Set[MavenIdentifier] = { + + val dependencies = pomContent + .getDependencies + .asScala + .toSet[Dependency] + .map(resolveDependency(_)) + + for ( Success(identifier) <- dependencies) yield identifier + } + + /** + * Process raw dependency specification from POM file, validate text values and try to resolve project variables. + * @param dependency Raw dependency specification as given in the POM file + * @param pomContent Contents of the POM file + * @param identifier Artifact identifier, as sometimes version / groupID is not part of POM file + * @return Try object holding the dependency's MavenIdentifier if successful + */ + private def resolveDependency(dependency: Dependency)(implicit pomContent: Model, identifier: MavenIdentifier): Try[MavenIdentifier] = { + Try { + val groupId = resolveProperty(dependency.getGroupId, "groupID") + val artifactId = resolveProperty(dependency.getArtifactId, "artifactID") + val version = resolveProperty(dependency.getVersion, "version") + + MavenIdentifier(configuration.mavenRepoBase.toString, groupId, artifactId, version) + } + } + + /** + * Resolve the given property value of an dependency specification and do input validation + * @param propValue Value to resolve + * @param propName Name of the property (for error logging) + * @param pomContent Contents of the POM file + * @return Fully resolved string value of the property if successful + * @throws NullPointerException If a null values was found for a required property + * @throws RuntimeException If actor failed to resolve a variable inside the POM file + */ + private def resolveProperty(propValue: String, propName: String)(implicit pomContent:Model, identifier:MavenIdentifier): String = { + if(propValue == null){ + throw new NullPointerException(s"Property '$propName' must not be null for dependencies") + } + else if (propValue.startsWith("$")){ + resolveProjectVariable(propValue) + .getOrElse(throw new RuntimeException(s"Failed to resolve variable '$propValue' for property '$propName'")) + } + else { + propValue + } + } + + private def resolveProjectVariable(variableName: String)(implicit pomContent: Model, identifier: MavenIdentifier): Option[String] = { + // Drop Maven Syntax from variable reference (e.g. ${varname}) + val rawVariableName = variableName.drop(2).dropRight(1) + + // Split dot-separated variable names + val variableParts = rawVariableName.split("\\.", 2) + + // Resolve special references to POM attributes + if (variableParts(0).equals("project") || variableParts(0).equals("pom")) { + val result = variableParts(1) match { + // groupID always present in identifier, but not always explicit in POM + case "groupId" => Some(identifier.groupId) + // artifactID always present in POM + case "artifactId" => Some(pomContent.getArtifactId) + // Version always present in identifier, but not always explicit in POM + case "version" => Some(identifier.version) + // Can only extract parent version if explicitly stated + case "parent.version" if pomContent.getParent != null && pomContent.getParent.getVersion != null => + Some(pomContent.getParent.getVersion) + case _ => None + } + result + } + else { + // All other formats are interpreted as POM property names + Option(pomContent.getProperties.getProperty(rawVariableName)) + } + + } } object PomFileReadActor { - def props: Props = Props(new PomFileReadActor) + def props(configuration: Configuration):Props = Props(new PomFileReadActor(configuration)) } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index 8bf7fcf..29d3dbc 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -20,6 +20,7 @@ import akka.actor.ActorSystem import akka.pattern.ask import akka.testkit.{ImplicitSender, TestKit} import akka.util.Timeout +import de.upb.cs.swt.delphi.crawler.Configuration import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActor} import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike} @@ -42,7 +43,7 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) "create a maven artifact with valid metadata" in { val mavenIdentifier = new MavenIdentifier("https://repo1.maven.org/maven2/", "junit", "junit", "4.12") val downloadActor = system.actorOf(MavenDownloadActor.props) - val readerActor = system.actorOf(PomFileReadActor.props) + val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) implicit val timeout: Timeout = Timeout(10 seconds) implicit val ec: ExecutionContext = system.dispatcher @@ -76,6 +77,32 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) assertResult(1)(metadata.licenses.size) assertResult("Eclipse Public License 1.0")(metadata.licenses.head.name) } + + "process dependencies as expected" in { + val mavenIdentifier = new MavenIdentifier("https://repo1.maven.org/maven2/", "org.apache.bookkeeper", "bookkeeper-server", "4.9.2") + val downloadActor = system.actorOf(MavenDownloadActor.props) + val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) + + implicit val timeout: Timeout = Timeout(10 seconds) + implicit val ec: ExecutionContext = system.dispatcher + + val f = downloadActor ? mavenIdentifier + + val msg = Await.result(f, 10 seconds) + + assert(msg.isInstanceOf[Success[MavenArtifact]]) + val artifact = msg.asInstanceOf[Success[MavenArtifact]].get + + val result = Await.result(readerActor ? artifact, 10 seconds) + assert(result.isInstanceOf[Success[MavenArtifact]]) + val annotatedArtifact = result.asInstanceOf[Success[MavenArtifact]].get + + val dependencies = annotatedArtifact.metadata.get.dependencies + + assertResult(8)(dependencies.size) + assertResult(8)(dependencies.count(_.version == "4.9.2")) + assert(dependencies.contains(MavenIdentifier("https://repo1.maven.org/maven2/","org.apache.bookkeeper", "circe-checksum", "4.9.2"))) + } } } From 82748c9be1228df30f9f5ca139788f191ee45cf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 17 Sep 2020 13:27:10 +0200 Subject: [PATCH 10/24] Recursively resolve POM variables in parents if possible. On-Demand and NOT optimized, but working --- .../crawler/processing/PomFileReadActor.scala | 70 ++++++++++++++----- .../processing/PomFileReadActorTest.scala | 3 +- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index 066ed6f..dda3411 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -16,10 +16,11 @@ package de.upb.cs.swt.delphi.crawler.processing -import akka.actor.{Actor, ActorLogging, Props} +import akka.actor.{Actor, ActorLogging, ActorSystem, Props} import de.upb.cs.swt.delphi.crawler.Configuration import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactLicense, IssueManagementData, MavenArtifact, MavenArtifactMetadata, PomFile} +import de.upb.cs.swt.delphi.crawler.tools.HttpDownloader import org.apache.maven.model.{Dependency, Model} import org.apache.maven.model.io.xpp3.MavenXpp3Reader @@ -36,6 +37,7 @@ import scala.util.{Failure, Success, Try} class PomFileReadActor(configuration: Configuration) extends Actor with ActorLogging{ val pomReader: MavenXpp3Reader = new MavenXpp3Reader() + implicit val system : ActorSystem = context.system override def receive: Receive = { case artifact@MavenArtifact(identifier, _ ,PomFile(pomStream), _, _) => @@ -128,6 +130,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } } + //noinspection ScalaStyle private def resolveProjectVariable(variableName: String)(implicit pomContent: Model, identifier: MavenIdentifier): Option[String] = { // Drop Maven Syntax from variable reference (e.g. ${varname}) val rawVariableName = variableName.drop(2).dropRight(1) @@ -135,28 +138,63 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog // Split dot-separated variable names val variableParts = rawVariableName.split("\\.", 2) + var result: Option[String] = None + // Resolve special references to POM attributes if (variableParts(0).equals("project") || variableParts(0).equals("pom")) { - val result = variableParts(1) match { - // groupID always present in identifier, but not always explicit in POM - case "groupId" => Some(identifier.groupId) - // artifactID always present in POM - case "artifactId" => Some(pomContent.getArtifactId) - // Version always present in identifier, but not always explicit in POM - case "version" => Some(identifier.version) - // Can only extract parent version if explicitly stated - case "parent.version" if pomContent.getParent != null && pomContent.getParent.getVersion != null => - Some(pomContent.getParent.getVersion) - case _ => None - } - result + result = variableParts(1) match { + // groupID always present in identifier, but not always explicit in POM + case "groupId" => Some(identifier.groupId) + // artifactID always present in POM + case "artifactId" => Some(pomContent.getArtifactId) + // Version always present in identifier, but not always explicit in POM + case "version" => Some(identifier.version) + // Can only extract parent version if explicitly stated + case "parent.version" if pomContent.getParent != null && pomContent.getParent.getVersion != null => + Some(pomContent.getParent.getVersion) + case _ => None + } + } + else { + // All other formats are interpreted as POM property names + result = Option(pomContent.getProperties.getProperty(rawVariableName)) + } + + // If not resolved -> try to resolve in parent! + if (result.isEmpty){ + recursiveResolveInParent(variableName) } else { - // All other formats are interpreted as POM property names - Option(pomContent.getProperties.getProperty(rawVariableName)) + result } } + + private def recursiveResolveInParent(variableName: String)(implicit pomContent: Model, identifier: MavenIdentifier):Option[String]={ + val parentDefinition = pomContent.getParent + + // Only resolve in parent if parent is explicitly defined! + if (parentDefinition != null && parentDefinition.getGroupId != null && parentDefinition.getArtifactId != null + && parentDefinition.getVersion != null){ + + val parentIdentifier = MavenIdentifier(configuration.mavenRepoBase.toString, parentDefinition.getGroupId, + parentDefinition.getArtifactId, parentDefinition.getVersion) + + // Download parent POM + new HttpDownloader().downloadFromUri(parentIdentifier.toPomLocation.toString) match { + case Success(pomStream) => + val parentPom = pomReader.read(pomStream) + pomStream.close() + // Recursive call to resolve variable in parent POM + resolveProjectVariable(variableName)(parentPom, parentIdentifier) + case Failure(x) => + throw x + } + } + else { + None + } + } } object PomFileReadActor { diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index 29d3dbc..27f0efc 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -99,9 +99,10 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) val dependencies = annotatedArtifact.metadata.get.dependencies - assertResult(8)(dependencies.size) + assertResult(10)(dependencies.size) assertResult(8)(dependencies.count(_.version == "4.9.2")) assert(dependencies.contains(MavenIdentifier("https://repo1.maven.org/maven2/","org.apache.bookkeeper", "circe-checksum", "4.9.2"))) + assert(dependencies.contains(MavenIdentifier("https://repo1.maven.org/maven2/","org.apache.kerby", "kerby-config", "1.1.1"))) } } From 7440e8adaf6633613003a7835fc3740a39762e3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 17 Sep 2020 13:36:48 +0200 Subject: [PATCH 11/24] Fix code smell --- .../crawler/processing/PomFileReadActorTest.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index 27f0efc..260a558 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -35,13 +35,15 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) with Matchers with BeforeAndAfterAll { + final val RepoUrl = "https://repo1.maven.org/maven2/" + override def afterAll { TestKit.shutdownActorSystem(system) } "The POM file reader actor " must { "create a maven artifact with valid metadata" in { - val mavenIdentifier = new MavenIdentifier("https://repo1.maven.org/maven2/", "junit", "junit", "4.12") + val mavenIdentifier = new MavenIdentifier(RepoUrl, "junit", "junit", "4.12") val downloadActor = system.actorOf(MavenDownloadActor.props) val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) @@ -79,7 +81,7 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) } "process dependencies as expected" in { - val mavenIdentifier = new MavenIdentifier("https://repo1.maven.org/maven2/", "org.apache.bookkeeper", "bookkeeper-server", "4.9.2") + val mavenIdentifier = new MavenIdentifier(RepoUrl, "org.apache.bookkeeper", "bookkeeper-server", "4.9.2") val downloadActor = system.actorOf(MavenDownloadActor.props) val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) @@ -101,8 +103,8 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) assertResult(10)(dependencies.size) assertResult(8)(dependencies.count(_.version == "4.9.2")) - assert(dependencies.contains(MavenIdentifier("https://repo1.maven.org/maven2/","org.apache.bookkeeper", "circe-checksum", "4.9.2"))) - assert(dependencies.contains(MavenIdentifier("https://repo1.maven.org/maven2/","org.apache.kerby", "kerby-config", "1.1.1"))) + assert(dependencies.contains(MavenIdentifier(RepoUrl,"org.apache.bookkeeper", "circe-checksum", "4.9.2"))) + assert(dependencies.contains(MavenIdentifier(RepoUrl,"org.apache.kerby", "kerby-config", "1.1.1"))) } } From 24aed36efd5f309fa10158a76fd42817b7f37b42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Thu, 17 Sep 2020 13:41:51 +0200 Subject: [PATCH 12/24] Remove code duplication in test --- .../processing/PomFileReadActorTest.scala | 53 +++++++------------ 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index 260a558..35ae867 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -41,28 +41,31 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) TestKit.shutdownActorSystem(system) } - "The POM file reader actor " must { - "create a maven artifact with valid metadata" in { - val mavenIdentifier = new MavenIdentifier(RepoUrl, "junit", "junit", "4.12") - val downloadActor = system.actorOf(MavenDownloadActor.props) - val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) + private def readPomFileFor(identifier: MavenIdentifier): MavenArtifact = { + val downloadActor = system.actorOf(MavenDownloadActor.props) + val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) + + implicit val timeout: Timeout = Timeout(10 seconds) + implicit val ec: ExecutionContext = system.dispatcher - implicit val timeout: Timeout = Timeout(10 seconds) - implicit val ec: ExecutionContext = system.dispatcher + val f = downloadActor ? identifier - val f = downloadActor ? mavenIdentifier + val msg = Await.result(f, 10 seconds) - val msg = Await.result(f, 10 seconds) + assert(msg.isInstanceOf[Success[MavenArtifact]]) + val artifact = msg.asInstanceOf[Success[MavenArtifact]].get - assert(msg.isInstanceOf[Success[MavenArtifact]]) - val artifact = msg.asInstanceOf[Success[MavenArtifact]].get + assert(artifact.metadata.isEmpty) + assert(artifact.publicationDate.isDefined && artifact.publicationDate.get != null) - assert(artifact.metadata.isEmpty) - assert(artifact.publicationDate.isDefined && artifact.publicationDate.get != null) + val result = Await.result(readerActor ? artifact, 10 seconds) + assert(result.isInstanceOf[Success[MavenArtifact]]) + result.asInstanceOf[Success[MavenArtifact]].get + } - val result = Await.result(readerActor ? artifact, 10 seconds) - assert(result.isInstanceOf[Success[MavenArtifact]]) - val annotatedArtifact = result.asInstanceOf[Success[MavenArtifact]].get + "The POM file reader actor " must { + "create a maven artifact with valid metadata" in { + val annotatedArtifact = readPomFileFor(MavenIdentifier(RepoUrl, "junit", "junit", "4.12")) assert(annotatedArtifact.metadata.isDefined) val metadata = annotatedArtifact.metadata.get @@ -81,23 +84,7 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) } "process dependencies as expected" in { - val mavenIdentifier = new MavenIdentifier(RepoUrl, "org.apache.bookkeeper", "bookkeeper-server", "4.9.2") - val downloadActor = system.actorOf(MavenDownloadActor.props) - val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) - - implicit val timeout: Timeout = Timeout(10 seconds) - implicit val ec: ExecutionContext = system.dispatcher - - val f = downloadActor ? mavenIdentifier - - val msg = Await.result(f, 10 seconds) - - assert(msg.isInstanceOf[Success[MavenArtifact]]) - val artifact = msg.asInstanceOf[Success[MavenArtifact]].get - - val result = Await.result(readerActor ? artifact, 10 seconds) - assert(result.isInstanceOf[Success[MavenArtifact]]) - val annotatedArtifact = result.asInstanceOf[Success[MavenArtifact]].get + val annotatedArtifact = readPomFileFor(MavenIdentifier(RepoUrl, "org.apache.bookkeeper", "bookkeeper-server", "4.9.2")) val dependencies = annotatedArtifact.metadata.get.dependencies From 577f543890c202aee43df1a7239ad7edad6ce7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Fri, 18 Sep 2020 14:57:09 +0200 Subject: [PATCH 13/24] PomReadActor now also resolves dependencies where version is not specified in file itself, but in parent --- .../crawler/processing/PomFileReadActor.scala | 79 +++++++++++++++---- .../processing/PomFileReadActorTest.scala | 3 +- 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index dda3411..e8e17a0 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -82,6 +82,8 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog */ private def getDependencies(implicit pomContent: Model, identifier: MavenIdentifier): Set[MavenIdentifier] = { + implicit val parentContent: Option[Model] = getParentPomModel(pomContent) + val dependencies = pomContent .getDependencies .asScala @@ -98,16 +100,44 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog * @param identifier Artifact identifier, as sometimes version / groupID is not part of POM file * @return Try object holding the dependency's MavenIdentifier if successful */ - private def resolveDependency(dependency: Dependency)(implicit pomContent: Model, identifier: MavenIdentifier): Try[MavenIdentifier] = { + private def resolveDependency(dependency: Dependency) + (implicit pomContent: Model, identifier: MavenIdentifier, parentContent: Option[Model]) + : Try[MavenIdentifier] = { Try { val groupId = resolveProperty(dependency.getGroupId, "groupID") val artifactId = resolveProperty(dependency.getArtifactId, "artifactID") - val version = resolveProperty(dependency.getVersion, "version") + + // Often dependency versions are left empty, as they are specified in the parent! + val version: String = if(dependency.getVersion == null && parentContent.isDefined){ + val parentIdent = MavenIdentifier(configuration.mavenRepoBase.toString, pomContent.getParent.getGroupId, + pomContent.getParent.getArtifactId, pomContent.getParent.getVersion) + + resolveDependencyVersion(dependency, parentContent.get, parentIdent) + } else { + resolveProperty(dependency.getVersion, "version") + } MavenIdentifier(configuration.mavenRepoBase.toString, groupId, artifactId, version) } } + private def resolveDependencyVersion(dependency: Dependency, pomContent: Model, identifier: MavenIdentifier): String = { + implicit val parentContent: Option[Model] = getParentPomModel(pomContent) + + pomContent + .getDependencyManagement.getDependencies + .asScala.toSet[Dependency] + .filter(d => d.getGroupId.equals(dependency.getGroupId) && d.getArtifactId.equals(dependency.getArtifactId)) + .map(_.getVersion) + .find(_ != null) match { + case Some(version) => + resolveProperty(version, "version")(pomContent, identifier, parentContent) + case None => + throw new NullPointerException(s"Version was null and could not be resolved in parent") + } + + } + /** * Resolve the given property value of an dependency specification and do input validation * @param propValue Value to resolve @@ -117,7 +147,9 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog * @throws NullPointerException If a null values was found for a required property * @throws RuntimeException If actor failed to resolve a variable inside the POM file */ - private def resolveProperty(propValue: String, propName: String)(implicit pomContent:Model, identifier:MavenIdentifier): String = { + private def resolveProperty(propValue: String, propName: String) + (implicit pomContent:Model, identifier:MavenIdentifier, parentContent:Option[Model]) + : String = { if(propValue == null){ throw new NullPointerException(s"Property '$propName' must not be null for dependencies") } @@ -131,7 +163,9 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } //noinspection ScalaStyle - private def resolveProjectVariable(variableName: String)(implicit pomContent: Model, identifier: MavenIdentifier): Option[String] = { + private def resolveProjectVariable(variableName: String) + (implicit pomContent: Model, identifier: MavenIdentifier, parentContent:Option[Model]) + : Option[String] = { // Drop Maven Syntax from variable reference (e.g. ${varname}) val rawVariableName = variableName.drop(2).dropRight(1) @@ -161,7 +195,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } // If not resolved -> try to resolve in parent! - if (result.isEmpty){ + if (result.isEmpty && parentContent.isDefined){ recursiveResolveInParent(variableName) } else { @@ -170,25 +204,38 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } - private def recursiveResolveInParent(variableName: String)(implicit pomContent: Model, identifier: MavenIdentifier):Option[String]={ - val parentDefinition = pomContent.getParent + private def recursiveResolveInParent(variableName: String) + (implicit pomContent: Model, identifier: MavenIdentifier, parentContent:Option[Model]) + :Option[String]={ + if(parentContent.isDefined){ + val parentIdentifier = MavenIdentifier(configuration.mavenRepoBase.toString, pomContent.getParent.getGroupId, + pomContent.getParent.getArtifactId, pomContent.getParent.getVersion) - // Only resolve in parent if parent is explicitly defined! - if (parentDefinition != null && parentDefinition.getGroupId != null && parentDefinition.getArtifactId != null - && parentDefinition.getVersion != null){ + val parentsParentContent = getParentPomModel(parentContent.get) - val parentIdentifier = MavenIdentifier(configuration.mavenRepoBase.toString, parentDefinition.getGroupId, - parentDefinition.getArtifactId, parentDefinition.getVersion) + resolveProjectVariable(variableName)(parentContent.get, parentIdentifier, parentsParentContent) + } + else { + None + } + } + + private def getParentPomModel(implicit pomContent: Model): Option[Model] = { + val parentDef = pomContent.getParent + + if (parentDef != null && parentDef.getGroupId != null && parentDef.getArtifactId != null && parentDef.getVersion != null){ + val parentIdentifier = MavenIdentifier(configuration.mavenRepoBase.toString, parentDef.getGroupId, + parentDef.getArtifactId, parentDef.getVersion) - // Download parent POM new HttpDownloader().downloadFromUri(parentIdentifier.toPomLocation.toString) match { case Success(pomStream) => val parentPom = pomReader.read(pomStream) pomStream.close() - // Recursive call to resolve variable in parent POM - resolveProjectVariable(variableName)(parentPom, parentIdentifier) + + Some(parentPom) case Failure(x) => - throw x + log.error(x, s"Failed to download parent POM") + None } } else { diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index 35ae867..e7afe43 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -88,10 +88,11 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) val dependencies = annotatedArtifact.metadata.get.dependencies - assertResult(10)(dependencies.size) + assertResult(23)(dependencies.size) assertResult(8)(dependencies.count(_.version == "4.9.2")) assert(dependencies.contains(MavenIdentifier(RepoUrl,"org.apache.bookkeeper", "circe-checksum", "4.9.2"))) assert(dependencies.contains(MavenIdentifier(RepoUrl,"org.apache.kerby", "kerby-config", "1.1.1"))) + assert(dependencies.contains(MavenIdentifier(RepoUrl,"commons-codec", "commons-codec", "1.6"))) } } From 5e535d2ebc899e7eb7c6050b9e24164895a9b15b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Sat, 19 Sep 2020 00:12:02 +0200 Subject: [PATCH 14/24] Optimized dependency resolving. Versions are now resolved throughout the whole parent hierarchy. Parents are only downloaded once, however, currently for every POM, not on-demand. --- .../crawler/processing/PomFileReadActor.scala | 150 ++++++++++-------- 1 file changed, 86 insertions(+), 64 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index e8e17a0..ebef877 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -73,6 +73,48 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } + /** + * Tries to resolve, download and parse the parent POM file of the given POM. + * @param pomContent Content of a POM file to resolve parent for + * @return Content of Parent POM, or None if no parent is specified or an error occurred + */ + private def getParentPomModel(implicit pomContent: Model): Option[Model] = { + val parentDef = pomContent.getParent + + if (parentDef != null && parentDef.getGroupId != null && parentDef.getArtifactId != null && parentDef.getVersion != null){ + val parentIdentifier = MavenIdentifier(configuration.mavenRepoBase.toString, parentDef.getGroupId, + parentDef.getArtifactId, parentDef.getVersion) + + new HttpDownloader().downloadFromUri(parentIdentifier.toPomLocation.toString) match { + case Success(pomStream) => + val parentPom = pomReader.read(pomStream) + pomStream.close() + + Some(parentPom) + case Failure(x) => + log.error(x, s"Failed to download parent POM") + None + } + } + else { + None + } + } + + private def buildParentHierarchy(implicit pomContent: Model): List[Model] = { + getParentPomModel(pomContent) match { + case Some(parentContent) => + List(parentContent) ++ buildParentHierarchy(parentContent) + case _ => + List() + } + } + + private def buildParentIdentifier(implicit pomContent:Model): MavenIdentifier = { + MavenIdentifier(configuration.mavenRepoBase.toString, pomContent.getParent.getGroupId, + pomContent.getParent.getArtifactId, pomContent.getParent.getVersion) + } + /** * Retrieve all dependencies specified in the given POM file as MavenIdentifiers. Try to resolve variables as well. * Only returns successfully resolved dependencies, omits failures. @@ -82,7 +124,9 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog */ private def getDependencies(implicit pomContent: Model, identifier: MavenIdentifier): Set[MavenIdentifier] = { - implicit val parentContent: Option[Model] = getParentPomModel(pomContent) + implicit lazy val parentHierarchy: List[Model] = buildParentHierarchy + + //implicit val parentContent: Option[Model] = getParentPomModel(pomContent) val dependencies = pomContent .getDependencies @@ -101,18 +145,16 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog * @return Try object holding the dependency's MavenIdentifier if successful */ private def resolveDependency(dependency: Dependency) - (implicit pomContent: Model, identifier: MavenIdentifier, parentContent: Option[Model]) + (implicit pomContent: Model, identifier: MavenIdentifier, parentHierarchy: List[Model]) : Try[MavenIdentifier] = { Try { val groupId = resolveProperty(dependency.getGroupId, "groupID") val artifactId = resolveProperty(dependency.getArtifactId, "artifactID") // Often dependency versions are left empty, as they are specified in the parent! - val version: String = if(dependency.getVersion == null && parentContent.isDefined){ - val parentIdent = MavenIdentifier(configuration.mavenRepoBase.toString, pomContent.getParent.getGroupId, - pomContent.getParent.getArtifactId, pomContent.getParent.getVersion) - - resolveDependencyVersion(dependency, parentContent.get, parentIdent) + val version: String = if(dependency.getVersion == null && parentHierarchy.nonEmpty){ + // Will recurse parent hierarchy to resolve missing version + resolveDependencyVersion(dependency, pomContent, identifier) } else { resolveProperty(dependency.getVersion, "version") } @@ -121,21 +163,37 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } } - private def resolveDependencyVersion(dependency: Dependency, pomContent: Model, identifier: MavenIdentifier): String = { - implicit val parentContent: Option[Model] = getParentPomModel(pomContent) - - pomContent - .getDependencyManagement.getDependencies - .asScala.toSet[Dependency] - .filter(d => d.getGroupId.equals(dependency.getGroupId) && d.getArtifactId.equals(dependency.getArtifactId)) - .map(_.getVersion) - .find(_ != null) match { - case Some(version) => - resolveProperty(version, "version")(pomContent, identifier, parentContent) - case None => - throw new NullPointerException(s"Version was null and could not be resolved in parent") + @scala.annotation.tailrec + private def resolveDependencyVersion(dependency: Dependency, pomContent: Model, identifier: MavenIdentifier, level: Int = 0) + (implicit parentHierarchy: List[Model]): String = { + if(pomContent.getDependencyManagement != null){ + pomContent + .getDependencyManagement.getDependencies + .asScala.toSet[Dependency] + .filter(d => d.getGroupId.equals(dependency.getGroupId) && d.getArtifactId.equals(dependency.getArtifactId)) + .map(_.getVersion) + .find(_ != null) match { + case Some(version) => + // Found something, try to resolve it if its a variable + resolveProperty(version, "version", level)(pomContent, identifier, parentHierarchy) + case None if level < parentHierarchy.length => + // Recursive call to find version definition in upper parent definitions + resolveDependencyVersion(dependency, parentHierarchy(level), buildParentIdentifier(pomContent), level + 1) + case None if level >= parentHierarchy.length => + // No parent left to recurse, so this really is a dependency without a version + throw new NullPointerException(s"Version was null and could not be resolved in parent") + } + } + else if(level < parentHierarchy.length) { + // Recursive call to find version definition in upper parent definitions + resolveDependencyVersion(dependency, parentHierarchy(level), buildParentIdentifier(pomContent), level + 1) + } + else { + // No parent left to recurse, so this really is a dependency without a version + throw new NullPointerException(s"Version was null and could not be resolved in parent") } + } /** @@ -147,14 +205,14 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog * @throws NullPointerException If a null values was found for a required property * @throws RuntimeException If actor failed to resolve a variable inside the POM file */ - private def resolveProperty(propValue: String, propName: String) - (implicit pomContent:Model, identifier:MavenIdentifier, parentContent:Option[Model]) + private def resolveProperty(propValue: String, propName: String, level: Int = 0) + (implicit pomContent:Model, identifier:MavenIdentifier, parentHierarchy: List[Model]) : String = { if(propValue == null){ throw new NullPointerException(s"Property '$propName' must not be null for dependencies") } else if (propValue.startsWith("$")){ - resolveProjectVariable(propValue) + resolveProjectVariable(propValue, level) .getOrElse(throw new RuntimeException(s"Failed to resolve variable '$propValue' for property '$propName'")) } else { @@ -163,8 +221,9 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } //noinspection ScalaStyle - private def resolveProjectVariable(variableName: String) - (implicit pomContent: Model, identifier: MavenIdentifier, parentContent:Option[Model]) + @scala.annotation.tailrec + private def resolveProjectVariable(variableName: String, level: Int) + (implicit pomContent: Model, identifier: MavenIdentifier, parentHierarchy: List[Model]) : Option[String] = { // Drop Maven Syntax from variable reference (e.g. ${varname}) val rawVariableName = variableName.drop(2).dropRight(1) @@ -195,8 +254,8 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } // If not resolved -> try to resolve in parent! - if (result.isEmpty && parentContent.isDefined){ - recursiveResolveInParent(variableName) + if (result.isEmpty && level <= parentHierarchy.length){ + resolveProjectVariable(variableName, level + 1)(parentHierarchy(level), buildParentIdentifier(pomContent), parentHierarchy) } else { result @@ -204,44 +263,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } - private def recursiveResolveInParent(variableName: String) - (implicit pomContent: Model, identifier: MavenIdentifier, parentContent:Option[Model]) - :Option[String]={ - if(parentContent.isDefined){ - val parentIdentifier = MavenIdentifier(configuration.mavenRepoBase.toString, pomContent.getParent.getGroupId, - pomContent.getParent.getArtifactId, pomContent.getParent.getVersion) - - val parentsParentContent = getParentPomModel(parentContent.get) - resolveProjectVariable(variableName)(parentContent.get, parentIdentifier, parentsParentContent) - } - else { - None - } - } - - private def getParentPomModel(implicit pomContent: Model): Option[Model] = { - val parentDef = pomContent.getParent - - if (parentDef != null && parentDef.getGroupId != null && parentDef.getArtifactId != null && parentDef.getVersion != null){ - val parentIdentifier = MavenIdentifier(configuration.mavenRepoBase.toString, parentDef.getGroupId, - parentDef.getArtifactId, parentDef.getVersion) - - new HttpDownloader().downloadFromUri(parentIdentifier.toPomLocation.toString) match { - case Success(pomStream) => - val parentPom = pomReader.read(pomStream) - pomStream.close() - - Some(parentPom) - case Failure(x) => - log.error(x, s"Failed to download parent POM") - None - } - } - else { - None - } - } } object PomFileReadActor { From 8853ed7b44dbb995a1db5c2ae9a7e04c185e443f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Sat, 19 Sep 2020 12:21:56 +0200 Subject: [PATCH 15/24] Optimization: Parent hierarchy is now lazy, ie only downloaded if at least one version / attribute failed to resolve locally. However, if any parent is required the whole hierarchy will be downloaded! Fixed a bug in test shutdown. --- .../crawler/processing/PomFileReadActor.scala | 97 ++++++++++++------- .../processing/PomFileReadActorTest.scala | 4 - 2 files changed, 64 insertions(+), 37 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index ebef877..de4de71 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -65,6 +65,8 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) + log.info(s"Successfully processed POM file for $identifier") + case Failure(ex) => log.error(s"Failed to parse POM file for artifact $identifier",ex ) // Best effort semantics: If parsing fails, artifact is returned without metadata @@ -101,6 +103,12 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } } + /** + * Recursive method building the parent hierarchy of the given POM. Will download and parse all parent POMs and + * return them in a list. + * @param pomContent POM file to build the parent hierarchy for + * @return List of parent POMs. Might be empty, if no parent is specified at all + */ private def buildParentHierarchy(implicit pomContent: Model): List[Model] = { getParentPomModel(pomContent) match { case Some(parentContent) => @@ -110,6 +118,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } } + private def buildParentIdentifier(implicit pomContent:Model): MavenIdentifier = { MavenIdentifier(configuration.mavenRepoBase.toString, pomContent.getParent.getGroupId, pomContent.getParent.getArtifactId, pomContent.getParent.getVersion) @@ -124,17 +133,22 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog */ private def getDependencies(implicit pomContent: Model, identifier: MavenIdentifier): Set[MavenIdentifier] = { - implicit lazy val parentHierarchy: List[Model] = buildParentHierarchy - - //implicit val parentContent: Option[Model] = getParentPomModel(pomContent) + // Always build the parent hierarchy exactly once + lazy val parentHierarchy: List[Model] = buildParentHierarchy(pomContent) + // Try to resolve each dependency specified in the POM val dependencies = pomContent .getDependencies .asScala .toSet[Dependency] - .map(resolveDependency(_)) + .map(resolveDependency(_, parentHierarchy)) + + if (dependencies.count(_.isFailure) > 0) { + log.warning(s"Failed to resolve some dependencies for $identifier") + } - for ( Success(identifier) <- dependencies) yield identifier + // Only return those dependencies that have been successfully resolved + for (Success(identifier) <- dependencies) yield identifier } /** @@ -144,29 +158,46 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog * @param identifier Artifact identifier, as sometimes version / groupID is not part of POM file * @return Try object holding the dependency's MavenIdentifier if successful */ - private def resolveDependency(dependency: Dependency) - (implicit pomContent: Model, identifier: MavenIdentifier, parentHierarchy: List[Model]) + private def resolveDependency(dependency: Dependency, parentHierarchy: => List[Model]) + (implicit pomContent: Model, identifier: MavenIdentifier) : Try[MavenIdentifier] = { + lazy val parents = parentHierarchy + Try { - val groupId = resolveProperty(dependency.getGroupId, "groupID") - val artifactId = resolveProperty(dependency.getArtifactId, "artifactID") + // Resolve groupID and artifact id in current POM + val groupId = resolveProperty(dependency.getGroupId, "groupID", parents) + val artifactId = resolveProperty(dependency.getArtifactId, "artifactID", parents) // Often dependency versions are left empty, as they are specified in the parent! - val version: String = if(dependency.getVersion == null && parentHierarchy.nonEmpty){ - // Will recurse parent hierarchy to resolve missing version - resolveDependencyVersion(dependency, pomContent, identifier) + val version: String = if(dependency.getVersion == null && parents.nonEmpty){ + // If there are parents and version is empty => Try to resolve version in parents + resolveDependencyVersion(dependency, pomContent, identifier, parents) } else { - resolveProperty(dependency.getVersion, "version") + // If no parents are present or version is specified => Resolve as regular property + resolveProperty(dependency.getVersion, "version", parents) } MavenIdentifier(configuration.mavenRepoBase.toString, groupId, artifactId, version) } } + /** + * Resolve the version of the given dependency by inspecting the tag of all parent POMs. + * @param dependency Dependency to resolve version for, ie. no explicit version is defined for this dependency! + * @param pomContent Content of the current POM file to inspect + * @param identifier Identifier of the current POM file + * @param level Level in the parent hierarchy, needed for recursion + * @param parentHierarchy Parent hierarchy object + * @return String value of the resolved version + * @throws NullPointerException If version could not be resolved in any parent + */ @scala.annotation.tailrec - private def resolveDependencyVersion(dependency: Dependency, pomContent: Model, identifier: MavenIdentifier, level: Int = 0) - (implicit parentHierarchy: List[Model]): String = { + private def resolveDependencyVersion(dependency: Dependency, pomContent: Model, identifier: MavenIdentifier, + parentHierarchy: => List[Model], level: Int = 0): String = { + lazy val parents = parentHierarchy + if(pomContent.getDependencyManagement != null){ + // If there is a dependency management tag: Try to find matching groupID and artifactID pomContent .getDependencyManagement.getDependencies .asScala.toSet[Dependency] @@ -174,19 +205,19 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog .map(_.getVersion) .find(_ != null) match { case Some(version) => - // Found something, try to resolve it if its a variable - resolveProperty(version, "version", level)(pomContent, identifier, parentHierarchy) - case None if level < parentHierarchy.length => - // Recursive call to find version definition in upper parent definitions - resolveDependencyVersion(dependency, parentHierarchy(level), buildParentIdentifier(pomContent), level + 1) - case None if level >= parentHierarchy.length => + // Found matching version definition, try to resolve it if its a variable + resolveProperty(version, "version", parents, level)(pomContent, identifier) + case None if level < parents.length => + // Found no matching version definition, but there is parents left to recurse to + resolveDependencyVersion(dependency, parents(level), buildParentIdentifier(pomContent), parents, level + 1) + case None if level >= parents.length => // No parent left to recurse, so this really is a dependency without a version throw new NullPointerException(s"Version was null and could not be resolved in parent") } } else if(level < parentHierarchy.length) { - // Recursive call to find version definition in upper parent definitions - resolveDependencyVersion(dependency, parentHierarchy(level), buildParentIdentifier(pomContent), level + 1) + // There is no dependency management tag, immediately recurse into parent if parent left + resolveDependencyVersion(dependency, parents(level), buildParentIdentifier(pomContent), parents, level + 1) } else { // No parent left to recurse, so this really is a dependency without a version @@ -205,14 +236,15 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog * @throws NullPointerException If a null values was found for a required property * @throws RuntimeException If actor failed to resolve a variable inside the POM file */ - private def resolveProperty(propValue: String, propName: String, level: Int = 0) - (implicit pomContent:Model, identifier:MavenIdentifier, parentHierarchy: List[Model]) + private def resolveProperty(propValue: String, propName: String, parentHierarchy: => List[Model], level: Int = 0) + (implicit pomContent:Model, identifier:MavenIdentifier) : String = { + lazy val parents = parentHierarchy if(propValue == null){ throw new NullPointerException(s"Property '$propName' must not be null for dependencies") } else if (propValue.startsWith("$")){ - resolveProjectVariable(propValue, level) + resolveProjectVariable(propValue, parents, level) .getOrElse(throw new RuntimeException(s"Failed to resolve variable '$propValue' for property '$propName'")) } else { @@ -222,9 +254,11 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog //noinspection ScalaStyle @scala.annotation.tailrec - private def resolveProjectVariable(variableName: String, level: Int) - (implicit pomContent: Model, identifier: MavenIdentifier, parentHierarchy: List[Model]) + private def resolveProjectVariable(variableName: String, parentHierarchy: => List[Model], level: Int) + (implicit pomContent: Model, identifier: MavenIdentifier) : Option[String] = { + lazy val parents = parentHierarchy + // Drop Maven Syntax from variable reference (e.g. ${varname}) val rawVariableName = variableName.drop(2).dropRight(1) @@ -254,16 +288,13 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } // If not resolved -> try to resolve in parent! - if (result.isEmpty && level <= parentHierarchy.length){ - resolveProjectVariable(variableName, level + 1)(parentHierarchy(level), buildParentIdentifier(pomContent), parentHierarchy) + if (result.isEmpty && level <= parents.length){ + resolveProjectVariable(variableName, parents, level + 1)(parents(level), buildParentIdentifier(pomContent)) } else { result } - } - - } object PomFileReadActor { diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index e7afe43..635de58 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -37,10 +37,6 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) final val RepoUrl = "https://repo1.maven.org/maven2/" - override def afterAll { - TestKit.shutdownActorSystem(system) - } - private def readPomFileFor(identifier: MavenIdentifier): MavenArtifact = { val downloadActor = system.actorOf(MavenDownloadActor.props) val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) From 0992b64f349cea74ddc6dd09bdacd5e064dacdaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Mon, 21 Sep 2020 14:03:31 +0200 Subject: [PATCH 16/24] Now extracting scopes for dependencies from POM files --- .../crawler/preprocessing/MavenArtifact.scala | 3 ++- .../crawler/processing/PomFileReadActor.scala | 10 ++++++---- .../processing/PomFileReadActorTest.scala | 17 ++++++++++------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index 6605b3c..3f16739 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -27,10 +27,11 @@ case class MavenArtifactMetadata(name: String, developers: List[String], licenses: List[ArtifactLicense], issueManagement: Option[IssueManagementData], - dependencies: Set[MavenIdentifier]) + dependencies: Set[ArtifactDependency]) case class IssueManagementData(system: String, url: String) case class ArtifactLicense(name: String, url:String) +case class ArtifactDependency(identifier: MavenIdentifier, scope: Option[String]) object MavenArtifact{ def withMetadata(artifact: MavenArtifact, metadata: MavenArtifactMetadata): MavenArtifact = { diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index de4de71..7497c78 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -19,7 +19,7 @@ package de.upb.cs.swt.delphi.crawler.processing import akka.actor.{Actor, ActorLogging, ActorSystem, Props} import de.upb.cs.swt.delphi.crawler.Configuration import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier -import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactLicense, IssueManagementData, MavenArtifact, MavenArtifactMetadata, PomFile} +import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactDependency, ArtifactLicense, IssueManagementData, MavenArtifact, MavenArtifactMetadata, PomFile} import de.upb.cs.swt.delphi.crawler.tools.HttpDownloader import org.apache.maven.model.{Dependency, Model} import org.apache.maven.model.io.xpp3.MavenXpp3Reader @@ -131,7 +131,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog * @param identifier Maven identifier, as sometimes version / groupID is not part of POM file! * @return Set of MavenIdentifiers for each successfully parsed dependency */ - private def getDependencies(implicit pomContent: Model, identifier: MavenIdentifier): Set[MavenIdentifier] = { + private def getDependencies(implicit pomContent: Model, identifier: MavenIdentifier): Set[ArtifactDependency] = { // Always build the parent hierarchy exactly once lazy val parentHierarchy: List[Model] = buildParentHierarchy(pomContent) @@ -160,7 +160,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog */ private def resolveDependency(dependency: Dependency, parentHierarchy: => List[Model]) (implicit pomContent: Model, identifier: MavenIdentifier) - : Try[MavenIdentifier] = { + : Try[ArtifactDependency] = { lazy val parents = parentHierarchy Try { @@ -177,7 +177,9 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog resolveProperty(dependency.getVersion, "version", parents) } - MavenIdentifier(configuration.mavenRepoBase.toString, groupId, artifactId, version) + val scope = Option(dependency.getScope) + + ArtifactDependency(MavenIdentifier(configuration.mavenRepoBase.toString, groupId, artifactId, version), scope) } } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index 635de58..ce9b6bf 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -22,7 +22,7 @@ import akka.testkit.{ImplicitSender, TestKit} import akka.util.Timeout import de.upb.cs.swt.delphi.crawler.Configuration import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier -import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActor} +import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactDependency, MavenArtifact, MavenDownloadActor} import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike} import scala.concurrent.duration._ @@ -35,7 +35,7 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) with Matchers with BeforeAndAfterAll { - final val RepoUrl = "https://repo1.maven.org/maven2/" + final val RepoUrl = new Configuration().mavenRepoBase.toString private def readPomFileFor(identifier: MavenIdentifier): MavenArtifact = { val downloadActor = system.actorOf(MavenDownloadActor.props) @@ -84,11 +84,14 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) val dependencies = annotatedArtifact.metadata.get.dependencies - assertResult(23)(dependencies.size) - assertResult(8)(dependencies.count(_.version == "4.9.2")) - assert(dependencies.contains(MavenIdentifier(RepoUrl,"org.apache.bookkeeper", "circe-checksum", "4.9.2"))) - assert(dependencies.contains(MavenIdentifier(RepoUrl,"org.apache.kerby", "kerby-config", "1.1.1"))) - assert(dependencies.contains(MavenIdentifier(RepoUrl,"commons-codec", "commons-codec", "1.6"))) + assertResult(25)(dependencies.size) + assertResult(9)(dependencies.count(_.identifier.version == "4.9.2")) + // Version is local POM reference + assert(dependencies.contains(ArtifactDependency(MavenIdentifier(RepoUrl,"org.apache.bookkeeper", "circe-checksum", "4.9.2"), None))) + // Version in a variable which is defined in parent POM + assert(dependencies.contains(ArtifactDependency(MavenIdentifier(RepoUrl,"org.apache.kerby", "kerby-config", "1.1.1"), Some("test")))) + // Version is not defined in local POM, and must be derived from parent POM + assert(dependencies.contains(ArtifactDependency(MavenIdentifier(RepoUrl,"commons-codec", "commons-codec", "1.6"), None))) } } From 221ff7dde15837d3ba830ceea7c7d9a94a4e2e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Mon, 21 Sep 2020 14:25:08 +0200 Subject: [PATCH 17/24] Now extracting parent and packaging. Fixed some storage issues --- .../crawler/preprocessing/MavenArtifact.scala | 4 +++- .../crawler/processing/PomFileReadActor.scala | 7 ++++++- .../swt/delphi/crawler/storage/ElasticActor.scala | 5 +++++ .../crawler/storage/ElasticStoreQueries.scala | 14 +++++++++++++- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index 3f16739..2709ebf 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -27,7 +27,9 @@ case class MavenArtifactMetadata(name: String, developers: List[String], licenses: List[ArtifactLicense], issueManagement: Option[IssueManagementData], - dependencies: Set[ArtifactDependency]) + dependencies: Set[ArtifactDependency], + parent:Option[MavenIdentifier], + packaging: String) case class IssueManagementData(system: String, url: String) case class ArtifactLicense(name: String, url:String) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index 7497c78..e8a54dd 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -54,6 +54,9 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog None } + val parent = Option(pom.getParent).map(p => MavenIdentifier(configuration.mavenRepoBase.toString, + p.getGroupId, p.getArtifactId, p.getVersion)) + val dependencies = getDependencies(pom, identifier) val metadata = MavenArtifactMetadata(pom.getName, @@ -61,7 +64,9 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog pom.getDevelopers.asScala.map(_.getId).toList, pom.getLicenses.asScala.map(l => ArtifactLicense(l.getName, l.getUrl)).toList, issueManagement, - dependencies) + dependencies, + parent, + pom.getPackaging) sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala index 7235f50..98e44b2 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala @@ -23,6 +23,7 @@ import de.upb.cs.swt.delphi.crawler.Identifier import de.upb.cs.swt.delphi.crawler.discovery.git.GitIdentifier import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.preprocessing.MavenArtifact import de.upb.cs.swt.delphi.crawler.processing.HermesResults /** @@ -47,6 +48,10 @@ class ElasticActor(client: ElasticClient) extends Actor with ActorLogging with A store(m) sender() ! Ack } + case a : MavenArtifact => { + store(a) + sender() ! Ack + } case g : GitIdentifier => { store(g) sender() ! Ack diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala index 1162181..91dc91e 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala @@ -64,7 +64,19 @@ trait ElasticStoreQueries { "issueManagement" -> metadata.issueManagement .map(management => Map("url" -> management.url, "system" -> management.system)).getOrElse("None"), "developers" -> metadata.developers.mkString(","), - "licenses" -> metadata.licenses.map(l => Map("name" -> l.name, "url" -> l.url)) + "licenses" -> metadata.licenses.map(l => Map("name" -> l.name, "url" -> l.url)), + "dependencies" -> metadata.dependencies.map(d => Map( + "groupId" -> d.identifier.groupId, + "artifactId" -> d.identifier.artifactId, + "version" -> d.identifier.version, + "scope" -> d.scope.getOrElse("default") + )), + "parent" -> metadata.parent.map(p => Map( + "groupId" -> p.groupId, + "artifactId" -> p.artifactId, + "version" -> p.version + )).getOrElse("None"), + "packaging" -> metadata.packaging ), "published" -> m.publicationDate.getOrElse("Unknown")) }.await) case None => From 1b0e71f3c2fbfcee4b85aa7de9f140f8c30a8984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Mon, 21 Sep 2020 15:49:32 +0200 Subject: [PATCH 18/24] Fixed a bug in actor communication. Code style improvements --- .../crawler/processing/PomFileReadActor.scala | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index e8a54dd..77eedaa 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -29,7 +29,7 @@ import scala.util.{Failure, Success, Try} /** * An Actor that receives MavenArtifacts and extracts metadata from its POM file. If successful, an - * MavenMetadata object is attached to the artifact and the artifact is returned. If failures occurr, + * MavenMetadata object is attached to the artifact and the artifact is returned. If failures occur, * the artifact is returned without metadata. * * @author Johannes Düsing @@ -47,15 +47,11 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog pomObject match { case Success(pom) => + val issueManagement = Option(pom.getIssueManagement) + .map(i => IssueManagementData(i.getSystem, i.getUrl)) - val issueManagement = if (pom.getIssueManagement != null) { - Some(IssueManagementData(pom.getIssueManagement.getSystem, pom.getIssueManagement.getUrl)) - } else { - None - } - - val parent = Option(pom.getParent).map(p => MavenIdentifier(configuration.mavenRepoBase.toString, - p.getGroupId, p.getArtifactId, p.getVersion)) + val parent = Option(pom.getParent) + .map(p => MavenIdentifier(configuration.mavenRepoBase.toString, p.getGroupId, p.getArtifactId, p.getVersion)) val dependencies = getDependencies(pom, identifier) @@ -68,7 +64,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog parent, pom.getPackaging) - sender() ! Success(MavenArtifact.withMetadata(artifact, metadata)) + sender() ! MavenArtifact.withMetadata(artifact, metadata) log.info(s"Successfully processed POM file for $identifier") From 9ae5e3c31a1dd484a2c311403e4bf0b6d819bf73 Mon Sep 17 00:00:00 2001 From: Johannes Duesing Date: Thu, 8 Oct 2020 16:19:26 +0200 Subject: [PATCH 19/24] Adapt tests to latest actor api change --- .../swt/delphi/crawler/processing/PomFileReadActorTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index ce9b6bf..2fd338a 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -55,8 +55,8 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) assert(artifact.publicationDate.isDefined && artifact.publicationDate.get != null) val result = Await.result(readerActor ? artifact, 10 seconds) - assert(result.isInstanceOf[Success[MavenArtifact]]) - result.asInstanceOf[Success[MavenArtifact]].get + assert(result.isInstanceOf[MavenArtifact]) + result.asInstanceOf[MavenArtifact] } "The POM file reader actor " must { From 06e1b748f75e5da7331acd8a241b1b681fdf7bc7 Mon Sep 17 00:00:00 2001 From: Johannes Duesing Date: Thu, 8 Oct 2020 17:26:48 +0200 Subject: [PATCH 20/24] Some restructuring to prepare persistent error storage --- .../maven/MavenDiscoveryProcess.scala | 14 ++-- .../crawler/preprocessing/MavenArtifact.scala | 2 +- .../preprocessing/MavenDownloadActor.scala | 83 +++++++++++-------- .../processing/OPALFunctionality.scala | 4 +- .../ProcessingFailureStorageActor.scala | 21 +++++ .../MavenDownloadActorTest.scala | 2 +- 6 files changed, 80 insertions(+), 46 deletions(-) create mode 100644 src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala index 01e2e27..cbcbb94 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala @@ -28,8 +28,8 @@ import de.upb.cs.swt.delphi.crawler.{AppLogging, Configuration} import de.upb.cs.swt.delphi.crawler.control.Phase import de.upb.cs.swt.delphi.crawler.control.Phase.Phase import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} -import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor} -import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults, PomFileReadActor} +import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor, MavenDownloadActorResponse} +import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults, PomFileReadActor, ProcessingFailureStorageActor} import de.upb.cs.swt.delphi.crawler.storage.ArtifactExistsQuery import de.upb.cs.swt.delphi.crawler.tools.NotYetImplementedException @@ -58,6 +58,7 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) val downloaderPool = system.actorOf(SmallestMailboxPool(8).props(MavenDownloadActor.props)) val pomReaderPool = system.actorOf(SmallestMailboxPool(8).props(PomFileReadActor.props(configuration))) + val errorHandlerPool = system.actorOf(SmallestMailboxPool(8).props(ProcessingFailureStorageActor.props)) val hermesPool = system.actorOf(SmallestMailboxPool(configuration.hermesActorPoolSize).props(HermesActor.props())) override def phase: Phase = Phase.Discovery @@ -87,13 +88,14 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) val preprocessing = filteredSource .alsoTo(createSinkFromActorRef[MavenIdentifier](elasticPool)) - .mapAsync(8)(identifier => (downloaderPool ? identifier).mapTo[Try[MavenArtifact]]) - .filter(artifact => artifact.isSuccess) - .map(artifact => artifact.get) + .mapAsync(8)(identifier => (downloaderPool ? identifier).mapTo[MavenDownloadActorResponse]) + .alsoTo(createSinkFromActorRef[MavenDownloadActorResponse](errorHandlerPool)) + .filter(!_.pomDownloadFailed) + // TODO: Adapt to new response model val finalizer = preprocessing - .mapAsync(8)(artifact => (pomReaderPool ? artifact).mapTo[MavenArtifact]) + .mapAsync(8)(artifact => (pomReaderPool ? artifact.artifact.get).mapTo[MavenArtifact]) .alsoTo(createSinkFromActorRef[MavenArtifact](elasticPool)) .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[Try[HermesResults]]) .filter(results => results.isSuccess) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index 2709ebf..d68be81 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -19,7 +19,7 @@ package de.upb.cs.swt.delphi.crawler.preprocessing import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier import org.joda.time.DateTime -case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile, +case class MavenArtifact(identifier : MavenIdentifier, jarFile: Option[JarFile], pomFile: PomFile, publicationDate: Option[DateTime], metadata: Option[MavenArtifactMetadata]) case class MavenArtifactMetadata(name: String, diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala index cf9aea9..b705489 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala @@ -26,52 +26,63 @@ import org.joda.time.format.DateTimeFormat import scala.util.{Failure, Success, Try} class MavenDownloadActor extends Actor with ActorLogging { + override def receive: Receive = { - case m : MavenIdentifier => { + case m : MavenIdentifier => implicit val system : ActorSystem = context.system val downloader = new HttpDownloader - val jarStream = downloader.downloadFromUri(m.toJarLocation.toString()) - val pomResponse = downloader.downloadFromUriWithHeaders(m.toPomLocation.toString()) - - jarStream match { - case Success(jar) => { - pomResponse match { - case Success((pomStream, pomHeaders)) => { - log.info(s"Downloaded $m") - - // Extract and parse publication date from header - val datePattern = DateTimeFormat.forPattern("E, dd MMM yyyy HH:mm:ss zzz").withLocale(Locale.ENGLISH) - val pomPublicationDate = pomHeaders.find( _.lowercaseName().equals("last-modified") ) - .map( header => Try(datePattern.parseDateTime(header.value())) ) match { - case Some(Success(date)) => Some(date) - case Some(Failure(x)) => x.printStackTrace(); None - case _ => None - } - - sender() ! Success(MavenArtifact(m, JarFile(jar, m.toJarLocation.toURL), PomFile(pomStream), - pomPublicationDate, None)) - } - case Failure(e) => { - // TODO: push error to actor - log.warning(s"Failed pom download for $m") - sender() ! Failure(e) - } + val pomResponse = downloader.downloadFromUriWithHeaders(m.toPomLocation.toString) + + pomResponse match { + case Success((pomStream, pomHeaders)) => + log.info(s"Downloaded $m") + + // Extract and parse publication date from header + val datePattern = DateTimeFormat.forPattern("E, dd MMM yyyy HH:mm:ss zzz").withLocale(Locale.ENGLISH) + val pomPublicationDate = pomHeaders.find( _.lowercaseName().equals("last-modified") ) + .map( header => Try(datePattern.parseDateTime(header.value())) ) match { + case Some(Success(date)) => Some(date) + case Some(Failure(x)) => + log.warning(s"Failed to extract publication date for $m: ${x.getMessage}") + None + case _ => None } - } - case Failure(e) => { - // TODO: push error to actor - log.warning(s"Failed jar download for $m") - sender() ! Failure(e) - } - } + downloader.downloadFromUri(m.toJarLocation.toString) match { + case Success(jar) => + sender() ! MavenDownloadActorResponse( + m, + Some(MavenArtifact(m, Some(JarFile(jar, m.toJarLocation.toURL)), PomFile(pomStream), pomPublicationDate, None)), + dateParsingFailed = pomPublicationDate.isEmpty) + case Failure(ex) => + log.warning(s"Failed to download jar file for $m") + sender() ! MavenDownloadActorResponse( + m, + Some(MavenArtifact(m, None, PomFile(pomStream), pomPublicationDate, None)), + jarDownloadFailed = true, + dateParsingFailed = pomPublicationDate.isEmpty, + errorMessage = ex.getMessage + ) + } + + case Failure(ex) => + log.error(s"Failed to download pom file for $m with message: ${ex.getMessage}") + sender() ! MavenDownloadActorResponse(m, None, pomDownloadFailed = true, errorMessage = ex.getMessage) + } - } } } + +case class MavenDownloadActorResponse(identifier: MavenIdentifier, + artifact: Option[MavenArtifact], + pomDownloadFailed: Boolean = false, + jarDownloadFailed: Boolean = false, + dateParsingFailed: Boolean = false, + errorMessage: String = "") + object MavenDownloadActor { - def props = Props(new MavenDownloadActor) + def props: Props = Props(new MavenDownloadActor) } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/OPALFunctionality.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/OPALFunctionality.scala index 23b8502..9b105d6 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/OPALFunctionality.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/OPALFunctionality.scala @@ -28,8 +28,8 @@ trait OPALFunctionality { def reifyProject(m: MavenArtifact): Project[URL] = { val project = new ClassStreamReader {}.createProject(m.identifier.toJarLocation.toURL, - new JarInputStream(m.jarFile.is)) - Try(m.jarFile.is.close()) + new JarInputStream(m.jarFile.get.is)) + Try(m.jarFile.get.is.close()) project } } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala new file mode 100644 index 0000000..9e9306d --- /dev/null +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala @@ -0,0 +1,21 @@ +package de.upb.cs.swt.delphi.crawler.processing + +import akka.actor.{Actor, ActorLogging, Props} +import de.upb.cs.swt.delphi.crawler.preprocessing.MavenDownloadActorResponse +import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.Ack + +class ProcessingFailureStorageActor extends Actor with ActorLogging{ + override def receive: Receive = { + case response@MavenDownloadActorResponse(identifier, None,true, _, _, errorMessage) => + log.info(s"Processing failed pom download for $identifier, message: $errorMessage") + sender() ! Ack + + case response@MavenDownloadActorResponse(_, Some(_), false, false, false, _) => + // This is the "all good" case, no need to do anything + sender() ! Ack + } +} + +object ProcessingFailureStorageActor { + def props: Props = Props(new ProcessingFailureStorageActor) +} diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala index f8cd7c1..5c2eb9f 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala @@ -56,7 +56,7 @@ class MavenDownloadActorTest extends TestKit(ActorSystem("DownloadActor")) assert(msg.isInstanceOf[Success[MavenArtifact]]) val artifact = msg.asInstanceOf[Success[MavenArtifact]].get - checkJar(artifact.jarFile.is) + checkJar(artifact.jarFile.get.is) checkPom(artifact.pomFile.is) assert(artifact.metadata.isEmpty) From 6d3eafa53f3d2de261a0e7acc477acf710a20d92 Mon Sep 17 00:00:00 2001 From: Johannes Duesing Date: Tue, 13 Oct 2020 14:37:40 +0200 Subject: [PATCH 21/24] First version of full error redirecting, not yet stored anywhere --- .../maven/MavenDiscoveryProcess.scala | 16 +++--- .../crawler/processing/HermesActor.scala | 2 +- .../crawler/processing/PomFileReadActor.scala | 19 +++++-- .../ProcessingFailureStorageActor.scala | 53 ++++++++++++++++++- .../delphi/crawler/storage/ElasticActor.scala | 6 +-- .../delphi/crawler/tools/HttpException.scala | 2 + 6 files changed, 81 insertions(+), 17 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala index cbcbb94..72c699d 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala @@ -29,7 +29,7 @@ import de.upb.cs.swt.delphi.crawler.control.Phase import de.upb.cs.swt.delphi.crawler.control.Phase.Phase import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor, MavenDownloadActorResponse} -import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults, PomFileReadActor, ProcessingFailureStorageActor} +import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults, PomFileReadActor, PomFileReadActorResponse, ProcessingFailureStorageActor} import de.upb.cs.swt.delphi.crawler.storage.ArtifactExistsQuery import de.upb.cs.swt.delphi.crawler.tools.NotYetImplementedException @@ -95,11 +95,15 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) // TODO: Adapt to new response model val finalizer = preprocessing - .mapAsync(8)(artifact => (pomReaderPool ? artifact.artifact.get).mapTo[MavenArtifact]) - .alsoTo(createSinkFromActorRef[MavenArtifact](elasticPool)) - .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[Try[HermesResults]]) - .filter(results => results.isSuccess) - .map(results => results.get) + .mapAsync(8)(downloadResponse=> (pomReaderPool ? downloadResponse).mapTo[PomFileReadActorResponse]) + .alsoTo(createSinkFromActorRef[PomFileReadActorResponse](errorHandlerPool)) + .alsoTo(createSinkFromActorRef[PomFileReadActorResponse](elasticPool)) + .filter(response => !response.jarDownloadFailed) + .map(_.artifact) + .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[(MavenIdentifier, Try[HermesResults])]) + .alsoTo(createSinkFromActorRef[(MavenIdentifier, Try[HermesResults])](errorHandlerPool)) + .filter(result => result._2.isSuccess) + .map(result => result._2.get) .alsoTo(createSinkFromActorRef[HermesResults](elasticPool)) .to(Sink.ignore) .run() diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala index 6e8cad3..e0fb569 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala @@ -35,7 +35,7 @@ class HermesActor() extends Actor with ActorLogging with OPALFunctionality with computeHermesResult(m, reifyProject(m)) } - sender() ! hermesResult + sender() ! (m.identifier, hermesResult) } } } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index 77eedaa..bb18871 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -19,7 +19,7 @@ package de.upb.cs.swt.delphi.crawler.processing import akka.actor.{Actor, ActorLogging, ActorSystem, Props} import de.upb.cs.swt.delphi.crawler.Configuration import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier -import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactDependency, ArtifactLicense, IssueManagementData, MavenArtifact, MavenArtifactMetadata, PomFile} +import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactDependency, ArtifactLicense, IssueManagementData, MavenArtifact, MavenArtifactMetadata, MavenDownloadActorResponse, PomFile} import de.upb.cs.swt.delphi.crawler.tools.HttpDownloader import org.apache.maven.model.{Dependency, Model} import org.apache.maven.model.io.xpp3.MavenXpp3Reader @@ -40,7 +40,10 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog implicit val system : ActorSystem = context.system override def receive: Receive = { - case artifact@MavenArtifact(identifier, _ ,PomFile(pomStream), _, _) => + + case MavenDownloadActorResponse(identifier, Some(artifact),_,jarDownloadFailed,_,_) => + + val pomStream = artifact.pomFile.is val pomObject = Try(pomReader.read(pomStream)) pomStream.close() @@ -64,14 +67,15 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog parent, pom.getPackaging) - sender() ! MavenArtifact.withMetadata(artifact, metadata) + sender() ! PomFileReadActorResponse(MavenArtifact.withMetadata(artifact, metadata), + jarDownloadFailed, pomParsingFailed = false, "") log.info(s"Successfully processed POM file for $identifier") case Failure(ex) => - log.error(s"Failed to parse POM file for artifact $identifier",ex ) + log.error(s"Failed to parse POM file for artifact $identifier",ex) // Best effort semantics: If parsing fails, artifact is returned without metadata - sender() ! artifact + sender() ! PomFileReadActorResponse(artifact, jarDownloadFailed, pomParsingFailed = true, ex.getMessage) } } @@ -300,6 +304,11 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog } } +case class PomFileReadActorResponse(artifact: MavenArtifact, + jarDownloadFailed: Boolean, + pomParsingFailed: Boolean, + errorMessage: String) + object PomFileReadActor { def props(configuration: Configuration):Props = Props(new PomFileReadActor(configuration)) } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala index 9e9306d..bfae880 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala @@ -1,17 +1,66 @@ package de.upb.cs.swt.delphi.crawler.processing import akka.actor.{Actor, ActorLogging, Props} -import de.upb.cs.swt.delphi.crawler.preprocessing.MavenDownloadActorResponse -import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.Ack +import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActorResponse} +import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} + +import scala.util.{Failure, Success} class ProcessingFailureStorageActor extends Actor with ActorLogging{ override def receive: Receive = { + + case StreamInitialized => + log.info(s"Stream initialized!") + sender() ! Ack + case StreamCompleted => + log.info(s"Stream completed!") + case StreamFailure(ex) => + log.error(ex, s"Stream failed!") + case response@MavenDownloadActorResponse(identifier, None,true, _, _, errorMessage) => + // POM Download failed, this is always an error log.info(s"Processing failed pom download for $identifier, message: $errorMessage") sender() ! Ack + case response@MavenDownloadActorResponse(identifier, _, false, false, true, errorMessage) => + // Publish date parsing failed, does not hinder further processing + log.info(s"Processing failed publish date extraction for $identifier, message: $errorMessage") + sender() ! Ack + + case response@PomFileReadActorResponse(MavenArtifact(identifier,_,_,_,Some(meta)),true, _, _) + if meta.packaging.equalsIgnoreCase("jar") => + // JAR Download failed although POM file said jar should exist, this is an error + log.info(s"Processing failed jar download for $identifier") + sender() ! Ack + + case response@PomFileReadActorResponse(MavenArtifact(identifier,_,_,_,_), _, true, errorMessage) => + // POM parsing failed + log.info(s"Processing failed pom processing for $identifier, message: $errorMessage") + sender() ! Ack + + case (identifier: MavenIdentifier, Failure(ex)) => + // Hermes processing failed + log.info(s"Processing failed Hermes analysis for $identifier, message: ${ex.getMessage}") + sender() ! Ack + case response@MavenDownloadActorResponse(_, Some(_), false, false, false, _) => // This is the "all good" case, no need to do anything + log.info("Got an all good response with no errors from MavenDownloadActor!") + sender() ! Ack + + case response@PomFileReadActorResponse(_, false, false, _) => + // This is the "all good" case, no need to do anything + log.info("Got an all good response with no errors from PomFileReadActor!") + sender() ! Ack + + case (_, Success(_)) => + // This is the "all good" case, no need to do anything + log.info(s"Got an all good response with no errors from HermesActor!") + sender() ! Ack + + case response@_ => + log.info(s"Got unexpected response format: $response") sender() ! Ack } } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala index 98e44b2..321def1 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala @@ -24,7 +24,7 @@ import de.upb.cs.swt.delphi.crawler.discovery.git.GitIdentifier import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier import de.upb.cs.swt.delphi.crawler.preprocessing.MavenArtifact -import de.upb.cs.swt.delphi.crawler.processing.HermesResults +import de.upb.cs.swt.delphi.crawler.processing.{HermesResults, PomFileReadActorResponse} /** * An actor reacting to item which should be pushed to elasticsearch @@ -48,8 +48,8 @@ class ElasticActor(client: ElasticClient) extends Actor with ActorLogging with A store(m) sender() ! Ack } - case a : MavenArtifact => { - store(a) + case PomFileReadActorResponse(artifact,_,false,_) => { + store(artifact) sender() ! Ack } case g : GitIdentifier => { diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpException.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpException.scala index 5ca31b9..b575867 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpException.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpException.scala @@ -4,4 +4,6 @@ import akka.http.scaladsl.model.StatusCode class HttpException(code: StatusCode) extends Throwable { + override def getMessage: String = s"Got an unexpected HTTP response, code $code." + } From b160740d30e18358b99ed37d3ebbe59ced97f5a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Mon, 19 Oct 2020 14:27:46 +0200 Subject: [PATCH 22/24] First working version that stores errors in elastic using a new type 'error' in the 'delphi' index --- .../maven/MavenDiscoveryProcess.scala | 15 ++--- .../maven/MavenProcessingError.scala | 49 ++++++++++++++ .../crawler/processing/HermesActor.scala | 6 +- .../crawler/processing/PomFileReadActor.scala | 4 +- .../ProcessingFailureStorageActor.scala | 67 ++++++++++++------- .../delphi/crawler/storage/ElasticActor.scala | 6 +- .../storage/ElasticIndexMaintenance.scala | 11 +-- .../crawler/storage/ElasticStoreQueries.scala | 14 +++- .../swt/delphi/crawler/storage/package.scala | 2 + 9 files changed, 130 insertions(+), 44 deletions(-) create mode 100644 src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenProcessingError.scala diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala index 72c699d..0e4e59b 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala @@ -29,7 +29,7 @@ import de.upb.cs.swt.delphi.crawler.control.Phase import de.upb.cs.swt.delphi.crawler.control.Phase.Phase import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor, MavenDownloadActorResponse} -import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults, PomFileReadActor, PomFileReadActorResponse, ProcessingFailureStorageActor} +import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesActorResponse, HermesResults, PomFileReadActor, PomFileReadActorResponse, ProcessingFailureStorageActor} import de.upb.cs.swt.delphi.crawler.storage.ArtifactExistsQuery import de.upb.cs.swt.delphi.crawler.tools.NotYetImplementedException @@ -58,7 +58,7 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) val downloaderPool = system.actorOf(SmallestMailboxPool(8).props(MavenDownloadActor.props)) val pomReaderPool = system.actorOf(SmallestMailboxPool(8).props(PomFileReadActor.props(configuration))) - val errorHandlerPool = system.actorOf(SmallestMailboxPool(8).props(ProcessingFailureStorageActor.props)) + val errorHandlerPool = system.actorOf(SmallestMailboxPool(8).props(ProcessingFailureStorageActor.props(elasticPool))) val hermesPool = system.actorOf(SmallestMailboxPool(configuration.hermesActorPoolSize).props(HermesActor.props())) override def phase: Phase = Phase.Discovery @@ -92,18 +92,17 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) .alsoTo(createSinkFromActorRef[MavenDownloadActorResponse](errorHandlerPool)) .filter(!_.pomDownloadFailed) - // TODO: Adapt to new response model val finalizer = preprocessing - .mapAsync(8)(downloadResponse=> (pomReaderPool ? downloadResponse).mapTo[PomFileReadActorResponse]) + .mapAsync(8)(downloadResponse => (pomReaderPool ? downloadResponse).mapTo[PomFileReadActorResponse]) .alsoTo(createSinkFromActorRef[PomFileReadActorResponse](errorHandlerPool)) .alsoTo(createSinkFromActorRef[PomFileReadActorResponse](elasticPool)) .filter(response => !response.jarDownloadFailed) .map(_.artifact) - .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[(MavenIdentifier, Try[HermesResults])]) - .alsoTo(createSinkFromActorRef[(MavenIdentifier, Try[HermesResults])](errorHandlerPool)) - .filter(result => result._2.isSuccess) - .map(result => result._2.get) + .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[HermesActorResponse]) + .alsoTo(createSinkFromActorRef[HermesActorResponse](errorHandlerPool)) + .filter(_.result.isSuccess) + .map(_.result.get) .alsoTo(createSinkFromActorRef[HermesResults](elasticPool)) .to(Sink.ignore) .run() diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenProcessingError.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenProcessingError.scala new file mode 100644 index 0000000..50f4d79 --- /dev/null +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenProcessingError.scala @@ -0,0 +1,49 @@ +// Copyright (C) 2018 The Delphi Team. +// See the LICENCE file distributed with this work for additional +// information regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de.upb.cs.swt.delphi.crawler.discovery.maven + +import org.joda.time.DateTime + +case class MavenProcessingError(identifier: MavenIdentifier, + occurredAt: DateTime, + errorType: MavenErrorType.Value, + message: String) + +object MavenErrorType extends Enumeration { + type MavenErrorType = Value + + val PomDownloadFailed, JarDownloadFailed, PomParsingFailed, HermesProcessingFailed = Value +} + + +object MavenProcessingError { + + private def createError(identifier: MavenIdentifier, errorType: MavenErrorType.Value, message: String) = + MavenProcessingError(identifier, DateTime.now(), errorType, message) + + def createPomDownloadError(identifier: MavenIdentifier, message: String): MavenProcessingError = + createError(identifier, MavenErrorType.PomDownloadFailed, message) + + def createJarDownloadError(identifier: MavenIdentifier, message: String): MavenProcessingError = + createError(identifier, MavenErrorType.JarDownloadFailed, message) + + def createPomParsingError(identifier: MavenIdentifier, message: String): MavenProcessingError = + createError(identifier, MavenErrorType.PomParsingFailed, message) + + def createHermesProcessingError(identifier: MavenIdentifier, message: String): MavenProcessingError = + createError(identifier, MavenErrorType.HermesProcessingFailed, message) +} \ No newline at end of file diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala index e0fb569..ae683f0 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala @@ -35,7 +35,7 @@ class HermesActor() extends Actor with ActorLogging with OPALFunctionality with computeHermesResult(m, reifyProject(m)) } - sender() ! (m.identifier, hermesResult) + sender() ! HermesActorResponse(m.identifier, hermesResult) } } } @@ -46,4 +46,6 @@ object HermesActor { } -case class HermesResults(identifier: MavenIdentifier, featureMap: Map[String, Int]) \ No newline at end of file +case class HermesResults(identifier: MavenIdentifier, featureMap: Map[String, Int]) + +case class HermesActorResponse(identifier: MavenIdentifier, result: Try[HermesResults]) \ No newline at end of file diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala index bb18871..83f9508 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -41,7 +41,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog override def receive: Receive = { - case MavenDownloadActorResponse(identifier, Some(artifact),_,jarDownloadFailed,_,_) => + case MavenDownloadActorResponse(identifier, Some(artifact),_,jarDownloadFailed,_,errorMessage) => val pomStream = artifact.pomFile.is @@ -68,7 +68,7 @@ class PomFileReadActor(configuration: Configuration) extends Actor with ActorLog pom.getPackaging) sender() ! PomFileReadActorResponse(MavenArtifact.withMetadata(artifact, metadata), - jarDownloadFailed, pomParsingFailed = false, "") + jarDownloadFailed, pomParsingFailed = false, if(jarDownloadFailed) errorMessage else "") log.info(s"Successfully processed POM file for $identifier") diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala index bfae880..cf11c4a 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala @@ -1,70 +1,85 @@ +// Copyright (C) 2018 The Delphi Team. +// See the LICENCE file distributed with this work for additional +// information regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package de.upb.cs.swt.delphi.crawler.processing -import akka.actor.{Actor, ActorLogging, Props} -import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import akka.actor.{Actor, ActorLogging, ActorRef, Props} +import de.upb.cs.swt.delphi.crawler.discovery.maven.{MavenIdentifier, MavenProcessingError} import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActorResponse} import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} -import scala.util.{Failure, Success} +import scala.util.Failure + +class ProcessingFailureStorageActor(elasticPool: ActorRef) extends Actor with ActorLogging { -class ProcessingFailureStorageActor extends Actor with ActorLogging{ override def receive: Receive = { case StreamInitialized => log.info(s"Stream initialized!") sender() ! Ack + case StreamCompleted => log.info(s"Stream completed!") + case StreamFailure(ex) => log.error(ex, s"Stream failed!") - case response@MavenDownloadActorResponse(identifier, None,true, _, _, errorMessage) => + case MavenDownloadActorResponse(identifier, None,true, _, _, errorMessage) => // POM Download failed, this is always an error log.info(s"Processing failed pom download for $identifier, message: $errorMessage") + storeError(MavenProcessingError.createPomDownloadError(identifier, errorMessage)) sender() ! Ack - case response@MavenDownloadActorResponse(identifier, _, false, false, true, errorMessage) => + case MavenDownloadActorResponse(identifier, _, false, false, true, errorMessage) => // Publish date parsing failed, does not hinder further processing log.info(s"Processing failed publish date extraction for $identifier, message: $errorMessage") sender() ! Ack - case response@PomFileReadActorResponse(MavenArtifact(identifier,_,_,_,Some(meta)),true, _, _) + case PomFileReadActorResponse(MavenArtifact(identifier,_,_,_,Some(meta)),true, _, errorMessage) if meta.packaging.equalsIgnoreCase("jar") => // JAR Download failed although POM file said jar should exist, this is an error log.info(s"Processing failed jar download for $identifier") + storeError(MavenProcessingError.createJarDownloadError(identifier, errorMessage)) sender() ! Ack - case response@PomFileReadActorResponse(MavenArtifact(identifier,_,_,_,_), _, true, errorMessage) => + case PomFileReadActorResponse(MavenArtifact(identifier,_,_,_,_), _, true, errorMessage) => // POM parsing failed log.info(s"Processing failed pom processing for $identifier, message: $errorMessage") + storeError(MavenProcessingError.createPomParsingError(identifier, errorMessage)) sender() ! Ack - case (identifier: MavenIdentifier, Failure(ex)) => + case HermesActorResponse(identifier: MavenIdentifier, Failure(ex)) => // Hermes processing failed log.info(s"Processing failed Hermes analysis for $identifier, message: ${ex.getMessage}") + storeError(MavenProcessingError.createHermesProcessingError(identifier, ex.getMessage)) sender() ! Ack - case response@MavenDownloadActorResponse(_, Some(_), false, false, false, _) => - // This is the "all good" case, no need to do anything - log.info("Got an all good response with no errors from MavenDownloadActor!") - sender() ! Ack - - case response@PomFileReadActorResponse(_, false, false, _) => - // This is the "all good" case, no need to do anything - log.info("Got an all good response with no errors from PomFileReadActor!") - sender() ! Ack - - case (_, Success(_)) => - // This is the "all good" case, no need to do anything - log.info(s"Got an all good response with no errors from HermesActor!") + case response + if response.isInstanceOf[MavenDownloadActorResponse] || response.isInstanceOf[PomFileReadActorResponse] || + response.isInstanceOf[HermesActorResponse] => sender() ! Ack - case response@_ => - log.info(s"Got unexpected response format: $response") - sender() ! Ack + case msg@_ => + log.error(s"Invalid message format: $msg") + sender() ! StreamFailure } + + private def storeError(error: MavenProcessingError): Unit = elasticPool forward error } object ProcessingFailureStorageActor { - def props: Props = Props(new ProcessingFailureStorageActor) + def props(elasticPool: ActorRef): Props = Props(new ProcessingFailureStorageActor(elasticPool)) } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala index 321def1..554a0a0 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala @@ -22,7 +22,7 @@ import com.sksamuel.elastic4s.http.ElasticClient import de.upb.cs.swt.delphi.crawler.Identifier import de.upb.cs.swt.delphi.crawler.discovery.git.GitIdentifier import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} -import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.discovery.maven.{MavenIdentifier, MavenProcessingError} import de.upb.cs.swt.delphi.crawler.preprocessing.MavenArtifact import de.upb.cs.swt.delphi.crawler.processing.{HermesResults, PomFileReadActorResponse} @@ -60,6 +60,10 @@ class ElasticActor(client: ElasticClient) extends Actor with ActorLogging with A store(h) sender() ! Ack } + case e : MavenProcessingError => { + store(e) + sender() ! Ack + } case x => log.warning("Received unknown message: [{}] ", x) } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala index 84c8874..aec0141 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala @@ -59,11 +59,14 @@ trait ElasticIndexMaintenance extends AppLogging { objectField("identifier") fields identifierFields, textField("methods") analyzer KeywordAnalyzer ), - objectField("features") fields featureList - ) - ) - + ), + mapping(processingError) as ( + keywordField("type"), + keywordField("message"), + dateField("occurred"), + keywordField("identifier") + )) }.await //Increases maximum number of nested fields diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala index 91dc91e..588549f 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala @@ -22,7 +22,7 @@ import com.sksamuel.elastic4s.http.index.IndexResponse import com.sksamuel.elastic4s.http.update.UpdateResponse import com.sksamuel.elastic4s.http.{ElasticClient, Response} import de.upb.cs.swt.delphi.crawler.discovery.git.GitIdentifier -import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.discovery.maven.{MavenIdentifier, MavenProcessingError} import de.upb.cs.swt.delphi.crawler.preprocessing.MavenArtifact import de.upb.cs.swt.delphi.crawler.processing.{HermesAnalyzer, HermesResults} import org.joda.time.DateTime @@ -91,6 +91,18 @@ trait ElasticStoreQueries { } } + def store(error: MavenProcessingError)(implicit client: ElasticClient, log: LoggingAdapter): Response[IndexResponse]= { + log.info(s"Pushing new error to elastic regarding identifier ${error.identifier}") + client.execute { + indexInto(delphiProcessingErrorType).id(error.occurredAt.getMillis.toString).fields( + "identifier" -> error.identifier.toUniqueString, + "occurred" -> error.occurredAt, + "message" -> error.message, + "type" -> error.errorType.toString + ) + }.await + } + def store(g: GitIdentifier)(implicit client: ElasticClient, log: LoggingAdapter): Response[IndexResponse] = { log.info("Pushing new git identifier to elastic: [{}]", g) client.execute { diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/package.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/package.scala index 1979e48..a13f167 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/package.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/package.scala @@ -21,5 +21,7 @@ import com.sksamuel.elastic4s.IndexAndType package object storage { val delphi = "delphi" val project = "project" + val processingError = "error" val delphiProjectType: IndexAndType = IndexAndType(delphi,project) + val delphiProcessingErrorType: IndexAndType = IndexAndType(delphi, processingError) } From 2078bd7e9052156bca91f0fa80020edb8f8a17d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Mon, 19 Oct 2020 15:18:04 +0200 Subject: [PATCH 23/24] Adapted tests to last change in actor APIs --- .../MavenDownloadActorTest.scala | 18 +++++----- .../processing/PomFileReadActorTest.scala | 36 ++++++++++++------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala index 5c2eb9f..1ae1385 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala @@ -27,7 +27,6 @@ import scala.concurrent.duration._ import de.upb.cs.swt.delphi.crawler.preprocessing.Common._ import scala.concurrent.Await -import scala.util.{Success, Try} /** * @author Hariharan. @@ -38,24 +37,27 @@ class MavenDownloadActorTest extends TestKit(ActorSystem("DownloadActor")) with WordSpecLike with Matchers with BeforeAndAfterAll { - override def afterAll { - TestKit.shutdownActorSystem(system) - } "The maven download actor" must { "create a maven artifact with a jar and pom file" in { val mavenIdentifier = new MavenIdentifier("https://repo1.maven.org/maven2/", "junit", "junit", "4.12") val downloadActor = system.actorOf(MavenDownloadActor.props) - implicit val timeout = Timeout(10 seconds) - implicit val ec = system.dispatcher + implicit val timeout: Timeout = Timeout(10 seconds) val f = downloadActor ? mavenIdentifier val msg = Await.result(f, 10 seconds) - assert(msg.isInstanceOf[Success[MavenArtifact]]) - val artifact = msg.asInstanceOf[Success[MavenArtifact]].get + assert(msg.isInstanceOf[MavenDownloadActorResponse]) + val response = msg.asInstanceOf[MavenDownloadActorResponse] + + assert(!response.pomDownloadFailed) + assert(!response.dateParsingFailed) + assert(!response.jarDownloadFailed) + assert(response.artifact.isDefined) + + val artifact = response.artifact.get checkJar(artifact.jarFile.get.is) checkPom(artifact.pomFile.is) diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala index 2fd338a..4b69ec0 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -22,22 +22,20 @@ import akka.testkit.{ImplicitSender, TestKit} import akka.util.Timeout import de.upb.cs.swt.delphi.crawler.Configuration import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier -import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactDependency, MavenArtifact, MavenDownloadActor} -import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike} +import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactDependency, MavenDownloadActor, MavenDownloadActorResponse} +import org.scalatest.{Matchers, WordSpecLike} import scala.concurrent.duration._ import scala.concurrent.{Await, ExecutionContext} -import scala.util.Success class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) with ImplicitSender with WordSpecLike - with Matchers - with BeforeAndAfterAll { + with Matchers { final val RepoUrl = new Configuration().mavenRepoBase.toString - private def readPomFileFor(identifier: MavenIdentifier): MavenArtifact = { + private def readPomFileFor(identifier: MavenIdentifier): PomFileReadActorResponse = { val downloadActor = system.actorOf(MavenDownloadActor.props) val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) @@ -48,20 +46,29 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) val msg = Await.result(f, 10 seconds) - assert(msg.isInstanceOf[Success[MavenArtifact]]) - val artifact = msg.asInstanceOf[Success[MavenArtifact]].get + assert(msg.isInstanceOf[MavenDownloadActorResponse]) + + val response = msg.asInstanceOf[MavenDownloadActorResponse] + + assert(!response.pomDownloadFailed && !response.jarDownloadFailed && + !response.dateParsingFailed && response.artifact.isDefined) + + val artifact = response.artifact.get assert(artifact.metadata.isEmpty) assert(artifact.publicationDate.isDefined && artifact.publicationDate.get != null) - val result = Await.result(readerActor ? artifact, 10 seconds) - assert(result.isInstanceOf[MavenArtifact]) - result.asInstanceOf[MavenArtifact] + val result = Await.result(readerActor ? response, 10 seconds) + assert(result.isInstanceOf[PomFileReadActorResponse]) + result.asInstanceOf[PomFileReadActorResponse] } "The POM file reader actor " must { "create a maven artifact with valid metadata" in { - val annotatedArtifact = readPomFileFor(MavenIdentifier(RepoUrl, "junit", "junit", "4.12")) + val readActorResponse = readPomFileFor(MavenIdentifier(RepoUrl, "junit", "junit", "4.12")) + assert(!readActorResponse.pomParsingFailed) + + val annotatedArtifact = readActorResponse.artifact assert(annotatedArtifact.metadata.isDefined) val metadata = annotatedArtifact.metadata.get @@ -80,7 +87,10 @@ class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) } "process dependencies as expected" in { - val annotatedArtifact = readPomFileFor(MavenIdentifier(RepoUrl, "org.apache.bookkeeper", "bookkeeper-server", "4.9.2")) + val readActorResponse = readPomFileFor(MavenIdentifier(RepoUrl, "org.apache.bookkeeper", "bookkeeper-server", "4.9.2")) + assert(!readActorResponse.pomParsingFailed) + + val annotatedArtifact = readActorResponse.artifact val dependencies = annotatedArtifact.metadata.get.dependencies From 73560af7a0854b12cd638e857d6ad8a2d49f692f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20D=C3=BCsing?= Date: Mon, 19 Oct 2020 15:53:13 +0200 Subject: [PATCH 24/24] Fixed bug in elastic data model regarding error storage --- .../delphi/crawler/storage/ElasticIndexMaintenance.scala | 2 +- .../swt/delphi/crawler/storage/ElasticStoreQueries.scala | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala index aec0141..808f374 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala @@ -65,7 +65,7 @@ trait ElasticIndexMaintenance extends AppLogging { keywordField("type"), keywordField("message"), dateField("occurred"), - keywordField("identifier") + objectField("identifier") fields identifierFields )) }.await diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala index 588549f..28cd05e 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala @@ -95,7 +95,10 @@ trait ElasticStoreQueries { log.info(s"Pushing new error to elastic regarding identifier ${error.identifier}") client.execute { indexInto(delphiProcessingErrorType).id(error.occurredAt.getMillis.toString).fields( - "identifier" -> error.identifier.toUniqueString, + "identifier" -> Map( + "groupId" -> error.identifier.groupId, + "artifactId" -> error.identifier.artifactId, + "version" -> error.identifier.version), "occurred" -> error.occurredAt, "message" -> error.message, "type" -> error.errorType.toString @@ -115,7 +118,7 @@ trait ElasticStoreQueries { } def store(m: MavenIdentifier)(implicit client: ElasticClient, log: LoggingAdapter): Response[IndexResponse] = { - log.info("Pushing new maven identifier to elastic: [{}]", m) + log.info("Pushing new maven identifier to elastic: [{}]", m.toUniqueString) client.execute { indexInto(delphiProjectType).id(m.toUniqueString) .fields("name" -> m.toUniqueString,