diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala index 14c3825..0e4e59b 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenDiscoveryProcess.scala @@ -28,8 +28,8 @@ import de.upb.cs.swt.delphi.crawler.{AppLogging, Configuration} import de.upb.cs.swt.delphi.crawler.control.Phase import de.upb.cs.swt.delphi.crawler.control.Phase.Phase import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} -import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActor} -import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults} +import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor, MavenDownloadActorResponse} +import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesActorResponse, HermesResults, PomFileReadActor, PomFileReadActorResponse, ProcessingFailureStorageActor} import de.upb.cs.swt.delphi.crawler.storage.ArtifactExistsQuery import de.upb.cs.swt.delphi.crawler.tools.NotYetImplementedException @@ -57,6 +57,8 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) private val seen = mutable.HashSet[MavenIdentifier]() val downloaderPool = system.actorOf(SmallestMailboxPool(8).props(MavenDownloadActor.props)) + val pomReaderPool = system.actorOf(SmallestMailboxPool(8).props(PomFileReadActor.props(configuration))) + val errorHandlerPool = system.actorOf(SmallestMailboxPool(8).props(ProcessingFailureStorageActor.props(elasticPool))) val hermesPool = system.actorOf(SmallestMailboxPool(configuration.hermesActorPoolSize).props(HermesActor.props())) override def phase: Phase = Phase.Discovery @@ -86,15 +88,21 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef) val preprocessing = filteredSource .alsoTo(createSinkFromActorRef[MavenIdentifier](elasticPool)) - .mapAsync(8)(identifier => (downloaderPool ? identifier).mapTo[Try[MavenArtifact]]) - .filter(artifact => artifact.isSuccess) - .map(artifact => artifact.get) + .mapAsync(8)(identifier => (downloaderPool ? identifier).mapTo[MavenDownloadActorResponse]) + .alsoTo(createSinkFromActorRef[MavenDownloadActorResponse](errorHandlerPool)) + .filter(!_.pomDownloadFailed) val finalizer = preprocessing - .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[Try[HermesResults]]) - .filter(results => results.isSuccess) - .map(results => results.get) + .mapAsync(8)(downloadResponse => (pomReaderPool ? downloadResponse).mapTo[PomFileReadActorResponse]) + .alsoTo(createSinkFromActorRef[PomFileReadActorResponse](errorHandlerPool)) + .alsoTo(createSinkFromActorRef[PomFileReadActorResponse](elasticPool)) + .filter(response => !response.jarDownloadFailed) + .map(_.artifact) + .mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[HermesActorResponse]) + .alsoTo(createSinkFromActorRef[HermesActorResponse](errorHandlerPool)) + .filter(_.result.isSuccess) + .map(_.result.get) .alsoTo(createSinkFromActorRef[HermesResults](elasticPool)) .to(Sink.ignore) .run() diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenProcessingError.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenProcessingError.scala new file mode 100644 index 0000000..50f4d79 --- /dev/null +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/discovery/maven/MavenProcessingError.scala @@ -0,0 +1,49 @@ +// Copyright (C) 2018 The Delphi Team. +// See the LICENCE file distributed with this work for additional +// information regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de.upb.cs.swt.delphi.crawler.discovery.maven + +import org.joda.time.DateTime + +case class MavenProcessingError(identifier: MavenIdentifier, + occurredAt: DateTime, + errorType: MavenErrorType.Value, + message: String) + +object MavenErrorType extends Enumeration { + type MavenErrorType = Value + + val PomDownloadFailed, JarDownloadFailed, PomParsingFailed, HermesProcessingFailed = Value +} + + +object MavenProcessingError { + + private def createError(identifier: MavenIdentifier, errorType: MavenErrorType.Value, message: String) = + MavenProcessingError(identifier, DateTime.now(), errorType, message) + + def createPomDownloadError(identifier: MavenIdentifier, message: String): MavenProcessingError = + createError(identifier, MavenErrorType.PomDownloadFailed, message) + + def createJarDownloadError(identifier: MavenIdentifier, message: String): MavenProcessingError = + createError(identifier, MavenErrorType.JarDownloadFailed, message) + + def createPomParsingError(identifier: MavenIdentifier, message: String): MavenProcessingError = + createError(identifier, MavenErrorType.PomParsingFailed, message) + + def createHermesProcessingError(identifier: MavenIdentifier, message: String): MavenProcessingError = + createError(identifier, MavenErrorType.HermesProcessingFailed, message) +} \ No newline at end of file diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala index 3025eff..d68be81 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenArtifact.scala @@ -17,5 +17,26 @@ package de.upb.cs.swt.delphi.crawler.preprocessing import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import org.joda.time.DateTime -case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile) +case class MavenArtifact(identifier : MavenIdentifier, jarFile: Option[JarFile], pomFile: PomFile, + publicationDate: Option[DateTime], metadata: Option[MavenArtifactMetadata]) + +case class MavenArtifactMetadata(name: String, + description: String, + developers: List[String], + licenses: List[ArtifactLicense], + issueManagement: Option[IssueManagementData], + dependencies: Set[ArtifactDependency], + parent:Option[MavenIdentifier], + packaging: String) + +case class IssueManagementData(system: String, url: String) +case class ArtifactLicense(name: String, url:String) +case class ArtifactDependency(identifier: MavenIdentifier, scope: Option[String]) + +object MavenArtifact{ + def withMetadata(artifact: MavenArtifact, metadata: MavenArtifactMetadata): MavenArtifact = { + MavenArtifact(artifact.identifier, artifact.jarFile, artifact.pomFile, artifact.publicationDate, Some(metadata)) + } +} diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala index 52be089..b705489 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActor.scala @@ -16,48 +16,73 @@ package de.upb.cs.swt.delphi.crawler.preprocessing +import java.util.Locale + import akka.actor.{Actor, ActorLogging, ActorSystem, Props} import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier import de.upb.cs.swt.delphi.crawler.tools.HttpDownloader +import org.joda.time.format.DateTimeFormat -import scala.util.{Failure, Success} +import scala.util.{Failure, Success, Try} class MavenDownloadActor extends Actor with ActorLogging { + override def receive: Receive = { - case m : MavenIdentifier => { + case m : MavenIdentifier => implicit val system : ActorSystem = context.system val downloader = new HttpDownloader - val jarStream = downloader.downloadFromUri(m.toJarLocation.toString()) - val pomStream = downloader.downloadFromUri(m.toPomLocation.toString()) - - jarStream match { - case Success(jar) => { - pomStream match { - case Success(pom) => { - log.info(s"Downloaded $m") - sender() ! Success(MavenArtifact(m, JarFile(jar, m.toJarLocation.toURL), PomFile(pom))) - } - case Failure(e) => { - // TODO: push error to actor - log.warning(s"Failed pom download for $m") - sender() ! Failure(e) - } + val pomResponse = downloader.downloadFromUriWithHeaders(m.toPomLocation.toString) + + pomResponse match { + case Success((pomStream, pomHeaders)) => + log.info(s"Downloaded $m") + + // Extract and parse publication date from header + val datePattern = DateTimeFormat.forPattern("E, dd MMM yyyy HH:mm:ss zzz").withLocale(Locale.ENGLISH) + val pomPublicationDate = pomHeaders.find( _.lowercaseName().equals("last-modified") ) + .map( header => Try(datePattern.parseDateTime(header.value())) ) match { + case Some(Success(date)) => Some(date) + case Some(Failure(x)) => + log.warning(s"Failed to extract publication date for $m: ${x.getMessage}") + None + case _ => None } - } - case Failure(e) => { - // TODO: push error to actor - log.warning(s"Failed jar download for $m") - sender() ! Failure(e) - } - } + downloader.downloadFromUri(m.toJarLocation.toString) match { + case Success(jar) => + sender() ! MavenDownloadActorResponse( + m, + Some(MavenArtifact(m, Some(JarFile(jar, m.toJarLocation.toURL)), PomFile(pomStream), pomPublicationDate, None)), + dateParsingFailed = pomPublicationDate.isEmpty) + case Failure(ex) => + log.warning(s"Failed to download jar file for $m") + sender() ! MavenDownloadActorResponse( + m, + Some(MavenArtifact(m, None, PomFile(pomStream), pomPublicationDate, None)), + jarDownloadFailed = true, + dateParsingFailed = pomPublicationDate.isEmpty, + errorMessage = ex.getMessage + ) + } + + case Failure(ex) => + log.error(s"Failed to download pom file for $m with message: ${ex.getMessage}") + sender() ! MavenDownloadActorResponse(m, None, pomDownloadFailed = true, errorMessage = ex.getMessage) + } - } } } + +case class MavenDownloadActorResponse(identifier: MavenIdentifier, + artifact: Option[MavenArtifact], + pomDownloadFailed: Boolean = false, + jarDownloadFailed: Boolean = false, + dateParsingFailed: Boolean = false, + errorMessage: String = "") + object MavenDownloadActor { - def props = Props(new MavenDownloadActor) + def props: Props = Props(new MavenDownloadActor) } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala index 6e8cad3..ae683f0 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/HermesActor.scala @@ -35,7 +35,7 @@ class HermesActor() extends Actor with ActorLogging with OPALFunctionality with computeHermesResult(m, reifyProject(m)) } - sender() ! hermesResult + sender() ! HermesActorResponse(m.identifier, hermesResult) } } } @@ -46,4 +46,6 @@ object HermesActor { } -case class HermesResults(identifier: MavenIdentifier, featureMap: Map[String, Int]) \ No newline at end of file +case class HermesResults(identifier: MavenIdentifier, featureMap: Map[String, Int]) + +case class HermesActorResponse(identifier: MavenIdentifier, result: Try[HermesResults]) \ No newline at end of file diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/OPALFunctionality.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/OPALFunctionality.scala index 23b8502..9b105d6 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/OPALFunctionality.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/OPALFunctionality.scala @@ -28,8 +28,8 @@ trait OPALFunctionality { def reifyProject(m: MavenArtifact): Project[URL] = { val project = new ClassStreamReader {}.createProject(m.identifier.toJarLocation.toURL, - new JarInputStream(m.jarFile.is)) - Try(m.jarFile.is.close()) + new JarInputStream(m.jarFile.get.is)) + Try(m.jarFile.get.is.close()) project } } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala new file mode 100644 index 0000000..83f9508 --- /dev/null +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActor.scala @@ -0,0 +1,314 @@ +// Copyright (C) 2018 The Delphi Team. +// See the LICENCE file distributed with this work for additional +// information regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de.upb.cs.swt.delphi.crawler.processing + +import akka.actor.{Actor, ActorLogging, ActorSystem, Props} +import de.upb.cs.swt.delphi.crawler.Configuration +import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactDependency, ArtifactLicense, IssueManagementData, MavenArtifact, MavenArtifactMetadata, MavenDownloadActorResponse, PomFile} +import de.upb.cs.swt.delphi.crawler.tools.HttpDownloader +import org.apache.maven.model.{Dependency, Model} +import org.apache.maven.model.io.xpp3.MavenXpp3Reader + +import scala.collection.JavaConverters._ +import scala.util.{Failure, Success, Try} + +/** + * An Actor that receives MavenArtifacts and extracts metadata from its POM file. If successful, an + * MavenMetadata object is attached to the artifact and the artifact is returned. If failures occur, + * the artifact is returned without metadata. + * + * @author Johannes Düsing + */ +class PomFileReadActor(configuration: Configuration) extends Actor with ActorLogging{ + + val pomReader: MavenXpp3Reader = new MavenXpp3Reader() + implicit val system : ActorSystem = context.system + + override def receive: Receive = { + + case MavenDownloadActorResponse(identifier, Some(artifact),_,jarDownloadFailed,_,errorMessage) => + + val pomStream = artifact.pomFile.is + + val pomObject = Try(pomReader.read(pomStream)) + pomStream.close() + + pomObject match { + case Success(pom) => + val issueManagement = Option(pom.getIssueManagement) + .map(i => IssueManagementData(i.getSystem, i.getUrl)) + + val parent = Option(pom.getParent) + .map(p => MavenIdentifier(configuration.mavenRepoBase.toString, p.getGroupId, p.getArtifactId, p.getVersion)) + + val dependencies = getDependencies(pom, identifier) + + val metadata = MavenArtifactMetadata(pom.getName, + pom.getDescription, + pom.getDevelopers.asScala.map(_.getId).toList, + pom.getLicenses.asScala.map(l => ArtifactLicense(l.getName, l.getUrl)).toList, + issueManagement, + dependencies, + parent, + pom.getPackaging) + + sender() ! PomFileReadActorResponse(MavenArtifact.withMetadata(artifact, metadata), + jarDownloadFailed, pomParsingFailed = false, if(jarDownloadFailed) errorMessage else "") + + log.info(s"Successfully processed POM file for $identifier") + + case Failure(ex) => + log.error(s"Failed to parse POM file for artifact $identifier",ex) + // Best effort semantics: If parsing fails, artifact is returned without metadata + sender() ! PomFileReadActorResponse(artifact, jarDownloadFailed, pomParsingFailed = true, ex.getMessage) + } + + } + + /** + * Tries to resolve, download and parse the parent POM file of the given POM. + * @param pomContent Content of a POM file to resolve parent for + * @return Content of Parent POM, or None if no parent is specified or an error occurred + */ + private def getParentPomModel(implicit pomContent: Model): Option[Model] = { + val parentDef = pomContent.getParent + + if (parentDef != null && parentDef.getGroupId != null && parentDef.getArtifactId != null && parentDef.getVersion != null){ + val parentIdentifier = MavenIdentifier(configuration.mavenRepoBase.toString, parentDef.getGroupId, + parentDef.getArtifactId, parentDef.getVersion) + + new HttpDownloader().downloadFromUri(parentIdentifier.toPomLocation.toString) match { + case Success(pomStream) => + val parentPom = pomReader.read(pomStream) + pomStream.close() + + Some(parentPom) + case Failure(x) => + log.error(x, s"Failed to download parent POM") + None + } + } + else { + None + } + } + + /** + * Recursive method building the parent hierarchy of the given POM. Will download and parse all parent POMs and + * return them in a list. + * @param pomContent POM file to build the parent hierarchy for + * @return List of parent POMs. Might be empty, if no parent is specified at all + */ + private def buildParentHierarchy(implicit pomContent: Model): List[Model] = { + getParentPomModel(pomContent) match { + case Some(parentContent) => + List(parentContent) ++ buildParentHierarchy(parentContent) + case _ => + List() + } + } + + + private def buildParentIdentifier(implicit pomContent:Model): MavenIdentifier = { + MavenIdentifier(configuration.mavenRepoBase.toString, pomContent.getParent.getGroupId, + pomContent.getParent.getArtifactId, pomContent.getParent.getVersion) + } + + /** + * Retrieve all dependencies specified in the given POM file as MavenIdentifiers. Try to resolve variables as well. + * Only returns successfully resolved dependencies, omits failures. + * @param pomContent Object holding POM file contents + * @param identifier Maven identifier, as sometimes version / groupID is not part of POM file! + * @return Set of MavenIdentifiers for each successfully parsed dependency + */ + private def getDependencies(implicit pomContent: Model, identifier: MavenIdentifier): Set[ArtifactDependency] = { + + // Always build the parent hierarchy exactly once + lazy val parentHierarchy: List[Model] = buildParentHierarchy(pomContent) + + // Try to resolve each dependency specified in the POM + val dependencies = pomContent + .getDependencies + .asScala + .toSet[Dependency] + .map(resolveDependency(_, parentHierarchy)) + + if (dependencies.count(_.isFailure) > 0) { + log.warning(s"Failed to resolve some dependencies for $identifier") + } + + // Only return those dependencies that have been successfully resolved + for (Success(identifier) <- dependencies) yield identifier + } + + /** + * Process raw dependency specification from POM file, validate text values and try to resolve project variables. + * @param dependency Raw dependency specification as given in the POM file + * @param pomContent Contents of the POM file + * @param identifier Artifact identifier, as sometimes version / groupID is not part of POM file + * @return Try object holding the dependency's MavenIdentifier if successful + */ + private def resolveDependency(dependency: Dependency, parentHierarchy: => List[Model]) + (implicit pomContent: Model, identifier: MavenIdentifier) + : Try[ArtifactDependency] = { + lazy val parents = parentHierarchy + + Try { + // Resolve groupID and artifact id in current POM + val groupId = resolveProperty(dependency.getGroupId, "groupID", parents) + val artifactId = resolveProperty(dependency.getArtifactId, "artifactID", parents) + + // Often dependency versions are left empty, as they are specified in the parent! + val version: String = if(dependency.getVersion == null && parents.nonEmpty){ + // If there are parents and version is empty => Try to resolve version in parents + resolveDependencyVersion(dependency, pomContent, identifier, parents) + } else { + // If no parents are present or version is specified => Resolve as regular property + resolveProperty(dependency.getVersion, "version", parents) + } + + val scope = Option(dependency.getScope) + + ArtifactDependency(MavenIdentifier(configuration.mavenRepoBase.toString, groupId, artifactId, version), scope) + } + } + + /** + * Resolve the version of the given dependency by inspecting the tag of all parent POMs. + * @param dependency Dependency to resolve version for, ie. no explicit version is defined for this dependency! + * @param pomContent Content of the current POM file to inspect + * @param identifier Identifier of the current POM file + * @param level Level in the parent hierarchy, needed for recursion + * @param parentHierarchy Parent hierarchy object + * @return String value of the resolved version + * @throws NullPointerException If version could not be resolved in any parent + */ + @scala.annotation.tailrec + private def resolveDependencyVersion(dependency: Dependency, pomContent: Model, identifier: MavenIdentifier, + parentHierarchy: => List[Model], level: Int = 0): String = { + lazy val parents = parentHierarchy + + if(pomContent.getDependencyManagement != null){ + // If there is a dependency management tag: Try to find matching groupID and artifactID + pomContent + .getDependencyManagement.getDependencies + .asScala.toSet[Dependency] + .filter(d => d.getGroupId.equals(dependency.getGroupId) && d.getArtifactId.equals(dependency.getArtifactId)) + .map(_.getVersion) + .find(_ != null) match { + case Some(version) => + // Found matching version definition, try to resolve it if its a variable + resolveProperty(version, "version", parents, level)(pomContent, identifier) + case None if level < parents.length => + // Found no matching version definition, but there is parents left to recurse to + resolveDependencyVersion(dependency, parents(level), buildParentIdentifier(pomContent), parents, level + 1) + case None if level >= parents.length => + // No parent left to recurse, so this really is a dependency without a version + throw new NullPointerException(s"Version was null and could not be resolved in parent") + } + } + else if(level < parentHierarchy.length) { + // There is no dependency management tag, immediately recurse into parent if parent left + resolveDependencyVersion(dependency, parents(level), buildParentIdentifier(pomContent), parents, level + 1) + } + else { + // No parent left to recurse, so this really is a dependency without a version + throw new NullPointerException(s"Version was null and could not be resolved in parent") + } + + + } + + /** + * Resolve the given property value of an dependency specification and do input validation + * @param propValue Value to resolve + * @param propName Name of the property (for error logging) + * @param pomContent Contents of the POM file + * @return Fully resolved string value of the property if successful + * @throws NullPointerException If a null values was found for a required property + * @throws RuntimeException If actor failed to resolve a variable inside the POM file + */ + private def resolveProperty(propValue: String, propName: String, parentHierarchy: => List[Model], level: Int = 0) + (implicit pomContent:Model, identifier:MavenIdentifier) + : String = { + lazy val parents = parentHierarchy + if(propValue == null){ + throw new NullPointerException(s"Property '$propName' must not be null for dependencies") + } + else if (propValue.startsWith("$")){ + resolveProjectVariable(propValue, parents, level) + .getOrElse(throw new RuntimeException(s"Failed to resolve variable '$propValue' for property '$propName'")) + } + else { + propValue + } + } + + //noinspection ScalaStyle + @scala.annotation.tailrec + private def resolveProjectVariable(variableName: String, parentHierarchy: => List[Model], level: Int) + (implicit pomContent: Model, identifier: MavenIdentifier) + : Option[String] = { + lazy val parents = parentHierarchy + + // Drop Maven Syntax from variable reference (e.g. ${varname}) + val rawVariableName = variableName.drop(2).dropRight(1) + + // Split dot-separated variable names + val variableParts = rawVariableName.split("\\.", 2) + + var result: Option[String] = None + + // Resolve special references to POM attributes + if (variableParts(0).equals("project") || variableParts(0).equals("pom")) { + result = variableParts(1) match { + // groupID always present in identifier, but not always explicit in POM + case "groupId" => Some(identifier.groupId) + // artifactID always present in POM + case "artifactId" => Some(pomContent.getArtifactId) + // Version always present in identifier, but not always explicit in POM + case "version" => Some(identifier.version) + // Can only extract parent version if explicitly stated + case "parent.version" if pomContent.getParent != null && pomContent.getParent.getVersion != null => + Some(pomContent.getParent.getVersion) + case _ => None + } + } + else { + // All other formats are interpreted as POM property names + result = Option(pomContent.getProperties.getProperty(rawVariableName)) + } + + // If not resolved -> try to resolve in parent! + if (result.isEmpty && level <= parents.length){ + resolveProjectVariable(variableName, parents, level + 1)(parents(level), buildParentIdentifier(pomContent)) + } + else { + result + } + } +} + +case class PomFileReadActorResponse(artifact: MavenArtifact, + jarDownloadFailed: Boolean, + pomParsingFailed: Boolean, + errorMessage: String) + +object PomFileReadActor { + def props(configuration: Configuration):Props = Props(new PomFileReadActor(configuration)) +} diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala new file mode 100644 index 0000000..cf11c4a --- /dev/null +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/processing/ProcessingFailureStorageActor.scala @@ -0,0 +1,85 @@ +// Copyright (C) 2018 The Delphi Team. +// See the LICENCE file distributed with this work for additional +// information regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de.upb.cs.swt.delphi.crawler.processing + +import akka.actor.{Actor, ActorLogging, ActorRef, Props} +import de.upb.cs.swt.delphi.crawler.discovery.maven.{MavenIdentifier, MavenProcessingError} +import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActorResponse} +import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} + +import scala.util.Failure + +class ProcessingFailureStorageActor(elasticPool: ActorRef) extends Actor with ActorLogging { + + override def receive: Receive = { + + case StreamInitialized => + log.info(s"Stream initialized!") + sender() ! Ack + + case StreamCompleted => + log.info(s"Stream completed!") + + case StreamFailure(ex) => + log.error(ex, s"Stream failed!") + + case MavenDownloadActorResponse(identifier, None,true, _, _, errorMessage) => + // POM Download failed, this is always an error + log.info(s"Processing failed pom download for $identifier, message: $errorMessage") + storeError(MavenProcessingError.createPomDownloadError(identifier, errorMessage)) + sender() ! Ack + + case MavenDownloadActorResponse(identifier, _, false, false, true, errorMessage) => + // Publish date parsing failed, does not hinder further processing + log.info(s"Processing failed publish date extraction for $identifier, message: $errorMessage") + sender() ! Ack + + case PomFileReadActorResponse(MavenArtifact(identifier,_,_,_,Some(meta)),true, _, errorMessage) + if meta.packaging.equalsIgnoreCase("jar") => + // JAR Download failed although POM file said jar should exist, this is an error + log.info(s"Processing failed jar download for $identifier") + storeError(MavenProcessingError.createJarDownloadError(identifier, errorMessage)) + sender() ! Ack + + case PomFileReadActorResponse(MavenArtifact(identifier,_,_,_,_), _, true, errorMessage) => + // POM parsing failed + log.info(s"Processing failed pom processing for $identifier, message: $errorMessage") + storeError(MavenProcessingError.createPomParsingError(identifier, errorMessage)) + sender() ! Ack + + case HermesActorResponse(identifier: MavenIdentifier, Failure(ex)) => + // Hermes processing failed + log.info(s"Processing failed Hermes analysis for $identifier, message: ${ex.getMessage}") + storeError(MavenProcessingError.createHermesProcessingError(identifier, ex.getMessage)) + sender() ! Ack + + case response + if response.isInstanceOf[MavenDownloadActorResponse] || response.isInstanceOf[PomFileReadActorResponse] || + response.isInstanceOf[HermesActorResponse] => + sender() ! Ack + + case msg@_ => + log.error(s"Invalid message format: $msg") + sender() ! StreamFailure + } + + private def storeError(error: MavenProcessingError): Unit = elasticPool forward error +} + +object ProcessingFailureStorageActor { + def props(elasticPool: ActorRef): Props = Props(new ProcessingFailureStorageActor(elasticPool)) +} diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala index 7235f50..554a0a0 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticActor.scala @@ -22,8 +22,9 @@ import com.sksamuel.elastic4s.http.ElasticClient import de.upb.cs.swt.delphi.crawler.Identifier import de.upb.cs.swt.delphi.crawler.discovery.git.GitIdentifier import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized} -import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier -import de.upb.cs.swt.delphi.crawler.processing.HermesResults +import de.upb.cs.swt.delphi.crawler.discovery.maven.{MavenIdentifier, MavenProcessingError} +import de.upb.cs.swt.delphi.crawler.preprocessing.MavenArtifact +import de.upb.cs.swt.delphi.crawler.processing.{HermesResults, PomFileReadActorResponse} /** * An actor reacting to item which should be pushed to elasticsearch @@ -47,6 +48,10 @@ class ElasticActor(client: ElasticClient) extends Actor with ActorLogging with A store(m) sender() ! Ack } + case PomFileReadActorResponse(artifact,_,false,_) => { + store(artifact) + sender() ! Ack + } case g : GitIdentifier => { store(g) sender() ! Ack @@ -55,6 +60,10 @@ class ElasticActor(client: ElasticClient) extends Actor with ActorLogging with A store(h) sender() ! Ack } + case e : MavenProcessingError => { + store(e) + sender() ! Ack + } case x => log.warning("Received unknown message: [{}] ", x) } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala index 84c8874..808f374 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticIndexMaintenance.scala @@ -59,11 +59,14 @@ trait ElasticIndexMaintenance extends AppLogging { objectField("identifier") fields identifierFields, textField("methods") analyzer KeywordAnalyzer ), - objectField("features") fields featureList - ) - ) - + ), + mapping(processingError) as ( + keywordField("type"), + keywordField("message"), + dateField("occurred"), + objectField("identifier") fields identifierFields + )) }.await //Increases maximum number of nested fields diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala index e26ba3a..28cd05e 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/ElasticStoreQueries.scala @@ -22,7 +22,8 @@ import com.sksamuel.elastic4s.http.index.IndexResponse import com.sksamuel.elastic4s.http.update.UpdateResponse import com.sksamuel.elastic4s.http.{ElasticClient, Response} import de.upb.cs.swt.delphi.crawler.discovery.git.GitIdentifier -import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.discovery.maven.{MavenIdentifier, MavenProcessingError} +import de.upb.cs.swt.delphi.crawler.preprocessing.MavenArtifact import de.upb.cs.swt.delphi.crawler.processing.{HermesAnalyzer, HermesResults} import org.joda.time.DateTime @@ -49,6 +50,62 @@ trait ElasticStoreQueries { } } + def store(m: MavenArtifact)(implicit client: ElasticClient, log: LoggingAdapter): Option[Response[UpdateResponse]] = { + elasticId(m.identifier) match { + case Some(id) => + log.info(s"Pushing POM file contents for ${m.identifier} under id $id") + + m.metadata match { + case Some(metadata) => + Some(client.execute { + update(id).in(delphiProjectType).doc(fields = "pom" -> Map( + "name" -> metadata.name, + "description" -> metadata.description, + "issueManagement" -> metadata.issueManagement + .map(management => Map("url" -> management.url, "system" -> management.system)).getOrElse("None"), + "developers" -> metadata.developers.mkString(","), + "licenses" -> metadata.licenses.map(l => Map("name" -> l.name, "url" -> l.url)), + "dependencies" -> metadata.dependencies.map(d => Map( + "groupId" -> d.identifier.groupId, + "artifactId" -> d.identifier.artifactId, + "version" -> d.identifier.version, + "scope" -> d.scope.getOrElse("default") + )), + "parent" -> metadata.parent.map(p => Map( + "groupId" -> p.groupId, + "artifactId" -> p.artifactId, + "version" -> p.version + )).getOrElse("None"), + "packaging" -> metadata.packaging + ), "published" -> m.publicationDate.getOrElse("Unknown")) + }.await) + case None => + log.warning(s"Tried to push POM file results to database, but no results are present for identifier: ${m.identifier}") + None + } + + + case None => + log.warning(s"Tried to push POM file results for non-existing identifier: ${m.identifier}.") + None + } + } + + def store(error: MavenProcessingError)(implicit client: ElasticClient, log: LoggingAdapter): Response[IndexResponse]= { + log.info(s"Pushing new error to elastic regarding identifier ${error.identifier}") + client.execute { + indexInto(delphiProcessingErrorType).id(error.occurredAt.getMillis.toString).fields( + "identifier" -> Map( + "groupId" -> error.identifier.groupId, + "artifactId" -> error.identifier.artifactId, + "version" -> error.identifier.version), + "occurred" -> error.occurredAt, + "message" -> error.message, + "type" -> error.errorType.toString + ) + }.await + } + def store(g: GitIdentifier)(implicit client: ElasticClient, log: LoggingAdapter): Response[IndexResponse] = { log.info("Pushing new git identifier to elastic: [{}]", g) client.execute { @@ -61,7 +118,7 @@ trait ElasticStoreQueries { } def store(m: MavenIdentifier)(implicit client: ElasticClient, log: LoggingAdapter): Response[IndexResponse] = { - log.info("Pushing new maven identifier to elastic: [{}]", m) + log.info("Pushing new maven identifier to elastic: [{}]", m.toUniqueString) client.execute { indexInto(delphiProjectType).id(m.toUniqueString) .fields("name" -> m.toUniqueString, diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/package.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/package.scala index 1979e48..a13f167 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/package.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/storage/package.scala @@ -21,5 +21,7 @@ import com.sksamuel.elastic4s.IndexAndType package object storage { val delphi = "delphi" val project = "project" + val processingError = "error" val delphiProjectType: IndexAndType = IndexAndType(delphi,project) + val delphiProcessingErrorType: IndexAndType = IndexAndType(delphi, processingError) } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpDownloader.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpDownloader.scala index 452b6cf..f43aec5 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpDownloader.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpDownloader.scala @@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit import akka.actor.ActorSystem import akka.http.scaladsl.Http -import akka.http.scaladsl.model.{HttpRequest, HttpResponse, StatusCodes} +import akka.http.scaladsl.model.{HttpHeader, HttpRequest, HttpResponse, StatusCodes} import akka.stream.ActorMaterializer import akka.stream.scaladsl.{Sink, StreamConverters} import akka.util.ByteString @@ -48,4 +48,20 @@ class HttpDownloader(implicit val system: ActorSystem) { Failure(new HttpException(code)) } } + + def downloadFromUriWithHeaders(requestedUri: String): Try[(InputStream, Seq[HttpHeader])] = { + val responseFuture: Future[HttpResponse] = + Http().singleRequest(HttpRequest(uri = requestedUri)) + + + Await.result(responseFuture, Duration.Inf) match { + case HttpResponse(StatusCodes.OK, headers, entity, _) => + Try(( + new ByteArrayInputStream(Await.result(entity.dataBytes.runFold(ByteString.empty)(_ ++ _).map(_.toArray), Duration.Inf)), + headers)) + case resp@HttpResponse(code, _, _, _) => + resp.discardEntityBytes() + Failure(new HttpException(code)) + } + } } diff --git a/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpException.scala b/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpException.scala index 5ca31b9..b575867 100644 --- a/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpException.scala +++ b/src/main/scala/de/upb/cs/swt/delphi/crawler/tools/HttpException.scala @@ -4,4 +4,6 @@ import akka.http.scaladsl.model.StatusCode class HttpException(code: StatusCode) extends Throwable { + override def getMessage: String = s"Got an unexpected HTTP response, code $code." + } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala index 022369e..1ae1385 100644 --- a/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/preprocessing/MavenDownloadActorTest.scala @@ -27,7 +27,6 @@ import scala.concurrent.duration._ import de.upb.cs.swt.delphi.crawler.preprocessing.Common._ import scala.concurrent.Await -import scala.util.{Success, Try} /** * @author Hariharan. @@ -38,28 +37,32 @@ class MavenDownloadActorTest extends TestKit(ActorSystem("DownloadActor")) with WordSpecLike with Matchers with BeforeAndAfterAll { - override def afterAll { - TestKit.shutdownActorSystem(system) - } "The maven download actor" must { "create a maven artifact with a jar and pom file" in { - val mavenIdentifier = new MavenIdentifier("http://central.maven.org/maven2/", "junit", "junit", "4.12") + val mavenIdentifier = new MavenIdentifier("https://repo1.maven.org/maven2/", "junit", "junit", "4.12") val downloadActor = system.actorOf(MavenDownloadActor.props) - implicit val timeout = Timeout(10 seconds) - implicit val ec = system.dispatcher + implicit val timeout: Timeout = Timeout(10 seconds) val f = downloadActor ? mavenIdentifier val msg = Await.result(f, 10 seconds) - assert(msg.isInstanceOf[Success[MavenArtifact]]) - val artifact = msg.asInstanceOf[Success[MavenArtifact]].get - checkJar(artifact.jarFile.is) - checkPom(artifact.pomFile.is) + assert(msg.isInstanceOf[MavenDownloadActorResponse]) + val response = msg.asInstanceOf[MavenDownloadActorResponse] + assert(!response.pomDownloadFailed) + assert(!response.dateParsingFailed) + assert(!response.jarDownloadFailed) + assert(response.artifact.isDefined) + + val artifact = response.artifact.get + checkJar(artifact.jarFile.get.is) + checkPom(artifact.pomFile.is) + assert(artifact.metadata.isEmpty) + assert(artifact.publicationDate.isDefined && artifact.publicationDate.get != null) } } } diff --git a/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala new file mode 100644 index 0000000..4b69ec0 --- /dev/null +++ b/src/test/scala/de/upb/cs/swt/delphi/crawler/processing/PomFileReadActorTest.scala @@ -0,0 +1,108 @@ +// Copyright (C) 2018 The Delphi Team. +// See the LICENCE file distributed with this work for additional +// information regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de.upb.cs.swt.delphi.crawler.processing + +import akka.actor.ActorSystem +import akka.pattern.ask +import akka.testkit.{ImplicitSender, TestKit} +import akka.util.Timeout +import de.upb.cs.swt.delphi.crawler.Configuration +import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier +import de.upb.cs.swt.delphi.crawler.preprocessing.{ArtifactDependency, MavenDownloadActor, MavenDownloadActorResponse} +import org.scalatest.{Matchers, WordSpecLike} + +import scala.concurrent.duration._ +import scala.concurrent.{Await, ExecutionContext} + +class PomFileReadActorTest extends TestKit(ActorSystem("DownloadActor")) + with ImplicitSender + with WordSpecLike + with Matchers { + + final val RepoUrl = new Configuration().mavenRepoBase.toString + + private def readPomFileFor(identifier: MavenIdentifier): PomFileReadActorResponse = { + val downloadActor = system.actorOf(MavenDownloadActor.props) + val readerActor = system.actorOf(PomFileReadActor.props(new Configuration())) + + implicit val timeout: Timeout = Timeout(10 seconds) + implicit val ec: ExecutionContext = system.dispatcher + + val f = downloadActor ? identifier + + val msg = Await.result(f, 10 seconds) + + assert(msg.isInstanceOf[MavenDownloadActorResponse]) + + val response = msg.asInstanceOf[MavenDownloadActorResponse] + + assert(!response.pomDownloadFailed && !response.jarDownloadFailed && + !response.dateParsingFailed && response.artifact.isDefined) + + val artifact = response.artifact.get + + assert(artifact.metadata.isEmpty) + assert(artifact.publicationDate.isDefined && artifact.publicationDate.get != null) + + val result = Await.result(readerActor ? response, 10 seconds) + assert(result.isInstanceOf[PomFileReadActorResponse]) + result.asInstanceOf[PomFileReadActorResponse] + } + + "The POM file reader actor " must { + "create a maven artifact with valid metadata" in { + val readActorResponse = readPomFileFor(MavenIdentifier(RepoUrl, "junit", "junit", "4.12")) + assert(!readActorResponse.pomParsingFailed) + + val annotatedArtifact = readActorResponse.artifact + + assert(annotatedArtifact.metadata.isDefined) + val metadata = annotatedArtifact.metadata.get + + assert(metadata.name != null && metadata.name.equals("JUnit")) + assert(metadata.description != null && metadata.description.startsWith("JUnit is a unit testing framework for Java,")) + + assert(metadata.issueManagement.isDefined) + assertResult("https://github.com/junit-team/junit/issues")(metadata.issueManagement.get.url) + assertResult("github")(metadata.issueManagement.get.system) + + assertResult(4)(metadata.developers.size) + + assertResult(1)(metadata.licenses.size) + assertResult("Eclipse Public License 1.0")(metadata.licenses.head.name) + } + + "process dependencies as expected" in { + val readActorResponse = readPomFileFor(MavenIdentifier(RepoUrl, "org.apache.bookkeeper", "bookkeeper-server", "4.9.2")) + assert(!readActorResponse.pomParsingFailed) + + val annotatedArtifact = readActorResponse.artifact + + val dependencies = annotatedArtifact.metadata.get.dependencies + + assertResult(25)(dependencies.size) + assertResult(9)(dependencies.count(_.identifier.version == "4.9.2")) + // Version is local POM reference + assert(dependencies.contains(ArtifactDependency(MavenIdentifier(RepoUrl,"org.apache.bookkeeper", "circe-checksum", "4.9.2"), None))) + // Version in a variable which is defined in parent POM + assert(dependencies.contains(ArtifactDependency(MavenIdentifier(RepoUrl,"org.apache.kerby", "kerby-config", "1.1.1"), Some("test")))) + // Version is not defined in local POM, and must be derived from parent POM + assert(dependencies.contains(ArtifactDependency(MavenIdentifier(RepoUrl,"commons-codec", "commons-codec", "1.6"), None))) + } + } + +}