Add anomaly detection Markdown summary report

JohT · JohT · commit 03af6da3472d · 2025-09-15T17:32:46.000+02:00
diff --git a/domains/anomaly-detection/anomalyDetectionCsv.sh b/domains/anomaly-detection/anomalyDetectionCsv.sh
@@ -67,6 +67,8 @@ anomaly_detection_features() {
 # Required Parameters:
 # - projection_node_label=...
 #   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Default: "Java". Example: "Typescript"
 anomaly_detection_queries() {
     local nodeLabel
     nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -92,6 +94,8 @@ anomaly_detection_queries() {
 # Required Parameters:
 # - projection_node_label=...
 #   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
 anomaly_detection_labels() {
     local nodeLabel
     nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -109,6 +113,63 @@ anomaly_detection_labels() {
     # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"
 }
 
+# TODO delete if not needed anymore
+# # Initialize anomaly detail (Markdown) report. 
+# # Intended to be run the before the first call of "anomaly_detection_detail_report".
+# initialize_anomaly_detection_detail_report() {
+#   archetype_summary_directory=${FULL_REPORT_DIRECTORY}/archetype-summary-${language}-${nodeLabel}
+#   rm -rf "${archetype_summary_directory}"
+# }
+
+# Appends a Markdown table to an existing file and
+# removes redundant header + separator rows.
+#
+# Usage:
+#   cat newTable.md | append_table myMarkdownFile.md
+#
+#   append_table myMarkdownFile.md <<'EOF'
+#   | Name | Score | Archetype |
+#   | ---  | ---   | ---       |
+#   | Bar  | 0.9   | Something |
+#   EOF
+#
+# Behavior:
+#   - Keeps the first header row and its following separator row.
+#   - Removes all subsequent duplicate header + separator pairs.
+#   - Leaves all data rows untouched.
+append_to_markdown_table() {
+  local file="$1"
+
+  # Append stdin to the target file
+  cat >> "${file}"
+  
+  # Clean up duplicate headers (header row + --- row)
+  awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
+}
+
+# Aggregates all results in a Markdown report.
+# Note: Call "initialize_anomaly_detection_detail_report" before the first call of this function.
+#
+# Required Parameters:
+# - projection_node_label=...
+#   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
+anomaly_detection_detail_report() {
+    local nodeLabel
+    nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
+    
+    local language
+    language=$( extractQueryParameter "projection_language" "${@}" )
+    
+    echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} anomaly summary Markdown report..."
+    
+    archetype_summary_directory=${FULL_REPORT_DIRECTORY}/archetype-summary-${language}-${nodeLabel}
+    mkdir -p "${archetype_summary_directory}"
+    # execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeSummary.cypher" "${@}" --output-markdown-table | append_to_markdown_table "${FULL_REPORT_DIRECTORY}/TopAnomaliesByArchetype.md"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeSummary.cypher" "${@}" --output-markdown-table > "${archetype_summary_directory}/TopAnomaliesByArchetype.md"
+}
+
 # Run the anomaly detection pipeline.
 # 
 # Required Parameters:
@@ -118,10 +179,13 @@ anomaly_detection_labels() {
 #   Label of the nodes that will be used for the projection. Example: "Package"
 # - projection_weight_property=...
 #   Name of the node property that contains the dependency weight. Example: "weight"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
 anomaly_detection_csv_reports() {
     time anomaly_detection_features "${@}"
     time anomaly_detection_queries "${@}"
     time anomaly_detection_labels "${@}"
+    time anomaly_detection_detail_report "${@}"
 }
 
 # Create report directory
@@ -146,6 +210,9 @@ ALGORITHM_LANGUAGE="projection_language"
 COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
 EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClustering"
 
+#TODO delete if not needed anymore
+#initialize_anomaly_detection_detail_report
+
 # -- Java Artifact Node Embeddings -------------------------------
 
 if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java"; then
diff --git a/domains/anomaly-detection/summary/AnomaliesInTotal.cypher b/domains/anomaly-detection/summary/AnomaliesInTotal.cypher
@@ -0,0 +1,22 @@
+// Anomaly Detection Summary: Overview of all analyzed code units in total. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
+
+MATCH (codeUnit)
+WHERE (codeUnit.incomingDependencies IS NOT NULL 
+   OR  codeUnit.outgoingDependencies IS NOT NULL)
+  WITH count(DISTINCT codeUnit)                  AS codeUnitCount
+      ,sum(codeUnit.anomalyLabel)                AS anomalyCount
+      ,sum(sign(codeUnit.anomalyAuthorityRank))  AS authorityCount
+      ,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
+      ,sum(sign(codeUnit.anomalyBridgeRank))     AS bridgeCount
+      ,sum(sign(codeUnit.anomalyHubRank))        AS hubCount
+      ,sum(sign(codeUnit.anomalyOutlierRank))    AS outlierCount
+      //,collect(codeUnit.name)[0..4]  AS exampleNames
+ RETURN codeUnitCount    AS `Analyzed Units`
+       ,anomalyCount     AS `Anomalies`
+       ,authorityCount   AS `Authorities`
+       ,bottleNeckCount  AS `Bottlenecks`
+       ,bridgeCount      AS `Bridges`
+       ,hubCount         AS `Hubs`
+       ,outlierCount     AS `Outliers`
+       //,exampleNames
+ ORDER BY anomalyCount DESC, codeUnitCount DESC
diff --git a/domains/anomaly-detection/summary/AnomaliesPerAbstractionLayer.cypher b/domains/anomaly-detection/summary/AnomaliesPerAbstractionLayer.cypher
@@ -0,0 +1,30 @@
+// Anomaly Detection Summary: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
+
+MATCH (codeUnit)
+WHERE (codeUnit.incomingDependencies IS NOT NULL 
+   OR  codeUnit.outgoingDependencies IS NOT NULL)
+UNWIND labels(codeUnit) AS codeUnitLabel
+  WITH *
+ WHERE NOT codeUnitLabel STARTS WITH 'Mark4'
+   AND NOT codeUnitLabel IN ['File', 'Directory', 'ByteCode', 'GenericDeclaration']
+  WITH collect(codeUnitLabel) AS codeUnitLabels
+      ,codeUnit
+  WITH apoc.text.join(codeUnitLabels, ',')       AS codeUnitLabels
+      ,count(DISTINCT codeUnit)                  AS codeUnitCount
+      ,sum(codeUnit.anomalyLabel)                AS anomalyCount
+      ,sum(sign(codeUnit.anomalyAuthorityRank))  AS authorityCount
+      ,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
+      ,sum(sign(codeUnit.anomalyBridgeRank))     AS bridgeCount
+      ,sum(sign(codeUnit.anomalyHubRank))        AS hubCount
+      ,sum(sign(codeUnit.anomalyOutlierRank))    AS outlierCount
+      //,collect(codeUnit.name)[0..4]  AS exampleNames
+ RETURN codeUnitLabels   AS `Abstraction Level`
+       ,codeUnitCount    AS `Units`
+       ,anomalyCount     AS `Anomalies`
+       ,authorityCount   AS `Authorities`
+       ,bottleNeckCount  AS `Bottlenecks`
+       ,bridgeCount      AS `Bridges`
+       ,hubCount         AS `Hubs`
+       ,outlierCount     AS `Outliers`
+       //,exampleNames
+ ORDER BY anomalyCount DESC, codeUnitCount DESC
diff --git a/domains/anomaly-detection/summary/AnomalyDetectionReportTopArchetypes.cypher b/domains/anomaly-detection/summary/AnomalyDetectionReportTopArchetypes.cypher
@@ -0,0 +1,38 @@
+// Anomaly Detection Labels: Summarizes all labelled archetypes by their anomaly score including their archetype rank. For code units with more than one archetype, the one with the higher rank is shown. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
+
+ MATCH (codeUnit)
+ WHERE $projection_node_label IN labels(codeUnit)
+UNWIND keys(codeUnit) AS codeUnitProperty
+  WITH *
+ WHERE codeUnitProperty STARTS WITH 'anomaly'
+   AND codeUnitProperty ENDS   WITH 'Rank'
+  WITH *
+      ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+      ,split(split(codeUnitProperty, 'anomaly')[1], 'Rank')[0] AS archetype             
+      ,codeUnit[codeUnitProperty]                              AS archetypeRank
+      ,codeUnit.anomalyScore                                   AS anomalyScore
+ ORDER BY codeUnit.anomalyScore DESC, archetypeRank ASC, codeUnitName ASC, archetype ASC
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+    WITH *, artifact.name                                             AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+    WITH *, last(split(projectRoot.absoluteFileName, '/'))            AS projectName
+OPTIONAL MATCH (codeDirectory:File:Directory)-[:CONTAINS]->(codeUnit)
+    WITH *, split(replace(codeDirectory.fileName, './', ''), '/')[-2] AS directoryName
+    WITH *, coalesce(artifactName, projectName, directoryName, "")    AS projectName
+RETURN projectName                                                            AS `Contained in`
+      //$projection_language + ' ' +  $projection_node_label                  AS `Code Unit`
+      ,codeUnitName                                                           AS `Name`
+      ,round(anomalyScore, 4, 'HALF_UP')                                      AS `Score`
+      ,collect(archetype)[0]                                                  AS `Archetype`
+      ,collect(archetypeRank)[0]                                              AS `Archetype Rank`
+      ,nullif(codeUnit.anomalyTopFeature1, "")                                AS `Top Feature 1`
+      ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue1, 4, 'HALF_UP'), 0.0) AS `Top Feature 1 SHAP`
+      ,nullif(codeUnit.anomalyTopFeature2, "")                                AS `Top Feature 2`
+      ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue2, 4, 'HALF_UP'), 0.0) AS `Top Feature 2 SHAP`
+      ,nullif(codeUnit.anomalyTopFeature3, "")                                AS `Top Feature 3`
+      ,nullif(round(codeUnit.anomalyTopFeatureSHAPValue3, 4, 'HALF_UP'), 0.0) AS `Top Feature 3 SHAP`
+      ,CASE WHEN codeUnit.anomalyScore <= 0          THEN 'Typical'
+            WHEN codeUnit.anomalyTopFeature1 IS NULL THEN 'Undetermined'
+                                                     ELSE 'Anomalous'     END AS `Model Status`
+      //,collect(archetype)[1]     AS secondaryArchetype
+      //,collect(archetypeRank)[1] AS secondaryArchetypeRank
diff --git a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+
+# Creates a Markdown report that contains all results of all the anomaly detection methods.
+# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
+# The results will be written into the sub directory reports/anomaly-detection.
+
+# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
+# Note that either "anomalyDetectionCsv.sh" or "anomalyDetectionPython.sh" is required to run prior to this script.
+
+# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
+
+# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
+set -o errexit -o pipefail
+
+# Overrideable Constants (defaults also defined in sub scripts)
+REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
+MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"}
+
+## Get this "domains/anomaly-detection/summary" directory if not already set
+# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
+# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
+# This way non-standard tools like readlink aren't needed.
+ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
+echo "anomalyDetectionSummary: ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR}"
+# Get the "scripts" directory by taking the path of this script and going one directory up.
+SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SUMMARY_DIR}/../../../scripts"} # Repository directory containing the shell scripts
+
+# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
+source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
+
+# Appends a Markdown table to an existing file and
+# removes redundant header + separator rows.
+#
+# Usage:
+#   cat newTable.md | append_table myMarkdownFile.md
+#
+#   append_table myMarkdownFile.md <<'EOF'
+#   | Name | Score | Archetype |
+#   | ---  | ---   | ---       |
+#   | Bar  | 0.9   | Something |
+#   EOF
+#
+# Behavior:
+#   - Keeps the first header row and its following separator row.
+#   - Removes all subsequent duplicate header + separator pairs.
+#   - Leaves all data rows untouched.
+append_to_markdown_table() {
+  local file="$1"
+
+  # Append stdin to the target file
+  cat >> "${file}"
+  
+  # Clean up duplicate headers (header row + --- row)
+  awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
+}
+
+# Aggregates all results in a Markdown report.
+#
+# Required Parameters:
+# - projection_node_label=...
+#   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
+anomaly_detection_summary_detail_report() {
+    local nodeLabel
+    nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
+    
+    local language
+    language=$( extractQueryParameter "projection_language" "${@}" )
+    
+    echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} anomaly summary Markdown report..."
+    
+    anomaly_summary_directory=${FULL_REPORT_DIRECTORY}/anomaly_summary_${language}_${nodeLabel}
+    mkdir -p "${anomaly_summary_directory}"
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomalyDetectionReportTopArchetypes.cypher" "${@}" --output-markdown-table > "${anomaly_summary_directory}/TopAnomaliesByArchetype.md"
+    # Clean-up after report generation. Empty reports will be deleted.
+    source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${anomaly_summary_directory}"
+}
+
+# Run the anomaly detection overview report generation.
+anomaly_detection_overview_report() {
+    local report_markdown_includes_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
+    mkdir -p "${report_markdown_includes_directory}"
+    
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesPerAbstractionLayer.md"
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesInTotal.md"
+}
+
+# Run the anomaly detection report generation.
+# 
+# Required Parameters:
+# - projection_node_label=...
+#   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
+anomaly_detection_report() {
+    time anomaly_detection_summary_detail_report "${@}"
+}
+
+# Create report directory
+REPORT_NAME="anomaly-detection"
+FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
+mkdir -p "${FULL_REPORT_DIRECTORY}"
+
+# Query Parameter key pairs for projection and algorithm side
+ALGORITHM_NODE="projection_node_label"
+ALGORITHM_LANGUAGE="projection_language"
+
+# -- Overview Report for all code type -------------------------------
+
+anomaly_detection_overview_report
+
+# -- Detail Reports for each code type -------------------------------
+
+anomaly_detection_report "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_LANGUAGE}=Java"
+anomaly_detection_report "${ALGORITHM_NODE}=Package" "${ALGORITHM_LANGUAGE}=Java"
+anomaly_detection_report "${ALGORITHM_NODE}=Type" "${ALGORITHM_LANGUAGE}=Java"
+anomaly_detection_report "${ALGORITHM_NODE}=Module" "${ALGORITHM_LANGUAGE}=Typescript"
+
+# ---------------------------------------------------------------
+
+echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
diff --git a/scripts/cleanupAfterReportGeneration.sh b/scripts/cleanupAfterReportGeneration.sh
@@ -29,6 +29,16 @@ find "${report_directory}" -type f -name "*.csv" | sort | while read -r report_f
     fi
 done
 
+# Find all Markdown (md) files in the report directory
+# and delete the ones that contain less than 3 lines.
+find "${report_directory}" -type f -name "*.md" | sort | while read -r report_file; do
+    number_of_lines=$(wc -l < "${report_file}" | awk '{print $1}')
+    if [[ "${number_of_lines}" -le 2 ]]; then
+        echo "cleanupReports: deleting empty (${number_of_lines} lines) report file ${report_file}"
+        rm -f "${report_file}"
+    fi
+done
+
 # Delete reports directory if its empty
 number_files_in_report_directory=$( find "${report_directory}" -type f | wc -l | awk '{print $1}' )
 if [[ "${number_files_in_report_directory}" -lt 1 ]]; then