Skip to content

Commit 03af6da

Browse files
committed
Add anomaly detection Markdown summary report
1 parent d0a7bf7 commit 03af6da

File tree

6 files changed

+289
-0
lines changed

6 files changed

+289
-0
lines changed

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ anomaly_detection_features() {
6767
# Required Parameters:
6868
# - projection_node_label=...
6969
# Label of the nodes that will be used for the projection. Example: "Package"
70+
# - projection_language=...
71+
# Name of the associated programming language. Default: "Java". Example: "Typescript"
7072
anomaly_detection_queries() {
7173
local nodeLabel
7274
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -92,6 +94,8 @@ anomaly_detection_queries() {
9294
# Required Parameters:
9395
# - projection_node_label=...
9496
# Label of the nodes that will be used for the projection. Example: "Package"
97+
# - projection_language=...
98+
# Name of the associated programming language. Examples: "Java", "Typescript"
9599
anomaly_detection_labels() {
96100
local nodeLabel
97101
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -109,6 +113,63 @@ anomaly_detection_labels() {
109113
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"
110114
}
111115

116+
# TODO delete if not needed anymore
117+
# # Initialize anomaly detail (Markdown) report.
118+
# # Intended to be run the before the first call of "anomaly_detection_detail_report".
119+
# initialize_anomaly_detection_detail_report() {
120+
# archetype_summary_directory=${FULL_REPORT_DIRECTORY}/archetype-summary-${language}-${nodeLabel}
121+
# rm -rf "${archetype_summary_directory}"
122+
# }
123+
124+
# Appends a Markdown table to an existing file and
125+
# removes redundant header + separator rows.
126+
#
127+
# Usage:
128+
# cat newTable.md | append_table myMarkdownFile.md
129+
#
130+
# append_table myMarkdownFile.md <<'EOF'
131+
# | Name | Score | Archetype |
132+
# | --- | --- | --- |
133+
# | Bar | 0.9 | Something |
134+
# EOF
135+
#
136+
# Behavior:
137+
# - Keeps the first header row and its following separator row.
138+
# - Removes all subsequent duplicate header + separator pairs.
139+
# - Leaves all data rows untouched.
140+
append_to_markdown_table() {
141+
local file="$1"
142+
143+
# Append stdin to the target file
144+
cat >> "${file}"
145+
146+
# Clean up duplicate headers (header row + --- row)
147+
awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
148+
}
149+
150+
# Aggregates all results in a Markdown report.
151+
# Note: Call "initialize_anomaly_detection_detail_report" before the first call of this function.
152+
#
153+
# Required Parameters:
154+
# - projection_node_label=...
155+
# Label of the nodes that will be used for the projection. Example: "Package"
156+
# - projection_language=...
157+
# Name of the associated programming language. Examples: "Java", "Typescript"
158+
anomaly_detection_detail_report() {
159+
local nodeLabel
160+
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
161+
162+
local language
163+
language=$( extractQueryParameter "projection_language" "${@}" )
164+
165+
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} anomaly summary Markdown report..."
166+
167+
archetype_summary_directory=${FULL_REPORT_DIRECTORY}/archetype-summary-${language}-${nodeLabel}
168+
mkdir -p "${archetype_summary_directory}"
169+
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeSummary.cypher" "${@}" --output-markdown-table | append_to_markdown_table "${FULL_REPORT_DIRECTORY}/TopAnomaliesByArchetype.md"
170+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeSummary.cypher" "${@}" --output-markdown-table > "${archetype_summary_directory}/TopAnomaliesByArchetype.md"
171+
}
172+
112173
# Run the anomaly detection pipeline.
113174
#
114175
# Required Parameters:
@@ -118,10 +179,13 @@ anomaly_detection_labels() {
118179
# Label of the nodes that will be used for the projection. Example: "Package"
119180
# - projection_weight_property=...
120181
# Name of the node property that contains the dependency weight. Example: "weight"
182+
# - projection_language=...
183+
# Name of the associated programming language. Examples: "Java", "Typescript"
121184
anomaly_detection_csv_reports() {
122185
time anomaly_detection_features "${@}"
123186
time anomaly_detection_queries "${@}"
124187
time anomaly_detection_labels "${@}"
188+
time anomaly_detection_detail_report "${@}"
125189
}
126190

127191
# Create report directory
@@ -146,6 +210,9 @@ ALGORITHM_LANGUAGE="projection_language"
146210
COMMUNITY_PROPERTY="community_property=communityLeidenIdTuned"
147211
EMBEDDING_PROPERTY="embedding_property=embeddingsFastRandomProjectionTunedForClustering"
148212

213+
#TODO delete if not needed anymore
214+
#initialize_anomaly_detection_detail_report
215+
149216
# -- Java Artifact Node Embeddings -------------------------------
150217

151218
if createUndirectedDependencyProjection "${PROJECTION_NAME}=artifact-anomaly-detection" "${PROJECTION_NODE}=Artifact" "${PROJECTION_WEIGHT}=weight" "${PROJECTION_LANGUAGE}=Java"; then
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Anomaly Detection Summary: Overview of all analyzed code units in total. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE (codeUnit.incomingDependencies IS NOT NULL
5+
OR codeUnit.outgoingDependencies IS NOT NULL)
6+
WITH count(DISTINCT codeUnit) AS codeUnitCount
7+
,sum(codeUnit.anomalyLabel) AS anomalyCount
8+
,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount
9+
,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
10+
,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount
11+
,sum(sign(codeUnit.anomalyHubRank)) AS hubCount
12+
,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount
13+
//,collect(codeUnit.name)[0..4] AS exampleNames
14+
RETURN codeUnitCount AS `Analyzed Units`
15+
,anomalyCount AS `Anomalies`
16+
,authorityCount AS `Authorities`
17+
,bottleNeckCount AS `Bottlenecks`
18+
,bridgeCount AS `Bridges`
19+
,hubCount AS `Hubs`
20+
,outlierCount AS `Outliers`
21+
//,exampleNames
22+
ORDER BY anomalyCount DESC, codeUnitCount DESC
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Anomaly Detection Summary: Overview of analyzed code units and the number of anomalies detected. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE (codeUnit.incomingDependencies IS NOT NULL
5+
OR codeUnit.outgoingDependencies IS NOT NULL)
6+
UNWIND labels(codeUnit) AS codeUnitLabel
7+
WITH *
8+
WHERE NOT codeUnitLabel STARTS WITH 'Mark4'
9+
AND NOT codeUnitLabel IN ['File', 'Directory', 'ByteCode', 'GenericDeclaration']
10+
WITH collect(codeUnitLabel) AS codeUnitLabels
11+
,codeUnit
12+
WITH apoc.text.join(codeUnitLabels, ',') AS codeUnitLabels
13+
,count(DISTINCT codeUnit) AS codeUnitCount
14+
,sum(codeUnit.anomalyLabel) AS anomalyCount
15+
,sum(sign(codeUnit.anomalyAuthorityRank)) AS authorityCount
16+
,sum(sign(codeUnit.anomalyBottleneckRank)) AS bottleNeckCount
17+
,sum(sign(codeUnit.anomalyBridgeRank)) AS bridgeCount
18+
,sum(sign(codeUnit.anomalyHubRank)) AS hubCount
19+
,sum(sign(codeUnit.anomalyOutlierRank)) AS outlierCount
20+
//,collect(codeUnit.name)[0..4] AS exampleNames
21+
RETURN codeUnitLabels AS `Abstraction Level`
22+
,codeUnitCount AS `Units`
23+
,anomalyCount AS `Anomalies`
24+
,authorityCount AS `Authorities`
25+
,bottleNeckCount AS `Bottlenecks`
26+
,bridgeCount AS `Bridges`
27+
,hubCount AS `Hubs`
28+
,outlierCount AS `Outliers`
29+
//,exampleNames
30+
ORDER BY anomalyCount DESC, codeUnitCount DESC
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Anomaly Detection Labels: Summarizes all labelled archetypes by their anomaly score including their archetype rank. For code units with more than one archetype, the one with the higher rank is shown. Requires all other labels/*.cypher queries to run first. Variables: projection_language, projection_node_label
2+
3+
MATCH (codeUnit)
4+
WHERE $projection_node_label IN labels(codeUnit)
5+
UNWIND keys(codeUnit) AS codeUnitProperty
6+
WITH *
7+
WHERE codeUnitProperty STARTS WITH 'anomaly'
8+
AND codeUnitProperty ENDS WITH 'Rank'
9+
WITH *
10+
,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
11+
,split(split(codeUnitProperty, 'anomaly')[1], 'Rank')[0] AS archetype
12+
,codeUnit[codeUnitProperty] AS archetypeRank
13+
,codeUnit.anomalyScore AS anomalyScore
14+
ORDER BY codeUnit.anomalyScore DESC, archetypeRank ASC, codeUnitName ASC, archetype ASC
15+
OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
16+
WITH *, artifact.name AS artifactName
17+
OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
18+
WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName
19+
OPTIONAL MATCH (codeDirectory:File:Directory)-[:CONTAINS]->(codeUnit)
20+
WITH *, split(replace(codeDirectory.fileName, './', ''), '/')[-2] AS directoryName
21+
WITH *, coalesce(artifactName, projectName, directoryName, "") AS projectName
22+
RETURN projectName AS `Contained in`
23+
//$projection_language + ' ' + $projection_node_label AS `Code Unit`
24+
,codeUnitName AS `Name`
25+
,round(anomalyScore, 4, 'HALF_UP') AS `Score`
26+
,collect(archetype)[0] AS `Archetype`
27+
,collect(archetypeRank)[0] AS `Archetype Rank`
28+
,nullif(codeUnit.anomalyTopFeature1, "") AS `Top Feature 1`
29+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue1, 4, 'HALF_UP'), 0.0) AS `Top Feature 1 SHAP`
30+
,nullif(codeUnit.anomalyTopFeature2, "") AS `Top Feature 2`
31+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue2, 4, 'HALF_UP'), 0.0) AS `Top Feature 2 SHAP`
32+
,nullif(codeUnit.anomalyTopFeature3, "") AS `Top Feature 3`
33+
,nullif(round(codeUnit.anomalyTopFeatureSHAPValue3, 4, 'HALF_UP'), 0.0) AS `Top Feature 3 SHAP`
34+
,CASE WHEN codeUnit.anomalyScore <= 0 THEN 'Typical'
35+
WHEN codeUnit.anomalyTopFeature1 IS NULL THEN 'Undetermined'
36+
ELSE 'Anomalous' END AS `Model Status`
37+
//,collect(archetype)[1] AS secondaryArchetype
38+
//,collect(archetypeRank)[1] AS secondaryArchetypeRank
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#!/usr/bin/env bash
2+
3+
# Creates a Markdown report that contains all results of all the anomaly detection methods.
4+
# It requires an already running Neo4j graph database with already scanned and analyzed artifacts.
5+
# The results will be written into the sub directory reports/anomaly-detection.
6+
7+
# Note that "scripts/prepareAnalysis.sh" is required to run prior to this script.
8+
# Note that either "anomalyDetectionCsv.sh" or "anomalyDetectionPython.sh" is required to run prior to this script.
9+
10+
# Requires executeQueryFunctions.sh, projectionFunctions.sh, cleanupAfterReportGeneration.sh
11+
12+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
13+
set -o errexit -o pipefail
14+
15+
# Overrideable Constants (defaults also defined in sub scripts)
16+
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
17+
MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"}
18+
19+
## Get this "domains/anomaly-detection/summary" directory if not already set
20+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
21+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
22+
# This way non-standard tools like readlink aren't needed.
23+
ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
24+
echo "anomalyDetectionSummary: ANOMALY_DETECTION_SUMMARY_DIR=${ANOMALY_DETECTION_SUMMARY_DIR}"
25+
# Get the "scripts" directory by taking the path of this script and going one directory up.
26+
SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SUMMARY_DIR}/../../../scripts"} # Repository directory containing the shell scripts
27+
28+
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
29+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
30+
31+
# Appends a Markdown table to an existing file and
32+
# removes redundant header + separator rows.
33+
#
34+
# Usage:
35+
# cat newTable.md | append_table myMarkdownFile.md
36+
#
37+
# append_table myMarkdownFile.md <<'EOF'
38+
# | Name | Score | Archetype |
39+
# | --- | --- | --- |
40+
# | Bar | 0.9 | Something |
41+
# EOF
42+
#
43+
# Behavior:
44+
# - Keeps the first header row and its following separator row.
45+
# - Removes all subsequent duplicate header + separator pairs.
46+
# - Leaves all data rows untouched.
47+
append_to_markdown_table() {
48+
local file="$1"
49+
50+
# Append stdin to the target file
51+
cat >> "${file}"
52+
53+
# Clean up duplicate headers (header row + --- row)
54+
awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
55+
}
56+
57+
# Aggregates all results in a Markdown report.
58+
#
59+
# Required Parameters:
60+
# - projection_node_label=...
61+
# Label of the nodes that will be used for the projection. Example: "Package"
62+
# - projection_language=...
63+
# Name of the associated programming language. Examples: "Java", "Typescript"
64+
anomaly_detection_summary_detail_report() {
65+
local nodeLabel
66+
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
67+
68+
local language
69+
language=$( extractQueryParameter "projection_language" "${@}" )
70+
71+
echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Creating ${language} ${nodeLabel} anomaly summary Markdown report..."
72+
73+
anomaly_summary_directory=${FULL_REPORT_DIRECTORY}/anomaly_summary_${language}_${nodeLabel}
74+
mkdir -p "${anomaly_summary_directory}"
75+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomalyDetectionReportTopArchetypes.cypher" "${@}" --output-markdown-table > "${anomaly_summary_directory}/TopAnomaliesByArchetype.md"
76+
# Clean-up after report generation. Empty reports will be deleted.
77+
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${anomaly_summary_directory}"
78+
}
79+
80+
# Run the anomaly detection overview report generation.
81+
anomaly_detection_overview_report() {
82+
local report_markdown_includes_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
83+
mkdir -p "${report_markdown_includes_directory}"
84+
85+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesPerAbstractionLayer.md"
86+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesInTotal.md"
87+
}
88+
89+
# Run the anomaly detection report generation.
90+
#
91+
# Required Parameters:
92+
# - projection_node_label=...
93+
# Label of the nodes that will be used for the projection. Example: "Package"
94+
# - projection_language=...
95+
# Name of the associated programming language. Examples: "Java", "Typescript"
96+
anomaly_detection_report() {
97+
time anomaly_detection_summary_detail_report "${@}"
98+
}
99+
100+
# Create report directory
101+
REPORT_NAME="anomaly-detection"
102+
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
103+
mkdir -p "${FULL_REPORT_DIRECTORY}"
104+
105+
# Query Parameter key pairs for projection and algorithm side
106+
ALGORITHM_NODE="projection_node_label"
107+
ALGORITHM_LANGUAGE="projection_language"
108+
109+
# -- Overview Report for all code type -------------------------------
110+
111+
anomaly_detection_overview_report
112+
113+
# -- Detail Reports for each code type -------------------------------
114+
115+
anomaly_detection_report "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_LANGUAGE}=Java"
116+
anomaly_detection_report "${ALGORITHM_NODE}=Package" "${ALGORITHM_LANGUAGE}=Java"
117+
anomaly_detection_report "${ALGORITHM_NODE}=Type" "${ALGORITHM_LANGUAGE}=Java"
118+
anomaly_detection_report "${ALGORITHM_NODE}=Module" "${ALGORITHM_LANGUAGE}=Typescript"
119+
120+
# ---------------------------------------------------------------
121+
122+
echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."

scripts/cleanupAfterReportGeneration.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ find "${report_directory}" -type f -name "*.csv" | sort | while read -r report_f
2929
fi
3030
done
3131

32+
# Find all Markdown (md) files in the report directory
33+
# and delete the ones that contain less than 3 lines.
34+
find "${report_directory}" -type f -name "*.md" | sort | while read -r report_file; do
35+
number_of_lines=$(wc -l < "${report_file}" | awk '{print $1}')
36+
if [[ "${number_of_lines}" -le 2 ]]; then
37+
echo "cleanupReports: deleting empty (${number_of_lines} lines) report file ${report_file}"
38+
rm -f "${report_file}"
39+
fi
40+
done
41+
3242
# Delete reports directory if its empty
3343
number_files_in_report_directory=$( find "${report_directory}" -type f | wc -l | awk '{print $1}' )
3444
if [[ "${number_files_in_report_directory}" -lt 1 ]]; then

0 commit comments

Comments
 (0)