Skip to content

Commit e453288

Browse files
authored
Merge pull request #79 from codefuse-ai/sync-cli
[sync] latest cli changes.
2 parents 8523a73 + 92f2acc commit e453288

17 files changed

+913
-184
lines changed

Diff for: cli/database/create.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,65 @@ def memory_statistics():
4848
logging.info(f"final -Xmx is : {max(total_memory - 1, 6):.2f} {size_units[unit_index]}")
4949

5050

51+
def is_valid_regex(pattern):
52+
try:
53+
re.compile(pattern)
54+
return True
55+
except re.error:
56+
return False
57+
58+
5159
def conf_option_deal(args):
5260
options = dict()
5361
if args.extraction_config_file:
5462
try:
5563
with open(args.extraction_config_file, "r") as f:
56-
options = json.load(f)
64+
extract_options = json.load(f)
65+
for conf in extract_options:
66+
language = conf["extractor"]
67+
# all 先不处理
68+
if language == "all":
69+
continue
70+
if language not in args.language:
71+
logging.error("%s language will not be extracted and the configuration is invalid", language)
72+
continue
73+
for option in conf["extractor_options"]:
74+
if "name" not in option:
75+
logging.error("option language error: please check name not in this conf : %s",
76+
json.dumps(option))
77+
return -1
78+
key = option["name"]
79+
if "value" not in option:
80+
logging.error("option value error: value not in this conf : %s", json.dumps(option))
81+
return -1
82+
if "config" not in option["value"]:
83+
logging.error("option config error:config not in this conf[\"value\"]: %s",
84+
json.dumps(option))
85+
return -1
86+
value = option["value"]["config"]
87+
if "pattern" in option["value"]:
88+
pattern = option["value"]["pattern"]
89+
if is_valid_regex(pattern):
90+
if re.search(pattern, value):
91+
logging.warning("option pattern error: this conf will be ignore: %s",
92+
json.dumps(option))
93+
continue
94+
else:
95+
logging.warning("option pattern error: this conf will be ignore: %s",
96+
json.dumps(option))
97+
continue
98+
if language not in options:
99+
options[language] = dict()
100+
if key in options[language]:
101+
logging.error("in %s extract, %s redefine", language, key)
102+
return -1
103+
options[language][key] = value
57104
except Exception as e:
58105
logging.error(e)
59106
return -1
60107
for language in args.language:
61-
options[language] = dict()
108+
if language not in options:
109+
options[language] = dict()
62110
if args.extraction_config:
63111
# 要求option必须是a.b=c的形式,a为语言名,若不是报错
64112
pattern = r'^(.+)\.(.+)\=(.+)$'
@@ -72,6 +120,9 @@ def conf_option_deal(args):
72120
if language not in args.language:
73121
logging.error("option language error: %s does not need to be extracted", language)
74122
return -1
123+
if key in options[language]:
124+
logging.error("in %s extract, %s redefine", language, key)
125+
return -1
75126
options[language][key] = value
76127
else:
77128
logging.error("option format error: %s, it need like java.a=b", tmp)
@@ -87,7 +138,6 @@ def database_create(args):
87138
if options == -1:
88139
logging.error("configuration error, Please check conf")
89140
raise ValueError("configuration error")
90-
memory_statistics()
91141
timeout = args.timeout
92142
extractor_fail = list()
93143
for language in args.language:

Diff for: cli/extractor/extractor.py

+70-31
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22

33
import psutil
4-
4+
import shlex
55
from run.runner import Runner
66
from sparrow_schema.schema import sparrow
77

@@ -17,6 +17,7 @@ class Extractor:
1717
sql_extractor = ""
1818
swift_extractor = ""
1919
xml_extractor = ""
20+
arkts_extractor = ""
2021

2122
def __init__(self):
2223
Extractor.cfamily_extractor = sparrow.home / "language" / "cfamily" / "extractor" / "usr" / "bin" / "coref-cfamily-src-extractor"
@@ -28,6 +29,7 @@ def __init__(self):
2829
Extractor.sql_extractor = sparrow.home / "language" / "sql" / "extractor" / "coref-sql-src-extractor_deploy.jar"
2930
Extractor.swift_extractor = sparrow.home / "language" / "swift" / "extractor" / "usr" / "bin" / "coref-swift-src-extractor"
3031
Extractor.xml_extractor = sparrow.home / "language" / "xml" / "extractor" / "coref-xml-extractor_deploy.jar"
32+
Extractor.arkts_extractor = sparrow.home / "language" / "arkts" / "extractor" / "coref-arkts-src-extractor"
3133

3234

3335
def cfamily_extractor_cmd(source_root, database, options):
@@ -58,15 +60,19 @@ def go_extractor_cmd(source_root, database, options):
5860

5961
def java_extractor_cmd(source_root, database, options):
6062
cmd = list()
61-
cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database)
63+
cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database, options)
6264
if options:
65+
if "white-list" in options and "whiteList" in options:
66+
logging.error("white-list and whiteList cannot be configured at the same time")
67+
return -1
68+
if "cp" in options and "classpath" in options:
69+
logging.error("cp and classpath cannot be configured at the same time")
70+
return -1
6371
for (key, value) in options.items():
6472
if key == "white-list" or key == "whiteList":
65-
cmd += ["-w=", value]
66-
elif key == "cp":
67-
cmd += ["-cp=", value]
68-
elif key == "classpath":
69-
cmd += ["--classpath=", value]
73+
cmd += ["-w=" + value]
74+
elif key == "cp" or key == "classpath":
75+
cmd += ["-cp=" + value]
7076
elif key == "incremental":
7177
if value == "true":
7278
cmd += ["--incremental"]
@@ -80,8 +86,9 @@ def java_extractor_cmd(source_root, database, options):
8086
logging.warning("java.incremental does not take effect, please use java.incremental=true")
8187
else:
8288
if key != "cache-dir" and key != "commit" and key != "remote-cache-type" and \
83-
key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix":
84-
logging.warning("unsupported config name:%s for java extractor.", key)
89+
key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix" and \
90+
key != "jvm_opts":
91+
logging.warning("unsupported config name: %s for java extractor.", key)
8592
if "incremental" not in options or options["incremental"] != "true":
8693
cmd += ["--parallel"]
8794
return cmd
@@ -124,7 +131,7 @@ def javascript_extractor_cmd(source_root, database, options):
124131

125132

126133
def properties_extractor_cmd(source_root, database, options):
127-
cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database)
134+
cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database, options)
128135
return cmd
129136

130137

@@ -136,13 +143,13 @@ def python_extractor_cmd(source_root, database, options):
136143

137144
def sql_extractor_cmd(source_root, database, options):
138145
cmd = list()
139-
cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database)
146+
cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database, options)
140147
if "sql-dialect-type" in options:
141148
cmd += ["--sql-dialect-type", options["sql-dialect-type"]]
142149
return cmd
143150

144151

145-
def swift_extractor(source_root, database, options):
152+
def swift_extractor_cmd(source_root, database, options):
146153
cmd = list()
147154
cmd += [str(Extractor.swift_extractor), str(source_root), str(database)]
148155
if options:
@@ -156,23 +163,59 @@ def swift_extractor(source_root, database, options):
156163

157164

158165
def xml_extractor_cmd(source_root, database, options):
159-
cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database)
166+
cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database, options)
160167
return cmd
161168

162169

163-
def jar_extractor_cmd(extractor_path, source_root, database):
164-
# 获取内存信息
165-
mem = psutil.virtual_memory()
166-
total_memory = mem.total
167-
pod_memory_limit = get_pod_memory_limit()
168-
if pod_memory_limit != 0:
169-
total_memory = pod_memory_limit
170-
total_memory_gb = round(total_memory / (1024 ** 3))
171-
logging.info("current memory is : %s GB", total_memory_gb)
172-
xmx = max(total_memory_gb - 1, 6)
173-
logging.info("final -Xmx is: %s GB", xmx)
170+
def arkts_extractor_cmd(source_root, database, options):
174171
cmd = list()
175-
cmd += ["java", "-jar", "-Xmx" + str(xmx) + "g", str(extractor_path)]
172+
cmd += [str(Extractor.arkts_extractor), "extract"] + \
173+
["--extract-text", "-s", str(source_root)] + \
174+
["-d", str(database / "coref_arkts_src.db")]
175+
if options:
176+
for (key, value) in options.items():
177+
if key == "blacklist" or key == "b":
178+
cmd += ["--blacklist"] + value.split(",")
179+
elif key == "use-gitignore":
180+
cmd += ["--use-gitignore"]
181+
elif key == "extract-text":
182+
cmd += ["--extract-text"]
183+
elif key == "extract-deps":
184+
cmd += ["--extract-deps"]
185+
elif key == "file-size-limit":
186+
cmd += ["--file-size-limit", value]
187+
elif key == "paths":
188+
cmd += ["--paths", value]
189+
else:
190+
logging.warning("unsupported config name:%s for arkts extractor.", key)
191+
return cmd
192+
193+
194+
def jar_extractor_cmd(extractor_path, source_root, database, options):
195+
jvm_opts = None
196+
if options:
197+
for (key, value) in options.items():
198+
if key == "jvm_opts":
199+
# jvm_opts from user specified extract config
200+
jvm_opts = value
201+
202+
# if no jvm_opts from extract config, calculate xmx according to current memory.
203+
if not jvm_opts:
204+
mem = psutil.virtual_memory()
205+
total_memory = mem.total
206+
pod_memory_limit = get_pod_memory_limit()
207+
if pod_memory_limit != 0:
208+
total_memory = pod_memory_limit
209+
total_memory_gb = round(total_memory / (1024 ** 3))
210+
total_memory_gb = min(total_memory_gb, 32) # limit to 32G
211+
xmx = max(total_memory_gb - 1, 6)
212+
logging.info("current memory is: %s GB, will use xmx: %s GB.", total_memory_gb, xmx)
213+
jvm_opts = f"-Xmx{xmx}g"
214+
215+
logging.info("extract jvm_opts is: %s .", jvm_opts)
216+
217+
cmd = list()
218+
cmd += ["java"] + shlex.split(jvm_opts) + ["-jar", str(extractor_path)]
176219
cmd += [str(source_root), str(database)]
177220
return cmd
178221

@@ -190,10 +233,9 @@ def extractor_run(language, source_root, database, timeout, options):
190233
tmp = Runner(cmd, timeout)
191234
return tmp.subrun()
192235
else:
193-
logging.error("Not supported language: %s", language)
236+
logging.error("Failed to obtain the %s extractor", language)
194237
return -1
195238

196-
197239
def get_pod_memory_limit():
198240
# cgroup 文件系统路径
199241
memory_limit_path = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
@@ -209,7 +251,4 @@ def get_pod_memory_limit():
209251
logging.error(f"IO error occurred when accessing cgroup files: {e}")
210252
except Exception as e:
211253
logging.error(f"An unexpected error occurred: {e}")
212-
return memory_limit
213-
214-
215-
254+
return memory_limit

Diff for: cli/godel/godel_compiler.py

+39-23
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,25 @@
11
import logging
2-
import tempfile
3-
import time
4-
from pathlib import Path
2+
import re
3+
import chardet
54

65
from run.runner import Runner
76
from sparrow_schema.schema import sparrow
87

98

9+
def get_encoding(file_path):
10+
with open(file_path, 'rb') as f:
11+
result = chardet.detect(f.read())
12+
return result['encoding']
13+
14+
1015
def godel_version_judge(path) -> str:
1116
# 判断脚本对应的godel编译器版本
12-
result = "script"
17+
result = "0.3"
1318
try:
14-
with open(path, "r") as f:
19+
with open(path, "r", encoding=get_encoding(path)) as f:
1520
tmp = f.readline()
16-
if "1.0" in tmp:
17-
result = "1.0"
21+
if re.match(r'//[ \t]*script', tmp):
22+
result = "script"
1823
except Exception as e:
1924
logging.error(f"godel version judge error: {str(e)}")
2025
return result
@@ -23,8 +28,8 @@ def godel_version_judge(path) -> str:
2328
def get_godel_compile(path):
2429
version = godel_version_judge(path)
2530
godel = ""
26-
if version == "1.0":
27-
godel = sparrow.godel_1_0
31+
if version == "0.3":
32+
godel = sparrow.godel_0_3
2833
elif version == "script":
2934
godel = sparrow.godel_script
3035
return godel
@@ -35,7 +40,8 @@ def backend_execute(path, database, output, timeout, output_format, verbose):
3540
version = godel_version_judge(path)
3641
cmd = list()
3742
cmd += [str(godel), str(path), "--run-souffle-directly", "--package-path"]
38-
cmd += [str(sparrow.lib_1_0)]
43+
if version == "0.3":
44+
cmd += [str(sparrow.lib_03)]
3945
if database is not None:
4046
cmd += ["--souffle-fact-dir", database]
4147
cmd += ["--souffle-output-format", output_format, "--souffle-output-path", output]
@@ -45,23 +51,33 @@ def backend_execute(path, database, output, timeout, output_format, verbose):
4551
return tmp.subrun()
4652

4753

54+
def precompiled(path, timeout):
55+
cmd = [str(sparrow.godel_script), "-p", str(sparrow.lib_script), "--semantic-only", str(path)]
56+
tmp = Runner(cmd, timeout)
57+
status = tmp.subrun()
58+
if status != 0:
59+
return False
60+
return True
61+
62+
4863
def execute(path, database, output, timeout, output_format, verbose):
4964
godel = get_godel_compile(path)
5065
version = godel_version_judge(path)
5166
cmd = list()
5267
if version == "script":
53-
# godel-script两步编译,实际执行后端为1.0
54-
with tempfile.NamedTemporaryFile(suffix='.gdl') as temp_file:
55-
cmd += [str(godel), str(path), "-p", str(sparrow.lib_1_0), "-o", temp_file.name]
56-
if verbose:
57-
cmd += ["--verbose"]
58-
tmp = Runner(cmd, timeout)
59-
start_time = time.time()
60-
return_code = tmp.subrun()
61-
if return_code != 0:
62-
logging.error("%s compile error, please check it yourself", str(path))
63-
return -1
64-
logging.info("godel-script compile time: %.2fs", time.time() - start_time)
65-
return backend_execute(Path(temp_file.name), database, output, timeout, output_format, verbose)
68+
# godel-script 直接执行
69+
cmd += [str(godel), "-p", str(sparrow.lib_script), "-f", database]
70+
cmd += ["-Of", "-r", str(path)]
71+
if output_format == "sqlite":
72+
cmd += ["--output-sqlite"]
73+
elif output_format == "csv":
74+
cmd += ["--output-csv"]
75+
else:
76+
cmd += ["--output-json"]
77+
cmd += [output]
78+
if verbose:
79+
cmd += ["--verbose"]
80+
tmp = Runner(cmd, timeout)
81+
return tmp.subrun()
6682
else:
6783
return backend_execute(path, database, output, timeout, output_format, verbose)

Diff for: cli/package/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)