1
1
import logging
2
2
3
3
import psutil
4
-
4
+ import shlex
5
5
from run .runner import Runner
6
6
from sparrow_schema .schema import sparrow
7
7
@@ -17,6 +17,7 @@ class Extractor:
17
17
sql_extractor = ""
18
18
swift_extractor = ""
19
19
xml_extractor = ""
20
+ arkts_extractor = ""
20
21
21
22
def __init__ (self ):
22
23
Extractor .cfamily_extractor = sparrow .home / "language" / "cfamily" / "extractor" / "usr" / "bin" / "coref-cfamily-src-extractor"
@@ -28,6 +29,7 @@ def __init__(self):
28
29
Extractor .sql_extractor = sparrow .home / "language" / "sql" / "extractor" / "coref-sql-src-extractor_deploy.jar"
29
30
Extractor .swift_extractor = sparrow .home / "language" / "swift" / "extractor" / "usr" / "bin" / "coref-swift-src-extractor"
30
31
Extractor .xml_extractor = sparrow .home / "language" / "xml" / "extractor" / "coref-xml-extractor_deploy.jar"
32
+ Extractor .arkts_extractor = sparrow .home / "language" / "arkts" / "extractor" / "coref-arkts-src-extractor"
31
33
32
34
33
35
def cfamily_extractor_cmd (source_root , database , options ):
@@ -58,15 +60,19 @@ def go_extractor_cmd(source_root, database, options):
58
60
59
61
def java_extractor_cmd (source_root , database , options ):
60
62
cmd = list ()
61
- cmd += jar_extractor_cmd (Extractor .java_extractor , source_root , database )
63
+ cmd += jar_extractor_cmd (Extractor .java_extractor , source_root , database , options )
62
64
if options :
65
+ if "white-list" in options and "whiteList" in options :
66
+ logging .error ("white-list and whiteList cannot be configured at the same time" )
67
+ return - 1
68
+ if "cp" in options and "classpath" in options :
69
+ logging .error ("cp and classpath cannot be configured at the same time" )
70
+ return - 1
63
71
for (key , value ) in options .items ():
64
72
if key == "white-list" or key == "whiteList" :
65
- cmd += ["-w=" , value ]
66
- elif key == "cp" :
67
- cmd += ["-cp=" , value ]
68
- elif key == "classpath" :
69
- cmd += ["--classpath=" , value ]
73
+ cmd += ["-w=" + value ]
74
+ elif key == "cp" or key == "classpath" :
75
+ cmd += ["-cp=" + value ]
70
76
elif key == "incremental" :
71
77
if value == "true" :
72
78
cmd += ["--incremental" ]
@@ -80,8 +86,9 @@ def java_extractor_cmd(source_root, database, options):
80
86
logging .warning ("java.incremental does not take effect, please use java.incremental=true" )
81
87
else :
82
88
if key != "cache-dir" and key != "commit" and key != "remote-cache-type" and \
83
- key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix" :
84
- logging .warning ("unsupported config name:%s for java extractor." , key )
89
+ key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix" and \
90
+ key != "jvm_opts" :
91
+ logging .warning ("unsupported config name: %s for java extractor." , key )
85
92
if "incremental" not in options or options ["incremental" ] != "true" :
86
93
cmd += ["--parallel" ]
87
94
return cmd
@@ -124,7 +131,7 @@ def javascript_extractor_cmd(source_root, database, options):
124
131
125
132
126
133
def properties_extractor_cmd (source_root , database , options ):
127
- cmd = jar_extractor_cmd (Extractor .properties_extractor , source_root , database )
134
+ cmd = jar_extractor_cmd (Extractor .properties_extractor , source_root , database , options )
128
135
return cmd
129
136
130
137
@@ -136,13 +143,13 @@ def python_extractor_cmd(source_root, database, options):
136
143
137
144
def sql_extractor_cmd (source_root , database , options ):
138
145
cmd = list ()
139
- cmd += jar_extractor_cmd (Extractor .sql_extractor , source_root , database )
146
+ cmd += jar_extractor_cmd (Extractor .sql_extractor , source_root , database , options )
140
147
if "sql-dialect-type" in options :
141
148
cmd += ["--sql-dialect-type" , options ["sql-dialect-type" ]]
142
149
return cmd
143
150
144
151
145
- def swift_extractor (source_root , database , options ):
152
+ def swift_extractor_cmd (source_root , database , options ):
146
153
cmd = list ()
147
154
cmd += [str (Extractor .swift_extractor ), str (source_root ), str (database )]
148
155
if options :
@@ -156,23 +163,59 @@ def swift_extractor(source_root, database, options):
156
163
157
164
158
165
def xml_extractor_cmd (source_root , database , options ):
159
- cmd = jar_extractor_cmd (Extractor .xml_extractor , source_root , database )
166
+ cmd = jar_extractor_cmd (Extractor .xml_extractor , source_root , database , options )
160
167
return cmd
161
168
162
169
163
- def jar_extractor_cmd (extractor_path , source_root , database ):
164
- # 获取内存信息
165
- mem = psutil .virtual_memory ()
166
- total_memory = mem .total
167
- pod_memory_limit = get_pod_memory_limit ()
168
- if pod_memory_limit != 0 :
169
- total_memory = pod_memory_limit
170
- total_memory_gb = round (total_memory / (1024 ** 3 ))
171
- logging .info ("current memory is : %s GB" , total_memory_gb )
172
- xmx = max (total_memory_gb - 1 , 6 )
173
- logging .info ("final -Xmx is: %s GB" , xmx )
170
+ def arkts_extractor_cmd (source_root , database , options ):
174
171
cmd = list ()
175
- cmd += ["java" , "-jar" , "-Xmx" + str (xmx ) + "g" , str (extractor_path )]
172
+ cmd += [str (Extractor .arkts_extractor ), "extract" ] + \
173
+ ["--extract-text" , "-s" , str (source_root )] + \
174
+ ["-d" , str (database / "coref_arkts_src.db" )]
175
+ if options :
176
+ for (key , value ) in options .items ():
177
+ if key == "blacklist" or key == "b" :
178
+ cmd += ["--blacklist" ] + value .split ("," )
179
+ elif key == "use-gitignore" :
180
+ cmd += ["--use-gitignore" ]
181
+ elif key == "extract-text" :
182
+ cmd += ["--extract-text" ]
183
+ elif key == "extract-deps" :
184
+ cmd += ["--extract-deps" ]
185
+ elif key == "file-size-limit" :
186
+ cmd += ["--file-size-limit" , value ]
187
+ elif key == "paths" :
188
+ cmd += ["--paths" , value ]
189
+ else :
190
+ logging .warning ("unsupported config name:%s for arkts extractor." , key )
191
+ return cmd
192
+
193
+
194
+ def jar_extractor_cmd (extractor_path , source_root , database , options ):
195
+ jvm_opts = None
196
+ if options :
197
+ for (key , value ) in options .items ():
198
+ if key == "jvm_opts" :
199
+ # jvm_opts from user specified extract config
200
+ jvm_opts = value
201
+
202
+ # if no jvm_opts from extract config, calculate xmx according to current memory.
203
+ if not jvm_opts :
204
+ mem = psutil .virtual_memory ()
205
+ total_memory = mem .total
206
+ pod_memory_limit = get_pod_memory_limit ()
207
+ if pod_memory_limit != 0 :
208
+ total_memory = pod_memory_limit
209
+ total_memory_gb = round (total_memory / (1024 ** 3 ))
210
+ total_memory_gb = min (total_memory_gb , 32 ) # limit to 32G
211
+ xmx = max (total_memory_gb - 1 , 6 )
212
+ logging .info ("current memory is: %s GB, will use xmx: %s GB." , total_memory_gb , xmx )
213
+ jvm_opts = f"-Xmx{ xmx } g"
214
+
215
+ logging .info ("extract jvm_opts is: %s ." , jvm_opts )
216
+
217
+ cmd = list ()
218
+ cmd += ["java" ] + shlex .split (jvm_opts ) + ["-jar" , str (extractor_path )]
176
219
cmd += [str (source_root ), str (database )]
177
220
return cmd
178
221
@@ -190,10 +233,9 @@ def extractor_run(language, source_root, database, timeout, options):
190
233
tmp = Runner (cmd , timeout )
191
234
return tmp .subrun ()
192
235
else :
193
- logging .error ("Not supported language: %s " , language )
236
+ logging .error ("Failed to obtain the %s extractor " , language )
194
237
return - 1
195
238
196
-
197
239
def get_pod_memory_limit ():
198
240
# cgroup 文件系统路径
199
241
memory_limit_path = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
@@ -209,7 +251,4 @@ def get_pod_memory_limit():
209
251
logging .error (f"IO error occurred when accessing cgroup files: { e } " )
210
252
except Exception as e :
211
253
logging .error (f"An unexpected error occurred: { e } " )
212
- return memory_limit
213
-
214
-
215
-
254
+ return memory_limit
0 commit comments