diff --git a/mubench.pipeline/crossproject_create_index.py b/mubench.pipeline/crossproject_create_index.py deleted file mode 100644 index 527bcd8f2..000000000 --- a/mubench.pipeline/crossproject_create_index.py +++ /dev/null @@ -1,43 +0,0 @@ -import os - -import sys - -from data.misuse import Misuse -from data.project import Project -from data.project_version import ProjectVersion -from tasks.implementations.collect_misuses import CollectMisusesTask -from tasks.implementations.collect_projects import CollectProjectsTask -from tasks.implementations.collect_versions import CollectVersionsTask -from tasks.task_runner import TaskRunner -from utils.data_entity_lists import DataEntityLists -from utils.dataset_util import get_white_list - - -__MUBENCH_ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) -__MUBENCH_DATA_PATH = os.path.join(__MUBENCH_ROOT_PATH, "data") -__MUBENCH_DATASETS_FILE = os.path.join(__MUBENCH_DATA_PATH, "datasets.yml") -_INDEX_PATH = os.path.join(__MUBENCH_ROOT_PATH, "checkouts-xp", "index.csv") - - -class PrintIndexTask: - def run(self, project: Project, version: ProjectVersion, misuse: Misuse): - print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(project.id, version.version_id, misuse.misuse_id, - ':'.join(version.source_dirs), - misuse.location.file, misuse.location.method, - "\t".join(misuse.apis)), file=open(_INDEX_PATH, "a")) - - -datasets = sys.argv[1:] - -white_list = [] -for dataset in datasets: - white_list.extend(get_white_list(__MUBENCH_DATASETS_FILE, dataset.lower())) -initial_parameters = [DataEntityLists(white_list, [])] - -runner = TaskRunner( - [CollectProjectsTask(__MUBENCH_DATA_PATH), CollectVersionsTask(False), CollectMisusesTask(), PrintIndexTask()]) - -if os.path.exists(_INDEX_PATH): - os.remove(_INDEX_PATH) - -runner.run(*initial_parameters) diff --git a/mubench.pipeline/crossproject_create_project_list.py b/mubench.pipeline/crossproject_create_project_list.py deleted file mode 100644 index b6564d107..000000000 --- a/mubench.pipeline/crossproject_create_project_list.py +++ /dev/null @@ -1,51 +0,0 @@ -import csv -import logging -import os -from datetime import datetime -from os.path import exists, join - -from utils.io import open_yamls_if_exists, write_yaml -from utils.logging import IndentFormatter -from utils.shell import Shell - -MUBENCH_ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) -CHECKOUTS_PATH = os.path.join(MUBENCH_ROOT_PATH, "checkouts-xp") -INDEX_PATH = os.path.join(CHECKOUTS_PATH, "index.csv") - -logger = logging.getLogger() -logger.setLevel(logging.DEBUG) -handler = logging.StreamHandler() -handler.setFormatter(IndentFormatter("%(asctime)s %(indent)s%(message)s")) -handler.setLevel(logging.INFO) -logger.addHandler(handler) -LOG_DIR = "logs" -if not exists(LOG_DIR): - os.makedirs(LOG_DIR) -log_name = datetime.now().strftime("prepare_ex4_%Y%m%d_%H%M%S") + ".log" -handler = logging.FileHandler(join(LOG_DIR, log_name)) -handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) -handler.setLevel(logging.DEBUG) -logger.addHandler(handler) - -example_projects_by_API = {} -with open(INDEX_PATH) as index: - for row in csv.reader(index, delimiter="\t"): - # skip blank lines, e.g., on trailing newline - if not row: - continue - - target_type = row[6] - try: - if target_type not in example_projects_by_API: - logger.info("Preparing examples for type: %s...", target_type) - target_example_file = os.path.join(CHECKOUTS_PATH, target_type + ".yml") - example_projects = {} - with open_yamls_if_exists(target_example_file) as projects: - for project in projects: - hash = Shell.exec("cd \"{}\"; git rev-parse HEAD".format(join(MUBENCH_ROOT_PATH, project["path"]))) - example_projects[project["url"]] = hash.strip() - example_projects_by_API[target_type] = example_projects - except Exception as error: - logger.exception("failed", exc_info=error) - -write_yaml(example_projects_by_API, join(CHECKOUTS_PATH, "example_projects_by_API.yml")) diff --git a/mubench.pipeline/crossproject_prepare.py b/mubench.pipeline/crossproject_prepare.py deleted file mode 100644 index 8ae4b5c14..000000000 --- a/mubench.pipeline/crossproject_prepare.py +++ /dev/null @@ -1,129 +0,0 @@ -import calendar -import csv -import logging -import os -import sys -from datetime import datetime -from os.path import exists, join -from typing import List - -from boa.BOA import BOA -from buildtools.maven import Project -from utils.io import write_yamls, write_yaml, is_empty -from utils.logging import IndentFormatter -from utils.shell import CommandFailedError - -MUBENCH_ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) -XP_CHECKOUTS_PATH = os.path.join(MUBENCH_ROOT_PATH, "checkouts-xp") -CHECKOUTS_PATH = os.path.join(XP_CHECKOUTS_PATH, "checkouts") -BOA_RESULTS_PATH = join(XP_CHECKOUTS_PATH, "boa-results") -INDEX_PATH = os.path.join(XP_CHECKOUTS_PATH, "index.csv") -SUBTYPES_PATH = os.path.join(XP_CHECKOUTS_PATH, "subtypes.csv") -MAX_SUBTYPES_SAMPLE_SIZE = 25 -MAX_PROJECT_SAMPLE_SIZE = 50 - -_SUBTYPES = {} - -username = sys.argv[1] -password = sys.argv[2] - -now = datetime.utcnow() -run_timestamp = calendar.timegm(now.timetuple()) - - -def _get_subtypes(target_type): - if not _SUBTYPES: - with open(SUBTYPES_PATH) as subtypes_file: - for subtypes_row in csv.reader(subtypes_file, delimiter="\t"): - _SUBTYPES[subtypes_row[0]] = subtypes_row[1:] - - all_subtypes = _SUBTYPES.get(target_type, []) - subtypes_sample = [subtype for subtype in all_subtypes if "sun." not in subtype] # filter Sun-specific types - return subtypes_sample - - -def _get_type_and_subtypes_list(target_type): - return [target_type] + _get_subtypes(target_type) - - -def _create_type_combinations(target_types: List): - if len(target_types) == 1: - return ([type] for type in _get_type_and_subtypes_list(target_types[0])) - else: - return ([target_type] + tail - for target_type in _get_type_and_subtypes_list(target_types[0]) - for tail in _create_type_combinations(target_types[1:])) - - -def _prepare_example_projects(target_types: List, boa: BOA, metadata_path: str): - data = [] - for type_combination in _create_type_combinations(target_types): - projects = boa.query_projects_with_type_usages(target_types, type_combination) - for project in projects: - checkout = project.get_checkout(CHECKOUTS_PATH) - if not checkout.exists(): - try: - logger.info(" Checking out %r...", str(project)) - checkout.clone() - except CommandFailedError as error: - logger.warning(" Checkout failed: %r", error) - checkout.delete() - continue - else: - logger.info(" Already checked out %r.", str(project)) - - try: - project_entry = {"id": project.id, "url": project.repository_url, - "path": os.path.relpath(checkout.path, MUBENCH_ROOT_PATH), - "source_paths": Project(checkout.path).get_sources_paths(), - "checkout_timestamp": run_timestamp} - write_yaml(project_entry) # check for encoding problems - data.append(project_entry) - except UnicodeEncodeError: - logger.warning(" Illegal characters in project data.") - - if len(data) >= MAX_PROJECT_SAMPLE_SIZE: - logger.warning(" Stopping after %r of %r example projects.", MAX_PROJECT_SAMPLE_SIZE, len(projects)) - write_yamls(data, metadata_path) - return - - write_yamls(data, metadata_path) - - -logger = logging.getLogger() -logger.setLevel(logging.DEBUG) -handler = logging.StreamHandler() -handler.setFormatter(IndentFormatter("%(asctime)s %(indent)s%(message)s")) -handler.setLevel(logging.INFO) -logger.addHandler(handler) -LOG_DIR = "logs" -if not exists(LOG_DIR): - os.makedirs(LOG_DIR) -log_name = datetime.now().strftime("prepare_ex4_%Y%m%d_%H%M%S") + ".log" -handler = logging.FileHandler(join(LOG_DIR, log_name)) -handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) -handler.setLevel(logging.DEBUG) -logger.addHandler(handler) - - -with open(INDEX_PATH) as index: - boa = BOA(username, password, BOA_RESULTS_PATH) - for row in csv.reader(index, delimiter="\t"): - # skip blank lines, e.g., on trailing newline - if not row: - continue - - project_id = row[0] - version_id = row[1] - target_types = sorted(row[6:]) - try: - target_example_file = os.path.join(XP_CHECKOUTS_PATH, "-".join(sorted(target_types)) + ".yml") - if not exists(target_example_file): - logger.info("Preparing examples for %s.%s (type(s): %s)...", project_id, version_id, target_types) - _prepare_example_projects(target_types, boa, target_example_file) - elif is_empty(target_example_file): - logger.info("No example projects for %s.%s (type(s): %s)", project_id, version_id, target_types) - else: - logger.info("Already prepared examples for %s.%s (type(s): %s)", project_id, version_id, target_types) - except Exception as error: - logger.exception("failed", exc_info=error) diff --git a/mubench.pipeline/tasks/configurations/configurations.py b/mubench.pipeline/tasks/configurations/configurations.py index 185ddcc92..61d3fc9fe 100644 --- a/mubench.pipeline/tasks/configurations/configurations.py +++ b/mubench.pipeline/tasks/configurations/configurations.py @@ -1,5 +1,6 @@ from typing import List +from tasks.implementations.crossproject_create_index import CrossProjectCreateIndexTask from tasks.implementations import stats from tasks.implementations.checkout import CheckoutTask from tasks.implementations.collect_misuses import CollectMisusesTask @@ -7,6 +8,9 @@ from tasks.implementations.collect_versions import CollectVersionsTask from tasks.implementations.compile_misuse import CompileMisuseTask from tasks.implementations.compile_version import CompileVersionTask +from tasks.implementations.crossproject_create_project_list import CrossProjectCreateProjectListTask +from tasks.implementations.crossproject_prepare import CrossProjectPrepareTask +from tasks.implementations.crossproject_read_index import CrossProjectReadIndexTask, CrossProjectSkipReadIndexTask from tasks.implementations.dataset_check_misuse import MisuseCheckTask from tasks.implementations.dataset_check_project import ProjectCheckTask from tasks.implementations.dataset_check_version import VersionCheckTask @@ -18,6 +22,7 @@ from tasks.implementations.load_detector import LoadDetectorTask from tasks.implementations.publish_findings import PublishFindingsTask from tasks.implementations.publish_metadata import PublishMetadataTask +from tasks.task_runner import TaskRunner from utils.dataset_util import get_available_datasets @@ -103,9 +108,10 @@ def tasks(self, config) -> List: config.java_options) detect = DetectProvidedCorrectUsagesTask(config.findings_path, config.force_detect, config.timeout, config.run_timestamp) - return [load_detector] + CheckoutTaskConfiguration().tasks(config) + [compile_version, collect_misuses, - filter_misuses_without_correct_usages, - compile_misuse, detect] + + # noinspection PyTypeChecker + return [load_detector] + CheckoutTaskConfiguration().tasks(config) + \ + [compile_version, collect_misuses, filter_misuses_without_correct_usages, compile_misuse] + [detect] class PublishProvidedPatternsExperiment(TaskConfiguration): @@ -132,8 +138,18 @@ def tasks(self, config) -> List: config.force_compile, config.use_tmp_wrkdir) load_detector = LoadDetectorTask(config.detectors_path, config.detector, config.requested_release, config.java_options) + + create_index = TaskRunner([CollectMisusesTask(), CrossProjectCreateIndexTask(config.xp_index_file)]) + read_index = CrossProjectReadIndexTask(config.xp_index_file) if config.with_xp \ + else CrossProjectSkipReadIndexTask() + prepare_cross_project = [create_index, read_index, + CrossProjectPrepareTask(config.root_path, config.xp_checkouts_path, + config.run_timestamp, + config.max_project_sample_size, config.boa_user, + config.boa_password)] + detect = DetectAllFindingsTask(config.findings_path, config.force_detect, config.timeout, config.run_timestamp) - return [load_detector] + CheckoutTaskConfiguration().tasks(config) + [compile_version, detect] + return [load_detector] + CheckoutTaskConfiguration().tasks(config) + prepare_cross_project + [compile_version, detect] class PublishAllFindingsExperiment(TaskConfiguration): @@ -161,7 +177,36 @@ def tasks(self, config) -> List: load_detector = LoadDetectorTask(config.detectors_path, config.detector, config.requested_release, config.java_options) detect = DetectAllFindingsTask(config.findings_path, config.force_detect, config.timeout, config.run_timestamp) - return [load_detector] + CheckoutTaskConfiguration().tasks(config) + [compile_version, detect] + + create_index = TaskRunner([CollectMisusesTask(), CrossProjectCreateIndexTask(config.xp_index_file)]) + read_index = CrossProjectReadIndexTask(config.xp_index_file) if config.with_xp \ + else CrossProjectSkipReadIndexTask() + prepare_cross_project = [create_index, read_index, + CrossProjectPrepareTask(config.root_path, config.xp_checkouts_path, + config.run_timestamp, + config.max_project_sample_size, config.boa_user, + config.boa_password)] + + # noinspection PyTypeChecker + return [load_detector] + CheckoutTaskConfiguration().tasks(config) + [compile_version] + \ + prepare_cross_project + [detect] + + +class RunCrossProjectPrepare(TaskConfiguration): + @staticmethod + def mode() -> str: + return "checkout-xp" + + def tasks(self, config) -> List: + create_index_tasks = [CollectProjectsTask(config.data_path), CollectVersionsTask(config.development_mode), + CollectMisusesTask(), CrossProjectCreateIndexTask(config.xp_index_file)] + create_index = TaskRunner(create_index_tasks) + # noinspection PyTypeChecker + return [create_index, + CrossProjectReadIndexTask(config.xp_index_file), + CrossProjectPrepareTask(config.root_path, config.xp_checkouts_path, config.run_timestamp, config.max_project_sample_size, config.boa_user, + config.boa_password), + CrossProjectCreateProjectListTask(config.root_path, config.xp_index_file, config.xp_checkouts_path)] class PublishBenchmarkExperiment(TaskConfiguration): diff --git a/mubench.pipeline/tasks/implementations/crossproject_create_index.py b/mubench.pipeline/tasks/implementations/crossproject_create_index.py new file mode 100644 index 000000000..8b1098265 --- /dev/null +++ b/mubench.pipeline/tasks/implementations/crossproject_create_index.py @@ -0,0 +1,20 @@ +import os + +from data.misuse import Misuse +from data.project import Project +from data.project_version import ProjectVersion +from utils.io import safe_open + + +class CrossProjectCreateIndexTask: + def __init__(self, index_file: str): + self.index_file = index_file + + if os.path.exists(index_file): + os.remove(index_file) + + def run(self, project: Project, version: ProjectVersion, misuse: Misuse): + print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(project.id, version.version_id, misuse.misuse_id, + ':'.join(version.source_dirs), + misuse.location.file, misuse.location.method, + "\t".join(misuse.apis)), file=safe_open(self.index_file + '-' + version.id, "a")) diff --git a/mubench.pipeline/tasks/implementations/crossproject_create_project_list.py b/mubench.pipeline/tasks/implementations/crossproject_create_project_list.py new file mode 100644 index 000000000..0d0ceae8b --- /dev/null +++ b/mubench.pipeline/tasks/implementations/crossproject_create_project_list.py @@ -0,0 +1,39 @@ +import csv +import logging +import os +from os.path import join + +from utils.io import open_yamls_if_exists, write_yaml +from utils.shell import Shell + + +class CrossProjectCreateProjectListTask: + def __init__(self, root_path: str, index_file: str, base_checkout_path: str): + self.root_path = root_path + self.index_file = index_file + self.base_checkout_path = base_checkout_path + + def run(self): + logger = logging.getLogger("tasks.cross_project_create_project_list") + example_projects_by_API = {} + with open(self.index_file) as index: + for row in csv.reader(index, delimiter="\t"): + # skip blank lines, e.g., on trailing newline + if not row: + continue + + target_type = row[6] + try: + if target_type not in example_projects_by_API: + logger.info("Preparing examples for type: %s...", target_type) + target_example_file = os.path.join(self.base_checkout_path, target_type + ".yml") + example_projects = {} + with open_yamls_if_exists(target_example_file) as projects: + for project in projects: + hash = Shell.exec("cd \"{}\"; git rev-parse HEAD".format(join(self.root_path, project["path"]))) + example_projects[project["url"]] = hash.strip() + example_projects_by_API[target_type] = example_projects + except Exception as error: + logger.exception("failed", exc_info=error) + + write_yaml(example_projects_by_API, join(self.base_checkout_path, "example_projects_by_API.yml")) diff --git a/mubench.pipeline/tasks/implementations/crossproject_prepare.py b/mubench.pipeline/tasks/implementations/crossproject_prepare.py new file mode 100644 index 000000000..604c8c289 --- /dev/null +++ b/mubench.pipeline/tasks/implementations/crossproject_prepare.py @@ -0,0 +1,120 @@ +import csv +import logging +import os +from os.path import exists, join +from typing import List + +from boa.BOA import BOA +from buildtools.maven import Project +from tasks.implementations.crossproject_read_index import CrossProjectMisuseApis +from utils.io import write_yamls, write_yaml, is_empty +from utils.shell import CommandFailedError + + +class CrossProjectSourcesPaths: + def __init__(self, paths: List[str]): + self.__paths = paths + + def get(self): + return list(self.__paths) + + +class CrossProjectPrepareTask: + def __init__(self, root_path: str, checkouts_base_path: str, timestamp: int, + max_project_sample_size: int, boa_user: str, boa_password: str): + self.root_path = root_path + self.checkouts_base_path = checkouts_base_path + self.project_checkouts_path = join(checkouts_base_path, "checkouts") + self.boa_results_path = join(checkouts_base_path, "boa-results") + self.subtypes_path = os.path.join(checkouts_base_path, "subtypes.csv") + self.timestamp = timestamp + self.max_project_sample_size = max_project_sample_size + self.boa_user = boa_user + self.boa_password = boa_password + + self._subtypes = {} + + def run(self, apis: CrossProjectMisuseApis): + logger = logging.getLogger("tasks.cross_project_prepare") + sources_paths = [] + + boa = BOA(self.boa_user, self.boa_password, self.boa_results_path) + for api in apis.get(): + project_id = api.project_id + version_id = api.version_id + target_types = api.target_types + try: + target_example_file = os.path.join(self.project_checkouts_path, "-".join(sorted(target_types)) + ".yml") + if not exists(target_example_file): + logger.info("Preparing examples for %s.%s (type(s): %s)...", project_id, version_id, + target_types) + sources_paths.extend(self._prepare_example_projects(target_types, boa, target_example_file)) + elif is_empty(target_example_file): + logger.info("No example projects for %s.%s (type(s): %s)", project_id, version_id, target_types) + else: + logger.info("Already prepared examples for %s.%s (type(s): %s)", project_id, version_id, + target_types) + except Exception as error: + logger.exception("failed", exc_info=error) + + return CrossProjectSourcesPaths(sources_paths) + + def _prepare_example_projects(self, target_types: List, boa: BOA, metadata_path: str) -> List[str]: + logger = logging.getLogger("tasks.cross_project_prepare") + data = [] + sources_paths = [] + for type_combination in self._create_type_combinations(target_types): + projects = boa.query_projects_with_type_usages(target_types, type_combination) + for project in projects: + checkout = project.get_checkout(self.checkouts_base_path) + if not checkout.exists(): + try: + logger.info(" Checking out %r...", str(project)) + checkout.clone() + except CommandFailedError as error: + logger.warning(" Checkout failed: %r", error) + checkout.delete() + continue + else: + logger.info(" Already checked out %r.", str(project)) + + try: + project_entry = {"id": project.id, "url": project.repository_url, + "path": os.path.relpath(checkout.path, self.root_path), + "source_paths": Project(checkout.path).get_sources_paths(), + "checkout_timestamp": self.timestamp} + write_yaml(project_entry) # check for encoding problems + data.append(project_entry) + sources_paths.extend(Project(checkout.path).get_sources_paths()) + except UnicodeEncodeError: + logger.warning(" Illegal characters in project data.") + + if len(data) >= self.max_project_sample_size: + logger.warning(" Stopping after %r of %r example projects.", self.max_project_sample_size, + len(projects)) + write_yamls(data, metadata_path) + return sources_paths + + write_yamls(data, metadata_path) + return sources_paths + + def _get_subtypes(self, target_type): + if not self._subtypes and exists(self.subtypes_path): + with open(self.subtypes_path) as subtypes_file: + for subtypes_row in csv.reader(subtypes_file, delimiter="\t"): + self._subtypes[subtypes_row[0]] = subtypes_row[1:] + + all_subtypes = self._subtypes.get(target_type, []) + subtypes_sample = [subtype for subtype in all_subtypes if "sun." not in subtype] # filter Sun-specific types + return subtypes_sample + + def _get_type_and_subtypes_list(self, target_type): + return [target_type] + self._get_subtypes(target_type) + + def _create_type_combinations(self, target_types: List): + if len(target_types) == 1: + return ([type_] for type_ in self._get_type_and_subtypes_list(target_types[0])) + else: + return ([target_type] + tail + for target_type in self._get_type_and_subtypes_list(target_types[0]) + for tail in self._create_type_combinations(target_types[1:])) diff --git a/mubench.pipeline/tasks/implementations/crossproject_read_index.py b/mubench.pipeline/tasks/implementations/crossproject_read_index.py new file mode 100644 index 000000000..d00d915c0 --- /dev/null +++ b/mubench.pipeline/tasks/implementations/crossproject_read_index.py @@ -0,0 +1,41 @@ +import csv +from typing import List + +from data.project_version import ProjectVersion + + +class CrossProjectMisuseApi: + def __init__(self, row): + self.project_id = row[0] + self.version_id = row[1] + self.misuse_id = row[2] + self.target_types = sorted(row[6:]) + + +class CrossProjectMisuseApis: + def __init__(self, apis: List[CrossProjectMisuseApi]): + self.__apis = apis + + def get(self): + return list(self.__apis) + + +class CrossProjectReadIndexTask: + def __init__(self, index_file: str): + self.index = index_file + + def run(self, version: ProjectVersion): + apis = [] + + with open(self.index + '-' + version.id) as index_file: + for row in csv.reader(index_file, delimiter="\t"): + # skip blank lines, e.g., on trailing newline + if row: + apis.append(CrossProjectMisuseApi(row)) + + return CrossProjectMisuseApis(apis) + + +class CrossProjectSkipReadIndexTask: + def run(self): + return CrossProjectMisuseApis([]) diff --git a/mubench.pipeline/tasks/implementations/detect_all_findings.py b/mubench.pipeline/tasks/implementations/detect_all_findings.py index de4ce5bd0..aaa3e73e4 100644 --- a/mubench.pipeline/tasks/implementations/detect_all_findings.py +++ b/mubench.pipeline/tasks/implementations/detect_all_findings.py @@ -1,13 +1,14 @@ import logging from os.path import join -from typing import Optional +from typing import Optional, List from data.detector import Detector from data.detector_run import DetectorRun from data.project_version import ProjectVersion from data.version_compile import VersionCompile from tasks.configurations.detector_interface_configuration import key_detector_mode, \ - key_target_src_paths, key_target_classes_paths, key_dependency_classpath + key_target_src_paths, key_target_classes_paths, key_dependency_classpath, key_training_src_path +from tasks.implementations.crossproject_prepare import CrossProjectSourcesPaths class DetectAllFindingsTask: @@ -20,24 +21,29 @@ def __init__(self, findings_base_path: str, force_detect: bool, timeout: Optiona self.timeout = timeout self.current_timestamp = current_timestamp - def run(self, detector: Detector, version: ProjectVersion, version_compile: VersionCompile): - run = DetectorRun(detector, version, self._get_findings_path(detector, version)) + def run(self, detector: Detector, version: ProjectVersion, version_compile: VersionCompile, + xp_sources_paths: CrossProjectSourcesPaths): + run = self._get_detector_run(detector, version) - run.ensure_executed(self._get_detector_arguments(version_compile), + run.ensure_executed(self._get_detector_arguments(version_compile, xp_sources_paths.get()), self.timeout, self.force_detect, self.current_timestamp, version_compile.timestamp, logging.getLogger("task.detect")) return run + def _get_detector_run(self, detector, version): + return DetectorRun(detector, version, self._get_findings_path(detector, version)) + def _get_findings_path(self, detector: Detector, version: ProjectVersion): return join(self.findings_base_path, DetectAllFindingsTask.__RUN_MODE_NAME, detector.id, version.project_id, version.version_id) @staticmethod - def _get_detector_arguments(version_compile: VersionCompile): - return { - key_detector_mode: DetectAllFindingsTask.__DETECTOR_MODE, - key_target_src_paths: version_compile.original_sources_paths, - key_target_classes_paths: version_compile.original_classes_paths, - key_dependency_classpath: version_compile.get_full_classpath() - } + def _get_detector_arguments(version_compile: VersionCompile, xp_sources_paths: List[str]): + detector_args = {key_detector_mode: DetectAllFindingsTask.__DETECTOR_MODE, + key_target_src_paths: version_compile.original_sources_paths, + key_target_classes_paths: version_compile.original_classes_paths, + key_dependency_classpath: version_compile.get_full_classpath()} + if xp_sources_paths: + detector_args[key_training_src_path] = xp_sources_paths + return detector_args diff --git a/mubench.pipeline/tasks/implementations/detect_provided_correct_usages.py b/mubench.pipeline/tasks/implementations/detect_provided_correct_usages.py index ee72a4b79..8bb87b0dc 100644 --- a/mubench.pipeline/tasks/implementations/detect_provided_correct_usages.py +++ b/mubench.pipeline/tasks/implementations/detect_provided_correct_usages.py @@ -1,6 +1,6 @@ import logging from os.path import join -from typing import Optional +from typing import Optional, List from data.detector import Detector from data.detector_run import DetectorRun @@ -9,7 +9,9 @@ from data.project_version import ProjectVersion from data.version_compile import VersionCompile from tasks.configurations.detector_interface_configuration import key_detector_mode, \ - key_training_src_path, key_training_classes_path, key_target_src_paths, key_target_classes_paths, key_dependency_classpath + key_training_src_path, key_training_classes_path, key_target_src_paths, key_target_classes_paths, \ + key_dependency_classpath +from tasks.implementations.crossproject_prepare import CrossProjectSourcesPaths class DetectProvidedCorrectUsagesTask: @@ -24,14 +26,19 @@ def __init__(self, findings_base_path: str, force_detect: bool, timeout: Optiona def run(self, detector: Detector, version: ProjectVersion, version_compile: VersionCompile, misuse: Misuse, misuse_compile: MisuseCompile): - run = DetectorRun(detector, version, self._get_findings_path(detector, version, misuse)) + run = self._get_detector_run(detector, misuse, version) - run.ensure_executed(self._get_detector_arguments(version_compile, misuse_compile), + detector_arguments = self._get_detector_arguments(version_compile, misuse_compile) + + run.ensure_executed(detector_arguments, self.timeout, self.force_detect, self.current_timestamp, misuse_compile.timestamp, logging.getLogger("task.detect")) return run + def _get_detector_run(self, detector, misuse, version): + return DetectorRun(detector, version, self._get_findings_path(detector, version, misuse)) + def _get_findings_path(self, detector: Detector, version: ProjectVersion, misuse: Misuse): return join(self.findings_base_path, DetectProvidedCorrectUsagesTask.__RUN_MODE_NAME, detector.id, version.project_id, version.version_id, misuse.misuse_id) diff --git a/mubench.pipeline/tasks/task_runner.py b/mubench.pipeline/tasks/task_runner.py index e772a0b88..7ca292eb0 100644 --- a/mubench.pipeline/tasks/task_runner.py +++ b/mubench.pipeline/tasks/task_runner.py @@ -13,15 +13,18 @@ class TaskRunner: def __init__(self, tasks: List): self.tasks = tasks self.logger = logging.getLogger("task_runner") + self.__accumulated_result = None def run(self, *initial_parameters: Tuple[Any]): if not self.tasks: return + self.__accumulated_result = None self.__run(0, list(initial_parameters)) for task in self.tasks: if callable(getattr(task, 'end', None)): task.end() + return self.__accumulated_result def __run(self, current_task_index: int, previous_results: List): task = self.tasks[current_task_index] @@ -39,6 +42,14 @@ def __run(self, current_task_index: int, previous_results: List): logger.debug("Full exception:", exc_info=True) return + is_leaf_task = current_task_index == len(self.tasks) - 1 + is_accumulable_result = hasattr(results, '__add__') + if is_leaf_task and is_accumulable_result: + if self.__accumulated_result is None: + self.__accumulated_result = results + else: + self.__accumulated_result += results + if results is None: results = [Continue()] diff --git a/mubench.pipeline/tests/tasks/implementations/test_detect_all_findings.py b/mubench.pipeline/tests/tasks/implementations/test_detect_all_findings.py new file mode 100644 index 000000000..59583cbe2 --- /dev/null +++ b/mubench.pipeline/tests/tasks/implementations/test_detect_all_findings.py @@ -0,0 +1,30 @@ +from unittest.mock import patch, MagicMock + +from nose.tools import assert_equals + +from tasks.configurations.detector_interface_configuration import key_training_src_path +from tasks.implementations.crossproject_prepare import CrossProjectSourcesPaths +from tasks.implementations.detect_all_findings import DetectAllFindingsTask +from tests.data.stub_detector import StubDetector +from tests.test_utils.data_util import create_version + + +@patch("tasks.implementations.detect_all_findings.DetectAllFindingsTask._get_detector_run") +class TestDetectAllFindingsTask: + def setup(self): + self.detector = StubDetector() + self.version = create_version("-version-", meta={}) + self.version_compile = self.version.get_compile("-compile-") + + def test_adds_xp_training_sources(self, get_detector_run_mock): + detector_run_mock = MagicMock() + get_detector_run_mock.return_value = detector_run_mock + xp_sources_paths = CrossProjectSourcesPaths(["xp_sources1", "xp_sources2"]) + uut = DetectAllFindingsTask("-findings-", False, None, -1) + + uut.run(self.detector, self.version, self.version_compile, xp_sources_paths) + + assert_equals(1, detector_run_mock.ensure_executed.call_count) + actual_ensure_executed_args = detector_run_mock.ensure_executed.call_args[0] + actual_detector_args = actual_ensure_executed_args[0] + assert_equals(["xp_sources1", "xp_sources2"], actual_detector_args[key_training_src_path]) diff --git a/mubench.pipeline/tests/tasks/test_task_runner.py b/mubench.pipeline/tests/tasks/test_task_runner.py index 2ef2dd04b..17bc97564 100644 --- a/mubench.pipeline/tests/tasks/test_task_runner.py +++ b/mubench.pipeline/tests/tasks/test_task_runner.py @@ -64,13 +64,13 @@ def test_runs_subsequent_task_with_results_of_previous_tasks_in_any_order(self): third_task.assert_called_once_with(42, ":some string:") def test_runs_subsequent_task_with_generic_result_of_previous_task(self): - first_task = VoidTask([[1,2]]) + first_task = VoidTask([[1, 2]]) second_task = ListConsumingTask() uut = TaskRunner([first_task, second_task]) uut.run() - second_task.assert_called_once_with([1,2]) + second_task.assert_called_once_with([1, 2]) def test_reports_if_a_task_requires_an_unavailable_parameter(self): first_task = VoidTask([42]) @@ -186,10 +186,28 @@ def test_handles_empty_tasks(self): uut = TaskRunner([]) uut.run() + def test_does_not_attempt_to_accumulate_non_accumulable_results(self): + branch_three_times = VoidTask(['-some string-', '-some string-', '-some string-']) + return_string = VoidTask(object()) + uut = TaskRunner([branch_three_times, return_string]) + + result = uut.run() + + assert_equals(None, result) + + def test_returns_accumulated_results_of_last_task(self): + branch_three_times = VoidTask(['-some string-', '-some string-', '-some string-']) + return_string = VoidTask(42) + uut = TaskRunner([branch_three_times, return_string]) + + result = uut.run() + + assert_equals(126, result) + class MockTask: - def __init__(self, results: List = None): - self.results = results or [] + def __init__(self, results: Any = None): + self.results = results self.calls = [] def assert_called_once_with(self, *args): @@ -240,7 +258,7 @@ def run(self, i: int, j: int): class FailingTask(MockTask): - def __init__(self, message: str = "", results = None): + def __init__(self, message: str = "", results=None): super().__init__(results) self.message = message @@ -250,7 +268,7 @@ def run(self): class FailingStringConsumingTask(MockTask): - def __init__(self, message: str = "", results = None): + def __init__(self, message: str = "", results=None): super().__init__(results) self.message = message diff --git a/mubench.pipeline/tests/utils/test_config_util.py b/mubench.pipeline/tests/utils/test_config_util.py index 89a0b8a29..5b3784e76 100644 --- a/mubench.pipeline/tests/utils/test_config_util.py +++ b/mubench.pipeline/tests/utils/test_config_util.py @@ -1,4 +1,5 @@ import sys +from unittest.mock import patch from nose.tools import assert_raises, assert_equals, nottest @@ -179,3 +180,19 @@ def test_allow_zero_limit(): def test_fails_on_negative_limit(): parser = _get_command_line_parser(['DemoDetector'], [], []) assert_raises(SystemExit, parser.parse_args, ['publish', 'ex2', 'DemoDetector', '-s', 'site', '--limit', '-1']) + + +def test_run_with_xp(): + parser = _get_command_line_parser(['DemoDetector'], [], []) + result = parser.parse_args(['run', 'ex2', 'DemoDetector', '--with-xp', '-bp', 'aaa', '-bu', 'bbb']) + assert_equals(True, result.with_xp) + assert_equals('aaa', result.boa_password) + assert_equals('bbb', result.boa_user) + + +@patch("utils.config_util.sys") +def test_requires_boa_credentials_on_with_xp(sys_mock): + args = ['run', 'ex1', 'DemoDetector', '--with-xp'] + sys_mock.argv = args + parser = _get_command_line_parser(['DemoDetector'], [], []) + assert_raises(SystemExit, parser.parse_args, args) diff --git a/mubench.pipeline/utils/config_util.py b/mubench.pipeline/utils/config_util.py index c0227c326..5ffb2b9e3 100644 --- a/mubench.pipeline/utils/config_util.py +++ b/mubench.pipeline/utils/config_util.py @@ -5,6 +5,8 @@ from os.path import join, abspath, dirname from typing import List, Any +import sys + from data.detector import get_available_detector_ids, Detector from tasks.implementations import stats from utils.dataset_util import get_available_dataset_ids @@ -17,6 +19,8 @@ __FINDINGS_PATH = join(MUBENCH_ROOT_PATH, "findings") __DATASETS_FILE_PATH = join(MUBENCH_ROOT_PATH, 'data', 'datasets.yml') __DETECTORS_PATH = join(MUBENCH_ROOT_PATH, "detectors") +__XP_CHECKOUTS_PATH = join(MUBENCH_ROOT_PATH, "checkouts-xp") +__XP_INDEX_FILE = join(__XP_CHECKOUTS_PATH, "index") class SortingHelpFormatter(HelpFormatter): @@ -68,6 +72,8 @@ def _get_command_line_parser(available_detectors: List[str], available_scripts: subparsers.required = True + parser.add_argument('--root-path', dest='root_path', default=__get_default('root-path', MUBENCH_ROOT_PATH), + help=argparse.SUPPRESS) parser.add_argument('--use-tmp-wrkdir', dest='use_tmp_wrkdir', default=__get_default('use-tmp-wrkdir', False), help=argparse.SUPPRESS, action='store_true') parser.add_argument('--data-path', dest='data_path', default=__get_default('data-path', __DATA_PATH), @@ -84,6 +90,12 @@ def _get_command_line_parser(available_detectors: List[str], available_scripts: default=__get_default('detectors-path', __DETECTORS_PATH), help=argparse.SUPPRESS) parser.add_argument('--development-mode', dest='development_mode', default=__get_default('development-mode', False), help=argparse.SUPPRESS, action='store_true') + parser.add_argument('--xp-checkouts-path', dest='xp_checkouts_path', + default=__get_default('xp-checkouts-path', __XP_CHECKOUTS_PATH), help=argparse.SUPPRESS) + parser.add_argument('--xp-index-file', dest='xp_index_file', + default=__get_default('xp-index-file', __XP_INDEX_FILE), help=argparse.SUPPRESS) + parser.add_argument('--max-project-sample-size', dest='max_project_sample_size', + default=__get_default('max-project-sample-size', 50), help=argparse.SUPPRESS) __add_check_subprocess(available_datasets, subparsers) __add_info_subprocess(available_datasets, subparsers) @@ -92,6 +104,7 @@ def _get_command_line_parser(available_detectors: List[str], available_scripts: __add_run_subprocess(available_detectors, available_datasets, subparsers) __add_publish_subprocess(available_detectors, available_datasets, subparsers) __add_stats_subprocess(available_scripts, available_datasets, subparsers) + __add_checkout_cross_project_subprocess(available_datasets, subparsers) # Add subprocesses provided by the ./mubench script __add_browse_subprocess(subparsers) @@ -226,6 +239,7 @@ def __add_run_ex2_subprocess(available_detectors: List[str], available_datasets: __setup_compile_arguments(experiment_parser) __setup_run_arguments(experiment_parser, available_detectors) __setup_publish_precision_arguments(experiment_parser) + __setup_cross_project_arguments(experiment_parser) def __add_run_ex3_subprocess(available_detectors: List[str], available_datasets: List[str], subparsers) -> None: @@ -239,6 +253,22 @@ def __add_run_ex3_subprocess(available_detectors: List[str], available_datasets: __setup_checkout_arguments(experiment_parser) __setup_compile_arguments(experiment_parser) __setup_run_arguments(experiment_parser, available_detectors) + __setup_cross_project_arguments(experiment_parser) + + +def __add_checkout_cross_project_subprocess(available_datasets: List[str], subparsers) -> None: + parser = subparsers.add_parser("checkout-xp", formatter_class=SortingHelpFormatter, + help="TODO", + description="TODO") + + __setup_filter_arguments(parser, available_datasets) + + boa_user = __get_default('boa-user', None) + boa_password = __get_default('boa-password', None) + parser.add_argument("-bu", "--boa-user", metavar="BOAUSER", required=not boa_user, + default=boa_user, help="Your boa username.") + parser.add_argument("-bp", "--boa-password", metavar="BOAPASSWORD", required=not boa_password, + default=boa_password, help="Your boa password.") def __add_publish_subprocess(available_detectors: List[str], available_datasets: List[str], subparsers) -> None: @@ -306,6 +336,7 @@ def __add_publish_ex2_subprocess(available_detectors: List[str], available_datas __setup_run_arguments(experiment_parser, available_detectors) __setup_publish_arguments(experiment_parser) __setup_publish_precision_arguments(experiment_parser) + __setup_cross_project_arguments(experiment_parser) def __add_publish_ex3_subprocess(available_detectors: List[str], available_datasets: List[str], @@ -325,6 +356,7 @@ def __add_publish_ex3_subprocess(available_detectors: List[str], available_datas __setup_compile_arguments(experiment_parser) __setup_run_arguments(experiment_parser, available_detectors) __setup_publish_arguments(experiment_parser) + __setup_cross_project_arguments(experiment_parser) def __setup_filter_arguments(parser: ArgumentParser, available_datasets: List[str]) -> None: @@ -398,5 +430,19 @@ def upload_limit(x): "Use `--limit 0` to publish only run stats.".format(default_limit)) +def __setup_cross_project_arguments(parser: ArgumentParser) -> None: + parser.add_argument('--with-xp', dest='with_xp', action='store_true', default=__get_default('with-xp', False), + help="use sampled projects with usages for learning.") + + boa_user = __get_default('boa-user', None) + boa_password = __get_default('boa-password', None) + parser.add_argument("-bu", "--boa-user", metavar="BOAUSER", + required='--with-xp' in sys.argv and not boa_user, + default=boa_user, help="Your boa username.") + parser.add_argument("-bp", "--boa-password", metavar="BOAPASSWORD", + required='--with-xp' in sys.argv and not boa_password, + default=boa_password, help="Your boa password.") + + def __add_browse_subprocess(subparsers) -> None: subparsers.add_parser('browse', help="Open a Linux shell in a container mounting the MUBench Docker Volumes.")