Skip to content

Commit

Permalink
various fixes for the slurm implementation
Browse files Browse the repository at this point in the history
  - patch: Added getter for JobState enum.
  - patch: Fix SLURM API.
    * Reworked queryExtendedJobStateById to always execute both scontrol and sacct and combine the output
    * Some refactoring and added comments
    * Fixed time parsing for jobs longer than 24h
    * Added --propagate=none" to getAdditionalCommandParameters
    * Added tests
  • Loading branch information
Gordi committed Jul 3, 2023
1 parent 7ba788e commit 87f1b5b
Show file tree
Hide file tree
Showing 9 changed files with 2,457 additions and 119 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ To start the integration tests, please fill in host and user settings (password

# Change Logs

* 0.1.3
- patch: Added getter for JobState enum.
- patch: Fix SLURM API.
* Reworked queryExtendedJobStateById to always execute both scontrol and sacct and combine the output
* Some refactoring and added comments
* Fixed time parsing for jobs longer than 24h
* Added --propagate=none" to getAdditionalCommandParameters
* Added tests

* 0.1.2
- patch: Fix SLURM API.
* Always return an asked resource for SLURM with requested memory, cores, and nodes.
Expand Down
7 changes: 7 additions & 0 deletions src/main/groovy/de/dkfz/roddy/execution/jobs/JobState.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ public boolean isDummy() {
return this == DUMMY;
}

public boolean isStarted() {
return this == STARTED;
}

public boolean isRunning() {
return this == JobState.RUNNING;
}
Expand All @@ -89,4 +93,7 @@ public boolean isSuccessful() {
return this == COMPLETED_SUCCESSFUL;
}

public boolean isFailed() { return Arrays.asList(JobState.FAILED, JobState.ABORTED).contains(this); }

public boolean isCompleted() { return Arrays.asList(JobState.COMPLETED_SUCCESSFUL, JobState.COMPLETED_UNKNOWN).contains(this); }
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class SlurmSubmissionCommand extends GridEngineBasedSubmissionCommand {

@Override
protected String getAdditionalCommandParameters() {
return "${getParsableParameter()} --kill-on-invalid-dep=yes" as String
return "${getParsableParameter()} --kill-on-invalid-dep=yes --propagate=none" as String
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package de.dkfz.roddy.execution.jobs.cluster.slurm

import de.dkfz.roddy.execution.jobs.BEJobID
import de.dkfz.roddy.execution.jobs.GenericJobInfo
import de.dkfz.roddy.execution.jobs.JobManagerOptions
import de.dkfz.roddy.execution.jobs.JobState
import de.dkfz.roddy.execution.jobs.TestHelper
import de.dkfz.roddy.tools.BufferUnit
import de.dkfz.roddy.tools.BufferValue
import spock.lang.Specification
import spock.lang.Unroll

import java.time.Duration

class SlurmJobManagerSpec extends Specification {

SlurmJobManager jobManager = new SlurmJobManager(TestHelper.makeExecutionService(), JobManagerOptions.create().build())


static final File getResourceFile(String file) {
new File("src/test/resources/de/dkfz/roddy/execution/jobs/cluster/slurm/", file)
}

@Unroll
def "test safelyParseColonSeparatedDuration with input '#value'"() {
expect:
SlurmJobManager.safelyParseColonSeparatedDuration(value) == Duration.parse(result)

where:
value | result
"12-00:00:00" | "PT288H"
"12-12:30:15" | "PT300H30M15S"
"12:00:00" | "PT12H"
"01:30:15" | "PT1H30M15S"
}

def "test processSControlOutput with scontrol.txt"() {
when:
GenericJobInfo jobInfo = jobManager.processSControlOutput(getResourceFile("scontrol.txt").text)

then:
/** Directories and files */
jobInfo.inputFile == new File("/dev/null")
jobInfo.logFile == new File("/path/to/outputFile")
jobInfo.user == "user(192456)"
jobInfo.submissionHost == "compute038"
jobInfo.executionHosts == ["compute038"]
jobInfo.errorLogFile == new File("/path/to/errorFile")
jobInfo.execHome == "/home/user"

/** Status info */
jobInfo.jobState == JobState.RUNNING
jobInfo.exitCode == 0
jobInfo.pendReason == "None"

/** Resources */
jobInfo.runTime == Duration.parse("PT3H21M58S")
jobInfo.askedResources.mem == new BufferValue(300, BufferUnit.G, BufferUnit.K)
jobInfo.askedResources.cores == 76
jobInfo.askedResources.nodes == 1
jobInfo.askedResources.walltime == Duration.parse("PT20H")
jobInfo.askedResources.queue == "compute"
jobInfo.usedResources.mem == new BufferValue(300, BufferUnit.G, BufferUnit.K)
jobInfo.usedResources.cores == 76
jobInfo.usedResources.nodes == 1
jobInfo.usedResources.walltime == Duration.parse("PT3H21M58S")
jobInfo.usedResources.queue == "compute"
jobInfo.submitTime.toString().startsWith("2023-06-20T07:07:33")
jobInfo.eligibleTime.toString().startsWith("2023-06-20T07:07:33")
jobInfo.startTime.toString().startsWith("2023-06-20T07:07:34")
jobInfo.endTime.toString().startsWith("2023-06-21T03:07:34")
}

def "test processSacctOutputFromJson with sacct.json"() {
when:
GenericJobInfo jobInfo = jobManager.processSacctOutputFromJson(getResourceFile("sacct.json").text)

then:
/** Common */
jobInfo.user == "user"
jobInfo.userGroup == "group"
jobInfo.jobGroup == "group"
jobInfo.priority == "0"
jobInfo.executionHosts == ["compute013"]

/** Status info */
jobInfo.jobState == JobState.COMPLETED_SUCCESSFUL
jobInfo.exitCode == 0
jobInfo.pendReason == "Dependency"

/** Resources */
jobInfo.askedResources.mem == new BufferValue(7168, BufferUnit.M, BufferUnit.K)
jobInfo.askedResources.cores == 1
jobInfo.askedResources.nodes == 1
jobInfo.askedResources.walltime == Duration.parse("PT5H")
jobInfo.askedResources.queue == "compute"
jobInfo.usedResources.mem == new BufferValue(7168, BufferUnit.M, BufferUnit.K)
jobInfo.usedResources.cores == 1
jobInfo.usedResources.nodes == 1
jobInfo.usedResources.walltime == Duration.parse("PT3M58S")
jobInfo.usedResources.queue == "compute"
jobInfo.runTime == Duration.parse("PT3M58S")

/** Directories and files */
jobInfo.execHome == "/path/to/file"

/** Timestamps */
jobInfo.submitTime.toInstant().toEpochMilli() / 1000 == 1687222030
jobInfo.eligibleTime.toInstant().toEpochMilli() / 1000 == 1687234198
jobInfo.startTime.toInstant().toEpochMilli() / 1000 == 1687234198
jobInfo.endTime.toInstant().toEpochMilli() / 1000 == 1687234436
}

def "test processSacctOutputFromJson with sacct_requeued.json"() {
when:
GenericJobInfo jobInfo = jobManager.processSacctOutputFromJson(getResourceFile("sacct_requeued.json").text)

then:
jobInfo.executionHosts == ["compute015"]
}

def "test fillFromSupplement"() {
given:
GenericJobInfo primary = new GenericJobInfo("jobName", new File("command"), new BEJobID("1"), [:], [])
primary.user = "User"
primary.userGroup = "User Group"
GenericJobInfo supplement = new GenericJobInfo("jobName", new File("command"), new BEJobID("1"), [:], [])
supplement.userGroup = "Other User Group"
supplement.account = "Account"
GenericJobInfo result

when:
result = jobManager.fillFromSupplement(primary, supplement)

then:
result.user == "User"
result.userGroup == "User Group"
result.account == "Account"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ class SlurmSubmissionCommandSpec extends Specification {
SlurmSubmissionCommand cmd = new SlurmSubmissionCommand(jobManager, makeJob([:]), "jobname",
null, [:], null, "/tmp/test.sh")
then:
cmd.toBashCommandString() == 'sbatch --job-name jobname --hold --chdir $HOME --mem=1024M --time=1:00:00 --nodes=1 --cores-per-socket=4 --parsable --kill-on-invalid-dep=yes /tmp/test.sh'
cmd.toBashCommandString() == 'sbatch --job-name jobname --hold --chdir $HOME --mem=1024M --time=1:00:00 --nodes=1 --cores-per-socket=4 --parsable --kill-on-invalid-dep=yes --propagate=none /tmp/test.sh'
}

def "command with accounting name"() {
when:
SlurmSubmissionCommand cmd = new SlurmSubmissionCommand(jobManager, makeJob([:], "accountingProject"),
"jobname", null, [:], null, "/tmp/test.sh")
then:
cmd.toBashCommandString() == 'sbatch --account="accountingProject" --job-name jobname --hold --chdir $HOME --mem=1024M --time=1:00:00 --nodes=1 --cores-per-socket=4 --parsable --kill-on-invalid-dep=yes /tmp/test.sh'
cmd.toBashCommandString() == 'sbatch --account="accountingProject" --job-name jobname --hold --chdir $HOME --mem=1024M --time=1:00:00 --nodes=1 --cores-per-socket=4 --parsable --kill-on-invalid-dep=yes --propagate=none /tmp/test.sh'
}

}
Loading

0 comments on commit 87f1b5b

Please sign in to comment.