Skip to content

Commit

Permalink
Merge pull request #790 from NREL/requeue-updates
Browse files Browse the repository at this point in the history
Requeue updates
  • Loading branch information
brianlball authored Aug 23, 2024
2 parents a5df754 + f2d5dd8 commit 0578b0b
Show file tree
Hide file tree
Showing 23 changed files with 193 additions and 31 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/openstudio-server-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ jobs:
if: |
github.ref == 'refs/heads/master' ||
github.ref == 'refs/heads/develop'
# github.ref == 'refs/heads/3.6.1-4'
# github.ref == 'refs/heads/3.6.1-3'
shell: bash
run: ./docker/deployment/scripts/deploy_docker_github_actions.sh
env:
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ services:
image: redis:6.0.9
ports:
- "6379:6379"
command: "redis-server --requirepass ${REDIS_PASSWORD}"
command: "redis-server --requirepass ${REDIS_PASSWORD} --maxclients 40000"
web:
image: nrel/openstudio-server:latest
ports:
Expand Down Expand Up @@ -72,7 +72,7 @@ services:
worker:
image: nrel/openstudio-server:latest
environment:
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=${REDIS_URL}
- MONGO_USER=${MONGO_USER}
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ services:
image: redis:6.0.9
ports:
- "6379:6379"
command: "redis-server --requirepass ${REDIS_PASSWORD}"
command: "redis-server --requirepass ${REDIS_PASSWORD} --maxclients 40000"
web:
image: 127.0.0.1:5000/openstudio-server
build:
Expand Down Expand Up @@ -69,7 +69,7 @@ services:
rails_env: docker
environment:
- OS_SERVER_NUMBER_OF_WORKERS=${OS_SERVER_NUMBER_OF_WORKERS}
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=${REDIS_URL}
- MONGO_USER=${MONGO_USER}
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ services:
image: redis:6.0.9
ports:
- "6379:6379"
command: "redis-server --requirepass ${REDIS_PASSWORD}"
command: "redis-server --requirepass ${REDIS_PASSWORD} --maxclients 40000"
web:
image: nrel/openstudio-server:latest
build:
Expand Down Expand Up @@ -75,7 +75,7 @@ services:
bundle_args: ''
environment:
- OS_SERVER_NUMBER_OF_WORKERS=1
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=${REDIS_URL}
- MONGO_USER=${MONGO_USER}
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ services:
image: redis:6.0.9
ports:
- "6379:6379"
command: "redis-server --requirepass ${REDIS_PASSWORD}"
command: "redis-server --requirepass ${REDIS_PASSWORD} --maxclients 40000"
web:
image: nrel/openstudio-server:latest
build:
Expand Down Expand Up @@ -69,7 +69,7 @@ services:
rails_env: docker
environment:
- OS_SERVER_NUMBER_OF_WORKERS=${OS_SERVER_NUMBER_OF_WORKERS}
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=${REDIS_URL}
- MONGO_USER=${MONGO_USER}
Expand Down
2 changes: 1 addition & 1 deletion docker/server/start-web-background.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ echo "Waiting for Redis to start"

#cd /opt/openstudio/server && bundle exec rake environment resque:work
echo "Startup two resque workers"
cd /opt/openstudio/server && COUNT=2 bundle exec rake environment resque:workers
cd /opt/openstudio/server && COUNT=6 bundle exec rake environment resque:workers
4 changes: 2 additions & 2 deletions local_setup_scripts/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ services:
resources:
reservations:
cpus: "1"
command: "redis-server --requirepass openstudio"
command: "redis-server --requirepass openstudio --maxclients 40000"
web:
image: 127.0.0.1:5000/openstudio-server
ports:
Expand Down Expand Up @@ -84,7 +84,7 @@ services:
worker:
image: 127.0.0.1:5000/openstudio-server
environment:
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=redis://:openstudio@queue:6379
- MONGO_USER=openstudio
Expand Down
4 changes: 2 additions & 2 deletions local_setup_scripts/win64/docker-compose-mock.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ services:
placement:
constraints:
- node.role == manager
command: "redis-server --requirepass openstudio"
command: "redis-server --requirepass openstudio --maxclients 40000"
web:
image: 127.0.0.1:5000/openstudio-server
ports:
Expand Down Expand Up @@ -72,7 +72,7 @@ services:
worker:
image: 127.0.0.1:5000/openstudio-server
environment:
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=redis://:openstudio@queue:6379
- MONGO_USER=openstudio
Expand Down
4 changes: 2 additions & 2 deletions local_setup_scripts/win64/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ services:
placement:
constraints:
- node.role == manager
command: "redis-server --requirepass openstudio"
command: "redis-server --requirepass openstudio --maxclients 40000"
web:
image: 127.0.0.1:5000/openstudio-server
ports:
Expand Down Expand Up @@ -72,7 +72,7 @@ services:
worker:
image: 127.0.0.1:5000/openstudio-server
environment:
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=redis://:openstudio@queue:6379
- MONGO_USER=openstudio
Expand Down
40 changes: 40 additions & 0 deletions server/app/controllers/admin_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,46 @@ def index
@os_cli = version ? version.strip : 'Unknown'
end

def prune_resque_workers
Rails.logger.warn "Pruning Dead Resque Workers"

# Enqueue a new job
worker = Resque::Worker.new()
worker.prune_dead_workers
worker.shutdown

respond_to do |format|
format.html { redirect_to admin_index_path, notice: 'Resque Workers Pruned.' }
format.json { head :no_content }
end
end

# POST /jobs/requeue_failed
def requeue_failed
Rails.logger.warn "Requeueing Failed Jobs to 'requeued' Queue"

requeued_count = 0

Resque::Failure.each do |id, failed_job|
payload = failed_job['payload']
job_class = payload['class']
job_args = payload['args']

# Requeue the job to the 'requeued' queue
Resque.enqueue_to('requeued', job_class.constantize, *job_args)
# Remove the job from the failed queue
Resque::Failure.remove(id)
requeued_count += 1
end

Rails.logger.info "#{requeued_count} jobs successfully requeued to 'requeued' queue by requeue_failed"

respond_to do |format|
format.html { redirect_to admin_index_path, notice: "#{requeued_count} Resque jobs requeued." }
format.json { render json: { message: "#{requeued_count} jobs requeued to 'requeued' queue" }, status: :ok }
end
end

def backup_database
logger.info params
write_and_send_data
Expand Down
22 changes: 22 additions & 0 deletions server/app/controllers/analyses_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ def destroy
# stop analysis button action
def stop
@analysis = Analysis.find(params[:id])
if @analysis.nil?
logger.error "Analysis with ID #{params[:id]} not found."
redirect_to analyses_path, alert: 'Analysis not found.' and return
end
res = @analysis.stop_analysis

respond_to do |format|
Expand All @@ -208,6 +212,24 @@ def stop
end
end
end

# stop analysis button action
def soft_stop
@analysis = Analysis.find(params[:id])
if @analysis.nil?
logger.error "Analysis with ID #{params[:id]} not found."
redirect_to analyses_path, alert: 'Analysis not found.' and return
end
res = @analysis.soft_stop_analysis

respond_to do |format|
if res[0]
format.html { redirect_to @analysis, notice: 'Analysis flag changed to stop. Will NOT wait until the last submitted run finishes before killing.' }
else
format.html { redirect_to @analysis, notice: 'Analysis flag did NOT change.' }
end
end
end

# Controller for submitting the action via post. This right now only works with the API
# and will only return a JSON response based on whether or not the analysis has been
Expand Down
20 changes: 19 additions & 1 deletion server/app/controllers/data_points_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -302,13 +302,15 @@ def requeue
Rails.logger.warn "data_points_controller.REQUEUE"
@data_point = DataPoint.find(params[:id])
analysis_id = @data_point.analysis
Rails.logger.warn "data_points_contoller.REQUEUEing #{@data_point.id}"
Rails.logger.debug "data_points_controller.id: #{@data_point.id}"
Rails.logger.debug "data_points_controller.job_id: #{@data_point.job_id}"
# Destroy the existing job in Resque queue; this is tied to a worker_host:PID:uuid
Resque::Job.destroy(:requeue, 'ResqueJobs::RunSimulateDataPoint', @data_point.job_id)
Resque::Job.destroy(:simulations, 'ResqueJobs::RunSimulateDataPoint', @data_point.job_id)

# Enqueue a new job
Resque.enqueue(ResqueJobs::RunSimulateDataPoint, @data_point.job_id)
Resque.enqueue_to(:requeued, ResqueJobs::RunSimulateDataPoint, @data_point.job_id)

# Attempt to find the worker processing this job
#worker = find_resque_worker_by_job_id(@data_point.job_id)
Expand All @@ -335,6 +337,22 @@ def requeue
end
end

def requeue_started
@analysis = Analysis.find(params[:id])
Rails.logger.warn "Requeueing all NOT completed normal simulations for analysis #{@analysis.id}"
data_points_to_requeue = @analysis.data_points.where.not(status_message: 'completed normal')

data_points_to_requeue.each do |dp|
Rails.logger.warn "Requeueing DataPoint #{dp.id}"
Resque.enqueue_to(:requeued, ResqueJobs::RunSimulateDataPoint, dp.job_id)
end

respond_to do |format|
format.html { redirect_to analysis_path(@analysis), notice: 'Simulations were successfully requeued.' }
format.json { head :no_content }
end
end

def find_resque_worker_by_job_id(job_id)
Rails.logger.debug "data_points_controller.find_resque_worker_by_job_id"
Resque.workers.each do |worker|
Expand Down
31 changes: 29 additions & 2 deletions server/app/controllers/pages_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def dashboard
# data for dashboard header
@projects = Project.all
# sort works because the states are queued, started, completed, na. started is the last in the list...
@analyses = Analysis.all.order_by(:updated_at.asc)
#@analyses = Analysis.all.order_by(:updated_at.asc)
failed_runs = DataPoint.where(status_message: 'datapoint failure').count
total_runs = DataPoint.all.count
completed_cnt = DataPoint.where(status: 'completed').count
Expand All @@ -50,7 +50,34 @@ def dashboard

# Finding the current analysis
#candidates = Analysis.includes(:jobs).order_by(:updated_at.asc)
@current = @analyses.detect { |analysis| analysis.jobs.any? { |job| job.status == 'started' } }
#@current = @analyses.detect { |analysis| analysis.jobs.any? { |job| job.status == 'started' } }

# Step 1: Fetch all analyses ordered by updated_at descending (newest first)
all_analyses = Analysis.includes(:jobs).order_by(updated_at: :desc)

# Step 2: Select the first two 'started' analyses
#started_analyses = all_analyses.select { |analysis| analysis.jobs.any? { |job| job.status == 'started' } }.first(2)
# Step 2: Efficiently select the first two 'started' analyses, leveraging Ruby for fine-tuned sorting
started_analyses = all_analyses.select { |analysis| analysis.jobs.any? { |job| job.status == 'started' }
}.sort_by { |analysis|
# This finds the earliest start time among the started jobs for sorting
analysis.jobs.select { |job| job.status == 'started' }.min_by(&:created_at).created_at
}.first(2)

# Step 3: Set @current and prepare @analyses
if started_analyses.any?
# Assume @current is the most recently updated
@current = started_analyses.first
# Ensure @analyses includes other analyses without changing the original order too much
# Remove @current from all_analyses and prepend the second 'started' analysis if it exists
all_analyses -= started_analyses
all_analyses.prepend(started_analyses.second) if started_analyses.length > 1
else
# Fallback if no 'started' analyses found
@current = all_analyses.first
end

@analyses = all_analyses
# If no 'started' analysis is currently running, optionally set @current to the most recently updated analysis
@current ||= @analyses.first

Expand Down
18 changes: 11 additions & 7 deletions server/app/jobs/resque_jobs/run_simulate_data_point.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,25 @@ def self.perform(data_point_id, options = {})
if !(statuses[:status] == 'completed' && statuses[:status_message] == 'completed normal')
msg = "RUNNING DJ: #{statuses[:status]} and #{statuses[:status_message]}"
d.add_to_rails_log(msg)
puts msg
job = DjJobs::RunSimulateDataPoint.new(data_point_id, options)
job.perform
else
msg = "SKIPPING #{data_point_id} since it is #{statuses[:status]} and #{statuses[:status_message]}"
d.add_to_rails_log(msg)
puts msg
end
rescue Errno::ENOSPC, Resque::DirtyExit, Resque::TermException, Resque::PruneDeadWorkerDirtyExit => e
rescue SignalException, Errno::ENOSPC, Resque::DirtyExit, Resque::TermException, Resque::PruneDeadWorkerDirtyExit => e
# Log the termination and re-enqueue attempt
d.add_to_rails_log("Worker Caught Exception: #{e.inspect}: Re-enqueueing DataPoint ID #{data_point_id}")
Resque.enqueue(self, data_point_id, options)
d.add_to_rails_log("DataPoint #{data_point_id} re-enqueued.")
d.add_to_rails_log("Worker Caught Exception: #{e.inspect}")#: Re-enqueueing DataPoint ID #{data_point_id}")
#Resque.enqueue_to(:requeued, self, data_point_id, options)
#puts "DataPoint #{data_point_id} re-enqueued."
puts "Worker Caught Exception: #{e.inspect}"
rescue => e
d.add_to_rails_log("Worker Caught Unhandled Exception: #{e.message}: Re-enqueueing DataPoint ID #{data_point_id}")
Resque.enqueue(self, data_point_id, options)
d.add_to_rails_log("Unhandled exception, re-enqueued DataPoint.")
d.add_to_rails_log("Worker Caught Unhandled Exception: #{e.message}")#: Re-enqueueing DataPoint ID #{data_point_id}")
#Resque.enqueue_to(:requeued, self, data_point_id, options)
#puts "Unhandled exception, re-enqueued DataPoint."
puts "Worker Caught Unhandled Exception: #{e.message}"
end
end
end
20 changes: 20 additions & 0 deletions server/app/models/analysis.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,26 @@ def stop_analysis
[save!, errors]
end

def soft_stop_analysis
Rails.logger.info('attempting to stop analysis')

self.run_flag = false

jobs.each do |j|
unless j.status == 'completed'
j.status = 'completed'
j.end_time = Time.new
j.status_message = 'datapoint canceled'
j.save!
end
end

# Remove all the queued background jobs for this analysis
data_points.where(status: 'queued').each(&:set_soft_canceled_state)

[save!, errors]
end

# Method that pulls out the variables from the uploaded problem/analysis JSON.
def pull_out_os_variables
pat_json = false
Expand Down
11 changes: 11 additions & 0 deletions server/app/models/data_point.rb
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ def set_canceled_state
self.status_message = 'datapoint canceled'
save!
end

def set_soft_canceled_state
Rails.logger.debug "data_point.set_soft_canceled_state"
#destroy_background_job # destroy queued job
self.run_start_time ||= Time.now
self.run_end_time = Time.now
self.status = :completed
self.status_message = 'datapoint canceled'
save!
end

def set_queued_state
Rails.logger.debug "data_point.set_queued_state"
Expand Down Expand Up @@ -171,6 +181,7 @@ def destroy_background_job
elsif Rails.application.config.job_manager == :resque
if job_id
Resque::Job.destroy(:simulations, 'ResqueJobs::RunSimulateDataPoint', job_id)
Resque::Job.destroy(:requeued, 'ResqueJobs::RunSimulateDataPoint', job_id)
end
else
raise 'Rails.application.config.job_manager must be set to :resque or :delayed_job'
Expand Down
Loading

0 comments on commit 0578b0b

Please sign in to comment.