diff --git a/.gitignore b/.gitignore index 59c74047..c76c4f53 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,7 @@ /tmp /log /public +fixtures/1M.json +fixtures/10M.json +fixtures/*.gz + diff --git a/.rspec b/.rspec new file mode 100644 index 00000000..c99d2e73 --- /dev/null +++ b/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/Gemfile b/Gemfile index 34074dfd..3cc15cb9 100644 --- a/Gemfile +++ b/Gemfile @@ -8,7 +8,12 @@ gem 'pg' gem 'puma' gem 'listen' gem 'bootsnap' +gem 'pry' +gem 'rspec' +gem 'rspec-rails' gem 'rack-mini-profiler' +gem 'ruby-prof' +gem 'strong_migrations' # Windows does not include zoneinfo files, so bundle the tzinfo-data gem gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby] diff --git a/Gemfile.lock b/Gemfile.lock index a9ddd818..44ba277d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -78,13 +78,16 @@ GEM bootsnap (1.18.4) msgpack (~> 1.2) builder (3.3.0) + coderay (1.1.3) concurrent-ruby (1.3.5) connection_pool (2.5.0) crass (1.0.6) date (3.4.1) + diff-lcs (1.5.1) drb (2.2.1) erubi (1.13.1) ffi (1.17.1-arm64-darwin) + ffi (1.17.1-x86_64-linux-gnu) globalid (1.2.1) activesupport (>= 6.1) i18n (1.14.7) @@ -107,6 +110,7 @@ GEM net-pop net-smtp marcel (1.0.4) + method_source (1.1.0) mini_mime (1.1.5) minitest (5.25.4) msgpack (1.8.0) @@ -122,10 +126,15 @@ GEM nio4r (2.7.4) nokogiri (1.18.2-arm64-darwin) racc (~> 1.4) + nokogiri (1.18.2-x86_64-linux-gnu) + racc (~> 1.4) pg (1.5.9) pp (0.6.2) prettyprint prettyprint (0.2.0) + pry (0.15.2) + coderay (~> 1.1) + method_source (~> 1.0) psych (5.2.3) date stringio @@ -179,8 +188,32 @@ GEM psych (>= 4.0.0) reline (0.6.0) io-console (~> 0.5) + rspec (3.13.0) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.3) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.3) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.2) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-rails (7.1.1) + actionpack (>= 7.0) + activesupport (>= 7.0) + railties (>= 7.0) + rspec-core (~> 3.13) + rspec-expectations (~> 3.13) + rspec-mocks (~> 3.13) + rspec-support (~> 3.13) + rspec-support (3.13.2) + ruby-prof (1.7.1) securerandom (0.4.1) stringio (3.1.2) + strong_migrations (2.2.0) + activerecord (>= 7) thor (1.3.2) timeout (0.4.3) tzinfo (2.0.6) @@ -195,14 +228,20 @@ GEM PLATFORMS arm64-darwin-24 + x86_64-linux DEPENDENCIES bootsnap listen pg + pry puma rack-mini-profiler rails (~> 8.0.1) + rspec + rspec-rails + ruby-prof + strong_migrations tzinfo-data RUBY VERSION diff --git a/app/controllers/trips_controller.rb b/app/controllers/trips_controller.rb index acb38be2..fa03e01a 100644 --- a/app/controllers/trips_controller.rb +++ b/app/controllers/trips_controller.rb @@ -2,6 +2,6 @@ class TripsController < ApplicationController def index @from = City.find_by_name!(params[:from]) @to = City.find_by_name!(params[:to]) - @trips = Trip.where(from: @from, to: @to).order(:start_time) + @trips = Trip.where(from: @from, to: @to).includes(bus: :services).order(:start_time) end end diff --git a/app/services/trips_importer.rb b/app/services/trips_importer.rb new file mode 100644 index 00000000..abf5d339 --- /dev/null +++ b/app/services/trips_importer.rb @@ -0,0 +1,94 @@ +class TripsImporter + def initialize(file = 'fixtures/small.json') + @file = file + end + + def self.call(...) + new(...).call + end + + def call + json = JSON.parse(File.read(file)) + + clean_database + + ActiveRecord::Base.transaction do + city_names = Set.new + service_names = Set.new + buses = {} + + # первый проход - собираем "справочные" данные - города, услуги, автобусы + # собираем в Set или хэш, так чтобы удобно было вставлять в бд + json.each do |trip| + city_names.add trip['from'] + city_names.add trip['to'] + + service_names.merge trip['bus']['services'] + buses[trip['bus']['number']] = trip['bus']['model'] + end + + # вставляем справочное + City.insert_all city_names.map { |name| { name: name } } + Service.insert_all service_names.map { |name| { name: name } } + Bus.insert_all buses.map { |number, model| { number: number, model: model }} + + # формируем хэши, чтобы удобно получить доступ к id при втором проходе + cities = City.all.each_with_object({}) { |city, hash| hash[city.name] = city.id } + services = Service.all.each_with_object({}) { |service, hash| hash[service.name] = service.id } + buses = Bus.all.each_with_object({}) { |bus, hash| hash[bus.number] = bus.id } + + # тут соберём пары автобус-услуга + buses_services = Set.new + + # тут данные по поездкам для вставки + trips = [] + + json.each do |trip| + from_id = cities[trip['from']] + to_id = cities[trip['to']] + bus_id = buses[trip['bus']['number']] + + # заполняем пары автобус-услуга + service_ids = services.values_at(*trip['bus']['services']) + service_ids.each do |service_id| + buses_services.add([bus_id, service_id]) + end + + trips.push({ + from_id: from_id, + to_id: to_id, + bus_id: bus_id, + start_time: trip['start_time'], + duration_minutes: trip['duration_minutes'], + price_cents: trip['price_cents'], + }) + end + + # вставка данных о поездках + # пока не стала батчить, т.к. и так довольно быстро происходит + Trip.insert_all trips + + # вот такой insert into buses_services + # вообще чаще используем has_and_belongs_to_many , потому что часто в связке потом нужны доп. данные и таймстемпы, и свой id + # тогда можно было бы insert_all использовать + if buses_services.present? + # что-то решила на всякий случай подготовить строки ) + values = buses_services.map { |arr| sprintf("(%s, %s)", arr[0], arr[1]) }.join(", ") + sql = "INSERT INTO buses_services (bus_id, service_id) VALUES #{values};" + ActiveRecord::Base.connection.execute(sql) + end + end + end + + private + + attr_reader :file + + def clean_database + City.delete_all + Bus.delete_all + Service.delete_all + Trip.delete_all + ActiveRecord::Base.connection.execute('delete from buses_services;') + end +end diff --git a/app/views/trips/_delimiter.html.erb b/app/views/trips/_delimiter.html.erb deleted file mode 100644 index 3f845ad0..00000000 --- a/app/views/trips/_delimiter.html.erb +++ /dev/null @@ -1 +0,0 @@ -==================================================== diff --git a/app/views/trips/_service.html.erb b/app/views/trips/_service.html.erb deleted file mode 100644 index 178ea8c0..00000000 --- a/app/views/trips/_service.html.erb +++ /dev/null @@ -1 +0,0 @@ -
  • <%= "#{service.name}" %>
  • diff --git a/app/views/trips/_services.html.erb b/app/views/trips/_services.html.erb index 2de639fc..0be503e1 100644 --- a/app/views/trips/_services.html.erb +++ b/app/views/trips/_services.html.erb @@ -1,6 +1,6 @@
  • Сервисы в автобусе:
  • diff --git a/app/views/trips/index.html.erb b/app/views/trips/index.html.erb index a60bce41..3be71bce 100644 --- a/app/views/trips/index.html.erb +++ b/app/views/trips/index.html.erb @@ -12,5 +12,5 @@ <%= render "services", services: trip.bus.services %> <% end %> - <%= render "delimiter" %> + ==================================================== <% end %> diff --git a/case_study.md b/case_study.md new file mode 100644 index 00000000..bcc0edee --- /dev/null +++ b/case_study.md @@ -0,0 +1,72 @@ +## A. Импорт данных + +Метрика: +- изначально замерила время на small.json - было около 15 секунд +- далее меряла small.json -> medium.json -> large.json + +### Подготовка + +Сначала вынесла код в сервис-обжект (PORO) `TripsImporter` для удобства + написала базовый тест. +Далее стала смотреть, как оптимизировать. + +### Оптимизация - исследование + +Сначала попробовала поэтапно идти: + +- переписала на один insert trips, не трогая другие части; стало ещё медленнее - откатила пока +- добавляла уникальные индексы на справочные данные + использовала upsert, но во-первых не так уж сильно оптимизировало, во-вторых оказалось, что `upsert` не возвращает id, если нет вставки. Тест не отловил, т.к. там одна поездка. Дописывать тест не стала (поленилась), но в реальном приложении нужно. +- попробовала рубипрофом попрофилировать, но такое себе - всё размазано, и так видно, что 100500 запросов идёт +- попробовала проверить, какая часть занимает много времени, комментируя куски кода и запуская, в принципе довольно наглядно. Понятно, что insert trips и sessions долго работает (ожидаемо) +- также смотрела рельсовые логи, видно, что также идёт 100500 мелких запросов +- также померяла, что сама загрузка (parse и обход) json занимает не так много времени, поэтому уж пару раз можно пройтись + +### Оптимизация + +Решила сначала собрать справочные данные по городам, автобусам и тд, потом вставить справочные данные и запросить получившиеся id из бд и сформировать подходящие структуры данных для дальнейшего поиска при подготовке данных для trips. Попутно добавила уникальные индексы на `name` и тд. + +Далее пройтись ещё раз, подготовить данные по поездкам и услугам автобусов, и уже вставить их отдельно. + +Эта оптимизация была эффективной - файл large стал обрабатываться за ~ 3-3,35 секунды. + +``` +anna@vivosaurus:~/apps/rails-optimization-task3$ be rake reload_json[fixtures/large.json] +3.356575947 +``` + +```ruby +task :reload_json, [:file_name] => :environment do |_task, args| + start_time = Time.current + + TripsImporter.call(args.file_name) + + end_time = Time.current + + p end_time - start_time +end +``` + +Решила рискнуть и попробовать на `1M.json`, получилось за 30 секунд. Как так? Все записи на месте в бд. + +## Б. Отображение расписаний + +Изначально время: 8329сек + +Сразу видим в rack-mini-profiler 1437 sql-запросов, делаем includes: +```ruby +@trips = Trip.where(from: @from, to: @to).includes(bus: :services).order(:start_time) +``` + +Время: 3930 + +Дальше видно, что очень много partials загружается: + +- убрала `partial` `service` (незачем в отдельном файле рендерить одну строчку, это не бесплатно) + +Время: 2336 + +- убрала аналогично `partial` `delimiter` + +Время: 632 + +Решила, что этого хватит. + diff --git a/config/database.yml b/config/database.yml index e116cfa6..5feec182 100644 --- a/config/database.yml +++ b/config/database.yml @@ -17,6 +17,9 @@ default: &default adapter: postgresql encoding: unicode + host: localhost + user: postgres + password: postgres # For details on connection pooling, see Rails configuration guide # http://guides.rubyonrails.org/configuring.html#database-pooling pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %> diff --git a/db/migrate/20250214163423_add_unique_index_to_services.rb b/db/migrate/20250214163423_add_unique_index_to_services.rb new file mode 100644 index 00000000..b4623761 --- /dev/null +++ b/db/migrate/20250214163423_add_unique_index_to_services.rb @@ -0,0 +1,7 @@ +class AddUniqueIndexToServices < ActiveRecord::Migration[8.0] + disable_ddl_transaction! + + def change + add_index :services, :name, unique: true, algorithm: :concurrently + end +end diff --git a/db/migrate/20250214164236_add_unique_index_to_bus_numbers.rb b/db/migrate/20250214164236_add_unique_index_to_bus_numbers.rb new file mode 100644 index 00000000..929a49a7 --- /dev/null +++ b/db/migrate/20250214164236_add_unique_index_to_bus_numbers.rb @@ -0,0 +1,7 @@ +class AddUniqueIndexToBusNumbers < ActiveRecord::Migration[8.0] + disable_ddl_transaction! + + def change + add_index :buses, :number, unique: true, algorithm: :concurrently + end +end diff --git a/db/migrate/20250214175040_add_unique_index_to_cities.rb b/db/migrate/20250214175040_add_unique_index_to_cities.rb new file mode 100644 index 00000000..850e3007 --- /dev/null +++ b/db/migrate/20250214175040_add_unique_index_to_cities.rb @@ -0,0 +1,7 @@ +class AddUniqueIndexToCities < ActiveRecord::Migration[8.0] + disable_ddl_transaction! + + def change + add_index :cities, :name, unique: true, algorithm: :concurrently + end +end diff --git a/db/migrate/20250214185429_add_unique_index_to_buses_services.rb b/db/migrate/20250214185429_add_unique_index_to_buses_services.rb new file mode 100644 index 00000000..f759751a --- /dev/null +++ b/db/migrate/20250214185429_add_unique_index_to_buses_services.rb @@ -0,0 +1,7 @@ +class AddUniqueIndexToBusesServices < ActiveRecord::Migration[8.0] + disable_ddl_transaction! + + def change + add_index :buses_services, [:bus_id, :service_id], unique: true, algorithm: :concurrently + end +end diff --git a/db/schema.rb b/db/schema.rb index f6921e45..fe1390a2 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -2,35 +2,38 @@ # of editing this file, please use the migrations feature of Active Record to # incrementally modify your database, and then regenerate this schema definition. # -# Note that this schema.rb definition is the authoritative source for your -# database schema. If you need to create the application database on another -# system, you should be using db:schema:load, not running all the migrations -# from scratch. The latter is a flawed and unsustainable approach (the more migrations -# you'll amass, the slower it'll run and the greater likelihood for issues). +# This file is the source Rails uses to define your schema when running `bin/rails +# db:schema:load`. When creating a new database, `bin/rails db:schema:load` tends to +# be faster and is potentially less error prone than running all of your +# migrations from scratch. Old migrations may fail to apply correctly if those +# migrations use external dependencies or application code. # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2019_03_30_193044) do - +ActiveRecord::Schema[8.0].define(version: 2025_02_14_185429) do # These are extensions that must be enabled in order to support this database - enable_extension "plpgsql" + enable_extension "pg_catalog.plpgsql" create_table "buses", force: :cascade do |t| t.string "number" t.string "model" + t.index ["number"], name: "index_buses_on_number", unique: true end create_table "buses_services", force: :cascade do |t| t.integer "bus_id" t.integer "service_id" + t.index ["bus_id", "service_id"], name: "index_buses_services_on_bus_id_and_service_id", unique: true end create_table "cities", force: :cascade do |t| t.string "name" + t.index ["name"], name: "index_cities_on_name", unique: true end create_table "services", force: :cascade do |t| t.string "name" + t.index ["name"], name: "index_services_on_name", unique: true end create_table "trips", force: :cascade do |t| @@ -41,5 +44,4 @@ t.integer "price_cents" t.integer "bus_id" end - end diff --git a/lib/tasks/utils.rake b/lib/tasks/utils.rake index 540fe871..beb3b4f2 100644 --- a/lib/tasks/utils.rake +++ b/lib/tasks/utils.rake @@ -1,34 +1,12 @@ # Наивная загрузка данных из json-файла в БД # rake reload_json[fixtures/small.json] + task :reload_json, [:file_name] => :environment do |_task, args| - json = JSON.parse(File.read(args.file_name)) + start_time = Time.current - ActiveRecord::Base.transaction do - City.delete_all - Bus.delete_all - Service.delete_all - Trip.delete_all - ActiveRecord::Base.connection.execute('delete from buses_services;') + TripsImporter.call(args.file_name) - json.each do |trip| - from = City.find_or_create_by(name: trip['from']) - to = City.find_or_create_by(name: trip['to']) - services = [] - trip['bus']['services'].each do |service| - s = Service.find_or_create_by(name: service) - services << s - end - bus = Bus.find_or_create_by(number: trip['bus']['number']) - bus.update(model: trip['bus']['model'], services: services) + end_time = Time.current - Trip.create!( - from: from, - to: to, - bus: bus, - start_time: trip['start_time'], - duration_minutes: trip['duration_minutes'], - price_cents: trip['price_cents'], - ) - end - end + p end_time - start_time end diff --git a/spec/rails_helper.rb b/spec/rails_helper.rb new file mode 100644 index 00000000..cb2ba553 --- /dev/null +++ b/spec/rails_helper.rb @@ -0,0 +1,70 @@ +# This file is copied to spec/ when you run 'rails generate rspec:install' +require 'spec_helper' +ENV['RAILS_ENV'] ||= 'test' +require_relative '../config/environment' +# Prevent database truncation if the environment is production +abort("The Rails environment is running in production mode!") if Rails.env.production? +# Uncomment the line below in case you have `--require rails_helper` in the `.rspec` file +# that will avoid rails generators crashing because migrations haven't been run yet +# return unless Rails.env.test? +require 'rspec/rails' +# Add additional requires below this line. Rails is not loaded until this point! + +# Requires supporting ruby files with custom matchers and macros, etc, in +# spec/support/ and its subdirectories. Files matching `spec/**/*_spec.rb` are +# run as spec files by default. This means that files in spec/support that end +# in _spec.rb will both be required and run as specs, causing the specs to be +# run twice. It is recommended that you do not name files matching this glob to +# end with _spec.rb. You can configure this pattern with the --pattern +# option on the command line or in ~/.rspec, .rspec or `.rspec-local`. +# +# The following line is provided for convenience purposes. It has the downside +# of increasing the boot-up time by auto-requiring all files in the support +# directory. Alternatively, in the individual `*_spec.rb` files, manually +# require only the support files necessary. +# +# Rails.root.glob('spec/support/**/*.rb').sort_by(&:to_s).each { |f| require f } + +# Checks for pending migrations and applies them before tests are run. +# If you are not using ActiveRecord, you can remove these lines. +begin + ActiveRecord::Migration.maintain_test_schema! +rescue ActiveRecord::PendingMigrationError => e + abort e.to_s.strip +end +RSpec.configure do |config| + # Remove this line if you're not using ActiveRecord or ActiveRecord fixtures + config.fixture_paths = [ + Rails.root.join('spec/fixtures') + ] + + # If you're not using ActiveRecord, or you'd prefer not to run each of your + # examples within a transaction, remove the following line or assign false + # instead of true. + config.use_transactional_fixtures = true + + # You can uncomment this line to turn off ActiveRecord support entirely. + # config.use_active_record = false + + # RSpec Rails uses metadata to mix in different behaviours to your tests, + # for example enabling you to call `get` and `post` in request specs. e.g.: + # + # RSpec.describe UsersController, type: :request do + # # ... + # end + # + # The different available types are documented in the features, such as in + # https://rspec.info/features/7-1/rspec-rails + # + # You can also this infer these behaviours automatically by location, e.g. + # /spec/models would pull in the same behaviour as `type: :model` but this + # behaviour is considered legacy and will be removed in a future version. + # + # To enable this behaviour uncomment the line below. + # config.infer_spec_type_from_file_location! + + # Filter lines from Rails gems in backtraces. + config.filter_rails_from_backtrace! + # arbitrary gems may also be filtered via: + # config.filter_gems_from_backtrace("gem name") +end diff --git a/spec/services/trips_importer_spec.rb b/spec/services/trips_importer_spec.rb new file mode 100644 index 00000000..08fb997f --- /dev/null +++ b/spec/services/trips_importer_spec.rb @@ -0,0 +1,11 @@ +require "rails_helper" + +RSpec.describe TripsImporter, type: :service do + it "imports cities and trips" do + expect do + described_class.call(Rails.root.join("fixtures/example.json")) + end.to change(City, :count).by(2).and change(Service, :count).by(2).and change(Bus, :count).by(1).and change(Trip, :count).by(10) + + expect(Bus.last.services.count).to eq(2) + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 00000000..327b58ea --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,94 @@ +# This file was generated by the `rails generate rspec:install` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + +# The settings below are suggested to provide a good initial experience +# with RSpec, but feel free to customize to your heart's content. +=begin + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/ + config.disable_monkey_patching! + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = "doc" + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +=end +end