Skip to content

Latest commit

 

History

History
357 lines (313 loc) · 14.8 KB

SQL queries and explanation of output.md

File metadata and controls

357 lines (313 loc) · 14.8 KB

Data cleaning

with cte as (select Price, substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station, 
substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', -1) as Gas_Station_Address,
Area,
substring_index(replace(replace(replace(Thanks, "('", ' '), "'", ''), ")", ''), ', ', 1) as Thanks_By, 
substring_index(replace(replace(replace(Thanks, "('", ' '), "'", ''), ")", ''), ', ', -1) as thanks_last_updated,
last_updated_price
from Regular_Gas),
cte1 as (
select Price, substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station, 
substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', -1) as Gas_Station_Address,
Area,
substring_index(replace(replace(replace(Thanks, "('", ' '), "'", ''), ")", ''), ', ', 1) as Thanks_By, 
substring_index(replace(replace(replace(Thanks, "('", ' '), "'", ''), ")", ''), ', ', -1) as thanks_last_updated,
last_updated_price
from midgrade_Gas),
cte2 as (
select Price, substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station, 
substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', -1) as Gas_Station_Address,
Area,
substring_index(replace(replace(replace(Thanks, "('", ' '), "'", ''), ")", ''), ', ', 1) as Thanks_By, 
substring_index(replace(replace(replace(Thanks, "('", ' '), "'", ''), ")", ''), ', ', -1) as thanks_last_updated,
last_updated_price
from premium_Gas),
cte3 as (
select Price, substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station, 
substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', -1) as Gas_Station_Address,
Area,
substring_index(replace(replace(replace(Thanks, "('", ' '), "'", ''), ")", ''), ', ', 1) as Thanks_By, 
substring_index(replace(replace(replace(Thanks, "('", ' '), "'", ''), ")", ''), ', ', -1) as thanks_last_updated,
last_updated_price
from diesel_fuel_Gas)
select distinct cte.Price as regular_price, cte1.price as midgrade_price, cte2.price as premium_price, cte3.price as diesel_fuel_price,
cte.gas_station, cte.gas_station_address, cte.Area, cte.last_updated_price
from cte
join cte1 on cte.gas_station = cte1.gas_station and cte.gas_station_address = cte1.gas_station_address
join cte2 on cte.gas_station = cte2.gas_station and cte.gas_station_address = cte2.gas_station_address
join cte3 on cte.gas_station = cte3.gas_station and cte.gas_station_address = cte3.gas_station_address
order by cte.last_updated_price desc
limit 10
;

Output

Screen Shot 2024-06-12 at 9 25 01 PM

-> It looks nicer now, after cleaning the CSV files

1. Lowest and highest price in each gas type?

Regular gas

select min(price), max(price) from Regular_Gas;

Output

Screen Shot 2024-06-12 at 9 28 03 PM

-> Highest regular gas price is $8.55, lowest is $3.69.

Midgrade gas

select min(price), max(price) from Midgrade_Gas;

Output

Screen Shot 2024-06-12 at 9 29 07 PM

-> Highest midgrade gas price is $7.23, lowest is $3.75.

Premium gas

select min(price), max(price) from Premium_Gas;

Output

Screen Shot 2024-06-12 at 9 30 08 PM

-> Highest premium gas price is $7.39, lowest is $3.85.

Diesel Fuel gas

select min(price), max(price) from Diesel_Fuel_Gas;

Output

Screen Shot 2024-06-12 at 9 31 10 PM

-> Highest diesel fuel gas price is $8.55, lowest is $3.79.

2. Which gas station has the lowest and highest price in each gas type? (let's look at the first 3)

Regular gas

min(price), max(price)
from Regular_Gas
group by Gas_Station
limit 3;

Output

Screen Shot 2024-06-12 at 9 33 34 PM

Midgrade gas

min(price), max(price)
from Midgrade_gas
group by Gas_Station
limit 3;

Output

Screen Shot 2024-06-12 at 9 34 52 PM

Premium gas

select distinct substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station,
min(price), max(price)
from Premium_gas
group by Gas_Station
limit 3;

Output

Screen Shot 2024-06-12 at 9 35 25 PM

Diesel Fuel gas

select distinct substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station,
min(price), max(price)
from Diesel_Fuel_gas
group by Gas_Station;

Output

Screen Shot 2024-06-12 at 9 35 58 PM

3. Does the price change a lot from Jun 6 to Jun 12 for regular gas?

price difference

with cte as (select date_format(last_updated_price, '%Y/%m/%d %H') as formatted_date, round(avg(price), 2) as average_price, 
lead(round(avg(price), 2)) over(order by date_format(last_updated_price, '%Y/%m/%d %H')) as next_value,
((lead(round(avg(price), 2)) over(order by date_format(last_updated_price, '%Y/%m/%d %H')) - round(avg(price), 2) ) * 100)/round(avg(price), 2) 
as price_difference
from regular_gas
group by formatted_date
order by formatted_date)
select formatted_date, average_price, round(lag(price_difference) over(order by formatted_date), 2) as price_difference
from cte
order by formatted_date 
;

Output

Screen Shot 2024-06-12 at 9 37 45 PM

-> The price went up and down quite frequently, which see its peak on Jun 9 at 3:00 PM (it went down by 6.5%)

average price difference

with cte as (select date_format(last_updated_price, '%Y/%m/%d %H') as formatted_date, round(avg(price), 2) as average_price, 
lead(round(avg(price), 2)) over(order by date_format(last_updated_price, '%Y/%m/%d %H')) as next_value,
((lead(round(avg(price), 2)) over(order by date_format(last_updated_price, '%Y/%m/%d %H')) - round(avg(price), 2) ) * 100)/round(avg(price), 2) 
as price_difference
from regular_gas
group by formatted_date
order by formatted_date)
select round(avg(price_difference), 2) as average_price_difference
from cte;

Output

Screen Shot 2024-06-12 at 9 44 14 PM

-> On average, the price went down by 0.12%.

4. Which area has the highest average gas price?

Regular gas

select distinct Area, round(avg(price), 0) as average_price_per_Area from Regular_Gas
group by Area 
order by average_price_per_Area desc
;

Output

Screen Shot 2024-06-12 at 9 46 41 PM

-> Essex has the highest average regular gas price

Midgrade gas

select distinct Area, round(avg(price), 0) as average_price_per_Area from Midgrade_Gas
group by Area 
order by average_price_per_Area desc
;

Output

Screen Shot 2024-06-12 at 9 47 43 PM

-> Vidal Junction has the highest average midgrade gas price

Premium gas

select distinct Area, round(avg(price), 0) as average_price_per_Area from Premium_Gas
group by Area 
order by average_price_per_Area desc
;

Output

Screen Shot 2024-06-12 at 9 48 39 PM

-> Ludlow has the highest average premium gas price

Diesel Fuel gas

select distinct Area, round(avg(price), 0) as average_price_per_Area from Diesel_Fuel_Gas
group by Area 
order by average_price_per_Area desc
;

Output

Screen Shot 2024-06-12 at 9 49 50 PM

-> Essex has the highest average diesel fuel gas price

5. What's the lowest average price per gas brand in the last 12 hours?

Regular Gas

select substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station, round(avg(price), 2) as 
average_price_per_gas_brand
from Regular_Gas
where last_updated_price > now() - interval 12 hour
group by Gas_Station
order by average_price_per_gas_brand 
limit 2
;

Output

Screen Shot 2024-06-12 at 9 50 58 PM

-> Foxxy Gasoline and Diamond Gas & Mart each has the lowest average regular gas

Midgrade Gas

select substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station, round(avg(price), 2) as 
average_price_per_gas_brand
from Midgrade_Gas
where last_updated_price > now() - interval 12 hour
group by Gas_Station
order by average_price_per_gas_brand 
limit 1
;

Output

Screen Shot 2024-06-12 at 9 52 12 PM

-> Fastrip has the lowest average midgrade gas

Premium Gas

select substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station, round(avg(price), 2) as 
average_price_per_gas_brand
from Premium_Gas
where last_updated_price > now() - interval 12 hour
group by Gas_Station
order by average_price_per_gas_brand 
limit 1
;

Output

Screen Shot 2024-06-12 at 9 53 10 PM

-> Fastrip has the lowest average premium gas

Diesel Fuel Gas

select substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station, round(avg(price), 2) as 
average_price_per_gas_brand
from Diesel_Fuel_Gas
where last_updated_price > now() - interval 12 hour
group by Gas_Station
order by average_price_per_gas_brand 
limit 1
;

Output

Screen Shot 2024-06-12 at 9 53 56 PM

-> Wintun Mini Market has the lowest average diesel fuel gas

6. Which part of California sees the cheapeast average regular gas price, most expensive average regular gas price? Which gas station address sees most regular gas price fluctuation (highest delta), and the most price changes (more changes in price) in the last 12 hours according to this dataset?

The cheapest part of California

select Area, round(avg(price), 2) as 
average_price_per_gas_brand
from Regular_Gas
where last_updated_price > now() - interval 12 hour
group by Area
order by average_price_per_gas_brand 
limit 1
;

Output

Screen Shot 2024-06-12 at 9 56 18 PM

-> Livermore has the cheapest average regular gas price

The most expensive part

select Area, round(avg(price), 2) as 
average_price_per_gas_brand
from Regular_Gas
where last_updated_price > now() - interval 12 hour
group by Area
order by average_price_per_gas_brand desc
limit 1
;

Output

Screen Shot 2024-06-12 at 9 58 10 PM

-> Vidal Junction has the most expensive average regular gas price

most regular gas price fluctuation (highest delta, if there are multiple deltas per gas station address then we will get the average of the deltas) gas station address in the last 12 hours (zero does not count because it means there's no changes)

with cte2 as (with cte1 as (with cte as (select distinct last_updated_price, substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station,
substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', -1) as Gas_Station_Address, price
from regular_gas
where last_updated_price > now() - interval 12 hour
order by gas_station, gas_station_address, last_updated_price)
select *, round(lead(price) over (partition by Gas_Station, Gas_Station_Address order by last_updated_price), 2) as next_value
from cte)
select *, round(next_value-price, 2) as delta from cte1)
select Gas_Station, Gas_Station_Address, avg(delta) as average_delta
from cte2
where delta != 0 and delta != -0
group by Gas_Station, Gas_Station_Address
order by average_delta desc
;

Output

Screen Shot 2024-06-12 at 10 00 21 PM

-> Fastrip on 195 W Hermosa St & N Sweetbriar Ave has the highest delta, it went down by $0.04.

Gas station address with the most price changes (i.e. number of deltas per Gas Station Address) in the last 12 hours

with cte2 as (with cte1 as (with cte as (select distinct last_updated_price, substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', 1) as Gas_Station,
substring_index(replace(replace(replace(Station, "('", ' '), "'", ''), ")", ''), ', ', -1) as Gas_Station_Address, price
from regular_gas
where last_updated_price > now() - interval 12 hour
order by gas_station, gas_station_address, last_updated_price)
select *, round(lead(price) over (partition by Gas_Station, Gas_Station_Address order by last_updated_price), 2) as next_value
from cte)
select *, round(next_value-price, 2) as delta from cte1)
select Gas_Station, Gas_Station_Address, count(delta) as number_of_deltas
from cte2
where delta != 0 and delta != -0
group by Gas_Station, Gas_Station_Address
order by number_of_deltas desc
;

Output

Screen Shot 2024-06-12 at 10 06 35 PM

-> In the last 12 hours, only Fastrip on 195 W Hermosa St & N Sweetbriar Ave has one delta.