Skip to content

Commit cd1dbe9

Browse files
committedJul 22, 2024
new codes
1 parent ee2ff11 commit cd1dbe9

9 files changed

+553
-11
lines changed
 

‎30_question/Log_Table.sql

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
3+
Given a log table (user_id, date, send_on_ios, send_on_android)
4+
5+
- Find out how many times each user signed in a day?
6+
- How many messages were sent?
7+
- First sign-in date?
8+
- How many messages were sent since the first sign-in date?
9+
- Whether the user is active today?
10+
11+
*/
12+
13+
-- 1) Sessionisation for login if no events are there.
14+
15+
with cte as (
16+
select
17+
user_id,
18+
event_time,
19+
case when extract(epoch from (event_time - lag(event_time) over(partition by user_id order by event_time))) >= 30 * 60 then 1 else 0 end as session_diff
20+
from user_sessions
21+
)
22+
, session_groups as (
23+
select user_id, event_time, sum(session_diff) over(partition by user_id order by event_time) diff from cte
24+
)
25+
select user_id, count(distinct diff) from session_groups group by user_id order by 1
26+
27+
28+
-- 2) How many messages were sent?
29+
30+
select user_id, count(1) from user_sessions WHERE event_type = 'message_sent' group by user_id, event_time
31+
32+
-- 3) First sign-in date?
33+
select user_id, min(event_time) from user_sessions group by user_id, event_time
34+
35+
-- 4) How many messages were sent since the first sign-in date?
36+
37+
-- 5) Whether the user is active today?
38+
39+
select distinct user_id from user_sessions where date = CURRENT_DATE()
40+
41+
s
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
/*
2+
3+
The cloud storage company would like to design a data model to capture all critical data elements and answer the following questions.
4+
5+
Questions -
6+
7+
Track how many files are Shared and % of Shared Files in a week.
8+
9+
what file type is shared more frequently ?
10+
11+
How many files had more than one owner at a given time?
12+
13+
Total File Actions by Content Categories.
14+
15+
File is shared among multiple people
16+
17+
One to record upload/download and the other to record shared assets
18+
19+
How to record file ownership transfer
20+
21+
Find people who only upload photos
22+
23+
*/
24+
25+
Dim_Date
26+
- date_id
27+
- timestamp
28+
- day
29+
- month
30+
- Quarter
31+
- year
32+
33+
Dim_user
34+
- user_id
35+
- date_id
36+
- user_name
37+
- user_email
38+
- user_phone
39+
40+
Dim_Plan
41+
- plan_id
42+
- plan_name (lite, pro, enterprise)
43+
- plan_price
44+
- plan_start_date (date_id)
45+
- plan_end_date (date_id)
46+
47+
Dim_Subscriptions
48+
- subscription_id
49+
- user_id
50+
- plan_id
51+
- subscription_type (monthly, yearly)
52+
- subscription_start_date (date_id)
53+
- subscription_end_date (date_id)
54+
- isActive
55+
56+
Fact_File
57+
- file_id pk
58+
- user_id pk
59+
- file_name
60+
- file_type (video, audio, image, document)
61+
- file_size
62+
- IsFileShared (True, False)
63+
- file_upload_date (date_id)
64+
- file_upload_starttimestamp (date_id)
65+
- file_upload_completetimestamp (date_id)
66+
- file_status (Completed, Cancelled, Progress, Completed)
67+
- UNIQUE(file_id, user_id)
68+
69+
Fact_Shared_Files
70+
- file_id
71+
- user_id
72+
- shared_user_id
73+
- share_date (date_id)
74+
- share_type (email, link, social_media)
75+
- access_type (view, edit, owner)
76+
77+
Fact_Logs
78+
- action_id
79+
- date_id
80+
- file_id
81+
- user_id
82+
- action_type (upload, download, share, delete)
83+
84+
85+
86+
-- 1) Track how many files are Shared and % of Shared Files in a week.
87+
select count(file_id) from Fact_Logs group by date_id, action_type having action_type = 'share';
88+
89+
90+
-- 2) what file type is shared more frequently ?
91+
select file_id, count(file_id) over(partition by file_id order by file_id) from Fact_Logs
92+
93+
-- 3) How many files had more than one owner at a given time?
94+
95+
-- 4) Total File Actions by Content Categories.
96+
97+
-- 5) File is shared among multiple people
98+
99+
-- 6) One to record upload/download and the other to record shared assets
100+
101+
-- 7) How to record file ownership transfer
102+
103+
-- 8) Find people who only upload photos
104+
105+
106+

‎Data-Modelling/pending1.txt ‎Data-Modelling/Extras.sql

+6-2
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,13 @@ Follow up sql questions
5757
6) I want to launch same taxi app in diffefent city like(london), so what data point we use to make it successful.
5858
8) Few SOL to find out people wo took taxi for airport any country they i caly tent to open a newcount report)
5959
60+
*/
61+
6062
-- Solution
6163

62-
1. Designing a Data Model for a Taxi Company
64+
-- 1. Designing a Data Model for a Taxi Company
6365

64-
Entity-Relationship Diagram (ERD)
66+
-- Entity-Relationship Diagram (ERD)
6567

6668
The data model for a taxi company should capture various aspects of the business, including drivers, vehicles, trips,
6769
customers, and payments. Here’s a high-level overview of the entities and their relationships:
@@ -205,6 +207,8 @@ JOIN Trip t ON v.vehicle_id = t.vehicle_id
205207
WHERE t.status = 'completed'
206208
GROUP BY v.vehicle_id, v.license_plate;
207209

210+
/*
211+
208212
Summary
209213
210214
The data model consists of Driver, Vehicle, Customer, Trip, Payment, and Location tables. Key Performance Indicators (KPIs) such as total revenue, number of trips,

‎Data-Modelling/Online_Streaming.sql

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
3+
Model the video streaming platform to answer common Questions
4+
5+
Questions -
6+
7+
How many users stream Daily/Monthly
8+
9+
Average viewing per user
10+
11+
Users who viewed certain content
12+
13+
Users who watched certain content on release date.
14+
15+
*/
16+
17+
Dim_Date
18+
- date_id
19+
- date
20+
- day
21+
- month
22+
- year
23+
- day_of_week
24+
- week_of_year
25+
26+
Dim_user
27+
- user_id
28+
- date_id (fk)
29+
- user_name
30+
- user_phone
31+
- user_email
32+
33+
Dim_Plan
34+
- plan_id
35+
- plan_name (lite, pro, enterprise)
36+
- plan_price
37+
- plan_start_date (date_id) (fk)
38+
- plan_end_date (date_id) (fk)
39+
40+
Dim_Subscriptions
41+
- subscription_id
42+
- user_id (fk)
43+
- plan_id (fk)
44+
- subscription_type (monthly, yearly)
45+
- subscription_start_date (date_id) (fk)
46+
- subscription_end_date (date_id) (fk)
47+
- isActive (true, false)
48+
49+
Dim_Content
50+
- content_id
51+
- content_name
52+
- content_type (video, audio, image, document)
53+
- content_category
54+
- content_release_date (date_id)
55+
56+
Fact_Streaming
57+
- streaming_id
58+
- user_id (fk)
59+
- content_id (fk)
60+
- subscription_id (fk)
61+
- streaming_date (date_id)
62+
- streaming_duration
63+
- streaming_device
64+
- streaming_location
65+
- streaming_quality
66+
- streaming_status
67+
- streaming_rating
68+
- streaming_feedback
69+
- UNIQUE(streaming_id, user_id, content_id)
70+
71+
-- 1) How many users stream Daily/Monthly
72+
with dau as (select count(user_id) dau_active_user from Fact_Streaming group by streaming_date),
73+
wau as (select EXTRACT(week from streaming_date), count(user_id) wau_active_user from Fact_Streaming group by 1)
74+
wau as (select EXTRACT(month from streaming_date), count(user_id) mau_active_user from Fact_Streaming group by 1)
75+
select * from dau, wau, mau;
76+
77+
-- 2) Average viewing per user
78+
select user_id, avg(streaming_duration) avg_viewing_duration from Fact_Streaming group by user_id;
79+
80+
-- 3) Users who viewed certain content
81+
select user_id, count(user_id) from Fact_Streaming where content_name='Toy story2' group by user_id;
82+
83+
-- 4) Users who watched certain content on release date.
84+
select a.user_id, count(1) from Fact_Streaming a join Dim_Content b on a.content_id=b.content_id where a.streaming_date = b.content_release_date group by a.user_id;

‎Data-Modelling/Taxi_Ride_Service.sql

+181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/*
2+
3+
Taxi Company would like to design a data model to capture all critical data elements.
4+
5+
Questions -
6+
7+
Track rides done by driver and their Performance
8+
9+
How many rides are happening to a common/famous destinations each day( Airports , Parks , Museums etc)
10+
11+
How many trips are cancelled per day.
12+
13+
How many rides and the average price during the peak hour per day.
14+
15+
what data point you ade to measure success - DAU, MAU, WAU
16+
17+
About driver and custoner in same table
18+
19+
Find out people wo took taxi directly from airport any country %,
20+
21+
Find custmer they have only taken taxi from airport means exclusive for airport.
22+
23+
I want to launch same taxi app in diffefent city like(london), so what data point we use to make it successful.
24+
25+
*/
26+
27+
Dim_Date
28+
- date_id
29+
- date
30+
- day
31+
- month
32+
- year
33+
- day_of_week
34+
- week_of_year
35+
36+
Dim_user
37+
- user_id
38+
- user_name
39+
- user_email
40+
- user_phone
41+
- user_address
42+
43+
Dim_vehicle
44+
- vehicle_id
45+
- vehicle_type
46+
- vehicle_number
47+
- vehicle_model
48+
- vehicle_color
49+
50+
Dim_Driver
51+
- driver_id
52+
- user_id
53+
- vehicle_id
54+
- driver_license
55+
- driver_rating
56+
- driver_experience
57+
58+
Dim_Location
59+
- Location_id,
60+
- Location_name
61+
- Latitude
62+
- Longitude
63+
- Landmark Type ( Airport, park, museum)
64+
- Landmark Name
65+
- Landmark City
66+
- State
67+
- country
68+
69+
Fact_payment
70+
- paymet_id
71+
- ride_id
72+
- payment_type (cash, card)
73+
- payment_amount
74+
- payment_date (date_id)
75+
- payment_status (completed, pending)
76+
- taxes
77+
- discount
78+
- total_amount
79+
- payment_gateway
80+
- base_rate
81+
- surge_rate
82+
- tip_amount
83+
84+
Fact_trips
85+
- trip_id
86+
- user_id
87+
- driver_id
88+
- pick_up_Location_id
89+
- drop_Location_id
90+
- payment_id
91+
- trip_starttimestamp (date_id)
92+
- trip_endtimestamp (date_id)
93+
- trip_status (completed, cancelled, progress)
94+
- trip_rating
95+
96+
97+
-- 1) Track rides done by driver and their Performance
98+
99+
SELECT driver_id, COUNT(distinct driver_id) as total_rides, AVG(trip_rating) as avg_rating from Fact_rides group by driver_id
100+
101+
-- 2) How many rides are happening to a common/famous destinations each day( Airports , Parks , Museums etc)
102+
103+
with cte as (
104+
select b.Location_name, count(1) trip_counts from Fact_trips a join Dim_Location b on a.pick_up_Location_id = b.Location_id
105+
where b.Landmark_Type in ('Airport', 'Museum', 'Park')
106+
group by b.Location_name
107+
),
108+
cte2 as (
109+
select b.Location_name, dense_rank() over(order by trip_counts desc) rnk from cte
110+
)
111+
select b.Location_name from cte2 where rnk <= 2
112+
113+
-- 3) How many trips are cancelled per day.
114+
115+
select count(1) from Fact_trips where trip_status = 'cancelled' group by trip_starttimestamp
116+
117+
-- 4) How many rides and the average price during the peak hour per day.
118+
119+
with cte as (
120+
select
121+
EXTRACT(HOUR from trip_starttimestamp) hour, EXTRACT(day from trip_starttimestamp) day,
122+
count(1) trip_count, avg(tip_amount), avg_trip_amount as trip_count
123+
from Fact_trips a
124+
join Fact_payment b on a.payment_id=b.payment_id
125+
group by 1, 2
126+
),
127+
cte2 as (
128+
select *, dense_rank() over(order by trip_count desc) rnk from cte
129+
)
130+
select * from cte2 where rnk <= 2
131+
132+
133+
-- What data points do you add to measure success (DAU, MAU, WAU)
134+
-- Daily Active Users (DAU): Number of unique users using the app per day.
135+
-- Weekly Active Users (WAU): Number of unique users using the app per week.
136+
-- Monthly Active Users (MAU): Number of unique users using the app per month.
137+
-- Ride Completion Rate: Percentage of completed rides out of total requested rides.
138+
-- Customer Satisfaction: Average rating given by users.
139+
-- Driver Retention Rate: Percentage of drivers continuing to use the platform over a period.
140+
141+
-- About driver and customer in the same table
142+
143+
-- This is generally not recommended as drivers and customers have distinct attributes and roles. Keeping them in separate tables ensures better normalization and easier data management.
144+
145+
-- Find out percentage of people who took taxi directly from any airport in any country
146+
147+
SELECT
148+
user_id,
149+
COUNT(trip_id) AS airport_trips,
150+
COUNT(trip_id) * 100.0 / (SELECT COUNT(*) FROM Fact_Trips) AS percentage_from_airport
151+
FROM Fact_Trips a
152+
JOIN Dim_Location b ON a.pick_up_location_id = b.location_id
153+
WHERE b.landmark_type = 'Airport'
154+
GROUP BY user_id;
155+
156+
-- Find customers who have only taken taxis from airports (exclusive for airports)
157+
158+
SELECT
159+
user_id
160+
FROM Fact_Trips a
161+
JOIN Dim_Location b ON a.pick_up_location_id = b.location_id
162+
WHERE b.landmark_type = 'Airport'
163+
GROUP BY user_id
164+
HAVING COUNT(DISTINCT a.trip_id) = (
165+
SELECT COUNT(*)
166+
FROM Fact_Trips c
167+
WHERE c.user_id = a.user_id
168+
);
169+
170+
171+
-- Data Points for Launching in a New City (e.g., London)
172+
-- Market Demand: Population density, tourist influx, business hubs.
173+
-- Competitor Analysis: Existing transportation options, pricing strategies.
174+
-- Regulations: Local laws, licensing requirements.
175+
-- User Demographics: Preferences, income levels, frequent destinations.
176+
-- Traffic and Transportation: Public transport availability, traffic patterns.
177+
-- Pricing Strategy: Dynamic pricing, competitive rates.
178+
-- Operational Logistics: Fleet size, driver recruitment.
179+
-- Technology Integration: Localization of app features, payment systems.
180+
-- Marketing Strategy: Targeted campaigns, partnerships.
181+
-- Customer Feedback: Continuous improvement based on user reviews and performance metrics.

‎Notes.md

+12-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
# By default (range between unbounded preceding and current row):
2-
- This specifies that the window frame includes all rows from the start of the partition up to and including the current row.
2+
- This means that the frame includes all rows from the beginning of the partition up to the current row, based on the order of the rows by the specified value..
33

4-
# Explicitly stated (range between unbounded preceding and unbounded following):
5-
- This specifies that the window frame includes all rows from the start of the partition up to the end of the partition, regardless of the current row.
4+
# ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
5+
- This means that the frame includes the current row and the six preceding rows, making a total of 7 rows
6+
7+
# ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
8+
- Includes the current row and the previous row.
9+
10+
# ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
11+
- Includes the current row and the two preceding rows.
12+
13+
# ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
14+
- Includes all rows from the start up to the current row.

‎Sessionisation.sql

+1-5
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,11 @@ with cte as (
22
SELECT
33
user_id,
44
mytimestamp as ogtimestamp,
5-
case when EXTRACT(EPOCH FROM mytimestamp) - LAG(
6-
EXTRACT(EPOCH FROM mytimestamp)
7-
) OVER (PARTITION BY user_id ORDER BY mytimestamp) >= 30 * 60 then 1 else 0 end AS time_interval
8-
-- , row_number() over(PARTITION BY user_id ORDER BY mytimestamp) rn
5+
case when EXTRACT(EPOCH FROM (mytimestamp - LAG(mytimestamp) OVER (PARTITION BY user_id ORDER BY mytimestamp))) >= 30 * 60 then 1 else 0 end AS time_interval
96
FROM user_timestamps
107
ORDER BY 1, 2
118
)
129
select
1310
user_id, ogtimestamp, time_interval, user_id || '_' || time_interval,
14-
sum(time_interval) over(PARTITION BY user_id, ogtimestamp) global_session_id,
1511
sum(time_interval) over(PARTITION BY user_id ORDER BY ogtimestamp) user_session_id
1612
from cte

‎Success_Metrics.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
-- Daily Active Users (DAU)
22

33
SELECT
4-
DATE(activity_timestamp) AS activity_date,
4+
activity_date,
55
COUNT(DISTINCT user_id) AS dau
66
FROM
77
user_activity_log

‎Year_on_Year.sql

+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
-- window function to calculate a rolling average. For simplicity, let's use a 7-day rolling average.
2+
3+
-- DAU YoY Growth Rate
4+
5+
/*
6+
CREATE TABLE user_activity (
7+
user_id INT,
8+
activity_date DATE
9+
);
10+
11+
-- Sample data for January 2024
12+
INSERT INTO user_activity (user_id, activity_date) VALUES
13+
(1, '2024-01-01'), (2, '2024-01-01'), (3, '2024-01-01'),
14+
(1, '2024-01-02'), (2, '2024-01-02'), (4, '2024-01-02'),
15+
(1, '2024-01-03'), (3, '2024-01-03'), (4, '2024-01-03'),
16+
(2, '2024-01-04'), (3, '2024-01-04'), (5, '2024-01-04'),
17+
(1, '2024-01-05'), (2, '2024-01-05'), (5, '2024-01-05'),
18+
(3, '2024-01-06'), (4, '2024-01-06'), (5, '2024-01-06'),
19+
(1, '2024-01-07'), (2, '2024-01-07'), (3, '2024-01-07');
20+
21+
-- Sample data for January 2023
22+
INSERT INTO user_activity (user_id, activity_date) VALUES
23+
(1, '2023-01-01'), (2, '2023-01-01'),
24+
(1, '2023-01-02'), (3, '2023-01-02'),
25+
(2, '2023-01-03'), (4, '2023-01-03'),
26+
(1, '2023-01-04'), (3, '2023-01-04'),
27+
(2, '2023-01-05'), (4, '2023-01-05'),
28+
(1, '2023-01-06'), (3, '2023-01-06'),
29+
(2, '2023-01-07'), (4, '2023-01-07');
30+
31+
*/
32+
33+
34+
WITH daily_dau AS (
35+
SELECT
36+
activity_date,
37+
COUNT(DISTINCT user_id) AS daily_active_users
38+
FROM
39+
user_activity
40+
GROUP BY
41+
activity_date
42+
),
43+
yoy_growth AS (
44+
SELECT
45+
current_year.activity_date,
46+
current_year.daily_active_users,
47+
previous_year.daily_active_users AS previous_year_daily_active_users,
48+
(current_year.daily_active_users::FLOAT / previous_year.daily_active_users - 1) * 100 AS yoy_growth_rate
49+
FROM
50+
daily_dau current_year
51+
LEFT JOIN
52+
daily_dau previous_year
53+
ON
54+
current_year.activity_date = previous_year.activity_date + INTERVAL '1 year'
55+
)
56+
SELECT
57+
activity_date,
58+
daily_active_users,
59+
yoy_growth_rate,
60+
AVG(yoy_growth_rate) OVER (
61+
ORDER BY activity_date
62+
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
63+
) AS rolling_avg_yoy_growth_rate
64+
FROM
65+
yoy_growth
66+
ORDER BY
67+
activity_date;
68+
69+
-- MAU YoY Growth Rate
70+
71+
with mau as(
72+
select
73+
date_trunc('month', activity_date) activity_date,
74+
count(DISTINCT user_id) cn
75+
from
76+
user_activity
77+
group by
78+
date_trunc('month', activity_date)
79+
order by 1
80+
),
81+
mau_yoy_growth as(
82+
select
83+
a.cn mau, a.activity_date,
84+
round(((a.cn - b.cn) * 100.0 / b.cn), 2) mau_yoy
85+
from
86+
mau a join mau b on a.activity_date = b.activity_date + interval '1 year'
87+
)
88+
select
89+
activity_date, mau, mau_yoy,
90+
round(avg(mau_yoy) over(order by activity_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW), 2) r_yoy
91+
from
92+
mau_yoy_growth
93+
94+
95+
96+
97+
/*
98+
99+
CREATE TABLE revenue (
100+
transaction_date DATE,
101+
total_revenue NUMERIC
102+
);
103+
104+
-- Sample data for 2023 and 2024
105+
INSERT INTO revenue (transaction_date, total_revenue) VALUES
106+
('2023-01-01', 1000), ('2023-01-02', 1100), ('2023-01-03', 1050),
107+
('2023-01-04', 1200), ('2023-01-05', 1250), ('2023-01-06', 1300),
108+
('2023-01-07', 1400), ('2023-02-01', 1500), ('2023-02-02', 1600),
109+
('2023-02-03', 1700), ('2023-02-04', 1800), ('2023-02-05', 1900),
110+
('2024-01-01', 2000), ('2024-01-02', 2200), ('2024-01-03', 2100),
111+
('2024-01-04', 2400), ('2024-01-05', 2500), ('2024-01-06', 2600),
112+
('2024-01-07', 2800), ('2024-02-01', 3000), ('2024-02-02', 3200),
113+
('2024-02-03', 3400), ('2024-02-04', 3600), ('2024-02-05', 3800);
114+
*/
115+
116+
with dau as (select distinct transaction_date, sum(total_revenue) sm from revenue group by transaction_date order by transaction_date),
117+
dau_yoy_revenue as (
118+
select a.transaction_date, ((a.sm - b.sm) * 100.0) / b.sm from dau a join dau b on a.transaction_date=b.transaction_date + interval '1 year'
119+
)
120+
select * from dau_yoy_revenue
121+

0 commit comments

Comments
 (0)
Please sign in to comment.