This repository was archived by the owner on Apr 14, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
96 lines (72 loc) · 4.11 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# TSC
# 2014-06-19
# run_analysis.R for Getting and Cleaning data course project
# requirements are listing in comments as they are attressed
# setwd('/xxxxx/classes/getting_cleaning_data/course_project')
fileURL <- 'https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip '
download.file(fileURL, 'uci_har_dataset.zip', method='curl')
unzip('uci_har_dataset.zip')
# test/subject_test.txt has the subject_id
# test/y_test.txt has the activity_id
# test/X_test.txt has the measurement_vector
# activity_labels.txt has id and activity_label
# features.txt has the label for the measurement_vectors
# load common labels
activity_labels_temp <- read.table('UCI HAR Dataset/activity_labels.txt', header=FALSE)
names(activity_labels_temp) <- c("activity_id", "activity_name") # need nice names later
measurement_labels_temp <- read.table('UCI HAR Dataset/features.txt', header=FALSE)
# we're going to do this twice, so we might as well write a function
get_df <- function (splitname) {
# read the train or test data
df_subject_id <- read.table(sprintf('UCI HAR Dataset/%s/subject_%s.txt', splitname, splitname), header=FALSE)
df_activity_id <- read.table(sprintf('UCI HAR Dataset/%s/y_%s.txt', splitname, splitname), header=FALSE)
df_measurement <- read.table(sprintf('UCI HAR Dataset/%s/X_%s.txt', splitname, splitname), header=FALSE)
# 4. Appropriately labels the data set with descriptive variable names.
names(df_measurement) <- measurement_labels_temp[,2]
# add the subject_id and activity_id
df <- cbind(df_subject_id, df_activity_id, df_measurement)
# give them nice names
colnames(df)[1] <- "subject_id"
colnames(df)[2] <- "activity_id"
# 3. Uses descriptive activity names to name the activities in the data set
# print(dim(df)) # before merge
df <- merge(df, activity_labels_temp, by="activity_id")
# print(dim(df)) # after merge, just to make sure we didn't lose any rows
return(df)
}
# 1. Merges the training and the test sets to create one data set.
df_test = get_df("test")
df_train = get_df("train")
df_full <- rbind(df_test, df_train)
# dim(df_full) ## number of obs matches expected from website
# table(df_full$subject_id)
# OK, we have 30 subjects
# plot(table(df_full$subject_id), type="l")
# abline(h=mean(table(df_full$subject_id)))
# OK, the rows are pretty evenly distributed among the 30 subjects
# table(df_full$activity_name)
# 2. Extracts only the measurements on the mean and standard deviation for each measurement.
# My interpretation is that "mean" should only include names "mean()" not gravityMean or meanFreq
meanCols <- grep('mean\\(\\)', names(df_full)) # 33
stdCols <- grep('std\\(\\)', names(df_full)) # 33
# so let's keep subject_id=2, activity_name=564, and the meanCols and stdCols
df_final <- df_full[,c(2,564,meanCols,stdCols)]
# dim(df_final)
# [1] 10299 68
# sum(complete.cases(df_final)) # no NAs
# 5. Creates a second, independent tidy data set with the average of each variable for each activity and each subject.
# This is ambiguous as to whether to use the variables in the original or the set just created.
# I'm assuming that it is based on the variables in df_final. It is OK to have one row per subject per activiy
# (see http://vita.had.co.nz/papers/tidy-data.pdf page 5)
# So, we should end up with
# subject_id, activity_name, mean(col_1), mean(col2), etc.
# looks like aggregate does everything we need
# I would expect 2 id columns and 66 data columns with 180 rows (30 subs X 6 activities)
df_subject_activity_means <- aggregate(subset(df_final, select= -c(subject_id, activity_name)),
by=list(subject_id=df_final$subject_id, activity_name=df_final$activity_name),
mean)
# table(df_subject_activity_means$activity_name) # 30 for each activity
# table(df_subject_activity_means$subject_id) # 6 for each subject
# sum(complete.cases(df_subject_activity_means))
# writing out the smaller (tidy) set for submission
write.csv(df_subject_activity_means, 'getdata_course_project_submission.csv', row.names = FALSE)