Skip to content

Commit

Permalink
Edits
Browse files Browse the repository at this point in the history
  • Loading branch information
neelgupta2112 committed Jan 29, 2025
1 parent 3f8f470 commit 57720f4
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 1 deletion.
Binary file modified .DS_Store
Binary file not shown.
157 changes: 157 additions & 0 deletions JoE/.Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# 4. Test whether another die is fair
set.seed(123)
View(df_2)
library(XNomial) # for xmulti
library(RVAideMemoire) # for G.test
library(coin) # for symmetry_test
# 4. Test whether another die is fair
set.seed(123)
a = sample(seq(1,6), size=100, replace=TRUE, prob=c(0.1,0.2,0.1,0.4,0.1,0.1))
df_2 = data.frame(
Roll = factor(seq(1, 100, 1)),
Face = factor(a)
)
# Create and view a crosstabs
# TODO
xt_2 = xtabs( ~ Face, data=df)
# Conduct a multinomial test
xmulti(xt_2, rep(1/length(xt_2), length(xt_2)), statName="Prob")
View(df_2)
# Create and view a crosstabs
# TODO
xt_2 = xtabs( ~ Face, data=df)
View(df_2)
# Create and view a crosstabs
# TODO
xt_2 = xtabs( ~Face, data=df)
# Create and view a crosstabs
# TODO
xt_2 = xtabs( ~ Face, data=df)
# Conduct six binomial tests
# TODO
binom.test(sum(df_2$Face == 1), nrow(df), p=1/6)
binom.test(sum(df_2$Face == 2), nrow(df), p=1/6)
# 3. Test whether a die is fair
set.seed(123)
a = sample(seq(1,6), size=100, replace=TRUE, prob=rep(1/6, 6))
df = data.frame(
Roll = factor(seq(1, 100, 1)),
Face = factor(a)
)
View(df)
# Make and view a crosstabs
# TODO
xt = xtabs( ~ Face, data=df)
xt
# Conduct a multinomial test
# TODO
xmulti(xt, rep(1/length(xt), length(xt)), statName="Prob")
xmulti
# 4. Test whether another die is fair
set.seed(123)
a = sample(seq(1,6), size=100, replace=TRUE, prob=c(0.1,0.2,0.1,0.4,0.1,0.1))
df_2 = data.frame(
Roll = factor(seq(1, 100, 1)),
Face = factor(a)
)
View(df_2)
# Create and view a crosstabs
# TODO
xt_2 = xtabs( ~ Face, data=df)
# Conduct a multinomial test
xmulti(xt_2, rep(1/length(xt_2), length(xt_2)), statName="Prob")
xt_2
# Conduct six binomial tests
# TODO
binom.test(sum(df_2$Face == 1), nrow(df), p=1/6)
binom.test(sum(df_2$Face == 2), nrow(df), p=1/6)
binom.test(sum(df_2$Face == 3), nrow(df), p=1/6)
binom.test(sum(df_2$Face == 4), nrow(df), p=1/6)
binom.test(sum(df_2$Face == 5), nrow(df), p=1/6)
binom.test(sum(df_2$Face == 6), nrow(df), p=1/6)
# Correct the p-values for multiple comparisons
# TODO
p.adjust(c(0.006882, 0.2296, 0.1408, 9.755e-08, 0.1408, 0.1408), method="holm")
# Conduct six binomial tests
# TODO
binom.test(sum(df_2$Face == 1), nrow(df_2), p=1/6)
binom.test(sum(df_2$Face == 2), nrow(df_2), p=1/6)
binom.test(sum(df_2$Face == 3), nrow(df_2), p=1/6)
binom.test(sum(df_2$Face == 4), nrow(df_2), p=1/6)
binom.test(sum(df_2$Face == 5), nrow(df_2), p=1/6)
binom.test(sum(df_2$Face == 6), nrow_2(df), p=1/6)
binom.test(sum(df_2$Face == 6), nrow(df_2), p=1/6)
# Correct the p-values for multiple comparisons
# TODO
p.adjust(c(0.006882, 0.2296, 0.1408, 9.755e-08, 0.1408, 0.1408), method="holm")
# 5. Test association between Major and GPA
set.seed(123)
a = sample(c("gte3.4","lt3.4"), size=50, replace=TRUE, prob=c(0.3, 0.7))
b = sample(c("gte3.4","lt3.4"), size=50, replace=TRUE, prob=c(0.7, 0.3))
df_3 = data.frame(
PId = factor(seq(1, 100, 1)),
Major = factor(rep(c("STEM","nonSTEM"), each=50), levels=c("STEM","nonSTEM")),
GPA = factor(c(a,b), levels=c("gte3.4","lt3.4"))
)
# Create and view a crosstabs
xt_3 = xtabs(~ GPA + Major, data = df_3)
xt_3
# Conduct three tests of association
fisher.test(xt_3)
chisq.test(xt_3)
G.test(xt_3)
df_3
xt_3
# 6. Compare GPAs within STEM and non-STEM majors
binom.test(xt_3[,1], p=51/100)
binom.test(xt_3[,2], p=51/100)
p.adjust(c(0.004183, 0.002857), method="holm")
#8. Create dependent counts
set.seed(123)
fall = sample(c("vanilla","chocolate","strawberry"), size=25, replace=TRUE, prob=c(0.3, 0.1, 0.6))
winter = sample(c("vanilla","chocolate","strawberry"), size=25, replace=TRUE, prob=c(0.6, 0.3, 0.1))
spring = sample(c("vanilla","chocolate","strawberry"), size=25, replace=TRUE, prob=c(0.1, 0.6, 0.3))
summer = sample(c("vanilla","chocolate","strawberry"), size=25, replace=TRUE, prob=c(0.4, 0.4, 0.2))
df_4 = data.frame(
PId = factor(rep(1:25, times=4)),
Season = factor(rep(c("fall","winter","spring","summer"), each=25), levels=c("fall","winter","spring","summer")),
Flavor = factor(c(fall, winter, spring, summer), levels=c("vanilla","chocolate","strawberry"))
)
df_4 <- df_4[order(df$PId),] # sort by PId
row.names(df_4) <- 1:nrow(df_4) # renumber row names
View(df_4)
print(xt_4) # view table
# conduct a symmetry test
symmetry_test(Flavor ~ Season | PId, data=df_4)
# 9. Analyze Winter flavor choices
print(xt_4[2,]) # Winter
# Perform a multinomial test on Winter flavor choices xt[2,]
# TODO
xmulti(xt_4[2,], c(37/100, 35/100, 28/100), statName="Prob") #lib=XNomial
# 10. Which Winter flavors differ from chance?
binom.test(sum(df_4$Season == "winter" & df_4$Flavor == "vanilla"), 25, p = 1/3)
binom.test(sum(df_4$Season == "winter" & df_4$Flavor == "chocolate"), 25, p = 1/3)
binom.test(sum(df_4$Season == "winter" & df_4$Flavor == "strawberry"), 25, p = 1/3)
p.adjust(c(0.0001279, 0.2037, 0.005154), method="holm")
# 5. Test association between Major and GPA
set.seed(123)
a = sample(c("gte3.4","lt3.4"), size=50, replace=TRUE, prob=c(0.3, 0.7))
b = sample(c("gte3.4","lt3.4"), size=50, replace=TRUE, prob=c(0.7, 0.3))
df_3 = data.frame(
PId = factor(seq(1, 100, 1)),
Major = factor(rep(c("STEM","nonSTEM"), each=50), levels=c("STEM","nonSTEM")),
GPA = factor(c(a,b), levels=c("gte3.4","lt3.4"))
)
View(df_3)
# Create and view a crosstabs
xt_3 = xtabs(~ GPA + Major, data = df_3)
xt_3
# Conduct three tests of association
fisher.test(xt_3)
chisq.test(xt_3)
G.test(xt_3)
df_3
xt_3
?binom.test
xt_3[,1]
xt_3[,2]
4 changes: 3 additions & 1 deletion JoE/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@ Data cannot be analyzed responsibly without deep knowledge of its social and his

But despite its significance, social and historical knowledge and methodologies are one of the most neglected areas in undergraduate computing education. In classes, it is very common for students to use datasets that they find on websites like Kaggle, datasets that are poorly documented and that students thus don’t fully understand. This is a recipe for irresponsible data work and a bad habit that can become a dangerous habit as the stakes get higher.

The motivation for our data essays is also in spired by "Datasheets for Datasets" `[@Gebru:2021]` which argues that "every dataset be accompanied with a datasheet that documents its motivation, composition, collection process, recommended uses, and so on." Our data essays resemble these datasheets and do more, with a focus on sites of uncertainty and ambiguity within the datasets and the collection process.

# Our Data Essays

The general structure of our data essays are inspired by Heather Krause’s concept of a data biography and the content builds on "Datasheets for Datasets" `[@Gebru:2021]` which argues that "every dataset be accompanied with a datasheet that documents its motivation, composition, collection process, recommended uses, and so on." Our data essays resemble these datasheets and do more, with a focus on sites of uncertainty and ambiguity within the datasets and the collection process.
The general structure of our data essays are inspired by Heather Krause’s concept of a data biography.

A data essay will and should differ depending on the data it describes, but most of our data essays try to address the following questions:

Expand Down

0 comments on commit 57720f4

Please sign in to comment.