Edits

melaniewalsh · Jan 29, 2025 · 57720f4 · 57720f4
1 parent 3f8f470
commit 57720f4
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 1 deletion.
diff --git a/.DS_Store b/.DS_Store
diff --git a/JoE/.Rhistory b/JoE/.Rhistory
@@ -0,0 +1,157 @@
+# 4. Test whether another die is fair
+set.seed(123)
+View(df_2)
+library(XNomial) # for xmulti
+library(RVAideMemoire) # for G.test
+library(coin) # for symmetry_test
+# 4. Test whether another die is fair
+set.seed(123)
+a = sample(seq(1,6), size=100, replace=TRUE, prob=c(0.1,0.2,0.1,0.4,0.1,0.1))
+df_2 = data.frame(
+Roll = factor(seq(1, 100, 1)),
+Face = factor(a)
+)
+# Create and view a crosstabs
+# TODO
+xt_2 = xtabs( ~ Face, data=df)
+# Conduct a multinomial test
+xmulti(xt_2, rep(1/length(xt_2), length(xt_2)), statName="Prob")
+View(df_2)
+# Create and view a crosstabs
+# TODO
+xt_2 = xtabs( ~ Face, data=df)
+View(df_2)
+# Create and view a crosstabs
+# TODO
+xt_2 = xtabs( ~Face, data=df)
+# Create and view a crosstabs
+# TODO
+xt_2 = xtabs( ~ Face, data=df)
+# Conduct six binomial tests
+# TODO
+binom.test(sum(df_2$Face == 1), nrow(df), p=1/6)
+binom.test(sum(df_2$Face == 2), nrow(df), p=1/6)
+# 3. Test whether a die is fair
+set.seed(123)
+a = sample(seq(1,6), size=100, replace=TRUE, prob=rep(1/6, 6))
+df = data.frame(
+Roll = factor(seq(1, 100, 1)),
+Face = factor(a)
+)
+View(df)
+# Make and view a crosstabs
+# TODO
+xt = xtabs( ~ Face, data=df)
+xt
+# Conduct a multinomial test
+# TODO
+xmulti(xt, rep(1/length(xt), length(xt)), statName="Prob")
+xmulti
+# 4. Test whether another die is fair
+set.seed(123)
+a = sample(seq(1,6), size=100, replace=TRUE, prob=c(0.1,0.2,0.1,0.4,0.1,0.1))
+df_2 = data.frame(
+Roll = factor(seq(1, 100, 1)),
+Face = factor(a)
+)
+View(df_2)
+# Create and view a crosstabs
+# TODO
+xt_2 = xtabs( ~ Face, data=df)
+# Conduct a multinomial test
+xmulti(xt_2, rep(1/length(xt_2), length(xt_2)), statName="Prob")
+xt_2
+# Conduct six binomial tests
+# TODO
+binom.test(sum(df_2$Face == 1), nrow(df), p=1/6)
+binom.test(sum(df_2$Face == 2), nrow(df), p=1/6)
+binom.test(sum(df_2$Face == 3), nrow(df), p=1/6)
+binom.test(sum(df_2$Face == 4), nrow(df), p=1/6)
+binom.test(sum(df_2$Face == 5), nrow(df), p=1/6)
+binom.test(sum(df_2$Face == 6), nrow(df), p=1/6)
+# Correct the p-values for multiple comparisons
+# TODO
+p.adjust(c(0.006882, 0.2296, 0.1408, 9.755e-08, 0.1408, 0.1408), method="holm")
+# Conduct six binomial tests
+# TODO
+binom.test(sum(df_2$Face == 1), nrow(df_2), p=1/6)
+binom.test(sum(df_2$Face == 2), nrow(df_2), p=1/6)
+binom.test(sum(df_2$Face == 3), nrow(df_2), p=1/6)
+binom.test(sum(df_2$Face == 4), nrow(df_2), p=1/6)
+binom.test(sum(df_2$Face == 5), nrow(df_2), p=1/6)
+binom.test(sum(df_2$Face == 6), nrow_2(df), p=1/6)
+binom.test(sum(df_2$Face == 6), nrow(df_2), p=1/6)
+# Correct the p-values for multiple comparisons
+# TODO
+p.adjust(c(0.006882, 0.2296, 0.1408, 9.755e-08, 0.1408, 0.1408), method="holm")
+# 5. Test association between Major and GPA
+set.seed(123)
+a = sample(c("gte3.4","lt3.4"), size=50, replace=TRUE, prob=c(0.3, 0.7))
+b = sample(c("gte3.4","lt3.4"), size=50, replace=TRUE, prob=c(0.7, 0.3))
+df_3 = data.frame(
+PId = factor(seq(1, 100, 1)),
+Major = factor(rep(c("STEM","nonSTEM"), each=50), levels=c("STEM","nonSTEM")),
+GPA = factor(c(a,b), levels=c("gte3.4","lt3.4"))
+)
+# Create and view a crosstabs
+xt_3 = xtabs(~ GPA + Major, data = df_3)
+xt_3
+# Conduct three tests of association
+fisher.test(xt_3)
+chisq.test(xt_3)
+G.test(xt_3)
+df_3
+xt_3
+# 6. Compare GPAs within STEM and non-STEM majors
+binom.test(xt_3[,1], p=51/100)
+binom.test(xt_3[,2], p=51/100)
+p.adjust(c(0.004183, 0.002857), method="holm")
+#8. Create dependent counts
+set.seed(123)
+fall   = sample(c("vanilla","chocolate","strawberry"), size=25, replace=TRUE, prob=c(0.3, 0.1, 0.6))
+winter = sample(c("vanilla","chocolate","strawberry"), size=25, replace=TRUE, prob=c(0.6, 0.3, 0.1))
+spring = sample(c("vanilla","chocolate","strawberry"), size=25, replace=TRUE, prob=c(0.1, 0.6, 0.3))
+summer = sample(c("vanilla","chocolate","strawberry"), size=25, replace=TRUE, prob=c(0.4, 0.4, 0.2))
+df_4 = data.frame(
+PId = factor(rep(1:25, times=4)),
+Season = factor(rep(c("fall","winter","spring","summer"), each=25), levels=c("fall","winter","spring","summer")),
+Flavor = factor(c(fall, winter, spring, summer), levels=c("vanilla","chocolate","strawberry"))
+)
+df_4 <- df_4[order(df$PId),]    # sort by PId
+row.names(df_4) <- 1:nrow(df_4) # renumber row names
+View(df_4)
+print(xt_4) # view table
+# conduct a symmetry test
+symmetry_test(Flavor ~ Season | PId, data=df_4)
+# 9. Analyze Winter flavor choices
+print(xt_4[2,]) # Winter
+# Perform a multinomial test on Winter flavor choices xt[2,]
+# TODO
+xmulti(xt_4[2,], c(37/100, 35/100, 28/100), statName="Prob") #lib=XNomial
+# 10. Which Winter flavors differ from chance?
+binom.test(sum(df_4$Season == "winter" & df_4$Flavor == "vanilla"), 25, p = 1/3)
+binom.test(sum(df_4$Season == "winter" & df_4$Flavor == "chocolate"), 25, p = 1/3)
+binom.test(sum(df_4$Season == "winter" & df_4$Flavor == "strawberry"), 25, p = 1/3)
+p.adjust(c(0.0001279, 0.2037, 0.005154), method="holm")
+# 5. Test association between Major and GPA
+set.seed(123)
+a = sample(c("gte3.4","lt3.4"), size=50, replace=TRUE, prob=c(0.3, 0.7))
+b = sample(c("gte3.4","lt3.4"), size=50, replace=TRUE, prob=c(0.7, 0.3))
+df_3 = data.frame(
+PId = factor(seq(1, 100, 1)),
+Major = factor(rep(c("STEM","nonSTEM"), each=50), levels=c("STEM","nonSTEM")),
+GPA = factor(c(a,b), levels=c("gte3.4","lt3.4"))
+)
+View(df_3)
+# Create and view a crosstabs
+xt_3 = xtabs(~ GPA + Major, data = df_3)
+xt_3
+# Conduct three tests of association
+fisher.test(xt_3)
+chisq.test(xt_3)
+G.test(xt_3)
+df_3
+xt_3
+?binom.test
+xt_3[,1]
+xt_3[,2]
diff --git a/JoE/paper.md b/JoE/paper.md
@@ -33,9 +33,11 @@ Data cannot be analyzed responsibly without deep knowledge of its social and his
 
 But despite its significance, social and historical knowledge and methodologies are one of the most neglected areas in undergraduate computing education. In classes, it is very common for students to use datasets that they find on websites like Kaggle, datasets that are poorly documented and that students thus don’t fully understand. This is a recipe for irresponsible data work and a bad habit that can become a dangerous habit as the stakes get higher. 
 
+The motivation for our data essays is also in spired by "Datasheets for Datasets" `[@Gebru:2021]`  which argues that "every dataset be accompanied with a datasheet that documents its motivation, composition, collection process, recommended uses, and so on." Our data essays resemble these datasheets and do more, with a focus on sites of uncertainty and ambiguity within the datasets and the collection process. 
+
 # Our Data Essays
 
-The general structure of our data essays are inspired by Heather Krause’s concept of a data biography and the content builds on "Datasheets for Datasets" `[@Gebru:2021]`  which argues that "every dataset be accompanied with a datasheet that documents its motivation, composition, collection process, recommended uses, and so on." Our data essays resemble these datasheets and do more, with a focus on sites of uncertainty and ambiguity within the datasets and the collection process. 
+The general structure of our data essays are inspired by Heather Krause’s concept of a data biography. 
 
 A data essay will and should differ depending on the data it describes, but most of our data essays try to address the following questions: