-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2.2_prepare_taxonomic_data.R
138 lines (91 loc) · 4.11 KB
/
2.2_prepare_taxonomic_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# ##############################################################################
#
## TAXONOMIc TABLE PREPARATION & MERGING OF SAMPLES
#
# ##############################################################################
# Setting working directory
setwd("~/Desktop/crc_analysis/scripts") #macbook
setwd("C:/Users/Erika Dvarionaite/iCloudDrive/Desktop/crc_analysis/scripts") #windows
# Packages
library(dplyr)
library(tibble)
library(readr)
library(stringr)
library(yaml)
# ##############################################################################
# Get data
# Meta
meta.all <- read_tsv(file = '../data/meta/meta.crc.tsv')
# Feature table
fn.path <- paste0('../data/species/mpa.txt')
feat.tax <- read_delim(fn.path, "\t", escape_double = FALSE, trim_ws = TRUE)
# List of all studies and sample information
load("../data/names.RData")
# Library sizes
library_size <- read.delim("../data/library_size/library_size_raw.txt", header = T, sep = ':')
rownames(library_size) <- library_size$Sample
library_size_filtered <- read.delim("../data/library_size/library_size_filtered.txt", header = T, sep = ',')
rownames(library_size_filtered) <- library_size_filtered$Sample
# ##############################################################################
# Clean feature table
feat <- as.data.frame(feat.tax) %>%
rename_all(funs(str_replace(., "_mphlan", ""))) #%>%
#select(-NCBI_tax_id)
# ##############################################################################
# Merge samples in taxonomic feature table
studies <- c("Thomas", "Wirbel", "Hannigan", "Feng", "Zeller", "Yu", "Vogtmann", "Yamada", "Qin", "Indian")
for (study in studies) {
sample_names <- names[[study]]
if (study == "Thomas") {
sample_names$run_accession <- paste0(sample_names$run_accession, '_', sample_names$sample_alias)
}
if (study == "Hannigan") {
sample_names$run_accession <- paste0(sample_names$run_accession, '_CRC_Virome')
}
if (study == "Feng") {
colnames(sample_names) <- c("study_accession", "sample_alias", "run_accession")
}
if (study == "Yu") {
sample_names$sample_alias <- sample_names$run_accession
}
rownames(sample_names) <- sample_names$run_accession
run_acc <- sample_names$run_accession
n = 0
for (run in run_acc) {
if (paste0(run) %in% colnames(feat)) {
reads <- library_size_filtered[paste0(run), "Reads"]
sample_alias <- sample_names[run, "sample_alias"]
if (!sample_alias %in% colnames(feat)) {
paste_run <- paste0(run)
index <- match(paste_run, colnames(feat))
stopifnot(length(index)==1)
feat[, index] <- round(feat[, index] * reads)
colnames(feat)[index] <- sample_alias
} else {
paste_run <- paste0(run)
index <- match(paste_run, colnames(feat))
index_alias <- match(sample_alias, colnames(feat))
stopifnot(length(index)==1 & length(index_alias)== 1)
feat[, index] <- round(feat[, index] * reads)
feat[, sample_alias] <- rowSums(feat[, c(index, index_alias)])
feat <- feat[, -index]
}
n = n+1
}
}
}
rownames(feat) <- make.names(feat[,1])
check <- colnames(feat) %in% meta.all$Sample_ID
colnames(feat)[check == F]
feat <- feat[, check == T]
stopifnot(all(colnames(feat) %in% meta.all$Sample_ID))
###############################################################################
## Convert to relative abundances
feat.relab <- prop.table(as.matrix(feat), 2)
###############################################################################
## Save files
write.table(feat, file=paste0("../data/species/species_full.tsv"),
col.names = T, quote = F, row.names = T, sep = '\t')
write.table(feat.relab, file=paste0("../data/species/species_full_relab.tsv"),
col.names = T, quote = F, row.names = T, sep = '\t')
################################# DONE #####################################