-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2.3_clean_functional_data.R
87 lines (62 loc) · 2.74 KB
/
2.3_clean_functional_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# ##############################################################################
#
## Feature table pre-processing
#
# ##############################################################################
# Setting working directory
setwd("~/Desktop/crc_analysis/scripts") #macbook
setwd("C:/Users/Erika Dvarionaite/iCloudDrive/Desktop/crc_analysis/scripts") #windows
# Packages
library(readr)
library(stringr)
library(tidyr)
library(yaml)
library(tidyverse)
library(yaml)
library(matrixStats)
# ##############################################################################
# General
# This script is for cleaning and filtering feature tables for each individual study
# ##############################################################################
# Get data
parameters <- yaml.load_file('../parameters.yaml')
study.tag <- parameters$all.studies
feat.tag <- parameters$functional.feat
memory.limit(56000)
memory.size(max = T)
# Metadata
meta.all <- read_tsv(file = '../data/meta/meta.crc.tsv')
# Load each feature table and filter each study individually
sink("../feature.count.filtering.txt", append = T)
for (tag in study.tag) {
cat('##############################################################################\n')
cat('# Starting cleaning', tag, 'study...\n')
for (f.tag in feat.tag) {
fn.path <- paste0("../data/", f.tag, "/", f.tag, "_relab.tsv")
feat <- read.table(fn.path,
sep = "\t", stringsAsFactors = F,
header = T, check.names = F,
row.names = 1, quote ="", fill = F)
cat(tag, '-', f.tag, 'gene table loaded...\n')
meta <- meta.all %>%
filter(Study == tag)
feat <- as.matrix(feat)
feat <- feat[, meta %>% pull(Sample_ID)]
print(rowMeans(feat[c(1,2),]))
feat <- feat[-c(1,2),]
feat<- feat[rowSums(feat[])>0,] # removing all zeroes for the actual number of features per study
# Filter low abundant features
temp.max.ab <- sapply(unique(meta$Group), FUN=function(s){
rowMaxs(feat[,meta %>% filter(Group == s) %>% pull(Sample_ID)])
})
f.idx = rowSums(temp.max.ab >= 1e-06) >= 1
feat.filt <- feat[f.idx,]
cat('Out of', nrow(feat), 'features,', 'retaining', sum(f.idx), f.tag, 'features for', tag, 'study after low-abundance filtering...\n')
# Save filtered feature table
filtered.feat <- paste0('../data/', f.tag, '/filtered_', f.tag, '_', tag, '.tsv')
write.table(feat.filt, file=filtered.feat, quote=FALSE, sep='\t',
row.names=TRUE, col.names=TRUE)
cat(f.tag, 'has now been processed...\n')
}
}
################################# DONE #####################################