-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInitialTesting.R
39 lines (27 loc) · 941 Bytes
/
InitialTesting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
con <- file("final/en_US/en_US.twitter.txt", "r")
readLines(con, 2)
close(con)
readLines("Textname.txt",encoding="UTF-8")
scan("Textname.txt","character",sep="\n")
strsplit(string,"")
table(sentences)
sentences<-scan("final/en_US/en_US.twitter.txt","character",sep="\n")
sentences<-gsub("\\.","",sentences)
sentences<-gsub("\\,","",sentences)
words<-strsplit(sentences," ")
words.freq<-table(unlist(words))
wordFrq <- cbind.data.frame(names(words.freq),as.integer(words.freq))
library(tau)
library(data.table)
createNgram <-function(stringVector, ngramSize){
ngram <- data.table()
ng <- textcnt(stringVector, method = "string", n=ngramSize, tolower = FALSE)
if(ngramSize==1){
ngram <- data.table(w1 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
}
else {
ngram <- data.table(w1w2 = names(ng), freq = unclass(ng), length=nchar(names(ng)))
}
return(ngram)
}
res <- createNgram(sentences, 2)