Reference: https://www.ptt.cc/bbs/movie/index.html
Read all the libraryies you need
Read the data.
DataAll <- read.csv('All_article.csv')
DataAll[,1] <- as.character(DataAll[,1])
DataAll[,2] <- as.character(DataAll[,2])
Since you’re doing CHINESE text mining, you have to build your own ngram.
myngram <- function(text,n){
WordsList <- list()
size <- n-1
for ( i in 1:(length(text)-size)) {
WordsList[i] <- substr(text, i, i+size)
}
return(WordsList)
}
Then, apply your function to your dataset, generate bigram and make it clean.
Bigram <- lapply(DataAll[,1], function(x,n) myngram(x, 2))
BigramList <- unlist(Bigram)
BigramList <- str_replace_all(BigramList, "[[:punct:]]", "")
BigramList <- str_replace_all(BigramList, "^[A-Za-z0-9]+$", "")
BigramList <- str_replace_all(BigramList,'[[:digit:]]+',"")
BigramList <- BigramList[! BigramList %in% ""]
BigramList <- BigramList[grep('[\u4e00-\u9fa5]{2}', BigramList)]
Lastly, count each word and sort them.
WordCount <- data.table(table(BigramList))
WordCount <- WordCount %>%
arrange(desc(WordCount[,2]))
devtools::install_github("lchiffon/wordcloud2")
library(wordcloud2)
And you can have your own WordCloud!
wordcloud2(WordCount[1:100,],fontFamily = "LiHei Pro")
## Warning in if (class(data) == "table") {: the condition has length > 1 and
## only the first element will be used
Jieba is an efficient and effective tool for word segmentation. JiebaR is its interface for R users.
Use install.pachages
for the library. worker
is for the setting and <=
is special for call the word segmentation tool.
# install.packages('jiebaR')
library("jiebaR")
mixseg <- worker(stop_word = 'stop_words.txt')
SegList <- list()
for (i in 1:nrow(DataAll)){
SegList[[i]] <- mixseg <= (as.character(DataAll[i,1]))
}
SegList <- unlist(SegList)
With the list, we can apply the same process what we’ve done just now.
SegList <- str_replace_all(SegList,'[[:digit:]]+',"")
SegList <- str_replace_all(SegList,"[[:punct:]]", "")
SegList <- str_replace_all(SegList,"^[A-Z a-z 0-9]+$", "")
SegTable <- data.table(table(SegList))
SegTable <- SegTable %>%
arrange(desc(SegTable$N))
You can get some keywords.
keys = worker("keywords",topn=20)
k <- vector_keywords(SegList,keys)
k
## 119223 19463.6 9003.97 8886.58 7783.09 5810.91 5270.9 5153.51 5036.12
## "" "電影" "最後" "這部" "劇情" "開始" "女兒" "異形" "導演"
## 4895.25 4883.51 4413.94 4394.32 4296.55 4179.16 3815.24 3721.33 3627.41
## "喜歡" "雖然" "大衛" "真的" "觀眾" "父親" "不過" "人類" "之後"
## 3533.5 3521.76
## "對於" "然後"
Or you can get the tag for each term.
cutter <- worker("tag")
tag <- cutter <= as.character(DataAll[1,1])
names(tag)
## [1] "f" "n" "v" "x" "t" "x" "c" "t" "uj" "zg" "n"
## [12] "a" "v" "ns" "a" "n" "d" "x" "v" "uj" "d" "v"
## [23] "x" "r" "p" "n" "f" "d" "d" "v" "ul" "n" "c"
## [34] "v" "n" "v" "c" "c" "n" "v" "n" "uj" "n" "x"
## [45] "v" "n" "v" "r" "uj" "n" "d" "v" "ul" "c" "m"
## [56] "d" "p" "n" "v" "v" "v" "ul" "n" "p" "n" "d"
## [67] "p" "n" "m" "v" "x" "zg" "n" "r" "v" "r" "n"
## [78] "a" "v" "uj" "n" "v" "p" "n" "n" "d" "zg" "a"
## [89] "a" "uj" "vn" "v" "x" "d" "v" "v" "n" "n" "v"
## [100] "uj" "n" "d" "v" "r" "uj" "x" "l" "p" "n" "v"
## [111] "r" "d" "m" "r" "d" "d" "d" "x" "c" "x" "v"
## [122] "m" "n" "uj" "a" "n" "v" "n" "zg" "eng" "d" "a"
## [133] "v" "zg"
There’s no good TF-IDF function in r for CHINESE. Still, we need to create our own.
tf <- function(feature, text){
llply(feature, function(x) str_count(text, x)) %>%
ldply %>%
t %>%
as.data.frame()
}
df <- function(feature,text){
llply(feature,function(x) grepl(x, text)) %>%
ldply %>%
t %>%
as.data.frame()
}
Then, let’s get TF-IDF done!
aa <- tf(WordCount$BigramList, DataAll[,1])
bb <- df(WordCount$BigramList, DataAll[,1])
dfTable <- apply(bb,MARGIN = 2,sum)
tfidfarray_ngram <- aa
tfidfarray_ngram[tfidfarray_ngram==0] <- NA
tfidfarray_ngram <- apply(tfidfarray_ngram,MARGIN = 1,
FUN = function(x) (1+log(x,base = 10))*log(nrow(DataAll)/dfTable,base = 10))
tfidfarray_ngram <- as.data.frame(t(tfidfarray_ngram))
tfidfarray_ngram[is.na(tfidfarray_ngram)] <- 0
a <- tf(SegTable$SegList, DataAll[,1])
b <- df(SegTable$SegList, DataAll[,1])
dfTable <- apply(b,MARGIN = 2,sum)
tfidfarray <- a
tfidfarray[tfidfarray==0] <- NA
tfidfarray <- apply(tfidfarray,MARGIN = 1,
FUN = function(x) (1+log(x,base = 10))*log(nrow(DataAll) /dfTable,base = 10))
tfidfarray[is.na(tfidfarray)] <- 0
tfidfarray <- t(tfidfarray)
First, calculate the similarity between each observation.
cosine <- function(x) {
y <- x %*% t(x)
res <- 1 - y / (sqrt(diag(y)) %*% t(sqrt(diag(y))))
return(res)
}
CosSim <- cosine(tfidfarray)
CosSim <- data.table(CosSim)
Let’s take a look at the most similar for the 50th article.
N50 <- which(as.numeric(CosSim[50,]) %in% sort(as.numeric(CosSim[50,]),
decreasing = F)[2:5])
DataAll[N50,2]
## [1] "[好雷] 亞瑟-王者之劍,蓋瑞奇的童話"
## [2] "[好雷] 亞瑟:王者之劍 "
## [3] "[好雷]亞瑟-王者之劍 "
## [4] "[負雷] 混亂的亞瑟:王者之劍 "
N200 <- which(as.numeric(CosSim[200,]) %in% sort(as.numeric(CosSim[200,]),
decreasing = F)[2:5])
DataAll[N200,2]
## [1] "[普好雷] 異形3"
## [2] "[好雷]異形-聖約 "
## [3] "[微好雷] 聖約"
## [4] "Re: [請益] 異形聖約:好雷加問題(種類,大魔王目的)"
Use akmeans()
, which allows us to calculate cosine similarity in our clustering algorithms.
library(akmeans)
k <- akmeans(tfidfarray, ths1 = 2 ,
d.metric = 2, min.k= 7, iter.max = 100)
## [1] "converged at k= 7"
DataCluster <- cbind(DataAll, k$cluster)
Use svm()
to classify ‘好雷’ and ‘負雷’ from the article.
We choose first 2000 variables based on tf-idf weight.
Raw <- data.frame(tfidfarray)
ScoreSum <- t(apply(Raw,2,sum))
train <- rbind(Raw, ScoreSum)
colnames(train) <- SegTable$SegList
index <- train[nrow(train),] %in% sort(ScoreSum,decreasing=T)[1:2000]
train <- train[,which(index)]
train <- train[-nrow(train),]
train <- cbind(DataAll[,3], train)
train <- train[sample(nrow(train)),]
TrainSet <- train[c(1:round(0.7*nrow(train))),]
TestSet <- train[c((round(0.7*nrow(train))+1):nrow(train)),]
library(e1071)
S <- svm(as.factor(TrainSet[,1]) ~ ., data=TrainSet, kernel = 'sigmoid', type = 'C-classification', gamma = 1/2000)
p <- predict(S, newdata = TestSet)
result <- table(TestSet[,1],p)
acc <- sum(diag(result)) / nrow(TestSet)
acc
## [1] 0.8855422