Text Minining in R (on Ptt Movie)

Text Mining on Ptt Movie

Reference: https://www.ptt.cc/bbs/movie/index.html

Read all the libraryies you need

Read the data.

DataAll <- read.csv('All_article.csv')
DataAll[,1] <- as.character(DataAll[,1])
DataAll[,2] <- as.character(DataAll[,2])

Part 1: Ngram Algorithm and WordCloud

Since you’re doing CHINESE text mining, you have to build your own ngram.

myngram <- function(text,n){
WordsList <- list()
size <- n-1
for ( i in 1:(length(text)-size)) {
  WordsList[i] <- substr(text, i, i+size)
}
return(WordsList)
}

Then, apply your function to your dataset, generate bigram and make it clean.

Bigram <- lapply(DataAll[,1], function(x,n) myngram(x, 2))
BigramList <- unlist(Bigram)


BigramList <- str_replace_all(BigramList, "[[:punct:]]", "")
BigramList <- str_replace_all(BigramList, "^[A-Za-z0-9]+$", "")
BigramList <- str_replace_all(BigramList,'[[:digit:]]+',"")
BigramList <- BigramList[! BigramList %in% ""]
BigramList <- BigramList[grep('[\u4e00-\u9fa5]{2}', BigramList)]

Lastly, count each word and sort them.

WordCount <- data.table(table(BigramList))


WordCount <- WordCount %>%
  arrange(desc(WordCount[,2]))

devtools::install_github("lchiffon/wordcloud2")
library(wordcloud2)

And you can have your own WordCloud!

wordcloud2(WordCount[1:100,],fontFamily = "LiHei Pro")

## Warning in if (class(data) == "table") {: the condition has length > 1 and
## only the first element will be used

Part 2: Jieba R

Jieba is an efficient and effective tool for word segmentation. JiebaR is its interface for R users.

Start with JiebaR

Use install.pachages for the library. worker is for the setting and <= is special for call the word segmentation tool.

# install.packages('jiebaR')
library("jiebaR")

mixseg <- worker(stop_word = 'stop_words.txt')

SegList <- list()
for (i in 1:nrow(DataAll)){
  SegList[[i]] <- mixseg <= (as.character(DataAll[i,1]))
}

SegList <- unlist(SegList)

With the list, we can apply the same process what we’ve done just now.

SegList <- str_replace_all(SegList,'[[:digit:]]+',"")
SegList <- str_replace_all(SegList,"[[:punct:]]", "")
SegList <- str_replace_all(SegList,"^[A-Z a-z 0-9]+$", "")
SegTable <- data.table(table(SegList))

SegTable <- SegTable %>%
  arrange(desc(SegTable$N))

Other cool features in jiebaR

You can get some keywords.

keys = worker("keywords",topn=20)
k <- vector_keywords(SegList,keys)
k

##  119223 19463.6 9003.97 8886.58 7783.09 5810.91  5270.9 5153.51 5036.12 
##      ""  "電影"  "最後"  "這部"  "劇情"  "開始"  "女兒"  "異形"  "導演" 
## 4895.25 4883.51 4413.94 4394.32 4296.55 4179.16 3815.24 3721.33 3627.41 
##  "喜歡"  "雖然"  "大衛"  "真的"  "觀眾"  "父親"  "不過"  "人類"  "之後" 
##  3533.5 3521.76 
##  "對於"  "然後"

Or you can get the tag for each term.

cutter <- worker("tag") 
tag <- cutter <= as.character(DataAll[1,1])
names(tag)

##   [1] "f"   "n"   "v"   "x"   "t"   "x"   "c"   "t"   "uj"  "zg"  "n"  
##  [12] "a"   "v"   "ns"  "a"   "n"   "d"   "x"   "v"   "uj"  "d"   "v"  
##  [23] "x"   "r"   "p"   "n"   "f"   "d"   "d"   "v"   "ul"  "n"   "c"  
##  [34] "v"   "n"   "v"   "c"   "c"   "n"   "v"   "n"   "uj"  "n"   "x"  
##  [45] "v"   "n"   "v"   "r"   "uj"  "n"   "d"   "v"   "ul"  "c"   "m"  
##  [56] "d"   "p"   "n"   "v"   "v"   "v"   "ul"  "n"   "p"   "n"   "d"  
##  [67] "p"   "n"   "m"   "v"   "x"   "zg"  "n"   "r"   "v"   "r"   "n"  
##  [78] "a"   "v"   "uj"  "n"   "v"   "p"   "n"   "n"   "d"   "zg"  "a"  
##  [89] "a"   "uj"  "vn"  "v"   "x"   "d"   "v"   "v"   "n"   "n"   "v"  
## [100] "uj"  "n"   "d"   "v"   "r"   "uj"  "x"   "l"   "p"   "n"   "v"  
## [111] "r"   "d"   "m"   "r"   "d"   "d"   "d"   "x"   "c"   "x"   "v"  
## [122] "m"   "n"   "uj"  "a"   "n"   "v"   "n"   "zg"  "eng" "d"   "a"  
## [133] "v"   "zg"

Part 3: TF-IDF Calculation

There’s no good TF-IDF function in r for CHINESE. Still, we need to create our own.

tf <- function(feature, text){
  llply(feature, function(x) str_count(text, x)) %>%
    ldply %>%
    t %>%
    as.data.frame()
  }

df <- function(feature,text){
  llply(feature,function(x) grepl(x, text)) %>% 
    ldply %>% 
    t %>% 
    as.data.frame()
}

Then, let’s get TF-IDF done!

aa <- tf(WordCount$BigramList, DataAll[,1])
bb <- df(WordCount$BigramList, DataAll[,1])

dfTable <- apply(bb,MARGIN = 2,sum)



tfidfarray_ngram <- aa
tfidfarray_ngram[tfidfarray_ngram==0] <- NA 
tfidfarray_ngram <- apply(tfidfarray_ngram,MARGIN = 1,
                    FUN = function(x) (1+log(x,base = 10))*log(nrow(DataAll)/dfTable,base = 10))
tfidfarray_ngram <- as.data.frame(t(tfidfarray_ngram))
tfidfarray_ngram[is.na(tfidfarray_ngram)] <- 0

a <- tf(SegTable$SegList, DataAll[,1])
b <- df(SegTable$SegList, DataAll[,1])
dfTable <- apply(b,MARGIN = 2,sum)

tfidfarray <- a
tfidfarray[tfidfarray==0] <- NA  
tfidfarray <- apply(tfidfarray,MARGIN = 1,
                    FUN = function(x) (1+log(x,base = 10))*log(nrow(DataAll) /dfTable,base = 10))

tfidfarray[is.na(tfidfarray)] <- 0 
tfidfarray <- t(tfidfarray)

Part 4: Application

Cosine Similarity

First, calculate the similarity between each observation.

cosine <- function(x) {
  y <- x %*% t(x)
  res <- 1 - y / (sqrt(diag(y)) %*% t(sqrt(diag(y))))
  return(res)
}

CosSim <- cosine(tfidfarray)
CosSim <- data.table(CosSim)

Let’s take a look at the most similar for the 50th article.

N50 <-  which(as.numeric(CosSim[50,]) %in% sort(as.numeric(CosSim[50,]),
                                                 decreasing = F)[2:5]) 
DataAll[N50,2]

## [1] "[好雷] 亞瑟-王者之劍，蓋瑞奇的童話"
## [2] "[好雷] 亞瑟：王者之劍 "            
## [3] "[好雷]亞瑟-王者之劍 "              
## [4] "[負雷] 混亂的亞瑟：王者之劍 "

N200 <-  which(as.numeric(CosSim[200,]) %in% sort(as.numeric(CosSim[200,]),
                                                decreasing = F)[2:5]) 

DataAll[N200,2]

## [1] "[普好雷] 異形3"                                   
## [2] "[好雷]異形-聖約 "                                 
## [3] "[微好雷] 聖約"                                    
## [4] "Re: [請益] 異形聖約：好雷加問題（種類,大魔王目的)"

Clustering

Use akmeans(), which allows us to calculate cosine similarity in our clustering algorithms.

library(akmeans)

k <- akmeans(tfidfarray, ths1 = 2 ,
             d.metric = 2, min.k= 7, iter.max = 100)

## [1] "converged at k= 7"

DataCluster <- cbind(DataAll, k$cluster)

Classification

Use svm() to classify ‘好雷’ and ‘負雷’ from the article.

Step 1: Variable Selection

We choose first 2000 variables based on tf-idf weight.

Raw <- data.frame(tfidfarray)
ScoreSum <- t(apply(Raw,2,sum))

train <- rbind(Raw, ScoreSum)
colnames(train) <- SegTable$SegList

index <- train[nrow(train),] %in% sort(ScoreSum,decreasing=T)[1:2000]
train <- train[,which(index)]
train <- train[-nrow(train),]

Step 2: Train / Test Split

train <- cbind(DataAll[,3], train)
train <- train[sample(nrow(train)),]

TrainSet <- train[c(1:round(0.7*nrow(train))),]
TestSet <- train[c((round(0.7*nrow(train))+1):nrow(train)),]

Step 3: Modeling

library(e1071)

S <- svm(as.factor(TrainSet[,1]) ~ ., data=TrainSet, kernel = 'sigmoid', type = 'C-classification', gamma = 1/2000)

p <- predict(S, newdata = TestSet)
result <- table(TestSet[,1],p)
acc <- sum(diag(result)) / nrow(TestSet)
acc

## [1] 0.8855422