library(stringr)
library(dplyr)
library(tidyr)

load("C:\\Users\\es901\\Documents\\dsR\\data\\project\\11-finalshort.RData") # finalshort

allword <- read.csv("C:\\Users\\es901\\Documents\\dsR\\data\\project\\wordfinal.csv")

#--0101---------------------------------------
#allword <- read.csv("D:\\word.csv", head = T)

allword$Freq <- as.integer(allword$Freq)
allword$word <- as.character(allword$word)
allword$name <- as.character(allword$name)
names(allword)[names(allword) == "Freq"] <- "n"

allword.nonumber <- filter(allword, !grepl("\\d", word)) %>%
  filter(!grepl("[[:punct:]]", word)) %>%
  filter(!grepl("妳|你|我|他|她|它|牠|們|的|了|啊|吧|日|月", word)) %>%
  filter(nchar(word) > 1) 

testall <- allword.nonumber %>%
  group_by(word) %>%
  summarise(sum = sum(n)) %>%
  filter(sum > 50)

namesss <- data.frame(name = unique(allword$name)) 
namesss$name <- as.character(namesss$name)
# 在allword裡面有data的人的name
final11 <- semi_join(finalshort, namesss, by = "name")
finalava <- final11

s <- rep(0, times = 9)
names(s) <- c("I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX")
for(i in 1:nrow(finalava))
{
  for(j in 3:3)
  {
    v <- finalava[i, j]
    
    if(!is.na(v))
    {
      if(v == 'I') {
        s[1] <- s[1] + 1
      } else if(v == 'II') {
        s[2] <- s[2] + 1
      } else if(v == 'III') {
        s[3] <- s[3] + 1
      } else if(v == 'IV') {
        s[4] <- s[4] + 1
      } else if(v == 'V') {
        s[5] <- s[5] + 1
      } else if(v == 'VI') {
        s[6] <- s[6] + 1
      } else if(v == 'VII') {
        s[7] <- s[7] + 1
      } else if(v == 'VIII') {
        s[8] <- s[8] + 1
      } else {
        s[9] <- s[9] + 1
      }
    }
  }
} # 計算人格分布

s.original <- s


#--0101---------------------------------------
tdf.full <- data_frame()

for(i in 1:nrow(finalava))
{
  namee <- finalava[i, 1]
  
  word.df <- allword.nonumber %>%
    filter(name == namee) %>%
    unique() %>%
    mutate(total = sum(n)) #%>%
  #mutate(n = n/total)
  
  word.df <- semi_join(word.df, testall, by = "word")
  
  df.spread <- subset(word.df, select = -total) %>%
    spread(word, n)
  
  tdf.full <- bind_rows(tdf.full, df.spread)
}

for(i in 1:nrow(tdf.full))
{
  for(j in 1:length(tdf.full))
  {
    if(is.na(tdf.full[i, j])) tdf.full[i, j] <- 0
  }
}

tdf.answer <- merge(finalshort, tdf.full, by = "name") %>%
  subset(select = -highiest)
tdf.answer$selected <- as.factor(tdf.answer$selected)
tdf.answer <- tdf.answer[tdf.answer$name != "YanTong Lin", ] # 移除錯誤結果
row.names(tdf.answer) <- seq(1, nrow(tdf.answer), by = 1) 

index <- 1:nrow(tdf.answer) # 幾筆資料
## 要一起執行
set.seed(5251)
testindex <- sample(index, trunc(length(index)*30/100)) # 挑20%的id (index)
##
trainset.name  <- tdf.answer[-testindex, ] # 剩餘的80%
testset.name   <- tdf.answer[testindex, ] # 20%

trainset <- subset(trainset.name, select = -name)
testset <- subset(testset.name, select = -name)

library(class)

s <- rep(0, times = 9)
names(s) <- c("I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX")
for(i in 1:nrow(trainset))
{
  for(j in 1)
  {
    v <- trainset[i, j]
    
    if(!is.na(v))
    {
      if(v == 'I') {
        s[1] <- s[1] + 1
      } else if(v == 'II') {
        s[2] <- s[2] + 1
      } else if(v == 'III') {
        s[3] <- s[3] + 1
      } else if(v == 'IV') {
        s[4] <- s[4] + 1
      } else if(v == 'V') {
        s[5] <- s[5] + 1
      } else if(v == 'VI') {
        s[6] <- s[6] + 1
      } else if(v == 'VII') {
        s[7] <- s[7] + 1
      } else if(v == 'VIII') {
        s[8] <- s[8] + 1
      } else {
        s[9] <- s[9] + 1
      }
    }
  }
} # 計算人格分布
s #
s.trainset <- s



knn.pred <- knn(trainset[, -1], testset[, -1], trainset[, 1], k = 3)

conf.mat <- table("Predictions" = knn.pred, Actual = testset$selected); conf.mat

TP <- sum(diag(conf.mat)) # True positive

FN <- 0
for(i in 1:nrow(conf.mat))
{
  a <- conf.mat[i, i]
  FN.temp <- sum(conf.mat[i, ]) - a
  FN <- FN + FN.temp
} # count False negative

FP <- 0
for(j in 1:ncol(conf.mat))
{
  a <- conf.mat[j, j]
  FP.temp <- sum(conf.mat[, j]) - a
  FP <- FP + FP.temp
} # count False positive

Precision <- TP / (TP + FP); Precision
Recall <- TP / (TP + FN); Recall
F1score <- 2*Precision*Recall/(Precision+Recall); F1score

save(conf.mat, file = "C:\\Users\\es901\\Documents\\dsR\\data\\project\\conf.mat.RData")