library(stringr)
library(dplyr)
library(tidyr)
load("C:\\Users\\es901\\Documents\\dsR\\data\\project\\11-finalshort.RData") # finalshort
allword <- read.csv("C:\\Users\\es901\\Documents\\dsR\\data\\project\\wordfinal.csv")
#--0101---------------------------------------
#allword <- read.csv("D:\\word.csv", head = T)
allword$Freq <- as.integer(allword$Freq)
allword$word <- as.character(allword$word)
allword$name <- as.character(allword$name)
names(allword)[names(allword) == "Freq"] <- "n"
allword.nonumber <- filter(allword, !grepl("\\d", word)) %>%
filter(!grepl("[[:punct:]]", word)) %>%
filter(!grepl("妳|你|我|他|她|它|牠|們|的|了|啊|吧|日|月", word)) %>%
filter(nchar(word) > 1)
testall <- allword.nonumber %>%
group_by(word) %>%
summarise(sum = sum(n)) %>%
filter(sum > 50)
namesss <- data.frame(name = unique(allword$name))
namesss$name <- as.character(namesss$name)
# 在allword裡面有data的人的name
final11 <- semi_join(finalshort, namesss, by = "name")
finalava <- final11
s <- rep(0, times = 9)
names(s) <- c("I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX")
for(i in 1:nrow(finalava))
{
for(j in 3:3)
{
v <- finalava[i, j]
if(!is.na(v))
{
if(v == 'I') {
s[1] <- s[1] + 1
} else if(v == 'II') {
s[2] <- s[2] + 1
} else if(v == 'III') {
s[3] <- s[3] + 1
} else if(v == 'IV') {
s[4] <- s[4] + 1
} else if(v == 'V') {
s[5] <- s[5] + 1
} else if(v == 'VI') {
s[6] <- s[6] + 1
} else if(v == 'VII') {
s[7] <- s[7] + 1
} else if(v == 'VIII') {
s[8] <- s[8] + 1
} else {
s[9] <- s[9] + 1
}
}
}
} # 計算人格分布
s.original <- s
#--0101---------------------------------------
tdf.full <- data_frame()
for(i in 1:nrow(finalava))
{
namee <- finalava[i, 1]
word.df <- allword.nonumber %>%
filter(name == namee) %>%
unique() %>%
mutate(total = sum(n)) #%>%
#mutate(n = n/total)
word.df <- semi_join(word.df, testall, by = "word")
df.spread <- subset(word.df, select = -total) %>%
spread(word, n)
tdf.full <- bind_rows(tdf.full, df.spread)
}
for(i in 1:nrow(tdf.full))
{
for(j in 1:length(tdf.full))
{
if(is.na(tdf.full[i, j])) tdf.full[i, j] <- 0
}
}
tdf.answer <- merge(finalshort, tdf.full, by = "name") %>%
subset(select = -highiest)
tdf.answer$selected <- as.factor(tdf.answer$selected)
tdf.answer <- tdf.answer[tdf.answer$name != "YanTong Lin", ] # 移除錯誤結果
row.names(tdf.answer) <- seq(1, nrow(tdf.answer), by = 1)
index <- 1:nrow(tdf.answer) # 幾筆資料
## 要一起執行
set.seed(5251)
testindex <- sample(index, trunc(length(index)*30/100)) # 挑20%的id (index)
##
trainset.name <- tdf.answer[-testindex, ] # 剩餘的80%
testset.name <- tdf.answer[testindex, ] # 20%
trainset <- subset(trainset.name, select = -name)
testset <- subset(testset.name, select = -name)
library(class)
s <- rep(0, times = 9)
names(s) <- c("I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX")
for(i in 1:nrow(trainset))
{
for(j in 1)
{
v <- trainset[i, j]
if(!is.na(v))
{
if(v == 'I') {
s[1] <- s[1] + 1
} else if(v == 'II') {
s[2] <- s[2] + 1
} else if(v == 'III') {
s[3] <- s[3] + 1
} else if(v == 'IV') {
s[4] <- s[4] + 1
} else if(v == 'V') {
s[5] <- s[5] + 1
} else if(v == 'VI') {
s[6] <- s[6] + 1
} else if(v == 'VII') {
s[7] <- s[7] + 1
} else if(v == 'VIII') {
s[8] <- s[8] + 1
} else {
s[9] <- s[9] + 1
}
}
}
} # 計算人格分布
s #
s.trainset <- s
knn.pred <- knn(trainset[, -1], testset[, -1], trainset[, 1], k = 3)
conf.mat <- table("Predictions" = knn.pred, Actual = testset$selected); conf.mat
TP <- sum(diag(conf.mat)) # True positive
FN <- 0
for(i in 1:nrow(conf.mat))
{
a <- conf.mat[i, i]
FN.temp <- sum(conf.mat[i, ]) - a
FN <- FN + FN.temp
} # count False negative
FP <- 0
for(j in 1:ncol(conf.mat))
{
a <- conf.mat[j, j]
FP.temp <- sum(conf.mat[, j]) - a
FP <- FP + FP.temp
} # count False positive
Precision <- TP / (TP + FP); Precision
Recall <- TP / (TP + FN); Recall
F1score <- 2*Precision*Recall/(Precision+Recall); F1score
save(conf.mat, file = "C:\\Users\\es901\\Documents\\dsR\\data\\project\\conf.mat.RData")