#원칙
#1. 데이터명 Data1.csv
#2. 목표변수 Gender
#3. 목표변수 factor, 결측치 확인 colSums(is.na(df))
#4. 데이터 전처리 자유(ex. Z-score, MinMax)
#5. 분석기법 자유
#6. predict 실제값이랑 예측값 혼동행렬
library(plyr)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## 다음의 패키지를 부착합니다: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
##
## 다음의 패키지를 부착합니다: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## 필요한 패키지를 로딩중입니다: lattice
setwd("C:/data")
getwd()
## [1] "C:/data"
df<-read.csv("Data1.csv")
df$Gender<-as.factor(df$Gender)
df$Gender<-revalue(df$Gender,replace=c("0"="female","1"="male"))
colSums(is.na(df))
## X Q1 Q2 Q3 Q4 Q5 Q6 Q7
## 0 0 0 0 0 0 0 0
## Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15
## 0 0 0 0 0 0 0 0
## Q16 Q17 Q18 Q19 Q20 Gender EDU BF
## 0 0 0 0 0 0 0 0
## BM Happiness Peace
## 0 0 0
numeric_cols<-sapply(df, is.numeric)
glimpse(df)
## Rows: 1,925
## Columns: 27
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ Q1 <int> 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Q2 <int> 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 2, 2, …
## $ Q3 <int> 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 3, 2, 3, …
## $ Q4 <int> 3, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 4, 2, 2, 4, …
## $ Q5 <int> 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 4, 4, 3, 1, 2, …
## $ Q6 <int> 2, 3, 4, 4, 4, 4, 4, 4, 1, 2, 2, 2, 4, 4, 3, 5, 2, 2, 1, 4, …
## $ Q7 <int> 2, 2, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 5, 4, 4, 5, 4, 3, 4, 4, …
## $ Q8 <int> 4, 4, 4, 4, 4, 4, 5, 5, 2, 2, 4, 4, 4, 4, 3, 5, 4, 2, 4, 4, …
## $ Q9 <int> 4, 4, 4, 4, 2, 4, 5, 5, 3, 4, 4, 4, 2, 2, 4, 5, 2, 4, 2, 4, …
## $ Q10 <int> 4, 4, 2, 4, 4, 4, 5, 5, 2, 4, 2, 4, 4, 4, 3, 4, 4, 3, 2, 3, …
## $ Q11 <int> 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 3, 4, 4, 4, 4, 5, 4, 3, 3, …
## $ Q12 <int> 4, 4, 4, 4, 4, 4, 5, 5, 3, 4, 4, 3, 4, 3, 3, 4, 5, 4, 4, 2, …
## $ Q13 <int> 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 2, 4, 4, 4, 5, 4, 4, 4, …
## $ Q14 <int> 4, 4, 4, 4, 4, 4, 5, 5, 5, 4, 4, 4, 3, 4, 5, 4, 5, 4, 4, 4, …
## $ Q15 <int> 4, 4, 3, 4, 4, 4, 4, 2, 3, 4, 4, 3, 1, 4, 4, 4, 5, 4, 4, 4, …
## $ Q16 <int> 4, 4, 4, 4, 4, 4, 5, 2, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, …
## $ Q17 <int> 4, 3, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 3, 2, 4, 5, 4, 4, 3, 4, …
## $ Q18 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 2, 4, 4, 4, …
## $ Q19 <int> 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 1, 4, 4, 4, 5, 4, 2, 3, …
## $ Q20 <int> 4, 1, 3, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 2, 4, 5, 5, 4, 2, 4, …
## $ Gender <fct> female, female, female, female, female, female, female, fema…
## $ EDU <int> 1, 1, 2, 1, 2, 1, 1, 1, 4, 3, 2, 1, 1, 3, 3, 2, 1, 1, 1, 4, …
## $ BF <dbl> 3.4, 4.0, 3.6, 4.2, 4.0, 4.0, 3.6, 3.6, 3.6, 3.2, 4.0, 3.2, …
## $ BM <dbl> 3.2, 3.4, 3.6, 4.0, 3.6, 4.0, 4.6, 4.6, 2.2, 3.2, 3.2, 3.6, …
## $ Happiness <dbl> 4.0, 4.0, 3.8, 4.0, 4.0, 4.0, 4.8, 4.4, 3.8, 4.0, 4.0, 3.4, …
## $ Peace <dbl> 4.0, 2.8, 3.8, 4.0, 4.0, 4.0, 3.8, 2.4, 4.0, 3.2, 4.0, 3.9, …
idx<-createDataPartition(y=df$Gender,p=0.8,list=FALSE)
train<-df[idx,]
test<-df[-idx,]
rf<-randomForest(Gender~.,data=train)
pred<-predict(rf, newdata = test)
conf_matrix<-confusionMatrix(pred,test$Gender)
conf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction female male
## female 191 94
## male 36 63
##
## Accuracy : 0.6615
## 95% CI : (0.6117, 0.7087)
## No Information Rate : 0.5911
## P-Value [Acc > NIR] : 0.00275
##
## Kappa : 0.2573
##
## Mcnemar's Test P-Value : 5.756e-07
##
## Sensitivity : 0.8414
## Specificity : 0.4013
## Pos Pred Value : 0.6702
## Neg Pred Value : 0.6364
## Prevalence : 0.5911
## Detection Rate : 0.4974
## Detection Prevalence : 0.7422
## Balanced Accuracy : 0.6213
##
## 'Positive' Class : female
##