빅데이터분석 실습 3일차

#원칙     
#1. 데이터명 Data1.csv
#2. 목표변수 Gender
#3. 목표변수 factor, 결측치 확인 colSums(is.na(df))
#4. 데이터 전처리 자유(ex. Z-score, MinMax)
#5. 분석기법 자유
#6. predict 실제값이랑 예측값 혼동행렬
library(plyr)
library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## 다음의 패키지를 부착합니다: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(caret)

## 필요한 패키지를 로딩중입니다: ggplot2

## 
## 다음의 패키지를 부착합니다: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## 필요한 패키지를 로딩중입니다: lattice

setwd("C:/data")
getwd()

## [1] "C:/data"

df<-read.csv("Data1.csv")
df$Gender<-as.factor(df$Gender)
df$Gender<-revalue(df$Gender,replace=c("0"="female","1"="male"))

colSums(is.na(df))

##         X        Q1        Q2        Q3        Q4        Q5        Q6        Q7 
##         0         0         0         0         0         0         0         0 
##        Q8        Q9       Q10       Q11       Q12       Q13       Q14       Q15 
##         0         0         0         0         0         0         0         0 
##       Q16       Q17       Q18       Q19       Q20    Gender       EDU        BF 
##         0         0         0         0         0         0         0         0 
##        BM Happiness     Peace 
##         0         0         0

numeric_cols<-sapply(df, is.numeric)
glimpse(df)

## Rows: 1,925
## Columns: 27
## $ X         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ Q1        <int> 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Q2        <int> 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 2, 2, …
## $ Q3        <int> 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 3, 2, 3, …
## $ Q4        <int> 3, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 4, 2, 2, 4, …
## $ Q5        <int> 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 4, 4, 3, 1, 2, …
## $ Q6        <int> 2, 3, 4, 4, 4, 4, 4, 4, 1, 2, 2, 2, 4, 4, 3, 5, 2, 2, 1, 4, …
## $ Q7        <int> 2, 2, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 5, 4, 4, 5, 4, 3, 4, 4, …
## $ Q8        <int> 4, 4, 4, 4, 4, 4, 5, 5, 2, 2, 4, 4, 4, 4, 3, 5, 4, 2, 4, 4, …
## $ Q9        <int> 4, 4, 4, 4, 2, 4, 5, 5, 3, 4, 4, 4, 2, 2, 4, 5, 2, 4, 2, 4, …
## $ Q10       <int> 4, 4, 2, 4, 4, 4, 5, 5, 2, 4, 2, 4, 4, 4, 3, 4, 4, 3, 2, 3, …
## $ Q11       <int> 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 3, 4, 4, 4, 4, 5, 4, 3, 3, …
## $ Q12       <int> 4, 4, 4, 4, 4, 4, 5, 5, 3, 4, 4, 3, 4, 3, 3, 4, 5, 4, 4, 2, …
## $ Q13       <int> 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 2, 4, 4, 4, 5, 4, 4, 4, …
## $ Q14       <int> 4, 4, 4, 4, 4, 4, 5, 5, 5, 4, 4, 4, 3, 4, 5, 4, 5, 4, 4, 4, …
## $ Q15       <int> 4, 4, 3, 4, 4, 4, 4, 2, 3, 4, 4, 3, 1, 4, 4, 4, 5, 4, 4, 4, …
## $ Q16       <int> 4, 4, 4, 4, 4, 4, 5, 2, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, …
## $ Q17       <int> 4, 3, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 3, 2, 4, 5, 4, 4, 3, 4, …
## $ Q18       <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 2, 4, 4, 4, …
## $ Q19       <int> 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 1, 4, 4, 4, 5, 4, 2, 3, …
## $ Q20       <int> 4, 1, 3, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 2, 4, 5, 5, 4, 2, 4, …
## $ Gender    <fct> female, female, female, female, female, female, female, fema…
## $ EDU       <int> 1, 1, 2, 1, 2, 1, 1, 1, 4, 3, 2, 1, 1, 3, 3, 2, 1, 1, 1, 4, …
## $ BF        <dbl> 3.4, 4.0, 3.6, 4.2, 4.0, 4.0, 3.6, 3.6, 3.6, 3.2, 4.0, 3.2, …
## $ BM        <dbl> 3.2, 3.4, 3.6, 4.0, 3.6, 4.0, 4.6, 4.6, 2.2, 3.2, 3.2, 3.6, …
## $ Happiness <dbl> 4.0, 4.0, 3.8, 4.0, 4.0, 4.0, 4.8, 4.4, 3.8, 4.0, 4.0, 3.4, …
## $ Peace     <dbl> 4.0, 2.8, 3.8, 4.0, 4.0, 4.0, 3.8, 2.4, 4.0, 3.2, 4.0, 3.9, …

idx<-createDataPartition(y=df$Gender,p=0.8,list=FALSE)
train<-df[idx,]
test<-df[-idx,]
rf<-randomForest(Gender~.,data=train)
pred<-predict(rf, newdata = test)
conf_matrix<-confusionMatrix(pred,test$Gender)
conf_matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction female male
##     female    191   94
##     male       36   63
##                                           
##                Accuracy : 0.6615          
##                  95% CI : (0.6117, 0.7087)
##     No Information Rate : 0.5911          
##     P-Value [Acc > NIR] : 0.00275         
##                                           
##                   Kappa : 0.2573          
##                                           
##  Mcnemar's Test P-Value : 5.756e-07       
##                                           
##             Sensitivity : 0.8414          
##             Specificity : 0.4013          
##          Pos Pred Value : 0.6702          
##          Neg Pred Value : 0.6364          
##              Prevalence : 0.5911          
##          Detection Rate : 0.4974          
##    Detection Prevalence : 0.7422          
##       Balanced Accuracy : 0.6213          
##                                           
##        'Positive' Class : female          
##

빅데이터분석 실습 3일차

choi doo jin

2024-01-31