rm(list=ls())
getwd()
## [1] "C:/data"
setwd("c:/data")
getwd()
## [1] "c:/data"
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
#단순선형회귀분석
#BM(건강한자기관리)
#HAPPLINESS(행복도)
#bm이 happiness에 끼치는 영향관계
#건강한 자기관리를 잘할수록 행복도 증가하는가
df<-read.csv("DATA1.csv")
glimpse(df)
## Rows: 1,925
## Columns: 26
## $ Q1 <int> 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Q2 <int> 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 2, 2, …
## $ Q3 <int> 2, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 3, 2, 3, …
## $ Q4 <int> 3, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 4, 2, 2, 4, …
## $ Q5 <int> 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 4, 4, 3, 1, 2, …
## $ Q6 <int> 2, 3, 4, 4, 4, 4, 4, 4, 1, 2, 2, 2, 4, 4, 3, 5, 2, 2, 1, 4, …
## $ Q7 <int> 2, 2, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 5, 4, 4, 5, 4, 3, 4, 4, …
## $ Q8 <int> 4, 4, 4, 4, 4, 4, 5, 5, 2, 2, 4, 4, 4, 4, 3, 5, 4, 2, 4, 4, …
## $ Q9 <int> 4, 4, 4, 4, 2, 4, 5, 5, 3, 4, 4, 4, 2, 2, 4, 5, 2, 4, 2, 4, …
## $ Q10 <int> 4, 4, 2, 4, 4, 4, 5, 5, 2, 4, 2, 4, 4, 4, 3, 4, 4, 3, 2, 3, …
## $ Q11 <int> 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 3, 4, 4, 4, 4, 5, 4, 3, 3, …
## $ Q12 <int> 4, 4, 4, 4, 4, 4, 5, 5, 3, 4, 4, 3, 4, 3, 3, 4, 5, 4, 4, 2, …
## $ Q13 <int> 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 2, 4, 4, 4, 5, 4, 4, 4, …
## $ Q14 <int> 4, 4, 4, 4, 4, 4, 5, 5, 5, 4, 4, 4, 3, 4, 5, 4, 5, 4, 4, 4, …
## $ Q15 <int> 4, 4, 3, 4, 4, 4, 4, 2, 3, 4, 4, 3, 1, 4, 4, 4, 5, 4, 4, 4, …
## $ Q16 <int> 4, 4, 4, 4, 4, 4, 5, 2, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, …
## $ Q17 <int> 4, 3, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 3, 2, 4, 5, 4, 4, 3, 4, …
## $ Q18 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 2, 4, 4, 4, …
## $ Q19 <int> 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 1, 4, 4, 4, 5, 4, 2, 3, …
## $ Q20 <int> 4, 1, 3, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 2, 4, 5, 5, 4, 2, 4, …
## $ Gender1 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, …
## $ EDU1 <int> 1, 1, 2, 1, 2, 1, 1, 1, 4, 3, 2, 1, 1, 3, 3, 2, 1, 1, 1, 4, …
## $ BF <dbl> 3.4, 4.0, 3.6, 4.2, 4.0, 4.0, 3.6, 3.6, 3.6, 3.2, 4.0, 3.2, …
## $ BM <dbl> 3.2, 3.4, 3.6, 4.0, 3.6, 4.0, 4.6, 4.6, 2.2, 3.2, 3.2, 3.6, …
## $ Happiness <dbl> 4.0, 4.0, 3.8, 4.0, 4.0, 4.0, 4.8, 4.4, 3.8, 4.0, 4.0, 3.4, …
## $ Peace <dbl> 4.0, 2.8, 3.8, 4.0, 4.0, 4.0, 3.8, 2.4, 4.0, 3.2, 4.0, 3.9, …
#회귀분석은 등간척도 이상이어야함
#독립변수 범주형이라도 회귀분석이 가능하다.=더미변수
#종속변수가 이진데이터(0,1/성공실패)경우 로지스틱회귀분석
bs.out2<-lm(Happiness~BM,data=df) #lm=선형회귀분석을 만드는 함수
summary(bs.out2)
##
## Call:
## lm(formula = Happiness ~ BM, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1591 -0.4577 0.0418 0.4409 1.9386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.06599 0.05777 35.77 <2e-16 ***
## BM 0.49771 0.01878 26.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6404 on 1923 degrees of freedom
## Multiple R-squared: 0.2675, Adjusted R-squared: 0.2671
## F-statistic: 702.2 on 1 and 1923 DF, p-value: < 2.2e-16
#~종속변수 관계 설정
#건강한 자기관리가 1 증가할 경우 행복은 0.498 증가함
#더빈왓슨(Durbin watson) 검정
#더빈왓슨 통계량을 0~4값을 가질 수 있음
#0에 가까울수록 양의 상관관계
#4에 가까울수록 음의 상관관계
#2에 가까울수록 오차항의 자기상관이 없음
#BM Estimate = 비율
#Intercept : 절편
#residuals : 잔차
#단순회귀분석 : happiness=2.06+0.497*BM, 모델링 完
#오차항 : 모집단을 알수 없음. 실제관측값-모회귀선의 차이(오차)
#잔차항 : 표본을 통해 표본회귀선을 만들고 그때 관측값의 차이(잔차), 잔차를 통해 오차항의 가정조건 성립을 확인=>잔차분석
library(car)#더빈왓슨 검정을 위해 필요
## 필요한 패키지를 로딩중입니다: carData
##
## 다음의 패키지를 부착합니다: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
sreg.res1<-residuals(bs.out2)
durbinWatsonTest(sreg.res1)
## [1] 1.787942
#오차는 독립성이라 판단한다.
library(caret)
#잔차의 등분산성
par(mfrow=c(2,2))
plot(bs.out2)

#q-q : 정규분포 확인
#잔차의 등분산성을 입증하기 위해서는 산점도에서 예측값(Fitted value)의
#변화에 관계없이 잔차(residuals)가 분포하는 모습이 일정하여야 한다.
#정규성검정
#shapiro-Wilk test(샤피로 윌크 검정)
shapiro.test(sreg.res1)#p-value 판단
##
## Shapiro-Wilk normality test
##
## data: sreg.res1
## W = 0.99439, p-value = 1.148e-06
options(scipen=999)#아라비아숫자로 바꾸기
shapiro.test(sreg.res1)
##
## Shapiro-Wilk normality test
##
## data: sreg.res1
## W = 0.99439, p-value = 0.000001148
options(scipen=-999)#원래대로 돌아가기
shapiro.test(sreg.res1)
##
## Shapiro-Wilk normality test
##
## data: sreg.res1
## W = 9.9439e-01, p-value = 1.148e-06
#귀무가설 : 정규분포 맞음
#대립가설 : 정규분포 아님
#유의확률 p-value<유의수준 0.05 = 귀무가설 기각
#정규분포가 아님
#정규성을 만족하지 못하면 박스콕스 변수 변환을 적용할 수 있다.
#Happiness 정규성 검정 한다.
options(scipen=999)
shapiro.test(sreg.res1)
##
## Shapiro-Wilk normality test
##
## data: sreg.res1
## W = 0.99439, p-value = 0.000001148
#위 잔차분석 결과 등분산성, 정규성만족 x
#정규성을 만족하지 못하는 경우 박스콕스 변수 변환을 적용 가능
#다항회귀분석 :독립변수의 차수 제곱을 넘어가는 것
#회귀분석(all)은 시험당 1문제
#다중회귀분석
#다중선형 회귀모형에서는 두개 이상의 독립변수가 주어졌다는 가정에서
#종속변수에 대한 분포를 가정하고 있다
#ㅣ는 독립변수들은 서로 독립적이야 한다는 것을
#다중공선성 vif함수를 통해 확인해야 한다
bs.out3<-lm(Happiness~BM+BF,data=df)
summary(bs.out3)
##
## Call:
## lm(formula = Happiness ~ BM + BF, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23134 -0.40553 0.02014 0.41352 1.86210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.60995 0.06412 25.11 <0.0000000000000002 ***
## BM 0.29054 0.02331 12.47 <0.0000000000000002 ***
## BF 0.33817 0.02435 13.89 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6106 on 1922 degrees of freedom
## Multiple R-squared: 0.3343, Adjusted R-squared: 0.3336
## F-statistic: 482.6 on 2 and 1922 DF, p-value: < 0.00000000000000022
library(car)
vif(bs.out3)
## BM BF
## 1.693504 1.693504
#vif값이 모두 10보다 작으므로 두 변수는 서로 내용추가
#비율 8:2
library(caret)
dim(df)
## [1] 1925 26
idx<-createDataPartition(df$Happiness,p=0.8,list=FALSE)
train<-df[idx,]
test<-df[-idx,]
library(dplyr)
glimpse(train)
## Rows: 1,541
## Columns: 26
## $ Q1 <int> 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, …
## $ Q2 <int> 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 2, 3, 2, …
## $ Q3 <int> 2, 4, 4, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 3, 3, 3, 2, …
## $ Q4 <int> 3, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 1, …
## $ Q5 <int> 4, 4, 2, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 4, 4, 3, 2, 2, 1, …
## $ Q6 <int> 2, 3, 4, 4, 4, 4, 4, 1, 2, 2, 2, 4, 4, 3, 5, 2, 2, 4, 2, 1, …
## $ Q7 <int> 2, 2, 4, 4, 4, 4, 4, 3, 4, 4, 4, 5, 4, 4, 5, 4, 3, 4, 4, 2, …
## $ Q8 <int> 4, 4, 4, 4, 4, 4, 5, 2, 2, 4, 4, 4, 4, 3, 5, 4, 2, 4, 4, 4, …
## $ Q9 <int> 4, 4, 4, 4, 2, 4, 5, 3, 4, 4, 4, 2, 2, 4, 5, 2, 4, 4, 4, 4, …
## $ Q10 <int> 4, 4, 2, 4, 4, 4, 5, 2, 4, 2, 4, 4, 4, 3, 4, 4, 3, 3, 3, 3, …
## $ Q11 <int> 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 3, 4, 4, 4, 4, 5, 4, 3, 4, 3, …
## $ Q12 <int> 4, 4, 4, 4, 4, 4, 5, 3, 4, 4, 3, 4, 3, 3, 4, 5, 4, 2, 3, 4, …
## $ Q13 <int> 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 2, 4, 4, 4, 5, 4, 4, 3, 2, …
## $ Q14 <int> 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 3, 4, 5, 4, 5, 4, 4, 4, 4, …
## $ Q15 <int> 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 3, 1, 4, 4, 4, 5, 4, 4, 3, 3, …
## $ Q16 <int> 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 4, …
## $ Q17 <int> 4, 3, 4, 4, 4, 4, 2, 4, 4, 4, 4, 3, 2, 4, 5, 4, 4, 4, 2, 4, …
## $ Q18 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 2, 4, 4, 4, 4, …
## $ Q19 <int> 4, 2, 4, 4, 4, 4, 4, 4, 2, 4, 4, 1, 4, 4, 4, 5, 4, 3, 3, 3, …
## $ Q20 <int> 4, 1, 3, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 5, 5, 4, 4, 3, 3, …
## $ Gender1 <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ EDU1 <int> 1, 1, 2, 1, 2, 1, 1, 4, 3, 2, 1, 1, 3, 3, 2, 1, 1, 4, 3, 2, …
## $ BF <dbl> 3.4, 4.0, 3.6, 4.2, 4.0, 4.0, 3.6, 3.6, 3.2, 4.0, 3.2, 4.0, …
## $ BM <dbl> 3.2, 3.4, 3.6, 4.0, 3.6, 4.0, 4.6, 2.2, 3.2, 3.2, 3.6, 3.8, …
## $ Happiness <dbl> 4.0, 4.0, 3.8, 4.0, 4.0, 4.0, 4.8, 3.8, 4.0, 4.0, 3.4, 2.8, …
## $ Peace <dbl> 4.0, 2.8, 3.8, 4.0, 4.0, 4.0, 3.8, 4.0, 3.2, 4.0, 3.9, 3.2, …
#linear regrssion model
fit<-lm(Happiness~BM+BF+Peace,data=train)
summary(fit)
##
## Call:
## lm(formula = Happiness ~ BM + BF + Peace, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88286 -0.32719 -0.00337 0.32718 1.55630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47312 0.08285 5.711 0.0000000134899153 ***
## BM 0.17765 0.02315 7.674 0.0000000000000295 ***
## BF 0.25863 0.02430 10.644 < 0.0000000000000002 ***
## Peace 0.48398 0.02292 21.120 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5375 on 1537 degrees of freedom
## Multiple R-squared: 0.4845, Adjusted R-squared: 0.4835
## F-statistic: 481.5 on 3 and 1537 DF, p-value: < 0.00000000000000022
#happiness=0.51388+0.20878*BM+0.24636*BF+0.45747*Peace
predict(fit,newdata=test)
## 8 19 30 34 38 40 44 49
## 3.382949 2.955942 3.321167 3.695929 4.102446 3.670195 2.313646 3.285636
## 54 55 60 79 93 105 110 111
## 3.682356 3.688636 3.979658 2.355387 4.154172 4.021846 3.708348 3.998994
## 112 113 119 127 154 162 171 182
## 3.898422 3.243188 4.047580 4.312232 3.023864 3.401767 4.083111 3.979658
## 183 185 189 201 211 221 224 226
## 4.050720 3.398367 3.521155 3.979658 3.569365 3.682615 4.066915 3.998994
## 227 234 243 247 248 250 268 269
## 4.102446 2.417099 3.776529 3.614434 3.811801 3.408165 3.563603 3.536715
## 271 277 287 295 299 300 301 312
## 3.814941 2.271458 3.944127 3.786067 3.814941 3.717886 3.772753 3.175267
## 315 327 333 336 337 340 343 346
## 4.118382 3.908596 3.743879 4.189703 2.384708 3.853989 3.760334 3.689013
## 349 359 376 377 385 391 402 409
## 3.537351 2.878483 3.298055 3.259902 4.012049 3.472310 3.649965 3.689013
## 412 413 414 416 417 427 430 431
## 3.023864 3.405284 3.524555 3.401767 3.979658 3.973001 3.679545 4.286240
## 434 436 442 446 448 450 451 460
## 3.647084 3.995853 3.698551 3.062276 3.016948 3.204141 3.314250 3.805144
## 469 472 479 483 491 493 496 498
## 4.748518 4.324652 4.118641 3.503806 3.970120 4.154172 3.559567 4.012049
## 503 508 523 530 536 543 559 568
## 4.137977 3.595357 3.960582 2.743016 3.003893 3.378655 4.154172 2.807421
## 569 573 575 576 580 591 593 602
## 3.876206 1.997267 2.077608 3.208176 3.821598 3.463031 3.737222 4.557549
## 612 618 619 621 622 632 633 642
## 4.134837 4.018447 4.088873 4.515621 3.550029 3.925051 3.576022 3.375515
## 654 658 674 683 684 689 691 694
## 3.585560 2.755695 4.118382 2.726821 3.546889 3.685496 3.427500 3.449458
## 695 698 702 708 710 712 713 716
## 2.788085 3.317131 3.524036 4.154172 3.992454 4.170368 4.085733 4.214919
## 721 725 726 730 731 736 737 746
## 2.462168 2.594494 2.197515 2.417099 2.462168 4.409028 4.118641 3.776270
## 748 749 755 762 766 772 776 778
## 3.979658 3.585819 4.338225 2.761833 2.901335 2.216851 2.207054 3.040059
## 790 791 803 811 821 830 834 841
## 2.939488 3.298055 2.817218 4.406147 3.181924 3.488765 3.669677 3.385312
## 842 862 865 878 880 883 884 885
## 3.611553 4.076194 3.408165 3.963463 3.595098 3.233650 3.934848 3.255867
## 903 905 909 919 926 930 931 934
## 3.397991 3.666160 3.566225 3.927932 2.797623 4.005651 3.750536 3.866408
## 936 939 942 945 954 956 958 964
## 3.349970 3.023864 2.122937 3.365977 4.102446 4.095530 3.595357 3.653741
## 969 970 972 975 977 979 986 991
## 3.236791 4.092649 3.520260 3.911736 3.533575 3.579162 3.363095 4.056859
## 994 996 1001 1020 1028 1031 1032 1037
## 4.386175 4.092649 4.208780 3.466807 3.491646 4.154172 3.502338 4.066915
## 1044 1053 1063 1072 1077 1083 1089 1091
## 3.776529 3.740362 3.701691 3.668782 3.301831 4.138236 3.059136 3.349522
## 1092 1095 1101 1110 1118 1120 1122 1127
## 3.730942 2.695325 3.605414 3.715005 3.233909 4.412804 2.538992 3.052738
## 1130 1134 1137 1138 1139 1140 1141 1143
## 3.679734 2.655759 3.818199 3.356179 2.965222 3.488765 3.143771 4.018070
## 1150 1156 1159 1160 1166 1191 1193 1198
## 3.572882 3.237685 2.975278 3.950784 4.141494 3.443955 3.527177 3.573400
## 1202 1208 1211 1214 1215 1222 1225 1229
## 3.198119 3.479226 3.417962 3.427241 3.478967 3.683250 3.182183 2.846469
## 1233 1234 1235 1238 1240 1241 1252 1257
## 3.295433 3.531212 3.278719 3.573400 2.833154 3.491646 3.101324 3.417444
## 1258 1272 1275 1277 1278 1281 1282 1285
## 3.443436 4.154172 4.448335 3.621091 3.085647 4.121523 3.056255 3.659503
## 1286 1293 1298 1313 1318 1323 1325 1329
## 3.576022 3.731460 3.834017 3.436779 4.047580 3.863786 2.723940 2.868685
## 1332 1335 1337 1345 1354 1357 1360 1363
## 3.540491 3.236531 3.437298 3.437038 3.243188 3.491646 3.537351 2.571383
## 1364 1370 1375 1380 1386 1387 1392 1393
## 3.859751 3.614434 3.156450 3.698739 3.233909 3.143771 3.510981 3.772494
## 1394 1396 1406 1410 1412 1415 1434 1436
## 3.779151 3.282236 3.349781 3.375515 3.152414 4.066915 3.573141 4.286240
## 1440 1456 1457 1458 1461 1466 1469 1480
## 3.566484 3.443955 3.050116 3.050116 3.153309 3.592217 4.260506 3.385312
## 1485 1492 1498 1500 1502 1508 1512 1513
## 3.359319 3.598239 4.477208 3.417703 3.424360 3.559567 3.766472 2.972138
## 1520 1530 1535 1545 1554 1558 1560 1566
## 3.963463 3.036543 3.947527 3.449717 3.866667 4.189444 3.440179 3.614175
## 1568 1571 1574 1576 1579 1583 1587 1588
## 4.073313 4.037523 3.963204 3.918394 4.441678 4.047320 4.083370 3.643943
## 1589 1592 1596 1604 1606 1608 1612 1613
## 3.040319 2.094192 3.947008 3.724544 3.027640 4.083111 3.391969 3.082247
## 1627 1637 1640 1641 1644 1648 1652 1654
## 3.882603 3.198119 4.405887 3.330446 3.802263 3.860010 3.007669 2.752813
## 1655 1659 1665 1673 1678 1681 1694 1705
## 3.153309 3.324683 3.695670 2.898454 2.736359 4.073572 3.446577 3.324683
## 1708 1710 1712 1722 1739 1744 1749 1752
## 3.417962 3.146652 3.420843 3.595993 4.550892 3.637546 3.398367 4.064034
## 1760 1761 1763 1770 1771 1775 1776 1778
## 2.898454 3.830877 3.252986 3.514757 4.044699 4.625730 3.595098 4.021587
## 1782 1783 1787 1791 1792 1797 1802 1808
## 3.434157 4.163711 3.427500 3.750277 3.040059 3.966720 3.879722 3.191462
## 1812 1817 1820 1830 1833 1835 1853 1854
## 3.778892 2.271199 2.655500 3.463031 3.485624 3.082247 2.933467 3.672817
## 1859 1862 1864 1866 1867 1870 1882 1883
## 3.510981 2.810938 3.460150 3.943232 3.814682 3.340502 3.602015 4.673940
## 1886 1892 1894 1903 1907 1908 1911 1916
## 3.123281 3.572882 3.676593 4.479830 3.179043 3.123800 3.740998 3.155931
lm_p<-predict(fit,newdata=test)
#round(predict(fit,newdata=test),1)
test$Happiness1<-round(predict(fit,newdata=test),1)
View(test)
#오차가 생기는지 확인
#Happiness-Happiness1=오차
#MSE
mean((test$Happiness-test$Happiness1)^1)
## [1] -0.001302083
mean((test$Happiness-test$Happiness2)^2)
## [1] NaN
#linear regrssion model
fit1<-lm(Happiness~BM+BF+Peace,data=train)
summary(fit1)
##
## Call:
## lm(formula = Happiness ~ BM + BF + Peace, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88286 -0.32719 -0.00337 0.32718 1.55630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47312 0.08285 5.711 0.0000000134899153 ***
## BM 0.17765 0.02315 7.674 0.0000000000000295 ***
## BF 0.25863 0.02430 10.644 < 0.0000000000000002 ***
## Peace 0.48398 0.02292 21.120 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5375 on 1537 degrees of freedom
## Multiple R-squared: 0.4845, Adjusted R-squared: 0.4835
## F-statistic: 481.5 on 3 and 1537 DF, p-value: < 0.00000000000000022
predict(fit,newdata=test)
## 8 19 30 34 38 40 44 49
## 3.382949 2.955942 3.321167 3.695929 4.102446 3.670195 2.313646 3.285636
## 54 55 60 79 93 105 110 111
## 3.682356 3.688636 3.979658 2.355387 4.154172 4.021846 3.708348 3.998994
## 112 113 119 127 154 162 171 182
## 3.898422 3.243188 4.047580 4.312232 3.023864 3.401767 4.083111 3.979658
## 183 185 189 201 211 221 224 226
## 4.050720 3.398367 3.521155 3.979658 3.569365 3.682615 4.066915 3.998994
## 227 234 243 247 248 250 268 269
## 4.102446 2.417099 3.776529 3.614434 3.811801 3.408165 3.563603 3.536715
## 271 277 287 295 299 300 301 312
## 3.814941 2.271458 3.944127 3.786067 3.814941 3.717886 3.772753 3.175267
## 315 327 333 336 337 340 343 346
## 4.118382 3.908596 3.743879 4.189703 2.384708 3.853989 3.760334 3.689013
## 349 359 376 377 385 391 402 409
## 3.537351 2.878483 3.298055 3.259902 4.012049 3.472310 3.649965 3.689013
## 412 413 414 416 417 427 430 431
## 3.023864 3.405284 3.524555 3.401767 3.979658 3.973001 3.679545 4.286240
## 434 436 442 446 448 450 451 460
## 3.647084 3.995853 3.698551 3.062276 3.016948 3.204141 3.314250 3.805144
## 469 472 479 483 491 493 496 498
## 4.748518 4.324652 4.118641 3.503806 3.970120 4.154172 3.559567 4.012049
## 503 508 523 530 536 543 559 568
## 4.137977 3.595357 3.960582 2.743016 3.003893 3.378655 4.154172 2.807421
## 569 573 575 576 580 591 593 602
## 3.876206 1.997267 2.077608 3.208176 3.821598 3.463031 3.737222 4.557549
## 612 618 619 621 622 632 633 642
## 4.134837 4.018447 4.088873 4.515621 3.550029 3.925051 3.576022 3.375515
## 654 658 674 683 684 689 691 694
## 3.585560 2.755695 4.118382 2.726821 3.546889 3.685496 3.427500 3.449458
## 695 698 702 708 710 712 713 716
## 2.788085 3.317131 3.524036 4.154172 3.992454 4.170368 4.085733 4.214919
## 721 725 726 730 731 736 737 746
## 2.462168 2.594494 2.197515 2.417099 2.462168 4.409028 4.118641 3.776270
## 748 749 755 762 766 772 776 778
## 3.979658 3.585819 4.338225 2.761833 2.901335 2.216851 2.207054 3.040059
## 790 791 803 811 821 830 834 841
## 2.939488 3.298055 2.817218 4.406147 3.181924 3.488765 3.669677 3.385312
## 842 862 865 878 880 883 884 885
## 3.611553 4.076194 3.408165 3.963463 3.595098 3.233650 3.934848 3.255867
## 903 905 909 919 926 930 931 934
## 3.397991 3.666160 3.566225 3.927932 2.797623 4.005651 3.750536 3.866408
## 936 939 942 945 954 956 958 964
## 3.349970 3.023864 2.122937 3.365977 4.102446 4.095530 3.595357 3.653741
## 969 970 972 975 977 979 986 991
## 3.236791 4.092649 3.520260 3.911736 3.533575 3.579162 3.363095 4.056859
## 994 996 1001 1020 1028 1031 1032 1037
## 4.386175 4.092649 4.208780 3.466807 3.491646 4.154172 3.502338 4.066915
## 1044 1053 1063 1072 1077 1083 1089 1091
## 3.776529 3.740362 3.701691 3.668782 3.301831 4.138236 3.059136 3.349522
## 1092 1095 1101 1110 1118 1120 1122 1127
## 3.730942 2.695325 3.605414 3.715005 3.233909 4.412804 2.538992 3.052738
## 1130 1134 1137 1138 1139 1140 1141 1143
## 3.679734 2.655759 3.818199 3.356179 2.965222 3.488765 3.143771 4.018070
## 1150 1156 1159 1160 1166 1191 1193 1198
## 3.572882 3.237685 2.975278 3.950784 4.141494 3.443955 3.527177 3.573400
## 1202 1208 1211 1214 1215 1222 1225 1229
## 3.198119 3.479226 3.417962 3.427241 3.478967 3.683250 3.182183 2.846469
## 1233 1234 1235 1238 1240 1241 1252 1257
## 3.295433 3.531212 3.278719 3.573400 2.833154 3.491646 3.101324 3.417444
## 1258 1272 1275 1277 1278 1281 1282 1285
## 3.443436 4.154172 4.448335 3.621091 3.085647 4.121523 3.056255 3.659503
## 1286 1293 1298 1313 1318 1323 1325 1329
## 3.576022 3.731460 3.834017 3.436779 4.047580 3.863786 2.723940 2.868685
## 1332 1335 1337 1345 1354 1357 1360 1363
## 3.540491 3.236531 3.437298 3.437038 3.243188 3.491646 3.537351 2.571383
## 1364 1370 1375 1380 1386 1387 1392 1393
## 3.859751 3.614434 3.156450 3.698739 3.233909 3.143771 3.510981 3.772494
## 1394 1396 1406 1410 1412 1415 1434 1436
## 3.779151 3.282236 3.349781 3.375515 3.152414 4.066915 3.573141 4.286240
## 1440 1456 1457 1458 1461 1466 1469 1480
## 3.566484 3.443955 3.050116 3.050116 3.153309 3.592217 4.260506 3.385312
## 1485 1492 1498 1500 1502 1508 1512 1513
## 3.359319 3.598239 4.477208 3.417703 3.424360 3.559567 3.766472 2.972138
## 1520 1530 1535 1545 1554 1558 1560 1566
## 3.963463 3.036543 3.947527 3.449717 3.866667 4.189444 3.440179 3.614175
## 1568 1571 1574 1576 1579 1583 1587 1588
## 4.073313 4.037523 3.963204 3.918394 4.441678 4.047320 4.083370 3.643943
## 1589 1592 1596 1604 1606 1608 1612 1613
## 3.040319 2.094192 3.947008 3.724544 3.027640 4.083111 3.391969 3.082247
## 1627 1637 1640 1641 1644 1648 1652 1654
## 3.882603 3.198119 4.405887 3.330446 3.802263 3.860010 3.007669 2.752813
## 1655 1659 1665 1673 1678 1681 1694 1705
## 3.153309 3.324683 3.695670 2.898454 2.736359 4.073572 3.446577 3.324683
## 1708 1710 1712 1722 1739 1744 1749 1752
## 3.417962 3.146652 3.420843 3.595993 4.550892 3.637546 3.398367 4.064034
## 1760 1761 1763 1770 1771 1775 1776 1778
## 2.898454 3.830877 3.252986 3.514757 4.044699 4.625730 3.595098 4.021587
## 1782 1783 1787 1791 1792 1797 1802 1808
## 3.434157 4.163711 3.427500 3.750277 3.040059 3.966720 3.879722 3.191462
## 1812 1817 1820 1830 1833 1835 1853 1854
## 3.778892 2.271199 2.655500 3.463031 3.485624 3.082247 2.933467 3.672817
## 1859 1862 1864 1866 1867 1870 1882 1883
## 3.510981 2.810938 3.460150 3.943232 3.814682 3.340502 3.602015 4.673940
## 1886 1892 1894 1903 1907 1908 1911 1916
## 3.123281 3.572882 3.676593 4.479830 3.179043 3.123800 3.740998 3.155931
lm_p<-predict(fit1,newdata=test)
test$Happiness2<-round(predict(fit1,newdata=test),1)
#비율 변경 7:3
library(caret)
dim(df)
## [1] 1925 26
set.seed(1)
idx<-createDataPartition(df$Happiness,p=0.7,list=FALSE)
train<-df[idx,]
test<-df[-idx,]
library(dplyr)
glimpse(train)
## Rows: 1,350
## Columns: 26
## $ Q1 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 5, …
## $ Q2 <int> 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4, 2, 2, 3, 2, 5, …
## $ Q3 <int> 2, 4, 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 3, 2, 3, 3, 2, 5, …
## $ Q4 <int> 3, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 4, 2, 2, 4, 4, 1, 5, …
## $ Q5 <int> 4, 2, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 4, 3, 1, 2, 2, 1, 5, …
## $ Q6 <int> 2, 4, 4, 4, 4, 4, 1, 2, 2, 2, 4, 4, 3, 2, 2, 1, 4, 2, 1, 5, …
## $ Q7 <int> 2, 4, 4, 4, 4, 4, 3, 4, 4, 4, 5, 4, 4, 4, 3, 4, 4, 4, 2, 5, …
## $ Q8 <int> 4, 4, 4, 4, 5, 5, 2, 2, 4, 4, 4, 4, 3, 4, 2, 4, 4, 4, 4, 5, …
## $ Q9 <int> 4, 4, 2, 4, 5, 5, 3, 4, 4, 4, 2, 2, 4, 2, 4, 2, 4, 4, 4, 5, …
## $ Q10 <int> 4, 2, 4, 4, 5, 5, 2, 4, 2, 4, 4, 4, 3, 4, 3, 2, 3, 3, 3, 5, …
## $ Q11 <int> 4, 4, 4, 4, 5, 5, 4, 4, 4, 3, 4, 4, 4, 5, 4, 3, 3, 4, 3, 5, …
## $ Q12 <int> 4, 4, 4, 4, 5, 5, 3, 4, 4, 3, 4, 3, 3, 5, 4, 4, 2, 3, 4, 5, …
## $ Q13 <int> 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 2, 4, 4, 5, 4, 4, 4, 3, 2, 5, …
## $ Q14 <int> 4, 4, 4, 4, 5, 5, 5, 4, 4, 4, 3, 4, 5, 5, 4, 4, 4, 4, 4, 5, …
## $ Q15 <int> 4, 3, 4, 4, 4, 2, 3, 4, 4, 3, 1, 4, 4, 5, 4, 4, 4, 3, 3, 4, …
## $ Q16 <int> 4, 4, 4, 4, 5, 2, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4, 4, …
## $ Q17 <int> 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 3, 2, 4, 4, 4, 3, 4, 2, 4, 4, …
## $ Q18 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, …
## $ Q19 <int> 4, 4, 4, 4, 4, 2, 4, 2, 4, 4, 1, 4, 4, 5, 4, 2, 3, 3, 3, 4, …
## $ Q20 <int> 4, 3, 4, 4, 4, 2, 4, 2, 4, 4, 4, 2, 4, 5, 4, 2, 4, 3, 3, 5, …
## $ Gender1 <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, …
## $ EDU1 <int> 1, 2, 2, 1, 1, 1, 4, 3, 2, 1, 1, 3, 3, 1, 1, 1, 4, 3, 2, 1, …
## $ BF <dbl> 3.4, 3.6, 4.0, 4.0, 3.6, 3.6, 3.6, 3.2, 4.0, 3.2, 4.0, 3.2, …
## $ BM <dbl> 3.2, 3.6, 3.6, 4.0, 4.6, 4.6, 2.2, 3.2, 3.2, 3.6, 3.8, 3.6, …
## $ Happiness <dbl> 4.0, 3.8, 4.0, 4.0, 4.8, 4.4, 3.8, 4.0, 4.0, 3.4, 2.8, 3.8, …
## $ Peace <dbl> 4.0, 3.8, 4.0, 4.0, 3.8, 2.4, 4.0, 3.2, 4.0, 3.9, 3.2, 3.2, …
#이론 220p 예제3
x1<-c(7,1,11,11,7,11,3,1,2,21,1,11,10)
x2<-c(26,29,56,31,52,55,71,31,54,47,40,66,68)
x3<-c(6,15,8,8,6,9,17,22,18,4,23,9,8)
x4<-c(60,52,20,47,33,22,6,44,22,26,34,12,12)
y<-c(78.5,74.3,104.3,87.6,95.9,109.2,102.7,72.5,93.1,115.9,83.8,113.3,109.4)
df<-data.frame(x1,x2,x3,x4,y)
step(lm(y~1,df),scope=list(lower=~1,upper=~x1+x2+x3+x4),direction="forward")
## Start: AIC=71.44
## y ~ 1
##
## Df Sum of Sq RSS AIC
## + x4 1 1831.90 883.87 58.852
## + x2 1 1809.43 906.34 59.178
## + x1 1 1450.08 1265.69 63.519
## + x3 1 776.36 1939.40 69.067
## <none> 2715.76 71.444
##
## Step: AIC=58.85
## y ~ x4
##
## Df Sum of Sq RSS AIC
## + x1 1 809.10 74.76 28.742
## + x3 1 708.13 175.74 39.853
## <none> 883.87 58.852
## + x2 1 14.99 868.88 60.629
##
## Step: AIC=28.74
## y ~ x4 + x1
##
## Df Sum of Sq RSS AIC
## + x2 1 26.789 47.973 24.974
## + x3 1 23.926 50.836 25.728
## <none> 74.762 28.742
##
## Step: AIC=24.97
## y ~ x4 + x1 + x2
##
## Df Sum of Sq RSS AIC
## <none> 47.973 24.974
## + x3 1 0.10909 47.864 26.944
##
## Call:
## lm(formula = y ~ x4 + x1 + x2, data = df)
##
## Coefficients:
## (Intercept) x4 x1 x2
## 71.6483 -0.2365 1.4519 0.4161
#Intercept : 절편
#scope 분석할때 고려한 변수의 범위 설정. 1은 상수항 의미
#direction은 변수 선택 방법 : forward 전진선택법, backward 후진제거법, both 단계별방법
library(ISLR)
data("attitude")
glimpse(attitude)
## Rows: 30
## Columns: 7
## $ rating <dbl> 43, 63, 71, 61, 81, 43, 58, 71, 72, 67, 64, 67, 69, 68, 77,…
## $ complaints <dbl> 51, 64, 70, 63, 78, 55, 67, 75, 82, 61, 53, 60, 62, 83, 77,…
## $ privileges <dbl> 30, 51, 68, 45, 56, 49, 42, 50, 72, 45, 53, 47, 57, 83, 54,…
## $ learning <dbl> 39, 54, 69, 47, 66, 44, 56, 55, 67, 47, 58, 39, 42, 45, 72,…
## $ raises <dbl> 61, 63, 76, 54, 71, 54, 66, 70, 71, 62, 58, 59, 55, 59, 79,…
## $ critical <dbl> 92, 73, 86, 84, 83, 49, 68, 66, 83, 80, 67, 74, 63, 77, 77,…
## $ advance <dbl> 45, 47, 48, 35, 47, 34, 35, 41, 31, 41, 34, 41, 25, 35, 46,…
step(lm(rating~.,data=attitude),direction="backward")
## Start: AIC=123.36
## rating ~ complaints + privileges + learning + raises + critical +
## advance
##
## Df Sum of Sq RSS AIC
## - critical 1 3.41 1152.4 121.45
## - raises 1 6.80 1155.8 121.54
## - privileges 1 14.47 1163.5 121.74
## - advance 1 74.11 1223.1 123.24
## <none> 1149.0 123.36
## - learning 1 180.50 1329.5 125.74
## - complaints 1 724.80 1873.8 136.04
##
## Step: AIC=121.45
## rating ~ complaints + privileges + learning + raises + advance
##
## Df Sum of Sq RSS AIC
## - raises 1 10.61 1163.0 119.73
## - privileges 1 14.16 1166.6 119.82
## - advance 1 71.27 1223.7 121.25
## <none> 1152.4 121.45
## - learning 1 177.74 1330.1 123.75
## - complaints 1 724.70 1877.1 134.09
##
## Step: AIC=119.73
## rating ~ complaints + privileges + learning + advance
##
## Df Sum of Sq RSS AIC
## - privileges 1 16.10 1179.1 118.14
## - advance 1 61.60 1224.6 119.28
## <none> 1163.0 119.73
## - learning 1 197.03 1360.0 122.42
## - complaints 1 1165.94 2328.9 138.56
##
## Step: AIC=118.14
## rating ~ complaints + learning + advance
##
## Df Sum of Sq RSS AIC
## - advance 1 75.54 1254.7 118.00
## <none> 1179.1 118.14
## - learning 1 186.12 1365.2 120.54
## - complaints 1 1259.91 2439.0 137.94
##
## Step: AIC=118
## rating ~ complaints + learning
##
## Df Sum of Sq RSS AIC
## <none> 1254.7 118.00
## - learning 1 114.73 1369.4 118.63
## - complaints 1 1370.91 2625.6 138.16
##
## Call:
## lm(formula = rating ~ complaints + learning, data = attitude)
##
## Coefficients:
## (Intercept) complaints learning
## 9.8709 0.6435 0.2112
step(lm(rating~.,data=attitude),direction="forward")
## Start: AIC=123.36
## rating ~ complaints + privileges + learning + raises + critical +
## advance
##
## Call:
## lm(formula = rating ~ complaints + privileges + learning + raises +
## critical + advance, data = attitude)
##
## Coefficients:
## (Intercept) complaints privileges learning raises critical
## 10.78708 0.61319 -0.07305 0.32033 0.08173 0.03838
## advance
## -0.21706
step(lm(rating~.,data=attitude),direction="both")
## Start: AIC=123.36
## rating ~ complaints + privileges + learning + raises + critical +
## advance
##
## Df Sum of Sq RSS AIC
## - critical 1 3.41 1152.4 121.45
## - raises 1 6.80 1155.8 121.54
## - privileges 1 14.47 1163.5 121.74
## - advance 1 74.11 1223.1 123.24
## <none> 1149.0 123.36
## - learning 1 180.50 1329.5 125.74
## - complaints 1 724.80 1873.8 136.04
##
## Step: AIC=121.45
## rating ~ complaints + privileges + learning + raises + advance
##
## Df Sum of Sq RSS AIC
## - raises 1 10.61 1163.0 119.73
## - privileges 1 14.16 1166.6 119.82
## - advance 1 71.27 1223.7 121.25
## <none> 1152.4 121.45
## + critical 1 3.41 1149.0 123.36
## - learning 1 177.74 1330.1 123.75
## - complaints 1 724.70 1877.1 134.09
##
## Step: AIC=119.73
## rating ~ complaints + privileges + learning + advance
##
## Df Sum of Sq RSS AIC
## - privileges 1 16.10 1179.1 118.14
## - advance 1 61.60 1224.6 119.28
## <none> 1163.0 119.73
## + raises 1 10.61 1152.4 121.45
## + critical 1 7.21 1155.8 121.54
## - learning 1 197.03 1360.0 122.42
## - complaints 1 1165.94 2328.9 138.56
##
## Step: AIC=118.14
## rating ~ complaints + learning + advance
##
## Df Sum of Sq RSS AIC
## - advance 1 75.54 1254.7 118.00
## <none> 1179.1 118.14
## + privileges 1 16.10 1163.0 119.73
## + raises 1 12.54 1166.6 119.82
## + critical 1 7.18 1171.9 119.96
## - learning 1 186.12 1365.2 120.54
## - complaints 1 1259.91 2439.0 137.94
##
## Step: AIC=118
## rating ~ complaints + learning
##
## Df Sum of Sq RSS AIC
## <none> 1254.7 118.00
## + advance 1 75.54 1179.1 118.14
## - learning 1 114.73 1369.4 118.63
## + privileges 1 30.03 1224.6 119.28
## + raises 1 1.19 1253.5 119.97
## + critical 1 0.00 1254.7 120.00
## - complaints 1 1370.91 2625.6 138.16
##
## Call:
## lm(formula = rating ~ complaints + learning, data = attitude)
##
## Coefficients:
## (Intercept) complaints learning
## 9.8709 0.6435 0.2112
#데이터 분석 방법 : both>backward>forward 순 추천
#건강관리부분
#rm=(list=ls())
#getwd()
#setwd("c:/data")
#getwd()
#df<-read.csv("Data1.csv")
#table(df$Gender1)
#여자:0 남자:1
#glimpse(df)
#lmfit<-lm(Happiness~EDU1,data=df)
#df$Gender1<-factor(df$Gender1)
#summary(lmfit)
#1중졸 2고졸 3대졸 4대학원졸
#팩터로 변경해서 더미변수만들기 df$EDU1<-factor(df$EDU1)
#결과보면 학력은 행복과 별개인 것을 알 수 있다 *없음
#성별도 마찬가지, 통계적으로
#bs.out2<-lm(Happiness~BM,data=df)
#summary(bs.out2)
#예제 12
#library(datasets)
#data("USArrests")#데이터는 미국 50개 주의 인구 10만명당 살인,폭행,강간으로 인한 체포 수와 도시인구비율을 나타낸다.
#head(USArrests)
#fit<-prcomp(USArrests,scale=TRUE)
#summary(fit)
#주성분분석 함수 prcomp()
#scale=TRUE:표준화
#변수가 4개면 주성분도 4개
#240P,242P 유의
#244p예제13
Nile
## Time Series:
## Start = 1871
## End = 1970
## Frequency = 1
## [1] 1120 1160 963 1210 1160 1160 813 1230 1370 1140 995 935 1110 994 1020
## [16] 960 1180 799 958 1140 1100 1210 1150 1250 1260 1220 1030 1100 774 840
## [31] 874 694 940 833 701 916 692 1020 1050 969 831 726 456 824 702
## [46] 1120 1100 832 764 821 768 845 864 862 698 845 744 796 1040 759
## [61] 781 865 845 944 984 897 822 1010 771 676 649 846 812 742 801
## [76] 1040 860 874 848 890 744 749 838 1050 918 986 797 923 975 815
## [91] 1020 906 901 1170 912 746 919 718 714 740
plot(Nile)
#1차 차분
Nile.diff1<-diff(Nile,differences=1)
plot(Nile.diff1)
#2차 차분
Nile.diff2<-diff(Nile,differences=2)
plot(Nile.diff2)
