## Select CRAN mirror
(r = getOption("repos"))
## CRAN
## "https://mran.revolutionanalytics.com/snapshot/2016-11-01"
## CRANextra
## "http://www.stats.ox.ac.uk/pub/RWin"
r["CRAN"] = "http://cran.seoul.go.kr"
options(repos = r)
rm(r)
options(repos=r) : Rmarkdown이 한동안 오류로 실행되지 않았는데, 잘못된 CRAN에서 패키지를 깔고 있어서였다.. CRAN mirror가 알고보니 굉장히 중요했던 것.. * 위의 코드는 R 스크립트를 실행하기 전 CRAN mirror를 미리 설정하는 코드입니다. 위와 같이 설정해도 되고, 세션에서 말씀드렸던 chooseCRANmirror(ind=51)을 사용하셔도 됩니다.
## Import Data
working.directory <- "C:/Users/iihsk/Desktop/SeongJin Kim/6. ESC/ESC 회장 문서모음/1. 회장 업무/ESC 2018-2/18-2 정규세션/Fall 18'_ Final Project/final project"
setwd(working.directory)
data <- read.csv("train.csv", header=TRUE, stringsAsFactors=TRUE, na.strings=c("NA","NaN", "?"))
dim(data)
## [1] 1894 132
na.strings=c("NA","NaN","?") : 본래 데이터를 불러오는 chunk는 보여드리지 않지만, 결측치(Missing Values)가 물음표로 인코딩 되어있었음. na.strings를 이용해 결측치로 불러옴
## Columns with NA values
colnames(data)[colSums(is.na(data))!=0]
## [1] "countyCode" "communityCode" "otherPerCap"
## [4] "numPolice" "policePerPop" "policeField"
## [7] "policeFieldPerPop" "policeCalls" "policCallPerPop"
## [10] "policCallPerOffic" "policePerPop2" "racialMatch"
## [13] "pctPolicWhite" "pctPolicBlack" "pctPolicHisp"
## [16] "pctPolicAsian" "pctPolicMinority" "officDrugUnits"
## [19] "numDiffDrugsSeiz" "policAveOT" "policCarsAvail"
## [22] "policOperBudget" "pctPolicPatrol" "gangUnit"
## [25] "policBudgetPerPop" "burglaries" "larcenies"
## [28] "autoTheft" "arsons"
NA가 있습니다# 범주형 변수 골라내기
categorical.variable <- c()
i <- 0
while(i<ncol(data)){
i <- i + 1
if(is.factor(data[,i])==TRUE) {
categorical.variable <- c(categorical.variable,i)
if(length(categorical.variable)<=20){
cat(paste(colnames(data)[i]),"is categorical", sep=" ","\n")
}
}
}
## communityname is categorical
## state is categorical
cat(paste("Thera are",length(categorical.variable),"Categorical Variables"),"\n") # 2개의 변수(communityname, state)이 범주형 변수
## Thera are 2 Categorical Variables
# 숫자형 변수 골라내기
numeric.variable <- c()
i <- 0
while(i<ncol(data)){
i <- i + 1
if(is.numeric(data[,i])==TRUE) {
numeric.variable <- c(numeric.variable,i)
if(length(numeric.variable)<=20){
cat(paste(colnames(data)[i]),"is numeric", sep=" ","\n")
}
}
}
## countyCode is numeric
## communityCode is numeric
## pop is numeric
## perHoush is numeric
## pctBlack is numeric
## pctWhite is numeric
## pctAsian is numeric
## pctHisp is numeric
## pct12.21 is numeric
## pct12.29 is numeric
## pct16.24 is numeric
## pct65up is numeric
## persUrban is numeric
## pctUrban is numeric
## medIncome is numeric
## pctWwage is numeric
## pctWfarm is numeric
## pctWdiv is numeric
## pctWsocsec is numeric
## pctPubAsst is numeric
cat(paste("Thera are",length(numeric.variable),"Numeric Variables"),"\n") # 130개 숫자형 변수
## Thera are 130 Numeric Variables
## 종속변수가 `정규분포`를 따르는가
y <- data$violentPerPop
hist(sqrt(y))
hist(log(y))
## fitting y with theoretical normal distribution
n <- length(y)
i <- 1:n
names(fitdistr(y, "normal"))
## [1] "estimate" "sd" "vcov" "n" "loglik"
fitdistr(y, "normal")$estimate[1]
## mean
## 584.5203
qnorm.log.rivers <- qnorm((i-0.5)/n,fitdistr(y,"normal")$estimate[1],fitdistr(y,"normal")$estimate[2])
plot(qnorm.log.rivers, sort(y), col="blue")
abline(0,1,col="green", lty=2) # does not follow normal distribution
y 와 정규분포가 잘 맞지 않는다
## `boxcox transformation` : Computes and optionally plots profile log-likelihoods for the parameter of the Box-Cox power transformation.
y <- violentPerPop[violentPerPop!=0]
x <- pop[violentPerPop!=0]
lm.fit <- lm(y~x)
boxcox(lm.fit, lambda = seq(-2, 2, 1/10), plotit=TRUE)
boxcox.transformation <- boxcox(lm.fit, lambda = seq(-2, 2, 1/10), plotit=TRUE)
names(boxcox.transformation)
## [1] "x" "y"
which.max(boxcox.transformation$y)
## [1] 54
p <- boxcox.transformation$x[which.max(boxcox.transformation$y)]
## `p` maximizes log likelihood of normal distribution
par(mfrow=c(2,1))
hist(y)
hist(y^p)
## 정규분포 확률변수와 같이 그려보자
fitdistr(y^p, densfun="normal") # 정규분포에 적합시켰을 때 평균과 표준편차
## mean sd
## 2.312755842 0.353574949
## (0.008126549) (0.005746338)
i <- seq(min(y^p),max(y^p),0.2)
dnorm.var <- dnorm(i, mean=fitdistr(y^p, densfun="normal")$estimate[1], sd=fitdistr(y^p, densfun="normal")$estimate[2])
dnorm.var <- dnorm.var*0.2 #
sum(dnorm.var) # 1과 거의 가까워진다.
## [1] 0.9982345
## 상대도수로 히스토그램 그리기
h <- hist(y^p, plot=FALSE)
h$counts <- h$counts/sum(h$counts)
## 참조 : legend, points w/ type="l"
plot(h, freq=TRUE, ylab="Relative Frequency", main="Transformation of Y alongside w/ Theoretical Normal Distr.",
sub = paste("Theoretical Norm. Dist.n w/ mean =",round(fitdistr(y^p,densfun="normal")$estimate[1],4), "sd =", round(fitdistr(y^p,densfun="normal")$estimate[2],4)))
points(i,dnorm.var, type="l", col="green", lty=2)
legend("topleft", legend=c(paste("p=",round(p,4)),"Theo Norm. Distr"), col=c("black","green"), lty=1:2, cex=0.8)
## Draw qqline : 거의 일직선 그리네 ㄱㅇㄷ
i <- seq(0,1,length=length(y^p))
qnorm.var <- qnorm(i, mean=fitdistr(y^p, densfun="normal")$estimate[1], sd=fitdistr(y^p, densfun="normal")$estimate[2])
plot(qnorm.var, sort(y^p), col="blue", cex=1.1)
abline(0,1,col="green", lty=2)
Y Transformtion으로 설명력은 잃을 것. p=1.4141