ESC_18FALL_Final Project_Transformation of Y

## Select CRAN mirror
(r = getOption("repos"))

##                                                       CRAN 
## "https://mran.revolutionanalytics.com/snapshot/2016-11-01" 
##                                                  CRANextra 
##                       "http://www.stats.ox.ac.uk/pub/RWin"

r["CRAN"] = "http://cran.seoul.go.kr"
options(repos = r)
rm(r)

options(repos=r) : Rmarkdown이 한동안 오류로 실행되지 않았는데, 잘못된 CRAN에서 패키지를 깔고 있어서였다.. CRAN mirror가 알고보니 굉장히 중요했던 것.. * 위의 코드는 R 스크립트를 실행하기 전 CRAN mirror를 미리 설정하는 코드입니다. 위와 같이 설정해도 되고, 세션에서 말씀드렸던 chooseCRANmirror(ind=51)을 사용하셔도 됩니다.

## Import Data
working.directory <- "C:/Users/iihsk/Desktop/SeongJin Kim/6. ESC/ESC 회장 문서모음/1. 회장 업무/ESC 2018-2/18-2 정규세션/Fall 18'_ Final Project/final project"
setwd(working.directory)
data <- read.csv("train.csv", header=TRUE, stringsAsFactors=TRUE, na.strings=c("NA","NaN", "?"))
dim(data)

## [1] 1894  132

na.strings=c("NA","NaN","?") : 본래 데이터를 불러오는 chunk는 보여드리지 않지만, 결측치(Missing Values)가 물음표로 인코딩 되어있었음. na.strings를 이용해 결측치로 불러옴

## Columns with NA values
colnames(data)[colSums(is.na(data))!=0]

##  [1] "countyCode"        "communityCode"     "otherPerCap"      
##  [4] "numPolice"         "policePerPop"      "policeField"      
##  [7] "policeFieldPerPop" "policeCalls"       "policCallPerPop"  
## [10] "policCallPerOffic" "policePerPop2"     "racialMatch"      
## [13] "pctPolicWhite"     "pctPolicBlack"     "pctPolicHisp"     
## [16] "pctPolicAsian"     "pctPolicMinority"  "officDrugUnits"   
## [19] "numDiffDrugsSeiz"  "policAveOT"        "policCarsAvail"   
## [22] "policOperBudget"   "pctPolicPatrol"    "gangUnit"         
## [25] "policBudgetPerPop" "burglaries"        "larcenies"        
## [28] "autoTheft"         "arsons"

위의 변수들은 NA가 있습니다

# 범주형 변수 골라내기
categorical.variable <- c()
i <- 0
while(i<ncol(data)){
    i <- i + 1
    if(is.factor(data[,i])==TRUE) {
    categorical.variable <- c(categorical.variable,i)
        if(length(categorical.variable)<=20){
        cat(paste(colnames(data)[i]),"is categorical", sep=" ","\n")
        }
    }
}

## communityname is categorical 
## state is categorical

cat(paste("Thera are",length(categorical.variable),"Categorical Variables"),"\n") # 2개의 변수(communityname, state)이 범주형 변수

## Thera are 2 Categorical Variables

# 숫자형 변수 골라내기
numeric.variable <- c()
i <- 0
while(i<ncol(data)){
    i <- i + 1
    if(is.numeric(data[,i])==TRUE) {
    numeric.variable <- c(numeric.variable,i)
        if(length(numeric.variable)<=20){
        cat(paste(colnames(data)[i]),"is numeric", sep=" ","\n")
        }
    }
}

## countyCode is numeric 
## communityCode is numeric 
## pop is numeric 
## perHoush is numeric 
## pctBlack is numeric 
## pctWhite is numeric 
## pctAsian is numeric 
## pctHisp is numeric 
## pct12.21 is numeric 
## pct12.29 is numeric 
## pct16.24 is numeric 
## pct65up is numeric 
## persUrban is numeric 
## pctUrban is numeric 
## medIncome is numeric 
## pctWwage is numeric 
## pctWfarm is numeric 
## pctWdiv is numeric 
## pctWsocsec is numeric 
## pctPubAsst is numeric

cat(paste("Thera are",length(numeric.variable),"Numeric Variables"),"\n") # 130개 숫자형 변수

## Thera are 130 Numeric Variables

## 종속변수가 `정규분포`를 따르는가
y <- data$violentPerPop
hist(sqrt(y))

hist(log(y))

## fitting y with theoretical normal distribution
n <- length(y)
i <- 1:n
names(fitdistr(y, "normal"))

## [1] "estimate" "sd"       "vcov"     "n"        "loglik"

fitdistr(y, "normal")$estimate[1]

##     mean 
## 584.5203

qnorm.log.rivers <- qnorm((i-0.5)/n,fitdistr(y,"normal")$estimate[1],fitdistr(y,"normal")$estimate[2])
plot(qnorm.log.rivers, sort(y), col="blue")
abline(0,1,col="green", lty=2)  # does not follow normal distribution

y 와 정규분포가 잘 맞지 않는다

## `boxcox transformation` : Computes and optionally plots profile log-likelihoods for the parameter of the Box-Cox power transformation.
y <- violentPerPop[violentPerPop!=0]
x <- pop[violentPerPop!=0]
lm.fit <- lm(y~x)
boxcox(lm.fit, lambda = seq(-2, 2, 1/10), plotit=TRUE)
boxcox.transformation <- boxcox(lm.fit, lambda = seq(-2, 2, 1/10), plotit=TRUE)

names(boxcox.transformation)

## [1] "x" "y"

which.max(boxcox.transformation$y)

## [1] 54

p <- boxcox.transformation$x[which.max(boxcox.transformation$y)]

## `p` maximizes log likelihood of normal distribution
par(mfrow=c(2,1))
hist(y)
hist(y^p)

## 정규분포 확률변수와 같이 그려보자
fitdistr(y^p, densfun="normal") # 정규분포에 적합시켰을 때 평균과 표준편차

##       mean           sd     
##   2.312755842   0.353574949 
##  (0.008126549) (0.005746338)

i <- seq(min(y^p),max(y^p),0.2)
dnorm.var <- dnorm(i,  mean=fitdistr(y^p, densfun="normal")$estimate[1], sd=fitdistr(y^p, densfun="normal")$estimate[2])
dnorm.var <- dnorm.var*0.2 # 
sum(dnorm.var) # 1과 거의 가까워진다.

## [1] 0.9982345

## 상대도수로 히스토그램 그리기
h <- hist(y^p, plot=FALSE)
h$counts <- h$counts/sum(h$counts)

## 참조 : legend, points w/ type="l"
plot(h, freq=TRUE, ylab="Relative Frequency", main="Transformation of Y alongside w/ Theoretical Normal Distr.",
    sub = paste("Theoretical Norm. Dist.n w/ mean =",round(fitdistr(y^p,densfun="normal")$estimate[1],4), "sd =", round(fitdistr(y^p,densfun="normal")$estimate[2],4)))
points(i,dnorm.var, type="l", col="green", lty=2)
legend("topleft", legend=c(paste("p=",round(p,4)),"Theo Norm. Distr"), col=c("black","green"), lty=1:2, cex=0.8)

## Draw qqline : 거의 일직선 그리네 ㄱㅇㄷ
i <- seq(0,1,length=length(y^p))
qnorm.var <- qnorm(i,  mean=fitdistr(y^p, densfun="normal")$estimate[1], sd=fitdistr(y^p, densfun="normal")$estimate[2])
plot(qnorm.var, sort(y^p), col="blue", cex=1.1)
abline(0,1,col="green", lty=2)

Y Transformtion으로 설명력은 잃을 것. p=1.4141

ESC_18FALL_Final Project_Transformation of Y

SeongJin Kim

2018년 11월 20일