working.directory<-"C:/Users/iihsk/Desktop/SeongJin Kim/1. Yonsei University/6. 2018 SPRING/4. Exploratory Data Analysis/3. Assignments/Assignment 3"
setwd(working.directory)
getwd()
## [1] "C:/Users/iihsk/Desktop/SeongJin Kim/1. Yonsei University/6. 2018 SPRING/4. Exploratory Data Analysis/3. Assignments/Assignment 3"
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
attach(airquality)
summary(Ozone)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 18.00 31.50 42.13 63.25 168.00 37
z<-fivenum(Ozone)
skew.before<-((z[4]-z[3])-(z[3]-z[2]))/((z[4]-z[3])+(z[3]-z[2]))
skew.before
## [1] 0.4065934
boxplot(Ozone)
1) With Boxplot of Ozone, we can visualy verify data’s skewness to right
Let’s check on the definition of each variables in dataset ‘airquality’
Link : dataset ’airquality
‘airquality’ data in NY measured by Ozone and solar radiation.
Variables
Skewness is initially 0.41 -> skewed to the right
Ozone data is skewed to the right. So we have to apply square root transformation
par(mfrow=c(1,2))
boxplot(-Ozone^(-0.5))
boxplot(-Ozone^(-1))
* First, we carry out Power Transformation. Because Ozone is skewed to the right we should conduct transformation with powers that are at right side of X (for example 0.5, log(X), -0.5, -1 etc)
Ozone1<-Ozone^(0.5)
Ozone2<-log(Ozone)
Ozone3<--Ozone^(-0.5)
Ozone4<--Ozone^(-1)
a1<-fivenum(Ozone1)
a2<-fivenum(Ozone2)
a3<-fivenum(Ozone3)
a4<-fivenum(Ozone4)
skewness.Ozone1<-((a1[4]-a1[3])-(a1[3]-a1[2]))/((a1[4]-a1[3])+(a1[3]-a1[2]))
skewness.Ozone2<-((a2[4]-a2[3])-(a2[3]-a2[2]))/((a2[4]-a2[3])+(a2[3]-a2[2]))
skewness.Ozone3<-((a3[4]-a3[3])-(a3[3]-a3[2]))/((a3[4]-a3[3])+(a3[3]-a3[2]))
skewness.Ozone4<-((a4[4]-a4[3])-(a4[3]-a4[2]))/((a4[4]-a4[3])+(a4[3]-a4[2]))
c(skewness.Ozone1,skewness.Ozone2, skewness.Ozone3, skewness.Ozone4)
## [1] 0.26480211 0.11236981 -0.04368407 -0.19585971
p<-1-2*z[3]*((z[4]-z[3])+(z[2]-z[3]))/((z[4]-z[3])^2+(z[2]-z[3])^2)
p
## [1] 0.03378238
Ozone3<-Ozone^(p)
b<-fivenum(Ozone3)
skewness.Ozone.p<-((b[4]-b[3])-(b[3]-b[2]))/((b[4]-b[3])+(b[3]-b[2]))
skewness.Ozone.p
## [1] 0.1228674
par(mfrow=c(1,2))
stem(Ozone3)
##
## The decimal point is 2 digit(s) to the left of the |
##
## 100 | 0
## 101 |
## 102 |
## 103 |
## 104 | 8
## 105 |
## 106 | 2888
## 107 | 3777
## 108 | 144488
## 109 | 111133338888
## 110 | 3333577778888
## 111 | 0222222338999
## 112 | 022344478899
## 113 | 0022346667789
## 114 | 0113889
## 115 | 0111256688999
## 116 | 0112245777
## 117 | 12456
## 118 | 09
boxplot(Ozone3, main="Boxplot of -Ozone^(-1)", col="bisque")
skewness.Ozone3
## [1] -0.04368407
dataQ2<-read.csv("Samsung Electronics Stock Prices 2017.csv", header=T)
colnames(dataQ2)<-c("Y.M.D", "StockPrice")
str(dataQ2) #second column is factor with character variables
## 'data.frame': 243 obs. of 2 variables:
## $ Y.M.D : Factor w/ 243 levels "2017/01/02","2017/01/03",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ StockPrice: Factor w/ 208 levels "1,778,000","1,805,000",..: 2 5 3 1 4 10 11 22 27 12 ...
StockPrice<-as.numeric(dataQ2[,2])
f<-fivenum(StockPrice)
f
## [1] 1.0 54.5 105.0 154.5 208.0
skewness.stock<-((f[4]-f[3])-(f[3]-f[2]))/((f[4]-f[3])+(f[3]-f[2]))
skewness.stock
## [1] -0.01
boxplot(StockPrice)
1-2*f[3]*((f[4]-f[3])+(f[2]-f[3]))/((f[4]-f[3])^2+(f[2]-f[3])^2)
## [1] 1.041996
StockPrice2<-StockPrice^(1.041996)
g<-fivenum(StockPrice2)
skewness.stock2<-((g[4]-g[3])-(g[3]-g[2]))/((g[4]-g[3])+(g[3]-g[2]))
skewness.stock2
## [1] 0.0004440838
boxplot(StockPrice2)
* Boxplot of StockPrice seems pretty symmetric too. So now I tried to draw stem-and-leaf plot
par(mfrow=c(1,2))
stem(StockPrice, scale =0.5)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 0 | 12345678901234567899
## 2 | 0123445678901234556789
## 4 | 012223456778901233455678899
## 6 | 012345678901223456789
## 8 | 0123456789012234556788999
## 10 | 001234567890001123445677899
## 12 | 0123456789901234456789
## 14 | 01123456789012234456789
## 16 | 00112345678901234567789
## 18 | 0123456778901123456789
## 20 | 01234445678
stem(StockPrice2)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 0 | 12345689012346789
## 2 | 0223456779012356789
## 4 | 11234578999023455689
## 6 | 01334556899001245679
## 8 | 01245667901245679
## 10 | 0124567901124556899
## 12 | 00011345689013444557899
## 14 | 0233455789123467889
## 16 | 12355678012445679
## 18 | 01345688900234578899
## 20 | 123467801245679
## 22 | 0013456890233467889
## 24 | 12356790124555689
## 26 | 0
stem(StockPrice, scale=0.25)
##
## The decimal point is 2 digit(s) to the right of the |
##
## 0 | 0000111111111122222222222233333333334444444444444
## 0 | 555555555555666666666666677777777777888888888899999999999
## 1 | 0000000000000001111111111111122222222222233333333333344444444444
## 1 | 555555555555666666666666777777777788888888888999999999999
## 2 | 0000000000001111
dataQ3<-read.delim("DISTRESS.DAT", header=F)
str(dataQ3) #str() reveals 3 functions in dataQ3
## 'data.frame': 10 obs. of 5 variables:
## $ V1: Factor w/ 10 levels "1.050*","1.175*",..: 1 2 3 4 5 6 7 8 9 10
## $ V2: Factor w/ 10 levels "1.030*","1.100*",..: 10 1 2 3 4 5 6 7 8 9
## $ V3: Factor w/ 10 levels "1.130","1.575",..: 4 5 6 7 8 9 10 1 2 3
## $ V4: num 1.76 1.93 2.02 2.09 2.6 ...
## $ V5: num 2.83 1.41 1.72 1.72 2.04 ...
attach(dataQ3)
dataQ3.char<-c(
as.character(V1),
as.character(V2),
as.character(V3),
as.character(V4),
as.character(V5))
dataQ3.modified<-chartr("*","a",dataQ3.char)
location.death<-grep("a", dataQ3.modified)
sad<-dataQ3.modified[location.death]
death<-as.numeric(sub("a","",sad))
survived<-as.numeric(dataQ3.modified[-location.death])
total <- c(death,survived)
total
## [1] 1.050 1.175 1.230 1.310 1.500 1.600 1.720 1.750 1.770 2.275 2.500
## [12] 1.030 1.100 1.185 1.225 1.262 1.295 1.300 1.550 1.820 1.890 1.940
## [23] 2.200 2.270 2.440 2.560 2.730 1.130 1.575 1.680 1.760 1.930 2.015
## [34] 2.090 2.600 2.700 2.950 3.160 3.400 3.640 2.830 1.410 1.715 1.720
## [45] 2.040 2.200 2.400 2.550 2.570 3.005
par(mfrow=c(1,2))
stem(total)
##
## The decimal point is at the |
##
## 1 | 0111222233334
## 1 | 566677778888999
## 2 | 001223344
## 2 | 56666778
## 3 | 0024
## 3 | 6
boxplot(total, main="Boxplot of total data")
fn<-fivenum(total)
skew<-((fn[4]-fn[3])-(fn[3]-fn[2]))/((fn[4]-fn[3])+(fn[3]-fn[2]))
p<-1-2*fn[3]*((fn[4]-fn[3])+(fn[2]-fn[3]))/((fn[4]-fn[3])^2+(fn[2]-fn[3])^2)
par(mfrow=c(1,2))
stem(-total^(-p))
##
## The decimal point is 1 digit(s) to the left of the |
##
## -13 | 1
## -12 | 9765
## -12 | 4332222100
## -11 | 998876655
## -11 | 433322221000
## -10 | 976665
## -10 | 44433211
boxplot(-total^(-p))
survival<-as.factor(c(rep("death", length(death)), rep("survived", length(survived))))
arranged.data<-data.frame(survival, total)
arranged.data
## survival total
## 1 death 1.050
## 2 death 1.175
## 3 death 1.230
## 4 death 1.310
## 5 death 1.500
## 6 death 1.600
## 7 death 1.720
## 8 death 1.750
## 9 death 1.770
## 10 death 2.275
## 11 death 2.500
## 12 death 1.030
## 13 death 1.100
## 14 death 1.185
## 15 death 1.225
## 16 death 1.262
## 17 death 1.295
## 18 death 1.300
## 19 death 1.550
## 20 death 1.820
## 21 death 1.890
## 22 death 1.940
## 23 death 2.200
## 24 death 2.270
## 25 death 2.440
## 26 death 2.560
## 27 death 2.730
## 28 survived 1.130
## 29 survived 1.575
## 30 survived 1.680
## 31 survived 1.760
## 32 survived 1.930
## 33 survived 2.015
## 34 survived 2.090
## 35 survived 2.600
## 36 survived 2.700
## 37 survived 2.950
## 38 survived 3.160
## 39 survived 3.400
## 40 survived 3.640
## 41 survived 2.830
## 42 survived 1.410
## 43 survived 1.715
## 44 survived 1.720
## 45 survived 2.040
## 46 survived 2.200
## 47 survived 2.400
## 48 survived 2.550
## 49 survived 2.570
## 50 survived 3.005
boxplot(total~survival, data=arranged.data)
attach(arranged.data)
## The following objects are masked _by_ .GlobalEnv:
##
## survival, total
dead<-subset(arranged.data, subset=survival=='death')
survived<-subset(arranged.data, subset=survival=='survived')
fnA<-fivenum(dead[,2])
fnB<-fivenum(survived[,2])
HsprA<-fnA[4]-fnA[2]
HsprB<-fnB[4]-fnB[2]
Hspr<-c(HsprA, HsprB)
M<-c(fnA[3], fnB[3])
plot(log(M), log(Hspr), type="l")
(RegLine<-lm(log(Hspr) ~ log(M)))
##
## Call:
## lm(formula = log(Hspr) ~ log(M))
##
## Coefficients:
## (Intercept) log(M)
## -0.5157 0.6854
1-coef(RegLine)[2]
## log(M)
## 0.3145712
p<-0.3145712
boxplot(total^(0.3145712)~survival, data=arranged.data)
attach(arranged.data)
## The following objects are masked _by_ .GlobalEnv:
##
## survival, total
## The following objects are masked from arranged.data (pos = 3):
##
## survival, total
fn<-fivenum(total^(0.3145712))
((fn[4]-fn[3])-(fn[3]-fn[2]))/((fn[4]-fn[3])+(fn[3]-fn[2]))
## [1] 0.0873964
median(survived[,2])
## [1] 2.2
dataQ4<-read.delim("PLASMA.DAT", header=T)
str(dataQ4)
## 'data.frame': 9 obs. of 5 variables:
## $ X93 : int 116 125 144 105 109 89 116 151 137
## $ X121 : int 135 137 173 119 83 95 128 149 139
## $ X112 : int 114 119 148 125 109 88 122 141 125
## $ X117 : int 98 105 124 91 80 91 107 126 109
## $ X121.1: int 135 102 122 133 104 116 119 138 107
boxplot(dataQ4)
* Comparing between each boxplots is difficult due to significant diffrence in spread.
attach(dataQ4)
p.vector<-c()
for (i in 1:ncol(dataQ4)){
g<-fivenum(dataQ4[,i])
p<-1-2*g[3]*((g[4]-g[3])+(g[2]-g[3]))/((g[4]-g[3])^2+(g[2]-g[3])^2)
p.vector[i]<-p
}
p.vector
## [1] -5.628571 12.911765 17.712329 10.905660 -0.400000
new.dataQ4<-rbind(dataQ4[,1]^(p.vector[1]),dataQ4[,2]^(p.vector[2]),dataQ4[,3]^(p.vector[3]),dataQ4[,4]^(p.vector[4]),dataQ4[,5]^(p.vector[5]))
boxplot(new.dataQ4)
* Usually symmetry and similar spread is achieved simultaneously.
fnA<-fivenum(dataQ4[,1])
fnB<-fivenum(dataQ4[,2])
fnC<-fivenum(dataQ4[,3])
fnD<-fivenum(dataQ4[,4])
fnE<-fivenum(dataQ4[,5])
HsprA<-fnA[4]-fnA[2]
HsprB<-fnB[4]-fnB[2]
HsprC<-fnC[4]-fnC[2]
HsprD<-fnD[4]-fnD[2]
HsprE<-fnE[4]-fnE[2]
Hspr<-c(HsprA, HsprB, HsprC, HsprD, HsprE)
M<-c(fnA[3], fnB[3], fnC[3], fnD[3], fnE[3])
(RegLine<-lm(log(Hspr) ~ log(M)))
##
## Call:
## lm(formula = log(Hspr) ~ log(M))
##
## Coefficients:
## (Intercept) log(M)
## 4.471 -0.313
1-coef(RegLine)[2]
## log(M)
## 1.312961
Error in executing in Markdown: plot(log(M), log(Hspr))
Error in executing Markdown : par(mfrow=c(1,2)) / boxplot(dataQ4) / boxplot(dataQ4^(1.5))
curve(x^(1/2), col="red", xlim=c(0,1),ylim=c(-5,2))
curve(x^(1/3),col="orange", add=TRUE)
curve(log(x), col="green", lwd=3, add=TRUE)
curve(-x^(-1/3), col="blue", add=TRUE)
curve(-x^(-1/2), col="purple", add=TRUE)