working.directory<-"C:/Users/iihsk/Desktop/SeongJin Kim/1. Yonsei University/6. 2018 SPRING/4. Exploratory Data Analysis/3. Assignments/Assignment 3"
setwd(working.directory)
getwd()
## [1] "C:/Users/iihsk/Desktop/SeongJin Kim/1. Yonsei University/6. 2018 SPRING/4. Exploratory Data Analysis/3. Assignments/Assignment 3"

1. Make it symmetry the ‘airquality’ dataset. Compare between symmetric and assymmetric results and evaluate if the result is satisfiable. Try additional transformation if neccessary.

summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
attach(airquality)
summary(Ozone)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   18.00   31.50   42.13   63.25  168.00      37
z<-fivenum(Ozone)
skew.before<-((z[4]-z[3])-(z[3]-z[2]))/((z[4]-z[3])+(z[3]-z[2]))
skew.before
## [1] 0.4065934
Skewness=0.4065934. So data is skewed to the right. Let’s check on Ozone’s boxplot
boxplot(Ozone)

1) With Boxplot of Ozone, we can visualy verify data’s skewness to right

  1. Let’s check on the definition of each variables in dataset ‘airquality’

  2. Link : dataset ’airquality

  3. ‘airquality’ data in NY measured by Ozone and solar radiation.

  4. Variables

    1. Ozone : Mean ozone concentration in parts per billion from 1300 to 1500 hours at Roosevelt Island(Island between Manhatten and Queens)
    2. Solar.R : Solar radiation in frequency band 4000-7700 Angstroms from 8am ~ 12pm at Central Park
    3. Wind : Average wind speed measured at LaGuardia Airtport, Long Islands
    4. Temp : Maximum daily temperature in degrees Fahrenheit at La Guardia Airport.
  5. Skewness is initially 0.41 -> skewed to the right

  6. Ozone data is skewed to the right. So we have to apply square root transformation

par(mfrow=c(1,2))
boxplot(-Ozone^(-0.5))
boxplot(-Ozone^(-1))

* First, we carry out Power Transformation. Because Ozone is skewed to the right we should conduct transformation with powers that are at right side of X (for example 0.5, log(X), -0.5, -1 etc)

Ozone1<-Ozone^(0.5)
Ozone2<-log(Ozone)
Ozone3<--Ozone^(-0.5)
Ozone4<--Ozone^(-1)
a1<-fivenum(Ozone1)
a2<-fivenum(Ozone2)
a3<-fivenum(Ozone3)
a4<-fivenum(Ozone4)
skewness.Ozone1<-((a1[4]-a1[3])-(a1[3]-a1[2]))/((a1[4]-a1[3])+(a1[3]-a1[2]))
skewness.Ozone2<-((a2[4]-a2[3])-(a2[3]-a2[2]))/((a2[4]-a2[3])+(a2[3]-a2[2]))
skewness.Ozone3<-((a3[4]-a3[3])-(a3[3]-a3[2]))/((a3[4]-a3[3])+(a3[3]-a3[2]))
skewness.Ozone4<-((a4[4]-a4[3])-(a4[3]-a4[2]))/((a4[4]-a4[3])+(a4[3]-a4[2]))
c(skewness.Ozone1,skewness.Ozone2, skewness.Ozone3, skewness.Ozone4)
## [1]  0.26480211  0.11236981 -0.04368407 -0.19585971
  • We can conclude by comparing skewness of above transformations, -Ozone(-1) obtained best symmetry.
p<-1-2*z[3]*((z[4]-z[3])+(z[2]-z[3]))/((z[4]-z[3])^2+(z[2]-z[3])^2)
p
## [1] 0.03378238
Ozone3<-Ozone^(p)
b<-fivenum(Ozone3)
skewness.Ozone.p<-((b[4]-b[3])-(b[3]-b[2]))/((b[4]-b[3])+(b[3]-b[2]))
skewness.Ozone.p
## [1] 0.1228674
  • Perfomring taylor expansion with f(x)=x^p and applying x=upper hinge, x=lower hinge would yeild approximate value of p such that when x^p skewness=0.
  • The result shows, square root x obtains better skewness than x^p. So we will stick with -Ozone^(-1)
Final Results
par(mfrow=c(1,2))
stem(Ozone3)
## 
##   The decimal point is 2 digit(s) to the left of the |
## 
##   100 | 0
##   101 | 
##   102 | 
##   103 | 
##   104 | 8
##   105 | 
##   106 | 2888
##   107 | 3777
##   108 | 144488
##   109 | 111133338888
##   110 | 3333577778888
##   111 | 0222222338999
##   112 | 022344478899
##   113 | 0022346667789
##   114 | 0113889
##   115 | 0111256688999
##   116 | 0112245777
##   117 | 12456
##   118 | 09
boxplot(Ozone3, main="Boxplot of -Ozone^(-1)", col="bisque")
skewness.Ozone3
## [1] -0.04368407

2. Symmetrizing Re-expression with Samsung Electronics Stock Prices.

dataQ2<-read.csv("Samsung Electronics Stock Prices 2017.csv", header=T)
colnames(dataQ2)<-c("Y.M.D", "StockPrice")
str(dataQ2)   #second column is factor with character variables
## 'data.frame':    243 obs. of  2 variables:
##  $ Y.M.D     : Factor w/ 243 levels "2017/01/02","2017/01/03",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ StockPrice: Factor w/ 208 levels "1,778,000","1,805,000",..: 2 5 3 1 4 10 11 22 27 12 ...
StockPrice<-as.numeric(dataQ2[,2])
f<-fivenum(StockPrice)
f
## [1]   1.0  54.5 105.0 154.5 208.0
skewness.stock<-((f[4]-f[3])-(f[3]-f[2]))/((f[4]-f[3])+(f[3]-f[2]))
skewness.stock
## [1] -0.01
boxplot(StockPrice)

1-2*f[3]*((f[4]-f[3])+(f[2]-f[3]))/((f[4]-f[3])^2+(f[2]-f[3])^2)
## [1] 1.041996
StockPrice2<-StockPrice^(1.041996)
g<-fivenum(StockPrice2)
skewness.stock2<-((g[4]-g[3])-(g[3]-g[2]))/((g[4]-g[3])+(g[3]-g[2]))
skewness.stock2
## [1] 0.0004440838
boxplot(StockPrice2)

* Boxplot of StockPrice seems pretty symmetric too. So now I tried to draw stem-and-leaf plot

par(mfrow=c(1,2))
stem(StockPrice, scale =0.5)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##    0 | 12345678901234567899
##    2 | 0123445678901234556789
##    4 | 012223456778901233455678899
##    6 | 012345678901223456789
##    8 | 0123456789012234556788999
##   10 | 001234567890001123445677899
##   12 | 0123456789901234456789
##   14 | 01123456789012234456789
##   16 | 00112345678901234567789
##   18 | 0123456778901123456789
##   20 | 01234445678
stem(StockPrice2)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##    0 | 12345689012346789
##    2 | 0223456779012356789
##    4 | 11234578999023455689
##    6 | 01334556899001245679
##    8 | 01245667901245679
##   10 | 0124567901124556899
##   12 | 00011345689013444557899
##   14 | 0233455789123467889
##   16 | 12355678012445679
##   18 | 01345688900234578899
##   20 | 123467801245679
##   22 | 0013456890233467889
##   24 | 12356790124555689
##   26 | 0
stem(StockPrice, scale=0.25)
## 
##   The decimal point is 2 digit(s) to the right of the |
## 
##   0 | 0000111111111122222222222233333333334444444444444
##   0 | 555555555555666666666666677777777777888888888899999999999
##   1 | 0000000000000001111111111111122222222222233333333333344444444444
##   1 | 555555555555666666666666777777777788888888888999999999999
##   2 | 0000000000001111

3. Severe Idiopathic Respiratory Distress Syndrom

  1. Draw boxplot, stem-and-leaf plot regardless of infant’s survival. Conduct variance-stablizing re-expression and compare it with previous results.
dataQ3<-read.delim("DISTRESS.DAT", header=F)
str(dataQ3)   #str() reveals 3 functions in dataQ3
## 'data.frame':    10 obs. of  5 variables:
##  $ V1: Factor w/ 10 levels "1.050*","1.175*",..: 1 2 3 4 5 6 7 8 9 10
##  $ V2: Factor w/ 10 levels "1.030*","1.100*",..: 10 1 2 3 4 5 6 7 8 9
##  $ V3: Factor w/ 10 levels "1.130","1.575",..: 4 5 6 7 8 9 10 1 2 3
##  $ V4: num  1.76 1.93 2.02 2.09 2.6 ...
##  $ V5: num  2.83 1.41 1.72 1.72 2.04 ...
attach(dataQ3)
dataQ3.char<-c(
as.character(V1),
as.character(V2),
as.character(V3),
as.character(V4),
as.character(V5))
dataQ3.modified<-chartr("*","a",dataQ3.char) 
location.death<-grep("a", dataQ3.modified)
sad<-dataQ3.modified[location.death]
death<-as.numeric(sub("a","",sad))
survived<-as.numeric(dataQ3.modified[-location.death])
total <- c(death,survived)
total
##  [1] 1.050 1.175 1.230 1.310 1.500 1.600 1.720 1.750 1.770 2.275 2.500
## [12] 1.030 1.100 1.185 1.225 1.262 1.295 1.300 1.550 1.820 1.890 1.940
## [23] 2.200 2.270 2.440 2.560 2.730 1.130 1.575 1.680 1.760 1.930 2.015
## [34] 2.090 2.600 2.700 2.950 3.160 3.400 3.640 2.830 1.410 1.715 1.720
## [45] 2.040 2.200 2.400 2.550 2.570 3.005
par(mfrow=c(1,2))
stem(total)
## 
##   The decimal point is at the |
## 
##   1 | 0111222233334
##   1 | 566677778888999
##   2 | 001223344
##   2 | 56666778
##   3 | 0024
##   3 | 6
boxplot(total, main="Boxplot of total data")

fn<-fivenum(total)
skew<-((fn[4]-fn[3])-(fn[3]-fn[2]))/((fn[4]-fn[3])+(fn[3]-fn[2]))
p<-1-2*fn[3]*((fn[4]-fn[3])+(fn[2]-fn[3]))/((fn[4]-fn[3])^2+(fn[2]-fn[3])^2)
par(mfrow=c(1,2))
stem(-total^(-p))
## 
##   The decimal point is 1 digit(s) to the left of the |
## 
##   -13 | 1
##   -12 | 9765
##   -12 | 4332222100
##   -11 | 998876655
##   -11 | 433322221000
##   -10 | 976665
##   -10 | 44433211
boxplot(-total^(-p))

  1. Separate data of dead infant and survived infant. Conduct variance stablization re-expression and compare between two subsets. What is the threshold of weight to believe infant lighter than some degree of weight is more likely to die.
survival<-as.factor(c(rep("death", length(death)), rep("survived", length(survived))))
arranged.data<-data.frame(survival, total)
arranged.data
##    survival total
## 1     death 1.050
## 2     death 1.175
## 3     death 1.230
## 4     death 1.310
## 5     death 1.500
## 6     death 1.600
## 7     death 1.720
## 8     death 1.750
## 9     death 1.770
## 10    death 2.275
## 11    death 2.500
## 12    death 1.030
## 13    death 1.100
## 14    death 1.185
## 15    death 1.225
## 16    death 1.262
## 17    death 1.295
## 18    death 1.300
## 19    death 1.550
## 20    death 1.820
## 21    death 1.890
## 22    death 1.940
## 23    death 2.200
## 24    death 2.270
## 25    death 2.440
## 26    death 2.560
## 27    death 2.730
## 28 survived 1.130
## 29 survived 1.575
## 30 survived 1.680
## 31 survived 1.760
## 32 survived 1.930
## 33 survived 2.015
## 34 survived 2.090
## 35 survived 2.600
## 36 survived 2.700
## 37 survived 2.950
## 38 survived 3.160
## 39 survived 3.400
## 40 survived 3.640
## 41 survived 2.830
## 42 survived 1.410
## 43 survived 1.715
## 44 survived 1.720
## 45 survived 2.040
## 46 survived 2.200
## 47 survived 2.400
## 48 survived 2.550
## 49 survived 2.570
## 50 survived 3.005
boxplot(total~survival, data=arranged.data)

attach(arranged.data)
## The following objects are masked _by_ .GlobalEnv:
## 
##     survival, total
dead<-subset(arranged.data, subset=survival=='death')
survived<-subset(arranged.data, subset=survival=='survived')
fnA<-fivenum(dead[,2])
fnB<-fivenum(survived[,2])

HsprA<-fnA[4]-fnA[2]
HsprB<-fnB[4]-fnB[2]
Hspr<-c(HsprA, HsprB)
M<-c(fnA[3], fnB[3])

plot(log(M), log(Hspr), type="l")

(RegLine<-lm(log(Hspr) ~ log(M)))
## 
## Call:
## lm(formula = log(Hspr) ~ log(M))
## 
## Coefficients:
## (Intercept)       log(M)  
##     -0.5157       0.6854
1-coef(RegLine)[2]
##    log(M) 
## 0.3145712
p<-0.3145712
boxplot(total^(0.3145712)~survival, data=arranged.data)

attach(arranged.data)
## The following objects are masked _by_ .GlobalEnv:
## 
##     survival, total
## The following objects are masked from arranged.data (pos = 3):
## 
##     survival, total
fn<-fivenum(total^(0.3145712))
((fn[4]-fn[3])-(fn[3]-fn[2]))/((fn[4]-fn[3])+(fn[3]-fn[2]))
## [1] 0.0873964
median(survived[,2])
## [1] 2.2

4. Plasma Citrate Concentration

dataQ4<-read.delim("PLASMA.DAT", header=T)
str(dataQ4)
## 'data.frame':    9 obs. of  5 variables:
##  $ X93   : int  116 125 144 105 109 89 116 151 137
##  $ X121  : int  135 137 173 119 83 95 128 149 139
##  $ X112  : int  114 119 148 125 109 88 122 141 125
##  $ X117  : int  98 105 124 91 80 91 107 126 109
##  $ X121.1: int  135 102 122 133 104 116 119 138 107
boxplot(dataQ4)

* Comparing between each boxplots is difficult due to significant diffrence in spread.

A. To achieve unique representative of location, conduct symmetrizing re-expression first
attach(dataQ4)
p.vector<-c()
for (i in 1:ncol(dataQ4)){
  g<-fivenum(dataQ4[,i])
  p<-1-2*g[3]*((g[4]-g[3])+(g[2]-g[3]))/((g[4]-g[3])^2+(g[2]-g[3])^2)
  p.vector[i]<-p
}
p.vector
## [1] -5.628571 12.911765 17.712329 10.905660 -0.400000
new.dataQ4<-rbind(dataQ4[,1]^(p.vector[1]),dataQ4[,2]^(p.vector[2]),dataQ4[,3]^(p.vector[3]),dataQ4[,4]^(p.vector[4]),dataQ4[,5]^(p.vector[5]))
boxplot(new.dataQ4)

* Usually symmetry and similar spread is achieved simultaneously.

fnA<-fivenum(dataQ4[,1])
fnB<-fivenum(dataQ4[,2])
fnC<-fivenum(dataQ4[,3])
fnD<-fivenum(dataQ4[,4])
fnE<-fivenum(dataQ4[,5])
HsprA<-fnA[4]-fnA[2]
HsprB<-fnB[4]-fnB[2]
HsprC<-fnC[4]-fnC[2]
HsprD<-fnD[4]-fnD[2]
HsprE<-fnE[4]-fnE[2]
Hspr<-c(HsprA, HsprB, HsprC, HsprD, HsprE)
M<-c(fnA[3], fnB[3], fnC[3], fnD[3], fnE[3])
(RegLine<-lm(log(Hspr) ~ log(M)))
## 
## Call:
## lm(formula = log(Hspr) ~ log(M))
## 
## Coefficients:
## (Intercept)       log(M)  
##       4.471       -0.313
1-coef(RegLine)[2]
##   log(M) 
## 1.312961
  • Error in executing in Markdown: plot(log(M), log(Hspr)) Plotting log(M), log(Hspr)

  • Error in executing Markdown : par(mfrow=c(1,2)) / boxplot(dataQ4) / boxplot(dataQ4^(1.5)) Boxplot ~ Time deviation

5. Visually demonstrate that log function has intermediate transformation effect by drawing square root of x, cube root of x, reciprocal square root of x, reciprocal cube root of x

curve(x^(1/2), col="red", xlim=c(0,1),ylim=c(-5,2))
curve(x^(1/3),col="orange", add=TRUE)
curve(log(x), col="green", lwd=3, add=TRUE)
curve(-x^(-1/3), col="blue", add=TRUE)
curve(-x^(-1/2), col="purple", add=TRUE)