library(LearnEDAfunctions)
head(lake)
##   Area Depth  PH Wshed Hions
## 1   55    19 7.1   0.8 1e-07
## 2   26    14 6.1   0.3 8e-07
## 3 1065    36 7.6   6.3 0e+00
## 4  213    71 7.6   4.0 0e+00
## 5 1463    35 8.2  33.0 0e+00
## 6  180    24 7.1   5.0 1e-07

Let us which two of these variables is not symmetric

aplpack::stem.leaf(lake$Area)
## 1 | 2: represents 120
##  leaf unit: 10
##             n: 71
##    12    0* | 222223333344
##    26    0. | 55555778899999
##   (11)   1* | 01122223344
##    34    1. | 6777788889
##    24    2* | 0112233
##    17    2. | 8
##    16    3* | 0
##    15    3. | 5579
##    11    4* | 33
##          4. | 
##     9    5* | 3
## HI: 599 610 716 1065 1285 1352 1463 3585
aplpack::stem.leaf(lake$Depth)
## 1 | 2: represents 12
##  leaf unit: 1
##             n: 71
##    6    0. | 777999
##   15    1* | 011334444
##   26    1. | 55677777899
##   32    2* | 133444
##   (6)   2. | 556678
##   33    3* | 012233334
##   24    3. | 5667899
##   17    4* | 0233
##   13    4. | 55
##   11    5* | 00
##    9    5. | 568
##    6    6* | 0
##    5    6. | 5
##    4    7* | 011
## HI: 89
aplpack::stem.leaf(lake$PH)
## 1 | 2: represents 1.2
##  leaf unit: 0.1
##             n: 71
##     1     s | 7
##     4    5. | 889
##    10    6* | 000111
##    14     t | 2222
##    19     f | 44455
##    24     s | 66677
##    34    6. | 8888899999
##   (14)   7* | 00000000111111
##    23     t | 2222333
##    16     f | 444455
##    10     s | 666
##     7    7. | 88
##          8* | 
##     5     t | 2
##           f | 
##     4     s | 666
##     1    8. | 8
aplpack::stem.leaf(lake$Wshed)
## 1 | 2: represents 1.2
##  leaf unit: 0.1
##             n: 71
##    9    0* | 222333444
##   22    0. | 5567777888899
##   35    1* | 0000000000024
##   (6)   1. | 555578
##   30    2* | 000000000
##   21    2. | 55
##   19    3* | 000
##         3. | 
##   16    4* | 00
##         4. | 
##   14    5* | 0
##         5. | 
##   13    6* | 34
## HI: 8 8 10 12 12 13 17 33 65 80 88
hist(lake$Area)

hist(lake$Depth)

hist(lake$PH)

hist(lake$Wshed)

We find that Area and Wshed are not symmetric (both are right-skewed)

1

let us start with the variable Area

fivenum(lake$Area)
## [1]   22.0   76.5  148.0  237.5 3585.0
mean(lake$Area)
## [1] 286.493

Area: LO = 22.0 , FL = 76.5 , M = 148.0 , FU = 237.5 , HI = 3585.0, depth of fourth = (36 + 1) / 2 = 18 1/2, mean = 286.493

We will attempt the H Quick Method

d = (286.493 - 148.0) / 18.5 = 7.4861

this clearly indicates right skwedness

roots <- sqrt(lake$Area)
logs <- log(lake$Area)
hinkley(lake$Area)
##         h 
## 0.8602047
hinkley(roots)
##         h 
## 0.3018693
hinkley(logs)
##            h 
## -0.006043786

we see that the value closest to 0 corresponds to the logs reexpression. Let us see if the logs reexpression will yield symmetry

aplpack::stem.leaf(logs)
## 1 | 2: represents 1.2
##  leaf unit: 0.1
##             n: 71
##     8    3* | 01233444
##    14    3. | 558899
##    21    4* | 0003344
##   (15)   4. | 555556778888889
##    35    5* | 001111122223334444
##    17    5. | 678899
##    11    6* | 00234
##     6    6. | 59
##     4    7* | 122
##          7. | 
##     1    8* | 1
symplot(logs)

(letter.values <- lval(logs))
##   depth       lo       hi     mids  spreads
## M  36.0 4.997212 4.997212 4.997212 0.000000
## H  18.5 4.337269 5.470165 4.903717 1.132896
## E   9.5 3.540854 6.179227 4.860041 2.638373
## D   5.0 3.367296 6.970730 5.169013 3.603434
## C   3.0 3.258097 7.209340 5.233718 3.951244
## B   2.0 3.135494 7.288244 5.211869 4.152750
## A   1.0 3.091042 8.184514 5.637778 5.093471

Looking at the stemplot, we can see that the logs look pretty uniformly distributed. There is one outlier at the high end. Fot the symmetry plot, almost all the points fall under the line u = v indicating left- skewness in the middle portion of the data. The only points above the line is at the far right, which includes the outlier.

We can also check the histogram for our reexpression using logs and see that symmetry is now much more present than prior to our reexpression.

hist(logs)

#### 2

let us do the variable Wshed now

fivenum(lake$Wshed)
## [1]  0.2  0.8  1.5  3.0 88.0
mean(lake$Wshed)
## [1] 6.18169

Wshed: LO = 0.2 , FL = 0.8 , M = 1.5 , FU = 3.0 , HI = 88.0, depth of fourth = (36 + 1) / 2 = 18 1/2, mean = 6.18169

We will attempt the H Quick Method

d = (6.18169 - 1.5) / 18.5 = 0.2531

this indicates right skwedness

roots1 <- sqrt(lake$Wshed)
logs1 <- log(lake$Wshed)
hinkley(lake$Wshed)
##        h 
## 2.128041
hinkley(roots1)
##         h 
## 0.6344084
hinkley(logs1)
##         h 
## 0.1242494

we see that the value closest to 0 corresponds to the logs reexpression. However, let us explore further to see if there is a better reexpression Let’s try taking a p = −1/2 rexpression which corresponds to reciprocal roots (1/√mortality rate ) We will also simultaneously compare to see if the logs reexpression is a better choice

recroots1 <- - 1 / sqrt(lake$Wshed)
aplpack::stem.leaf(logs1)
## 1 | 2: represents 1.2
##  leaf unit: 0.1
##             n: 71
##     3    -1. | 666
##     6    -1* | 222
##    12    -0. | 999665
##    22    -0* | 3333222211
##   (17)    0* | 00000000000134444
##    32     0. | 5566666666699
##    19     1* | 00033
##    14     1. | 688
##    11     2* | 00344
##     6     2. | 58
##     4     3* | 4
## HI: 4.17438726989564 4.38202663467388 4.47733681447821
aplpack::stem.leaf(recroots1)
## 1 | 2: represents 0.12
##  leaf unit: 0.01
##             n: 71
## LO: -2.23606797749979 -2.23606797749979 -2.23606797749979
##    6    -18 | 222
##         -17 | 
##         -16 | 
##    9    -15 | 888
##   11    -14 | 11
##         -13 | 
##   12    -12 | 9
##   20    -11 | 99991111
##   33    -10 | 5500000000000
##   34     -9 | 1
##   (5)    -8 | 41111
##   32     -7 | 64000000000
##   21     -6 | 33
##   19     -5 | 77700
##   14     -4 | 4
##   13     -3 | 99551
##    8     -2 | 8874
##    4     -1 | 7210
symplot(logs1)

symplot(recroots1)

(letter.values1 <- lval(logs1))
##   depth         lo        hi      mids  spreads
## M  36.0  0.4054651 0.4054651 0.4054651 0.000000
## H  18.5 -0.2231436 1.0986123 0.4377344 1.321756
## E   9.5 -0.8047190 2.1910133 0.6931472 2.995732
## D   5.0 -1.2039728 2.8332133 0.8146203 4.037186
## C   3.0 -1.6094379 4.1743873 1.2824747 5.783825
## B   2.0 -1.6094379 4.3820266 1.3862944 5.991465
## A   1.0 -1.6094379 4.4773368 1.4339495 6.086775

Looking at the stemplot, we can see that the logs look pretty uniformly distributed whereas for recroots1, there are some gaps. For the symmetry plot for logs, there are some points which fall under the line u = v but also some that fall above the line. For the recroots1 symmetry plot, we can gather that the recroots1 reexpression will not achieve symmetry. It seems out of the three options (roots, logs, and reciprocal roots), logs is the best option to get our batch closer to symmetry We can also check the histogran for our reexpression using logs and see that symmetry is now much more present than prior to our reexpression (although we can see that there is a little bit of right skwedness).

hist(logs1)

#### 3

We will take the variable density in the dataset largest.us.cities.by.population We can see that the data set is right skewed

largest.us.cities.by.population <- read.csv("C:/Users/eclai/Downloads/largest.us.cities.by.population.csv")
hist(largest.us.cities.by.population$density)

fivenum(largest.us.cities.by.population$density)
## [1]   166.0  2242.0  3293.0  4598.5 26260.0
mean(largest.us.cities.by.population$density)
## [1] 4030.74

Density: LO = 166.0 , FL = 2242.0 , M = 3293.0 , FU = 4598.5 , HI = 26260.0, depth of fourth = (150.5 + 1) / 2 = 75 3/4, mean = 4030.74

We will attempt the H Quick Method

d = (4030.74 - 3293.0) / 75.45 = 9.7779

this clearly indicates right skwedness

roots2 <- sqrt(largest.us.cities.by.population$density)
logs2 <- log(largest.us.cities.by.population$density)
hinkley(largest.us.cities.by.population$density)
##         h 
## 0.3140321
hinkley(roots2)
##         h 
## 0.1356144
hinkley(logs2)
##           h 
## -0.01610969

we see that the value closest to 0 corresponds to the logs reexpression. Let us see if the logs reexpression will achieve symmetry for our batch

aplpack::stem.leaf(logs2)
## 1 | 2: represents 1.2
##  leaf unit: 0.1
##             n: 300
## LO: 5.11198778835654 5.71702770140622
##     3      t | 3
##     5      f | 45
##     6      s | 6
##    10     6. | 8999
##    25     7* | 000000111111111
##    38      t | 2222222333333
##    61      f | 44444445555555555555555
##    96      s | 66666666666777777777777777777777777
##   127     7. | 8888888888888888899999999999999
##   (43)    8* | 0000000000000000000000011111111111111111111
##   130      t | 222222222222222222222222223333333333333333333333333
##    79      f | 4444444444444455555555555
##    54      s | 666666666677777
##    39     8. | 888888888899999
##    24     9* | 0000111
##    17      t | 222223333
##     8      f | 444
##     5      s | 6
##     4     9. | 888
##     1    10* | 1
symplot(logs2)

(letter.values <- lval(logs2))
##   depth       lo        hi     mids   spreads
## M 150.5 8.099553  8.099553 8.099553 0.0000000
## H  75.5 7.715124  8.433481 8.074302 0.7183576
## E  38.0 7.388328  8.824825 8.106576 1.4364971
## D  19.5 7.163170  9.137162 8.150166 1.9739925
## C  10.0 6.961296  9.347665 8.154480 2.3863685
## B   5.5 6.604454  9.544079 8.074266 2.9396247
## A   3.0 6.381816  9.810056 8.095936 3.4282396
## Z   2.0 5.717028  9.818801 7.767914 4.1017730
## Y   1.0 5.111988 10.175802 7.643895 5.0638144

Looking at the stemplot, we can see that the logs look pretty uniformly distributed. For the symmetry plot for logs, there are some points which fall under the line u = v but also some that fall above the line. Overall they looks uniformly distributed except for 3 outliers towards the far right. When checking out the mids, we can see there is no particular pattern, which is an indication for symmetry. Now, we can also check the histogram for our reexpression using logs and see that symmetry is now much more present than prior to our reexpression of this density batch.

hist(logs2)

#### 4

We will take the variable area in the dataset largest.us.cities.by.population We can see that there is a right skewdness which can be observed in the histogram

hist(largest.us.cities.by.population$area)

fivenum(largest.us.cities.by.population$area)
## [1]    6.390   37.290   59.325  107.710 1706.800
mean(largest.us.cities.by.population$area)
## [1] 98.945

Density: LO = 6.390 , FL = 37.290 , M = 59.325 , FU = 107.710 , HI = 1706.800, depth of fourth = (150.5 + 1) / 2 = 75 3/4, mean = 98.945

We will attempt the H Quick Method

d = (98.945 - 59.325) / 75.45 = 0.5251

this indicates right skwedness

roots3 <- sqrt(largest.us.cities.by.population$area)
logs3 <- log(largest.us.cities.by.population$area)
hinkley(largest.us.cities.by.population$area)
##         h 
## 0.5641062
hinkley(roots3)
##         h 
## 0.2748477
hinkley(logs3)
##          h 
## 0.08422885

we see that the value closest to 0 corresponds to the logs reexpression. Let us see if the logs reexpression will achieve symmetry for our batch

aplpack::stem.leaf(logs3)
## 1 | 2: represents 1.2
##  leaf unit: 0.1
##             n: 300
## LO: 1.85473426838944
##     2    2* | 1
##     3     t | 3
##     5     f | 55
##    10     s | 66777
##    18    2. | 88889999
##    32    3* | 00001111111111
##    47     t | 222222223333333
##    72     f | 4444444444455555555555555
##   105     s | 666666666666666666677777777777777
##   138    3. | 888888888888888889999999999999999
##   (35)   4* | 00000000000001111111111111111111111
##   127     t | 2222222222233333333333333
##   102     f | 44444445555555555
##    85     s | 66666666666677777777777
##    62    4. | 888888999999999999
##    44    5* | 0000001111
##    34     t | 22222333
##    26     f | 44555
##    21     s | 67777777
##    13    5. | 88889
##     8    6* | 111
##     5     t | 2
##     4     f | 44
##     2     s | 6
## HI: 7.44237555131384
symplot(logs3)

(letter.values <- lval(logs3))
##   depth       lo       hi     mids  spreads
## M 150.5 4.082981 4.082981 4.082981 0.000000
## H  75.5 3.618725 4.679437 4.149081 1.060712
## E  38.0 3.273364 5.132440 4.202902 1.859076
## D  19.5 3.046583 5.708184 4.377384 2.661601
## C  10.0 2.776332 5.890456 4.333394 3.114124
## B   5.5 2.564654 6.216857 4.390755 3.652203
## A   3.0 2.348514 6.461765 4.405140 4.113251
## Z   2.0 2.129421 6.616694 4.373058 4.487273
## Y   1.0 1.854734 7.442376 4.648555 5.587641

Looking at the stemplot, we can see that the logs look pretty uniformly distributed. For the symmetry plot for logs, there are some points which fall under the line u = v but also some that fall above the line especially after the first few points or so. Towards the far right, we can also see a few outliers that lie way above the line. Now, we can also check the histogram for our reexpression using logs and see that symmetry is now much more present than prior to our reexpression of this density batch.

hist(logs3)