Objective: Conduct exploratory data analysis to determine the major factors associated with “living potential” in a state.

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

## Loading required package: usethis

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:data.table':
## 
##     first, last

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

## corrplot 0.92 loaded

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:plotly':
## 
##     subplot

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize

## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

EDA Step 0: import the excel data - states and overall summary

## states 
## 
##  10  Variables      50  Observations
## --------------------------------------------------------------------------------
## Rank 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------
## State 
##        n  missing distinct 
##       50        0       50 
## 
## lowest : Alabama      Alaska       Arizona      Arkansas     California  
## highest: Virginia     Washington   WestVirginia Wisconsin    Wyoming     
## --------------------------------------------------------------------------------
## Health Care 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------
## Education 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------
## Economy 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------
## Infrastructure 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------
## Opportunity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------
## Fiscal Stability 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------
## Crime & Corrections 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------
## Natural Environment 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       50        0       50        1     25.5       17     3.45     5.90 
##      .25      .50      .75      .90      .95 
##    13.25    25.50    37.75    45.10    47.55 
## 
## lowest :  1  2  3  4  5, highest: 46 47 48 49 50
## --------------------------------------------------------------------------------

##  [1] "Rank"                "State"               "Health Care"        
##  [4] "Education"           "Economy"             "Infrastructure"     
##  [7] "Opportunity"         "Fiscal Stability"    "Crime & Corrections"
## [10] "Natural Environment"

## [1] 50 10

## Rows: 50
## Columns: 10
## $ Rank                  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ State                 <chr> "Washington", "Minnesota", "Utah", "NewHampshire…
## $ `Health Care`         <dbl> 8, 16, 11, 13, 24, 28, 12, 15, 2, 25, 18, 20, 30…
## $ Education             <dbl> 4, 17, 10, 13, 29, 9, 12, 8, 2, 3, 15, 18, 7, 25…
## $ Economy               <dbl> 4, 15, 1, 11, 3, 20, 13, 26, 5, 8, 25, 27, 17, 3…
## $ Infrastructure        <dbl> 3, 9, 5, 34, 10, 6, 39, 24, 42, 20, 12, 19, 22, …
## $ Opportunity           <dbl> 25, 2, 30, 3, 24, 10, 8, 9, 36, 33, 17, 1, 28, 2…
## $ `Fiscal Stability`    <dbl> 6, 21, 5, 33, 4, 17, 18, 9, 43, 8, 37, 23, 7, 16…
## $ `Crime & Corrections` <dbl> 19, 15, 8, 1, 10, 31, 9, 25, 4, 26, 3, 14, 20, 1…
## $ `Natural Environment` <dbl> 15, 10, 47, 2, 12, 6, 19, 17, 4, 18, 9, 20, 27, …

## # A tibble: 5 × 10
##    Rank State        `Health Care` Education Economy Infrastructure Opportunity
##   <dbl> <chr>                <dbl>     <dbl>   <dbl>          <dbl>       <dbl>
## 1     1 Washington               8         4       4              3          25
## 2     2 Minnesota               16        17      15              9           2
## 3     3 Utah                    11        10       1              5          30
## 4     4 NewHampshire            13        13      11             34           3
## 5     5 Idaho                   24        29       3             10          24
## # … with 3 more variables: `Fiscal Stability` <dbl>,
## #   `Crime & Corrections` <dbl>, `Natural Environment` <dbl>

##       Rank          State            Health Care      Education    
##  Min.   : 1.00   Length:50          Min.   : 1.00   Min.   : 1.00  
##  1st Qu.:13.25   Class :character   1st Qu.:13.25   1st Qu.:13.25  
##  Median :25.50   Mode  :character   Median :25.50   Median :25.50  
##  Mean   :25.50                      Mean   :25.50   Mean   :25.50  
##  3rd Qu.:37.75                      3rd Qu.:37.75   3rd Qu.:37.75  
##  Max.   :50.00                      Max.   :50.00   Max.   :50.00  
##     Economy      Infrastructure   Opportunity    Fiscal Stability
##  Min.   : 1.00   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00   
##  1st Qu.:13.25   1st Qu.:13.25   1st Qu.:13.25   1st Qu.:13.25   
##  Median :25.50   Median :25.50   Median :25.50   Median :25.50   
##  Mean   :25.50   Mean   :25.50   Mean   :25.50   Mean   :25.50   
##  3rd Qu.:37.75   3rd Qu.:37.75   3rd Qu.:37.75   3rd Qu.:37.75   
##  Max.   :50.00   Max.   :50.00   Max.   :50.00   Max.   :50.00   
##  Crime & Corrections Natural Environment
##  Min.   : 1.00       Min.   : 1.00      
##  1st Qu.:13.25       1st Qu.:13.25      
##  Median :25.50       Median :25.50      
##  Mean   :25.50       Mean   :25.50      
##  3rd Qu.:37.75       3rd Qu.:37.75      
##  Max.   :50.00       Max.   :50.00

Based on the above analysis, we can conclude the following: - This table contains 50 rows and 10 columns - names of the columns(Factors):

[1] “Rank” “State”
[3] “Health Care” “Education”
[5] “Economy” “Infrastructure”
[7] “Opportunity” “Fiscal Stability”
[9] “Crime & Corrections” “Natural Environment” - No missing data - The data is ordered by the “Rank column” - The table contains data of 50 states. Order of the rank: best: 1, worst:50 - According to the above criteria, Washington State has the highest living potential followed by Louisiana as worst.

In order to understand the factors responsible for determining the living potential of a state, we will conduct the rest of the Exploratory Data Analysis (EDA).

EDA Step 1: Gain visual insights

Box-whisker chart top 10 states

Objective:

In order to understand the factors associated with the best states to live, we conduct the Box-Whiskers plot of the top 10 US states. This plot uses X-axis (Factors) and Y-axis (Rank and States). # Results: - Five factors are important: Healthcare, Education, Economy, Crime and Natural Environment. These factors are positively influencing the living potential in these states. - There are two outliers: Education and Natural Environment, which might be driving the spread of the data

## Warning in melt(data = top10States, id.vars = c("Rank", "State"), variable.name
## = "Factors", : The melt generic in data.table has been passed a tbl_df and will
## attempt to redirect to the relevant reshape2 method; please note that reshape2
## is deprecated, and this redirection is now deprecated as well. To continue using
## melt methods from reshape2 while both libraries are attached, e.g. melt.list,
## you can prepend the namespace like reshape2::melt(top10States). In the next
## version, this warning will become an error.

box and whisker chart for bottom 10 states

Objective:

In order to understand the factors associated with the worst states to live, we conduct the Box-Whiskers plot of the bottom 10 US states. This plot uses X-axis (Factors) and Y-axis (Rank and States). # Results: - As identified above, these five factors (Healthcare, Education, Economy, Crime and Natural Environment) are driving the living potential negatively. - It is noticeable that crime is higher for the bottom 10 states compared to top 10. - The outlier in Economy is influencing the spread of the data.

## Warning in melt(data = bottom10States, id.vars = c("Rank", "State"),
## variable.name = "Categories", : The melt generic in data.table has been
## passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(bottom10States). In the next version, this warning will become an
## error.

EDA Step 2A: Correlation Analysis

Column correlation Analysis: Factors and Sub-Factors

Objective:

In order to further understand the importance of the identified factors and the sub-factors driving the decision to categorize as the best and worst states to live.
# Results: - The dark blue circles indicate significant correlation between factor-to-factor and factor-to-sub-factor. - From the plot, it can be observed that five factors (Healthcare, Education, Economy, Crime and Natural Environment) and their interaction with the sub-factors are driving the living potential both positively and negatively.

##                      Health Care Access Health Care Quality Public Health
## Health Care Access                 1.00                0.25          0.54
## Health Care Quality                0.25                1.00          0.65
## Public Health                      0.54                0.65          1.00
## Business Environment               0.11                0.40          0.53
## Employment                         0.12                0.28          0.50
## Growth                            -0.20                0.21          0.16
## Higher Education                  -0.11                0.30          0.41
## Pre-K-12                           0.67                0.33          0.58
## Energy                            -0.34               -0.03          0.02
## Internet Access                   -0.03                0.22          0.36
## Transportation                    -0.38               -0.06         -0.08
## Affordability                     -0.27               -0.71         -0.72
## Economic Opportunity               0.49                0.60          0.73
## Equality                           0.43               -0.02         -0.03
## Long-Term                         -0.16                0.10          0.00
## Short-Term                        -0.59               -0.12         -0.27
## Corrections                        0.28                0.40          0.49
## Public Safety                      0.56                0.10          0.38
## Air & Water Quality                0.38               -0.05          0.24
## Pollution                          0.17                0.16          0.37
##                      Business Environment Employment Growth Higher Education
## Health Care Access                   0.11       0.12  -0.20            -0.11
## Health Care Quality                  0.40       0.28   0.21             0.30
## Public Health                        0.53       0.50   0.16             0.41
## Business Environment                 1.00       0.46   0.50             0.27
## Employment                           0.46       1.00   0.63             0.41
## Growth                               0.50       0.63   1.00             0.28
## Higher Education                     0.27       0.41   0.28             1.00
## Pre-K-12                             0.36       0.47   0.13             0.12
## Energy                              -0.03       0.36   0.49             0.46
## Internet Access                      0.43       0.42   0.34             0.53
## Transportation                       0.04       0.27   0.37             0.43
## Affordability                       -0.37      -0.26  -0.14            -0.22
## Economic Opportunity                 0.30       0.55   0.11             0.33
## Equality                             0.00      -0.16  -0.06            -0.28
## Long-Term                            0.26       0.35   0.49             0.42
## Short-Term                          -0.06       0.14   0.24             0.17
## Corrections                          0.27       0.35   0.20             0.15
## Public Safety                        0.06       0.24  -0.03            -0.10
## Air & Water Quality                 -0.10       0.29  -0.08             0.13
## Pollution                           -0.01       0.18   0.04             0.24
##                      Pre-K-12 Energy Internet Access Transportation
## Health Care Access       0.67  -0.34           -0.03          -0.38
## Health Care Quality      0.33  -0.03            0.22          -0.06
## Public Health            0.58   0.02            0.36          -0.08
## Business Environment     0.36  -0.03            0.43           0.04
## Employment               0.47   0.36            0.42           0.27
## Growth                   0.13   0.49            0.34           0.37
## Higher Education         0.12   0.46            0.53           0.43
## Pre-K-12                 1.00  -0.09            0.18          -0.13
## Energy                  -0.09   1.00            0.09           0.64
## Internet Access          0.18   0.09            1.00           0.33
## Transportation          -0.13   0.64            0.33           1.00
## Affordability           -0.19   0.06           -0.22           0.03
## Economic Opportunity     0.54   0.08            0.35           0.13
## Equality                 0.24  -0.39           -0.07          -0.22
## Long-Term                0.11   0.36            0.13           0.30
## Short-Term              -0.36   0.35            0.08           0.43
## Corrections              0.30  -0.06            0.34          -0.05
## Public Safety            0.64  -0.10           -0.03          -0.11
## Air & Water Quality      0.34  -0.07            0.06          -0.10
## Pollution                0.17   0.22            0.00          -0.01
##                      Affordability Economic Opportunity Equality Long-Term
## Health Care Access           -0.27                 0.49     0.43     -0.16
## Health Care Quality          -0.71                 0.60    -0.02      0.10
## Public Health                -0.72                 0.73    -0.03      0.00
## Business Environment         -0.37                 0.30     0.00      0.26
## Employment                   -0.26                 0.55    -0.16      0.35
## Growth                       -0.14                 0.11    -0.06      0.49
## Higher Education             -0.22                 0.33    -0.28      0.42
## Pre-K-12                     -0.19                 0.54     0.24      0.11
## Energy                        0.06                 0.08    -0.39      0.36
## Internet Access              -0.22                 0.35    -0.07      0.13
## Transportation                0.03                 0.13    -0.22      0.30
## Affordability                 1.00                -0.56    -0.08      0.14
## Economic Opportunity         -0.56                 1.00     0.04      0.00
## Equality                     -0.08                 0.04     1.00      0.10
## Long-Term                     0.14                 0.00     0.10      1.00
## Short-Term                    0.09                -0.18    -0.39      0.33
## Corrections                  -0.57                 0.33     0.24     -0.05
## Public Safety                -0.10                 0.41     0.17     -0.05
## Air & Water Quality          -0.06                 0.26     0.24      0.20
## Pollution                    -0.35                 0.38    -0.13     -0.02
##                      Short-Term Corrections Public Safety Air & Water Quality
## Health Care Access        -0.59        0.28          0.56                0.38
## Health Care Quality       -0.12        0.40          0.10               -0.05
## Public Health             -0.27        0.49          0.38                0.24
## Business Environment      -0.06        0.27          0.06               -0.10
## Employment                 0.14        0.35          0.24                0.29
## Growth                     0.24        0.20         -0.03               -0.08
## Higher Education           0.17        0.15         -0.10                0.13
## Pre-K-12                  -0.36        0.30          0.64                0.34
## Energy                     0.35       -0.06         -0.10               -0.07
## Internet Access            0.08        0.34         -0.03                0.06
## Transportation             0.43       -0.05         -0.11               -0.10
## Affordability              0.09       -0.57         -0.10               -0.06
## Economic Opportunity      -0.18        0.33          0.41                0.26
## Equality                  -0.39        0.24          0.17                0.24
## Long-Term                  0.33       -0.05         -0.05                0.20
## Short-Term                 1.00       -0.18         -0.24               -0.12
## Corrections               -0.18        1.00          0.07                0.07
## Public Safety             -0.24        0.07          1.00                0.29
## Air & Water Quality       -0.12        0.07          0.29                1.00
## Pollution                 -0.05        0.25          0.30                0.25
##                      Pollution
## Health Care Access        0.17
## Health Care Quality       0.16
## Public Health             0.37
## Business Environment     -0.01
## Employment                0.18
## Growth                    0.04
## Higher Education          0.24
## Pre-K-12                  0.17
## Energy                    0.22
## Internet Access           0.00
## Transportation           -0.01
## Affordability            -0.35
## Economic Opportunity      0.38
## Equality                 -0.13
## Long-Term                -0.02
## Short-Term               -0.05
## Corrections               0.25
## Public Safety             0.30
## Air & Water Quality       0.25
## Pollution                 1.00

## EDA Step 2B: Correlation Analysis # evaluate the entire dataset for correlation # Objective: In order to further quantify the significance of the major factors driving the living potential of the top/bottom 10 states, we have conducted this analysis.
# Results: - The dark blue circles indicate significant correlation between factor-to-factor and factor-to-sub-factor. - From the plot, it can be observed that five factors (Healthcare, Education, Economy, Crime and Natural Environment) and their interaction with each other are significant for living potential consideration. - Significant factor-factor interactions (>0.5) are: Healthcare - Education: 0.63 Education - Crime: 0.59 Healthcare - Crime: 0.58 These results indicate that the living potential of a state depends on providing Healthcare, Education and reduction of crime.

#truncate data by dropping Rank and State columns
drop <- c("Rank","State")
states_trunk = states[,!(names(states) %in% drop)]
states_trunk # view the data

## # A tibble: 50 × 8
##    `Health Care` Education Economy Infrastructure Opportunity `Fiscal Stability`
##            <dbl>     <dbl>   <dbl>          <dbl>       <dbl>              <dbl>
##  1             8         4       4              3          25                  6
##  2            16        17      15              9           2                 21
##  3            11        10       1              5          30                  5
##  4            13        13      11             34           3                 33
##  5            24        29       3             10          24                  4
##  6            28         9      20              6          10                 17
##  7            12        12      13             39           8                 18
##  8            15         8      26             24           9                  9
##  9             2         2       5             42          36                 43
## 10            25         3       8             20          33                  8
## # … with 40 more rows, and 2 more variables: `Crime & Corrections` <dbl>,
## #   `Natural Environment` <dbl>

# correlation matrix
res <- cor(states_trunk)
round(res,2)

##                     Health Care Education Economy Infrastructure Opportunity
## Health Care                1.00      0.63    0.34          -0.06       -0.02
## Education                  0.63      1.00    0.44           0.26        0.27
## Economy                    0.34      0.44    1.00           0.50        0.05
## Infrastructure            -0.06      0.26    0.50           1.00        0.12
## Opportunity               -0.02      0.27    0.05           0.12        1.00
## Fiscal Stability          -0.10      0.08    0.32           0.42        0.09
## Crime & Corrections        0.58      0.59    0.13          -0.03        0.27
## Natural Environment        0.33      0.43    0.03           0.09        0.15
##                     Fiscal Stability Crime & Corrections Natural Environment
## Health Care                    -0.10                0.58                0.33
## Education                       0.08                0.59                0.43
## Economy                         0.32                0.13                0.03
## Infrastructure                  0.42               -0.03                0.09
## Opportunity                     0.09                0.27                0.15
## Fiscal Stability                1.00               -0.24               -0.01
## Crime & Corrections            -0.24                1.00                0.50
## Natural Environment            -0.01                0.50                1.00

# use Hmisc library to get further correlation data

res2 <- rcorr(as.matrix(states_trunk))
res2$r # display correlation coefficients

##                     Health Care Education   Economy Infrastructure Opportunity
## Health Care          1.00000000 0.6301561 0.3357983    -0.05805522 -0.02194478
## Education            0.63015606 1.0000000 0.4397119     0.26223289  0.26943577
## Economy              0.33579832 0.4397119 1.0000000     0.49771909  0.04749100
## Infrastructure      -0.05805522 0.2622329 0.4977191     1.00000000  0.11529412
## Opportunity         -0.02194478 0.2694358 0.0474910     0.11529412  1.00000000
## Fiscal Stability    -0.10271309 0.0847539 0.3157263     0.42376951  0.09493397
## Crime & Corrections  0.57599040 0.5945258 0.1300840    -0.03068427  0.26780312
## Natural Environment  0.33243697 0.4339496 0.0337575     0.08993998  0.14545018
##                     Fiscal Stability Crime & Corrections Natural Environment
## Health Care              -0.10271309          0.57599040          0.33243697
## Education                 0.08475390          0.59452581          0.43394958
## Economy                   0.31572629          0.13008403          0.03375750
## Infrastructure            0.42376951         -0.03068427          0.08993998
## Opportunity               0.09493397          0.26780312          0.14545018
## Fiscal Stability          1.00000000         -0.23822329         -0.01109244
## Crime & Corrections      -0.23822329          1.00000000          0.49896759
## Natural Environment      -0.01109244          0.49896759          1.00000000

res2$P # display the p-values

##                      Health Care    Education      Economy Infrastructure
## Health Care                   NA 9.401790e-07 0.0171219838   0.6888122563
## Education           9.401790e-07           NA 0.0013980313   0.0658110886
## Economy             1.712198e-02 1.398031e-03           NA   0.0002353927
## Infrastructure      6.888123e-01 6.581109e-02 0.0002353927             NA
## Opportunity         8.797656e-01 5.846856e-02 0.7432872647   0.4252788839
## Fiscal Stability    4.778270e-01 5.584152e-01 0.0255142955   0.0021647915
## Crime & Corrections 1.204713e-05 5.302387e-06 0.3679063788   0.8324717662
## Natural Environment 1.833668e-02 1.641386e-03 0.8159707474   0.5345029056
##                     Opportunity Fiscal Stability Crime & Corrections
## Health Care          0.87976559      0.477827006        1.204713e-05
## Education            0.05846856      0.558415202        5.302387e-06
## Economy              0.74328726      0.025514295        3.679064e-01
## Infrastructure       0.42527888      0.002164791        8.324718e-01
## Opportunity                  NA      0.511959606        6.007224e-02
## Fiscal Stability     0.51195961               NA        9.571981e-02
## Crime & Corrections  0.06007224      0.095719811                  NA
## Natural Environment  0.31352789      0.939058053        2.257294e-04
##                     Natural Environment
## Health Care                0.0183366794
## Education                  0.0016413862
## Economy                    0.8159707474
## Infrastructure             0.5345029056
## Opportunity                0.3135278911
## Fiscal Stability           0.9390580534
## Crime & Corrections        0.0002257294
## Natural Environment                  NA

#
# matrix of the p-value of the correlation
# computing correlogram with the significance test
cor.mtest <- function(mat, ...){
  mat <- as.matrix(mat)
    n <- ncol(mat)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(mat[, i], mat[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
      colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}
p.mat <- cor.mtest(states_trunk)

# Insignificant correlation are crossed
corrplot(res, type="upper", order="hclust", p.mat = p.mat, sig.level = 0.01, insig = "blank")

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(res, method="color", col=col(50),  
         type="upper", order="hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=25, #Text label color and rotation
         # Combine with significance
         p.mat = p.mat, sig.level = 0.01, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=FALSE 
         )

```

Exploratory Data Analysis: USNews-State-Rankings

Mir Quddus

5/1/2022

Objective: Conduct exploratory data analysis to determine the major factors associated with “living potential” in a state.

EDA Step 0: import the excel data - states and overall summary

EDA Step 1: Gain visual insights

Box-whisker chart top 10 states

Objective:

box and whisker chart for bottom 10 states

Objective:

EDA Step 2A: Correlation Analysis

Column correlation Analysis: Factors and Sub-Factors

Objective: