library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DT)
library(knitr)
  1. CDC’s Division of Population Health provides cross-cutting set of 124 indicators that were developed by consensus and that allows states and territories and large metropolitan areas to uniformly define, collect, and report chronic disease data that are important to public health practice and available for states, territories and large metropolitan areas.
    source: https://data.cdc.gov/Chronic-Disease-Indicators/U-S-Chronic-Disease-Indicators-CDI-/g4ie-h725

Data Acquisition

cdi.data <- read.csv('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 607\\week6\\U.S._Chronic_Disease_Indicators__CDI_.csv')
dim(cdi.data)
## [1] 523486     34
head(cdi.data)
##   YearStart YearEnd LocationAbbr  LocationDesc DataSource   Topic
## 1      2016    2016           US United States      BRFSS Alcohol
## 2      2016    2016           AL       Alabama      BRFSS Alcohol
## 3      2016    2016           AK        Alaska      BRFSS Alcohol
## 4      2016    2016           AZ       Arizona      BRFSS Alcohol
## 5      2016    2016           AR      Arkansas      BRFSS Alcohol
## 6      2016    2016           CA    California      BRFSS Alcohol
##                                                  Question Response
## 1 Binge drinking prevalence among adults aged >= 18 years       NA
## 2 Binge drinking prevalence among adults aged >= 18 years       NA
## 3 Binge drinking prevalence among adults aged >= 18 years       NA
## 4 Binge drinking prevalence among adults aged >= 18 years       NA
## 5 Binge drinking prevalence among adults aged >= 18 years       NA
## 6 Binge drinking prevalence among adults aged >= 18 years       NA
##   DataValueUnit    DataValueType DataValue DataValueAlt
## 1             % Crude Prevalence      16.9         16.9
## 2             % Crude Prevalence      13.0         13.0
## 3             % Crude Prevalence      18.2         18.2
## 4             % Crude Prevalence      15.6         15.6
## 5             % Crude Prevalence      15.0         15.0
## 6             % Crude Prevalence      16.3         16.3
##   DataValueFootnoteSymbol         DatavalueFootnote LowConfidenceLimit
## 1                       * 50 States + DC: US Median               16.0
## 2                                                                 11.9
## 3                                                                 16.0
## 4                                                                 14.3
## 5                                                                 13.0
## 6                                                                 15.4
##   HighConfidenceLimit StratificationCategory1 Stratification1
## 1                18.0                 Overall         Overall
## 2                14.1                 Overall         Overall
## 3                20.6                 Overall         Overall
## 4                16.9                 Overall         Overall
## 5                17.2                 Overall         Overall
## 6                17.2                 Overall         Overall
##   StratificationCategory2 Stratification2 StratificationCategory3
## 1                      NA              NA                      NA
## 2                      NA              NA                      NA
## 3                      NA              NA                      NA
## 4                      NA              NA                      NA
## 5                      NA              NA                      NA
## 6                      NA              NA                      NA
##   Stratification3                               GeoLocation ResponseID
## 1              NA                                                   NA
## 2              NA   (32.84057112200048, -86.63186076199969)         NA
## 3              NA  (64.84507995700051, -147.72205903599973)         NA
## 4              NA (34.865970280000454, -111.76381127699972)         NA
## 5              NA   (34.74865012400045, -92.27449074299966)         NA
## 6              NA  (37.63864012300047, -120.99999953799971)         NA
##   LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1
## 1         59     ALC     ALC2_2         CrdPrev                   OVERALL
## 2          1     ALC     ALC2_2         CrdPrev                   OVERALL
## 3          2     ALC     ALC2_2         CrdPrev                   OVERALL
## 4          4     ALC     ALC2_2         CrdPrev                   OVERALL
## 5          5     ALC     ALC2_2         CrdPrev                   OVERALL
## 6          6     ALC     ALC2_2         CrdPrev                   OVERALL
##   StratificationID1 StratificationCategoryID2 StratificationID2
## 1               OVR                        NA                NA
## 2               OVR                        NA                NA
## 3               OVR                        NA                NA
## 4               OVR                        NA                NA
## 5               OVR                        NA                NA
## 6               OVR                        NA                NA
##   StratificationCategoryID3 StratificationID3
## 1                        NA                NA
## 2                        NA                NA
## 3                        NA                NA
## 4                        NA                NA
## 5                        NA                NA
## 6                        NA                NA

Let’s perform anaylysis for US (national) data

cdi.data.us <- cdi.data %>%
  select(YearStart,LocationAbbr,DataSource,Topic,Question,QuestionID,DataValueType,DataValue,DataValueAlt,Stratification1) %>%
  filter(LocationAbbr=='US' , DataValueAlt!='NA')
kable(head(cdi.data.us))
YearStart LocationAbbr DataSource Topic Question QuestionID DataValueType DataValue DataValueAlt Stratification1
2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years ALC2_2 Crude Prevalence 16.9 16.9 Overall
2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years ALC2_2 Age-adjusted Prevalence 17.9 17.9 Overall
2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years ALC2_2 Crude Prevalence 21.9 21.9 Male
2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years ALC2_2 Age-adjusted Prevalence 22.5 22.5 Male
2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years ALC2_2 Crude Prevalence 12.0 12.0 Female
2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years ALC2_2 Age-adjusted Prevalence 13.1 13.1 Female

Checking for existing CDI Topics in the dataset

unique(cdi.data.us$Topic)
##  [1] Alcohol                                        
##  [2] Arthritis                                      
##  [3] Asthma                                         
##  [4] Cancer                                         
##  [5] Chronic Obstructive Pulmonary Disease          
##  [6] Cardiovascular Disease                         
##  [7] Diabetes                                       
##  [8] Immunization                                   
##  [9] Mental Health                                  
## [10] Chronic Kidney Disease                         
## [11] Nutrition, Physical Activity, and Weight Status
## [12] Older Adults                                   
## [13] Oral Health                                    
## [14] Overarching Conditions                         
## [15] Reproductive Health                            
## [16] Tobacco                                        
## [17] Disability                                     
## 17 Levels: Alcohol Arthritis Asthma Cancer ... Tobacco

Checking for existing CDI data sources in the dataset

unique(cdi.data.us$DataSource)
##  [1] BRFSS                                  
##  [2] YRBSS                                  
##  [3] ACS 5-Year Estimates                   
##  [4] USRDS                                  
##  [5] NVSS                                   
##  [6] Statewide central cancer registries    
##  [7] Death certificate                      
##  [8] CDC Breastfeeding Report Card          
##  [9] mPINC                                  
## [10] HRSA, UDS                              
## [11] ANRF                                   
## [12] AEDS                                   
## [13] InfoUSA; USDA                          
## [14] WFRS                                   
## [15] National Immunization Survey           
## [16] USDA National Farmers' Market Directory
## [17] Current Population Survey              
## [18] ACS 1-Year Estimates                   
## [19] NSCH                                   
## [20] Birth Certificate, NVSS                
## [21] NVSS, Mortality                        
## [22] CMS CCW                                
## [23] CMS Part A Claims Data                 
## 32 Levels: ACS 1-Year Estimates ACS 5-Year Estimates AEDS ANRF ... YRBSS

Checking for existing CDI Datavalue type in the dataset

unique(cdi.data.us$DataValueType)
##  [1] Crude Prevalence                        
##  [2] Age-adjusted Prevalence                 
##  [3] Mean                                    
##  [4] Age-adjusted Mean                       
##  [5] Number                                  
##  [6] Median                                  
##  [7] Adjusted by age, sex, race and ethnicity
##  [8] Age-adjusted Rate                       
##  [9] Crude Rate                              
## [10] Average Annual Age-adjusted Rate        
## [11] Average Annual Crude Rate               
## [12] Average Annual Number                   
## [13] Percent                                 
## [14] Per capita alcohol consumption          
## [15] Prevalence                              
## 19 Levels: Adjusted by age, sex, race and ethnicity ... Yes/No

Checking for existing CDI years in the dataset

unique(cdi.data.us$YearStart)
##  [1] 2016 2015 2014 2013 2012 2011 2010 2009 2001 2008
nrow(cdi.data.us)
## [1] 3635

Checking for Data for datavaluetype Crude Prevalence

cdi.data.us.crdprv <- cdi.data.us %>%
    filter(DataValueType=='Crude Prevalence')
nrow(cdi.data.us.crdprv)
## [1] 1242

EDA Analysis

ggplot(data = cdi.data.us.crdprv, aes(x=Topic,y=DataValueAlt))+
  geom_bar(stat = 'identity',aes(fill=Topic))+
  facet_wrap(~Stratification1, ncol = 2)+
  theme_bw()

Performing EDA for CDI group Alchol

ggplot(data=filter(cdi.data.us.crdprv,Topic=='Alcohol'),aes(x=YearStart, y=DataValueAlt, group=Stratification1
                                                                ,color=Stratification1)) +
    geom_line()+
    geom_point()+
    facet_wrap(~Question,ncol = 1)+
    theme_bw()

Binge drinking prevalence among adults aged >= 18 years

library(dplyr)
cdi.data.us.crdprv.ALC2_2 <- filter(cdi.data.us.crdprv,QuestionID=='ALC2_2')
summary(cdi.data.us.crdprv.ALC2_2$DataValueAlt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.10   12.15   16.85   16.96   21.40   24.20

Statistical Analysis

fill <- '#56B4E9' 
ggplot(cdi.data.us.crdprv.ALC2_2, aes(x =Stratification1, y = DataValueAlt)) +
        geom_boxplot(fill=fill)+
        theme_bw()

Analyzing for Normal Distribution and Probability

cdi.data.us.ALC2_2.male <- filter(cdi.data.us,Stratification1=='Overall',QuestionID=='ALC2_2')
hist(cdi.data.us.ALC2_2.male$DataValueAlt)

The plot appears to have normal distribution with right skewedness

alc2_2_mean <- mean(cdi.data.us.ALC2_2.male$DataValueAlt)
alc2_2_mean
## [1] 17.23333
alc2_2_sd <- sd(cdi.data.us.ALC2_2.male$DataValueAlt)
alc2_2_sd
## [1] 0.8616404
IQR(cdi.data.us.ALC2_2.male$DataValueAlt)
## [1] 0.725
hist(cdi.data.us.ALC2_2.male$DataValueAlt, probability = TRUE)
x <- 16:19.5
y <- dnorm(x = x, mean = alc2_2_mean, sd = alc2_2_sd)
lines(x = x, y = y, col = "blue")

qqnorm(cdi.data.us.ALC2_2.male$DataValueAlt)
qqline(cdi.data.us.ALC2_2.male$DataValueAlt)

Probability of randomly chosing a man with datavalue=19

1 - pnorm(q = 19, mean = alc2_2_mean, sd = alc2_2_sd)
## [1] 0.02016502

Probability of randomly chosing a man with datavalue=16

1 - pnorm(q = 16, mean = alc2_2_mean, sd = alc2_2_sd)
## [1] 0.9238391

Analysis for Crude Prevalence data type and Percentage increase in data value for year 2016 and 2015

Let’s perform the data analysis for US (national) for the year 2016 and for datavalue type ‘Crude Prevalence’

cdi.data.us.cp.2016 <- filter(cdi.data.us,DataValueType=='Crude Prevalence',YearStart=='2016')
head(cdi.data.us.cp.2016)
##   YearStart LocationAbbr DataSource   Topic
## 1      2016           US      BRFSS Alcohol
## 2      2016           US      BRFSS Alcohol
## 3      2016           US      BRFSS Alcohol
## 4      2016           US      BRFSS Alcohol
## 5      2016           US      BRFSS Alcohol
## 6      2016           US      BRFSS Alcohol
##                                                  Question QuestionID
## 1 Binge drinking prevalence among adults aged >= 18 years     ALC2_2
## 2 Binge drinking prevalence among adults aged >= 18 years     ALC2_2
## 3 Binge drinking prevalence among adults aged >= 18 years     ALC2_2
## 4  Binge drinking prevalence among women aged 18-44 years     ALC2_3
## 5            Heavy drinking among adults aged >= 18 years     ALC5_1
## 6            Heavy drinking among adults aged >= 18 years     ALC5_1
##      DataValueType DataValue DataValueAlt Stratification1
## 1 Crude Prevalence      16.9         16.9         Overall
## 2 Crude Prevalence      21.9         21.9            Male
## 3 Crude Prevalence      12.0         12.0          Female
## 4 Crude Prevalence      18.7         18.7         Overall
## 5 Crude Prevalence       6.5          6.5         Overall
## 6 Crude Prevalence       7.1          7.1            Male

Let’s perform the data analysis for US (national) for the year 2015 and for datavalue type ‘Crude Prevalence’

cdi.data.us.cp.2015 <- filter(cdi.data.us,DataValueType=='Crude Prevalence',YearStart=='2015')
head(cdi.data.us.cp.2015)
##   YearStart LocationAbbr DataSource   Topic
## 1      2015           US      YRBSS Alcohol
## 2      2015           US      YRBSS Alcohol
## 3      2015           US      BRFSS Alcohol
## 4      2015           US      BRFSS Alcohol
## 5      2015           US      BRFSS Alcohol
## 6      2015           US      BRFSS Alcohol
##                                                  Question QuestionID
## 1                                 Alcohol use among youth     ALC1_1
## 2                   Binge drinking prevalence among youth     ALC2_1
## 3 Binge drinking prevalence among adults aged >= 18 years     ALC2_2
## 4 Binge drinking prevalence among adults aged >= 18 years     ALC2_2
## 5 Binge drinking prevalence among adults aged >= 18 years     ALC2_2
## 6  Binge drinking prevalence among women aged 18-44 years     ALC2_3
##      DataValueType DataValue DataValueAlt Stratification1
## 1 Crude Prevalence      32.8         32.8         Overall
## 2 Crude Prevalence      17.7         17.7         Overall
## 3 Crude Prevalence      11.7         11.7          Female
## 4 Crude Prevalence      21.4         21.4            Male
## 5 Crude Prevalence      16.3         16.3         Overall
## 6 Crude Prevalence      17.7         17.7         Overall

Now use merge function to combine the 2016 an 2015 dataset using QuestionID and Stratification1 features

cdi.data.us.cp.2016.2015.tidy <- merge(cdi.data.us.cp.2016,cdi.data.us.cp.2015,by.x =c('QuestionID','Stratification1'),
                                     by.y = c('QuestionID','Stratification1'))
kable(head(cdi.data.us.cp.2016.2015.tidy))
QuestionID Stratification1 YearStart.x LocationAbbr.x DataSource.x Topic.x Question.x DataValueType.x DataValue.x DataValueAlt.x YearStart.y LocationAbbr.y DataSource.y Topic.y Question.y DataValueType.y DataValue.y DataValueAlt.y
ALC2_2 Female 2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years Crude Prevalence 12.0 12.0 2015 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years Crude Prevalence 11.7 11.7
ALC2_2 Male 2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years Crude Prevalence 21.9 21.9 2015 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years Crude Prevalence 21.4 21.4
ALC2_2 Overall 2016 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years Crude Prevalence 16.9 16.9 2015 US BRFSS Alcohol Binge drinking prevalence among adults aged >= 18 years Crude Prevalence 16.3 16.3
ALC2_3 Overall 2016 US BRFSS Alcohol Binge drinking prevalence among women aged 18-44 years Crude Prevalence 18.7 18.7 2015 US BRFSS Alcohol Binge drinking prevalence among women aged 18-44 years Crude Prevalence 17.7 17.7
ALC5_1 Female 2016 US BRFSS Alcohol Heavy drinking among adults aged >= 18 years Crude Prevalence 5.9 5.9 2015 US BRFSS Alcohol Heavy drinking among adults aged >= 18 years Crude Prevalence 5.0 5.0
ALC5_1 Male 2016 US BRFSS Alcohol Heavy drinking among adults aged >= 18 years Crude Prevalence 7.1 7.1 2015 US BRFSS Alcohol Heavy drinking among adults aged >= 18 years Crude Prevalence 6.4 6.4
colnames(cdi.data.us.cp.2016.2015.tidy)
##  [1] "QuestionID"      "Stratification1" "YearStart.x"    
##  [4] "LocationAbbr.x"  "DataSource.x"    "Topic.x"        
##  [7] "Question.x"      "DataValueType.x" "DataValue.x"    
## [10] "DataValueAlt.x"  "YearStart.y"     "LocationAbbr.y" 
## [13] "DataSource.y"    "Topic.y"         "Question.y"     
## [16] "DataValueType.y" "DataValue.y"     "DataValueAlt.y"

Let’s clean the data

cp.percent.change.2016.2015.clean <- cdi.data.us.cp.2016.2015.tidy %>%
    select(QuestionID,Question.x,Stratification1,YearStart.x,YearStart.y,DataSource.x,Topic.x,DataValue.x,DataValue.y)
kable(head(cp.percent.change.2016.2015.clean))
QuestionID Question.x Stratification1 YearStart.x YearStart.y DataSource.x Topic.x DataValue.x DataValue.y
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Female 2016 2015 BRFSS Alcohol 12.0 11.7
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Male 2016 2015 BRFSS Alcohol 21.9 21.4
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Overall 2016 2015 BRFSS Alcohol 16.9 16.3
ALC2_3 Binge drinking prevalence among women aged 18-44 years Overall 2016 2015 BRFSS Alcohol 18.7 17.7
ALC5_1 Heavy drinking among adults aged >= 18 years Female 2016 2015 BRFSS Alcohol 5.9 5.0
ALC5_1 Heavy drinking among adults aged >= 18 years Male 2016 2015 BRFSS Alcohol 7.1 6.4
cp.percent.change.2016.2015.clean <- rename(cp.percent.change.2016.2015.clean,Question=Question.x,Year2016=YearStart.x,
                                           Year2015=YearStart.y,DataSource=DataSource.x,Topic=Topic.x,
                                            DataValue2016=DataValue.x,DataValue2015=DataValue.y)
kable(head(cp.percent.change.2016.2015.clean))
QuestionID Question Stratification1 Year2016 Year2015 DataSource Topic DataValue2016 DataValue2015
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Female 2016 2015 BRFSS Alcohol 12.0 11.7
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Male 2016 2015 BRFSS Alcohol 21.9 21.4
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Overall 2016 2015 BRFSS Alcohol 16.9 16.3
ALC2_3 Binge drinking prevalence among women aged 18-44 years Overall 2016 2015 BRFSS Alcohol 18.7 17.7
ALC5_1 Heavy drinking among adults aged >= 18 years Female 2016 2015 BRFSS Alcohol 5.9 5.0
ALC5_1 Heavy drinking among adults aged >= 18 years Male 2016 2015 BRFSS Alcohol 7.1 6.4
Now calcualte the percent change in 2016 as compared to 2015 for Nation al data
cp.percent.change <- cp.percent.change.2016.2015.clean %>%
    mutate(PercentChange = (as.numeric(as.character(DataValue2016))-
                   as.numeric(as.character(DataValue2015)))/as.numeric(as.character(DataValue2015))*100)
kable(head(cp.percent.change))
QuestionID Question Stratification1 Year2016 Year2015 DataSource Topic DataValue2016 DataValue2015 PercentChange
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Female 2016 2015 BRFSS Alcohol 12.0 11.7 2.564103
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Male 2016 2015 BRFSS Alcohol 21.9 21.4 2.336449
ALC2_2 Binge drinking prevalence among adults aged >= 18 years Overall 2016 2015 BRFSS Alcohol 16.9 16.3 3.680982
ALC2_3 Binge drinking prevalence among women aged 18-44 years Overall 2016 2015 BRFSS Alcohol 18.7 17.7 5.649718
ALC5_1 Heavy drinking among adults aged >= 18 years Female 2016 2015 BRFSS Alcohol 5.9 5.0 18.000000
ALC5_1 Heavy drinking among adults aged >= 18 years Male 2016 2015 BRFSS Alcohol 7.1 6.4 10.937500

Let’s visualize the percent increase by stratification

library(ggplot2)
ggplot(data = cp.percent.change, aes(x=Topic,y=PercentChange))+
  geom_bar(stat = 'identity',aes(fill=Topic))+
  facet_wrap(~Stratification1, ncol = 1)

Conclusion:
It appears that Female stratification has visbile increase percentage change as compared to Male for Alcohol

Let’s visualize the top 6 percent increase by stratification

cp.percent.change.arrange <- cp.percent.change %>%
    arrange(desc(PercentChange))
ggplot(data = head(cp.percent.change.arrange), aes(x=Topic,y=PercentChange))+
  geom_bar(stat = 'identity',aes(fill=Topic))+
  facet_wrap(~Stratification1, ncol = 1) 

Conclusion:
Top 6 percentage increase in CDI topics are Alchol, Arhtiritis and Chrocnic Kidney Disease

Compare US national data with state data and calculate percent change

cdi.data.states <- filter(cdi.data,LocationAbbr!='US',YearStart=='2016')
cdi.data.states.clean <- select(cdi.data.states,YearStart,LocationAbbr,DataSource,Topic,Question,QuestionID,DataValueType,DataValue,Stratification1)
cdi.data.states.cp <- filter(cdi.data.states.clean,DataValueType=='Crude Prevalence',YearStart=='2016')
unique(cdi.data.states.cp$LocationAbbr)
##  [1] AL AK AZ AR CA CO CT DE DC FL GA HI ID IL IN IA KS KY LA ME MD MA MI
## [24] MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT
## [47] VA WA WV WI WY GU PR VI
## 55 Levels: AK AL AR AZ CA CO CT DC DE FL GA GU HI IA ID IL IN KS KY ... WY
cp.us.states <- merge(cdi.data.us.cp.2016,cdi.data.states.cp,by.x =c('QuestionID','Stratification1'),
                                     by.y = c('QuestionID','Stratification1'))
cp.us.states.clean <- cp.us.states %>%
    select(QuestionID,Question.x,LocationAbbr.x,LocationAbbr.y,Stratification1,YearStart.x,YearStart.y,DataSource.x,Topic.x,DataValue.x,DataValue.y)
cp.us.states.clean <- rename(cp.us.states.clean,Question=Question.x,US2016=YearStart.x,
                             USLcoation=LocationAbbr.x,StatesLocation=LocationAbbr.y,
                           States2016=YearStart.y,DataSource=DataSource.x,Topic=Topic.x,
                            USDataValue2016=DataValue.x,StatesValue2016=DataValue.y)
cp.percent.change <- cp.us.states.clean %>%
    mutate(PercentChange = (as.numeric(as.character(StatesValue2016))-
                   as.numeric(as.character(USDataValue2016)))/as.numeric(as.character(USDataValue2016))*100)
kable(head(cp.percent.change))
QuestionID Question USLcoation StatesLocation Stratification1 US2016 States2016 DataSource Topic USDataValue2016 StatesValue2016 PercentChange
ALC2_2 Binge drinking prevalence among adults aged >= 18 years US OH Female 2016 2016 BRFSS Alcohol 12.0 12.8 6.666667
ALC2_2 Binge drinking prevalence among adults aged >= 18 years US PR Female 2016 2016 BRFSS Alcohol 12.0 7.7 -35.833333
ALC2_2 Binge drinking prevalence among adults aged >= 18 years US AL Female 2016 2016 BRFSS Alcohol 12.0 8.8 -26.666667
ALC2_2 Binge drinking prevalence among adults aged >= 18 years US KS Female 2016 2016 BRFSS Alcohol 12.0 10.4 -13.333333
ALC2_2 Binge drinking prevalence among adults aged >= 18 years US DC Female 2016 2016 BRFSS Alcohol 12.0 21.3 77.500000
ALC2_2 Binge drinking prevalence among adults aged >= 18 years US VI Female 2016 2016 BRFSS Alcohol 12.0 8.6 -28.333333

Let’s compare few states..

ggplot(data = filter(cp.percent.change,StatesLocation %in% c('NY','OH','CA','FL'), 
                     Topic=='Alcohol'), aes(x=Question,y=PercentChange))+
  geom_bar(stat = 'identity',aes(fill=Question))+

  facet_wrap(~StatesLocation, ncol = 2)+
  theme_bw()

Conclusion:
For CDI topic Alchol, Florida apperars to have highest percentage change for bing driniking prevalance among women aged 18-44 years whereas NY appears to have the lowest perecent change in the same category

Let’s compare the national vs states percent change by stratification

ggplot(data = cp.percent.change, aes(x=Topic,y=PercentChange))+
  geom_bar(stat = 'identity',aes(fill=Topic))+
  facet_wrap(~Stratification1, ncol = 1)+
  theme_bw()
## Warning: Removed 215 rows containing missing values (position_stack).

Mental Health data and percent change

Let’s analyze the Mental Health percenet change and find top 6 states with highest percent change

cp.percent.change.MH.rank <- cp.percent.change %>%
    filter(Topic=='Mental Health') %>%
    arrange(desc(PercentChange))
kable(head(cp.percent.change.MH.rank))
QuestionID Question USLcoation StatesLocation Stratification1 US2016 States2016 DataSource Topic USDataValue2016 StatesValue2016 PercentChange
MTH2_0 At least 14 recent mentally unhealthy days among women aged 18-44 years US NH Overall 2016 2016 BRFSS Mental Health 15.7 24.6 56.68790
MTH2_0 At least 14 recent mentally unhealthy days among women aged 18-44 years US AR Overall 2016 2016 BRFSS Mental Health 15.7 22.5 43.31210
MTH2_0 At least 14 recent mentally unhealthy days among women aged 18-44 years US WV Overall 2016 2016 BRFSS Mental Health 15.7 21.7 38.21656
MTH2_0 At least 14 recent mentally unhealthy days among women aged 18-44 years US KY Overall 2016 2016 BRFSS Mental Health 15.7 21.5 36.94268
MTH2_0 At least 14 recent mentally unhealthy days among women aged 18-44 years US MS Overall 2016 2016 BRFSS Mental Health 15.7 20.0 27.38854
MTH2_0 At least 14 recent mentally unhealthy days among women aged 18-44 years US OH Overall 2016 2016 BRFSS Mental Health 15.7 19.8 26.11465
ggplot(data = head(cp.percent.change.MH.rank), aes(x=Question,y=PercentChange))+
  geom_bar(stat = 'identity',aes(fill=Question))+
  facet_wrap(~StatesLocation, ncol = 3)+
  theme_bw()

Conclusion:
New Hampshire (NH) appears to have highest percentage change in Mental Hygiene CDI topic as compared to national data

Let’s analyze the yearly data provided by difference sources by topic and calculate their perncetage in the group

cdi.data.us.01 <- cdi.data.us %>%
        group_by(YearStart,DataSource,Topic) %>%
        summarize(count=n())
cdi.data.us.stat <- cdi.data.us.01 %>%
    arrange(desc(count))
cdi.data.us.stat.countPercent <- mutate(cdi.data.us.stat,percentTotal = round(count/sum(count)*100,2))
cd.data.brfss <- filter(cdi.data.us.stat.countPercent,DataSource=='BRFSS')
kable(head(cd.data.brfss))
YearStart DataSource Topic count percentTotal
2011 BRFSS Diabetes 73 19.47
2013 BRFSS Diabetes 73 18.53
2015 BRFSS Diabetes 73 18.48
2012 BRFSS Diabetes 67 18.06
2011 BRFSS Cardiovascular Disease 49 13.07
2013 BRFSS Cardiovascular Disease 49 12.44
ggplot(data = cd.data.brfss, aes(x=Topic,y=percentTotal))+
  geom_bar(stat = 'identity',aes(fill=Topic))+
  facet_wrap(~YearStart, ncol = 1)+
  theme_bw()

Conclusion:
Diabetes appears to be the highest and consistent as compared to other CDI topic for all given years

  1. Population by Country (1980 - 2010). Total population (in millions) by country, 1980 to 2010. Compiled by Energy Information Administration (EIA).
    Source: https://catalog.data.gov/dataset/population-by-country-1980-2010

Data Acquisition

population.data <- read.csv('https://raw.githubusercontent.com/niteen11/MSDS/master/DATA607/week6/dataset/populationbycountry19802010millions.csv')
head(population.data)
##                           X     X1980     X1981     X1982     X1983
## 1             North America 320.27638 324.44694 328.62014 332.72487
## 2                   Bermuda   0.05473   0.05491   0.05517   0.05551
## 3                    Canada   24.5933      24.9   25.2019   25.4563
## 4                 Greenland   0.05021   0.05103   0.05166   0.05211
## 5                    Mexico  68.34748  69.96926   71.6409  73.36288
## 6 Saint Pierre and Miquelon   0.00599   0.00601   0.00605   0.00607
##       X1984     X1985     X1986     X1987    X1988     X1989    X1990
## 1 336.72143 340.74811 344.89548 349.07829 353.2939 357.68457 362.4468
## 2   0.05585   0.05618   0.05651   0.05683  0.05717   0.05749  0.05778
## 3   25.7018   25.9416   26.2038   26.5497  26.8948   27.3793  27.7906
## 4   0.05263   0.05315   0.05364    0.0541  0.05485   0.05541  0.05563
## 5  75.08014  76.76723  78.44243  80.12249 81.78182  83.36684 84.91365
## 6   0.00611   0.00616   0.00621   0.00625  0.00628   0.00631  0.00632
##       X1991     X1992     X1993    X1994     X1995    X1996     X1997
## 1 367.70684 373.29069 378.74233 383.9166 388.97216 393.9428 398.97205
## 2    0.0581    0.0587   0.05924  0.05975   0.06029  0.06087   0.06145
## 3   28.1179  28.54489  28.95334 29.33081  29.69053 30.02632   30.3056
## 4   0.05554   0.05549   0.05564  0.05592   0.05619  0.05634   0.05651
## 5  86.48803  88.11103  89.74914  91.3379  92.88035 94.39858  95.89515
## 6   0.00633   0.00636   0.00638   0.0064    0.0064  0.00641   0.00642
##       X1998     X1999    X2000     X2001     X2002     X2003     X2004
## 1 403.85585 408.60296 413.3245 417.83236 422.05268 426.06238 430.26938
## 2   0.06198   0.06251  0.06306   0.06361   0.06418   0.06476   0.06534
## 3  30.55166  30.82026 31.09956  31.37674  31.64096  31.88931  32.13476
## 4   0.05661    0.0567  0.05689   0.05713   0.05736   0.05754    0.0577
## 5  97.32506  98.61691 99.92662 101.24696 102.47993 103.71806 104.95959
## 6   0.00642   0.00643  0.00641   0.00637   0.00633   0.00629   0.00625
##       X2005     X2006     X2007     X2008     X2009     X2010
## 1 434.47232 438.82964  443.3473 447.67394 451.83698 456.59331
## 2   0.06591   0.06644   0.06692   0.06739   0.06784   0.06827
## 3  32.38638  32.65668  32.93596   33.2127  33.48721  33.75974
## 4   0.05778   0.05764   0.05753   0.05756    0.0576   0.05764
## 5  106.2029 107.44953 108.70089  109.9554 111.21179 112.46886
## 6    0.0062   0.00615    0.0061   0.00605     0.006   0.00594

Let’s rename the country column

population.data <- rename(population.data,Country=X)
head(population.data)
##                     Country     X1980     X1981     X1982     X1983
## 1             North America 320.27638 324.44694 328.62014 332.72487
## 2                   Bermuda   0.05473   0.05491   0.05517   0.05551
## 3                    Canada   24.5933      24.9   25.2019   25.4563
## 4                 Greenland   0.05021   0.05103   0.05166   0.05211
## 5                    Mexico  68.34748  69.96926   71.6409  73.36288
## 6 Saint Pierre and Miquelon   0.00599   0.00601   0.00605   0.00607
##       X1984     X1985     X1986     X1987    X1988     X1989    X1990
## 1 336.72143 340.74811 344.89548 349.07829 353.2939 357.68457 362.4468
## 2   0.05585   0.05618   0.05651   0.05683  0.05717   0.05749  0.05778
## 3   25.7018   25.9416   26.2038   26.5497  26.8948   27.3793  27.7906
## 4   0.05263   0.05315   0.05364    0.0541  0.05485   0.05541  0.05563
## 5  75.08014  76.76723  78.44243  80.12249 81.78182  83.36684 84.91365
## 6   0.00611   0.00616   0.00621   0.00625  0.00628   0.00631  0.00632
##       X1991     X1992     X1993    X1994     X1995    X1996     X1997
## 1 367.70684 373.29069 378.74233 383.9166 388.97216 393.9428 398.97205
## 2    0.0581    0.0587   0.05924  0.05975   0.06029  0.06087   0.06145
## 3   28.1179  28.54489  28.95334 29.33081  29.69053 30.02632   30.3056
## 4   0.05554   0.05549   0.05564  0.05592   0.05619  0.05634   0.05651
## 5  86.48803  88.11103  89.74914  91.3379  92.88035 94.39858  95.89515
## 6   0.00633   0.00636   0.00638   0.0064    0.0064  0.00641   0.00642
##       X1998     X1999    X2000     X2001     X2002     X2003     X2004
## 1 403.85585 408.60296 413.3245 417.83236 422.05268 426.06238 430.26938
## 2   0.06198   0.06251  0.06306   0.06361   0.06418   0.06476   0.06534
## 3  30.55166  30.82026 31.09956  31.37674  31.64096  31.88931  32.13476
## 4   0.05661    0.0567  0.05689   0.05713   0.05736   0.05754    0.0577
## 5  97.32506  98.61691 99.92662 101.24696 102.47993 103.71806 104.95959
## 6   0.00642   0.00643  0.00641   0.00637   0.00633   0.00629   0.00625
##       X2005     X2006     X2007     X2008     X2009     X2010
## 1 434.47232 438.82964  443.3473 447.67394 451.83698 456.59331
## 2   0.06591   0.06644   0.06692   0.06739   0.06784   0.06827
## 3  32.38638  32.65668  32.93596   33.2127  33.48721  33.75974
## 4   0.05778   0.05764   0.05753   0.05756    0.0576   0.05764
## 5  106.2029 107.44953 108.70089  109.9554 111.21179 112.46886
## 6    0.0062   0.00615    0.0061   0.00605     0.006   0.00594

Let’s Tidy and Trnasform data

population.data.US.tidy <- population.data %>%
        filter(Country == "United States") %>%
        gather(Year, Population, X1980:X2010) %>%
        separate(Year, c("left","Year"), sep="X") %>%
        select(-left) %>%
        arrange(Year)
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(population.data.US.tidy)
##         Country Year Population
## 1 United States 1980  227.22468
## 2 United States 1981  229.46571
## 3 United States 1982  231.66446
## 4 United States 1983  233.79199
## 5 United States 1984   235.8249
## 6 United States 1985   237.9238
# extract 1995 value as the baseline value and add to US population dataframe
us_base <- population.data.US.tidy[population.data.US.tidy$Year==1995,3]
population.data.US.tidy$us_baseline <- us_base

population.data.US.tidy <- population.data.US.tidy %>% 
  mutate(us_change = (as.numeric(as.character(Population))-as.numeric(as.character(us_baseline)))/as.numeric(as.character(us_baseline)))
head(population.data.US.tidy)
##         Country Year Population us_baseline  us_change
## 1 United States 1980  227.22468   266.27839 -0.1466650
## 2 United States 1981  229.46571   266.27839 -0.1382488
## 3 United States 1982  231.66446   266.27839 -0.1299915
## 4 United States 1983  233.79199   266.27839 -0.1220016
## 5 United States 1984   235.8249   266.27839 -0.1143671
## 6 United States 1985   237.9238   266.27839 -0.1064848
# create a world dataframe, clean and turn into long data format
world.data <- population.data %>%
        filter(Country != "United States") %>%
        gather(Year, Population, X1980:X2010) %>%
        separate(Year, c("left","Year"), sep="X") %>%
        select(-left) %>%
        arrange(Year) %>%
        filter(Population != "NA")
## Warning: attributes are not identical across measure variables;
## they will be dropped
# create baselines for each country
world_base <- world.data %>%
        filter(Year == 1995) %>%
        select(Country, world_Baseline = Population)

world.data <- world.data %>%
        left_join(world_base) %>%
        arrange(Country) %>%
        mutate(population_change = (as.numeric(as.character(Population))-as.numeric(as.character(world_Baseline)))/
                 as.numeric(as.character(world_Baseline)))
## Joining, by = "Country"
## Warning in evalq((as.numeric(as.character(Population))
## - as.numeric(as.character(world_Baseline)))/
## as.numeric(as.character(world_Baseline)), : NAs introduced by coercion
## Warning in evalq((as.numeric(as.character(Population))
## - as.numeric(as.character(world_Baseline)))/
## as.numeric(as.character(world_Baseline)), : NAs introduced by coercion

## Warning in evalq((as.numeric(as.character(Population))
## - as.numeric(as.character(world_Baseline)))/
## as.numeric(as.character(world_Baseline)), : NAs introduced by coercion
head(world.data)
##       Country Year Population world_Baseline population_change
## 1 Afghanistan 1980    15.0436       19.22594        -0.2175363
## 2 Afghanistan 1981   13.67368       19.22594        -0.2887900
## 3 Afghanistan 1982   12.57743       19.22594        -0.3458094
## 4 Afghanistan 1983   12.43058       19.22594        -0.3534475
## 5 Afghanistan 1984   12.75384       19.22594        -0.3366337
## 6 Afghanistan 1985   13.09371       19.22594        -0.3189561
# change year variables from character to numeric
world.data$Year <- as.numeric(world.data$Year)
population.data.US.tidy$Year <- as.numeric(population.data.US.tidy$Year)

Let’s Rank world population top 5 and bottom 5 in top 50 list

rank <- world.data %>% 
        filter(Year == 2010) %>% 
        arrange(desc(population_change)) %>%
        mutate(rank = seq(1,length(Country), by=1)) %>%
        filter(rank < 6 | rank > 45  )
kable(head(rank,10))
Country Year Population world_Baseline population_change rank
United Arab Emirates 2010 4.97559 2.45798 1.0242598 1
Rwanda 2010 11.05598 5.47298 1.0201024 2
Liberia 2010 3.68508 1.90028 0.9392300 3
Western Sahara 2010 0.49152 0.26387 0.8627354 4
Niger 2010 15.87827 9.19933 0.7260246 5
Cameroon 2010 19.29415 13.60316 0.4183579 46
Macau 2010 0.56796 0.4013 0.4153003 47
Tanzania 2010 41.8929 29.7529 0.4080275 48
Mozambique 2010 22.06145 15.76475 0.3994164 49
Middle East 2010 212.33692 152.34103 0.3938262 50

Let’s visualize ppulation change the world data

p <- ggplot(world.data, aes(Year, population_change, group=Country)) +
        theme_bw() +
        theme(plot.background = element_blank(),
              panel.grid.minor = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.major.y = element_line(linetype = 3, colour = "grey50"),
              panel.border = element_blank(),
              panel.background = element_blank(),
              axis.ticks = element_blank(),  
              axis.title = element_blank()) +
        geom_line(colour="grey90", alpha=.9)

print(p)
## Warning: Removed 312 rows containing missing values (geom_path).

Now indicate the US data as a reference line

p <- p +
        geom_line(data=population.data.US.tidy, aes(Year, us_change, group=1), linetype=5)

print(p)
## Warning: Removed 312 rows containing missing values (geom_path).

Now Add the UAE data to visulize the population change

p <- p +
        geom_line(data=filter(world.data, Country=="United Arab Emirates"), 
                  aes(Year, population_change, group=Country), colour="dodgerblue", size = 1)

print(p)
## Warning: Removed 312 rows containing missing values (geom_path).

Conclusion:
We looked into the world population data and ranked them according to percent population change. We took 1995 as a baseline year and UAE appears to be have highest population change w.r.t. US population.

  1. US Daily Electricity Data - Demand, Net Generation and Total Net Actual Change
    Source: us_daily_electric_system_operating_data

Data Acquisition

us.daily.elec.data <- read.csv('https://raw.githubusercontent.com/niteen11/MSDS/master/DATA607/week6/dataset/us_daily_electric_system_operating_data.csv',skip=4,header = T)
dim(us.daily.elec.data)
## [1] 56 32
head(us.daily.elec.data)
##                  megawatthours X01.01.2018 X01.02.2018 X01.03.2018
## 1          California (region)                      NA          NA
## 2                       Demand      645599      704074      727216
## 3               Net generation      463629      502312      504456
## 4 Total net actual interchange     -179525     -201654     -222762
## 5           Carolinas (region)                      NA          NA
## 6                       Demand      847133      938726      948954
##   X01.04.2018 X01.05.2018 X01.06.2018 X01.07.2018 X01.08.2018 X01.09.2018
## 1          NA          NA          NA          NA          NA          NA
## 2      714305      712021      668299      655802      731881      729380
## 3      505148      484644      447821      447952      500797      493884
## 4     -209163     -213905     -219968     -207861     -237089     -246264
## 5          NA          NA          NA          NA          NA          NA
## 6      901939      939699      932742      946665      840094      646932
##   X01.10.2018 X01.11.2018 X01.12.2018 X01.13.2018 X01.14.2018 X01.15.2018
## 1          NA          NA          NA          NA          NA          NA
## 2      716698      675260      687504      665136      642080      703473
## 3      482843      435107      460105      452264      428364      474015
## 4     -234361     -240146     -227413     -231833     -213729     -229468
## 5          NA          NA          NA          NA          NA          NA
## 6      590003      561119      513368      587356      761043      823663
##   X01.16.2018 X01.17.2018 X01.18.2018 X01.19.2018 X01.20.2018 X01.21.2018
## 1          NA          NA          NA          NA          NA          NA
## 2      713050      715733      718298      712862      669421      665340
## 3      487101      484557      488237      484276      458660      437506
## 4     -225955     -231182     -230069     -228598     -210774     -227842
## 5          NA          NA          NA          NA          NA          NA
## 6      778424      779099      858007      755994      663462      575237
##   X01.22.2018 X01.23.2018 X01.24.2018 X01.25.2018 X01.26.2018 X01.27.2018
## 1          NA          NA          NA          NA          NA          NA
## 2      727071      722833      728743      732552      699533      666415
## 3      502041      510543      500446      501674      464103      432315
## 4     -225042     -212304     -228303     -230401     -233814     -231742
## 5          NA          NA          NA          NA          NA          NA
## 6      588743      523900      603365      676274      685613      575060
##   X01.28.2018 X01.29.2018 X01.30.2018 X01.31.2018
## 1          NA          NA          NA          NA
## 2      647844      725483      720384      717446
## 3      433092      491871      480451      492836
## 4     -213582     -233589     -239945     -224620
## 5          NA          NA          NA          NA
## 6      511872      567562      697175      752846

Tidying and Transforming data

us.daily.elec.data.tidy <- us.daily.elec.data %>%
    gather(date, usage, X01.01.2018:X01.31.2018) %>%
    separate(date, c('left','date'), sep='X')
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(us.daily.elec.data.tidy,3)
##         megawatthours left       date  usage
## 1 California (region)      01.01.2018       
## 2              Demand      01.01.2018 645599
## 3      Net generation      01.01.2018 463629

Let’s check for numeric value in usage and replace non numeric values with NA

us.data.clean <- us.daily.elec.data.tidy %>%
        select(megawatthours,date,usage)  %>%
        mutate(usage = ifelse(grepl("^[-]{0,1}[0-9]{0,}.{0,1}[0-9]{1,}$",usage),usage,NA))
head(us.data.clean)
##                  megawatthours       date   usage
## 1          California (region) 01.01.2018    <NA>
## 2                       Demand 01.01.2018  645599
## 3               Net generation 01.01.2018  463629
## 4 Total net actual interchange 01.01.2018 -179525
## 5           Carolinas (region) 01.01.2018    <NA>
## 6                       Demand 01.01.2018  847133

Using muatate function to create location attribute

us.data.clean.loc <- us.data.clean %>%
     mutate(location = ifelse(is.na(usage),as.character(megawatthours),lag(as.character(megawatthours))))
head(us.data.clean.loc,10)
##                   megawatthours       date   usage            location
## 1           California (region) 01.01.2018    <NA> California (region)
## 2                        Demand 01.01.2018  645599 California (region)
## 3                Net generation 01.01.2018  463629              Demand
## 4  Total net actual interchange 01.01.2018 -179525      Net generation
## 5            Carolinas (region) 01.01.2018    <NA>  Carolinas (region)
## 6                        Demand 01.01.2018  847133  Carolinas (region)
## 7                Net generation 01.01.2018  802908              Demand
## 8  Total net actual interchange 01.01.2018  -43153      Net generation
## 9              Central (region) 01.01.2018    <NA>    Central (region)
## 10                       Demand 01.01.2018  890403    Central (region)

Now populating location value for each categorical data

location <- c('')
for(row in 1:nrow(us.data.clean.loc)) {
    loc.value <- us.data.clean.loc[row,'megawatthours']
   
    if(is.na(us.data.clean.loc[row,'usage'])) {
        location <- loc.value }  
    rbind(us.data.clean.loc$location[row] <- as.character(location) )
}
head(us.data.clean.loc,10)
##                   megawatthours       date   usage            location
## 1           California (region) 01.01.2018    <NA> California (region)
## 2                        Demand 01.01.2018  645599 California (region)
## 3                Net generation 01.01.2018  463629 California (region)
## 4  Total net actual interchange 01.01.2018 -179525 California (region)
## 5            Carolinas (region) 01.01.2018    <NA>  Carolinas (region)
## 6                        Demand 01.01.2018  847133  Carolinas (region)
## 7                Net generation 01.01.2018  802908  Carolinas (region)
## 8  Total net actual interchange 01.01.2018  -43153  Carolinas (region)
## 9              Central (region) 01.01.2018    <NA>    Central (region)
## 10                       Demand 01.01.2018  890403    Central (region)

Once the location is populated for each data point, we can get rid of NA usage values

us.data.clean.loc.df <- filter(us.data.clean.loc,!is.na(usage))
kable(head(us.data.clean.loc.df,15))
megawatthours date usage location
Demand 01.01.2018 645599 California (region)
Net generation 01.01.2018 463629 California (region)
Total net actual interchange 01.01.2018 -179525 California (region)
Demand 01.01.2018 847133 Carolinas (region)
Net generation 01.01.2018 802908 Carolinas (region)
Total net actual interchange 01.01.2018 -43153 Carolinas (region)
Demand 01.01.2018 890403 Central (region)
Net generation 01.01.2018 898131 Central (region)
Total net actual interchange 01.01.2018 7727 Central (region)
Demand 01.01.2018 1289538 Electric Reliability Council of Texas
Net generation 01.01.2018 1296554 Electric Reliability Council of Texas
Total net actual interchange 01.01.2018 6981 Electric Reliability Council of Texas
Demand 01.01.2018 588600 Florida (region)
Net generation 01.01.2018 577335 Florida (region)
Total net actual interchange 01.01.2018 1032 Florida (region)

let’s fix the day in the date for Jan month 2018

us.data.clean.loc.df.spread <- us.data.clean.loc.df %>%
  spread(megawatthours,usage) %>%
  mutate(Day= as.numeric(substr(date,4,5))) %>%
  select(2:6)
kable(head(us.data.clean.loc.df.spread))
location Demand Net generation Total net actual interchange Day
California (region) 645599 463629 -179525 1
Carolinas (region) 847133 802908 -43153 1
Central (region) 890403 898131 7727 1
Electric Reliability Council of Texas 1289538 1296554 6981 1
Florida (region) 588600 577335 1032 1
Mid-Atlantic (region) 2828052 2893869 64583 1

Data Visulization

ggplot(us.data.clean.loc.df.spread, aes(x=Demand, y=Day, group=location,color=location)) +
    geom_line()+
    facet_wrap(~location,ncol = 2)

ggplot(data = us.data.clean.loc.df.spread, aes(x=Day,y=Demand))+
  geom_bar(stat = 'identity',aes(fill=location))+
  facet_wrap(~location, ncol = 2)+
  theme_bw()

Conclusion:
By alanyzing above data and plots, it appears that there is consistency in demand of electicity in california, carolinas, central, southeast regions and is higher as compared to other regions.