library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DT)
library(knitr)
cdi.data <- read.csv('C:\\NITEEN\\CUNY\\Spring 2018\\DATA 607\\week6\\U.S._Chronic_Disease_Indicators__CDI_.csv')
dim(cdi.data)
## [1] 523486 34
head(cdi.data)
## YearStart YearEnd LocationAbbr LocationDesc DataSource Topic
## 1 2016 2016 US United States BRFSS Alcohol
## 2 2016 2016 AL Alabama BRFSS Alcohol
## 3 2016 2016 AK Alaska BRFSS Alcohol
## 4 2016 2016 AZ Arizona BRFSS Alcohol
## 5 2016 2016 AR Arkansas BRFSS Alcohol
## 6 2016 2016 CA California BRFSS Alcohol
## Question Response
## 1 Binge drinking prevalence among adults aged >= 18 years NA
## 2 Binge drinking prevalence among adults aged >= 18 years NA
## 3 Binge drinking prevalence among adults aged >= 18 years NA
## 4 Binge drinking prevalence among adults aged >= 18 years NA
## 5 Binge drinking prevalence among adults aged >= 18 years NA
## 6 Binge drinking prevalence among adults aged >= 18 years NA
## DataValueUnit DataValueType DataValue DataValueAlt
## 1 % Crude Prevalence 16.9 16.9
## 2 % Crude Prevalence 13.0 13.0
## 3 % Crude Prevalence 18.2 18.2
## 4 % Crude Prevalence 15.6 15.6
## 5 % Crude Prevalence 15.0 15.0
## 6 % Crude Prevalence 16.3 16.3
## DataValueFootnoteSymbol DatavalueFootnote LowConfidenceLimit
## 1 * 50 States + DC: US Median 16.0
## 2 11.9
## 3 16.0
## 4 14.3
## 5 13.0
## 6 15.4
## HighConfidenceLimit StratificationCategory1 Stratification1
## 1 18.0 Overall Overall
## 2 14.1 Overall Overall
## 3 20.6 Overall Overall
## 4 16.9 Overall Overall
## 5 17.2 Overall Overall
## 6 17.2 Overall Overall
## StratificationCategory2 Stratification2 StratificationCategory3
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## Stratification3 GeoLocation ResponseID
## 1 NA NA
## 2 NA (32.84057112200048, -86.63186076199969) NA
## 3 NA (64.84507995700051, -147.72205903599973) NA
## 4 NA (34.865970280000454, -111.76381127699972) NA
## 5 NA (34.74865012400045, -92.27449074299966) NA
## 6 NA (37.63864012300047, -120.99999953799971) NA
## LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1
## 1 59 ALC ALC2_2 CrdPrev OVERALL
## 2 1 ALC ALC2_2 CrdPrev OVERALL
## 3 2 ALC ALC2_2 CrdPrev OVERALL
## 4 4 ALC ALC2_2 CrdPrev OVERALL
## 5 5 ALC ALC2_2 CrdPrev OVERALL
## 6 6 ALC ALC2_2 CrdPrev OVERALL
## StratificationID1 StratificationCategoryID2 StratificationID2
## 1 OVR NA NA
## 2 OVR NA NA
## 3 OVR NA NA
## 4 OVR NA NA
## 5 OVR NA NA
## 6 OVR NA NA
## StratificationCategoryID3 StratificationID3
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
cdi.data.us <- cdi.data %>%
select(YearStart,LocationAbbr,DataSource,Topic,Question,QuestionID,DataValueType,DataValue,DataValueAlt,Stratification1) %>%
filter(LocationAbbr=='US' , DataValueAlt!='NA')
kable(head(cdi.data.us))
YearStart | LocationAbbr | DataSource | Topic | Question | QuestionID | DataValueType | DataValue | DataValueAlt | Stratification1 |
---|---|---|---|---|---|---|---|---|---|
2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | ALC2_2 | Crude Prevalence | 16.9 | 16.9 | Overall |
2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | ALC2_2 | Age-adjusted Prevalence | 17.9 | 17.9 | Overall |
2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | ALC2_2 | Crude Prevalence | 21.9 | 21.9 | Male |
2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | ALC2_2 | Age-adjusted Prevalence | 22.5 | 22.5 | Male |
2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | ALC2_2 | Crude Prevalence | 12.0 | 12.0 | Female |
2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | ALC2_2 | Age-adjusted Prevalence | 13.1 | 13.1 | Female |
unique(cdi.data.us$Topic)
## [1] Alcohol
## [2] Arthritis
## [3] Asthma
## [4] Cancer
## [5] Chronic Obstructive Pulmonary Disease
## [6] Cardiovascular Disease
## [7] Diabetes
## [8] Immunization
## [9] Mental Health
## [10] Chronic Kidney Disease
## [11] Nutrition, Physical Activity, and Weight Status
## [12] Older Adults
## [13] Oral Health
## [14] Overarching Conditions
## [15] Reproductive Health
## [16] Tobacco
## [17] Disability
## 17 Levels: Alcohol Arthritis Asthma Cancer ... Tobacco
unique(cdi.data.us$DataSource)
## [1] BRFSS
## [2] YRBSS
## [3] ACS 5-Year Estimates
## [4] USRDS
## [5] NVSS
## [6] Statewide central cancer registries
## [7] Death certificate
## [8] CDC Breastfeeding Report Card
## [9] mPINC
## [10] HRSA, UDS
## [11] ANRF
## [12] AEDS
## [13] InfoUSA; USDA
## [14] WFRS
## [15] National Immunization Survey
## [16] USDA National Farmers' Market Directory
## [17] Current Population Survey
## [18] ACS 1-Year Estimates
## [19] NSCH
## [20] Birth Certificate, NVSS
## [21] NVSS, Mortality
## [22] CMS CCW
## [23] CMS Part A Claims Data
## 32 Levels: ACS 1-Year Estimates ACS 5-Year Estimates AEDS ANRF ... YRBSS
unique(cdi.data.us$DataValueType)
## [1] Crude Prevalence
## [2] Age-adjusted Prevalence
## [3] Mean
## [4] Age-adjusted Mean
## [5] Number
## [6] Median
## [7] Adjusted by age, sex, race and ethnicity
## [8] Age-adjusted Rate
## [9] Crude Rate
## [10] Average Annual Age-adjusted Rate
## [11] Average Annual Crude Rate
## [12] Average Annual Number
## [13] Percent
## [14] Per capita alcohol consumption
## [15] Prevalence
## 19 Levels: Adjusted by age, sex, race and ethnicity ... Yes/No
unique(cdi.data.us$YearStart)
## [1] 2016 2015 2014 2013 2012 2011 2010 2009 2001 2008
nrow(cdi.data.us)
## [1] 3635
cdi.data.us.crdprv <- cdi.data.us %>%
filter(DataValueType=='Crude Prevalence')
nrow(cdi.data.us.crdprv)
## [1] 1242
ggplot(data = cdi.data.us.crdprv, aes(x=Topic,y=DataValueAlt))+
geom_bar(stat = 'identity',aes(fill=Topic))+
facet_wrap(~Stratification1, ncol = 2)+
theme_bw()
ggplot(data=filter(cdi.data.us.crdprv,Topic=='Alcohol'),aes(x=YearStart, y=DataValueAlt, group=Stratification1
,color=Stratification1)) +
geom_line()+
geom_point()+
facet_wrap(~Question,ncol = 1)+
theme_bw()
library(dplyr)
cdi.data.us.crdprv.ALC2_2 <- filter(cdi.data.us.crdprv,QuestionID=='ALC2_2')
summary(cdi.data.us.crdprv.ALC2_2$DataValueAlt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.10 12.15 16.85 16.96 21.40 24.20
fill <- '#56B4E9'
ggplot(cdi.data.us.crdprv.ALC2_2, aes(x =Stratification1, y = DataValueAlt)) +
geom_boxplot(fill=fill)+
theme_bw()
cdi.data.us.ALC2_2.male <- filter(cdi.data.us,Stratification1=='Overall',QuestionID=='ALC2_2')
hist(cdi.data.us.ALC2_2.male$DataValueAlt)
The plot appears to have normal distribution with right skewedness
alc2_2_mean <- mean(cdi.data.us.ALC2_2.male$DataValueAlt)
alc2_2_mean
## [1] 17.23333
alc2_2_sd <- sd(cdi.data.us.ALC2_2.male$DataValueAlt)
alc2_2_sd
## [1] 0.8616404
IQR(cdi.data.us.ALC2_2.male$DataValueAlt)
## [1] 0.725
hist(cdi.data.us.ALC2_2.male$DataValueAlt, probability = TRUE)
x <- 16:19.5
y <- dnorm(x = x, mean = alc2_2_mean, sd = alc2_2_sd)
lines(x = x, y = y, col = "blue")
qqnorm(cdi.data.us.ALC2_2.male$DataValueAlt)
qqline(cdi.data.us.ALC2_2.male$DataValueAlt)
Probability of randomly chosing a man with datavalue=19
1 - pnorm(q = 19, mean = alc2_2_mean, sd = alc2_2_sd)
## [1] 0.02016502
Probability of randomly chosing a man with datavalue=16
1 - pnorm(q = 16, mean = alc2_2_mean, sd = alc2_2_sd)
## [1] 0.9238391
Let’s perform the data analysis for US (national) for the year 2016 and for datavalue type ‘Crude Prevalence’
cdi.data.us.cp.2016 <- filter(cdi.data.us,DataValueType=='Crude Prevalence',YearStart=='2016')
head(cdi.data.us.cp.2016)
## YearStart LocationAbbr DataSource Topic
## 1 2016 US BRFSS Alcohol
## 2 2016 US BRFSS Alcohol
## 3 2016 US BRFSS Alcohol
## 4 2016 US BRFSS Alcohol
## 5 2016 US BRFSS Alcohol
## 6 2016 US BRFSS Alcohol
## Question QuestionID
## 1 Binge drinking prevalence among adults aged >= 18 years ALC2_2
## 2 Binge drinking prevalence among adults aged >= 18 years ALC2_2
## 3 Binge drinking prevalence among adults aged >= 18 years ALC2_2
## 4 Binge drinking prevalence among women aged 18-44 years ALC2_3
## 5 Heavy drinking among adults aged >= 18 years ALC5_1
## 6 Heavy drinking among adults aged >= 18 years ALC5_1
## DataValueType DataValue DataValueAlt Stratification1
## 1 Crude Prevalence 16.9 16.9 Overall
## 2 Crude Prevalence 21.9 21.9 Male
## 3 Crude Prevalence 12.0 12.0 Female
## 4 Crude Prevalence 18.7 18.7 Overall
## 5 Crude Prevalence 6.5 6.5 Overall
## 6 Crude Prevalence 7.1 7.1 Male
Let’s perform the data analysis for US (national) for the year 2015 and for datavalue type ‘Crude Prevalence’
cdi.data.us.cp.2015 <- filter(cdi.data.us,DataValueType=='Crude Prevalence',YearStart=='2015')
head(cdi.data.us.cp.2015)
## YearStart LocationAbbr DataSource Topic
## 1 2015 US YRBSS Alcohol
## 2 2015 US YRBSS Alcohol
## 3 2015 US BRFSS Alcohol
## 4 2015 US BRFSS Alcohol
## 5 2015 US BRFSS Alcohol
## 6 2015 US BRFSS Alcohol
## Question QuestionID
## 1 Alcohol use among youth ALC1_1
## 2 Binge drinking prevalence among youth ALC2_1
## 3 Binge drinking prevalence among adults aged >= 18 years ALC2_2
## 4 Binge drinking prevalence among adults aged >= 18 years ALC2_2
## 5 Binge drinking prevalence among adults aged >= 18 years ALC2_2
## 6 Binge drinking prevalence among women aged 18-44 years ALC2_3
## DataValueType DataValue DataValueAlt Stratification1
## 1 Crude Prevalence 32.8 32.8 Overall
## 2 Crude Prevalence 17.7 17.7 Overall
## 3 Crude Prevalence 11.7 11.7 Female
## 4 Crude Prevalence 21.4 21.4 Male
## 5 Crude Prevalence 16.3 16.3 Overall
## 6 Crude Prevalence 17.7 17.7 Overall
Now use merge function to combine the 2016 an 2015 dataset using QuestionID and Stratification1 features
cdi.data.us.cp.2016.2015.tidy <- merge(cdi.data.us.cp.2016,cdi.data.us.cp.2015,by.x =c('QuestionID','Stratification1'),
by.y = c('QuestionID','Stratification1'))
kable(head(cdi.data.us.cp.2016.2015.tidy))
QuestionID | Stratification1 | YearStart.x | LocationAbbr.x | DataSource.x | Topic.x | Question.x | DataValueType.x | DataValue.x | DataValueAlt.x | YearStart.y | LocationAbbr.y | DataSource.y | Topic.y | Question.y | DataValueType.y | DataValue.y | DataValueAlt.y |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ALC2_2 | Female | 2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | Crude Prevalence | 12.0 | 12.0 | 2015 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | Crude Prevalence | 11.7 | 11.7 |
ALC2_2 | Male | 2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | Crude Prevalence | 21.9 | 21.9 | 2015 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | Crude Prevalence | 21.4 | 21.4 |
ALC2_2 | Overall | 2016 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | Crude Prevalence | 16.9 | 16.9 | 2015 | US | BRFSS | Alcohol | Binge drinking prevalence among adults aged >= 18 years | Crude Prevalence | 16.3 | 16.3 |
ALC2_3 | Overall | 2016 | US | BRFSS | Alcohol | Binge drinking prevalence among women aged 18-44 years | Crude Prevalence | 18.7 | 18.7 | 2015 | US | BRFSS | Alcohol | Binge drinking prevalence among women aged 18-44 years | Crude Prevalence | 17.7 | 17.7 |
ALC5_1 | Female | 2016 | US | BRFSS | Alcohol | Heavy drinking among adults aged >= 18 years | Crude Prevalence | 5.9 | 5.9 | 2015 | US | BRFSS | Alcohol | Heavy drinking among adults aged >= 18 years | Crude Prevalence | 5.0 | 5.0 |
ALC5_1 | Male | 2016 | US | BRFSS | Alcohol | Heavy drinking among adults aged >= 18 years | Crude Prevalence | 7.1 | 7.1 | 2015 | US | BRFSS | Alcohol | Heavy drinking among adults aged >= 18 years | Crude Prevalence | 6.4 | 6.4 |
colnames(cdi.data.us.cp.2016.2015.tidy)
## [1] "QuestionID" "Stratification1" "YearStart.x"
## [4] "LocationAbbr.x" "DataSource.x" "Topic.x"
## [7] "Question.x" "DataValueType.x" "DataValue.x"
## [10] "DataValueAlt.x" "YearStart.y" "LocationAbbr.y"
## [13] "DataSource.y" "Topic.y" "Question.y"
## [16] "DataValueType.y" "DataValue.y" "DataValueAlt.y"
Let’s clean the data
cp.percent.change.2016.2015.clean <- cdi.data.us.cp.2016.2015.tidy %>%
select(QuestionID,Question.x,Stratification1,YearStart.x,YearStart.y,DataSource.x,Topic.x,DataValue.x,DataValue.y)
kable(head(cp.percent.change.2016.2015.clean))
QuestionID | Question.x | Stratification1 | YearStart.x | YearStart.y | DataSource.x | Topic.x | DataValue.x | DataValue.y |
---|---|---|---|---|---|---|---|---|
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Female | 2016 | 2015 | BRFSS | Alcohol | 12.0 | 11.7 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Male | 2016 | 2015 | BRFSS | Alcohol | 21.9 | 21.4 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Overall | 2016 | 2015 | BRFSS | Alcohol | 16.9 | 16.3 |
ALC2_3 | Binge drinking prevalence among women aged 18-44 years | Overall | 2016 | 2015 | BRFSS | Alcohol | 18.7 | 17.7 |
ALC5_1 | Heavy drinking among adults aged >= 18 years | Female | 2016 | 2015 | BRFSS | Alcohol | 5.9 | 5.0 |
ALC5_1 | Heavy drinking among adults aged >= 18 years | Male | 2016 | 2015 | BRFSS | Alcohol | 7.1 | 6.4 |
cp.percent.change.2016.2015.clean <- rename(cp.percent.change.2016.2015.clean,Question=Question.x,Year2016=YearStart.x,
Year2015=YearStart.y,DataSource=DataSource.x,Topic=Topic.x,
DataValue2016=DataValue.x,DataValue2015=DataValue.y)
kable(head(cp.percent.change.2016.2015.clean))
QuestionID | Question | Stratification1 | Year2016 | Year2015 | DataSource | Topic | DataValue2016 | DataValue2015 |
---|---|---|---|---|---|---|---|---|
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Female | 2016 | 2015 | BRFSS | Alcohol | 12.0 | 11.7 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Male | 2016 | 2015 | BRFSS | Alcohol | 21.9 | 21.4 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Overall | 2016 | 2015 | BRFSS | Alcohol | 16.9 | 16.3 |
ALC2_3 | Binge drinking prevalence among women aged 18-44 years | Overall | 2016 | 2015 | BRFSS | Alcohol | 18.7 | 17.7 |
ALC5_1 | Heavy drinking among adults aged >= 18 years | Female | 2016 | 2015 | BRFSS | Alcohol | 5.9 | 5.0 |
ALC5_1 | Heavy drinking among adults aged >= 18 years | Male | 2016 | 2015 | BRFSS | Alcohol | 7.1 | 6.4 |
Now calcualte | the percent change in 2016 as compared to 2015 for Nation | al data |
cp.percent.change <- cp.percent.change.2016.2015.clean %>%
mutate(PercentChange = (as.numeric(as.character(DataValue2016))-
as.numeric(as.character(DataValue2015)))/as.numeric(as.character(DataValue2015))*100)
kable(head(cp.percent.change))
QuestionID | Question | Stratification1 | Year2016 | Year2015 | DataSource | Topic | DataValue2016 | DataValue2015 | PercentChange |
---|---|---|---|---|---|---|---|---|---|
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Female | 2016 | 2015 | BRFSS | Alcohol | 12.0 | 11.7 | 2.564103 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Male | 2016 | 2015 | BRFSS | Alcohol | 21.9 | 21.4 | 2.336449 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | Overall | 2016 | 2015 | BRFSS | Alcohol | 16.9 | 16.3 | 3.680982 |
ALC2_3 | Binge drinking prevalence among women aged 18-44 years | Overall | 2016 | 2015 | BRFSS | Alcohol | 18.7 | 17.7 | 5.649718 |
ALC5_1 | Heavy drinking among adults aged >= 18 years | Female | 2016 | 2015 | BRFSS | Alcohol | 5.9 | 5.0 | 18.000000 |
ALC5_1 | Heavy drinking among adults aged >= 18 years | Male | 2016 | 2015 | BRFSS | Alcohol | 7.1 | 6.4 | 10.937500 |
Let’s visualize the percent increase by stratification
library(ggplot2)
ggplot(data = cp.percent.change, aes(x=Topic,y=PercentChange))+
geom_bar(stat = 'identity',aes(fill=Topic))+
facet_wrap(~Stratification1, ncol = 1)
Conclusion:
It appears that Female stratification has visbile increase percentage change as compared to Male for Alcohol
Let’s visualize the top 6 percent increase by stratification
cp.percent.change.arrange <- cp.percent.change %>%
arrange(desc(PercentChange))
ggplot(data = head(cp.percent.change.arrange), aes(x=Topic,y=PercentChange))+
geom_bar(stat = 'identity',aes(fill=Topic))+
facet_wrap(~Stratification1, ncol = 1)
Conclusion:
Top 6 percentage increase in CDI topics are Alchol, Arhtiritis and Chrocnic Kidney Disease
cdi.data.states <- filter(cdi.data,LocationAbbr!='US',YearStart=='2016')
cdi.data.states.clean <- select(cdi.data.states,YearStart,LocationAbbr,DataSource,Topic,Question,QuestionID,DataValueType,DataValue,Stratification1)
cdi.data.states.cp <- filter(cdi.data.states.clean,DataValueType=='Crude Prevalence',YearStart=='2016')
unique(cdi.data.states.cp$LocationAbbr)
## [1] AL AK AZ AR CA CO CT DE DC FL GA HI ID IL IN IA KS KY LA ME MD MA MI
## [24] MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT
## [47] VA WA WV WI WY GU PR VI
## 55 Levels: AK AL AR AZ CA CO CT DC DE FL GA GU HI IA ID IL IN KS KY ... WY
cp.us.states <- merge(cdi.data.us.cp.2016,cdi.data.states.cp,by.x =c('QuestionID','Stratification1'),
by.y = c('QuestionID','Stratification1'))
cp.us.states.clean <- cp.us.states %>%
select(QuestionID,Question.x,LocationAbbr.x,LocationAbbr.y,Stratification1,YearStart.x,YearStart.y,DataSource.x,Topic.x,DataValue.x,DataValue.y)
cp.us.states.clean <- rename(cp.us.states.clean,Question=Question.x,US2016=YearStart.x,
USLcoation=LocationAbbr.x,StatesLocation=LocationAbbr.y,
States2016=YearStart.y,DataSource=DataSource.x,Topic=Topic.x,
USDataValue2016=DataValue.x,StatesValue2016=DataValue.y)
cp.percent.change <- cp.us.states.clean %>%
mutate(PercentChange = (as.numeric(as.character(StatesValue2016))-
as.numeric(as.character(USDataValue2016)))/as.numeric(as.character(USDataValue2016))*100)
kable(head(cp.percent.change))
QuestionID | Question | USLcoation | StatesLocation | Stratification1 | US2016 | States2016 | DataSource | Topic | USDataValue2016 | StatesValue2016 | PercentChange |
---|---|---|---|---|---|---|---|---|---|---|---|
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | US | OH | Female | 2016 | 2016 | BRFSS | Alcohol | 12.0 | 12.8 | 6.666667 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | US | PR | Female | 2016 | 2016 | BRFSS | Alcohol | 12.0 | 7.7 | -35.833333 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | US | AL | Female | 2016 | 2016 | BRFSS | Alcohol | 12.0 | 8.8 | -26.666667 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | US | KS | Female | 2016 | 2016 | BRFSS | Alcohol | 12.0 | 10.4 | -13.333333 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | US | DC | Female | 2016 | 2016 | BRFSS | Alcohol | 12.0 | 21.3 | 77.500000 |
ALC2_2 | Binge drinking prevalence among adults aged >= 18 years | US | VI | Female | 2016 | 2016 | BRFSS | Alcohol | 12.0 | 8.6 | -28.333333 |
Let’s compare few states..
ggplot(data = filter(cp.percent.change,StatesLocation %in% c('NY','OH','CA','FL'),
Topic=='Alcohol'), aes(x=Question,y=PercentChange))+
geom_bar(stat = 'identity',aes(fill=Question))+
facet_wrap(~StatesLocation, ncol = 2)+
theme_bw()
Conclusion:
For CDI topic Alchol, Florida apperars to have highest percentage change for bing driniking prevalance among women aged 18-44 years whereas NY appears to have the lowest perecent change in the same category
Let’s compare the national vs states percent change by stratification
ggplot(data = cp.percent.change, aes(x=Topic,y=PercentChange))+
geom_bar(stat = 'identity',aes(fill=Topic))+
facet_wrap(~Stratification1, ncol = 1)+
theme_bw()
## Warning: Removed 215 rows containing missing values (position_stack).
Let’s analyze the Mental Health percenet change and find top 6 states with highest percent change
cp.percent.change.MH.rank <- cp.percent.change %>%
filter(Topic=='Mental Health') %>%
arrange(desc(PercentChange))
kable(head(cp.percent.change.MH.rank))
QuestionID | Question | USLcoation | StatesLocation | Stratification1 | US2016 | States2016 | DataSource | Topic | USDataValue2016 | StatesValue2016 | PercentChange |
---|---|---|---|---|---|---|---|---|---|---|---|
MTH2_0 | At least 14 recent mentally unhealthy days among women aged 18-44 years | US | NH | Overall | 2016 | 2016 | BRFSS | Mental Health | 15.7 | 24.6 | 56.68790 |
MTH2_0 | At least 14 recent mentally unhealthy days among women aged 18-44 years | US | AR | Overall | 2016 | 2016 | BRFSS | Mental Health | 15.7 | 22.5 | 43.31210 |
MTH2_0 | At least 14 recent mentally unhealthy days among women aged 18-44 years | US | WV | Overall | 2016 | 2016 | BRFSS | Mental Health | 15.7 | 21.7 | 38.21656 |
MTH2_0 | At least 14 recent mentally unhealthy days among women aged 18-44 years | US | KY | Overall | 2016 | 2016 | BRFSS | Mental Health | 15.7 | 21.5 | 36.94268 |
MTH2_0 | At least 14 recent mentally unhealthy days among women aged 18-44 years | US | MS | Overall | 2016 | 2016 | BRFSS | Mental Health | 15.7 | 20.0 | 27.38854 |
MTH2_0 | At least 14 recent mentally unhealthy days among women aged 18-44 years | US | OH | Overall | 2016 | 2016 | BRFSS | Mental Health | 15.7 | 19.8 | 26.11465 |
ggplot(data = head(cp.percent.change.MH.rank), aes(x=Question,y=PercentChange))+
geom_bar(stat = 'identity',aes(fill=Question))+
facet_wrap(~StatesLocation, ncol = 3)+
theme_bw()
Conclusion:
New Hampshire (NH) appears to have highest percentage change in Mental Hygiene CDI topic as compared to national data
Let’s analyze the yearly data provided by difference sources by topic and calculate their perncetage in the group
cdi.data.us.01 <- cdi.data.us %>%
group_by(YearStart,DataSource,Topic) %>%
summarize(count=n())
cdi.data.us.stat <- cdi.data.us.01 %>%
arrange(desc(count))
cdi.data.us.stat.countPercent <- mutate(cdi.data.us.stat,percentTotal = round(count/sum(count)*100,2))
cd.data.brfss <- filter(cdi.data.us.stat.countPercent,DataSource=='BRFSS')
kable(head(cd.data.brfss))
YearStart | DataSource | Topic | count | percentTotal |
---|---|---|---|---|
2011 | BRFSS | Diabetes | 73 | 19.47 |
2013 | BRFSS | Diabetes | 73 | 18.53 |
2015 | BRFSS | Diabetes | 73 | 18.48 |
2012 | BRFSS | Diabetes | 67 | 18.06 |
2011 | BRFSS | Cardiovascular Disease | 49 | 13.07 |
2013 | BRFSS | Cardiovascular Disease | 49 | 12.44 |
ggplot(data = cd.data.brfss, aes(x=Topic,y=percentTotal))+
geom_bar(stat = 'identity',aes(fill=Topic))+
facet_wrap(~YearStart, ncol = 1)+
theme_bw()
Conclusion:
Diabetes appears to be the highest and consistent as compared to other CDI topic for all given years
population.data <- read.csv('https://raw.githubusercontent.com/niteen11/MSDS/master/DATA607/week6/dataset/populationbycountry19802010millions.csv')
head(population.data)
## X X1980 X1981 X1982 X1983
## 1 North America 320.27638 324.44694 328.62014 332.72487
## 2 Bermuda 0.05473 0.05491 0.05517 0.05551
## 3 Canada 24.5933 24.9 25.2019 25.4563
## 4 Greenland 0.05021 0.05103 0.05166 0.05211
## 5 Mexico 68.34748 69.96926 71.6409 73.36288
## 6 Saint Pierre and Miquelon 0.00599 0.00601 0.00605 0.00607
## X1984 X1985 X1986 X1987 X1988 X1989 X1990
## 1 336.72143 340.74811 344.89548 349.07829 353.2939 357.68457 362.4468
## 2 0.05585 0.05618 0.05651 0.05683 0.05717 0.05749 0.05778
## 3 25.7018 25.9416 26.2038 26.5497 26.8948 27.3793 27.7906
## 4 0.05263 0.05315 0.05364 0.0541 0.05485 0.05541 0.05563
## 5 75.08014 76.76723 78.44243 80.12249 81.78182 83.36684 84.91365
## 6 0.00611 0.00616 0.00621 0.00625 0.00628 0.00631 0.00632
## X1991 X1992 X1993 X1994 X1995 X1996 X1997
## 1 367.70684 373.29069 378.74233 383.9166 388.97216 393.9428 398.97205
## 2 0.0581 0.0587 0.05924 0.05975 0.06029 0.06087 0.06145
## 3 28.1179 28.54489 28.95334 29.33081 29.69053 30.02632 30.3056
## 4 0.05554 0.05549 0.05564 0.05592 0.05619 0.05634 0.05651
## 5 86.48803 88.11103 89.74914 91.3379 92.88035 94.39858 95.89515
## 6 0.00633 0.00636 0.00638 0.0064 0.0064 0.00641 0.00642
## X1998 X1999 X2000 X2001 X2002 X2003 X2004
## 1 403.85585 408.60296 413.3245 417.83236 422.05268 426.06238 430.26938
## 2 0.06198 0.06251 0.06306 0.06361 0.06418 0.06476 0.06534
## 3 30.55166 30.82026 31.09956 31.37674 31.64096 31.88931 32.13476
## 4 0.05661 0.0567 0.05689 0.05713 0.05736 0.05754 0.0577
## 5 97.32506 98.61691 99.92662 101.24696 102.47993 103.71806 104.95959
## 6 0.00642 0.00643 0.00641 0.00637 0.00633 0.00629 0.00625
## X2005 X2006 X2007 X2008 X2009 X2010
## 1 434.47232 438.82964 443.3473 447.67394 451.83698 456.59331
## 2 0.06591 0.06644 0.06692 0.06739 0.06784 0.06827
## 3 32.38638 32.65668 32.93596 33.2127 33.48721 33.75974
## 4 0.05778 0.05764 0.05753 0.05756 0.0576 0.05764
## 5 106.2029 107.44953 108.70089 109.9554 111.21179 112.46886
## 6 0.0062 0.00615 0.0061 0.00605 0.006 0.00594
Let’s rename the country column
population.data <- rename(population.data,Country=X)
head(population.data)
## Country X1980 X1981 X1982 X1983
## 1 North America 320.27638 324.44694 328.62014 332.72487
## 2 Bermuda 0.05473 0.05491 0.05517 0.05551
## 3 Canada 24.5933 24.9 25.2019 25.4563
## 4 Greenland 0.05021 0.05103 0.05166 0.05211
## 5 Mexico 68.34748 69.96926 71.6409 73.36288
## 6 Saint Pierre and Miquelon 0.00599 0.00601 0.00605 0.00607
## X1984 X1985 X1986 X1987 X1988 X1989 X1990
## 1 336.72143 340.74811 344.89548 349.07829 353.2939 357.68457 362.4468
## 2 0.05585 0.05618 0.05651 0.05683 0.05717 0.05749 0.05778
## 3 25.7018 25.9416 26.2038 26.5497 26.8948 27.3793 27.7906
## 4 0.05263 0.05315 0.05364 0.0541 0.05485 0.05541 0.05563
## 5 75.08014 76.76723 78.44243 80.12249 81.78182 83.36684 84.91365
## 6 0.00611 0.00616 0.00621 0.00625 0.00628 0.00631 0.00632
## X1991 X1992 X1993 X1994 X1995 X1996 X1997
## 1 367.70684 373.29069 378.74233 383.9166 388.97216 393.9428 398.97205
## 2 0.0581 0.0587 0.05924 0.05975 0.06029 0.06087 0.06145
## 3 28.1179 28.54489 28.95334 29.33081 29.69053 30.02632 30.3056
## 4 0.05554 0.05549 0.05564 0.05592 0.05619 0.05634 0.05651
## 5 86.48803 88.11103 89.74914 91.3379 92.88035 94.39858 95.89515
## 6 0.00633 0.00636 0.00638 0.0064 0.0064 0.00641 0.00642
## X1998 X1999 X2000 X2001 X2002 X2003 X2004
## 1 403.85585 408.60296 413.3245 417.83236 422.05268 426.06238 430.26938
## 2 0.06198 0.06251 0.06306 0.06361 0.06418 0.06476 0.06534
## 3 30.55166 30.82026 31.09956 31.37674 31.64096 31.88931 32.13476
## 4 0.05661 0.0567 0.05689 0.05713 0.05736 0.05754 0.0577
## 5 97.32506 98.61691 99.92662 101.24696 102.47993 103.71806 104.95959
## 6 0.00642 0.00643 0.00641 0.00637 0.00633 0.00629 0.00625
## X2005 X2006 X2007 X2008 X2009 X2010
## 1 434.47232 438.82964 443.3473 447.67394 451.83698 456.59331
## 2 0.06591 0.06644 0.06692 0.06739 0.06784 0.06827
## 3 32.38638 32.65668 32.93596 33.2127 33.48721 33.75974
## 4 0.05778 0.05764 0.05753 0.05756 0.0576 0.05764
## 5 106.2029 107.44953 108.70089 109.9554 111.21179 112.46886
## 6 0.0062 0.00615 0.0061 0.00605 0.006 0.00594
population.data.US.tidy <- population.data %>%
filter(Country == "United States") %>%
gather(Year, Population, X1980:X2010) %>%
separate(Year, c("left","Year"), sep="X") %>%
select(-left) %>%
arrange(Year)
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(population.data.US.tidy)
## Country Year Population
## 1 United States 1980 227.22468
## 2 United States 1981 229.46571
## 3 United States 1982 231.66446
## 4 United States 1983 233.79199
## 5 United States 1984 235.8249
## 6 United States 1985 237.9238
# extract 1995 value as the baseline value and add to US population dataframe
us_base <- population.data.US.tidy[population.data.US.tidy$Year==1995,3]
population.data.US.tidy$us_baseline <- us_base
population.data.US.tidy <- population.data.US.tidy %>%
mutate(us_change = (as.numeric(as.character(Population))-as.numeric(as.character(us_baseline)))/as.numeric(as.character(us_baseline)))
head(population.data.US.tidy)
## Country Year Population us_baseline us_change
## 1 United States 1980 227.22468 266.27839 -0.1466650
## 2 United States 1981 229.46571 266.27839 -0.1382488
## 3 United States 1982 231.66446 266.27839 -0.1299915
## 4 United States 1983 233.79199 266.27839 -0.1220016
## 5 United States 1984 235.8249 266.27839 -0.1143671
## 6 United States 1985 237.9238 266.27839 -0.1064848
# create a world dataframe, clean and turn into long data format
world.data <- population.data %>%
filter(Country != "United States") %>%
gather(Year, Population, X1980:X2010) %>%
separate(Year, c("left","Year"), sep="X") %>%
select(-left) %>%
arrange(Year) %>%
filter(Population != "NA")
## Warning: attributes are not identical across measure variables;
## they will be dropped
# create baselines for each country
world_base <- world.data %>%
filter(Year == 1995) %>%
select(Country, world_Baseline = Population)
world.data <- world.data %>%
left_join(world_base) %>%
arrange(Country) %>%
mutate(population_change = (as.numeric(as.character(Population))-as.numeric(as.character(world_Baseline)))/
as.numeric(as.character(world_Baseline)))
## Joining, by = "Country"
## Warning in evalq((as.numeric(as.character(Population))
## - as.numeric(as.character(world_Baseline)))/
## as.numeric(as.character(world_Baseline)), : NAs introduced by coercion
## Warning in evalq((as.numeric(as.character(Population))
## - as.numeric(as.character(world_Baseline)))/
## as.numeric(as.character(world_Baseline)), : NAs introduced by coercion
## Warning in evalq((as.numeric(as.character(Population))
## - as.numeric(as.character(world_Baseline)))/
## as.numeric(as.character(world_Baseline)), : NAs introduced by coercion
head(world.data)
## Country Year Population world_Baseline population_change
## 1 Afghanistan 1980 15.0436 19.22594 -0.2175363
## 2 Afghanistan 1981 13.67368 19.22594 -0.2887900
## 3 Afghanistan 1982 12.57743 19.22594 -0.3458094
## 4 Afghanistan 1983 12.43058 19.22594 -0.3534475
## 5 Afghanistan 1984 12.75384 19.22594 -0.3366337
## 6 Afghanistan 1985 13.09371 19.22594 -0.3189561
# change year variables from character to numeric
world.data$Year <- as.numeric(world.data$Year)
population.data.US.tidy$Year <- as.numeric(population.data.US.tidy$Year)
rank <- world.data %>%
filter(Year == 2010) %>%
arrange(desc(population_change)) %>%
mutate(rank = seq(1,length(Country), by=1)) %>%
filter(rank < 6 | rank > 45 )
kable(head(rank,10))
Country | Year | Population | world_Baseline | population_change | rank |
---|---|---|---|---|---|
United Arab Emirates | 2010 | 4.97559 | 2.45798 | 1.0242598 | 1 |
Rwanda | 2010 | 11.05598 | 5.47298 | 1.0201024 | 2 |
Liberia | 2010 | 3.68508 | 1.90028 | 0.9392300 | 3 |
Western Sahara | 2010 | 0.49152 | 0.26387 | 0.8627354 | 4 |
Niger | 2010 | 15.87827 | 9.19933 | 0.7260246 | 5 |
Cameroon | 2010 | 19.29415 | 13.60316 | 0.4183579 | 46 |
Macau | 2010 | 0.56796 | 0.4013 | 0.4153003 | 47 |
Tanzania | 2010 | 41.8929 | 29.7529 | 0.4080275 | 48 |
Mozambique | 2010 | 22.06145 | 15.76475 | 0.3994164 | 49 |
Middle East | 2010 | 212.33692 | 152.34103 | 0.3938262 | 50 |
Let’s visualize ppulation change the world data
p <- ggplot(world.data, aes(Year, population_change, group=Country)) +
theme_bw() +
theme(plot.background = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = 3, colour = "grey50"),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank()) +
geom_line(colour="grey90", alpha=.9)
print(p)
## Warning: Removed 312 rows containing missing values (geom_path).
Now indicate the US data as a reference line
p <- p +
geom_line(data=population.data.US.tidy, aes(Year, us_change, group=1), linetype=5)
print(p)
## Warning: Removed 312 rows containing missing values (geom_path).
Now Add the UAE data to visulize the population change
p <- p +
geom_line(data=filter(world.data, Country=="United Arab Emirates"),
aes(Year, population_change, group=Country), colour="dodgerblue", size = 1)
print(p)
## Warning: Removed 312 rows containing missing values (geom_path).
Conclusion:
We looked into the world population data and ranked them according to percent population change. We took 1995 as a baseline year and UAE appears to be have highest population change w.r.t. US population.
us.daily.elec.data <- read.csv('https://raw.githubusercontent.com/niteen11/MSDS/master/DATA607/week6/dataset/us_daily_electric_system_operating_data.csv',skip=4,header = T)
dim(us.daily.elec.data)
## [1] 56 32
head(us.daily.elec.data)
## megawatthours X01.01.2018 X01.02.2018 X01.03.2018
## 1 California (region) NA NA
## 2 Demand 645599 704074 727216
## 3 Net generation 463629 502312 504456
## 4 Total net actual interchange -179525 -201654 -222762
## 5 Carolinas (region) NA NA
## 6 Demand 847133 938726 948954
## X01.04.2018 X01.05.2018 X01.06.2018 X01.07.2018 X01.08.2018 X01.09.2018
## 1 NA NA NA NA NA NA
## 2 714305 712021 668299 655802 731881 729380
## 3 505148 484644 447821 447952 500797 493884
## 4 -209163 -213905 -219968 -207861 -237089 -246264
## 5 NA NA NA NA NA NA
## 6 901939 939699 932742 946665 840094 646932
## X01.10.2018 X01.11.2018 X01.12.2018 X01.13.2018 X01.14.2018 X01.15.2018
## 1 NA NA NA NA NA NA
## 2 716698 675260 687504 665136 642080 703473
## 3 482843 435107 460105 452264 428364 474015
## 4 -234361 -240146 -227413 -231833 -213729 -229468
## 5 NA NA NA NA NA NA
## 6 590003 561119 513368 587356 761043 823663
## X01.16.2018 X01.17.2018 X01.18.2018 X01.19.2018 X01.20.2018 X01.21.2018
## 1 NA NA NA NA NA NA
## 2 713050 715733 718298 712862 669421 665340
## 3 487101 484557 488237 484276 458660 437506
## 4 -225955 -231182 -230069 -228598 -210774 -227842
## 5 NA NA NA NA NA NA
## 6 778424 779099 858007 755994 663462 575237
## X01.22.2018 X01.23.2018 X01.24.2018 X01.25.2018 X01.26.2018 X01.27.2018
## 1 NA NA NA NA NA NA
## 2 727071 722833 728743 732552 699533 666415
## 3 502041 510543 500446 501674 464103 432315
## 4 -225042 -212304 -228303 -230401 -233814 -231742
## 5 NA NA NA NA NA NA
## 6 588743 523900 603365 676274 685613 575060
## X01.28.2018 X01.29.2018 X01.30.2018 X01.31.2018
## 1 NA NA NA NA
## 2 647844 725483 720384 717446
## 3 433092 491871 480451 492836
## 4 -213582 -233589 -239945 -224620
## 5 NA NA NA NA
## 6 511872 567562 697175 752846
us.daily.elec.data.tidy <- us.daily.elec.data %>%
gather(date, usage, X01.01.2018:X01.31.2018) %>%
separate(date, c('left','date'), sep='X')
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(us.daily.elec.data.tidy,3)
## megawatthours left date usage
## 1 California (region) 01.01.2018
## 2 Demand 01.01.2018 645599
## 3 Net generation 01.01.2018 463629
Let’s check for numeric value in usage and replace non numeric values with NA
us.data.clean <- us.daily.elec.data.tidy %>%
select(megawatthours,date,usage) %>%
mutate(usage = ifelse(grepl("^[-]{0,1}[0-9]{0,}.{0,1}[0-9]{1,}$",usage),usage,NA))
head(us.data.clean)
## megawatthours date usage
## 1 California (region) 01.01.2018 <NA>
## 2 Demand 01.01.2018 645599
## 3 Net generation 01.01.2018 463629
## 4 Total net actual interchange 01.01.2018 -179525
## 5 Carolinas (region) 01.01.2018 <NA>
## 6 Demand 01.01.2018 847133
Using muatate function to create location attribute
us.data.clean.loc <- us.data.clean %>%
mutate(location = ifelse(is.na(usage),as.character(megawatthours),lag(as.character(megawatthours))))
head(us.data.clean.loc,10)
## megawatthours date usage location
## 1 California (region) 01.01.2018 <NA> California (region)
## 2 Demand 01.01.2018 645599 California (region)
## 3 Net generation 01.01.2018 463629 Demand
## 4 Total net actual interchange 01.01.2018 -179525 Net generation
## 5 Carolinas (region) 01.01.2018 <NA> Carolinas (region)
## 6 Demand 01.01.2018 847133 Carolinas (region)
## 7 Net generation 01.01.2018 802908 Demand
## 8 Total net actual interchange 01.01.2018 -43153 Net generation
## 9 Central (region) 01.01.2018 <NA> Central (region)
## 10 Demand 01.01.2018 890403 Central (region)
Now populating location value for each categorical data
location <- c('')
for(row in 1:nrow(us.data.clean.loc)) {
loc.value <- us.data.clean.loc[row,'megawatthours']
if(is.na(us.data.clean.loc[row,'usage'])) {
location <- loc.value }
rbind(us.data.clean.loc$location[row] <- as.character(location) )
}
head(us.data.clean.loc,10)
## megawatthours date usage location
## 1 California (region) 01.01.2018 <NA> California (region)
## 2 Demand 01.01.2018 645599 California (region)
## 3 Net generation 01.01.2018 463629 California (region)
## 4 Total net actual interchange 01.01.2018 -179525 California (region)
## 5 Carolinas (region) 01.01.2018 <NA> Carolinas (region)
## 6 Demand 01.01.2018 847133 Carolinas (region)
## 7 Net generation 01.01.2018 802908 Carolinas (region)
## 8 Total net actual interchange 01.01.2018 -43153 Carolinas (region)
## 9 Central (region) 01.01.2018 <NA> Central (region)
## 10 Demand 01.01.2018 890403 Central (region)
Once the location is populated for each data point, we can get rid of NA usage values
us.data.clean.loc.df <- filter(us.data.clean.loc,!is.na(usage))
kable(head(us.data.clean.loc.df,15))
megawatthours | date | usage | location |
---|---|---|---|
Demand | 01.01.2018 | 645599 | California (region) |
Net generation | 01.01.2018 | 463629 | California (region) |
Total net actual interchange | 01.01.2018 | -179525 | California (region) |
Demand | 01.01.2018 | 847133 | Carolinas (region) |
Net generation | 01.01.2018 | 802908 | Carolinas (region) |
Total net actual interchange | 01.01.2018 | -43153 | Carolinas (region) |
Demand | 01.01.2018 | 890403 | Central (region) |
Net generation | 01.01.2018 | 898131 | Central (region) |
Total net actual interchange | 01.01.2018 | 7727 | Central (region) |
Demand | 01.01.2018 | 1289538 | Electric Reliability Council of Texas |
Net generation | 01.01.2018 | 1296554 | Electric Reliability Council of Texas |
Total net actual interchange | 01.01.2018 | 6981 | Electric Reliability Council of Texas |
Demand | 01.01.2018 | 588600 | Florida (region) |
Net generation | 01.01.2018 | 577335 | Florida (region) |
Total net actual interchange | 01.01.2018 | 1032 | Florida (region) |
let’s fix the day in the date for Jan month 2018
us.data.clean.loc.df.spread <- us.data.clean.loc.df %>%
spread(megawatthours,usage) %>%
mutate(Day= as.numeric(substr(date,4,5))) %>%
select(2:6)
kable(head(us.data.clean.loc.df.spread))
location | Demand | Net generation | Total net actual interchange | Day |
---|---|---|---|---|
California (region) | 645599 | 463629 | -179525 | 1 |
Carolinas (region) | 847133 | 802908 | -43153 | 1 |
Central (region) | 890403 | 898131 | 7727 | 1 |
Electric Reliability Council of Texas | 1289538 | 1296554 | 6981 | 1 |
Florida (region) | 588600 | 577335 | 1032 | 1 |
Mid-Atlantic (region) | 2828052 | 2893869 | 64583 | 1 |
ggplot(us.data.clean.loc.df.spread, aes(x=Demand, y=Day, group=location,color=location)) +
geom_line()+
facet_wrap(~location,ncol = 2)
ggplot(data = us.data.clean.loc.df.spread, aes(x=Day,y=Demand))+
geom_bar(stat = 'identity',aes(fill=location))+
facet_wrap(~location, ncol = 2)+
theme_bw()
Conclusion:
By alanyzing above data and plots, it appears that there is consistency in demand of electicity in california, carolinas, central, southeast regions and is higher as compared to other regions.