I used the CMS website to download a csv containing wide data. The goal is to produce analyze which states perform best on the “Overall Rating of the Hospital” question and to determine which questions are most associated with the Overall Rating.
library(tidyr)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
##Read in wide data from github
u_data<-read.csv("https://raw.githubusercontent.com/scottogden10/607-Assignment2/master/VBP_EXP.csv")
##Now clean up and label data, take only where the outcome interest (Overall Rating of Hospital) is Available. Use only Achievement data (for current year's score).
reduc<-data.frame("HospNum"=u_data$Provider.Number,"State"=u_data$State,u_data[grep("Achie",colnames(u_data))])
clean<-subset(reduc,reduc$Overall..Rating.of.Hospital.Achievement.Points!='Not Available')
#Use the package to turn wide data into long, usable for analysis, with the only columns being for measures (dimensions collapse)
t<-gather(clean,"Metric","Score",3:10)
head(t)
## HospNum State Metric Score
## 1 10005 AL Communication.with.Nurses.Achievement.Points 6 out of 10
## 2 10012 AL Communication.with.Nurses.Achievement.Points 4 out of 10
## 3 10131 AL Communication.with.Nurses.Achievement.Points 0 out of 10
## 4 20001 AK Communication.with.Nurses.Achievement.Points 0 out of 10
## 5 20018 AK Communication.with.Nurses.Achievement.Points 0 out of 10
## 6 30014 AZ Communication.with.Nurses.Achievement.Points 1 out of 10
#Change scores to their numeric values
tidy<-data.frame(subset(t,select = HospNum:Metric),"Score"=as.numeric(str_sub(t$Score,1,2)))
By_State<-aggregate(subset(tidy,select=Score),by=list(tidy$State,tidy$Metric),FUN=mean)
#Now we have the results we need, a mean score grouped by question and by state.
#For example Lets Look at the overall rating of the hospital by State and Identify the top ten pefroming sttates
OR<-subset(By_State,Group.2=="Overall..Rating.of.Hospital.Achievement.Points")
head(OR[order(-OR$Score),],10)
## Group.1 Group.2 Score
## 291 SD Overall..Rating.of.Hospital.Achievement.Points 5.375000
## 279 NE Overall..Rating.of.Hospital.Achievement.Points 4.652174
## 298 WI Overall..Rating.of.Hospital.Achievement.Points 4.484375
## 266 IN Overall..Rating.of.Hospital.Achievement.Points 4.432099
## 267 KS Overall..Rating.of.Hospital.Achievement.Points 3.954545
## 269 LA Overall..Rating.of.Hospital.Achievement.Points 3.904110
## 256 CO Overall..Rating.of.Hospital.Achievement.Points 3.704545
## 294 UT Overall..Rating.of.Hospital.Achievement.Points 3.700000
## 262 HI Overall..Rating.of.Hospital.Achievement.Points 3.500000
## 276 MT Overall..Rating.of.Hospital.Achievement.Points 3.416667
#Notice they tend to be southern and midwestern states, now let's see the bottom performers.
head(OR[order(OR$Score),],10)
## Group.1 Group.2 Score
## 258 DC Overall..Rating.of.Hospital.Achievement.Points 0.0000000
## 278 ND Overall..Rating.of.Hospital.Achievement.Points 0.5000000
## 259 DE Overall..Rating.of.Hospital.Achievement.Points 0.6666667
## 284 NY Overall..Rating.of.Hospital.Achievement.Points 0.7659574
## 282 NM Overall..Rating.of.Hospital.Achievement.Points 0.8400000
## 283 NV Overall..Rating.of.Hospital.Achievement.Points 0.8500000
## 281 NJ Overall..Rating.of.Hospital.Achievement.Points 0.9344262
## 299 WV Overall..Rating.of.Hospital.Achievement.Points 1.0740741
## 257 CT Overall..Rating.of.Hospital.Achievement.Points 1.2222222
## 251 AK Overall..Rating.of.Hospital.Achievement.Points 1.2500000
#Notice they tend to be primarily north/eastern states
##Now let's analyze which factors are most correlated with the overall score.
library(corrplot)
Overall<-subset(tidy,tidy$Metric=="Overall..Rating.of.Hospital.Achievement.Points",select = Score)
Comm_Nurse<-subset(tidy,tidy$Metric=="Communication.with.Nurses.Achievement.Points",select = Score)
Comm_MD<-subset(tidy,tidy$Metric=="Communication.with.Doctors.Achievement.Points",select = Score)
Environment<-subset(tidy,tidy$Metric=="Cleanliness.and.Quietness.of.Hospital.Environment.Achievement.Points",select = Score)
Responsiveness<-subset(tidy,tidy$Metric=="Responsiveness.of.Hospital.Staff.Achievement.Points",select = Score)
Discharge<-subset(tidy,tidy$Metric=="Discharge.Information.Achievement.Points",select = Score)
Medication<-subset(tidy,tidy$Metric=="Communication.about.Medicines.Achievement.Points",select = Score)
Pain_Control<-subset(tidy,tidy$Metric=="Pain.Management.Achievement.Points",select = Score)
d<-data.frame(Overall,Comm_Nurse,Comm_MD,Environment,Responsiveness,Discharge,Medication,Pain_Control)
dd<-plyr::rename(d,c("Score"="Overall","Score.1"="Comm_Nurse","Score.2"="Comm_MD","Score.3"="Environment","Score.4"="Responsiveness","Score.5"="Discharge","Score.6"="Medication","Score.7"="Pain_Control"))
head(dd)
## Overall Comm_Nurse Comm_MD Environment Responsiveness Discharge
## 20679 5 6 7 6 2 7
## 20680 2 4 3 3 2 4
## 20681 1 0 3 1 0 2
## 20682 3 0 0 0 1 5
## 20683 0 0 0 0 0 0
## 20684 2 1 0 0 0 4
## Medication Pain_Control
## 20679 7 5
## 20680 6 5
## 20681 1 0
## 20682 0 0
## 20683 5 0
## 20684 3 0
M<-cor(dd)
#Let's plot all of the correlations!
corrplot(M,method="circle")
#Now specific to Overall Rating
M[,1]
## Overall Comm_Nurse Comm_MD Environment Responsiveness
## 1.0000000 0.7031152 0.5352141 0.6365816 0.5966346
## Discharge Medication Pain_Control
## 0.5085098 0.6187822 0.6626712
##We see that Nursing Communication and Pain Control are the two factors most correlated with Success on the Overall Rating!
The goal of this analysis is to analyze which types of pokemon are best for attack, defense and speed.
##Read in wide data from github
p_data<-read.csv("https://raw.githubusercontent.com/scottogden10/607-Assignment2/master/pokemon.csv")
##Now clean up and label data, take only where the outcome interest
##Ideally this would be normalized where you'd have tables describing which types are good against which others, but since our analysis calls attack, defense and speed we don't need
#We need to get rid of Outliers, the so called Mega pokemon
p<-p_data[-grep("Mega",p_data$Name),]
preduc<-subset(p,select=Name:Speed)
#Prelim analysis based on these fields
par(mfrow=c(2,2))
hist(preduc$Attack.Level)
hist(preduc$Defense.Level)
hist(preduc$Speed)
hist(preduc$Horse.Power)
#We see that these are relatively well-behaved with a slight right skew, we can analyze with a mean or median and get a good sense for which types are good for what.
#Use the package to turn wide data into long, usable for analysis, with the only columns being for measures (dimensions collapse)
tp<-gather(preduc,"Stat","Score",3:6)
head(tp)
## Name Type Stat Score
## 1 Abomasnow Grass Attack.Level 92
## 2 Abra Psychic Attack.Level 20
## 3 Absol Dark Attack.Level 130
## 4 Accelgor Bug Attack.Level 70
## 5 Aerodactyl Rock Attack.Level 105
## 6 Aipom Normal Attack.Level 70
#Change scores to their numeric values
tidyp<-data.frame(subset(tp,select = Type:Score),"Score"=as.numeric(str_sub(tp$Score,1,2)))
By_Type<-aggregate(subset(tidyp,select=Score),by=list(tidyp$Type,tidyp$Stat),FUN=median)
#Now we have the results we need, a median score grouped by question and by state.
Atk<-subset(By_Type,Group.2=="Attack.Level")
Def<-subset(By_Type,Group.2=="Defense.Level")
HP<-subset(By_Type,Group.2=="Horse.Power")
Sp<-subset(By_Type,Group.2=="Speed")
Atk
## Group.1 Group.2 Score
## 1 Bug Attack.Level 65.0
## 2 Dark Attack.Level 86.5
## 3 Electric Attack.Level 65.0
## 4 Fighting Attack.Level 100.0
## 5 Fire Attack.Level 82.5
## 6 Flying Attack.Level 85.0
## 7 Ghost Attack.Level 66.0
## 8 Grass Attack.Level 68.0
## 9 Ground Attack.Level 85.0
## 10 Ice Attack.Level 65.0
## 11 Normal Attack.Level 70.0
## 12 Poison Attack.Level 74.0
## 13 Psychic Attack.Level 55.0
## 14 Rock Attack.Level 89.0
## 15 Water Attack.Level 70.0
Def
## Group.1 Group.2 Score
## 16 Bug Defense.Level 59.0
## 17 Dark Defense.Level 67.5
## 18 Electric Defense.Level 60.0
## 19 Fighting Defense.Level 62.0
## 20 Fire Defense.Level 60.0
## 21 Flying Defense.Level 75.0
## 22 Ghost Defense.Level 70.0
## 23 Grass Defense.Level 65.0
## 24 Ground Defense.Level 84.5
## 25 Ice Defense.Level 70.0
## 26 Normal Defense.Level 60.0
## 27 Poison Defense.Level 67.0
## 28 Psychic Defense.Level 62.5
## 29 Rock Defense.Level 100.0
## 30 Water Defense.Level 68.0
HP
## Group.1 Group.2 Score
## 31 Bug Horse.Power 60.0
## 32 Dark Horse.Power 65.0
## 33 Electric Horse.Power 60.0
## 34 Fighting Horse.Power 70.0
## 35 Fire Horse.Power 67.5
## 36 Flying Horse.Power 79.0
## 37 Ghost Horse.Power 58.5
## 38 Grass Horse.Power 65.0
## 39 Ground Horse.Power 75.0
## 40 Ice Horse.Power 70.0
## 41 Normal Horse.Power 70.0
## 42 Poison Horse.Power 67.5
## 43 Psychic Horse.Power 66.0
## 44 Rock Horse.Power 67.0
## 45 Water Horse.Power 69.0
Sp
## Group.1 Group.2 Score
## 46 Bug Speed 50.0
## 47 Dark Speed 62.5
## 48 Electric Speed 87.5
## 49 Fighting Speed 40.0
## 50 Fire Speed 80.5
## 51 Flying Speed 103.5
## 52 Ghost Speed 65.0
## 53 Grass Speed 74.5
## 54 Ground Speed 47.5
## 55 Ice Speed 75.0
## 56 Normal Speed 50.0
## 57 Poison Speed 60.0
## 58 Psychic Speed 92.5
## 59 Rock Speed 55.0
## 60 Water Speed 70.0
#For more specificity lets add some box plots.
tidypa<-subset(tidyp,tidyp$Stat=="Attack.Level")
tidypd<-subset(tidyp,tidyp$Stat=="Defense.Level")
tidyph<-subset(tidyp,tidyp$Stat=="Horse.Power")
tidyps<-subset(tidyp,tidyp$Stat=="Speed")
#Attack
ggplot(tidypa, aes(x = tidypa$Type, y = tidypa$Score, fill = tidypa$Type))+geom_boxplot()
#Fighting is the strongest type
#Defense
ggplot(tidypd, aes(x = tidypd$Type, y = tidypd$Score, fill = tidypd$Type))+geom_boxplot()
#Rock is the strongest type
#HP
ggplot(tidyph, aes(x = tidyph$Type, y = tidyph$Score, fill = tidyph$Type))+geom_boxplot()
#Flying is the strongest
#Speed
ggplot(tidyps, aes(x = tidyps$Type, y = tidyps$Score, fill = tidyps$Type))+geom_boxplot()
#Flying is the strongest
#Use overall Stats
ggplot(tidyp, aes(x = tidyp$Type, y = tidyp$Score, fill = tidyp$Type))+geom_boxplot()
#Flying is the strongest
##Read in wide data from github and rename for east of use.
r_data<-read.csv("https://raw.githubusercontent.com/scottogden10/607-Assignment2/master/Relig.csv")
colnames(r_data)<-c("Religion","LT30","30.49","50.99","GT100","n")
#Use the package to turn wide data into long, usable for analysis, with the only columns being for measures (dimensions collapse)
tr<-gather(r_data,"Metric","Score",2:5)
## Warning: attributes are not identical across measure variables; they will
## be dropped
head(tr,15)
## Religion n Metric Score
## 1 Buddhist 233 LT30 36%
## 2 Catholic 6,137 LT30 36%
## 3 Evangelical Protestant 7,462 LT30 35%
## 4 Hindu 172 LT30 17%
## 5 Historically Black Protestant 1,704 LT30 53%
## 6 Jehovah's Witness 208 LT30 48%
## 7 Jewish 708 LT30 16%
## 8 Mainline Protestant 5,208 LT30 29%
## 9 Mormon 594 LT30 27%
## 10 Muslim 205 LT30 34%
## 11 Orthodox Christian 155 LT30 18%
## 12 Unaffiliated (religious "nones") 6,79 LT30 33%
## 13 Buddhist 233 30.49 18%
## 14 Catholic 6,137 30.49 19%
## 15 Evangelical Protestant 7,462 30.49 22%
#Add the n to the data by multiplying score by sample size,
tr$n_sub<-round(as.numeric(gsub("%","",tr$Score))*as.numeric(gsub(",","",tr$n))/100)
#Now spread the data by the sub groups n to get a cross tab
cross<-spread(subset(tr,select=c(Religion,Metric,n_sub)),Metric,n_sub)
#The most basic test we can do is to see if these n's follow what would be expected of them if the rows were independent: a Chi Square test.
chisq.test(cross[-1])
##
## Pearson's Chi-squared test
##
## data: cross[-1]
## X-squared = 1037.6, df = 33, p-value < 2.2e-16
chisq.test(cross[-1])$p.value
## [1] 1.173154e-196
#So the chance that the null hypothesis that there is no effect on the distribution by religion, were it true, would produce a result this extreme in very few cases.
#We can go a little stronger with the analysis here, we can ask which religion is "the wealthiest". We can't get means, because we don't have row level data, but we can get an ordering because they categories we do have are ordinal.
cross2<-(cross)
cross2$ntot<-cross2$`30.49`+cross2$`50.99`+cross2$GT100+cross2$LT30
#We will assign 1,2,3,4 to the different categories to produce a wealth ordering among religions and then compute a weighted average of the ordinal designations.
cross2$rank<-(2*cross2$`30.49`+3*cross2$`50.99`+4*cross2$GT100+1*cross2$LT30)/cross2$ntot
rank_2<-cross2[order(-cross2$rank),c(1,7)]
rank_2
## Religion rank
## 7 Jewish 2.971469
## 4 Hindu 2.894737
## 11 Orthodox Christian 2.761290
## 9 Mormon 2.461279
## 8 Mainline Protestant 2.450077
## 12 Unaffiliated (religious "nones") 2.351471
## 10 Muslim 2.346341
## 2 Catholic 2.280104
## 1 Buddhist 2.220779
## 3 Evangelical Protestant 2.212101
## 6 Jehovah's Witness 1.815534
## 5 Historically Black Protestant 1.799883
##We see that the jewish faith tend to be on the wealthier end while the historically black protestants are on the lower end, implying some sort of disparity. Let's plot to finish:
col<-c("red","green","blue","orange","grey","purple","yellow","black","white","darkblue","darkred","darkgreen")
barplot(rank_2$rank,ylim=c(0,4),width=22,col=col,xlab="Religion",ylab="Wealth Order")
legend("top",
legend=rank_2$Religion,
fill=col,cex=.6, ncol=3
)