Data Set 1: Hospital Experience Data

I used the CMS website to download a csv containing wide data. The goal is to produce analyze which states perform best on the “Overall Rating of the Hospital” question and to determine which questions are most associated with the Overall Rating.

library(tidyr)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)

##Read in wide data from github
u_data<-read.csv("https://raw.githubusercontent.com/scottogden10/607-Assignment2/master/VBP_EXP.csv")

##Now clean up and label data, take only where the outcome interest (Overall Rating of Hospital) is Available.  Use only Achievement data (for current year's score).

reduc<-data.frame("HospNum"=u_data$Provider.Number,"State"=u_data$State,u_data[grep("Achie",colnames(u_data))])


clean<-subset(reduc,reduc$Overall..Rating.of.Hospital.Achievement.Points!='Not Available')

#Use the package to turn wide data into long, usable for analysis, with the only columns being for measures (dimensions collapse)

t<-gather(clean,"Metric","Score",3:10)
head(t)
##   HospNum State                                       Metric       Score
## 1   10005    AL Communication.with.Nurses.Achievement.Points 6 out of 10
## 2   10012    AL Communication.with.Nurses.Achievement.Points 4 out of 10
## 3   10131    AL Communication.with.Nurses.Achievement.Points 0 out of 10
## 4   20001    AK Communication.with.Nurses.Achievement.Points 0 out of 10
## 5   20018    AK Communication.with.Nurses.Achievement.Points 0 out of 10
## 6   30014    AZ Communication.with.Nurses.Achievement.Points 1 out of 10
#Change scores to their numeric values
tidy<-data.frame(subset(t,select = HospNum:Metric),"Score"=as.numeric(str_sub(t$Score,1,2)))
By_State<-aggregate(subset(tidy,select=Score),by=list(tidy$State,tidy$Metric),FUN=mean)

#Now we have the results we need, a mean score grouped by question and by state.

#For example Lets Look at the overall rating of the hospital by State and Identify the top ten pefroming sttates

OR<-subset(By_State,Group.2=="Overall..Rating.of.Hospital.Achievement.Points")


head(OR[order(-OR$Score),],10)
##     Group.1                                        Group.2    Score
## 291      SD Overall..Rating.of.Hospital.Achievement.Points 5.375000
## 279      NE Overall..Rating.of.Hospital.Achievement.Points 4.652174
## 298      WI Overall..Rating.of.Hospital.Achievement.Points 4.484375
## 266      IN Overall..Rating.of.Hospital.Achievement.Points 4.432099
## 267      KS Overall..Rating.of.Hospital.Achievement.Points 3.954545
## 269      LA Overall..Rating.of.Hospital.Achievement.Points 3.904110
## 256      CO Overall..Rating.of.Hospital.Achievement.Points 3.704545
## 294      UT Overall..Rating.of.Hospital.Achievement.Points 3.700000
## 262      HI Overall..Rating.of.Hospital.Achievement.Points 3.500000
## 276      MT Overall..Rating.of.Hospital.Achievement.Points 3.416667
#Notice they tend to be southern and midwestern states, now let's see the bottom performers.

head(OR[order(OR$Score),],10)
##     Group.1                                        Group.2     Score
## 258      DC Overall..Rating.of.Hospital.Achievement.Points 0.0000000
## 278      ND Overall..Rating.of.Hospital.Achievement.Points 0.5000000
## 259      DE Overall..Rating.of.Hospital.Achievement.Points 0.6666667
## 284      NY Overall..Rating.of.Hospital.Achievement.Points 0.7659574
## 282      NM Overall..Rating.of.Hospital.Achievement.Points 0.8400000
## 283      NV Overall..Rating.of.Hospital.Achievement.Points 0.8500000
## 281      NJ Overall..Rating.of.Hospital.Achievement.Points 0.9344262
## 299      WV Overall..Rating.of.Hospital.Achievement.Points 1.0740741
## 257      CT Overall..Rating.of.Hospital.Achievement.Points 1.2222222
## 251      AK Overall..Rating.of.Hospital.Achievement.Points 1.2500000
#Notice they tend to be primarily north/eastern states

##Now let's analyze which factors are most correlated with the overall score.

library(corrplot)

Overall<-subset(tidy,tidy$Metric=="Overall..Rating.of.Hospital.Achievement.Points",select = Score)
Comm_Nurse<-subset(tidy,tidy$Metric=="Communication.with.Nurses.Achievement.Points",select = Score)
Comm_MD<-subset(tidy,tidy$Metric=="Communication.with.Doctors.Achievement.Points",select = Score)
Environment<-subset(tidy,tidy$Metric=="Cleanliness.and.Quietness.of.Hospital.Environment.Achievement.Points",select = Score)
Responsiveness<-subset(tidy,tidy$Metric=="Responsiveness.of.Hospital.Staff.Achievement.Points",select = Score)
Discharge<-subset(tidy,tidy$Metric=="Discharge.Information.Achievement.Points",select = Score)
Medication<-subset(tidy,tidy$Metric=="Communication.about.Medicines.Achievement.Points",select = Score)
Pain_Control<-subset(tidy,tidy$Metric=="Pain.Management.Achievement.Points",select = Score)

d<-data.frame(Overall,Comm_Nurse,Comm_MD,Environment,Responsiveness,Discharge,Medication,Pain_Control)

dd<-plyr::rename(d,c("Score"="Overall","Score.1"="Comm_Nurse","Score.2"="Comm_MD","Score.3"="Environment","Score.4"="Responsiveness","Score.5"="Discharge","Score.6"="Medication","Score.7"="Pain_Control"))
head(dd)
##       Overall Comm_Nurse Comm_MD Environment Responsiveness Discharge
## 20679       5          6       7           6              2         7
## 20680       2          4       3           3              2         4
## 20681       1          0       3           1              0         2
## 20682       3          0       0           0              1         5
## 20683       0          0       0           0              0         0
## 20684       2          1       0           0              0         4
##       Medication Pain_Control
## 20679          7            5
## 20680          6            5
## 20681          1            0
## 20682          0            0
## 20683          5            0
## 20684          3            0
M<-cor(dd)

#Let's plot all of the correlations!

corrplot(M,method="circle")

#Now specific to Overall Rating
M[,1]
##        Overall     Comm_Nurse        Comm_MD    Environment Responsiveness 
##      1.0000000      0.7031152      0.5352141      0.6365816      0.5966346 
##      Discharge     Medication   Pain_Control 
##      0.5085098      0.6187822      0.6626712
##We see that Nursing Communication and Pain Control are the two factors most correlated with Success on the Overall Rating!

Data Set 2: Pokemon

The goal of this analysis is to analyze which types of pokemon are best for attack, defense and speed.

##Read in wide data from github
p_data<-read.csv("https://raw.githubusercontent.com/scottogden10/607-Assignment2/master/pokemon.csv")

##Now clean up and label data, take only where the outcome interest 


##Ideally this would be normalized where you'd have tables describing which types are good against which others, but since our analysis calls attack, defense and speed we don't need 

#We need to get rid of Outliers, the so called Mega pokemon
p<-p_data[-grep("Mega",p_data$Name),]
preduc<-subset(p,select=Name:Speed)

#Prelim analysis based on these fields
par(mfrow=c(2,2))
hist(preduc$Attack.Level)
hist(preduc$Defense.Level)
hist(preduc$Speed)
hist(preduc$Horse.Power)

#We see that these are relatively well-behaved with a slight right skew, we can analyze with a mean or median and get a good sense for which types are good for what.


#Use the package to turn wide data into long, usable for analysis, with the only columns being for measures (dimensions collapse)

tp<-gather(preduc,"Stat","Score",3:6)
head(tp)
##         Name    Type         Stat Score
## 1  Abomasnow   Grass Attack.Level    92
## 2       Abra Psychic Attack.Level    20
## 3      Absol    Dark Attack.Level   130
## 4   Accelgor     Bug Attack.Level    70
## 5 Aerodactyl    Rock Attack.Level   105
## 6      Aipom  Normal Attack.Level    70
#Change scores to their numeric values
tidyp<-data.frame(subset(tp,select = Type:Score),"Score"=as.numeric(str_sub(tp$Score,1,2)))
By_Type<-aggregate(subset(tidyp,select=Score),by=list(tidyp$Type,tidyp$Stat),FUN=median)

#Now we have the results we need, a median score grouped by question and by state.

Atk<-subset(By_Type,Group.2=="Attack.Level")
Def<-subset(By_Type,Group.2=="Defense.Level")
HP<-subset(By_Type,Group.2=="Horse.Power")
Sp<-subset(By_Type,Group.2=="Speed")

Atk
##     Group.1      Group.2 Score
## 1       Bug Attack.Level  65.0
## 2      Dark Attack.Level  86.5
## 3  Electric Attack.Level  65.0
## 4  Fighting Attack.Level 100.0
## 5      Fire Attack.Level  82.5
## 6    Flying Attack.Level  85.0
## 7     Ghost Attack.Level  66.0
## 8     Grass Attack.Level  68.0
## 9    Ground Attack.Level  85.0
## 10      Ice Attack.Level  65.0
## 11   Normal Attack.Level  70.0
## 12   Poison Attack.Level  74.0
## 13  Psychic Attack.Level  55.0
## 14     Rock Attack.Level  89.0
## 15    Water Attack.Level  70.0
Def
##     Group.1       Group.2 Score
## 16      Bug Defense.Level  59.0
## 17     Dark Defense.Level  67.5
## 18 Electric Defense.Level  60.0
## 19 Fighting Defense.Level  62.0
## 20     Fire Defense.Level  60.0
## 21   Flying Defense.Level  75.0
## 22    Ghost Defense.Level  70.0
## 23    Grass Defense.Level  65.0
## 24   Ground Defense.Level  84.5
## 25      Ice Defense.Level  70.0
## 26   Normal Defense.Level  60.0
## 27   Poison Defense.Level  67.0
## 28  Psychic Defense.Level  62.5
## 29     Rock Defense.Level 100.0
## 30    Water Defense.Level  68.0
HP
##     Group.1     Group.2 Score
## 31      Bug Horse.Power  60.0
## 32     Dark Horse.Power  65.0
## 33 Electric Horse.Power  60.0
## 34 Fighting Horse.Power  70.0
## 35     Fire Horse.Power  67.5
## 36   Flying Horse.Power  79.0
## 37    Ghost Horse.Power  58.5
## 38    Grass Horse.Power  65.0
## 39   Ground Horse.Power  75.0
## 40      Ice Horse.Power  70.0
## 41   Normal Horse.Power  70.0
## 42   Poison Horse.Power  67.5
## 43  Psychic Horse.Power  66.0
## 44     Rock Horse.Power  67.0
## 45    Water Horse.Power  69.0
Sp
##     Group.1 Group.2 Score
## 46      Bug   Speed  50.0
## 47     Dark   Speed  62.5
## 48 Electric   Speed  87.5
## 49 Fighting   Speed  40.0
## 50     Fire   Speed  80.5
## 51   Flying   Speed 103.5
## 52    Ghost   Speed  65.0
## 53    Grass   Speed  74.5
## 54   Ground   Speed  47.5
## 55      Ice   Speed  75.0
## 56   Normal   Speed  50.0
## 57   Poison   Speed  60.0
## 58  Psychic   Speed  92.5
## 59     Rock   Speed  55.0
## 60    Water   Speed  70.0
#For more specificity lets add some box plots.
tidypa<-subset(tidyp,tidyp$Stat=="Attack.Level")
tidypd<-subset(tidyp,tidyp$Stat=="Defense.Level")
tidyph<-subset(tidyp,tidyp$Stat=="Horse.Power")
tidyps<-subset(tidyp,tidyp$Stat=="Speed")

#Attack
ggplot(tidypa, aes(x = tidypa$Type, y = tidypa$Score, fill = tidypa$Type))+geom_boxplot()

#Fighting is the strongest type

#Defense
ggplot(tidypd, aes(x = tidypd$Type, y = tidypd$Score, fill = tidypd$Type))+geom_boxplot()

#Rock is the strongest type

#HP
ggplot(tidyph, aes(x = tidyph$Type, y = tidyph$Score, fill = tidyph$Type))+geom_boxplot()

#Flying is the strongest

#Speed
ggplot(tidyps, aes(x = tidyps$Type, y = tidyps$Score, fill = tidyps$Type))+geom_boxplot()

#Flying is the strongest

#Use overall Stats
ggplot(tidyp, aes(x = tidyp$Type, y = tidyp$Score, fill = tidyp$Type))+geom_boxplot()

#Flying is the strongest

Data Source 3: Religion and Income

##Read in wide data from github and rename for east of use.
r_data<-read.csv("https://raw.githubusercontent.com/scottogden10/607-Assignment2/master/Relig.csv")
colnames(r_data)<-c("Religion","LT30","30.49","50.99","GT100","n")

#Use the package to turn wide data into long, usable for analysis, with the only columns being for measures (dimensions collapse)

tr<-gather(r_data,"Metric","Score",2:5)
## Warning: attributes are not identical across measure variables; they will
## be dropped
head(tr,15)
##                            Religion     n Metric Score
## 1                          Buddhist   233   LT30   36%
## 2                          Catholic 6,137   LT30   36%
## 3            Evangelical Protestant 7,462   LT30   35%
## 4                             Hindu   172   LT30   17%
## 5     Historically Black Protestant 1,704   LT30   53%
## 6                 Jehovah's Witness   208   LT30   48%
## 7                            Jewish   708   LT30   16%
## 8               Mainline Protestant 5,208   LT30   29%
## 9                            Mormon   594   LT30   27%
## 10                           Muslim   205   LT30   34%
## 11               Orthodox Christian   155   LT30   18%
## 12 Unaffiliated (religious "nones")  6,79   LT30   33%
## 13                         Buddhist   233  30.49   18%
## 14                         Catholic 6,137  30.49   19%
## 15           Evangelical Protestant 7,462  30.49   22%
#Add the n to the data by multiplying score by sample size,

tr$n_sub<-round(as.numeric(gsub("%","",tr$Score))*as.numeric(gsub(",","",tr$n))/100)

#Now spread the data by the sub groups n to get a cross tab

cross<-spread(subset(tr,select=c(Religion,Metric,n_sub)),Metric,n_sub)

#The most basic test we can do is to see if these n's follow what would be expected of them if the rows were independent: a Chi Square test.


chisq.test(cross[-1])
## 
##  Pearson's Chi-squared test
## 
## data:  cross[-1]
## X-squared = 1037.6, df = 33, p-value < 2.2e-16
chisq.test(cross[-1])$p.value
## [1] 1.173154e-196
#So the chance that the null hypothesis that there is no effect on the distribution by religion, were it true, would produce a result this extreme in very few cases.

#We can go a little stronger with the analysis here, we can ask which religion is "the wealthiest".  We can't get means, because we don't have row level data, but we can get an ordering because they categories we do have are ordinal.

cross2<-(cross)

cross2$ntot<-cross2$`30.49`+cross2$`50.99`+cross2$GT100+cross2$LT30

#We will assign 1,2,3,4 to the different categories to produce a wealth ordering among religions and then compute a weighted average of the ordinal designations.

cross2$rank<-(2*cross2$`30.49`+3*cross2$`50.99`+4*cross2$GT100+1*cross2$LT30)/cross2$ntot

rank_2<-cross2[order(-cross2$rank),c(1,7)]
rank_2
##                            Religion     rank
## 7                            Jewish 2.971469
## 4                             Hindu 2.894737
## 11               Orthodox Christian 2.761290
## 9                            Mormon 2.461279
## 8               Mainline Protestant 2.450077
## 12 Unaffiliated (religious "nones") 2.351471
## 10                           Muslim 2.346341
## 2                          Catholic 2.280104
## 1                          Buddhist 2.220779
## 3            Evangelical Protestant 2.212101
## 6                 Jehovah's Witness 1.815534
## 5     Historically Black Protestant 1.799883
##We see that the jewish faith tend to be on the wealthier end while the historically black protestants are on the lower end, implying some sort of disparity.  Let's plot to finish:

col<-c("red","green","blue","orange","grey","purple","yellow","black","white","darkblue","darkred","darkgreen")

barplot(rank_2$rank,ylim=c(0,4),width=22,col=col,xlab="Religion",ylab="Wealth Order")

legend("top",
  legend=rank_2$Religion,
  fill=col,cex=.6, ncol=3
)