This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

library(readr)
df = read_csv("~/Desktop/nycHospitalReviews.csv")
Parsed with column specification:
cols(
  hospital = col_character(),
  address = col_character(),
  numRev = col_character(),
  reviews = col_character(),
  Latitude = col_double(),
  Longitude = col_double(),
  `Accuracy Score` = col_double(),
  `Accuracy Type` = col_character(),
  Number = col_character(),
  Street = col_character(),
  City = col_character(),
  State = col_character(),
  County = col_character(),
  Zip = col_character(),
  Country = col_character(),
  Source = col_character()
)
library(dplyr)
df = mutate(df, count= (is.null(reviews)))
df$count = as.integer(df$count) + 1 #chnage to 1 because was 0 as they are not null 
totComp = aggregate(count ~ hospital, data = df, FUN = sum)
tot_sort =  totComp[order(-totComp$count ),]
library(stringr)
library(syuzhet)
#df$reviewText = as.character(df$reviewText)
#find the sentiment for each hospital
df$sentiment = get_sentiment(df$reviews)
totSent = aggregate(sentiment ~ hospital, data = df, FUN = mean) 
totSent = totSent[order(-totSent$sent ),]
#find the word count over sentiment for each of 6 words
df$word_count = str_count(df$reviews, "rude")
totRude = aggregate(word_count ~ hospital, data = df, FUN = sum) 
totRude1 = totRude
totRude1$word = "Rude"
totRude1 = merge(totRude1,totSent, by = 'hospital', sort = TRUE)
totRude1 = totRude1[order(totRude1$sent ),]
df$word_count = str_count(df$reviews, "unprofessional")
totUnprofes = aggregate(word_count ~ hospital, data = df, FUN = sum) 
totUnprofes1 = totUnprofes
totUnprofes1$word = "Unprofessional"
totUnprofes1 = merge(totUnprofes1,totSent, by = 'hospital', sort = TRUE)
totUnprofes1 = totUnprofes1[order(totUnprofes1$sent ),]
df$word_count = str_count(df$reviews, "pain")
totPain = aggregate(word_count ~ hospital, data = df, FUN = sum)
totPain1 = totPain
totPain1$word = "Pain"
totPain1 = merge(totPain1,totSent, by = 'hospital', sort = TRUE)
totPain1 = totPain1[order(totPain1$sent ),]
df$word_count = str_count(df$reviews, "friendly")
totFriend = aggregate(word_count ~ hospital, data = df, FUN = sum) 
totFriend1 = totFriend
totFriend1$word = "Friendly"
totFriend1 = merge(totFriend1,totSent, by = 'hospital', sort = TRUE)
totFriend1 = totFriend1[order(totFriend1$sent ),]
df$word_count = str_count(df$reviews, "kind")
totKind = aggregate(word_count ~ hospital, data = df, FUN = sum) 
totKind1 = totKind
totKind1$word = "Kind"
totKind1 = merge(totKind1,totSent, by = 'hospital', sort = TRUE)
totKind1 = totKind1[order(totKind1$sent ),]
df$word_count = str_count(df$reviews, "baby")
totBaby = aggregate(word_count ~ hospital, data = df, FUN = sum )
totBaby1 = totBaby
totBaby1$word = "Baby"
totBaby1 = merge(totBaby1,totSent, by = 'hospital', sort = TRUE)
totBaby1 = totBaby1[order(totBaby1$sent ),]
wordword = rbind(totRude1, totPain1,totUnprofes1, totFriend1, totBaby1, totKind1)
#plot the data 
library(ggplot2)
ggplot(wordword, aes(x=sentiment, y=word_count, color = word)) + 
    geom_point() +
    facet_wrap(~ word)

#Northwell Health-GoHealth Urgent Care
word_northwell = wordword[(wordword$hospital == 'Northwell Health-GoHealth Urgent Care'),]
word_northwell$hospital = NULL
word_northwell$sentiment = NULL
names(word_northwell)[2]<-"Words in NorthwellHealth-GoHealth Urgent Care"
word_northwell =  word_northwell[order(-word_northwell$word_count),]
library(huxtable)
word_northwell = as_hux(word_northwell)
# Select columns by name:
word_northwell= word_northwell[, c("Words in NorthwellHealth-GoHealth Urgent Care", "word_count")] 
word_northwell<- huxtable::add_colnames(word_northwell)
theme_basic(word_northwell)
  Words in NorthwellHealth-GoHealth Urgent Care │ word_count     
────────────────────────────────────────────────┼────────────────
                                       Friendly │ 19             
                                           Kind │ 17             
                                           Pain │ 15             
                                           Rude │ 6              
                                 Unprofessional │ 2              
                                           Baby │ 0              

Column names: Words in NorthwellHealth-GoHealth Urgent Care, word_count
#urgent care!
df$urgent = str_detect(df$hospital, "urgent")
df$Urgent = str_detect(df$hospital, "Urgent")
urgent = df[which(df$Urgent == TRUE | df$urgent == TRUE) ,]
nonUrgent = df[which(df$Urgent == FALSE | df$urgent == FALSE) ,]
urgent$pain_sent = get_sentiment(urgent$reviews)
nonUrgent$pain_sent = get_sentiment(nonUrgent$reviews)
urgent$word_count = str_count(urgent$reviews, "pain")
totPain = aggregate(word_count ~ hospital, data = urgent, FUN = sum)
totPain$word = "Urgent- Pain"
totPain = merge(totPain,totSent, by = 'hospital', sort = TRUE)
totPain = totPain[order(totPain$sent ),]
nonUrgent$word_count = str_count(nonUrgent$reviews, "pain")
totPain1 = aggregate(word_count ~ hospital, data = nonUrgent, FUN = sum)
totPain1$word = "Non Urgent - Pain"
totPain1 = merge(totPain1,totSent, by = 'hospital', sort = TRUE)
totPain1 = totPain1[order(totPain1$sent ),]
pain_urgent =rbind(totPain, totPain1)
ggplot(pain_urgent, aes(x=sentiment, y=word_count, color = word)) + 
    geom_point() +
    facet_wrap(~ word)

#prepare data for extraction to be used in Tableau
df2 = df[,c("hospital", "Longitude", "Latitude", "numRev")]
to_export = merge(totSent, df2,by = "hospital")
to_export = to_export[!duplicated(to_export$hospital), ]
to_export$sent = to_export$sentiment
to_export = mutate(to_export, isNeg = (sentiment < 0))
to_export$sentiment = abs(to_export$sentiment)
setwd("~/Desktop/")
The working directory was changed to /Users/ariellaschneider/Desktop inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
write.csv(to_export, "long_lat_rev1.csv") 
library(tm)
library(wordcloud)
#make the corpus
corp <- VCorpus(VectorSource(df$reviews))
#clean the corpus 
corp = tm_map(corp, removePunctuation) 
corp = tm_map(corp, content_transformer(tolower) ,lazy=TRUE) 
corp = tm_map(corp, content_transformer(removeWords), stopwords("english") ,lazy=TRUE)
corp = tm_map(corp, stripWhitespace)
corp = tm_map(corp, removeNumbers)
corp = tm_map(corp, removeWords, c("hello","is","it","me","you're","looking","for?", "said", "the", "will", "read", "one", "also", "doctor", "doctors", "hospital", "get", "just", "like", "told"))
#make a document term matrix 
dtm = DocumentTermMatrix(corp)
#remove sparese terms to get less than 300 words
dtms = removeSparseTerms(dtm, .99)
dim(dtms)
[1] 1940 1034
m = as.matrix(dtms)
corr = cor(df$sentiment, m)
sentRev = data.frame(colnames(m))
sentRev$corr = corr[1,1:nrow(sentRev)]
sentRev$corr = sentRev$corr *100
sentRev =  sentRev[order(-sentRev$corr),]
# Keep only those in the top 100
pos100 = order(corr, decreasing=T)[1:100]
pos100words = colnames(corr)[pos100]
posCorr = corr[pos100]
#make a word clous with the size based on the correlation
wordcloud(pos100words, freq = posCorr, scale = c(2.3, 0.005), max.words = 110)

# Keep only those in the bottom 100
neg100 = order(corr, decreasing= FALSE)[1:100]
neg100words = colnames(corr)[neg100]
negCorr = corr[neg100]
#make a word clous with the size based on the correlation
wordcloud(neg30words, scale = c(2.5, 0.005), freq = abs(negCorr))

library(readr)
infe = read_csv("~/Desktop/infection.csv")
Parsed with column specification:
cols(
  `Facility ID` = col_double(),
  `Hospital Name` = col_character(),
  `Indicator Name` = col_character(),
  Year = col_double(),
  `Infections Observed` = col_double(),
  `Infections Predicted` = col_double(),
  Denominator = col_double(),
  `Indicator Value` = col_double(),
  `Indicator Lower Confidence Limit` = col_double(),
  `Indicator Upper Confidence Limit` = col_double(),
  `Indicator Units` = col_character(),
  `Comparison Results` = col_character(),
  Latitude = col_double(),
  Longitude = col_double()
)
#get the relevant data 
infections = infe[which(infe$`Hospital Name` %in% totSent$hospital),]
#find the total infections
totInf = aggregate(`Infections Observed` ~ `Hospital Name`, data = infections, FUN = sum) 
totInf = totInf[order(-totInf$`Infections Observed` ),]
#find all of the hospital codes
totCodes = aggregate(`Facility ID` ~ `Hospital Name`, data = infections, FUN = mean) 
#merge the data sets
tots = merge(totSent,totInf, by.x = 'hospital', by.y = 'Hospital Name', sort = TRUE)
tots = merge(tots,totCodes, by.x = 'hospital', by.y = 'Hospital Name', sort = TRUE)
#get rid of the outlier
tots = tots[!(tots$sentiment >7.1),]
#make a linear regression model
fit = lm(tots$`Infections Observed` ~ tots$sentiment)
summary(fit)

Call:
lm(formula = tots$`Infections Observed` ~ tots$sentiment)

Residuals:
    Min      1Q  Median      3Q     Max 
-1752.1 -1035.5  -552.1   747.0  4845.3 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      2054.0      309.6   6.634 1.31e-07 ***
tots$sentiment   -109.1      211.0  -0.517    0.609    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1596 on 34 degrees of freedom
Multiple R-squared:  0.007795,  Adjusted R-squared:  -0.02139 
F-statistic: 0.2671 on 1 and 34 DF,  p-value: 0.6086
library(ggplot2)
ggplot(tots, aes(x=sentiment , y=`Infections Observed`)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=TRUE) + xlab("Sentiment") +
    ylab("Infections Observed")

library(readr)
death = read_csv("~/Desktop/deaths.csv")
Parsed with column specification:
cols(
  `Hospital Name` = col_character(),
  `Facility ID` = col_double(),
  `APR Medical Surgical Description` = col_character(),
  `Type of Insurance` = col_character(),
  `Patient Gender` = col_character(),
  live_status = col_character(),
  `Patient Age Group` = col_character(),
  `Discharge Year` = col_double(),
  `Number of Discharges` = col_double(),
  `Average Length of Stay` = col_double(),
  `Yearly Rank by Volume` = col_double()
)
#get relevant deaths based on facility id 
deaths = death[which(death$`Facility ID` %in% tots$`Facility ID`),]
#count the dead vs. alive patients
deaths = mutate(deaths, dead = (live_status == "Dea"))
deaths$dead = as.integer(deaths$dead)
deaths$dead = deaths$dead * deaths$`Number of Discharges`
totDead= aggregate(dead ~ `Hospital Name`, data = deaths, FUN = sum) 
totPatients= aggregate(`Number of Discharges` ~ `Hospital Name`, data = deaths, FUN = sum) 
totID= aggregate(`Facility ID` ~ `Hospital Name`, data = deaths, FUN = mean) 
totsID = merge(totID,totDead, by.x = 'Hospital Name', by.y = 'Hospital Name', sort = TRUE)
totsID = merge(totsID,totPatients, by.x = 'Hospital Name', by.y = 'Hospital Name', sort = TRUE)
totsID$`Hospital Name` <- NULL 
tots = merge(tots,totsID, by = 'Facility ID', sort = TRUE)
#find the percentage of deaths
tots$percent_death = as.double(tots$dead)/as.double(tots$`Number of Discharges`)
tots = tots[!(tots$sentiment >7.1),]
#linear regression model
ggplot(tots, aes(x=sentiment , y=percent_death)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=FALSE) + xlab("Sentiment") +
    ylab("Percent of Discharges that Result in Death")

fit = lm(tots$percent_death ~ tots$sentiment)
summary(fit)

Call:
lm(formula = tots$percent_death ~ tots$sentiment)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.11530 -0.03042 -0.01316 -0.00254  0.74786 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)
(Intercept)     0.02205    0.02626   0.840    0.407
tots$sentiment  0.02671    0.01789   1.493    0.145

Residual standard error: 0.1353 on 34 degrees of freedom
Multiple R-squared:  0.06151,   Adjusted R-squared:  0.03391 
F-statistic: 2.228 on 1 and 34 DF,  p-value: 0.1447
#remove another outlier!!
tots = tots[!(tots$percent_death >0.8),]
tots = tots[!(tots$sentiment >7.1),]
#linear regression
ggplot(tots, aes(x=sentiment , y=percent_death)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=TRUE) + xlab("Sentiment") +
    ylab("Percent of Discharges that Result in Death")

fit = lm(tots$percent_death ~ tots$sentiment)
summary(fit)

Call:
lm(formula = tots$percent_death ~ tots$sentiment)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.012048 -0.005454 -0.001620  0.003623  0.020663 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     0.020980   0.001513  13.863 2.57e-15 ***
tots$sentiment -0.002552   0.001071  -2.382   0.0231 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.0078 on 33 degrees of freedom
Multiple R-squared:  0.1467,    Adjusted R-squared:  0.1209 
F-statistic: 5.675 on 1 and 33 DF,  p-value: 0.02312
#find length of stay 
totLength= aggregate(`Average Length of Stay` ~ `Facility ID`, data = deaths, FUN = mean) 
tots = merge(tots,totLength, by = 'Facility ID', sort = TRUE)
tots = tots[!(tots$sentiment >6.0),]
#linear regression
ggplot(tots, aes(x=sentiment , y=`Average Length of Stay`)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=TRUE) + xlab("Sentiment") +
    ylab("Average Length of Stay")

fit = lm(tots$`Average Length of Stay` ~ tots$sentiment)
summary(fit)

Call:
lm(formula = tots$`Average Length of Stay` ~ tots$sentiment)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.4412 -1.3363 -0.1588  1.0601  4.6385 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)     10.2303     0.4233  24.169   <2e-16 ***
tots$sentiment  -0.4180     0.2996  -1.395    0.172    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.182 on 33 degrees of freedom
Multiple R-squared:  0.05568,   Adjusted R-squared:  0.02706 
F-statistic: 1.946 on 1 and 33 DF,  p-value: 0.1724
#read in readmission rates
library(readr)
readmis = read_csv("~/Desktop/readmission.csv")
Parsed with column specification:
cols(
  `Discharge Year` = col_double(),
  `Software Version` = col_double(),
  `Facility ID` = col_double(),
  `Facility Name` = col_character(),
  `Hospital County` = col_character(),
  `At Risk Admissions` = col_double(),
  `Observed PPR Chains` = col_double(),
  `Observed PPR Rate` = col_double(),
  `Expected PPR Rate` = col_double(),
  `Risk Adjusted PPR Rate` = col_double()
)
#count up the readmission rates 
readmish = readmis[which(readmis$`Facility ID` %in% tots$`Facility ID`),]
totReadmis= aggregate(`Observed PPR Rate` ~ `Facility ID`, data = readmish, FUN = mean) 
tots = merge(tots,totReadmis, by = 'Facility ID', sort = TRUE)
tots = tots[!(tots$sentiment >6.0),]
#linear regression model
ggplot(tots, aes(x=sentiment , y=`Observed PPR Rate`)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=TRUE) + xlab("Sentiment") +
    ylab("Observed PPR Rate")

fit = lm(tots[,9] ~ tots$sentiment)
summary(fit)

Call:
lm(formula = tots[, 9] ~ tots$sentiment)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.0586 -1.0208 -0.2028  0.8365  3.6944 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)      7.7797     0.3048  25.520   <2e-16 ***
tots$sentiment  -0.7718     0.2158  -3.576   0.0011 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.571 on 33 degrees of freedom
Multiple R-squared:  0.2793,    Adjusted R-squared:  0.2575 
F-statistic: 12.79 on 1 and 33 DF,  p-value: 0.001099

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
library(readr)
df = read_csv("~/Desktop/nycHospitalReviews.csv")
```

```{r}
library(dplyr)
df = mutate(df, count= (is.null(reviews)))
df$count = as.integer(df$count) + 1 #chnage to 1 because was 0 as they are not null 
totComp = aggregate(count ~ hospital, data = df, FUN = sum)
tot_sort =  totComp[order(-totComp$count ),]

```

```{r}

library(stringr)
library(syuzhet)
#df$reviewText = as.character(df$reviewText)

#find the sentiment for each hospital
df$sentiment = get_sentiment(df$reviews)
totSent = aggregate(sentiment ~ hospital, data = df, FUN = mean) 
totSent = totSent[order(-totSent$sent ),]

#find the word count over sentiment for each of 6 words
df$word_count = str_count(df$reviews, "rude")
totRude = aggregate(word_count ~ hospital, data = df, FUN = sum) 
totRude1 = totRude
totRude1$word = "Rude"
totRude1 = merge(totRude1,totSent, by = 'hospital', sort = TRUE)
totRude1 = totRude1[order(totRude1$sent ),]


df$word_count = str_count(df$reviews, "unprofessional")
totUnprofes = aggregate(word_count ~ hospital, data = df, FUN = sum) 
totUnprofes1 = totUnprofes
totUnprofes1$word = "Unprofessional"
totUnprofes1 = merge(totUnprofes1,totSent, by = 'hospital', sort = TRUE)
totUnprofes1 = totUnprofes1[order(totUnprofes1$sent ),]

df$word_count = str_count(df$reviews, "pain")
totPain = aggregate(word_count ~ hospital, data = df, FUN = sum)
totPain1 = totPain
totPain1$word = "Pain"
totPain1 = merge(totPain1,totSent, by = 'hospital', sort = TRUE)
totPain1 = totPain1[order(totPain1$sent ),]

df$word_count = str_count(df$reviews, "friendly")
totFriend = aggregate(word_count ~ hospital, data = df, FUN = sum) 
totFriend1 = totFriend
totFriend1$word = "Friendly"
totFriend1 = merge(totFriend1,totSent, by = 'hospital', sort = TRUE)
totFriend1 = totFriend1[order(totFriend1$sent ),]

df$word_count = str_count(df$reviews, "kind")
totKind = aggregate(word_count ~ hospital, data = df, FUN = sum) 
totKind1 = totKind
totKind1$word = "Kind"
totKind1 = merge(totKind1,totSent, by = 'hospital', sort = TRUE)
totKind1 = totKind1[order(totKind1$sent ),]

df$word_count = str_count(df$reviews, "baby")
totBaby = aggregate(word_count ~ hospital, data = df, FUN = sum )
totBaby1 = totBaby
totBaby1$word = "Baby"
totBaby1 = merge(totBaby1,totSent, by = 'hospital', sort = TRUE)
totBaby1 = totBaby1[order(totBaby1$sent ),]



wordword = rbind(totRude1, totPain1,totUnprofes1, totFriend1, totBaby1, totKind1)


#plot the data 
library(ggplot2)


ggplot(wordword, aes(x=sentiment, y=word_count, color = word)) + 
    geom_point() +
    facet_wrap(~ word)

#Northwell Health-GoHealth Urgent Care
word_northwell = wordword[(wordword$hospital == 'Northwell Health-GoHealth Urgent Care'),]
word_northwell$hospital = NULL
word_northwell$sentiment = NULL
names(word_northwell)[2]<-"Words in NorthwellHealth-GoHealth Urgent Care"
word_northwell =  word_northwell[order(-word_northwell$word_count),]

library(huxtable)
word_northwell = as_hux(word_northwell)

# Select columns by name:
word_northwell= word_northwell[, c("Words in NorthwellHealth-GoHealth Urgent Care", "word_count")] 
word_northwell<- huxtable::add_colnames(word_northwell)


theme_basic(word_northwell)

```

```{r}

#decided not to use this part! 


#urgent care!
df$urgent = str_detect(df$hospital, "urgent")
df$Urgent = str_detect(df$hospital, "Urgent")

urgent = df[which(df$Urgent == TRUE | df$urgent == TRUE) ,]
nonUrgent = df[which(df$Urgent == FALSE | df$urgent == FALSE) ,]

urgent$pain_sent = get_sentiment(urgent$reviews)
nonUrgent$pain_sent = get_sentiment(nonUrgent$reviews)




urgent$word_count = str_count(urgent$reviews, "pain")
totPain = aggregate(word_count ~ hospital, data = urgent, FUN = sum)
totPain$word = "Urgent- Pain"
totPain = merge(totPain,totSent, by = 'hospital', sort = TRUE)
totPain = totPain[order(totPain$sent ),]

nonUrgent$word_count = str_count(nonUrgent$reviews, "pain")
totPain1 = aggregate(word_count ~ hospital, data = nonUrgent, FUN = sum)
totPain1$word = "Non Urgent - Pain"
totPain1 = merge(totPain1,totSent, by = 'hospital', sort = TRUE)
totPain1 = totPain1[order(totPain1$sent ),]


pain_urgent =rbind(totPain, totPain1)

ggplot(pain_urgent, aes(x=sentiment, y=word_count, color = word)) + 
    geom_point() +
    facet_wrap(~ word)

```

```{r}
#prepare data for extraction to be used in Tableau
df2 = df[,c("hospital", "Longitude", "Latitude", "numRev")]
to_export = merge(totSent, df2,by = "hospital")
to_export = to_export[!duplicated(to_export$hospital), ]
to_export$sent = to_export$sentiment
to_export = mutate(to_export, isNeg = (sentiment < 0))
to_export$sentiment = abs(to_export$sentiment)

setwd("~/Desktop/")
write.csv(to_export, "long_lat_rev1.csv") 


```

```{r}

library(tm)
library(wordcloud)


#make the corpus
corp <- VCorpus(VectorSource(df$reviews))

#clean the corpus 
corp = tm_map(corp, removePunctuation) 
corp = tm_map(corp, content_transformer(tolower) ,lazy=TRUE) 
corp = tm_map(corp, content_transformer(removeWords), stopwords("english") ,lazy=TRUE)
corp = tm_map(corp, stripWhitespace)
corp = tm_map(corp, removeNumbers)
corp = tm_map(corp, removeWords, c("hello","is","it","me","you're","looking","for?", "said", "the", "will", "read", "one", "also", "doctor", "doctors", "hospital", "get", "just", "like", "told"))


#make a document term matrix 
dtm = DocumentTermMatrix(corp)

#remove sparese terms to get less than 300 words
dtms = removeSparseTerms(dtm, .99)
dim(dtms)
m = as.matrix(dtms)

corr = cor(df$sentiment, m)

sentRev = data.frame(colnames(m))
sentRev$corr = corr[1,1:nrow(sentRev)]
sentRev$corr = sentRev$corr *100
sentRev =  sentRev[order(-sentRev$corr),]


# Keep only those in the top 100
pos100 = order(corr, decreasing=T)[1:100]
pos100words = colnames(corr)[pos100]

posCorr = corr[pos100]
#make a word clous with the size based on the correlation
wordcloud(pos100words, freq = posCorr, scale = c(2.3, 0.005), max.words = 110)


# Keep only those in the bottom 100
neg100 = order(corr, decreasing= FALSE)[1:100]
neg100words = colnames(corr)[neg100]
negCorr = corr[neg100]
#make a word clous with the size based on the correlation
wordcloud(neg30words, scale = c(2.5, 0.005), freq = abs(negCorr))

```


```{r}
library(readr)
infe = read_csv("~/Desktop/infection.csv")
```


```{r}

#get the relevant data 
infections = infe[which(infe$`Hospital Name` %in% totSent$hospital),]

#find the total infections
totInf = aggregate(`Infections Observed` ~ `Hospital Name`, data = infections, FUN = sum) 
totInf = totInf[order(-totInf$`Infections Observed` ),]

#find all of the hospital codes
totCodes = aggregate(`Facility ID` ~ `Hospital Name`, data = infections, FUN = mean) 

#merge the data sets
tots = merge(totSent,totInf, by.x = 'hospital', by.y = 'Hospital Name', sort = TRUE)
tots = merge(tots,totCodes, by.x = 'hospital', by.y = 'Hospital Name', sort = TRUE)

#get rid of the outlier
tots = tots[!(tots$sentiment >7.1),]

#make a linear regression model
fit = lm(tots$`Infections Observed` ~ tots$sentiment)
summary(fit)

library(ggplot2)

ggplot(tots, aes(x=sentiment , y=`Infections Observed`)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=TRUE) + xlab("Sentiment") +
    ylab("Infections Observed")
```



```{r}
library(readr)
death = read_csv("~/Desktop/deaths.csv")


#get relevant deaths based on facility id 
deaths = death[which(death$`Facility ID` %in% tots$`Facility ID`),]

#count the dead vs. alive patients
deaths = mutate(deaths, dead = (live_status == "Dea"))
deaths$dead = as.integer(deaths$dead)
deaths$dead = deaths$dead * deaths$`Number of Discharges`
totDead= aggregate(dead ~ `Hospital Name`, data = deaths, FUN = sum) 
totPatients= aggregate(`Number of Discharges` ~ `Hospital Name`, data = deaths, FUN = sum) 
totID= aggregate(`Facility ID` ~ `Hospital Name`, data = deaths, FUN = mean) 


totsID = merge(totID,totDead, by.x = 'Hospital Name', by.y = 'Hospital Name', sort = TRUE)
totsID = merge(totsID,totPatients, by.x = 'Hospital Name', by.y = 'Hospital Name', sort = TRUE)
totsID$`Hospital Name` <- NULL 
tots = merge(tots,totsID, by = 'Facility ID', sort = TRUE)

#find the percentage of deaths
tots$percent_death = as.double(tots$dead)/as.double(tots$`Number of Discharges`)
tots = tots[!(tots$sentiment >7.1),]

#linear regression model
ggplot(tots, aes(x=sentiment , y=percent_death)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=FALSE) + xlab("Sentiment") +
    ylab("Percent of Discharges that Result in Death")

fit = lm(tots$percent_death ~ tots$sentiment)
summary(fit)

```

```{r}
#remove another outlier!!
tots = tots[!(tots$percent_death >0.8),]
tots = tots[!(tots$sentiment >7.1),]

#linear regression
ggplot(tots, aes(x=sentiment , y=percent_death)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=TRUE) + xlab("Sentiment") +
    ylab("Percent of Discharges that Result in Death")

fit = lm(tots$percent_death ~ tots$sentiment)
summary(fit)
```

```{r}

#find length of stay 
totLength= aggregate(`Average Length of Stay` ~ `Facility ID`, data = deaths, FUN = mean) 
tots = merge(tots,totLength, by = 'Facility ID', sort = TRUE)
tots = tots[!(tots$sentiment >6.0),]

#linear regression
ggplot(tots, aes(x=sentiment , y=`Average Length of Stay`)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=TRUE) + xlab("Sentiment") +
    ylab("Average Length of Stay")

fit = lm(tots$`Average Length of Stay` ~ tots$sentiment)
summary(fit)

```

```{r}

#read in readmission rates
library(readr)
readmis = read_csv("~/Desktop/readmission.csv")

#count up the readmission rates 
readmish = readmis[which(readmis$`Facility ID` %in% tots$`Facility ID`),]
totReadmis= aggregate(`Observed PPR Rate` ~ `Facility ID`, data = readmish, FUN = mean) 
tots = merge(tots,totReadmis, by = 'Facility ID', sort = TRUE)
tots = tots[!(tots$sentiment >6.0),]

#linear regression model
ggplot(tots, aes(x=sentiment , y=`Observed PPR Rate`)) + geom_point(color="darkred", size=4, alpha=.5) + 
    geom_smooth(method="lm", se=TRUE) + xlab("Sentiment") +
    ylab("Observed PPR Rate")

fit = lm(tots[,9] ~ tots$sentiment)
summary(fit)
```

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file).
