Descriptive Statistics

library(readxl)
ratemdsfinal <- read_excel("ratemdsfinal.xlsx")

## New names:
## * `` -> ...1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(xtable)
library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(knitr)
library(psych)
colSums(is.na(ratemdsfinal))

##                        ...1                Doctor_names 
##                           0                           0 
##                      Gender         Years of Experience 
##                           0                          59 
##              Doctor_reviews                      Rating 
##                           0                           0 
##            Review_sentences          Count of Sentiment 
##                           0                           0 
##            Sum of Pos Score Count of Positive Sentiment 
##                           0                           0 
##         Positive_Proportion           Average pos_score 
##                           0                           0 
##            Sum of Neg Score         Negative_Proportion 
##                           0                           0 
## Count of Negative Sentiment           Average neg_score 
##                           0                           0 
##      Communication_positive           Expertisepositive 
##                           0                           0 
##                Timepositive            Bedside_positive 
##                           0                           0 
##              Officepositive                Costpositive 
##                           0                           0 
##      Communication_negative           Expertisenegative 
##                           0                           0 
##               Time_negative            Bedside_negative 
##                           0                           0 
##             Office_negative               Cost_negative 
##                           0                           0 
##                  Word Count                Review Count 
##                           0                           0 
##                Phrase Count    Average words per review 
##                           0                           0 
##               Overall_score 
##                           0

ratemdsfinal$`Years of Experience`[is.na(ratemdsfinal$`Years of Experience`)]=round(mean(ratemdsfinal$`Years of Experience`,na.rm = T))

# ghetting only the numeric columsn 
num_cols <- unlist(lapply(ratemdsfinal, is.numeric))  
rate <- ratemdsfinal[, num_cols]

rate$...1<- NULL
str(rate)

## tibble [793 × 28] (S3: tbl_df/tbl/data.frame)
##  $ Years of Experience        : num [1:793] 21 15 26 21 27 38 49 36 25 26 ...
##  $ Rating                     : num [1:793] 4.85 4.47 4.92 4.89 4.89 5 4.44 4.75 4.93 4.78 ...
##  $ Count of Sentiment         : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
##  $ Sum of Pos Score           : num [1:793] 9.81 18.01 7.11 8.41 6.54 ...
##  $ Count of Positive Sentiment: num [1:793] 12 21 8 9 9 7 15 10 7 8 ...
##  $ Positive_Proportion        : num [1:793] 0.857 0.913 0.889 1 1 ...
##  $ Average pos_score          : num [1:793] 0.817 0.858 0.889 0.934 0.726 ...
##  $ Sum of Neg Score           : num [1:793] -0.9394 -0.6902 -0.0018 0 0 ...
##  $ Negative_Proportion        : num [1:793] 0.143 0.087 0.111 0 0 ...
##  $ Count of Negative Sentiment: num [1:793] 2 2 1 0 0 0 2 1 0 0 ...
##  $ Average neg_score          : num [1:793] -0.4697 -0.3451 -0.0018 0 0 ...
##  $ Communication_positive     : num [1:793] 0.0323 0.0876 0 0.0513 0.1 ...
##  $ Expertisepositive          : num [1:793] 0.145 0.073 0.139 0.103 0.267 ...
##  $ Timepositive               : num [1:793] 0.0806 0.0438 0.0556 0.1538 0.1667 ...
##  $ Bedside_positive           : num [1:793] 0 0.19 0.194 0.103 0.167 ...
##  $ Officepositive             : num [1:793] 0.0645 0.0949 0.1111 0.0513 0.0333 ...
##  $ Costpositive               : num [1:793] 0.0161 0.073 0.0833 0 0.0667 ...
##  $ Communication_negative     : num [1:793] 0.0484 0.0438 0.0278 0 0 ...
##  $ Expertisenegative          : num [1:793] 0.0645 0.0219 0.0278 0 0.0333 ...
##  $ Time_negative              : num [1:793] 0.0968 0.0511 0 0 0 ...
##  $ Bedside_negative           : num [1:793] 0.0161 0.0219 0 0 0 ...
##  $ Office_negative            : num [1:793] 0.0323 0.0365 0 0 0 ...
##  $ Cost_negative              : num [1:793] 0 0.0219 0 0 0 ...
##  $ Word Count                 : num [1:793] 523 1604 288 358 280 ...
##  $ Review Count               : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
##  $ Phrase Count               : num [1:793] 67 165 38 42 48 32 133 93 41 67 ...
##  $ Average words per review   : num [1:793] 37 70 32 40 31 36 69 100 38 77 ...
##  $ Overall_score              : num [1:793] 0.348 0.513 0.887 0.934 0.726 ...

names(rate)[1]<- "Exp"

# Print Descriptive Statistics for RateMDs
kable(xtable(describe(rate)[c(3,4,5,8,9,11)]))

	mean	sd	median	min	max	skew
Exp	25.9394704	10.7023017	26.0000000	2.0000	56.0000	0.2925266
Rating	3.9887516	1.1456304	4.5000000	1.0000	5.0000	-1.0690984
Count of Sentiment	2.9621690	3.2622855	2.0000000	1.0000	30.0000	3.0905276
Sum of Pos Score	1.6952861	2.0505289	0.9493000	0.0000	18.0120	2.9120548
Count of Positive Sentiment	2.1601513	2.5047079	1.0000000	0.0000	21.0000	2.8523571
Positive_Proportion	0.7329744	0.3628249	1.0000000	0.0000	1.0000	-1.0791118
Average pos_score	0.6597428	0.3194366	0.7778000	0.0000	0.9923	-1.1775197
Sum of Neg Score	-0.4450890	0.8923222	0.0000000	-10.8437	0.0000	-4.3611618
Negative_Proportion	0.2263399	0.3439964	0.0000000	0.0000	1.0000	1.3315686
Count of Negative Sentiment	0.6935687	1.2951625	0.0000000	0.0000	15.0000	4.1368418
Average neg_score	-0.2453372	0.3380014	0.0000000	-0.9810	0.0000	-0.8781404
Communication_positive	0.0519260	0.1105353	0.0000000	0.0000	1.0000	4.0196863
Expertisepositive	0.1172052	0.1680621	0.0666667	0.0000	1.0000	2.6532162
Timepositive	0.0530472	0.1028384	0.0000000	0.0000	1.0000	3.2624407
Bedside_positive	0.1393430	0.1792870	0.1000000	0.0000	1.0000	2.2700317
Officepositive	0.0567404	0.1138064	0.0000000	0.0000	1.0000	3.5556046
Costpositive	0.0388190	0.0780689	0.0000000	0.0000	0.5000	2.6948463
Communication_negative	0.0427782	0.0923958	0.0000000	0.0000	1.0000	3.9127936
Expertisenegative	0.0620597	0.1284017	0.0000000	0.0000	1.0000	3.9608482
Time_negative	0.0499659	0.1058386	0.0000000	0.0000	1.0000	3.8477383
Bedside_negative	0.0335470	0.1045708	0.0000000	0.0000	1.0000	6.1851972
Office_negative	0.0337728	0.0817816	0.0000000	0.0000	0.6000	3.6060950
Cost_negative	0.0140710	0.0598381	0.0000000	0.0000	1.0000	8.8894980
Word Count	169.6973518	250.4917975	78.0000000	0.0000	2350.0000	3.5365532
Review Count	2.9621690	3.2622855	2.0000000	1.0000	30.0000	3.0905276
Phrase Count	17.4186633	23.0809676	9.0000000	1.0000	195.0000	3.4406515
Average words per review	52.6557377	53.6781351	40.0000000	0.0000	737.0000	5.8080336
Overall_score	0.4144056	0.5385908	0.6249000	-0.9810	0.9914	-0.9302287

# Checking for Normality 
library(ggpubr)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Loading required package: magrittr

ggqqplot(rate$Rating)

shapiro.test(rate$Rating)

## 
##  Shapiro-Wilk normality test
## 
## data:  rate$Rating
## W = 0.83136, p-value < 2.2e-16

ggqqplot(rate$Positive_Proportion)

shapiro.test(rate$Positive_Proportion)

## 
##  Shapiro-Wilk normality test
## 
## data:  rate$Positive_Proportion
## W = 0.72269, p-value < 2.2e-16

ggqqplot(rate$Negative_Proportion)

shapiro.test(rate$Negative_Proportion)

## 
##  Shapiro-Wilk normality test
## 
## data:  rate$Negative_Proportion
## W = 0.67902, p-value < 2.2e-16

ggqqplot(rate$`Average pos_score`)

shapiro.test(rate$`Average pos_score`)

## 
##  Shapiro-Wilk normality test
## 
## data:  rate$`Average pos_score`
## W = 0.78975, p-value < 2.2e-16

ggqqplot(rate$`Average neg_score`)

shapiro.test(rate$`Average neg_score`)

## 
##  Shapiro-Wilk normality test
## 
## data:  rate$`Average neg_score`
## W = 0.7132, p-value < 2.2e-16

## Multiple t-tests for RateMDs
a<- lapply(rate[], function(x) t.test(x ~ ratemdsfinal$Gender, var.equal = TRUE))

Statistics based on Gender

#str(ratemdsfinal)
ratemdsfinal$Gender<- factor(ratemdsfinal$Gender)

rate %>% #plot rating over Gender 
  ggplot(aes(x=Rating,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.4,position="identity")+
  labs(title = "Density of Rating by Gender on RateMDs\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Positive_Proportion over Gender 
  ggplot(aes(x=Positive_Proportion,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Positive Proportion by Gender on RateMDs\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Negative_Proportion over Gender 
  ggplot(aes(x=Negative_Proportion,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Negative_Proportion by Gender on RateMDs\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot `Average neg_score` over Gender 
  ggplot(aes(x=`Average neg_score`,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average neg_score` by Gender on RateMDs\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot `Average pos_score` over Gender 
  ggplot(aes(x=`Average pos_score`,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average pos_score` by Gender on RateMDs\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Years of Experience over Gender 
  ggplot(aes(x=Exp,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Years of Experience by Gender on RateMDs\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

rate %>% #plot Overall_score over Gender 
  ggplot(aes(x=Overall_score,fill=ratemdsfinal$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Overall_score by Gender on RateMDs\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Statistics by No of Years of Experience

colSums(is.na(ratemdsfinal))

##                        ...1                Doctor_names 
##                           0                           0 
##                      Gender         Years of Experience 
##                           0                           0 
##              Doctor_reviews                      Rating 
##                           0                           0 
##            Review_sentences          Count of Sentiment 
##                           0                           0 
##            Sum of Pos Score Count of Positive Sentiment 
##                           0                           0 
##         Positive_Proportion           Average pos_score 
##                           0                           0 
##            Sum of Neg Score         Negative_Proportion 
##                           0                           0 
## Count of Negative Sentiment           Average neg_score 
##                           0                           0 
##      Communication_positive           Expertisepositive 
##                           0                           0 
##                Timepositive            Bedside_positive 
##                           0                           0 
##              Officepositive                Costpositive 
##                           0                           0 
##      Communication_negative           Expertisenegative 
##                           0                           0 
##               Time_negative            Bedside_negative 
##                           0                           0 
##             Office_negative               Cost_negative 
##                           0                           0 
##                  Word Count                Review Count 
##                           0                           0 
##                Phrase Count    Average words per review 
##                           0                           0 
##               Overall_score 
##                           0

r<- ratemdsfinal

summary(ratemdsfinal$`Years of Experience`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   19.00   26.00   25.94   33.00   56.00

ratemdsfinal$`Years of Experience` <- as.factor(ratemdsfinal$`Years of Experience`)
levels(ratemdsfinal$`Years of Experience`) = list("Less than 6 years" = c(0:6),
                                        "7 to 14 years " = c(7:14), 
                                        "15 to 23 years" = c(15:23), 
                                        "24 and above" = c(24:56))
summary(ratemdsfinal$`Years of Experience`)

## Less than 6 years    7 to 14 years     15 to 23 years      24 and above 
##                19                86               240               448

# shows how the rating is divided between No of years of experience of the doctor
ggplot(data = rate, mapping = aes(x =Rating)) +
  geom_density(alpha=.4,position="identity")+
  facet_wrap(~ ratemdsfinal$`Years of Experience`)+
  ggtitle("Vistualising Years of Experience over Rating")+
  xlab("CRating")+
  theme_bw()

rate %>% 
  ggplot(aes(x=Rating,fill=ratemdsfinal$`Years of Experience`))+
  geom_density(alpha=.7,position="identity")+
  labs(title = "Density of Rating over Years of Experience on RateMDs",x="Rating Received on RateMDs",y = "Density of Rating")

str(ratemdsfinal)

## tibble [793 × 33] (S3: tbl_df/tbl/data.frame)
##  $ ...1                       : num [1:793] 0 1 2 3 4 5 6 7 8 9 ...
##  $ Doctor_names               : chr [1:793] "Dr. Martine T. Nelson" "Dr. Jason C. Tjaden" "Dr. Paul Tortoriello" "Dr. Maria Rosa" ...
##  $ Gender                     : Factor w/ 2 levels "Female","Male": 1 2 2 1 1 1 2 2 2 1 ...
##  $ Years of Experience        : Factor w/ 4 levels "Less than 6 years",..: 3 3 4 3 4 4 4 4 4 4 ...
##  $ Doctor_reviews             : chr [1:793] "['Dr Nelson and her staff are wonderful! They are always on time and very caring!', 'Dr. Nelson was our family "| __truncated__ "['We love Dr. Tjaden! Our daughter had medical issues when she was born and he was incredibly supportive, affir"| __truncated__ "['Excellent doctor! Absolute best pediatrician. We switched from another local doctor who misdiagnosed my daugh"| __truncated__ "[\"Dr. Rosa is one of a kind. Genuine, caring and full of kindness. She's gentle with my girls and they LOVE he"| __truncated__ ...
##  $ Rating                     : num [1:793] 4.85 4.47 4.92 4.89 4.89 5 4.44 4.75 4.93 4.78 ...
##  $ Review_sentences           : chr [1:793] "[\"['Dr Nelson and her staff are wonderful\", ' They are always on time and very caring', \", 'Dr\", ' Nelson w"| __truncated__ "[\"['We love Dr\", ' Tjaden', ' Our daughter had medical issues when she was born and he was incredibly support"| __truncated__ "[\"['Excellent doctor\", ' Absolute best pediatrician', ' We switched from another local doctor who misdiagnose"| __truncated__ "['[\"Dr', ' Rosa is one of a kind', ' Genuine, caring and full of kindness', \" She's gentle with my girls and "| __truncated__ ...
##  $ Count of Sentiment         : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
##  $ Sum of Pos Score           : num [1:793] 9.81 18.01 7.11 8.41 6.54 ...
##  $ Count of Positive Sentiment: num [1:793] 12 21 8 9 9 7 15 10 7 8 ...
##  $ Positive_Proportion        : num [1:793] 0.857 0.913 0.889 1 1 ...
##  $ Average pos_score          : num [1:793] 0.817 0.858 0.889 0.934 0.726 ...
##  $ Sum of Neg Score           : num [1:793] -0.9394 -0.6902 -0.0018 0 0 ...
##  $ Negative_Proportion        : num [1:793] 0.143 0.087 0.111 0 0 ...
##  $ Count of Negative Sentiment: num [1:793] 2 2 1 0 0 0 2 1 0 0 ...
##  $ Average neg_score          : num [1:793] -0.4697 -0.3451 -0.0018 0 0 ...
##  $ Communication_positive     : num [1:793] 0.0323 0.0876 0 0.0513 0.1 ...
##  $ Expertisepositive          : num [1:793] 0.145 0.073 0.139 0.103 0.267 ...
##  $ Timepositive               : num [1:793] 0.0806 0.0438 0.0556 0.1538 0.1667 ...
##  $ Bedside_positive           : num [1:793] 0 0.19 0.194 0.103 0.167 ...
##  $ Officepositive             : num [1:793] 0.0645 0.0949 0.1111 0.0513 0.0333 ...
##  $ Costpositive               : num [1:793] 0.0161 0.073 0.0833 0 0.0667 ...
##  $ Communication_negative     : num [1:793] 0.0484 0.0438 0.0278 0 0 ...
##  $ Expertisenegative          : num [1:793] 0.0645 0.0219 0.0278 0 0.0333 ...
##  $ Time_negative              : num [1:793] 0.0968 0.0511 0 0 0 ...
##  $ Bedside_negative           : num [1:793] 0.0161 0.0219 0 0 0 ...
##  $ Office_negative            : num [1:793] 0.0323 0.0365 0 0 0 ...
##  $ Cost_negative              : num [1:793] 0 0.0219 0 0 0 ...
##  $ Word Count                 : num [1:793] 523 1604 288 358 280 ...
##  $ Review Count               : num [1:793] 14 23 9 9 9 7 19 11 7 8 ...
##  $ Phrase Count               : num [1:793] 67 165 38 42 48 32 133 93 41 67 ...
##  $ Average words per review   : num [1:793] 37 70 32 40 31 36 69 100 38 77 ...
##  $ Overall_score              : num [1:793] 0.348 0.513 0.887 0.934 0.726 ...

Healthgrades

library(readxl)
healthgrades <- read_excel("healthgradesfinal.xlsx")

str(healthgrades)

## tibble [247 × 33] (S3: tbl_df/tbl/data.frame)
##  $ DoctorName              : chr [1:247] "Dr. Amy Williams, MD" "Dr. Roma Franzia, MD" "Dr. Elizabeth Manjooran, MD" "Dr. Jason Canel, MD" ...
##  $ Age                     : chr [1:247] "• Age 42" "• Age 50" "• Age 59" "• Age 48" ...
##  $ Gender                  : chr [1:247] "Female" "Female" "Female" "Male" ...
##  $ Speciality              : chr [1:247] "Pediatrics" "Pediatrics" "Pediatrics" "Pediatrics" ...
##  $ Years of Experience     : num [1:247] 12 21 28 11 43 24 42 39 16 41 ...
##  $ Biography               : chr [1:247] "Dr. Amy Williams, MD is a pediatrics specialist in Chicago, IL. She specializes in pediatrics." "Dr. Roma Franzia, MD is a pediatrics specialist in Winnetka, IL and has been practicing for 21 years. She gradu"| __truncated__ "Dr. Elizabeth Manjooran, MD is a pediatrics specialist in Des Plaines, IL. She specializes in pediatrics." "Dr. Jason Canel, MD is a pediatrics specialist in Glenview, IL and has been practicing for 11 years. He graduat"| __truncated__ ...
##  $ Rating                  : num [1:247] 4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
##  $ Reviews                 : chr [1:247] "[\"I've been seeing Dr. Williams for years and through 2 pregnancies. She is the absolute best. Caring, compass"| __truncated__ "[\"Dr. Roma Franzia is the best pediatrician ever! So caring about my kids and family as a whole, always availa"| __truncated__ "['She’s not only a very good doctor but also a wonderful person! Also her office staff is very professional, wa"| __truncated__ "[\"We were assigned Dr Canel at my daughter's birth. I am so glad he was on call that day! My daughter is now t"| __truncated__ ...
##  $ Review_cleaned          : chr [1:247] "['[\"I\\'ve been seeing Dr', ' Williams for years ', ' through 2 pregnancies', ' She is the absolute best', ' C"| __truncated__ "['[\"Dr', ' Roma Franzia is the best pediatrician ever', ' So caring about my kids ', ' family as a whole', ' a"| __truncated__ "[\"['She’s not only a very good doctor \", ' also a wonderful person', ' Also her office staff is very professi"| __truncated__ "['[\"We were assigned Dr Canel at my daughter\\'s birth', ' I am so glad he was on call that day', ' My daughte"| __truncated__ ...
##  $ Word Count              : num [1:247] 614 793 176 199 84 182 144 690 50 68 ...
##  $ Review Count            : num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
##  $ Phrase Count            : num [1:247] 166 143 35 47 16 42 41 152 11 11 ...
##  $ Total Count of Sentiment: num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
##  $ Positive_Proportion     : num [1:247] 1 1 1 1 1 ...
##  $ Count of pos Sentiment  : num [1:247] 21 11 4 5 2 3 5 16 3 2 ...
##  $ Average pos_score       : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
##  $ Negative_Proportion     : num [1:247] 0 0 0 0 0 ...
##  $ Count of neg Sentiment  : num [1:247] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Average neg_score       : num [1:247] 0 0 0 0 0 0 0 -0.612 0 0 ...
##  $ Communication_positive  : num [1:247] 0.0676 0.0286 0.1 0.1538 0.3333 ...
##  $ Expertisepositive       : num [1:247] 0.189 0.114 0.1 0.154 0.167 ...
##  $ Timepositive            : num [1:247] 0.1216 0.0714 0 0.1154 0 ...
##  $ Bedside_positive        : num [1:247] 0.108 0.186 0.35 0.115 0.333 ...
##  $ Officepositive          : num [1:247] 0.027 0.0429 0.15 0.0769 0 ...
##  $ Costpositive            : num [1:247] 0.0541 0.0286 0 0.0385 0 ...
##  $ Communication_negative  : num [1:247] 0 0.0143 0 0 0 ...
##  $ Expertisenegative       : num [1:247] 0.027 0.0143 0 0 0 ...
##  $ Time_negative           : num [1:247] 0 0.0429 0 0 0 ...
##  $ Bedside_negative        : num [1:247] 0 0.0429 0 0 0 ...
##  $ Office_negative         : num [1:247] 0 0.0143 0 0 0 ...
##  $ Cost_negative           : num [1:247] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Average words per review: num [1:247] 29 72 44 40 42 61 29 41 17 34 ...
##  $ Overall_score           : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...

healthgrades$Gender<- factor(healthgrades$Gender)

healthgrades$Speciality<- factor(healthgrades$Speciality)

table(healthgrades$Speciality)

## 
## Allergy & Immunology          Dermatology      Family Medicine 
##                    3                    1                    2 
##    Internal Medicine    Neonatal Medicine   Pediatric Medicine 
##                   18                    1                    2 
##           Pediatrics 
##                  220

# ghetting only the numeric columsn 
num_cols <- unlist(lapply(healthgrades, is.numeric))  
health <- healthgrades[, num_cols]

str(health)

## tibble [247 × 26] (S3: tbl_df/tbl/data.frame)
##  $ Years of Experience     : num [1:247] 12 21 28 11 43 24 42 39 16 41 ...
##  $ Rating                  : num [1:247] 4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
##  $ Word Count              : num [1:247] 614 793 176 199 84 182 144 690 50 68 ...
##  $ Review Count            : num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
##  $ Phrase Count            : num [1:247] 166 143 35 47 16 42 41 152 11 11 ...
##  $ Total Count of Sentiment: num [1:247] 21 11 4 5 2 3 5 17 3 2 ...
##  $ Positive_Proportion     : num [1:247] 1 1 1 1 1 ...
##  $ Count of pos Sentiment  : num [1:247] 21 11 4 5 2 3 5 16 3 2 ...
##  $ Average pos_score       : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...
##  $ Negative_Proportion     : num [1:247] 0 0 0 0 0 ...
##  $ Count of neg Sentiment  : num [1:247] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Average neg_score       : num [1:247] 0 0 0 0 0 0 0 -0.612 0 0 ...
##  $ Communication_positive  : num [1:247] 0.0676 0.0286 0.1 0.1538 0.3333 ...
##  $ Expertisepositive       : num [1:247] 0.189 0.114 0.1 0.154 0.167 ...
##  $ Timepositive            : num [1:247] 0.1216 0.0714 0 0.1154 0 ...
##  $ Bedside_positive        : num [1:247] 0.108 0.186 0.35 0.115 0.333 ...
##  $ Officepositive          : num [1:247] 0.027 0.0429 0.15 0.0769 0 ...
##  $ Costpositive            : num [1:247] 0.0541 0.0286 0 0.0385 0 ...
##  $ Communication_negative  : num [1:247] 0 0.0143 0 0 0 ...
##  $ Expertisenegative       : num [1:247] 0.027 0.0143 0 0 0 ...
##  $ Time_negative           : num [1:247] 0 0.0429 0 0 0 ...
##  $ Bedside_negative        : num [1:247] 0 0.0429 0 0 0 ...
##  $ Office_negative         : num [1:247] 0 0.0143 0 0 0 ...
##  $ Cost_negative           : num [1:247] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Average words per review: num [1:247] 29 72 44 40 42 61 29 41 17 34 ...
##  $ Overall_score           : num [1:247] 0.824 0.792 0.916 0.897 0.936 ...

# Print Descriptive Statistics for RateMDs
kable(xtable(describe(health)[c(3,4,5,8,9,11)]))

	mean	sd	median	min	max	skew
Years of Experience	25.8600823	10.9100838	26.0000000	2.0000	53.0000000	0.1548193
Rating	4.5979757	0.3479848	4.6000000	3.5000	5.0000000	-0.5336239
Word Count	194.0971660	270.5785255	124.0000000	11.0000	2018.0000000	4.1629431
Review Count	4.4089069	6.7522054	3.0000000	1.0000	54.0000000	4.9852808
Phrase Count	39.4777328	56.4946083	23.0000000	3.0000	432.0000000	4.4631481
Total Count of Sentiment	4.7692308	9.5512835	3.0000000	1.0000	90.0000000	6.8203153
Positive_Proportion	0.9075924	0.1964834	1.0000000	0.0000	1.0000000	-2.6376345
Count of pos Sentiment	4.2995951	9.0171088	2.0000000	0.0000	86.0000000	7.0230331
Average pos_score	0.7945390	0.1824017	0.8376093	0.0000	0.9857000	-2.5855164
Negative_Proportion	0.0924076	0.1964834	0.0000000	0.0000	1.0000000	2.6376345
Count of neg Sentiment	0.4696356	1.0997455	0.0000000	0.0000	9.0000000	4.0034054
Average neg_score	-0.1475103	0.2808211	0.0000000	-0.9471	0.0000000	-1.6245998
Communication_positive	0.1074784	0.1175589	0.0769231	0.0000	0.7500000	1.5961346
Expertisepositive	0.1554605	0.1370798	0.1428571	0.0000	1.0000000	1.6127717
Timepositive	0.0940978	0.1083136	0.0666667	0.0000	0.5000000	1.4619393
Bedside_positive	0.1563275	0.1493452	0.1428571	0.0000	1.0000000	1.9710141
Officepositive	0.0472850	0.0756680	0.0000000	0.0000	0.3333333	1.7511825
Costpositive	0.0276534	0.0562286	0.0000000	0.0000	0.3333333	2.7729653
Communication_negative	0.0310048	0.0852062	0.0000000	0.0000	1.0000000	6.8384268
Expertisenegative	0.0308365	0.0874027	0.0000000	0.0000	1.0000000	6.7252228
Time_negative	0.0365831	0.0713895	0.0000000	0.0000	0.5000000	2.8473511
Bedside_negative	0.0183953	0.0478848	0.0000000	0.0000	0.3333333	3.7851412
Office_negative	0.0151664	0.0443944	0.0000000	0.0000	0.3333333	4.4347950
Cost_negative	0.0052563	0.0278533	0.0000000	0.0000	0.3333333	8.2021183
Average words per review	45.6032389	19.9854741	44.0000000	8.0000	135.0000000	0.7046350
Overall_score	0.6470287	0.3555858	0.7937000	-0.8750	0.9857000	-1.5541893

# Checking for Normality 
library(ggpubr)
ggqqplot(health$Rating)

shapiro.test(health$Rating)

## 
##  Shapiro-Wilk normality test
## 
## data:  health$Rating
## W = 0.91311, p-value = 8.461e-11

ggqqplot(health$Positive_Proportion)

shapiro.test(health$Positive_Proportion)

## 
##  Shapiro-Wilk normality test
## 
## data:  health$Positive_Proportion
## W = 0.54009, p-value < 2.2e-16

ggqqplot(health$Negative_Proportion)

shapiro.test(health$Negative_Proportion)

## 
##  Shapiro-Wilk normality test
## 
## data:  health$Negative_Proportion
## W = 0.54009, p-value < 2.2e-16

ggqqplot(health$`Average pos_score`)

shapiro.test(health$`Average pos_score`)

## 
##  Shapiro-Wilk normality test
## 
## data:  health$`Average pos_score`
## W = 0.72734, p-value < 2.2e-16

ggqqplot(health$`Average neg_score`)

shapiro.test(health$`Average neg_score`)

## 
##  Shapiro-Wilk normality test
## 
## data:  health$`Average neg_score`
## W = 0.57777, p-value < 2.2e-16

## Multiple t-tests for RateMDs
a<- lapply(health[], function(x) t.test(x ~ healthgrades$Gender, var.equal = TRUE))

## Plots with Gender 
health %>% #plot rating over Gender 
  ggplot(aes(x=Rating,fill=healthgrades$Gender))+
  geom_density(alpha=.4,position="identity")+
  labs(title = "Density of Rating by Gender on HealthGrades\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot Positive_Proportion over Gender 
  ggplot(aes(x=Positive_Proportion,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Positive Proportion by Gender on HealthGrades\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health  %>% #plot Negative_Proportion over Gender 
  ggplot(aes(x=Negative_Proportion,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Negative_Proportion by Gender on HealthGrades\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health  %>% #plot `Average neg_score` over Gender 
  ggplot(aes(x=`Average neg_score`,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average neg_score` by Gender on HealthGrades\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health %>% #plot `Average pos_score` over Gender 
  ggplot(aes(x=`Average pos_score`,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average pos_score` by Gender on HealthGrades\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

health  %>% #plot Years of Experience over Gender 
  ggplot(aes(x=`Years of Experience`,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Years of Experience by Gender on HealthGrades\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Warning: Removed 4 rows containing non-finite values (stat_density).

health %>% #plot Overall_score over Gender 
  ggplot(aes(x=Overall_score,fill=healthgrades$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Overall_score by Gender on HealthGrades\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

library(effsize)

## 
## Attaching package: 'effsize'

## The following object is masked from 'package:psych':
## 
##     cohen.d

library(DT)

data_descriptives <- describe(rate)
datatable(data_descriptives[,c(3,4,5,8,9,11)]) %>%
  formatRound(1:13, 2)

Combined

hcommon=data.frame(healthgrades[,c(3,5,20,21,22,23,24,25,26,27,28,29,30,31,32,7,14,17,16,19,33)])
dim(hcommon)

## [1] 247  21

rcommon=data.frame(ratemdsfinal[,c(3,4,17,18,19,20,21,22,23,24,25,26,27,28,32,6,11,14,12,16,33)])
dim(rcommon)

## [1] 793  21

dcombine=data.frame(rbind(hcommon,rcommon))
str(dcombine)

## 'data.frame':    1040 obs. of  21 variables:
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 1 2 2 2 1 ...
##  $ Years.of.Experience     : chr  "12" "21" "28" "11" ...
##  $ Communication_positive  : num  0.0676 0.0286 0.1 0.1538 0.3333 ...
##  $ Expertisepositive       : num  0.189 0.114 0.1 0.154 0.167 ...
##  $ Timepositive            : num  0.1216 0.0714 0 0.1154 0 ...
##  $ Bedside_positive        : num  0.108 0.186 0.35 0.115 0.333 ...
##  $ Officepositive          : num  0.027 0.0429 0.15 0.0769 0 ...
##  $ Costpositive            : num  0.0541 0.0286 0 0.0385 0 ...
##  $ Communication_negative  : num  0 0.0143 0 0 0 ...
##  $ Expertisenegative       : num  0.027 0.0143 0 0 0 ...
##  $ Time_negative           : num  0 0.0429 0 0 0 ...
##  $ Bedside_negative        : num  0 0.0429 0 0 0 ...
##  $ Office_negative         : num  0 0.0143 0 0 0 ...
##  $ Cost_negative           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Average.words.per.review: num  29 72 44 40 42 61 29 41 17 34 ...
##  $ Rating                  : num  4.9 4.7 5 4.8 5 5 4.8 4.6 5 5 ...
##  $ Positive_Proportion     : num  1 1 1 1 1 ...
##  $ Negative_Proportion     : num  0 0 0 0 0 ...
##  $ Average.pos_score       : num  0.824 0.792 0.916 0.897 0.936 ...
##  $ Average.neg_score       : num  0 0 0 0 0 0 0 -0.612 0 0 ...
##  $ Overall_score           : num  0.824 0.792 0.916 0.897 0.936 ...

dcombine$Years.of.Experience <- factor(dcombine$Years.of.Experience)

# Print Descriptive Statistics for Combined
kable(xtable(describe(dcombine)[c(3,4,5,8,9,11)]))

	mean	sd	median	min	max	skew
Gender*	1.4730769	0.4995148	1.0000000	1.000	2.0000	0.1076933
Years.of.Experience*	19.4922780	12.6408180	18.0000000	1.000	51.0000	1.2210775
Communication_positive	0.0651197	0.1146517	0.0000000	0.000	1.0000	3.1973305
Expertisepositive	0.1262908	0.1620018	0.0909091	0.000	1.0000	2.4456190
Timepositive	0.0627967	0.1055687	0.0000000	0.000	1.0000	2.7014127
Bedside_positive	0.1433768	0.1727294	0.1111111	0.000	1.0000	2.2205717
Officepositive	0.0544948	0.1060410	0.0000000	0.000	1.0000	3.5336472
Costpositive	0.0361672	0.0736005	0.0000000	0.000	0.5000	2.7902871
Communication_negative	0.0399820	0.0908382	0.0000000	0.000	1.0000	4.4908606
Expertisenegative	0.0546442	0.1206358	0.0000000	0.000	1.0000	4.3331605
Time_negative	0.0467875	0.0988836	0.0000000	0.000	1.0000	3.8977334
Bedside_negative	0.0299485	0.0944457	0.0000000	0.000	1.0000	6.6026214
Office_negative	0.0293538	0.0750176	0.0000000	0.000	0.6000	3.8922822
Cost_negative	0.0119775	0.0541032	0.0000000	0.000	1.0000	9.5204150
Average.words.per.review	50.9807692	47.9577896	41.0000000	0.000	737.0000	6.3078409
Rating	4.1334423	1.0470945	4.5000000	1.000	5.0000	-1.3967250
Positive_Proportion	0.7744462	0.3391379	1.0000000	0.000	1.0000	-1.3426505
Negative_Proportion	0.1945310	0.3203035	0.0000000	0.000	1.0000	1.5719105
Average.pos_score	0.6917569	0.2982497	0.7988333	0.000	0.9923	-1.4246449
Average.neg_score	-0.2221033	0.3278596	0.0000000	-0.981	0.0000	-1.0272195
Overall_score	0.4696536	0.5107502	0.6907250	-0.981	0.9914	-1.1026585

## Multiple t-tests for RateMDs
a<- lapply(dcombine[,-c(1:2)], function(x) t.test(x ~ dcombine$Gender, var.equal = TRUE))

dcombine %>% #plot rating over Gender 
  ggplot(aes(x=Rating,fill=dcombine$Gender))+
  geom_density(alpha=.4,position="identity")+
  labs(title = "Density of Rating by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Rating",y = "Density of Rating")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Positive_Proportion over Gender 
  ggplot(aes(x=Positive_Proportion,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Positive Proportion by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Positive_Proportion",y = "Density of Positive Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Negative_Proportion over Gender 
  ggplot(aes(x=Negative_Proportion,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Negative_Proportion by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Negative_Proportion",y = "Density of Negative_Proportion")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot `Average neg_score` over Gender 
  ggplot(aes(x=Average.neg_score,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average neg_score` by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="`Average neg_score`",y = "Density of `Average neg_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot `Average pos_score` over Gender 
  ggplot(aes(x=Average.pos_score,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of `Average pos_score` by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="`Average pos_score`",y = "Density of `Average pos_score`")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

dcombine %>% #plot Years of Experience over Gender 
  ggplot(aes(x=Years.of.Experience,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Years of Experience by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Years of Experience - in years",y = "Density of Years of Experience")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

dcombine %>% #plot Overall_score over Gender 
  ggplot(aes(x=Overall_score,fill=dcombine$Gender))+
  geom_density(alpha=.5,position="identity")+
  labs(title = "Density of Overall_score by Gender on Both - Healthgrades and RateMDs\n(blue=Female, red=Male)",x="Overall_score",y = "Density of Overall_score")+#add title and axis labels
  theme(legend.position="none")#hide the side signal of fill parameter

## Warning: Use of `dcombine$Gender` is discouraged. Use `Gender` instead.

EDA on Text

Chirag Ahluwalia

4/24/2020

Descriptive Statistics

Statistics based on Gender

Healthgrades

Combined