##Assignment

Load the dataset

healthcare <- read.csv("C:/Users/murth/healthcare_dataset.csv")

#1. Print the structure

str(healthcare)
## 'data.frame':    10000 obs. of  15 variables:
##  $ Name              : chr  "Tiffany Ramirez" "Ruben Burns" "Chad Byrd" "Antonio Frederick" ...
##  $ Age               : int  81 35 61 49 51 41 82 55 33 39 ...
##  $ Gender            : chr  "Female" "Male" "Male" "Male" ...
##  $ Blood.Type        : chr  "O-" "O+" "B-" "B-" ...
##  $ Medical.Condition : chr  "Diabetes" "Asthma" "Obesity" "Asthma" ...
##  $ Date.of.Admission : chr  "2022-11-17" "2023-06-01" "2019-01-09" "2020-05-02" ...
##  $ Doctor            : chr  "Patrick Parker" "Diane Jackson" "Paul Baker" "Brian Chandler" ...
##  $ Hospital          : chr  "Wallace-Hamilton" "Burke, Griffin and Cooper" "Walton LLC" "Garcia Ltd" ...
##  $ Insurance.Provider: chr  "Medicare" "UnitedHealthcare" "Medicare" "Medicare" ...
##  $ Billing.Amount    : num  37491 47304 36875 23303 18086 ...
##  $ Room.Number       : int  146 404 292 480 477 180 161 384 215 310 ...
##  $ Admission.Type    : chr  "Elective" "Emergency" "Emergency" "Urgent" ...
##  $ Discharge.Date    : chr  "2022-12-01" "2023-06-15" "2019-02-08" "2020-05-03" ...
##  $ Medication        : chr  "Aspirin" "Lipitor" "Lipitor" "Penicillin" ...
##  $ Test.Results      : chr  "Inconclusive" "Normal" "Normal" "Abnormal" ...

#2. List the variables in your dataset

names(healthcare)
##  [1] "Name"               "Age"                "Gender"            
##  [4] "Blood.Type"         "Medical.Condition"  "Date.of.Admission" 
##  [7] "Doctor"             "Hospital"           "Insurance.Provider"
## [10] "Billing.Amount"     "Room.Number"        "Admission.Type"    
## [13] "Discharge.Date"     "Medication"         "Test.Results"

#3. Print the top 15 rows of your dataset

head(healthcare, 15)
##                     Name Age Gender Blood.Type Medical.Condition
## 1        Tiffany Ramirez  81 Female         O-          Diabetes
## 2            Ruben Burns  35   Male         O+            Asthma
## 3              Chad Byrd  61   Male         B-           Obesity
## 4      Antonio Frederick  49   Male         B-            Asthma
## 5    Mrs. Brandy Flowers  51   Male         O-         Arthritis
## 6         Patrick Parker  41   Male        AB+         Arthritis
## 7         Charles Horton  82   Male        AB+      Hypertension
## 8           Patty Norman  55 Female         O-         Arthritis
## 9             Ryan Hayes  33   Male         A+          Diabetes
## 10          Sharon Perez  39 Female         O-            Asthma
## 11           Amy Roberts  45   Male         B-            Cancer
## 12 Mrs. Caroline Farrell  23 Female         O-      Hypertension
## 13    Christina Williams  85 Female         A+          Diabetes
## 14          William Page  72 Female         A+          Diabetes
## 15      Michael Bradshaw  65 Female        AB+            Cancer
##    Date.of.Admission          Doctor                   Hospital
## 1         2022-11-17  Patrick Parker           Wallace-Hamilton
## 2         2023-06-01   Diane Jackson  Burke, Griffin and Cooper
## 3         2019-01-09      Paul Baker                 Walton LLC
## 4         2020-05-02  Brian Chandler                 Garcia Ltd
## 5         2021-07-09  Dustin Griffin    Jones, Brown and Murray
## 6         2020-08-20     Robin Green                   Boyd PLC
## 7         2021-03-22 Patricia Bishop  Wheeler, Bryant and Johns
## 8         2019-05-16   Brian Kennedy                  Brown Inc
## 9         2020-12-17    Kristin Dunn  Smith, Edwards and Obrien
## 10        2022-12-15  Jessica Bailey               Brown-Golden
## 11        2021-04-13 Anthony Roberts             Little-Spencer
## 12        2019-06-09  William Miller                   Rose Inc
## 13        2021-11-29   Laura Roberts Malone, Thompson and Mejia
## 14        2021-07-29    James Carney          Richardson-Powell
## 15        2021-06-05  Katherine Lowe            Castaneda-Hardy
##    Insurance.Provider Billing.Amount Room.Number Admission.Type Discharge.Date
## 1            Medicare      37490.983         146       Elective     2022-12-01
## 2    UnitedHealthcare      47304.065         404      Emergency     2023-06-15
## 3            Medicare      36874.897         292      Emergency     2019-02-08
## 4            Medicare      23303.322         480         Urgent     2020-05-03
## 5    UnitedHealthcare      18086.344         477         Urgent     2021-08-02
## 6               Aetna      22522.363         180         Urgent     2020-08-23
## 7               Cigna      39593.436         161         Urgent     2021-04-15
## 8          Blue Cross      13546.817         384       Elective     2019-06-02
## 9               Aetna      24903.037         215       Elective     2020-12-22
## 10         Blue Cross      22788.236         310         Urgent     2022-12-16
## 11              Aetna      40325.071         306      Emergency     2021-05-11
## 12           Medicare       6185.904         126      Emergency     2019-06-26
## 13              Aetna       4835.946         444       Elective     2021-12-14
## 14              Cigna      13669.378         492       Elective     2021-08-14
## 15              Cigna      10342.836         120      Emergency     2021-06-25
##     Medication Test.Results
## 1      Aspirin Inconclusive
## 2      Lipitor       Normal
## 3      Lipitor       Normal
## 4   Penicillin     Abnormal
## 5  Paracetamol       Normal
## 6      Aspirin     Abnormal
## 7      Lipitor     Abnormal
## 8      Aspirin       Normal
## 9      Aspirin     Abnormal
## 10     Aspirin       Normal
## 11  Penicillin     Abnormal
## 12 Paracetamol Inconclusive
## 13     Aspirin Inconclusive
## 14     Aspirin       Normal
## 15   Ibuprofen Inconclusive

#4. Write a user defined function using any of the variables from the data set.

# Created classify_age function and used case_when statement to define 
# the age values, mutate() to create new column while assigning values
classify_age <- function(age) {
  case_when(
    age <= 12 ~ "Child",
    age > 12 & age <= 19 ~ "Teenager",
    age > 19 & age <= 64 ~ "Adult",
    age > 64 ~ "Senior",
    TRUE ~ "NA"
  )
}
healthcare <- healthcare %>% 
              mutate(Age_Classification = classify_age(Age))

head(healthcare, 5)
##                  Name Age Gender Blood.Type Medical.Condition Date.of.Admission
## 1     Tiffany Ramirez  81 Female         O-          Diabetes        2022-11-17
## 2         Ruben Burns  35   Male         O+            Asthma        2023-06-01
## 3           Chad Byrd  61   Male         B-           Obesity        2019-01-09
## 4   Antonio Frederick  49   Male         B-            Asthma        2020-05-02
## 5 Mrs. Brandy Flowers  51   Male         O-         Arthritis        2021-07-09
##           Doctor                  Hospital Insurance.Provider Billing.Amount
## 1 Patrick Parker          Wallace-Hamilton           Medicare       37490.98
## 2  Diane Jackson Burke, Griffin and Cooper   UnitedHealthcare       47304.06
## 3     Paul Baker                Walton LLC           Medicare       36874.90
## 4 Brian Chandler                Garcia Ltd           Medicare       23303.32
## 5 Dustin Griffin   Jones, Brown and Murray   UnitedHealthcare       18086.34
##   Room.Number Admission.Type Discharge.Date  Medication Test.Results
## 1         146       Elective     2022-12-01     Aspirin Inconclusive
## 2         404      Emergency     2023-06-15     Lipitor       Normal
## 3         292      Emergency     2019-02-08     Lipitor       Normal
## 4         480         Urgent     2020-05-03  Penicillin     Abnormal
## 5         477         Urgent     2021-08-02 Paracetamol       Normal
##   Age_Classification
## 1             Senior
## 2              Adult
## 3              Adult
## 4              Adult
## 5              Adult

#5. Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset

#Created new dataframe and filtered Seniors and Emergency admission type from the main dataframe, and assigned it t new one

Healthcare_emergency_senior <- as.data.frame(healthcare %>%
                               filter(Admission.Type == "Emergency", Age_Classification == "Senior"
                              ))
head(Healthcare_emergency_senior, 10)
##                      Name Age Gender Blood.Type Medical.Condition
## 1        Michael Bradshaw  65 Female        AB+            Cancer
## 2              Sally Shaw  80   Male         O-         Arthritis
## 3          Francis Newman  74 Female        AB+      Hypertension
## 4          Rodney Maynard  81   Male        AB-           Obesity
## 5      Jennifer Rodriguez  83 Female        AB-            Asthma
## 6              Anna Adams  70 Female         A+           Obesity
## 7  Mr. Christopher Miller  81   Male        AB+      Hypertension
## 8     Christina Hernandez  84 Female         O+      Hypertension
## 9            John Griffin  78 Female         B-            Cancer
## 10      Gabrielle Russell  68   Male        AB+            Cancer
##    Date.of.Admission             Doctor                 Hospital
## 1         2021-06-05     Katherine Lowe          Castaneda-Hardy
## 2         2019-07-10 Zachary Horton DDS  Rush, Owens and Johnson
## 3         2021-05-25   Carolyn Baker MD                Hess-Lowe
## 4         2023-07-09   Dr. Kyle Dickson Nixon, Evans and Bradley
## 5         2019-11-12    Rachel Sullivan               Duke Group
## 6         2022-09-06      Samuel Taylor               Snow Group
## 7         2021-08-15     Amber Gonzalez          Ortega and Sons
## 8         2019-10-13        Cody Wright          Harris and Sons
## 9         2020-04-03       Kelsey Clark              Aguirre LLC
## 10        2020-02-29        Tina Rogers          Berry-Gallagher
##    Insurance.Provider Billing.Amount Room.Number Admission.Type Discharge.Date
## 1               Cigna      10342.836         120      Emergency     2021-06-25
## 2          Blue Cross      16609.312         366      Emergency     2019-08-07
## 3               Cigna      12680.731         298      Emergency     2021-06-08
## 4          Blue Cross      21001.275         197      Emergency     2023-07-17
## 5               Cigna       5098.663         192      Emergency     2019-11-24
## 6            Medicare       1000.181         258      Emergency     2022-09-18
## 7               Cigna      22519.336         465      Emergency     2021-09-06
## 8               Aetna      10803.730         388      Emergency     2019-10-29
## 9               Aetna      25948.507         359      Emergency     2020-04-06
## 10           Medicare       2628.948         380      Emergency     2020-03-12
##     Medication Test.Results Age_Classification
## 1    Ibuprofen Inconclusive             Senior
## 2    Ibuprofen Inconclusive             Senior
## 3      Lipitor     Abnormal             Senior
## 4      Lipitor       Normal             Senior
## 5      Lipitor     Abnormal             Senior
## 6  Paracetamol     Abnormal             Senior
## 7      Lipitor Inconclusive             Senior
## 8      Aspirin Inconclusive             Senior
## 9      Lipitor       Normal             Senior
## 10 Paracetamol       Normal             Senior

#6. Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

Billing_details <- cbind(healthcare$Age , healthcare$Medical.Condition, healthcare$Admission.Type,            healthcare$Billing.Amount)

Billing_details <- as.data.frame(Billing_details)
Billing_details <- rename (Billing_details, Age = V1 ,Medical.Condition = V2, Admission.Type = V3, Billing.Amount = V4 )
head(Billing_details, 5)
##   Age Medical.Condition Admission.Type   Billing.Amount
## 1  81          Diabetes       Elective 37490.9833635282
## 2  35            Asthma      Emergency 47304.0648454751
## 3  61           Obesity      Emergency 36874.8969966128
## 4  49            Asthma         Urgent 23303.3220921969
## 5  51         Arthritis         Urgent 18086.3441835639

#7. Remove missing values in your dataset.

#import dataset
Medical_data <- read.csv("C:/Users/murth/Medical_data.csv")
#head(Medical_data,10)
#Filter missing values
Medical_data %>% filter(is.na(Age))
##    Patient_ID Age Gender Blood_Pressure Cholesterol  BMI Smoker     Diagnosis
## 1       P1012  NA   MALE            160         180 27.8    Yes      Diabetes
## 2       P1016  NA   Male            120         180 22.5     No           nan
## 3       P1017  NA Female            130          NA 27.8    Yes      Diabetes
## 4       P1024  NA   MALE            140         220 22.5      N Heart Disease
## 5       P1034  NA   Male            120          NA 35.4    Yes          None
## 6       P1035  NA FEMALE             NA         220 35.4     No          None
## 7       P1036  NA   male            160         220 35.4      Y Heart Disease
## 8       P1050  NA FEMALE            150         180 30.0     No Heart Disease
## 9       P1063  NA    nan            140         180 22.5    Yes      Diabetes
## 10      P1069  NA Female            130         250   NA    nan          None
## 11      P1070  NA Female            140         300 22.5    Yes      Diabetes
## 12      P1071  NA Female            150         300 40.1      Y      Diabetes
## 13      P1074  NA   MALE            150         200 40.1      N Heart Disease
## 14      P1081  NA   Male            160         200 30.0    Yes          None
## 15      P1083  NA    nan            140         200   NA    Yes Heart Disease
## 16      P1084  NA   Male            120         220 22.5    Yes Heart Disease
## 17      P1087  NA   Male            140         200 35.4    Yes Heart Disease
##    Admission_Date              Notes
## 1      2023-06-22                   
## 2      2023-03-21        Checkup due
## 3      2023-08-01 Follow-up required
## 4      2023-01-20 Follow-up required
## 5      2023-01-12 Follow-up required
## 6      2023-11-26                N/A
## 7      2023-08-12                N/A
## 8      2023-08-08                N/A
## 9      2023-03-23                   
## 10     2023-10-13                N/A
## 11     2023-08-30        Checkup due
## 12     2023-09-08                N/A
## 13     2023-04-29        Checkup due
## 14     2023-09-24 Follow-up required
## 15     2023-10-09                N/A
## 16     2023-03-04                   
## 17     2023-11-11            Pending
#Remove missing values in Age 
is.na(Medical_data$Age)
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13]  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
##  [37]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
##  [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE
##  [85]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE
na.omit(Medical_data)
##     Patient_ID Age Gender Blood_Pressure Cholesterol  BMI Smoker     Diagnosis
## 2        P1001  65   Male            150         180 27.8    nan          None
## 3        P1002  45   Male            120         220 22.5     No           nan
## 5        P1004  65   Male            120         300 40.1    nan          None
## 6        P1005  35 Female            130         200 35.4      N Heart Disease
## 11       P1010  55   MALE            140         220 30.0      Y          None
## 12       P1011  45   Male            120         300 35.4     No           nan
## 14       P1013  65 FEMALE            120         250 27.8    nan           nan
## 16       P1015  55   male            140         250 30.0      N      Diabetes
## 20       P1019  55 Female            150         250 40.1      Y           nan
## 22       P1021  25   Male            160         180 27.8      N          None
## 23       P1022  55    nan            120         180 27.8     No      Diabetes
## 33       P1032  55    nan            140         300 22.5      Y          None
## 34       P1033  55   MALE            140         220 40.1    Yes      Diabetes
## 38       P1037  45    nan            150         200 40.1      N          None
## 39       P1038  55   MALE            130         300 35.4    nan           nan
## 42       P1041  45   male            140         250 35.4     No Heart Disease
## 44       P1043  45 FEMALE            120         300 30.0      Y      Diabetes
## 45       P1044  65   MALE            150         250 35.4     No      Diabetes
## 46       P1045  25   Male            140         300 40.1      Y Heart Disease
## 48       P1047  55 FEMALE            160         250 35.4     No Heart Disease
## 49       P1048  25 Female            140         220 27.8    nan      Diabetes
## 52       P1051  35 FEMALE            140         250 22.5     No      Diabetes
## 53       P1052  35   MALE            150         250 30.0     No Heart Disease
## 55       P1054  35 Female            130         300 27.8    Yes Heart Disease
## 57       P1056  35   MALE            140         300 40.1     No      Diabetes
## 59       P1058  55    nan            150         220 40.1     No      Diabetes
## 61       P1060  55   male            120         180 22.5      Y          None
## 62       P1061  65   male            130         300 35.4    nan           nan
## 63       P1062  45 FEMALE            120         300 40.1      N           nan
## 66       P1065  55 FEMALE            120         300 27.8    nan Heart Disease
## 67       P1066  35    nan            120         220 22.5    Yes           nan
## 68       P1067  55 Female            130         250 27.8      N      Diabetes
## 73       P1072  35    nan            130         180 40.1    Yes           nan
## 74       P1073  55    nan            120         220 22.5      Y           nan
## 76       P1075  65   male            150         180 40.1     No           nan
## 79       P1078  55    nan            120         200 30.0      N      Diabetes
## 86       P1085  55 Female            120         200 40.1    Yes          None
## 89       P1088  65   MALE            140         180 40.1    nan          None
## 90       P1089  65    nan            150         180 22.5      Y      Diabetes
## 94       P1093  25 FEMALE            140         300 35.4      N Heart Disease
## 95       P1094  55   Male            120         200 40.1    nan Heart Disease
## 96       P1095  55   MALE            150         200 35.4    Yes           nan
## 98       P1097  65 Female            150         200 27.8      N           nan
## 100      P1099  65    nan            140         300 35.4      N          None
##     Admission_Date              Notes
## 2       2023-01-01                   
## 3       2023-12-14                   
## 5       2023-07-10 Follow-up required
## 6       2023-11-08 Follow-up required
## 11      2023-06-22        Checkup due
## 12      2023-06-09                N/A
## 14      2023-11-13                   
## 16      2023-05-29        Checkup due
## 20      2023-09-09 Follow-up required
## 22      2023-06-13 Follow-up required
## 23      2023-08-15                N/A
## 33      2023-09-16                   
## 34      2023-05-24                   
## 38      2023-09-29            Pending
## 39      2023-12-24 Follow-up required
## 42      2023-05-02 Follow-up required
## 44      2023-06-13                N/A
## 45      2023-03-31 Follow-up required
## 46      2023-05-16            Pending
## 48      2023-11-12        Checkup due
## 49      2023-06-27            Pending
## 52      2023-12-23        Checkup due
## 53      2023-02-10                   
## 55      2023-07-11            Pending
## 57      2023-07-20        Checkup due
## 59      2023-08-08        Checkup due
## 61      2023-02-15            Pending
## 62      2023-02-04                   
## 63      2023-09-10 Follow-up required
## 66      2023-07-16                   
## 67      2023-10-30                N/A
## 68      2023-01-10                N/A
## 73      2023-12-31            Pending
## 74      2023-01-05            Pending
## 76      2023-10-16                N/A
## 79      2023-08-12                   
## 86      2023-08-05                   
## 89      2023-10-10 Follow-up required
## 90      2023-06-26                   
## 94      2023-11-07        Checkup due
## 95      2023-01-06            Pending
## 96      2023-07-16 Follow-up required
## 98      2023-05-13        Checkup due
## 100     2023-09-16
Medical_data_missing_removed = na.omit(Medical_data)
print(Medical_data_missing_removed)
##     Patient_ID Age Gender Blood_Pressure Cholesterol  BMI Smoker     Diagnosis
## 2        P1001  65   Male            150         180 27.8    nan          None
## 3        P1002  45   Male            120         220 22.5     No           nan
## 5        P1004  65   Male            120         300 40.1    nan          None
## 6        P1005  35 Female            130         200 35.4      N Heart Disease
## 11       P1010  55   MALE            140         220 30.0      Y          None
## 12       P1011  45   Male            120         300 35.4     No           nan
## 14       P1013  65 FEMALE            120         250 27.8    nan           nan
## 16       P1015  55   male            140         250 30.0      N      Diabetes
## 20       P1019  55 Female            150         250 40.1      Y           nan
## 22       P1021  25   Male            160         180 27.8      N          None
## 23       P1022  55    nan            120         180 27.8     No      Diabetes
## 33       P1032  55    nan            140         300 22.5      Y          None
## 34       P1033  55   MALE            140         220 40.1    Yes      Diabetes
## 38       P1037  45    nan            150         200 40.1      N          None
## 39       P1038  55   MALE            130         300 35.4    nan           nan
## 42       P1041  45   male            140         250 35.4     No Heart Disease
## 44       P1043  45 FEMALE            120         300 30.0      Y      Diabetes
## 45       P1044  65   MALE            150         250 35.4     No      Diabetes
## 46       P1045  25   Male            140         300 40.1      Y Heart Disease
## 48       P1047  55 FEMALE            160         250 35.4     No Heart Disease
## 49       P1048  25 Female            140         220 27.8    nan      Diabetes
## 52       P1051  35 FEMALE            140         250 22.5     No      Diabetes
## 53       P1052  35   MALE            150         250 30.0     No Heart Disease
## 55       P1054  35 Female            130         300 27.8    Yes Heart Disease
## 57       P1056  35   MALE            140         300 40.1     No      Diabetes
## 59       P1058  55    nan            150         220 40.1     No      Diabetes
## 61       P1060  55   male            120         180 22.5      Y          None
## 62       P1061  65   male            130         300 35.4    nan           nan
## 63       P1062  45 FEMALE            120         300 40.1      N           nan
## 66       P1065  55 FEMALE            120         300 27.8    nan Heart Disease
## 67       P1066  35    nan            120         220 22.5    Yes           nan
## 68       P1067  55 Female            130         250 27.8      N      Diabetes
## 73       P1072  35    nan            130         180 40.1    Yes           nan
## 74       P1073  55    nan            120         220 22.5      Y           nan
## 76       P1075  65   male            150         180 40.1     No           nan
## 79       P1078  55    nan            120         200 30.0      N      Diabetes
## 86       P1085  55 Female            120         200 40.1    Yes          None
## 89       P1088  65   MALE            140         180 40.1    nan          None
## 90       P1089  65    nan            150         180 22.5      Y      Diabetes
## 94       P1093  25 FEMALE            140         300 35.4      N Heart Disease
## 95       P1094  55   Male            120         200 40.1    nan Heart Disease
## 96       P1095  55   MALE            150         200 35.4    Yes           nan
## 98       P1097  65 Female            150         200 27.8      N           nan
## 100      P1099  65    nan            140         300 35.4      N          None
##     Admission_Date              Notes
## 2       2023-01-01                   
## 3       2023-12-14                   
## 5       2023-07-10 Follow-up required
## 6       2023-11-08 Follow-up required
## 11      2023-06-22        Checkup due
## 12      2023-06-09                N/A
## 14      2023-11-13                   
## 16      2023-05-29        Checkup due
## 20      2023-09-09 Follow-up required
## 22      2023-06-13 Follow-up required
## 23      2023-08-15                N/A
## 33      2023-09-16                   
## 34      2023-05-24                   
## 38      2023-09-29            Pending
## 39      2023-12-24 Follow-up required
## 42      2023-05-02 Follow-up required
## 44      2023-06-13                N/A
## 45      2023-03-31 Follow-up required
## 46      2023-05-16            Pending
## 48      2023-11-12        Checkup due
## 49      2023-06-27            Pending
## 52      2023-12-23        Checkup due
## 53      2023-02-10                   
## 55      2023-07-11            Pending
## 57      2023-07-20        Checkup due
## 59      2023-08-08        Checkup due
## 61      2023-02-15            Pending
## 62      2023-02-04                   
## 63      2023-09-10 Follow-up required
## 66      2023-07-16                   
## 67      2023-10-30                N/A
## 68      2023-01-10                N/A
## 73      2023-12-31            Pending
## 74      2023-01-05            Pending
## 76      2023-10-16                N/A
## 79      2023-08-12                   
## 86      2023-08-05                   
## 89      2023-10-10 Follow-up required
## 90      2023-06-26                   
## 94      2023-11-07        Checkup due
## 95      2023-01-06            Pending
## 96      2023-07-16 Follow-up required
## 98      2023-05-13        Checkup due
## 100     2023-09-16

#8. Identify and remove duplicated data in your dataset

#Remove duplicates based on healthcare - Name variable 
Healthcare_unique = healthcare %>% distinct(healthcare$Name)
Healthcare_unique = healthcare %>% distinct(healthcare$Name, .keep_all = TRUE)
head(Healthcare_unique, 10)
##                   Name Age Gender Blood.Type Medical.Condition
## 1      Tiffany Ramirez  81 Female         O-          Diabetes
## 2          Ruben Burns  35   Male         O+            Asthma
## 3            Chad Byrd  61   Male         B-           Obesity
## 4    Antonio Frederick  49   Male         B-            Asthma
## 5  Mrs. Brandy Flowers  51   Male         O-         Arthritis
## 6       Patrick Parker  41   Male        AB+         Arthritis
## 7       Charles Horton  82   Male        AB+      Hypertension
## 8         Patty Norman  55 Female         O-         Arthritis
## 9           Ryan Hayes  33   Male         A+          Diabetes
## 10        Sharon Perez  39 Female         O-            Asthma
##    Date.of.Admission          Doctor                  Hospital
## 1         2022-11-17  Patrick Parker          Wallace-Hamilton
## 2         2023-06-01   Diane Jackson Burke, Griffin and Cooper
## 3         2019-01-09      Paul Baker                Walton LLC
## 4         2020-05-02  Brian Chandler                Garcia Ltd
## 5         2021-07-09  Dustin Griffin   Jones, Brown and Murray
## 6         2020-08-20     Robin Green                  Boyd PLC
## 7         2021-03-22 Patricia Bishop Wheeler, Bryant and Johns
## 8         2019-05-16   Brian Kennedy                 Brown Inc
## 9         2020-12-17    Kristin Dunn Smith, Edwards and Obrien
## 10        2022-12-15  Jessica Bailey              Brown-Golden
##    Insurance.Provider Billing.Amount Room.Number Admission.Type Discharge.Date
## 1            Medicare       37490.98         146       Elective     2022-12-01
## 2    UnitedHealthcare       47304.06         404      Emergency     2023-06-15
## 3            Medicare       36874.90         292      Emergency     2019-02-08
## 4            Medicare       23303.32         480         Urgent     2020-05-03
## 5    UnitedHealthcare       18086.34         477         Urgent     2021-08-02
## 6               Aetna       22522.36         180         Urgent     2020-08-23
## 7               Cigna       39593.44         161         Urgent     2021-04-15
## 8          Blue Cross       13546.82         384       Elective     2019-06-02
## 9               Aetna       24903.04         215       Elective     2020-12-22
## 10         Blue Cross       22788.24         310         Urgent     2022-12-16
##     Medication Test.Results Age_Classification     healthcare$Name
## 1      Aspirin Inconclusive             Senior     Tiffany Ramirez
## 2      Lipitor       Normal              Adult         Ruben Burns
## 3      Lipitor       Normal              Adult           Chad Byrd
## 4   Penicillin     Abnormal              Adult   Antonio Frederick
## 5  Paracetamol       Normal              Adult Mrs. Brandy Flowers
## 6      Aspirin     Abnormal              Adult      Patrick Parker
## 7      Lipitor     Abnormal             Senior      Charles Horton
## 8      Aspirin       Normal              Adult        Patty Norman
## 9      Aspirin     Abnormal              Adult          Ryan Hayes
## 10     Aspirin       Normal              Adult        Sharon Perez
# Remove duplicates based on medical condition 
Healthcare_unique1 = healthcare %>% distinct(healthcare$Medical.Condition, .keep_all = TRUE)
head(Healthcare_unique1, 5)
##                  Name Age Gender Blood.Type Medical.Condition Date.of.Admission
## 1     Tiffany Ramirez  81 Female         O-          Diabetes        2022-11-17
## 2         Ruben Burns  35   Male         O+            Asthma        2023-06-01
## 3           Chad Byrd  61   Male         B-           Obesity        2019-01-09
## 4 Mrs. Brandy Flowers  51   Male         O-         Arthritis        2021-07-09
## 5      Charles Horton  82   Male        AB+      Hypertension        2021-03-22
##            Doctor                  Hospital Insurance.Provider Billing.Amount
## 1  Patrick Parker          Wallace-Hamilton           Medicare       37490.98
## 2   Diane Jackson Burke, Griffin and Cooper   UnitedHealthcare       47304.06
## 3      Paul Baker                Walton LLC           Medicare       36874.90
## 4  Dustin Griffin   Jones, Brown and Murray   UnitedHealthcare       18086.34
## 5 Patricia Bishop Wheeler, Bryant and Johns              Cigna       39593.44
##   Room.Number Admission.Type Discharge.Date  Medication Test.Results
## 1         146       Elective     2022-12-01     Aspirin Inconclusive
## 2         404      Emergency     2023-06-15     Lipitor       Normal
## 3         292      Emergency     2019-02-08     Lipitor       Normal
## 4         477         Urgent     2021-08-02 Paracetamol       Normal
## 5         161         Urgent     2021-04-15     Lipitor     Abnormal
##   Age_Classification healthcare$Medical.Condition
## 1             Senior                     Diabetes
## 2              Adult                       Asthma
## 3              Adult                      Obesity
## 4              Adult                    Arthritis
## 5             Senior                 Hypertension

#9. Reorder multiple rows in descending order

# Reorder rows in descending order by Billing Amount
Healthcare_sorted = as.data.frame(healthcare %>% arrange(desc(Billing.Amount)))
head(Healthcare_sorted, 10)
##                 Name Age Gender Blood.Type Medical.Condition Date.of.Admission
## 1        Daniel Hall  77   Male         A+      Hypertension        2018-12-04
## 2    Teresa Buchanan  75   Male         B+            Cancer        2020-11-22
## 3           Roy Beck  79 Female         A-         Arthritis        2020-11-05
## 4         Mary Stein  35   Male         A+      Hypertension        2020-12-07
## 5      Richard Jones  59 Female         A+            Asthma        2020-05-16
## 6      Holly Clayton  43   Male         O-           Obesity        2021-11-30
## 7       Jason Miller  74   Male         B-            Cancer        2019-01-01
## 8        John Oneill  31   Male        AB-          Diabetes        2019-08-16
## 9  Elizabeth Johnson  50 Female         A-      Hypertension        2022-07-10
## 10      Robert Potts  67 Female         A-           Obesity        2019-01-17
##               Doctor                        Hospital Insurance.Provider
## 1    Timothy Serrano                Arellano-Mahoney              Aetna
## 2        Joseph Rice                 Ellison-Johnson   UnitedHealthcare
## 3        Aaron Mills       Thompson, Carlson and Kim           Medicare
## 4    Alice Gross DVM      Morales, Ferrell and Clark           Medicare
## 5      Rebecca Parks        Smith, Cooper and Chavez              Aetna
## 6  Zachary Castaneda    Webster, Oconnell and Norton           Medicare
## 7      Matthew Lewis         Ford, Gibson and Parker         Blue Cross
## 8       Travis Gibbs                        Dunn Ltd              Aetna
## 9         Dawn Haley Sanders, Robertson and Williams           Medicare
## 10    William Wilson                 Harmon-Anderson         Blue Cross
##    Billing.Amount Room.Number Admission.Type Discharge.Date  Medication
## 1        49995.90         196      Emergency     2018-12-11 Paracetamol
## 2        49994.98         296         Urgent     2020-11-28     Aspirin
## 3        49985.97         483       Elective     2020-11-12 Paracetamol
## 4        49974.81         390      Emergency     2020-12-28     Lipitor
## 5        49974.30         242      Emergency     2020-05-31     Aspirin
## 6        49974.16         441      Emergency     2021-12-16  Penicillin
## 7        49958.00         292      Emergency     2019-01-29   Ibuprofen
## 8        49954.97         455      Emergency     2019-09-03 Paracetamol
## 9        49951.26         476      Emergency     2022-08-06     Aspirin
## 10       49947.56         154      Emergency     2019-01-19     Aspirin
##    Test.Results Age_Classification
## 1      Abnormal             Senior
## 2  Inconclusive             Senior
## 3  Inconclusive             Senior
## 4        Normal              Adult
## 5      Abnormal              Adult
## 6      Abnormal              Adult
## 7  Inconclusive             Senior
## 8      Abnormal              Adult
## 9  Inconclusive              Adult
## 10 Inconclusive             Senior

#10. Rename some of the column names in your dataset

# Create a new dataframe using cbind() with selected columns
Healthcare_New = cbind(healthcare$Name, healthcare$Age, healthcare$Billing.Amount, healthcare$Medical.Condition)
Healthcare_New = as.data.frame(Healthcare_New)
names(Healthcare_New)
## [1] "V1" "V2" "V3" "V4"
Healthcare_New <- rename (Healthcare_New, Patient_Name = V1 ,Patient_Age = V2, Billing_Amount = V3, Medical_Condition = V4 )
names(Healthcare_New)
## [1] "Patient_Name"      "Patient_Age"       "Billing_Amount"   
## [4] "Medical_Condition"

#11. Add new variables in your data frame by using a mathematical function (for e.g. – multiply an existing column by 2 and add it as a new variable to your data frame)

#20% discount on billing amount for senior
Healthcare_discount = healthcare %>%
                      mutate(Discounted_Billing_Amount = 
                      case_when( Age_Classification == "Senior" ~ Billing.Amount * 0.80,
                                  TRUE ~ Billing.Amount
                                ))
head(Healthcare_discount, 5)
##                  Name Age Gender Blood.Type Medical.Condition Date.of.Admission
## 1     Tiffany Ramirez  81 Female         O-          Diabetes        2022-11-17
## 2         Ruben Burns  35   Male         O+            Asthma        2023-06-01
## 3           Chad Byrd  61   Male         B-           Obesity        2019-01-09
## 4   Antonio Frederick  49   Male         B-            Asthma        2020-05-02
## 5 Mrs. Brandy Flowers  51   Male         O-         Arthritis        2021-07-09
##           Doctor                  Hospital Insurance.Provider Billing.Amount
## 1 Patrick Parker          Wallace-Hamilton           Medicare       37490.98
## 2  Diane Jackson Burke, Griffin and Cooper   UnitedHealthcare       47304.06
## 3     Paul Baker                Walton LLC           Medicare       36874.90
## 4 Brian Chandler                Garcia Ltd           Medicare       23303.32
## 5 Dustin Griffin   Jones, Brown and Murray   UnitedHealthcare       18086.34
##   Room.Number Admission.Type Discharge.Date  Medication Test.Results
## 1         146       Elective     2022-12-01     Aspirin Inconclusive
## 2         404      Emergency     2023-06-15     Lipitor       Normal
## 3         292      Emergency     2019-02-08     Lipitor       Normal
## 4         480         Urgent     2020-05-03  Penicillin     Abnormal
## 5         477         Urgent     2021-08-02 Paracetamol       Normal
##   Age_Classification Discounted_Billing_Amount
## 1             Senior                  29992.79
## 2              Adult                  47304.06
## 3              Adult                  36874.90
## 4              Adult                  23303.32
## 5              Adult                  18086.34

#12. Create a training set using random number generator engine.

# set.seed() must always be run before any random sampling
set.seed(1234)
#75 % training set 
Healthcare_training75 = healthcare %>% sample_frac(.75, replace = FALSE)
str(Healthcare_training75)
## 'data.frame':    7500 obs. of  16 variables:
##  $ Name              : chr  "Joseph Smith" "Timothy Ford" "Christopher Oconnell" "Gilbert Gordon" ...
##  $ Age               : int  30 26 77 77 74 82 21 63 76 70 ...
##  $ Gender            : chr  "Male" "Male" "Male" "Female" ...
##  $ Blood.Type        : chr  "O+" "B-" "AB+" "A+" ...
##  $ Medical.Condition : chr  "Diabetes" "Asthma" "Asthma" "Obesity" ...
##  $ Date.of.Admission : chr  "2019-12-27" "2023-10-16" "2019-08-30" "2023-04-06" ...
##  $ Doctor            : chr  "Rachel Baker" "Richard Ross" "Tyler Torres" "Sandra Johnson" ...
##  $ Hospital          : chr  "Archer Ltd" "Lynch-Martin" "Davis Ltd" "Ayala, Hernandez and Collins" ...
##  $ Insurance.Provider: chr  "Blue Cross" "Aetna" "Blue Cross" "UnitedHealthcare" ...
##  $ Billing.Amount    : num  7843 7005 15067 20940 43028 ...
##  $ Room.Number       : int  494 179 324 222 396 104 245 345 289 263 ...
##  $ Admission.Type    : chr  "Emergency" "Emergency" "Emergency" "Emergency" ...
##  $ Discharge.Date    : chr  "2020-01-01" "2023-11-05" "2019-09-17" "2023-04-27" ...
##  $ Medication        : chr  "Lipitor" "Ibuprofen" "Paracetamol" "Penicillin" ...
##  $ Test.Results      : chr  "Abnormal" "Normal" "Inconclusive" "Abnormal" ...
##  $ Age_Classification: chr  "Adult" "Adult" "Senior" "Senior" ...

set.seed(1234) #75 % training set Healthcare_training75 = healthcare %>% sample_frac(.75, replace = FALSE) str(Healthcare_training75)

#13. Print the summary statistics of your dataset

summary(healthcare)
##         Name            Age              Gender          Blood.Type   
##  Length   :10000   Min.   :18.00   Length   :10000   Length   :10000  
##  N.unique : 9378   1st Qu.:35.00   N.unique :    2   N.unique :    8  
##  N.blank  :    0   Median :52.00   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    7   Mean   :51.45   Min.nchar:    4   Min.nchar:    2  
##  Max.nchar:   25   3rd Qu.:68.00   Max.nchar:    6   Max.nchar:    3  
##                    Max.   :85.00                                      
##  Medical.Condition Date.of.Admission       Doctor           Hospital    
##  Length   :10000   Length   :10000   Length   :10000   Length   :10000  
##  N.unique :    6   N.unique : 1815   N.unique : 9416   N.unique : 8639  
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0  
##  Min.nchar:    6   Min.nchar:   10   Min.nchar:    6   Min.nchar:    6  
##  Max.nchar:   12   Max.nchar:   10   Max.nchar:   26   Max.nchar:   34  
##                                                                         
##  Insurance.Provider Billing.Amount   Room.Number      Admission.Type 
##  Length   :10000    Min.   : 1000   Min.   :101.0   Length   :10000  
##  N.unique :    5    1st Qu.:13507   1st Qu.:199.0   N.unique :    3  
##  N.blank  :    0    Median :25258   Median :299.0   N.blank  :    0  
##  Min.nchar:    5    Mean   :25517   Mean   :300.1   Min.nchar:    6  
##  Max.nchar:   16    3rd Qu.:37734   3rd Qu.:400.0   Max.nchar:    9  
##                     Max.   :49996   Max.   :500.0                    
##    Discharge.Date      Medication       Test.Results   Age_Classification
##  Length   :10000   Length   :10000   Length   :10000   Length   :10000   
##  N.unique : 1834   N.unique :    5   N.unique :    3   N.unique :    3   
##  N.blank  :    0   N.blank  :    0   N.blank  :    0   N.blank  :    0   
##  Min.nchar:   10   Min.nchar:    7   Min.nchar:    6   Min.nchar:    5   
##  Max.nchar:   10   Max.nchar:   11   Max.nchar:   12   Max.nchar:    8   
## 

#14. Use any of the numerical variables from the dataset and perform the following statistical functions • Mean , • Median , • Mode , • Range

#Using Billing.Amount as the numerical variable
mean(healthcare$Billing.Amount)
## [1] 25516.81
# Median
median(healthcare$Billing.Amount)
## [1] 25258.11
#Mode

# Mode - no built-in function in R, took reference from https://www.geeksforgeeks.org/r-language/mean-median-and-mode-in-r-programming/ 
get_mode <- function(x){
  unique_x <- unique(x)
  unique_x[which.max(tabulate(match(x, unique_x)))]
}
get_mode(healthcare$Billing.Amount)
## [1] 37490.98
# Range
range(healthcare$Billing.Amount)
## [1]  1000.181 49995.902

#15. Plot a scatter plot for any 2 variables in your dataset

ggplot(data = healthcare, aes(x = healthcare$Age, 
                              y=healthcare$Billing.Amount, 
                              col= healthcare$Gender  )) + geom_point()
## Warning: Use of `healthcare$Age` is discouraged.
## ℹ Use `Age` instead.
## Warning: Use of `healthcare$Billing.Amount` is discouraged.
## ℹ Use `Billing.Amount` instead.
## Warning: Use of `healthcare$Gender` is discouraged.
## ℹ Use `Gender` instead.

#16. Plot a bar plot for any 2 variables in your dataset

ggplot(data = healthcare, aes(x = healthcare$Medical.Condition, 
                          fill = healthcare$Age_Classification)) + geom_bar()
## Warning: Use of `healthcare$Medical.Condition` is discouraged.
## ℹ Use `Medical.Condition` instead.
## Warning: Use of `healthcare$Age_Classification` is discouraged.
## ℹ Use `Age_Classification` instead.

#17. Find the correlation between any 2 variables by applying Pearson correlation

cor(healthcare$Age, healthcare$Billing.Amount, method = "pearson")
## [1] -0.009483329