PROJECT CODE

Load necessary libraries

library(dplyr)#for data manipulation(filtering,summarizing etc)
library(tidyverse)#this package is contains other packages like ggplot2,dplr,tidyr,etc
library(ggplot2)#for creating complex data visualization
library(gridExtra)#for arranging multiple ggplot objects in a grid layout.

Importing data set

Importing the data set using the read.csv() function,(comma separated variable), and assessing the quality of the data set, by checking the structure, and then checked for missing,duplicate values,and dimension of the data set.

centhos <- read.csv("C:/Users/User/Documents/Central Hospital.csv")
summary(centhos)

##    case_id            generation    date_infection      date_onset       
##  Length:454         Min.   : 2.00   Length:454         Length:454        
##  Class :character   1st Qu.:13.00   Class :character   Class :character  
##  Mode  :character   Median :16.00   Mode  :character   Mode  :character  
##                     Mean   :16.88                                        
##                     3rd Qu.:20.00                                        
##                     Max.   :36.00                                        
##                                                                          
##  date_hospitalisation date_outcome         outcome             gender         
##  Length:454           Length:454         Length:454         Length:454        
##  Class :character     Class :character   Class :character   Class :character  
##  Mode  :character     Mode  :character   Mode  :character   Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##       age         age_unit           age_years       age_cat         
##  Min.   : 0.0   Length:454         Min.   : 0.00   Length:454        
##  1st Qu.: 7.0   Class :character   1st Qu.: 7.00   Class :character  
##  Median :15.0   Mode  :character   Median :15.00   Mode  :character  
##  Mean   :17.4                      Mean   :17.38                     
##  3rd Qu.:24.0                      3rd Qu.:24.00                     
##  Max.   :87.0                      Max.   :87.00                     
##  NA's   :9                         NA's   :9                         
##    age_cat5           hospital              lon              lat       
##  Length:454         Length:454         Min.   :-13.27   Min.   :8.448  
##  Class :character   Class :character   1st Qu.:-13.25   1st Qu.:8.460  
##  Mode  :character   Mode  :character   Median :-13.23   Median :8.468  
##                                        Mean   :-13.23   Mean   :8.469  
##                                        3rd Qu.:-13.22   3rd Qu.:8.479  
##                                        Max.   :-13.21   Max.   :8.490  
##                                                                        
##    infector            source              wt_kg            ht_cm      
##  Length:454         Length:454         Min.   : -2.00   Min.   : 15.0  
##  Class :character   Class :character   1st Qu.: 43.00   1st Qu.: 97.0  
##  Mode  :character   Mode  :character   Median : 57.50   Median :135.0  
##                                        Mean   : 55.13   Mean   :129.7  
##                                        3rd Qu.: 67.00   3rd Qu.:161.0  
##                                        Max.   :103.00   Max.   :335.0  
##                                                                        
##     ct_blood        fever              chills             cough          
##  Min.   :17.00   Length:454         Length:454         Length:454        
##  1st Qu.:20.00   Class :character   Class :character   Class :character  
##  Median :22.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :21.19                                                           
##  3rd Qu.:22.00                                                           
##  Max.   :25.00                                                           
##                                                                          
##     aches              vomit                temp      time_admission    
##  Length:454         Length:454         Min.   :35.7   Length:454        
##  Class :character   Class :character   1st Qu.:37.8   Class :character  
##  Mode  :character   Mode  :character   Median :38.8   Mode  :character  
##                                        Mean   :38.5                     
##                                        3rd Qu.:39.2                     
##                                        Max.   :40.4                     
##                                        NA's   :3                        
##       bmi         days_onset_hosp 
##  Min.   :-41.32   Min.   : 0.000  
##  1st Qu.: 24.10   1st Qu.: 1.000  
##  Median : 32.23   Median : 1.000  
##  Mean   : 44.57   Mean   : 1.852  
##  3rd Qu.: 47.71   3rd Qu.: 2.000  
##  Max.   :370.37   Max.   :12.000  
##

View(centhos)
head(centhos,10)

##    case_id generation date_infection date_onset date_hospitalisation
## 1   275cc7          5     2014-05-24 2014-05-27           2014-05-28
## 2   64c8ef          6                2014-07-14           2014-07-15
## 3   e56412          9     2014-07-12 2014-07-15           2014-07-17
## 4   62a2ef          9     2014-07-02 2014-07-17           2014-07-19
## 5   92e129         10     2014-08-01 2014-08-10           2014-08-12
## 6   240da4         10                2014-08-20           2014-08-20
## 7   31e797         13     2014-08-21 2014-08-26           2014-08-28
## 8   9b4647          9     2014-08-03 2014-08-29           2014-08-29
## 9   fd4dd4         13     2014-09-04 2014-09-05           2014-09-05
## 10  24ef7d         11     2014-09-06 2014-09-12           2014-09-14
##    date_outcome outcome gender age age_unit age_years age_cat age_cat5
## 1    2014-06-07   Death      f   4    years         4     0-4      0-4
## 2    2014-07-25 Recover      m  30    years        30   30-49    30-34
## 3    2014-07-19   Death      f  23    years        23   20-29    20-24
## 4    2014-07-23   Death      f  14    years        14   10-14    10-14
## 5    2014-08-22 Recover      f  42    years        42   30-49    40-44
## 6                 Death      m   1    years         1     0-4      0-4
## 7                 Death      m  49    years        49   30-49    45-49
## 8    2014-08-27   Death      m  44    years        44   30-49    40-44
## 9    2014-09-07   Death      f   2    years         2     0-4      0-4
## 10   2014-10-09 Recover      m  52    years        52   50-69    50-54
##            hospital       lon      lat infector  source wt_kg ht_cm ct_blood
## 1  Central Hospital -13.23694 8.469161   e02f66   other    41    75       23
## 2  Central Hospital -13.26147 8.457506                     66   177       23
## 3  Central Hospital -13.23433 8.478321   894024 funeral    55   150       23
## 4  Central Hospital -13.23794 8.469774   b0c500   other    59   136       22
## 5  Central Hospital -13.23573 8.474329   4b501e   other    85   196       23
## 6  Central Hospital -13.26799 8.462502                     17    47       23
## 7  Central Hospital -13.26021 8.457613   c276a4   other    82   220       21
## 8  Central Hospital -13.21540 8.482433   2b36fa   other    73   207       23
## 9  Central Hospital -13.21277 8.464897   ea996f funeral    21    39       23
## 10 Central Hospital -13.22377 8.461412   a8c00d   other    90   236       22
##    fever chills cough aches vomit temp time_admission       bmi days_onset_hosp
## 1     no     no   yes    no    no 37.2          09:21  72.88889               1
## 2     no    yes   yes    no    no 36.8          10:12  21.06674               1
## 3     no     no    no    no   yes 37.4          16:46  24.44444               2
## 4     no    yes   yes    no    no 36.6          13:52  31.89879               2
## 5                                 36.7          14:25  22.12620               2
## 6     no    yes   yes    no   yes 36.3          16:06  76.95790               0
## 7     no     no   yes    no    no 36.2          13:10  16.94215               2
## 8                                 37.8          04:00  17.03657               0
## 9     no    yes    no    no    no 37.1          15:28 138.06706               0
## 10                                37.5          13:46  16.15915               2

sum(is.na(centhos)|centhos == "")

## [1] 890

colSums(is.na(centhos)|centhos == "")

##              case_id           generation       date_infection 
##                    0                    0                  154 
##           date_onset date_hospitalisation         date_outcome 
##                    0                    0                   74 
##              outcome               gender                  age 
##                   96                   26                    9 
##             age_unit            age_years              age_cat 
##                    0                    9                    9 
##             age_cat5             hospital                  lon 
##                    9                    0                    0 
##                  lat             infector               source 
##                    0                  154                  154 
##                wt_kg                ht_cm             ct_blood 
##                    0                    0                    0 
##                fever               chills                cough 
##                   26                   26                   26 
##                aches                vomit                 temp 
##                   26                   26                    3 
##       time_admission                  bmi      days_onset_hosp 
##                   63                    0                    0

missing_summary <- sapply(centhos, function(x) sum(is.na(x) | x == ""))
missing_summary <- missing_summary[missing_summary > 0]
missing_summary

## date_infection   date_outcome        outcome         gender            age 
##            154             74             96             26              9 
##      age_years        age_cat       age_cat5       infector         source 
##              9              9              9            154            154 
##          fever         chills          cough          aches          vomit 
##             26             26             26             26             26 
##           temp time_admission 
##              3             63

sum(duplicated(centhos))

## [1] 0

dim(centhos)

## [1] 454  30

Handling missing data

To handle the missing values, i discovered that most of the columns have too much missing values, and some are irrelevant so i removed them! like the date_infection, date_outcome, source, infector columns has a lot of missing values and the data are irrelevant and for the other age_cat is the same as age_cat5 so i removed age_cat.

newcenthos <- centhos[, -c(3,4,5,6,10,11,12,14,17,18)]
head(newcenthos)

##   case_id generation outcome gender age age_cat5       lon      lat wt_kg ht_cm
## 1  275cc7          5   Death      f   4      0-4 -13.23694 8.469161    41    75
## 2  64c8ef          6 Recover      m  30    30-34 -13.26147 8.457506    66   177
## 3  e56412          9   Death      f  23    20-24 -13.23433 8.478321    55   150
## 4  62a2ef          9   Death      f  14    10-14 -13.23794 8.469774    59   136
## 5  92e129         10 Recover      f  42    40-44 -13.23573 8.474329    85   196
## 6  240da4         10   Death      m   1      0-4 -13.26799 8.462502    17    47
##   ct_blood fever chills cough aches vomit temp time_admission      bmi
## 1       23    no     no   yes    no    no 37.2          09:21 72.88889
## 2       23    no    yes   yes    no    no 36.8          10:12 21.06674
## 3       23    no     no    no    no   yes 37.4          16:46 24.44444
## 4       22    no    yes   yes    no    no 36.6          13:52 31.89879
## 5       23                                36.7          14:25 22.12620
## 6       23    no    yes   yes    no   yes 36.3          16:06 76.95790
##   days_onset_hosp
## 1               1
## 2               1
## 3               2
## 4               2
## 5               2
## 6               0

dim(newcenthos)

## [1] 454  20

Handling missing data in the age and time_admission column

For the age and time_admissin col, this columns are important so i replaced the missing values in the age column with the mean, since they are just 9 missing rows.

centhos$age["NA"]

## [1] NA

summary(newcenthos$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     7.0    15.0    17.4    24.0    87.0       9

sum(is.na(newcenthos$age)| newcenthos$age == "")

## [1] 9

newcenthos$age <- ifelse(is.na(newcenthos$age) | newcenthos$age == "", mean(newcenthos$age, na.rm = TRUE), newcenthos$age)
newcenthos$age <- round(newcenthos$age)
head(newcenthos$age)

## [1]  4 30 23 14 42  1

Handling missing values in the time_admission column

For the time_admission column, there are some missing values there and i don’t want to just delete the rows, so i replaced them with 00:00

sum(is.na(newcenthos$time_admission)|newcenthos$time_admission =="")

## [1] 63

summary(newcenthos$time_admission)

##    Length     Class      Mode 
##       454 character character

newcenthos$time_admission <- ifelse(is.na(newcenthos$time_admission) | newcenthos$time_admission == "","00:00", newcenthos$time_admission)
head(newcenthos$time_admission)

## [1] "09:21" "10:12" "16:46" "13:52" "14:25" "16:06"

colSums(is.na(newcenthos)|newcenthos=="")

##         case_id      generation         outcome          gender             age 
##               0               0              96              26               0 
##        age_cat5             lon             lat           wt_kg           ht_cm 
##               9               0               0               0               0 
##        ct_blood           fever          chills           cough           aches 
##               0              26              26              26              26 
##           vomit            temp  time_admission             bmi days_onset_hosp 
##              26               3               0               0               0

Removing all rows with missing data

After all this cleaning there are still some missing rows, so i would be removing them.

rows_with_missing <- newcenthos[rowSums(is.na(newcenthos)|newcenthos=="")>0, ]
head(rows_with_missing)

##    case_id generation outcome gender age age_cat5       lon      lat wt_kg
## 5   92e129         10 Recover      f  42    40-44 -13.23573 8.474329    85
## 8   9b4647          9   Death      m  44    40-44 -13.21540 8.482433    73
## 10  24ef7d         11 Recover      m  52    50-54 -13.22377 8.461412    90
## 12  f5e8d0         11              m  22    20-24 -13.25317 8.457906    77
## 13  7c67f1         14 Recover      f  18    15-19 -13.21197 8.483053    59
## 14  5d6f13          7 Recover      m  27    25-29 -13.26767 8.462242    71
##    ht_cm ct_blood fever chills cough aches vomit temp time_admission      bmi
## 5    196       23                                36.7          14:25 22.12620
## 8    207       23                                37.8          04:00 17.03657
## 10   236       22                                37.5          13:46 16.15915
## 12   147       21                                36.9          09:32 35.63330
## 13   185       22                                37.8          00:00 17.23886
## 14   175       23                                35.8          10:35 23.18367
##    days_onset_hosp
## 5                2
## 8                0
## 10               2
## 12               0
## 13               0
## 14               2

nrow(rows_with_missing)

## [1] 141

centralhospital<- anti_join(newcenthos, rows_with_missing)

## Joining with `by = join_by(case_id, generation, outcome, gender, age, age_cat5,
## lon, lat, wt_kg, ht_cm, ct_blood, fever, chills, cough, aches, vomit, temp,
## time_admission, bmi, days_onset_hosp)`

dim(centralhospital)

## [1] 313  20

head(centralhospital)

##   case_id generation outcome gender age age_cat5       lon      lat wt_kg ht_cm
## 1  275cc7          5   Death      f   4      0-4 -13.23694 8.469161    41    75
## 2  64c8ef          6 Recover      m  30    30-34 -13.26147 8.457506    66   177
## 3  e56412          9   Death      f  23    20-24 -13.23433 8.478321    55   150
## 4  62a2ef          9   Death      f  14    10-14 -13.23794 8.469774    59   136
## 5  240da4         10   Death      m   1      0-4 -13.26799 8.462502    17    47
## 6  31e797         13   Death      m  49    45-49 -13.26021 8.457613    82   220
##   ct_blood fever chills cough aches vomit temp time_admission      bmi
## 1       23    no     no   yes    no    no 37.2          09:21 72.88889
## 2       23    no    yes   yes    no    no 36.8          10:12 21.06674
## 3       23    no     no    no    no   yes 37.4          16:46 24.44444
## 4       22    no    yes   yes    no    no 36.6          13:52 31.89879
## 5       23    no    yes   yes    no   yes 36.3          16:06 76.95790
## 6       21    no     no   yes    no    no 36.2          13:10 16.94215
##   days_onset_hosp
## 1               1
## 2               1
## 3               2
## 4               2
## 5               0
## 6               2

summary(centralhospital)

##    case_id            generation      outcome             gender         
##  Length:313         Min.   : 2.00   Length:313         Length:313        
##  Class :character   1st Qu.:13.00   Class :character   Class :character  
##  Mode  :character   Median :16.00   Mode  :character   Mode  :character  
##                     Mean   :16.93                                        
##                     3rd Qu.:20.00                                        
##                     Max.   :35.00                                        
##       age          age_cat5              lon              lat       
##  Min.   : 0.00   Length:313         Min.   :-13.27   Min.   :8.449  
##  1st Qu.: 7.00   Class :character   1st Qu.:-13.25   1st Qu.:8.460  
##  Median :15.00   Mode  :character   Median :-13.23   Median :8.468  
##  Mean   :17.68                      Mean   :-13.23   Mean   :8.469  
##  3rd Qu.:24.00                      3rd Qu.:-13.22   3rd Qu.:8.479  
##  Max.   :73.00                      Max.   :-13.21   Max.   :8.490  
##      wt_kg            ht_cm          ct_blood        fever          
##  Min.   :  0.00   Min.   : 26.0   Min.   :17.00   Length:313        
##  1st Qu.: 44.00   1st Qu.: 96.0   1st Qu.:20.00   Class :character  
##  Median : 58.00   Median :135.0   Median :22.00   Mode  :character  
##  Mean   : 55.53   Mean   :129.7   Mean   :21.16                     
##  3rd Qu.: 68.00   3rd Qu.:161.0   3rd Qu.:22.00                     
##  Max.   :100.00   Max.   :281.0   Max.   :25.00                     
##     chills             cough              aches              vomit          
##  Length:313         Length:313         Length:313         Length:313        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       temp       time_admission          bmi         days_onset_hosp 
##  Min.   :35.90   Length:313         Min.   :  0.00   Min.   : 0.000  
##  1st Qu.:38.20   Class :character   1st Qu.: 24.45   1st Qu.: 1.000  
##  Median :38.80   Mode  :character   Median : 32.63   Median : 1.000  
##  Mean   :38.57                      Mean   : 44.15   Mean   : 1.904  
##  3rd Qu.:39.20                      3rd Qu.: 47.90   3rd Qu.: 3.000  
##  Max.   :40.40                      Max.   :370.37   Max.   :11.000

removing negative values from bmi and wt_kg

In this columns there are some in consistencies, some data have some negative signs which is not supposed to be there, so i removed it! cause bmi and wt_kg cant be negative!

centralhospital$bmi <- abs(centralhospital$bmi)
head(centralhospital$bmi)

## [1] 72.88889 21.06674 24.44444 31.89879 76.95790 16.94215

centralhospital$wt_kg <- abs(centralhospital$wt_kg)
head(centralhospital$wt_kg)

## [1] 41 66 55 59 17 82

Checking for outliers

Outliers are data points that are significantly different from other observation in a data set, so to check for outliers in this data, i would be using boxplot() function.

Age Distribution: There are several dots to the right of the upper whisker, indicating some individuals in this dataset are considerably older than the majority.
Height Distribution (ht_cm): There’s a single dot to the right of the upper whisker, suggesting one person is notably taller than most. There’s also a dot to the left, indicating a shorter individual.
BMI Distribution: This one has quite a few outliers on the right side, indicating some individuals have a much higher Body Mass Index than the rest. There are also a few on the left, suggesting very low BMIs.
Days Onset Distribution (days_onset_hosp): There are several outliers to the right, showing some individuals had a much longer time between the onset of symptoms and hospitalization.
Weight Distribution (wt_kg): Similar to height, there’s a dot on the higher end, indicating a heavier individual, and a few on the lower end, indicating lighter individuals.
Temperature Distribution (temp): There’s a single dot on the higher end, suggesting one instance of a higher temperature reading

p1 <- ggplot(centralhospital, aes(age))+
  geom_boxplot(fill = "skyblue")+
  labs(title = "Age Distribution")

p2 <- ggplot(centralhospital, aes(ht_cm))+
  geom_boxplot(fill = "skyblue")+
  labs(title = "Height Distribution")

p3 <- ggplot(centralhospital, aes(bmi))+
  geom_boxplot(fill = "skyblue")+
  labs(title = "bmi Distribution")

p4 <- ggplot(centralhospital, aes(days_onset_hosp))+
  geom_boxplot(fill = "skyblue")+
  labs(title = "days onset distribution")

p5 <- ggplot(centralhospital, aes(wt_kg))+
  geom_boxplot(fill = "skyblue")+
  labs(title = "wt_kg Distribution")

p6 <- ggplot(centralhospital, aes(temp))+
  geom_boxplot(fill = "skyblue")+
  labs(title = "temp Distribution")

grid.arrange(p1,p2,p3,p4,p5,p6, ncol=3)

Bar plot of outcome against gender

This plot shows that more males dies than females, and dthe number of recovered are almost equal for both genders.

ggplot(centralhospital, aes(x = gender, fill = outcome))+
  geom_bar(position = "dodge")+
  labs(title = "Gender vs Outcome", x = "Gender", y = "count")+
  scale_fill_manual(values = c("Recover" = "yellow","Death"="blue"))+
  theme_minimal()

Plot of thhe outcome

This plot shows that more people died and less people recovered

ggplot(centralhospital, aes(x = outcome))+
  geom_bar(fill = "orange")+
  labs(title = "Outcome Distribution", x = "Outcome", y = "count")+
  theme_minimal()

PROJECT TWO: Cognifyz Technologies Certified Internship – Restaurant Dataset Analysis

The Cognifyz data set contains detailed information on over 9,500 restaurants across various cities and countries. It includes data on restaurant names, locations, cuisines offered, average costs, delivery and booking options, customer ratings, and reviews. I would be analyzing trends, customer preferences, service availability, and geographic distribution of restaurants.

Aim

To derive insights from a restaurant data set through various data analytics tasks using R.

Research Questions

What are the most common cuisines served?
Which cities have the highest-rated restaurants?
Does online delivery or table booking correlate with higher prices or ratings?

Objectives

Perform comprehensive exploratory data analysis (EDA).
Identify patterns and trends in customer reviews, pricing, and service features.
Visualize findings through charts and statistical summaries

Loading necessary libraries

library(dplyr)
library(labeling)
library(ggplot2)
library(sf)
library(ggmap)
library(dbscan)
library(labeling)
library(stringr)
library(tidyverse)  
library(tidytext)   
library(wordcloud)   
library(textdata)

Loading and checking for missing values

loading the data set and then checking for missing values, we have just 9 missing values, so we are removing it.

cognifyz <- read.csv("C:/Users/User/Desktop/GIT presentation/COGNIFYZ DATASET.csv")
head(cognifyz,5)

##   Restaurant.ID        Restaurant.Name Country.Code             City
## 1       6317637       Le Petit Souffle          162      Makati City
## 2       6304287       Izakaya Kikufuji          162      Makati City
## 3       6300002 Heat - Edsa Shangri-La          162 Mandaluyong City
## 4       6318506                   Ooma          162 Mandaluyong City
## 5       6314302            Sambo Kojin          162 Mandaluyong City
##                                                                   Address
## 1 Third Floor, Century City Mall, Kalayaan Avenue, Poblacion, Makati City
## 2     Little Tokyo, 2277 Chino Roces Avenue, Legaspi Village, Makati City
## 3                Edsa Shangri-La, 1 Garden Way, Ortigas, Mandaluyong City
## 4  Third Floor, Mega Fashion Hall, SM Megamall, Ortigas, Mandaluyong City
## 5        Third Floor, Mega Atrium, SM Megamall, Ortigas, Mandaluyong City
##                                     Locality
## 1  Century City Mall, Poblacion, Makati City
## 2 Little Tokyo, Legaspi Village, Makati City
## 3 Edsa Shangri-La, Ortigas, Mandaluyong City
## 4     SM Megamall, Ortigas, Mandaluyong City
## 5     SM Megamall, Ortigas, Mandaluyong City
##                                               Locality.Verbose Longitude
## 1       Century City Mall, Poblacion, Makati City, Makati City  121.0275
## 2      Little Tokyo, Legaspi Village, Makati City, Makati City  121.0141
## 3 Edsa Shangri-La, Ortigas, Mandaluyong City, Mandaluyong City  121.0568
## 4     SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City  121.0565
## 5     SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City  121.0575
##   Latitude                         Cuisines Average.Cost.for.two
## 1 14.56544       French, Japanese, Desserts                 1100
## 2 14.55371                         Japanese                 1200
## 3 14.58140 Seafood, Asian, Filipino, Indian                 4000
## 4 14.58532                  Japanese, Sushi                 1500
## 5 14.58445                 Japanese, Korean                 1500
##           Currency Has.Table.booking Has.Online.delivery Is.delivering.now
## 1 Botswana Pula(P)               Yes                  No                No
## 2 Botswana Pula(P)               Yes                  No                No
## 3 Botswana Pula(P)               Yes                  No                No
## 4 Botswana Pula(P)                No                  No                No
## 5 Botswana Pula(P)               Yes                  No                No
##   Switch.to.order.menu Price.range Aggregate.rating Rating.color Rating.text
## 1                   No           3              4.8   Dark Green   Excellent
## 2                   No           3              4.5   Dark Green   Excellent
## 3                   No           4              4.4        Green   Very Good
## 4                   No           4              4.9   Dark Green   Excellent
## 5                   No           4              4.8   Dark Green   Excellent
##   Votes
## 1   314
## 2   591
## 3   270
## 4   365
## 5   229

summary(cognifyz)

##  Restaurant.ID      Restaurant.Name     Country.Code        City          
##  Min.   :      53   Length:9551        Min.   :  1.00   Length:9551       
##  1st Qu.:  301962   Class :character   1st Qu.:  1.00   Class :character  
##  Median : 6004089   Mode  :character   Median :  1.00   Mode  :character  
##  Mean   : 9051128                      Mean   : 18.37                     
##  3rd Qu.:18352292                      3rd Qu.:  1.00                     
##  Max.   :18500652                      Max.   :216.00                     
##    Address            Locality         Locality.Verbose     Longitude      
##  Length:9551        Length:9551        Length:9551        Min.   :-157.95  
##  Class :character   Class :character   Class :character   1st Qu.:  77.08  
##  Mode  :character   Mode  :character   Mode  :character   Median :  77.19  
##                                                           Mean   :  64.13  
##                                                           3rd Qu.:  77.28  
##                                                           Max.   : 174.83  
##     Latitude        Cuisines         Average.Cost.for.two   Currency        
##  Min.   :-41.33   Length:9551        Min.   :     0       Length:9551       
##  1st Qu.: 28.48   Class :character   1st Qu.:   250       Class :character  
##  Median : 28.57   Mode  :character   Median :   400       Mode  :character  
##  Mean   : 25.85                      Mean   :  1199                         
##  3rd Qu.: 28.64                      3rd Qu.:   700                         
##  Max.   : 55.98                      Max.   :800000                         
##  Has.Table.booking  Has.Online.delivery Is.delivering.now  Switch.to.order.menu
##  Length:9551        Length:9551         Length:9551        Length:9551         
##  Class :character   Class :character    Class :character   Class :character    
##  Mode  :character   Mode  :character    Mode  :character   Mode  :character    
##                                                                                
##                                                                                
##                                                                                
##   Price.range    Aggregate.rating Rating.color       Rating.text       
##  Min.   :1.000   Min.   :0.000    Length:9551        Length:9551       
##  1st Qu.:1.000   1st Qu.:2.500    Class :character   Class :character  
##  Median :2.000   Median :3.200    Mode  :character   Mode  :character  
##  Mean   :1.805   Mean   :2.666                                         
##  3rd Qu.:2.000   3rd Qu.:3.700                                         
##  Max.   :4.000   Max.   :4.900                                         
##      Votes        
##  Min.   :    0.0  
##  1st Qu.:    5.0  
##  Median :   31.0  
##  Mean   :  156.9  
##  3rd Qu.:  131.0  
##  Max.   :10934.0

sum(is.na(cognifyz) | cognifyz =="")

## [1] 9

checking for missing rows and removing them

# Checking for missing rows in each columns
na_location <- which(is.na(cognifyz)|cognifyz=="", arr.ind = TRUE)
na_location

##       row col
##  [1,]  85  10
##  [2,]  88  10
##  [3,]  95  10
##  [4,] 298  10
##  [5,] 329  10
##  [6,] 347  10
##  [7,] 369  10
##  [8,] 419  10
##  [9,] 456  10

#I removed rows with missing data since the rows are not to much
cognifyz_1 <- cognifyz[!is.na(cognifyz$Cuisines) & cognifyz$Cuisines != "", ]
head(cognifyz_1,3)

##   Restaurant.ID        Restaurant.Name Country.Code             City
## 1       6317637       Le Petit Souffle          162      Makati City
## 2       6304287       Izakaya Kikufuji          162      Makati City
## 3       6300002 Heat - Edsa Shangri-La          162 Mandaluyong City
##                                                                   Address
## 1 Third Floor, Century City Mall, Kalayaan Avenue, Poblacion, Makati City
## 2     Little Tokyo, 2277 Chino Roces Avenue, Legaspi Village, Makati City
## 3                Edsa Shangri-La, 1 Garden Way, Ortigas, Mandaluyong City
##                                     Locality
## 1  Century City Mall, Poblacion, Makati City
## 2 Little Tokyo, Legaspi Village, Makati City
## 3 Edsa Shangri-La, Ortigas, Mandaluyong City
##                                               Locality.Verbose Longitude
## 1       Century City Mall, Poblacion, Makati City, Makati City  121.0275
## 2      Little Tokyo, Legaspi Village, Makati City, Makati City  121.0141
## 3 Edsa Shangri-La, Ortigas, Mandaluyong City, Mandaluyong City  121.0568
##   Latitude                         Cuisines Average.Cost.for.two
## 1 14.56544       French, Japanese, Desserts                 1100
## 2 14.55371                         Japanese                 1200
## 3 14.58140 Seafood, Asian, Filipino, Indian                 4000
##           Currency Has.Table.booking Has.Online.delivery Is.delivering.now
## 1 Botswana Pula(P)               Yes                  No                No
## 2 Botswana Pula(P)               Yes                  No                No
## 3 Botswana Pula(P)               Yes                  No                No
##   Switch.to.order.menu Price.range Aggregate.rating Rating.color Rating.text
## 1                   No           3              4.8   Dark Green   Excellent
## 2                   No           3              4.5   Dark Green   Excellent
## 3                   No           4              4.4        Green   Very Good
##   Votes
## 1   314
## 2   591
## 3   270

Determining the three most common cuisines

#Level 1 Task 1: Determine the top three most common cuisines in the data set
top3Cuisines <- cognifyz_1 %>%
  count(Cuisines) %>%
  arrange(desc(n)) %>%
 head(3)
print(top3Cuisines)

##                Cuisines   n
## 1          North Indian 936
## 2 North Indian, Chinese 511
## 3               Chinese 354

calculate percentage of retaurants that serve each of the top cuicines

#Calculate the percentage of restaurants that serve each of the top cuisines.
#the first step is to get the total no of restaurants
total_no_restaurants <- nrow(cognifyz_1)
total_no_restaurants

## [1] 9542

#counting number of entries in the cuisines column
no_ofentries_cuisine <- cognifyz_1 %>%
  count(Cuisines)
head(no_ofentries_cuisine,)

##                                    Cuisines n
## 1                                   Afghani 4
## 2                 Afghani, Mughlai, Chinese 1
## 3                     Afghani, North Indian 1
## 4 Afghani, North Indian, Pakistani, Arabian 1
## 5                                   African 1
## 6                       African, Portuguese 1

#get the top 3 cuisines
top3cuisines <-no_ofentries_cuisine %>%
  arrange(desc(n))%>%
  head(3)
print(top3cuisines)

##                Cuisines   n
## 1          North Indian 936
## 2 North Indian, Chinese 511
## 3               Chinese 354

#percentage of restaurants that serve each of the top cuisines
top3cuisinesPER <- top3cuisines %>%
  mutate(percentage = (n / total_no_restaurants) * 100)
print(top3cuisinesPER)

##                Cuisines   n percentage
## 1          North Indian 936   9.809264
## 2 North Indian, Chinese 511   5.355271
## 3               Chinese 354   3.709914

#level 1 task 2
#Identify the city with the highest number of restaurants in the data set.
# first step is to Count the number of restaurants in each city
city_counts <- cognifyz_1 %>%
  count(City) %>%
  arrange(desc(n))

#Displaying the city with the highest number of restaurants
higest_no_res <- city_counts[1,]
print(higest_no_res)

##        City    n
## 1 New Delhi 5473

# Calculate the average rating for restaurants in each city
ave_ratings <- cognifyz_1 %>%
  group_by(City) %>%
  summarise(avg_rating = mean(Aggregate.rating)) %>%
  arrange(desc(avg_rating))
print(ave_ratings)

## # A tibble: 140 × 2
##    City             avg_rating
##    <chr>                 <dbl>
##  1 Inner City             4.9 
##  2 Quezon City            4.8 
##  3 Makati City            4.65
##  4 Pasig City             4.63
##  5 Mandaluyong City       4.62
##  6 Beechworth             4.6 
##  7 London                 4.54
##  8 Taguig City            4.53
##  9 Lincoln                4.5 
## 10 Secunderabad           4.5 
## # ℹ 130 more rows

#Determine the city with the highest average rating
city_highest_rating <- ave_ratings[1, ]
print(city_highest_rating)

## # A tibble: 1 × 2
##   City       avg_rating
##   <chr>           <dbl>
## 1 Inner City        4.9

#Level 1 task 3 Price Range Distribution
#Create a histogram or bar chart to visualize the distribution of price ranges among the restaurants.
ggplot(cognifyz_1, aes(x = Price.range)) +
  geom_bar(fill = "skyblue", color = "black") +
  #geom_histogram(bindwidth = 0.5, boundary = 0) +
  labs(
    title = "Distribution of Price Ranges Among Restaurants",
    x = "Price Range",
    y = "Count"
  ) +
  theme_minimal()

#Calculate the percentage of restaurants in each price range category.
percentage_restaurants <- cognifyz_1 %>%
  group_by(Price.range) %>%
  summarise(count = n()) %>%
  mutate(percentage = (count / sum(count)) * 100)
print(percentage_restaurants)

## # A tibble: 4 × 3
##   Price.range count percentage
##         <int> <int>      <dbl>
## 1           1  4438      46.5 
## 2           2  3113      32.6 
## 3           3  1405      14.7 
## 4           4   586       6.14

# #so i decided to plot the graph of the percentage of restaurants in each category
# ggplot(percentage_restaurants, aes(x = `Price.range`, y = percentage)) +
#   geom_bar(stat = "identity", fill = "skyblue", color = "black") +
#   labs(
#     title = "Percentage of Restaurants in Each Price Range category",
#     x = "Price Range",
#     y = "Percentage (%)"
#   ) +
#   theme_minimal()

#level 1 Task 4
#Task: Online Delivery
#Determine the percentage of restaurants that offer online delivery.
online_delivery_percentage <- cognifyz_1 %>%
  summarise(
    total_restaurants = n(),
      num_online_delivery = sum(Has.Online.delivery == "Yes", na.rm = TRUE)
  ) %>%
  mutate(percentage_online_delivery = (num_online_delivery / total_restaurants) * 100)
print(online_delivery_percentage)

##   total_restaurants num_online_delivery percentage_online_delivery
## 1              9542                2451                   25.68644

#Compare the average ratings of restaurants with and without online delivery
ave_ratings_online_delivery <- cognifyz_1 %>%
  group_by(Has.Online.delivery) %>%
  summarise(avg_rating = mean(Aggregate.rating, na.rm = TRUE))
print(ave_ratings_online_delivery)

## # A tibble: 2 × 2
##   Has.Online.delivery avg_rating
##   <chr>                    <dbl>
## 1 No                        2.46
## 2 Yes                       3.25

#Creating  a bar chart to compare this ratings
#i did this out of curiosity
ggplot(ave_ratings_online_delivery, aes(x = Has.Online.delivery, y = avg_rating, fill = Has.Online.delivery)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Average Ratings Based on Online Delivery",
    x = "Online Delivery",
    y = "Average Rating"
  ) +
  theme_minimal()

part two

#setting the working directory
setwd("C:/Users/User/Desktop/GIT presentation")
cognifyz_1 <- read.csv("Cleaned_Dataset.csv")
view(cognifyz_1)
#level 1 task 1
#Analyze the text reviews to identify the most common positive and negative keywords.
# Convert cognifyz_1 into a tidy format (word tokens)
review_words <-cognifyz_1 %>%
  unnest_tokens(word, Rating.text) %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

# Remove common stopwords like "the", "and", "is"
# View processed words
head(review_words)

##   Restaurant.ID                          Restaurant.Name Country.Code
## 1       6317637                         Le Petit Souffle          162
## 2       6304287                         Izakaya Kikufuji          162
## 3       6318506                                     Ooma          162
## 4       6314302                              Sambo Kojin          162
## 5       6300010 Spiral - Sofitel Philippine Plaza Manila          162
## 6       6314987                                 Locavore          162
##               City
## 1      Makati City
## 2      Makati City
## 3 Mandaluyong City
## 4 Mandaluyong City
## 5       Pasay City
## 6       Pasig City
##                                                                   Address
## 1 Third Floor, Century City Mall, Kalayaan Avenue, Poblacion, Makati City
## 2     Little Tokyo, 2277 Chino Roces Avenue, Legaspi Village, Makati City
## 3  Third Floor, Mega Fashion Hall, SM Megamall, Ortigas, Mandaluyong City
## 4        Third Floor, Mega Atrium, SM Megamall, Ortigas, Mandaluyong City
## 5   Plaza Level, Sofitel Philippine Plaza Manila, CCP Complex, Pasay City
## 6     Brixton Technology Center, 10 Brixton Street, Kapitolyo, Pasig City
##                                      Locality
## 1   Century City Mall, Poblacion, Makati City
## 2  Little Tokyo, Legaspi Village, Makati City
## 3      SM Megamall, Ortigas, Mandaluyong City
## 4      SM Megamall, Ortigas, Mandaluyong City
## 5 Sofitel Philippine Plaza Manila, Pasay City
## 6                                   Kapitolyo
##                                           Locality.Verbose Longitude Latitude
## 1   Century City Mall, Poblacion, Makati City, Makati City  121.0275 14.56544
## 2  Little Tokyo, Legaspi Village, Makati City, Makati City  121.0141 14.55371
## 3 SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City  121.0565 14.58532
## 4 SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City  121.0575 14.58445
## 5  Sofitel Philippine Plaza Manila, Pasay City, Pasay City  120.9801 14.55299
## 6                                    Kapitolyo, Pasig City  121.0565 14.57204
##                     Cuisines Average.Cost.for.two         Currency
## 1 French, Japanese, Desserts                 1100 Botswana Pula(P)
## 2                   Japanese                 1200 Botswana Pula(P)
## 3            Japanese, Sushi                 1500 Botswana Pula(P)
## 4           Japanese, Korean                 1500 Botswana Pula(P)
## 5    European, Asian, Indian                 6000 Botswana Pula(P)
## 6                   Filipino                 1100 Botswana Pula(P)
##   Has.Table.booking Has.Online.delivery Is.delivering.now Switch.to.order.menu
## 1               Yes                  No                No                   No
## 2               Yes                  No                No                   No
## 3                No                  No                No                   No
## 4               Yes                  No                No                   No
## 5               Yes                  No                No                   No
## 6               Yes                  No                No                   No
##   Price.range Aggregate.rating Rating.color Votes      word
## 1           3              4.8   Dark Green   314 excellent
## 2           3              4.5   Dark Green   591 excellent
## 3           4              4.9   Dark Green   365 excellent
## 4           4              4.8   Dark Green   229 excellent
## 5           4              4.9   Dark Green   621 excellent
## 6           3              4.8   Dark Green   532 excellent

# Load sentiment lexicon (Bing)
bing_lexicon <- get_sentiments("bing")

# Match words with sentiment lexicon
review_sentiment <- review_words %>%
  inner_join(bing_lexicon, by = "word")

# Count most common positive and negative words
top_words <- review_sentiment %>%
  count(word, sentiment, sort = TRUE)
# View top words
head(top_words)

##        word sentiment   n
## 1 excellent  positive 300
## 2      poor  negative 186

# Plot top 10 positive & negative words
top_words %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(word, n), y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  coord_flip() +
  labs(title = "Most Common Positive & Negative Words", x = "Word", y = "Count") +
  theme_minimal()

## Selecting by n

#Calculate the average length of reviews and explore if there is a relationship between review length and rating.
# Compute review length (word count)
reviews <- cognifyz_1 %>%
  mutate(review_length = str_count(Rating.text, "\\w+"))
head(reviews)

##   Restaurant.ID        Restaurant.Name Country.Code             City
## 1       6317637       Le Petit Souffle          162      Makati City
## 2       6304287       Izakaya Kikufuji          162      Makati City
## 3       6300002 Heat - Edsa Shangri-La          162 Mandaluyong City
## 4       6318506                   Ooma          162 Mandaluyong City
## 5       6314302            Sambo Kojin          162 Mandaluyong City
## 6      18189371           Din Tai Fung          162 Mandaluyong City
##                                                                   Address
## 1 Third Floor, Century City Mall, Kalayaan Avenue, Poblacion, Makati City
## 2     Little Tokyo, 2277 Chino Roces Avenue, Legaspi Village, Makati City
## 3                Edsa Shangri-La, 1 Garden Way, Ortigas, Mandaluyong City
## 4  Third Floor, Mega Fashion Hall, SM Megamall, Ortigas, Mandaluyong City
## 5        Third Floor, Mega Atrium, SM Megamall, Ortigas, Mandaluyong City
## 6 Ground Floor, Mega Fashion Hall, SM Megamall, Ortigas, Mandaluyong City
##                                     Locality
## 1  Century City Mall, Poblacion, Makati City
## 2 Little Tokyo, Legaspi Village, Makati City
## 3 Edsa Shangri-La, Ortigas, Mandaluyong City
## 4     SM Megamall, Ortigas, Mandaluyong City
## 5     SM Megamall, Ortigas, Mandaluyong City
## 6     SM Megamall, Ortigas, Mandaluyong City
##                                               Locality.Verbose Longitude
## 1       Century City Mall, Poblacion, Makati City, Makati City  121.0275
## 2      Little Tokyo, Legaspi Village, Makati City, Makati City  121.0141
## 3 Edsa Shangri-La, Ortigas, Mandaluyong City, Mandaluyong City  121.0568
## 4     SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City  121.0565
## 5     SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City  121.0575
## 6     SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City  121.0563
##   Latitude                         Cuisines Average.Cost.for.two
## 1 14.56544       French, Japanese, Desserts                 1100
## 2 14.55371                         Japanese                 1200
## 3 14.58140 Seafood, Asian, Filipino, Indian                 4000
## 4 14.58532                  Japanese, Sushi                 1500
## 5 14.58445                 Japanese, Korean                 1500
## 6 14.58376                          Chinese                 1000
##           Currency Has.Table.booking Has.Online.delivery Is.delivering.now
## 1 Botswana Pula(P)               Yes                  No                No
## 2 Botswana Pula(P)               Yes                  No                No
## 3 Botswana Pula(P)               Yes                  No                No
## 4 Botswana Pula(P)                No                  No                No
## 5 Botswana Pula(P)               Yes                  No                No
## 6 Botswana Pula(P)                No                  No                No
##   Switch.to.order.menu Price.range Aggregate.rating Rating.color Rating.text
## 1                   No           3              4.8   Dark Green   Excellent
## 2                   No           3              4.5   Dark Green   Excellent
## 3                   No           4              4.4        Green   Very Good
## 4                   No           4              4.9   Dark Green   Excellent
## 5                   No           4              4.8   Dark Green   Excellent
## 6                   No           3              4.4        Green   Very Good
##   Votes review_length
## 1   314             1
## 2   591             1
## 3   270             2
## 4   365             1
## 5   229             1
## 6   336             2

colnames(reviews)

##  [1] "Restaurant.ID"        "Restaurant.Name"      "Country.Code"        
##  [4] "City"                 "Address"              "Locality"            
##  [7] "Locality.Verbose"     "Longitude"            "Latitude"            
## [10] "Cuisines"             "Average.Cost.for.two" "Currency"            
## [13] "Has.Table.booking"    "Has.Online.delivery"  "Is.delivering.now"   
## [16] "Switch.to.order.menu" "Price.range"          "Aggregate.rating"    
## [19] "Rating.color"         "Rating.text"          "Votes"               
## [22] "review_length"

# Calculate average length
avg_length <- mean(reviews$review_length, na.rm = TRUE)
# Print result
print(paste("Average review length:", round(avg_length, 2)))

## [1] "Average review length: 1.34"

# Scatter plot: Review Length vs Aggregate Rating
ggplot(reviews, aes(x = review_length, y =Aggregate.rating)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_smooth(method = "lm", color = "red") +  # Add trendline
  labs(title = "Review Length vs Rating", x = "Review Length (words)", y = "Aggregate Rating") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#task 2
#Identify the restaurants with the highest and lowest number of votes.
#restaurants with highest votes
top_restaurants <- cognifyz_1%>%
  arrange(desc(Votes))
head(top_restaurants, 5)  # Top 5 restaurants with the highest votes

##   Restaurant.ID           Restaurant.Name Country.Code      City
## 1         51705                      Toit            1 Bangalore
## 2         51040                  Truffles            1 Bangalore
## 3        308322          Hauz Khas Social            1 New Delhi
## 4         20404                 Peter Cat            1   Kolkata
## 5         56618 AB's - Absolute Barbecues            1 Bangalore
##                                                                           Address
## 1               298, Namma Metro Pillar 62, 100 Feet Road, Indiranagar, Bangalore
## 2                             28, 4th 'B' Cross, Koramangala 5th Block, Bangalore
## 3                                          9-A & 12, Hauz Khas Village, New Delhi
## 4                                     18A, Park Street, Park Street Area, Kolkata
## 5 90/4, 3rd Floor, Outer Ring Road, Munnekollaly Village, Marathahalli, Bangalore
##                Locality                 Locality.Verbose Longitude Latitude
## 1           Indiranagar           Indiranagar, Bangalore  77.64071 12.97917
## 2 Koramangala 5th Block Koramangala 5th Block, Bangalore  77.61429 12.93330
## 3     Hauz Khas Village     Hauz Khas Village, New Delhi  77.19447 28.55429
## 4      Park Street Area        Park Street Area, Kolkata  88.35289 22.55267
## 5          Marathahalli          Marathahalli, Bangalore  77.69939 12.94993
##                                     Cuisines Average.Cost.for.two
## 1                   Italian, American, Pizza                 2000
## 2                     American, Burger, Cafe                  800
## 3 Continental, American, Asian, North Indian                 1600
## 4                  Continental, North Indian                 1000
## 5      European, Mediterranean, North Indian                 1400
##             Currency Has.Table.booking Has.Online.delivery Is.delivering.now
## 1 Indian Rupees(Rs.)                No                  No                No
## 2 Indian Rupees(Rs.)                No                 Yes                No
## 3 Indian Rupees(Rs.)               Yes                 Yes                No
## 4 Indian Rupees(Rs.)                No                 Yes                No
## 5 Indian Rupees(Rs.)                No                  No                No
##   Switch.to.order.menu Price.range Aggregate.rating Rating.color Rating.text
## 1                   No           4              4.8   Dark Green   Excellent
## 2                   No           2              4.7   Dark Green   Excellent
## 3                   No           3              4.3        Green   Very Good
## 4                   No           3              4.3        Green   Very Good
## 5                   No           3              4.6   Dark Green   Excellent
##   Votes
## 1 10934
## 2  9667
## 3  7931
## 4  7574
## 5  6907

#restaurants with lowest votes
bottom_restaurants <- cognifyz_1 %>% arrange(Votes)
head(bottom_restaurants, 5)  # Top 5 restaurants with the lowest votes

##   Restaurant.ID    Restaurant.Name Country.Code       City
## 1       6710645   Cantinho da Gula           30 S��o Paulo
## 2      18433852      The Chaiwalas            1  Faridabad
## 3      18465871 Fusion Food Corner            1  Faridabad
## 4      18472646      Punjabi Rasoi            1  Faridabad
## 5      18471268      Baskin Robbin            1  Faridabad
##                                                                  Address
## 1                 Rua Pedroso Alvarenga, 522, Itaim Bibi, S��o Paulo, SP
## 2     Sector 21 A, Asian Hospital, Badhkal Chowk, Badkal Lake, Faridabad
## 3 158/7, Opposite DDA Flat, Pul Pehlad Pur, Charmwood Village, Faridabad
## 4                   1, Deepak Complex, Eros Charmwood Village, Faridabad
## 5               Ground Floor, Crown Interiorz Mall, Sector 35, Faridabad
##                                     Locality
## 1                                 Itaim Bibi
## 2                                Badkal Lake
## 3                          Charmwood Village
## 4                          Charmwood Village
## 5 Crown Interiorz Mall, Sector 35, Faridabad
##                                        Locality.Verbose Longitude  Latitude
## 1                                Itaim Bibi, S��o Paulo -46.67567 -23.58100
## 2                                Badkal Lake, Faridabad  77.30009  28.42628
## 3                          Charmwood Village, Faridabad   0.00000   0.00000
## 4                          Charmwood Village, Faridabad  77.29243  28.49233
## 5 Crown Interiorz Mall, Sector 35, Faridabad, Faridabad  77.30745  28.46959
##                Cuisines Average.Cost.for.two           Currency
## 1             Brazilian                   55 Brazilian Real(R$)
## 2                  Cafe                  300 Indian Rupees(Rs.)
## 3 North Indian, Chinese                  300 Indian Rupees(Rs.)
## 4          North Indian                  400 Indian Rupees(Rs.)
## 5              Desserts                  300 Indian Rupees(Rs.)
##   Has.Table.booking Has.Online.delivery Is.delivering.now Switch.to.order.menu
## 1                No                  No                No                   No
## 2                No                  No                No                   No
## 3                No                  No                No                   No
## 4                No                  No                No                   No
## 5                No                  No                No                   No
##   Price.range Aggregate.rating Rating.color Rating.text Votes
## 1           2                0        White   Not rated     0
## 2           1                0        White   Not rated     0
## 3           1                0        White   Not rated     0
## 4           1                0        White   Not rated     0
## 5           1                0        White   Not rated     0

#Analyze if there is a correlation between the number of votes and the rating of a restaurant.
# Group by restaurant name and compute total votes and average rating
votes_summary <- cognifyz_1 %>%
  group_by(Restaurant.Name) %>%
  summarise(
    total_votes = sum(Votes),             # or use n() if each row represents one vote
    avg_rating = mean(Aggregate.rating, na.rm = TRUE)
  )

# Highest total votes
top_restaurants <- votes_summary %>% arrange(desc(total_votes))
head(top_restaurants, 5)

## # A tibble: 5 × 3
##   Restaurant.Name           total_votes avg_rating
##   <chr>                           <int>      <dbl>
## 1 Barbeque Nation                 28142       4.35
## 2 AB's - Absolute Barbecues       13400       4.82
## 3 Toit                            10934       4.8 
## 4 Big Chill                       10853       4.47
## 5 Farzi Cafe                      10098       4.37

# Lowest total votes
bottom_restaurants <- votes_summary %>% arrange(total_votes)
head(bottom_restaurants, 5)

## # A tibble: 5 × 3
##   Restaurant.Name total_votes avg_rating
##   <chr>                 <int>      <dbl>
## 1 #hashtag                  0          0
## 2 13 Cafe                   0          0
## 3 3x Cafe                   0          0
## 4 44 Grills                 0          0
## 5 6 Packs Momos             0          0

# If using the aggregated data
correlation <- cor(votes_summary$total_votes, votes_summary$avg_rating, use = "complete.obs")
print(paste("Correlation between votes and rating:", round(correlation, 2)))

## [1] "Correlation between votes and rating: 0.24"

# Scatter plot with trendline using ggplot2
ggplot(votes_summary, aes(x = total_votes, y = avg_rating)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Relationship Between Votes and Rating",
       x = "Total Votes",
       y = "Average Rating") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#task 3
#Analyze if there is a relationship between the
#price range and the availability of online
#delivery and table booking.

# Create a contingency table
online_delivery_table <- table(cognifyz_1$Price.range, cognifyz_1$Has.Online.delivery)
print(online_delivery_table)

##    
##       No  Yes
##   1 3737  701
##   2 1827 1286
##   3  994  411
##   4  533   53

# Create a contingency table for table booking
table_booking_table <- table(cognifyz_1$Price.range, cognifyz_1$Has.Table.booking)
print(table_booking_table)

##    
##       No  Yes
##   1 4437    1
##   2 2874  239
##   3  761  644
##   4  312  274

# Plot online delivery vs. price range
ggplot(cognifyz_1, aes(x = Price.range, fill = Has.Online.delivery)) +
  geom_bar(position = "fill") +  # "fill" shows proportions
  labs(title = "Proportion of Restaurants Offering Online Delivery by Price Range",
       x = "Price Range",
       y = "Proportion",
       fill = "Online Delivery") +
  theme_minimal()

# Plot table booking vs. price range
ggplot(cognifyz_1, aes(x = Price.range, fill = Has.Table.booking)) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Restaurants Offering Table Booking by Price Range",
       x = "Price Range",
       y = "Proportion",
       fill = "Table Booking") +
  theme_minimal()

# chi_online <- chisq.test(online_delivery_table)
# print(chi_online)
# 
# chi_booking <- chisq.test(table_booking_table)
# print(chi_booking)

Project 3: Oral Cancer Prediction Using Random Forest in R(Kaggle data set)

The Oral Cancer Prediction Data set contains patient data used to predict the likelihood of developing oral cancer. It typically includes features like age, gender, lifestyle habits (such as tobacco or alcohol use), and possibly clinical indicators.

Aim

The aim of this project was to

Build a machine learning model to predict oral cancer diagnosis based on various patient and behavioral attributes (such as age, tobacco use, country, etc.). (to do later)
understanding key factors that contribute to the likelihood of developing oral cancer.

Research Questions

Which age group is most frequently diagnosed with oral cancer?
How does tobacco use influence oral cancer occurrence?
Are there noticeable geographic (country-wise) patterns in oral cancer cases?
Can we develop a reliable machine learning model to predict oral cancer diagnoses using patient data?

Objectives

Perform an initial exploration and visualization of the data set to identify patterns and distributions.
Clean and preprocess the data to ensure it’s suitable for modeling.
Build a predictive model using the Random Forest algorithm.
Evaluate the model’s performance and identify the most important features influencing the diagnosis

# Load necessary libraries
library(tidyverse)
library(ggplot2)
library(dplyr)
library(caret)
library(corrplot)
library(ggpubr)
library(skimr)

cancer <- read.csv("C:/Users/User/Downloads/oral_cancer_prediction_dataset.csv")
head(cancer)

##   ID      Country Age Gender Tobacco.Use Alcohol.Consumption HPV.Infection
## 1  1        Italy  36 Female         Yes                 Yes           Yes
## 2  2        Japan  64   Male         Yes                 Yes           Yes
## 3  3           UK  37 Female          No                 Yes            No
## 4  4    Sri Lanka  55   Male         Yes                 Yes            No
## 5  5 South Africa  68   Male          No                  No            No
## 6  6       Taiwan  70   Male         Yes                  No           Yes
##   Betel.Quid.Use Chronic.Sun.Exposure Poor.Oral.Hygiene
## 1             No                   No               Yes
## 2             No                  Yes               Yes
## 3             No                  Yes               Yes
## 4            Yes                   No               Yes
## 5             No                   No               Yes
## 6            Yes                   No               Yes
##   Diet..Fruits...Vegetables.Intake. Family.History.of.Cancer
## 1                               Low                       No
## 2                              High                       No
## 3                          Moderate                       No
## 4                          Moderate                       No
## 5                              High                       No
## 6                          Moderate                      Yes
##   Compromised.Immune.System Oral.Lesions Unexplained.Bleeding
## 1                        No           No                   No
## 2                        No           No                  Yes
## 3                        No           No                   No
## 4                        No          Yes                   No
## 5                        No           No                   No
## 6                        No          Yes                  Yes
##   Difficulty.Swallowing White.or.Red.Patches.in.Mouth Tumor.Size..cm.
## 1                    No                            No        0.000000
## 2                    No                            No        1.782186
## 3                    No                           Yes        3.523895
## 4                    No                            No        0.000000
## 5                    No                            No        2.834789
## 6                    No                            No        1.692675
##   Cancer.Stage Treatment.Type Survival.Rate..5.Year.... Cost.of.Treatment..USD.
## 1            0   No Treatment                 100.00000                    0.00
## 2            1   No Treatment                  83.34010                77772.50
## 3            2        Surgery                  63.22287               101164.50
## 4            0   No Treatment                 100.00000                    0.00
## 5            3   No Treatment                  44.29320                45354.75
## 6            2        Surgery                  67.40727                96504.00
##   Economic.Burden..Lost.Workdays.per.Year. Early.Diagnosis
## 1                                        0              No
## 2                                      177              No
## 3                                      130             Yes
## 4                                        0             Yes
## 5                                       52              No
## 6                                       91             Yes
##   Oral.Cancer..Diagnosis.
## 1                      No
## 2                     Yes
## 3                     Yes
## 4                      No
## 5                     Yes
## 6                     Yes

summary(cancer)

##        ID          Country               Age            Gender         
##  Min.   :    1   Length:84922       Min.   : 15.00   Length:84922      
##  1st Qu.:21231   Class :character   1st Qu.: 48.00   Class :character  
##  Median :42462   Mode  :character   Median : 55.00   Mode  :character  
##  Mean   :42462                      Mean   : 54.51                     
##  3rd Qu.:63692                      3rd Qu.: 61.00                     
##  Max.   :84922                      Max.   :101.00                     
##  Tobacco.Use        Alcohol.Consumption HPV.Infection      Betel.Quid.Use    
##  Length:84922       Length:84922        Length:84922       Length:84922      
##  Class :character   Class :character    Class :character   Class :character  
##  Mode  :character   Mode  :character    Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##  Chronic.Sun.Exposure Poor.Oral.Hygiene  Diet..Fruits...Vegetables.Intake.
##  Length:84922         Length:84922       Length:84922                     
##  Class :character     Class :character   Class :character                 
##  Mode  :character     Mode  :character   Mode  :character                 
##                                                                           
##                                                                           
##                                                                           
##  Family.History.of.Cancer Compromised.Immune.System Oral.Lesions      
##  Length:84922             Length:84922              Length:84922      
##  Class :character         Class :character          Class :character  
##  Mode  :character         Mode  :character          Mode  :character  
##                                                                       
##                                                                       
##                                                                       
##  Unexplained.Bleeding Difficulty.Swallowing White.or.Red.Patches.in.Mouth
##  Length:84922         Length:84922          Length:84922                 
##  Class :character     Class :character      Class :character             
##  Mode  :character     Mode  :character      Mode  :character             
##                                                                          
##                                                                          
##                                                                          
##  Tumor.Size..cm.  Cancer.Stage   Treatment.Type     Survival.Rate..5.Year....
##  Min.   :0.000   Min.   :0.000   Length:84922       Min.   : 10.00           
##  1st Qu.:0.000   1st Qu.:0.000   Class :character   1st Qu.: 65.23           
##  Median :0.000   Median :0.000   Mode  :character   Median :100.00           
##  Mean   :1.747   Mean   :1.119                      Mean   : 79.50           
##  3rd Qu.:3.480   3rd Qu.:2.000                      3rd Qu.:100.00           
##  Max.   :6.000   Max.   :4.000                      Max.   :100.00           
##  Cost.of.Treatment..USD. Economic.Burden..Lost.Workdays.per.Year.
##  Min.   :     0          Min.   :  0.00                          
##  1st Qu.:     0          1st Qu.:  0.00                          
##  Median :     0          Median :  0.00                          
##  Mean   : 39110          Mean   : 52.03                          
##  3rd Qu.: 76468          3rd Qu.:104.00                          
##  Max.   :159988          Max.   :179.00                          
##  Early.Diagnosis    Oral.Cancer..Diagnosis.
##  Length:84922       Length:84922           
##  Class :character   Class :character       
##  Mode  :character   Mode  :character       
##                                            
##                                            
##

sum(cancer=='')

## [1] 0

sum(is.na(cancer))#checking for the summary of the data

## [1] 0

#I have been able to clarify that there are no missing or empty columns in this data set

# View data set structure
str(cancer)

## 'data.frame':    84922 obs. of  25 variables:
##  $ ID                                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Country                                 : chr  "Italy" "Japan" "UK" "Sri Lanka" ...
##  $ Age                                     : int  36 64 37 55 68 70 41 53 62 50 ...
##  $ Gender                                  : chr  "Female" "Male" "Female" "Male" ...
##  $ Tobacco.Use                             : chr  "Yes" "Yes" "No" "Yes" ...
##  $ Alcohol.Consumption                     : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ HPV.Infection                           : chr  "Yes" "Yes" "No" "No" ...
##  $ Betel.Quid.Use                          : chr  "No" "No" "No" "Yes" ...
##  $ Chronic.Sun.Exposure                    : chr  "No" "Yes" "Yes" "No" ...
##  $ Poor.Oral.Hygiene                       : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Diet..Fruits...Vegetables.Intake.       : chr  "Low" "High" "Moderate" "Moderate" ...
##  $ Family.History.of.Cancer                : chr  "No" "No" "No" "No" ...
##  $ Compromised.Immune.System               : chr  "No" "No" "No" "No" ...
##  $ Oral.Lesions                            : chr  "No" "No" "No" "Yes" ...
##  $ Unexplained.Bleeding                    : chr  "No" "Yes" "No" "No" ...
##  $ Difficulty.Swallowing                   : chr  "No" "No" "No" "No" ...
##  $ White.or.Red.Patches.in.Mouth           : chr  "No" "No" "Yes" "No" ...
##  $ Tumor.Size..cm.                         : num  0 1.78 3.52 0 2.83 ...
##  $ Cancer.Stage                            : int  0 1 2 0 3 2 1 0 3 3 ...
##  $ Treatment.Type                          : chr  "No Treatment" "No Treatment" "Surgery" "No Treatment" ...
##  $ Survival.Rate..5.Year....               : num  100 83.3 63.2 100 44.3 ...
##  $ Cost.of.Treatment..USD.                 : num  0 77773 101165 0 45355 ...
##  $ Economic.Burden..Lost.Workdays.per.Year.: int  0 177 130 0 52 91 105 0 136 82 ...
##  $ Early.Diagnosis                         : chr  "No" "No" "Yes" "Yes" ...
##  $ Oral.Cancer..Diagnosis.                 : chr  "No" "Yes" "Yes" "No" ...

skimr::skim(cancer)#this code skims the data set my giving a more detailed description of the data

Data summary
Name	cancer
Number of rows	84922
Number of columns	25
_______________________
Column type frequency:
character	18
numeric	7
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Country	1	2	12	17
Gender	1	4	6	2
Tobacco.Use	1	2	3	2
Alcohol.Consumption	1	2	3	2
HPV.Infection	1	2	3	2
Betel.Quid.Use	1	2	3	2
Chronic.Sun.Exposure	1	2	3	2
Poor.Oral.Hygiene	1	2	3	2
Diet..Fruits…Vegetables.Intake.	1	3	8	3
Family.History.of.Cancer	1	2	3	2
Compromised.Immune.System	1	2	3	2
Oral.Lesions	1	2	3	2
Unexplained.Bleeding	1	2	3	2
Difficulty.Swallowing	1	2	3	2
White.or.Red.Patches.in.Mouth	1	2	3	2
Treatment.Type	1	7	16	5
Early.Diagnosis	1	2	3	2
Oral.Cancer..Diagnosis.	1	2	3	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
ID	1	42461.50	24515.01	1	21231.25	42461.5	63691.75	84922	▇▇▇▇▇
Age	1	54.51	10.01	15	48.00	55.0	61.00	101	▁▅▇▂▁
Tumor.Size..cm.	1	1.75	2.03	0	0.00	0.0	3.48	6	▇▂▂▂▂
Cancer.Stage	1	1.12	1.34	0	0.00	0.0	2.00	4	▇▂▂▂▁
Survival.Rate..5.Year….	1	79.50	26.48	10	65.23	100.0	100.00	100	▁▂▁▂▇
Cost.of.Treatment..USD.	1	39109.88	44710.69	0	0.00	0.0	76468.44	159988	▇▂▃▂▁
Economic.Burden..Lost.Workdays.per.Year.	1	52.03	60.39	0	0.00	0.0	104.00	179	▇▂▂▂▂

# Check class distribution of the target variable
table(cancer$Diagnosis) # Assuming "Diagnosis" is the target variable

## < table of extent 0 >

prop.table(table(cancer$Diagnosis)) # Check proportion

## numeric(0)

#age range that gets affected more by oral cancer
ggplot(cancer, aes(x = Age)) +
  geom_histogram(fill = "blue",binwidth =5, bins = 30, alpha = 0.7, color = 'black') +
  theme_minimal() +
  scale_x_continuous(breaks = seq(min(cancer$Age), max(cancer$Age), by = 5)) +
  labs(title = "Age Distribution of Patients", x = "Age", y = "Count")

#rate at which oral cancer is caused by smoking tobacco
ggplot(cancer, aes(x = Tobacco.Use, fill = Oral.Cancer..Diagnosis.)) +
  geom_bar(position = "dodge",) +
  theme_minimal() +
  labs(title = "Effect of Tobacco Use on Oral Cancer", x = "Tobacco Use", y = "Proportion")

#number of countrys affected by oraal cancer
ggplot(cancer, aes(x = Country, fill = Oral.Cancer..Diagnosis.)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Oral Cancer Cases by Country", x = "Country", y = "Count")

# #building a model for this dataset
# # Convert categorical variables to factors
# cancer$Oral.Cancer..Diagnosis. <- as.factor(cancer$Oral.Cancer..Diagnosis.)
# cancer <- cancer %>% mutate_if(is.character, as.factor)
# 
# # Split data into training and test sets (80/20 split)
# set.seed(123)
# trainIndex <- createDataPartition(cancer$Oral.Cancer..Diagnosis., p = 0.8, list = FALSE)
# trainData <- cancer[trainIndex, ]
# testData <- cancer[-trainIndex, ]
# head(trainIndex)
# 
# # Train model
# rf_model <- randomForest(Oral.Cancer..Diagnosis. ~ ., data = trainData, ntree = 100, mtry = 3, importance = TRUE)
# 
# # Print model summary
# print(rf_model)
# 
# # Feature importance
# varImpPlot(rf_model)
# 
# # Predictions on test set
# rf_predictions <- predict(rf_model, testData)
# 
# # Confusion Matrix
# confusionMatrix(rf_predictions, testData$Oral.Cancer..Diagnosis.)
#

International Center for Applied Mathematical Modelling and Data Analysis (ICAMMDA)

Graduate Internship Training (GIT) Presentation

Table of content

Introduction

Section A

skill(s) Acquired

Projects

PROJECT CODE

PROJECT TWO: Cognifyz Technologies Certified Internship – Restaurant Dataset Analysis

part two

Project 3: Oral Cancer Prediction Using Random Forest in R(Kaggle data set)

Section B.