#1.ASK
#1.0 Business task
#Which parks have the highest number of visitors in a year.
#Which parks have the highest lowest number of visitors in the year.
#Which months bring in the highest and lowest number of visitors in the parks
#Find out the total revenue per month for all the parks

#1.1 Stakeholders
#stakeholders include the following
#KWS Marketing team
#Kenya Tourist board
#Data Analytics Team

#2.PREPARE
#There are 3 data sets, KWS data  which is public 
#.Data is organized in csv files

#3.PROCESS
#.For this project I choose RStudio Desktop in order to prepare, process, clean, analyze and create the visualizations.

#3.1 COLLECT & DATA WRANGLING
library(tidyverse) #tidyverse for data wrangling
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate) #load lubridate for data wrangling
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2) #load lubridate for data wrangling
library(dplyr) #load dplyr for to enable data frame manipulation
library(data.table) #load data.table for fast aggregation of large data sets
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggmosaic) #load ggmosaic for creating visualizations of categorical data
## Warning: package 'ggmosaic' was built under R version 4.1.3
library(spatial)
library(ggspatial)
## Warning: package 'ggspatial' was built under R version 4.1.3
#3.1.1.COLLECT  AND READ DATA
#Upload data here
KWS_2019<-read_csv("C:/Users/user/Desktop/DATA SETS/KWS/KWS 2019 DATA.csv")
## Rows: 372 Columns: 17
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): park/Reserve, Transport-NONPAYING, Transport-VEHICLES, TOTAL REVENU...
## dbl (7): Tourist-Resident Children, Tourist-WCK, Transport-CONCESSIONS, Tran...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
KWS_2020<-read_csv("C:/Users/user/Desktop/DATA SETS/KWS/KWS 2020 DATA.csv")
## Rows: 217 Columns: 17
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (5): park/Reserve, Tourist-Non Resident Children, Transport-VEHICLES, T...
## dbl (12): Tourist-Citizen adults, Tourist-Citizen Children, Tourist-Resident...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
KWS_2021<-read_csv("C:/Users/user/Desktop/DATA SETS/KWS/KWS 2021 DATA.csv")
## Rows: 372 Columns: 17
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (2): park/Reserve, PERIOD
## dbl (12): Tourist-Citizen adults, Tourist-Citizen Children, Tourist-Resident...
## lgl  (3): Tourist-WCK, Transport-CONCESSIONS, Transport-BOATS
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#3.1.1.CONDUCT EXPLORATORY DATA ANALYSIS
colnames(KWS_2019)#List of column names
##  [1] "park/Reserve"                  "Tourist-Citizen adults"       
##  [3] "Tourist-Citizen Children"      "Tourist-Resident adults"      
##  [5] "Tourist-Resident Children"     "Tourist-Non Resident adults"  
##  [7] "Tourist-Non Resident Children" "Tourist-WCK"                  
##  [9] "Transport-CONCESSIONS"         "Transport-NONPAYING"          
## [11] "Transport-VEHICLES"            "Transport-AIRCRAFTS"          
## [13] "Transport-BOATS"               "Transport-BICYCLES"           
## [15] "Transport-HORSES"              "TOTAL REVENUE"                
## [17] "PERIOD"
colnames(KWS_2021)
##  [1] "park/Reserve"                  "Tourist-Citizen adults"       
##  [3] "Tourist-Citizen Children"      "Tourist-Resident adults"      
##  [5] "Tourist-Resident Children"     "Tourist-Non Resident adults"  
##  [7] "Tourist-Non Resident Children" "Tourist-WCK"                  
##  [9] "Transport-CONCESSIONS"         "Transport-NONPAYING"          
## [11] "Transport-VEHICLES"            "Transport-AIRCRAFTS"          
## [13] "Transport-BOATS"               "Transport-BICYCLES"           
## [15] "Transport-HORSES"              "TOTAL REVENUE"                
## [17] "PERIOD"
colnames(KWS_2020)
##  [1] "park/Reserve"                  "Tourist-Citizen adults"       
##  [3] "Tourist-Citizen Children"      "Tourist-Resident adults"      
##  [5] "Tourist-Resident Children"     "Tourist-Non Resident adults"  
##  [7] "Tourist-Non Resident Children" "Tourist-WCK"                  
##  [9] "Transport-CONCESSIONS"         "Transport-NONPAYING"          
## [11] "Transport-VEHICLES"            "Transport-AIRCRAFTS"          
## [13] "Transport-BOATS"               "Transport-BICYCLES"           
## [15] "Transport-HORSES"              "TOTAL REVENUE"                
## [17] "PERIOD"
dim(KWS_2019)  #Dimensions of the data frame,how many rows and columns?
## [1] 372  17
dim(KWS_2020)
## [1] 217  17
dim(KWS_2021)
## [1] 372  17
str(KWS_2019)  #See list of columns and data types (numeric, character, etc)
## spec_tbl_df [372 x 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ park/Reserve                 : chr [1:372] "Aberdares National Park" "Amboseli National Park" "Central Island National Park" "Chyulu Hills National Park" ...
##  $ Tourist-Citizen adults       : num [1:372] 865 3477 46 10 4057 ...
##  $ Tourist-Citizen Children     : num [1:372] 118 231 0 8 1166 ...
##  $ Tourist-Resident adults      : num [1:372] 299 333 13 16 554 14 111 64 10 17 ...
##  $ Tourist-Resident Children    : num [1:372] 129 115 0 0 153 30 59 14 0 267 ...
##  $ Tourist-Non Resident adults  : num [1:372] 799 8153 33 11 2442 ...
##  $ Tourist-Non Resident Children: num [1:372] 54 450 0 0 247 1 354 7 0 248 ...
##  $ Tourist-WCK                  : num [1:372] NA NA NA NA NA NA NA NA NA NA ...
##  $ Transport-CONCESSIONS        : num [1:372] NA NA NA NA NA NA NA NA NA NA ...
##  $ Transport-NONPAYING          : chr [1:372] NA NA NA NA ...
##  $ Transport-VEHICLES           : chr [1:372] NA NA NA NA ...
##  $ Transport-AIRCRAFTS          : num [1:372] NA NA NA NA NA NA NA NA NA NA ...
##  $ Transport-BOATS              : num [1:372] NA NA NA NA NA NA NA NA NA NA ...
##  $ Transport-BICYCLES           : num [1:372] NA NA NA NA NA NA NA NA NA NA ...
##  $ Transport-HORSES             : num [1:372] NA NA NA NA NA NA NA NA NA NA ...
##  $ TOTAL REVENUE                : chr [1:372] "6139933.00" "72614095.00" NA NA ...
##  $ PERIOD                       : chr [1:372] "1/1/2019" "1/1/2019" "1/1/2019" "1/1/2019" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `park/Reserve` = col_character(),
##   ..   `Tourist-Citizen adults` = col_number(),
##   ..   `Tourist-Citizen Children` = col_number(),
##   ..   `Tourist-Resident adults` = col_number(),
##   ..   `Tourist-Resident Children` = col_double(),
##   ..   `Tourist-Non Resident adults` = col_number(),
##   ..   `Tourist-Non Resident Children` = col_number(),
##   ..   `Tourist-WCK` = col_double(),
##   ..   `Transport-CONCESSIONS` = col_double(),
##   ..   `Transport-NONPAYING` = col_character(),
##   ..   `Transport-VEHICLES` = col_character(),
##   ..   `Transport-AIRCRAFTS` = col_double(),
##   ..   `Transport-BOATS` = col_double(),
##   ..   `Transport-BICYCLES` = col_double(),
##   ..   `Transport-HORSES` = col_double(),
##   ..   `TOTAL REVENUE` = col_character(),
##   ..   PERIOD = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(KWS_2020)
## spec_tbl_df [217 x 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ park/Reserve                 : chr [1:217] "Aberdares National Park" "Amboseli National Park" "Central Island National Park" "Chyulu Hills National Park" ...
##  $ Tourist-Citizen adults       : num [1:217] 858 3503 18 18 2835 ...
##  $ Tourist-Citizen Children     : num [1:217] 129 264 0 3 965 ...
##  $ Tourist-Resident adults      : num [1:217] 223 391 6 14 418 20 197 87 28 11 ...
##  $ Tourist-Resident Children    : num [1:217] 141 171 0 0 79 0 56 19 0 271 ...
##  $ Tourist-Non Resident adults  : num [1:217] 1216 8701 5 19 1703 ...
##  $ Tourist-Non Resident Children: chr [1:217] "142.00" "558.00" "0.00" "5.00" ...
##  $ Tourist-WCK                  : num [1:217] 0 0 0 0 201 0 0 2 0 0 ...
##  $ Transport-CONCESSIONS        : num [1:217] 0 0 0 0 0 0 22 0 0 NA ...
##  $ Transport-NONPAYING          : num [1:217] 0 0 0 0 0 0 0 3 0 16 ...
##  $ Transport-VEHICLES           : chr [1:217] "463.00" "2742.00" "-" "21.00" ...
##  $ Transport-AIRCRAFTS          : num [1:217] 0 40 0 0 0 0 0 0 0 2 ...
##  $ Transport-BOATS              : num [1:217] 0 0 7 0 0 0 335 4 8 0 ...
##  $ Transport-BICYCLES           : num [1:217] 0 0 0 0 1372 ...
##  $ Transport-HORSES             : num [1:217] 0 0 0 0 0 0 0 0 0 0 ...
##  $ TOTAL REVENUE                : chr [1:217] "11998081.00" "76736730.00" "23660.00" "109520.00" ...
##  $ PERIOD                       : chr [1:217] "1/1/2020" "1/1/2020" "1/1/2020" "1/1/2020" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `park/Reserve` = col_character(),
##   ..   `Tourist-Citizen adults` = col_double(),
##   ..   `Tourist-Citizen Children` = col_double(),
##   ..   `Tourist-Resident adults` = col_double(),
##   ..   `Tourist-Resident Children` = col_double(),
##   ..   `Tourist-Non Resident adults` = col_double(),
##   ..   `Tourist-Non Resident Children` = col_character(),
##   ..   `Tourist-WCK` = col_double(),
##   ..   `Transport-CONCESSIONS` = col_double(),
##   ..   `Transport-NONPAYING` = col_double(),
##   ..   `Transport-VEHICLES` = col_character(),
##   ..   `Transport-AIRCRAFTS` = col_double(),
##   ..   `Transport-BOATS` = col_double(),
##   ..   `Transport-BICYCLES` = col_double(),
##   ..   `Transport-HORSES` = col_double(),
##   ..   `TOTAL REVENUE` = col_character(),
##   ..   PERIOD = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(KWS_2021)
## spec_tbl_df [372 x 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ park/Reserve                 : chr [1:372] "Aberdares National Park" "Amboseli National Park" "Central Island National Park" "Chyulu Hills National Park" ...
##  $ Tourist-Citizen adults       : num [1:372] 2386 6658 0 0 10145 ...
##  $ Tourist-Citizen Children     : num [1:372] 532 2435 0 0 6667 ...
##  $ Tourist-Resident adults      : num [1:372] 290 791 0 0 124 0 483 18 0 388 ...
##  $ Tourist-Resident Children    : num [1:372] 124 223 0 0 82 0 164 1 0 107 ...
##  $ Tourist-Non Resident adults  : num [1:372] 509 4022 0 0 1473 ...
##  $ Tourist-Non Resident Children: num [1:372] 53 584 0 0 136 0 185 2 0 272 ...
##  $ Tourist-WCK                  : logi [1:372] NA NA NA NA NA NA ...
##  $ Transport-CONCESSIONS        : logi [1:372] NA NA NA NA NA NA ...
##  $ Transport-NONPAYING          : num [1:372] NA NA NA NA NA NA NA NA NA NA ...
##  $ Transport-VEHICLES           : num [1:372] 699 2748 0 0 2795 ...
##  $ Transport-AIRCRAFTS          : num [1:372] 0 107 0 0 0 0 0 0 0 0 ...
##  $ Transport-BOATS              : logi [1:372] NA NA NA NA NA NA ...
##  $ Transport-BICYCLES           : num [1:372] 0 0 0 0 1646 ...
##  $ Transport-HORSES             : num [1:372] NA NA NA NA NA NA NA NA NA NA ...
##  $ TOTAL REVENUE                : num [1:372] 3687692 35285935 0 0 9908574 ...
##  $ PERIOD                       : chr [1:372] "12/1/2021" "12/1/2021" "12/1/2021" "12/1/2021" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `park/Reserve` = col_character(),
##   ..   `Tourist-Citizen adults` = col_double(),
##   ..   `Tourist-Citizen Children` = col_double(),
##   ..   `Tourist-Resident adults` = col_double(),
##   ..   `Tourist-Resident Children` = col_double(),
##   ..   `Tourist-Non Resident adults` = col_double(),
##   ..   `Tourist-Non Resident Children` = col_double(),
##   ..   `Tourist-WCK` = col_logical(),
##   ..   `Transport-CONCESSIONS` = col_logical(),
##   ..   `Transport-NONPAYING` = col_double(),
##   ..   `Transport-VEHICLES` = col_double(),
##   ..   `Transport-AIRCRAFTS` = col_double(),
##   ..   `Transport-BOATS` = col_logical(),
##   ..   `Transport-BICYCLES` = col_double(),
##   ..   `Transport-HORSES` = col_double(),
##   ..   `TOTAL REVENUE` = col_double(),
##   ..   PERIOD = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
#Columns Transport non paying,transport vehicles,Total revenue are currently in character data type
#We will change this columns to numeric
#We will also change period column to date data type
# STEP 4: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS
# Convert the Transport non paying,transport vehicles, total revenue,Tourist non resident
# tourist wck,transport concessions,transport boat columns to numeric data type
# # Convert the period column to date data type
KWS_2019<-KWS_2019%>%
  mutate(`Transport-NONPAYING`=as.double(`Transport-NONPAYING`),
         `Transport-VEHICLES`=as.double(`Transport-VEHICLES`),
         `TOTAL REVENUE`=as.double(`TOTAL REVENUE`))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
KWS_2020<-KWS_2020%>%
  mutate(`Transport-VEHICLES`=as.double(`Transport-VEHICLES`),
         `TOTAL REVENUE`=as.double(`TOTAL REVENUE`),
         `Tourist-Non Resident Children`=as.double(`Tourist-Non Resident Children`))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
KWS_2021<-KWS_2021%>%
  mutate(`Tourist-WCK`=as.double(`Tourist-WCK`),
         `Transport-CONCESSIONS`=as.double(`Transport-CONCESSIONS`),
         `Transport-BOATS`=as.double(`Transport-BOATS`))

# Convert the period column to date data type for all the data sets
KWS_2019$PERIOD<-as.Date(KWS_2019$PERIOD, format = "%m/%d/%Y")
KWS_2020$PERIOD<-as.Date(KWS_2020$PERIOD, format = "%m/%d/%Y")
KWS_2021$PERIOD<-as.Date(KWS_2021$PERIOD, format = "%m/%d/%Y")

#Inspect the data frame to make the data is analysis ready
view(KWS_2019)
view(KWS_2020)
view(KWS_2021)
#There are null values in all the three data sets, replace null values with zero to enable calculations.
KWS_2019 <- KWS_2019 %>% 
  mutate(`Tourist-WCK` = coalesce(`Tourist-WCK`, 0),
         `Transport-CONCESSIONS` = coalesce(`Transport-CONCESSIONS`, 0),
         `Transport-NONPAYING` = coalesce(`Transport-NONPAYING`, 0),
         `Transport-VEHICLES` = coalesce(`Transport-VEHICLES`, 0),
         `Transport-AIRCRAFTS` = coalesce(`Transport-AIRCRAFTS`, 0),
         `Transport-BOATS` = coalesce(`Transport-BOATS`, 0),
         `Transport-BICYCLES` = coalesce(`Transport-BICYCLES`, 0),
         `Transport-HORSES` = coalesce(`Transport-HORSES`, 0),
         `TOTAL REVENUE` = coalesce(`TOTAL REVENUE`, 0))

KWS_2020 <- KWS_2020 %>% 
  mutate(`Tourist-WCK` = coalesce(`Tourist-WCK`, 0),
         `Transport-CONCESSIONS` = coalesce(`Transport-CONCESSIONS`, 0),
         `Transport-NONPAYING` = coalesce(`Transport-NONPAYING`, 0),
         `Transport-VEHICLES` = coalesce(`Transport-VEHICLES`, 0),
         `Transport-AIRCRAFTS` = coalesce(`Transport-AIRCRAFTS`, 0),
         `Transport-BOATS` = coalesce(`Transport-BOATS`, 0),
         `Transport-BICYCLES` = coalesce(`Transport-BICYCLES`, 0),
         `Transport-HORSES` = coalesce(`Transport-HORSES`, 0),
         `TOTAL REVENUE` = coalesce(`TOTAL REVENUE`, 0))

KWS_2021 <- KWS_2021 %>% 
  mutate(`Tourist-WCK` = coalesce(`Tourist-WCK`, 0),
         `Transport-CONCESSIONS` = coalesce(`Transport-CONCESSIONS`, 0),
         `Transport-NONPAYING` = coalesce(`Transport-NONPAYING`, 0),
         `Transport-VEHICLES` = coalesce(`Transport-VEHICLES`, 0),
         `Transport-AIRCRAFTS` = coalesce(`Transport-AIRCRAFTS`, 0),
         `Transport-BOATS` = coalesce(`Transport-BOATS`, 0),
         `Transport-BICYCLES` = coalesce(`Transport-BICYCLES`, 0),
         `Transport-HORSES` = coalesce(`Transport-HORSES`, 0),
         `TOTAL REVENUE` = coalesce(`TOTAL REVENUE`, 0))

#There are null value in the first 5 columns,we need to remove this so as to be able to explore the data.
KWS_2019<-KWS_2019%>%
  drop_na(c("Tourist-Citizen adults","Tourist-Citizen Children","Tourist-Resident adults","Tourist-Resident Children"
            ,"Tourist-Non Resident adults","Tourist-Non Resident Children"))
  
KWS_2020<-KWS_2020%>%
  drop_na(c("Tourist-Citizen adults","Tourist-Citizen Children","Tourist-Resident adults","Tourist-Resident Children"
            ,"Tourist-Non Resident adults","Tourist-Non Resident Children"))     

KWS_2021<-KWS_2021%>%
  drop_na(c("Tourist-Citizen adults","Tourist-Citizen Children","Tourist-Resident adults","Tourist-Resident Children"
            ,"Tourist-Non Resident adults","Tourist-Non Resident Children"))



# 4: CONDUCT DESCRIPTIVE ANALYSIS
#Which category of tourists make the most visits in the park in a year.
#Visits in 2019
Total_tourist19<-KWS_2019%>%
  summarise(sum(`Tourist-Citizen adults`),sum(`Tourist-Citizen Children`),sum(`Tourist-Resident adults`),
            sum(`Tourist-Non Resident Children`),sum(`Tourist-Non Resident adults`),sum(`Tourist-Non Resident Children`),
            sum(`Tourist-WCK`))
#Visits in 2020
Total_tourist20<-KWS_2020%>%
  summarise(sum(`Tourist-Citizen adults`),sum(`Tourist-Citizen Children`),sum(`Tourist-Resident adults`),
            sum(`Tourist-Non Resident Children`),sum(`Tourist-Non Resident adults`),sum(`Tourist-Non Resident Children`),
            sum(`Tourist-WCK`))
#Visits in 2019
Total_tourist21<-KWS_2021%>%
  summarise(sum(`Tourist-Citizen adults`),sum(`Tourist-Citizen Children`),sum(`Tourist-Resident adults`),
            sum(`Tourist-Non Resident Children`),sum(`Tourist-Non Resident adults`),sum(`Tourist-Non Resident Children`),
            sum(`Tourist-WCK`))


#The Highest category of visits to all the parks in Kenya is by Tourist-Citizen children
#Lets visualize this
#FirSt let us create a data frame from the Total tourist data set that will allow us to plot a bar graph
#Year 2019 visualization
data19=data.frame(Category = c('Tourist-Citizen adults', "Tourist-Citizen Children", "Tourist-Resident adults",
               "Tourist-Non Resident Children", "Tourist-Non Resident adults", "Tourist-WCK"),
  Total_Visits = c(758586,840230,54607,52990,548061,88664))

ggplot(data19,aes(x=Category,y=Total_Visits))+
  geom_bar(stat="identity",fill="steelblue")+
  labs(x="Category",y="Total_Visits")+
  scale_y_continuous()

#year 2020 visualization
data20=data.frame(Category = c('Tourist-Citizen adults', "Tourist-Citizen Children", "Tourist-Resident adults",
                             "Tourist-Non Resident Children", "Tourist-Non Resident adults", "Tourist-WCK"),
                Total_Visits = c(   222683,101032,22181,11555,130682,4495))
ggplot(data20,aes(x=Category,y=Total_Visits,fill=Category))+
  geom_bar(stat="identity")+
  labs(x="Category",y="Total_Visits")+
  scale_y_continuous()

#year 2021 visualization
data21=data.frame(Category = c('Tourist-Citizen adults', "Tourist-Citizen Children", "Tourist-Resident adults",
                               "Tourist-Non Resident Children", "Tourist-Non Resident adults", "Tourist-WCK"),
                  Total_Visits = c( 645218,322789,50310,94731,172351,0))

ggplot(data21,aes(x=Category,y=Total_Visits,fill=Category))+
  geom_bar(stat="identity")+
  labs(x="Category",y="Total_Visits")+
  scale_y_continuous()

#Which parks have the highest & lowest number of visitors in a year.
#Create another column called Total visits to capture sum of visists from all categories of visitors
KWS_2019<-KWS_2019%>%
  mutate(alltourist=`Tourist-Citizen adults`+`Tourist-Citizen Children`+`Tourist-Non Resident adults`,`Tourist-Resident Children`+
           `Tourist-Non Resident adults`+`Tourist-Non Resident Children`+`Tourist-WCK`)

Most_visited_parks19<-KWS_2019%>%
  group_by(`park/Reserve`)%>%
  summarise(alltouristv=max(alltourist))

Most_visited_parks19<-Most_visited_parks19%>%
  mutate(park=`park/Reserve`)

ggplot(Most_visited_parks19, aes(x = park, y = alltouristv)) +
  geom_bar(stat = "identity", fill = "steelblue")+
  labs(x = "park", y = "Visitor Count", title = "Visitors to Parks/Reserves2019") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_flip()

#In 2019 the park with highest number of visitors are as follows
 #1.Kisumu Impala sanctuary-106,132,2.Nairobi mini orphanage-88,114,3.Lake Nakuru national park-42,773
#in 2019 the park with the lowest number of visitors are as follow
  #1.South Island national park-30,2.Central Island national park-79,3.Sibiloi Nationalpark-89
 
  KWS_2020<-KWS_2020%>%
  mutate(alltourist=`Tourist-Citizen adults`+`Tourist-Citizen Children`+`Tourist-Non Resident adults`,`Tourist-Resident Children`+
           `Tourist-Non Resident adults`+`Tourist-Non Resident Children`+`Tourist-WCK`)

Most_visited_parks20<-KWS_2020%>%
  group_by(`park/Reserve`)%>%
  summarise(alltouristv=max(alltourist)) 

Most_visited_parks20<-Most_visited_parks20%>%
  mutate(park=`park/Reserve`)

ggplot(Most_visited_parks20, aes(x = park, y = alltouristv)) +
  geom_bar(stat = "identity", fill = "steelblue")+
  labs(x = "park", y = "Visitor Count", title = "Visitors to Parks/Reserves2020") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_flip()

#In 2020 the park with highest number of visitors are as follows:
 #1.Nairobi National Park-22,806,2.Tsavo National East Park-20,104,3.Amboseli National Park-14,828
#In 2020 the park with lowest number of visitors are as follows:
 #1.South Island National park-7,2.Central Island Park-23,3.Chyulu hills national park

KWS_2021<-KWS_2021%>%
  mutate(alltourist=`Tourist-Citizen adults`+`Tourist-Citizen Children`+`Tourist-Non Resident adults`,`Tourist-Resident Children`+
           `Tourist-Non Resident adults`+`Tourist-Non Resident Children`+`Tourist-WCK`)

Most_visited_parks21<-KWS_2021%>%
  group_by(`park/Reserve`)%>%
  summarise(alltouristv=max(alltourist))

str(Most_visited_parks21)
## tibble [31 x 2] (S3: tbl_df/tbl/data.frame)
##  $ park/Reserve: chr [1:31] "Aberdares National Park" "Amboseli National Park" "Central Island National Park" "Chyulu Hills National Park" ...
##  $ alltouristv : num [1:31] 3639 13115 0 0 18285 ...
Most_visited_parks21<-Most_visited_parks21%>%
  mutate(park=`park/Reserve`)

ggplot(Most_visited_parks21, aes(x = park, y = alltouristv)) +
  geom_bar(stat = "identity", fill = "steelblue")+
  labs(x = "park", y = "Visitor Count", title = "Visitors to Parks/Reserves2021") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_flip()

#In 2021 the park with highest number of visitors are as follows:
 #1.Nairobi mini orphanage-44,510,2.Kisumu Impala sanctuary-43,326, 3.Nairobi National park-24,535
#In 2021 the park with lowest number of visitors are as follows:
 #1.Sibiloi par, 2.South Island, 3.Chyulu hills national park
 
##3.Which months bring in the highest and lowest number of visitors in the parks 
#YEAR 2019
KWS_2019_2<-KWS_2019%>%
  collapse("month")%>%
  group_by(PERIOD)%>%
  summarize(sumtourist=sum(alltourist))

KWS_2019_2%>%
  arrange(sumtourist)
## # A tibble: 12 x 2
##    PERIOD     sumtourist
##    <date>          <dbl>
##  1 2019-05-01      94379
##  2 2019-01-01     122048
##  3 2019-11-01     126566
##  4 2019-10-01     131466
##  5 2019-02-01     136461
##  6 2019-09-01     140677
##  7 2019-04-01     142779
##  8 2019-03-01     158858
##  9 2019-12-01     181260
## 10 2019-06-01     201464
## 11 2019-08-01     326703
## 12 2019-07-01     384216
#The months with the highest number of visitors in 2019 are as follows:
#1.July-384,216 2.August-326,703  3.June-201,464

#The months with the lowest number of visitors in 2019 are as follows:
#1.May-94,379  2.January-122,048  3.November-126,566


#Visualize the data
ggplot(KWS_2019_2, aes(x = PERIOD, y = sumtourist)) +
  geom_line() +
  geom_point() +
  labs(x = "Period", y = "Sum of Tourists", title = "Tourist Data") +
  theme_minimal()+
  scale_y_continuous()

#YEAR 2020
KWS_2020_2<-KWS_2020%>%
  collapse("month")%>%
  group_by(PERIOD)%>%
  summarize(sumtourist=sum(alltourist))

KWS_2020_2%>%
  arrange(sumtourist)
## # A tibble: 7 x 2
##   PERIOD     sumtourist
##   <date>          <dbl>
## 1 2020-04-01       9350
## 2 2020-05-01      24390
## 3 2020-06-01      35957
## 4 2020-07-01      44707
## 5 2020-03-01      76057
## 6 2020-01-01     125467
## 7 2020-02-01     138469
#The months with the highest number of visitors in 2020 are as follows:
#1.February-138,469 2.January-125,467  3.March-76,057

#The months with the lowest number of visitors in 2020 are as follows:
#1.April-9,350  2.May-24,390  3.June-35,957

#Visualize the data
  ggplot(KWS_2020_2, aes(x = PERIOD, y = sumtourist)) +
  geom_line() +
  geom_point() +
  labs(x = "Period", y = "Sum of Tourists", title = "Tourist Data") +
  theme_minimal()

  scale_y_continuous()
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1
#YEAR 2021
  KWS_2021_2<-KWS_2021%>%
    collapse("month")%>%
    group_by(PERIOD)%>%
    summarize(sumtourist=sum(alltourist))
  
  KWS_2021_2%>%
    arrange(sumtourist)
## # A tibble: 12 x 2
##    PERIOD     sumtourist
##    <date>          <dbl>
##  1 2021-04-01      24445
##  2 2021-02-01      56797
##  3 2021-03-01      59760
##  4 2021-05-01      68716
##  5 2021-06-01      69746
##  6 2021-01-01      75347
##  7 2021-09-01      85299
##  8 2021-07-01      98914
##  9 2021-08-01     100403
## 10 2021-11-01     123370
## 11 2021-10-01     134144
## 12 2021-12-01     243417
  #The months with the highest number of visitors in 2021 are as follows:
  #1.December-243,417 2.October-134,144  3.November-123,370
  
  #The months with the lowest number of visitors in 2021 are as follows:
  #1.April-24,445  2.February-56,797  3.March-59,760
  
  
  #Visualize the data
  ggplot(KWS_2021_2, aes(x = PERIOD, y = sumtourist)) +
    geom_line() +
    geom_point() +
    labs(x = "Period", y = "Sum of Tourists", title = "Tourist Data") +
    theme_minimal()+
    scale_y_continuous()

#Find out the total revenue per month for all the parks
  KWS_REVENUE19<-KWS_2019%>%
    group_by(`park/Reserve`)%>%
    summarize(Tot_rev=sum(`TOTAL REVENUE`))%>%
    rename("park"="park/Reserve")
  
  ggplot(KWS_REVENUE19, aes(x = park, y = Tot_rev)) +
    geom_bar(stat = "identity", fill = "steelblue")+
    labs(x = "park", y = "Total revenue", title = "Total revenue per park 2020") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    coord_flip()

  #year2020
  KWS_REVENUE20<-KWS_2020%>%
    group_by(`park/Reserve`)%>%
    summarize(Tot_rev=sum(`TOTAL REVENUE`))%>%
    rename("park"="park/Reserve")
    
#Visusalize Total revenue
  ggplot(KWS_REVENUE20, aes(x = park, y = Tot_rev)) +
    geom_bar(stat = "identity", fill = "steelblue")+
    labs(x = "park", y = "Total revenue", title = "Total revenue per park 2020") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    coord_flip()

  #year2021
  KWS_REVENUE21<-KWS_2021%>%
    group_by(`park/Reserve`)%>%
    summarize(Tot_rev=sum(`TOTAL REVENUE`))%>%
    rename("park"="park/Reserve")
  
  #Visusalize Total revenue
  ggplot(KWS_REVENUE21, aes(x = park, y = Tot_rev)) +
    geom_bar(stat = "identity", fill = "steelblue")+
    labs(x = "park", y = "Total revenue", title = "Total revenue per park 2020") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    coord_flip()

  #Results:
  #1.The Highest category of visits to all the parks in Kenya in 2019,2020 $ 2021 is by Tourist-Citizen children
  
  #2.Which parks have the highest and lowest number of visitors in the year
  #In 2019 the park with highest number of visitors are as follows:
  #1.Kisumu Impala sanctuary-106,132,2.Nairobi mini orphanage-88,114,3.Lake Nakuru national park-42,773
  #in 2019 the park with the lowest number of visitors are as follow
  #.South Island national park-30,2.Central Island national park-79,3.Sibiloi Nationalpark-89
  
  #In 2020 the park with highest number of visitors are as follows:
  #1.Nairobi National Park-22,806,2.Tsavo National East Park-20,104,3.Amboseli National Park-14,828
  #In 2020 the park with lowest number of visitors are as follows:
  #.1.South Island Nation 2.Central Island Park-23,3.Chyulu hills national park
  
  #In 2021 the park with highest number of visitors are as follows:
  #1.Nairobi mini orphanage-44,510,2.Kisumu Impala sanctuary-43,326, 3.Nairobi National park-24,535
  #In 2021 the park with lowest number of visitors are as follows:
  #1.Sibiloi par, 2.South Island, 3.Chyulu hills national park
  
  #3.Which months bring in the highest and lowest number of visitors in the parks
  
  #The months with the highest number of visitors in 2019 are as follows:
  #1.July-384,216 2.August-326,703  3.June-201,464
  
  #The months with the lowest number of visitors in 2019 are as follows:
  #1.May-94,379  2.January-122,048  3.November-126,566
  
  #The months with the highest number of visitors in 2020 are as follows:
  #1.February-138,469 2.January-125,467  3.March-76,057
  
  #The months with the lowest number of visitors in 2020 are as follows:
  #1.April-9,350  2.May-24,390  3.June-35,957
  
  #The months with the highest number of visitors in 2021 are as follows:
  #1.December-243,417 2.October-134,144  3.November-123,370
  
  #The months with the lowest number of visitors in 2021 are as follows:
  #1.April-24,445  2.February-56,797  3.March-59,760