Telangana

Data Description

# Getting data from different csv files and combining them into one file

input_path = './%s_%s.csv'

domestic_path = 'domestic_visitors/domestic_visitors'
foreign_path = 'foreign_visitors/foreign_visitors'

all_domestic_data <- data.frame()
all_foreign_data <- data.frame()


for( year in years){
  file_path_domestic = sprintf(input_path, domestic_path, year)
  all_domestic_data =  rbind(all_domestic_data, read.csv(file_path_domestic))
  
  file_path_foreign = sprintf(input_path, foreign_path, year)
  all_foreign_data =  rbind(all_foreign_data, read.csv(file_path_foreign))
  
}

output_path = './%s.csv'

write.csv(all_domestic_data, sprintf(output_path, domestic_path), row.names=F)
write.csv(all_foreign_data, sprintf(output_path, foreign_path), row.names=F)
# Loading data sets
domestic_vis = read.csv('./domestic_visitors/domestic_visitors.csv')
foreign_vis = read.csv('./foreign_visitors/foreign_visitors.csv')

head(domestic_vis)
##   district       date    month year visitors
## 1 Adilabad 01-01-2016  January 2016   792136
## 2 Adilabad 01-02-2016 February 2016   937820
## 3 Adilabad 01-03-2016    March 2016   582946
## 4 Adilabad 01-04-2016    April 2016   341948
## 5 Adilabad 01-05-2016      May 2016   252887
## 6 Adilabad 01-06-2016     June 2016   368237
head(foreign_vis)
##   district       date    month year visitors
## 1 Adilabad 01-01-2016  January 2016        2
## 2 Adilabad 01-02-2016 February 2016        0
## 3 Adilabad 01-03-2016    March 2016        2
## 4 Adilabad 01-04-2016    April 2016        0
## 5 Adilabad 01-05-2016      May 2016        0
## 6 Adilabad 01-06-2016     June 2016        0

Data Cleaning

In Domestic visitors, There are 345 Null Values out of 1512 Values

So, there are 22.82% Null values

In Foreign visitors, There are 320 Null Values out of 1512 Values

So, there are 21.16% Null values

# Lets try to understand where these null values reside in our data
null_vis <- domestic_vis[is.na(domestic_vis$visitors), ]


# Group the data and count occurrences
count_data <- null_vis %>%
  group_by(district, year) %>%
  summarize(count = n())

# count_data %>% as_tibble() %>% print(n=40)


# Create the stacked bar plot
dom_plot = ggplot(count_data, aes(x = district, y = count, fill = as.factor(year)  ) )+
  geom_bar(stat = "identity",color = "black") +
    geom_text(aes(label = count, group = year), position = position_stack(vjust = 0.5)) +
  scale_y_continuous(breaks = seq(0, 50, by = 12)) +

  labs(x = "District", y = "Count", title = "Domestic Missing Values") +
  scale_fill_discrete(name = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Lets try to understand where these null values reside in our data
null_vis_fore <- foreign_vis[is.na(foreign_vis$visitors), ]


# Group the data and count occurrences
count_data_fore <- null_vis_fore %>%
  group_by(district, year) %>%
  summarize(count = n())

fore_plot = ggplot(count_data_fore, aes(x = district, y = count, fill = as.factor(year)  ) )+
  geom_bar(stat = "identity",color = "black") +
    geom_text(aes(label = count, group = year), position = position_stack(vjust = 0.5)) +
  scale_y_continuous(breaks = seq(0, 50, by = 12)) +

  labs(x = "District", y = "Count", title = "Foreign Missing Values") +
  scale_fill_discrete(name = "Year") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

dom_plot

fore_plot

Cleaning and Imputing the significant values

calculate_missing_percentage <- function(data, group_columns, value_column) {
  # Calculate missing values per specified group columns
  missing_data <- data %>% 
    group_by(across(all_of(group_columns))) %>% 
    summarize(
      total_rows = n(),
      missing_values = sum(is.na({{ value_column }}))
    )
  
  # Calculate missing percentage
  missing_data <- missing_data %>%
    mutate(missing_percentage = (missing_values / total_rows) * 100)
  
  # Filter rows with missing percentage equal to 100%
  missing_all_values <- missing_data %>%
    filter(missing_percentage >25)
  
  return(missing_all_values)
}
result_district <- calculate_missing_percentage(domestic_vis, c("district"), visitors)
result_foreign <- calculate_missing_percentage(foreign_vis, c("district"), visitors)
result_district_year_month <- calculate_missing_percentage(domestic_vis, c("month","year"), visitors)
result_district
## # A tibble: 5 × 4
##   district      total_rows missing_values missing_percentage
##   <chr>              <int>          <int>              <dbl>
## 1 "Medchal "            48             48              100  
## 2 "Narayanapet"         12              6               50  
## 3 "Ranga Reddy"         48             48              100  
## 4 "Suryapet"            48             33               68.8
## 5 "Vikarabad"           48             48              100
result_foreign
## # A tibble: 3 × 4
##   district      total_rows missing_values missing_percentage
##   <chr>              <int>          <int>              <dbl>
## 1 "Medchal "            48             48                100
## 2 "Ranga Reddy"         48             48                100
## 3 "Vikarabad"           48             48                100

We will delete the insignificant districts from the data i.e. districts which has more than 75% missing values

districts_above_threshold_d <- result_district %>%
  filter(missing_percentage >= 75) %>%
  pull(district)  # Get the list of districts to remove for domestic_vis

districts_above_threshold_f <- result_foreign %>%
  filter(missing_percentage >= 75) %>%
  pull(district)  # Get the list of districts to remove for foreign_vis

# Remove rows from 'domestic_vis' where districts have >= 75% missing values
cleaned_domestic_vis <- domestic_vis %>%
  filter(!(district %in% districts_above_threshold_d))
sum(is.na(cleaned_domestic_vis$visitors))
## [1] 201
# Remove rows from 'foreign_vis' where districts have >= 75% missing values
cleaned_foreign_vis <- foreign_vis %>%
  filter(!(district %in% districts_above_threshold_f))
sum(is.na(cleaned_foreign_vis$visitors))
## [1] 176
plot_before_after_NA <- function(data_with_na, data_without_na,main1=NULL) {
  # Histogram of visitors with NA values
  df_with_na <- ggplot(data = data_with_na, aes(x = visitors)) +
    geom_histogram(fill = "skyblue", color = "black", bins = 30, na.rm = FALSE) +
    labs(title = "Distribution of Visitors with Null Values", x = "Visitors") +
    theme_minimal() +ggtitle(main1) 
  
  # Histogram of visitors after removing NA values
  df_without_na <- ggplot(data = data_without_na, aes(x = visitors)) +
    geom_histogram(fill = "skyblue", color = "black", bins = 30, na.rm = TRUE) +
    labs(title = "Distribution of Visitors after removing Null Values", x = "Visitors") +
    theme_minimal() + ggtitle(main1)
  
  # Combine both histograms into a single plot using facets
  combined_plot <- cowplot::plot_grid(
    df_with_na + facet_wrap(~ "With Null Values"),
    df_without_na + facet_wrap(~ "Without Null Values"),
    ncol = 2
  )
  
  # Display the combined plot
  print(combined_plot)
}

# Example usage of the function with your dataframes
plot_before_after_NA(domestic_vis, cleaned_domestic_vis , "Distribution plot for domestic_visitors")

plot_before_after_NA(foreign_vis, cleaned_foreign_vis , "Distribution plot for foreign_visitors")

total_rows_clean_d <- nrow(cleaned_domestic_vis)
total_rows_clean_f <- nrow(cleaned_foreign_vis)

total_missing_clean_d <- sum(is.na(cleaned_domestic_vis$visitors))
total_missing_clean_f <- sum(is.na(cleaned_foreign_vis$visitors))

percentage_missing_clean_d <- (total_missing_clean_d / total_rows_clean_d) * 100
percentage_missing_clean_f <- (total_missing_clean_f / total_rows_clean_f) * 100
# paste(percentage_missing_clean_d,"Remaining missing values for domestic_vis")
# paste(percentage_missing_clean_f,"Remaining missing values for foreign_vis")

But even after removing those districts with null values we have 14.69% null values remaining.

Similarly even after removing those districts with null values we have `12.87% null values remaining.

So we will impute the data with the Mean with respect to districts.

And then we will again plot distribution of data to make sure imputation doesn’t change the distribution of original data.

# mean_visitors_d <- mean(cleaned_domestic_vis$visitors, na.rm = TRUE)
# mean_visitors_f <- mean(cleaned_foreign_vis$visitors, na.rm = TRUE)
# 
# cleaned_domestic_vis$visitors[is.na(cleaned_domestic_vis$visitors)] <- mean_visitors_d
# cleaned_foreign_vis$visitors[is.na(cleaned_foreign_vis$visitors)] <- mean_visitors_f
# 
# imputed_data_d = cleaned_domestic_vis
# imputed_data_f = cleaned_foreign_vis

# imputed_data_d <- cleaned_domestic_vis %>%
#   group_by(district) %>%
#   mutate(visitors = ifelse(is.na(visitors), mean(visitors, na.rm = TRUE), visitors))
# 
# imputed_data_f <- cleaned_foreign_vis %>%
#   group_by(district) %>%
#   mutate(visitors = ifelse(is.na(visitors), mean(visitors, na.rm = TRUE), visitors))

# imputed_data_d <- cleaned_domestic_vis %>%
#   group_by(district, year) %>%
#   mutate(visitors = ifelse(is.na(visitors), mean(visitors, na.rm = TRUE), visitors))
# 
# imputed_data_f <- cleaned_foreign_vis %>%
#   group_by(district, year) %>%
#   mutate(visitors = ifelse(is.na(visitors), mean(visitors, na.rm = TRUE), visitors))

imputed_data_d <- cleaned_domestic_vis %>%
  group_by(district) %>%
  mutate(visitors = ifelse(is.na(visitors), round(mean(visitors[year == 2017], na.rm = TRUE)), visitors))

imputed_data_f <- cleaned_foreign_vis %>%
  group_by(district) %>%
  mutate(visitors = ifelse(is.na(visitors), round(mean(visitors[year == 2017], na.rm = TRUE)), visitors))


plot_before_after_NA(cleaned_domestic_vis , imputed_data_d ,"Distribution for Imputed Domestic_vis")

plot_before_after_NA(cleaned_foreign_vis , imputed_data_f , "Distribution for Imputed Foreign_vis")

total_missing_values_d <- sum(is.na(imputed_data_d$visitors))
paste(total_missing_values_d, "null values in domestic_vis")
## [1] "6 null values in domestic_vis"
domestic_vis = imputed_data_d


total_missing_values_f <- sum(is.na(imputed_data_f$visitors))
paste(total_missing_values_f, "null values in foreign_vis")
## [1] "0 null values in foreign_vis"
foreign_vis = imputed_data_f

Data Analysis

Top 10 Districts with highest visitors

#Creating a function to get top 10 districts and bar plot 
top_10_districts =function( visitors_df,type="domestic"){
  
  # Group by district, calculate the sum of visitors, and select the top 10 districts

  top_10_districts_df <- visitors_df %>%
  group_by(district) %>%
  summarise(total_visitors = sum(visitors,na.rm = T)) %>%
  arrange(desc(total_visitors)) %>%
  head(10)
  
  # convert into million on for domestic visitors
  if (type=="domestic"){
    top_10_districts_df$total_visitors <- top_10_districts_df$total_visitors / 1000000

  }
  
  bar_plot =ggplot(top_10_districts_df, aes(x = reorder(district,-total_visitors), y = total_visitors)) +
  geom_bar(stat = "identity", fill = "skyblue",color='black') +
  # scale_y_continuous(labels = scales::comma) +
  labs(title = "Top 10 Districts by Total Domestic Visitors", 
          x = "District", y = "Total Visitors (in Millions)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
  

  
  return( list("df"= top_10_districts_df,"bar_plot"=bar_plot))
}
dom_top_10_dis = top_10_districts(domestic_vis)
print(dom_top_10_dis$bar_plot)

fore_top_10_dis = top_10_districts(foreign_vis,type="foreign")
print(fore_top_10_dis$df)
## # A tibble: 10 × 2
##    district                   total_visitors
##    <chr>                               <dbl>
##  1 "Hyderabad"                       1044898
##  2 "Warangal (Urban)"                   8821
##  3 "Mahbubnagar"                        2282
##  4 "Jayashankar Bhoopalpally"           1684
##  5 "Jogulamba Gadwal "                  1170
##  6 "Nagarkurnool "                       995
##  7 "Mulugu"                              575
##  8 "Warangal (Rural)"                    540
##  9 "Adilabad"                             32
## 10 "Mancherial"                           10

Top and Bottom 3 CAGR districts for domestic visitors

total_vis_year <- domestic_vis %>%
  group_by(district, year) %>%
  summarise(visitors = sum(visitors, na.rm = T))

# Get unique districts for the year 2016
districts <- domestic_vis %>%
  filter(year == 2016) %>%
  select(district) %>%
  distinct() %>%
  pull()

# Function to calculate CAGR
calculate_cagr <- function(district_name) {
  fv <- total_vis_year %>% filter(district == district_name & year == 2019) %>% pull(visitors)
  pv <- total_vis_year %>% filter(district == district_name & year == 2016) %>% pull(visitors)


  if ( pv != 0) {
    cagr <- ((fv/pv)^(1/3) - 1) * 100
    return(c(district = district_name, cagr = round(cagr,2)))
  } else {
    return(c(district = district_name, cagr = 0) )
  }
}

# Apply the function to each district
cagr_list <- map_df(districts, calculate_cagr)

# Create a data frame from the list
district_cagr <- as.data.frame(cagr_list)
district_cagr$cagr = as.numeric(district_cagr$cagr)

# Get top and bottom 3 districts by CAGR
top3_districts_cagr <- district_cagr %>% arrange(desc(cagr)) %>% slice_head(n = 3)
bottom3_districts_cagr <- district_cagr %>% arrange(cagr) %>% slice_head(n = 3)

# Print the results
Top 3 Districts by CAGR:
##                district  cagr
## 1             Nizamabad 93.03
## 2 Bhadradri Kothagudem  47.93
## 3      Warangal (Rural) 40.45

Bottom 3 Districts by CAGR:
##           district   cagr
## 1      Karimnagar  -79.63
## 2         Nalgonda -71.13
## 3 Warangal (Urban) -58.86

Peak and Low Season for Hyderabad District

month_wise_hyb = function(vis_df){
  
  month_wise_hyb_df <- vis_df %>%
  filter(district == 'Hyderabad') %>%
  group_by(district, month) %>%
  summarize(visitors = sum(visitors))
  
  month_with_max_vis <- month_wise_hyb_df %>%
  filter(visitors == max(visitors))
  
  month_with_min_vis <- month_wise_hyb_df %>%
  filter(visitors == min(visitors))
  
  return (list('df'=month_wise_hyb_df, 'max_vis'=month_with_max_vis, 'min_vis'=month_with_min_vis))
  
}

month_wise_hyb_dom <- month_wise_hyb(domestic_vis)
month_wise_hyb_fore <- month_wise_hyb(foreign_vis)

# Find the month with the maximum visitors
month_with_max_vis_dom <- month_wise_hyb_dom$max_vis
month_with_max_vis_fore <- month_wise_hyb_fore$max_vis

# Find the month with the minimum visitors

month_with_min_vis_dom <- month_wise_hyb_dom$min_vis
month_with_mix_vis_fore <- month_wise_hyb_fore$min_vis
Month with highest domestic visitors June , visitors: 16.898 Million

Month with lowest Domestic visitors  February, visitors: 5.01443\times 10^{6}


Month with highest Foreign visitors December , visitors: 0.12 Million

Month with lowest Foreign visitors  May, visitors: 6.0376\times 10^{4}
# Create a mapping of month names to numeric values
month_mapping <- c("January" = 1, "February" = 2, "March" = 3, "April" = 4,
                   "May" = 5, "June" = 6, "July" = 7, "August" = 8,
                   "September" = 9, "October" = 10, "November" = 11, "December" = 12)

# Use dplyr to create a new numeric 'month_num' column
month_wise_hyb_dom <- month_wise_hyb_dom$df %>%
  mutate(month_num = match(month, names(month_mapping)))

month_wise_hyb_fore <- month_wise_hyb_fore$df %>%
  mutate(month_num = match(month, names(month_mapping)))

# Create the time series plot using 'month_num' as x-axis
dom_plot =ggplot(month_wise_hyb_dom, aes(x = month_num, y = visitors)) +
  # geom_line() +
  geom_line() +
  geom_point(alpha=1) +
  labs(title = "Total Monthly Domestic Visitors in Hyderabad",
       x = "Month",
       y = "Visitors") +
  scale_x_continuous(breaks = 1:12, labels = names(month_mapping)) + # Customize x-axis labels
  theme_minimal() +
  theme( axis.text.x = element_text(angle = 45, hjust = 1) )

fore_plot = ggplot(month_wise_hyb_fore, aes(x = month_num, y = visitors)) +
  geom_line() +
  geom_point(alpha=1) +
  labs(title = "TOtal Monthly Foreign Visitors in Hyderabad",
       x = "Month",
       y = "Visitors") +
  scale_x_continuous(breaks = 1:12, labels = names(month_mapping)) + # Customize x-axis labels
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))



print(dom_plot)

print(fore_plot)

Top and Bottom 3 district in terms of domestic to foreign tourist raito

# Merge data frames on 'district', 'date', 'month', and 'year'
dom_for_v <- inner_join(domestic_vis, foreign_vis, by = c('district', 'date', 'month', 'year'))

# Rename columns if needed
dom_for_v <- rename(dom_for_v, domestic_vis = visitors.x, foreign_vis = visitors.y)

# Group by 'district' and calculate the sum of 'domestic_vis' and 'foreign_vis'
dom_for_v_dis_sum <- dom_for_v %>%
  group_by(district) %>%
  summarise(domestic_vis = sum(domestic_vis,na.rm=T), foreign_vis = sum(foreign_vis,na.rm=T))

# Remove rows where 'foreign_vis' is zero
dom_for_v_dis_sum <- dom_for_v_dis_sum %>% filter(foreign_vis != 0)

# Calculate the ratio of 'domestic_vis' to 'foreign_vis'
dom_for_v_dis_sum <- dom_for_v_dis_sum %>%  mutate(dom_foreign_ratio = domestic_vis / foreign_vis)

# Find the top 3 and bottom 3 districts based on 'dom_foreign_ratio'
bottom_3_districts <- dom_for_v_dis_sum %>% arrange(desc(dom_foreign_ratio)) %>% select(district, dom_foreign_ratio) %>%  slice_head(n = 3)  
top_3_districts <- dom_for_v_dis_sum %>% arrange(dom_foreign_ratio) %>% select(district, dom_foreign_ratio) %>% slice_head(n = 3)


print(bottom_3_districts)
## # A tibble: 3 × 2
##   district   dom_foreign_ratio
##   <chr>                  <dbl>
## 1 "Nirmal"            8309803 
## 2 "Jangaon "           475280.
## 3 "Adilabad"           228799.
ggplot(bottom_3_districts, aes(x= reorder(district, -dom_foreign_ratio),y=dom_foreign_ratio/1000000))+
  geom_bar(stat='identity') +
  labs( title = "Bottom 3 Districts ( Wrost domestic to foreign ratio)",
        x = "District",
        y = "Domestic to Foreign ratio (in Millions)")

print(top_3_districts)
## # A tibble: 3 × 2
##   district         dom_foreign_ratio
##   <chr>                        <dbl>
## 1 Hyderabad                     80.3
## 2 Warangal (Rural)            1717. 
## 3 Mulugu                      3165.
ggplot(top_3_districts , aes(x= reorder(district, -dom_foreign_ratio),y=dom_foreign_ratio))+
  geom_bar(stat='identity') +
  labs( title = "Top 3 Districts ( Best domestic to foreign ratio)",
        x = "District",
        y = "Domestic to Foreign ratio")

Population to Tourist footfall ratio

population = read.csv('./population.csv')
head(population)
##                   district Population_2011 Population_2019
## 1                 Adilabad          708972          755637
## 2    Bhadradri Kothagudem          1069261         1139641
## 3                Hyderabad         3943323         4202876
## 4                 Jagtial           985417         1050278
## 5                 Jangaon           566376          603655
## 6 Jayashankar Bhoopalpally          711434          758261
dom_for_v_dis_2019 = dom_for_v %>% filter(year == '2019')

dom_for_v_dis_sum_2019= dom_for_v_dis_2019 %>%
  group_by(district) %>%
  summarise(domestic_vis = sum(domestic_vis,na.rm=T), foreign_vis = sum(foreign_vis,na.rm=T))

data_with_population <- left_join(dom_for_v_dis_sum_2019, population, by = c('district'))



dom_for_v_dis_sum_pop <- data_with_population %>%  mutate(dom_foreign_sum = domestic_vis + foreign_vis)
dom_for_v_dis_sum_pop <- dom_for_v_dis_sum_pop %>% filter(dom_foreign_sum != 0)


dom_for_v_dis_sum_pop <- dom_for_v_dis_sum_pop %>%
                          mutate(dom_foreign_pop_ratio = dom_foreign_sum / Population_2019 )

dom_for_v_dis_sum_pop_top5 = dom_for_v_dis_sum_pop %>% select(district,dom_foreign_pop_ratio) %>%
                              arrange(desc(dom_foreign_pop_ratio)) %>%
                                slice_head(n = 5)

dom_for_v_dis_sum_pop_bottom5 = dom_for_v_dis_sum_pop %>% select(district,dom_foreign_pop_ratio) %>%
  arrange(dom_foreign_pop_ratio) %>%
  slice_head(n = 5)

dom_for_v_dis_sum_pop_top5
## # A tibble: 5 × 2
##   district                dom_foreign_pop_ratio
##   <chr>                                   <dbl>
## 1 "Rajanna Sircilla "                     28.6 
## 2 "Bhadradri Kothagudem "                 11.2 
## 3 "Medak "                                 6.67
## 4 "Yadadri Bhongir"                        5.70
## 5 "Nirmal"                                 5.05
dom_for_v_dis_sum_pop_bottom5
## # A tibble: 5 × 2
##   district                 dom_foreign_pop_ratio
##   <chr>                                    <dbl>
## 1 "Kamareddy "                          0.000515
## 2 "Peddapalli"                          0.0196  
## 3 "Nizamabad"                           0.0277  
## 4 "Komaram Bheem Asifabad"              0.0349  
## 5 "Karimnagar "                         0.0723
ggplot(dom_for_v_dis_sum_pop_top5 , aes(x= reorder(district, -dom_foreign_pop_ratio),y=dom_foreign_pop_ratio))+
  geom_bar(stat='identity',fill='lightgreen',color='black') +
  labs( title = "Top 5 Districts ( Best tourists to population ratio)",
        x = "District",
        y = "Tourists to population ratio)")

ggplot(dom_for_v_dis_sum_pop_bottom5 , aes(x= reorder(district, -dom_foreign_pop_ratio),y=dom_foreign_pop_ratio))+
  geom_bar(stat='identity',fill='#ffcccb',color='black') +
  labs( title = "Bottom 5 Districts ( Best tourists to population ratio)",
        x = "District",
        y = "Tourists to population ratio)")

Conclusion

Hyderabad

Hyderabad is a major economic and cultural hub in South India, attracting a large number of domestic and foreign tourists to its historical landmarks and business centers. Must visit places

  • Charminar
  • Golconda Fort
  • Qutb Shahi Tombs
  • Ramoji Film City
Mancherial

Mancherial has strong contributions from coal mining and diverse industrial development, along with a favourable climate for agriculture and significant infrastructure

  • Coal Mines and Singareni Collieries Company Limited (SCCL) areas
  • Kawal Tiger Reserve
  • Sri Satyanarayana Swamy Temple
Rajanna Sircilla

Rajanna Sircilla has high population to tourist footfall due to the handloom industry, Kargil Park, and the historical Sircilla Fort, and high population.

  • Sircilla Textile Park
  • Sri Raja Rajeshwara Swamy Devasthanam
  • Ananthagiri Fort
Mulugu

Mulugu is a district with a high ratio of domestic tourists to foreign tourists, primarily due to its popularity as a weekend getaway spot for locals and the presence of famous Hindu temples.

  • Ramappa Temple
  • Bogatha Waterfall
  • Medaram Jathara