#Introduction The dataset contains transactional information from purchases made across various cities in the USA, spanning the period from January to December 2012. My focus is specifically on analyzing purchase trends during January and December, periods that coincide with the start of a new school session and the Christmas holiday season, respectively. The data was obtained from the Kaggle website at the provided URL. https://www.kaggle.com/datasets/dsfelix/purchasestxt/data
library(tidyverse)
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(readr)
library(R.utils)
## Warning: package 'R.utils' was built under R version 4.3.2
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 4.3.2
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.26.0 (2024-01-24 05:12:50 UTC) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
##
## The following object is masked from 'package:R.methodsS3':
##
## throw
##
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
##
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
##
## R.utils v2.12.3 (2023-11-18 01:00:02 UTC) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following object is masked from 'package:utils':
##
## timestamp
##
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, isOpen, nullfile, parse, warnings
urlfile <- "https://media.githubusercontent.com/media/topkelama/lfsStorage/main/purchases_log.txt"
#read.table() and read.csv() did not work properly
#used read.delim()
purchases_log <- read.delim(file = urlfile, header = FALSE, stringsAsFactors = FALSE)
List first 6 rows
head(purchases_log)
## V1 V2 V3 V4 V5 V6
## 1 2012-01-01 09:00 San Jose Men's Clothing 214.05 Amex
## 2 2012-01-01 09:00 Fort Worth Women's Clothing 153.57 Visa
## 3 2012-01-01 09:00 San Diego Music 66.08 Cash
## 4 2012-01-01 09:00 Pittsburgh Pet Supplies 493.51 Discover
## 5 2012-01-01 09:00 Omaha Children's Clothing 235.63 MasterCard
## 6 2012-01-01 09:00 Stockton Men's Clothing 247.18 MasterCard
Assign with the appropriate column names
colnames(purchases_log) <- c("Date", "Time", "City", "Purchased_Item", "Amount", "Payment_method")
head(purchases_log)
## Date Time City Purchased_Item Amount Payment_method
## 1 2012-01-01 09:00 San Jose Men's Clothing 214.05 Amex
## 2 2012-01-01 09:00 Fort Worth Women's Clothing 153.57 Visa
## 3 2012-01-01 09:00 San Diego Music 66.08 Cash
## 4 2012-01-01 09:00 Pittsburgh Pet Supplies 493.51 Discover
## 5 2012-01-01 09:00 Omaha Children's Clothing 235.63 MasterCard
## 6 2012-01-01 09:00 Stockton Men's Clothing 247.18 MasterCard
Drop the unnecessary column
purchases_log_cleaned <- purchases_log %>%
select(-Time)
head(purchases_log_cleaned)
## Date City Purchased_Item Amount Payment_method
## 1 2012-01-01 San Jose Men's Clothing 214.05 Amex
## 2 2012-01-01 Fort Worth Women's Clothing 153.57 Visa
## 3 2012-01-01 San Diego Music 66.08 Cash
## 4 2012-01-01 Pittsburgh Pet Supplies 493.51 Discover
## 5 2012-01-01 Omaha Children's Clothing 235.63 MasterCard
## 6 2012-01-01 Stockton Men's Clothing 247.18 MasterCard
Transform the data to long format with City as key and City_name as value
# Transform data to long format
purchases_log_long <- gather(purchases_log_cleaned, key = "Variable", value = "Value", -Date, -Amount, -Purchased_Item, -Payment_method)
# Display the first few rows of the long-format data
head(purchases_log_long)
## Date Purchased_Item Amount Payment_method Variable Value
## 1 2012-01-01 Men's Clothing 214.05 Amex City San Jose
## 2 2012-01-01 Women's Clothing 153.57 Visa City Fort Worth
## 3 2012-01-01 Music 66.08 Cash City San Diego
## 4 2012-01-01 Pet Supplies 493.51 Discover City Pittsburgh
## 5 2012-01-01 Children's Clothing 235.63 MasterCard City Omaha
## 6 2012-01-01 Men's Clothing 247.18 MasterCard City Stockton
Assign appropriate names to the variable and value column.
# Rename the Variable and Value columns
names(purchases_log_long)[names(purchases_log_long) == "Variable"] <- "City"
names(purchases_log_long)[names(purchases_log_long) == "Value"] <- "City_Name"
# Display the first few rows of the renamed data
head(purchases_log_long)
## Date Purchased_Item Amount Payment_method City City_Name
## 1 2012-01-01 Men's Clothing 214.05 Amex City San Jose
## 2 2012-01-01 Women's Clothing 153.57 Visa City Fort Worth
## 3 2012-01-01 Music 66.08 Cash City San Diego
## 4 2012-01-01 Pet Supplies 493.51 Discover City Pittsburgh
## 5 2012-01-01 Children's Clothing 235.63 MasterCard City Omaha
## 6 2012-01-01 Men's Clothing 247.18 MasterCard City Stockton
Analysis on August and December purchase trend
#convert the string to Date data type
purchases_log_long$Date <- as.Date(purchases_log_long$Date)
# Filter data for December and August
purchases_log_filtered_months <- purchases_log_long %>%
filter(month(Date) %in% c(8, 12)) # Include only August (8) and December (12)
# Group by Date and calculate total amount spent
purchases_trend <- purchases_log_filtered_months %>%
group_by(Date) %>%
summarise(Total_Amount = sum(Amount))
head(purchases_trend)
## # A tibble: 6 × 2
## Date Total_Amount
## <date> <dbl>
## 1 2012-08-01 2847568.
## 2 2012-08-02 2828845.
## 3 2012-08-03 2843417.
## 4 2012-08-04 2836889.
## 5 2012-08-05 2806366.
## 6 2012-08-06 2841956.
Extract only month from the date
# Create a new column for total amount spent
purchases_log_filtered_months <- purchases_log_filtered_months %>%
mutate(Total_Amount = as.numeric(gsub("\\D", "", Amount))) # Remove non-numeric characters from Amount and convert to numeric
# Extract month from Date
purchases_log_filtered_months$Month <- month(purchases_log_filtered_months$Date)
# Group by month and calculate total amount spent for each month
monthly_totals <- purchases_log_filtered_months %>%
group_by(Month) %>%
summarise(Total_Amount = sum(Total_Amount, na.rm = TRUE))
Total amount spent before new school session and before Christmas
#Total amount spent in the month of August and December
monthly_totals
## # A tibble: 2 × 2
## Month Total_Amount
## <dbl> <dbl>
## 1 8 7979907059
## 2 12 7967988271
# Calculate mean, median, mode, min, and max amount for each month
monthly_stats <- monthly_totals %>%
summarise(
Mean_Amount = mean(Total_Amount, na.rm = TRUE),
Median_Amount = median(Total_Amount, na.rm = TRUE),
Min_Amount = min(Total_Amount, na.rm = TRUE),
Max_Amount = max(Total_Amount, na.rm = TRUE)
)
# Print monthly statistics
print(monthly_stats)
## # A tibble: 1 × 4
## Mean_Amount Median_Amount Min_Amount Max_Amount
## <dbl> <dbl> <dbl> <dbl>
## 1 7973947665 7973947665 7967988271 7979907059
Plot the purchase trend for August
# Plot the trend
august_trend <- purchases_trend %>%
filter(month(Date) == 8)
# Plot the trend for August
ggplot(august_trend, aes(x = Date, y = Total_Amount/10000)) + # divided by 10000 to make the value smaller and clearer
geom_line() +
geom_smooth(method = "loess") +
labs(title = "Trend of Total Amount Spent in August",
x = "Date",
y = "Total Amount") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
The blue smooth line shows that around August 15th the purchase is
decreased.
Plot the purchase trend for December
# Filter the data for December and remove missing values
december_trend <- purchases_trend %>%
filter(month(Date) == 12) %>%
na.omit()
# Plot the trend for December
ggplot(december_trend, aes(x = Date, y = Total_Amount/10000)) +
geom_line() +
geom_smooth(method = "loess") +
labs(title = "Trend of Total Amount Spent in December",
x = "Date",
y = "Total Amount") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
We can see that the blue smooth line is slightly moving downward after
2nd week of the December.
Conclusion: Based on the analysis, we observed a consistent spending pattern in August and December, with the amount spent ranging from approximately 14710000 to 14810000 during the second week of August and the first and second weeks of December. This indicates a potential correlation between spending and the festive season or the beginning of a new school session. However, further analysis is needed to confirm this correlation.
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Anna Moy intends to utilize this dataset to analyze the changes in ridership trends over time for various modes of transportation. Specifically, she is interested in studying the ridership patterns for Subway, Buses, LIRR, and Metro North across the years 2020, 2021, and 2022. Furthermore, she plans to compare the ridership between Buses and Subways, and identify the maximum and minimum ridership levels across all modes of transportation. My focus is on exploring the ridership trends for all modes of transportation during the pandemic lockdown period in NYC.The data was obtained from the following URL. https://data.ny.gov/Transportation/MTA-Daily-Ridership-Data-Beginning-2020/vxuj-8kew/data_preview
mta_ridership <- read.csv("https://raw.githubusercontent.com/topkelama/lfsStorage/main/MTA_Daily_Ridership_Data__Beginning_2020_20240227.csv")
head(mta_ridership)
## Date Subways..Total.Estimated.Ridership
## 1 03/01/2020 2212965
## 2 03/02/2020 5329915
## 3 03/03/2020 5481103
## 4 03/04/2020 5498809
## 5 03/05/2020 5496453
## 6 03/06/2020 5189447
## Subways....of.Comparable.Pre.Pandemic.Day Buses..Total.Estimated.Ridership
## 1 0.97 984908
## 2 0.96 2209066
## 3 0.98 2228608
## 4 0.99 2177165
## 5 0.99 2244515
## 6 0.93 2066743
## Buses....of.Comparable.Pre.Pandemic.Day LIRR..Total.Estimated.Ridership
## 1 0.99 NA
## 2 0.99 321569
## 3 0.99 319727
## 4 0.97 311662
## 5 1.00 307597
## 6 0.92 289171
## LIRR....of.Comparable.Pre.Pandemic.Day Metro.North..Total.Estimated.Ridership
## 1 NA 55826
## 2 1.03 180702
## 3 1.02 190648
## 4 0.99 192689
## 5 0.98 194387
## 6 0.92 205056
## Metro.North....of.Comparable.Pre.Pandemic.Day
## 1 0.59
## 2 0.66
## 3 0.69
## 4 0.70
## 5 0.70
## 6 0.74
## Access.A.Ride..Total.Scheduled.Trips
## 1 19922
## 2 30338
## 3 32767
## 4 34297
## 5 33209
## 6 30970
## Access.A.Ride....of.Comparable.Pre.Pandemic.Day
## 1 1.13
## 2 1.02
## 3 1.10
## 4 1.15
## 5 1.12
## 6 1.04
## Bridges.and.Tunnels..Total.Traffic
## 1 786961
## 2 874620
## 3 882175
## 4 905558
## 5 929298
## 6 945408
## Bridges.and.Tunnels....of.Comparable.Pre.Pandemic.Day
## 1 0.98
## 2 0.95
## 3 0.96
## 4 0.98
## 5 1.01
## 6 1.03
## Staten.Island.Railway..Total.Estimated.Ridership
## 1 1636
## 2 17140
## 3 17453
## 4 17136
## 5 17203
## 6 15285
## Staten.Island.Railway....of.Comparable.Pre.Pandemic.Day
## 1 0.52
## 2 1.07
## 3 1.09
## 4 1.07
## 5 1.08
## 6 0.96
# Remove unnecessary columns
mta_ridership_clean <- mta_ridership[, c("Date", "Subways..Total.Estimated.Ridership", "Buses..Total.Estimated.Ridership", "LIRR..Total.Estimated.Ridership", "Metro.North..Total.Estimated.Ridership")]
# Rename the columns
colnames(mta_ridership_clean) <- c("Date", "Total_SubwEst_ride/day", "Total_BusEst_ride/day", "Total_LirrEst_ride/day", "Total_MetroNorthEst_ride/day")
# Display the first few rows of the cleaned data with renamed columns
head(mta_ridership_clean)
## Date Total_SubwEst_ride/day Total_BusEst_ride/day
## 1 03/01/2020 2212965 984908
## 2 03/02/2020 5329915 2209066
## 3 03/03/2020 5481103 2228608
## 4 03/04/2020 5498809 2177165
## 5 03/05/2020 5496453 2244515
## 6 03/06/2020 5189447 2066743
## Total_LirrEst_ride/day Total_MetroNorthEst_ride/day
## 1 NA 55826
## 2 321569 180702
## 3 319727 190648
## 4 311662 192689
## 5 307597 194387
## 6 289171 205056
Convert the date string to Date data type, remove the rows with NA.
# Remove rows with NA values
mta_ridership_clean <- na.omit(mta_ridership_clean)
# Convert "Date" column to a Date object
mta_ridership_clean$Date <- as.Date(mta_ridership_clean$Date, format = "%m/%d/%Y")
# Display the first few rows of the cleaned data with renamed columns and without NAs
head(mta_ridership_clean)
## Date Total_SubwEst_ride/day Total_BusEst_ride/day
## 2 2020-03-02 5329915 2209066
## 3 2020-03-03 5481103 2228608
## 4 2020-03-04 5498809 2177165
## 5 2020-03-05 5496453 2244515
## 6 2020-03-06 5189447 2066743
## 7 2020-03-07 2814637 1249085
## Total_LirrEst_ride/day Total_MetroNorthEst_ride/day
## 2 321569 180702
## 3 319727 190648
## 4 311662 192689
## 5 307597 194387
## 6 289171 205056
## 7 106058 75839
Check the data type
str(mta_ridership_clean)
## 'data.frame': 1456 obs. of 5 variables:
## $ Date : Date, format: "2020-03-02" "2020-03-03" ...
## $ Total_SubwEst_ride/day : int 5329915 5481103 5498809 5496453 5189447 2814637 2120656 4973513 4867818 4697122 ...
## $ Total_BusEst_ride/day : int 2209066 2228608 2177165 2244515 2066743 1249085 957163 2124770 2111989 2112967 ...
## $ Total_LirrEst_ride/day : int 321569 319727 311662 307597 289171 106058 81565 277001 259324 245798 ...
## $ Total_MetroNorthEst_ride/day: int 180702 190648 192689 194387 205056 75839 60800 183953 179050 175074 ...
## - attr(*, "na.action")= 'omit' Named int 1
## ..- attr(*, "names")= chr "1"
Narrow down to specified date range and convert the data frame to long format
# Filter the data for the specified date range
lockdown_period <- mta_ridership_clean[mta_ridership_clean$Date >= as.Date("2020-03-22") & mta_ridership_clean$Date <= as.Date("2020-05-07"), ]
# Convert the data frame to long format
lockdown_long <- gather(lockdown_period, key = "Transportation_Mode", value = "Ridership", -Date)
# Display the first few rows of the long format data
head(lockdown_long)
## Date Transportation_Mode Ridership
## 1 2020-03-22 Total_SubwEst_ride/day 408723
## 2 2020-03-23 Total_SubwEst_ride/day 709499
## 3 2020-03-24 Total_SubwEst_ride/day 741587
## 4 2020-03-25 Total_SubwEst_ride/day 690032
## 5 2020-03-26 Total_SubwEst_ride/day 680360
## 6 2020-03-27 Total_SubwEst_ride/day 656817
summary_stats <- lockdown_long %>%
group_by(Transportation_Mode) %>%
summarize(
Mean_Ridership = mean(Ridership, na.rm = TRUE),
Median_Ridership = median(Ridership, na.rm = TRUE),
Max_Ridership = max(Ridership, na.rm = TRUE),
Min_Ridership = min(Ridership, na.rm = TRUE)
)
# Print the summary statistics
print(summary_stats)
## # A tibble: 4 × 5
## Transportation_Mode Mean_Ridership Median_Ridership Max_Ridership
## <chr> <dbl> <int> <int>
## 1 Total_BusEst_ride/day 21416. 16377 73517
## 2 Total_LirrEst_ride/day 10021. 8816 30564
## 3 Total_MetroNorthEst_ride/day 28396. 9694 139285
## 4 Total_SubwEst_ride/day 434554. 437852 741587
## # ℹ 1 more variable: Min_Ridership <int>
# Create a boxplot of the ridership for each transportation mode
ggplot(lockdown_long, aes(x = Transportation_Mode, y = Ridership)) +
geom_boxplot() +
labs(title = "Distribution of Ridership Across Transportation Modes",
y = "Ridership")
Three box plots are flattened to the bottom and almost speak
nothing.
Here, I have changed the ridership value to log form to get clearer visualization.
ggplot(lockdown_long, aes(x = Transportation_Mode, y = log(Ridership))) +
geom_boxplot() +
labs(title = "Distribution of Log-Transformed Ridership Across Transportation Modes",
y = "Log-Ridership")
The Ridership value on the y coordinate is in Log form, when we convert
this value to exponential it will match with aforementioned central
tendency shown by summary_stats.
To sum up, during the NYC lockdown period from March 22nd to May 7th, 2020, the Subway had the highest ridership, with an average of 434,554 passengers per day. Metro North followed with an average of 28,396 passengers, while Buses had an average of 21,416 and LIRR had the lowest at 10,021 passengers per day.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
The dataset provided offers a glimpse into global inflation rates, highlighting the changing economic landscapes of various regions. To align with Mohammed Rehman’s suggestion, I begin by examining the broader trend of inflation in South Asia, focusing specifically on developing countries such as Nepal. This analysis aims to uncover the general inflation trajectory in South Asia and, subsequently, to construct a linear regression model for Nepal. Overall, this endeavor seeks to offer concise yet comprehensive insights into South Asia’s inflation trends and provide a forecast of inflation rates in Nepal.The dataset, accessible via the following URL: https://raw.githubusercontent.com/topkelama/lfsStorage/main/global_inflation_data.csv,
inflationDF <- read.csv("https://raw.githubusercontent.com/topkelama/lfsStorage/main/global_inflation_data.csv")
List first 6 rows
head(inflationDF)
## country_name indicator_name X1980
## 1 Afghanistan Annual average inflation (consumer prices) rate 13.4
## 2 Albania Annual average inflation (consumer prices) rate NA
## 3 Algeria Annual average inflation (consumer prices) rate 9.7
## 4 Andorra Annual average inflation (consumer prices) rate NA
## 5 Angola Annual average inflation (consumer prices) rate 46.7
## 6 Antigua and Barbuda Annual average inflation (consumer prices) rate 19.0
## X1981 X1982 X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992
## 1 22.2 18.2 15.9 20.4 8.7 -2.1 18.4 27.5 71.5 47.4 43.8 58.19
## 2 NA NA NA NA NA NA NA NA NA -0.2 35.7 226.00
## 3 14.6 6.6 7.8 6.3 10.4 14.0 5.9 5.9 9.2 9.3 25.9 31.70
## 4 NA NA NA NA NA NA NA NA NA NA NA NA
## 5 1.4 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 85.3 299.10
## 6 11.5 4.2 2.3 3.8 1.0 0.5 3.6 6.8 4.4 6.6 4.5 3.00
## X1993 X1994 X1995 X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003
## 1 33.99 20.01 14.0 14.01 14.01 14.01 14.01 0.0 -43.4 51.93 35.66
## 2 85.00 22.60 7.8 12.70 33.20 20.60 0.40 0.0 3.1 5.20 2.40
## 3 20.50 29.00 29.8 18.70 5.70 5.00 2.60 0.3 4.2 1.40 4.30
## 4 NA NA NA NA NA NA NA NA NA 3.10 3.10
## 5 1379.50 949.80 2672.2 4146.00 221.50 107.40 248.20 325.0 152.6 108.90 98.20
## 6 3.10 6.50 2.7 3.00 0.40 3.30 1.10 -0.2 1.9 2.40 2.00
## X2004 X2005 X2006 X2007 X2008 X2009 X2010 X2011 X2012 X2013 X2014 X2015 X2016
## 1 16.36 10.57 6.78 8.68 26.42 -6.81 2.18 11.8 6.44 7.39 4.67 -0.66 4.38
## 2 2.90 2.40 2.40 3.00 3.30 2.20 3.60 3.4 2.00 1.90 1.60 1.90 1.30
## 3 4.00 1.40 2.30 3.70 4.90 5.70 3.90 4.5 8.90 3.30 2.90 4.80 6.40
## 4 2.90 3.50 3.70 2.70 4.30 -1.20 1.70 2.6 1.50 0.50 -0.10 -1.10 -0.40
## 5 43.50 23.00 13.30 12.20 12.50 13.70 14.50 13.5 10.30 8.80 7.30 9.20 30.70
## 6 2.00 2.10 1.80 1.40 5.30 -0.60 3.40 3.5 3.40 1.10 1.10 1.00 -0.50
## X2017 X2018 X2019 X2020 X2021 X2022 X2023 X2024
## 1 4.98 0.63 2.3 5.44 5.06 13.71 9.1 NA
## 2 2.00 2.00 1.4 1.60 2.00 6.70 4.8 4.0
## 3 5.60 4.30 2.0 2.40 7.20 9.30 9.0 6.8
## 4 2.60 1.00 0.5 0.10 1.70 6.20 5.2 3.5
## 5 29.80 19.60 17.1 22.30 25.80 21.40 13.1 22.3
## 6 2.40 1.20 1.4 1.10 1.60 7.50 5.0 2.9
select the data from 2007 to 2024
# Filter data for the years 2007 to 2024
inflationDF <- inflationDF %>%
select(country_name, indicator_name, X2007:X2024)
#inflationDF
I am interested on the inflation rate trend of south Asia
# List of South Asian countries
south_asian_countries <- c("Afghanistan", "Bangladesh", "Bhutan", "India", "Maldives", "Nepal", "Pakistan", "Sri Lanka")
# Filter data for South Asian countries only
df_south_asian <- inflationDF %>%
filter(country_name %in% south_asian_countries)
#df_south_asian
Filter the rows with NAs
# Find rows where there are NA values
rows_with_na <- df_south_asian %>%
rowwise() %>%
mutate(any_na = any(is.na(c_across(X2007:X2024)))) %>%
filter(any_na) %>%
select(-any_na)
Calculate the mean value from the row that has NA and fill the NA with mean value.
# Calculate the row-wise mean for the columns X2007 to X2024
rows_with_na$mean_value <- rowMeans(rows_with_na[,3:ncol(rows_with_na)], na.rm = TRUE)#from 3rd column to last column of the rows with NA data frame.
# Replace the NA values with the row-wise mean
df_south_asian <- df_south_asian %>%
mutate(across(X2007:X2024, ~ ifelse(is.na(.), rows_with_na$mean_value, .)))
#df_south_asian
# Transform the data into long format
df_south_asian_long <- df_south_asian %>%
pivot_longer(cols = X2007:X2024, names_to = "Year", values_to = "Inflation_Rate")
#df_south_asian_long
Remove the ‘X’ from Year
# Remove 'X' character from 'Year' column
df_south_asian_long$Year <- gsub("X", "", df_south_asian_long$Year)
# Convert 'Year' column to integer
df_south_asian_long$Year <- as.integer(df_south_asian_long$Year)
str(df_south_asian_long)
## tibble [144 × 4] (S3: tbl_df/tbl/data.frame)
## $ country_name : chr [1:144] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ indicator_name: chr [1:144] "Annual average inflation (consumer prices) rate" "Annual average inflation (consumer prices) rate" "Annual average inflation (consumer prices) rate" "Annual average inflation (consumer prices) rate" ...
## $ Year : int [1:144] 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 ...
## $ Inflation_Rate: num [1:144] 8.68 26.42 -6.81 2.18 11.8 ...
Plot the South Asian Inflation trend from 2007 to 2024
# Plot the inflation rates
df_south_asian_long$Year <- as.factor(df_south_asian_long$Year)
# Plot the inflation rates
ggplot(df_south_asian_long, aes(x = Year, y = Inflation_Rate, color = country_name, group = country_name)) +
geom_line() +
labs(title = "Inflation Rates of South Asian Countries (2007-2024)",
x = "Year", y = "Inflation Rate (%)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_discrete(breaks = unique(df_south_asian_long$Year))
Filter the country Nepal and put it in long format
# Filter data for Nepal only
df_nepal <- df_south_asian %>%
filter(country_name == "Nepal")
# Transform data into long format
df_nepal_long <- df_nepal %>%
pivot_longer(cols = X2007:X2024, names_to = "Year", values_to = "Inflation_Rate")
Remove ‘X’ from ‘Year’ column if any Convert ‘Year’ column to integer data type
# Remove 'X' character from 'Year' column
df_nepal_long$Year <- gsub("X", "", df_nepal_long$Year)
# Convert 'Year' column to integer
df_nepal_long$Year <- as.integer(df_nepal_long$Year)
#df_nepal_long
# Build a linear regression model
lm_model <- lm(Inflation_Rate ~ as.integer(Year), data = df_nepal_long)
# Summary of the model
summary(lm_model)
##
## Call:
## lm(formula = Inflation_Rate ~ as.integer(Year), data = df_nepal_long)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1029 -2.2761 0.2685 1.2799 3.7514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 465.17568 197.82319 2.351 0.0318 *
## as.integer(Year) -0.22714 0.09815 -2.314 0.0343 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.16 on 16 degrees of freedom
## Multiple R-squared: 0.2508, Adjusted R-squared: 0.204
## F-statistic: 5.356 on 1 and 16 DF, p-value: 0.03427
In summary, this analysis of the global inflation dataset has delved into inflation trends in South Asia, with a particular focus on developing nations such as Nepal. Through this exploration, a predictive model for Nepal’s inflation rates has been developed, providing insights that can aid in strategic planning and decision-making for economic stability and growth in the region.