ad <- read.csv("C:/Users/user/Downloads/advertising.csv") #Loading the dataset
head(ad) #previewing the first 5 elements of the data
## Daily.Time.Spent.on.Site Age Area.Income Daily.Internet.Usage
## 1 68.95 35 61833.90 256.09
## 2 80.23 31 68441.85 193.77
## 3 69.47 26 59785.94 236.50
## 4 74.15 29 54806.18 245.89
## 5 68.37 35 73889.99 225.58
## 6 59.99 23 59761.56 226.74
## Ad.Topic.Line City Male Country
## 1 Cloned 5thgeneration orchestration Wrightburgh 0 Tunisia
## 2 Monitored national standardization West Jodi 1 Nauru
## 3 Organic bottom-line service-desk Davidton 0 San Marino
## 4 Triple-buffered reciprocal time-frame West Terrifurt 1 Italy
## 5 Robust logistical utilization South Manuel 0 Iceland
## 6 Sharable client-driven software Jamieberg 1 Norway
## Timestamp Clicked.on.Ad
## 1 2016-03-27 00:53:11 0
## 2 2016-04-04 01:39:02 0
## 3 2016-03-13 20:35:42 0
## 4 2016-01-10 02:31:19 0
## 5 2016-06-03 03:36:18 0
## 6 2016-05-19 14:30:17 0
## Daily.Time.Spent.on.Site Age Area.Income
## "numeric" "integer" "numeric"
## Daily.Internet.Usage Ad.Topic.Line City
## "numeric" "character" "character"
## Male Country Timestamp
## "integer" "character" "character"
## Clicked.on.Ad
## "integer"
ad$Timestamp <- as.POSIXct(ad$Timestamp, "%Y-%m-%d %H:%M:%S",tz = "GMT")
duplicates <- ad[duplicated(ad),] #storing duplicates in a table called "duplicates"
duplicates #previewing the table
## [1] Daily.Time.Spent.on.Site Age Area.Income
## [4] Daily.Internet.Usage Ad.Topic.Line City
## [7] Male Country Timestamp
## [10] Clicked.on.Ad
## <0 rows> (or 0-length row.names)
colSums(is.na(ad)) #Checking the total number of null values in each column
## Daily.Time.Spent.on.Site Age Area.Income
## 0 0 0
## Daily.Internet.Usage Ad.Topic.Line City
## 0 0 0
## Male Country Timestamp
## 0 0 0
## Clicked.on.Ad
## 0
names(ad) #Displaying column names
## [1] "Daily.Time.Spent.on.Site" "Age"
## [3] "Area.Income" "Daily.Internet.Usage"
## [5] "Ad.Topic.Line" "City"
## [7] "Male" "Country"
## [9] "Timestamp" "Clicked.on.Ad"
names(ad) <- gsub("[.]", "_", names(ad)) #Replacing "." with "_"
names(ad) #Displaying column names
## [1] "Daily_Time_Spent_on_Site" "Age"
## [3] "Area_Income" "Daily_Internet_Usage"
## [5] "Ad_Topic_Line" "City"
## [7] "Male" "Country"
## [9] "Timestamp" "Clicked_on_Ad"
summary(ad)
## Daily_Time_Spent_on_Site Age Area_Income Daily_Internet_Usage
## Min. :32.60 Min. :19.00 Min. :13996 Min. :104.8
## 1st Qu.:51.36 1st Qu.:29.00 1st Qu.:47032 1st Qu.:138.8
## Median :68.22 Median :35.00 Median :57012 Median :183.1
## Mean :65.00 Mean :36.01 Mean :55000 Mean :180.0
## 3rd Qu.:78.55 3rd Qu.:42.00 3rd Qu.:65471 3rd Qu.:218.8
## Max. :91.43 Max. :61.00 Max. :79485 Max. :270.0
## Ad_Topic_Line City Male Country
## Length:1000 Length:1000 Min. :0.000 Length:1000
## Class :character Class :character 1st Qu.:0.000 Class :character
## Mode :character Mode :character Median :0.000 Mode :character
## Mean :0.481
## 3rd Qu.:1.000
## Max. :1.000
## Timestamp Clicked_on_Ad
## Min. :2016-01-01 02:52:10 Min. :0.0
## 1st Qu.:2016-02-18 02:55:42 1st Qu.:0.0
## Median :2016-04-07 17:27:29 Median :0.5
## Mean :2016-04-10 10:34:06 Mean :0.5
## 3rd Qu.:2016-05-31 03:18:14 3rd Qu.:1.0
## Max. :2016-07-24 00:22:16 Max. :1.0
describe(ad)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad
## Daily_Time_Spent_on_Site 1 1000 65.00 15.85 68.22 65.74 17.92
## Age 2 1000 36.01 8.79 35.00 35.51 8.90
## Area_Income 3 1000 55000.00 13414.63 57012.30 56038.94 13316.62
## Daily_Internet_Usage 4 1000 180.00 43.90 183.13 179.99 58.61
## Ad_Topic_Line* 5 1000 500.50 288.82 500.50 500.50 370.65
## City* 6 1000 487.32 279.31 485.50 487.51 356.57
## Male 7 1000 0.48 0.50 0.00 0.48 0.00
## Country* 8 1000 116.41 69.94 114.50 115.82 89.70
## Timestamp 9 1000 NaN NA NA NaN NA
## Clicked_on_Ad 10 1000 0.50 0.50 0.50 0.50 0.74
## min max range skew kurtosis se
## Daily_Time_Spent_on_Site 32.60 91.43 58.83 -0.37 -1.10 0.50
## Age 19.00 61.00 42.00 0.48 -0.41 0.28
## Area_Income 13996.50 79484.80 65488.30 -0.65 -0.11 424.21
## Daily_Internet_Usage 104.78 269.96 165.18 -0.03 -1.28 1.39
## Ad_Topic_Line* 1.00 1000.00 999.00 0.00 -1.20 9.13
## City* 1.00 969.00 968.00 0.00 -1.19 8.83
## Male 0.00 1.00 1.00 0.08 -2.00 0.02
## Country* 1.00 237.00 236.00 0.08 -1.23 2.21
## Timestamp Inf -Inf -Inf NA NA NA
## Clicked_on_Ad 0.00 1.00 1.00 0.00 -2.00 0.02
From the “summary()” and “describe()” functions, the following measures of central tendency can be gathered:
mode <- function(v){
uniq <- unique(v)
uniq[which.max(tabulate(match(v,uniq)))]
}
The most recurrent Ad Topic Line:
## [1] "Cloned 5thgeneration orchestration"
The most recurrent City:
## [1] "Lisamouth"
The most recurrent Country:
## [1] "Czech Republic"
From the plot, the modal age is 31.
Checking the distribution in terms of gender where 1 is Male and 0 is Female:
## gender
## 0 1
## 519 481
From this, there are More women than men, making female the modal gender.
# scatterplot
plot((ad$Daily_Time_Spent_on_Site), (ad$Age),
main = "A scatterplot of Time Spent on site against age",
xlab = 'Time spent',
ylab = 'Age')
# scatterplot of Time on site vs income
plot((ad$Daily_Time_Spent_on_Site), (ad$Area_Income),
main = "A scatterplot of Time Spent on site against income",
xlab = 'Time Spent on Site',
ylab = 'Income')
# scatterplot of Time on site vs Internet usage
plot((ad$Daily_Time_Spent_on_Site), (ad$Daily_Internet_Usage),
main = "A scatterplot of Time Spent on site against Daily Internet Usage",
xlab = 'Time Spent on Site',
ylab = 'Daily Internet Usage')
# Heat map
# Checking the relationship between the variables
# Using Numeric variables only
numeric_tbl <- ad %>%
select_if(is.numeric) %>%
select(Daily_Time_Spent_on_Site, Age, Area_Income,Daily_Internet_Usage)
# Calculate the correlations
corr <- cor(numeric_tbl, use = "complete.obs")
ggcorrplot(round(corr, 2),
type = "full", lab = T)
# Analysis of people who click on the ads
ad_click <- ad[which(ad$Clicked_on_Ad == 1),] # Creating a new dataset that only has those who clicked on an ad
# Most popular age group of people clicking on ads
hist((ad_click$Age),
main = "Histogram of Age of those who click ads",
xlab = 'Age',
ylab = 'Frequency',
col = "blue")
gender2 <- (ad_click$Male)
gender2.frequency <- table(gender2)
gender2.frequency
## gender2
## 0 1
## 269 231
# plotting to visualize the gender distribution
barplot(gender2.frequency,
main="A bar chart showing Gender of those who clicked",
xlab="Gender(0 = Female, 1 = Male)",
ylab = "Frequency",
col=c("darkblue","red"),
)
# scatterplot
plot((ad_click$Daily_Time_Spent_on_Site), (ad_click$Age),
main = "A scatterplot of Time Spent on site and clicked ad against age",
xlab = 'Time spent',
ylab = 'Age')
# scatterplot of Time on site vs income
plot((ad_click$Daily_Time_Spent_on_Site), (ad_click$Area_Income),
main = "A scatterplot of Time Spent on site and ad clicked against income",
xlab = 'Time Spent on Site',
ylab = 'Income')
# scatterplot of Time on site vs Internet usage
plot((ad_click$Daily_Time_Spent_on_Site), (ad_click$Daily_Internet_Usage),
main = "A scatterplot of Time Spent on site and ad clicked against Daily Internet Usage",
xlab = 'Time Spent on Site',
ylab = 'Daily Internet Usage')
# Heat map
# Checking the relationship between the variables
# Using Numeric variables only
numeric_tbl <- ad_click %>%
select_if(is.numeric) %>%
select(Daily_Time_Spent_on_Site, Age, Area_Income,Daily_Internet_Usage)
# Calculate the correlations
corr <- cor(numeric_tbl, use = "complete.obs")
ggcorrplot(round(corr, 2),
type = "full", lab = T)
There is low correlation between the numerical variables.
The country with the most ad clicks:
mode(ad_click$Country)
## [1] "Australia"
mode(ad_click$Area_Income)
## [1] 24593.33
## [1] "Reactive local challenge"
create_report(ad)
##
##
## processing file: report.rmd
##
|
| | 0%
|
|.. | 2%
## inline R code fragments
##
##
|
|... | 5%
## label: global_options (with options)
## List of 1
## $ include: logi FALSE
##
##
|
|..... | 7%
## ordinary text without R code
##
##
|
|....... | 10%
## label: introduce
##
|
|........ | 12%
## ordinary text without R code
##
##
|
|.......... | 14%
## label: plot_intro
##
|
|............ | 17%
## ordinary text without R code
##
##
|
|............. | 19%
## label: data_structure
##
|
|............... | 21%
## ordinary text without R code
##
##
|
|................. | 24%
## label: missing_profile
##
|
|.................. | 26%
## ordinary text without R code
##
##
|
|.................... | 29%
## label: univariate_distribution_header
##
|
|...................... | 31%
## ordinary text without R code
##
##
|
|....................... | 33%
## label: plot_histogram
##
|
|......................... | 36%
## ordinary text without R code
##
##
|
|........................... | 38%
## label: plot_density
##
|
|............................ | 40%
## ordinary text without R code
##
##
|
|.............................. | 43%
## label: plot_frequency_bar
##
|
|................................ | 45%
## ordinary text without R code
##
##
|
|................................. | 48%
## label: plot_response_bar
##
|
|................................... | 50%
## ordinary text without R code
##
##
|
|..................................... | 52%
## label: plot_with_bar
##
|
|...................................... | 55%
## ordinary text without R code
##
##
|
|........................................ | 57%
## label: plot_normal_qq
##
|
|.......................................... | 60%
## ordinary text without R code
##
##
|
|........................................... | 62%
## label: plot_response_qq
##
|
|............................................. | 64%
## ordinary text without R code
##
##
|
|............................................... | 67%
## label: plot_by_qq
##
|
|................................................ | 69%
## ordinary text without R code
##
##
|
|.................................................. | 71%
## label: correlation_analysis
##
|
|.................................................... | 74%
## ordinary text without R code
##
##
|
|..................................................... | 76%
## label: principal_component_analysis
##
|
|....................................................... | 79%
## ordinary text without R code
##
##
|
|......................................................... | 81%
## label: bivariate_distribution_header
##
|
|.......................................................... | 83%
## ordinary text without R code
##
##
|
|............................................................ | 86%
## label: plot_response_boxplot
##
|
|.............................................................. | 88%
## ordinary text without R code
##
##
|
|............................................................... | 90%
## label: plot_by_boxplot
##
|
|................................................................. | 93%
## ordinary text without R code
##
##
|
|................................................................... | 95%
## label: plot_response_scatterplot
##
|
|.................................................................... | 98%
## ordinary text without R code
##
##
|
|......................................................................| 100%
## label: plot_by_scatterplot
## output file: C:/Users/user/Documents/Geoffrey Chege Moringa IP W12/report.knit.md
## "C:/Program Files/RStudio/bin/quarto/bin/pandoc" +RTS -K512m -RTS "C:/Users/user/Documents/Geoffrey Chege Moringa IP W12/report.knit.md" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc245043a566d9.html --lua-filter "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmarkdown\lua\latex-div.lua" --self-contained --variable bs3=TRUE --standalone --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\user\AppData\Local\Temp\RtmpG4gVy0\rmarkdown-str24502fe73d52.html"
##
## Output created: report.html
# Heat map
# Checking the relationship between the variables
# Using Numeric variables only
numeric_tbl2 <- ad %>%
select_if(is.numeric) %>%
select(Daily_Time_Spent_on_Site, Age, Area_Income,Daily_Internet_Usage, Clicked_on_Ad)
# Calculate the correlations
corr <- cor(numeric_tbl2, use = "complete.obs")
ggcorrplot(round(corr, 2),
type = "full", lab = T)
Clicked_on_Ad is the target variable so I will get correlation without it included.
# Heat map
# Checking the relationship between the variables
# Using Numeric variables only
numeric_tbl3 <- ad %>%
select_if(is.numeric) %>%
select(Daily_Time_Spent_on_Site, Age, Area_Income,Daily_Internet_Usage)
# Calculating the correlations
corr <- cor(numeric_tbl3, use = "complete.obs")
ggcorrplot(round(corr, 2),
type = "full", lab = T)
# Normalizing the dataset
normalize <- function(x){
return ((x-min(x)) / (max(x)-min(x)))
}
ad$Daily_Time_Spent_on_Site <- normalize(ad$Daily_Time_Spent_on_Site)
ad$Age <- normalize(ad$Age)
ad$Area_Income <- normalize(ad$Area_Income)
ad$Male <- normalize(ad$Male)
#previewing normalized dataset
head(ad)
## Daily_Time_Spent_on_Site Age Area_Income Daily_Internet_Usage
## 1 0.6178820 0.3809524 0.7304725 256.09
## 2 0.8096209 0.2857143 0.8313752 193.77
## 3 0.6267211 0.1666667 0.6992003 236.50
## 4 0.7062723 0.2380952 0.6231599 245.89
## 5 0.6080231 0.3809524 0.9145678 225.58
## 6 0.4655788 0.0952381 0.6988280 226.74
## Ad_Topic_Line City Male Country
## 1 Cloned 5thgeneration orchestration Wrightburgh 0 Tunisia
## 2 Monitored national standardization West Jodi 1 Nauru
## 3 Organic bottom-line service-desk Davidton 0 San Marino
## 4 Triple-buffered reciprocal time-frame West Terrifurt 1 Italy
## 5 Robust logistical utilization South Manuel 0 Iceland
## 6 Sharable client-driven software Jamieberg 1 Norway
## Timestamp Clicked_on_Ad
## 1 2016-03-27 00:53:11 0
## 2 2016-04-04 01:39:02 0
## 3 2016-03-13 20:35:42 0
## 4 2016-01-10 02:31:19 0
## 5 2016-06-03 03:36:18 0
## 6 2016-05-19 14:30:17 0
# splitting the data into training and testing sets
# I will split it 70:30
intrain <- createDataPartition(y = ad$Clicked_on_Ad, p = 0.7, list = FALSE)
training <- ad[intrain,]
testing <- ad[-intrain,]
# checking the dimensions of our training and testing sets
dim(training)
## [1] 700 10
dim(testing)
## [1] 300 10
# checking the dimensions of our split
prop.table(table(ad$Clicked_on_Ad)) * 100
##
## 0 1
## 50 50
prop.table(table(training$Clicked_on_Ad)) * 100
##
## 0 1
## 50 50
prop.table(table(testing$Clicked_on_Ad)) * 100
##
## 0 1
## 50 50
# Specifying target and predictor variables
m <- rpart(Clicked_on_Ad ~ . ,
data = numeric_tbl2,
method = "class")
# Plotting model
rpart.plot(m)
# Making predictions
p <- predict(m, numeric_tbl2, type ="class")
# Printing the confusion matrix
table(p, numeric_tbl2$Clicked_on_Ad)
##
## p 0 1
## 0 485 28
## 1 15 472
# Printing the Accuracy
(mean(numeric_tbl2$Clicked_on_Ad == p))*100
## [1] 95.7