Dataset

The data consists of google trends for AutoML /Automated machine learning. This was extracted from 2020 kaggle online survey: @https://www.kaggle.com/c/kaggle-survey-2020

URL: https://www.kaggle.com/parulpandey/automl-google-trends-data?select=relatedEntities.csv

Exploratory Data Analysis

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sea <- read.csv("C:/Users/Anhuynh/Desktop/Data Science Project/AI & Technology Trend/Searches.csv")
geo <- read.csv("C:/Users/Anhuynh/Desktop/Data Science Project/AI & Technology Trend/geoMap.csv")
ent <- read.csv("C:/Users/Anhuynh/Desktop/Data Science Project/AI & Technology Trend/relatedEntities.csv")
que <- read.csv("C:/Users/Anhuynh/Desktop/Data Science Project/AI & Technology Trend/relatedQueries.csv")
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
sea$Date <- ym(sea$Date)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
library(ggmap)
## Warning: package 'ggmap' was built under R version 4.1.1
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
country <- distinct(geo, Country)

country_df <- as.data.frame(country)
locations_df <- mutate_geocode(country_df, Country)
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=China&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=South+Korea&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Hong+Kong&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Singapore&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Taiwan&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Israel&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Ireland&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Japan&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Switzerland&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Belgium&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=United+States&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Netherlands&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Canada&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=India&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Germany&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Norway&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=United+Kingdom&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Denmark&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=France&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Malaysia&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Sweden&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Australia&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Philippines&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Portugal&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=South+Africa&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Spain&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Russia&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Italy&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Pakistan&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Vietnam&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Mexico&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Thailand&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Poland&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Brazil&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Chile&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Argentina&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Ukraine&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Indonesia&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Colombia&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Turkey&key=xxx
locations <- as_tibble(locations_df)
geo_per_destination <- left_join(geo, locations, by = c("Country" = "Country"))
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.1.1
world <- map_data("world")
ggplot() +
  geom_map(
    data = world, map = world,
    aes(long, lat, map_id = region),
    color = "white", fill = "lightgrey", size = 0.1
  ) +
  geom_point(
    data = geo_per_destination,
    aes(lon, lat, 
        color = "Country",
        size=Number.of.Searches),
    alpha = 0.5
  ) +
  #labs(x = NULL, y = NULL, color = NULL)+
  theme_void() +
  theme(legend.position = "none")+
  scale_color_manual(values = c(Country = "blue")) +
  scale_size_continuous(range = c(2, 8)) +
  geom_text_repel(data = locations, aes(x = lon, y = lat, label = Country, label.size = 0.25)) +
  labs(title = "Google Searches of Automated Machine Learning across Countries in 2020",
       size = "Scale",
       color = NULL, subtitle = paste("Values in correspondence of location & popularity","|", "0 : not enough data; 50 : half as popular; 100 : most popular")) +
  theme(plot.subtitle=element_text(size=9, hjust=0.25, face="italic", color="blue")) +
  guides(color = guide_legend(override.aes = list(size = 5)))
## Warning: Ignoring unknown aesthetics: x, y
## Warning: Ignoring unknown aesthetics: label.size
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggplot(sea, aes(x = Date, y = Search.Interest)) + 
geom_line(color = "#00AFBB", size = 1) +
  scale_color_manual(values = "#00AFBB") +
  scale_fill_manual(values = "#00AFBB") +
  labs( x="Date",y="",title="Google Search for Automated ML over Years", subtitle = "Timeline Data (2015-2020)") +
 theme(legend.position = "none") +
  stat_smooth(
  color = "#FC4E07", fill = "#FC4E07",
  method = "loess") +
 scale_x_date(date_labels = "%b-%Y") +
   theme_dark() +
   theme(
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank()
  )
## `geom_smooth()` using formula 'y ~ x'

library(ggplot2)

ggplot(ent, aes(x = Search.Interest, y = reorder(Related.Entities,Search.Interest), label = Search.Interest)) +
  geom_col(stat="identity", fill="blue", alpha=.9, width=.8) +
  geom_text(position = position_stack(vjust = 0.5), size = 3, color = "white") +
  labs( x="Scoring of Search Interest",y="",title="Category of Search Entities", subtitle = "Scoring is on a relative scale.") +
  theme(legend.position = "none") +
    theme_dark() +
   theme(
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank()
  ) +
  theme(axis.line = element_line(color = 'darkgrey'))
## Warning: Ignoring unknown parameters: stat

ggplot(que, aes(x = Search.Interest, y = reorder(Related.Queries, Search.Interest), label = Search.Interest)) +
  geom_col(stat="identity", fill="purple", alpha=.9, width=.8) +
  geom_text(position = position_stack(vjust = 0.5), size = 3, color = "white") +
  labs( x="Scoring of Search Interest",y="",title="Category of Search Queries", subtitle = "Scoring is on a relative scale.") +
  theme(legend.position = "none") +
    theme_dark() +
   theme(
    plot.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank()
  ) +
  theme(axis.line = element_line(color = 'darkgrey'))
## Warning: Ignoring unknown parameters: stat