Including all the required libraries

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr, warn.conflicts = FALSE)
library(dlookr)
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
library(ggplot2)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.6     ✓ purrr   0.3.4
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x igraph::%--%()           masks lubridate::%--%()
## x tibble::as_data_frame()  masks igraph::as_data_frame(), dplyr::as_data_frame()
## x lubridate::as.difftime() masks base::as.difftime()
## x purrr::compose()         masks igraph::compose()
## x tidyr::crossing()        masks igraph::crossing()
## x lubridate::date()        masks base::date()
## x tidyr::expand()          masks reshape::expand()
## x tidyr::extract()         masks dlookr::extract()
## x dplyr::filter()          masks stats::filter()
## x igraph::groups()         masks dplyr::groups()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x reshape::rename()        masks dplyr::rename()
## x lubridate::setdiff()     masks base::setdiff()
## x purrr::simplify()        masks igraph::simplify()
## x reshape::stamp()         masks lubridate::stamp()
## x igraph::union()          masks lubridate::union(), base::union()
library(ggraph)
library(tidygraph)
## 
## Attaching package: 'tidygraph'
## The following object is masked from 'package:reshape':
## 
##     rename
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:stats':
## 
##     filter
library(ggpubr)
library(sf)
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1; sf_use_s2() is TRUE
library("wordcloud2")
library(wordcloud)
## Loading required package: RColorBrewer
library(webshot)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following object is masked from 'package:reshape':
## 
##     rename
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:ggpubr':
## 
##     gghistogram
library(knitr)

#To display map in mac
# webshot::install_phantomjs()

# To display map in Windows
# install_phantomjs(version = "2.1.1", baseURL = "https://github.com/wch/webshot/releases/download/v0.3.1/", force = FALSE)

Loading dataset

df_bike_sharing <-  read.csv("/cloud/project/Bike_Sharing/hour.csv", header = TRUE, sep = ",")
df_bluebikes <- read.csv("/cloud/project/Bike_Sharing/Bluebikes_edited.csv", header = TRUE,
                                strip.white = TRUE, sep = ",")

# View(df_bike_sharing)
# View(df_bluebikes_edited)

Renaming columns to incorporate sensible column names

col_names <- c("ID", "Date", "Season", "Year", "Month", "Hour", "Holiday", "Weekday", "Working_Day",
       "Weather_Situation", "Norm_Temp", "Norm_Feels_Temp", "Norm_Humidity", "Norm_Windspeed",
       "Users_Unregistered", "Users_Registered", "Users_Total" )
colnames(df_bike_sharing) <- col_names

Number of instances and attributes

dim(df_bike_sharing)
## [1] 17379    17

Checking the datatypes for the columns

sapply(df_bike_sharing, class)
##                 ID               Date             Season               Year 
##          "integer"        "character"          "integer"          "integer" 
##              Month               Hour            Holiday            Weekday 
##          "integer"          "integer"          "integer"          "integer" 
##        Working_Day  Weather_Situation          Norm_Temp    Norm_Feels_Temp 
##          "integer"          "integer"          "numeric"          "numeric" 
##      Norm_Humidity     Norm_Windspeed Users_Unregistered   Users_Registered 
##          "numeric"          "numeric"          "integer"          "integer" 
##        Users_Total 
##          "integer"

Dataset summary

summary(df_bike_sharing)
##        ID            Date               Season           Year       
##  Min.   :    1   Length:17379       Min.   :1.000   Min.   :0.0000  
##  1st Qu.: 4346   Class :character   1st Qu.:2.000   1st Qu.:0.0000  
##  Median : 8690   Mode  :character   Median :3.000   Median :1.0000  
##  Mean   : 8690                      Mean   :2.502   Mean   :0.5026  
##  3rd Qu.:13034                      3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :17379                      Max.   :4.000   Max.   :1.0000  
##      Month             Hour          Holiday           Weekday     
##  Min.   : 1.000   Min.   : 0.00   Min.   :0.00000   Min.   :0.000  
##  1st Qu.: 4.000   1st Qu.: 6.00   1st Qu.:0.00000   1st Qu.:1.000  
##  Median : 7.000   Median :12.00   Median :0.00000   Median :3.000  
##  Mean   : 6.538   Mean   :11.55   Mean   :0.02877   Mean   :3.004  
##  3rd Qu.:10.000   3rd Qu.:18.00   3rd Qu.:0.00000   3rd Qu.:5.000  
##  Max.   :12.000   Max.   :23.00   Max.   :1.00000   Max.   :6.000  
##   Working_Day     Weather_Situation   Norm_Temp     Norm_Feels_Temp 
##  Min.   :0.0000   Min.   :1.000     Min.   :0.020   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.000     1st Qu.:0.340   1st Qu.:0.3333  
##  Median :1.0000   Median :1.000     Median :0.500   Median :0.4848  
##  Mean   :0.6827   Mean   :1.425     Mean   :0.497   Mean   :0.4758  
##  3rd Qu.:1.0000   3rd Qu.:2.000     3rd Qu.:0.660   3rd Qu.:0.6212  
##  Max.   :1.0000   Max.   :4.000     Max.   :1.000   Max.   :1.0000  
##  Norm_Humidity    Norm_Windspeed   Users_Unregistered Users_Registered
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00     Min.   :  0.0   
##  1st Qu.:0.4800   1st Qu.:0.1045   1st Qu.:  4.00     1st Qu.: 34.0   
##  Median :0.6300   Median :0.1940   Median : 17.00     Median :115.0   
##  Mean   :0.6272   Mean   :0.1901   Mean   : 35.68     Mean   :153.8   
##  3rd Qu.:0.7800   3rd Qu.:0.2537   3rd Qu.: 48.00     3rd Qu.:220.0   
##  Max.   :1.0000   Max.   :0.8507   Max.   :367.00     Max.   :886.0   
##   Users_Total   
##  Min.   :  1.0  
##  1st Qu.: 40.0  
##  Median :142.0  
##  Mean   :189.5  
##  3rd Qu.:281.0  
##  Max.   :977.0

———- Data Cleaning and Preprocessing ———-

Diagnose dataset to look for missing values

diagnose(df_bike_sharing) # no missing values found

Changing attributes, creating new ones for more interpretability

1.) Changing the numerical month column to their month names

2.) Converting binary year column to its respective year values (0 represents year 2020 and 1 represents year 2021)

3.) Converting the weekday numbers to weekday (Mon, Tue, etc.)

4.) Adding a yearly quarter column (eg 2021 Q1)

5.) Converting season numerical column to denote respective season (1 Winter, 2 - Spring, 3 - Summer, 4 - Fall)

6.) Converting the column Weather Situation to its respective assigned values given below:

1: Clear, Few clouds, Partly cloudy, Partly cloudy

2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist

3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds

4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

df_bike_sharing$Date <- dplyr::case_when(
      substring(df_bike_sharing$Date,1,4) == "2011" ~ 
        sub("2011", "2020", df_bike_sharing$Date),
      substring(df_bike_sharing$Date,1,4) =="2012" ~
        sub("2012", "2021", df_bike_sharing$Date),
      TRUE ~ as.character(df_bike_sharing$Date)
)

# Changing the 'Date' column datatype to Date
df_bike_sharing$Date <- as.Date(df_bike_sharing$Date)

# Extracting the Day from Date and storing in a new column
df_bike_sharing$Day_of_Month <- format(df_bike_sharing$Date, format = "%d")

# Converting Month numerical values to their designated month names
df_bike_sharing$Month <- month.abb[df_bike_sharing$Month]

# Converting Year column from values 0 and 1 to actual year values
# The value 0 represents year 2020 and 1 represents 2021
df_bike_sharing$Year <- ifelse(df_bike_sharing$Year == 0, 2020, 2021)

# Converting Weekday numerical values to their weekday names
df_bike_sharing$Day_of_Week <- dplyr::case_when(
  df_bike_sharing$Weekday == 0 ~ "Sun",
  df_bike_sharing$Weekday == 1 ~ "Mon",
  df_bike_sharing$Weekday == 2 ~ "Tue",
  df_bike_sharing$Weekday == 3 ~ "Wed",
  df_bike_sharing$Weekday == 4 ~ "Thur",
  df_bike_sharing$Weekday == 5 ~ "Fri",
  df_bike_sharing$Weekday == 6 ~ "Sat",
  TRUE ~ as.character(df_bike_sharing$Weekday)
)

# Converting integer values of Hour to HH:MM format
df_bike_sharing$Hour <- sprintf("%02s", df_bike_sharing$Hour)

# Adding Yearly Quarter Column
df_bike_sharing$Quarter = as.yearqtr(df_bike_sharing$Date, format = "%Yq%q")

# Converting numerical Season column to categorical to denote seasons
df_bike_sharing$Season <- as.character(factor(df_bike_sharing$Season, levels = 1:4,
                                 labels = c("Winter", "Spring", "Summer", "Fall")))

# Converting Weather Situation to its respective assigned values
df_bike_sharing$Weather_Situation <- as.character(factor(df_bike_sharing$Weather_Situation, levels = 1:4,
                                 labels = c("Clear or Partly Cloudy",
                                            "Mist and Cloudy",
                                            "Light Rain or Snow",
                                            "Heavy Rain or Snow")))

Creating new columns to convert normalized values to actual values

The temperature columns Temp, Feels_like_temp are normalized and calculated using:

(t - t_min) / (t_max - t_min)

Temp : t_min = -8, t_max = +39 (Celsius scale)

Feels_like_temp : t_min = -16, t_max = +50 (Celsius scale)

Humidity and Windspeed columns are normalized to the scale:

Humidity - max value of 100

Windspeed - max value of 67

# Converting temperature values to their actual values and storing in new columns
df_bike_sharing$Temp <- (df_bike_sharing$Norm_Temp * (39 + 8)) - 8
df_bike_sharing$Feels_Temp <- (df_bike_sharing$Norm_Temp * (50 + 16)) - 16

# Converting Humidity and Windspeed
df_bike_sharing$Humidity <- df_bike_sharing$Norm_Humidity * 100
df_bike_sharing$Windspeed <- df_bike_sharing$Norm_Windspeed * 67

Fabricating the routes for bike sharing data

df_bluebikes_routes_unique <- df_bluebikes %>%
  filter(start.station.name != end.station.name) 

# Reading the start and end locations into a dataframe
df_bluebikes_routes <- dplyr::select(df_bluebikes_routes_unique,
                          c(start.station.name,end.station.name,
                          Source.Longitude,Source.Latitude,
                          Destination.Longitude,Destination.Latitude))

df_bluebikes_routes_dist <- distinct(df_bluebikes_routes)

# Sampling the routes to create random routes for each row of original data
df_random_routes <- sample_n(df_bluebikes_routes_dist, nrow(df_bike_sharing), replace = TRUE)
df_bike_sharing <- cbind(df_bike_sharing, df_random_routes)
ncol(df_bike_sharing)
## [1] 30
colnames(df_bike_sharing)[c(25, 26, 27, 28, 29, 30)] <- c("Source", "Destination",
                                                  "Src_Long", "Src_Lat", "Dest_Long", "Dest_Lat")

Viewing the Final Data – Ready for Visualization

# View(df_bike_sharing)

————— Data Visualization —————

Grouped Bar Chart

This chart helps us to visually compare the monthly bike usage for years 2020 ans 2021

options(dplyr.summarise.inform = FALSE)

df_bike_grouped_bar <- df_bike_sharing %>%
  select(Month, Year, Users_Total) %>%
  group_by(Month, Year) %>%
  summarise(Users = sum(Users_Total))

df_bike_grouped_bar$Month = factor(df_bike_grouped_bar$Month, levels = month.abb)

ggplot(df_bike_grouped_bar, aes(x = Month, y = Users, fill = as.factor(Year))) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  geom_text(aes(label = Users),
            vjust = 0.5, hjust = -0.1, color = "black", size = 1.7, position = position_dodge(0.9)) +
  scale_fill_brewer(palette = "Dark2") +
  ggtitle("Comparison of Monthly Bike Usage for 2020 and 2021") +
  xlab("Months") +
  ylab("Total Number of Users") +
  labs(fill = "Year") +
  scale_x_discrete(limits = ~month.abb) +
  theme(plot.title = element_text(size = 13, face = "bold.italic", color = "black", hjust = 0.5),
        axis.title.x = element_text(size = 7, face = "bold"),
        axis.title.y = element_text(size = 7, face = "bold"),
        axis.text.x = element_text(size = 5, angle = 0),
        axis.text.y = element_text(size = 5, angle = 90),
        legend.position = "right",
        legend.title = element_text(size = 7, face = "bold"),
        legend.text = element_text(size = 7, face = "bold"),
        strip.text = element_text(size = 7))

Box Plot

This plot shows the weekly distribution of registered and unregistered users

df_bike_boxplot <- df_bike_sharing %>%
  select(Weekday, Users_Unregistered, Users_Registered)

df_bike_boxplot_long <- df_bike_boxplot %>% 
  pivot_longer(-Weekday,
               names_to = "Type_of_User",
               values_to = "Users")

ggplot(data = df_bike_boxplot_long, aes(x = as.character(Weekday), y = Users)) +
  geom_boxplot(aes(fill = Type_of_User)) +
  scale_x_discrete(labels = c("0" = "Sunday",
                              "1" = "Monday",
                              "2" = "Tuesday",
                              "3" = "Wednesday",
                              "4" = "Thursday",
                              "5" = "Friday",
                              "6" = "Saturday")) +
  ggtitle("Weekly Distribution of Registered and Unregistered Users") +
  xlab("Weekday") +
  ylab("Number of Users") +
  labs(fill = "Type of Users") +
  theme(plot.title = element_text(size = 13, face = "bold.italic", color = "black", hjust = 0.5),
        axis.title.x = element_text(size = 7, face = "bold"),
        axis.title.y = element_text(size = 7, face = "bold"),
        axis.text.x = element_text(size = 7),
        axis.text.y = element_text(size = 7),
        legend.position = "bottom",
        legend.title = element_text(size = 7, face = "bold"),
        legend.text = element_text(size = 7),
        strip.text = element_text(size = 7)) +
  stat_summary(fun.y = "mean", colour = "darkred", geom = "point",
               shape = 19, size = 3, show.legend = FALSE) 

Heatmap

This heatmap shows the correlation between Registerd and Unregistered Users with the Weather Attributes

# Create a dataframe with numerical columns
df_numeric <- df_bike_sharing %>%
  select(Temp, Feels_Temp, Humidity, Windspeed, Users_Unregistered, Users_Registered)

# Correlation Matrix of all numerical columns
df_bike_heatmap <- round(cor(df_numeric), 2)
df_bike_heatmap <- melt(df_bike_heatmap)

# Plotting heatmap
ggplot(data = df_bike_heatmap, aes(x = X1, y = X2, fill = value)) + 
  geom_tile() +
  geom_text(aes(X1, X2, label = value), color = "black", size = 4) +
  ggtitle("Correlation Heatmap of Users and Weather Attributes") +
  theme_minimal() +
  labs(fill = "Correlation Value") +
  theme(plot.title = element_text(size = 13, face = "bold.italic", hjust = 0.5),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(size = 7, angle = 90),
        axis.text.y = element_text(size = 7),
        legend.position = "right",
        legend.title = element_text(size = 7, face = "bold"),
        legend.text = element_text(size = 7, face = "bold"))

Bar Plot

This chart shows the hourly usage of bikes based on different seasons

options(dplyr.summarise.inform = FALSE)

df_bike_bar <- df_bike_sharing %>%
  select(Hour, Season, Users_Total) %>%
  group_by(Hour, Season) %>%
  summarise(Users = ceiling(mean(Users_Total)))

# Plotting bar graph

# Since the x label used in the bar graph is ordinal categorical (hours of the day), hence we have not sorted the bars in decreasing order of their heights.
ggplot(df_bike_bar, aes(x = Hour, y = Users)) +
  geom_bar(stat = "identity", position="dodge", aes(fill = Season)) +
  facet_wrap(~Season, scales = "free_x") +
  geom_text(aes(label = Users), angle = 0, vjust = -0.5,
            color = "black", size = 1.5, angle = 90) +
  scale_fill_brewer(palette = "Dark2") +
  ggtitle("Hourly Usage of Bikes Based on Different Seasons") +
  xlab("Hour of the Day") +
  ylab("Number of Users") +
  theme(plot.title = element_text(size = 13, face = "bold.italic", color = "black", hjust = 0.5),
        axis.title.x = element_text(size = 7, face = "bold"),
        axis.title.y = element_text(size = 7, face = "bold"),
        axis.text.x = element_text(size = 5, angle = 90),
        axis.text.y = element_text(size = 5),
        legend.position = "right",
        legend.title = element_text(size = 7, face = "bold"),
        legend.text = element_text(size = 7, face = "bold"),
        strip.text = element_text(size = 7))

Line Chart

This chart shows the quarterlu usage of bikes based on weather situation. We are also predicting the number of users for next quarters

options(dplyr.summarise.inform = FALSE)

df_bike_line <- df_bike_sharing %>%
  group_by(Quarter, Weather_Situation) %>%
  summarise(Users = ceiling(mean(Users_Registered)))

ggplot(df_bike_line, aes(x = Quarter,  y = Users)) +
  geom_line() +
  geom_smooth(formula = y ~ x,
              aes(color = Weather_Situation,
                  fill = Weather_Situation), method = "lm") +
  geom_point() +
  ylim(0, 1000) +
  facet_wrap(~ Weather_Situation, scales = "free_x") +
  geom_forecast(stat = "forecast", position = "identity",
                colour = "lightblue", showgap = FALSE) +
  geom_text(aes(label = Users),
            vjust = -2.5, color = "black", size = 1.7) +
  scale_fill_brewer(palette = "Dark2") +
  ggtitle("Quarterly Usage of Bikes Based on Weather Situation with Future Prediction") +
  xlab("Year and Quarters") +
  ylab("Number of Users") +
  scale_x_yearqtr(format = "%YQ%q") +
  theme(plot.title = element_text(size = 13, face = "bold.italic", color = "red", hjust = 0.5),
        axis.title.x = element_text(size = 7, face = "bold"),
        axis.title.y = element_text(size = 7, face = "bold"),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 5),
        legend.position = "top",
        legend.title = element_blank(),
        legend.text = element_text(colour = "blue", size = 7, face = "bold"),
        strip.text = element_text(size = 7))

Network Graph

The graph is showcasing the 20 minimally used routes.

# Selecting the source and destination and summarizing the total users
plot_net <- dplyr::select(df_bike_sharing, c(Source, Destination, Users_Total))

plot_net_filtered <- plot_net %>%
  group_by(Source, Destination) %>%
  summarise(Users_Avg = mean(Users_Total)) %>%
  arrange(Users_Avg) %>%
  head(20)

# Creating a network
grph_net <- graph.data.frame(plot_net_filtered[c(1, 2, 3)], directed = TRUE)

V(grph_net)$size <- plot_net_filtered$Users_Avg

# Plotting the network
plot(grph_net,
     layout = layout.auto,
     vertex.size = 10,
     vertex.color = "blue",
     vertex.label.cex = 0.8,
     vertex.label.color = "black",
     edge.arrow.size = 0.1,
)
title(main = list("Rarely Used Routes", cex=1.5))

Map View of stations

The map shows network of various bike stations and relative locations in Boston

# Selecting the source and destination with respective longitude and latitude columns
plot_map_filtered <- df_bike_sharing %>%
  dplyr::select(c(Source, Destination, Src_Long,
                  Src_Lat, Dest_Long, Dest_Lat, Users_Total)) %>%
  group_by(Source, Destination, Src_Long,
                  Src_Lat, Dest_Long, Dest_Lat) %>%
  summarise(Users = round(mean(Users_Total), 0))

# Using Google API to plot the mapview
api_key <- register_google(key = "AIzaSyCO_xreF1k7Gx-grRe5Dzz17BdzRca5398")
map_canvas <- get_map(c(left = min(plot_map_filtered$Src_Long), 
                  bottom = min(plot_map_filtered$Src_Lat), 
                  right = max(plot_map_filtered$Dest_Long), 
                  top = max(plot_map_filtered$Dest_Lat)),
                  maptype = "satellite",
                  source = "google",
                  zoom = 10)

# Plotting using ggmap
ggmap(map_canvas, darken = c(0.6, "white")) + 
  geom_segment(data = plot_map_filtered,
               aes(x = Src_Long, 
                   y = Src_Lat,
                   xend = Dest_Long,
                   yend = Dest_Lat,
                   alpha = sqrt(Users)),
               color = "#000000") +
  # coord_cartesian() +
  scale_alpha(range = c(0.0001, .5)) +
  geom_point(data = plot_map_filtered %>% 
               group_by(longitude = Src_Long,
                        latitude = Src_Lat) %>%
               summarize(rides = sum(Users)),
             aes(x = longitude, y = latitude, size = rides),
             color = "#009900", alpha = .4) + 
  scale_size_continuous(range(4, 100)) +
  scale_color_viridis_c() + 
  scale_fill_viridis_c() + 
  theme_nothing()

# Saving the html output as png in local drive
# ggsave(filename = "station-network.jpg", width = 8, units = "in")

Word Cloud

This visualization is displaying the routes, the size of the text depends on the number of users travelling on the routes.

# Aggregating Total Number of users on a particular route using Group by
plot_cloud_filtered <- df_bike_sharing %>%
  dplyr::select(c(Source, Destination, Users_Total)) %>%
  group_by(Source, Destination) %>%
  summarise(Users = ceiling(mean(Users_Total)))

# Concatenating source and destination
plot_cloud_filtered$concat <- paste(plot_cloud_filtered$Source,
                                    plot_cloud_filtered$Destination,
                                    sep = " --> ")

set.seed(1234)
word_cloud <- wordcloud2(data = plot_cloud_filtered[c(4,3)],
                        size = 0.5, color = 'random-dark', gridSize = 15,
                        shape = "diamond", ellipticity = 0.2,
                        fontWeight = "normal", shuffle = FALSE) 
word_cloud