We’ve identified several of the most popular data science websites, along with some popular data science concepts and sectors. Our goal is to crawl the sites to check for popularity of these topics within the articles.
Below is the list of websites we used:
datasciencecentral.com
smartdatacollective.com
whatsthebigdata.com
blog.kaggle.com
simplystatistics.org
Below is the list of concepts we searched for:
Quantitative
Predictive Modeling
Personalization
Big Data
Data Mining
Visualization
Machine Learning
Business Intelligence
Forecast
Deep Learning
Below is a list of the sectors and associated keywords we searched for:
Agriculture: (“Agriculture”, “Botany”, “Botanical”, “Farming”)
Disease: (“Disease”, “Health”, “Medicine”, “Clinic”, “Epidemiology”)
DNA: (“DNA”, “Genetics”, “Biology”)
Weather: (“Weather”, “Climate”, “Meteorology”)"
The process will be as follows:
Identify list of URLs for each website. In our example we performed a top-down crawl starting at the homepage and following all links to deeper pages. For this we used the popular web crawler “Screaming Frog” (https://www.screamingfrog.co.uk/seo-spider/).
Count occurences of specified topics (Screaming Frog has built in search cabailities).
Manual inspection to determine of sitewide elements (such as navigation) include the topics. We’ll have to subtract these sitewide instances from any counts in order to get a true count the appropriate URLs.
The # of URLs mentioning a topic (regardless of how many times its mentioned), will be our metric of the popularity of that topic.
Let’s load up all the data sets and start with some visualization on the DataScienceCentral Data Set:
# Helper for getting new connection to Cloud SQL
getSqlConnection <- function() {
con <-dbConnect(RMySQL::MySQL(),
username = 'sjones',#other ids set up are 'achan' and 'mhayes'
password = 'ac.mh.sj.607',#we all can use the same password
host = '35.202.129.190',#this is the IP address of the cloud instance
dbname = 'softskills')
return(con)
}
connection <- getSqlConnection()
dsc_data <- dbGetQuery(connection,"select * from blog_topics.dsc_data")
kgl_data <- dbGetQuery(connection,"select * from blog_topics.kgl_data")
ss_data <- dbGetQuery(connection,"select * from blog_topics.ss_data")
sdc_data <- dbGetQuery(connection,"select * from blog_topics.sdc_data")
wbg_data <- dbGetQuery(connection,"select * from blog_topics.wbg_data")
#dsc_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/datasciencecentral-urls.csv")
#kgl_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/kaggle-urls.csv")
#ss_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/simplystatistics-urls.csv")
#sdc_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/smartdatacollective-urls.csv")
#wbg_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/whatsthebigdata-urls.csv")
#head(dsc_data)
Upon manual inspection we’ll need to modify these counts to account for sitewide occurences. First we check if a count is 0 (if so we won’t adjust it), then we adjust appropriately.
dsc_data$big_data[dsc_data$big_data > 0] <- dsc_data$big_data[dsc_data$big_data > 0] - 1
kgl_data$forecast[kgl_data$forecast > 0] <- kgl_data$forecast[kgl_data$forecast > 0] - 3
sdc_data$big_data <- NA
sdc_data$business_intelligence[sdc_data$business_intelligence > 0] <- NA
sdc_data$machine_learning[sdc_data$machine_learning > 0] <- sdc_data$machine_learning[sdc_data$machine_learning > 0] - 2
wbg_data$data_mining[wbg_data$data_mining > 0] <- wbg_data$data_mining[wbg_data$data_mining > 0] - 1
wbg_data$deep_learning[wbg_data$deep_learning > 0] <- wbg_data$deep_learning[wbg_data$deep_learning > 0] - 1
wbg_data$machine_learning[wbg_data$machine_learning >0 ] <- wbg_data$machine_learning[wbg_data$machine_learning >0] - 1
Next we want to turn these values into binary values, as we only care about how many URLs mention a given topic, not how many times its mentioned in a specific URL.
dsc_data$big_data[dsc_data$big_data > 0] <- 1
dsc_data$business_intelligence[dsc_data$business_intelligence > 0] <- 1
dsc_data$data_mining[dsc_data$data_mining > 0] <- 1
dsc_data$deep_learning[dsc_data$deep_learning > 0] <- 1
dsc_data$forecast[dsc_data$forecast > 0] <- 1
dsc_data$machine_learning[dsc_data$machine_learning > 0] <- 1
dsc_data$personalization[dsc_data$personalization > 0] <- 1
dsc_data$predictive_modeling[dsc_data$predictive_modeling > 0] <- 1
dsc_data$quantitative[dsc_data$quantitative > 0] <- 1
ss_data$big_data[ss_data$big_data > 0] <- 1
ss_data$business_intelligence[ss_data$business_intelligence > 0] <- 1
ss_data$data_mining[ss_data$data_mining > 0] <- 1
ss_data$deep_learning[ss_data$deep_learning > 0] <- 1
ss_data$forecast[ss_data$forecast > 0] <- 1
ss_data$machine_learning[ss_data$machine_learning > 0] <- 1
ss_data$personalization[ss_data$personalization > 0] <- 1
ss_data$predictive_modeling[ss_data$predictive_modeling > 0] <- 1
ss_data$quantitative[ss_data$quantitative > 0] <- 1
kgl_data$big_data[kgl_data$big_data > 0] <- 1
kgl_data$business_intelligence[kgl_data$business_intelligence > 0] <- 1
kgl_data$data_mining[kgl_data$data_mining > 0] <- 1
kgl_data$deep_learning[kgl_data$deep_learning > 0] <- 1
kgl_data$forecast[kgl_data$forecast > 0] <- 1
kgl_data$machine_learning[kgl_data$machine_learning > 0] <- 1
kgl_data$personalization[kgl_data$personalization > 0] <- 1
kgl_data$predictive_modeling[kgl_data$predictive_modeling > 0] <- 1
kgl_data$quantitative[kgl_data$quantitative > 0] <- 1
sdc_data$big_data[sdc_data$big_data > 0] <- 1
sdc_data$business_intelligence[sdc_data$business_intelligence > 0] <- 1
sdc_data$data_mining[sdc_data$data_mining > 0] <- 1
sdc_data$deep_learning[sdc_data$deep_learning > 0] <- 1
sdc_data$forecast[sdc_data$forecast > 0] <- 1
sdc_data$machine_learning[sdc_data$machine_learning > 0] <- 1
sdc_data$personalization[sdc_data$personalization > 0] <- 1
sdc_data$predictive_modeling[sdc_data$predictive_modeling > 0] <- 1
sdc_data$quantitative[sdc_data$quantitative > 0] <- 1
wbg_data$big_data[wbg_data$big_data > 0] <- 1
wbg_data$business_intelligence[wbg_data$business_intelligence > 0] <- 1
wbg_data$data_mining[wbg_data$data_mining > 0] <- 1
wbg_data$deep_learning[wbg_data$deep_learning > 0] <- 1
wbg_data$forecast[wbg_data$forecast > 0] <- 1
wbg_data$machine_learning[wbg_data$machine_learning > 0] <- 1
wbg_data$personalization[wbg_data$personalization > 0] <- 1
wbg_data$predictive_modeling[wbg_data$predictive_modeling > 0] <- 1
wbg_data$quantitative[wbg_data$quantitative > 0] <- 1
Now that we have binary values we can compute sums that will represent the # of URLs which those topics were found.
Furthermore, by dividing by the count of URLs we get a ratio, which is much more appropriate for comparing across data sets.
dsc_sums <- c(sum(dsc_data$big_data),sum(dsc_data$business_intelligence), sum(dsc_data$data_mining), sum(dsc_data$deep_learning),sum(dsc_data$forecast), sum(dsc_data$machine_learning), sum(dsc_data$personalization), sum(dsc_data$predictive_modeling), sum(dsc_data$quantitative))/length(dsc_data$url)
dsc_sums
## [1] 0.296726067 0.061748860 0.191462909 0.217985910 0.067965189 0.342312474
## [7] 0.002900953 0.031910485 0.040613344
wbg_data$big_data <- as.integer(wbg_data$big_data)
wbg_sums <- c(sum(wbg_data$big_data),sum(wbg_data$business_intelligence), sum(wbg_data$data_mining), sum(wbg_data$deep_learning),sum(wbg_data$forecast), sum(wbg_data$machine_learning), sum(wbg_data$personalization), sum(wbg_data$predictive_modeling), sum(wbg_data$quantitative))/length(wbg_data$url)
wbg_sums
## [1] NA 0.080793763 0.035435861 0.074415308 0.063075833 0.153082920
## [7] 0.008504607 0.006378455 0.016300496
kgl_sums <- c(sum(kgl_data$big_data),sum(kgl_data$business_intelligence), sum(kgl_data$data_mining), sum(kgl_data$deep_learning),sum(kgl_data$forecast), sum(kgl_data$machine_learning), sum(kgl_data$personalization), sum(kgl_data$predictive_modeling), sum(kgl_data$quantitative))/length(kgl_data$url)
kgl_sums
## [1] 0.06437768 0.01144492 0.09012876 0.09012876 0.14592275 0.47067239
## [7] 0.01001431 0.03290415 0.02432046
sdc_sums <- c(sum(sdc_data$big_data),sum(sdc_data$business_intelligence), sum(sdc_data$data_mining), sum(sdc_data$deep_learning),sum(sdc_data$forecast), sum(sdc_data$machine_learning), sum(sdc_data$personalization), sum(sdc_data$predictive_modeling), sum(sdc_data$quantitative))/length(sdc_data$url)
sdc_sums
## [1] NA NA 0.085155351 0.032604526 0.053701573 0.137706176
## [7] 0.025700038 0.006137323 0.016110472
ss_sums <- c(sum(ss_data$big_data),sum(ss_data$business_intelligence), sum(ss_data$data_mining), sum(ss_data$deep_learning),sum(ss_data$forecast), sum(ss_data$machine_learning), sum(ss_data$personalization), sum(ss_data$predictive_modeling), sum(ss_data$quantitative))/length(ss_data$url)
ss_sums
## [1] 0.1014705882 0.0007352941 0.0051470588 0.0154411765 0.0161764706
## [6] 0.0573529412 0.0000000000 0.0014705882 0.0382352941
Now let’s look at some visualizations across these topics:
var_names <- c("Big Data", "Business Intelligence", "Data Mining", "Deep Learning", "Forecast", "Machine Learning", "Personalization", "Predictive Modeling", "Quantitative")
data_df <- data.frame(var_names, dsc_sums, wbg_sums, kgl_sums, sdc_sums, ss_sums)
data_df
## var_names dsc_sums wbg_sums kgl_sums sdc_sums
## 1 Big Data 0.296726067 NA 0.06437768 NA
## 2 Business Intelligence 0.061748860 0.080793763 0.01144492 NA
## 3 Data Mining 0.191462909 0.035435861 0.09012876 0.085155351
## 4 Deep Learning 0.217985910 0.074415308 0.09012876 0.032604526
## 5 Forecast 0.067965189 0.063075833 0.14592275 0.053701573
## 6 Machine Learning 0.342312474 0.153082920 0.47067239 0.137706176
## 7 Personalization 0.002900953 0.008504607 0.01001431 0.025700038
## 8 Predictive Modeling 0.031910485 0.006378455 0.03290415 0.006137323
## 9 Quantitative 0.040613344 0.016300496 0.02432046 0.016110472
## ss_sums
## 1 0.1014705882
## 2 0.0007352941
## 3 0.0051470588
## 4 0.0154411765
## 5 0.0161764706
## 6 0.0573529412
## 7 0.0000000000
## 8 0.0014705882
## 9 0.0382352941
Now that we’ve got all our data pulled in, its actually not tidied completely. Let’s gather and spread and get it set up correctly.
To get it easily visualizated via ggplot, we’ll gather it up.
However to present it as a dataframe we’ll gather and spread to transpose rows and columns.
data_df_gather <- tidyr::gather(data_df, var, ratio, -var_names)
data_df <- tidyr::spread(data_df_gather, var_names, ratio)
data_df
## var Big Data Business Intelligence Data Mining Deep Learning
## 1 dsc_sums 0.29672607 0.0617488603 0.191462909 0.21798591
## 2 kgl_sums 0.06437768 0.0114449213 0.090128755 0.09012876
## 3 sdc_sums NA NA 0.085155351 0.03260453
## 4 ss_sums 0.10147059 0.0007352941 0.005147059 0.01544118
## 5 wbg_sums NA 0.0807937633 0.035435861 0.07441531
## Forecast Machine Learning Personalization Predictive Modeling
## 1 0.06796519 0.34231247 0.002900953 0.031910485
## 2 0.14592275 0.47067239 0.010014306 0.032904149
## 3 0.05370157 0.13770618 0.025700038 0.006137323
## 4 0.01617647 0.05735294 0.000000000 0.001470588
## 5 0.06307583 0.15308292 0.008504607 0.006378455
## Quantitative
## 1 0.04061334
## 2 0.02432046
## 3 0.01611047
## 4 0.03823529
## 5 0.01630050
ggplot(data_df_gather, aes(x = factor(var_names), y = ratio)) + facet_wrap(~var) + geom_bar(stat = 'identity', aes(fill = factor(var_names))) + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
## Warning: Removed 3 rows containing missing values (position_stack).
Now we take a look at a few varieties of market sectors to see how often they are mentioned on the popular data science blogs.
dsc_sec_data <- dbGetQuery(connection,"select * from blog_topics.dsc_sec_data")
kgl_sec_data <- dbGetQuery(connection,"select * from blog_topics.kgl_sec_data")
ss_sec_data <- dbGetQuery(connection,"select * from blog_topics.ss_sec_data")
sdc_sec_data <- dbGetQuery(connection,"select * from blog_topics.sdc_sec_data")
wbg_sec_data <- dbGetQuery(connection,"select * from blog_topics.wgb_sec_data")
#dsc_sec_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/datascience-central-urls-sectors.csv")
#kgl_sec_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/kaggle-urls-sectors.csv")
#ss_sec_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/simplystatistics-urls-sectors.csv")
#sdc_sec_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/smartdatacollective-urls-sectors.csv")
#wbg_sec_data <- read.csv("https://raw.githubusercontent.com/murphystout/data-607/master/whatsthebigdata-urls-sectors.csv")
wbg_sec_data$agriculture[wbg_sec_data$agriculture > 0] <- wbg_sec_data$agriculture[wbg_sec_data$agriculture > 0] - 1
wbg_sec_data$disease[wbg_sec_data$disease < 5] <- 0
wbg_sec_data$dna[wbg_sec_data$dna > 0] <- wbg_sec_data$dna[wbg_sec_data$dna > 0] - 1
dsc_sec_data$agriculture[dsc_sec_data$agriculture > 0] <- 1
dsc_sec_data$disease[dsc_sec_data$disease > 0] <- 1
dsc_sec_data$dna[dsc_sec_data$dna > 0] <- 1
dsc_sec_data$weather[dsc_sec_data$weather > 0] <- 1
ss_sec_data$agriculture[ss_sec_data$agriculture > 0] <- 1
ss_sec_data$disease[ss_sec_data$disease > 0] <- 1
ss_sec_data$dna[ss_sec_data$dna > 0] <- 1
ss_sec_data$weather[ss_sec_data$weather > 0] <- 1
kgl_sec_data$agriculture[kgl_sec_data$agriculture > 0] <- 1
kgl_sec_data$disease[kgl_sec_data$disease > 0] <- 1
kgl_sec_data$dna[kgl_sec_data$dna > 0] <- 1
kgl_sec_data$weather[kgl_sec_data$weather > 0] <- 1
sdc_sec_data$agriculture[sdc_sec_data$agriculture > 0] <- 1
sdc_sec_data$disease[sdc_sec_data$disease > 0] <- 1
sdc_sec_data$dna[sdc_sec_data$dna > 0] <- 1
sdc_sec_data$weather[sdc_sec_data$weather > 0] <- 1
wbg_sec_data$agriculture[wbg_sec_data$agriculture > 0] <- 1
wbg_sec_data$disease[wbg_sec_data$disease > 0] <- 1
wbg_sec_data$dna[wbg_sec_data$dna > 0] <- 1
wbg_sec_data$weather[wbg_sec_data$weather > 0] <- 1
dsc_sec_sums <- c(sum(dsc_data$agriculture),sum(dsc_sec_data$disease), sum(dsc_sec_data$dna), sum(dsc_sec_data$weather))/length(dsc_sec_data$url)
dsc_sec_sums
## [1] 0.00000000 0.17813765 0.01781377 0.00000000
wbg_sec_sums <- c(sum(wbg_sec_data$agriculture),sum(wbg_sec_data$disease), sum(wbg_sec_data$dna), sum(wbg_sec_data$weather))/length(wbg_sec_data$url)
wbg_sec_sums
## [1] 0.014180672 0.123949580 0.039915966 0.007352941
kgl_sec_sums <- c(sum(kgl_sec_data$agriculture),sum(kgl_sec_data$disease), sum(kgl_sec_data$dna), sum(kgl_sec_data$weather))/length(kgl_sec_data$url)
kgl_sec_sums
## [1] 0.001169591 0.185964912 0.025730994 0.056140351
sdc_sec_sums <- c(sum(sdc_sec_data$agriculture),sum(sdc_sec_data$disease), sum(sdc_sec_data$dna), sum(sdc_sec_data$weather))/length(sdc_sec_data$url)
sdc_sec_sums
## [1] 0.009381898 0.187086093 0.019867550 0.043046358
ss_sec_sums <- c(sum(ss_sec_data$agriculture),sum(ss_sec_data$disease), sum(ss_sec_data$dna), sum(ss_sec_data$weather))/length(ss_sec_data$url)
ss_sec_sums
## [1] 0.004 0.278 0.140 0.030
sec_names <- c("Agriculture", "Disease", "DNA", "Weather")
sec_df <- data.frame(sec_names, dsc_sec_sums, wbg_sec_sums, kgl_sec_sums, sdc_sec_sums, ss_sec_sums)
sec_df
## sec_names dsc_sec_sums wbg_sec_sums kgl_sec_sums sdc_sec_sums
## 1 Agriculture 0.00000000 0.014180672 0.001169591 0.009381898
## 2 Disease 0.17813765 0.123949580 0.185964912 0.187086093
## 3 DNA 0.01781377 0.039915966 0.025730994 0.019867550
## 4 Weather 0.00000000 0.007352941 0.056140351 0.043046358
## ss_sec_sums
## 1 0.004
## 2 0.278
## 3 0.140
## 4 0.030
sec_df_gather <- tidyr::gather(sec_df, var, ratio, -sec_names)
sec_df <- tidyr::spread(sec_df_gather, sec_names, ratio)
sec_df
## var Agriculture Disease DNA Weather
## 1 dsc_sec_sums 0.000000000 0.1781377 0.01781377 0.000000000
## 2 kgl_sec_sums 0.001169591 0.1859649 0.02573099 0.056140351
## 3 sdc_sec_sums 0.009381898 0.1870861 0.01986755 0.043046358
## 4 ss_sec_sums 0.004000000 0.2780000 0.14000000 0.030000000
## 5 wbg_sec_sums 0.014180672 0.1239496 0.03991597 0.007352941
ggplot(sec_df_gather, aes(x = factor(sec_names), y = ratio)) + facet_wrap(~var) + geom_bar(stat = 'identity', aes(fill = factor(sec_names))) + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())