Data used could be obtained in my Github repo. I wrote a spider using Scrapy framwork and used python for some data cleaning. All the spider and data processing codes are available, please refer to my github repo if you are interested.
library(DBI)
library(RSQLite)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(leaflet)
file <- "../data/Items.db"
driver <- dbDriver("SQLite")
conn <- dbConnect(driver,file)
city <- dbGetQuery(conn,'SELECT * FROM cities')
geo_job <- dbGetQuery(conn,'SELECT url,lat,lon,title,company,city,location FROM geo_job INNER JOIN jobs ON geo_job.id_foreign==jobs.id')
index <- complete.cases(geo_job)
complete_geo <- geo_job[index==TRUE,]
missing_sum <- nrow(geo_job) - sum(index)
missing_geo <- geo_job[index==FALSE,c(2,3,4,7)]
head(missing_geo)
## lat lon title
## 3 NA NA Java/Python服务器后台开发
## 6 NA NA python开发工程师—入职就买6险1金/工资不打折
## 12 NA NA python程序员
## 28 NA NA Python高级软件工程师
## 33 NA NA python开发工程师
## 34 NA NA Python软件工程师--国服
## location
## 3 浦江镇浦锦路2049弄万科VMO-28号楼
## 6 成都市南延线高新孵化园8栋德商国际A座9楼
## 12 上海市普陀区谈家渡路28号盛泉商务大厦19H
## 28 长宁区福泉路99号携程网络技术大楼
## 33 天府大道中段666号希顿国际广场c座31楼3109号
## 34 深圳市龙岗区坂田华为基地五和大道北中控大厦
184 job records are missing latitude and longtitude.
index_city = complete.cases(city)
city[index_city==FALSE,]
## city count lat lon
## 22 无锡 3 NA NA
## 24 济南 4 NA NA
## 29 泰州 1 NA NA
spl <- strsplit(geo_job$city,'-')
tmp <- lapply(spl,function(x) x[1])
geo_job$city_simple <- unlist(tmp)
city$city <- reorder(city$city,city$count,function(x){-mean(x)})
city <- arrange(city,desc(count))
p <- ggplot(data=city[1:20,],aes(city,count)) + geom_bar(fill='blue',stat = "identity") + labs(x="City",y="Count",title="Python Jobs Count by City")
ggplotly(p)
The top 5 cities with most python jobs are 北京, 上海, 深圳, 广州, 成都.
city$hover <- paste(city$city,': ',city$count," python jobs")
quan <- quantile(city$count)
quan[1] <-0
city$q <- with(city,cut(count,quan,include.lowest=T))
levels(city$q) <- paste(c('1st','2nd','3rd','4th')," Quantile")
city$q <- as.ordered(city$q)
g <- list(
scope = 'asia',
showland = TRUE,
landcolor = toRGB("gray85"),
subunitwidth = 1,
countrywidth = 1,
subunitcolor = toRGB("white"),
countrycolor = toRGB("black")
)
plot_ly(city, lon = lon, lat = lat, text = hover,
marker = list(size = sqrt(count)+9, line = list(width = 0)),
color = q, type = 'scattergeo',locationmode="china") %>%
layout(title = 'Python Jobs Count by Cites', geo = g)
Sum of python jobs in a city is represented as the size of circle. Note that this map is interactive. You could click the quantiles in the top right of this map to hide or show related circles.
m <- leaflet()
m <- addTiles(m)
m <- addMarkers(m, complete_geo$lon, lat=complete_geo$lat, popup=paste('<br>',"<a href='", complete_geo$url, "'>" , complete_geo$title,"</a>",'<br>',complete_geo$company))
m
This map shows exactly where each job is located. You could click on marker to get URL of each job, which I think is the most amazing part.