US_sates <- data.frame(
"ANSI_let" = c("AL", "AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA",
"HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT",
"VA","WA","WV","WI","WY"),
"ANSI_num" = c("01","02","04","05","06","08","09","10","11","12",
"13","15","16","17","18","19","20","21","22","23","24","25",
"26","27","28","29","30","31","32","33","34","35","36",
"37","38","39","40","41","42","44","45","46","47","48","49",
"50","51","53","54","55","56"),
"USPS" = c("AL", "AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA",
"HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI",
"MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC",
"ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT",
"VA","WA","WV","WI","WY")
)
The goals of this document is to summarize some important key facts about the FCC data (June 2022 version).
The data can be downloaded at https://broadbandmap.fcc.gov/data-download/nationwide-data
In January 2023 data can be accessed at:
“Street address” level (called location)
States level (per release)
Providers level
One location can have multiple providers and sometimes multiple services.
Once selected a state (no option to get an US region or all the States) the data is provided by technology (zip files).
We downloaded all the data in the CORI “Rstudio server”. To list all
the files used (368 files, 79GB) is made with
R/list_files_used.R
This list of files is provided below:
library(DT)
files_used <- readRDS("../shareable_data/files_used.rds")
files_used <- merge(files_used, US_sates,
by.x = "States", by.y = "ANSI_num", all = TRUE)
files_used <- files_used[,c("ANSI_let", "Technology", "nb_lines")]
DT::datatable(files_used, rownames = FALSE,
extensions = 'Buttons',
options = list(
dom = 'Blfrtip',
buttons = list('copy', 'print', list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download')
)
)
)
cat *.csv | wc -l
# 866703684
ls | wc -l
# 368
# keep in mind that wc -l count headers
We have a total of 866 703 316 rows (866 703 684 - 368 headers) for 112 288 407 unique locations.
A row: location x provider x technology (fiber, cable) x services (25/3)
Every rows has:
frn: 10 digit FCC register number of the enity that
submitted the data (difference with provider_id and
brand_name ?).provider_id/brand_name: unique
identifier/name of service providertechnology: see below (side note: 72,
Licensed-by-Rule Terrestrial Fixed Wireless is not present in
this data set)max_advertised_download_speed: Mbpsmax_advertised_upload_speed: Mbpslow_latency: low latency, defined as having
round-trip latency of less than or equal to 100 ms based on the 95th
percentile of measurements.business_residential_code: * B:
Business-only locationR: Residential-only locationX: Bothstate_usps : DCblock_geoid : 15-digit U.S. Census Bureau FIPS code
(latest U.S. Census Bureau decennial data.)h3_res8_id: H3 index (resolution 8/9)select
technology,
count(location_id)
from
staging.raw
group by
technology;
Name <- c("Other", "Copper Wire", "Coaxial Cable", "Fiber to the Premises", "Geostationary Satellite", "Non-geostationary Satellite", "Unlicensed Terrestrial Fixed Wireless", "Licensed Terrestrial Fixed Wireless")
Code <- c(0, 10, 40, 50, 60, 61, 70, 71)
technology <- data.frame(Name,
Code,
Count = c(3660, 64747570, 100089306, 52607977, 336639409, 110410618, 37493532, 164711244))
technology$Count <- format(technology$Count, big.mark = " ")
DT::datatable(technology,
rownames = FALSE,
extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = list('copy', 'print', list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download')
)
)
)
select
technology,
sum(case when low_latency = true then 1 else 0 end) as cnt_low_lty,
count(technology)
from
staging.raw
group by
technology;
low_lty <- data.frame(Name,
Code,
Count_low_lty =c(2757, 64627908, 100076508, 52533143,
0, 110410618, 37135215, 164604085),
Count = c(3660, 64747570, 100089306, 52607977, 336639409, 110410618, 37493532, 164711244))
low_lty$Perc <- round((low_lty$Count_low_lty / low_lty$Count * 100), 2)
low_lty$Count <- format(low_lty$Count, big.mark = " ")
low_lty$Count_low_lty <- format(low_lty$Count_low_lty, big.mark = " ")
DT::datatable(low_lty,
rownames = FALSE,
extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = list('copy', 'print', list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download')
)
)
)
-- data-sat_only.csv
select
geoid_st,
count(geoid_st) as cnt_loc,
sum(case when only_sat = true then 1 else 0 end) as sat_only_loc,
count(distinct geoid_bl) as cnt_bl
from
staging.broadband_map_loc_2022june
group by
geoid_st;
-- data-non_sat.csv
select
geoid_st,
sum(cnt_locations) as tot,
sum(cnt_non_sat_wireless_locations) as cnt_non_air
from
staging.block_test
group by
geoid_st;
sat <- read.csv("../shareable_data/data-sat_only.csv",
colClasses = c("character", "integer", "integer", "integer"))
sat$Perc_sat <- round((sat$sat_only_loc / sat$cnt_loc) * 100, 2)
non_sat <- read.csv("../shareable_data/data-non_sat.csv",
colClasses = c("character", "integer", "integer"))
non_sat$sat_wireles_only <- sat$cnt_loc - non_sat$cnt_non_air
non_sat$Perc_sat_wireles_only <- round((non_sat$sat_wireles_only / sat$cnt_loc) * 100, 2)
sat <- merge(sat, US_sates,
by.x = "geoid_st", by.y = "ANSI_num", all = TRUE)
sat <- merge(sat, non_sat,
by.x = "geoid_st", by.y = "geoid_st", all = TRUE)
sat$cnt_loc <- format(sat$cnt_loc, big.mark = " ")
sat$sat_only_loc <- format(sat$sat_only_loc, big.mark = " ")
sat$sat_wireles_only <- format(sat$sat_wireles_only, big.mark = " ")
sat <- sat[,c("ANSI_let", "cnt_loc", "sat_only_loc",
"sat_wireles_only","Perc_sat", "Perc_sat_wireles_only")]
If you look satellite only Alaska and Arkansans display particular numbers (10,68% and 7,11%)
If you include Wireless (locations are only covered by satellite or wireless or both) :
DT::datatable(sat, rownames = FALSE,
extensions = 'Buttons',
options = list(
dom = 'Blfrtip',
buttons = list('copy', 'print', list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download')
)
)
)
select
geoid_st,
sum(case when bb_access_flag_25_3 = true then 1 else 0 end ) as cnt_bl_25_3,
sum(case when bb_access_flag_100_20 = true then 1 else 0 end ) as cnt_bl_100_20,
count(geoid_st) as total_block,
sum(cnt_bb_access_flag_25_3_served) as cnt_loc_25_3,
sum(cnt_bb_access_flag_100_20_served) as cnt_loc_100_20,
sum(cnt_locations) as total_location
from
sch_broadband.bb_map_block_2022june
group by
geoid_st;
--- w/o sat and wireless
select
geoid_st,
sum(cnt_25_3) as cnt_loc_25_3,
sum(cnt_100_20) as cnt_loc_100_20,
sum(case when flag_25_3 = true then 1 else 0 end ) as cnt_bl_25_3,
sum(case when flag_100_20 = true then 1 else 0 end ) as cnt_bl_100_20
from
staging.block_test
group by
geoid_st;
served_st_loc <- read.csv("../shareable_data/data-served_states_sat.csv",
colClasses = c("character", "integer",
"integer", "integer",
"integer", "integer", "integer"))
served_st_loc_no_sat <- read.csv("../shareable_data/data-served_states_nosat.csv",
colClasses = c("character", "integer",
"integer", "integer", "integer"))
names(served_st_loc_no_sat) <- c("geoid_st", "cnt_loc_25_3_noA","cnt_loc_100_20_noA", "cnt_bl_25_3_noA", "cnt_bl_100_20_noA")
served_st_loc <- merge(served_st_loc, served_st_loc_no_sat,
by.x = "geoid_st", by.y = "geoid_st", all = TRUE)
served_st_loc <- merge(served_st_loc, US_sates,
by.x = "geoid_st", by.y = "ANSI_num", all = TRUE)
served_st_loc$Perc_25_3 <- round((served_st_loc$cnt_loc_25_3 / served_st_loc$total_location * 100), 2)
served_st_loc$Perc_25_3_noA <- round((served_st_loc$cnt_loc_25_3_noA / served_st_loc$total_location * 100), 2)
served_st_loc$Perc_100_20 <- round((served_st_loc$cnt_loc_100_20 / served_st_loc$total_location * 100), 2)
served_st_loc$Perc_100_20_noA <- round((served_st_loc$cnt_loc_100_20_noA / served_st_loc$total_location * 100), 2)
served_st_loc$Total_locations <- format(served_st_loc$total_location, big.mark = " ")
served_st_loc <- served_st_loc[ ,c("ANSI_let", "Perc_25_3", "Perc_25_3_noA", "Perc_100_20", "Perc_100_20_noA","Total_locations")]
In US we have 112 288 407 unique locations.
The coverage (>= 25/3) per states vary from 85 - 100% (average: 99.56 %)
The coverage (>= 100/20) per states vary from 70 -100% (average: 91.93 %)
The coverage (>= 25/3) per states range from 72 - 99.77% (average: 87.71 %)
The coverage (>= 100/20) per states range from 66 - 99.77% (average: 85.35 %)
DT::datatable(served_st_loc,rownames = FALSE ,
extensions = 'Buttons',
options = list(
dom = 'Blfrtip',
buttons = list('copy', 'print', list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download')
)
)
)
served_st_bl <- read.csv("../shareable_data/data-served_states_sat.csv",
colClasses = c("character", "integer",
"integer", "integer",
"integer", "integer", "integer"))
served_st_bl_no_sat <- read.csv("../shareable_data/data-served_states_nosat.csv",
colClasses = c("character", "integer",
"integer", "integer", "integer"))
names(served_st_bl_no_sat) <- c("geoid_st", "cnt_loc_25_3_noA","cnt_loc_100_20_noA", "cnt_bl_25_3_noA", "cnt_bl_100_20_noA")
served_st_bl <- merge(served_st_bl, US_sates,
by.x = "geoid_st", by.y = "ANSI_num", all = TRUE)
served_st_bl <- merge(served_st_bl, served_st_bl_no_sat,
by.x = "geoid_st", by.y = "geoid_st", all = TRUE)
served_st_bl$Perc_25_3 <- round((served_st_bl$cnt_bl_25_3 / served_st_bl$total_block * 100), 2)
served_st_bl$Perc_25_3_noA <- round((served_st_bl$cnt_bl_25_3_noA / served_st_bl$total_block * 100), 2)
served_st_bl$Perc_100_20 <- round((served_st_bl$cnt_bl_100_20 / served_st_bl$total_block * 100), 2)
served_st_bl$Perc_100_20_noA <- round((served_st_bl$cnt_bl_100_20_noA / served_st_bl$total_block * 100), 2)
served_st_bl$Total_block <- format(served_st_bl$total_block, big.mark = " ")
served_st_bl <- served_st_bl[ ,c("ANSI_let", "Perc_25_3", "Perc_25_3_noA", "Perc_100_20", "Perc_100_20_noA", "Total_block")]
Last census (2020) has 8 132 968 blocks, we have 6 366 045 of them covered. If we exclude satellite and wireless we have 5 791 623 blocks covered. 574 422 blocks are only covered by satellite and/or wireless (7 571 597 locations).
For coverage (>= 25/3): Only one State (AK) has less than 100%
Range from 69 - 100 (average: 90.79 %) for 100/20
The coverage (>= 25/3) per states range from 62 - 99.21% (average: 83.74 %)
The coverage (>= 100/20) per states range from 54 - 99.21% (average: 80.01 %)
DT::datatable(served_st_bl,rownames = FALSE,
extensions = 'Buttons',
options = list(
dom = 'Blfrtip',
buttons = list('copy', 'print', list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download')
)
)
)
We have block that are:
“empty”
block that has at least a location that match download/upload (or technology)
Should we take into account the coverage per block?
block that are fully covered (all locations match)
(Perc_all_covered)
2.61 - 14.52% for 25/3 (w/o Satellite and Wireless)
2.61 - 14.42% for 100/20 (w/o Satellite and Wireless)
block containing location that does not match criteria
all_covered <- read.csv("../shareable_data/data-all_covered_st.csv",
colClasses = c("character", "integer",
"integer", "double",
"double","integer"))
all_covered <- merge(all_covered, US_sates,
by.x = "geoid_st", by.y = "ANSI_num", all = TRUE)
all_covered$Perc_all_covered_25_3 <- round((all_covered$all_covered_25_3/all_covered$total_location * 100),2)
all_covered$Perc_all_covered_100_20 <- round((all_covered$all_covered_100_20/all_covered$total_location * 100),2)
all_covered$avg_no_25_3 <- round(all_covered$avg_no_25_3 ,2)
all_covered$avg_no_100_20 <- round(all_covered$avg_no_100_20 ,2)
all_covered <- all_covered[,c("ANSI_let", "Perc_all_covered_25_3", "Perc_all_covered_100_20", "avg_no_25_3", "avg_no_100_20")]
DT::datatable(all_covered,rownames = FALSE,
extensions = 'Buttons',
options = list(
dom = 'Blfrtip',
buttons = list('copy', 'print', list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download')
)
)
)
Both (X) to
Residential(R) and Business(B).Empty block : 4440.025 km^2
block not matching 25/3: 3213.279 km^2 (2337.121 in 2021)
block matching 25/3: 17200.1 km^2 (22441.2 in 2021)
block matching 100/20: 15732.16 km^2 (15864.31)
(size of VT: 24 910 km^2, these number exclude Satellite and Wireless)
311530106291112) we have one block
(6 locations) with 1 000 000 in max down/upload.nb of locations with max download >= 10 000 : 2 894 717
nb of locations with max download >= 100 000: 70 190 (9719
blocks)
nb of locations with max download > 100 000: 46 (16 blocks)
nb of locations with max download >= 500 000: 11 (3 blocks)
We have other weird/strange numbers (raw data: services x providers):
raw_speed <- read.csv("../shareable_data/raw_data_technology_speed.csv")
DT::datatable(raw_speed, rownames = FALSE,
extensions = 'Buttons',
options = list(
dom = 'Blfrtip',
buttons = list('copy', 'print', list(
extend = 'collection',
buttons = c('csv', 'excel'),
text = 'Download')
)
)
)
FCC also provides percentage of coverage by technology (or multiple at once) per geography types.
We did not explore this data set outside of testing validity of pipeline.
TODO block empty: should we keep track of them ?
[ ] TODO adding empty block and document that (not urgent).
what are the worst location/block in services < 25/3
[x] 1 TODO counting block 25/3 / 100/20. [x] 1 poorly served (cnt at the
state lvl for location and block). [ ] map with poolry plus
unserved
[ ] 4 TODO: calc sum of area for block 2021/2022 -> same for
VT
[ ] 4 TODO: calc sum of area for block 2021/2022 filter with 25/3
TODO presentation and verification county
TODO: maybe use Vermont counties and compare the old data to new data. I
would use blocks that have at least one location with ≥ 25/3 as the
filtering criteria. Maybe do 100/20 too.
[ ] 2 TODO block served by at least one versus fully served ie all
location has 25/3
[ ] –> 2 maybe merge fully served / not fully served ?
[ ] –> 2 2021/2022 for both tiger 2019 for 2021 ?
[ ] 3 TODO block served by at least one versus fully served ie all
location has 100/20. [ ] –> 2021/2022 for both. [ ] 99 TODO check
with household at county ? block ?
[ ] 10 TODO Housekeeping: move the table in a correct place + add it to metadata schema
[ ] 9 TODO redo pipeline vt for all us adding copper
- 20 TODO all pipeline in SQL instead of R so we have the table in
DB.
[ ] TODO get some example ?
[ ] table for providers (at least for normalization)