library(here)
library(ggplot2)
library(tidyverse)
library(tidycensus)
library(sf)
library(patchwork)
census_api_key("ce143e4c445f1ccaed3aa214715bebcb9a487cfa")
# https://walker-data.com/tidycensus/articles/basic-usage.html#searching-for-variables
load_variables(year = 2020, dataset = "pl")
racevars <- c(White = "P2_005N", Black = "P2_006N", AmericanIndian = "P2_007N",
Asian = "P2_008N", HawaiianPacificIslander = "P2_009N", Other = "P2_010N",
TwoOrMoreRaces = "P2_011N", Hispanic = "P2_002N")
# Retrieve race/ethnicity counts by tract, include
# geospatial data (geometry = TRUE) and summary value
# (summary_var) with each record, and calculate percent See
# basic usage:
# https://walker-data.com/tidycensus/articles/basic-usage.html
# See available geographies:
# https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus
ca_tracts <- get_decennial(year = 2020, sumfile = "pl", variables = racevars,
geography = "tract", state = "CA", geometry = TRUE, summary_var = "P2_001N") %>%
mutate(percent = 100 * (value/summary_value))
# Flatten tract data, so that there is one record for each
# tract
flat_ca_tracts <- pivot_wider(ca_tracts, names_from = variable,
values_from = c(value, percent))
flat_ca_tracts
# Save flattened tract data as shapefile for further
# analysis outside of R
st_write(flat_ca_tracts, here("analyses", "output", "tract_ca_race_ethnicity.shp"),
delete_dsn = TRUE)
## Warning in abbreviate_shapefile_names(obj): Field names abbreviated for ESRI
## Shapefile driver
## Deleting source `/Users/hannah/DataMade/waterboard-coaching/analyses/output/tract_ca_race_ethnicity.shp' using driver `ESRI Shapefile'
## Writing layer `tract_ca_race_ethnicity' to data source
## `/Users/hannah/DataMade/waterboard-coaching/analyses/output/tract_ca_race_ethnicity.shp' using driver `ESRI Shapefile'
## Writing 9129 features with 19 fields and geometry type Multi Polygon.
# Determine whether your denominator is correct
select(as_tibble(ca_tracts), -geometry) %>%
group_by(GEOID) %>%
summarise(total_pct = sum(percent)) %>%
head()
For more on retrieving Census data with R: https://walker-data.com/census-r/mapping-census-data-with-r.html
# Data retrieved from
# https://data.ca.gov/dataset/?q=Inspections&organization=california-state-water-resources-control-board
complaints <- read_csv(here("data", "raw", "water-rights-complaints.csv"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
## dat <- vroom(...)
## problems(dat)
investigations <- read_csv(here("data", "raw", "water-rights-investigations.csv"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
## dat <- vroom(...)
## problems(dat)
# Add investigation data to complaint, where applicable
complaints_with_investigations <- left_join(complaints, investigations,
by = join_by(RELATED_INVESTIGATION_ID == INVESTIGATION_ID),
keep = TRUE) %>%
select(CID_NUMBER.x, DATE_COMPLAINT_RECEIVED, DATE_COMPLAINT_CLOSED,
REASON_FOR_CLOSING, COMPLAINT_COUNTY, COMPLAINT_LOCATION_LATITUDE,
COMPLAINT_LONGITUDE_LATITUDE, INVESTIGATION_ID, INVESTIGATION_START_DATE.x,
INVESTIGATION_CLOSURE_DATE.x, POI_COUNTY, LATITUDE, LONGITUDE,
NUM_VIOLATION, NUM_ENFORCEMENT_ACTION)
complaints_with_investigations
# Remove records without coordinates, as st_as_sf will
# error on null values
complaints_with_coordinates <- filter(complaints_with_investigations,
!is.na(COMPLAINT_LOCATION_LATITUDE) & !is.na(COMPLAINT_LONGITUDE_LATITUDE))
# Project tracts and complaints using the same coordinate
# reference system Albers California - https://epsg.io/3310
flat_ca_tracts_projected <- st_transform(flat_ca_tracts, 3310)
complaints_projected <- complaints_with_coordinates %>%
st_as_sf(coords = c("COMPLAINT_LONGITUDE_LATITUDE", "COMPLAINT_LOCATION_LATITUDE"),
crs = 4326) %>%
st_transform(3310)
# Perform spatial join (will fail if CRS does not match)
# and calculate count of complaints by tract
complaints_by_tract <- st_join(flat_ca_tracts_projected, complaints_projected) %>%
group_by(GEOID) %>%
summarise(n_complaints = n_distinct(CID_NUMBER.x, na.rm = TRUE))
complaints_by_tract
# Map choropleth of complaints by tract
# https://ggplot2.tidyverse.org/reference/scale_brewer.html
ggplot() + geom_sf(complaints_by_tract, mapping = aes(geometry = geometry,
fill = n_complaints), color = NA) + scale_fill_distiller(palette = "OrRd",
direction = 1)
# Calculate number of complaints investigated by tract
response_by_tract <- st_join(complaints_projected, flat_ca_tracts_projected) %>%
mutate(INVESTIGATED = !is.na(INVESTIGATION_ID)) %>%
group_by(GEOID) %>%
count(INVESTIGATED)
# Calculate investigation rate by tract
investigation_rate_by_tract <- as_tibble(response_by_tract) %>%
select(-geometry) %>%
pivot_wider(names_from = INVESTIGATED, names_prefix = "INVESTIGATED_",
values_from = n, values_fill = 0) %>%
mutate(N_COMPLAINTS = INVESTIGATED_TRUE + INVESTIGATED_FALSE,
INVESTIGATION_RATE = INVESTIGATED_TRUE/N_COMPLAINTS *
100)
# Are we making a population map? The number of complaints
# per tract could reflect the population of that tract.
# Join investigation count and rate back to demographics to
# calculate complaints per capita by tract.
investigation_rate_and_complaints_per_capita_by_tract <- left_join(flat_ca_tracts_projected,
investigation_rate_by_tract, by = "GEOID") %>%
mutate(COMPLAINTS_PER_CAPITA = N_COMPLAINTS/summary_value)
investigation_rate_and_complaints_per_capita_by_tract
# Map choropleth of complaints per capita by tract
complaints_plt <- ggplot() + geom_sf(investigation_rate_and_complaints_per_capita_by_tract,
mapping = aes(geometry = geometry, fill = COMPLAINTS_PER_CAPITA),
color = NA) + scale_fill_distiller(palette = "OrRd", direction = 1)
# Map choropleth of investigation rate by tract
investigations_plt <- ggplot() + geom_sf(investigation_rate_and_complaints_per_capita_by_tract,
mapping = aes(geometry = geometry, fill = INVESTIGATION_RATE),
color = NA) + scale_fill_distiller(palette = "OrRd", direction = 1)
complaints_plt + investigations_plt
For more on spatial joins and analysis: https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html
# Add binary variables complaints filed and complaints
# investigated
complaint_and_investigation_demographics <- investigation_rate_and_complaints_per_capita_by_tract %>%
mutate(complaints_filed = ifelse(is.na(N_COMPLAINTS), FALSE,
N_COMPLAINTS > 0), complaints_investigated = ifelse(is.na(INVESTIGATED_TRUE),
FALSE, INVESTIGATED_TRUE > 0))
# Chart histogram of White population by tract
ggplot(complaint_and_investigation_demographics, aes(x = percent_White)) +
geom_histogram()
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Chart histogram of White population by tract by whether a
# complaint was filed
ggplot(complaint_and_investigation_demographics, aes(x = percent_White)) +
geom_histogram() + facet_wrap(~complaints_filed)
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Calculate mean percent of each race/ethnicity for group
# of tracts where complaints not filed and group of tracts
# where complaints filed
mean_complaint_demographics <- as_tibble(complaint_and_investigation_demographics) %>%
group_by(complaints_filed) %>%
summarise(white = mean(percent_White, na.rm = TRUE), black = mean(percent_Black,
na.rm = TRUE), amind = mean(percent_AmericanIndian, na.rm = TRUE),
asian = mean(percent_Asian, na.rm = TRUE), hpi = mean(percent_HawaiianPacificIslander,
na.rm = TRUE), other = mean(percent_Other, na.rm = TRUE),
mixed = mean(percent_TwoOrMoreRaces, na.rm = TRUE), hispanic = mean(percent_Hispanic,
na.rm = TRUE)) %>%
pivot_longer(!complaints_filed, names_to = "race_ethnicity",
values_to = "mean")
# Calculate median percent of each race/ethnicity for group
# of tracts where complaints not filed and group of tracts
# where complaints filed
median_complaint_demographics <- as_tibble(complaint_and_investigation_demographics) %>%
group_by(complaints_filed) %>%
summarise(white = median(percent_White, na.rm = TRUE), black = median(percent_Black,
na.rm = TRUE), amind = median(percent_AmericanIndian,
na.rm = TRUE), asian = median(percent_Asian, na.rm = TRUE),
hpi = median(percent_HawaiianPacificIslander, na.rm = TRUE),
other = median(percent_Other, na.rm = TRUE), mixed = median(percent_TwoOrMoreRaces,
na.rm = TRUE), hispanic = median(percent_Hispanic,
na.rm = TRUE)) %>%
pivot_longer(!complaints_filed, names_to = "race_ethnicity",
values_to = "median")
# Chart mean and median population makeup for group of
# tracts where complaints not filed and group of tracts
# where complaints filed
mean_plt <- ggplot(mean_complaint_demographics, aes(x = complaints_filed,
y = mean, fill = race_ethnicity)) + geom_bar(stat = "identity")
median_plt <- ggplot(median_complaint_demographics, aes(x = complaints_filed,
y = median, fill = race_ethnicity)) + geom_bar(stat = "identity")
mean_plt + median_plt
# Chart histogram of White population by tract and whether
# a complaint was filed plus whether a filed complaint was
# investigated
ggplot(complaint_and_investigation_demographics, aes(x = percent_White)) +
geom_histogram() + facet_wrap(~complaints_filed + complaints_investigated)
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Calculate mean percent of each race/ethnicity for groups
# of tracts where complaint not filed, complaint filed but
# not investigated, and complaint investigated
mean_investigation_demographics <- as_tibble(complaint_and_investigation_demographics) %>%
group_by(complaints_filed + complaints_investigated) %>%
summarise(white = mean(percent_White, na.rm = TRUE), black = mean(percent_Black,
na.rm = TRUE), amind = mean(percent_AmericanIndian, na.rm = TRUE),
asian = mean(percent_Asian, na.rm = TRUE), hpi = mean(percent_HawaiianPacificIslander,
na.rm = TRUE), other = mean(percent_Other, na.rm = TRUE),
mixed = mean(percent_TwoOrMoreRaces, na.rm = TRUE), hispanic = mean(percent_Hispanic,
na.rm = TRUE)) %>%
pivot_longer(!`complaints_filed + complaints_investigated`,
names_to = "race_ethnicity", values_to = "mean")
# Calculate median percent of each race/ethnicity for
# groups of tracts where complaint not filed, complaint
# filed but not investigated, and complaint investigated
median_investigation_demographics <- as_tibble(complaint_and_investigation_demographics) %>%
group_by(complaints_filed + complaints_investigated) %>%
summarise(white = median(percent_White, na.rm = TRUE), black = median(percent_Black,
na.rm = TRUE), amind = median(percent_AmericanIndian,
na.rm = TRUE), asian = median(percent_Asian, na.rm = TRUE),
hpi = median(percent_HawaiianPacificIslander, na.rm = TRUE),
other = median(percent_Other, na.rm = TRUE), mixed = median(percent_TwoOrMoreRaces,
na.rm = TRUE), hispanic = median(percent_Hispanic,
na.rm = TRUE)) %>%
pivot_longer(!`complaints_filed + complaints_investigated`,
names_to = "race_ethnicity", values_to = "median")
# Chart mean and median population makeup for groups of
# tracts where complaint not filed, complaint filed but not
# investigated, and complaint investigated
mean_investigation_plt <- ggplot(mean_investigation_demographics,
aes(x = factor(`complaints_filed + complaints_investigated`),
y = mean, fill = race_ethnicity)) + geom_bar(stat = "identity") +
scale_x_discrete(name = "condition", labels = c(`0` = "complaint not filed",
`1` = "complaint filed not investigated", `2` = "complaint investigated"))
median_investigation_plt <- ggplot(median_investigation_demographics,
aes(x = factor(`complaints_filed + complaints_investigated`),
y = median, fill = race_ethnicity)) + geom_bar(stat = "identity") +
scale_x_discrete(name = "condition", labels = c(`0` = "complaint not filed",
`1` = "complaint filed not investigated", `2` = "complaint investigated"))
mean_investigation_plt + median_investigation_plt
We might infer that, because tracts where complaints are made and investigated are proportionally more white than tracts where complaints are not made, the white population is receiving a disproportionate amount of CWB resources.
But, there are many reasons why a complaint might not be investigated.
# Why are complaints closed?
count(complaints, REASON_FOR_CLOSING, sort = TRUE)
Of these, “Discretionary dismissal by Division or Office” represents a decision point - something we can effect by adjusting operations. Let’s take a look at the demographics of tracts where discretionary dismissals are made.
# Calculate number of discretionary dismissals by tract
discretionary_dismissal_by_tract <- st_join(flat_ca_tracts_projected,
filter(complaints_projected, REASON_FOR_CLOSING == "Discretionary dismissal by Division or Office")) %>%
group_by(GEOID) %>%
summarise(n_discretionary_dismissals = n_distinct(CID_NUMBER.x,
na.rm = TRUE))
# Calculate dismissal rate and create binary variable
# indicating whether rate is higher than average
dismissal_rate_by_tract <- st_join(discretionary_dismissal_by_tract,
complaints_by_tract) %>%
filter(n_complaints > 0) %>%
mutate(dismissal_rate = n_discretionary_dismissals/n_complaints,
higher_than_average = dismissal_rate > mean(dismissal_rate))
# Join dismissal rate to tracts
dismissal_demographics <- st_join(flat_ca_tracts_projected, dismissal_rate_by_tract)
dismissal_demographics
# Calculate mean population for groups where dismissal rate
# below and above average
mean_dismissal_demographics <- as_tibble(dismissal_demographics) %>%
group_by(higher_than_average) %>%
summarise(white = mean(percent_White, na.rm = TRUE), black = mean(percent_Black,
na.rm = TRUE), amind = mean(percent_AmericanIndian, na.rm = TRUE),
asian = mean(percent_Asian, na.rm = TRUE), hpi = mean(percent_HawaiianPacificIslander,
na.rm = TRUE), other = mean(percent_Other, na.rm = TRUE),
mixed = mean(percent_TwoOrMoreRaces, na.rm = TRUE), hispanic = mean(percent_Hispanic,
na.rm = TRUE)) %>%
filter(!is.na(higher_than_average)) %>%
pivot_longer(!higher_than_average, names_to = "race_ethnicity",
values_to = "mean")
# Plot demographics of tracts where dismissal rate below
# and above average
ggplot(mean_dismissal_demographics, aes(x = higher_than_average,
y = mean, fill = race_ethnicity)) + geom_bar(stat = "identity")
Here, we see a much smaller difference in the racial/ethnic makeup between tracts with higher and lower than average dismissal rates.