Analyzing data for racial equity

library(here)
library(ggplot2)
library(tidyverse)
library(tidycensus)
library(sf)
library(patchwork)

census_api_key("ce143e4c445f1ccaed3aa214715bebcb9a487cfa")

Getting Census data

# https://walker-data.com/tidycensus/articles/basic-usage.html#searching-for-variables
load_variables(year = 2020, dataset = "pl")
racevars <- c(White = "P2_005N", Black = "P2_006N", AmericanIndian = "P2_007N",
    Asian = "P2_008N", HawaiianPacificIslander = "P2_009N", Other = "P2_010N",
    TwoOrMoreRaces = "P2_011N", Hispanic = "P2_002N")

# Retrieve race/ethnicity counts by tract, include
# geospatial data (geometry = TRUE) and summary value
# (summary_var) with each record, and calculate percent See
# basic usage:
# https://walker-data.com/tidycensus/articles/basic-usage.html
# See available geographies:
# https://walker-data.com/tidycensus/articles/basic-usage.html#geography-in-tidycensus
ca_tracts <- get_decennial(year = 2020, sumfile = "pl", variables = racevars,
    geography = "tract", state = "CA", geometry = TRUE, summary_var = "P2_001N") %>%
    mutate(percent = 100 * (value/summary_value))

# Flatten tract data, so that there is one record for each
# tract
flat_ca_tracts <- pivot_wider(ca_tracts, names_from = variable,
    values_from = c(value, percent))

flat_ca_tracts
# Save flattened tract data as shapefile for further
# analysis outside of R
st_write(flat_ca_tracts, here("analyses", "output", "tract_ca_race_ethnicity.shp"),
    delete_dsn = TRUE)
## Warning in abbreviate_shapefile_names(obj): Field names abbreviated for ESRI
## Shapefile driver
## Deleting source `/Users/hannah/DataMade/waterboard-coaching/analyses/output/tract_ca_race_ethnicity.shp' using driver `ESRI Shapefile'
## Writing layer `tract_ca_race_ethnicity' to data source 
##   `/Users/hannah/DataMade/waterboard-coaching/analyses/output/tract_ca_race_ethnicity.shp' using driver `ESRI Shapefile'
## Writing 9129 features with 19 fields and geometry type Multi Polygon.
# Determine whether your denominator is correct
select(as_tibble(ca_tracts), -geometry) %>%
    group_by(GEOID) %>%
    summarise(total_pct = sum(percent)) %>%
    head()

For more on retrieving Census data with R: https://walker-data.com/census-r/mapping-census-data-with-r.html

Getting inspection data

# Data retrieved from
# https://data.ca.gov/dataset/?q=Inspections&organization=california-state-water-resources-control-board
complaints <- read_csv(here("data", "raw", "water-rights-complaints.csv"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
##   dat <- vroom(...)
##   problems(dat)
investigations <- read_csv(here("data", "raw", "water-rights-investigations.csv"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
##   dat <- vroom(...)
##   problems(dat)
# Add investigation data to complaint, where applicable
complaints_with_investigations <- left_join(complaints, investigations,
    by = join_by(RELATED_INVESTIGATION_ID == INVESTIGATION_ID),
    keep = TRUE) %>%
    select(CID_NUMBER.x, DATE_COMPLAINT_RECEIVED, DATE_COMPLAINT_CLOSED,
        REASON_FOR_CLOSING, COMPLAINT_COUNTY, COMPLAINT_LOCATION_LATITUDE,
        COMPLAINT_LONGITUDE_LATITUDE, INVESTIGATION_ID, INVESTIGATION_START_DATE.x,
        INVESTIGATION_CLOSURE_DATE.x, POI_COUNTY, LATITUDE, LONGITUDE,
        NUM_VIOLATION, NUM_ENFORCEMENT_ACTION)

complaints_with_investigations

Exploratory analysis

Where are complaints made? Where are they investigated?

# Remove records without coordinates, as st_as_sf will
# error on null values
complaints_with_coordinates <- filter(complaints_with_investigations,
    !is.na(COMPLAINT_LOCATION_LATITUDE) & !is.na(COMPLAINT_LONGITUDE_LATITUDE))

# Project tracts and complaints using the same coordinate
# reference system Albers California - https://epsg.io/3310
flat_ca_tracts_projected <- st_transform(flat_ca_tracts, 3310)

complaints_projected <- complaints_with_coordinates %>%
    st_as_sf(coords = c("COMPLAINT_LONGITUDE_LATITUDE", "COMPLAINT_LOCATION_LATITUDE"),
        crs = 4326) %>%
    st_transform(3310)

# Perform spatial join (will fail if CRS does not match)
# and calculate count of complaints by tract
complaints_by_tract <- st_join(flat_ca_tracts_projected, complaints_projected) %>%
    group_by(GEOID) %>%
    summarise(n_complaints = n_distinct(CID_NUMBER.x, na.rm = TRUE))

complaints_by_tract
# Map choropleth of complaints by tract
# https://ggplot2.tidyverse.org/reference/scale_brewer.html
ggplot() + geom_sf(complaints_by_tract, mapping = aes(geometry = geometry,
    fill = n_complaints), color = NA) + scale_fill_distiller(palette = "OrRd",
    direction = 1)

# Calculate number of complaints investigated by tract
response_by_tract <- st_join(complaints_projected, flat_ca_tracts_projected) %>%
    mutate(INVESTIGATED = !is.na(INVESTIGATION_ID)) %>%
    group_by(GEOID) %>%
    count(INVESTIGATED)

# Calculate investigation rate by tract
investigation_rate_by_tract <- as_tibble(response_by_tract) %>%
    select(-geometry) %>%
    pivot_wider(names_from = INVESTIGATED, names_prefix = "INVESTIGATED_",
        values_from = n, values_fill = 0) %>%
    mutate(N_COMPLAINTS = INVESTIGATED_TRUE + INVESTIGATED_FALSE,
        INVESTIGATION_RATE = INVESTIGATED_TRUE/N_COMPLAINTS *
            100)

# Are we making a population map? The number of complaints
# per tract could reflect the population of that tract.
# Join investigation count and rate back to demographics to
# calculate complaints per capita by tract.
investigation_rate_and_complaints_per_capita_by_tract <- left_join(flat_ca_tracts_projected,
    investigation_rate_by_tract, by = "GEOID") %>%
    mutate(COMPLAINTS_PER_CAPITA = N_COMPLAINTS/summary_value)

investigation_rate_and_complaints_per_capita_by_tract
# Map choropleth of complaints per capita by tract
complaints_plt <- ggplot() + geom_sf(investigation_rate_and_complaints_per_capita_by_tract,
    mapping = aes(geometry = geometry, fill = COMPLAINTS_PER_CAPITA),
    color = NA) + scale_fill_distiller(palette = "OrRd", direction = 1)

# Map choropleth of investigation rate by tract
investigations_plt <- ggplot() + geom_sf(investigation_rate_and_complaints_per_capita_by_tract,
    mapping = aes(geometry = geometry, fill = INVESTIGATION_RATE),
    color = NA) + scale_fill_distiller(palette = "OrRd", direction = 1)

complaints_plt + investigations_plt

For more on spatial joins and analysis: https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html

What are the demographics of areas where complaints are filed?

# Add binary variables complaints filed and complaints
# investigated
complaint_and_investigation_demographics <- investigation_rate_and_complaints_per_capita_by_tract %>%
    mutate(complaints_filed = ifelse(is.na(N_COMPLAINTS), FALSE,
        N_COMPLAINTS > 0), complaints_investigated = ifelse(is.na(INVESTIGATED_TRUE),
        FALSE, INVESTIGATED_TRUE > 0))
# Chart histogram of White population by tract
ggplot(complaint_and_investigation_demographics, aes(x = percent_White)) +
    geom_histogram()
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Chart histogram of White population by tract by whether a
# complaint was filed
ggplot(complaint_and_investigation_demographics, aes(x = percent_White)) +
    geom_histogram() + facet_wrap(~complaints_filed)
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Calculate mean percent of each race/ethnicity for group
# of tracts where complaints not filed and group of tracts
# where complaints filed
mean_complaint_demographics <- as_tibble(complaint_and_investigation_demographics) %>%
    group_by(complaints_filed) %>%
    summarise(white = mean(percent_White, na.rm = TRUE), black = mean(percent_Black,
        na.rm = TRUE), amind = mean(percent_AmericanIndian, na.rm = TRUE),
        asian = mean(percent_Asian, na.rm = TRUE), hpi = mean(percent_HawaiianPacificIslander,
            na.rm = TRUE), other = mean(percent_Other, na.rm = TRUE),
        mixed = mean(percent_TwoOrMoreRaces, na.rm = TRUE), hispanic = mean(percent_Hispanic,
            na.rm = TRUE)) %>%
    pivot_longer(!complaints_filed, names_to = "race_ethnicity",
        values_to = "mean")

# Calculate median percent of each race/ethnicity for group
# of tracts where complaints not filed and group of tracts
# where complaints filed
median_complaint_demographics <- as_tibble(complaint_and_investigation_demographics) %>%
    group_by(complaints_filed) %>%
    summarise(white = median(percent_White, na.rm = TRUE), black = median(percent_Black,
        na.rm = TRUE), amind = median(percent_AmericanIndian,
        na.rm = TRUE), asian = median(percent_Asian, na.rm = TRUE),
        hpi = median(percent_HawaiianPacificIslander, na.rm = TRUE),
        other = median(percent_Other, na.rm = TRUE), mixed = median(percent_TwoOrMoreRaces,
            na.rm = TRUE), hispanic = median(percent_Hispanic,
            na.rm = TRUE)) %>%
    pivot_longer(!complaints_filed, names_to = "race_ethnicity",
        values_to = "median")

# Chart mean and median population makeup for group of
# tracts where complaints not filed and group of tracts
# where complaints filed
mean_plt <- ggplot(mean_complaint_demographics, aes(x = complaints_filed,
    y = mean, fill = race_ethnicity)) + geom_bar(stat = "identity")

median_plt <- ggplot(median_complaint_demographics, aes(x = complaints_filed,
    y = median, fill = race_ethnicity)) + geom_bar(stat = "identity")

mean_plt + median_plt

What are demographics of areas where complaints are investigated?

# Chart histogram of White population by tract and whether
# a complaint was filed plus whether a filed complaint was
# investigated
ggplot(complaint_and_investigation_demographics, aes(x = percent_White)) +
    geom_histogram() + facet_wrap(~complaints_filed + complaints_investigated)
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Calculate mean percent of each race/ethnicity for groups
# of tracts where complaint not filed, complaint filed but
# not investigated, and complaint investigated
mean_investigation_demographics <- as_tibble(complaint_and_investigation_demographics) %>%
    group_by(complaints_filed + complaints_investigated) %>%
    summarise(white = mean(percent_White, na.rm = TRUE), black = mean(percent_Black,
        na.rm = TRUE), amind = mean(percent_AmericanIndian, na.rm = TRUE),
        asian = mean(percent_Asian, na.rm = TRUE), hpi = mean(percent_HawaiianPacificIslander,
            na.rm = TRUE), other = mean(percent_Other, na.rm = TRUE),
        mixed = mean(percent_TwoOrMoreRaces, na.rm = TRUE), hispanic = mean(percent_Hispanic,
            na.rm = TRUE)) %>%
    pivot_longer(!`complaints_filed + complaints_investigated`,
        names_to = "race_ethnicity", values_to = "mean")

# Calculate median percent of each race/ethnicity for
# groups of tracts where complaint not filed, complaint
# filed but not investigated, and complaint investigated
median_investigation_demographics <- as_tibble(complaint_and_investigation_demographics) %>%
    group_by(complaints_filed + complaints_investigated) %>%
    summarise(white = median(percent_White, na.rm = TRUE), black = median(percent_Black,
        na.rm = TRUE), amind = median(percent_AmericanIndian,
        na.rm = TRUE), asian = median(percent_Asian, na.rm = TRUE),
        hpi = median(percent_HawaiianPacificIslander, na.rm = TRUE),
        other = median(percent_Other, na.rm = TRUE), mixed = median(percent_TwoOrMoreRaces,
            na.rm = TRUE), hispanic = median(percent_Hispanic,
            na.rm = TRUE)) %>%
    pivot_longer(!`complaints_filed + complaints_investigated`,
        names_to = "race_ethnicity", values_to = "median")

# Chart mean and median population makeup for groups of
# tracts where complaint not filed, complaint filed but not
# investigated, and complaint investigated
mean_investigation_plt <- ggplot(mean_investigation_demographics,
    aes(x = factor(`complaints_filed + complaints_investigated`),
        y = mean, fill = race_ethnicity)) + geom_bar(stat = "identity") +
    scale_x_discrete(name = "condition", labels = c(`0` = "complaint not filed",
        `1` = "complaint filed not investigated", `2` = "complaint investigated"))

median_investigation_plt <- ggplot(median_investigation_demographics,
    aes(x = factor(`complaints_filed + complaints_investigated`),
        y = median, fill = race_ethnicity)) + geom_bar(stat = "identity") +
    scale_x_discrete(name = "condition", labels = c(`0` = "complaint not filed",
        `1` = "complaint filed not investigated", `2` = "complaint investigated"))

mean_investigation_plt + median_investigation_plt

Are we looking at the right thing?

We might infer that, because tracts where complaints are made and investigated are proportionally more white than tracts where complaints are not made, the white population is receiving a disproportionate amount of CWB resources.

But, there are many reasons why a complaint might not be investigated.

# Why are complaints closed?
count(complaints, REASON_FOR_CLOSING, sort = TRUE)

Of these, “Discretionary dismissal by Division or Office” represents a decision point - something we can effect by adjusting operations. Let’s take a look at the demographics of tracts where discretionary dismissals are made.

# Calculate number of discretionary dismissals by tract
discretionary_dismissal_by_tract <- st_join(flat_ca_tracts_projected,
    filter(complaints_projected, REASON_FOR_CLOSING == "Discretionary dismissal by Division or Office")) %>%
    group_by(GEOID) %>%
    summarise(n_discretionary_dismissals = n_distinct(CID_NUMBER.x,
        na.rm = TRUE))

# Calculate dismissal rate and create binary variable
# indicating whether rate is higher than average
dismissal_rate_by_tract <- st_join(discretionary_dismissal_by_tract,
    complaints_by_tract) %>%
    filter(n_complaints > 0) %>%
    mutate(dismissal_rate = n_discretionary_dismissals/n_complaints,
        higher_than_average = dismissal_rate > mean(dismissal_rate))

# Join dismissal rate to tracts
dismissal_demographics <- st_join(flat_ca_tracts_projected, dismissal_rate_by_tract)

dismissal_demographics
# Calculate mean population for groups where dismissal rate
# below and above average
mean_dismissal_demographics <- as_tibble(dismissal_demographics) %>%
    group_by(higher_than_average) %>%
    summarise(white = mean(percent_White, na.rm = TRUE), black = mean(percent_Black,
        na.rm = TRUE), amind = mean(percent_AmericanIndian, na.rm = TRUE),
        asian = mean(percent_Asian, na.rm = TRUE), hpi = mean(percent_HawaiianPacificIslander,
            na.rm = TRUE), other = mean(percent_Other, na.rm = TRUE),
        mixed = mean(percent_TwoOrMoreRaces, na.rm = TRUE), hispanic = mean(percent_Hispanic,
            na.rm = TRUE)) %>%
    filter(!is.na(higher_than_average)) %>%
    pivot_longer(!higher_than_average, names_to = "race_ethnicity",
        values_to = "mean")

# Plot demographics of tracts where dismissal rate below
# and above average
ggplot(mean_dismissal_demographics, aes(x = higher_than_average,
    y = mean, fill = race_ethnicity)) + geom_bar(stat = "identity")

Here, we see a much smaller difference in the racial/ethnic makeup between tracts with higher and lower than average dismissal rates.