STI Analysis

Author

Cody Appa

Published

May 4, 2023

Preamble

This project aims to allow the user to interactively look at infection rates of the most prevalent STI’s: Chlamydia, Gonorrhea, and Syphilis. By using this portfolio you will be able to mouse over a map of the united states for each infection and visualize data from the CDC on infection rates per county.

Data

Data was gathered from the CDC website at https://www.cdc.gov/std/statistics/2021/figures.htm.

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(rnaturalearthdata))
suppressPackageStartupMessages(library(rnaturalearth))
suppressPackageStartupMessages(library(sf))
suppressPackageStartupMessages(library(tigris))
library(tidyverse)
library(dplyr)
library(ggplot2)
library(readxl)
library(scales)
library(rnaturalearth)
library(rnaturalearthdata)
library(sf)
library(tigris)

suppressPackageStartupMessages(library(tigris))

STIDictionary<-read_excel("STISheet.xlsx")
knitr::kable(STIDictionary)
Attribute Type Description
STD Categorical Which STI infection the data is for
Year Categorical The year of data acquisition
State Categorical The state the data was acquired in
County Categorical The county the data was acquired in
State FIPS Ordinal Numerical values to differentiate states
County FIPS Ordinal Numerical values to differentiate counties
Rate Quantitative The rate of infection of the STI in question
Rate Category Categorical Infection rate categories from low to high
Code
data <- read.csv("Chlamydia - Rates of Reported Cases by County United States 2021 .csv")

Visualizations

Each of these visualizations is an interactive, spatial, heat-map of the United States. By mousing over individual counties it will show you the county name and infection rate.

Code
library(sf)
library(ggplot2)
library(dplyr)
library(ggiraph)

data <- read.csv("Chlamydia - Rates of Reported Cases by County United States 2021 .csv", header = TRUE)
data$Rate <- as.numeric(gsub(",", "", data$Rate))

invisible(suppressWarnings({
  us_counties <- tigris::counties(cb = TRUE, resolution = "20m", year = 2020, class = "sf", progress = FALSE)
}))



us_counties_contiguous <- us_counties %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

us_counties_data <- left_join(us_counties, data, by = c("NAMELSAD" = "County", "STATE_NAME" = "State"))
us_counties_data$Rate <- as.numeric(us_counties_data$Rate)

us_counties_data_contiguous <- us_counties_data %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

my_colors <- c('blue', 'red', 'orange', 'yellow')


gg <- ggplot() +
geom_sf_interactive(data = us_counties_data_contiguous, aes(fill = Rate, tooltip = paste(NAMELSAD, "<br>", STATE_NAME, "<br>", "Rate:", Rate)), color = "grey", size = 0.1) +
  scale_fill_gradientn(colors = my_colors, na.value = "grey70", name = "Rate") + 
  labs(title = "Chlamydia Infection Rate by County 2021", caption = "Figure 1: Infection rate of chlamydia by county per 100k people") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        plot.caption = element_text(hjust = .5, size = 8, margin = margin(t = 10, r = 10)))


girafe(code = print(gg))
Code
library(sf)
library(ggplot2)
library(dplyr)
library(ggiraph)

data2 <- read.csv("Gonorrhea - Rates of Reported Cases by County United States 2021 .csv", header = TRUE)
data2$Rate <- as.numeric(gsub(",", "", data2$Rate), na.rm = TRUE)

invisible(suppressWarnings({
  us_counties <- tigris::counties(cb = TRUE, resolution = "20m", year = 2020, class = "sf", progress = FALSE)
}))


us_counties_contiguous <- us_counties %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

us_counties_data <- left_join(us_counties, data2, by = c("NAMELSAD" = "County", "STATE_NAME" = "State"))
us_counties_data$Rate <- as.numeric(us_counties_data$Rate)

us_counties_data_contiguous <- us_counties_data %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

my_colors <- c('blue', 'red', 'orange', 'yellow')


gg <- ggplot() +
geom_sf_interactive(data = us_counties_data_contiguous, aes(fill = Rate, tooltip = paste(NAMELSAD, "<br>", STATE_NAME, "<br>", "Rate:", Rate)), color = "grey", size = 0.1) +
  scale_fill_gradientn(colors = my_colors, na.value = "grey70", name = "Rate") + 
  labs(title = "Gonorrhea Infection Rate by County 2021", caption = "Figure 2: Infection rate of gonorrhea by county per 100k people") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        plot.caption = element_text(hjust = .5, size = 8, margin = margin(t = 10, r = 10)))


girafe(code = print(gg))
Code
library(sf)
library(ggplot2)
library(dplyr)
library(ggiraph)

data3 <- read.csv("Primary and Secondary Syphilis - Rates of Reported Cases by County United States 2021 .csv", header = TRUE)
data3$Rate <- as.numeric(gsub(",", "", data3$Rate), na.rm = TRUE)

invisible(suppressWarnings({
  us_counties <- tigris::counties(cb = TRUE, resolution = "20m", year = 2020, class = "sf", progress = FALSE)
}))


us_counties_contiguous <- us_counties %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

us_counties_data <- left_join(us_counties, data3, by = c("NAMELSAD" = "County", "STATE_NAME" = "State"))
us_counties_data$Rate <- as.numeric(us_counties_data$Rate)

us_counties_data_contiguous <- us_counties_data %>% 
  filter(
    !(STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))
  )

my_colors <- c('blue', 'red', 'orange', 'yellow')


gg <- ggplot() +
geom_sf_interactive(data = us_counties_data_contiguous, aes(fill = Rate, tooltip = paste(NAMELSAD, "<br>", STATE_NAME, "<br>", "Rate:", Rate)), color = "grey", size = 0.1) +
  scale_fill_gradientn(colors = my_colors, na.value = "grey70", name = "Rate") + 
  labs(title = "Primary and Secondary Syphilis Infection Rate by County 2021", caption = "Figure 3: Infection rate of primary and secondary Syphilis by county per 100k people") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        plot.caption = element_text(hjust = .5, size = 8, margin = margin(t = 10, r = 10)))


girafe(code = print(gg))

Conclusion

Chlamydia has the highest rate of infection out of the three STI’s, though gonorrhea follows a similar pattern of infection rate by county. Syphilis, the lowest infection rate of the three seems to have a few hot spots but it’s hard to say if there is a pattern. One interesting thing to note is Todd County, South Dakota. The reporting seems suspicious in that county as Chlamydia and Gonorrhea infection rate is the same and syphilis is 10x the next highest county.