Data wrangling for variant analysis

Author

Shane Hogle

Published

May 24, 2024

Setup

Libraries and global variables

library(here)
library(tidyverse)
library(Polychrome)
library(withr)
library(fs)
library(archive)
library(slider)
library(fpeek)
source(here::here("R", "utils_generic.R"))

Set up some directories

data_raw <- here::here("_data_raw", "wgs")
shared <- here::here("_data_raw", "shared")
vcftar <- here::here(data_raw, "haplotypecaller", "tables.tar.gz")

# create temporary location to decompress
tmpdir <- fs::file_temp()

# make processed data directory if it doesn't exist
data <- here::here("data", "wgs")
fs::dir_create(data)

Contamination analysis

Mashscreen

Nothing really suspicious… HAMBI_0403 has some overlap with HAMBI_1279… We’ll look into that more below

mashscreenfiles <- fs::dir_ls(
  path =  here::here(data_raw, "contamination", "mashscreen"),
  all = FALSE,
  recurse = TRUE,
  type = "file",
  glob = "*.tsv",
  regexp = NULL,
  invert = FALSE,
  fail = TRUE
)

mashscreenslurped <- readr::read_tsv(
  mashscreenfiles,
  col_names = c(
    "identity",
    "shared-hashes",
    "median-multiplicity",
    "p-value",
    "query-ID",
    "query-comment"
  ),
  col_types = c("dcddcc"),
  id = "sample"
) 

mashscreenslurped_fmt <- mashscreenslurped %>% 
  mutate(strainID = str_extract(sample, 
                              regex("(HAMBI[:digit:]{4})"))) %>%
  mutate(strainID = str_replace(strainID, "HAMBI", "HAMBI_")) %>% 
  mutate(matching_strain = str_extract(`query-ID`, 
                              regex("(HAMBI_[:digit:]{4})"))) %>% 
  dplyr::select(-sample, -`query-ID`, -`query-comment`, 
                shared_hashes=`shared-hashes`, 
                median_multiplicity=`median-multiplicity`,
                p_value = `p-value`) %>% 
  relocate(strainID, matching_strain) %>% 
  filter(!shared_hashes %in% c("1/1000", "2/1000", "3/1000", "4/1000", "5/1000"))

write_tsv(mashscreenslurped_fmt, here::here(data, "mashscreen_contamination.tsv"))

coverage

cmfiles <- fs::dir_ls(
  path =  here::here(data_raw, "contamination", "competitive_mapping"),
  all = FALSE,
  recurse = TRUE,
  type = "file",
  glob = "*.txt",
  regexp = NULL,
  invert = FALSE,
  fail = TRUE
)
cmslurped <- readr::read_tsv(
  cmfiles,
  comment = "#",
  col_names = c(
    "matching_strain",
    "avg_fold_coverage",
    "ref_length",
    "ref_gc",
    "percent_covered",
    "bases_covered",
    "plus_reads",
    "minus_reads",
    "read_gc",
    "median_fold_coverage",
    "std_dev_coverage"
  ),
  col_types = c("cdddddddddd"),
  id = "sample"
) 

cmslurped_fmt <- cmslurped %>% 
   mutate(strainID = str_extract(sample, 
                              regex("(HAMBI[:digit:]{4})"))) %>%
  mutate(strainID = str_replace(strainID, "HAMBI", "HAMBI_")) %>% 
  mutate(matching_chrom = str_extract(matching_strain, 
                              regex("(HAMBI_[:digit:]{4}_[:alnum:]+)")),
         matching_strain = str_extract(matching_strain, 
                              regex("(HAMBI_[:digit:]{4})"))) %>% 
  dplyr::select(-sample) %>% 
  relocate(strainID, matching_strain, matching_chrom, median_fold_coverage, avg_fold_coverage, std_dev_coverage) %>% 
  filter(median_fold_coverage > 0)

write_tsv(cmslurped_fmt, here::here(data, "competitive_mapping_contamination.tsv"))

cmslurped_fmt

So the competitive mapping analysis also shows that some small percentage of the reads from HAMBI_0403 are mapping to HAMBI_1279 (33,281 reads out of 3,242,608 total mapping reads)

cmslurped_fmt %>% 
  filter(strainID != matching_strain)

Reading and small formatting of data

Untar and decompress

# untar directory containing variant tables 
archive::archive_extract(
  vcftar,
  dir = tmpdir,
  files = NULL,
  options = character(),
  strip_components = 0L
)

vcfdir <- here::here(tmpdir, "tables")

Variants

SnpEff annotations

snpefffiles <- fs::dir_ls(
  path = vcfdir,
  all = FALSE,
  recurse = TRUE,
  type = "file",
  glob = "*.snpeff.tsv",
  regexp = NULL,
  invert = FALSE,
  fail = TRUE
)

snpefffiles_notempty <- map_dfr(snpefffiles, peek_count_lines) %>% 
  pivot_longer(everything()) %>%
  filter(value > 1) %>% 
  pull(name)

snpeffslurped <- readr::read_tsv(
  snpefffiles_notempty,
  skip = 1,
  col_names = c(
    "chrom",
    "pos",
    "ref",
    "alt",
    "filter",
    # Annotated using Sequence Ontology terms. Multiple effects can be concatenated using '&'.
    "effect",
    # A simple estimation of putative impact / deleteriousness : HIGH, MODERATE, LOW, or MODIFIER
    "impact",
    # gene id from prokka
    "locus_tag",
    # whether variant is coding or noncoding
    "biotype",
    # Variant using HGVS notation (DNA level)
    "hgvs_c",
    # Variant using HGVS notation (Protein level).
    "hgvs_p",
    # Position in coding bases (one based includes START and STOP codons).
    "cds_pos",
    # Total number of coding bases (one based includes START and STOP codons).
    "cds_len",
    # Position in translated product (one based includes START and STOP codons).
    "aa_pos",
    # Total length of translated product (one based includes START and STOP codons).
    "aa_len"
  ),
  col_types = c("cdcccccccccdddd"),
  id = "sample"
)

snpeffslurpedfmt <- snpeffslurped %>% 
  # this is weird, but joining on column like filter that contains characters like ';' doesnt work
  dplyr::select(-filter) %>% 
  mutate(strainID = str_extract(sample, 
                              regex("(HAMBI_[:digit:]{4})"))) %>% 
  dplyr::select(-sample) %>% 
  relocate(strainID) %>% 
  # this is because SNPeff does not give the correct closest locus_tag when 
  # the mutation is not in a CDS!!! this doesn't affect earlier work because I 
  # always focus on NS mutations, but here since a lot of these mutations are 
  # not in CDS we need to deal with this manually
  mutate(locus_tag = if_else(is.na(hgvs_p), NA_character_, locus_tag))

Allele frequencies

varfiles <- fs::dir_ls(
  path = vcfdir,
  all = FALSE,
  recurse = TRUE,
  type = "file",
  glob = "*.variants.tsv",
  regexp = NULL,
  invert = FALSE,
  fail = TRUE
)

varfiles_notempty <- map_dfr(varfiles, peek_count_lines) %>% 
  pivot_longer(everything()) %>%
  filter(value > 1) %>% 
  pull(name)

varslurped <- readr::read_tsv(
  varfiles_notempty,
  skip = 1,
  col_names = c(
    "chrom",
    "pos",
    "ref",
    "alt",
    "filter",
    "type",
    # Allelic depths for the ref and alt alleles in the order listed
    "allele_depth",
    # Allele fractions of alternate alleles. Excludes filtering
    "depth_total",
    # Count of fragments supporting each allele.
    "variant_freq"
  ),
  col_types = c("cdccccccddd"),
  id = "sample"
)

varslurpedfmt <- varslurped %>% 
  mutate(strainID = str_extract(sample, 
                              regex("(HAMBI_[:digit:]{4})"))) %>% 
  dplyr::select(-sample) %>% 
  relocate(strainID) %>% 
  # taken only first two most abundant alleles
  separate(allele_depth, into =  c("depth_ref", "depth_alt"), 
           sep=",", extra="drop") %>% 
  # format depths to numberic
  mutate(depth_ref = as.numeric(depth_ref), 
         depth_alt = as.numeric(depth_alt))

Clean up tmp directories

# remove decompressed vcf and tables directory from temp location
fs::dir_delete(vcfdir)

Annotations

hambi_annotations <- readr::read_rds(here::here(shared, "combined_annotations_final.rds"))
mgefinder <- readr::read_tsv(here::here(shared, "MGE_finder_HAMBI_combined.tsv"))

Rows: 475 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): strainID, name, chrom, prediction_tool
dbl (2): start, end

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

genomad <- readr::read_tsv(here::here(shared, "genomad_HAMBI_combined.tsv"))

Rows: 99 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): strainID, chrom, name, prediction_tool
dbl (2): start, end

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Formatting and filtering

Combine snpeff annotations and var frequencies

wgsvars <- left_join(varslurpedfmt, snpeffslurpedfmt,
          by = join_by(strainID, chrom, pos, ref, alt))

Mark regions with mobile elements in metagenomes

wgsvars_mb <- wgsvars %>% 
  left_join(bind_rows(mgefinder, genomad), 
            by = join_by(strainID, chrom, between(pos, start, end))) %>% 
  rename(mge_name = name, mge_start = start, mge_end = end, mge_prediction_tool = prediction_tool)

Join with annotations

Separate CDS and non CDS mutations

wgsvars_mb_nocds <- filter(wgsvars_mb, is.na(locus_tag))
wgsvars_mb_ann_cds <- left_join(filter(wgsvars_mb, !is.na(locus_tag)), hambi_annotations, 
                                by = join_by(strainID, chrom, locus_tag))

Use GFF file and make lagged/lead start and end sites to identify upstream regions of plus/minus strand genes

hambi_annotations_lagged <- hambi_annotations %>% 
  group_by(chrom) %>% 
  mutate(us_plus_end = lag(end),
         us_minus_end = lead(start)) %>% 
  ungroup()

Intergenic mutations upstream gene plus strand

plus_no_cds <- left_join(wgsvars_mb_nocds, hambi_annotations_lagged, 
            by = join_by(strainID, chrom, between(pos, us_plus_end, start)))  %>% 
  filter(strand == "plus") %>% 
  dplyr::select(strainID:impact, hgvs_c, mge_name:mge_prediction_tool,
                locus_tag = locus_tag.y, 
                pk_ft_type, start, end, strand,
                pk_gene, pk_COG,  
                pk_product, pk_ec_num) %>% 
  mutate(pk_ft_type = "intergenic/possible promoter region")

Intergenic mutations upstream gene minus strand

minus_no_cds <- left_join(wgsvars_mb_nocds, hambi_annotations_lagged, 
            by = join_by(strainID, chrom, between(pos, end, us_minus_end))) %>% 
  filter(strand == "minus") %>% 
  dplyr::select(strainID:impact, hgvs_c, mge_name:mge_prediction_tool,
                locus_tag = locus_tag.y, 
                pk_ft_type, start, end, strand,
                pk_gene, pk_COG,  
                pk_product, pk_ec_num) %>%
  mutate(pk_ft_type = "intergenic/possible promoter region")

Intergenic mutations not in proper orientation

full_no_cds <- bind_rows(minus_no_cds, plus_no_cds)

# these mutations are not in a proper promoter orientation of any gene. E.G.
# they are downstream of the 5' start site of all genes they are next to. 

# 5' =======>-#--------- 5'
# 3' ---------#-<======= 3'

bad_orientation <- anti_join(dplyr::select(wgsvars_mb_nocds, strainID, chrom, pos), 
          dplyr::select(full_no_cds, strainID, chrom, pos)) %>% 
  left_join(wgsvars_mb_nocds) %>% 
  mutate(pk_ft_type = "intergenic/Not in proper promoter strand orientation")

Joining with `by = join_by(strainID, chrom, pos)`
Joining with `by = join_by(strainID, chrom, pos)`

Export

final <- bind_rows(minus_no_cds, plus_no_cds, bad_orientation) %>% 
  left_join(hambi_annotations) %>% 
  bind_rows(wgsvars_mb_ann_cds) %>% 
  arrange(chrom, pos)

Joining with `by = join_by(strainID, chrom, locus_tag, pk_ft_type, start, end,
strand, pk_gene, pk_COG, pk_product, pk_ec_num)`

readr::write_tsv(final, here::here(data, "wgs_variant_freqs_annotations.tsv"))

Interpretation/analysis

Inspectable tables

final %>% 
  filter(filter == "PASS") %>% 
  count(strainID)

final %>% 
  filter(filter == "PASS") %>% 
  dplyr::select(strainID, pk_ft_type, locus_tag:strand, pk_gene, Preferred_name, pk_product, Description, chrom:hgvs_c, biotype:aa_len)

First off this was the first experiment where we sequenced evolved strains that had many high-confidence mutations in intergenic regions. We have included genes here where the mutations were upstream of the 5’ of the gene in a potential promoter region. Predicting promoters is not easy and the method here is admittedly crude and should be interpreted as providing some additional functional context and not predcition of a promoter/gene regulation. In the case of ampR and ampC in HAMBI_1279 and HAMBI_1977 the mutation was in an intergenic region where the genes are in minus, forward orientation, e.g.,

5' <=======-#--------- 5'
3' ---------#-=======> 3'

So that it could potentially be in promoter region for both genes. Either way the mutation is in close proximity to ampR (transcriptional regulator of beta lactamase) and ampC (beta lactamase) which is consistent with experimental adaptation to ampicillin.

Recently it has been shown that membrane transporters, especially those using proton motive force(Wan, Wai Chi Chan, and Chen 2023), are important for ampicillin tolerance under nutrient stress conditions(Wang et al. 2021) and over long term maintanence (Wan et al. 2021). We also found PMF-dependent membrane transporter genes (puuP, nhaA, actP), were mutated in our dataset. The full summary of mutations is below

Mutations in ampicillin evolved strains and their functional potential
Species	StrainID	Gene	Mutation type	Description
Pseudomonas_E putida	HAMBI_0006	puuP	SNP, intergenic/possible promoter region	proton dependent putrescine transporter(Kurihara et al. 2009)
Pseudomonas_E putida	HAMBI_0006	ttgR	INDEL, CDS, non-synonymous	drug binding repressor protein for the ttgABC multidrug efflux pump system (Terań et al. 2003)
Agrobacterium tumefaciens	HAMBI_0105	ropA	SNP, CDS, synonymous	outer membrane porin-like protein(González-Sánchez et al. 2017)
Agrobacterium tumefaciens	HAMBI_0105	rbsA	SNP, CDS, non-synonymous	ATP binding protein for ribose import
Agrobacterium tumefaciens	HAMBI_0105	cysA	intergenic/possible promoter region	ATP binding protein for sulfate/thiosulfate import
Hafnia alvei; Pseudomonas_E chlororaphis	HAMBI_1279; HAMBI_1977	ampR	intergenic/possible promoter region; SNP, CDS, non-synonymous	HTH-type transcriptional activator. This protein is a positive regulator of gene expression of beta-lactamase (AmpC)(Lodge, Busby, and Piddock 1993)
Hafnia alvei; Pseudomonas_E chlororaphis	HAMBI_1279, HAMBI_1977	ampC	intergenic/possible promoter region	This protein is a serine beta-lactamase. It degrates beta-lactam antibiotics like ampicillin
Kluyvera intermedia	HAMBI_1299	mazG	intergenic/possible promoter region	Nucleoside triphosphate pyrophosphohydrolase/pyrophosphatase
Sphingobacterium spiritivorum	HAMBI_1896	rpsL	SNP, CDS, non-synonymous	Small ribosomal subunit protein S12
Sphingobacterium spiritivorum	HAMBI_1896	nrdE/nrdA	SNP, CDS, non-synonymous	Ribonucleoside-diphosphate reductase subunit alpha
Aeromonas caviae	HAMBI_1972	exeA	INDEL, CDS, non-synonymous	extracellular protein secretion gene(Jahagirdar and Howard 1994)
Chitinophaga sancti	HAMBI_1988	nhaA	SNP, CDS, non-synonymous	Na(+)/H(+) antiporter NhaA(Wan et al. 2021)
Chitinophaga sancti	HAMBI_1988	lacZ-like	SNP, CDS, non-synonymous	Beta-galactosidase
Chitinophaga sancti	HAMBI_1988	rhlE	intergenic/possible promoter region	ATP-dependent RNA helicase RhlE
Bordetella avium	HAMBI_2160	cysS	SNP, CDS, non-synonymous	Cysteine - tRNA ligase
Cupriavidus oxalaticus	HAMBI_2164	ypdB/lytT	INDEL, CDS, non-synonymous	Member of the two-component regulatory system YpdA/YpdB, which is part of a nutrient-sensing regulatory network composed of YpdA/YpdB, the high-affinity pyruvate signaling system(Behr, Fried, and Jung 2014)
Moraxella canis	HAMBI_2792	actP	SNP, CDS, non-synonymous	ActP belongs to the sodium:solute symporter family. The driving force used by ActP is a transmembrane electrochemical potential(Gimenez et al. 2003)
Moraxella canis	HAMBI_2792	ftsI	INDEL, CDS, non-synonymous	Peptidoglycan D,D-transpeptidase FtsI (aka penicillin binding protein)(Spratt 1975)
Microvirga lotononidis	HAMBI_3237	flgJ	SNP, CDS, non-synonymous	Peptidoglycan hydrolase. Flagellum-specific muramidase which hydrolyzes the peptidoglycan layer to assemble the rod structure in the periplasmic space(Nambu et al. 1999)

Some publications potentially worth checking out

Many of the mutations seem are in genes related to a more general stress response and in different core metabolic pathways. There is some precedent changes in core metabolism processes and different kinds of transporters conferring resistance or antibiotic tolerance. The references below are worth reading further.

References

Behr, S., L. Fried, and K. Jung. 2014. “Identification of a Novel Nutrient-Sensing Histidine Kinase/Response Regulator Network in Escherichia Coli.” Journal of Bacteriology 196 (11): 2023–29. https://doi.org/10.1128/jb.01554-14.

Gimenez, Rosa, Mariá Felisa Nunẽz, Josefa Badia, Juan Aguilar, and Laura Baldoma. 2003. “The Gene yjcG , Cotranscribed with the Gene Acs , Encodes an Acetate Permease in Escherichia Coli.” Journal of Bacteriology 185 (21): 6448–55. https://doi.org/10.1128/jb.185.21.6448-6455.2003.

González-Sánchez, Antonio, Ciro A. Cubillas, Fabiola Miranda, Araceli Dávalos, and Alejandro García-de los Santos. 2017. “The ropAe Gene Encodes a Porin-Like Protein Involved in Copper Transit in Rhizobium Etli CFN42.” MicrobiologyOpen 7 (3). https://doi.org/10.1002/mbo3.573.

Jahagirdar, R, and S P Howard. 1994. “Isolation and Characterization of a Second Exe Operon Required for Extracellular Protein Secretion in Aeromonas Hydrophila.” Journal of Bacteriology 176 (22): 6819–26. https://doi.org/10.1128/jb.176.22.6819-6826.1994.

Kurihara, Shin, Yuichi Tsuboi, Shinpei Oda, Hyeon Guk Kim, Hidehiko Kumagai, and Hideyuki Suzuki. 2009. “The Putrescine Importer PuuP of Escherichia Coli K-12.” Journal of Bacteriology 191 (8): 2776–82. https://doi.org/10.1128/jb.01314-08.

Lodge, Julia, Stephen Busby, and Laura Piddock. 1993. “Investigation of thePseudomonas Aeruginosa ampRgene and Its Role at the ChromosomalampCÎ²-Lactamase Promoter.” FEMS Microbiology Letters 111 (2-3): 315–19. https://doi.org/10.1111/j.1574-6968.1993.tb06404.x.

Nambu, Takayuki, Tohru Minamino, Robert M. Macnab, and Kazuhiro Kutsukake. 1999. “Peptidoglycan-Hydrolyzing Activity of the FlgJ Protein, Essential for Flagellar Rod Formation in Salmonella Typhimurium.” Journal of Bacteriology 181 (5): 1555–61. https://doi.org/10.1128/jb.181.5.1555-1561.1999.

Spratt, B G. 1975. “Distinct Penicillin Binding Proteins Involved in the Division, Elongation, and Shape of Escherichia Coli K12.” Proceedings of the National Academy of Sciences 72 (8): 2999–3003. https://doi.org/10.1073/pnas.72.8.2999.

Terań, Wilson, Antonia Felipe, Ana Segura, Antonia Rojas, Juan-Luis Ramos, and Mariá-Trinidad Gallegos. 2003. “Antibiotic-Dependent Induction of Pseudomonas Putida DOT-T1E TtgABC Efflux Pump Is Mediated by the Drug Binding Repressor TtgR.” Antimicrobial Agents and Chemotherapy 47 (10): 3067–72. https://doi.org/10.1128/aac.47.10.3067-3072.2003.

Wan, Yingkun, Edward Wai Chi Chan, and Sheng Chen. 2023. “Maintenance and Generation of Proton Motive Force Are Both Essential for Expression of Phenotypic Antibiotic Tolerance in Bacteria.” Edited by Brian Conlon. Microbiology Spectrum 11 (5). https://doi.org/10.1128/spectrum.00832-23.

Wan, Yingkun, Miaomiao Wang, Edward Wai Chi Chan, and Sheng Chen. 2021. “Membrane Transporters of the Major Facilitator Superfamily Are Essential for Long-Term Maintenance of Phenotypic Tolerance to Multiple Antibiotics in E. Coli.” Edited by Cezar M. Khursigara. Microbiology Spectrum 9 (3). https://doi.org/10.1128/spectrum.01846-21.

Wang, Miaomiao, Edward Wai Chi Chan, Yingkun Wan, Marcus Ho-yin Wong, and Sheng Chen. 2021. “Active Maintenance of Proton Motive Force Mediates Starvation-Induced Bacterial Antibiotic Tolerance in Escherichia Coli.” Communications Biology 4 (1). https://doi.org/10.1038/s42003-021-02612-1.