Data wrangling for variant analysis
Setup
Libraries and global variables
Set up some directories
data_raw <- here::here("_data_raw", "wgs")
shared <- here::here("_data_raw", "shared")
vcftar <- here::here(data_raw, "haplotypecaller", "tables.tar.gz")
# create temporary location to decompress
tmpdir <- fs::file_temp()
# make processed data directory if it doesn't exist
data <- here::here("data", "wgs")
fs::dir_create(data)
Contamination analysis
Mashscreen
Nothing really suspicious… HAMBI_0403 has some overlap with HAMBI_1279… We’ll look into that more below
mashscreenfiles <- fs::dir_ls(
path = here::here(data_raw, "contamination", "mashscreen"),
all = FALSE,
recurse = TRUE,
type = "file",
glob = "*.tsv",
regexp = NULL,
invert = FALSE,
fail = TRUE
)
mashscreenslurped <- readr::read_tsv(
mashscreenfiles,
col_names = c(
"identity",
"shared-hashes",
"median-multiplicity",
"p-value",
"query-ID",
"query-comment"
),
col_types = c("dcddcc"),
id = "sample"
)
mashscreenslurped_fmt <- mashscreenslurped %>%
mutate(strainID = str_extract(sample,
regex("(HAMBI[:digit:]{4})"))) %>%
mutate(strainID = str_replace(strainID, "HAMBI", "HAMBI_")) %>%
mutate(matching_strain = str_extract(`query-ID`,
regex("(HAMBI_[:digit:]{4})"))) %>%
dplyr::select(-sample, -`query-ID`, -`query-comment`,
shared_hashes=`shared-hashes`,
median_multiplicity=`median-multiplicity`,
p_value = `p-value`) %>%
relocate(strainID, matching_strain) %>%
filter(!shared_hashes %in% c("1/1000", "2/1000", "3/1000", "4/1000", "5/1000"))
write_tsv(mashscreenslurped_fmt, here::here(data, "mashscreen_contamination.tsv"))
coverage
cmfiles <- fs::dir_ls(
path = here::here(data_raw, "contamination", "competitive_mapping"),
all = FALSE,
recurse = TRUE,
type = "file",
glob = "*.txt",
regexp = NULL,
invert = FALSE,
fail = TRUE
)
cmslurped <- readr::read_tsv(
cmfiles,
comment = "#",
col_names = c(
"matching_strain",
"avg_fold_coverage",
"ref_length",
"ref_gc",
"percent_covered",
"bases_covered",
"plus_reads",
"minus_reads",
"read_gc",
"median_fold_coverage",
"std_dev_coverage"
),
col_types = c("cdddddddddd"),
id = "sample"
)
cmslurped_fmt <- cmslurped %>%
mutate(strainID = str_extract(sample,
regex("(HAMBI[:digit:]{4})"))) %>%
mutate(strainID = str_replace(strainID, "HAMBI", "HAMBI_")) %>%
mutate(matching_chrom = str_extract(matching_strain,
regex("(HAMBI_[:digit:]{4}_[:alnum:]+)")),
matching_strain = str_extract(matching_strain,
regex("(HAMBI_[:digit:]{4})"))) %>%
dplyr::select(-sample) %>%
relocate(strainID, matching_strain, matching_chrom, median_fold_coverage, avg_fold_coverage, std_dev_coverage) %>%
filter(median_fold_coverage > 0)
write_tsv(cmslurped_fmt, here::here(data, "competitive_mapping_contamination.tsv"))
cmslurped_fmt
So the competitive mapping analysis also shows that some small percentage of the reads from HAMBI_0403 are mapping to HAMBI_1279 (33,281 reads out of 3,242,608 total mapping reads)
Reading and small formatting of data
Untar and decompress
Variants
SnpEff annotations
snpefffiles <- fs::dir_ls(
path = vcfdir,
all = FALSE,
recurse = TRUE,
type = "file",
glob = "*.snpeff.tsv",
regexp = NULL,
invert = FALSE,
fail = TRUE
)
snpefffiles_notempty <- map_dfr(snpefffiles, peek_count_lines) %>%
pivot_longer(everything()) %>%
filter(value > 1) %>%
pull(name)
snpeffslurped <- readr::read_tsv(
snpefffiles_notempty,
skip = 1,
col_names = c(
"chrom",
"pos",
"ref",
"alt",
"filter",
# Annotated using Sequence Ontology terms. Multiple effects can be concatenated using '&'.
"effect",
# A simple estimation of putative impact / deleteriousness : HIGH, MODERATE, LOW, or MODIFIER
"impact",
# gene id from prokka
"locus_tag",
# whether variant is coding or noncoding
"biotype",
# Variant using HGVS notation (DNA level)
"hgvs_c",
# Variant using HGVS notation (Protein level).
"hgvs_p",
# Position in coding bases (one based includes START and STOP codons).
"cds_pos",
# Total number of coding bases (one based includes START and STOP codons).
"cds_len",
# Position in translated product (one based includes START and STOP codons).
"aa_pos",
# Total length of translated product (one based includes START and STOP codons).
"aa_len"
),
col_types = c("cdcccccccccdddd"),
id = "sample"
)
snpeffslurpedfmt <- snpeffslurped %>%
# this is weird, but joining on column like filter that contains characters like ';' doesnt work
dplyr::select(-filter) %>%
mutate(strainID = str_extract(sample,
regex("(HAMBI_[:digit:]{4})"))) %>%
dplyr::select(-sample) %>%
relocate(strainID) %>%
# this is because SNPeff does not give the correct closest locus_tag when
# the mutation is not in a CDS!!! this doesn't affect earlier work because I
# always focus on NS mutations, but here since a lot of these mutations are
# not in CDS we need to deal with this manually
mutate(locus_tag = if_else(is.na(hgvs_p), NA_character_, locus_tag))
Allele frequencies
varfiles <- fs::dir_ls(
path = vcfdir,
all = FALSE,
recurse = TRUE,
type = "file",
glob = "*.variants.tsv",
regexp = NULL,
invert = FALSE,
fail = TRUE
)
varfiles_notempty <- map_dfr(varfiles, peek_count_lines) %>%
pivot_longer(everything()) %>%
filter(value > 1) %>%
pull(name)
varslurped <- readr::read_tsv(
varfiles_notempty,
skip = 1,
col_names = c(
"chrom",
"pos",
"ref",
"alt",
"filter",
"type",
# Allelic depths for the ref and alt alleles in the order listed
"allele_depth",
# Allele fractions of alternate alleles. Excludes filtering
"depth_total",
# Count of fragments supporting each allele.
"variant_freq"
),
col_types = c("cdccccccddd"),
id = "sample"
)
varslurpedfmt <- varslurped %>%
mutate(strainID = str_extract(sample,
regex("(HAMBI_[:digit:]{4})"))) %>%
dplyr::select(-sample) %>%
relocate(strainID) %>%
# taken only first two most abundant alleles
separate(allele_depth, into = c("depth_ref", "depth_alt"),
sep=",", extra="drop") %>%
# format depths to numberic
mutate(depth_ref = as.numeric(depth_ref),
depth_alt = as.numeric(depth_alt))
Clean up tmp directories
Annotations
hambi_annotations <- readr::read_rds(here::here(shared, "combined_annotations_final.rds"))
mgefinder <- readr::read_tsv(here::here(shared, "MGE_finder_HAMBI_combined.tsv"))
Rows: 475 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): strainID, name, chrom, prediction_tool
dbl (2): start, end
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 99 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): strainID, chrom, name, prediction_tool
dbl (2): start, end
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Formatting and filtering
Combine snpeff annotations and var frequencies
Mark regions with mobile elements in metagenomes
Join with annotations
Separate CDS and non CDS mutations
Use GFF file and make lagged/lead start and end sites to identify upstream regions of plus/minus strand genes
Intergenic mutations upstream gene plus strand
plus_no_cds <- left_join(wgsvars_mb_nocds, hambi_annotations_lagged,
by = join_by(strainID, chrom, between(pos, us_plus_end, start))) %>%
filter(strand == "plus") %>%
dplyr::select(strainID:impact, hgvs_c, mge_name:mge_prediction_tool,
locus_tag = locus_tag.y,
pk_ft_type, start, end, strand,
pk_gene, pk_COG,
pk_product, pk_ec_num) %>%
mutate(pk_ft_type = "intergenic/possible promoter region")
Intergenic mutations upstream gene minus strand
minus_no_cds <- left_join(wgsvars_mb_nocds, hambi_annotations_lagged,
by = join_by(strainID, chrom, between(pos, end, us_minus_end))) %>%
filter(strand == "minus") %>%
dplyr::select(strainID:impact, hgvs_c, mge_name:mge_prediction_tool,
locus_tag = locus_tag.y,
pk_ft_type, start, end, strand,
pk_gene, pk_COG,
pk_product, pk_ec_num) %>%
mutate(pk_ft_type = "intergenic/possible promoter region")
Intergenic mutations not in proper orientation
full_no_cds <- bind_rows(minus_no_cds, plus_no_cds)
# these mutations are not in a proper promoter orientation of any gene. E.G.
# they are downstream of the 5' start site of all genes they are next to.
# 5' =======>-#--------- 5'
# 3' ---------#-<======= 3'
bad_orientation <- anti_join(dplyr::select(wgsvars_mb_nocds, strainID, chrom, pos),
dplyr::select(full_no_cds, strainID, chrom, pos)) %>%
left_join(wgsvars_mb_nocds) %>%
mutate(pk_ft_type = "intergenic/Not in proper promoter strand orientation")
Joining with `by = join_by(strainID, chrom, pos)`
Joining with `by = join_by(strainID, chrom, pos)`
Export
Interpretation/analysis
Inspectable tables
final %>%
filter(filter == "PASS") %>%
dplyr::select(strainID, pk_ft_type, locus_tag:strand, pk_gene, Preferred_name, pk_product, Description, chrom:hgvs_c, biotype:aa_len)
First off this was the first experiment where we sequenced evolved strains that had many high-confidence mutations in intergenic regions. We have included genes here where the mutations were upstream of the 5’ of the gene in a potential promoter region. Predicting promoters is not easy and the method here is admittedly crude and should be interpreted as providing some additional functional context and not predcition of a promoter/gene regulation. In the case of ampR and ampC in HAMBI_1279 and HAMBI_1977 the mutation was in an intergenic region where the genes are in minus, forward orientation, e.g.,
So that it could potentially be in promoter region for both genes. Either way the mutation is in close proximity to ampR (transcriptional regulator of beta lactamase) and ampC (beta lactamase) which is consistent with experimental adaptation to ampicillin.
Recently it has been shown that membrane transporters, especially those using proton motive force(Wan, Wai Chi Chan, and Chen 2023), are important for ampicillin tolerance under nutrient stress conditions(Wang et al. 2021) and over long term maintanence (Wan et al. 2021). We also found PMF-dependent membrane transporter genes (puuP, nhaA, actP), were mutated in our dataset. The full summary of mutations is below
Species | StrainID | Gene | Mutation type | Description |
---|---|---|---|---|
Pseudomonas_E putida | HAMBI_0006 | puuP | SNP, intergenic/possible promoter region | proton dependent putrescine transporter(Kurihara et al. 2009) |
Pseudomonas_E putida | HAMBI_0006 | ttgR | INDEL, CDS, non-synonymous | drug binding repressor protein for the ttgABC multidrug efflux pump system (Terań et al. 2003) |
Agrobacterium tumefaciens | HAMBI_0105 | ropA | SNP, CDS, synonymous | outer membrane porin-like protein(González-Sánchez et al. 2017) |
Agrobacterium tumefaciens | HAMBI_0105 | rbsA | SNP, CDS, non-synonymous | ATP binding protein for ribose import |
Agrobacterium tumefaciens | HAMBI_0105 | cysA | intergenic/possible promoter region | ATP binding protein for sulfate/thiosulfate import |
Hafnia alvei; Pseudomonas_E chlororaphis | HAMBI_1279; HAMBI_1977 | ampR | intergenic/possible promoter region; SNP, CDS, non-synonymous | HTH-type transcriptional activator. This protein is a positive regulator of gene expression of beta-lactamase (AmpC)(Lodge, Busby, and Piddock 1993) |
Hafnia alvei; Pseudomonas_E chlororaphis | HAMBI_1279, HAMBI_1977 | ampC | intergenic/possible promoter region | This protein is a serine beta-lactamase. It degrates beta-lactam antibiotics like ampicillin |
Kluyvera intermedia | HAMBI_1299 | mazG | intergenic/possible promoter region | Nucleoside triphosphate pyrophosphohydrolase/pyrophosphatase |
Sphingobacterium spiritivorum | HAMBI_1896 | rpsL | SNP, CDS, non-synonymous | Small ribosomal subunit protein S12 |
Sphingobacterium spiritivorum | HAMBI_1896 | nrdE/nrdA | SNP, CDS, non-synonymous | Ribonucleoside-diphosphate reductase subunit alpha |
Aeromonas caviae | HAMBI_1972 | exeA | INDEL, CDS, non-synonymous | extracellular protein secretion gene(Jahagirdar and Howard 1994) |
Chitinophaga sancti | HAMBI_1988 | nhaA | SNP, CDS, non-synonymous | Na(+)/H(+) antiporter NhaA(Wan et al. 2021) |
Chitinophaga sancti | HAMBI_1988 | lacZ-like | SNP, CDS, non-synonymous | Beta-galactosidase |
Chitinophaga sancti | HAMBI_1988 | rhlE | intergenic/possible promoter region | ATP-dependent RNA helicase RhlE |
Bordetella avium | HAMBI_2160 | cysS | SNP, CDS, non-synonymous | Cysteine - tRNA ligase |
Cupriavidus oxalaticus | HAMBI_2164 | ypdB/lytT | INDEL, CDS, non-synonymous | Member of the two-component regulatory system YpdA/YpdB, which is part of a nutrient-sensing regulatory network composed of YpdA/YpdB, the high-affinity pyruvate signaling system(Behr, Fried, and Jung 2014) |
Moraxella canis | HAMBI_2792 | actP | SNP, CDS, non-synonymous | ActP belongs to the sodium:solute symporter family. The driving force used by ActP is a transmembrane electrochemical potential(Gimenez et al. 2003) |
Moraxella canis | HAMBI_2792 | ftsI | INDEL, CDS, non-synonymous | Peptidoglycan D,D-transpeptidase FtsI (aka penicillin binding protein)(Spratt 1975) |
Microvirga lotononidis | HAMBI_3237 | flgJ | SNP, CDS, non-synonymous | Peptidoglycan hydrolase. Flagellum-specific muramidase which hydrolyzes the peptidoglycan layer to assemble the rod structure in the periplasmic space(Nambu et al. 1999) |
Some publications potentially worth checking out
Many of the mutations seem are in genes related to a more general stress response and in different core metabolic pathways. There is some precedent changes in core metabolism processes and different kinds of transporters conferring resistance or antibiotic tolerance. The references below are worth reading further.
- RpoS gene: RpoS and the bacterial general stress response
- nhaA gene: Modulating the evolutionary trajectory of tolerance using antibiotics with different metabolic dependencies
- Persister Heterogeneity Arising from a Single Metabolic Stress
- cysS gene?? Reducing the fitness cost of antibiotic resistance by amplification of initiator tRNA genes
- Ribosomal mutations promote the evolution of antibiotic resistance in a multidrug environment
- rpoS and ypdB genes: Interspecific bacterial sensing through airborne signals modulates locomotion and drug resistance