Skip to content

Sequence processing for Intermediate R workshop

Processing of 16S rRNA amplicon data from BioProject PRJNA608458 using QIIME2.

1. SRA download and qiime import

Get SRA accessions from NCBI based on BioProject accession. Use SRAToolkit to download sequences in batch. Filter to only download field samples.

code

mkdir -p seq_prep/1.fastq
cd seq_prep

module purge
module load sratoolkit/3.0.2

grep -v "[CNX]-" SraRunInfo.csv \
  | cut -d ',' -f 1 \
  | grep "SRR" \
  | xargs -n 1 -P 8 fastq-dump --split-files --gzip --skip-technical --outdir 1.fastq/

Create Manifest file for QIIME import.

code

grep -v "[CNX]-" SraRunInfo.csv \
  | cut -d ',' -f 1,30 \
  | sed '1d' \
  | awk 'BEGIN {FS=","; OFS="\t"} {print $2,"$PWD/1.fastq/"$1"_1.fastq.gz","$PWD/1.fastq/"$1"_2.fastq.gz"}' \
  | sed '1i sample-id\tforward-absolute-filepath\treverse-absolute-filepath' \
  > sra_manifest.tsv

Import as QIIME archive.

code

module purge
module load QIIME2/2022.2

qiime tools import \
  --type 'SampleData[PairedEndSequencesWithQuality]' \
  --input-path sra_manifest.tsv \
  --output-path 1.paired-end-demux.qza \
  --input-format PairedEndFastqManifestPhred33V2

2. Join paired-end reads

code

qiime vsearch join-pairs \
  --i-demultiplexed-seqs 1.paired-end-demux.qza \
  --o-joined-sequences 2.paired-end-demux-joined.qza

Check quality.

code

qiime demux summarize \
  --i-data 2.paired-end-demux-joined.qza \
  --o-visualization 2.paired-end-demux-joined.qzv

3. Quality trimming

code

qiime quality-filter q-score \
  --p-min-quality 30 \
  --i-demux 2.paired-end-demux-joined.qza \
  --o-filtered-sequences 3.paired-end-demux-joined-filtered.qza \
  --o-filter-stats 3.paired-end-demux-joined-filter-stats.qza

Check quality.

code

qiime demux summarize \
  --i-data 3.paired-end-demux-joined-filtered.qza \
  --o-visualization 3.paired-end-demux-joined-filtered.qzv

4. Denoise with Deblur

code

qiime deblur denoise-16S \
  --i-demultiplexed-seqs 3.paired-end-demux-joined-filtered.qza \
  --p-trim-length 258 \
  --p-sample-stats \
  --o-representative-sequences 4.rep-seqs.qza \
  --o-table 4.table.qza \
  --o-stats 4.deblur-stats.qza \
  --verbose \
  --p-jobs-to-start 12 \
  --output-dir 4.deblur_outputs

Visually inspect statistics.

code

qiime deblur visualize-stats \
  --i-deblur-stats 4.deblur-stats.qza \
  --o-visualization 4.deblur-stats.qzv

Summarize feature table stats.

code

qiime feature-table summarize \
  --i-table 4.table.qza \
  --o-visualization 4.table.qzv

5. Classify taxonomy

Download SILVA SSU NR99 archive from QIIME's data resources.

code

wget https://data.qiime2.org/2022.2/common/silva-138-99-seqs.qza
wget https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza

SLURM script for classifying taxonomy.

code

#!/bin/bash -e
#SBATCH --job-name=5.classify_taxonomy
#SBATCH --account=nesi02659
#SBATCH --time=8:00:00
#SBATCH --mem=128GB
#SBATCH --partition=milan
#SBATCH --cpus-per-task=24
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=jian.sheng.boey@auckland.ac.nz

# Module
module purge
module load QIIME2/2022.2

# In-silico PCR
qiime feature-classifier extract-reads \
  --i-sequences silva-138-99-seqs.qza \
  --p-f-primer GTGYCAGCMGCCGGGGTAA \
  --p-r-primer GGACTACNVGGGTWTCTAAT \
  --p-n-jobs $SLURM_CPUS_PER_TASK \
  --o-reads 5.ref-seqs.qza

# Train classifier
qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads 5.ref-seqs.qza \
  --i-reference-taxonomy silva-138-99-tax.qza \
  --o-classifier 5.classifier.qza

# Annotate representative sequences
qiime feature-classifier classify-sklearn \
  --i-classifier 5.classifier.qza \
  --i-reads 4.rep-seqs.qza \
  --o-classification 5.taxonomy.qza \
  --p-n-jobs $SLURM_CPUS_PER_TASK

# Tabulate taxonomy
qiime metadata tabulate \
  --m-input-file 5.taxonomy.qza \
  --o-visualization 5.taxonomy.qzv

6. Clean up tables

Remove taxon with Mitochondria or Chloroplast.

code

qiime taxa filter-table \
  --i-table 4.table.qza \
  --i-taxonomy 5.taxonomy.qza \
  --p-exclude mitochondria,chloroplast \
  --o-filtered-table 6.filtered-table.qza

Visualize.

code

qiime feature-table summarize \
  --i-table 6.filtered-table.qza \
  --o-visualization 6.filtered-table.qzv

6. Create phylogeny

Create alignments and phylogenetic tree.

code

qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences 4.rep-seqs.qza \
  --output-dir 7.phylogeny \
  --p-n-threads 12

Generate phylogeny-informed distance matrices.

code

qiime diversity-lib unweighted-unifrac \
  --i-phylogeny 7.phylogeny/rooted_tree.qza \
  --i-table 6.filtered-table.qza \
  --p-threads 8 \
  --o-distance-matrix 7.unweighted-unifrac.qza

code

qiime diversity-lib weighted-unifrac \
  --i-phylogeny 7.phylogeny/rooted_tree.qza \
  --i-table 6.filtered-table.qza \
  --p-threads 8 \
  --o-distance-matrix 7.weighted-unifrac.qza

7. Export tables

QIIME artifacts are Zip archives. Simply unzip.

code

# Everything was previously done in seq_prep/
cd ../

mkdir -p data/

for i in seq_prep/{5.taxonomy,6.filtered-table,7.unweighted-unifrac,7.weighted-unifrac}.qza; do
  name=$(basename $i .qza)
  qiime tools export \
    --input-path $i \
    --output-path data/$name
done

Convert BIOM feature table into TSV.

code

biom convert \
  -i data/6.filtered-table/feature-table.biom \
  -o data/6.filtered-table/feature-table.tsv \
  --to-tsv

Clean up headers.

code

# Feature table
sed '1d' data/6.filtered-table/feature-table.tsv \
  | sed 's/#OTU ID/ASVID/' \
  > filtered_feature_table.tsv

# Taxonomy table
sed 's/ /_/' data/5.taxonomy/taxonomy.tsv > taxonomy.tsv

# Distance matrices
for i in data/7.*; do
  name=$(basename $i .tsv | sed 's/-/_/g' | sed 's/7.//g')
  echo $name
  sed 's/^\t/sample\t/' $i/distance-matrix.tsv > $name.tsv
done