2023-11-14: DNA to Protein in Rust

A Rust implementation of the DNA to Protein conversion program given in Python 3 at: https://www.geeksforgeeks.org/dna-protein-python-3/

This program is a demonstration of file-handling and use of a HashMap. There are two functions: read_seq reads text from a given filename, and returns it as a string with newline characters removed; and translate which uses a HashMap to convert character triples into single characters.

For read_seq, we use read_to_string to read a filename into a string, before replacing the newline characters.

For translate, we build a HashMap from an array of tuples, with collect doing the necessary work to build a HashMap.

Final Program

Download code: dna-protein-source.zip

// This is a copy of the DNA translation program from 
// https://www.geeksforgeeks.org/dna-protein-python-3/

// Except that dna_sequence file is edited to only use the required text
// instead of including a range limit in code.

use std::collections::HashMap;
use std::{env, fs};

// Requires a filename in argv to convert.
fn main() {
    let mut args = env::args();
    args.next(); // ignore program name

    if let Some(filename) = args.next() {
        let dna_seq = read_seq(&filename);
        let protein_seq = translate(&dna_seq);

        if let Some(protein_filename) = args.next() {
            // if a second filename is given, check it as target sequence
            let target_seq = read_seq(&protein_filename);
            println!("Translated sequence compared to target: {}", 
                     protein_seq == target_seq);
        } else {
            println!("Translated sequence: {}", protein_seq);
        }

    } else {
        println!("Provide filename of DNA sequence to translate");
    }
}

// Reads text from given filename, and returns a string without newlines.
fn read_seq(filename: &str) -> String {
    let contents = fs::read_to_string(filename)
        .expect("Something went wrong reading the file");
    contents.replace('\n',"").replace('\r',"")
}

// Translates a given DNA sequence into a string of protein names.
// Given sequence should be divisible into triplets.
fn translate(seq: &str) -> String {
    let table: HashMap<&str, char> = [
        ("ATA", 'I'), ("ATC", 'I'), ("ATT", 'I'), ("ATG", 'M'), 
        ("ACA", 'T'), ("ACC", 'T'), ("ACG", 'T'), ("ACT", 'T'), 
        ("AAC", 'N'), ("AAT", 'N'), ("AAA", 'K'), ("AAG", 'K'), 
        ("AGC", 'S'), ("AGT", 'S'), ("AGA", 'R'), ("AGG", 'R'),                  
        ("CTA", 'L'), ("CTC", 'L'), ("CTG", 'L'), ("CTT", 'L'), 
        ("CCA", 'P'), ("CCC", 'P'), ("CCG", 'P'), ("CCT", 'P'), 
        ("CAC", 'H'), ("CAT", 'H'), ("CAA", 'Q'), ("CAG", 'Q'), 
        ("CGA", 'R'), ("CGC", 'R'), ("CGG", 'R'), ("CGT", 'R'), 
        ("GTA", 'V'), ("GTC", 'V'), ("GTG", 'V'), ("GTT", 'V'), 
        ("GCA", 'A'), ("GCC", 'A'), ("GCG", 'A'), ("GCT", 'A'), 
        ("GAC", 'D'), ("GAT", 'D'), ("GAA", 'E'), ("GAG", 'E'), 
        ("GGA", 'G'), ("GGC", 'G'), ("GGG", 'G'), ("GGT", 'G'), 
        ("TCA", 'S'), ("TCC", 'S'), ("TCG", 'S'), ("TCT", 'S'), 
        ("TTC", 'F'), ("TTT", 'F'), ("TTA", 'L'), ("TTG", 'L'), 
        ("TAC", 'Y'), ("TAT", 'Y'), ("TAA", '_'), ("TAG", '_'), 
        ("TGC", 'C'), ("TGT", 'C'), ("TGA", '_'), ("TGG", 'W'),
    ].iter().cloned().collect();

    let mut protein = String::new();

    // Note: Bounds for i tolerate non-triplet seq values
    for i in (3..=seq.len()).step_by(3) { 
        let codon = &seq[(i-3)..i]; 
        if let Some(p) = table.get(&codon) {
            protein.push(*p);
        }
    }

    protein
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_translate() {
        assert_eq!("", translate(""));
        assert_eq!("", translate("TA"));
        assert_eq!("I", translate("ATA"));
        assert_eq!("IL", translate("ATATTG"));
        assert_eq!("IL", translate("ATATTGT"));
    }
}

Page from Peter's Scrapbook, output from a VimWiki on 2024-04-02.