091-data-generation-code.rmd

---
title: "091-data-generation-code"
author: "mcc"
date: "4/17/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Calculate the amino acid compositions (AAC) and Di-peptide compositions (DPC)

 from .fasta formats, {Myoglobin, Non-Myoglobin}

Calculating the Amino Acid and Di-peptide composition of a protein string is a 
simple calculation requiring the total amino acid length of the peptide or 
poly-peptide of interest and a count of substrings. Initially, the command 
`seqinr::read.fasta` reads .fasta file formats and returns a list of proteins 
stripping away all other information. Secondly, the command 
`stringr::str_count()` produces an integer value of the number of substrings 
in a larger string, i.e. `peptide`. 

For example, `aa_nums[j] = str_count(peptide, col_titles[j]) / total_aa`, 

Where; `aa_nums[j]` is an array to saving values for later writing to file, 
`peptide` is the string to check, i.e. protein of interest, `col_titles[j]` 
is the substring which is either a single amino acid or di-peptide.

Input: .fasta
Output: .csv

Libraries 

```
Libraries = c("stringr", "knitr", "seqinr")

for (p in Libraries) {  # Install Libraries
    library(p, character.only = TRUE)
}

opts_chunk$set(cache = TRUE, 
               warning = FALSE, 
               message = FALSE,
               align = "center")
```

Import uniprot-myoglobin.fasta - Read peptide lines

```
read_fasta <- function(file) {
    listo_proteins <- read.fasta(file = file,
                                 seqtype = "AA",
                                 as.string = TRUE,
                                 seqonly = FALSE,
                                 strip.desc = TRUE)
    return(listo_proteins)
}

file = "./00-data/ORIGINAL_DATA/uniprot-myoglobin.fasta"
myoglobins <- read_fasta(file)
```

Column_titles
```
column_titles = function() {
    peptides = c("A", "C", "D", "E", "F",
                 "G", "H", "I", "K", "L",
                 "M", "N", "P", "Q", "R",
                 "S", "T", "V", "W", "Y")
                 
    # Add DIPEPTIDES column titles
    di_titles = vector(mode = "character", length = 400)
    k = 1
    for (i in 1:20) {
        for (j in 1:20) {
            di_titles[k] <- paste(peptides[i], peptides[j], sep = "")
            k = k + 1
        }
    }
    aa_di_titles <- c("Class","TotalAA","PID", peptides, di_titles)
    return(aa_di_titles)
}

col_titles <- column_titles()
col_titles
```

Write empty .csv

```
write_empty_csv <- function(protein_class = "C") {
    col_titles <- column_titles()
    file_name <- paste(protein_class, "_aac_dpc.csv", sep = "")
    write.table(t(col_titles), 
                file_name,  
                sep = ",", 
                col.names = FALSE, 
                row.names = FALSE, 
                eol = "\n")
    return(file_name)
}

file_name <- write_empty_csv()
```

## Calculate AAC and DPC values function

```
calc_aac_dpc <- function(peptide, protein_class = "C", i, file_name) {
    aa_nums = matrix(0, ncol = 423)
    ###############################
    # First column is class
    aa_nums[1] = ifelse(protein_class == "C", 0, 1)
    # Second column is total number of amino acids
    total_aa = nchar(peptide)
    aa_nums[2] = total_aa
    # Third line is 'Protein ID', PID
    aa_nums[3] = paste(protein_class, i, sep = "")
    # Column 4:423 - Calculate AAC/DPC
    for (j in 4:423) {
        aa_nums[j] = str_count(peptide, col_titles[j]) / total_aa
        }
    write(t(aa_nums), file = file_name, append = TRUE, ncolumns = 423, sep = ",")
}
```

## Run Myoglobin

```
# RUN Myoglobin
for (i in 1:1124) {
    peptide <- myoglobins[[i]][1]
    calc_aac_dpc(peptide, protein_class = "M", i, file_name)
}
```

## Run Control / Human-NOT-myoglobin 

- Import data - Read peptide lines

```
read_fasta <- function(file) {
    listo_proteins <- read.fasta(file = file,
                            seqtype = "AA",
                            as.string = TRUE,
                            seqonly = FALSE,
                            strip.desc = TRUE)
    return(listo_proteins)
}

file = "./00-data/ORIGINAL_DATA/uniprot-human+NOT+hemoglobin+NOT+myoglobin+random.fasta"
controls <- read_fasta(file)
```

## Run Controls

```
for (i in 1:1216) {
    peptide <- controls[[i]][1]
    calc_aac_dpc(peptide, protein_class = "C", i, file_name)
}
```

## KEEP AAC ONLY FOR RAW DATA

```
file = "./00-data/aac_dpc_values/C+M_aac_dpc.csv"
C+M_aac_dpc <- read.csv(file,
                        stringsAsFactors=FALSE)
# View(`C+M_aac_dpc`)

# Select 1st thru 23rd variables
c_m_RAW_AAC <- C+M_aac_dpc[c(1:23)]
```

- To A Comma Delimited Text File

```
setwd("../00-data/02-aac_dpc_values/")

write.table(c_m_RAW_AAC, 
            file = "./00-data/02-aac_dpc_values/c_m_RAW_AAC.csv", 
            sep = ",", 
            row.names = F)
```

## Transform {C, F, I} from c_m_RAW_AAC

```
library(readr)

file = "../00-data/02-aac_dpc_values/c_m_RAW_AAC.csv"
c_m_RAW_AAC <- read_csv(file, 
                        col_types = cols(Class = col_factor(levels = c("0","1"))))
c_m_TRANSFORMED_AAC <- c_m_RAW_AAC
```

1. Transfrom C,F,I using sqrt(x)
2. Columns: C=5, F=8, I=11

```
c_m_TRANSFORMED_AAC[, 5] <- sqrt(c_m_TRANSFORMED_AAC[, 5]) # C
c_m_TRANSFORMED_AAC[, 8] <- sqrt(c_m_TRANSFORMED_AAC[, 8]) # F
c_m_TRANSFORMED_AAC[,11] <- sqrt(c_m_TRANSFORMED_AAC[,11]) # I

file = "./00-data/02-aac_dpc_values/c_m_TRANSFORMED.csv"
write_csv(c_m_TRANSFORMED_AAC, 
          file = file, 
          col_names = T)
```