-
Notifications
You must be signed in to change notification settings - Fork 1
/
BacDiveR_check.Rmd
100 lines (84 loc) · 2.6 KB
/
BacDiveR_check.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
---
title: "BacDiveR_check"
author: "Ilya"
date: "1/31/2019"
output: github_document
---
###look at data for one species from BacDive
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
####install packages
```{r packages}
pkgTest <- function(x)
{
if (x %in% rownames(installed.packages()) == FALSE) {
install.packages(x, dependencies= TRUE,repos = "http://cran.us.r-project.org")
}
library(x, character.only = TRUE)
}
neededPackages <- c("data.table", "glue", "rlang","tibble", "tidyselect", "dplyr", "ggplot2", "tidyr"
)
#"rstan"
for (package in neededPackages){pkgTest(package)}
```
```{r}
D = readRDS("Data for Bacillus halotolerans.rds")
#find out field names
names(D)
#look at unique fields
unique(D$field)
#look at unique sections
unique(D$section)
#get just taxonomy name
test = subset(D, section == "taxonomy_name")
#look at unique subsections
unique(test$subsection)
#get just strains
test_strains = subset(test, subsection == "strains")
#see what fields are for strain
unique(test_strains$field)
#check is_type_strain
test_strain_type = subset(test_strains, field == "is_type_strain")
table(test_strain_type$bacdive_id,test_strain_type$value)
```
####find out what values should be there for one ID
```{r}
id_1 = subset(D, bacdive_id == "100619")
unique(id_1$value)
```
###try spread and dcast
```{r}
#spread
D <- D[,c("bacdive_id",
"field",
"value")]
# D$value[is.na(D$value)] <- -9999#this gets duplicate error, as does replacing with blank
D_uni=distinct(D)#make sure rows are unique
dim(D_uni)
#remove NA values
#look at NA values
D_na = D_uni[is.na(D_uni$value),]
D_uni = D_uni[!is.na(D_uni$value),]
dim(D_uni)
D_uni = subset(D_uni, field != "ID_reference")
D_uni = subset(D_uni, field != "ID_reference1")
D_uni = subset(D_uni, field != "ID_reference2")
# Spread_1 = spread(D_uni, key = c(bacdive_id, field), value= value)
#this makes columns equal to ID and row equal to field name
# Spread_1 = spread(D_uni, key = bacdive_id, value = value)
Spread_2 = spread(D_uni, key = field, value = value)
write.csv(Spread_2,file = "bacdive_1_species_wide.csv")
#this also works
library(data.table)
D_back = D
D =D_back
# D$value[is.na(D$value)] <- -9999#this gets duplicate error, as
D_uni=distinct(D)#make sure rows are unique
D_uni = subset(D_uni, field != "ID_reference")
D_uni = subset(D_uni, field != "ID_reference1")
D_uni = subset(D_uni, field != "ID_reference2")
# D_cast = dcast(setDT(D), bacdive_id ~ field, value.var = "value")
D_cast = dcast(setDT(D_uni), bacdive_id ~ field, value.var = "value")
write.csv(D_cast,file = "bacdive_1_species_wide_dcast.csv")
```