library(tidyverse)
surveys <- read_csv('data/portal_data_joined.csv')
View(surveys)
#Output is a T/F vector
is.na(surveys$weight)
#Can be used as in put to filter
surveys %>%
filter(is.na(weight))
#The not operator (!) changes true to false and false to true
surveys %>%
filter(!is.na(weight))
surveys %>%
filter(!is.na(weight)) %>%
mean(weight)
surveys %>%
filter(!is.na(weight)) %>%
mean(surveys$weight)
#ignore missing values
grouped_surveys<-surveys %>%
group_by(sex) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
library("tidyverse")
#mean weight by sex and species id
surveys %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
#ignore missing values
grouped_surveys<-surveys %>%
group_by(sex) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
View(grouped_surveys)
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex) %>%
summarize(mean_weight = mean(weight))
surveys%>%
filter(year<1999)%>%
select(year, sex, weight)
library(tidyverse)
surveys%>%
filter(year<1999)%>%
select(year, sex, weight)
surveys%>%
select(year, sex, weight)%>%
filter(year<1999)
### Mutate
mutate(surveys, weight_kg = weight/1000)
### Mutate
mut_surveys<-mutate(surveys, weight_kg = weight/1000)
View(mut_surveys)
mut_surveys<-surveys %>%
mutate(weight_kg = weight/1000)
View(mut_surveys)
mut_surveys<-surveys %>%
mutate(weight_kg = weight/1000,
weight_kg_2 = weight*2/1000)
View(mut_surveys)
surveys %>%
mutate(weight_kg = weight/1000,
weight_kg_2 = weight*2/1000)%>%
head
surveys %>%
mutate(weight_kg = weight/1000,
weight_kg_2 = weight*2/1000)%>%
head(n=3)
################# Summarizing data by categories ###############################
#Overall mean weight
surveys %>%
summarize(mean_weight = mean(weight))
View(surveys)
################# Summarizing data by categories ###############################
#Overall mean weight
surveys %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
#mean weight by sex
surveys%>%
group_by(sex)%>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
#mean weight by sex and species id
surveys%>%
group_by(sex, species_id)%>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
#Output is a T/F vector
is.na(weight)
#Output is a T/F vector
is.na(surveys$weight)
View(surveys)
#Can be used as in put to filter
surveys%>%
filter(is.na(weight))
#Output is a T/F vector
!is.na(surveys$weight)
#The not operator (!) changes true to false and false to true
surveys%>%
filter(!is.na(weight))
#filter out missing values rather than ignoring them
surveys%>%
filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight))
#print only first 15 lines
surveys%>%
filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight))%>%
print(15)
#print only first 15 lines
surveys%>%
filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight))%>%
print(n=15)
#print only first 15 lines
surveys%>%
filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight))%>%
print(n = 15)
#print only first 15 lines
surveys%>%
filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight))%>%
print(n = 20)
#calculate multiple summary statistics
surveys%>%filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight),
min_weight = min(weight))
#Tally: get a count of records in each category
surveys%>%
group_by(sex)%>%
tally
#Tally: get a count of records in each category
surveys%>%
group_by(sex, species_id)%>%
tally
########################### Spread ############################################
surveys_gw <- surveys %>%
filter(!is.na(weight)) %>%
group_by(genus, plot_id) %>%
summarize(mean_weight = mean(weight))
surveys_gw %>%
spread(genus, mean_weight, fill = 0) %>%
head()
surveys_spread <- surveys_gw %>%
spread(key = genus,
value = mean_weight)
View(surveys_spread)
######################### Data wrangling in R ######################
#### Based on the data carpentry ecology lessons:
####       http://www.datacarpentry.org/R-ecology-lesson/03-dplyr.html
#installing packages
install.packages("tidyverse")
library(tidyverse)
#loading data
surveys <- read_csv('data/portal_data_joined.csv')
############################## The Verbs! ################################
### Select
select(surveys, plot_id, species_id, weight)
### Filter
filter(surveys, year == 1995)
###pipes
surveys %>%
filter(weight<5) %>%
select(species_id, sex, weight)
surveys_sml <- surveys %>%
filter(weight < 5) %>%
select(species_id, sex, weight)
surveys_sml
#Exercise #1:
### Using pipes, subset the survey data to include individuals collected
###     before 1995 and retain only the columns year, sex, and weight.
surveys%>%
filter(year<1999)%>%
select(year, sex, weight)
### OR ###
surveys%>%
select(year, sex, weight)%>%
filter(year<1999)
### Mutate
mut_surveys<-mutate(surveys, weight_kg = weight/1000)
#same as
mut_surveys<-surveys %>%
mutate(weight_kg = weight/1000)
#create multiple columns
mut_surveys<-surveys %>%
mutate(weight_kg = weight/1000,
weight_kg_2 = weight*2/1000)
#show only first 3 lines
surveys %>%
mutate(weight_kg = weight/1000,
weight_kg_2 = weight*2/1000)%>%
head(n=3)
##### Exercise 2
#Create a new data frame from the survey data that meets the following criteria:
#1. Contains only the species_id column and a new column called hindfoot_half
#2. hindfood_half contains values that are half the hindfoot_length values.
#3. Only include records from 1990 and after
#Hint: think about how the commands should be ordered to produce this data frame!
surveys%>%
filter(year>1990)%>%
mutate(hindfoot_half = hindfoot_length/2) %>%
select(species_id, hindfoot_half)
################# Summarizing data by categories ###############################
#Overall mean weight
surveys %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
#mean weight by sex
surveys%>%
group_by(sex)%>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
#mean weight by sex and species id
surveys%>%
group_by(sex, species_id)%>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
################### Removing missing values with filter ########################
#Output is a T/F vector
!is.na(surveys$weight)
#Can be used as in put to filter
surveys%>%
filter(is.na(weight))
#The not operator (!) changes true to false and false to true
surveys%>%
filter(!is.na(weight))
#filter out missing values rather than ignoring them
surveys%>%
filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight))
#print only first 15 lines
surveys%>%
filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight))%>%
print(n = 20)
#calculate multiple summary statistics
surveys%>%filter(!is.na(weight))%>%
group_by(sex, species_id)%>%
summarise(mean_weight = mean(weight),
min_weight = min(weight))
#Tally: get a count of records in each category
surveys%>%
group_by(sex, species_id)%>%
tally
################################# Exercise 3 ####################################################################
#How many individuals were caught in each plot_type surveyed?
#Use group_by() and summarize() to find the mean, min, and max hindfoot length for each species (using species_id).
#What was the heaviest animal measured in each year? Return the columns year,  genus, species_id, and weight.
#You saw above how to count the number of individuals of each sex using a combination of group_by() and tally().
#How could you get the same result using group_by() and summarize()? Hint: see ?n.
#Individuals per plot type
surveys %>%
group_by(plot_type)%>%
tally
#hfl by species
surveys %>%
group_by(species_id) %>%
filter(!is.na(hindfoot_length))%>%
summarize(mean_hfl = mean(hindfoot_length),
min_hfl = min(hindfoot_length),
max_hfl = max(hindfoot_length))
#heavist animal measured in each year
big_animal<- surveys %>%
filter(!is.na(weight)) %>%
group_by(year) %>%
filter(weight == max(weight)) %>%
select(year, genus, species_id) %>%
arrange(year)
big_species<- big_animal %>%
filter(max(weight))
install.packages("tidyverse")
########################### Spread ############################################
surveys_gw <- surveys%>%
filter(!is.na(weight))%>%
group_by(genus, plot_id)%>%
summarize(mean_weight = mean(weight))
library(tidyverse)
########################### Spread ############################################
surveys_gw <- surveys%>%
filter(!is.na(weight))%>%
group_by(genus, plot_id)%>%
summarize(mean_weight = mean(weight))
View(surveys_gw)
surveys_spread<- surveys_gw %>%
spread(key = genus,
value = mean_weight)
View(surveys_spread)
surveys_spread<- surveys_gw %>%
spread(key = genus,
value = mean_weight,
fill = 0)
############################### Gather #########################################
surveys_spread%>%
gather(key = genus,
value = mean_weight,
Baiomys:Spermophilus)%>%
head
View(surveys_gw)
long_data<-surveys %>%
gather(key = measurement,
value = value,
hindfoot_length, weight)
View(long_data)
mean_values <- long_data%>%
filter(!is.na(value))%>%
group_by(measurement, plot_type, year)%>%
summarise(mean = mean(value))
View(mean_values)
#Step 3: spread() them into a data set with a column
#for hindfoot_length and weight.
mean_values%>%
spread(key = measurement,
value = mean)
surveys_complete<- surveys %>%
filter(species_id!="",
!is.na(weight),
!is.na(hindfoot_length),
sex != "")
View(surveys_complete)
species_count <- species_complete %>%
group_by(species_id)%>%
tally
species_count <- surveys_complete %>%
group_by(species_id)%>%
tally
View(species_count)
species_count <- surveys_complete %>%
group_by(species_id)%>%
tally %>%
filter(n>50)
surveys_complete<- surveys_complete%>%
filter(species_id %in% species_count$species_id)
################################# Exporting data #######################################################
write_csv(surveys_complete,
path = "data/surveys_complete.csv")
library(tidyverse)
surveys <- read_csv('data/portal_data_joined.csv')
surveys_kg<-mutate(surveys, weight_kg = weight/1000)
View(surveys_kg)
surveys%>%
mutate(weight_kg = weight/1000)
View(surveys_kg)
surveys%>%
mutate(weight_kg = weight/1000,
weight_kg = weight/1000 *2)
surveys_kg_2<-surveys%>%
mutate(weight_kg = weight/1000,
weight_kg = weight/1000 *2)
View(surveys_kg_2)
View(surveys_kg)
View(surveys_kg_2)
surveys_kg_2<-surveys%>%
mutate(weight_kg = weight/1000,
weight_kg_2 = weight/1000 *2)
View(surveys_kg_2)
surveys_kg_2<-surveys%>%
mutate(weight_kg = weight/1000,
weight_kg_2 = weight/1000 *2)  %>%
head
View(surveys_kg_2)
