################## Install packages ############################################
#.libpaths(new = "D:/r-packages")

#install.packages("tidyverse", "plyr")

################## Data cleaning starts at import ##############################

#only required parametrs
surveys<-read.csv("data/surveys_no_header.csv")

#the header parameter: does the csv have a header?
surveys<-read.csv("data/surveys_no_header.csv", header = FALSE)

#the col.names parameter: give the dataset column names
surveys<-read.csv("data/surveys_no_header.csv", header = FALSE, 
                  col.names = c("recordID", "mo", "dy", "yr", "plot", "species", 
                               "scientificName", "locality", "decimalLatitude", 
                               "decimalLongitude", "county", "state", "country", 
                               "sex", "hfl", "wgt")   
                  )
#check column classes: not everything makes sense as a factor
str(surveys)

surveys<-read.csv("data/surveys_no_header.csv", header = FALSE,
                  col.names = c("recordID", "mo", "dy", "yr", 
                                "plot", "species", "scientificName", "locality", 
                                "decimalLatitude", "decimalLongitude", "county", "state", 
                                "country", "sex", "hfl", "wgt"),
            
                  colClasses = c("character", "factor", "factor", "factor", 
                                 "factor", "factor", "factor", "character", 
                                 "numeric", "numeric", "factor", "factor", 
                                 "factor", "factor", "numeric", "numeric")
                  )

#confirm data type changes
str(surveys)

#notice that some of the factor variables have "" as the first level

#The na.strings parameter: tell R what labels represent missing data

surveys<-read.csv("data/surveys_no_header.csv", header = FALSE,
                  col.names = c("recordID", "mo", "dy", "yr", "plot", "species", 
                               "scientificName", "locality", "decimalLatitude", 
                               "decimalLongitude", "county", "state", "country", 
                               "sex", "hfl", "wgt"), 
                  colClasses = c("character", "factor", "factor", "factor", 
                                 "factor", "factor", "factor", "character", 
                                 "numeric", "numeric", "factor", "factor", 
                                 "factor", "factor", "numeric", "numeric"),
                  na.strings = c(NA, "")
                  )

#see that the " " are gone in factors
str(surveys)

########################### Data type conversions ##############################
## What if you want to change the data type after it's loaded?

class(surveys$plot)    #the plot variable is a factor

surveys$plot<-as.character(surveys$plot) #factor to character
class(surveys$plot)

surveys$plot<-as.numeric(surveys$plot) #character to numeric
class(surveys$plot)

surveys$plot<-as.factor(surveys$plot)    #character to factor
class(surveys$plot)

# Year is currently stored as factor: what if we want to convert to a number?
year <- as.numeric(surveys$yr) #wrong: WHY?

# a slow way to do this
year <- as.character(surveys$yr) #convert from factor to character
year <- as.numeric(surveys$yr) #convert from factor to number

################## Faceting with levels and factors ############################

#move it into a variable so we don't mess up the data frame
sex<-surveys$sex

levels(sex) #what are the levels
nlevels(sex) #how many levels are there?
summary(sex) #how many values are in each category

library(dplyr)
sex<-recode(sex, "P" = "other", "R"="other", "Z" = "other")
summary(sex)

##################### Exercise: fix "Dipodomys\xe6sp." and "Onychomys\xe6sp."
# replace \xe6 with a space using recode factor

surveys$scientificName<- recode(surveys$scientificName, 
                                       "Dipodomys\xe6sp." = "Dipodomys sp.", 
                                       "Onychomys\xe6sp." = "Onychomys sp.")
levels(surveys$scientificName)

levels(surveys$scientificName) #lots of similar looking levels
nlevels(surveys$scientificName) #number of levels
 
surveys$scientificName<- trimws(surveys$scientificName) #removes whitespace
nlevels(surveys$scientificName)   #error?
class(surveys$scientificName)     #it’s a character
surveys$scientificName<- as.factor(surveys$scientificName) #type conversion
nlevels(surveys$scientificName)   #fewer levels
levels(surveys$scientificName)    #reduced duplications

#################### Fuzzy string matching (clustering) ########################
library(stringdist) #contains functions that compares strings

stringdist("abc", "abc") #no difference = 0 distance
stringdist("abc", "abd") #1 difference = distance of 1
stringdist("abc", "cba") #2 differences = distance of 2
stringdist("abc", "def") #3 differences = max distance of 3

#we're going to save the scientific name column into a vector
sp_names<-surveys$scientificName

#compare all of the species name to one species
stringdist(sp_names, "Amphispiza bilineata") 
#low numbers = more similar

#look at the levels
levels(sp_names)
#mispellings?

### Tell R what the correct spellings are (from readme)
codes<-c("Ammodramus savannarum",     "Ammospermophilus harrisii",            
         "Amphispiza bilineata",      "Amphispiza cilineata",                      
         "Baiomys taylori",           "Calamospiza melanocorys",
         "Callipepla squamata",       "Campylorhynchus brunneicapillus",
         "Chaetodipus baileyi",       "Cnemidophorus tigris",
         "Cnemidophorus uniparens",   "Crotalus scutalatus", 
         "Crotalus viridis",          "Dipodomys merriami",
         "Dipodomys ordii",           "Dipodomys spectabilis",
         "Dipodomys sp.",             "Onychomys leucogaster",
         "Onychomys torridus",        "Onychomys sp.")     

#create a list of which names match each codes
i<-amatch(x = sp_names,       #the list of things you want to code
          table = codes)      #the list of acceptable values

#create a data frame that compares the raw text to the assigned code
sp_names_df<-data.frame(rawtext = sp_names, #list of uncorrected species names
                        code = codes[i])    #looks up which code it was match to

#look at the data frame
View(sp_names_df)
#uh oh, lots of NAs (means they weren't assigned to codes)

#change the clustering method 
i<-amatch(sp_names, codes, method = "cosine")

#create comparison df again
sp_names_df<-data.frame(rawtext = sp_names, code = codes[i])

#Are there any unassigned? - not at the top
sum(is.na(sp_names_df$code))

#is this the same as the original dataset?

sum(is.na(surveys$scientificName))

sum(is.na(sp_names_df$code)) == sum(is.na(surveys$scientificName))

#assign the coded column back to original df
surveys$scientificName<-sp_names_df$code

########################## Splitting columns ###################################

# Splitting columns

library(tidyr) #contains the separate and unite functions

surveys<-separate(data = surveys,              #your data frame
                  col = scientificName,        #column to split
                  sep=c(" "),                  #what to split on
                  into = c("genus", "sp"),     #names of new columns
                  remove = FALSE               #keeps original column
)