# Moth declines are most severe in broadleaf woodlands despite a net gain in habitat 
# availability - Blumgart et al., 2022

# Script_1_Making_diversity_stats_data_frame

# This script is to get the diversity stats for each site-year using iNEXT. I will calculate estimated
# species richness (Hill number 0) and Shannon diversity (Hill number 1), aka 'effective common species'.

# The only data frame needed here is "All_years_all_species_all_sites.csv". This contains the annual
# totals of each species in each site in each year. 

# Libraries ####

library(dplyr) # For data manipulation
library(tidyr) # For data manipulation
library(iNEXT) # For calculating species richness and diversity

# Import and prepare data ####

# Read in raw species data (yearly totals for each species)
All_years <- read.csv("Data/All_years_all_species_all_sites.csv")

# The data is in long form. It's needs to be in a S x N form with species as rows and samples as
# columns. Rearrange it.

# Rename variables
All_years <- 
  All_years %>%
  rename(Year = CalYear, SITE = RIS.TrapCode, RIS_code = Code.RISgeneric, Common_name = Common.Name,
         Binomial = binomial)

# Cut it down to the years I need (1968 - 2016) and cut out the Channel Isles
All_years_2 <- 
  All_years %>%
  dplyr::filter(Year >= 1968 & Year <= 2016) %>%
  dplyr::filter(Country != "Channel Islands [United Kingdom]")

# Now get rid of records that are not identified to species level
# Make a small df to look at 
Temp <- 
  All_years_2 %>%
  dplyr::select(RIS_code, Common_name, Binomial) %>%
  dplyr::distinct()

# Remove the ones that could not be identified to species
All_years_3 <-
  All_years_2 %>%
  dplyr::filter(Binomial != "Diachrysia species") %>%
  dplyr::filter(Binomial != "Epirrhoe species") %>%
  dplyr::filter(Binomial != "Geometridae species") %>%
  dplyr::filter(Binomial != "Macaria species") %>%
  dplyr::filter(Binomial != "Mesapamea species") %>%
  dplyr::filter(Binomial != "Mesoligia species") %>%
  dplyr::filter(Binomial != "Noctuidae species") %>%
  dplyr::filter(Binomial != "Plusia species") %>%
  dplyr::filter(Binomial != "Chloroclysta species") %>%
  dplyr::filter(Binomial != "Eilema species") %>%
  dplyr::filter(Binomial != "Eupithecia species") %>%
  dplyr::filter(Binomial != "Hoplodrina species") %>%
  dplyr::filter(Binomial != "Unidentifiable macro spp") %>%
  dplyr::filter(Binomial != "Idaea species") %>%
  dplyr::filter(Binomial != "Acleris cristana") # A micro snuck through!

# Note that Acronicta species, Amphipoea species, Epirrita species and Oligia species are typically
# aggregated in the RIS so they are left in as valid taxa

# Make a consistent single taxon for Common/Lesser Common Rustic
All_years_3$Binomial[All_years_3$RIS_code == 3392] <- "Mesapamea secalis/didyma"
All_years_3$Binomial[All_years_3$RIS_code == 3047] <- "Mesapamea secalis/didyma"
All_years_3$Binomial[All_years_3$RIS_code == 3046] <- "Mesapamea secalis/didyma"

# Fix error for Pale Brindled Beauty
All_years_3$Binomial[All_years_3$RIS_code == 929] <- "Phigalia pilosaria"

# And for Plain Pug
All_years_3$Binomial[All_years_3$RIS_code == 840] <- "Eupithecia simpliciata"

# Check that all the remaining species are either true species or accepted aggregates (Amphipoea, 
# Epirrita and Oligia species)
Temp <- 
  All_years_3 %>%
  dplyr::select(RIS_code, Common_name, Binomial) %>%
  dplyr::distinct() # Looks fine

# Remove this df
rm(Temp)

# I now need to get it in the form so that each column is one site-year...

# First, cut it down to its essential variables
All_years_4 <- 
  All_years_3 %>%
  dplyr::select(Year, SITE, Binomial, Annual_count)

# Get rid of site-years with very few observations otherwise it screws up the iNEXT stuff later on. 

# Get annual totals
Annual_totals <- 
  All_years_4 %>%
  group_by(SITE, Year) %>%
  dplyr::summarise(Sum = sum(Annual_count))

# Merge this on to All_years_4
All_years_4_B <- merge(All_years_4, Annual_totals, by = c("SITE", "Year"))

# Now cut out anything with a site-year total of less than 50 (an arbitrary minimum arrived at 
# by trial and error)
All_years_4_C <- 
  All_years_4_B %>%
  dplyr::filter(Sum > 50)

# And get rid of the Sum column
All_years_4_C$Sum <- NULL

# combine Site-year into one variable and remove SITE and Year
All_years_4_C$Site_year <- paste0(All_years_4_C$SITE,"_",All_years_4_C$Year)
All_years_4_C$Year <- NULL
All_years_4_C$SITE <- NULL

# Round the counts and convert to integers. 
All_years_4_C$Annual_count <- round (All_years_4_C$Annual_count, 0)
All_years_4_C$Annual_count <- as.integer(All_years_4_C$Annual_count)

# Get rid of duplicates created by standardising the binomial names above 
All_years_4_C <- 
  All_years_4_C %>%
  dplyr::group_by(Site_year, Binomial) %>%
  dplyr::summarise(Annual_count = sum(Annual_count)) # This got rid of 7 duplicate rows

# Now convert data frame to wide form
All_years_5 <- pivot_wider(All_years_4_C, names_from = Site_year, 
                           values_from = Annual_count, values_fill = list(Annual_count = 0))

# Name rows as species
rownames(All_years_5) <- paste(All_years_5$Binomial)

# Cut off the Binomial column to make the final df for rarefying
All_years_to_rarefy <- 
  All_years_5 %>%
  dplyr::select(-Binomial)

# Turn it into a dataframe
All_years_to_rarefy <- as.data.frame(All_years_to_rarefy)

# Data prep complete

# # #

# Calculating richness with iNEXT ####

# Get the richness estimates
Richness_df <- ChaoRichness(All_years_to_rarefy)

# Now to get back the SITE and Year

# Make a colunm of the rownames
Richness_df$SITE_Year <- rownames(Richness_df)

# Use regular expressions to extract the year and site numbers
Richness_df$Year <- gsub("\\d+_", "", Richness_df$SITE_Year) # "\\d" means a digit and 
# the "+" means any number of... So this translates to: When you find any number of digits 
# following by an underscore, replace it with nothing

# Do the same for SITE
Richness_df$SITE <- gsub("_\\d+", "", Richness_df$SITE_Year)

# Now rename columns to remember that these are richness values.
Richness_df <-
  Richness_df %>%
  rename(Actual_sp_richness = Observed, Est_sp_richness = Estimator, 
         Est_sp_richness_se = Est_s.e., Est_sp_richness_low = "95% Lower", 
         Est_sp_richness_upp = "95% Upper")

# Get rid of SITE_Year
Richness_df$SITE_Year <- NULL

# # # 

# Now do the same for Shannon and Simpson indicies

# Calculating Shannon with iNEXT ####

Shannon_df <- ChaoShannon(All_years_to_rarefy, transform = TRUE) # transform = TRUE to get it back to the
# "effective number of common species scale"

# Now to get back the SITE and Year

# Make a colunm of the rownames
Shannon_df$SITE_Year <- rownames(Shannon_df)

# Use regular expressions to extract the year and site numbers
Shannon_df$Year <- gsub("\\d+_", "", Shannon_df$SITE_Year)

# Do the same for SITE
Shannon_df$SITE <- gsub("_\\d+", "", Shannon_df$SITE_Year)

# Now rename columns to remember that these are richness values.
Shannon_df <-
  Shannon_df %>%
  rename(Actual_Shannon = Observed, Est_Shannon = Estimator, 
         Est_Shannon_se = "Est_s.e", Est_Shannon_low = "95% Lower", 
         Est_Shannon_upp = "95% Upper")

# Get rid of SITE_Year
Shannon_df$SITE_Year <- NULL

# # #

# Merge them
Big_diversity_df <- merge(Richness_df, Shannon_df, by = c("SITE", "Year"), all = TRUE)

# Save the df (this file is already provided in the Data folder so does not need to be rewritten)
##write.csv(Big_diversity_df, "Data/Diversity_stats_complete.csv", row.names = FALSE)

# # # 

# End of script. This data frame is used in the next stage of the modeling in Script_2_General_analysis