# Moth declines are most severe in broadleaf woodlands despite a net gain in habitat availability - Blumgart et al., 2022

# Turning_raw_data_to_site_indices

# March 2021

# The purpose of this code is to take raw daily moth data and produce a site index (sindex)
# for each species in each year. It does this by taking into account missing values 
# within a year which it estimates based on the species' flight season using GAMs. 
# For site-years where the entire flight season for a species is sampled, the sindex is 
# equal to the total number of moths caught that year. For site-years where there are 
# missing weeks, the sindex is an estimate of how many moths would have been caught 
# that year had the trap been running for the entire period. 

# This code was written primarily by Colin Harrower at the Center for Ecology and Hydrology (CEH)
# for the purpose of producing moth abundance trends for "Moth trends for Britain and Ireland 
# from the Rothamsted Insect Survey light-trap network (1968 to 2016)" (Harrower et al., 2019)
# which are available available at:
# https://doi.org/10.5285/0a7d65e8-8bc8-46e5-ab72-ee64ed851583
# and also for the "Atlas of Britain and Ireland's Larger Moths" (Randle 2019). 
# This code is also used in the UK Butterfly Monitoring Scheme (UKBMS) Generalized Abundance Index (GAI) 
# analyses and is reproduced here with the permission of the UKBMS (https://ukbms.org/). 
# See: 
#     Dennis, E. B., Morgan, B. J. T., Freeman, S. N., Brereton, T. M., & Roy, D. B. (2016). 
#     A generalized abundance index for seasonal invertebrates. Biometrics, 72(December), 1305–1314. 
#     https://doi.org/10.1111/biom.12506
# For more details on this technique. 

# The code below has been edited by me (Dan Blumgart) and so is not identical to that used in 
# publications above. The primary difference is that the original code used used raw counts whereas 
# mine uses derived counts. E.g., if a moth trap was left running for 5 nights and caught 100 moths, 
# the raw count would be 100 moths caught on the final night, whereas derived counts would be
# 20 moths caught each of the 5 nights (i.e., an averaged catch). As the script below uses maximum
# counts per week, using the derived counts avoids excessively large counts that would result from 
# using the raw count. This does not change the slopes of trends in any systematic way but reduces
# the amount of unnecessary noise in the data.

# Add libraries ####

library("mgcv") # For GAMs
library("nlme") # For mixed linear models
library("ggplot2") # For plotting
library("dplyr") # For data manipulation

# # #

# Define functions ####

# clean_data_daily takes the maximum of the derived daily abundance in each week. The argument
# season_lim is not used here and is set to its maximum extent. I.e, the season limit runs for all
# weeks of the year: 1 to 53.

clean_data_daily = function(sp_data, tp_col = "WEEKNO", add_cols = NULL, season_lim = c(1,53)){
  # Remove any negative counts (shouldn't be any but better safe!)
  rm_inds = which(sp_data$DAILY_COUNT < 0)
  if(length(rm_inds) > 0){
    sp_data = sp_data[-rm_inds,]
  }
  # Create vector of data by which the sp_data is to be reduced
  by_cols = c("SPECIES","SITE","YEAR",tp_col, add_cols)
  # Now determine the max value for each unique combination of the by_cols
  temp = aggregate(list(DAILY_COUNT = sp_data[,"DAILY_COUNT"]), sp_data[,by_cols], FUN = max, na.rm = TRUE)
  temp = temp[order(temp$YEAR, temp$SITE, temp[,tp_col]),]
  row.names(temp) = NULL
  # Remove any counts outside of season
  if(!is.null(season_lim)){
    stopifnot(season_lim[1] < season_lim[2])
    rm_inds = which(temp[,tp_col] < season_lim[1] | temp[,tp_col] > season_lim[2])
    if(length(rm_inds) > 0){
      temp = temp[-rm_inds,]
    }
  }
  # Remove any sites which never have a positive count within season
  site_totals = tapply(temp$DAILY_COUNT, temp$SITE, sum)
  rm_sites = names(which(site_totals == 0))
  if(length(rm_sites) > 0){
    rm_inds = which(temp$SITE %in% rm_sites)
    temp = temp[-rm_inds,]
  }
  temp$DAILY_COUNT = ceiling(temp$DAILY_COUNT)
  return(temp)
}

# est_fp_spline estimates the flight period for each species.
est_fp_spline = function(yr_data, min_visit = NULL, min_pos = NULL, add_anchor = FALSE, 
                         tp_col = "WEEKNO", season_lim = c(1,53), verbose = FALSE, 
                         maxit = NULL, site_effect = FALSE, gam_func = "gam"){
  # Data checks and argument checks
  # Check that data contains the expect columns
  if(any(!c("SITE","DAILY_COUNT",tp_col) %in% names(yr_data))){
    stop("yr_data must contain columns titled SITE,DAILY_COUNT,and ",tp_col)
  }
  # Check that yr_data only contains data for 1 year and 1 species
  if(length(unique(yr_data$YEAR)) > 1 ){
    stop("yr_data contains data for more than 1 year")
  }
  if(length(unique(yr_data$SPECIES)) > 1){
    stop("yr_data contains data for more than 1 species")
  }
  # Check that there is data
  if(nrow(yr_data) == 0){
    warning("Data frame passed to flight period GAM contains no data (0 rows) returning NULL")
    return(NULL)
  }
  # If min_pos and min_visit suuplied then check that min_pos >= min_visit
  if(!is.null(min_visit) & !is.null(min_pos)){
    if(min_pos > min_visit){
      stop("Minimum number of positive counts cannot be less than the minimum number of visits")
    }
  }
  
  # Clean the yr_data
  yr_data = clean_data_daily(yr_data, tp_col = tp_col, season_lim = season_lim)
  # Double check that there is still data if not the set nm_data to NULL and return
  if(nrow(yr_data) == 0){
    nm_data = NULL
    # Output progress statement
    if(verbose)
      cat("WARNING: No positve counts for this species/year cobmination (exiting and returning NULL)\n")
    return(nm_data)
  }
  # Attempt to load mgcv package and return error if not found
  temp = require(mgcv)
  if(!temp)
    stop("Required package 'mgcv' not found")
  
  # Determine the number of visits and positive counts for each site
  visit_inds = which(yr_data$DAILY_COUNT >= 0) # A visit should always have a 0 or a positive count
  n_visit = sapply(tapply(yr_data[visit_inds,tp_col],yr_data$SITE[visit_inds], unique),length)
  pos_inds = which(yr_data$DAILY_COUNT > 0)
  n_pos = sapply(tapply(yr_data[pos_inds,tp_col],yr_data$SITE[pos_inds], unique),length)
  # Cleanup
  rm(visit_inds, pos_inds)
  
  # If min_visit and/or min_pos not null then find sites passing critera
  if(!is.null(min_pos) & !is.null(min_visit)){
    pass_sites = sort(intersect(names(n_pos)[n_pos >= min_pos], names(n_visit)[n_visit >= min_visit]))
    # Output progress statement
    if(verbose)
      cat("Excluding sites that have less than ",min_visit," visits or ",min_pos," positive counts\n", sep="")
  } else if(!is.null(min_pos)){
    pass_sites = names(n_pos)[n_pos >= min_pos]
    # Output progress statement
    if(verbose)
      cat("Excluding sites that have less ",min_pos," positive counts\n", sep="")
  } else if(!is.null(min_visit)){
    pass_sites = names(n_visit)[n_visit >= min_visit]
    # Output progress statement
    if(verbose)
      cat("Excluding sites that have less than ",min_visit," visits\n", sep="")
  }
  
  # Determine which sites are to be excluded based on min_visit and min_pos criteria and then exclude them
  if(exists("pass_sites")){
    # Initialise excluded site vector as all site codes
    exc_sites = unique(yr_data$SITE)
    # Remove any sites codes that are in pass_sites
    exc_sites = sort(exc_sites[which(!exc_sites %in% pass_sites)])
    # Output progress statement (if verbose = TRUE)
    if(verbose){
      if(length(exc_sites) > 0){
        cat("  Excluding ", length(exc_sites), " of ",length(exc_sites) + length(pass_sites)," sites:\n    ", paste(exc_sites,collapse=", "),"\n",sep="")
      } else {
        cat("  No sites excluded as none found that fall below specified thresholds\n", sep="")
      }
    }
    # Now drop these sites from yr_data
    gam_inds = which(yr_data$SITE %in% pass_sites)
    yr_data = yr_data[gam_inds,]
    rm(pass_sites)
  }
  
  # Check that if there is still data
  if(nrow(yr_data) >= 0){
    # Are anchor points to be added (need to be added for figuring out which indices to use for analysis)
    if(add_anchor){
      # Determine anchors days/weeks 
      # If tp is dayno then add anchors of 5 consecutive days on each side of the range where the first day of the anchor points is +/- 10 days from range limits
      if(toupper(tp_col) == "DAYNO"){
        anc_tp = c(season_lim[1]-14:10,season_lim[2]+10:14)
      }
      # If tp is weekno then add anchors to the 5 consecutive weeks on each side of the range
      if(toupper(tp_col) == "WEEKNO"){
        anc_tp = c(season_lim[1]-1:5,season_lim[2]+1:5)
      }
      # Create an anchor dataset (for only pass sites) - only needs columns that are used in fitting GAM
      site_codes = unique(yr_data$SITE)
      anc_data = data.frame(SITE = rep(as.numeric(site_codes), each = 10), ANC_TP = rep(anc_tp, length(site_codes)), DAILY_COUNT = 0)
      # Rename the ANC_TP columns
      names(anc_data)[2] = tp_col
      rm(site_codes, anc_tp)
    }
    # Fit GAM
    # Output progress statement
    if(verbose)
      cat("Fitting GAM\n",sep="")
    # Determine no of sites in GAM dataset
    n_sites = length(unique(yr_data$SITE))
    # if maxit (maximum number of iterations for GAM convergence) not supplied then use default
    if(is.null(maxit)){
      maxit = gam.control()$maxit
    }
    # Setup model formula using the correct tp column and including a site term if there is more than 1 site
    mod_form = as.formula(paste0("DAILY_COUNT ~ s(",tp_col,",bs =\"cr\")",ifelse(n_sites > 1 & site_effect, "+factor(SITE)","")))
    # If there is only 1 site and verbose is true then print statement to say that site term has been removed
    if(verbose & n_sites == 1)
      cat("  Only one site so removing site term from GAM model\n", sep="")
    # Now try fitting model using the bam function
    if(add_anchor){
      gam_obj = try(do.call(gam_func,list(formula=mod_form, data = rbind(yr_data[,c("SITE",tp_col,"DAILY_COUNT")], anc_data), family = poisson(link="log"), control = list(maxit = maxit))), silent = TRUE)
    } else {
      gam_obj = try(do.call(gam_func,list(formula = mod_form, data = yr_data[,c("SITE",tp_col,"DAILY_COUNT")], family = poisson(link="log"), control = list(maxit = maxit))), silent = TRUE)
    }
    
    # If it hasn't failed check that it has converged again
    if(!"try-error" %in% class(gam_obj)){
      if(gam_obj$iter == maxit | !gam_obj$converged){
        class(gam_obj) = "try-error"
        # Output progress statement
        if(verbose)
          cat("  GAM failed to converge\n",sep="")
      } else {
        # Output progress statement
        if(verbose)
          cat("  GAM fitting successful (",gam_obj$iter," / ",maxit," iterations)\n", sep="")
      }
    }
    
    if(!"try-error" %in% class(gam_obj)){
      # Output progress statement
      if(verbose)
        cat("Estimate NM values\n")
      # If model includes site term the should predict values for each site, normalise individually for each sites after which the values for each site should be the same.
      # However to save time/resources can instead predict for 1st site and normalise that site to give same values but without predicting rest of sites
      # The above also means that the same method can be used regardless of whether the model includes a site term as there is only a single set of WEEKNO/DAYNO values and the SITE column with the 1st site code will only be used if the model includes a site term
      # Setup data to be used to predict
      pred_data = data.frame(SITE = yr_data$SITE[1], tp_col = season_lim[1]:season_lim[2])
      # Rename tp_col colummn
      names(pred_data)[grepl("tp_col",names(pred_data))] = tp_col
      # Output progress statement
      if(verbose)
        cat("  Predicting values from fitted model\n")
      # Now predicted values from model
      pred_data$FITTED = predict(gam_obj,newdata = pred_data, type="response")
      # Output progress statement
      if(verbose)
        cat("  Standardising values to produce NM values\n")
      # Normalise data at each site so that each site the NM values follow the fitted but sum to 1
      pred_data$NM = pred_data$FITTED/sum(pred_data$FITTED)
      # Save to nm_data (NEED TO ADD SPECIES AND YEAR?) and remove pred_data
      nm_data = data.frame(SPECIES = yr_data$SPECIES[1],YEAR = yr_data$YEAR[1],pred_data[,c(tp_col,"NM")])
      # Output progress statement
      if(verbose)
        cat("NM values calculated\n")
      
      
    } else {
      nm_data = NULL
      # Output progress
      if(verbose)
        cat("ERROR: GAM fitting failed (exiting and returning NULL)\n")
    }
    
  } else {
    nm_data = NULL
    # Output progress statement
    if(verbose)
      cat("WARNING: No data for this species/year cobmination (exiting and returning NULL)\n")
  }
  
  return(nm_data)
  
}

# Create function check_nm. This function looks at the flight period and judges whether it's any good. 
# For moths with low counts, individual counts can create dubious flight periods. E.g., if a species 
# is only recorded once in the North at an unusual time of year, there will be a big spike at that point
# that mis-represents where the flight period really was. In this case, the nearest 'good' year is used
# as an estimate. The parameters for a 'good' year can be altered based on how much of the 
# flight period is allowed to occur in one week and how little overlap with other years it's allowed to have. 
# The default is a maximum of 70% of the flight season in one week and a minimum overlap of 30% with
# the overall average year. 
check_nm = function(sp_data,sp_nm, max_nm = 0.7, min_overlap = 0.3, tp_col = "WEEKNO", season_lim = c(1,26)){
  # Create small wrapper function used later to check nm across all year/bootstrap combinations
  nm_overlap = function(cur_nm, cnt_nm){
    prop_over = (2 - sum(abs(cur_nm - cnt_nm)))/2
    max_nm = max(cur_nm)
    ret_obj = c(prop_over = prop_over, max_nm = max_nm)
    return(ret_obj)
  }
  # Get indices of sp_data that are withing flight period
  inds = which(sp_data$WEEKNO >= season_lim[1] & sp_data$WEEKNO <= season_lim[2])
  # Estimate a normalised flight period directly from counts by calculating weekly sums and then dividing by number of unique sites in data (not of that week, as this can cause issues with dodgy records outside of normal flight period for species e.g. if only 1 record ever comes from week 52 and it is a moderate count it can end up being the highest value if you take the mean for each week. Likewise the total sum can be biased by 1 single large count
  wk_sum = tapply(sp_data$DAILY_COUNT[inds], sp_data[inds,tp_col], sum, na.rm = TRUE)
  tot_n_sites = length(unique(sp_data$SITE[inds]))
  wk_avg = wk_sum/tot_n_sites
  # Setup object to hold normalised count flight period that spans entire season (may be weeks/days without counts)
  cnt_nm = setNames(rep(0,length(season_lim[1]:season_lim[2])),season_lim[1]:season_lim[2])
  # Normalise weekly mean count
  cnt_nm[names(wk_avg)] = wk_avg / sum(wk_avg)
  # figure out which grouping columns to work over when applying nm checks (e.g. year or year & iboot)
  grp_cols = names(sp_nm)[names(sp_nm) %in% c("YEAR","iboot")]
  # Now calculate prop overlap & max nm for each combination of grouping columns
  prop_over = aggregate(sp_nm[,"NM",drop=FALSE], sp_nm[,grp_cols,drop=FALSE], FUN = nm_overlap, cnt_nm = cnt_nm)
  # Find combinations where values don't meet criteria
  fail_inds = which(prop_over$NM[,"prop_over"] < min_overlap | prop_over$NM[,"max_nm"] > max_nm | is.nan(prop_over$NM[,"prop_over"]) | is.nan(prop_over$NM[,"max_nm"]) )
  # Find which rows in sp_nm correspond to these dodgy year/boot combinations
  rm_inds = which(apply(sp_nm[,grp_cols,drop=FALSE],1,paste,collapse="_") %in% apply(prop_over[fail_inds,grp_cols,drop = FALSE],1,paste,collapse="_"))
  # If any combinations fail then exclude then from sp_nm before returning remaining
  if(length(rm_inds) > 0){
    sp_nm = sp_nm[-rm_inds,]
  }
  return(sp_nm)
}

# Specify a function called nm_fill_miss. This is just complimentary to the last function and fills in the 
# bad flight period with the nearest good one. 
nm_fill_miss = function(sp_data, si_data, nm_data){
  # Determine if the NM data is missing any years for which there is data (excluding site index only data) and if so also return which is the closest year with data
  miss_yrs = miss_nm_yrs(sp_data, nm_data)
  # If missing years found then create NM values for these years by using the nearest year if not then do nothing and return nm_data unchanged
  if(!is.null(miss_yrs)){
    # Print warning to console
    cat("NM values missing for years present in actual data\n",paste(sprintf("  No NM values for %s so using values for %s",miss_yrs[,1], miss_yrs[,2]),collspe="\n"),sep="")
    # Get indices of values in NM which are to be used to replace missing years
    add_inds = lapply(miss_yrs$USE,function(x,y){ which(y  ==  x) },nm_data$YEAR)
    # Extract the NM data for years that are to replace missing years
    add_nm = nm_data[unlist(add_inds),]
    # Alter the year in the replacement to be the year(s) that were missing 
    add_nm$YEAR = rep(miss_yrs$MISSING,sapply(add_inds,length))
    # Add the replacement data into the nm_data
    nm_data = rbind(nm_data, add_nm)
  }
  return(nm_data)
}

# This function is also necessary
miss_nm_yrs = function(sp_data, nm_data){
  data_yrs = sort(unique(sp_data$YEAR))
  nm_yrs = sort(unique(nm_data$YEAR))
  miss_yrs = data_yrs[which(!data_yrs %in% nm_yrs)]
  if(length(miss_yrs) > 0){
    near_yrs = find_nearest(miss_yrs,nm_yrs)
    out_obj = data.frame(MISSING = miss_yrs, USE = near_yrs)
  } else {
    out_obj = NULL
  }
  return(out_obj)
}

# And this one
find_nearest = function(x, vec){
  # Check x and vec are numeric
  # Convert x to numeric if factor or character (assuming these can be safely converted to numbers)
  if(class(x) == "factor"){
    x = as.numeric(as.character(x))
  } else if(class(x) == "character"){
    x = as.numeric(x)
  }
  # Convert vec to numeric if factor or character (assuming these can be safely converted to numbers)
  if(class(vec) == "factor"){
    vec = as.numeric(as.character(vec))
  } else if(class(x) == "character"){
    vec = as.numeric(vec)
  }
  
  ret_obj = rep(NA, length(x))
  for(i in 1:length(x)){
    ret_obj[i] = vec[which.min(abs(vec-x[i]))]
  }
  return(ret_obj)
}

# # # 

# Read in raw data ####

# Find the raw data files. You will need to change the address
f_list = list.files("Data/Raw_moth_species_data") # This lists the raw data I have for 
# 432 species. Each species has its own separate file. These files contain daily data with the 
# following variables:
#[1] "SITE"           "SPECIES"        "YEAR"           "MONTH"          "DAY"            "WEEKNO"        
# [7] "DAYNO"          "DAYS_FOR_COUNT" "COUNT"          "DAILY_COUNT"    "CATCH_DATE"   

# Create species codes using the file names
spp_codes = as.numeric(gsub("Sp ([[:digit:]]{1,})_count[.]rds","\\1",f_list)) # This simply extracts the 
# number from each file name. The codes are the moth numbers in the Rothamsted moth coding system

# Read in the moth traits and extract only the names
Moth_traits <- read.csv("Data/Moth_traits.csv")

names <- 
  Moth_traits %>%
  dplyr::rename(COMMON_NAME = Common_name, RIS_CODE = RIS_code) %>% # Change names to match code
  dplyr::select(COMMON_NAME, RIS_CODE)

# Read in site info
site_info <- read.csv("Data/Site_info.csv")

# Cut out and rename so it's only SITE and REGION
site_info <- 
  site_info %>%
  dplyr::rename(REGION = N_S) %>% # Rename to match code
  dplyr::select(SITE, REGION)

# The start of the loop ####


for(i_sp in 1:length(spp_codes)){  
  
  # Read in one of the data files
  sp_data = readRDS(paste0("Data/Raw_moth_species_data/Sp ",spp_codes[i_sp],"_count.rds")) 
  # This reads in the data for a single species. E.g., 382 = Hebrew Character

  # Merge Site_info with sp_data
  sp_data = merge(sp_data, site_info[,c("SITE","REGION")], by = "SITE", all.x = TRUE,sort = FALSE)
  # The sp_data now contains a REGION column

  # Create a vector that has all years (1968 - 2016)
  yrs = sort(unique(sp_data$YEAR))
  
  # Determine which regions species occurs in (may only occur in one). E.g., if there are no records in the 
  # north then when the two data frames were merged above, the argument all.x will get rid of any of the
  # sites that have no records for that moth and hence 'North' will not be in the REGION column. 
  sp_reg = sort(unique(sp_data$REGION))
  
  # Make a combination of all years in moths regions. 
  combs = expand.grid(list(REGION = sp_reg, YEAR = yrs))
  
  # Setup empty object to store results
  temp_sp_nm = vector("list",nrow(combs))
  
   # Print progress to screen
  print(paste0("Starting ", names$COMMON_NAME[names$RIS_CODE == spp_codes[i_sp]], ", No. ", spp_codes[i_sp]))
 
  # Now loop through all years and estimate NM 
  for(i_comb in 1:nrow(combs)){
    # Show progress to screen
    cat("Year =", combs$YEAR[i_comb],"-",as.character(combs$REGION[i_comb]), "\n")
    # If there is no data to fit a model to, the model comes back NULL and nothing is added to dataframe
    
    # This is where you can set min_pos to exclude sites with very low figures that may be mistakes
    # You can also set season limits
    if(sum(subset(sp_data, YEAR == combs$YEAR[i_comb] & REGION == combs$REGION[i_comb])[,"DAILY_COUNT"]) > 0){
      temp_sp_nm[[i_comb]] = est_fp_spline(yr_data = subset(sp_data, YEAR == combs$YEAR[i_comb] 
                                                            & REGION == combs$REGION[i_comb]),
                                           tp_col = "WEEKNO", season_lim = c(1,53))
      if(!is.null(temp_sp_nm[[i_comb]])){
        temp_sp_nm[[i_comb]][,"REGION"] = combs$REGION[i_comb]
      }
    }
  }
  # Combine list into a single data frame
  sp_nm = do.call("rbind", temp_sp_nm)
  rm(temp_sp_nm) # Remove the temporary list
  
  # This has now estimated the flight period as a curve for each region in each site for this species. It
  # has done it on a weekly basis and given each WEEKNO an NM which specifies the proportion of individuals
  # caught in this week for this year in this region. This will now be used to judge how much of the the
  # flight season was sampled in each site in each year.

  # Print progress to screen
  print(paste0("Busy doing ", names$COMMON_NAME[names$RIS_CODE == spp_codes[i_sp]], ", No. ", spp_codes[i_sp]))


  # Check nm values and remove any years failing check_nm checks and then fill missing 
  # years in flight curves with nearest curve. The check_nm functon arguments 'max_nm =' and
  # 'min_overlap =' are used for quality control. max_nm sets a limit of what proportion of the
  # flight curve is allowed to occur in a single week. The min_overlap sets the minimum amount
  # the flight curve for each year should overlap with a flight curve for all years combined.The 
  # defaults are max_nm = 0.7, min_overlap = 0.3. This filters out annomalous flight curves. 
  # The function tells you if any curves have been replaced.

    # Check which regions present in data
  pre_reg = unique(sp_nm$REGION)
  
  # For northern sites first
  if("North" %in% pre_reg){
    test_north = check_nm(subset(sp_data,REGION == "North"), subset(sp_nm,REGION == "North"), 
                          season_lim = c(1,53))
    if(nrow(test_north) > 0){ #Avoids error for species with no decent flight periods in the north and
      # sets it to NULL instead
      test_north = nm_fill_miss(subset(sp_data, REGION == "North"), nm_data = test_north)
    } else {
      test_north = NULL
    }
  } else {
    test_north = NULL
  }
  
  # and for southern sites
  if("South" %in% pre_reg){
    test_south = check_nm(subset(sp_data,REGION == "South"), subset(sp_nm,REGION == "South"),
                          season_lim = c(1,53))
    test_south = nm_fill_miss(subset(sp_data, REGION == "South"), nm_data = test_south)
  } else {
    test_south = NULL
  }
  
  # Bind the two resulting dfs together
  sp_nm = rbind(test_north,test_south)
  
  # And remove the unbound ones
  rm(test_north, test_south) 
  
  # The sp_nm is now overwritten with the improved curves. 

  # The flight curve estimation is now complete. Go back to raw data to extract weekly max daily
  # counts.
  
  # clean_data_daily function takes the sp_data dataframe and finds the maximum DAILY_COUNT  
  # in each week. It then aggregates the table down to contain only the maximums then
  # rounds each daily count to an integer

  # Weekly maximums with divisions by days sampled
  cleaned_daily = clean_data_daily(sp_data, add_cols = "REGION", season_lim = c(1,53))

  # Add on the NMs
  comb_data = merge(cleaned_daily, sp_nm, by = c("SPECIES","YEAR","WEEKNO","REGION"), all.x = TRUE)
  
  # In comb data, every week with a positive count of ANY species is included as a row of data even if 
  # it is a zero count for this particular species. For weeks in which no species of any kind were 
  # recorded then there is no row of data for this week. As the all.x = TRUE argument was used then 
  # only the NMs with corresponding weeks are preserved. So, when the NMs are summed up for a given 
  # year, they total the total proportion of the flight period sampled.
  
  sp_yr_summary = aggregate(comb_data[,c("DAILY_COUNT","NM")], by = comb_data[,c("SPECIES","YEAR","REGION","SITE")], sum, na.rm = TRUE)
  # The object sp_yr_summary gives a total count for each site-year combination sampled.
  # It also gives a proportion of the flight season sampled which is 'NM'.
  # In the creation of sp_yr_summary, sums up all weekly counts and all flight curve
  # proportion estimates (NM) for each SPECIES_YEAR-REGION-SITE combination trapped. 
  # The summing up of all the sampled NMs gives a proportion of the total flight curve
  # sampled. This is then used to estimate indecies for incomplete years. 

  # Add the raw totals for each year as a column. This will be used to produce the sindex values I 
  # actually use. In the sp_data file, the sum of the COUNT (not DAILY_COUNT!) is the sum total of all
  # the moths recorded at that site in that year. I just need to sum them up and merge them on. 
  Raw_totals <- 
    sp_data %>%
    group_by(SPECIES, SITE, YEAR) %>%
    dplyr::summarise(Raw_total = sum(COUNT))
  
  # Now merge it on to sp_yr_summary
  sp_yr_summary <- merge(sp_yr_summary, Raw_totals, by = c("SPECIES", "SITE", "YEAR"), all.x = TRUE)
  
  # Now make a sindex value based on the raw yearly counts (My_raw_sindex)
  sp_yr_summary$My_raw_sindex <- round(sp_yr_summary$Raw_total/sp_yr_summary$NM)
  
  # A quick recap of what the variables in sp_yr_summary mean:
  
  # SPECIES - The RIS code of the species
  # SITE - The RIS code for the site
  # YEAR - The calendar year
  # REGION - Whether this site is in the North or south
  # DAILY_COUNT - The summed weekly maximums of derived data (i.e daily total divided by how many days
        # in sample) for this site-year
  # NM - The proportion of the site-year sampled (number of weeks with at least one positive count for 
        # any moth during the flight period of the species in question)
  # Raw_total - This is the total number of this species of moth caught in this site-year.. 
  # My_raw_sindex - This is the Raw_total divided by NM and rounded to nearest integer. This will be 
        # used as the abundance index in the next stage of modeling

  ###
  
  # Print progress to screen
  print(paste0("Just finished ", names$COMMON_NAME[names$RIS_CODE == spp_codes[i_sp]], ", No. ", spp_codes[i_sp]))
  
  print(paste0("Total progress: ", round(i_sp/432*100), "%"))
  
  # Now save the final output as an RDS file (readable only in R)
  saveRDS(sp_yr_summary, file = file.path("Data/Sindex_files",paste0(spp_codes[i_sp],".rds")))
  
}

# # #

# End of script