library(readr) # read and write tabular data
library(dplyr) # manipulate data
library(pdftools) # process pdfs
library(stringr) # work with string
library(here) # file paths
Cleaning data
A common step during data analysis is to clean up the raw data. We fix any obvious errors, edit column names, exclude rows we do not want, and save the cleaned up data set. We do the analysis on the cleaned data set.
Load libraries
iNaturalist
Cleaning up iNaturalist file
Save the file paths as objects.
<- tempfile()
observations_raw_path <- here('data/raw/observations-513664.csv.zip')
zip_path unzip(zipfile = zip_path, exdir = observations_raw_path)
Warning in unzip(zipfile = zip_path, exdir = observations_raw_path): error 1 in
extracting from zip file
<- here('data/cleaned/cnc-los-angeles-observations.csv') observations_cleaned_path
The CSV we downloaded from iNaturalist had several problems.
- some observation did not have a species
- some observations did not have location information
- some observations were not in Los Angeles
- there were some fields that we did not need
- information about observations with threatened, native, and introduced species had to be downloaded separately
In order to simplify things for the workshop attendees, we fixed the various issues and created a cleaned up CSV.
These are the steps we took to produced the cleaned up file. The raw file is at ‘/var/folders/zw/0kb57jnn5nv9k3pgxsz86n7h0000gq/T//Rtmpel1mxP/file5e7c583e3ff2’, the cleaned file is at ‘/Users/wyk/Development/science/city_nature_challenge/NHMLA_workshop/after_inaturalist_lesson/data/cleaned/cnc-los-angeles-observations.csv’
<- tempfile()
tmp <- here('data/raw/observations-513664.zip')
zip_path unzip(zipfile = zip_path, exdir = tmp)
<- read_csv(paste(tmp, 'observations-513664.csv', sep='/')) inat_raw
Use colSums(is.na())
to count the number of rows that have NA values for each column.
colSums(is.na(inat_raw))
id uuid
0 0
observed_on_string observed_on
0 0
time_observed_at time_zone
5862 0
user_id user_login
0 0
user_name created_at
72542 0
updated_at quality_grade
0 0
license url
48002 0
image_url sound_url
2530 192990
tag_list description
186884 169938
num_identification_agreements num_identification_disagreements
0 0
captive_cultivated oauth_application_id
0 71706
place_guess latitude
440 438
longitude positional_accuracy
438 40407
private_place_guess private_latitude
193738 193738
private_longitude public_positional_accuracy
193738 36853
geoprivacy taxon_geoprivacy
180034 144575
coordinates_obscured positioning_method
0 176278
positioning_device species_guess
174658 29909
scientific_name common_name
1671 12223
iconic_taxon_name taxon_id
1862 1670
taxon_kingdom_name taxon_phylum_name
1846 3906
taxon_subphylum_name taxon_superclass_name
15073 191497
taxon_class_name taxon_subclass_name
6165 142186
taxon_superorder_name taxon_order_name
183410 10661
taxon_suborder_name taxon_superfamily_name
148094 153153
taxon_family_name taxon_subfamily_name
16037 79392
taxon_supertribe_name taxon_tribe_name
190875 105525
taxon_subtribe_name taxon_genus_name
168941 28054
taxon_genushybrid_name taxon_species_name
193738 55501
taxon_hybrid_name taxon_subspecies_name
193139 190760
taxon_variety_name taxon_form_name
192195 193738
All rows have id, observed_on, and user_id.
some rows don’t have scientific_name, latitude or longitude.
Use filter
to select the observations we want.
!is.na
will select rows that have are not NA, meaning rows that have a value. We select observations that have latitude, longitude, and scientific_name. We also ignored observations outside of LA
We save the cleaned up data in a new object inat
.
<- inat_raw %>%
inat filter(!is.na(latitude) &
!is.na(longitude) &
!is.na(scientific_name)) %>%
filter(latitude < 40)
The original data frame ‘inat_raw’ had 193K rows, the cleaned data frame ‘inat’ has 191K rows.
We can double check our work.
latitude, longitude, scientific_name have zero NA.
colSums(is.na(inat))
id uuid
0 0
observed_on_string observed_on
0 0
time_observed_at time_zone
5819 0
user_id user_login
0 0
user_name created_at
71186 0
updated_at quality_grade
0 0
license url
47396 0
image_url sound_url
2243 190893
tag_list description
184803 168134
num_identification_agreements num_identification_disagreements
0 0
captive_cultivated oauth_application_id
0 71018
place_guess latitude
1 0
longitude positional_accuracy
0 40057
private_place_guess private_latitude
191638 191638
private_longitude public_positional_accuracy
191638 36540
geoprivacy taxon_geoprivacy
178390 142538
coordinates_obscured positioning_method
0 174275
positioning_device species_guess
172680 28434
scientific_name common_name
0 10525
iconic_taxon_name taxon_id
190 0
taxon_kingdom_name taxon_phylum_name
174 2221
taxon_subphylum_name taxon_superclass_name
13372 189405
taxon_class_name taxon_subclass_name
4465 140237
taxon_superorder_name taxon_order_name
181350 8944
taxon_suborder_name taxon_superfamily_name
146151 151144
taxon_family_name taxon_subfamily_name
14269 77518
taxon_supertribe_name taxon_tribe_name
188777 103590
taxon_subtribe_name taxon_genus_name
166888 26258
taxon_genushybrid_name taxon_species_name
191638 53654
taxon_hybrid_name taxon_subspecies_name
191043 188661
taxon_variety_name taxon_form_name
190096 191638
We want to delete unneeded columns to keep the CSV under 100 MB for Github
names(inat)
[1] "id" "uuid"
[3] "observed_on_string" "observed_on"
[5] "time_observed_at" "time_zone"
[7] "user_id" "user_login"
[9] "user_name" "created_at"
[11] "updated_at" "quality_grade"
[13] "license" "url"
[15] "image_url" "sound_url"
[17] "tag_list" "description"
[19] "num_identification_agreements" "num_identification_disagreements"
[21] "captive_cultivated" "oauth_application_id"
[23] "place_guess" "latitude"
[25] "longitude" "positional_accuracy"
[27] "private_place_guess" "private_latitude"
[29] "private_longitude" "public_positional_accuracy"
[31] "geoprivacy" "taxon_geoprivacy"
[33] "coordinates_obscured" "positioning_method"
[35] "positioning_device" "species_guess"
[37] "scientific_name" "common_name"
[39] "iconic_taxon_name" "taxon_id"
[41] "taxon_kingdom_name" "taxon_phylum_name"
[43] "taxon_subphylum_name" "taxon_superclass_name"
[45] "taxon_class_name" "taxon_subclass_name"
[47] "taxon_superorder_name" "taxon_order_name"
[49] "taxon_suborder_name" "taxon_superfamily_name"
[51] "taxon_family_name" "taxon_subfamily_name"
[53] "taxon_supertribe_name" "taxon_tribe_name"
[55] "taxon_subtribe_name" "taxon_genus_name"
[57] "taxon_genushybrid_name" "taxon_species_name"
[59] "taxon_hybrid_name" "taxon_subspecies_name"
[61] "taxon_variety_name" "taxon_form_name"
Setting a column to NULL will remove the column.
$uuid <- NULL
inat$observed_on_string <- NULL
inat$time_zone <- NULL
inat$num_identification_agreements <- NULL
inat$num_identification_disagreements <- NULL
inat$oauth_application_id <- NULL
inat$place_guess <- NULL
inat$private_place_guess <- NULL
inat$private_longitude <- NULL
inat$private_latitude <- NULL
inat$positioning_method <- NULL
inat$positioning_device <- NULL inat
use filter
and dim
to see how often the taxon names were used.
%>% filter(!is.na(taxon_subphylum_name)) %>% dim inat
[1] 178266 50
%>% filter(!is.na(taxon_superclass_name)) %>% dim inat
[1] 2233 50
%>% filter(!is.na(taxon_subclass_name)) %>% dim inat
[1] 51401 50
%>% filter(!is.na(taxon_superorder_name)) %>% dim inat
[1] 10288 50
%>% filter(!is.na(taxon_suborder_name)) %>% dim inat
[1] 45487 50
%>% filter(!is.na(taxon_superfamily_name)) %>% dim inat
[1] 40494 50
%>% filter(!is.na(taxon_subfamily_name)) %>% dim inat
[1] 114120 50
%>% filter(!is.na(taxon_supertribe_name)) %>% dim inat
[1] 2861 50
%>% filter(!is.na(taxon_tribe_name)) %>% dim inat
[1] 88048 50
%>% filter(!is.na(taxon_subtribe_name)) %>% dim inat
[1] 24750 50
%>% filter(!is.na(taxon_genushybrid_name)) %>% dim inat
[1] 0 50
%>% filter(!is.na(taxon_hybrid_name)) %>% dim inat
[1] 595 50
%>% filter(!is.na(taxon_subspecies_name)) %>% dim inat
[1] 2977 50
%>% filter(!is.na(taxon_variety_name)) %>% dim inat
[1] 1542 50
%>% filter(!is.na(taxon_form_name)) %>% dim inat
[1] 0 50
$species_guess <- NULL
inat
$taxon_subphylum_name <- NULL
inat
$taxon_superclass_name <- NULL
inat$taxon_subclass_name <- NULL
inat
$taxon_superorder_name <- NULL
inat$taxon_suborder_name <- NULL
inat
$taxon_superfamily_name <- NULL
inat$taxon_subfamily_name <- NULL
inat
$taxon_supertribe_name <- NULL
inat$taxon_tribe_name <- NULL
inat$taxon_subtribe_name <- NULL
inat
$taxon_genushybrid_name <- NULL
inat$taxon_hybrid_name <- NULL
inat
$taxon_variety_name <- NULL
inat$taxon_form_name <- NULL inat
names(inat)
[1] "id" "observed_on"
[3] "time_observed_at" "user_id"
[5] "user_login" "user_name"
[7] "created_at" "updated_at"
[9] "quality_grade" "license"
[11] "url" "image_url"
[13] "sound_url" "tag_list"
[15] "description" "captive_cultivated"
[17] "latitude" "longitude"
[19] "positional_accuracy" "public_positional_accuracy"
[21] "geoprivacy" "taxon_geoprivacy"
[23] "coordinates_obscured" "scientific_name"
[25] "common_name" "iconic_taxon_name"
[27] "taxon_id" "taxon_kingdom_name"
[29] "taxon_phylum_name" "taxon_class_name"
[31] "taxon_order_name" "taxon_family_name"
[33] "taxon_genus_name" "taxon_species_name"
[35] "taxon_subspecies_name"
We want to save the cleaned up data set so we can use it later. We can save data.frame to a CSV using the write_csv()
function from the readr
package. The first argument is the name of the data.frame, and the second is the path to the new file we want to create.
write_csv(inat, file= observations_cleaned_path, na = "" )
Adding more information
In order to get information about observations with threatened, introduced, and native species, we had to download a separate CSV for each option. We want to merge all these files into one cleaned observations file.
<- read_csv(observations_cleaned_path)
inat_data <- read_csv(here('data/raw/observations-514065_threatened.csv'))
threatened_raw <- read_csv(here('data/raw/observations-514069_introduced.csv'))
introduced_raw <- read_csv(here('data/raw/observations-514076_native.csv')) native_raw
Get threatened species.
<- threatened_raw %>%
threatened_data mutate(threatened = TRUE)
<- threatened_data[!duplicated(threatened_data), ]
threatened_data
threatened_data
# A tibble: 337 × 2
taxon_id threatened
<dbl> <lgl>
1 48662 TRUE
2 1454382 TRUE
3 59575 TRUE
4 210624 TRUE
5 221090 TRUE
6 80308 TRUE
7 144205 TRUE
8 78804 TRUE
9 61164 TRUE
10 77658 TRUE
# ℹ 327 more rows
Get introduced species.
<- introduced_raw %>%
introduced_data mutate(establishment_means = 'introduced')
<- introduced_data[!duplicated(introduced_data), ]
introduced_data
introduced_data
# A tibble: 1,440 × 2
taxon_id establishment_means
<dbl> <chr>
1 480298 introduced
2 13858 introduced
3 187316 introduced
4 51271 introduced
5 46020 introduced
6 236935 introduced
7 552449 introduced
8 53168 introduced
9 162972 introduced
10 53025 introduced
# ℹ 1,430 more rows
Get native species.
<- native_raw %>%
native_data mutate(establishment_means = 'native')
<- native_data[!duplicated(native_data), ]
native_data
native_data
# A tibble: 3,058 × 2
taxon_id establishment_means
<dbl> <chr>
1 54247 native
2 36100 native
3 36204 native
4 6930 native
5 7470 native
6 43115 native
7 48662 native
8 7089 native
9 50822 native
10 29053 native
# ℹ 3,048 more rows
Get combine introduced and native data frames
<- rbind(introduced_data, native_data)
establishment_means
establishment_means
# A tibble: 4,498 × 2
taxon_id establishment_means
<dbl> <chr>
1 480298 introduced
2 13858 introduced
3 187316 introduced
4 51271 introduced
5 46020 introduced
6 236935 introduced
7 552449 introduced
8 53168 introduced
9 162972 introduced
10 53025 introduced
# ℹ 4,488 more rows
remove rows where taxon_id is listed as both native and introduced
<- establishment_means %>%
non_duplicates count(taxon_id) %>%
arrange(desc(n)) %>%
filter(n == 1)
<- non_duplicates$taxon_id
keep_ids
length(keep_ids)
[1] 4436
<- establishment_means %>%
dedup_establishment_means filter(taxon_id %in% keep_ids)
add threatened info to the main inat data frame.
<- left_join(inat_data, threatened_data) combined_data
Joining with `by = join_by(taxon_id, threatened)`
add native, introduced info to main inat data frame
<- left_join(combined_data, dedup_establishment_means) combined_data
Joining with `by = join_by(taxon_id, establishment_means)`
save file
write_csv(combined_data, observations_cleaned_path, na = "")
LASAN
Indicator species
Convert pdf to CSV
<- pdf_text(here("data/raw/LASAN/cnt061142_indicator_species.pdf")) %>%
pdf ::read_lines() readr
= list()
my_list for (line in pdf[5:189]) {
if (line != '') {
<- str_match(line,
matches "^([a-zA-Z]+) +([a-zA-Z]+ [a-zA-Z.()]+ ([a-zA-Z.()]+ [a-zA-Z.()]+)?) +([-'a-zA-Z ]+) +"
)<- c(matches[1,2], matches[1,3], matches[1,5], 'species') %>% str_squish()
row length(my_list)+1]] <- row
my_list[[
}
}
<- plyr::ldply(my_list)
stats_df colnames(stats_df) <- c('group', 'scientific name', 'common name', 'taxon rank')
<- stats_df %>% filter_all(any_vars(!is.na(.)))
stats_df <- stats_df %>% filter(group != 'Group')
stats_df
2,'common name'] <- paste(stats_df[2,'common name'], 'Salamander')
stats_df[18,'common name'] <- paste(stats_df[18,'common name'], 'Jerusalem crickets')
stats_df[22,'common name'] <- paste(stats_df[22,'common name'], 'hairstreak')
stats_df[23,'common name'] <- paste(stats_df[23,'common name'], 'butterfly')
stats_df[23,'taxon rank'] <- 'subspecies'
stats_df[25,'scientific name'] <- 'Mutillidae'
stats_df[25,'taxon rank'] <- 'family'
stats_df[26,'scientific name'] <- 'Pogonomyrmex'
stats_df[26,'taxon rank'] <- 'genus' stats_df[
write_csv(stats_df, here('data/cleaned/LA_city_indicator_species.csv'))