word <- function (string, start = 1L, end = start, sep = fixed(" ")) {
  n <- max(length(string), length(start), length(end))
  string <- rep(string, length.out = n)
  start <- rep(start, length.out = n)
  end <- rep(end, length.out = n)
  require(stringr)
  breaks <- str_locate_all(string, sep)
  words <- lapply(breaks, invert_match)
  len <- vapply(words, nrow, integer(1))
  # Adjust
  neg_start <- !is.na(start) & start < 0L
  start[neg_start] <- start[neg_start] + len[neg_start] + 1L
  neg_end <- !is.na(end) & end < 0L
  end[neg_end] <- end[neg_end] + len[neg_end] + 1L
  start[start > len] <- NA
  start <- pmin(len, start)  
  starts <- mapply(function(word, loc) word[loc, "start"], words, start)
  end[end > len] <- NA
  end <- pmin(len, end)
  ends <- mapply(function(word, loc) word[loc, "end"], words, end)
  # return
  str_sub(string, starts, ends)
}

# letter <- make.abbr(toupper(validnames))
# names = toupper(validnames)
make.lettercode <- function(validnames) {
  result <- data.frame(original=validnames)
#  names <- toupper(names)
  validnames <- gsub('\"', '', validnames, perl=TRUE, useBytes=TRUE) # \"Gruenliches etwas
  validnames <- gsub('^x ', 'x', validnames) # genus hybrids
  validnames <- gsub(' * ', ' ', validnames, fixed=TRUE, useBytes=TRUE) # Seggregates
  validnames <- gsub(' s. str. ', ' ', validnames, fixed=TRUE, useBytes=TRUE)
  validnames <- gsub(' s. l. ', ' ', validnames, fixed=TRUE, useBytes=TRUE)
  len <- sapply(gregexpr("\\W+", validnames), length) + 1
  hybrids <- word(validnames, 2, sep=fixed(" ")) %in% c('x','X') & grepl(' ', validnames, fixed = T)
  validnames[hybrids] <- paste(word(validnames[hybrids], 1, sep=fixed(" ")), word(validnames[hybrids], 3, len[hybrids], sep=fixed(" ")), sep=' ')
  validnames <- make.names(validnames, unique = FALSE) # Make syntactically valid names out of character vectors, from here on use fixed(".")
  validnames <- gsub("\\.[\\.]+", ".", validnames)
  validnames <- gsub("\\.$", "", validnames) # no dot at the end

  result$validnames <- validnames
  parsed <- strsplit(validnames, ".", fixed = TRUE)
 # Abbreviate genus to 4 letters
  gen.arg <- sapply(parsed, "[[", 1)
  gen.unique <- unique(gen.arg)
  gen <- abbreviate(gen.unique, minlength = 4, strict = TRUE)
  for(i in 1:3) gen[nchar(gen) < 4] <- paste(gen[nchar(gen) < 4], '', sep = ' ') # fill short genus names with spaces
  set.seed(123)
  if(any(duplicated(gen)))
   for(d in which(duplicated(gen))) {
    t <- 1
    repeat{
      if(t > 50) gen[d] <- paste(substr(gen[d], start = 1, stop = 3), sample(1:9, size = 1), sep='') else 
        gen[d] <- paste(substr(gen[d], start = 1, stop = 3), sample(unlist(strsplit(gen.arg[d], NULL)), size = 1), sep='')
      if(!gen[d] %in% gen[-d]) break
      t = (t + 1)
      if(t > 100) break
    }
  }
 if(any(duplicated(gen))) {
   warning(paste('Genus', paste(names(gen[duplicated(gen)]), collapse=', '), 'can not be singularized to 4 letter abbreviations.'))
   
 }
 result$genus <- names(gen)[match(gen.arg, names(gen))]
 result$genuscode <- gen[match(gen.arg, names(gen))]
  # Special signs for special taxon ranks
  ranks <- toupper(word(validnames, 3, sep=fixed(".")))
  ranks[toupper(word(validnames, 2, sep=fixed("."))) == 'SER'] <- 'SER'
  ranks[toupper(word(validnames, 2, sep=fixed("."))) == 'SUBSECT'] <- 'SSE'
  ranks[toupper(word(validnames, 2, sep=fixed("."))) == 'SUBGEN'] <- 'SGE'
  ranks[toupper(word(validnames, 2, sep=fixed("."))) %in% c('SPECIES','SP','SPEC') | is.na(word(validnames, 2, sep=fixed(".")))] <- 'GAT'
  ranks[is.na(ranks)] <- 'SPE'
  ranks[ranks == 'SSP'] <- 'SUBSP'

result$rank <- ranks
#### Abbreviate epitheta
for(e in unique(names(gen))) {
 subset <- result[result$genus == e, ]
 lettercode <- subset$genuscode
 validnames <- subset$validnames
 rank <- subset$rank
 for(i in 1:nrow(subset))  
   lettercode[i] <- switch(rank[i],
       # Abbreviate Series with $ before 1. epitheta
       SER = paste(lettercode[i], '$', substr(word(validnames[i], 3, sep=fixed(".")),1,1), sep=''),
       # Abbreviate Section with $ before 1. epitheta
       SECT = paste(lettercode[i], '$', substr(word(validnames[i], 3, sep=fixed(".")),1,1), sep=''),
       SSE = paste(lettercode[i], '$', substr(word(validnames[i], 3, sep=fixed(".")),1,1), sep=''),
       SGE = paste(lettercode[i], '$', substr(word(validnames[i], 3, sep=fixed(".")),1,1), sep=''),
       # Abbreviate Aggregates with / before 1. epitheta
       AGG = paste(lettercode[i], '#', substr(word(validnames[i], 3, sep=fixed(".")),1,2), sep=''),
       # Subspecies with - between first characters of epi 2 and 3
       SUBSP = paste(lettercode[i], substr(word(validnames[i], 2, sep=fixed(".")),1,1), '-', substr(word(validnames[i], 4, sep=fixed(".")),1,1), sep=''),
       # Variant with ; between epi 2 and 3
       VAR = paste(lettercode[i], substr(word(validnames[i], 2, sep=fixed(".")),1,1), ';', substr(word(validnames[i], 4, sep=fixed(".")),1,1), sep=''),
       # Forma with , between epi 2 and 3
       FOR = paste(lettercode[i], substr(word(validnames[i], 2, sep=fixed(".")),1,1), ',', substr(word(validnames[i], 4, sep=fixed(".")),1,1), sep=''),
       # Hybrids with * between epi 2 and 3
       X = paste(lettercode[i], substr(word(validnames[i], 2, sep=fixed(".")),1,1), '*', substr(word(validnames[i], 4, sep=fixed(".")),1,1), sep=''),
       # Genus with -SP
       GAT = paste(lettercode[i], '-SP', sep=''),
       # Species
       SPE = paste(lettercode[i], substr(word(validnames[i], 2, sep=fixed(".")),1,3), sep=''),
       # Everything else
       paste(lettercode[i], substr(word(validnames[i], 2, sep=fixed(".")),1,3), sep='')
     )
  if(any(duplicated(lettercode)))
    for(d in which(duplicated(lettercode))) {
      t <- 1
      repeat{
        if(t > 50) lettercode[d] <- paste(substr(lettercode[d], start = 1, stop = 6), sample(0:9, size = 1), sep='') else 
          if(t > 30) lettercode[d] <- paste(substr(lettercode[d], start = 1, stop = 4), sample(unlist(strsplit(word(validnames[d], 2, sep=fixed(".")), NULL)), size = 1), substr(lettercode[d], start = 6, stop = 7), sep='')
          lettercode[d] <- paste(substr(lettercode[d], start = 1, stop = 6), sample(unlist(strsplit(word(validnames[d], 2, sep=fixed(".")), NULL)), size = 1), sep='')
        if(!lettercode[d] %in% lettercode[-d]) break
        t = (t + 1)
        if(t > 100) break
      }
    }
  if(any(duplicated(lettercode))) {
    warning(paste('Epithet abbreviation can not be singularized to 3 letter abbreviations.'))
    print(e)
    print(validnames[duplicated(lettercode)])
    print(lettercode[duplicated(lettercode)])
  }
 subset$lettercode <- lettercode
 result$lettercode[match(rownames(subset), rownames(result))] <- subset$lettercode
}
 out <- result$lettercode
 names(out) <- result$original
print(names(out[is.na(out)]))
# letter <- out
 return(out)
}
# result[result$genus == 'EUPHRASIA',]
### Tests
# validnames <- c("Aa maderoi", "Poa sp.", "Cladina rangiferina ssp. rangiferina","Cladonia cornuta", "Cladonia cornuta var. groenlandica","Cladonia rangiformis", "Bryoerythrophyllum", 'Cladophora species', 'Viola reichenbachiana x riviniana', 'Achillea nobilis subsp. neilreichii','Achillea nobilis subsp. nobilis')
# make.abbrev(validnames)
# 
# taxa <- read.csv2('TaxonNamesToAbbreviate.csv', as.is=TRUE)[,1]
# taxabbr <- make.abbrev(toupper(taxa))
# 
# taxabbr <- make.abbrev(taxa)


