golib/helper/pluralize.go

508 lines
13 KiB
Go

//
// pluralize.go
// Copyright (C) 2023 tiglog <me@tiglog.com>
//
// Distributed under terms of the MIT license.
//
package helper
import (
"fmt"
"regexp"
"strconv"
"strings"
)
// PluralizeRule -- pluralize rule expression and replacement value.
type PluralizeRule struct {
expression *regexp.Regexp
replacement string
}
// PluralizeClient -- pluralize client.
type PluralizeClient struct {
pluralRules []PluralizeRule
singularRules []PluralizeRule
uncountables map[string]bool
irregularSingles map[string]string
irregularPlurals map[string]string
interpolateExpr *regexp.Regexp
}
func NewPluralizeClient() *PluralizeClient {
client := PluralizeClient{}
client.init()
return &client
}
func (c *PluralizeClient) init() {
c.pluralRules = make([]PluralizeRule, 0)
c.singularRules = make([]PluralizeRule, 0)
c.uncountables = make(map[string]bool)
c.irregularSingles = make(map[string]string)
c.irregularPlurals = make(map[string]string)
c.loadIrregularRules()
c.loadPluralizationRules()
c.loadSingularizationRules()
c.loadUncountableRules()
c.interpolateExpr = regexp.MustCompile(`\$(\d{1,2})`)
}
// Pluralize -- Pluralize or singularize a word based on the passed in count.
//
// word: the word to pluralize
// count: how many of the word exist
// inclusive: whether to prefix with the number (e.g. 3 ducks)
func (c *PluralizeClient) Pluralize(word string, count int, inclusive bool) string {
pluralized := func() func(string) string {
if count == 1 {
return c.Singular
}
return c.Plural
}
if inclusive {
return fmt.Sprintf("%d %s", count, pluralized()(word))
}
return pluralized()(word)
}
// Plural -- Pluralize a word.
func (c *PluralizeClient) Plural(word string) string {
return c.replaceWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word)
}
// IsPlural -- Check if a word is plural.
func (c *PluralizeClient) IsPlural(word string) bool {
return c.checkWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word)
}
// Singular -- Singularize a word.
func (c *PluralizeClient) Singular(word string) string {
return c.replaceWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word)
}
// IsSingular -- Check if a word is singular.
func (c *PluralizeClient) IsSingular(word string) bool {
return c.checkWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word)
}
// AddPluralRule -- Add a pluralization rule to the collection.
func (c *PluralizeClient) AddPluralRule(rule string, replacement string) {
c.pluralRules = append(c.pluralRules, PluralizeRule{sanitizeRule(rule), replacement})
}
// AddSingularRule -- Add a singularization rule to the collection.
func (c *PluralizeClient) AddSingularRule(rule string, replacement string) {
c.singularRules = append(c.singularRules, PluralizeRule{sanitizeRule(rule), replacement})
}
// AddUncountableRule -- Add an uncountable word rule.
func (c *PluralizeClient) AddUncountableRule(word string) {
if !isExpr(word) {
c.uncountables[strings.ToLower(word)] = true
return
}
c.AddPluralRule(word, `$0`)
c.AddSingularRule(word, `$0`)
}
// AddIrregularRule -- Add an irregular word definition.
func (c *PluralizeClient) AddIrregularRule(single string, plural string) {
p := strings.ToLower(plural)
s := strings.ToLower(single)
c.irregularSingles[s] = p
c.irregularPlurals[p] = s
}
func (c *PluralizeClient) replaceWord(replaceMap map[string]string, keepMap map[string]string, rules []PluralizeRule) func(w string) string { //nolint:lll
f := func(word string) string {
// Get the correct token and case restoration functions.
var token = strings.ToLower(word)
// Check against the keep object map.
if _, ok := keepMap[token]; ok {
return restoreCase(word, token)
}
// Check against the replacement map for a direct word replacement.
if replaceToken, ok := replaceMap[token]; ok {
return restoreCase(word, replaceToken)
}
// Run all the rules against the word.
return c.sanitizeWord(token, word, rules)
}
return f
}
func (c *PluralizeClient) checkWord(replaceMap map[string]string, keepMap map[string]string, rules []PluralizeRule) func(w string) bool {
f := func(word string) bool {
var token = strings.ToLower(word)
if _, ok := keepMap[token]; ok {
return true
}
if _, ok := replaceMap[token]; ok {
return false
}
return c.sanitizeWord(token, token, rules) == token
}
return f
}
func (c *PluralizeClient) interpolate(str string, args []string) string {
lookup := map[string]string{}
for _, submatch := range c.interpolateExpr.FindAllStringSubmatch(str, -1) {
element, _ := strconv.Atoi(submatch[1])
lookup[submatch[0]] = args[element]
}
result := c.interpolateExpr.ReplaceAllStringFunc(str, func(repl string) string {
return lookup[repl]
})
return result
}
func (c *PluralizeClient) replace(word string, rule PluralizeRule) string {
return rule.expression.ReplaceAllStringFunc(word, func(w string) string {
match := rule.expression.FindString(word)
index := rule.expression.FindStringIndex(word)[0]
args := rule.expression.FindAllStringSubmatch(word, -1)[0]
result := c.interpolate(rule.replacement, args)
if match == `` {
return restoreCase(word[index-1:index], result)
}
return restoreCase(match, result)
})
}
func (c *PluralizeClient) sanitizeWord(token string, word string, rules []PluralizeRule) string {
// If empty string
if len(token) == 0 {
return word
}
// If does not need fixup
if _, ok := c.uncountables[token]; ok {
return word
}
// Iterate over the sanitization rules and use the first one to match.
// NOTE: iterate rules array in reverse order specific => general rules
for i := len(rules) - 1; i >= 0; i-- {
if rules[i].expression.MatchString(word) {
return c.replace(word, rules[i])
}
}
return word
}
func sanitizeRule(rule string) *regexp.Regexp {
if isExpr(rule) {
return regexp.MustCompile(rule)
}
return regexp.MustCompile(`(?i)^` + rule + `$`)
}
func restoreCase(word string, token string) string {
// Tokens are an exact match.
if word == token {
return token
}
// Lower cased words. E.g. "hello".
if word == strings.ToLower(word) {
return strings.ToLower(token)
}
// Upper cased words. E.g. "WHISKY".
if word == strings.ToUpper(word) {
return strings.ToUpper(token)
}
// Title cased words. E.g. "Title".
if word[:1] == strings.ToUpper(word[:1]) {
return strings.ToUpper(token[:1]) + strings.ToLower(token[1:])
}
// Lower cased words. E.g. "test".
return strings.ToLower(token)
}
// isExpr -- helper to detect if string represents an expression by checking first character to be `(`.
func isExpr(s string) bool {
return s[:1] == `(`
}
func (c *PluralizeClient) loadIrregularRules() { //nolint:funlen
var irregularRules = []struct {
single string
plural string
}{
// Pronouns.
{`I`, `we`},
{`me`, `us`},
{`he`, `they`},
{`she`, `they`},
{`them`, `them`},
{`myself`, `ourselves`},
{`yourself`, `yourselves`},
{`itself`, `themselves`},
{`herself`, `themselves`},
{`himself`, `themselves`},
{`themself`, `themselves`},
{`is`, `are`},
{`was`, `were`},
{`has`, `have`},
{`this`, `these`},
{`that`, `those`},
{`my`, `our`},
{`its`, `their`},
{`his`, `their`},
{`her`, `their`},
// Words ending in with a consonant and `o`.
{`echo`, `echoes`},
{`dingo`, `dingoes`},
{`volcano`, `volcanoes`},
{`tornado`, `tornadoes`},
{`torpedo`, `torpedoes`},
// Ends with `us`.
{`genus`, `genera`},
{`viscus`, `viscera`},
// Ends with `ma`.
{`stigma`, `stigmata`},
{`stoma`, `stomata`},
{`dogma`, `dogmata`},
{`lemma`, `lemmata`},
{`schema`, `schemata`},
{`anathema`, `anathemata`},
// Other irregular rules.
{`ox`, `oxen`},
{`axe`, `axes`},
{`die`, `dice`},
{`yes`, `yeses`},
{`foot`, `feet`},
{`eave`, `eaves`},
{`goose`, `geese`},
{`tooth`, `teeth`},
{`quiz`, `quizzes`},
{`human`, `humans`},
{`proof`, `proofs`},
{`carve`, `carves`},
{`valve`, `valves`},
{`looey`, `looies`},
{`thief`, `thieves`},
{`groove`, `grooves`},
{`pickaxe`, `pickaxes`},
{`passerby`, `passersby`},
{`canvas`, `canvases`},
{`sms`, `sms`},
}
for _, r := range irregularRules {
c.AddIrregularRule(r.single, r.plural)
}
}
func (c *PluralizeClient) loadPluralizationRules() {
var pluralizationRules = []struct {
rule string
replacement string
}{
{`(?i)s?$`, `s`},
{`(?i)[^[:ascii:]]$`, `$0`},
{`(?i)([^aeiou]ese)$`, `$1`},
{`(?i)(ax|test)is$`, `$1es`},
{`(?i)(alias|[^aou]us|t[lm]as|gas|ris)$`, `$1es`},
{`(?i)(e[mn]u)s?$`, `$1s`},
{`(?i)([^l]ias|[aeiou]las|[ejzr]as|[iu]am)$`, `$1`},
{`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1i`}, //nolint:lll,misspell
{`(?i)(alumn|alg|vertebr)(?:a|ae)$`, `$1ae`},
{`(?i)(seraph|cherub)(?:im)?$`, `$1im`},
{`(?i)(her|at|gr)o$`, `$1oes`},
{`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|automat|quor)(?:a|um)$`, `$1a`}, //nolint:lll,misspell
{`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)(?:a|on)$`, `$1a`},
{`(?i)sis$`, `ses`},
{`(?i)(?:(kni|wi|li)fe|(ar|l|ea|eo|oa|hoo)f)$`, `$1$2ves`},
{`(?i)([^aeiouy]|qu)y$`, `$1ies`},
{`(?i)([^ch][ieo][ln])ey$`, `$1ies`},
{`(?i)(x|ch|ss|sh|zz)$`, `$1es`},
{`(?i)(matr|cod|mur|sil|vert|ind|append)(?:ix|ex)$`, `$1ices`},
{`(?i)\b((?:tit)?m|l)(?:ice|ouse)$`, `$1ice`},
{`(?i)(pe)(?:rson|ople)$`, `$1ople`},
{`(?i)(child)(?:ren)?$`, `$1ren`},
{`(?i)eaux$`, `$0`},
{`(?i)m[ae]n$`, `men`},
{`thou`, `you`},
}
for _, r := range pluralizationRules {
c.AddPluralRule(r.rule, r.replacement)
}
}
func (c *PluralizeClient) loadSingularizationRules() {
var singularizationRules = []struct {
rule string
replacement string
}{
{`(?i)s$`, ``},
{`(?i)(ss)$`, `$1`},
{`(?i)(wi|kni|(?:after|half|high|low|mid|non|night|[^\w]|^)li)ves$`, `$1fe`},
{`(?i)(ar|(?:wo|[ae])l|[eo][ao])ves$`, `$1f`},
{`(?i)ies$`, `y`},
{`(?i)(dg|ss|ois|lk|ok|wn|mb|th|ch|ec|oal|is|ck|ix|sser|ts|wb)ies$`, `$1ie`},
{`(?i)\b(l|(?:neck|cross|hog|aun)?t|coll|faer|food|gen|goon|group|hipp|junk|vegg|(?:pork)?p|charl|calor|cut)ies$`, `$1ie`}, //nolint:lll
{`(?i)\b(mon|smil)ies$`, `$1ey`},
{`(?i)\b((?:tit)?m|l)ice$`, `$1ouse`},
{`(?i)(seraph|cherub)im$`, `$1`},
{`(?i)(x|ch|ss|sh|zz|tto|go|cho|alias|[^aou]us|t[lm]as|gas|(?:her|at|gr)o|[aeiou]ris)(?:es)?$`, `$1`},
{`(?i)(analy|diagno|parenthe|progno|synop|the|empha|cri|ne)(?:sis|ses)$`, `$1sis`},
{`(?i)(movie|twelve|abuse|e[mn]u)s$`, `$1`},
{`(?i)(test)(?:is|es)$`, `$1is`},
{`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1us`}, //nolint:lll,misspell
{`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|quor)a$`, `$1um`}, //nolint:lll,misspell
{`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)a$`, `$1on`},
{`(?i)(alumn|alg|vertebr)ae$`, `$1a`},
{`(?i)(cod|mur|sil|vert|ind)ices$`, `$1ex`},
{`(?i)(matr|append)ices$`, `$1ix`},
{`(?i)(pe)(rson|ople)$`, `$1rson`},
{`(?i)(child)ren$`, `$1`},
{`(?i)(eau)x?$`, `$1`},
{`(?i)men$`, `man`},
}
for _, r := range singularizationRules {
c.AddSingularRule(r.rule, r.replacement)
}
}
func (c *PluralizeClient) loadUncountableRules() { //nolint:funlen
var uncountableRules = []string{
// Singular words with no plurals.
`adulthood`,
`advice`,
`agenda`,
`aid`,
`aircraft`,
`alcohol`,
`ammo`,
`analytics`,
`anime`,
`athletics`,
`audio`,
`bison`,
`blood`,
`bream`,
`buffalo`,
`butter`,
`carp`,
`cash`,
`chassis`,
`chess`,
`clothing`,
`cod`,
`commerce`,
`cooperation`,
`corps`,
`debris`,
`diabetes`,
`digestion`,
`elk`,
`energy`,
`equipment`,
`excretion`,
`expertise`,
`firmware`,
`flounder`,
`fun`,
`gallows`,
`garbage`,
`graffiti`,
`hardware`,
`headquarters`,
`health`,
`herpes`,
`highjinks`,
`homework`,
`housework`,
`information`,
`jeans`,
`justice`,
`kudos`,
`labour`,
`literature`,
`machinery`,
`mackerel`,
`mail`,
`media`,
`mews`,
`moose`,
`music`,
`mud`,
`manga`,
`news`,
`only`,
`personnel`,
`pike`,
`plankton`,
`pliers`,
`police`,
`pollution`,
`premises`,
`rain`,
`research`,
`rice`,
`salmon`,
`scissors`,
`series`,
`sewage`,
`shambles`,
`shrimp`,
`software`,
`staff`,
`swine`,
`tennis`,
`traffic`,
`transportation`,
`trout`,
`tuna`,
`wealth`,
`welfare`,
`whiting`,
`wildebeest`,
`wildlife`,
`you`,
// Regexes.
`(?i)pok[eé]mon$`, //
`(?i)[^aeiou]ese$`, // "chinese", "japanese"
`(?i)deer$`, // "deer", "reindeer"
`(?i)(fish)$`, // "fish", "blowfish", "angelfish"
`(?i)measles$`, //
`(?i)o[iu]s$`, // "carnivorous"
`(?i)pox$`, // "chickpox", "smallpox"
`(?i)sheep$`, //
}
for _, w := range uncountableRules {
c.AddUncountableRule(w)
}
}