diff --git a/helper/pluralize.go b/helper/pluralize.go new file mode 100644 index 0000000..189cd8a --- /dev/null +++ b/helper/pluralize.go @@ -0,0 +1,507 @@ +// +// pluralize.go +// Copyright (C) 2023 tiglog +// +// Distributed under terms of the MIT license. +// + +package helper + +import ( + "fmt" + "regexp" + "strconv" + "strings" +) + +// PluralizeRule -- pluralize rule expression and replacement value. +type PluralizeRule struct { + expression *regexp.Regexp + replacement string +} + +// PluralizeClient -- pluralize client. +type PluralizeClient struct { + pluralRules []PluralizeRule + singularRules []PluralizeRule + uncountables map[string]bool + irregularSingles map[string]string + irregularPlurals map[string]string + interpolateExpr *regexp.Regexp +} + +func NewPluralizeClient() *PluralizeClient { + client := PluralizeClient{} + client.init() + + return &client +} + +func (c *PluralizeClient) init() { + c.pluralRules = make([]PluralizeRule, 0) + c.singularRules = make([]PluralizeRule, 0) + c.uncountables = make(map[string]bool) + c.irregularSingles = make(map[string]string) + c.irregularPlurals = make(map[string]string) + + c.loadIrregularRules() + c.loadPluralizationRules() + c.loadSingularizationRules() + c.loadUncountableRules() + c.interpolateExpr = regexp.MustCompile(`\$(\d{1,2})`) +} + +// Pluralize -- Pluralize or singularize a word based on the passed in count. +// +// word: the word to pluralize +// count: how many of the word exist +// inclusive: whether to prefix with the number (e.g. 3 ducks) +func (c *PluralizeClient) Pluralize(word string, count int, inclusive bool) string { + pluralized := func() func(string) string { + if count == 1 { + return c.Singular + } + + return c.Plural + } + + if inclusive { + return fmt.Sprintf("%d %s", count, pluralized()(word)) + } + + return pluralized()(word) +} + +// Plural -- Pluralize a word. +func (c *PluralizeClient) Plural(word string) string { + return c.replaceWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word) +} + +// IsPlural -- Check if a word is plural. +func (c *PluralizeClient) IsPlural(word string) bool { + return c.checkWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word) +} + +// Singular -- Singularize a word. +func (c *PluralizeClient) Singular(word string) string { + return c.replaceWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word) +} + +// IsSingular -- Check if a word is singular. +func (c *PluralizeClient) IsSingular(word string) bool { + return c.checkWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word) +} + +// AddPluralRule -- Add a pluralization rule to the collection. +func (c *PluralizeClient) AddPluralRule(rule string, replacement string) { + c.pluralRules = append(c.pluralRules, PluralizeRule{sanitizeRule(rule), replacement}) +} + +// AddSingularRule -- Add a singularization rule to the collection. +func (c *PluralizeClient) AddSingularRule(rule string, replacement string) { + c.singularRules = append(c.singularRules, PluralizeRule{sanitizeRule(rule), replacement}) +} + +// AddUncountableRule -- Add an uncountable word rule. +func (c *PluralizeClient) AddUncountableRule(word string) { + if !isExpr(word) { + c.uncountables[strings.ToLower(word)] = true + return + } + + c.AddPluralRule(word, `$0`) + c.AddSingularRule(word, `$0`) +} + +// AddIrregularRule -- Add an irregular word definition. +func (c *PluralizeClient) AddIrregularRule(single string, plural string) { + p := strings.ToLower(plural) + s := strings.ToLower(single) + + c.irregularSingles[s] = p + c.irregularPlurals[p] = s +} + +func (c *PluralizeClient) replaceWord(replaceMap map[string]string, keepMap map[string]string, rules []PluralizeRule) func(w string) string { //nolint:lll + f := func(word string) string { + // Get the correct token and case restoration functions. + var token = strings.ToLower(word) + + // Check against the keep object map. + if _, ok := keepMap[token]; ok { + return restoreCase(word, token) + } + + // Check against the replacement map for a direct word replacement. + if replaceToken, ok := replaceMap[token]; ok { + return restoreCase(word, replaceToken) + } + + // Run all the rules against the word. + return c.sanitizeWord(token, word, rules) + } + + return f +} + +func (c *PluralizeClient) checkWord(replaceMap map[string]string, keepMap map[string]string, rules []PluralizeRule) func(w string) bool { + f := func(word string) bool { + var token = strings.ToLower(word) + + if _, ok := keepMap[token]; ok { + return true + } + + if _, ok := replaceMap[token]; ok { + return false + } + + return c.sanitizeWord(token, token, rules) == token + } + + return f +} + +func (c *PluralizeClient) interpolate(str string, args []string) string { + lookup := map[string]string{} + + for _, submatch := range c.interpolateExpr.FindAllStringSubmatch(str, -1) { + element, _ := strconv.Atoi(submatch[1]) + lookup[submatch[0]] = args[element] + } + + result := c.interpolateExpr.ReplaceAllStringFunc(str, func(repl string) string { + return lookup[repl] + }) + + return result +} + +func (c *PluralizeClient) replace(word string, rule PluralizeRule) string { + return rule.expression.ReplaceAllStringFunc(word, func(w string) string { + match := rule.expression.FindString(word) + index := rule.expression.FindStringIndex(word)[0] + args := rule.expression.FindAllStringSubmatch(word, -1)[0] + + result := c.interpolate(rule.replacement, args) + + if match == `` { + return restoreCase(word[index-1:index], result) + } + return restoreCase(match, result) + }) +} + +func (c *PluralizeClient) sanitizeWord(token string, word string, rules []PluralizeRule) string { + // If empty string + if len(token) == 0 { + return word + } + // If does not need fixup + if _, ok := c.uncountables[token]; ok { + return word + } + + // Iterate over the sanitization rules and use the first one to match. + // NOTE: iterate rules array in reverse order specific => general rules + for i := len(rules) - 1; i >= 0; i-- { + if rules[i].expression.MatchString(word) { + return c.replace(word, rules[i]) + } + } + + return word +} + +func sanitizeRule(rule string) *regexp.Regexp { + if isExpr(rule) { + return regexp.MustCompile(rule) + } + + return regexp.MustCompile(`(?i)^` + rule + `$`) +} + +func restoreCase(word string, token string) string { + // Tokens are an exact match. + if word == token { + return token + } + + // Lower cased words. E.g. "hello". + if word == strings.ToLower(word) { + return strings.ToLower(token) + } + + // Upper cased words. E.g. "WHISKY". + if word == strings.ToUpper(word) { + return strings.ToUpper(token) + } + + // Title cased words. E.g. "Title". + if word[:1] == strings.ToUpper(word[:1]) { + return strings.ToUpper(token[:1]) + strings.ToLower(token[1:]) + } + + // Lower cased words. E.g. "test". + return strings.ToLower(token) +} + +// isExpr -- helper to detect if string represents an expression by checking first character to be `(`. +func isExpr(s string) bool { + return s[:1] == `(` +} + +func (c *PluralizeClient) loadIrregularRules() { //nolint:funlen + var irregularRules = []struct { + single string + plural string + }{ + // Pronouns. + {`I`, `we`}, + {`me`, `us`}, + {`he`, `they`}, + {`she`, `they`}, + {`them`, `them`}, + {`myself`, `ourselves`}, + {`yourself`, `yourselves`}, + {`itself`, `themselves`}, + {`herself`, `themselves`}, + {`himself`, `themselves`}, + {`themself`, `themselves`}, + {`is`, `are`}, + {`was`, `were`}, + {`has`, `have`}, + {`this`, `these`}, + {`that`, `those`}, + {`my`, `our`}, + {`its`, `their`}, + {`his`, `their`}, + {`her`, `their`}, + // Words ending in with a consonant and `o`. + {`echo`, `echoes`}, + {`dingo`, `dingoes`}, + {`volcano`, `volcanoes`}, + {`tornado`, `tornadoes`}, + {`torpedo`, `torpedoes`}, + // Ends with `us`. + {`genus`, `genera`}, + {`viscus`, `viscera`}, + // Ends with `ma`. + {`stigma`, `stigmata`}, + {`stoma`, `stomata`}, + {`dogma`, `dogmata`}, + {`lemma`, `lemmata`}, + {`schema`, `schemata`}, + {`anathema`, `anathemata`}, + // Other irregular rules. + {`ox`, `oxen`}, + {`axe`, `axes`}, + {`die`, `dice`}, + {`yes`, `yeses`}, + {`foot`, `feet`}, + {`eave`, `eaves`}, + {`goose`, `geese`}, + {`tooth`, `teeth`}, + {`quiz`, `quizzes`}, + {`human`, `humans`}, + {`proof`, `proofs`}, + {`carve`, `carves`}, + {`valve`, `valves`}, + {`looey`, `looies`}, + {`thief`, `thieves`}, + {`groove`, `grooves`}, + {`pickaxe`, `pickaxes`}, + {`passerby`, `passersby`}, + {`canvas`, `canvases`}, + {`sms`, `sms`}, + } + + for _, r := range irregularRules { + c.AddIrregularRule(r.single, r.plural) + } +} + +func (c *PluralizeClient) loadPluralizationRules() { + var pluralizationRules = []struct { + rule string + replacement string + }{ + {`(?i)s?$`, `s`}, + {`(?i)[^[:ascii:]]$`, `$0`}, + {`(?i)([^aeiou]ese)$`, `$1`}, + {`(?i)(ax|test)is$`, `$1es`}, + {`(?i)(alias|[^aou]us|t[lm]as|gas|ris)$`, `$1es`}, + {`(?i)(e[mn]u)s?$`, `$1s`}, + {`(?i)([^l]ias|[aeiou]las|[ejzr]as|[iu]am)$`, `$1`}, + {`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1i`}, //nolint:lll,misspell + {`(?i)(alumn|alg|vertebr)(?:a|ae)$`, `$1ae`}, + {`(?i)(seraph|cherub)(?:im)?$`, `$1im`}, + {`(?i)(her|at|gr)o$`, `$1oes`}, + {`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|automat|quor)(?:a|um)$`, `$1a`}, //nolint:lll,misspell + {`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)(?:a|on)$`, `$1a`}, + {`(?i)sis$`, `ses`}, + {`(?i)(?:(kni|wi|li)fe|(ar|l|ea|eo|oa|hoo)f)$`, `$1$2ves`}, + {`(?i)([^aeiouy]|qu)y$`, `$1ies`}, + {`(?i)([^ch][ieo][ln])ey$`, `$1ies`}, + {`(?i)(x|ch|ss|sh|zz)$`, `$1es`}, + {`(?i)(matr|cod|mur|sil|vert|ind|append)(?:ix|ex)$`, `$1ices`}, + {`(?i)\b((?:tit)?m|l)(?:ice|ouse)$`, `$1ice`}, + {`(?i)(pe)(?:rson|ople)$`, `$1ople`}, + {`(?i)(child)(?:ren)?$`, `$1ren`}, + {`(?i)eaux$`, `$0`}, + {`(?i)m[ae]n$`, `men`}, + {`thou`, `you`}, + } + + for _, r := range pluralizationRules { + c.AddPluralRule(r.rule, r.replacement) + } +} + +func (c *PluralizeClient) loadSingularizationRules() { + var singularizationRules = []struct { + rule string + replacement string + }{ + {`(?i)s$`, ``}, + {`(?i)(ss)$`, `$1`}, + {`(?i)(wi|kni|(?:after|half|high|low|mid|non|night|[^\w]|^)li)ves$`, `$1fe`}, + {`(?i)(ar|(?:wo|[ae])l|[eo][ao])ves$`, `$1f`}, + {`(?i)ies$`, `y`}, + {`(?i)(dg|ss|ois|lk|ok|wn|mb|th|ch|ec|oal|is|ck|ix|sser|ts|wb)ies$`, `$1ie`}, + {`(?i)\b(l|(?:neck|cross|hog|aun)?t|coll|faer|food|gen|goon|group|hipp|junk|vegg|(?:pork)?p|charl|calor|cut)ies$`, `$1ie`}, //nolint:lll + {`(?i)\b(mon|smil)ies$`, `$1ey`}, + {`(?i)\b((?:tit)?m|l)ice$`, `$1ouse`}, + {`(?i)(seraph|cherub)im$`, `$1`}, + {`(?i)(x|ch|ss|sh|zz|tto|go|cho|alias|[^aou]us|t[lm]as|gas|(?:her|at|gr)o|[aeiou]ris)(?:es)?$`, `$1`}, + {`(?i)(analy|diagno|parenthe|progno|synop|the|empha|cri|ne)(?:sis|ses)$`, `$1sis`}, + {`(?i)(movie|twelve|abuse|e[mn]u)s$`, `$1`}, + {`(?i)(test)(?:is|es)$`, `$1is`}, + {`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1us`}, //nolint:lll,misspell + {`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|quor)a$`, `$1um`}, //nolint:lll,misspell + {`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)a$`, `$1on`}, + {`(?i)(alumn|alg|vertebr)ae$`, `$1a`}, + {`(?i)(cod|mur|sil|vert|ind)ices$`, `$1ex`}, + {`(?i)(matr|append)ices$`, `$1ix`}, + {`(?i)(pe)(rson|ople)$`, `$1rson`}, + {`(?i)(child)ren$`, `$1`}, + {`(?i)(eau)x?$`, `$1`}, + {`(?i)men$`, `man`}, + } + + for _, r := range singularizationRules { + c.AddSingularRule(r.rule, r.replacement) + } +} + +func (c *PluralizeClient) loadUncountableRules() { //nolint:funlen + var uncountableRules = []string{ + // Singular words with no plurals. + `adulthood`, + `advice`, + `agenda`, + `aid`, + `aircraft`, + `alcohol`, + `ammo`, + `analytics`, + `anime`, + `athletics`, + `audio`, + `bison`, + `blood`, + `bream`, + `buffalo`, + `butter`, + `carp`, + `cash`, + `chassis`, + `chess`, + `clothing`, + `cod`, + `commerce`, + `cooperation`, + `corps`, + `debris`, + `diabetes`, + `digestion`, + `elk`, + `energy`, + `equipment`, + `excretion`, + `expertise`, + `firmware`, + `flounder`, + `fun`, + `gallows`, + `garbage`, + `graffiti`, + `hardware`, + `headquarters`, + `health`, + `herpes`, + `highjinks`, + `homework`, + `housework`, + `information`, + `jeans`, + `justice`, + `kudos`, + `labour`, + `literature`, + `machinery`, + `mackerel`, + `mail`, + `media`, + `mews`, + `moose`, + `music`, + `mud`, + `manga`, + `news`, + `only`, + `personnel`, + `pike`, + `plankton`, + `pliers`, + `police`, + `pollution`, + `premises`, + `rain`, + `research`, + `rice`, + `salmon`, + `scissors`, + `series`, + `sewage`, + `shambles`, + `shrimp`, + `software`, + `staff`, + `swine`, + `tennis`, + `traffic`, + `transportation`, + `trout`, + `tuna`, + `wealth`, + `welfare`, + `whiting`, + `wildebeest`, + `wildlife`, + `you`, + // Regexes. + `(?i)pok[eé]mon$`, // + `(?i)[^aeiou]ese$`, // "chinese", "japanese" + `(?i)deer$`, // "deer", "reindeer" + `(?i)(fish)$`, // "fish", "blowfish", "angelfish" + `(?i)measles$`, // + `(?i)o[iu]s$`, // "carnivorous" + `(?i)pox$`, // "chickpox", "smallpox" + `(?i)sheep$`, // + } + + for _, w := range uncountableRules { + c.AddUncountableRule(w) + } +}