508 lines
13 KiB
Go
508 lines
13 KiB
Go
|
//
|
||
|
// pluralize.go
|
||
|
// Copyright (C) 2023 tiglog <me@tiglog.com>
|
||
|
//
|
||
|
// Distributed under terms of the MIT license.
|
||
|
//
|
||
|
|
||
|
package helper
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"regexp"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
)
|
||
|
|
||
|
// PluralizeRule -- pluralize rule expression and replacement value.
|
||
|
type PluralizeRule struct {
|
||
|
expression *regexp.Regexp
|
||
|
replacement string
|
||
|
}
|
||
|
|
||
|
// PluralizeClient -- pluralize client.
|
||
|
type PluralizeClient struct {
|
||
|
pluralRules []PluralizeRule
|
||
|
singularRules []PluralizeRule
|
||
|
uncountables map[string]bool
|
||
|
irregularSingles map[string]string
|
||
|
irregularPlurals map[string]string
|
||
|
interpolateExpr *regexp.Regexp
|
||
|
}
|
||
|
|
||
|
func NewPluralizeClient() *PluralizeClient {
|
||
|
client := PluralizeClient{}
|
||
|
client.init()
|
||
|
|
||
|
return &client
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) init() {
|
||
|
c.pluralRules = make([]PluralizeRule, 0)
|
||
|
c.singularRules = make([]PluralizeRule, 0)
|
||
|
c.uncountables = make(map[string]bool)
|
||
|
c.irregularSingles = make(map[string]string)
|
||
|
c.irregularPlurals = make(map[string]string)
|
||
|
|
||
|
c.loadIrregularRules()
|
||
|
c.loadPluralizationRules()
|
||
|
c.loadSingularizationRules()
|
||
|
c.loadUncountableRules()
|
||
|
c.interpolateExpr = regexp.MustCompile(`\$(\d{1,2})`)
|
||
|
}
|
||
|
|
||
|
// Pluralize -- Pluralize or singularize a word based on the passed in count.
|
||
|
//
|
||
|
// word: the word to pluralize
|
||
|
// count: how many of the word exist
|
||
|
// inclusive: whether to prefix with the number (e.g. 3 ducks)
|
||
|
func (c *PluralizeClient) Pluralize(word string, count int, inclusive bool) string {
|
||
|
pluralized := func() func(string) string {
|
||
|
if count == 1 {
|
||
|
return c.Singular
|
||
|
}
|
||
|
|
||
|
return c.Plural
|
||
|
}
|
||
|
|
||
|
if inclusive {
|
||
|
return fmt.Sprintf("%d %s", count, pluralized()(word))
|
||
|
}
|
||
|
|
||
|
return pluralized()(word)
|
||
|
}
|
||
|
|
||
|
// Plural -- Pluralize a word.
|
||
|
func (c *PluralizeClient) Plural(word string) string {
|
||
|
return c.replaceWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word)
|
||
|
}
|
||
|
|
||
|
// IsPlural -- Check if a word is plural.
|
||
|
func (c *PluralizeClient) IsPlural(word string) bool {
|
||
|
return c.checkWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word)
|
||
|
}
|
||
|
|
||
|
// Singular -- Singularize a word.
|
||
|
func (c *PluralizeClient) Singular(word string) string {
|
||
|
return c.replaceWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word)
|
||
|
}
|
||
|
|
||
|
// IsSingular -- Check if a word is singular.
|
||
|
func (c *PluralizeClient) IsSingular(word string) bool {
|
||
|
return c.checkWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word)
|
||
|
}
|
||
|
|
||
|
// AddPluralRule -- Add a pluralization rule to the collection.
|
||
|
func (c *PluralizeClient) AddPluralRule(rule string, replacement string) {
|
||
|
c.pluralRules = append(c.pluralRules, PluralizeRule{sanitizeRule(rule), replacement})
|
||
|
}
|
||
|
|
||
|
// AddSingularRule -- Add a singularization rule to the collection.
|
||
|
func (c *PluralizeClient) AddSingularRule(rule string, replacement string) {
|
||
|
c.singularRules = append(c.singularRules, PluralizeRule{sanitizeRule(rule), replacement})
|
||
|
}
|
||
|
|
||
|
// AddUncountableRule -- Add an uncountable word rule.
|
||
|
func (c *PluralizeClient) AddUncountableRule(word string) {
|
||
|
if !isExpr(word) {
|
||
|
c.uncountables[strings.ToLower(word)] = true
|
||
|
return
|
||
|
}
|
||
|
|
||
|
c.AddPluralRule(word, `$0`)
|
||
|
c.AddSingularRule(word, `$0`)
|
||
|
}
|
||
|
|
||
|
// AddIrregularRule -- Add an irregular word definition.
|
||
|
func (c *PluralizeClient) AddIrregularRule(single string, plural string) {
|
||
|
p := strings.ToLower(plural)
|
||
|
s := strings.ToLower(single)
|
||
|
|
||
|
c.irregularSingles[s] = p
|
||
|
c.irregularPlurals[p] = s
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) replaceWord(replaceMap map[string]string, keepMap map[string]string, rules []PluralizeRule) func(w string) string { //nolint:lll
|
||
|
f := func(word string) string {
|
||
|
// Get the correct token and case restoration functions.
|
||
|
var token = strings.ToLower(word)
|
||
|
|
||
|
// Check against the keep object map.
|
||
|
if _, ok := keepMap[token]; ok {
|
||
|
return restoreCase(word, token)
|
||
|
}
|
||
|
|
||
|
// Check against the replacement map for a direct word replacement.
|
||
|
if replaceToken, ok := replaceMap[token]; ok {
|
||
|
return restoreCase(word, replaceToken)
|
||
|
}
|
||
|
|
||
|
// Run all the rules against the word.
|
||
|
return c.sanitizeWord(token, word, rules)
|
||
|
}
|
||
|
|
||
|
return f
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) checkWord(replaceMap map[string]string, keepMap map[string]string, rules []PluralizeRule) func(w string) bool {
|
||
|
f := func(word string) bool {
|
||
|
var token = strings.ToLower(word)
|
||
|
|
||
|
if _, ok := keepMap[token]; ok {
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
if _, ok := replaceMap[token]; ok {
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
return c.sanitizeWord(token, token, rules) == token
|
||
|
}
|
||
|
|
||
|
return f
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) interpolate(str string, args []string) string {
|
||
|
lookup := map[string]string{}
|
||
|
|
||
|
for _, submatch := range c.interpolateExpr.FindAllStringSubmatch(str, -1) {
|
||
|
element, _ := strconv.Atoi(submatch[1])
|
||
|
lookup[submatch[0]] = args[element]
|
||
|
}
|
||
|
|
||
|
result := c.interpolateExpr.ReplaceAllStringFunc(str, func(repl string) string {
|
||
|
return lookup[repl]
|
||
|
})
|
||
|
|
||
|
return result
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) replace(word string, rule PluralizeRule) string {
|
||
|
return rule.expression.ReplaceAllStringFunc(word, func(w string) string {
|
||
|
match := rule.expression.FindString(word)
|
||
|
index := rule.expression.FindStringIndex(word)[0]
|
||
|
args := rule.expression.FindAllStringSubmatch(word, -1)[0]
|
||
|
|
||
|
result := c.interpolate(rule.replacement, args)
|
||
|
|
||
|
if match == `` {
|
||
|
return restoreCase(word[index-1:index], result)
|
||
|
}
|
||
|
return restoreCase(match, result)
|
||
|
})
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) sanitizeWord(token string, word string, rules []PluralizeRule) string {
|
||
|
// If empty string
|
||
|
if len(token) == 0 {
|
||
|
return word
|
||
|
}
|
||
|
// If does not need fixup
|
||
|
if _, ok := c.uncountables[token]; ok {
|
||
|
return word
|
||
|
}
|
||
|
|
||
|
// Iterate over the sanitization rules and use the first one to match.
|
||
|
// NOTE: iterate rules array in reverse order specific => general rules
|
||
|
for i := len(rules) - 1; i >= 0; i-- {
|
||
|
if rules[i].expression.MatchString(word) {
|
||
|
return c.replace(word, rules[i])
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return word
|
||
|
}
|
||
|
|
||
|
func sanitizeRule(rule string) *regexp.Regexp {
|
||
|
if isExpr(rule) {
|
||
|
return regexp.MustCompile(rule)
|
||
|
}
|
||
|
|
||
|
return regexp.MustCompile(`(?i)^` + rule + `$`)
|
||
|
}
|
||
|
|
||
|
func restoreCase(word string, token string) string {
|
||
|
// Tokens are an exact match.
|
||
|
if word == token {
|
||
|
return token
|
||
|
}
|
||
|
|
||
|
// Lower cased words. E.g. "hello".
|
||
|
if word == strings.ToLower(word) {
|
||
|
return strings.ToLower(token)
|
||
|
}
|
||
|
|
||
|
// Upper cased words. E.g. "WHISKY".
|
||
|
if word == strings.ToUpper(word) {
|
||
|
return strings.ToUpper(token)
|
||
|
}
|
||
|
|
||
|
// Title cased words. E.g. "Title".
|
||
|
if word[:1] == strings.ToUpper(word[:1]) {
|
||
|
return strings.ToUpper(token[:1]) + strings.ToLower(token[1:])
|
||
|
}
|
||
|
|
||
|
// Lower cased words. E.g. "test".
|
||
|
return strings.ToLower(token)
|
||
|
}
|
||
|
|
||
|
// isExpr -- helper to detect if string represents an expression by checking first character to be `(`.
|
||
|
func isExpr(s string) bool {
|
||
|
return s[:1] == `(`
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) loadIrregularRules() { //nolint:funlen
|
||
|
var irregularRules = []struct {
|
||
|
single string
|
||
|
plural string
|
||
|
}{
|
||
|
// Pronouns.
|
||
|
{`I`, `we`},
|
||
|
{`me`, `us`},
|
||
|
{`he`, `they`},
|
||
|
{`she`, `they`},
|
||
|
{`them`, `them`},
|
||
|
{`myself`, `ourselves`},
|
||
|
{`yourself`, `yourselves`},
|
||
|
{`itself`, `themselves`},
|
||
|
{`herself`, `themselves`},
|
||
|
{`himself`, `themselves`},
|
||
|
{`themself`, `themselves`},
|
||
|
{`is`, `are`},
|
||
|
{`was`, `were`},
|
||
|
{`has`, `have`},
|
||
|
{`this`, `these`},
|
||
|
{`that`, `those`},
|
||
|
{`my`, `our`},
|
||
|
{`its`, `their`},
|
||
|
{`his`, `their`},
|
||
|
{`her`, `their`},
|
||
|
// Words ending in with a consonant and `o`.
|
||
|
{`echo`, `echoes`},
|
||
|
{`dingo`, `dingoes`},
|
||
|
{`volcano`, `volcanoes`},
|
||
|
{`tornado`, `tornadoes`},
|
||
|
{`torpedo`, `torpedoes`},
|
||
|
// Ends with `us`.
|
||
|
{`genus`, `genera`},
|
||
|
{`viscus`, `viscera`},
|
||
|
// Ends with `ma`.
|
||
|
{`stigma`, `stigmata`},
|
||
|
{`stoma`, `stomata`},
|
||
|
{`dogma`, `dogmata`},
|
||
|
{`lemma`, `lemmata`},
|
||
|
{`schema`, `schemata`},
|
||
|
{`anathema`, `anathemata`},
|
||
|
// Other irregular rules.
|
||
|
{`ox`, `oxen`},
|
||
|
{`axe`, `axes`},
|
||
|
{`die`, `dice`},
|
||
|
{`yes`, `yeses`},
|
||
|
{`foot`, `feet`},
|
||
|
{`eave`, `eaves`},
|
||
|
{`goose`, `geese`},
|
||
|
{`tooth`, `teeth`},
|
||
|
{`quiz`, `quizzes`},
|
||
|
{`human`, `humans`},
|
||
|
{`proof`, `proofs`},
|
||
|
{`carve`, `carves`},
|
||
|
{`valve`, `valves`},
|
||
|
{`looey`, `looies`},
|
||
|
{`thief`, `thieves`},
|
||
|
{`groove`, `grooves`},
|
||
|
{`pickaxe`, `pickaxes`},
|
||
|
{`passerby`, `passersby`},
|
||
|
{`canvas`, `canvases`},
|
||
|
{`sms`, `sms`},
|
||
|
}
|
||
|
|
||
|
for _, r := range irregularRules {
|
||
|
c.AddIrregularRule(r.single, r.plural)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) loadPluralizationRules() {
|
||
|
var pluralizationRules = []struct {
|
||
|
rule string
|
||
|
replacement string
|
||
|
}{
|
||
|
{`(?i)s?$`, `s`},
|
||
|
{`(?i)[^[:ascii:]]$`, `$0`},
|
||
|
{`(?i)([^aeiou]ese)$`, `$1`},
|
||
|
{`(?i)(ax|test)is$`, `$1es`},
|
||
|
{`(?i)(alias|[^aou]us|t[lm]as|gas|ris)$`, `$1es`},
|
||
|
{`(?i)(e[mn]u)s?$`, `$1s`},
|
||
|
{`(?i)([^l]ias|[aeiou]las|[ejzr]as|[iu]am)$`, `$1`},
|
||
|
{`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1i`}, //nolint:lll,misspell
|
||
|
{`(?i)(alumn|alg|vertebr)(?:a|ae)$`, `$1ae`},
|
||
|
{`(?i)(seraph|cherub)(?:im)?$`, `$1im`},
|
||
|
{`(?i)(her|at|gr)o$`, `$1oes`},
|
||
|
{`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|automat|quor)(?:a|um)$`, `$1a`}, //nolint:lll,misspell
|
||
|
{`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)(?:a|on)$`, `$1a`},
|
||
|
{`(?i)sis$`, `ses`},
|
||
|
{`(?i)(?:(kni|wi|li)fe|(ar|l|ea|eo|oa|hoo)f)$`, `$1$2ves`},
|
||
|
{`(?i)([^aeiouy]|qu)y$`, `$1ies`},
|
||
|
{`(?i)([^ch][ieo][ln])ey$`, `$1ies`},
|
||
|
{`(?i)(x|ch|ss|sh|zz)$`, `$1es`},
|
||
|
{`(?i)(matr|cod|mur|sil|vert|ind|append)(?:ix|ex)$`, `$1ices`},
|
||
|
{`(?i)\b((?:tit)?m|l)(?:ice|ouse)$`, `$1ice`},
|
||
|
{`(?i)(pe)(?:rson|ople)$`, `$1ople`},
|
||
|
{`(?i)(child)(?:ren)?$`, `$1ren`},
|
||
|
{`(?i)eaux$`, `$0`},
|
||
|
{`(?i)m[ae]n$`, `men`},
|
||
|
{`thou`, `you`},
|
||
|
}
|
||
|
|
||
|
for _, r := range pluralizationRules {
|
||
|
c.AddPluralRule(r.rule, r.replacement)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) loadSingularizationRules() {
|
||
|
var singularizationRules = []struct {
|
||
|
rule string
|
||
|
replacement string
|
||
|
}{
|
||
|
{`(?i)s$`, ``},
|
||
|
{`(?i)(ss)$`, `$1`},
|
||
|
{`(?i)(wi|kni|(?:after|half|high|low|mid|non|night|[^\w]|^)li)ves$`, `$1fe`},
|
||
|
{`(?i)(ar|(?:wo|[ae])l|[eo][ao])ves$`, `$1f`},
|
||
|
{`(?i)ies$`, `y`},
|
||
|
{`(?i)(dg|ss|ois|lk|ok|wn|mb|th|ch|ec|oal|is|ck|ix|sser|ts|wb)ies$`, `$1ie`},
|
||
|
{`(?i)\b(l|(?:neck|cross|hog|aun)?t|coll|faer|food|gen|goon|group|hipp|junk|vegg|(?:pork)?p|charl|calor|cut)ies$`, `$1ie`}, //nolint:lll
|
||
|
{`(?i)\b(mon|smil)ies$`, `$1ey`},
|
||
|
{`(?i)\b((?:tit)?m|l)ice$`, `$1ouse`},
|
||
|
{`(?i)(seraph|cherub)im$`, `$1`},
|
||
|
{`(?i)(x|ch|ss|sh|zz|tto|go|cho|alias|[^aou]us|t[lm]as|gas|(?:her|at|gr)o|[aeiou]ris)(?:es)?$`, `$1`},
|
||
|
{`(?i)(analy|diagno|parenthe|progno|synop|the|empha|cri|ne)(?:sis|ses)$`, `$1sis`},
|
||
|
{`(?i)(movie|twelve|abuse|e[mn]u)s$`, `$1`},
|
||
|
{`(?i)(test)(?:is|es)$`, `$1is`},
|
||
|
{`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1us`}, //nolint:lll,misspell
|
||
|
{`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|quor)a$`, `$1um`}, //nolint:lll,misspell
|
||
|
{`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)a$`, `$1on`},
|
||
|
{`(?i)(alumn|alg|vertebr)ae$`, `$1a`},
|
||
|
{`(?i)(cod|mur|sil|vert|ind)ices$`, `$1ex`},
|
||
|
{`(?i)(matr|append)ices$`, `$1ix`},
|
||
|
{`(?i)(pe)(rson|ople)$`, `$1rson`},
|
||
|
{`(?i)(child)ren$`, `$1`},
|
||
|
{`(?i)(eau)x?$`, `$1`},
|
||
|
{`(?i)men$`, `man`},
|
||
|
}
|
||
|
|
||
|
for _, r := range singularizationRules {
|
||
|
c.AddSingularRule(r.rule, r.replacement)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (c *PluralizeClient) loadUncountableRules() { //nolint:funlen
|
||
|
var uncountableRules = []string{
|
||
|
// Singular words with no plurals.
|
||
|
`adulthood`,
|
||
|
`advice`,
|
||
|
`agenda`,
|
||
|
`aid`,
|
||
|
`aircraft`,
|
||
|
`alcohol`,
|
||
|
`ammo`,
|
||
|
`analytics`,
|
||
|
`anime`,
|
||
|
`athletics`,
|
||
|
`audio`,
|
||
|
`bison`,
|
||
|
`blood`,
|
||
|
`bream`,
|
||
|
`buffalo`,
|
||
|
`butter`,
|
||
|
`carp`,
|
||
|
`cash`,
|
||
|
`chassis`,
|
||
|
`chess`,
|
||
|
`clothing`,
|
||
|
`cod`,
|
||
|
`commerce`,
|
||
|
`cooperation`,
|
||
|
`corps`,
|
||
|
`debris`,
|
||
|
`diabetes`,
|
||
|
`digestion`,
|
||
|
`elk`,
|
||
|
`energy`,
|
||
|
`equipment`,
|
||
|
`excretion`,
|
||
|
`expertise`,
|
||
|
`firmware`,
|
||
|
`flounder`,
|
||
|
`fun`,
|
||
|
`gallows`,
|
||
|
`garbage`,
|
||
|
`graffiti`,
|
||
|
`hardware`,
|
||
|
`headquarters`,
|
||
|
`health`,
|
||
|
`herpes`,
|
||
|
`highjinks`,
|
||
|
`homework`,
|
||
|
`housework`,
|
||
|
`information`,
|
||
|
`jeans`,
|
||
|
`justice`,
|
||
|
`kudos`,
|
||
|
`labour`,
|
||
|
`literature`,
|
||
|
`machinery`,
|
||
|
`mackerel`,
|
||
|
`mail`,
|
||
|
`media`,
|
||
|
`mews`,
|
||
|
`moose`,
|
||
|
`music`,
|
||
|
`mud`,
|
||
|
`manga`,
|
||
|
`news`,
|
||
|
`only`,
|
||
|
`personnel`,
|
||
|
`pike`,
|
||
|
`plankton`,
|
||
|
`pliers`,
|
||
|
`police`,
|
||
|
`pollution`,
|
||
|
`premises`,
|
||
|
`rain`,
|
||
|
`research`,
|
||
|
`rice`,
|
||
|
`salmon`,
|
||
|
`scissors`,
|
||
|
`series`,
|
||
|
`sewage`,
|
||
|
`shambles`,
|
||
|
`shrimp`,
|
||
|
`software`,
|
||
|
`staff`,
|
||
|
`swine`,
|
||
|
`tennis`,
|
||
|
`traffic`,
|
||
|
`transportation`,
|
||
|
`trout`,
|
||
|
`tuna`,
|
||
|
`wealth`,
|
||
|
`welfare`,
|
||
|
`whiting`,
|
||
|
`wildebeest`,
|
||
|
`wildlife`,
|
||
|
`you`,
|
||
|
// Regexes.
|
||
|
`(?i)pok[eé]mon$`, //
|
||
|
`(?i)[^aeiou]ese$`, // "chinese", "japanese"
|
||
|
`(?i)deer$`, // "deer", "reindeer"
|
||
|
`(?i)(fish)$`, // "fish", "blowfish", "angelfish"
|
||
|
`(?i)measles$`, //
|
||
|
`(?i)o[iu]s$`, // "carnivorous"
|
||
|
`(?i)pox$`, // "chickpox", "smallpox"
|
||
|
`(?i)sheep$`, //
|
||
|
}
|
||
|
|
||
|
for _, w := range uncountableRules {
|
||
|
c.AddUncountableRule(w)
|
||
|
}
|
||
|
}
|