golib/helper/pluralize.go

//
// pluralize.go
// Copyright (C) 2023 tiglog <me@tiglog.com>
//
// Distributed under terms of the MIT license.
//

package helper

import (
	"fmt"
	"regexp"
	"strconv"
	"strings"
)

// PluralizeRule -- pluralize rule expression and replacement value.
type PluralizeRule struct {
	expression  *regexp.Regexp
	replacement string
}

// PluralizeClient -- pluralize client.
type PluralizeClient struct {
	pluralRules      []PluralizeRule
	singularRules    []PluralizeRule
	uncountables     map[string]bool
	irregularSingles map[string]string
	irregularPlurals map[string]string
	interpolateExpr  *regexp.Regexp
}

func NewPluralizeClient() *PluralizeClient {
	client := PluralizeClient{}
	client.init()

	return &client
}

func (c *PluralizeClient) init() {
	c.pluralRules = make([]PluralizeRule, 0)
	c.singularRules = make([]PluralizeRule, 0)
	c.uncountables = make(map[string]bool)
	c.irregularSingles = make(map[string]string)
	c.irregularPlurals = make(map[string]string)

	c.loadIrregularRules()
	c.loadPluralizationRules()
	c.loadSingularizationRules()
	c.loadUncountableRules()
	c.interpolateExpr = regexp.MustCompile(`\$(\d{1,2})`)
}

// Pluralize -- Pluralize or singularize a word based on the passed in count.
//
//	word: the word to pluralize
//	count: how many of the word exist
//	inclusive: whether to prefix with the number (e.g. 3 ducks)
func (c *PluralizeClient) Pluralize(word string, count int, inclusive bool) string {
	pluralized := func() func(string) string {
		if count == 1 {
			return c.Singular
		}

		return c.Plural
	}

	if inclusive {
		return fmt.Sprintf("%d %s", count, pluralized()(word))
	}

	return pluralized()(word)
}

// Plural -- Pluralize a word.
func (c *PluralizeClient) Plural(word string) string {
	return c.replaceWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word)
}

// IsPlural -- Check if a word is plural.
func (c *PluralizeClient) IsPlural(word string) bool {
	return c.checkWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word)
}

// Singular -- Singularize a word.
func (c *PluralizeClient) Singular(word string) string {
	return c.replaceWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word)
}

// IsSingular -- Check if a word is singular.
func (c *PluralizeClient) IsSingular(word string) bool {
	return c.checkWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word)
}

// AddPluralRule -- Add a pluralization rule to the collection.
func (c *PluralizeClient) AddPluralRule(rule string, replacement string) {
	c.pluralRules = append(c.pluralRules, PluralizeRule{sanitizeRule(rule), replacement})
}

// AddSingularRule -- Add a singularization rule to the collection.
func (c *PluralizeClient) AddSingularRule(rule string, replacement string) {
	c.singularRules = append(c.singularRules, PluralizeRule{sanitizeRule(rule), replacement})
}

// AddUncountableRule -- Add an uncountable word rule.
func (c *PluralizeClient) AddUncountableRule(word string) {
	if !isExpr(word) {
		c.uncountables[strings.ToLower(word)] = true
		return
	}

	c.AddPluralRule(word, `$0`)
	c.AddSingularRule(word, `$0`)
}

// AddIrregularRule -- Add an irregular word definition.
func (c *PluralizeClient) AddIrregularRule(single string, plural string) {
	p := strings.ToLower(plural)
	s := strings.ToLower(single)

	c.irregularSingles[s] = p
	c.irregularPlurals[p] = s
}

func (c *PluralizeClient) replaceWord(replaceMap map[string]string, keepMap map[string]string, rules []PluralizeRule) func(w string) string { //nolint:lll
	f := func(word string) string {
		// Get the correct token and case restoration functions.
		var token = strings.ToLower(word)

		// Check against the keep object map.
		if _, ok := keepMap[token]; ok {
			return restoreCase(word, token)
		}

		// Check against the replacement map for a direct word replacement.
		if replaceToken, ok := replaceMap[token]; ok {
			return restoreCase(word, replaceToken)
		}

		// Run all the rules against the word.
		return c.sanitizeWord(token, word, rules)
	}

	return f
}

func (c *PluralizeClient) checkWord(replaceMap map[string]string, keepMap map[string]string, rules []PluralizeRule) func(w string) bool {
	f := func(word string) bool {
		var token = strings.ToLower(word)

		if _, ok := keepMap[token]; ok {
			return true
		}

		if _, ok := replaceMap[token]; ok {
			return false
		}

		return c.sanitizeWord(token, token, rules) == token
	}

	return f
}

func (c *PluralizeClient) interpolate(str string, args []string) string {
	lookup := map[string]string{}

	for _, submatch := range c.interpolateExpr.FindAllStringSubmatch(str, -1) {
		element, _ := strconv.Atoi(submatch[1])
		lookup[submatch[0]] = args[element]
	}

	result := c.interpolateExpr.ReplaceAllStringFunc(str, func(repl string) string {
		return lookup[repl]
	})

	return result
}

func (c *PluralizeClient) replace(word string, rule PluralizeRule) string {
	return rule.expression.ReplaceAllStringFunc(word, func(w string) string {
		match := rule.expression.FindString(word)
		index := rule.expression.FindStringIndex(word)[0]
		args := rule.expression.FindAllStringSubmatch(word, -1)[0]

		result := c.interpolate(rule.replacement, args)

		if match == `` {
			return restoreCase(word[index-1:index], result)
		}
		return restoreCase(match, result)
	})
}

func (c *PluralizeClient) sanitizeWord(token string, word string, rules []PluralizeRule) string {
	// If empty string
	if len(token) == 0 {
		return word
	}
	// If does not need fixup
	if _, ok := c.uncountables[token]; ok {
		return word
	}

	// Iterate over the sanitization rules and use the first one to match.
	// NOTE: iterate rules array in reverse order specific => general rules
	for i := len(rules) - 1; i >= 0; i-- {
		if rules[i].expression.MatchString(word) {
			return c.replace(word, rules[i])
		}
	}

	return word
}

func sanitizeRule(rule string) *regexp.Regexp {
	if isExpr(rule) {
		return regexp.MustCompile(rule)
	}

	return regexp.MustCompile(`(?i)^` + rule + `$`)
}

func restoreCase(word string, token string) string {
	// Tokens are an exact match.
	if word == token {
		return token
	}

	// Lower cased words. E.g. "hello".
	if word == strings.ToLower(word) {
		return strings.ToLower(token)
	}

	// Upper cased words. E.g. "WHISKY".
	if word == strings.ToUpper(word) {
		return strings.ToUpper(token)
	}

	// Title cased words. E.g. "Title".
	if word[:1] == strings.ToUpper(word[:1]) {
		return strings.ToUpper(token[:1]) + strings.ToLower(token[1:])
	}

	// Lower cased words. E.g. "test".
	return strings.ToLower(token)
}

// isExpr -- helper to detect if string represents an expression by checking first character to be `(`.
func isExpr(s string) bool {
	return s[:1] == `(`
}

func (c *PluralizeClient) loadIrregularRules() { //nolint:funlen
	var irregularRules = []struct {
		single string
		plural string
	}{
		// Pronouns.
		{`I`, `we`},
		{`me`, `us`},
		{`he`, `they`},
		{`she`, `they`},
		{`them`, `them`},
		{`myself`, `ourselves`},
		{`yourself`, `yourselves`},
		{`itself`, `themselves`},
		{`herself`, `themselves`},
		{`himself`, `themselves`},
		{`themself`, `themselves`},
		{`is`, `are`},
		{`was`, `were`},
		{`has`, `have`},
		{`this`, `these`},
		{`that`, `those`},
		{`my`, `our`},
		{`its`, `their`},
		{`his`, `their`},
		{`her`, `their`},
		// Words ending in with a consonant and `o`.
		{`echo`, `echoes`},
		{`dingo`, `dingoes`},
		{`volcano`, `volcanoes`},
		{`tornado`, `tornadoes`},
		{`torpedo`, `torpedoes`},
		// Ends with `us`.
		{`genus`, `genera`},
		{`viscus`, `viscera`},
		// Ends with `ma`.
		{`stigma`, `stigmata`},
		{`stoma`, `stomata`},
		{`dogma`, `dogmata`},
		{`lemma`, `lemmata`},
		{`schema`, `schemata`},
		{`anathema`, `anathemata`},
		// Other irregular rules.
		{`ox`, `oxen`},
		{`axe`, `axes`},
		{`die`, `dice`},
		{`yes`, `yeses`},
		{`foot`, `feet`},
		{`eave`, `eaves`},
		{`goose`, `geese`},
		{`tooth`, `teeth`},
		{`quiz`, `quizzes`},
		{`human`, `humans`},
		{`proof`, `proofs`},
		{`carve`, `carves`},
		{`valve`, `valves`},
		{`looey`, `looies`},
		{`thief`, `thieves`},
		{`groove`, `grooves`},
		{`pickaxe`, `pickaxes`},
		{`passerby`, `passersby`},
		{`canvas`, `canvases`},
		{`sms`, `sms`},
	}

	for _, r := range irregularRules {
		c.AddIrregularRule(r.single, r.plural)
	}
}

func (c *PluralizeClient) loadPluralizationRules() {
	var pluralizationRules = []struct {
		rule        string
		replacement string
	}{
		{`(?i)s?$`, `s`},
		{`(?i)[^[:ascii:]]$`, `$0`},
		{`(?i)([^aeiou]ese)$`, `$1`},
		{`(?i)(ax|test)is$`, `$1es`},
		{`(?i)(alias|[^aou]us|t[lm]as|gas|ris)$`, `$1es`},
		{`(?i)(e[mn]u)s?$`, `$1s`},
		{`(?i)([^l]ias|[aeiou]las|[ejzr]as|[iu]am)$`, `$1`},
		{`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1i`}, //nolint:lll,misspell
		{`(?i)(alumn|alg|vertebr)(?:a|ae)$`, `$1ae`},
		{`(?i)(seraph|cherub)(?:im)?$`, `$1im`},
		{`(?i)(her|at|gr)o$`, `$1oes`},
		{`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|automat|quor)(?:a|um)$`, `$1a`}, //nolint:lll,misspell
		{`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)(?:a|on)$`, `$1a`},
		{`(?i)sis$`, `ses`},
		{`(?i)(?:(kni|wi|li)fe|(ar|l|ea|eo|oa|hoo)f)$`, `$1$2ves`},
		{`(?i)([^aeiouy]|qu)y$`, `$1ies`},
		{`(?i)([^ch][ieo][ln])ey$`, `$1ies`},
		{`(?i)(x|ch|ss|sh|zz)$`, `$1es`},
		{`(?i)(matr|cod|mur|sil|vert|ind|append)(?:ix|ex)$`, `$1ices`},
		{`(?i)\b((?:tit)?m|l)(?:ice|ouse)$`, `$1ice`},
		{`(?i)(pe)(?:rson|ople)$`, `$1ople`},
		{`(?i)(child)(?:ren)?$`, `$1ren`},
		{`(?i)eaux$`, `$0`},
		{`(?i)m[ae]n$`, `men`},
		{`thou`, `you`},
	}

	for _, r := range pluralizationRules {
		c.AddPluralRule(r.rule, r.replacement)
	}
}

func (c *PluralizeClient) loadSingularizationRules() {
	var singularizationRules = []struct {
		rule        string
		replacement string
	}{
		{`(?i)s$`, ``},
		{`(?i)(ss)$`, `$1`},
		{`(?i)(wi|kni|(?:after|half|high|low|mid|non|night|[^\w]|^)li)ves$`, `$1fe`},
		{`(?i)(ar|(?:wo|[ae])l|[eo][ao])ves$`, `$1f`},
		{`(?i)ies$`, `y`},
		{`(?i)(dg|ss|ois|lk|ok|wn|mb|th|ch|ec|oal|is|ck|ix|sser|ts|wb)ies$`, `$1ie`},
		{`(?i)\b(l|(?:neck|cross|hog|aun)?t|coll|faer|food|gen|goon|group|hipp|junk|vegg|(?:pork)?p|charl|calor|cut)ies$`, `$1ie`}, //nolint:lll
		{`(?i)\b(mon|smil)ies$`, `$1ey`},
		{`(?i)\b((?:tit)?m|l)ice$`, `$1ouse`},
		{`(?i)(seraph|cherub)im$`, `$1`},
		{`(?i)(x|ch|ss|sh|zz|tto|go|cho|alias|[^aou]us|t[lm]as|gas|(?:her|at|gr)o|[aeiou]ris)(?:es)?$`, `$1`},
		{`(?i)(analy|diagno|parenthe|progno|synop|the|empha|cri|ne)(?:sis|ses)$`, `$1sis`},
		{`(?i)(movie|twelve|abuse|e[mn]u)s$`, `$1`},
		{`(?i)(test)(?:is|es)$`, `$1is`},
		{`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1us`},              //nolint:lll,misspell
		{`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|quor)a$`, `$1um`}, //nolint:lll,misspell
		{`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)a$`, `$1on`},
		{`(?i)(alumn|alg|vertebr)ae$`, `$1a`},
		{`(?i)(cod|mur|sil|vert|ind)ices$`, `$1ex`},
		{`(?i)(matr|append)ices$`, `$1ix`},
		{`(?i)(pe)(rson|ople)$`, `$1rson`},
		{`(?i)(child)ren$`, `$1`},
		{`(?i)(eau)x?$`, `$1`},
		{`(?i)men$`, `man`},
	}

	for _, r := range singularizationRules {
		c.AddSingularRule(r.rule, r.replacement)
	}
}

func (c *PluralizeClient) loadUncountableRules() { //nolint:funlen
	var uncountableRules = []string{
		// Singular words with no plurals.
		`adulthood`,
		`advice`,
		`agenda`,
		`aid`,
		`aircraft`,
		`alcohol`,
		`ammo`,
		`analytics`,
		`anime`,
		`athletics`,
		`audio`,
		`bison`,
		`blood`,
		`bream`,
		`buffalo`,
		`butter`,
		`carp`,
		`cash`,
		`chassis`,
		`chess`,
		`clothing`,
		`cod`,
		`commerce`,
		`cooperation`,
		`corps`,
		`debris`,
		`diabetes`,
		`digestion`,
		`elk`,
		`energy`,
		`equipment`,
		`excretion`,
		`expertise`,
		`firmware`,
		`flounder`,
		`fun`,
		`gallows`,
		`garbage`,
		`graffiti`,
		`hardware`,
		`headquarters`,
		`health`,
		`herpes`,
		`highjinks`,
		`homework`,
		`housework`,
		`information`,
		`jeans`,
		`justice`,
		`kudos`,
		`labour`,
		`literature`,
		`machinery`,
		`mackerel`,
		`mail`,
		`media`,
		`mews`,
		`moose`,
		`music`,
		`mud`,
		`manga`,
		`news`,
		`only`,
		`personnel`,
		`pike`,
		`plankton`,
		`pliers`,
		`police`,
		`pollution`,
		`premises`,
		`rain`,
		`research`,
		`rice`,
		`salmon`,
		`scissors`,
		`series`,
		`sewage`,
		`shambles`,
		`shrimp`,
		`software`,
		`staff`,
		`swine`,
		`tennis`,
		`traffic`,
		`transportation`,
		`trout`,
		`tuna`,
		`wealth`,
		`welfare`,
		`whiting`,
		`wildebeest`,
		`wildlife`,
		`you`,
		// Regexes.
		`(?i)pok[eé]mon$`,  //
		`(?i)[^aeiou]ese$`, // "chinese", "japanese"
		`(?i)deer$`,        // "deer", "reindeer"
		`(?i)(fish)$`,      // "fish", "blowfish", "angelfish"
		`(?i)measles$`,     //
		`(?i)o[iu]s$`,      // "carnivorous"
		`(?i)pox$`,         // "chickpox", "smallpox"
		`(?i)sheep$`,       //
	}

	for _, w := range uncountableRules {
		c.AddUncountableRule(w)
	}
}