dict: 日常更新； close #375

2026-05-13 23:16:54 +00:00 · 2023-06-29 20:44:38 +08:00
parent d9cf59d5f2
commit d00aac9ef4
14 changed files with 479 additions and 152 deletions
--- a/others/script/main.go
+++ b/others/script/main.go
@@ -22,9 +22,13 @@ func main() {
 	rime.CnEn()
 	fmt.Println("--------------------------------------------------")

+	// 为没注音的词汇半自动注音
+	rime.Pinyin(rime.ExtPath)
+	fmt.Println("--------------------------------------------------")
+
 	// 为 ext、tencent 没权重的词条加上权重，有权重的改为下面设置的权重
-	rime.AddWeight(rime.ExtPath, rime.DefaultWeight)
-	rime.AddWeight(rime.TencentPath, rime.DefaultWeight)
+	rime.AddWeight(rime.ExtPath, 100)
+	rime.AddWeight(rime.TencentPath, 100)
 	fmt.Println("--------------------------------------------------")

 	// 检查
--- a/others/script/rime/pinyin.go
+++ b/others/script/rime/pinyin.go
@@ -9,6 +9,7 @@ import (
 	"strconv"
 	"strings"
 	"time"
+	"unicode"
 	"unicode/utf8"

 	"github.com/yanyiwu/gojieba"
@@ -215,36 +216,40 @@ var onlyOne = map[string]string{
 }

 func init() {
-	// 从 base 准备结巴的词典和词组拼音映射
-	baseFile, err := os.Open(BasePath)
-	if err != nil {
-		log.Fatalln(err)
-	}
-	defer baseFile.Close()
-	sc := bufio.NewScanner(baseFile)
-	isMark := false
-	for sc.Scan() {
-		line := sc.Text()
-		if !isMark {
-			if strings.HasPrefix(line, mark) {
-				isMark = true
-			}
-			continue
-		}
-		if strings.HasPrefix(line, "#") || line == "" {
-			continue
-		}
-		parts := strings.Split(line, "\t")
-		if len(parts) != 3 {
-			log.Fatalln("len(parts) != 3", line)
-		}
-		text, code := parts[0], parts[1]
-		weight, err := strconv.Atoi(parts[2])
+	// 从 base、ext 准备结巴的词典和词组拼音映射
+	for _, dictPath := range []string{BasePath, ExtPath} {
+		file, err := os.Open(dictPath)
 		if err != nil {
-			log.Fatalln(err, line)
+			log.Fatalln(err)
 		}
-		jieba.AddWordEx(text, weight, "")
-		wordPinyin[text] = append(wordPinyin[text], code)
+
+		sc := bufio.NewScanner(file)
+		isMark := false
+		for sc.Scan() {
+			line := sc.Text()
+			if !isMark {
+				if strings.HasPrefix(line, mark) {
+					isMark = true
+				}
+				continue
+			}
+			if strings.HasPrefix(line, "#") || line == "" {
+				continue
+			}
+			parts := strings.Split(line, "\t")
+			if len(parts) < 2 || !isAllLower(parts[1]) {
+				continue
+			}
+			text, code := parts[0], parts[1]
+			weight, err := strconv.Atoi(parts[2])
+			if err != nil {
+				log.Fatalln(err, line)
+			}
+			jieba.AddWordEx(text, weight, "")
+			wordPinyin[text] = append(wordPinyin[text], code)
+		}
+
+		file.Close()
 	}

 	// 拷贝 hanPinyin 到 hanziPinyin，再从 onlyOne 替换掉映射中的注音
@@ -287,16 +292,21 @@ func Pinyin(dictPath string) {
 		}

 		parts := strings.Split(line, "\t")
-		if len(parts) <= 1 {
-			fmt.Println("parts <= 1:", line)
-		}
 		text := parts[0]
-		// parts[1] 不是权重或已经注音（包含空格），不再注音
-		// if _, err := strconv.Atoi(parts[1]); err != nil || strings.Contains(parts[1], " ") {
-		// 	continue
-		// }
-		// 注音
-		code := generatePinyin(text)
+		var code string
+		// parts[1] 可能是：空、已经注音完成、注音到一半（含有未能自动注音的多音字汉字）
+		// 注音完成的，不再注音，其余的进行注音
+		if len(parts) == 1 { // 只有汉字
+			code = generatePinyin(text)
+		} else if len(parts) == 2 || len(parts) == 3 {
+			if isAllLower(parts[1]) { // 全小写，不包含汉字，代表已经注音完成
+				code = parts[1]
+			} else { // 注音到一半（含有汉字），重新注音
+				code = generatePinyin(text)
+			}
+		} else {
+			log.Fatalln("分割错误：", line)
+		}
 		lines[i] = text + "\t" + code
 	}

@@ -358,3 +368,16 @@ func GeneratePinyinTest(s string) {
 	r := generatePinyin(s)
 	fmt.Printf("%s %q\n", words, r)
 }
+
+// 判断 code 是否全小写，不判断空格
+func isAllLower(s string) bool {
+	for _, ch := range s {
+		if ch == ' ' {
+			continue
+		}
+		if !unicode.IsLower(ch) {
+			return false
+		}
+	}
+	return true
+}
--- a/others/script/rime/rime.go
+++ b/others/script/rime/rime.go
@@ -23,7 +23,6 @@ type lemma struct {

 var (
 	mark          = "# +_+"      // 词库中的标记符号，表示从这行开始进行检查或排序
-	DefaultWeight = 100          // ext、tencent 词库中默认的权重
 	RimeDir       = getRimeDir() // Rime 配置目录

 	EmojiMapPath = filepath.Join(RimeDir, "others/emoji-map.txt")
--- a/others/script/rime/错别字.txt
+++ b/others/script/rime/错别字.txt
@@ -1,3 +1,5 @@
+# 在 # -_- 上面是错词；下面是包含错词但不是错词。
+的一逼
 觉对
 奇情片
 不还意思
--- a/others/script/rime/需要注音.txt
+++ b/others/script/rime/需要注音.txt
@@ -237,3 +237,4 @@
 还要
 一圈
 长图
+降息