package main import ( "bufio" "fmt" "io" "log" "os" "path/filepath" "runtime" "sort" "strings" "unicode" "unicode/utf8" ) func main() { if len(os.Args) == 1 || os.Args[1] == "-h" || os.Args[1] == "--help" { fmt.Printf("usage: %s <file1> [<file2> [... <fileN>]]n", filepath.Base(os.Args[0])) os.Exit(1) } frequencyForWord := map[string]int{} // 與:make(map[string]int)相同 for _, filename := range commandLineFiles(os.Args[1:]) { updateFrequencies(filename, frequencyForWord) } reportByWords(frequencyForWord) wordsForFrequency := invertStringIntMap(frequencyForWord) reportByFrequency(wordsForFrequency) } func commandLineFiles(files []string) []string { if runtime.GOOS == "windows" { args := make([]string, 0, len(files)) for _, name := range files { if matches, err := filepath.Glob(name); err != nil { args = append(args, name) // 無效模式 } else if matches != nil { args = append(args, matches...) } } return args } return files } func updateFrequencies(filename string, frequencyForWord map[string]int) { var file *os.File var err error if file, err = os.Open(filename); err != nil { log.Println("failed to open the file: ", err) return } defer file.Close() readAndUpdateFrequencies(bufio.NewReader(file), frequencyForWord) } func readAndUpdateFrequencies(reader *bufio.Reader, frequencyForWord map[string]int) { for { line, err := reader.ReadString('n') for _, word := range SplitOnNonLetters(strings.TrimSpace(line)) { if len(word) > utf8.UTFMax || utf8.RuneCountInString(word) > 1 { frequencyForWord[strings.ToLower(word)] += 1 } } if err != nil { if err != io.EOF { log.Println("failed to finish reading the file: ", err) } break } } } func SplitOnNonLetters(s string) []string { notALetter := func(char rune) bool { return !unicode.IsLetter(char) } return strings.FieldsFunc(s, notALetter) } func invertStringIntMap(intForString map[string]int) map[int][]string { stringsForInt := make(map[int][]string, len(intForString)) for key, value := range intForString { stringsForInt[value] = append(stringsForInt[value], key) } return stringsForInt } func reportByWords(frequencyForWord map[string]int) { words := make([]string, 0, len(frequencyForWord)) wordWidth, frequencyWidth := 0, 0 for word, frequency := range frequencyForWord { words = append(words, word) if width := utf8.RuneCountInString(word); width > wordWidth { wordWidth = width } if width := len(fmt.Sprint(frequency)); width > frequencyWidth { frequencyWidth = width } } sort.Strings(words) gap := wordWidth + frequencyWidth - len("Word") - len("Frequency") fmt.Printf("Word %*s%sn", gap, " ", "Frequency") for _, word := range words { fmt.Printf("%-*s %*dn", wordWidth, word, frequencyWidth, frequencyForWord[word]) } } func reportByFrequency(wordsForFrequency map[int][]string) { frequencies := make([]int, 0, len(wordsForFrequency)) for frequency := range wordsForFrequency { frequencies = append(frequencies, frequency) } sort.Ints(frequencies) width := len(fmt.Sprint(frequencies[len(frequencies)-1])) fmt.Println("Frequency → Words") for _, frequency := range frequencies { words := wordsForFrequency[frequency] sort.Strings(words) fmt.Printf("%*d %sn", width, frequency, strings.Join(words, ", ")) } }程式的執行結果如下所示。
PS D:code> go run .main.go small-file.txt
Word Frequency
ability 1
about 1
above 3
years 1
you 128
Frequency → Words
1 ability, about, absence, absolute, absolutely, abuse, accessible, ...
2 accept, acquired, after, against, applies, arrange, assumptions, ...
...
128 you
151 or
192 to
221 of
345 the
map[string]int
型別的結構來儲存每一個單詞的頻率,但是要得到第二種輸出結果我們需要將整個對映反轉成多值型別的對映,如 map[int][]string
,也就是說,鍵是頻率而值則是所有具有這個頻率的單詞。func main() { if len(os.Args) == 1 || os.Args[1] == "-h" || os.Args[1] == "--help" { fmt.Printf("usage: %s <file1> [<file2> [... <fileN>]]n", filepath.Base(os.Args[0])) os.Exit(1) } frequencyForWord := map[string]int{} // 與:make(map[string]int)相同 for _, filename := range commandLineFiles(os.Args[1:]) { updateFrequencies(filename, frequencyForWord) } reportByWords(frequencyForWord) wordsForFrequency := invertStringIntMap(frequencyForWord) reportByFrequency(wordsForFrequency) }main() 函數首先分析命令列引數,之後再進行相應處理。
func commandLineFiles(files []string) []string { if runtime.GOOS == "windows" { args := make([]string, 0, len(files)) for _, name := range files { if matches, err := filepath.Glob(name); err != nil { args = append(args, name) // 無效模式 } else if matches != nil { args = append(args, matches...) } } return args } return files }因為 Unix 類系統(如 Linux 或 Mac OS X 等)的命令列工具預設會自動處理萬用字元(也就是說,*.txt 能匹配任意字尾為 .txt 的檔案,如 README.txt 和 INSTALL.txt 等),而 Windows 平台的命令列工具(CMD)不支援萬用字元,所以如果使用者在命令列輸入 *.txt,那麼程式只能接收到 *.txt。
func updateFrequencies(filename string, frequencyForWord map[string]int) { var file *os.File var err error if file, err = os.Open(filename); err != nil { log.Println("failed to open the file: ", err) return } defer file.Close() readAndUpdateFrequencies(bufio.NewReader(file), frequencyForWord) }updateFrequencies() 函數純粹就是用來處理檔案的,它開啟給定的檔案,並使用 defer 在函數返回時關閉檔案,這裡我們將檔案作為一個 *bufio.Reader(使用 bufio.NewReader() 函數建立)傳給 readAndUpdateFrequencies() 函數,因為這個函數是以字串的形式一行一行地讀取資料的,所以實際的工作都是在 readAndUpdateFrequencies() 函數裡完成的,程式碼如下。
func readAndUpdateFrequencies(reader *bufio.Reader, frequencyForWord map[string]int) { for { line, err := reader.ReadString('n') for _, word := range SplitOnNonLetters(strings.TrimSpace(line)) { if len(word) > utf8.UTFMax || utf8.RuneCountInString(word) > 1 { frequencyForWord[strings.ToLower(word)] += 1 } } if err != nil { if err != io.EOF { log.Println("failed to finish reading the file: ", err) } break } } }第一部分的程式碼我們應該很熟悉了,用了一個無限迴圈來一行一行地讀一個檔案,當讀到檔案結尾或者出現錯誤的時候就退出迴圈,將錯誤報告給使用者但並不退出程式,因為還有很多其他的檔案需要去處理。
func SplitOnNonLetters(s string) []string { notALetter := func(char rune) bool { return !unicode.IsLetter(char) } return strings.FieldsFunc(s, notALetter) }SplitOnNonLetters() 函數用來在非單詞字元上對一個字串進行切分,首先我們為 strings.FieldsFunc() 函數建立一個匿名函數 notALetter,如果傳入的是字元那就返回 false,否則返回 true,然後返回撥用函數 strings.FieldsFunc() 的結果,呼叫的時候將給定的字串和 notALetter 作為它的引數。
func reportByWords(frequencyForWord map[string]int) { words := make([]string, 0, len(frequencyForWord)) wordWidth, frequencyWidth := 0, 0 for word, frequency := range frequencyForWord { words = append(words, word) if width := utf8.RuneCountInString(word); width > wordWidth { wordWidth = width } if width := len(fmt.Sprint(frequency)); width > frequencyWidth { frequencyWidth = width } } sort.Strings(words) gap := wordWidth + frequencyWidth - len("Word") - len("Frequency") fmt.Printf("Word %*s%sn", gap, " ", "Frequency") for _, word := range words { fmt.Printf("%-*s %*dn", wordWidth, word, frequencyWidth, frequencyForWord[word]) } }計算出了 frequencyForWord 之後,呼叫 reportByWords() 將它的資料列印出來,因為我們需要將輸出結果按照字母順序排序好,所以首先要建立一個空的容量足夠大的 []string 切片來儲存所有在 frequencyForWord 裡的單詞。
%*s
可以實現的列印固定長度的空白,也可以使用 %s
來列印 strings.Repeat(" ", gap) 返回的字串。func invertStringIntMap(intForString map[string]int) map[int][]string { stringsForInt := make(map[int][]string, len(intForString)) for key, value := range intForString { stringsForInt[value] = append(stringsForInt[value], key) } return stringsForInt }上面的函數首先建立一個空的對映,用來儲存反轉的結果,但是我們並不知道它到底要儲存多少個項,因此我們假設它和原來的對映容量一樣大,然後簡單地遍歷原來的對映,將它的值作為鍵儲存到反轉的對映里,並將鍵增加到對應的值裡去,新的對映的值就是一個字串切片,即使原來的對映有多個鍵對應同一個值,也不會丟掉任何資料。
func reportByFrequency(wordsForFrequency map[int][]string) { frequencies := make([]int, 0, len(wordsForFrequency)) for frequency := range wordsForFrequency { frequencies = append(frequencies, frequency) } sort.Ints(frequencies) width := len(fmt.Sprint(frequencies[len(frequencies)-1])) fmt.Println("Frequency → Words") for _, frequency := range frequencies { words := wordsForFrequency[frequency] sort.Strings(words) fmt.Printf("%*d %sn", width, frequency, strings.Join(words, ", ")) } }這個函數的結構和 reportByWords() 函數很相似,它首先建立一個切片用來儲存頻率,並按照頻率升序排列,然後再計算需要容納的最大長度並以此作為第一列的寬度,之後輸出報告的標題,最後,遍歷輸出所有的頻率並按照字母升序輸出對應的單詞,如果一個頻率有超過兩個對應的單詞則單詞之間使用逗號分隔開。