1 Star 9 Fork 6

general/site-mirror-go

加入 Gitee
与超过 1400万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
charset.go 2.04 KB
一键复制 编辑 原始数据 按行查看 历史
general 提交于 2019-04-05 14:38 +08:00 . 解决编码问题, 页面抓取正常
package crawler
import (
"bytes"
"html"
"io/ioutil"
"strings"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)
// CharsetMap 字符集映射
var CharsetMap = map[string]encoding.Encoding{
"utf-8": unicode.UTF8,
"gbk": simplifiedchinese.GBK,
"gb2312": simplifiedchinese.GB18030,
"gb18030": simplifiedchinese.GB18030,
"big5": traditionalchinese.Big5,
}
// HTMLCharacterEntitiesMap HTML 字符实体
var HTMLCharacterEntitiesMap = map[string]string{
"\u00a0": " ",
"©": "©",
"®": "®",
"™": "™",
"¢": "¢",
"£": "£",
"¥": "¥",
"€": "€",
"§": "§",
}
// ReplaceHTMLCharacterEntities 替换页面中html实体字符, 以免写入文件时遇到不支持的字符
func ReplaceHTMLCharacterEntities(input string, charset encoding.Encoding) (output string) {
if charset == unicode.UTF8 {
output = input
return
}
output = html.UnescapeString(input)
for char, entity := range HTMLCharacterEntitiesMap {
output = strings.Replace(output, char, entity, -1)
}
return
}
// DecodeToUTF8 从输入的byte数组中按照指定的字符集解析出对应的utf8格式的内容并返回.
func DecodeToUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) {
if charset == unicode.UTF8 {
output = input
return
}
reader := transform.NewReader(bytes.NewReader(input), charset.NewDecoder())
output, err = ioutil.ReadAll(reader)
if err != nil {
return
}
return
}
// EncodeFromUTF8 将输入的utf-8格式的byte数组中按照指定的字符集编码并返回
func EncodeFromUTF8(input []byte, charset encoding.Encoding) (output []byte, err error) {
if charset == unicode.UTF8 {
output = input
return
}
reader := transform.NewReader(bytes.NewReader(input), encoding.ReplaceUnsupported(charset.NewEncoder()))
output, err = ioutil.ReadAll(reader)
if err != nil {
return
}
return
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Go
1
https://gitee.com/generals-space/site-mirror-go.git
git@gitee.com:generals-space/site-mirror-go.git
generals-space
site-mirror-go
site-mirror-go
fce466e9cf56

搜索帮助