Ai
1 Star 0 Fork 0

coorrer/unipdf

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
cmap_parser.go 12.08 KB
一键复制 编辑 原始数据 按行查看 历史
xushuai 提交于 2025-09-23 15:05 +08:00 . init
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
package cmap
import (
"errors"
"fmt"
"io"
"gitee.com/coodder/unipdf/common"
)
// parse parses the CMap file and loads into the CMap structure.
func (cmap *CMap) parse() error {
var prev cmapObject
for {
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
common.Log.Debug("ERROR: parsing CMap: %v", err)
return err
}
switch t := o.(type) {
case cmapOperand:
op := t
switch op.Operand {
case begincodespacerange:
err := cmap.parseCodespaceRange()
if err != nil {
return err
}
case begincidrange:
err := cmap.parseCIDRange()
if err != nil {
return err
}
case beginbfchar:
err := cmap.parseBfchar()
if err != nil {
return err
}
case beginbfrange:
err := cmap.parseBfrange()
if err != nil {
return err
}
case usecmap:
if prev == nil {
common.Log.Debug("ERROR: usecmap with no arg")
return ErrBadCMap
}
name, ok := prev.(cmapName)
if !ok {
common.Log.Debug("ERROR: usecmap arg not a name %#v", prev)
return ErrBadCMap
}
cmap.usecmap = name.Name
case cidSystemInfo:
// Some PDF generators leave the "/"" off CIDSystemInfo
// e.g. ~/testdata/459474_809.pdf
err := cmap.parseSystemInfo()
if err != nil {
return err
}
}
case cmapName:
n := t
switch n.Name {
case cidSystemInfo:
err := cmap.parseSystemInfo()
if err != nil {
return err
}
case cmapname:
err := cmap.parseName()
if err != nil {
return err
}
case cmaptype:
err := cmap.parseType()
if err != nil {
return err
}
case cmapversion:
err := cmap.parseVersion()
if err != nil {
return err
}
}
}
prev = o
}
return nil
}
// parseName parses a cmap name and adds it to `cmap`.
// cmap names are defined like this: /CMapName /83pv-RKSJ-H def
func (cmap *CMap) parseName() error {
name := ""
done := false
for i := 0; i < 10 && !done; i++ {
o, err := cmap.parseObject()
if err != nil {
return err
}
switch t := o.(type) {
case cmapOperand:
switch t.Operand {
case "def":
done = true
default:
// This is not an error because some PDF files don't have valid PostScript names.
// e.g. ~/testdata/Papercut vs Equitrac.pdf
// /CMapName /Adobe-SI-*Courier New-6164-0 def
// We just append the non-existant operator "New-6164-0" to the name
common.Log.Debug("parseName: State error. o=%#v name=%#q", o, name)
if name != "" {
name = fmt.Sprintf("%s %s", name, t.Operand)
}
common.Log.Debug("parseName: Recovered. name=%#q", name)
}
case cmapName:
name = t.Name
}
}
if !done {
common.Log.Debug("ERROR: parseName: No def ")
return ErrBadCMap
}
cmap.name = name
return nil
}
// parseType parses a cmap type and adds it to `cmap`.
// cmap names are defined like this: /CMapType 1 def
func (cmap *CMap) parseType() error {
ctype := 0
done := false
for i := 0; i < 3 && !done; i++ {
o, err := cmap.parseObject()
if err != nil {
return err
}
switch t := o.(type) {
case cmapOperand:
switch t.Operand {
case "def":
done = true
default:
common.Log.Error("parseType: state error. o=%#v", o)
return ErrBadCMap
}
case cmapInt:
ctype = int(t.val)
}
}
cmap.ctype = ctype
return nil
}
// parseVersion parses a cmap version and adds it to `cmap`.
// cmap names are defined like this: /CMapType 1 def
// We don't need the version. We do this to eat up the version code in the cmap definition
// to reduce unhandled parse object warnings.
func (cmap *CMap) parseVersion() error {
version := ""
done := false
for i := 0; i < 3 && !done; i++ {
o, err := cmap.parseObject()
if err != nil {
return err
}
switch t := o.(type) {
case cmapOperand:
switch t.Operand {
case "def":
done = true
default:
common.Log.Debug("ERROR: parseVersion: state error. o=%#v", o)
return ErrBadCMap
}
case cmapInt:
version = fmt.Sprintf("%d", t.val)
case cmapFloat:
version = fmt.Sprintf("%f", t.val)
case cmapString:
version = t.String
default:
common.Log.Debug("ERROR: parseVersion: Bad type. o=%#v", o)
}
}
cmap.version = version
return nil
}
// parseSystemInfo parses a cmap CIDSystemInfo and adds it to `cmap`.
// cmap CIDSystemInfo is define like this:
// /CIDSystemInfo 3 dict dup begin
//
// /Registry (Adobe) def
// /Ordering (Japan1) def
// /Supplement 1 def
//
// end def
func (cmap *CMap) parseSystemInfo() error {
inDict := false
inDef := false
name := ""
done := false
systemInfo := CIDSystemInfo{}
// 50 is a generous but arbitrary limit to prevent an endless loop on badly formed cmap files.
for i := 0; i < 50 && !done; i++ {
o, err := cmap.parseObject()
if err != nil {
return err
}
switch t := o.(type) {
case cmapDict:
d := t.Dict
r, ok := d["Registry"]
if !ok {
common.Log.Debug("ERROR: Bad System Info")
return ErrBadCMap
}
rr, ok := r.(cmapString)
if !ok {
common.Log.Debug("ERROR: Bad System Info")
return ErrBadCMap
}
systemInfo.Registry = rr.String
r, ok = d["Ordering"]
if !ok {
common.Log.Debug("ERROR: Bad System Info")
return ErrBadCMap
}
rr, ok = r.(cmapString)
if !ok {
common.Log.Debug("ERROR: Bad System Info")
return ErrBadCMap
}
systemInfo.Ordering = rr.String
s, ok := d["Supplement"]
if !ok {
common.Log.Debug("ERROR: Bad System Info")
return ErrBadCMap
}
ss, ok := s.(cmapInt)
if !ok {
common.Log.Debug("ERROR: Bad System Info")
return ErrBadCMap
}
systemInfo.Supplement = int(ss.val)
done = true
case cmapOperand:
switch t.Operand {
case "begin":
inDict = true
case "end":
done = true
case "def":
inDef = false
}
case cmapName:
if inDict {
name = t.Name
inDef = true
}
case cmapString:
if inDef {
switch name {
case "Registry":
systemInfo.Registry = t.String
case "Ordering":
systemInfo.Ordering = t.String
}
}
case cmapInt:
if inDef {
switch name {
case "Supplement":
systemInfo.Supplement = int(t.val)
}
}
}
}
if !done {
common.Log.Debug("ERROR: Parsed System Info dict incorrectly")
return ErrBadCMap
}
cmap.systemInfo = systemInfo
return nil
}
// parseCodespaceRange parses the codespace range section of a CMap.
func (cmap *CMap) parseCodespaceRange() error {
for {
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
hexLow, ok := o.(cmapHexString)
if !ok {
if op, isOperand := o.(cmapOperand); isOperand {
if op.Operand == endcodespacerange {
return nil
}
return errors.New("unexpected operand")
}
}
o, err = cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
hexHigh, ok := o.(cmapHexString)
if !ok {
return errors.New("non-hex high")
}
if len(hexLow.b) != len(hexHigh.b) {
return errors.New("unequal number of bytes in range")
}
low := hexToCharCode(hexLow)
high := hexToCharCode(hexHigh)
if high < low {
common.Log.Debug("ERROR: Bad codespace. low=0x%02x high=0x%02x", low, high)
return ErrBadCMap
}
numBytes := hexHigh.numBytes
cspace := Codespace{NumBytes: numBytes, Low: low, High: high}
cmap.codespaces = append(cmap.codespaces, cspace)
common.Log.Trace("Codespace low: 0x%X, high: 0x%X", low, high)
}
if len(cmap.codespaces) == 0 {
common.Log.Debug("ERROR: No codespaces in cmap.")
return ErrBadCMap
}
return nil
}
// parseCIDRange parses the CID range section of a CMap.
func (cmap *CMap) parseCIDRange() error {
for {
// Parse character code interval start.
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
hexStart, ok := o.(cmapHexString)
if !ok {
if op, isOperand := o.(cmapOperand); isOperand {
if op.Operand == endcidrange {
return nil
}
return errors.New("cid interval start must be a hex string")
}
}
charcodeStart := hexToCharCode(hexStart)
// Parse character code interval end.
o, err = cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
hexEnd, ok := o.(cmapHexString)
if !ok {
return errors.New("cid interval end must be a hex string")
}
if len(hexStart.b) != len(hexEnd.b) {
return errors.New("unequal number of bytes in range")
}
charcodeEnd := hexToCharCode(hexEnd)
if charcodeStart > charcodeEnd {
common.Log.Debug("ERROR: invalid CID range. start=0x%02x end=0x%02x", charcodeStart, charcodeEnd)
return ErrBadCMap
}
// Parse interval start CID.
o, err = cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
cidStart, ok := o.(cmapInt)
if !ok {
return errors.New("cid start value must be an decimal number")
}
if cidStart.val < 0 {
return errors.New("invalid cid start value")
}
// Fill charcode to CID map.
cid := cidStart.val
for charcode := charcodeStart; charcode <= charcodeEnd; charcode++ {
cmap.codeToCID[charcode] = CharCode(cid)
cid++
}
common.Log.Trace("CID range: <0x%X> <0x%X> %d", charcodeStart, charcodeEnd, cidStart.val)
}
return nil
}
// parseBfchar parses a bfchar section of a CMap file.
func (cmap *CMap) parseBfchar() error {
for {
// Src code.
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
var code CharCode
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfchar {
return nil
}
return errors.New("unexpected operand")
case cmapHexString:
code = hexToCharCode(v)
default:
return errors.New("unexpected type")
}
// Target code.
o, err = cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
var target rune
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfchar {
return nil
}
common.Log.Debug("ERROR: Unexpected operand. %#v", v)
return ErrBadCMap
case cmapHexString:
target = hexToRune(v)
case cmapName:
common.Log.Debug("ERROR: Unexpected name. %#v", v)
target = MissingCodeRune
default:
common.Log.Debug("ERROR: Unexpected type. %#v", o)
return ErrBadCMap
}
cmap.codeToUnicode[code] = target
}
return nil
}
// parseBfrange parses a bfrange section of a CMap file.
func (cmap *CMap) parseBfrange() error {
for {
// The specifications are in triplets.
// <srcCodeFrom> <srcCodeTo> <target>
// where target can be either <destFrom> as a hex code, or a list.
// Src code from.
var srcCodeFrom CharCode
o, err := cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
switch v := o.(type) {
case cmapOperand:
if v.Operand == endbfrange {
return nil
}
return errors.New("unexpected operand")
case cmapHexString:
srcCodeFrom = hexToCharCode(v)
default:
return errors.New("unexpected type")
}
// Src code to.
var srcCodeTo CharCode
o, err = cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
switch v := o.(type) {
case cmapOperand:
common.Log.Debug("ERROR: Imcomplete bfrange triplet")
return ErrBadCMap
case cmapHexString:
srcCodeTo = hexToCharCode(v)
default:
common.Log.Debug("ERROR: Unexpected type %T", o)
return ErrBadCMap
}
// target(s).
o, err = cmap.parseObject()
if err != nil {
if err == io.EOF {
break
}
return err
}
switch v := o.(type) {
case cmapArray:
if len(v.Array) != int(srcCodeTo-srcCodeFrom)+1 {
common.Log.Debug("ERROR: Invalid number of items in array")
return ErrBadCMap
}
for code := srcCodeFrom; code <= srcCodeTo; code++ {
o := v.Array[code-srcCodeFrom]
hexs, ok := o.(cmapHexString)
if !ok {
return errors.New("non-hex string in array")
}
r := hexToRune(hexs)
cmap.codeToUnicode[code] = r
}
case cmapHexString:
// <codeFrom> <codeTo> <dst>, maps [from,to] to [dst,dst+to-from].
r := hexToRune(v)
for code := srcCodeFrom; code <= srcCodeTo; code++ {
cmap.codeToUnicode[code] = r
r++
}
default:
common.Log.Debug("ERROR: Unexpected type %T", o)
return ErrBadCMap
}
}
return nil
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/coorrer/unipdf.git
git@gitee.com:coorrer/unipdf.git
coorrer
unipdf
unipdf
v1.2.0

搜索帮助