fix: util.
This commit is contained in:
76
util/docx_util.go
Normal file
76
util/docx_util.go
Normal file
@ -0,0 +1,76 @@
|
||||
package util
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
docx "github.com/fumiama/go-docx"
|
||||
)
|
||||
|
||||
func DocxToStructuredPrompt(filename string) (string, error) {
|
||||
f, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
doc, err := docx.Parse(f, fi.Size())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var sb strings.Builder
|
||||
sb.WriteString(fmt.Sprintf("# 文件:%s\n\n", filename))
|
||||
|
||||
for _, item := range doc.Document.Body.Items {
|
||||
switch v := item.(type) {
|
||||
case *docx.Paragraph:
|
||||
// 直接用 fmt.Sprint 利用庫的 Stringer
|
||||
text := fmt.Sprint(v)
|
||||
text = strings.TrimSpace(text)
|
||||
if text != "" {
|
||||
sb.WriteString(text + "\n\n")
|
||||
}
|
||||
|
||||
case *docx.Table:
|
||||
sb.WriteString("## 表格\n")
|
||||
|
||||
// 先印表頭(可選)
|
||||
sb.WriteString("| ")
|
||||
|
||||
// 假設第一行是表頭(很多文件如此),或全部當內容
|
||||
for i, row := range v.TableRows {
|
||||
var cells []string
|
||||
for _, cell := range row.TableCells {
|
||||
// 這裡是重點:cell 本身沒有 String(),但可以遍歷它的 Paragraphs
|
||||
var cellText strings.Builder
|
||||
for _, p := range cell.Paragraphs {
|
||||
cellText.WriteString(fmt.Sprint(p))
|
||||
cellText.WriteString(" ")
|
||||
}
|
||||
cells = append(cells, strings.TrimSpace(cellText.String()))
|
||||
}
|
||||
|
||||
sb.WriteString(strings.Join(cells, " | "))
|
||||
sb.WriteString(" |\n")
|
||||
|
||||
// 如果想加 markdown 表頭分隔線(只在第一行後加)
|
||||
if i == 0 {
|
||||
sb.WriteString("| " + strings.Repeat("--- | ", len(cells)) + "\n")
|
||||
}
|
||||
}
|
||||
sb.WriteString("\n")
|
||||
|
||||
default:
|
||||
// 忽略圖片、頁首等
|
||||
}
|
||||
}
|
||||
|
||||
return sb.String(), nil
|
||||
}
|
||||
Reference in New Issue
Block a user