You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

pdf.go 1.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. // Copyright 2023 The casbin Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package txt
  15. import (
  16. "strings"
  17. "github.com/ledongthuc/pdf"
  18. )
  19. func getTextFromPdf(path string) (string, error) {
  20. f, r, err := pdf.Open(path)
  21. if err != nil {
  22. return "", err
  23. }
  24. defer f.Close()
  25. totalPage := r.NumPage()
  26. var mergedTexts []string
  27. for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
  28. p := r.Page(pageIndex)
  29. if p.V.IsNull() {
  30. continue
  31. }
  32. var lastTextStyle pdf.Text
  33. var mergedSentence string
  34. texts := p.Content().Text
  35. for _, text := range texts {
  36. if text.Y == lastTextStyle.Y {
  37. mergedSentence += text.S
  38. } else {
  39. if mergedSentence != "" {
  40. mergedTexts = append(mergedTexts, mergedSentence)
  41. }
  42. lastTextStyle = text
  43. mergedSentence = text.S
  44. }
  45. }
  46. if mergedSentence != "" {
  47. mergedTexts = append(mergedTexts, mergedSentence)
  48. }
  49. }
  50. mergedText := strings.Join(mergedTexts, "\n")
  51. return mergedText, nil
  52. }