You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

vector_embedding.go 4.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. // Copyright 2023 The casbin Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package object
  15. import (
  16. "context"
  17. "fmt"
  18. "path/filepath"
  19. "time"
  20. "github.com/casbin/casibase/embedding"
  21. "github.com/casbin/casibase/storage"
  22. "github.com/casbin/casibase/txt"
  23. "github.com/casbin/casibase/util"
  24. "golang.org/x/time/rate"
  25. )
  26. func filterTextFiles(files []*storage.Object) []*storage.Object {
  27. fileTypes := txt.GetSupportedFileTypes()
  28. fileTypeMap := map[string]bool{}
  29. for _, fileType := range fileTypes {
  30. fileTypeMap[fileType] = true
  31. }
  32. res := []*storage.Object{}
  33. for _, file := range files {
  34. ext := filepath.Ext(file.Key)
  35. if fileTypeMap[ext] {
  36. res = append(res, file)
  37. }
  38. }
  39. return res
  40. }
  41. func getFilteredFileObjects(provider string, prefix string) ([]*storage.Object, error) {
  42. files, err := storage.ListObjects(provider, prefix)
  43. if err != nil {
  44. return nil, err
  45. }
  46. return filterTextFiles(files), nil
  47. }
  48. func addEmbeddedVector(embeddingProviderObj embedding.EmbeddingProvider, text string, storeName string, fileName string) (bool, error) {
  49. data, err := embeddingProviderObj.QueryVector(text, 5)
  50. // data, err := model.GetEmbeddingSafe(authToken, text)
  51. if err != nil {
  52. return false, err
  53. }
  54. displayName := text
  55. if len(text) > 25 {
  56. displayName = text[:25]
  57. }
  58. vector := &Vector{
  59. Owner: "admin",
  60. Name: fmt.Sprintf("vector_%s", util.GetRandomName()),
  61. CreatedTime: util.GetCurrentTime(),
  62. DisplayName: displayName,
  63. Store: storeName,
  64. File: fileName,
  65. Text: text,
  66. Data: data,
  67. }
  68. return AddVector(vector)
  69. }
  70. func addVectorsForStore(embeddingProviderObj embedding.EmbeddingProvider, storageProviderName string, key string, storeName string) (bool, error) {
  71. var affected bool
  72. var err error
  73. objs, err := getFilteredFileObjects(storageProviderName, key)
  74. if err != nil {
  75. return false, err
  76. }
  77. timeLimiter := rate.NewLimiter(rate.Every(time.Minute), 3)
  78. for _, obj := range objs {
  79. var text string
  80. fileExt := filepath.Ext(obj.Key)
  81. text, err = txt.GetParsedTextFromUrl(obj.Url, fileExt)
  82. if err != nil {
  83. return false, err
  84. }
  85. textSections := txt.GetTextSections(text)
  86. for i, textSection := range textSections {
  87. if timeLimiter.Allow() {
  88. fmt.Printf("[%d/%d] Generating embedding for store: [%s]'s text section: %s\n", i+1, len(textSections), storeName, textSection)
  89. affected, err = addEmbeddedVector(embeddingProviderObj, textSection, storeName, obj.Key)
  90. } else {
  91. err = timeLimiter.Wait(context.Background())
  92. if err != nil {
  93. return false, err
  94. }
  95. fmt.Printf("[%d/%d] Generating embedding for store: [%s]'s text section: %s\n", i+1, len(textSections), storeName, textSection)
  96. affected, err = addEmbeddedVector(embeddingProviderObj, textSection, storeName, obj.Key)
  97. }
  98. }
  99. }
  100. return affected, err
  101. }
  102. func getRelatedVectors(owner string) ([]*Vector, error) {
  103. vectors, err := GetVectors(owner)
  104. if err != nil {
  105. return nil, err
  106. }
  107. if len(vectors) == 0 {
  108. return nil, fmt.Errorf("no knowledge vectors found")
  109. }
  110. return vectors, nil
  111. }
  112. func GetNearestVectorText(embeddingProvider embedding.EmbeddingProvider, owner string, text string) (string, error) {
  113. qVector, err := embeddingProvider.QueryVector(text, 5)
  114. // qVector, err := embedding.GetEmbeddingSafe(authToken, question)
  115. if err != nil {
  116. return "", err
  117. }
  118. if qVector == nil {
  119. return "", fmt.Errorf("no qVector found")
  120. }
  121. vectors, err := getRelatedVectors(owner)
  122. if err != nil {
  123. return "", err
  124. }
  125. var nVectors [][]float32
  126. for _, candidate := range vectors {
  127. nVectors = append(nVectors, candidate.Data)
  128. }
  129. i := getNearestVectorIndex(qVector, nVectors)
  130. return vectors[i].Text, nil
  131. }