|
- /*
- Copyright 2021 The KubeEdge Authors.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
- package dataset
-
- import (
- "bufio"
- "encoding/json"
- "fmt"
- "os"
- "path/filepath"
- "strings"
- "time"
-
- "k8s.io/klog/v2"
-
- "github.com/kubeedge/sedna/cmd/sedna-lc/app/options"
- sednav1 "github.com/kubeedge/sedna/pkg/apis/sedna/v1alpha1"
- "github.com/kubeedge/sedna/pkg/globalmanager/runtime"
- "github.com/kubeedge/sedna/pkg/localcontroller/db"
- clienttypes "github.com/kubeedge/sedna/pkg/localcontroller/gmclient"
- "github.com/kubeedge/sedna/pkg/localcontroller/storage"
- "github.com/kubeedge/sedna/pkg/localcontroller/util"
- workertypes "github.com/kubeedge/sedna/pkg/localcontroller/worker"
- )
-
- const (
- // MonitorDataSourceIntervalSeconds is interval time of monitoring data source
- MonitorDataSourceIntervalSeconds = 60
- // KindName is kind of dataset resource
- KindName = "dataset"
- // CSVFormat is commas separated value format with a extra header.
- // It can be used in structured data scenarios.
- CSVFormat = "csv"
- // TXTFormat is line separated format.
- // It can be used in unstructured data scenarios.
- TXTFormat = "txt"
- )
-
- // DatasetManager defines dataset manager
- type Manager struct {
- Client clienttypes.ClientI
- DatasetMap map[string]*Dataset
- VolumeMountPrefix string
- }
-
- // Dataset defines config for dataset
- type Dataset struct {
- *sednav1.Dataset
- DataSource *DataSource `json:"dataSource"`
- Done chan struct{}
- URLPrefix string
- Storage storage.Storage
- }
-
- // DataSource defines config for data source
- type DataSource struct {
- TrainSamples []string
- NumberOfSamples int
- Header string
- }
-
- // New creates a dataset manager
- func New(client clienttypes.ClientI, options *options.LocalControllerOptions) *Manager {
- dm := Manager{
- Client: client,
- DatasetMap: make(map[string]*Dataset),
- VolumeMountPrefix: options.VolumeMountPrefix,
- }
-
- return &dm
- }
-
- // Start starts dataset manager
- func (dm *Manager) Start() error {
- return nil
- }
-
- // GetDatasetChannel gets dataset
- func (dm *Manager) GetDataset(name string) (*Dataset, bool) {
- d, ok := dm.DatasetMap[name]
- return d, ok
- }
-
- // Insert inserts dataset to db
- func (dm *Manager) Insert(message *clienttypes.Message) error {
- name := util.GetUniqueIdentifier(message.Header.Namespace, message.Header.ResourceName, message.Header.ResourceKind)
- first := false
- dataset, ok := dm.DatasetMap[name]
- if !ok {
- dataset = &Dataset{}
- dataset.Storage = storage.Storage{IsLocalStorage: false}
- dataset.Done = make(chan struct{})
- dm.DatasetMap[name] = dataset
- first = true
- }
-
- if err := json.Unmarshal(message.Content, dataset); err != nil {
- return err
- }
-
- credential := dataset.ObjectMeta.Annotations[runtime.SecretAnnotationKey]
- if credential != "" {
- if err := dataset.Storage.SetCredential(credential); err != nil {
- return fmt.Errorf("failed to set dataset(name=%s)'s storage credential, error: %+v", name, err)
- }
- }
-
- isLocalURL, err := dataset.Storage.IsLocalURL(dataset.Spec.URL)
- if err != nil {
- return fmt.Errorf("dataset(name=%s)'s url is invalid, error: %+v", name, err)
- }
- if isLocalURL {
- dataset.Storage.IsLocalStorage = true
- }
-
- if first {
- go dm.monitorDataSources(name)
- }
-
- if err := db.SaveResource(name, dataset.TypeMeta, dataset.ObjectMeta, dataset.Spec); err != nil {
- return err
- }
-
- return nil
- }
-
- // Delete deletes dataset config in db
- func (dm *Manager) Delete(message *clienttypes.Message) error {
- name := util.GetUniqueIdentifier(message.Header.Namespace, message.Header.ResourceName, message.Header.ResourceKind)
-
- if ds, ok := dm.DatasetMap[name]; ok && ds.Done != nil {
- close(ds.Done)
- }
-
- delete(dm.DatasetMap, name)
-
- if err := db.DeleteResource(name); err != nil {
- return err
- }
-
- return nil
- }
-
- // monitorDataSources monitors the data url of specified dataset
- func (dm *Manager) monitorDataSources(name string) {
- ds, ok := dm.DatasetMap[name]
- if !ok || ds == nil {
- return
- }
-
- dataURL := ds.Spec.URL
- if ds.Storage.IsLocalStorage {
- dataURL = util.AddPrefixPath(dm.VolumeMountPrefix, dataURL)
- }
-
- ds.URLPrefix = strings.TrimRight(dataURL, filepath.Base(dataURL))
- samplesNumber := 0
- for {
- select {
- case <-ds.Done:
- return
- default:
- }
-
- dataSource, err := ds.getDataSource(dataURL, ds.Spec.Format)
- if err != nil {
- klog.Errorf("dataset(name=%s) get samples from %s failed, error: %+v", name, dataURL, err)
- } else {
- ds.DataSource = dataSource
- if samplesNumber != dataSource.NumberOfSamples {
- samplesNumber = dataSource.NumberOfSamples
- klog.Infof("dataset(name=%s) get samples from data source(url=%s) successfully. number of samples: %d",
- name, dataURL, dataSource.NumberOfSamples)
-
- header := clienttypes.MessageHeader{
- Namespace: ds.Namespace,
- ResourceKind: ds.Kind,
- ResourceName: ds.Name,
- Operation: clienttypes.StatusOperation,
- }
-
- if err := dm.Client.WriteMessage(struct {
- NumberOfSamples int `json:"numberOfSamples"`
- }{
- dataSource.NumberOfSamples,
- }, header); err != nil {
- klog.Errorf("dataset(name=%s) publish samples info failed, error: %+v", name, err)
- }
- }
- }
- <-time.After(MonitorDataSourceIntervalSeconds * time.Second)
- }
- }
-
- // getDataSource gets data source info
- func (ds *Dataset) getDataSource(dataURL string, format string) (*DataSource, error) {
- if err := ds.validFormat(format); err != nil {
- return nil, err
- }
-
- localURL, err := ds.Storage.Download(dataURL, "")
-
- if !ds.Storage.IsLocalStorage {
- defer os.RemoveAll(localURL)
- }
-
- if err != nil {
- return nil, err
- }
-
- return ds.readByLine(localURL, format)
- }
-
- // readByLine reads file by line
- func (ds *Dataset) readByLine(url string, format string) (*DataSource, error) {
- samples, err := GetSamples(url)
- if err != nil {
- klog.Errorf("read file %s failed, error: %v", url, err)
- return nil, err
- }
-
- numberOfSamples := 0
- dataSource := DataSource{}
- switch strings.ToLower(format) {
- case TXTFormat:
- numberOfSamples += len(samples)
- case CSVFormat:
- // the first row of csv file is header
- if len(samples) == 0 {
- return nil, fmt.Errorf("file %s is empty", url)
- }
- dataSource.Header = samples[0]
- samples = samples[1:]
- numberOfSamples += len(samples)
-
- default:
- return nil, fmt.Errorf("invaild file format")
- }
-
- dataSource.TrainSamples = samples
- dataSource.NumberOfSamples = numberOfSamples
-
- return &dataSource, nil
- }
-
- func (dm *Manager) GetName() string {
- return KindName
- }
-
- func (dm *Manager) AddWorkerMessage(message workertypes.MessageContent) {
- // dummy
- }
-
- // GetSamples gets samples in a file
- func GetSamples(url string) ([]string, error) {
- var samples = []string{}
- if !util.IsExists(url) {
- return nil, fmt.Errorf("url(%s) does not exist", url)
- }
-
- if !util.IsFile(url) {
- return nil, fmt.Errorf("url(%s) is not a file, not vaild", url)
- }
-
- file, err := os.Open(url)
- if err != nil {
- klog.Errorf("read %s failed, error: %v", url, err)
- return samples, err
- }
-
- fileScanner := bufio.NewScanner(file)
- for fileScanner.Scan() {
- samples = append(samples, fileScanner.Text())
- }
-
- if err = file.Close(); err != nil {
- klog.Errorf("close file(url=%s) failed, error: %v", url, err)
- return samples, err
- }
-
- return samples, nil
- }
-
- // validFormat checks data format is valid
- func (ds *Dataset) validFormat(format string) error {
- for _, v := range []string{TXTFormat, CSVFormat} {
- if strings.ToLower(format) == v {
- return nil
- }
- }
-
- return fmt.Errorf("dataset format(%s) is invalid", format)
- }
|