using NumSharp; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Security.Cryptography; using System.Text; using System.Text.RegularExpressions; using TensorFlowNET.Examples.Utility; namespace TensorFlowNET.Examples { public class DataHelpers { public static Dictionary build_word_dict(string path) { var contents = File.ReadAllLines(path); var words = new List(); foreach (var content in contents) words.AddRange(clean_str(content).Split(' ').Where(x => x.Length > 1)); var word_counter = words.GroupBy(x => x) .Select(x => new { Word = x.Key, Count = x.Count() }) .OrderByDescending(x => x.Count) .ToArray(); var word_dict = new Dictionary(); word_dict[""] = 0; word_dict[""] = 1; word_dict[""] = 2; foreach (var word in word_counter) word_dict[word.Word] = word_dict.Count; return word_dict; } public static (int[][], int[]) build_word_dataset(string path, Dictionary word_dict, int document_max_len) { var contents = File.ReadAllLines(path); var x = contents.Select(c => (clean_str(c) + " ") .Split(' ').Take(document_max_len) .Select(w => word_dict.ContainsKey(w) ? word_dict[w] : word_dict[""]).ToArray()) .ToArray(); for (int i = 0; i < x.Length; i++) if (x[i].Length == document_max_len) x[i][document_max_len - 1] = word_dict[""]; else Array.Resize(ref x[i], document_max_len); var y = contents.Select(c => int.Parse(c.Substring(0, c.IndexOf(','))) - 1).ToArray(); return (x, y); } public static (int[][], int[], int) build_char_dataset(string path, string model, int document_max_len, int? limit = null, bool shuffle=true) { string alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’'\"/|_#$%ˆ&*˜‘+=<>()[]{} "; /*if (step == "train") df = pd.read_csv(TRAIN_PATH, names =["class", "title", "content"]);*/ var char_dict = new Dictionary(); char_dict[""] = 0; char_dict[""] = 1; foreach (char c in alphabet) char_dict[c.ToString()] = char_dict.Count; var contents = File.ReadAllLines(path); if (shuffle) new Random(17).Shuffle(contents); //File.WriteAllLines("text_classification/dbpedia_csv/train_6400.csv", contents.Take(6400)); var size = limit == null ? contents.Length : limit.Value; var x = new int[size][]; var y = new int[size]; var tenth = size / 10; var percent = 0; for (int i = 0; i < size; i++) { if ((i + 1) % tenth == 0) { percent += 10; Console.WriteLine($"\t{percent}%"); } string[] parts = contents[i].ToLower().Split(",\"").ToArray(); string content = parts[2]; content = content.Substring(0, content.Length - 1); var a = new int[document_max_len]; for (int j = 0; j < document_max_len; j++) { if (j >= content.Length) a[j] = char_dict[""]; else a[j] = char_dict.ContainsKey(content[j].ToString()) ? char_dict[content[j].ToString()] : char_dict[""]; } x[i] = a; y[i] = int.Parse(parts[0]); } return (x, y, alphabet.Length + 2); } /// /// Loads MR polarity data from files, splits the data into words and generates labels. /// Returns split sentences and labels. /// /// /// /// public static (string[], NDArray) load_data_and_labels(string positive_data_file, string negative_data_file) { Directory.CreateDirectory("CnnTextClassification"); Utility.Web.Download(positive_data_file, "CnnTextClassification", "rt -polarity.pos"); Utility.Web.Download(negative_data_file, "CnnTextClassification", "rt-polarity.neg"); // Load data from files var positive_examples = File.ReadAllLines("CnnTextClassification/rt-polarity.pos") .Select(x => x.Trim()) .ToArray(); var negative_examples = File.ReadAllLines("CnnTextClassification/rt-polarity.neg") .Select(x => x.Trim()) .ToArray(); var x_text = new List(); x_text.AddRange(positive_examples); x_text.AddRange(negative_examples); x_text = x_text.Select(x => clean_str(x)).ToList(); var positive_labels = positive_examples.Select(x => new int[2] { 0, 1 }).ToArray(); var negative_labels = negative_examples.Select(x => new int[2] { 1, 0 }).ToArray(); var y = np.concatenate(new NDArray[] { new int[][][] { positive_labels, negative_labels } }); return (x_text.ToArray(), y); } private static string clean_str(string str) { str = Regex.Replace(str, "[^A-Za-z0-9(),!?]", " "); str = Regex.Replace(str, ",", " "); return str; } /// /// Padding /// /// /// the char to pad with /// a list of list where each sublist has same length public static (int[][], int[]) pad_sequences(int[][] sequences, int pad_tok = 0) { int max_length = sequences.Select(x => x.Length).Max(); return _pad_sequences(sequences, pad_tok, max_length); } public static (int[][][], int[][]) pad_sequences(int[][][] sequences, int pad_tok = 0) { int max_length_word = sequences.Select(x => x.Select(w => w.Length).Max()).Max(); int[][][] sequence_padded; var sequence_length = new int[sequences.Length][]; for (int i = 0; i < sequences.Length; i++) { // all words are same length now var (sp, sl) = _pad_sequences(sequences[i], pad_tok, max_length_word); sequence_length[i] = sl; } int max_length_sentence = sequences.Select(x => x.Length).Max(); (sequence_padded, _) = _pad_sequences(sequences, np.repeat(pad_tok, max_length_word).GetData().ToArray(), max_length_sentence); (sequence_length, _) = _pad_sequences(sequence_length, 0, max_length_sentence); return (sequence_padded, sequence_length); } private static (int[][], int[]) _pad_sequences(int[][] sequences, int pad_tok, int max_length) { var sequence_length = new int[sequences.Length]; for (int i = 0; i < sequences.Length; i++) { sequence_length[i] = sequences[i].Length; Array.Resize(ref sequences[i], max_length); } return (sequences, sequence_length); } private static (int[][][], int[]) _pad_sequences(int[][][] sequences, int[] pad_tok, int max_length) { var sequence_length = new int[sequences.Length]; for (int i = 0; i < sequences.Length; i++) { sequence_length[i] = sequences[i].Length; Array.Resize(ref sequences[i], max_length); for (int j = 0; j < max_length - sequence_length[i]; j++) { sequences[i][max_length - j - 1] = new int[pad_tok.Length]; Array.Copy(pad_tok, sequences[i][max_length - j - 1], pad_tok.Length); } } return (sequences, sequence_length); } public static string CalculateMD5Hash(string input) { // step 1, calculate MD5 hash from input MD5 md5 = System.Security.Cryptography.MD5.Create(); byte[] inputBytes = System.Text.Encoding.ASCII.GetBytes(input); byte[] hash = md5.ComputeHash(inputBytes); // step 2, convert byte array to hex string StringBuilder sb = new StringBuilder(); for (int i = 0; i < hash.Length; i++) { sb.Append(hash[i].ToString("X2")); } return sb.ToString(); } } }