#python #нейронные_сети #машинное_обучение #numpy #keras
Я написал скрипт нейронной сети, а точнее часть с подготовкой данных на вход. Но не уверен, правильно ли всё сделал для того, чтобы модель смогла корректно обучаться. Очень нужна оценка знающих людей. Код: import sklearn import numpy as np from collections import Counter from keras.models import model_from_json from keras.preprocessing import sequence from sklearn.model_selection import train_test_split as tts labels_lexicon = ['_label_0', '_label_1', '_label_2'] # список категорий def get_data_from_the_file(): labels, descriptions, lexicon, lexicon_base = [], [], [], [] for i, line in enumerate(open('testtext.txt', 'r', encoding='utf8', errors='ignore')): content = line.split() labels.append([content[0]]) descriptions.append(content[1:]) lexicon_base += content[1:] count_lexicon = Counter(lexicon_base).most_common(5000) for count_item in count_lexicon: lexicon.append(count_item[0]) return labels, descriptions, lexicon labels, descriptions, lexicon = get_data_from_the_file() def get_descriptions_to_index(lexicon): cache = {} word2index = {} for i,word in enumerate(lexicon): if cache.get(word) == None: cache[word] = i word2index[word] = i return word2index word2index = get_descriptions_to_index(lexicon) def get_labels_to_index(labels_lexicon): cache = {} labels2index = {} for i,word in enumerate(labels_lexicon): if cache.get(word) == None: cache[word] = i labels2index[word] = i return labels2index labels2index = get_labels_to_index(labels_lexicon) list_of_tokenize_descriptions = [] list_of_tokenize_labels = [] for descriptions_arrays in descriptions: prepare_list_of_tokenize_descriptions = [] for descriptions_piece in descriptions_arrays: if word2index.get(descriptions_piece) != None: prepare_list_of_tokenize_descriptions.append(word2index[descriptions_piece]) list_of_tokenize_descriptions.append(prepare_list_of_tokenize_descriptions) for labels_arrays in labels: prepare_list_of_tokenize_labels = [] for labels_piece in labels_arrays: if labels2index.get(labels_piece) != None: prepare_list_of_tokenize_labels.append(labels2index[labels_piece]) list_of_tokenize_labels.append(prepare_list_of_tokenize_labels) x_matrix_list = [] y_matrix_list = [] for i in range(len(list_of_tokenize_descriptions)): matrix_i = np.zeros((len(lexicon)),dtype=int) line = list_of_tokenize_descriptions[i] for index in line: matrix_i[index] = 1 x_matrix_list.append(matrix_i) for i in range(len(list_of_tokenize_labels)): matrix_i = np.zeros((len(labels_lexicon)),dtype=int) line = list_of_tokenize_labels[i] for index in line: matrix_i[index] = 1 y_matrix_list.append(matrix_i) x_train, x_test, y_train, y_test = tts(np.array(x_matrix_list), np.array(y_matrix_list), test_size=0.3) Здесь ссылкуа на dataset.
Ответы
Ответ 1
Я бы в данном случае воспользовался методом keras.preprocessing.text.Tokenizer: from pathlib import Path import pandas as pd from keras.models import Sequential from keras.layers import Embedding, LSTM, Dense, Dropout, Activation from keras.preprocessing.text import Tokenizer, text_to_word_sequence from keras.preprocessing.sequence import pad_sequences from keras import optimizers from keras.callbacks import ModelCheckpoint from keras.models import load_model from sklearn.model_selection import train_test_split def get_data(filename, num_words=5000, frac=1.0): data = (pd.read_csv(filename, header=None, names=['text'], sep='~') .sample(frac=frac)) data[['label','text']] = data.pop('text').str.split(n=1, expand=True) data = data.dropna() data = data.loc[data['label'].str.contains(r'^_label')] # build vocabulary tok = Tokenizer(num_words=num_words) tok.fit_on_texts(data['text']) # convert texts to sequences X = tok.texts_to_sequences(data['text']) lb = LabelBinarizer() Y = pd.DataFrame(lb.fit_transform(data['label']), columns=lb.classes_, index=data.index) return (pad_sequences(X, maxlen=num_words), Y, tok) path = Path(r'D:\temp\.data') filename = path / 'testtext.txt' num_words = 1000 X, Y, tok = get_data(filename, num_words=num_words) # split data set to train / dev X_train, X_dev, Y_train, Y_dev = \ train_test_split(X, Y, test_size=0.2, random_state=123, stratify=Y) print('X_train.shape:\t{}\t\tY_train.shape:\t{}'.format(X_train.shape, Y_train.shape)) print('X_dev.shape:\t{}\t\tY_dev.shape:\t{}'.format(X_dev.shape, Y_dev.shape)) вывод: X_train.shape: (26850, 1000) Y_train.shape: (26850, 3) X_dev.shape: (6713, 1000) Y_dev.shape: (6713, 3) Что у нас получилось: In [4]: X Out[4]: array([[ 0, 0, 0, ..., 250, 154, 16], [ 0, 0, 0, ..., 112, 121, 84], [ 0, 0, 0, ..., 72, 49, 44], ..., [ 0, 0, 0, ..., 5, 109, 99], [ 0, 0, 0, ..., 158, 513, 78], [ 0, 0, 0, ..., 0, 0, 138]]) In [5]: X.shape Out[5]: (33563, 1000) In [6]: Y Out[6]: _label_0 _label_1 _label_2 30455 1 0 0 19423 1 0 0 29907 0 1 0 12779 0 1 0 28342 0 0 1 27583 0 0 1 28096 0 1 0 21411 1 0 0 23425 1 0 0 33227 1 0 0 ... ... ... ... 28788 1 0 0 17329 0 1 0 5339 0 1 0 9461 0 0 1 31315 0 0 1 23199 1 0 0 6752 0 1 0 164 1 0 0 24283 0 0 1 25055 0 0 1 [33563 rows x 3 columns] In [7]: Y.shape Out[7]: (33563, 3) In [10]: tok.index_word[250] Out[10]: 'реабилитац' In [11]: tok.index_word[154] Out[11]: 'инвалид' In [12]: tok.index_word[16] Out[12]: 'год'
Комментариев нет:
Отправить комментарий