Страницы

Поиск по вопросам

вторник, 24 декабря 2019 г.

Препроцессинг текста для обучения модели классификации в Keras

#python #нейронные_сети #машинное_обучение #numpy #keras


Я написал скрипт нейронной сети, а точнее часть с подготовкой данных на вход. Но
не уверен, правильно ли всё сделал для того, чтобы модель смогла корректно обучаться.
Очень нужна оценка знающих людей.

Код:

import sklearn
import numpy as np
from collections import Counter
from keras.models import model_from_json
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split as tts


labels_lexicon = ['_label_0', '_label_1', '_label_2'] # список категорий

def get_data_from_the_file():
  labels, descriptions, lexicon, lexicon_base = [], [], [], []
  for i,  line in enumerate(open('testtext.txt', 'r', encoding='utf8', errors='ignore')):
    content = line.split()
    labels.append([content[0]])
    descriptions.append(content[1:])
    lexicon_base += content[1:]

  count_lexicon = Counter(lexicon_base).most_common(5000)
  for count_item in count_lexicon:
   lexicon.append(count_item[0])

  return labels, descriptions, lexicon

labels, descriptions, lexicon = get_data_from_the_file()

def get_descriptions_to_index(lexicon):
    cache = {}
    word2index = {}
    for i,word in enumerate(lexicon):
        if cache.get(word) == None:
            cache[word] = i
            word2index[word] = i
    return word2index
word2index = get_descriptions_to_index(lexicon)


def get_labels_to_index(labels_lexicon):
    cache = {}
    labels2index = {}
    for i,word in enumerate(labels_lexicon):
        if cache.get(word) == None:
            cache[word] = i
            labels2index[word] = i
    return labels2index
labels2index = get_labels_to_index(labels_lexicon)

list_of_tokenize_descriptions = []
list_of_tokenize_labels = []


for descriptions_arrays in descriptions:
    prepare_list_of_tokenize_descriptions = []
    for descriptions_piece in descriptions_arrays:
        if word2index.get(descriptions_piece) != None:
            prepare_list_of_tokenize_descriptions.append(word2index[descriptions_piece])
    list_of_tokenize_descriptions.append(prepare_list_of_tokenize_descriptions)


for labels_arrays in labels:
    prepare_list_of_tokenize_labels = []
    for labels_piece in labels_arrays:
        if labels2index.get(labels_piece) != None:
            prepare_list_of_tokenize_labels.append(labels2index[labels_piece])
    list_of_tokenize_labels.append(prepare_list_of_tokenize_labels)

x_matrix_list = []
y_matrix_list = []

for i in range(len(list_of_tokenize_descriptions)):
  matrix_i = np.zeros((len(lexicon)),dtype=int)
  line =  list_of_tokenize_descriptions[i]
  for index in line:
    matrix_i[index] = 1
  x_matrix_list.append(matrix_i)


for i in range(len(list_of_tokenize_labels)):
  matrix_i = np.zeros((len(labels_lexicon)),dtype=int)
  line =  list_of_tokenize_labels[i]
  for index in line:
    matrix_i[index] = 1
  y_matrix_list.append(matrix_i)


x_train, x_test, y_train, y_test = tts(np.array(x_matrix_list), np.array(y_matrix_list),
 test_size=0.3)


Здесь ссылкуа на dataset. 
    


Ответы

Ответ 1



Я бы в данном случае воспользовался методом keras.preprocessing.text.Tokenizer: from pathlib import Path import pandas as pd from keras.models import Sequential from keras.layers import Embedding, LSTM, Dense, Dropout, Activation from keras.preprocessing.text import Tokenizer, text_to_word_sequence from keras.preprocessing.sequence import pad_sequences from keras import optimizers from keras.callbacks import ModelCheckpoint from keras.models import load_model from sklearn.model_selection import train_test_split def get_data(filename, num_words=5000, frac=1.0): data = (pd.read_csv(filename, header=None, names=['text'], sep='~') .sample(frac=frac)) data[['label','text']] = data.pop('text').str.split(n=1, expand=True) data = data.dropna() data = data.loc[data['label'].str.contains(r'^_label')] # build vocabulary tok = Tokenizer(num_words=num_words) tok.fit_on_texts(data['text']) # convert texts to sequences X = tok.texts_to_sequences(data['text']) lb = LabelBinarizer() Y = pd.DataFrame(lb.fit_transform(data['label']), columns=lb.classes_, index=data.index) return (pad_sequences(X, maxlen=num_words), Y, tok) path = Path(r'D:\temp\.data') filename = path / 'testtext.txt' num_words = 1000 X, Y, tok = get_data(filename, num_words=num_words) # split data set to train / dev X_train, X_dev, Y_train, Y_dev = \ train_test_split(X, Y, test_size=0.2, random_state=123, stratify=Y) print('X_train.shape:\t{}\t\tY_train.shape:\t{}'.format(X_train.shape, Y_train.shape)) print('X_dev.shape:\t{}\t\tY_dev.shape:\t{}'.format(X_dev.shape, Y_dev.shape)) вывод: X_train.shape: (26850, 1000) Y_train.shape: (26850, 3) X_dev.shape: (6713, 1000) Y_dev.shape: (6713, 3) Что у нас получилось: In [4]: X Out[4]: array([[ 0, 0, 0, ..., 250, 154, 16], [ 0, 0, 0, ..., 112, 121, 84], [ 0, 0, 0, ..., 72, 49, 44], ..., [ 0, 0, 0, ..., 5, 109, 99], [ 0, 0, 0, ..., 158, 513, 78], [ 0, 0, 0, ..., 0, 0, 138]]) In [5]: X.shape Out[5]: (33563, 1000) In [6]: Y Out[6]: _label_0 _label_1 _label_2 30455 1 0 0 19423 1 0 0 29907 0 1 0 12779 0 1 0 28342 0 0 1 27583 0 0 1 28096 0 1 0 21411 1 0 0 23425 1 0 0 33227 1 0 0 ... ... ... ... 28788 1 0 0 17329 0 1 0 5339 0 1 0 9461 0 0 1 31315 0 0 1 23199 1 0 0 6752 0 1 0 164 1 0 0 24283 0 0 1 25055 0 0 1 [33563 rows x 3 columns] In [7]: Y.shape Out[7]: (33563, 3) In [10]: tok.index_word[250] Out[10]: 'реабилитац' In [11]: tok.index_word[154] Out[11]: 'инвалид' In [12]: tok.index_word[16] Out[12]: 'год'

Комментариев нет:

Отправить комментарий