
Поиск по вопросам

вторник, 24 декабря 2019 г.

Препроцессинг текста для обучения модели классификации в Keras

#python #нейронные_сети #машинное_обучение #numpy #keras

Я написал скрипт нейронной сети, а точнее часть с подготовкой данных на вход. Но
не уверен, правильно ли всё сделал для того, чтобы модель смогла корректно обучаться.
Очень нужна оценка знающих людей.


import sklearn
import numpy as np
from collections import Counter
from keras.models import model_from_json
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split as tts

labels_lexicon = ['_label_0', '_label_1', '_label_2'] # список категорий

def get_data_from_the_file():
  labels, descriptions, lexicon, lexicon_base = [], [], [], []
  for i,  line in enumerate(open('testtext.txt', 'r', encoding='utf8', errors='ignore')):
    content = line.split()
    lexicon_base += content[1:]

  count_lexicon = Counter(lexicon_base).most_common(5000)
  for count_item in count_lexicon:

  return labels, descriptions, lexicon

labels, descriptions, lexicon = get_data_from_the_file()

def get_descriptions_to_index(lexicon):
    cache = {}
    word2index = {}
    for i,word in enumerate(lexicon):
        if cache.get(word) == None:
            cache[word] = i
            word2index[word] = i
    return word2index
word2index = get_descriptions_to_index(lexicon)

def get_labels_to_index(labels_lexicon):
    cache = {}
    labels2index = {}
    for i,word in enumerate(labels_lexicon):
        if cache.get(word) == None:
            cache[word] = i
            labels2index[word] = i
    return labels2index
labels2index = get_labels_to_index(labels_lexicon)

list_of_tokenize_descriptions = []
list_of_tokenize_labels = []

for descriptions_arrays in descriptions:
    prepare_list_of_tokenize_descriptions = []
    for descriptions_piece in descriptions_arrays:
        if word2index.get(descriptions_piece) != None:

for labels_arrays in labels:
    prepare_list_of_tokenize_labels = []
    for labels_piece in labels_arrays:
        if labels2index.get(labels_piece) != None:

x_matrix_list = []
y_matrix_list = []

for i in range(len(list_of_tokenize_descriptions)):
  matrix_i = np.zeros((len(lexicon)),dtype=int)
  line =  list_of_tokenize_descriptions[i]
  for index in line:
    matrix_i[index] = 1

for i in range(len(list_of_tokenize_labels)):
  matrix_i = np.zeros((len(labels_lexicon)),dtype=int)
  line =  list_of_tokenize_labels[i]
  for index in line:
    matrix_i[index] = 1

x_train, x_test, y_train, y_test = tts(np.array(x_matrix_list), np.array(y_matrix_list),

Здесь ссылкуа на dataset. 


Ответ 1

Я бы в данном случае воспользовался методом keras.preprocessing.text.Tokenizer: from pathlib import Path import pandas as pd from keras.models import Sequential from keras.layers import Embedding, LSTM, Dense, Dropout, Activation from keras.preprocessing.text import Tokenizer, text_to_word_sequence from keras.preprocessing.sequence import pad_sequences from keras import optimizers from keras.callbacks import ModelCheckpoint from keras.models import load_model from sklearn.model_selection import train_test_split def get_data(filename, num_words=5000, frac=1.0): data = (pd.read_csv(filename, header=None, names=['text'], sep='~') .sample(frac=frac)) data[['label','text']] = data.pop('text').str.split(n=1, expand=True) data = data.dropna() data = data.loc[data['label'].str.contains(r'^_label')] # build vocabulary tok = Tokenizer(num_words=num_words) tok.fit_on_texts(data['text']) # convert texts to sequences X = tok.texts_to_sequences(data['text']) lb = LabelBinarizer() Y = pd.DataFrame(lb.fit_transform(data['label']), columns=lb.classes_, index=data.index) return (pad_sequences(X, maxlen=num_words), Y, tok) path = Path(r'D:\temp\.data') filename = path / 'testtext.txt' num_words = 1000 X, Y, tok = get_data(filename, num_words=num_words) # split data set to train / dev X_train, X_dev, Y_train, Y_dev = \ train_test_split(X, Y, test_size=0.2, random_state=123, stratify=Y) print('X_train.shape:\t{}\t\tY_train.shape:\t{}'.format(X_train.shape, Y_train.shape)) print('X_dev.shape:\t{}\t\tY_dev.shape:\t{}'.format(X_dev.shape, Y_dev.shape)) вывод: X_train.shape: (26850, 1000) Y_train.shape: (26850, 3) X_dev.shape: (6713, 1000) Y_dev.shape: (6713, 3) Что у нас получилось: In [4]: X Out[4]: array([[ 0, 0, 0, ..., 250, 154, 16], [ 0, 0, 0, ..., 112, 121, 84], [ 0, 0, 0, ..., 72, 49, 44], ..., [ 0, 0, 0, ..., 5, 109, 99], [ 0, 0, 0, ..., 158, 513, 78], [ 0, 0, 0, ..., 0, 0, 138]]) In [5]: X.shape Out[5]: (33563, 1000) In [6]: Y Out[6]: _label_0 _label_1 _label_2 30455 1 0 0 19423 1 0 0 29907 0 1 0 12779 0 1 0 28342 0 0 1 27583 0 0 1 28096 0 1 0 21411 1 0 0 23425 1 0 0 33227 1 0 0 ... ... ... ... 28788 1 0 0 17329 0 1 0 5339 0 1 0 9461 0 0 1 31315 0 0 1 23199 1 0 0 6752 0 1 0 164 1 0 0 24283 0 0 1 25055 0 0 1 [33563 rows x 3 columns] In [7]: Y.shape Out[7]: (33563, 3) In [10]: tok.index_word[250] Out[10]: 'реабилитац' In [11]: tok.index_word[154] Out[11]: 'инвалид' In [12]: tok.index_word[16] Out[12]: 'год'

Комментариев нет:

Отправить комментарий