少し前にBERTについて学びました。その発展形を何個か試したので備忘録的な感じでコードを残しておきます。
以下の記事でもtensorflowでBERTの基本のベースのモノで分類器を作製しています。 nlab-notebook.com
はじめに
使いたいモデルがはっきりしている場合は以下のような書き方でも大丈夫ですが、AutoModelはライブラリから呼び出して使えるモデルのラッパークラスも存在しますので、いつかまとめようと思います。
distilbert編
Tokenizer
from transformers import BertTokenizer,DistilBertTokenizer from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from transformers import TFDistilBertForSequenceClassification from transformers import TFBertModel,TFDistilBertModel MODEL_NAME="distilbert-base-uncased" tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) # テキストのリストを専用の入力データに変換 def to_features(texts, max_length): shape = (len(texts), max_length) # input_idsやattention_mask, token_type_ids input_ids = np.zeros(shape, dtype="int32") # attention_mask = np.zeros(shape, dtype="int32") for i, text in enumerate(texts): encoded_dict = tokenizer.encode_plus(text, max_length=max_length, pad_to_max_length=True,truncation=True) input_ids[i] = encoded_dict["input_ids"] # attention_mask[i] = encoded_dict["attention_mask"] return [tf.cast(input_ids, tf.int32)]
データに適用
max_length=500
x_train = to_features(train_texts, max_length)
y_train=tf.cast(train_labels, tf.int32)
x_valid = to_features(valid_texts, max_length)
y_valid=tf.cast(valid_labels, tf.int32)
分類モデル構築の例
from tensorflow import keras from tensorflow.keras import optimizers, losses, metrics from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, Dropout from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint def build_model(transformer, max_len=512): input_word_ids = Input(shape=(max_len,), dtype="int32", name="input_ids") sequence_output = transformer(input_word_ids)[0] ## 入力文章の1単語目[CLS]の特徴量を使用して、多値分類 cls_token = sequence_output[:, 0, :] out = Dense(128, activation='relu')(cls_token) out = Dense(4, activation='softmax')(out) model = Model(inputs=input_word_ids, outputs=out) adam=optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['acc']) return model transformer_layer = TFDistilBertModel.from_pretrained(MODEL_NAME) # 層をfreeze(学習させないように)するには以下 #transformer_layer.trainable= not True model = build_model(transformer_layer, max_len=500) model.summary()
roberta編
Tokenizer
from transformers import BertTokenizer,DistilBertTokenizer from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from transformers import RobertaTokenizer,DebertaV2Tokenizer # MDL_PATH="microsoft/deberta-v3-base" MDL_PATH="roberta-base" tokenizer = RobertaTokenizer.from_pretrained(MDL_PATH) # テキストのリストを専用の入力データに変換 def to_features(texts, max_length): shape = (len(texts), max_length) # input_idsやattention_mask, token_type_ids input_ids = np.zeros(shape, dtype="int32") attention_mask = np.zeros(shape, dtype="int32") for i, text in enumerate(texts): encoded_dict = tokenizer.encode_plus(text, max_length=max_length, pad_to_max_length=True,truncation=True) input_ids[i] = encoded_dict["input_ids"] attention_mask[i] = encoded_dict["attention_mask"] return [tf.cast(input_ids, tf.int32), tf.cast(attention_mask, tf.int32)]
データに適用
max_length=500
x_train = to_features(train_texts, max_length)
y_train=tf.cast(train_labels, tf.int32)
x_valid = to_features(valid_texts, max_length)
y_valid=tf.cast(valid_labels, tf.int32)
分類器の例
from tensorflow import keras from tensorflow.keras import optimizers, losses, metrics from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, Dropout,Conv1D,Conv2D,GlobalMaxPooling1D from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint filters=250 #channel_size kernel_size=3 #filter size def build_model(roberta_model, max_len=512): input_ids = Input(shape=(max_length, ), dtype='int32', name='input_ids') attention_mask = Input(shape=(max_length, ), dtype='int32', name='attention_mask') sequence_output = roberta_model(input_ids = input_ids, attention_mask = attention_mask)[0] ## 入力文章の1単語目[CLS]の特徴量を使用して、多値分類 # cls_token = sequence_output[:, 0, :] # document_encodings = tf.squeeze(cls_token, axis=1) out = Conv1D(filters,kernel_size,padding="valid",activation="relu",strides=1)(sequence_output) out = GlobalMaxPooling1D()(out) out = Dense(512, activation='relu')(out) # out = Dense(512, activation='relu')(out) # out = Dense(256, activation='relu')(out) # out = Dense(128, activation='relu')(out) out = Dropout(0.2)(out) out = Dense(4, activation='softmax')(out) model = Model(inputs = [input_ids, attention_mask], outputs = out) adam=optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['acc']) return model
from transformers import TFAutoModel,TFRobertaModel,TFDebertaV2Model roberta_model = TFRobertaModel.from_pretrained(MDL_PATH) # 層をfreeze(学習させないように)するには以下 # roberta_model.trainable= not True model = build_model(roberta_model, max_len=500) model.summary()
まとめ
ほぼコードだけですが誰かの役になれば幸いです。そして、自分も書き方を忘れたらこの記事に戻ってこようと思います。