（備忘録-python）tensorflowでBERTを使う roberta、distilbert編(ほぼコードのみ)

少し前にBERTについて学びました。その発展形を何個か試したので備忘録的な感じでコードを残しておきます。

（備忘録-python）自然言語処理超入門：(やっと)BERTの仕組みを学び・使う(英文)（準初心者向け）

自然言語処理の必須知識となったBERTについて、初心者でもわかりやすく解説します。BERTの特徴や仕組み、応用例などを紹介します。BERTを使って自然言語処理のスキルを向上させましょう。

はじめに
distilbert編
roberta編
まとめ
参考記事：TransformerとBERTの使い方（テキスト分類用コードまとめ）

はじめに

使いたいモデルがはっきりしている場合は以下のような書き方でも大丈夫ですが、AutoModelはライブラリから呼び出して使えるモデルのラッパークラスも存在しますので、いつかまとめようと思います。

distilbert編

Tokenizer

from transformers import BertTokenizer,DistilBertTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from transformers import TFDistilBertForSequenceClassification
from transformers import TFBertModel,TFDistilBertModel

MODEL_NAME="distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)


# テキストのリストを専用の入力データに変換
def to_features(texts, max_length):
    shape = (len(texts), max_length)
    # input_idsやattention_mask, token_type_ids
    input_ids = np.zeros(shape, dtype="int32")
#     attention_mask = np.zeros(shape, dtype="int32")

    for i, text in enumerate(texts):
        encoded_dict = tokenizer.encode_plus(text, max_length=max_length, pad_to_max_length=True,truncation=True)
        input_ids[i] = encoded_dict["input_ids"]
#         attention_mask[i] = encoded_dict["attention_mask"]

    return [tf.cast(input_ids, tf.int32)]

データに適用

max_length=500
x_train = to_features(train_texts, max_length)
y_train=tf.cast(train_labels, tf.int32)
x_valid = to_features(valid_texts, max_length)
y_valid=tf.cast(valid_labels, tf.int32)

分類モデル構築の例

from tensorflow import keras
from tensorflow.keras import optimizers, losses, metrics
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype="int32", name="input_ids")
    sequence_output = transformer(input_word_ids)[0]
    ## 入力文章の1単語目[CLS]の特徴量を使用して、多値分類
    cls_token = sequence_output[:, 0, :]
    
    out = Dense(128, activation='relu')(cls_token)
    out = Dense(4, activation='softmax')(out)
    
    model = Model(inputs=input_word_ids, outputs=out)
    
    adam=optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

    model.compile(optimizer=adam,
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
    
    return model

transformer_layer = TFDistilBertModel.from_pretrained(MODEL_NAME)
# 層をfreeze(学習させないように)するには以下
#transformer_layer.trainable= not True


model = build_model(transformer_layer, max_len=500)
model.summary()

roberta編

Tokenizer

from transformers import BertTokenizer,DistilBertTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import RobertaTokenizer,DebertaV2Tokenizer

# MDL_PATH="microsoft/deberta-v3-base"
MDL_PATH="roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(MDL_PATH)


# テキストのリストを専用の入力データに変換
def to_features(texts, max_length):
    shape = (len(texts), max_length)
    # input_idsやattention_mask, token_type_ids
    input_ids = np.zeros(shape, dtype="int32")
    attention_mask = np.zeros(shape, dtype="int32")

    for i, text in enumerate(texts):
        encoded_dict = tokenizer.encode_plus(text, max_length=max_length, pad_to_max_length=True,truncation=True)
        input_ids[i] = encoded_dict["input_ids"]
        attention_mask[i] = encoded_dict["attention_mask"]

    return [tf.cast(input_ids, tf.int32), tf.cast(attention_mask, tf.int32)]

データに適用

max_length=500
x_train = to_features(train_texts, max_length)
y_train=tf.cast(train_labels, tf.int32)
x_valid = to_features(valid_texts, max_length)
y_valid=tf.cast(valid_labels, tf.int32)

分類器の例

from tensorflow import keras
from tensorflow.keras import optimizers, losses, metrics
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout,Conv1D,Conv2D,GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

filters=250    #channel_size
kernel_size=3 #filter size


def build_model(roberta_model, max_len=512):
    
    input_ids = Input(shape=(max_length, ), dtype='int32', name='input_ids')
    attention_mask = Input(shape=(max_length, ), dtype='int32', name='attention_mask')
    
    sequence_output = roberta_model(input_ids = input_ids, attention_mask = attention_mask)[0]
    
    ## 入力文章の1単語目[CLS]の特徴量を使用して、多値分類
#     cls_token = sequence_output[:, 0, :]
#     document_encodings = tf.squeeze(cls_token, axis=1)

    out = Conv1D(filters,kernel_size,padding="valid",activation="relu",strides=1)(sequence_output)
    out = GlobalMaxPooling1D()(out)
    out = Dense(512, activation='relu')(out)
#     out = Dense(512, activation='relu')(out)
#     out = Dense(256, activation='relu')(out)
#     out = Dense(128, activation='relu')(out)
    out = Dropout(0.2)(out)
    out = Dense(4, activation='softmax')(out)
    
    model = Model(inputs = [input_ids, attention_mask], 
                           outputs = out)
    
    
    
    adam=optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    
    model.compile(optimizer=adam,
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
    
    return model

from transformers import TFAutoModel,TFRobertaModel,TFDebertaV2Model
roberta_model = TFRobertaModel.from_pretrained(MDL_PATH)

# 層をfreeze(学習させないように)するには以下
# roberta_model.trainable= not True

model = build_model(roberta_model, max_len=500)
model.summary()