[파이썬] KoELECTRA 기반의 감성 자질 모델 구현 도전기 #2-3-1

2023. 5. 14. 12:44연구 프로젝트/감성 자질 모델

2. 구현 도전

 

3) BERT의 사전 학습

※시간 될 때마다 코드에 대한 자세한 주석은 계속 추가할 예정입니다.

(1) KoELECTRA 코드 출처: GitHub - monologg/KoELECTRA: Pretrained ELECTRA Model for Korean

 

GitHub - monologg/KoELECTRA: Pretrained ELECTRA Model for Korean

Pretrained ELECTRA Model for Korean. Contribute to monologg/KoELECTRA development by creating an account on GitHub.

github.com

 

*KoELECTRA를 사전학습하는 코드 중 필요한 모듈

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import json

import tensorflow.compat.v1 as tf #tensorflow verson 1

import configure_pretraining
from model import modeling
from model import optimization
from pretrain import pretrain_data
from pretrain import pretrain_helpers
from util import training_utils
from util import utils

-configure_pretraining.py

-modeling.py

-optimization.py

-pretrain_helpers

 

(2) 사전학습 사전 설정-configure_pretraining.py

(i) PretrainingConfig 클래스

class PretrainingConfig(object):
    """Defines pre-training hyperparameters."""

    def __init__(self, model_name, data_dir, **kwargs):
        self.model_name = model_name
        self.debug = False  # debug mode for quickly running things
        self.do_train = True  # pre-train ELECTRA
        self.do_eval = False  # evaluate generator/discriminator on unlabeled data

        # loss functions
        self.electra_objective = True  # if False, use the BERT objective instead
        self.gen_weight = 1.0  # masked language modeling / generator loss
        self.disc_weight = 50.0  # discriminator loss
        self.mask_prob = 0.15  # percent of input tokens to mask out / replace

        # optimization
        self.learning_rate = 5e-4
        self.lr_decay_power = 1.0  # linear weight decay by default
        self.weight_decay_rate = 0.01
        self.num_warmup_steps = 10000

        # training settings
        self.iterations_per_loop = 200
        self.save_checkpoints_steps = 1000
        self.num_train_steps = 1000000
        self.num_eval_steps = 100

        # model settings
        self.model_size = "base"  # one of "small", "base", or "large"
        # override the default transformer hparams for the provided model size; see
        # modeling.BertConfig for the possible hparams and util.training_utils for
        # the defaults
        self.model_hparam_overrides = (
            kwargs["model_hparam_overrides"]
            if "model_hparam_overrides" in kwargs else {})
        self.embedding_size = None  # bert hidden size by default
        self.vocab_size = 32200  # number of tokens in the vocabulary
        self.do_lower_case = False  # lowercase the input?

        # generator settings
        self.uniform_generator = False  # generator is uniform at random
        self.untied_generator_embeddings = False  # tie generator/discriminator token embeddings
        self.untied_generator = True  # tie all generator/discriminator weights?
        self.generator_layers = 1.0  # frac of discriminator layers for generator
        self.generator_hidden_size = 0.25  # frac of discrim hidden size for gen
        self.disallow_correct = False  # force the generator to sample incorrect
        # tokens (so 15% of tokens are always fake)
        self.temperature = 1.0  # temperature for sampling from generator

        # batch sizes
        self.max_seq_length = 128
        self.train_batch_size = 128
        self.eval_batch_size = 128

        # TPU settings
        self.use_tpu = True
        self.num_tpu_cores = 8
        self.tpu_job_name = None
        self.tpu_name = None  # cloud TPU to use for training
        self.tpu_zone = None  # GCE zone where the Cloud TPU is located in
        self.gcp_project = None  # project name for the Cloud TPU-enabled project

        # default locations of data files
        self.pretrain_tfrecords = os.path.join(
            data_dir, "pretrain_tfrecords/pretrain_data.tfrecord*")
        self.vocab_file = os.path.join(data_dir, "vocab.txt")
        self.model_dir = os.path.join(data_dir, "models", model_name)
        results_dir = os.path.join(self.model_dir, "results")
        self.results_txt = os.path.join(results_dir, "unsup_results.txt")
        self.results_pkl = os.path.join(results_dir, "unsup_results.pkl")

        # update defaults with passed-in hyperparameters
        self.update(kwargs)

        self.max_predictions_per_seq = int((self.mask_prob + 0.005) * self.max_seq_length)

        # debug-mode settings
        if self.debug:
            self.train_batch_size = 8
            self.num_train_steps = 20
            self.eval_batch_size = 4
            self.iterations_per_loop = 1
            self.num_eval_steps = 2

        # defaults for different-sized model
        if self.model_size == "small":
            self.embedding_size = 128
        # Here are the hyperparameters we used for larger models; see Table 6 in the paper for the full hyperparameters
        else:
            self.max_seq_length = 512
            self.learning_rate = 2e-4
            if self.model_size == "base":
                self.embedding_size = 768
                self.generator_hidden_size = 0.33333
                self.train_batch_size = 256
            else:
                self.embedding_size = 1024
                self.mask_prob = 0.25
                self.train_batch_size = 2048

        # passed-in-arguments override (for example) debug-mode defaults
        self.update(kwargs)

    def update(self, kwargs):
        for k, v in kwargs.items():
            if k not in self.__dict__:
                raise ValueError("Unknown hparam " + k)
            self.__dict__[k] = v

 

(3) KoELECTRA 모델-modeling.py

(i) BertConfig 클래스

*초기 설정

class BertConfig(object):
  """Configuration for 'BertModel' (ELECTRA uses the same model as BERT)."""

  def __init__(self,
               vocab_size, #전체 단어 개수
               hidden_size=768, #인코더 레이어 크기
               num_hidden_layers=12, #트랜스포머 인코더에서 은닉층의 개수
               num_attention_heads=12, #각 어텐션 레이어에서 어텐션 헤드의 개수
               intermediate_size=3072, #인코더의 중간 레이어(ex. 피드포워드) 크기
               hidden_act="gelu", #인코더의 비선형 활성화 함수
               hidden_dropout_prob=0.1, #임베딩, 인코더 레이어에서 모든 dense 레이어의 드랍아웃 확률
               attention_probs_dropout_prob=0.1, #어텐션 확률에 대한 드랍아웃의 비율
               max_position_embeddings=512, #입력값 최대 길이
               type_vocab_size=2, #세그먼트 값 개수
               initializer_range=0.02): #모든 가중치 값 초기화를 위한 '잘린 정규분포'의 표준편차
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range

 

*데이터 형식 변경 함수 4가지

def from_dict(cls, json_object): #파이썬 딕셔너리 형태(key, value)로 저장된 파라미터를 통해 BERT 초기 환경 설정
    config = BertConfig(vocab_size=None)
    for (key, value) in six.iteritems(json_object):
      config.__dict__[key] = value #__dict__: config 기본 설정 값을 딕셔너리 형태로 제작
    return config

  @classmethod
  def from_json_file(cls, json_file): #json 형식의 파일로 저장된 BERT의 기본 설정을 불러온 후, from_dict 메소드를 통해 파이썬의 딕셔너리 형태로 설정 값 저장
    with tf.io.gfile.GFile(json_file, "r") as reader:
      text = reader.read()
    return cls.from_dict(json.loads(text))

  def to_dict(self): #클래스 객체를 파이쎤 딕셔너리 형태로 복사(deepcopy) 및 반환
    output = copy.deepcopy(self.__dict__)
    return output

  def to_json_string(self): #클래스 객체를 파이썬 딕셔너리 형태로 복사한 후에 이를 json 파일 형태로 저장
    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

 

(ii) BERT(or ELECTRA) 모델 제작-BertModel 클래스

※ELECTRA는 BERT와 동일한 트랜스포머 구조를 지님

*초기 설정

class BertModel(object):
  def __init__(self,
               bert_config, #BertConfig 클래스 객체
               is_training, #True: 학습 목적으로 사용 vs. False: 평가 목적으로 사용
               input_ids, #입력 문장을 토큰화 한 후 토큰ID로 변환한 tensor 형태의 결과값
               input_mask=None, #마스킹할 토큰 정보 나타내는 tensor
               token_type_ids=None, #tensor 형태의 세그먼트 정보
               use_one_hot_embeddings=True, #(선택옵션)원-핫 단어 임베딩 사용 유무(TPU 환경에서는 True, CPU/GPU 환경에서는 False 추천)
               scope=None, #(선택옵션): 변수 범위(기본값: "electra")
               embedding_size=None,
               input_embeddings=None,
               input_reprs=None,
               update_embeddings=True,
               untied_embeddings=False):
    bert_config = copy.deepcopy(bert_config)  #bert의 기본 설정인 bert_config을 복사(deepcopy)
    if not is_training:  #학습 목적이 아닌 평가 용도의 모델이라면(is_training=False)
      bert_config.hidden_dropout_prob = 0.0
      bert_config.attention_probs_dropout_prob = 0.0  #드랍아웃 비율 0으로 설정 = 드랍아웃 실행하지 않음

    input_shape = get_shape_list(token_type_ids, expected_rank=2)  #get_shape_list 함수에 token_type_ids(세그먼트 정보)와 expected_rank 값을 입력해 얻은 결과물 저장
    batch_size = input_shape[0]  #배치 사이즈
    seq_length = input_shape[1]  #데이터 전체 길이

    if input_mask is None:  #input_mask가 None이면,
      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)  #[배치 사이즈, 문장 길이] 사이즈에 모든 요소가 1인 tensor 제작 -> input_mask

    assert token_type_ids is not None  #token_type_ids는 None이 아님을 가정

    if input_reprs is None:
      if input_embeddings is None:
        with tf.variable_scope(
            (scope if untied_embeddings else "electra") + "/embeddings",
            reuse=tf.AUTO_REUSE):
          # Perform embedding lookup on the word ids
          if embedding_size is None:  #단어 임베딩 사이즈가 입력되지 않은 경우
            embedding_size = bert_config.hidden_size  #BERT의 속성 값 중 인코더 레이어 크기를 단어 임베딩 사이즈로 지정
          (self.token_embeddings, self.embedding_table) = embedding_lookup(
              input_ids=input_ids,
              vocab_size=bert_config.vocab_size,
              embedding_size=embedding_size,
              initializer_range=bert_config.initializer_range,
              word_embedding_name="word_embeddings",
              use_one_hot_embeddings=use_one_hot_embeddings)
      else:  #input_embeddinggs 값이 None이 아니면 입력값 그대로 사용
        self.token_embeddings = input_embeddings

      with tf.variable_scope(
          (scope if untied_embeddings else "electra") + "/embeddings",
          reuse=tf.AUTO_REUSE):
        # Add positional embeddings and token type embeddings, then layer
        # normalize and perform dropout.
        self.embedding_output = embedding_postprocessor(
            input_tensor=self.token_embeddings,
            use_token_type=True,
            token_type_ids=token_type_ids,
            token_type_vocab_size=bert_config.type_vocab_size,
            token_type_embedding_name="token_type_embeddings",
            use_position_embeddings=True,
            position_embedding_name="position_embeddings",
            initializer_range=bert_config.initializer_range,
            max_position_embeddings=bert_config.max_position_embeddings,
            dropout_prob=bert_config.hidden_dropout_prob)
    else:
      self.embedding_output = input_reprs
    if not update_embeddings:
      self.embedding_output = tf.stop_gradient(self.embedding_output)

    with tf.variable_scope(scope, default_name="electra"):
      if self.embedding_output.shape[-1] != bert_config.hidden_size:
        self.embedding_output = tf.layers.dense(
            self.embedding_output, bert_config.hidden_size,
            name="embeddings_project")

      with tf.variable_scope("encoder"):
        # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
        # mask of shape [batch_size, seq_length, seq_length] which is used
        # for the attention scores.
        attention_mask = create_attention_mask_from_input_mask(
            token_type_ids, input_mask)

        # Run the stacked transformer. Output shapes
        # sequence_output: [batch_size, seq_length, hidden_size]
        # pooled_output: [batch_size, hidden_size]
        # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size].
        # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length]
        (self.all_layer_outputs, self.attn_maps) = transformer_model(
            input_tensor=self.embedding_output,
            attention_mask=attention_mask,
            hidden_size=bert_config.hidden_size,
            num_hidden_layers=bert_config.num_hidden_layers,
            num_attention_heads=bert_config.num_attention_heads,
            intermediate_size=bert_config.intermediate_size,
            intermediate_act_fn=get_activation(bert_config.hidden_act),
            hidden_dropout_prob=bert_config.hidden_dropout_prob,
            attention_probs_dropout_prob=
            bert_config.attention_probs_dropout_prob,
            initializer_range=bert_config.initializer_range,
            do_return_all_layers=True)  #transformer_model 함수 실행: 멀티 헤드 어텐션 트랜스포머 모델 구축
        self.sequence_output = self.all_layer_outputs[-1]
        self.pooled_output = self.sequence_output[:, 0]

 

(iii) get_pooled_output 함수

def get_pooled_output(self):
    return self.pooled_output

  def get_sequence_output(self):
    """Gets final hidden layer of encoder.

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
      to the final hidden of the transformer encoder.
    """
    return self.sequence_output

  def get_all_encoder_layers(self):
    return self.all_layer_outputs

  def get_embedding_output(self):
    """Gets output of the embedding lookup (i.e., input to the transformer).

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
      to the output of the embedding layer, after summing the word
      embeddings with the positional embeddings and the token type embeddings,
      then performing layer normalization. This is the input to the transformer.
    """
    return self.embedding_output

  def get_embedding_table(self):
    return self.embedding_table

(iv) gelu 함수

def gelu(input_tensor):
  """Gaussian Error Linear Unit.

  This is a smoother version of the RELU.
  Original paper: https://arxiv.org/abs/1606.08415

  Args:
    input_tensor: float Tensor to perform activation.

  Returns:
    `input_tensor` with the GELU activation applied.
  """
  cdf = 0.5 * (1.0 + tf.math.erf(input_tensor / tf.sqrt(2.0)))
  return input_tensor * cdf

(v) get_activation 함수

def get_activation(activation_string):
  """Maps a string to a Python function, e.g., "relu" => 'tf.nn.relu'.

  Args:
    activation_string: String name of the activation function.

  Returns:
    A Python function corresponding to the activation function. If
    'activation_string' is None, empty, or "linear", this will return None.
    If 'activation_string' is not a string, it will return 'activation_string'.

  Raises:
    ValueError: The 'activation_string' does not correspond to a known
      activation.
  """

  # We assume that anything that"s not a string is already an activation
  # function, so we just return it.
  if not isinstance(activation_string, six.string_types):
    return activation_string

  if not activation_string:
    return None

  act = activation_string.lower()
  if act == "linear":
    return None
  elif act == "relu":
    return tf.nn.relu
  elif act == "gelu":
    return gelu
  elif act == "tanh":
    return tf.tanh
  else:
    raise ValueError("Unsupported activation: %s" % act)

(vi) get_assignment_map_from_checkpoint 함수

def get_assignment_map_from_checkpoint(tvars, init_checkpoint, prefix=""):
  """Compute the union of the current variables and checkpoint variables."""
  name_to_variable = collections.OrderedDict()
  for var in tvars:
    name = var.name
    m = re.match("^(.*):\\d+$", name)
    if m is not None:
      name = m.group(1)
    name_to_variable[name] = var

  initialized_variable_names = {}
  assignment_map = collections.OrderedDict()
  for x in tf.train.list_variables(init_checkpoint):
    (name, var) = (x[0], x[1])
    if prefix + name not in name_to_variable:
      continue
    assignment_map[name] = prefix + name
    initialized_variable_names[name] = 1
    initialized_variable_names[name + ":0"] = 1

  return assignment_map, initialized_variable_names

(vii) dropout 함수

def dropout(input_tensor, dropout_prob):
  """Perform dropout.

  Args:
    input_tensor: float Tensor.
    dropout_prob: Python float. The probability of dropping out a value (NOT of
      *keeping* a dimension as in `tf.nn.dropout`).

  Returns:
    A version of 'input_tensor' with dropout applied.
  """
  if dropout_prob is None or dropout_prob == 0.0:
    return input_tensor

  output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
  return output

(viii) layer_norm 함수

def layer_norm(input_tensor, name=None):
  """Run layer normalization on the last dimension of the tensor."""
  return contrib_layers.layer_norm(
      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)

(iX) layer_norm_and_dropout 함수

def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
  """Runs layer normalization followed by dropout."""
  output_tensor = layer_norm(input_tensor, name)
  output_tensor = dropout(output_tensor, dropout_prob)
  return output_tensor

(X) create_initializer 함수

def create_initializer(initializer_range=0.02):
  """Creates a 'truncated_normal_initializer' with the given range."""
  return tf.truncated_normal_initializer(stddev=initializer_range)

 

(Xi) embedding_lookup 함수

*토큰 id tensor를 생성하기 위해 단어 임베딩을 살펴보는 함수

def embedding_lookup(input_ids,  #문장을 토큰화한 후 토큰 id와 매칭해서 저장한 tensor 형태의 값
                     vocab_size,  #전체 단어 개수
                     embedding_size=128,  #단어 임베딩 차원
                     initializer_range=0.02,  #임베딩 초기화 범위
                     word_embedding_name="word_embeddings",  #문자열로, 임베딩 table의 이름
                     use_one_hot_embeddings=False):  #True-단어 임베딩에서 원-핫 인코딩 사용, False-단어 임베딩에서 tf.nn.embedding-lookup() 사용 (TPU에서는 True가 더 빠름)
                     #tf.nn.embedding_lookup(): tensorflow에서 제공하는 embedding look up 함수로, 단어별 벡터값을 리스트 형태로 저장한 후 연산 과정에서 해당 인덱스의 실제 값을 읽어와 학습 및 처리

  #이 함수는 입력값 사이즈가 [배치 사이즈, 문장 길이, num_inputs?]임을 가정
  original_dims = input_ids.shape.ndims
  if original_dims == 2:  #입력값이 2차원 형태의 tensor라면,
    input_ids = tf.expand_dims(input_ids, axis=[-1])  #[배치 사이즈, 문장 길이, 1] 형태의 3차원 tensor로 변환

  embedding_table = tf.get_variable(
      name=word_embedding_name,
      shape=[vocab_size, embedding_size],
      initializer=create_initializer(initializer_range))
  #[단어 전체 개수, 임베딩 벡터 차원]의 형태며, 잘린 정규분포의 표준편차 값을 이용해 tensor 형태의 초기값 생성

  if original_dims == 3:  #입력값이 3차원 형태의 tensor라면,
    input_shape = get_shape_list(input_ids)  #입력값의 사이즈를 리스트 형태로 가져옴
    tf.reshape(input_ids, [-1, input_shape[-1]])  #입력값의 형태 변환
    output = tf.matmul(input_ids, embedding_table)  #입력값 & 초기값 tensor 간 행렬곱 계산
    output = tf.reshape(output,
                        [input_shape[0], input_shape[1], embedding_size])  #계산된 결과값의 형태를 (배치 사이즈, 문장 길이, 임베딩 벡터 차원)로 다시 변환
  else:  #입력값이 2차원, 3차원도 아니라면,
    if use_one_hot_embeddings:  #원-핫 인코딩 사용한다면,
      flat_input_ids = tf.reshape(input_ids, [-1])  #토큰 id로 변환한 입력값을 1차원으로 변형
      one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)  #전체 단어 개수를 요소 개수로 설정한 원-핫 인코디 벡터 반환
      output = tf.matmul(one_hot_input_ids, embedding_table)  #원-핫 인코딩 벡터와 초기값 간의 행렬곱 계산
    else:  #원-핫 인코딩 사용하지 않으면,
      output = tf.nn.embedding_lookup(embedding_table, input_ids)  #토큰 id로 변환한 입력값을 전체 단어 크기로 사용하는 tf.nn.embedding_lookup 함수 실행

    input_shape = get_shape_list(input_ids)  #토큰 id로 변환한 입력값의 사이즈를 리스트 형태로 저장

    output = tf.reshape(output,
                        input_shape[0:-1] + [input_shape[-1] * embedding_size])
  return output, embedding_table  #float Tensor of shape [batch_size, seq_length, embedding_size]

 

(Xii) embedding_postprocessor 함수

def embedding_postprocessor(input_tensor,  #[배치 사이즈, 문장 길이, 임베딩 차원]의 tensor
                            use_token_type=False,  #토큰 임베딩 사용 유무
                            token_type_ids=None,  #입력 데이터의 세그먼트 정보
                            token_type_vocab_size=16,  #입력 데이터의 세그먼트 정보의 단어 개수
                            token_type_embedding_name="token_type_embeddings",  #세그먼트 정보 임베딩 변수 이름
                            use_position_embeddings=True,  #위치 임베딩 사용 유무
                            position_embedding_name="position_embeddings",  #위치 임베딩 변수 이름
                            initializer_range=0.02,  #가중치 초기값 범위
                            max_position_embeddings=512,  #문장 최대 길이
                            dropout_prob=0.1):  #마지막 output tensor에 드롭아웃 적용할 확률
  """Performs various post-processing on a word embedding tensor.
  Returns:
    float tensor with same shape as 'input_tensor'.
  """
  input_shape = get_shape_list(input_tensor, expected_rank=3)  #입력값의 사이즈를 get_shape_list 함수 이용해 리스트 형태로 저장: [배치 사이즈, 문장 길이, 임베딩 차원]
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  width = input_shape[2]
  
  output = input_tensor  #output 변수에 input_tensor(토큰 id화한 데이터) 저장 = 토큰 임베딩 결과★

  if use_token_type:  #세그먼트 임베딩 사용한다면,
    if token_type_ids is None:
      raise ValueError("'token_type_ids' must be specified if"
                       "'use_token_type' is True.")
    token_type_table = tf.get_variable(
        name=token_type_embedding_name,
        shape=[token_type_vocab_size, width],
        initializer=create_initializer(initializer_range))  #[세그먼트 레이어의 단어 개수, 임베딩 차원] 모양의 tensor 만들어 token_type_table 변수에 저장
    # This vocab will be small so we always do one-hot here, since it is always
    # faster for a small vocabulary.
    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)  #여기에서는 매번 원-핫 인코딩 사용. 세그먼트 레이어 단어 개수를 차원으로 갖는 원-핫 인코딩 벡터 생성
    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)  #원-핫 인코딩 벡터들이 모인 행렬과 token_type_table 간 행렬곱
    token_type_embeddings = tf.reshape(token_type_embeddings,
                                       [batch_size, seq_length, width])
    output += token_type_embeddings  #output 변수에 위 결과값 더함 = 토큰 임베딩 레이어 결과값+세그먼트 임베딩 레이어 결과값★

  if use_position_embeddings:   #위치 임베딩 사용한다면,
    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)  #요소 별로 seq_length가 max_position_embedding값보다 작거나 같다고 가정
    with tf.control_dependencies([assert_op]):
      full_position_embeddings = tf.get_variable(
          name=position_embedding_name,
          shape=[max_position_embeddings, width],
          initializer=create_initializer(initializer_range))
      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                     [seq_length, -1])  #위치 임베딩 행렬에서 첫 행, 첫 열을 시작으로 문장 길이 만큼의 열벡터 추출
      num_dims = len(output.shape.as_list())  #output 차원을 리스트화 했을 때 길이(차원)을 num_dims 변수에 저장

      # Only the last two dimensions are relevant (`seq_length` and `width`), so
      # we broadcast among the first dimensions, which is typically just
      # the batch size.
      position_broadcast_shape = []
      for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
      position_broadcast_shape.extend([seq_length, width])
      position_embeddings = tf.reshape(position_embeddings,
                                       position_broadcast_shape)  #위치 임베딩 행렬을 positio_broadcast_shape와 같은 형태로 변형
      output += position_embeddings  #위치 임베딩을 output에 더함 = 토큰 임베딩+세그먼트 임베딩+위치 임베딩 결과 반환★
      
  output = layer_norm_and_dropout(output, dropout_prob)
  return output

(Xiii) create_attention_mask_from_input_mask 함수

def create_attention_mask_from_input_mask(from_tensor, to_mask):
  """Create 3D attention mask from a 2D tensor mask.

  Args:
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].

  Returns:
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
  """
  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  batch_size = from_shape[0]
  from_seq_length = from_shape[1]

  to_shape = get_shape_list(to_mask, expected_rank=2)
  to_seq_length = to_shape[1]

  to_mask = tf.cast(
      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)

  # We don't assume that `from_tensor` is a mask (although it could be). We
  # don't actually care if we attend *from* padding tokens (only *to* padding)
  # tokens so we create a tensor of all ones.
  #
  # `broadcast_ones` = [batch_size, from_seq_length, 1]
  broadcast_ones = tf.ones(
      shape=[batch_size, from_seq_length, 1], dtype=tf.float32)

  # Here we broadcast along two dimensions to create the mask.
  mask = broadcast_ones * to_mask

  return mask

(Xiv) attention_layer 함수

def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    do_return_2d_tensor=False,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):
  """Performs multi-headed attention from 'from_tensor' to 'to_tensor'.

  This is an implementation of multi-headed attention based on "Attention
  is all you Need". If 'from_tensor' and 'to_tensor' are the same, then
  this is self-attention. Each timestep in 'from_tensor' attends to the
  corresponding sequence in 'to_tensor', and returns a fixed-with vector.

  This function first projects 'from_tensor' into a "query" tensor and
  'to_tensor' into "key" and "value" tensors. These are (effectively) a list
  of tensors of length 'num_attention_heads', where each tensor is of shape
  [batch_size, seq_length, size_per_head].

  Then, the query and key tensors are dot-producted and scaled. These are
  softmaxed to obtain attention probabilities. The value tensors are then
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

  In practice, the multi-headed attention are done with transposes and
  reshapes rather than actual separate tensors.

  Args:
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    size_per_head: int. Size of each attention head.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
      * from_seq_length, num_attention_heads * size_per_head]. If False, the
      output will be of shape [batch_size, from_seq_length, num_attention_heads
      * size_per_head].
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the 'from_tensor' and 'to_tensor'.
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the 'from_tensor'.
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the 'to_tensor'.

  Returns:
    float Tensor of shape [batch_size, from_seq_length,
      num_attention_heads * size_per_head]. (If 'do_return_2d_tensor' is
      true, this will be of shape [batch_size * from_seq_length,
      num_attention_heads * size_per_head]).

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  """
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                           seq_length, width):
    output_tensor = tf.reshape(
        input_tensor, [batch_size, seq_length, num_attention_heads, width])

    output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
    return output_tensor

  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

  if len(from_shape) != len(to_shape):
    raise ValueError(
        "The rank of 'from_tensor' must match the rank of 'to_tensor'.")

  if len(from_shape) == 3:
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]
    to_seq_length = to_shape[1]
  elif len(from_shape) == 2:
    if batch_size is None or from_seq_length is None or to_seq_length is None:
      raise ValueError(
          "When passing in rank 2 tensors to attention_layer, the values "
          "for 'batch_size', 'from_seq_length', and 'to_seq_length' "
          "must all be specified.")

  # Scalar dimensions referenced here:
  #   B = batch size (number of sequences)
  #   F = 'from_tensor' sequence length
  #   T = 'to_tensor' sequence length
  #   N = 'num_attention_heads'
  #   H = 'size_per_head'

  from_tensor_2d = reshape_to_matrix(from_tensor)
  to_tensor_2d = reshape_to_matrix(to_tensor)

  # 'query_layer' = [B*F, N*H]
  query_layer = tf.layers.dense(
      from_tensor_2d,
      num_attention_heads * size_per_head,
      activation=query_act,
      name="query",
      kernel_initializer=create_initializer(initializer_range))

  # 'key_layer' = [B*T, N*H]
  key_layer = tf.layers.dense(
      to_tensor_2d,
      num_attention_heads * size_per_head,
      activation=key_act,
      name="key",
      kernel_initializer=create_initializer(initializer_range))

  # 'value_layer' = [B*T, N*H]
  value_layer = tf.layers.dense(
      to_tensor_2d,
      num_attention_heads * size_per_head,
      activation=value_act,
      name="value",
      kernel_initializer=create_initializer(initializer_range))

  # 'query_layer' = [B, N, F, H]
  query_layer = transpose_for_scores(query_layer, batch_size,
                                     num_attention_heads, from_seq_length,
                                     size_per_head)

  # 'key_laye' = [B, N, T, H]
  key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
                                   to_seq_length, size_per_head)

  # Take the dot product between "query" and "key" to get the raw
  # attention scores.
  # 'attention_scores' = [B, N, F, T]
  attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  attention_scores = tf.multiply(attention_scores,
                                 1.0 / math.sqrt(float(size_per_head)))

  if attention_mask is not None:
    # 'attention_mask' = [B, 1, F, T]
    attention_mask = tf.expand_dims(attention_mask, axis=[1])

    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
    # masked positions, this operation will create a tensor which is 0.0 for
    # positions we want to attend and -10000.0 for masked positions.
    adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

    # Since we are adding it to the raw scores before the softmax, this is
    # effectively the same as removing these entirely.
    attention_scores += adder

  # Normalize the attention scores to probabilities.
  # 'attention_probs' = [B, N, F, T]
  attention_probs = tf.nn.softmax(attention_scores)

  # This is actually dropping out entire tokens to attend to, which might
  # seem a bit unusual, but is taken from the original Transformer paper.
  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

  # 'value_layer' = [B, T, N, H]
  value_layer = tf.reshape(
      value_layer,
      [batch_size, to_seq_length, num_attention_heads, size_per_head])

  # 'value_layer' = [B, N, T, H]
  value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

  # 'context_layer' = [B, N, F, H]
  context_layer = tf.matmul(attention_probs, value_layer)

  # 'context_layer' = [B, F, N, H]
  context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

  if do_return_2d_tensor:
    # 'context_layer' = [B*F, N*H]
    context_layer = tf.reshape(
        context_layer,
        [batch_size * from_seq_length, num_attention_heads * size_per_head])
  else:
    # 'context_layer' = [B, F, N*H]
    context_layer = tf.reshape(
        context_layer,
        [batch_size, from_seq_length, num_attention_heads * size_per_head])

  return context_layer, attention_probs

(Xv) transformer_model 함수

def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False):
  """Multi-headed, multi-layer Transformer from "Attention is All You Need".

  This is almost an exact implementation of the original Transformer encoder.

  See the original paper:
  https://arxiv.org/abs/1706.03762

  Also see:
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """
  if hidden_size % num_attention_heads != 0:
    raise ValueError(
        "The hidden size (%d) is not a multiple of the number of attention "
        "heads (%d)" % (hidden_size, num_attention_heads))

  attention_head_size = int(hidden_size / num_attention_heads)
  input_shape = get_shape_list(input_tensor, expected_rank=3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  input_width = input_shape[2]

  # The Transformer performs sum residuals on all layers so the input needs
  # to be the same as the hidden size.
  if input_width != hidden_size:
    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                     (input_width, hidden_size))

  # We keep the representation as a 2D tensor to avoid re-shaping it back and
  # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
  # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
  # help the optimizer.
  prev_output = reshape_to_matrix(input_tensor)

  attn_maps = []
  all_layer_outputs = []
  for layer_idx in range(num_hidden_layers):
    with tf.variable_scope("layer_%d" % layer_idx):
      with tf.variable_scope("attention"):
        attention_heads = []
        with tf.variable_scope("self"):
          attention_head, probs = attention_layer(
              from_tensor=prev_output,
              to_tensor=prev_output,
              attention_mask=attention_mask,
              num_attention_heads=num_attention_heads,
              size_per_head=attention_head_size,
              attention_probs_dropout_prob=attention_probs_dropout_prob,
              initializer_range=initializer_range,
              do_return_2d_tensor=True,
              batch_size=batch_size,
              from_seq_length=seq_length,
              to_seq_length=seq_length)
          attention_heads.append(attention_head)
          attn_maps.append(probs)

        attention_output = None
        if len(attention_heads) == 1:
          attention_output = attention_heads[0]
        else:
          # In the case where we have other sequences, we just concatenate
          # them to the self-attention head before the projection.
          attention_output = tf.concat(attention_heads, axis=-1)

        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
          attention_output = tf.layers.dense(
              attention_output,
              hidden_size,
              kernel_initializer=create_initializer(initializer_range))
          attention_output = dropout(attention_output, hidden_dropout_prob)
          attention_output = layer_norm(attention_output + prev_output)

      # The activation is only applied to the "intermediate" hidden layer.
      with tf.variable_scope("intermediate"):
        intermediate_output = tf.layers.dense(
            attention_output,
            intermediate_size,
            activation=intermediate_act_fn,
            kernel_initializer=create_initializer(initializer_range))

      # Down-project back to `hidden_size` then add the residual.
      with tf.variable_scope("output"):
        prev_output = tf.layers.dense(
            intermediate_output,
            hidden_size,
            kernel_initializer=create_initializer(initializer_range))
        prev_output = dropout(prev_output, hidden_dropout_prob)
        prev_output = layer_norm(prev_output + attention_output)
        all_layer_outputs.append(prev_output)

  attn_maps = tf.stack(attn_maps, 0)
  if do_return_all_layers:
    return tf.stack([reshape_from_matrix(layer, input_shape)
                     for layer in all_layer_outputs], 0), attn_maps
  else:
    return reshape_from_matrix(prev_output, input_shape), attn_maps

 

(Xvi) get_shape_list 함수

def get_shape_list(tensor, expected_rank=None, name=None):  #tensor 형태의 문장에 대한 세그먼트 정보를 list 형태로 반환
  if isinstance(tensor, np.ndarray) or isinstance(tensor, list):  #세그먼트 벡터가 numpy 배열 or list 형식이면,
    shape = np.array(tensor).shape  #numpy 배열로 변경했을 때의 사이즈 저장(ex. [1차원, 2차원, ...])
    if isinstance(expected_rank, six.integer_types):  #expected_rank가 정수값이면,
      assert len(shape) == expected_rank  #shape 길이(tensor 차원)와 expected_rank 값이 같다고 가정
    elif expected_rank is not None:   #expected rank 값이 정수 and None이 아니라면,
      assert len(shape) in expected_rank  #shape의 길이(tensor 차원)이 expected_rank에 포함된다고 가정
    return shape  #shape 반환 및 함수 종료
  #아래는 세그먼트 tensor가 numpy 배열 or list가 아니라면,
  if name is None:  #name 입력값이 None이면,
    name = tensor.name  #tensor의 name 속성값 지정

  if expected_rank is not None:  #expected_rank가 None이 아니면,
    assert_rank(tensor, expected_rank, name) #assert_rank 함수 실행

  shape = tensor.shape.as_list()  #tensor 형태의 세그먼트 사이즈를 리스트 형태로 저장

  non_static_indexes = []
  for (index, dim) in enumerate(shape):
    if dim is None:  #세그먼트 행 중 열 차원의 값이 None이면,
      non_static_indexes.append(index)  #non_static_indexes에 해당 행의 index 값 넣음

  if not non_static_indexes:  #non_static_indexes가 비어있지 않으면,
    return shape  #shape 그대로 반환 및 함수 종료

  dyn_shape = tf.shape(tensor)  #(만약 non_static_indexes==None이라면) 세그먼트 tensor 모양을 dyn_shape 변수에 저장
  for index in non_static_indexes:  #여러 행 충 열 차원의 값이 None인 행들에 대해,
    shape[index] = dyn_shape[index]  #tensor 사이즈의 값을 shape list에 저장
  return shape  #shape 반환 및 함수 종료

 

(Xvii) reshape_to_matrix 함수

def reshape_to_matrix(input_tensor):
  """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
  ndims = input_tensor.shape.ndims
  if ndims < 2:
    raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
                     (input_tensor.shape))
  if ndims == 2:
    return input_tensor

  width = input_tensor.shape[-1]
  output_tensor = tf.reshape(input_tensor, [-1, width])
  return output_tensor

 

def reshape_from_matrix(output_tensor, orig_shape_list):
  """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
  if len(orig_shape_list) == 2:
    return output_tensor

  output_shape = get_shape_list(output_tensor)

  orig_dims = orig_shape_list[0:-1]
  width = output_shape[-1]

  return tf.reshape(output_tensor, orig_dims + [width])

 

(Xviii) assert_rank 함수

def assert_rank(tensor, expected_rank, name=None):
  if name is None:  #name 값이 None이면,
    name = tensor.name  #세그먼트 tensor의 name 값 지정

  expected_rank_dict = {}
  if isinstance(expected_rank, six.integer_types):  #expected_rank가 정수 형태면,
    expected_rank_dict[expected_rank] = True  #{expected_rank: True} 형태의 딕셔너리 생성
  else:  #정수 형태가 아니라면,
    for x in expected_rank:
      expected_rank_dict[x] = True  #expected_rank 내 값을 x를 키로, True를 값으로 갖는 딕셔너리 생성

  actual_rank = tensor.shape.ndims  #actual_rank 변수에 세그먼트 tensor 차원을 저장
  if actual_rank not in expected_rank_dict:  #actual_rank(실제 rank 값)이 expected_rank_dict(예상한 rank 값이 담긴 딕셔너리)에 없다면,
    scope_name = tf.get_variable_scope().name
    raise ValueError(
        "For the tensor '%s' in scope '%s', the actual rank "
        "'%d' (shape = %s) is not equal to the expected rank '%s'" %
        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))  #오류 발생