Source code for claf.model.token_classification.mixin


from pathlib import Path
import logging

import numpy as np
import torch
import pycm
from pycm.pycm_obj import pycmVectorError

from claf.decorator import arguments_required
import claf.utils as common_utils
from claf.model import cls_utils
from claf.metric.classification import macro_f1, macro_precision, macro_recall
from seqeval.metrics import accuracy_score as conlleval_accuracy
from seqeval.metrics import f1_score as conlleval_f1

logger = logging.getLogger(__name__)


[docs]class TokenClassification:
    """ Token Classification Mixin Class """

[docs]    def make_predictions(self, output_dict):
        """
        Make predictions with model's output_dict

        * Args:
            output_dict: model's output dictionary consisting of
                - sequence_embed: embedding vector of the sequence
                - tag_logits: representing unnormalized log probabilities of the tag

                - tag_idxs: target tag idxs
                - data_idx: data idx
                - loss: a scalar loss to be optimized

        * Returns:
            predictions: prediction dictionary consisting of
                - key: 'id' (sequence id)
                - value: dictionary consisting of
                    - tag_idxs
        """

        data_indices = output_dict["data_idx"]
        pred_tag_logits = output_dict["tag_logits"]
        pred_tag_idxs = [
            torch.argmax(pred_tag_logit, dim=-1).tolist() for pred_tag_logit in pred_tag_logits
        ]

        predictions = {
            self._dataset.get_id(data_idx.item()): {"tag_idxs": pred_tag_idx}
            for data_idx, pred_tag_idx in zip(list(data_indices.data), pred_tag_idxs)
        }

        return predictions

    @arguments_required(["sequence"])
    def predict(self, output_dict, arguments, helper):
        """
        Inference by raw_feature

        * Args:
            output_dict: model's output dictionary consisting of
                - sequence_embed: embedding vector of the sequence
                - tag_logits: representing unnormalized log probabilities of the tags.
            arguments: arguments dictionary consisting of user_input
            helper: dictionary to get the classification result, consisting of
                - tag_idx2text: dictionary converting tag_idx to tag_text

        * Returns: output dict (dict) consisting of
            - tag_logits: representing unnormalized log probabilities of the tags
            - tag_idxs: predicted tag idxs
            - tag_texts: predicted tag texts
            - tag_slots: predicted tag slots
        """

        sequence = arguments["sequence"]
        tag_logits = output_dict["tag_logits"][0]
        tag_idxs = [tag_logit.argmax(dim=-1) for tag_logit in tag_logits]
        tag_texts = [helper["tag_idx2text"][tag_idx.item()] for tag_idx in tag_idxs]

        return {
            "tag_logits": tag_logits,
            "tag_idxs": tag_idxs,
            "tag_texts": tag_texts,
            "tag_dict": cls_utils.get_tag_dict(sequence, tag_texts),
        }

[docs]    def make_metrics(self, predictions):
        """
        Make metrics with prediction dictionary

        * Args:
            predictions: prediction dictionary consisting of
                - key: 'id' (sequence id)
                - value: dictionary consisting of
                    - tag_idxs

        * Returns:
            metrics: metric dictionary consisting of
                - 'accuracy': sequence level accuracy
                - 'tag_accuracy': tag level accuracy
                - 'macro_f1': tag prediction macro(unweighted mean) f1
                - 'macro_precision': tag prediction macro(unweighted mean) precision
                - 'macro_recall': tag prediction macro(unweighted mean) recall
        """

        pred_tag_idxs_list = []
        target_tag_idxs_list = []

        accurate_sequence = []

        for data_idx, pred in predictions.items():
            target = self._dataset.get_ground_truth(data_idx)

            pred_tag_idxs_list.append(pred["tag_idxs"])
            target_tag_idxs_list.append(target["tag_idxs"])

            accurate_sequence.append(
                1 if (np.asarray(target["tag_idxs"]) == np.asarray(pred["tag_idxs"])).all() else 0
            )

        pred_tags = [
            [self._dataset.tag_idx2text[tag_idx] for tag_idx in tag_idxs] for tag_idxs in pred_tag_idxs_list
        ]
        target_tags = [
            [self._dataset.tag_idx2text[tag_idx] for tag_idx in tag_idxs] for tag_idxs in target_tag_idxs_list
        ]

        flat_pred_tags = list(common_utils.flatten(pred_tags))
        flat_target_tags = list(common_utils.flatten(target_tags))

        # confusion matrix
        try:
            pycm_obj = pycm.ConfusionMatrix(actual_vector=flat_target_tags, predict_vector=flat_pred_tags)
        except pycmVectorError as e:
            if str(e) == "Number of the classes is lower than 2":
                logger.warning("Number of tags in the batch is 1. Sanity check is highly recommended.")
                return {
                    "accuracy": 1.,
                    "tag_accuracy": 1.,

                    "macro_f1": 1.,
                    "macro_precision": 1.,
                    "macro_recall": 1.,

                    "conlleval_accuracy": 1.,
                    "conlleval_f1": 1.,
                }
            raise

        self.write_predictions(
            {"target": flat_target_tags, "predict": flat_pred_tags}, pycm_obj=pycm_obj
        )

        sequence_accuracy = sum(accurate_sequence) / len(accurate_sequence)

        metrics = {
            "accuracy": sequence_accuracy,
            "tag_accuracy": pycm_obj.Overall_ACC,

            "macro_f1": macro_f1(pycm_obj),
            "macro_precision": macro_precision(pycm_obj),
            "macro_recall": macro_recall(pycm_obj),

            "conlleval_accuracy": conlleval_accuracy(target_tags, pred_tags),
            "conlleval_f1": conlleval_f1(target_tags, pred_tags),
        }

        return metrics

[docs]    def write_predictions(self, predictions, file_path=None, is_dict=True, pycm_obj=None):
        """
        Override write_predictions() in ModelBase to log confusion matrix
        """

        super(TokenClassification, self).write_predictions(
            predictions, file_path=file_path, is_dict=is_dict
        )

        data_type = "train" if self.training else "valid"

        if pycm_obj is not None:
            stats_file_path = f"predictions-{data_type}-{self._train_counter.get_display()}-stats"
            pycm_obj.save_csv(str(Path(self._log_dir) / "predictions" / stats_file_path))

            confusion_matrix_file_path = (
                f"predictions-{data_type}-{self._train_counter.get_display()}-confusion_matrix"
            )
            cls_utils.write_confusion_matrix_to_csv(
                str(Path(self._log_dir) / "predictions" / confusion_matrix_file_path), pycm_obj
            )

[docs]    def print_examples(self, index, inputs, predictions):
        """
        Print evaluation examples

        * Args:
            index: data index
            inputs: mini-batch inputs
            predictions: prediction dictionary consisting of
                - key: 'id' (sequence id)
                - value: dictionary consisting of
                    - class_idx

        * Returns:
            print(Sequence, Target Tags, Target Slots, Predicted Tags, Predicted Slots)
        """

        data_idx = inputs["labels"]["data_idx"][index].item()
        data_id = self._dataset.get_id(data_idx)

        helper = self._dataset.helper
        sequence = helper["examples"][data_id]["sequence"]
        target_tag_texts = helper["examples"][data_id]["tag_texts"]

        pred_tag_idxs = predictions[data_id]["tag_idxs"]
        pred_tag_texts = self._dataset.get_tag_texts_with_idxs(pred_tag_idxs)

        print()
        print("- Sequence:", sequence)
        print("- Target:")
        print("    Tags:", target_tag_texts)
        print("    (Slots)", cls_utils.get_tag_dict(sequence, target_tag_texts))
        print("- Predict:")
        print("    Tags:", pred_tag_texts)
        print("    (Slots)", cls_utils.get_tag_dict(sequence, pred_tag_texts))
        print()