Sentiment Analaysis Model in Android Studio

Stephen_Shen · March 15, 2024, 5:55am

I am facing this error in Android studio app, when i pass the attribute to the model
Failed to run interpreter: Internal error: Failed to run on the given Interpreter: tensorflow/lite/kernels/gather.cc:158 indices_has_only_positive_elements was not true.
Node number 17 (GATHER) failed to invoke.

Code in Android studio
package com.example.rs4u_v2;

import android.util.Log;

import org.tensorflow.lite.Interpreter;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SentimentAnalysis {
private static final String MODEL_PATH = “rs2u_model_Q_V7.tflite”;
private static final int MAX_SEQ_LENGTH = 128; // Adjust based on your model’s input size
private Map<String, Integer> vocabulary;
private Interpreter interpreter;

//private BertTokenizer bertTokenizer;
public SentimentAnalysis(Interpreter tflite, Map<String, Integer> vocab) {
    interpreter = tflite;
    vocabulary = vocab;
    if (interpreter == null) {
        Log.e("SentimentAnalysis", "Interpreter is not initialized.");
    }
}

public int analyzeSentiment(int ratings, String texts) {
    if (interpreter == null) {
        Log.e("SentimentAnalysis", "Interpreter is not initialized.");
        return -1;
    } else {
        Log.d("SentimentAnalysis", "Performing sentiment analysis...");
    }

    Object[] inputTensor = preprocessInput(ratings, texts);

    if (inputTensor == null) {
        Log.e("SentimentAnalysis", "Failed to preprocess input.");
        return -1; // or any other error code to indicate failure
    }

    // Debug logging for input tensors
    Log.d("SentimentAnalysis", "Input Tensor 1: " + arrayToString(inputTensor[0]));
    Log.d("SentimentAnalysis", "Input Tensor 2: " + arrayToString(inputTensor[1]));

    Map<Integer, Object> outputTensor = new HashMap<>();
    float[][] probabilities = new float[1][3];  // Assuming 3 classes in your model

    outputTensor.put(0, probabilities);
    Log.d("SentimentAnalysis", "Output Tensor (outputTensor.put(0, probabilities);): " + outputTensor);
    Log.d("SentimentAnalysis", "Input Tensor: " + arrayToStringg(inputTensor));

    //interpreter.runForMultipleInputsOutputs(inputTensor, outputTensor);
    try {
        interpreter.runForMultipleInputsOutputs(inputTensor, outputTensor);
        Log.d("SentimentAnalysis", "interpreter: " + inputTensor);
        Log.d("SentimentAnalysis", "After run the model: " + outputTensor);
    } catch (Exception e) {
        Log.e("SentimentAnalysis", "Failed to run interpreter: " + e.getMessage());
        return -1; // or any other error code to indicate failure
    }

    Log.d("SentimentAnalysis", "Output Tensor: " + arrayToString(probabilities));

    int predictedClass = argmax(probabilities[0]);

    Log.d("SentimentAnalysis", "Predicted Class: " + predictedClass);
    // Map predicted class to human-readable label
    String predictedLabel;
    switch (predictedClass) {
        case 0:
            predictedLabel = "Negative";
            break;
        case 1:
            predictedLabel = "Neutral";
            break;
        case 2:
            predictedLabel = "Positive";
            break;
        default:
            predictedLabel = "Unknown";
    }

    // Display or return the predicted sentiment label
    Log.d("SentimentAnalysis", "Predicted Sentiment: " + predictedLabel);

    // Reset the probabilities array back to its initial state
    for (int i = 0; i < probabilities[0].length; i++) {
        probabilities[0][i] = 0.0f; // Set all probabilities to zero
    }


    return predictedClass;
}

private Object[] preprocessInput(int ratings, String texts) {
    // Tokenize the input text using the provided vocabulary and special tokens
    // Pad or truncate the tokenized sequence to fit MAX_SEQ_LENGTH
    // Construct input tensors (input_ids and attention_mask)

    // Preprocess the input text
    texts = texts.trim(); // Trim leading and trailing whitespace
    texts = texts.replaceAll("\\s+", " "); // Replace multiple consecutive spaces with a single space
    texts = texts.toLowerCase(); // Convert text to lowercase

    List<Integer> inputIds = new ArrayList<>();
    List<Integer> attentionMask = new ArrayList<>();

    // Add rating as the first token (adjusted to fit within vocabulary size)
    inputIds.add(Math.min(ratings, vocabulary.size() - 1));
    attentionMask.add(1); // Set attention mask to 1 for the first token

    // Tokenize the input text
    String[] tokens = texts.split(" "); // Assuming space-separated tokens
    for (String token : tokens) {
        int tokenId = vocabulary.containsKey(token) ? vocabulary.get(token) : 0; // Assign 0 for unknown tokens
        Log.d("SentimentAnalysis", "Token: " + token + ", Token ID: " + tokenId+1);
        inputIds.add(tokenId+1);
        attentionMask.add(1); // Set attention mask to 1 for input tokens
    }

    int checkTokenIDSzie = 0;
    // Pad or truncate sequences to fit MAX_SEQ_LENGTH
    while (inputIds.size() < MAX_SEQ_LENGTH) {
        inputIds.add(0); // Pad with zeros
        attentionMask.add(0); // Set attention mask to 0 for padding tokens
        checkTokenIDSzie++;
    }
    Log.d("SentimentAnalysis", "Token Size: " + checkTokenIDSzie );
    inputIds = inputIds.subList(0, MAX_SEQ_LENGTH);
    attentionMask = attentionMask.subList(0, MAX_SEQ_LENGTH);

    // Convert lists to arrays
    int[] inputIdsArray = inputIds.stream().mapToInt(Integer::intValue).toArray();
    int[] attentionMaskArray = attentionMask.stream().mapToInt(Integer::intValue).toArray();

    // Construct input tensor arrays
    int[][] inputIdsTensor = {inputIdsArray};
    int[][] attentionMaskTensor = {attentionMaskArray};

    return new Object[]{inputIdsTensor, attentionMaskTensor};
}

private List<Integer> tokenizeText(String text) {
    List<Integer> tokenIds = new ArrayList<>();

    for (char c : text.toCharArray()) {
        String character = String.valueOf(c);
        if (vocabulary.containsKey(character)) {
            tokenIds.add(vocabulary.get(character));
        } else {
            // Handle unknown characters if needed
            tokenIds.add(0); // Using 0 as a placeholder for unknown tokens
        }
    }

    return tokenIds;
}


private int argmax(float[] array) {
    int maxIndex = -1;
    float maxVal = Float.MIN_VALUE;

    for (int i = 0; i < array.length; i++) {
        if (array[i] > maxVal) {
            maxVal = array[i];
            maxIndex = i;
        }
    }

    return maxIndex;
}

private String arrayToStringg(Object[] array) {
    StringBuilder sb = new StringBuilder("[");
    for (int i = 0; i < array.length; i++) {
        sb.append(array[i]);
        if (i < array.length - 1) {
            sb.append(", ");
        }
    }
    sb.append("]");
    return sb.toString();
}

private String arrayToString(Object array) {
    StringBuilder sb = new StringBuilder("[");
    if (array instanceof int[][]) {
        int[][] arr = (int[][]) array;
        for (int i = 0; i < arr.length; i++) {
            for (int j = 0; j < arr[i].length; j++) {
                sb.append(arr[i][j]);
                if (j < arr[i].length - 1) {
                    sb.append(", ");
                }
            }
            if (i < arr.length - 1) {
                sb.append("; ");
            }
        }
    } else if (array instanceof float[][]) {
        float[][] arr = (float[][]) array;
        for (int i = 0; i < arr.length; i++) {
            for (int j = 0; j < arr[i].length; j++) {
                sb.append(arr[i][j]);
                if (j < arr[i].length - 1) {
                    sb.append(", ");
                }
            }
            if (i < arr.length - 1) {
                sb.append("; ");
            }
        }
    }
    sb.append("]");
    return sb.toString();
}

private String arrayToString(float[] array) {
    StringBuilder sb = new StringBuilder("[");
    for (int i = 0; i < array.length; i++) {
        sb.append(array[i]);
        if (i < array.length - 1) {
            sb.append(", ");
        }
    }
    sb.append("]");
    return sb.toString();
}

}

Code of Model
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tf2onnx
import onnx

print(“start”)

Load Yelp dataset (replace ‘path_to_yelp_dataset.csv’ with the actual path to your Yelp dataset file)

filename = ‘C:/FYP/yelp_review.csv’

Read CSV file

df = pd.read_csv(filename, encoding=‘utf-8’, on_bad_lines=“skip”, engine=“python”)

Limit the dataset size to 1000 rows

df = df.head(10000)

Assuming your dataset has ‘stars’ as the rating and ‘text’ as the review text

data = {‘text’: df[‘text’].values, ‘stars’: df[‘stars’].values}

Map star ratings to sentiment classes

data[‘sentiment’] = pd.cut(data[‘stars’], bins=[0, 2, 3, 5], labels=[‘negative’, ‘neutral’, ‘positive’])

Convert the dictionary to a Pandas DataFrame

df_data = pd.DataFrame(data)

print(“start split”)

Split the dataset into training, validation, and test sets

train_data, test_data = train_test_split(df_data, test_size=0.2, random_state=42)

If you want to further split for validation, you can do the following

valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

print(“start tokenize”)

BERT tokenizer and model (using bert-base-cased)

tokenizer = BertTokenizer.from_pretrained(‘bert-base-cased’)
model = TFBertForSequenceClassification.from_pretrained(‘bert-base-cased’, num_labels=3) # 3 classes: negative, neutral, positive

Save original tokenizer configuration and vocabulary

tokenizer.save_pretrained(‘C:/FYP/RS4U_Model/original_tokenizer’)
original_vocab = tokenizer.save_vocabulary(‘C:/FYP/RS4U_Model/original_vocabulary’)

print(“start create dataset”)

Define a custom dataset

class CustomDataset(tf.keras.utils.Sequence):
def init(self, texts, labels, tokenizer, max_length=128, batch_size=8):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
self.batch_size = batch_size
self.label_mapping = {‘negative’: 0, ‘neutral’: 1, ‘positive’: 2}

#def __len__(self):
    #return len(self.texts) // self.batch_size
def __len__(self):
    return (len(self.texts) + self.batch_size - 1) // self.batch_size

def __getitem__(self, idx):
    batch_texts = self.texts[idx * self.batch_size: (idx + 1) * self.batch_size]
    batch_labels = self.labels[idx * self.batch_size: (idx + 1) * self.batch_size]

    # Convert string labels to numerical values
    batch_labels = [self.label_mapping[label] for label in batch_labels]

    # Tokenize the batch of texts
    tokens = self.tokenizer.batch_encode_plus(
        batch_texts,
        max_length=self.max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    input_ids = tf.ensure_shape(tokens['input_ids'], (None, self.max_length))
    attention_mask = tf.ensure_shape(tokens['attention_mask'], (None, self.max_length))

    return {
        'input_ids': tokens['input_ids'],
        'attention_mask': tokens['attention_mask'],
        'label': tf.convert_to_tensor(batch_labels, dtype=tf.int32)
    }

print(“start create dataloader”)

Tokenize and create DataLoader

def create_dataloader(data, tokenizer, max_length=128, batch_size=8):
dataset = CustomDataset(texts=data[‘text’], labels=data[‘sentiment’], tokenizer=tokenizer, max_length=max_length, batch_size=batch_size)
dataloader = tf.data.Dataset.from_generator(lambda: dataset, output_signature={
‘input_ids’: tf.TensorSpec(shape=(None, max_length), dtype=tf.int32),
‘attention_mask’: tf.TensorSpec(shape=(None, max_length), dtype=tf.int32),
‘label’: tf.TensorSpec(shape=(None,), dtype=tf.int32)
})
return dataloader

max_length = 128
batch_size = 8
train_dataloader = create_dataloader(train_data, tokenizer, max_length=max_length, batch_size=batch_size)
valid_dataloader = create_dataloader(valid_data, tokenizer, max_length=max_length, batch_size=batch_size)
test_dataloader = create_dataloader(test_data, tokenizer, max_length=max_length, batch_size=batch_size)

print(“start training”)

Training loop

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
criterion = SparseCategoricalCrossentropy(from_logits=True)

num_epochs = 3
for epoch in range(num_epochs):
total_loss = 0
num_batches = tf.data.experimental.cardinality(train_dataloader).numpy()
#num_batches = len(train_dataloader)

for batch in train_dataloader:
    inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
    labels = batch['label']
    
    with tf.GradientTape() as tape:
        outputs = model(inputs, training=True)
        loss = criterion(labels, outputs.logits)

    total_loss += loss.numpy()

    # Backward pass and optimization
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

# Calculate average training loss
avg_train_loss = total_loss / num_batches

# Validation
all_preds = []
all_labels = []
for batch in valid_dataloader:
    inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
    labels = batch['label']
    
    outputs = model(inputs, training=False)
    preds = tf.argmax(outputs.logits, axis=1)
    all_preds.extend(preds.numpy())
    all_labels.extend(labels.numpy())

# Calculate accuracy on validation set
accuracy_valid = accuracy_score(all_labels, all_preds)

print(f'Epoch {epoch + 1}/{num_epochs}, Avg Train Loss: {avg_train_loss:.4f}, Validation Accuracy: {accuracy_valid:.4f}')

Testing on the original model trained using the test set

all_preds_test = []
all_labels_test = []

for batch in test_dataloader:
inputs = {‘input_ids’: batch[‘input_ids’], ‘attention_mask’: batch[‘attention_mask’]}
labels = batch[‘label’]

outputs = model(inputs, training=False)
preds = tf.argmax(outputs.logits, axis=1)
all_preds_test.extend(preds.numpy())
all_labels_test.extend(labels.numpy())

Calculate accuracy, precision, recall, and f1 score on the test set

accuracy_test = accuracy_score(all_labels_test, all_preds_test)
precision = precision_score(all_labels_test, all_preds_test, average=‘weighted’)
recall = recall_score(all_labels_test, all_preds_test, average=‘weighted’)
f1 = f1_score(all_labels_test, all_preds_test, average=‘weighted’)
print(f’Accuracy (Original Model): {accuracy_test:.4f}‘)
print(f’Precision (Original Model): {precision:.4f}’)
print(f’Recall (Original Model): {recall:.4f}‘)
print(f’F1 Score (Original Model): {f1:.4f}’)

Confusion matrix on the test set

conf_matrix = confusion_matrix(all_labels_test, all_preds_test)
print(‘Confusion Matrix (Original Model):’)
print(’ Predicted Positive Predicted Negative’)
print(f’Actual Positive {conf_matrix[0, 0]} {conf_matrix[0, 1]}‘)
print(f’Actual Negative {conf_matrix[1, 0]} {conf_matrix[1, 1]}’)

Save the results to a text file

results_file = ‘C:/FYP/Model/results_original.txt’
with open(results_file, ‘w’) as file:
file.write(f’Test Accuracy (Original Model): {accuracy_test:.4f}\n’)
file.write(f’Precision (Original Model): {precision:.4f}\n’)
file.write(f’Recall (Original Model): {recall:.4f}\n’)
file.write(f’F1 Score (Original Model): {f1:.4f}\n’)
file.write(‘Confusion Matrix (Original Model):\n’)
file.write(’ Predicted Positive Predicted Negative\n’)
file.write(f’Actual Positive {conf_matrix[0, 0]} {conf_matrix[0, 1]}\n’)
file.write(f’Actual Negative {conf_matrix[1, 0]} {conf_matrix[1, 1]}\n’)

Save the trained model

#model.save(‘C:/FYP/RS4U_Model/tf_model’)
model.save_pretrained(‘C:/FYP/RS4U_Model/tf_model’)

print(“Done 1st saving”)

Save the tokenizer associated with the fine-tuned model

tokenizer.save_pretrained(‘C:/FYP/RS4U_Model/rs4u_tokenizer’)
rs4u_vocab = tokenizer.save_vocabulary(‘C:/FYP/RS4U_Model/rs4u_vocabulary’)

print(“start load the model”)

Load the model and tokenizer

loaded_model = TFBertForSequenceClassification.from_pretrained(‘bert-base-cased’)
tokenizer = BertTokenizer.from_pretrained(‘bert-base-cased’)

print(“Convert the model”)

Convert the TensorFlow model to ONNX format

dummy_input = {
‘input_ids’: tf.zeros((1, 128), dtype=tf.int32),
‘attention_mask’: tf.zeros((1, 128), dtype=tf.int32),
}

Define the input signature as a list of TensorSpec

input_signature = [
tf.TensorSpec(shape=(None, 128), dtype=tf.int32, name=‘input_ids’),
tf.TensorSpec(shape=(None, 128), dtype=tf.int32, name=‘attention_mask’),
]

onnx_model, _ = tf2onnx.convert.from_keras(
loaded_model,
input_signature=input_signature, # Pass the input signature here
)

print(“save the onnx model”)

Save the ONNX model

onnx.save_model(onnx_model, ‘C:/FYP/RS4U_Model/rs4u_model.onnx’)

print(“Save as tensorflow life model”)

Convert the ONNX model to TensorFlow Lite with quantization

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()
with open(‘C:/FYP/RS4U_Model/rs4u_model.tflite’, ‘wb’) as f:
f.write(tflite_model)