r1.5

分支 (16)

标签 (5)

管理

管理

master

r2.3

r2.2

r2.1

r2.0

r1.8

r1.9

r2.0.0-alpha

r1.10

r1.7

r1.6

dynamic_shape

r1.5

r1.3

r1.4

r1.2

v2.2.11

v2.2.10

v2.2.1

v1.6.0

v1.5.0

models
/
official
/
nlp
/
textcnn
/
src
/
dataset.py

# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Data operations, will be used in train.py and eval.py
"""
import os
import math
import random
import codecs
from pathlib import Path
import numpy as np
import pandas as pd
import mindspore.dataset as ds


class Generator():
    def __init__(self, input_list):
        self.input_list = input_list

    def __getitem__(self, item):
        return np.array(self.input_list[item][0], dtype=np.int32), np.array(self.input_list[item][1], dtype=np.int32)

    def __len__(self):
        return len(self.input_list)


class DataProcessor:
    """
    preprocess dataset
    """
    def get_dict_len(self):
        """
        get number of different words in the whole dataset
        """
        if self.doConvert:
            return len(self.Vocab)
        return -1

    def collect_weight(self, glove_path, embed_size):
        """ collect weight """
        vocab_size = self.get_dict_len()
        embedding_index = {}
        with open(glove_path) as f:
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.array(values[1:], dtype='float32')
                embedding_index[word] = vec
        weight_np = np.zeros((vocab_size, embed_size)).astype(np.float32)

        for word, vec in embedding_index.items():
            try:
                index = self.Vocab[word]
            except KeyError:
                continue
            weight_np[index, :] = vec
        return weight_np

    def create_train_dataset(self, epoch_size, batch_size, collect_weight=False, glove_path='', embed_size=50):
        if collect_weight:
            weight_np = self.collect_weight(glove_path, embed_size)
            np.savetxt('./weight.txt', weight_np)
        dataset = ds.GeneratorDataset(source=Generator(input_list=self.train),
                                      column_names=["data", "label"], shuffle=False)
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
        return dataset

    def create_test_dataset(self, batch_size):
        dataset = ds.GeneratorDataset(source=Generator(input_list=self.test),
                                      column_names=["data", "label"], shuffle=False)
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
        return dataset

class MovieReview(DataProcessor):
    """
    preprocess MovieReview dataset
    """
    def __init__(self, root_dir, maxlen, split):
        """
        input:
            root_dir: the root directory path of the MR dataset
            maxlen: set the max length of the sentence
            split: set the ratio of training set to testing set
            rank: the logic order of the worker
            size: the worker num
        """
        self.path = root_dir
        self.feelMap = {
            'neg': 0,
            'pos': 1
        }
        self.files = []
        self.doConvert = False
        mypath = Path(self.path)

        if not mypath.exists() or not mypath.is_dir():
            print("please check the root_dir!")
            raise ValueError

        # walk through the root_dir
        for root, _, filename in os.walk(self.path):
            for each in filename:
                self.files.append(os.path.join(root, each))
            break

        # check whether get two files
        if len(self.files) != 2:
            print("There are {} files in the root_dir".format(len(self.files)))
            raise ValueError

        # begin to read data
        self.word_num = 0
        self.maxlen = 0
        self.minlen = float("inf")
        self.maxlen = float("-inf")
        self.Pos = []
        self.Neg = []
        for filename in self.files:
            with codecs.open(filename, 'r', 'Latin1') as f:
                ff = f.read()
            with codecs.open(filename, 'w', 'utf-8') as file_object:
                file_object.write(ff)
            self.read_data(filename)
        self.PosNeg = self.Pos + self.Neg
        self.text2vec(maxlen=maxlen)
        self.split_dataset(split=split)

    def read_data(self, filePath):
        """
        read text into memory

        input:
            filePath: the path where the data is stored in
        """
        with open(filePath, 'r') as f:
            for sentence in f.readlines():
                sentence = sentence.replace('\n', '')\
                                    .replace('"', '')\
                                    .replace('\'', '')\
                                    .replace('.', '')\
                                    .replace(',', '')\
                                    .replace('[', '')\
                                    .replace(']', '')\
                                    .replace('(', '')\
                                    .replace(')', '')\
                                    .replace(':', '')\
                                    .replace('--', '')\
                                    .replace('-', '')\
                                    .replace('\\', '')\
                                    .replace('0', '')\
                                    .replace('1', '')\
                                    .replace('2', '')\
                                    .replace('3', '')\
                                    .replace('4', '')\
                                    .replace('5', '')\
                                    .replace('6', '')\
                                    .replace('7', '')\
                                    .replace('8', '')\
                                    .replace('9', '')\
                                    .replace('`', '')\
                                    .replace('=', '')\
                                    .replace('$', '')\
                                    .replace('/', '')\
                                    .replace('*', '')\
                                    .replace(';', '')\
                                    .replace('<b>', '')\
                                    .replace('%', '')
                sentence = sentence.split(' ')
                sentence = list(filter(lambda x: x, sentence))
                if sentence:
                    self.word_num += len(sentence)
                    self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence)
                    self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence)
                    if 'pos' in filePath:
                        self.Pos.append([sentence, self.feelMap['pos']])
                    else:
                        self.Neg.append([sentence, self.feelMap['neg']])

    def text2vec(self, maxlen):
        """
        convert the sentence into a vector in an int type

        input:
            maxlen: max length of the sentence
        """
        # Vocab = {word : index}
        self.Vocab = dict()

        for SentenceLabel in self.Pos+self.Neg:
            vector = [0]*maxlen
            for index, word in enumerate(SentenceLabel[0]):
                if index >= maxlen:
                    break
                if word not in self.Vocab.keys():
                    self.Vocab[word] = len(self.Vocab)
                    vector[index] = len(self.Vocab) - 1
                else:
                    vector[index] = self.Vocab[word]
            SentenceLabel[0] = vector
        self.doConvert = True

    def split_dataset(self, split):
        """
        split the dataset into training set and test set
        input:
            split: the ratio of training set to test set
            rank: logic order
            size: device num
        """
        trunk_pos_size = math.ceil((1-split)*len(self.Pos))
        trunk_neg_size = math.ceil((1-split)*len(self.Neg))
        trunk_num = int(1/(1-split))
        pos_temp = list()
        neg_temp = list()
        for index in range(trunk_num):
            pos_temp.append(self.Pos[index*trunk_pos_size:(index+1)*trunk_pos_size])
            neg_temp.append(self.Neg[index*trunk_neg_size:(index+1)*trunk_neg_size])
        self.test = pos_temp.pop(2)+neg_temp.pop(2)
        self.train = [i for item in pos_temp+neg_temp for i in item]

        random.shuffle(self.train)

class Subjectivity(DataProcessor):
    """
    preprocess Subjectivity dataset
    """
    def __init__(self, root_dir, maxlen, split):
        self.path = root_dir
        self.feelMap = {
            'neg': 0,
            'pos': 1
        }
        self.files = []
        self.doConvert = False
        mypath = Path(self.path)

        if not mypath.exists() or not mypath.is_dir():
            print("please check the root_dir!")
            raise ValueError

        # walk through the root_dir
        for root, _, filename in os.walk(self.path):
            for each in filename:
                self.files.append(os.path.join(root, each))
            break

        # begin to read data
        self.word_num = 0
        self.maxlen = 0
        self.minlen = float("inf")
        self.maxlen = float("-inf")
        self.Pos = []
        self.Neg = []
        for filename in self.files:
            self.read_data(filename)
        self.PosNeg = self.Pos + self.Neg
        self.text2vec(maxlen=maxlen)
        self.split_dataset(split=split)

    def read_data(self, filePath):
        """
        read text into memory

        input:
            filePath: the path where the data is stored in
        """
        with open(filePath, 'r', encoding="ISO-8859-1") as f:
            for sentence in f.readlines():
                sentence = sentence.replace('\n', '')\
                                    .replace('"', '')\
                                    .replace('\'', '')\
                                    .replace('.', '')\
                                    .replace(',', '')\
                                    .replace('[', '')\
                                    .replace(']', '')\
                                    .replace('(', '')\
                                    .replace(')', '')\
                                    .replace(':', '')\
                                    .replace('--', '')\
                                    .replace('-', '')\
                                    .replace('\\', '')\
                                    .replace('0', '')\
                                    .replace('1', '')\
                                    .replace('2', '')\
                                    .replace('3', '')\
                                    .replace('4', '')\
                                    .replace('5', '')\
                                    .replace('6', '')\
                                    .replace('7', '')\
                                    .replace('8', '')\
                                    .replace('9', '')\
                                    .replace('`', '')\
                                    .replace('=', '')\
                                    .replace('$', '')\
                                    .replace('/', '')\
                                    .replace('*', '')\
                                    .replace(';', '')\
                                    .replace('<b>', '')\
                                    .replace('%', '')
                sentence = sentence.split(' ')
                sentence = list(filter(lambda x: x, sentence))
                if sentence:
                    self.word_num += len(sentence)
                    self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence)
                    self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence)
                    if 'quote' in filePath:
                        self.Pos.append([sentence, self.feelMap['pos']])
                    elif 'plot' in filePath:
                        self.Neg.append([sentence, self.feelMap['neg']])

    def text2vec(self, maxlen):
        """
        convert the sentence into a vector in an int type

        input:
            maxlen: max length of the sentence
        """
        # Vocab = {word : index}
        self.Vocab = dict()

        for SentenceLabel in self.Pos+self.Neg:
            vector = [0]*maxlen
            for index, word in enumerate(SentenceLabel[0]):
                if index >= maxlen:
                    break
                if word not in self.Vocab.keys():
                    self.Vocab[word] = len(self.Vocab)
                    vector[index] = len(self.Vocab) - 1
                else:
                    vector[index] = self.Vocab[word]
            SentenceLabel[0] = vector
        self.doConvert = True

    def split_dataset(self, split):
        """
        split the dataset into training set and test set
        input:
            split: the ratio of training set to test set
            rank: logic order
            size: device num
        """
        trunk_pos_size = math.ceil((1-split)*len(self.Pos))
        trunk_neg_size = math.ceil((1-split)*len(self.Neg))
        trunk_num = int(1/(1-split))
        pos_temp = list()
        neg_temp = list()
        for index in range(trunk_num):
            pos_temp.append(self.Pos[index*trunk_pos_size:(index+1)*trunk_pos_size])
            neg_temp.append(self.Neg[index*trunk_neg_size:(index+1)*trunk_neg_size])
        self.test = pos_temp.pop(2)+neg_temp.pop(2)
        self.train = [i for item in pos_temp+neg_temp for i in item]

        random.shuffle(self.train)

class SST2(DataProcessor):
    """
    preprocess SST2 dataset
    """
    def __init__(self, root_dir, maxlen, split):
        self.path = root_dir
        self.files = []
        self.train = []
        self.test = []
        self.doConvert = False
        mypath = Path(self.path)

        if not mypath.exists() or not mypath.is_dir():
            print("please check the root_dir!")
            raise ValueError

        # walk through the root_dir
        for root, _, filename in os.walk(self.path):
            for each in filename:
                self.files.append(os.path.join(root, each))
            break

        # begin to read data
        self.word_num = 0
        self.maxlen = 0
        self.minlen = float("inf")
        self.maxlen = float("-inf")
        for filename in self.files:
            if 'train' in filename or 'dev' in filename:
                with codecs.open(filename, 'r') as f:
                    ff = f.read()
                with codecs.open(filename, 'w', 'utf-8') as file_object:
                    file_object.write(ff)
                self.read_data(filename)
        self.text2vec(maxlen=maxlen)
        self.split_dataset(split=split)

    def read_data(self, filePath):
        """
        read text into memory

        input:
            filePath: the path where the data is stored in
        """
        df = pd.read_csv(filePath, delimiter='\t')
        for sentence, label in zip(df['sentence'], df['label']):
            sentence = sentence.replace('\n', '')\
                                .replace('"', '')\
                                .replace('\'', '')\
                                .replace('.', '')\
                                .replace(',', '')\
                                .replace('[', '')\
                                .replace(']', '')\
                                .replace('(', '')\
                                .replace(')', '')\
                                .replace(':', '')\
                                .replace('--', '')\
                                .replace('-', '')\
                                .replace('\\', '')\
                                .replace('0', '')\
                                .replace('1', '')\
                                .replace('2', '')\
                                .replace('3', '')\
                                .replace('4', '')\
                                .replace('5', '')\
                                .replace('6', '')\
                                .replace('7', '')\
                                .replace('8', '')\
                                .replace('9', '')\
                                .replace('`', '')\
                                .replace('=', '')\
                                .replace('$', '')\
                                .replace('/', '')\
                                .replace('*', '')\
                                .replace(';', '')\
                                .replace('<b>', '')\
                                .replace('%', '')
            sentence = sentence.split(' ')
            sentence = list(filter(lambda x: x, sentence))
            if sentence:
                self.word_num += len(sentence)
                self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence)
                self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence)
                if 'train' in filePath:
                    self.train.append([sentence, label])
                elif 'dev' in filePath:
                    self.test.append([sentence, label])

    def text2vec(self, maxlen):
        """
        convert the sentence into a vector in an int type

        input:
            maxlen: max length of the sentence
        """
        # Vocab = {word : index}
        self.Vocab = dict()

        for SentenceLabel in self.train+self.test:
            vector = [0]*maxlen
            for index, word in enumerate(SentenceLabel[0]):
                if index >= maxlen:
                    break
                if word not in self.Vocab.keys():
                    self.Vocab[word] = len(self.Vocab)
                    vector[index] = len(self.Vocab) - 1
                else:
                    vector[index] = self.Vocab[word]
            SentenceLabel[0] = vector
        self.doConvert = True

    def split_dataset(self, split):
        """
        split the dataset into training set and test set
        input:
            split: the ratio of training set to test set
            rank: logic order
            size: device num
        """
        random.shuffle(self.train)