ESCI Challenge for Improving Product Search
Dataset of task2 EDA
A simple visualization of task2 data
In [1]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
In [2]:
product_df = pd.read_csv('product_catalogue-v0.1.csv')
test_df = pd.read_csv('test_public-v0.1.csv')
train_df = pd.read_csv('train-v0.1.csv')
train_df = train_df.merge(product_df,left_on = ['product_id','query_locale'],right_on=['product_id','product_locale'], how= 'left')
test_df = test_df.merge(product_df,left_on = ['product_id','query_locale'],right_on=['product_id','product_locale'], how= 'left')
EDA ¶
In [3]:
print("Total number of train data = " , train_df.shape[0])
print("Total number of test data = " , test_df.shape[0])
print("Total number of product = " , product_df.shape[0])
Train Essay Sample¶
In [4]:
row = dict(train_df.iloc[0,:])
for name, value in row.items():
print(f'{name}:{value}')
Test Essay Sample¶
In [5]:
row = dict(test_df.iloc[0,:])
for name, value in row.items():
print(f'{name}:{value}')
Quick view of Train Dataframe¶
In [6]:
train_df.head()
Out[6]:
In [7]:
lens = lambda x: len(str(x))
length_train_df = pd.DataFrame()
for name in train_df.columns:
if ('product' in name or 'query' in name) and 'locale' not in name:
length_train_df[f'{name}_length'] = train_df[name].apply(lens)
round(length_train_df.describe(),2)
Out[7]:
label count¶
In [8]:
plt.bar(
x = np.unique(train_df["esci_label"]),
height = [list(train_df["esci_label"]).count(i) for i in np.unique(train_df["esci_label"])] ,
)
plt.xlabel("Classes")
plt.ylabel("Number of Rows")
plt.title('ESCI label Distribution ')
plt.show()
### Null Values
In [9]:
train_df.isnull().sum()
Out[9]:
STRING FILTER DISTRIBUTION ¶
Length¶
In [10]:
lens = lambda x: len(str(x))
length_train_df = pd.DataFrame()
for name in train_df.columns:
if ('product' in name or 'query' in name) and ('locale' not in name and 'id' not in name):
length_train_df[f'{name}_length'] = train_df[name].apply(lens)
In [11]:
for name in length_train_df.columns:
plt.hist(length_train_df[name],bins=50)
plt.xlabel("length")
plt.ylabel("count")
plt.title(name)
plt.show()
Token Length In A PreTrained Model ¶
use bert-base-multilingual-cased as example¶
In [12]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
In [13]:
df = train_df.sample(100000,random_state= 2022)
token_len = []
bar = tqdm(df.iterrows(),total=df.shape[0])
for i, row in bar:
text = [row['query'], row['product_title'], row['product_bullet_point'], row['product_brand'],
row['product_color_name'], row['product_locale'], row['product_description']]
text = [str(i) for i in text]
text = f' {tokenizer.sep_token} '.join(text)
encoded_text = tokenizer.encode_plus(text, add_special_tokens=False)
input_ids = encoded_text["input_ids"]
token_len.append(len(input_ids))
In [14]:
plt.hist(token_len,bins=100)
plt.show()
In [15]:
pd.Series(token_len).describe()
Out[15]:
In [16]:
import re
def cleanText(text):
pattern = re.compile(r'<[^>]+>',re.S)
text = pattern.sub('', text)
return text
token_len = []
bar = tqdm(df.iterrows(),total=df.shape[0])
for i, row in bar:
text = [row['query'], row['product_title'], row['product_bullet_point'], row['product_brand'],
row['product_color_name'], row['product_locale'], row['product_description']]
text = [cleanText(str(i)) for i in text]
text = f' {tokenizer.sep_token} '.join(text)
encoded_text = tokenizer.encode_plus(text, add_special_tokens=False)
input_ids = encoded_text["input_ids"]
token_len.append(len(input_ids))
In [18]:
plt.hist(token_len,bins=100)
plt.show()
In [19]:
pd.Series(token_len).describe()
Out[19]:
Content
Comments
You must login before you can post a comment.