!pip install parsivar
!pip install git+https://github.com/RoboEpics/roboepics-client.git
from roboepics_client.roboepics_client import RoboEpicsClient
problem_id = 4
problem_enter_id = None # fill this value with your id
roboepics_client = RoboEpicsClient(problem_id, problem_enter_id)
Downloading and extracting data¶
We used a permanent google link for faster download
!gdown --id 1uYwzBe8nLhOQ2Q3rCScEXvljLkQqJrHc
!7z x data.7z
!ls
Utils¶
from __future__ import unicode_literals
import collections
import gc
import json
import re
import os
import numpy as np
import pandas as pd
import parsivar
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
def read_json(path, n_lines_to_read=None):
"""
read json file line by line iteratively (generator function).
use this function to read json files when you have memory limitations.
"""
with open(path) as f:
for i, line in enumerate(tqdm(f)):
if n_lines_to_read == i:
break
yield json.loads(line)
parsivar_normalizer = parsivar.Normalizer(statistical_space_correction=True)
char_mappings = {
"٥": "5",
"А": "a",
"В": "b",
"Е": "e",
"Н": "h",
"Р": "P",
"С": "C",
"Т": "T",
"а": "a",
"г": "r",
"е": "e",
"к": "k",
"м": "m",
"о": "o",
"р": "p",
"ڈ": "د",
"ڇ": "چ",
# Persian numbers (will be raplaced by english one)
"۰": "0",
"۱": "1",
"۲": "2",
"۳": "3",
"۴": "4",
"۵": "5",
"۶": "6",
"۷": "7",
"۸": "8",
"۹": "9",
".": ".",
# Arabic numbers (will be raplaced by english one)
"٠": "0",
"١": "1",
"٢": "2",
"٣": "3",
"٤": "4",
"٥": "5",
"٦": "6",
"٧": "7",
"٨": "8",
"٩": "9",
# Special Arabic Characters (will be replaced by persian one)
"ك": "ک",
"ى": "ی",
"ي": "ی",
"ؤ": "و",
"ئ": "ی",
"إ": "ا",
"أ": "ا",
"آ": "ا",
"ة": "ه",
"ء": "ی",
# French alphabet (will be raplaced by english one)
"à": "a",
"ä": "a",
"ç": "c",
"é": "e",
"è": "e",
"ê": "e",
"ë": "e",
"î": "i",
"ï": "i",
"ô": "o",
"ù": "u",
"û": "u",
"ü": "u",
# Camma (will be replace by dots for floating point numbers)
",": ".",
# And (will be replace by dots for floating point numbers)
"&": " and ",
# Vowels (will be removed)
"ّ": "", # tashdid
"َ": "", # a
"ِ": "", # e
"ُ": "", # o
"ـ": "", # tatvil
# Spaces
"": "", # 0x9E -> ZERO WIDTH JOINER
"": " ", # 0x9D -> ZERO WIDTH NON-JOINER
# Arabic Presentation Forms-A (will be replaced by persian one)
"ﭐ": "ا",
"ﭑ": "ا",
"ﭖ": "پ",
"ﭗ": "پ",
"ﭘ": "پ",
"ﭙ": "پ",
"ﭞ": "ت",
"ﭟ": "ت",
"ﭠ": "ت",
"ﭡ": "ت",
"ﭺ": "چ",
"ﭻ": "چ",
"ﭼ": "چ",
"ﭽ": "چ",
"ﮊ": "ژ",
"ﮋ": "ژ",
"ﮎ": "ک",
"ﮏ": "ک",
"ﮐ": "ک",
"ﮑ": "ک",
"ﮒ": "گ",
"ﮓ": "گ",
"ﮔ": "گ",
"ﮕ": "گ",
"ﮤ": "ه",
"ﮥ": "ه",
"ﮦ": "ه",
"ﮪ": "ه",
"ﮫ": "ه",
"ﮬ": "ه",
"ﮭ": "ه",
"ﮮ": "ی",
"ﮯ": "ی",
"ﮰ": "ی",
"ﮱ": "ی",
"ﯼ": "ی",
"ﯽ": "ی",
"ﯾ": "ی",
"ﯿ": "ی",
# Arabic Presentation Forms-B (will be removed)
"ﹰ": "",
"ﹱ": "",
"ﹲ": "",
"ﹳ": "",
"ﹴ": "",
"": "",
"ﹶ": "",
"ﹷ": "",
"ﹸ": "",
"ﹹ": "",
"ﹺ": "",
"ﹻ": "",
"ﹼ": "",
"ﹽ": "",
"ﹾ": "",
"ﹿ": "",
# Arabic Presentation Forms-B (will be replaced by persian one)
"ﺀ": "ی",
"ﺁ": "ا",
"ﺂ": "ا",
"ﺃ": "ا",
"ﺄ": "ا",
"ﺅ": "و",
"ﺆ": "و",
"ﺇ": "ا",
"ﺈ": "ا",
"ﺉ": "ی",
"ﺊ": "ی",
"ﺋ": "ی",
"ﺌ": "ی",
"ﺍ": "ا",
"ﺎ": "ا",
"ﺏ": "ب",
"ﺐ": "ب",
"ﺑ": "ب",
"ﺒ": "ب",
"ﺓ": "ه",
"ﺔ": "ه",
"ﺕ": "ت",
"ﺖ": "ت",
"ﺗ": "ت",
"ﺘ": "ت",
"ﺙ": "ث",
"ﺚ": "ث",
"ﺛ": "ث",
"ﺜ": "ث",
"ﺝ": "ج",
"ﺞ": "ج",
"ﺟ": "ج",
"ﺠ": "ج",
"ﺡ": "ح",
"ﺢ": "ح",
"ﺣ": "ح",
"ﺤ": "ح",
"ﺥ": "خ",
"ﺦ": "خ",
"ﺧ": "خ",
"ﺨ": "خ",
"ﺩ": "د",
"ﺪ": "د",
"ﺫ": "ذ",
"ﺬ": "ذ",
"ﺭ": "ر",
"ﺮ": "ر",
"ﺯ": "ز",
"ﺰ": "ز",
"ﺱ": "س",
"ﺲ": "س",
"ﺳ": "س",
"ﺴ": "س",
"ﺵ": "ش",
"ﺶ": "ش",
"ﺷ": "ش",
"ﺸ": "ش",
"ﺹ": "ص",
"ﺺ": "ص",
"ﺻ": "ص",
"ﺼ": "ص",
"ﺽ": "ض",
"ﺾ": "ض",
"ﺿ": "ض",
"ﻀ": "ض",
"ﻁ": "ط",
"ﻂ": "ط",
"ﻃ": "ط",
"ﻄ": "ط",
"ﻅ": "ظ",
"ﻆ": "ظ",
"ﻇ": "ظ",
"ﻈ": "ظ",
"ﻉ": "ع",
"ﻊ": "ع",
"ﻋ": "ع",
"ﻌ": "ع",
"ﻍ": "غ",
"ﻎ": "غ",
"ﻏ": "غ",
"ﻐ": "غ",
"ﻑ": "ف",
"ﻒ": "ف",
"ﻓ": "ف",
"ﻔ": "ف",
"ﻕ": "ق",
"ﻖ": "ق",
"ﻗ": "ق",
"ﻘ": "ق",
"ﻙ": "ک",
"ﻚ": "ک",
"ﻛ": "ک",
"ﻜ": "ک",
"ﻝ": "ل",
"ﻞ": "ل",
"ﻟ": "ل",
"ﻠ": "ل",
"ﻡ": "م",
"ﻢ": "م",
"ﻣ": "م",
"ﻤ": "م",
"ﻥ": "ن",
"ﻦ": "ن",
"ﻧ": "ن",
"ﻨ": "ن",
"ﻩ": "ه",
"ﻪ": "ه",
"ﻫ": "ه",
"ﻬ": "ه",
"ﻭ": "و",
"ﻮ": "و",
"ﻯ": "ی",
"ﻰ": "ی",
"ﻱ": "ی",
"ﻲ": "ی",
"ﻳ": "ی",
"ﻴ": "ی",
"ﻵ": "لا",
"ﻶ": "لا",
"ﻷ": "لا",
"ﻸ": "لا",
"ﻹ": "لا",
"ﻺ": "لا",
"ﻻ": "لا",
"ﻼ": "لا",
}
valid_chars = [
" ",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
"Q",
"R",
"S",
"T",
"U",
"V",
"W",
"X",
"Y",
"Z",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"آ",
"ئ",
"ا",
"ب",
"ت",
"ث",
"ج",
"ح",
"خ",
"د",
"ذ",
"ر",
"ز",
"س",
"ش",
"ص",
"ض",
"ط",
"ظ",
"ع",
"غ",
"ف",
"ق",
"ل",
"م",
"ن",
"ه",
"و",
"پ",
"چ",
"ژ",
"ک",
"گ",
"ی",
]
def _replace_rep(t):
"Replace repetitions at the character level: ccc -> c"
def __replace_rep(m):
c, cc = m.groups()
return f"{c}"
re_rep = re.compile(r"(\S)(\1{2,})")
return re_rep.sub(__replace_rep, t)
def _replace_wrep(t):
"Replace word repetitions: word word word -> word"
def __replace_wrep(m):
c, cc = m.groups()
return f"{c}"
re_wrep = re.compile(r"(\b\w+\W+)(\1{2,})")
return re_wrep.sub(__replace_wrep, t)
def _normalize_text(x):
"""normalize a sentence"""
x = str(x)
x = parsivar_normalizer.normalize(x) # apply `parsivar` normalizations
x = re.sub(r"[\u200c\r\n]", " ", x) # remove half space and new line characters
x = x.lower()
x = "".join(
[char_mappings[xx] if xx in char_mappings else xx for xx in x]
) # substitue bad characters with appropriate ones
x = re.sub(
r"[^{}]".format("".join(valid_chars)), " ", x
) # just keep valid characters and substitue others with space
x = re.sub(r"[a-z]+", r" \g<0> ", x) # put space around words and numbers
x = re.sub(r"[0-9]+", r" \g<0> ", x) # put space around words and numbers
x = re.sub(r"\s+", " ", x) # remove more than one white spaces with space
x = _replace_rep(x)
x = _replace_wrep(x)
return x.strip()
def normalize_texts(X, use_tqdm=False):
"""normalize list of sentences"""
if use_tqdm:
X = [_normalize_text(x) for x in tqdm(X)]
else:
X = [_normalize_text(x) for x in tqdm(X)]
X = list(X)
return X
class JSONListWriter:
"""
auxilary class to write list of dictionaries into json file.
each item in one line.
"""
def __init__(self, file_path):
self.fd = None
self.file_path = file_path
self.delimiter = "\n"
def open(self):
self.fd = open(self.file_path, "w")
self.first_item_written = False
return self
def close(self):
self.fd.close()
self.fd = None
def write_item(self, obj):
if self.first_item_written:
self.fd.write(self.delimiter)
self.fd.write(json.dumps(obj))
self.first_item_written = True
def __enter__(self):
return self.open()
def __exit__(self, type, value, traceback):
self.close()
Preprocess¶
In this section we preprocess the data. It has the following steps:
- set and extract a product name for each base product
- normalize the product name
- extract and exclude invalid products that don't have seller
- aggregate clicks based on search_id
- normalize raw_query
- aggregate searches based on raw_query
- aggregate results, clicks and page views on for each aggregated search
- normalize offline test queries
# setting paths for inputs
data_folder = "./data"
products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")
search_log_train_path = os.path.join(data_folder, "search_log_train.json")
click_log_train_path = os.path.join(data_folder, "click_log_train.json")
queries_test_offline_path = os.path.join(data_folder, "queries_test_offline.json")
queries_test_offline_normalized_path = os.path.join(
data_folder, "queries_test_offline_normalized.json"
)
# paths for aggregated data to be maid
search_clicks_file_path = os.path.join(
data_folder, f"searches_clicks_joined_train.json"
)
search_click_merged_path = os.path.join(data_folder, f"searches_merged_train.json")
def make_base_product_names(products_path: str, products_normalized_path: str):
"""
assign each base product a name.
it is extracted from sellers of that product.
"""
with JSONListWriter(products_normalized_path) as file:
for product in read_json(products_path):
pr_name = ""
for seller in product["sellers"]:
pr_name += " " + seller["name1"] + " " + seller["name2"]
words = [w.strip() for w in pr_name.split()]
words = set(
[w for w in words if w != ""]
) # create a set of all words from seller product names
pr_name = (" ".join(words)).strip()
if (
pr_name == ""
): # exclude this product if its name is an empty string (does not have sellers)
continue
product["product_name"] = pr_name
product["product_name_normalized"] = _normalize_text(pr_name)
file.write_item(product)
def aggregate_clicks(search_path, click_path, tag, valid_base_ids):
"""aggregate clicks on each search record and injects it into the search record"""
search_clicks_dict = {}
for i, click_row in enumerate(
read_json(click_path)
): # aggregate clicks on search_id
search_id = click_row["search_log_id"]
base_product_id = click_row["base_product_id"]
list_of_clicks = search_clicks_dict.get(search_id, [])
list_of_clicks.append(base_product_id)
search_clicks_dict[search_id] = list_of_clicks
invalid_results, invalid_clicks, invalid_searches = 0, 0, 0
with JSONListWriter(
search_clicks_file_path
) as file: # write the result in a new file
for i, search_row in enumerate(read_json(search_path)):
search_id = search_row["_id"]
search_results = search_row["result"]
results = [
r for r in search_results if r in valid_base_ids
] # omit results that are not valid products
results_set = set(results)
clicks = search_clicks_dict.get(search_id, [])
clicks = [
c for c in clicks if c in results_set
] # omit clicks on invalid products
invalid_results += len(search_results) - len(results)
invalid_clicks += len(search_clicks_dict.get(search_id, [])) - len(clicks)
if len(clicks) == 0:
invalid_searches += 1
continue
search_row["raw_query"] = search_row["raw_query"].strip()
search_row["raw_query_normalized"] = _normalize_text(
search_row["raw_query"]
) # store the normalized raw_query
search_row["result"] = results
search_row["clicks"] = clicks
file.write_item(search_row)
print(
f"invalid searches: {invalid_searches}, "
+ f"invalid results: {invalid_results}, "
+ f"invalid clicks: {invalid_clicks}"
)
def aggregate_searches(tag):
"aggregates searches based on raw query."
search_clicks_path = os.path.join(data_folder, f"searches_clicks_joined_{tag}.json")
groups = {}
normalized_query_mapping = {}
# aggregate searchs on raw_query
# following counters are creatd for each aggregated search
# results counter: shows how many times each product is showed to user when the raw_query is searched
# clicks counter: shows how many times each product is clicked when the raw_query is searched
# pages counter: shows how many times each page is viewed by user when the raw_query is searched
for i, search in enumerate(read_json(search_clicks_path)):
raw_query = search["raw_query"]
normalized_query_mapping[raw_query] = search["raw_query_normalized"]
counters = groups.get(raw_query, {})
groups[raw_query] = counters
counters.setdefault("results", collections.Counter())
counters.setdefault("pages", collections.Counter())
counters.setdefault("clicks", collections.Counter())
counters["results"].update(search["result"])
counters["pages"].update([search["page"]])
counters["clicks"].update(search["clicks"])
new_df = []
for raw_query, counters in tqdm(groups.items()):
results_counter = counters["results"].most_common() # sort based on views
pages_counter = counters["pages"].most_common() # sort based on views
clicks_counter = counters["clicks"].most_common() # sort based on clicks
new_df.append(
{
"raw_query": raw_query,
"raw_query_normalized": normalized_query_mapping[raw_query],
"results": [k for k, v in results_counter],
"result_counts": [v for k, v in results_counter],
"pages": [k for k, v in pages_counter],
"page_counts": [v for k, v in pages_counter],
"clicks": [k for k, v in clicks_counter],
"click_counts": [v for k, v in clicks_counter],
}
)
print("Number of unique queries after merge:", len(new_df))
pd.DataFrame(new_df).to_json(
search_click_merged_path,
orient="records",
lines=True,
)
def normalize_test_queries(queries_test_path, queries_test_normalized_path):
"""normalize test queries"""
with JSONListWriter(queries_test_normalized_path) as file:
for query in read_json(queries_test_path):
normalized_query = _normalize_text(query)
file.write_item(normalized_query)
make_base_product_names(products_path, products_normalized_path)
print("\nProduct names created and saved in:", products_normalized_path)
valid_base_ids = set(
[product["_id"] for product in read_json(products_normalized_path)]
)
print("\nList of valid products created")
aggregate_clicks(
search_path=search_log_train_path,
click_path=click_log_train_path,
tag="train",
valid_base_ids=valid_base_ids,
)
print("\nSearches and clicks in the training set are merged")
aggregate_searches("train")
print("\nSearches are aggregated wrt the raw query")
normalize_test_queries(queries_test_offline_path, queries_test_offline_normalized_path)
print("Test queries are normalized")
Extracting features¶
In this section we extract feature for each product and searched query.
At the end, a dat
file is created which is the training data of LambdaMart model.
Here, as a baseline we embed a sentences as follows:
- extract the tf-idf of the setence
- project the tf-idf vector into a lower dimension space by a random projection
So, here is overview of the steps in this section:
- embed product names in a low dimension vector
- embed train raw_queries in a low dimension vector
- embed offline test raw_queries in a low dimension vector
- create
dat
file for model training - store embeded product names and queries in a file
%reset -f # reset notebook due to memory limitations
import gc
import json
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm, trange
def read_json(path, n_lines_to_read=None):
"""
read json file line by line iteratively (generator function).
use this function to read json files when you have memory limitations.
"""
with open(path) as f:
for i, line in enumerate(tqdm(f)):
if n_lines_to_read == i:
break
yield json.loads(line)
data_folder = "./data"
merged_searches_path_train = os.path.join(data_folder, "searches_merged_train.json")
products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")
queries_path_test = os.path.join(data_folder, "queries_test_offline.json")
queries_normalized_path_test = os.path.join(
data_folder, "queries_test_offline_normalized.json"
)
random_projection_path = os.path.join(data_folder, "random_projection.npz")
product_features_path = os.path.join(data_folder, "product_features.npz")
query_train_features_path = os.path.join(data_folder, "query_train_features.npz")
query_test_features_path = os.path.join(data_folder, "query_test_features.npz")
train_dat_path = os.path.join(data_folder, f"train_emb.dat")
vocab_size = 4000 # vocab size for tfidf
emb_dim = 256
# number of lines from the merge searches to choose
sample_num_train = 10000
sample_num_test = None
print("read base products!")
products_df = pd.read_json(products_normalized_path, orient="records", lines=True)
products_df = products_df.drop("sellers", axis=1)
product_id_dict = {_id: ind for ind, _id in enumerate(products_df["_id"])}
product_names = products_df["product_name"]
print("read merged searches!")
merged_searches_train_df = list(read_json(merged_searches_path_train, sample_num_train))
merged_searches_train_df = pd.DataFrame(merged_searches_train_df)
queries_train = merged_searches_train_df["raw_query"]
queries_test = list(read_json(queries_path_test, sample_num_test))
product_names_normalized = products_df["product_name_normalized"]
queries_train_normalized = merged_searches_train_df["raw_query_normalized"]
queries_test_normalized = list(read_json(queries_normalized_path_test, sample_num_test))
del products_df # free memory
gc.collect()
random_projection_mat = np.random.rand(vocab_size, emb_dim) # random projection matrix
vectorizer = TfidfVectorizer(max_features=vocab_size, lowercase=True, use_idf=True) # tfidf vectorizer
vectorizer.fit(product_names_normalized) # fit vectorizer
# transform product names with tfidf vectorizer
products_tfidf = vectorizer.transform(product_names_normalized)
# project the tfidf vector with random projection matrix
products_projected = products_tfidf.dot(random_projection_mat)
del products_tfidf # free memory
gc.collect()
queries_train_tfidf = vectorizer.transform(queries_train_normalized)
queries_train_projected = queries_train_tfidf.dot(random_projection_mat)
del queries_train_tfidf
gc.collect()
queries_test_tfidf = vectorizer.transform(queries_test_normalized)
queries_test_projected = queries_test_tfidf.dot(random_projection_mat)
del queries_test_tfidf
gc.collect()
def make_dat_file(
dat_file_path,
merged_searches,
query_features,
product_features,
n_candidates=None,
):
"""
create a `dat` file which is the training data of LambdaMart model.
The file format of the training and test files is the same as for SVMlight,
with the exception that the lines in the input files have to be sorted by increasing qid.
The first lines may contain comments and are ignored if they start with #.
Each of the following lines represents one training example and is of the following format:
<line> .=. <target> qid:<qid> <feature>:<value> <feature>:<value> ... <feature>:<value> # <info>
<target> .=. <float>
<qid> .=. <positive integer>
<feature> .=. <positive integer>
<value> .=. <float>
<info> .=. <string>
The target value and each of the feature/value pairs are separated by a space character.
Feature/value pairs MUST be ordered by increasing feature number.
Features with value zero can be skipped.
The target value defines the order of the examples for each query.
Implicitly, the target values are used to generated pairwise preference constraints as described in [Joachims, 2002c].
A preference constraint is included for all pairs of examples in the example_file, for which the target value differs.
The special feature "qid" can be used to restrict the generation of constraints.
Two examples are considered for a pairwise preference constraint only if the value of "qid" is the same.
For example, given the example_file
3 qid:1 1:1 2:1 3:0 4:0.2 5:0 # 1A
2 qid:1 1:0 2:0 3:1 4:0.1 5:1 # 1B
1 qid:1 1:0 2:1 3:0 4:0.4 5:0 # 1C
1 qid:1 1:0 2:0 3:1 4:0.3 5:0 # 1D
1 qid:2 1:0 2:0 3:1 4:0.2 5:0 # 2A
2 qid:2 1:1 2:0 3:1 4:0.4 5:0 # 2B
1 qid:2 1:0 2:0 3:1 4:0.1 5:0 # 2C
1 qid:2 1:0 2:0 3:1 4:0.2 5:0 # 2D
2 qid:3 1:0 2:0 3:1 4:0.1 5:1 # 3A
3 qid:3 1:1 2:1 3:0 4:0.3 5:0 # 3B
4 qid:3 1:1 2:0 3:0 4:0.4 5:1 # 3C
1 qid:3 1:0 2:1 3:1 4:0.5 5:0 # 3D
the following set of pairwise constraints is generated (examples are referred to by the info-string after the # character):
1A>1B, 1A>1C, 1A>1D, 1B>1C, 1B>1D, 2B>2A, 2B>2C, 2B>2D, 3C>3A, 3C>3B, 3C>3D, 3B>3A, 3B>3D, 3A>3D
more information:
- https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html#embedding-additional-information-inside-libsvm-file
- https://www.cs.cornell.edu/people/tj/svm_light/svm_rank.html
"""
features_list = []
scores = []
groups = []
with open(dat_file_path, "w") as file:
for qid, (_, merged_search) in enumerate(tqdm(merged_searches.iterrows())):
if n_candidates is None:
limit = len(merged_search["results"])
limit = min(limit, len(merged_search["results"]))
clicks = dict(zip(merged_search["clicks"], merged_search["click_counts"]))
for candidate_product_id in merged_search["results"][:limit]:
candidate_score = clicks.get(candidate_product_id, 0)
candidate_score = np.log2(candidate_score + 1)
p_idx = product_id_dict[candidate_product_id]
feature = np.concatenate((product_features[p_idx], query_features[qid]))
feature = np.around(feature, 3)
file.write(
f"{candidate_score} qid:{qid} "
+ " ".join([f"{i}:{s}" for i, s in enumerate(feature)])
+ "\n"
)
make_dat_file(
train_dat_path,
merged_searches_train_df,
queries_train_projected,
products_projected,
)
# we look at 5 lines of train.dat
!head -5 ./data/train_emb.dat
np.savez(random_projection_path, random_projection_mat)
np.savez(product_features_path, products_projected)
np.savez(query_train_features_path, queries_train_projected)
np.savez(query_test_features_path, queries_test_projected)
Train¶
In this section, the model is trained and saved into a file.
%reset -f
import os
import xgboost as xgb
data_folder = "./data"
train_dat_path = os.path.join(data_folder, f"train_emb.dat")
model_path = os.path.join(data_folder, "ranker.json")
train_data = xgb.DMatrix(train_dat_path)
param = {
"max_depth": 20,
"eta": 0.3,
"objective": "rank:ndcg",
"verbosity": 1,
"num_parallel_tree": 1,
"tree_method": "gpu_hist",
"eval_metric": ["ndcg", "ndcg@10"],
}
eval_list = [(train_data, "train")]
model = xgb.train(
param,
train_data,
num_boost_round=200,
evals=eval_list,
)
model.save_model(model_path)
Inference¶
The last phase is the inference step.
In this step we must suggest 10 products for each test query.
But, as the result
for each test query is not given,
we must have a method to extract candidate products for each test query.
The naive approach can be to give all prodcuts as a candidate for each query and want the model to sort them. But, this is nor performant and nor easy for the model to predict.
Here, as a baseline, we do this:
- create an inverted index of word occurance in each product. e.g. mobile -> [ualkj, iousl, ...]
- split query and extract candidates of each word
- intersect word candidates (exclude words with zero candidates)
- if the intersection results into an empty set: select word with minimum number of candidates greater than zero.
Then, we can predict each product score by the trained model and select top 10 ones.
At last the result.csv
file is generated and submitted.
%reset -f
import itertools
import gc
import json
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm, trange
def read_json(path, n_lines_to_read=None):
"""
read json file line by line iteratively (generator function).
use this function to read json files when you have memory limitations.
"""
with open(path) as f:
for i, line in enumerate(tqdm(f)):
if n_lines_to_read == i:
break
yield json.loads(line)
data_folder = "./data"
merged_searches_path_train = os.path.join(data_folder, "queries_merged_train.json")
products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")
queries_path_test = os.path.join(data_folder, "queries_test_offline.json")
queries_normalized_path_test = os.path.join(
data_folder, "queries_test_offline_normalized.json"
)
random_projection_path = os.path.join(data_folder, "random_projection.npz")
product_features_path = os.path.join(data_folder, "product_features.npz")
query_train_features_path = os.path.join(data_folder, "query_train_features.npz")
query_test_features_path = os.path.join(data_folder, "query_test_features.npz")
train_dat_path = os.path.join(data_folder, f"train_emb.dat")
model_path = os.path.join(data_folder, "ranker.json")
uploading_result_path = os.path.join(data_folder, "result.csv")
# number of test queries to choose
sample_num_test = None
print("read base products!")
products_df = pd.read_json(products_normalized_path, orient="records", lines=True)
products_df = products_df.drop("sellers", axis=1)
product_id_dict = {_id: ind for ind, _id in enumerate(products_df["_id"])}
product_names = products_df["product_name"]
product_ids = products_df["_id"]
print("read merged searches!")
queries_test = list(read_json(queries_path_test, sample_num_test))
product_names_normalized = products_df["product_name_normalized"]
queries_test_normalized = list(read_json(queries_normalized_path_test, sample_num_test))
del products_df # free memory
gc.collect()
# load ebmeded vectors
products_projected = np.load(product_features_path)["arr_0"]
queries_test_projected = np.load(query_test_features_path)["arr_0"]
# create the inverted index
word_inverted_index = {}
for i, product_name in enumerate(tqdm(product_names_normalized)):
for word in product_name.split():
word_index = word_inverted_index.get(word, set())
word_index.add(i)
word_inverted_index[word] = word_index
def intersection_find(query):
candidates = None
for word in query.split():
word_candidates = word_inverted_index.get(word, set())
if len(word_candidates) == 0: # skip word with 0 candidates
continue
elif candidates is None: # the first word with candidates
candidates = word_candidates
else: # do the intersection
candidates = candidates.intersection(word_candidates)
if candidates is None: # all words had 0 candidates
candidates = set()
elif len(candidates) == 0: # the intersection resulted into an empty set
pass
return candidates
def minimum_word_candidate_find(query):
words = query.split()
n_candidates_per_word = [
len(word_inverted_index.get(word, set())) for word in words
]
sorted_indices = np.argsort(
n_candidates_per_word
) # sort words with respect to their number of candidates
for idx in sorted_indices:
if n_candidates_per_word[idx] > 0:
return word_inverted_index.get(words[idx], set())
return []
def find_candidate_products(query):
candidates = intersection_find(query)
if len(candidates) == 0:
candidates = minimum_word_candidate_find(query)
return [product_ids[p_idx] for p_idx in candidates]
param = {}
model = xgb.Booster(**param)
model.load_model(model_path)
model.set_param({"predictor": "gpu_predictor", "gpu_id": 0})
The inference cell is the following one. Due to the memory limitations and enhance the inference performance, batch processing is taken into account.
For each test query:
- extract candidates by the described method
- for each candidate:
- add the candidate to the batch
- if the items in batch == BATCH_SIZE:
- predict scores for items in batch
- store predictions in a dictionary
- select top products for the queries that have predicted scores for all of their candidates
BATCH_SIZE = int(50e3)
TOP_K = 10
def get_query_product_feature(product_id, query_feature, product_features):
p_idx = product_id_dict[product_id]
feature = np.concatenate((product_features[p_idx], query_feature))
return feature
def batch_predict(result_df, batch_queries):
batch_feat = list(
itertools.chain(*[result_df[t_query]["features"] for t_query in batch_queries])
) # stack features of all candidates
batch_feat = np.stack(batch_feat) # covert features to numpy datatype
batch_data = xgb.DMatrix(batch_feat) # convert it to xgboost datatype
batch_predictions = model.predict(batch_data)
return batch_predictions
def store_predictions(result_df, batch_queries):
batch_prediction_idx = 0
done_queries = []
for query in batch_queries:
query_result = result_df[query]
n_candidates_in_batch = len(query_result["features"])
query_result["features"] = [] # free memory
new_predictions = batch_predictions[
batch_prediction_idx : batch_prediction_idx + n_candidates_in_batch
]
query_result["predictions"].extend(new_predictions)
batch_prediction_idx += n_candidates_in_batch
if len(query_result["candidates"]) == len(
query_result["predictions"]
): # all candidates scores are predicted
select_best_products(query_result)
done_queries.append(query)
for query in done_queries: # remove completed queries from batch
batch_queries.remove(query)
def select_best_products(query_result):
predictions = np.array(query_result["predictions"])
t_candidates = query_result["candidates"]
query_result["predictions"] = [] # free memory
query_result["candidates"] = [] # free memory
# sort candidates based on prediction scores
sorted_inds = np.argsort(predictions)[::-1][:TOP_K]
# convert product indices to product ids
selected_products = [t_candidates[ii] for ii in sorted_inds]
# pad the suggestion with dummy `00000` id (10 products must be suggested)
selected_products += ["00000"] * (TOP_K - len(selected_products))
# store suggestions
query_result["suggestions"] = selected_products
# result_df: query -> {"features": ..., "predictions": .., "candidates": ..., "suggestions": ...}
result_df = {}
batch_queries = set() # queries in batch
n_candidates_in_batch = 0
for test_qid, (test_query, test_query_normalized) in enumerate(
zip(tqdm(queries_test), queries_test_normalized)
):
candidates = find_candidate_products(test_query_normalized)
batch_queries.add(test_query)
test_query_result = {"features": [], "predictions": [], "candidates": candidates}
result_df[test_query] = test_query_result
for j, candidate_id in enumerate(candidates):
test_query_result["features"].append(
get_query_product_feature(
candidate_id, queries_test_projected[test_qid], products_projected
)
)
n_candidates_in_batch += 1
if n_candidates_in_batch == BATCH_SIZE or (
test_qid == len(queries_test) - 1
and j == len(candidates) - 1 # last candidate of last query
):
batch_predictions = batch_predict(result_df, batch_queries)
store_predictions(result_df, batch_queries)
n_candidates_in_batch = 0
new_result_df = [[t_query] + value["suggestions"] for t_query, value in result_df.items()]
new_result_df = pd.DataFrame(new_result_df)
new_result_df.to_csv(uploading_result_path, header=False, index=False)
print(new_result_df.shape)
from roboepics_client.roboepics_client import RoboEpicsClient
problem_id = 4 # You need to provide an appropriate value for this variable
problem_enter_id = None # fill this value with your id
roboepics_client = RoboEpicsClient(problem_id, problem_enter_id)
roboepics_client.submit(uploading_result_path)
Content
Comments
You must login before you can post a comment.