Food Recognition Challenge

BanuMahasti's First Notebook

In [ ]:

!pip install parsivar
!pip install git+https://github.com/RoboEpics/roboepics-client.git

In [ ]:

from roboepics_client.roboepics_client import RoboEpicsClient

problem_id = 4
problem_enter_id = None  # fill this value with your id

roboepics_client = RoboEpicsClient(problem_id, problem_enter_id)

Downloading and extracting data¶

We used a permanent google link for faster download

In [ ]:

!gdown --id 1uYwzBe8nLhOQ2Q3rCScEXvljLkQqJrHc
!7z x data.7z
!ls

Utils¶

In [ ]:

from __future__ import unicode_literals

import collections
import gc
import json
import re
import os

import numpy as np
import pandas as pd
import parsivar
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [ ]:

def read_json(path, n_lines_to_read=None):
    """
    read json file line by line iteratively (generator function).
    use this function to read json files when you have memory limitations.
    """

    with open(path) as f:
        for i, line in enumerate(tqdm(f)):
            if n_lines_to_read == i:
                break
            yield json.loads(line)

In [ ]:

parsivar_normalizer = parsivar.Normalizer(statistical_space_correction=True)

char_mappings = {
    "٥": "5",
    "А": "a",
    "В": "b",
    "Е": "e",
    "Н": "h",
    "Р": "P",
    "С": "C",
    "Т": "T",
    "а": "a",
    "г": "r",
    "е": "e",
    "к": "k",
    "м": "m",
    "о": "o",
    "р": "p",
    "ڈ": "د",
    "ڇ": "چ",
    # Persian numbers (will be raplaced by english one)
    "۰": "0",
    "۱": "1",
    "۲": "2",
    "۳": "3",
    "۴": "4",
    "۵": "5",
    "۶": "6",
    "۷": "7",
    "۸": "8",
    "۹": "9",
    ".": ".",
    # Arabic numbers (will be raplaced by english one)
    "٠": "0",
    "١": "1",
    "٢": "2",
    "٣": "3",
    "٤": "4",
    "٥": "5",
    "٦": "6",
    "٧": "7",
    "٨": "8",
    "٩": "9",
    # Special Arabic Characters (will be replaced by persian one)
    "ك": "ک",
    "ى": "ی",
    "ي": "ی",
    "ؤ": "و",
    "ئ": "ی",
    "إ": "ا",
    "أ": "ا",
    "آ": "ا",
    "ة": "ه",
    "ء": "ی",
    # French alphabet (will be raplaced by english one)
    "à": "a",
    "ä": "a",
    "ç": "c",
    "é": "e",
    "è": "e",
    "ê": "e",
    "ë": "e",
    "î": "i",
    "ï": "i",
    "ô": "o",
    "ù": "u",
    "û": "u",
    "ü": "u",
    # Camma (will be replace by dots for floating point numbers)
    ",": ".",
    # And (will be replace by dots for floating point numbers)
    "&": " and ",
    # Vowels (will be removed)
    "ّ": "",  # tashdid
    "َ": "",  # a
    "ِ": "",  # e
    "ُ": "",  # o
    "ـ": "",  # tatvil
    # Spaces
    "‍": "",  # 0x9E -> ZERO WIDTH JOINER
    "‌": " ",  # 0x9D -> ZERO WIDTH NON-JOINER
    # Arabic Presentation Forms-A (will be replaced by persian one)
    "ﭐ": "ا",
    "ﭑ": "ا",
    "ﭖ": "پ",
    "ﭗ": "پ",
    "ﭘ": "پ",
    "ﭙ": "پ",
    "ﭞ": "ت",
    "ﭟ": "ت",
    "ﭠ": "ت",
    "ﭡ": "ت",
    "ﭺ": "چ",
    "ﭻ": "چ",
    "ﭼ": "چ",
    "ﭽ": "چ",
    "ﮊ": "ژ",
    "ﮋ": "ژ",
    "ﮎ": "ک",
    "ﮏ": "ک",
    "ﮐ": "ک",
    "ﮑ": "ک",
    "ﮒ": "گ",
    "ﮓ": "گ",
    "ﮔ": "گ",
    "ﮕ": "گ",
    "ﮤ": "ه",
    "ﮥ": "ه",
    "ﮦ": "ه",
    "ﮪ": "ه",
    "ﮫ": "ه",
    "ﮬ": "ه",
    "ﮭ": "ه",
    "ﮮ": "ی",
    "ﮯ": "ی",
    "ﮰ": "ی",
    "ﮱ": "ی",
    "ﯼ": "ی",
    "ﯽ": "ی",
    "ﯾ": "ی",
    "ﯿ": "ی",
    # Arabic Presentation Forms-B (will be removed)
    "ﹰ": "",
    "ﹱ": "",
    "ﹲ": "",
    "ﹳ": "",
    "ﹴ": "",
    "﹵": "",
    "ﹶ": "",
    "ﹷ": "",
    "ﹸ": "",
    "ﹹ": "",
    "ﹺ": "",
    "ﹻ": "",
    "ﹼ": "",
    "ﹽ": "",
    "ﹾ": "",
    "ﹿ": "",
    # Arabic Presentation Forms-B (will be replaced by persian one)
    "ﺀ": "ی",
    "ﺁ": "ا",
    "ﺂ": "ا",
    "ﺃ": "ا",
    "ﺄ": "ا",
    "ﺅ": "و",
    "ﺆ": "و",
    "ﺇ": "ا",
    "ﺈ": "ا",
    "ﺉ": "ی",
    "ﺊ": "ی",
    "ﺋ": "ی",
    "ﺌ": "ی",
    "ﺍ": "ا",
    "ﺎ": "ا",
    "ﺏ": "ب",
    "ﺐ": "ب",
    "ﺑ": "ب",
    "ﺒ": "ب",
    "ﺓ": "ه",
    "ﺔ": "ه",
    "ﺕ": "ت",
    "ﺖ": "ت",
    "ﺗ": "ت",
    "ﺘ": "ت",
    "ﺙ": "ث",
    "ﺚ": "ث",
    "ﺛ": "ث",
    "ﺜ": "ث",
    "ﺝ": "ج",
    "ﺞ": "ج",
    "ﺟ": "ج",
    "ﺠ": "ج",
    "ﺡ": "ح",
    "ﺢ": "ح",
    "ﺣ": "ح",
    "ﺤ": "ح",
    "ﺥ": "خ",
    "ﺦ": "خ",
    "ﺧ": "خ",
    "ﺨ": "خ",
    "ﺩ": "د",
    "ﺪ": "د",
    "ﺫ": "ذ",
    "ﺬ": "ذ",
    "ﺭ": "ر",
    "ﺮ": "ر",
    "ﺯ": "ز",
    "ﺰ": "ز",
    "ﺱ": "س",
    "ﺲ": "س",
    "ﺳ": "س",
    "ﺴ": "س",
    "ﺵ": "ش",
    "ﺶ": "ش",
    "ﺷ": "ش",
    "ﺸ": "ش",
    "ﺹ": "ص",
    "ﺺ": "ص",
    "ﺻ": "ص",
    "ﺼ": "ص",
    "ﺽ": "ض",
    "ﺾ": "ض",
    "ﺿ": "ض",
    "ﻀ": "ض",
    "ﻁ": "ط",
    "ﻂ": "ط",
    "ﻃ": "ط",
    "ﻄ": "ط",
    "ﻅ": "ظ",
    "ﻆ": "ظ",
    "ﻇ": "ظ",
    "ﻈ": "ظ",
    "ﻉ": "ع",
    "ﻊ": "ع",
    "ﻋ": "ع",
    "ﻌ": "ع",
    "ﻍ": "غ",
    "ﻎ": "غ",
    "ﻏ": "غ",
    "ﻐ": "غ",
    "ﻑ": "ف",
    "ﻒ": "ف",
    "ﻓ": "ف",
    "ﻔ": "ف",
    "ﻕ": "ق",
    "ﻖ": "ق",
    "ﻗ": "ق",
    "ﻘ": "ق",
    "ﻙ": "ک",
    "ﻚ": "ک",
    "ﻛ": "ک",
    "ﻜ": "ک",
    "ﻝ": "ل",
    "ﻞ": "ل",
    "ﻟ": "ل",
    "ﻠ": "ل",
    "ﻡ": "م",
    "ﻢ": "م",
    "ﻣ": "م",
    "ﻤ": "م",
    "ﻥ": "ن",
    "ﻦ": "ن",
    "ﻧ": "ن",
    "ﻨ": "ن",
    "ﻩ": "ه",
    "ﻪ": "ه",
    "ﻫ": "ه",
    "ﻬ": "ه",
    "ﻭ": "و",
    "ﻮ": "و",
    "ﻯ": "ی",
    "ﻰ": "ی",
    "ﻱ": "ی",
    "ﻲ": "ی",
    "ﻳ": "ی",
    "ﻴ": "ی",
    "ﻵ": "لا",
    "ﻶ": "لا",
    "ﻷ": "لا",
    "ﻸ": "لا",
    "ﻹ": "لا",
    "ﻺ": "لا",
    "ﻻ": "لا",
    "ﻼ": "لا",
}

valid_chars = [
    " ",
    "0",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "آ",
    "ئ",
    "ا",
    "ب",
    "ت",
    "ث",
    "ج",
    "ح",
    "خ",
    "د",
    "ذ",
    "ر",
    "ز",
    "س",
    "ش",
    "ص",
    "ض",
    "ط",
    "ظ",
    "ع",
    "غ",
    "ف",
    "ق",
    "ل",
    "م",
    "ن",
    "ه",
    "و",
    "پ",
    "چ",
    "ژ",
    "ک",
    "گ",
    "ی",
]


def _replace_rep(t):
    "Replace repetitions at the character level: ccc -> c"

    def __replace_rep(m):
        c, cc = m.groups()
        return f"{c}"

    re_rep = re.compile(r"(\S)(\1{2,})")
    return re_rep.sub(__replace_rep, t)


def _replace_wrep(t):
    "Replace word repetitions: word word word -> word"

    def __replace_wrep(m):
        c, cc = m.groups()
        return f"{c}"

    re_wrep = re.compile(r"(\b\w+\W+)(\1{2,})")
    return re_wrep.sub(__replace_wrep, t)


def _normalize_text(x):
    """normalize a sentence"""

    x = str(x)
    x = parsivar_normalizer.normalize(x)  # apply `parsivar` normalizations
    x = re.sub(r"[\u200c\r\n]", " ", x)  # remove half space and new line characters
    x = x.lower()
    x = "".join(
        [char_mappings[xx] if xx in char_mappings else xx for xx in x]
    )  # substitue bad characters with appropriate ones
    x = re.sub(
        r"[^{}]".format("".join(valid_chars)), " ", x
    )  # just keep valid characters and substitue others with space
    x = re.sub(r"[a-z]+", r" \g<0> ", x)  # put space around words and numbers
    x = re.sub(r"[0-9]+", r" \g<0> ", x)  # put space around words and numbers
    x = re.sub(r"\s+", " ", x)  # remove more than one white spaces with space
    x = _replace_rep(x)
    x = _replace_wrep(x)
    return x.strip()


def normalize_texts(X, use_tqdm=False):
    """normalize list of sentences"""

    if use_tqdm:
        X = [_normalize_text(x) for x in tqdm(X)]
    else:
        X = [_normalize_text(x) for x in tqdm(X)]
    X = list(X)
    return X

In [ ]:

class JSONListWriter:
    """
    auxilary class to write list of dictionaries into json file.
    each item in one line.
    """

    def __init__(self, file_path):
        self.fd = None
        self.file_path = file_path
        self.delimiter = "\n"

    def open(self):
        self.fd = open(self.file_path, "w")
        self.first_item_written = False
        return self

    def close(self):
        self.fd.close()
        self.fd = None

    def write_item(self, obj):
        if self.first_item_written:
            self.fd.write(self.delimiter)
        self.fd.write(json.dumps(obj))
        self.first_item_written = True

    def __enter__(self):
        return self.open()

    def __exit__(self, type, value, traceback):
        self.close()

Preprocess¶

In this section we preprocess the data. It has the following steps:

set and extract a product name for each base product
normalize the product name
extract and exclude invalid products that don't have seller
aggregate clicks based on search_id
normalize raw_query
aggregate searches based on raw_query
aggregate results, clicks and page views on for each aggregated search
normalize offline test queries

In [ ]:

# setting paths for inputs
data_folder = "./data"

products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")

search_log_train_path = os.path.join(data_folder, "search_log_train.json")
click_log_train_path = os.path.join(data_folder, "click_log_train.json")

queries_test_offline_path = os.path.join(data_folder, "queries_test_offline.json")
queries_test_offline_normalized_path = os.path.join(
    data_folder, "queries_test_offline_normalized.json"
)

# paths for aggregated data to be maid
search_clicks_file_path = os.path.join(
        data_folder, f"searches_clicks_joined_train.json"
    )

search_click_merged_path = os.path.join(data_folder, f"searches_merged_train.json")

In [ ]:

def make_base_product_names(products_path: str, products_normalized_path: str):
    """
    assign each base product a name.
    it is extracted from sellers of that product.
    """

    with JSONListWriter(products_normalized_path) as file:
        for product in read_json(products_path):
            pr_name = ""
            for seller in product["sellers"]:
                pr_name += " " + seller["name1"] + " " + seller["name2"]
            words = [w.strip() for w in pr_name.split()]
            words = set(
                [w for w in words if w != ""]
            )  # create a set of all words from seller product names
            pr_name = (" ".join(words)).strip()

            if (
                pr_name == ""
            ):  # exclude this product if its name is an empty string (does not have sellers)
                continue

            product["product_name"] = pr_name
            product["product_name_normalized"] = _normalize_text(pr_name)

            file.write_item(product)


def aggregate_clicks(search_path, click_path, tag, valid_base_ids):
    """aggregate clicks on each search record and injects it into the search record"""

    search_clicks_dict = {}
    for i, click_row in enumerate(
        read_json(click_path)
    ):  # aggregate clicks on search_id
        search_id = click_row["search_log_id"]
        base_product_id = click_row["base_product_id"]

        list_of_clicks = search_clicks_dict.get(search_id, [])
        list_of_clicks.append(base_product_id)
        search_clicks_dict[search_id] = list_of_clicks

    invalid_results, invalid_clicks, invalid_searches = 0, 0, 0
    
    with JSONListWriter(
        search_clicks_file_path
    ) as file:  # write the result in a new file
        for i, search_row in enumerate(read_json(search_path)):
            search_id = search_row["_id"]
            search_results = search_row["result"]

            results = [
                r for r in search_results if r in valid_base_ids
            ]  # omit results that are not valid products
            results_set = set(results)

            clicks = search_clicks_dict.get(search_id, [])
            clicks = [
                c for c in clicks if c in results_set
            ]  # omit clicks on invalid products

            invalid_results += len(search_results) - len(results)
            invalid_clicks += len(search_clicks_dict.get(search_id, [])) - len(clicks)

            if len(clicks) == 0:
                invalid_searches += 1
                continue

            search_row["raw_query"] = search_row["raw_query"].strip()
            search_row["raw_query_normalized"] = _normalize_text(
                search_row["raw_query"]
            )  # store the normalized raw_query
            search_row["result"] = results
            search_row["clicks"] = clicks

            file.write_item(search_row)

    print(
        f"invalid searches: {invalid_searches}, "
        + f"invalid results: {invalid_results}, "
        + f"invalid clicks: {invalid_clicks}"
    )


def aggregate_searches(tag):
    "aggregates searches based on raw query."

    search_clicks_path = os.path.join(data_folder, f"searches_clicks_joined_{tag}.json")
    groups = {}
    normalized_query_mapping = {}
    # aggregate searchs on raw_query
    # following counters are creatd for each aggregated search
    # results counter: shows how many times each product is showed to user when the raw_query is searched
    # clicks counter: shows how many times each product is clicked when the raw_query is searched
    # pages counter: shows how many times each page is viewed by user when the raw_query is searched
    for i, search in enumerate(read_json(search_clicks_path)):
        raw_query = search["raw_query"]
        normalized_query_mapping[raw_query] = search["raw_query_normalized"]

        counters = groups.get(raw_query, {})
        groups[raw_query] = counters

        counters.setdefault("results", collections.Counter())
        counters.setdefault("pages", collections.Counter())
        counters.setdefault("clicks", collections.Counter())

        counters["results"].update(search["result"])
        counters["pages"].update([search["page"]])
        counters["clicks"].update(search["clicks"])

    new_df = []
    for raw_query, counters in tqdm(groups.items()):
        results_counter = counters["results"].most_common()  # sort based on views
        pages_counter = counters["pages"].most_common()  # sort based on views
        clicks_counter = counters["clicks"].most_common()  # sort based on clicks

        new_df.append(
            {
                "raw_query": raw_query,
                "raw_query_normalized": normalized_query_mapping[raw_query],
                "results": [k for k, v in results_counter],
                "result_counts": [v for k, v in results_counter],
                "pages": [k for k, v in pages_counter],
                "page_counts": [v for k, v in pages_counter],
                "clicks": [k for k, v in clicks_counter],
                "click_counts": [v for k, v in clicks_counter],
            }
        )
    print("Number of unique queries after merge:", len(new_df))

    pd.DataFrame(new_df).to_json(
        search_click_merged_path,
        orient="records",
        lines=True,
    )


def normalize_test_queries(queries_test_path, queries_test_normalized_path):
    """normalize test queries"""
    with JSONListWriter(queries_test_normalized_path) as file:
        for query in read_json(queries_test_path):
            normalized_query = _normalize_text(query)
            file.write_item(normalized_query)

In [ ]:

make_base_product_names(products_path, products_normalized_path)

print("\nProduct names created and saved in:", products_normalized_path)

In [ ]:

valid_base_ids = set(
    [product["_id"] for product in read_json(products_normalized_path)]
)
print("\nList of valid products created")

In [ ]:

aggregate_clicks(
    search_path=search_log_train_path,
    click_path=click_log_train_path,
    tag="train",
    valid_base_ids=valid_base_ids,
)
print("\nSearches and clicks in the training set are merged")

In [ ]:

aggregate_searches("train")
print("\nSearches are aggregated wrt the raw query")

In [ ]:

normalize_test_queries(queries_test_offline_path, queries_test_offline_normalized_path)
print("Test queries are normalized")

Extracting features¶

In this section we extract feature for each product and searched query. At the end, a dat file is created which is the training data of LambdaMart model.

Here, as a baseline we embed a sentences as follows:

extract the tf-idf of the setence
project the tf-idf vector into a lower dimension space by a random projection

So, here is overview of the steps in this section:

embed product names in a low dimension vector
embed train raw_queries in a low dimension vector
embed offline test raw_queries in a low dimension vector
create dat file for model training
store embeded product names and queries in a file

In [ ]:

%reset -f # reset notebook due to memory limitations

In [ ]:

import gc
import json
import os

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm, trange


def read_json(path, n_lines_to_read=None):
    """
    read json file line by line iteratively (generator function).
    use this function to read json files when you have memory limitations.
    """

    with open(path) as f:
        for i, line in enumerate(tqdm(f)):
            if n_lines_to_read == i:
                break
            yield json.loads(line)

In [ ]:

data_folder = "./data"

merged_searches_path_train = os.path.join(data_folder, "searches_merged_train.json")

products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")

queries_path_test = os.path.join(data_folder, "queries_test_offline.json")
queries_normalized_path_test = os.path.join(
    data_folder, "queries_test_offline_normalized.json"
)

random_projection_path = os.path.join(data_folder, "random_projection.npz")
product_features_path = os.path.join(data_folder, "product_features.npz")
query_train_features_path = os.path.join(data_folder, "query_train_features.npz")
query_test_features_path = os.path.join(data_folder, "query_test_features.npz")

train_dat_path = os.path.join(data_folder, f"train_emb.dat")

In [ ]:

vocab_size = 4000  # vocab size for tfidf
emb_dim = 256

# number of lines from the merge searches to choose
sample_num_train = 10000
sample_num_test = None

In [ ]:

print("read base products!")
products_df = pd.read_json(products_normalized_path, orient="records", lines=True)
products_df = products_df.drop("sellers", axis=1)
product_id_dict = {_id: ind for ind, _id in enumerate(products_df["_id"])}
product_names = products_df["product_name"]

In [ ]:

print("read merged searches!")
merged_searches_train_df = list(read_json(merged_searches_path_train, sample_num_train))
merged_searches_train_df = pd.DataFrame(merged_searches_train_df)
queries_train = merged_searches_train_df["raw_query"]
queries_test = list(read_json(queries_path_test, sample_num_test))

In [ ]:

product_names_normalized = products_df["product_name_normalized"]
queries_train_normalized = merged_searches_train_df["raw_query_normalized"]
queries_test_normalized = list(read_json(queries_normalized_path_test, sample_num_test))

del products_df  # free memory
gc.collect()

In [ ]:

random_projection_mat = np.random.rand(vocab_size, emb_dim)  # random projection matrix

vectorizer = TfidfVectorizer(max_features=vocab_size, lowercase=True, use_idf=True)  # tfidf vectorizer
vectorizer.fit(product_names_normalized)  # fit vectorizer

# transform product names with tfidf vectorizer
products_tfidf = vectorizer.transform(product_names_normalized)  
# project the tfidf vector with random projection matrix
products_projected = products_tfidf.dot(random_projection_mat)
del products_tfidf  # free memory
gc.collect()

queries_train_tfidf = vectorizer.transform(queries_train_normalized)
queries_train_projected = queries_train_tfidf.dot(random_projection_mat)
del queries_train_tfidf
gc.collect()

queries_test_tfidf = vectorizer.transform(queries_test_normalized)
queries_test_projected = queries_test_tfidf.dot(random_projection_mat)
del queries_test_tfidf
gc.collect()

In [ ]:

def make_dat_file(
    dat_file_path,
    merged_searches,
    query_features,
    product_features,
    n_candidates=None,
):
    """
    create a `dat` file which is the training data of LambdaMart model.

    The file format of the training and test files is the same as for SVMlight,
    with the exception that the lines in the input files have to be sorted by increasing qid.
    The first lines may contain comments and are ignored if they start with #.
    Each of the following lines represents one training example and is of the following format:

    <line> .=. <target> qid:<qid> <feature>:<value> <feature>:<value> ... <feature>:<value> # <info>
    <target> .=. <float>
    <qid> .=. <positive integer>
    <feature> .=. <positive integer>
    <value> .=. <float>
    <info> .=. <string>

    The target value and each of the feature/value pairs are separated by a space character.
    Feature/value pairs MUST be ordered by increasing feature number.
    Features with value zero can be skipped.
    The target value defines the order of the examples for each query.
    Implicitly, the target values are used to generated pairwise preference constraints as described in [Joachims, 2002c].
    A preference constraint is included for all pairs of examples in the example_file, for which the target value differs.
    The special feature "qid" can be used to restrict the generation of constraints.
    Two examples are considered for a pairwise preference constraint only if the value of "qid" is the same.

    For example, given the example_file

    3 qid:1 1:1 2:1 3:0 4:0.2 5:0 # 1A
    2 qid:1 1:0 2:0 3:1 4:0.1 5:1 # 1B
    1 qid:1 1:0 2:1 3:0 4:0.4 5:0 # 1C
    1 qid:1 1:0 2:0 3:1 4:0.3 5:0 # 1D
    1 qid:2 1:0 2:0 3:1 4:0.2 5:0 # 2A
    2 qid:2 1:1 2:0 3:1 4:0.4 5:0 # 2B
    1 qid:2 1:0 2:0 3:1 4:0.1 5:0 # 2C
    1 qid:2 1:0 2:0 3:1 4:0.2 5:0 # 2D
    2 qid:3 1:0 2:0 3:1 4:0.1 5:1 # 3A
    3 qid:3 1:1 2:1 3:0 4:0.3 5:0 # 3B
    4 qid:3 1:1 2:0 3:0 4:0.4 5:1 # 3C
    1 qid:3 1:0 2:1 3:1 4:0.5 5:0 # 3D

    the following set of pairwise constraints is generated (examples are referred to by the info-string after the # character):

    1A>1B, 1A>1C, 1A>1D, 1B>1C, 1B>1D, 2B>2A, 2B>2C, 2B>2D, 3C>3A, 3C>3B, 3C>3D, 3B>3A, 3B>3D, 3A>3D

    more information:
     - https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html#embedding-additional-information-inside-libsvm-file
     - https://www.cs.cornell.edu/people/tj/svm_light/svm_rank.html
    """
    
    features_list = []
    scores = []
    groups = []

    with open(dat_file_path, "w") as file:
        for qid, (_, merged_search) in enumerate(tqdm(merged_searches.iterrows())):
            if n_candidates is None:
                limit = len(merged_search["results"])
            limit = min(limit, len(merged_search["results"]))
            clicks = dict(zip(merged_search["clicks"], merged_search["click_counts"]))

            for candidate_product_id in merged_search["results"][:limit]:
                candidate_score = clicks.get(candidate_product_id, 0)
                candidate_score = np.log2(candidate_score + 1)

                p_idx = product_id_dict[candidate_product_id]
                feature = np.concatenate((product_features[p_idx], query_features[qid]))
                feature = np.around(feature, 3)

                file.write(
                    f"{candidate_score} qid:{qid} "
                    + " ".join([f"{i}:{s}" for i, s in enumerate(feature)])
                    + "\n"
                )

In [ ]:

make_dat_file(
    train_dat_path,
    merged_searches_train_df,
    queries_train_projected,
    products_projected,
)

In [ ]:

# we look at 5 lines of train.dat
!head -5 ./data/train_emb.dat

In [ ]:

np.savez(random_projection_path, random_projection_mat)
np.savez(product_features_path, products_projected)
np.savez(query_train_features_path, queries_train_projected)
np.savez(query_test_features_path, queries_test_projected)

Train¶

In this section, the model is trained and saved into a file.

In [ ]:

%reset -f

In [ ]:

import os
import xgboost as xgb

In [ ]:

data_folder = "./data"

train_dat_path = os.path.join(data_folder, f"train_emb.dat")
model_path = os.path.join(data_folder, "ranker.json")

In [ ]:

train_data = xgb.DMatrix(train_dat_path)

In [ ]:

param = {
    "max_depth": 20,
    "eta": 0.3,
    "objective": "rank:ndcg",
    "verbosity": 1,
    "num_parallel_tree": 1,
    "tree_method": "gpu_hist",
    "eval_metric": ["ndcg", "ndcg@10"],
}

eval_list = [(train_data, "train")]

model = xgb.train(
    param,
    train_data,
    num_boost_round=200,
    evals=eval_list,
)

model.save_model(model_path)

Inference¶

The last phase is the inference step. In this step we must suggest 10 products for each test query. But, as the result for each test query is not given, we must have a method to extract candidate products for each test query.

The naive approach can be to give all prodcuts as a candidate for each query and want the model to sort them. But, this is nor performant and nor easy for the model to predict.

Here, as a baseline, we do this:

create an inverted index of word occurance in each product. e.g. mobile -> [ualkj, iousl, ...]
split query and extract candidates of each word
intersect word candidates (exclude words with zero candidates)
if the intersection results into an empty set: select word with minimum number of candidates greater than zero.

Then, we can predict each product score by the trained model and select top 10 ones. At last the result.csv file is generated and submitted.

In [ ]:

%reset -f

In [ ]:

import itertools
import gc
import json
import os

import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm, trange


def read_json(path, n_lines_to_read=None):
    """
    read json file line by line iteratively (generator function).
    use this function to read json files when you have memory limitations.
    """

    with open(path) as f:
        for i, line in enumerate(tqdm(f)):
            if n_lines_to_read == i:
                break
            yield json.loads(line)

In [ ]:

data_folder = "./data"

merged_searches_path_train = os.path.join(data_folder, "queries_merged_train.json")

products_path = os.path.join(data_folder, "base_products.json")
products_normalized_path = os.path.join(data_folder, "base_products_normalized.json")

queries_path_test = os.path.join(data_folder, "queries_test_offline.json")
queries_normalized_path_test = os.path.join(
    data_folder, "queries_test_offline_normalized.json"
)

random_projection_path = os.path.join(data_folder, "random_projection.npz")
product_features_path = os.path.join(data_folder, "product_features.npz")
query_train_features_path = os.path.join(data_folder, "query_train_features.npz")
query_test_features_path = os.path.join(data_folder, "query_test_features.npz")

train_dat_path = os.path.join(data_folder, f"train_emb.dat")
model_path = os.path.join(data_folder, "ranker.json")
uploading_result_path = os.path.join(data_folder, "result.csv")

In [ ]:

# number of test queries to choose
sample_num_test = None

In [ ]:

print("read base products!")
products_df = pd.read_json(products_normalized_path, orient="records", lines=True)
products_df = products_df.drop("sellers", axis=1)
product_id_dict = {_id: ind for ind, _id in enumerate(products_df["_id"])}
product_names = products_df["product_name"]
product_ids = products_df["_id"]

In [ ]:

print("read merged searches!")
queries_test = list(read_json(queries_path_test, sample_num_test))

In [ ]:

product_names_normalized = products_df["product_name_normalized"]
queries_test_normalized = list(read_json(queries_normalized_path_test, sample_num_test))

del products_df  # free memory
gc.collect()

In [ ]:

# load ebmeded vectors

products_projected = np.load(product_features_path)["arr_0"]
queries_test_projected = np.load(query_test_features_path)["arr_0"]

In [ ]:

# create the inverted index

word_inverted_index = {}

for i, product_name in enumerate(tqdm(product_names_normalized)):
    for word in product_name.split():
        word_index = word_inverted_index.get(word, set())
        word_index.add(i)
        word_inverted_index[word] = word_index

In [ ]:

def intersection_find(query):
    candidates = None
    for word in query.split():
        word_candidates = word_inverted_index.get(word, set())
        if len(word_candidates) == 0:  # skip word with 0 candidates
            continue
        elif candidates is None:  # the first word with candidates
            candidates = word_candidates
        else:  # do the intersection
            candidates = candidates.intersection(word_candidates)
    if candidates is None:  # all words had 0 candidates
        candidates = set()
    elif len(candidates) == 0:  # the intersection resulted into an empty set
        pass
    return candidates


def minimum_word_candidate_find(query):
    words = query.split()
    n_candidates_per_word = [
        len(word_inverted_index.get(word, set())) for word in words
    ]
    sorted_indices = np.argsort(
        n_candidates_per_word
    )  # sort words with respect to their number of candidates
    for idx in sorted_indices:
        if n_candidates_per_word[idx] > 0:
            return word_inverted_index.get(words[idx], set())
    return []


def find_candidate_products(query):
    candidates = intersection_find(query)
    if len(candidates) == 0:
        candidates = minimum_word_candidate_find(query)
    return [product_ids[p_idx] for p_idx in candidates]

In [ ]:

param = {}

model = xgb.Booster(**param)

model.load_model(model_path)
model.set_param({"predictor": "gpu_predictor", "gpu_id": 0})

The inference cell is the following one. Due to the memory limitations and enhance the inference performance, batch processing is taken into account.

For each test query:

extract candidates by the described method
for each candidate:
- add the candidate to the batch
- if the items in batch == BATCH_SIZE:
  - predict scores for items in batch
  - store predictions in a dictionary
  - select top products for the queries that have predicted scores for all of their candidates

In [ ]:

BATCH_SIZE = int(50e3)
TOP_K = 10


def get_query_product_feature(product_id, query_feature, product_features):
    p_idx = product_id_dict[product_id]
    feature = np.concatenate((product_features[p_idx], query_feature))
    return feature


def batch_predict(result_df, batch_queries):
    batch_feat = list(
        itertools.chain(*[result_df[t_query]["features"] for t_query in batch_queries])
    )  # stack features of all candidates
    batch_feat = np.stack(batch_feat)  # covert features to numpy datatype
    batch_data = xgb.DMatrix(batch_feat)  # convert it to xgboost datatype
    batch_predictions = model.predict(batch_data)
    return batch_predictions


def store_predictions(result_df, batch_queries):
    batch_prediction_idx = 0
    done_queries = []
    for query in batch_queries:
        query_result = result_df[query]
        n_candidates_in_batch = len(query_result["features"])
        query_result["features"] = []  # free memory

        new_predictions = batch_predictions[
            batch_prediction_idx : batch_prediction_idx + n_candidates_in_batch
        ]
        query_result["predictions"].extend(new_predictions)
        batch_prediction_idx += n_candidates_in_batch

        if len(query_result["candidates"]) == len(
            query_result["predictions"]
        ):  # all candidates scores are predicted
            select_best_products(query_result)
            done_queries.append(query)

    for query in done_queries:  # remove completed queries from batch
        batch_queries.remove(query)


def select_best_products(query_result):
    predictions = np.array(query_result["predictions"])
    t_candidates = query_result["candidates"]

    query_result["predictions"] = []  # free memory
    query_result["candidates"] = []  # free memory

    # sort candidates based on prediction scores
    sorted_inds = np.argsort(predictions)[::-1][:TOP_K]

    # convert product indices to product ids
    selected_products = [t_candidates[ii] for ii in sorted_inds]

    # pad the suggestion with dummy `00000` id (10 products must be suggested)
    selected_products += ["00000"] * (TOP_K - len(selected_products))

    # store suggestions
    query_result["suggestions"] = selected_products


# result_df: query -> {"features": ..., "predictions": .., "candidates": ..., "suggestions": ...}
result_df = {}
batch_queries = set()  # queries in batch
n_candidates_in_batch = 0
for test_qid, (test_query, test_query_normalized) in enumerate(
    zip(tqdm(queries_test), queries_test_normalized)
):
    candidates = find_candidate_products(test_query_normalized)

    batch_queries.add(test_query)
    test_query_result = {"features": [], "predictions": [], "candidates": candidates}
    result_df[test_query] = test_query_result

    for j, candidate_id in enumerate(candidates):
        test_query_result["features"].append(
            get_query_product_feature(
                candidate_id, queries_test_projected[test_qid], products_projected
            )
        )
        n_candidates_in_batch += 1
        if n_candidates_in_batch == BATCH_SIZE or (
            test_qid == len(queries_test) - 1
            and j == len(candidates) - 1  # last candidate of last query
        ):
            batch_predictions = batch_predict(result_df, batch_queries)
            store_predictions(result_df, batch_queries)
            n_candidates_in_batch = 0

In [ ]:

new_result_df = [[t_query] + value["suggestions"] for t_query, value in result_df.items()]
new_result_df = pd.DataFrame(new_result_df)
new_result_df.to_csv(uploading_result_path, header=False, index=False)
print(new_result_df.shape)

In [ ]:

from roboepics_client.roboepics_client import RoboEpicsClient

problem_id = 4  # You need to provide an appropriate value for this variable
problem_enter_id = None  # fill this value with your id

roboepics_client = RoboEpicsClient(problem_id, problem_enter_id)

In [ ]:

roboepics_client.submit(uploading_result_path)

In [ ]:

Content

2651

Show Comments

Comments

You must login before you can post a comment.