Task 1: Next Product Recommendation

rule baseline

A simple baseline, recall by next_item counter, LB 0.29+

In [1]:

import warnings
warnings.simplefilter('ignore')

import gc
import re
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm

In [2]:

# df_prod = pd.read_csv('data/products_train.csv')
# df_prod

In [3]:

df_sess = pd.read_csv('data/sessions_train.csv')
df_sess

Out[3]:

	prev_items	next_item	locale
0	['B09W9FND7K' 'B09JSPLN1M']	B09M7GY217	DE
1	['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...	B001B4THSA	DE
2	['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...	B0767DTG2Q	DE
3	['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...	B0B4R9NN4B	DE
4	['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8']	B0BGVBKWGZ	DE
...	...	...	...
3606244	['B086CYFSKW' 'B0874F9859' 'B086CYFSKW']	B07B5TYD76	IT
3606245	['B09NRZKZ7V' 'B08WJTPV93']	B08L1P4C3D	IT
3606246	['B085JFX7MP' 'B085JGHW8R']	B01MPWVD44	IT
3606247	['B00B0UING2' 'B00B0UING2']	B00D3HYEZ4	IT
3606248	['B092S9D1SD' 'B09XQQ1S72' 'B0852MS7QC' 'B0B1V...	B0B7RX65YP	IT

3606249 rows × 3 columns

In [4]:

df_test = pd.read_csv('data/sessions_test_task1.csv')
df_test

Out[4]:

	prev_items	locale
0	['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...	DE
1	['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS']	DE
2	['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...	DE
3	['B08KQBYV43' '3955350843' '3955350843' '39553...	DE
4	['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...	DE
...	...	...
316966	['B077SZ2C3Y' 'B0B14M3VZX']	UK
316967	['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851...	UK
316968	['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1...	UK
316969	['B01MCQMORK' 'B09JYZ325W']	UK
316970	['B0B8JX92YJ' 'B09TN4MP6V' 'B0BG2LZQSL']	UK

316971 rows × 2 columns

In [5]:

def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    l = [i for i in x.split() if i]
    return l

In [6]:

next_item_dict = defaultdict(list)

for _, row in tqdm(df_sess.iterrows(), total=len(df_sess)):
    prev_items = str2list(row['prev_items'])
    next_item = row['next_item']
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        next_item_dict[prev_items[0]].append(next_item)
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])
        next_item_dict[prev_items[-1]].append(next_item)

In [7]:

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    prev_items = str2list(row['prev_items'])
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        continue
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])

In [8]:

next_item_map = {}

for item in tqdm(next_item_dict):
    counter = Counter(next_item_dict[item])
    next_item_map[item] = [i[0] for i in counter.most_common(100)]

In [9]:

k = []
v = []

for item in next_item_dict:
    k.append(item)
    v.append(next_item_dict[item])
    
df_next = pd.DataFrame({'item': k, 'next_item': v})
df_next = df_next.explode('next_item').reset_index(drop=True)
df_next

Out[9]:

	item	next_item
0	B09W9FND7K	B09JSPLN1M
1	B09W9FND7K	B09JSPLN1M
2	B09W9FND7K	B09JSPLN1M
3	B09W9FND7K	B09JSPLN1M
4	B09W9FND7K	B078WW2WN5
...	...	...
16340339	B0BFPLN8FQ	B0BF5GT13X
16340340	B09737CD6H	B08DJ9SQFY
16340341	B084GZ3QZ7	B00CBAW8SE
16340342	B081TXFKS5	B004RN5I6W
16340343	B07H31QX5Q	B08R7685RH

16340344 rows × 2 columns

In [10]:

top200 = df_next['next_item'].value_counts().index.tolist()[:200]

In [11]:

df_test['last_item'] = df_test['prev_items'].apply(lambda x: str2list(x)[-1])
df_test['next_item_prediction'] = df_test['last_item'].map(next_item_map)
df_test

Out[11]:

	prev_items	locale	last_item	next_item_prediction
0	['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...	DE	B099NQFMG7	[B099NS1XPG, B08496TCCQ, B01BVG1XJS, B099NR3X6...
1	['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS']	DE	B00R9RZ9ZS	[B004ZXMV4Q, B00R9R5ND6, B095TQTZXY, B086J6RTT...
2	['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...	DE	B07G7Q5N6G	[B08C9Q7QVK, B07G7Q5N6G, B07YSRXJD3, B0B5QNFWJ...
3	['B08KQBYV43' '3955350843' '3955350843' '39553...	DE	3955350843	[395535086X, 3955350843, B0829LZFT1, 377247695...
4	['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...	DE	B09J945WQR	[B09J8V18FL, B09J8T6TTH, B09J8SKX9G, B09J8V9RQ...
...	...	...	...	...
316966	['B077SZ2C3Y' 'B0B14M3VZX']	UK	B0B14M3VZX	[B08X9L5RGD, B07V5FL8G6, B09Y4HKGKT, B09MW64JGM]
316967	['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851...	UK	B081YDH55K	[B0989BHLSY, B09CPNS7XV, B09895QPQF, B09CPP92Q...
316968	['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1...	UK	B09HL11V5B	[B09HKZBNZH, B09HZSRJWW, B09HL141QC, B09HX9VGW...
316969	['B01MCQMORK' 'B09JYZ325W']	UK	B09JYZ325W	[B07TR5LQSL, B08FB464L7, B09JYZ325W, B08JG8TSC...
316970	['B0B8JX92YJ' 'B09TN4MP6V' 'B0BG2LZQSL']	UK	B0BG2LZQSL	[B09XPX59JK, B09TN4MP6V, B08LGRK6MT]

316971 rows × 4 columns

In [12]:

preds = []

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    pred_orig = row['next_item_prediction']
    pred = pred_orig
    prev_items = str2list(row['prev_items'])
    if type(pred) == float:
        pred = top200[:100]
    else:
        if len(pred_orig) < 100:
            for i in top200:
                if i not in pred_orig and i not in prev_items:
                    pred.append(i)
                if len(pred) >= 100:
                    break
        else:
            pred = pred[:100]
    preds.append(pred)

In [13]:

df_test['next_item_prediction'] = preds
df_test

Out[13]:

	prev_items	locale	last_item	next_item_prediction
0	['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...	DE	B099NQFMG7	[B099NS1XPG, B08496TCCQ, B01BVG1XJS, B099NR3X6...
1	['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS']	DE	B00R9RZ9ZS	[B004ZXMV4Q, B00R9R5ND6, B095TQTZXY, B086J6RTT...
2	['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...	DE	B07G7Q5N6G	[B08C9Q7QVK, B07G7Q5N6G, B07YSRXJD3, B0B5QNFWJ...
3	['B08KQBYV43' '3955350843' '3955350843' '39553...	DE	3955350843	[395535086X, 3955350843, B0829LZFT1, 377247695...
4	['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...	DE	B09J945WQR	[B09J8V18FL, B09J8T6TTH, B09J8SKX9G, B09J8V9RQ...
...	...	...	...	...
316966	['B077SZ2C3Y' 'B0B14M3VZX']	UK	B0B14M3VZX	[B08X9L5RGD, B07V5FL8G6, B09Y4HKGKT, B09MW64JG...
316967	['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851...	UK	B081YDH55K	[B0989BHLSY, B09CPNS7XV, B09895QPQF, B09CPP92Q...
316968	['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1...	UK	B09HL11V5B	[B09HKZBNZH, B09HZSRJWW, B09HL141QC, B09HX9VGW...
316969	['B01MCQMORK' 'B09JYZ325W']	UK	B09JYZ325W	[B07TR5LQSL, B08FB464L7, B09JYZ325W, B08JG8TSC...
316970	['B0B8JX92YJ' 'B09TN4MP6V' 'B0BG2LZQSL']	UK	B0BG2LZQSL	[B09XPX59JK, B09TN4MP6V, B08LGRK6MT, B07QPV9Z7...

316971 rows × 4 columns

In [14]:

df_test['next_item_prediction'].apply(len).describe()

Out[14]:

count    316971.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: next_item_prediction, dtype: float64

In [66]:

df_test[['locale', 'next_item_prediction']].to_parquet('submission_task1.parquet', engine='pyarrow')

In [ ]:

Content

4662

Show Comments

Comments

MPDF

Over 1 year ago

great share

xiaoye_hua

Over 1 year ago

Thanks. Heng. Really helpful !

Just for discussion: I think this idea tries to find the following 2 patterns:

Uers’ preference: which item do users always click after this item?
The original sorting in the website(might be position bias or something similar): maybe in the original recommended result, item1 is always followed by item2. And that’s the reason why user always click item2 after item1.

WDYT? Happy to hear you opinion. Thanks

Liked by

You must login before you can post a comment.