Loading

Task 1: Next Product Recommendation

rule baseline

recall by next_item counter

heng_zheng

A simple baseline, recall by next_item counter, LB 0.29+

In [1]:
import warnings
warnings.simplefilter('ignore')

import gc
import re
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
In [2]:
# df_prod = pd.read_csv('data/products_train.csv')
# df_prod
In [3]:
df_sess = pd.read_csv('data/sessions_train.csv')
df_sess
Out[3]:
prev_items next_item locale
0 ['B09W9FND7K' 'B09JSPLN1M'] B09M7GY217 DE
1 ['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B... B001B4THSA DE
2 ['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ... B0767DTG2Q DE
3 ['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM... B0B4R9NN4B DE
4 ['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'] B0BGVBKWGZ DE
... ... ... ...
3606244 ['B086CYFSKW' 'B0874F9859' 'B086CYFSKW'] B07B5TYD76 IT
3606245 ['B09NRZKZ7V' 'B08WJTPV93'] B08L1P4C3D IT
3606246 ['B085JFX7MP' 'B085JGHW8R'] B01MPWVD44 IT
3606247 ['B00B0UING2' 'B00B0UING2'] B00D3HYEZ4 IT
3606248 ['B092S9D1SD' 'B09XQQ1S72' 'B0852MS7QC' 'B0B1V... B0B7RX65YP IT

3606249 rows × 3 columns

In [4]:
df_test = pd.read_csv('data/sessions_test_task1.csv')
df_test
Out[4]:
prev_items locale
0 ['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC... DE
1 ['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'] DE
2 ['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7... DE
3 ['B08KQBYV43' '3955350843' '3955350843' '39553... DE
4 ['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB... DE
... ... ...
316966 ['B077SZ2C3Y' 'B0B14M3VZX'] UK
316967 ['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851... UK
316968 ['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1... UK
316969 ['B01MCQMORK' 'B09JYZ325W'] UK
316970 ['B0B8JX92YJ' 'B09TN4MP6V' 'B0BG2LZQSL'] UK

316971 rows × 2 columns

In [5]:
def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    l = [i for i in x.split() if i]
    return l
In [6]:
next_item_dict = defaultdict(list)

for _, row in tqdm(df_sess.iterrows(), total=len(df_sess)):
    prev_items = str2list(row['prev_items'])
    next_item = row['next_item']
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        next_item_dict[prev_items[0]].append(next_item)
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])
        next_item_dict[prev_items[-1]].append(next_item)
In [7]:
for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    prev_items = str2list(row['prev_items'])
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        continue
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])
In [8]:
next_item_map = {}

for item in tqdm(next_item_dict):
    counter = Counter(next_item_dict[item])
    next_item_map[item] = [i[0] for i in counter.most_common(100)]
In [9]:
k = []
v = []

for item in next_item_dict:
    k.append(item)
    v.append(next_item_dict[item])
    
df_next = pd.DataFrame({'item': k, 'next_item': v})
df_next = df_next.explode('next_item').reset_index(drop=True)
df_next
Out[9]:
item next_item
0 B09W9FND7K B09JSPLN1M
1 B09W9FND7K B09JSPLN1M
2 B09W9FND7K B09JSPLN1M
3 B09W9FND7K B09JSPLN1M
4 B09W9FND7K B078WW2WN5
... ... ...
16340339 B0BFPLN8FQ B0BF5GT13X
16340340 B09737CD6H B08DJ9SQFY
16340341 B084GZ3QZ7 B00CBAW8SE
16340342 B081TXFKS5 B004RN5I6W
16340343 B07H31QX5Q B08R7685RH

16340344 rows × 2 columns

In [10]:
top200 = df_next['next_item'].value_counts().index.tolist()[:200]
In [11]:
df_test['last_item'] = df_test['prev_items'].apply(lambda x: str2list(x)[-1])
df_test['next_item_prediction'] = df_test['last_item'].map(next_item_map)
df_test
Out[11]:
prev_items locale last_item next_item_prediction
0 ['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC... DE B099NQFMG7 [B099NS1XPG, B08496TCCQ, B01BVG1XJS, B099NR3X6...
1 ['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'] DE B00R9RZ9ZS [B004ZXMV4Q, B00R9R5ND6, B095TQTZXY, B086J6RTT...
2 ['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7... DE B07G7Q5N6G [B08C9Q7QVK, B07G7Q5N6G, B07YSRXJD3, B0B5QNFWJ...
3 ['B08KQBYV43' '3955350843' '3955350843' '39553... DE 3955350843 [395535086X, 3955350843, B0829LZFT1, 377247695...
4 ['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB... DE B09J945WQR [B09J8V18FL, B09J8T6TTH, B09J8SKX9G, B09J8V9RQ...
... ... ... ... ...
316966 ['B077SZ2C3Y' 'B0B14M3VZX'] UK B0B14M3VZX [B08X9L5RGD, B07V5FL8G6, B09Y4HKGKT, B09MW64JGM]
316967 ['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851... UK B081YDH55K [B0989BHLSY, B09CPNS7XV, B09895QPQF, B09CPP92Q...
316968 ['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1... UK B09HL11V5B [B09HKZBNZH, B09HZSRJWW, B09HL141QC, B09HX9VGW...
316969 ['B01MCQMORK' 'B09JYZ325W'] UK B09JYZ325W [B07TR5LQSL, B08FB464L7, B09JYZ325W, B08JG8TSC...
316970 ['B0B8JX92YJ' 'B09TN4MP6V' 'B0BG2LZQSL'] UK B0BG2LZQSL [B09XPX59JK, B09TN4MP6V, B08LGRK6MT]

316971 rows × 4 columns

In [12]:
preds = []

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    pred_orig = row['next_item_prediction']
    pred = pred_orig
    prev_items = str2list(row['prev_items'])
    if type(pred) == float:
        pred = top200[:100]
    else:
        if len(pred_orig) < 100:
            for i in top200:
                if i not in pred_orig and i not in prev_items:
                    pred.append(i)
                if len(pred) >= 100:
                    break
        else:
            pred = pred[:100]
    preds.append(pred)
In [13]:
df_test['next_item_prediction'] = preds
df_test
Out[13]:
prev_items locale last_item next_item_prediction
0 ['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC... DE B099NQFMG7 [B099NS1XPG, B08496TCCQ, B01BVG1XJS, B099NR3X6...
1 ['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'] DE B00R9RZ9ZS [B004ZXMV4Q, B00R9R5ND6, B095TQTZXY, B086J6RTT...
2 ['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7... DE B07G7Q5N6G [B08C9Q7QVK, B07G7Q5N6G, B07YSRXJD3, B0B5QNFWJ...
3 ['B08KQBYV43' '3955350843' '3955350843' '39553... DE 3955350843 [395535086X, 3955350843, B0829LZFT1, 377247695...
4 ['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB... DE B09J945WQR [B09J8V18FL, B09J8T6TTH, B09J8SKX9G, B09J8V9RQ...
... ... ... ... ...
316966 ['B077SZ2C3Y' 'B0B14M3VZX'] UK B0B14M3VZX [B08X9L5RGD, B07V5FL8G6, B09Y4HKGKT, B09MW64JG...
316967 ['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851... UK B081YDH55K [B0989BHLSY, B09CPNS7XV, B09895QPQF, B09CPP92Q...
316968 ['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1... UK B09HL11V5B [B09HKZBNZH, B09HZSRJWW, B09HL141QC, B09HX9VGW...
316969 ['B01MCQMORK' 'B09JYZ325W'] UK B09JYZ325W [B07TR5LQSL, B08FB464L7, B09JYZ325W, B08JG8TSC...
316970 ['B0B8JX92YJ' 'B09TN4MP6V' 'B0BG2LZQSL'] UK B0BG2LZQSL [B09XPX59JK, B09TN4MP6V, B08LGRK6MT, B07QPV9Z7...

316971 rows × 4 columns

In [14]:
df_test['next_item_prediction'].apply(len).describe()
Out[14]:
count    316971.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: next_item_prediction, dtype: float64
In [66]:
df_test[['locale', 'next_item_prediction']].to_parquet('submission_task1.parquet', engine='pyarrow')
In [ ]:


Comments

MPDF
11 months ago

great share

xiaoye_hua
11 months ago

Thanks. Heng. Really helpful !

Just for discussion: I think this idea tries to find the following 2 patterns:

  1. Uers’ preference: which item do users always click after this item?
  2. The original sorting in the website(might be position bias or something similar): maybe in the original recommended result, item1 is always followed by item2. And that’s the reason why user always click item2 after item1.

WDYT? Happy to hear you opinion. Thanks

You must login before you can post a comment.

Execute