# !unzip -o 'datasets/data.zip' -d 'datasets/'


# 使用pip命令安装指定版本工具包,lightgbm对模型结果有影响
!pip list
!pip install lightgbm==3.2.0
!pip install joblib==1.0.1 lightgbm==3.2.0 numpy==1.20.2 pandas==1.2.3 scikit-learn==0.24.1 tqdm==4.60.0

Package                            Version
---------------------------------- -------------------
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 2.0.3
anaconda-project                   0.9.1
anyio                              2.2.0
appdirs                            1.4.4
argh                               0.26.2
argon2-cffi                        20.1.0
asn1crypto                         1.4.0
astroid                            2.5
astropy                            4.2.1
async-generator                    1.10
atomicwrites                       1.4.0
attrs                              20.3.0
autopep8                           1.5.6
Babel                              2.9.0
backcall                           0.2.0
backports.functools-lru-cache      1.6.4
backports.shutil-get-terminal-size 1.0.0
backports.tempfile                 1.0
backports.weakref                  1.0.post1
bcrypt                             3.2.0
beautifulsoup4                     4.9.3
bitarray                           1.9.2
bkcharts                           0.2
black                              19.10b0
bleach                             3.3.0
bokeh                              2.3.2
boto                               2.49.0
Bottleneck                         1.3.2
brotlipy                           0.7.0
certifi                            2020.12.5
cffi                               1.14.5
chardet                            4.0.0
click                              7.1.2
cloudpickle                        1.6.0
clyent                             1.2.2
colorama                           0.4.4
comtypes                           1.1.9
conda                              4.10.1
conda-build                        3.21.4
conda-content-trust                0+unknown
conda-package-handling             1.7.3
conda-repo-cli                     1.0.4
conda-token                        0.3.0
conda-verify                       3.4.2
contextlib2                        0.6.0.post1
cryptography                       3.4.7
cycler                             0.10.0
Cython                             0.29.23
cytoolz                            0.11.0
dask                               2021.4.0
decorator                          5.0.6
defusedxml                         0.7.1
descartes                          1.1.0
diff-match-patch                   20200713
distributed                        2021.4.0
docutils                           0.17
entrypoints                        0.3
et-xmlfile                         1.0.1
Faker                              9.8.3
fastcache                          1.1.0
filelock                           3.0.12
flake8                             3.9.0
Flask                              1.1.2
fsspec                             0.9.0
future                             0.18.2
gevent                             21.1.2
glob2                              0.7
greenlet                           1.0.0
h11                                0.12.0
h5py                               2.10.0
HeapDict                           1.0.1
html5lib                           1.1
idna                               2.10
imagecodecs                        2021.3.31
imageio                            2.9.0
imagesize                          1.2.0
importlib-metadata                 3.10.0
iniconfig                          1.1.1
intervaltree                       3.1.0
ipykernel                          5.3.4
ipython                            7.22.0
ipython-genutils                   0.2.0
ipywidgets                         7.6.3
isort                              5.8.0
itsdangerous                       1.1.0
jdcal                              1.4.1
jedi                               0.17.2
Jinja2                             2.11.3
joblib                             1.0.1
json5                              0.9.5
jsonschema                         3.2.0
jupyter                            1.0.0
jupyter-client                     6.1.12
jupyter-console                    6.4.0
jupyter-core                       4.7.1
jupyter-packaging                  0.7.12
jupyter-server                     1.4.1
jupyterlab                         3.0.14
jupyterlab-pygments                0.1.2
jupyterlab-server                  2.4.0
jupyterlab-widgets                 1.0.0
keyring                            22.3.0
kiwisolver                         1.3.1
lazy-object-proxy                  1.6.0
libarchive-c                       2.9
llvmlite                           0.36.0
locket                             0.2.1
lxml                               4.6.3
MarkupSafe                         1.1.1
matplotlib                         3.3.4
mccabe                             0.6.1
menuinst                           1.4.16
mistune                            0.8.4
mizani                             0.7.3
mkl-fft                            1.3.0
mkl-random                         1.2.1
mkl-service                        2.3.0
mock                               4.0.3
more-itertools                     8.7.0
mpmath                             1.2.1
msgpack                            1.0.2
multipledispatch                   0.6.0
mypy-extensions                    0.4.3
navigator-updater                  0.2.1
nbclassic                          0.2.6
nbclient                           0.5.3
nbconvert                          6.0.7
nbformat                           5.1.3
nest-asyncio                       1.5.1
networkx                           2.5
nltk                               3.6.1
nose                               1.3.7
notebook                           6.3.0
numba                              0.53.1
numexpr                            2.7.3
numpy                              1.20.1
numpydoc                           1.1.0
olefile                            0.46
openpyxl                           3.0.7
outcome                            1.1.0
packaging                          20.9
palettable                         3.3.0
pandas                             1.2.4
pandocfilters                      1.4.3
paramiko                           2.7.2
parso                              0.7.0
partd                              1.2.0
path                               15.1.2
pathlib2                           2.3.5
pathspec                           0.7.0
patsy                              0.5.1
pep8                               1.7.1
pexpect                            4.8.0
pickleshare                        0.7.5
Pillow                             8.2.0
pip                                21.0.1
pkginfo                            1.7.0
plotly                             5.4.0
plotnine                           0.8.0
pluggy                             0.13.1
ply                                3.11
prometheus-client                  0.10.1
prompt-toolkit                     3.0.17
psutil                             5.8.0
ptyprocess                         0.7.0
py                                 1.10.0
pycodestyle                        2.6.0
pycosat                            0.6.3
pycparser                          2.20
pycurl                             7.43.0.6
pydocstyle                         6.0.0
pyerfa                             1.7.3
pyflakes                           2.2.0
Pygments                           2.8.1
pylint                             2.7.4
pyls-black                         0.4.6
pyls-spyder                        0.3.2
PyNaCl                             1.4.0
pyodbc                             4.0.0-unsupported
pyOpenSSL                          20.0.1
pyparsing                          2.4.7
pypyodbc                           1.3.2
pyreadline                         2.1
pyrsistent                         0.17.3
PySocks                            1.7.1
pytest                             6.2.3
python-dateutil                    2.8.1
python-jsonrpc-server              0.4.0
python-language-server             0.36.2
pytz                               2021.1
PyWavelets                         1.1.1
pywin32                            227
pywin32-ctypes                     0.2.0
pywinpty                           0.5.7
PyYAML                             5.4.1
pyzmq                              20.0.0
QDarkStyle                         2.8.1
QtAwesome                          1.0.2
qtconsole                          5.0.3
QtPy                               1.9.0
regex                              2021.4.4
requests                           2.25.1
rope                               0.18.0
Rtree                              0.9.7
ruamel-yaml-conda                  0.15.100
scikit-image                       0.18.1
scikit-learn                       0.24.1
scipy                              1.6.2
seaborn                            0.11.1
selenium                           4.0.0
Send2Trash                         1.5.0
setuptools                         52.0.0.post20210125
simplegeneric                      0.8.1
singledispatch                     0.0.0
sip                                4.19.13
six                                1.15.0
sniffio                            1.2.0
snowballstemmer                    2.1.0
sortedcollections                  2.1.0
sortedcontainers                   2.3.0
soupsieve                          2.2.1
Sphinx                             4.0.1
sphinxcontrib-applehelp            1.0.2
sphinxcontrib-devhelp              1.0.2
sphinxcontrib-htmlhelp             1.0.3
sphinxcontrib-jsmath               1.0.1
sphinxcontrib-qthelp               1.0.3
sphinxcontrib-serializinghtml      1.1.4
sphinxcontrib-websupport           1.2.4
spyder                             4.2.5
spyder-kernels                     1.10.2
SQLAlchemy                         1.4.7
statsmodels                        0.12.2
sympy                              1.8
tables                             3.6.1
tblib                              1.7.0
tenacity                           8.0.1
terminado                          0.9.4
testpath                           0.4.4
text-unidecode                     1.3
textdistance                       4.2.1
threadpoolctl                      2.1.0
three-merge                        0.1.1
tifffile                           2021.4.8
toml                               0.10.2
toolz                              0.11.1
tornado                            6.1
tqdm                               4.59.0
traitlets                          5.0.5
trio                               0.19.0
trio-websocket                     0.9.2
typed-ast                          1.4.2
typing-extensions                  3.7.4.3
ujson                              4.0.2
unicodecsv                         0.14.1
urllib3                            1.26.4
watchdog                           1.0.2
wcwidth                            0.2.5
webencodings                       0.5.1
Werkzeug                           1.0.1
wheel                              0.36.2
widgetsnbextension                 3.5.1
win-inet-pton                      1.1.0
win-unicode-console                0.5
wincertstore                       0.2
wrapt                              1.12.1
wsproto                            1.0.0
xlrd                               2.0.1
XlsxWriter                         1.3.8
xlwings                            0.23.0
xlwt                               1.3.0
xmltodict                          0.12.0
yapf                               0.31.0
zict                               2.0.0
zipp                               3.4.1
zope.event                         4.5.0
zope.interface                     5.3.0
Collecting lightgbm==3.2.0
  Downloading lightgbm-3.2.0-py3-none-win_amd64.whl (1.0 MB)
Requirement already satisfied: scikit-learn!=0.22.0 in d:\anaconda3\lib\site-packages (from lightgbm==3.2.0) (0.24.1)
Requirement already satisfied: scipy in d:\anaconda3\lib\site-packages (from lightgbm==3.2.0) (1.6.2)
Requirement already satisfied: numpy in d:\anaconda3\lib\site-packages (from lightgbm==3.2.0) (1.20.1)
Requirement already satisfied: wheel in d:\anaconda3\lib\site-packages (from lightgbm==3.2.0) (0.36.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\anaconda3\lib\site-packages (from scikit-learn!=0.22.0->lightgbm==3.2.0) (2.1.0)
Requirement already satisfied: joblib>=0.11 in d:\anaconda3\lib\site-packages (from scikit-learn!=0.22.0->lightgbm==3.2.0) (1.0.1)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.0
Requirement already satisfied: joblib==1.0.1 in d:\anaconda3\lib\site-packages (1.0.1)
Requirement already satisfied: lightgbm==3.2.0 in d:\anaconda3\lib\site-packages (3.2.0)
Collecting numpy==1.20.2
  Downloading numpy-1.20.2-cp38-cp38-win_amd64.whl (13.7 MB)
Collecting pandas==1.2.3
  Downloading pandas-1.2.3-cp38-cp38-win_amd64.whl (9.3 MB)
Requirement already satisfied: scikit-learn==0.24.1 in d:\anaconda3\lib\site-packages (0.24.1)
Collecting tqdm==4.60.0
  Downloading tqdm-4.60.0-py2.py3-none-any.whl (75 kB)
Requirement already satisfied: scipy in d:\anaconda3\lib\site-packages (from lightgbm==3.2.0) (1.6.2)
Requirement already satisfied: wheel in d:\anaconda3\lib\site-packages (from lightgbm==3.2.0) (0.36.2)
Requirement already satisfied: python-dateutil>=2.7.3 in d:\anaconda3\lib\site-packages (from pandas==1.2.3) (2.8.1)
Requirement already satisfied: pytz>=2017.3 in d:\anaconda3\lib\site-packages (from pandas==1.2.3) (2021.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\anaconda3\lib\site-packages (from scikit-learn==0.24.1) (2.1.0)
Requirement already satisfied: six>=1.5 in d:\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas==1.2.3) (1.15.0)
Installing collected packages: numpy, tqdm, pandas
  Attempting uninstall: numpy
    Found existing installation: numpy 1.20.1
    Uninstalling numpy-1.20.1:
      Successfully uninstalled numpy-1.20.1

ERROR: Could not install packages due to an OSError: [WinError 5] 拒绝访问。: 'D:\\anaconda3\\Lib\\site-packages\\~umpy\\core\\_multiarray_tests.cp38-win_amd64.pyd'
Consider using the `--user` option or check the permissions.


import pandas as pd
import numpy as np

from joblib import Parallel,delayed
from tqdm.notebook import tqdm
import os

from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import KFold,StratifiedKFold,StratifiedShuffleSplit
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")


# csv文件合并并行设置,请根据CPU支持核数设置，默认4核
jobs = 4

label_path = './datasets/data/train/train_labels.csv'
trn_path = './datasets/data/train'
test_path = './datasets/data/test'

result_path = './work/prediction_result'
if not os.path.exists(result_path): # 如果不存在则创建目录
    os.makedirs(result_path)


df_label = pd.read_csv(label_path)
# 车号5，19,36,94为异常数据，标签修正为0
df_label.loc[4,:] = (5,0,np.NaN)
df_label.loc[18,:] = (19,0,np.NaN)
df_label.loc[35,:] = (36,0,np.NaN)
df_label.loc[93,:] = (94,0,np.NaN)
# 车号77碰撞时间有问题，修正为13:37:12
df_label.loc[76,:] = (77,1,'2020/10/20 13:37:12')
# df_label.to_csv('./原始数据/train_labels_revised.csv',index=False)


def csv_find(parent_path, file_flag):
    '''批处理读csv文件位置'''
    df_paths = []
    for root, dirs, files in os.walk(parent_path):  # os.walk输出[目录路径，子文件夹，子文件]
        for file in files:
            if "".join(file).find(file_flag) != -1:  # 判断是否csv文件(files是list格式，需要转成str)
                df_paths.append(root + '/' + file)  # 所有csv文件
    return df_paths

csvs = csv_find(trn_path,file_flag='csv')
csvs_test = csv_find(test_path,file_flag='csv')
csvs.remove(label_path)


def read_csv(path):
    del_cols = ['车辆行驶里程','驾驶员需求扭矩值']
    # '车速'单独考虑
    num_cols = [ '低压蓄电池电压','整车当前总电流','整车当前总电压']
    
    '''数据处理'''
    # 读取文件
    df0 = pd.read_csv(path,index_col=False,low_memory=False)
    # 删除缺失值
    df = df0.dropna(axis=0,thresh=17)
    df = df.rename(columns={'采集时间':'CollectTime'})
    df['CollectTime'] = pd.to_datetime(df['CollectTime'],format='%Y-%m-%d %H:%M:%S')
    # 去重
    df.drop_duplicates(subset=['车号','CollectTime'],keep='first',inplace=True)
    # 排序
    df = df.sort_values('CollectTime').reset_index(drop=True)
    # 删除无效特征
    df = df.drop(del_cols,axis=1)
    # 档位状态
    df['整车当前档位状态'] = df['整车当前档位状态'].replace('驻车','空档')
    # 电池包主负继电器状态
    df['电池包主负继电器状态'] = df['电池包主负继电器状态'].replace('粘连','断开')
    # 主驾驶座占用状态
    df = df[df['主驾驶座占用状态'] != '传感器故障']
    
    '''特征构造'''
    df['time_delta'] = df['CollectTime'].diff().dt.total_seconds()
    df['time_delta_5'] = (df['CollectTime']-df['CollectTime'].shift(5)).dt.total_seconds()
         
    # 特征构造1: 判断数据是启动or停车——162 80(2020-12-02 13:04:13)
    a = pd.DataFrame()
    b = pd.DataFrame()
    df['电池包主负继电器状态cate'] = df['电池包主负继电器状态'].astype('category').cat.codes
    for i in np.arange(5):
        a['电池状态'+str(i)] = df['电池包主负继电器状态cate']-df['电池包主负继电器状态cate'].shift(i+1)
        b['电池状态'+str(i)] = df['电池包主负继电器状态cate']-df['电池包主负继电器状态cate'].shift(-i-1)
    df['if_off'] = a.sum(axis=1)
    df['if_on'] = b.sum(axis=1)
    
    # 特征构造2: 车速特征
    df['v_diff1'] = df['车速'].diff()/df['time_delta']#用于规则分类
    df['v_diff2'] = -df['车速'].shift(3).rolling(window=3).mean()/df['time_delta_5']#用于规则分类(前5条数据的3条取平均)
    df['v_diff3'] = df['车速'].diff() # 用于规则&LGBM
    df['v_diff4'] = df['v_diff1'].shift(-1) # 用于时间预测
    df['a_min5'] = df['v_diff1'].rolling(window=3).min() # 用于LGBM模型
    df['a_mean5'] = df['v_diff1'].rolling(window=3).mean() # 用于LGBM模型
    df['a_max3'] = df['v_diff1'].rolling(window=3).min() # 用于LGBM模型
    
    df = df.iloc[5:,:]

    # 删除时间戳太长的数据
    df = df[df['time_delta_5']<90]
    
    return df

def applyParallel_concat(paths, func, jobs=4):
    ret = Parallel(n_jobs=jobs)(delayed(func)(csv)for csv in tqdm(paths))
    return pd.concat(ret)


df_trnall = applyParallel_concat(csvs,read_csv,jobs=jobs)
print('df_trnall',df_trnall.shape)
df_testall = applyParallel_concat(csvs_test,read_csv,jobs=jobs)
print('df_testall',df_testall.shape)
# df_trnall.columns

df_trnall (3928449, 29)

df_testall (4285948, 29)


def col_feature1(df):
    # 条件筛选数据
    df = df [(df['电池包主负继电器状态cate']==0)] # 停车条件1
    # 68的off_or_on==-3
    df = df [(df['if_off']<-2) | (df['车速']!=0)] # 停车条件2
    return df

df_trnall2 = col_feature1(df_trnall)
df_testall2 = col_feature1(df_testall)
print('trn',df_trnall2.shape)
print('test',df_testall2.shape)

trn (9178, 29)
test (10772, 29)


# 对训练集标签进行重采样
df_label['CollectTime'] = pd.to_datetime(df_label['CollectTime'],format='%Y-%m-%d %H:%M:%S')
# 增加label数量
df_label_new = df_label
df_label1 = df_label[df_label['Label']==1]
for kind,kind_df in df_label1.groupby('车号'):
    # 增加上下5s时间的Label
    for t in np.arange(5):
        new_row1 = pd.DataFrame({'车号': kind, 'Label':1,
                                'CollectTime': kind_df['CollectTime'].iloc[0] + pd.Timedelta(seconds=t+1)},index=[1])
        new_row2 = pd.DataFrame({'车号': kind, 'Label':1,
                                'CollectTime': kind_df['CollectTime'].iloc[0] - pd.Timedelta(seconds=t+1)},index=[1])
        df_label_new = df_label_new.append(new_row1, ignore_index=True)
        df_label_new = df_label_new.append(new_row2, ignore_index=True)
# 将筛选过的训练数据和重采样的Label和并
df = pd.merge(df_trnall2,df_label_new, on=['车号', 'CollectTime'], how='left')
df['Label'] = df['Label'].fillna(0)
df1 = df[df['Label']==1]

print('标签1数据量',df1.shape)
print('标签1车辆数量',df1['车号'].nunique())

标签1数据量 (154, 30)
标签1车辆数量 49


# 按车速差分类
feat = (df['v_diff1']<-9)|(df['v_diff2']<-6)|(df['v_diff3']<-40)
feat_test = (df_testall2['v_diff1']<-9)|(df_testall2['v_diff2']<-6)|(df_testall2['v_diff3']<-40)
print('trn规则分类车辆数:',df[feat]['车号'].nunique())
print('trn规则分类车辆:',df[feat]['车号'].sort_values().unique())
print('-'*100)
print('test规则分类车辆数:',df_testall2[feat_test]['车号'].nunique())
print('test规则分类车辆:',df_testall2[feat_test]['车号'].sort_values().unique())

trn规则分类车辆数: 30
trn规则分类车辆: [  1   3  12  15  17  35  38  41  44  47  50  56  58  61  62  64  65  68
  69  74  76  77  85  98 105 110 112 113 119 120]
----------------------------------------------------------------------------------------------------
test规则分类车辆数: 45
test规则分类车辆: [132 133 135 138 140 147 150 152 154 158 161 163 164 168 170 172 175 176
 186 187 193 197 201 202 205 206 207 209 210 212 213 222 223 227 230 231
 233 235 244 249 252 253 254 255 260]


## 提取结果
df_trn_label = df[feat] # 预测的车号
sub_trn = pd.DataFrame(columns=['车号','Label_pred','CollectTime_pred'])
sub_trn = pd.merge(df_label,sub_trn,on=['车号'],how='left')
for kind,kind_df in df_trn_label.groupby('车号'):
    sub_trn.loc[sub_trn[sub_trn['车号']==kind].index,'Label_pred'] = 1
    sub_trn.loc[sub_trn[sub_trn['车号']==kind].index,'CollectTime_pred'] = kind_df['CollectTime'].iloc[0]
sub_trn['CollectTime_pred'] = pd.to_datetime(sub_trn['CollectTime_pred'],format='%Y-%m-%d %H:%M:%S')
print('trn预测最大时间差:',np.abs(sub_trn['CollectTime_pred']-sub_trn['CollectTime']).max())
#提取出还未预测的车辆
cols = ['车号','Label','CollectTime']
trn_for_pred = sub_trn[sub_trn['Label_pred']!=1][cols]
print('trn需预测数据:',trn_for_pred.shape)

# 提交结果
sub_rule = pd.DataFrame(columns=['车号','Label','CollectTime'])
sub_rule['车号'] = np.arange(121,261)
for kind,kind_df in df_testall2[feat_test].groupby('车号'):
    sub_rule.loc[sub_rule[sub_rule['车号']==kind].index,'Label'] = 1
    sub_rule.loc[sub_rule[sub_rule['车号']==kind].index,'CollectTime'] = kind_df['CollectTime'].iloc[0]
test_for_pred = sub_rule[sub_rule['Label']!=1][cols]
# trn_for_pred.to_csv('./train_label_for_pred.csv',index=False)
# test_for_pred.to_csv('./test_label_for_pred.csv',index=False)
# sub_rule.to_csv('./submit_rule.csv',index=False)

trn预测最大时间差: 0 days 00:00:04
trn需预测数据: (90, 3)


csvs_LGB = []
csvs_test_LGB = []
for i in trn_for_pred['车号']:
    path = trn_path + '/' + str(i) + '.csv'
    csvs_LGB.append(path)
for i in test_for_pred['车号']:
    path = test_path + '/' + str(i)+'.csv'
    csvs_test_LGB.append(path)
print('训练集csv数:',len(csvs_LGB))
print('测试集csv数:',len(csvs_test_LGB))

训练集csv数: 90
测试集csv数: 95


df_trnall_LGB = applyParallel_concat(csvs_LGB,read_csv,jobs=jobs)
print('df_trnall',df_trnall_LGB.shape)
df_testall_LGB = applyParallel_concat(csvs_test_LGB,read_csv,jobs=jobs)
print('df_testall',df_testall_LGB.shape)
df_trnall_LGB.columns

df_trnall (2923559, 29)

df_testall (2954171, 29)

Index(['车号', 'CollectTime', '加速踏板位置', '电池包主负继电器状态', '电池包主正继电器状态', '制动踏板状态',
       '驾驶员离开提示', '主驾驶座占用状态', '驾驶员安全带状态', '手刹状态', '整车钥匙状态', '低压蓄电池电压',
       '整车当前档位状态', '整车当前总电流', '整车当前总电压', '车速', '方向盘转角', 'time_delta',
       'time_delta_5', '电池包主负继电器状态cate', 'if_off', 'if_on', 'v_diff1',
       'v_diff2', 'v_diff3', 'v_diff4', 'a_min5', 'a_mean5', 'a_max3'],
      dtype='object')


def col_feature2(df):
    cate_cols = ['制动踏板状态', '驾驶员离开提示','主驾驶座占用状态', '驾驶员安全带状态',
                 '手刹状态', '整车钥匙状态', '整车当前档位状态']
    cate_cols2 = []
    
    # 类别特征编码
    for col in cate_cols:
        df[col+'cate'] = df[col].astype('category').cat.codes
        cate_cols2.append(col+'cate')
    df_code_dict = {col:{code:cate for code,cate in enumerate(df[col].astype('category').cat.categories)} 
                    for col in cate_cols}
    df['整车钥匙状态catestd'] = df['整车钥匙状态cate'].rolling(window=5,center=True).std()
    
    # 条件筛选数据
    df = df [(df['电池包主负继电器状态cate']==0)] # 停车条件1
    df = df [(df['if_on']==0) | (df['车速']>20)] # 停车条件2 判断是否启动阶段，或者车速过大的异常清空
    df = df [(df['if_off']<-3) | (df['车速']!=0)] # 停车条件3  判断是否停车阶段，或者车速异常大于0的情况
    
    # 车速分桶
    bin1 = [-0.1,0.01,0.5,1.5,3,100]
    df['v_bin'] = pd.cut(df['车速'], bin1, labels=False)
    
    # 加速踏板特征
    df['a0'] = df.apply(lambda x:  1 if (x['加速踏板位置']>0) else 0,axis=1)
    df['a_min5'] = df['a_min5'] - df['v_bin']*df['v_bin']/2 - df['a0']*1.5
    df['a_mean5'] = df['a_mean5'] - df['v_bin']*df['v_bin']/2 - df['a0']*1.5
#     df['a_min5'] = df['a_min5'] - df['v_bin']*df['v_bin'] - df['a0']*3
#     df['a_mean5'] = df['a_mean5'] - df['v_bin']*df['v_bin'] - df['a0']*3
    df['v_diff1'] = df['v_diff1'] - df['v_bin']*df['v_bin'] - df['a0']*3
    df['v_diff3'] = df['v_diff3'] - df['v_bin']*df['v_bin'] - df['a0']*3
    
    # 加速度分桶
    bin1 = [-100,-15,-10,-6,-3,-0.01,0.1,100]
    df['v_diff3_bin'] = pd.cut(df['v_diff3'], bin1, labels=False)
    
    ori_cols = ['加速踏板位置', '电池包主负继电器状态', '电池包主正继电器状态', '制动踏板状态',
       '驾驶员离开提示', '主驾驶座占用状态', '驾驶员安全带状态', '手刹状态', '整车钥匙状态', '低压蓄电池电压',
       '整车当前档位状态', '整车当前总电流', '整车当前总电压','车速', '方向盘转角']
    cate_cols = ['电池包主负继电器状态cate','制动踏板状态cate', '驾驶员离开提示cate','主驾驶座占用状态cate',
                 '驾驶员安全带状态cate', '手刹状态cate', '整车钥匙状态cate','整车当前档位状态cate',]
    choose_cols = ['time_delta', 'time_delta_5','if_on','a0','v_bin','v_diff3','v_diff2','v_diff4']
    df = df.drop(ori_cols+cate_cols+choose_cols,axis=1)

    return df,df_code_dict


df_trnall2_LGB , df_code_dict1= col_feature2(df_trnall_LGB)
print('trn',df_trnall2_LGB.shape)
df_testall2_LGB, df_code_dict2 = col_feature2(df_testall_LGB)
print('test',df_testall2_LGB.shape)
print(df_trnall2_LGB.columns)
# df_testall2_LGB.to_csv('./test_data.csv',encoding='GBK',index=False)
# df_code_dict1 == df_code_dict2
# df_code_dict1

trn (4528, 9)
test (5100, 9)
Index(['车号', 'CollectTime', 'if_off', 'v_diff1', 'a_min5', 'a_mean5', 'a_max3',
       '整车钥匙状态catestd', 'v_diff3_bin'],
      dtype='object')


df_LGB = pd.merge(df_trnall2_LGB,df_label_new, on=['车号', 'CollectTime'], how='left')
df_LGB['Label'] = df_LGB['Label'].fillna(0)
df1_LGB = df_LGB[df_LGB['Label']==1]
print('训练数据label==1数量:',df1_LGB.shape)
print('训练数据label==1车数量:',df1_LGB['车号'].nunique())
# df.to_csv('./train_data.csv',encoding='GBK',index=False)
# df1.to_csv('./train_label_data.csv',encoding='GBK',index=False)

训练数据label==1数量: (47, 10)
训练数据label==1车数量: 19


## 重命名
cols = {'车号':'Num','整车钥匙状态catestd':'key_std',}

df_LGB = df_LGB.rename(columns=cols)
df_test = df_testall2_LGB.rename(columns=cols)

# 训练数据的特征
f_names = [x for x in df_LGB.columns if x not in['Num','CollectTime','Label']]
test_data = df_test[f_names]
print('data',df_LGB.shape)
print('testdata',test_data.shape)
print(df_LGB.columns)

data (4528, 10)
testdata (5100, 7)
Index(['Num', 'CollectTime', 'if_off', 'v_diff1', 'a_min5', 'a_mean5',
       'a_max3', 'key_std', 'v_diff3_bin', 'Label'],
      dtype='object')


split = StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=2021)
# 按照label里的车号划分，再进入df中选择对应车号的行,保证trn和val的车号不相同
# 这个地方一定要加.values，并且注意别写错地方了
for trn_idx,val_idx in split.split(trn_for_pred['车号'],trn_for_pred['Label']):
    trn_data = df_LGB[df_LGB['Num'].apply(lambda x: x in trn_for_pred.iloc[trn_idx]['车号'].values)].reset_index(drop=True)
    val_data = df_LGB[df_LGB['Num'].apply(lambda x: x in trn_for_pred.iloc[val_idx]['车号'].values)].reset_index(drop=True)

trn_x, trn_y =  trn_data[f_names], trn_data.loc[:,'Label']
val_x, val_y =  val_data[f_names], val_data.loc[:,'Label']

print('trn Label',trn_data['Label'].value_counts())
print('val Label',val_data['Label'].value_counts())
print('trn车数量',trn_data['Num'].nunique())
print('val车数量',val_data['Num'].nunique())

trn Label 0.0    3080
1.0      34
Name: Label, dtype: int64
val Label 0.0    1401
1.0      13
Name: Label, dtype: int64
trn车数量 63
val车数量 27


# 调参val由0.83到0.93，线上0.85到0.925，对于不均衡样本colsample要尽可能小
model =  LGBMClassifier(n_jobs=8,
                        n_estimators=30000,
                        objective='binary',
                        boosting_type='gbdt',
                        learning_rate=0.104962713,
                        subsample=0.9,
                        colsample_bytree=1,
                        is_unbalance=True,
#                         scale_pos_weight=30,
                        reg_lambda=0,
                        num_leaves=100,
                        random_state=2021)
model.fit(trn_x,trn_y,
          eval_set=[(val_x,val_y)],
          eval_metric='logloss',
          early_stopping_rounds=2000,
          verbose=3000)
# 预测
pred_val = model.predict(val_x)
pred_trn = model.predict(trn_x)
pred_test = model.predict(test_data)

#得分
f1_trn = f1_score(pred_trn,trn_y)
f1_val = f1_score(pred_val,val_y)
print('trn F1',f1_trn)
print('val F1',f1_val)

Training until validation scores don't improve for 2000 rounds
[3000]	valid_0's binary_logloss: 0.00013201
[6000]	valid_0's binary_logloss: 0.0001186
[9000]	valid_0's binary_logloss: 0.000114999
[12000]	valid_0's binary_logloss: 0.000114034
[15000]	valid_0's binary_logloss: 0.000113444
[18000]	valid_0's binary_logloss: 0.000112927
[21000]	valid_0's binary_logloss: 0.000112591
[24000]	valid_0's binary_logloss: 0.000112362
[27000]	valid_0's binary_logloss: 0.000112196
[30000]	valid_0's binary_logloss: 0.000112071
Did not meet early stopping. Best iteration is:
[29999]	valid_0's binary_logloss: 0.000112069
trn F1 1.0
val F1 1.0


df_test['label_pred'] = pred_test
df_test_sub = df_test[df_test['label_pred']==1]

sub = pd.DataFrame(columns=['车号','Label','CollectTime'])
sub['车号'] = np.arange(121,261)
for kind,kind_df in df_test_sub.groupby('Num'):
    sub.loc[sub[sub['车号']==kind].index,'Label'] = 1
    sub.loc[sub[sub['车号']==kind].index,'CollectTime'] = kind_df['CollectTime'].iloc[0]
print('submit_pred:',sub[sub['Label']==1].shape)
# sub.to_csv('./submit_pred.csv',index=False)#,encoding='GBK'

submit_pred: (19, 3)


### 提交
# sub_all = pd.read_csv('./submit_rule.csv')
sub_all = sub_rule.copy()
a = sub[sub['Label']==1].index.values
sub_all.iloc[a] = sub[sub['Label']==1]
sub_all['Label'] = sub_all['Label'].fillna(0) #补0
sub_all['Label'] = sub_all['Label'].astype(int)#转int
# sub_all.to_csv('./submit_all.csv',index=False)
print('总提交正样本数:',sub_all[sub_all['Label']==1].shape[0])

总提交正样本数: 64


df_test_label  = sub_all[sub_all['Label']==1]
df_test_label['CollectTime'] = pd.to_datetime(df_test_label['CollectTime'],format='%Y-%m-%d %H:%M:%S')


def label_resample(df):
    df_new = df[['车号','CollectTime']]
    for kind,kind_df in df.groupby('车号'):
        for t in np.arange(5):
            new_row1 = pd.DataFrame({'车号': kind,
                                    'CollectTime': kind_df['CollectTime'].iloc[0] + pd.Timedelta(seconds=t+1)},index=[1])
            new_row2 = pd.DataFrame({'车号': kind,
                                    'CollectTime': kind_df['CollectTime'].iloc[0] - pd.Timedelta(seconds=t+1)},index=[1])
            df_new = df_new.append(new_row1, ignore_index=True)
            df_new = df_new.append(new_row2, ignore_index=True)
    return df_new

time1 = label_resample(df_label[df_label['Label']==1])
print('trn',time1.shape)
print('trn车数量:',df_label[df_label['Label']==1].shape)
time1_test = label_resample(df_test_label)
print('test',time1_test.shape)
print('test车数量:',df_test_label.shape)

trn (539, 2)
trn车数量: (49, 3)
test (704, 2)
test车数量: (64, 3)


# 找到预测结果中附近的if_off==-5数据
df_trnall2_time = df_trnall2[df_trnall2['if_off']==-5]
df_trn_time = pd.merge(df_trnall2_time,time1, on=['车号', 'CollectTime'], how='inner')
# 找到Trn中的TimeLabel: 'if_off'==-5时 Time_Label=1
df_trn_time = pd.merge(df_trn_time,df_label[df_label['Label']==1], on=['车号', 'CollectTime'], how='left')
df_trn_time['Label'] = df_trn_time['Label'].fillna(0)
print('Trn Label==1总数量:',df_trn_time.shape)
print('Trn车if_off==-5 的数量:',df_trn_time[df_trn_time['Label']==1].shape)

# Test一样
df_testall2_time = df_testall2[df_testall2['if_off']==-5]
df_test_time = pd.merge(df_testall2_time,time1_test, on=['车号', 'CollectTime'], how='inner')
print('Test Label==1总数量:',df_test_time.shape)

Trn Label==1总数量: (49, 30)
Trn车if_off==-5 的数量: (38, 30)
Test Label==1总数量: (64, 29)


def col_feature3(df):
    cate_cols = ['主驾驶座占用状态']
    
    # 类别特征编码
    for col in cate_cols:
        df[col+'cate'] = df[col].astype('category').cat.codes
    
    df['a1'] = df['v_diff1'].apply(lambda x: 1 if x>0 else 0)
#     df['a2'] = df['v_diff4'].apply(lambda x: 1 if x>0 else 0)
    
    ori_cols = ['加速踏板位置', '电池包主负继电器状态', '电池包主正继电器状态', '制动踏板状态',
       '驾驶员离开提示', '主驾驶座占用状态', '驾驶员安全带状态', '手刹状态', '整车钥匙状态', '低压蓄电池电压',
       '整车当前档位状态', '整车当前总电流', '整车当前总电压','车速', '方向盘转角']
    cate_cols = ['电池包主负继电器状态cate']
    choose_cols = ['time_delta', 'time_delta_5','if_on','if_off','v_diff3','v_diff2', 'a_min5', 'a_mean5', 'a_max3']
    df = df.drop(ori_cols+cate_cols+choose_cols,axis=1)

    return df

df_trn_time2 = col_feature3(df_trn_time)
df_test_time2 = col_feature3(df_test_time)
print(df_trn_time2.columns)

Index(['车号', 'CollectTime', 'v_diff1', 'v_diff4', 'Label', '主驾驶座占用状态cate',
       'a1'],
      dtype='object')


# 训练数据的特征
f_names2 = [x for x in df_trn_time2.columns if x not in['车号','CollectTime','Label']]
x_time_data = df_trn_time2[f_names2]
y_time_data = df_trn_time2['Label']
test_time_data = df_test_time2[f_names2]
print(test_time_data.columns)
std = StandardScaler()
x_time_data = std.fit_transform(x_time_data)
test_time_data = std.fit_transform(test_time_data)
print(x_time_data.shape,test_time_data.shape)

Index(['v_diff1', 'v_diff4', '主驾驶座占用状态cate', 'a1'], dtype='object')
(49, 4) (64, 4)


pred_test_Kfold = pd.DataFrame()
f1_trn,f1_val = [],[]
fold = StratifiedKFold(n_splits=5,shuffle=True,random_state=0)

for iteration ,(trn_idx,val_idx) in enumerate(fold.split(x_time_data,y_time_data)):
#     print('------------{} fold -------------'.format(iteration))
    x_trn_time,y_trn_time = x_time_data[trn_idx],y_time_data[trn_idx]
    x_val_time,y_val_time = x_time_data[val_idx],y_time_data[val_idx]
    
    knn = KNeighborsClassifier(n_neighbors=5,p=1,metric='minkowski')
    knn.fit(x_trn_time,y_trn_time)

    # 预测
    pred_trn = knn.predict(x_trn_time)
    pred_val = knn.predict(x_val_time)
    pred_test_Kfold['label'+str(iteration)] = knn.predict(test_time_data)

    #得分
    f1_trn.append(f1_score(pred_trn,y_trn_time))
    f1_val.append(f1_score(pred_val,y_val_time))

print('trn F1',f1_trn)
print('val F1',f1_val)
print('trn_F1mean',np.mean(f1_trn))
print('val_F1mean',np.mean(f1_val))

trn F1 [0.8857142857142858, 0.8823529411764706, 0.8695652173913044, 0.8695652173913044, 0.8732394366197184]
val F1 [0.75, 0.888888888888889, 0.888888888888889, 0.888888888888889, 0.8750000000000001]
trn_F1mean 0.8760874196586167
val_F1mean 0.8583333333333334


pred_test_Kfold['Time_Pred'] = pred_test_Kfold.mean(axis=1)
pred_test_time = pred_test_Kfold['Time_Pred'].apply(lambda x: 1 if x>0.6 else 0)
df_test_time2['Time_Pred'] = pred_test_time
df_test_time2[df_test_time2['Time_Pred']==0]


# 找到if_off==-4对应time 
df_testall2_time_if_off4 = df_testall2[df_testall2['if_off']==-4]
df_test_time_if_off4 = pd.merge(df_testall2_time_if_off4,time1_test, on=['车号', 'CollectTime'], how='inner')
df_test_time_if_off4['Time_Pred'] = pred_test_time
df_test_time_if_off4.shape

(64, 30)


Time1_test_sub = df_test_time2.loc[df_test_time2['Time_Pred']==1,['车号','CollectTime']]
Time2_test_sub = df_test_time_if_off4.loc[df_test_time_if_off4['Time_Pred']==0,['车号','CollectTime']]
Time_test_sub = pd.concat([Time1_test_sub,Time2_test_sub])

# 提交最终结果
sub_all_time = pd.merge(sub_all[['车号','Label']],Time_test_sub,on=['车号'], how='left')
sub_all_time.to_csv(result_path+'/result.csv',index=False)
print('Label1数量:',sub_all_time[sub_all_time['Label']==1].shape)
print('-------------------end-------------------')

Label1数量: (64, 3)
-------------------end-------------------

知识点标签¶

场景标签¶

概述¶

训练集清洗¶

碰撞状态预测模型¶

规则预测模型¶

文件读取¶

特征工程¶

集成标签(重采样)¶

规则分类¶

结果保存¶

LightGBM预测模型¶

文件读取¶

特征工程¶

集成标签(重采样)¶

LGBM模型分类¶

结果保存¶

碰撞时间预测模型¶

重采样¶

特征工程¶

标准化¶

五折交叉验证¶

结果提交¶

	车号	CollectTime	v_diff1	v_diff4	主驾驶座占用状态cate
1	132	2021-01-30 07:47:55	-15.2190	0.000000	1
4	138	2020-11-26 19:07:39	-18.5310	-1.117500	1
19	168	2020-11-10 14:40:15	-14.1565	-10.703000	1
39	207	2020-12-23 16:39:46	-14.6250	-7.078125	0

车号	Label	CollectTime
1	1	2020/8/30 21:36:09
2	0
3	1	2020/8/12 8:36:46