Getting started
pip install jupyterlab
启动jupyterlab服务
jupyter-lab --ip 0.0.0.0 --port 8888 --no-browser --allow-root
pip install transformers
datasets
pip install opencv-python
pip install jupyter
torchvision
torch
torchmetrics
scikit-learn
peft
ipykernel
pip install seaborn
pip install imbalanced-learn
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
PD_SK
from sklearn import ensemble,preprocessing,model_selection,metrics
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import random
ord('A')
chr(65)
plt.pie(s['digit'],labels=s.index,autopct='%4.2f%%',startangle=90,explode=[0,0,0,0.2],shadow=True,colors=['orange','lightgreen','lightblue','pink'])
r = plt.bar(s.index,s['digit'],color='pink',label='pnn')
plt.bar(s.index,s['digit'],bottom=s['digit'],color='lightgreen',label='grn')
plt.legend()
GDB = ensemble.AdaBoostRegressor()
params = {'n_estimators':range(38,42,2),
'learning_rate':np.linspace(0.2,1.8,4)
}
gs = model_selection.GridSearchCV(GDB,params,cv=3)
data = pd.read_csv('Alice_Springs_2019.csv',)
data = data[~data.isna().any(axis=1)]
data.reset_index(inplace=True,drop=True)
data['timestamp']= pd.to_datetime(data['timestamp'],errors='coerce')
data['timestamp'][0].strftime('%Y-%m-%d')
data['timestamp'][0].day
data['Year'] = data['timestamp'].apply(lambda x:x.year)
data['Month'] = data['timestamp'].apply(lambda x:x.month)
data['Day'] = data['timestamp'].apply(lambda x:x.day)
data['Hour'] = data['timestamp'].apply(lambda x:x.day)
data['Day'] = data['timestamp'].apply(lambda x:x.day)
pd_data = data.drop(columns=['timestamp'])
ss_feature = preprocessing.StandardScaler()
ss_ap = preprocessing.StandardScaler()
pd_data.iloc[:,1:7] = ss_feature.fit_transform(pd_data.iloc[:,1:7])
pd_data['Active_Power']= ss_ap.fit_transform(pd.DataFrame(pd_data['Active_Power']))
X = pd_data.iloc[:,1:]
Y = pd_data['Active_Power']
gs.fit(X,Y)
import joblib
import pickle
joblib.dump(gs.best_estimator_,'./adaboost.est')
model = joblib.load('./adaboost.est')
metric = metrics.r2_score
pred = model.predict(X)
metric(Y,pred)
tx,ex,ty,ey = model_selection.train_test_split(X,Y,test_size=0.2)
from imblearn.over_sampling import RandomOverSampler
metric(ey,gs.predict(ex))
gs.best_score_
tx,ex,ty,ey = model_selection.train_test_split(X,Y,test_size=0.2)
gs.fit(tx,ty)
gs.best_score_
pd_data.groupby(['Year','Month']).agg({'Active_Power':lambda x:str(x.idxmax())+' '+str( x.max())})
btm,bin,_ = plt.hist(pd_data['Active_Power'],color='lightblue',label='standered',density=True)
plt.hist(data['Active_Power'],bottom=btm,bins=bin,color='pink',label='origin',density=True)
plt.legend()
plt.xlabel('power')
plt.ylabel('num')
plt.title('Hist')
plt.savefig('hist.jpg',dpi=300,format='jpg')
import seaborn as sns
cormatrix = data.iloc[:,1:].corr()
cormatrix['Active_Power'][1:]
sns.heatmap(cormatrix,cmap='twilight_r')
sns.kdeplot(data['Active_Power'])
plt.hist(data['Active_Power'],color='pink',label='origin',density=True)
sns.scatterplot(x=data['Global_Horizontal_Radiation'],y=data['Active_Power'])
plt.cm.PuRd
plt.scatter(x=data['Global_Horizontal_Radiation'],y=data['Active_Power'],c=data['Active_Power'],linewidths=0.1,edgecolors='black',alpha=0.8,cmap='tab20_r')
plt.colorbar()
sns.regplot(x=data['Global_Horizontal_Radiation'],y=data['Active_Power'],line_kws={'color':'red'})
plt.figure(figsize=(24,5))
plt.plot_date(x=data['timestamp'][:5000],y=data['Active_Power'][:5000],fmt='-o',color='red')
plt.figure(figsize=(24,5))
plt.plot(data['Active_Power'][:5000])
rst = plt.boxplot(data['Active_Power'])
plt.show()
diff = data['Active_Power'].quantile(0.75)-data['Active_Power'].quantile(0.25)
up_bound = data['Active_Power'].median()+diff
low_bound = data['Active_Power'].median()-diff
data[(data['Active_Power']<low_bound) | (data['Active_Power']>up_bound)]
LLM
RTE
import torch
from transformers import AutoModelForSequenceClassification,AutoTokenizer,trainer,training_args,Trainer,TrainingArguments,utils
from datasets import Dataset
from peft import LoraConfig,get_peft_model,prepare_model_for_kbit_training,PeftModel,TaskType
import pandas as pd
import os
llm_model_path = "C:/gemma-2b-it"
label_list = ['entailment', 'not_entailment']
tknizer = AutoTokenizer.from_pretrained(llm_model_path)
def data_tk_fn(item):
return tknizer(item['sentence'],return_tensors='pt',padding='max_length',max_length=100,truncation=True)
def merge_sentence(item):
s1= ' '.join(item['sentence1'].split()[:80])
s2 = ' '.join(item['sentence2'].split()[:20])
sentence = f'sentence1:{s1}\nsentence2:{s2}'
return sentence
def load_data(data_path):
df_data = pd.read_csv(data_path,sep='\t',index_col='index')
df_data.dropna(axis=0,ignore_index=True,inplace=True)
df_data.label = df_data['label'].apply(lambda x: label_list.index(x))
df_data = df_data.sample(frac=1,ignore_index=True)
df_data.rename(columns={'label':'labels'},inplace=True)
df_data['sentence'] = df_data.apply(merge_sentence,axis=1)
ds = Dataset.from_pandas(df_data)
return ds.map(function=data_tk_fn,batched=True).remove_columns(['sentence1', 'sentence2','sentence'])
train_path = 'train.tsv'
data_path = train_path
df_data = pd.read_csv(data_path,sep='\t',index_col='index')
df_data.dropna(axis=0,ignore_index=True,inplace=True)
df_data.label = df_data['label'].apply(lambda x: label_list.index(x))
df_data = df_data.sample(frac=1,ignore_index=True)
df_data.rename(columns={'label':'labels'},inplace=True)
df_data['sentence'] = df_data.apply(merge_sentence,axis=1)
ds = Dataset.from_pandas(df_data)
ds.map(function=data_tk_fn,batched=True).remove_columns(['sentence1', 'sentence2','sentence'])
df_data['s1_words'] = df_data['sentence1'].apply(lambda x:len(x.split()))
df_data['s2_words'] = df_data['sentence2'].apply(lambda x:len(x.split()))
df_data['s1_words'].describe()
df_data['s1_words'].hist()
df_data['s2_words'].describe()
df_data['s2_words'].hist()
train_ds = load_data(train_path)
dev_ds = load_data('dev.tsv')
llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_path,num_labels =2 )
lora_config = LoraConfig(task_type=TaskType.SEQ_CLS,
r =3,
inference_mode = False,
target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'])
llm_model = prepare_model_for_kbit_training(llm_model)
peft_model = get_peft_model(llm_model,lora_config)
peft_model.print_trainable_parameters()
train_arg = TrainingArguments(output_dir='./output/',
eval_strategy='steps',
logging_first_step=True,
logging_steps=10,
log_level='info',
logging_strategy='steps',
save_strategy='epoch',
per_device_train_batch_size=2)
train_arg.batch_eval_metrics
from torchmetrics.classification import Accuracy
from transformers import trainer_utils
import numpy as np
from transformers.data import data_collator
acc_metric = Accuracy(task='multiclass',num_classes=2,top_k = 1)
def metric(eval_preds):
predictions = eval_preds.predictions
label_ids = eval_preds.label_ids
pred = predictions.argmax(axis=-1)
acc = acc_metric(torch.Tensor(pred),torch.Tensor(label_ids))
return {'Accuracy':acc}
trainer = Trainer(peft_model,train_arg,train_dataset=train_ds,eval_dataset=dev_ds,compute_metrics=metric)
trainer.train()
MRPC
#引用包
import pandas as pd
import numpy as np
import torch
from transformers import TrainingArguments,AutoModelForSequenceClassification,AutoTokenizer,Trainer
from peft import LoraModel,LoraConfig,prepare_model_for_kbit_training,TaskType,get_peft_model,PeftModel
from datasets import Dataset
from transformers import trainer_utils
def forge_text(data_row):
string1 = data_row['#1 String']
string2 = data_row['#2 String']
forged_string = \
f'String_1:{string1}\nString_2:{string2}'
return forged_string
# gemma_tokenizer = AutoTokenizer.from_pretrained('D:/gemma-transformers-2b-it-v3')
gemma_tokenizer = AutoTokenizer.from_pretrained('C:/gemma-2b-it')
def ds_tokenize(item):
return gemma_tokenizer(item['text'],padding='max_length',max_length=96,truncation=True,return_tensors='pt').to('cuda')
def load_data(tsv_path):
data = pd.read_csv(tsv_path,sep='\t',on_bad_lines='skip').sample(frac=0.5)
data['text'] = data.apply(forge_text,axis=1)
if 'Quality' in data.columns:
data_dict = {'text':data['text'],'labels':data['Quality']}
else:
data_dict = {'text':data['text']}
ds = Dataset.from_dict(data_dict)
return ds.map(ds_tokenize,batched=True).remove_columns('text')
train_ds = load_data('./train.tsv')
dev_ds = load_data('./dev.tsv')
test_ds = load_data('./test.tsv')
gemma_tokenizer('hello world',return_tensors='pt')
# gemma_model = AutoModelForSequenceClassification.from_pretrained('D:/gemma-transformers-2b-it-v3',num_labels=2,device_map='cuda')
gemma_model = AutoModelForSequenceClassification.from_pretrained('D:/gemma-transformers-2b-it-v3',num_labels=2)
gemma_model
config = LoraConfig(
task_type=TaskType.SEQ_CLS,
target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
inference_mode=False,
r = 4
)
model = prepare_model_for_kbit_training(gemma_model)
model = get_peft_model(model,config)
model.print_trainable_parameters()
train_args = TrainingArguments(
output_dir='./output/',
auto_find_batch_size=True,
learning_rate=1e-4,
num_train_epochs=5,
logging_dir='./log/'
)
trainner = Trainer(
model,
train_args,
train_dataset=train_ds,
eval_dataset=dev_ds,
compute_metrics=metric
)
trainner.train()
SST-2
import pandas as pd
import torch
import peft
import transformers
import torchmetrics
import tqdm
file_path = './train.tsv'
pd_data = pd.read_csv(file_path,sep='\t')
pd_data.rename(columns={
'label':'labels'},inplace=True)
pd_data.columns
from transformers import AutoModelForSequenceClassification,AutoTokenizer,Trainer,TrainingArguments
from datasets import Dataset
llm = AutoModelForSequenceClassification.from_pretrained('C:/gemma-2b-it/',num_labels=2)
tknizer = AutoTokenizer.from_pretrained('C:/gemma-2b-it/')
llm
def ds_map_fn(item):
return tknizer(item['sentence'],truncation=transformers.tokenization_utils_base.TruncationStrategy.LONGEST_FIRST,padding=transformers.utils.generic.PaddingStrategy.LONGEST,return_tensors='pt')
file_path = './train.tsv'
def get_dataset(file_path,sample=1.0):
pd_data = pd.read_csv(file_path,sep='\t')
pd_data.rename(columns={
'label':'labels'},inplace=True)
pd_data = pd_data.sample(frac=sample)
ds = Dataset.from_pandas(pd_data)
return ds
ds = get_dataset(file_path)
from torch.utils.data import DataLoader
from peft import LoraConfig,prepare_model_for_kbit_training,get_peft_model
lora_config = LoraConfig(task_type=peft.utils.peft_types.TaskType.SEQ_CLS,
r=4,
target_modules=['k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'])
peft_model = prepare_model_for_kbit_training(llm)
peft_model = get_peft_model(peft_model,lora_config)
peft_model.print_trainable_parameters()
Epoch = 3
Batchsize = 10
lr = 5e-4
opt = torch.optim.AdamW(params=peft_model.parameters(),lr=lr)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=opt,step_size=3,gamma=0.8)
metric = torchmetrics.Accuracy('multiclass',num_classes=2)
train_ds = get_dataset('./train.tsv')
eval_ds = get_dataset('./dev.tsv')
train_dl = DataLoader(dataset=train_ds,batch_size=Batchsize)
eval_dl = DataLoader(dataset=eval_ds,batch_size=Batchsize)
def eval(metric,epoch,peft_model,eval_dl):
with torch.no_grad():
total_loss = 0
peft_model.eval()
t_bar = tqdm(eval_dl,position=0)
t_bar.set_description(f'Eval Epoch:{epoch}')
batch = 0
metric.reset()
metric.to('cuda')
for item in t_bar:
labels = item['labels'].long().to('cuda')
sentence = item['sentence']
tk_rst = tknizer(sentence,padding=True,truncation=True,return_tensors='pt').to('cuda')
tk_rst['labels'] = labels
rst = peft_model.forward(**tk_rst)
loss = rst.loss
total_loss = total_loss+float(loss)
epoch_avg_loss = total_loss/(batch+1)
logits = rst.logits
pred = logits.argmax(-1)
batch_acc = metric(pred,labels)
avg_acc = metric.compute()
t_bar.set_postfix({
'Loss':float(loss),'AVG_LOSS':epoch_avg_loss,'acc':float(batch_acc),'AVG_acc':float(avg_acc)})
batch = batch +1
peft_model.train()
return epoch_avg_loss
def train(peft_model,Epoch,train_dl,eval_dl,opt,lr_scheduler,metric):
peft_model.cuda()
best_eval_loss = 1000
for epoch in range(Epoch):
total_loss = 0
peft_model.train()
t_bar = tqdm(train_dl,position=0)
t_bar.set_description(f'Train Epoch:{epoch:4d}')
batch = 0
for item in t_bar:
labels = item['labels'].long().to('cuda')
sentence = item['sentence']
tk_rst = tknizer(sentence,padding=True,truncation=True,return_tensors='pt').to('cuda')
tk_rst['labels'] = labels
rst = peft_model.forward(**tk_rst)
loss = rst.loss
loss.backward()
opt.step()
opt.zero_grad()
total_loss = total_loss+float(loss)
epoch_avg_loss = total_loss/(batch+1)
t_bar.set_postfix({
'BatchLoss':float(loss),'Epoch_AVG_LOSS':epoch_avg_loss})
if batch%500 == 0:
eval_loss = eval(metric,epoch,peft_model,eval_dl)
lr_scheduler.step()
if eval_loss < best_eval_loss:
peft_model.save_pretrained(f'./sst_gemma_best_{epoch}_{batch}')
best_eval_loss = eval_loss
batch = batch +1
train(peft_model,Epoch,train_dl,eval_dl,opt,lr_scheduler,metric)
peft_model.save_pretrained('./sst_gemma')
from peft import PeftModel
loaded_model = PeftModel.from_pretrained(llm,'./sst_gemma',is_trainable=True)
loaded_model.print_trainable_parameters()
test_d = pd.read_csv('test.tsv',sep='\t',index_col='index')
sentences = test_d['sentence'].tolist()
tk_rst = tknizer(sentences,truncation=True,padding=True,return_tensors='pt')
tk_rst.keys()
tk_rst['input_ids'],tk_rst['attention_mask']
loaded_model.forward(input_ids=tk_rst['input_ids'])
IMG_SEG
import cv2
import os
import numpy as np
import pandas as pd
import json
from matplotlib import pyplot as plt
import torchvision as v
import torch
import random
img_dir = './JPEGImages'
anno_dir = './labelme_anno'
OP_H = 540
OP_W = 960
anno_filelist = os.listdir(anno_dir)
anno_file = anno_filelist[0]
print(anno_file)
Mode = 'Train'
def get_img_anno(anno_file,OP_H,OP_W,Mode='Train'):
with open(os.path.join(anno_dir,anno_file)) as f:
anno_info = json.load(f)
img_h = anno_info['imageHeight']
img_w = anno_info['imageWidth']
img_path = anno_info['imagePath']
img_path = img_path[1:]
img_shapes = anno_info['shapes']
regions = []
img = cv2.imread(img_path)
img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
randomcrop = v.transforms.RandomCrop((OP_H,OP_W))
resize = v.transforms.Resize((OP_H,OP_W),interpolation=T.InterpolationMode.NEAREST)
to_tensor = v.transforms.ToTensor()
to_PilImg = v.transforms.ToPILImage()
color_jitter= v.transforms.ColorJitter(0.2,0.2,0.2)
img_tensor = to_tensor(img)
if random.random()<0.2 or Mode != 'Train':
img_tensor = resize(img_tensor)
if Mode == 'Train':
img_tensor = color_jitter(img_tensor)
for shape in img_shapes:
points = shape['points']
for i in range(len(points)):
points[i][0] = points[i][0]/img_w*OP_W
points[i][1] = points[i][1]/img_h*OP_H
regions.append(np.array(points,np.int32))
label_img = cv2.fillPoly(np.zeros((OP_H,OP_W)),regions,color=1)
label_img = to_tensor(label_img)
else:
img_tensor = color_jitter(img_tensor)
for shape in img_shapes:
points = shape['points']
regions.append(np.array(points,np.int32))
label_img = cv2.fillPoly(np.zeros((img_h,img_w)),regions,color=1)
label_img = to_tensor(label_img)
img_all = torch.concat((img_tensor,label_img),dim=0)
cut_ok = 0
cut_times = 0
while(cut_ok == 0 ):
img_all_cut = randomcrop(img_all)
cut_img = img_all_cut[0:-1,:,:]
cut_label = img_all_cut[-1,:,:]
cut_times = cut_times+1
if cut_label.sum()/label_img.sum() > 0.6 or cut_times>500:
cut_ok = 1
label_img=cut_label
img_tensor = cut_img
return img_tensor,label_img.squeeze()
img,label = get_img_anno(anno_file,OP_H,OP_W)
topil = v.transforms.ToPILImage()
label == 1
img*label+(label == 0)
s = img.clone()
rgb_mask = torch.zeros((3,540,960))
rgb_mask[0] = label
rgb_mask[1] = label
merged = s+rgb_mask*0.2
merged = merged.clip(0,1)
topil(merged)
topil(label)
anno_filelist = os.listdir(anno_dir)
random.shuffle(anno_filelist)
train_anno = anno_filelist[0:100]
eval_anno = anno_filelist[100:]
import torch.utils
import torch.utils.data
class IMG_SEG_DS(torch.utils.data.Dataset):
def __init__(self,anno_filelist,Mode= 'Train',OP_H=540,OP_W=960) -> None:
super().__init__()
self.anno_filelist = anno_filelist
self.OP_H = OP_H
self.OP_W = OP_W
self.Mode = Mode
def __getitem__(self, idx):
anno_file = self.anno_filelist[idx]
img,label = get_img_anno(anno_file,self.OP_H,self.OP_W,self.Mode)
return img,label
def __len__(self):
return len(self.anno_filelist[0:100])
train_ds = IMG_SEG_DS(train_anno)
eval_ds = IMG_SEG_DS(eval_anno,Mode='eval')
class IMG_SEG(torch.nn.Module):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
def forward(self,x):
return x
model = v.models.segmentation.deeplabv3_resnet50(pretrained = False,pretrained_backbone=False,num_classes=2).cuda()
Epoch = 45
Batchsize = 2
lr = 5e-4
opt = torch.optim.AdamW(params=model.parameters(),lr=lr)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=opt,step_size=3,gamma=0.6)
loss_fn = torch.nn.CrossEntropyLoss()
train_dl = torch.utils.data.DataLoader(train_ds,Batchsize,shuffle=True)
eval_dl = torch.utils.data.DataLoader(eval_ds,batch_size=2)
from tqdm import tqdm
import time
def Evaluate(model,eval_dl,Epoch,):
model.eval()
with torch.no_grad():
tqdm_bar = tqdm(eval_dl)
tqdm_bar.set_description(desc=f'Evaluation in Epoch:{Epoch}',refresh=True)
total_loss = 0
for j,item in enumerate(tqdm_bar):
img,label = item
img = img.to('cuda',torch.float32)
label = label.to('cuda',torch.long)
pred_logits = model(img)
loss = loss_fn(pred_logits['out'],label)
batch_loss = float(loss.cpu())
avg_loss = total_loss/(j+1)
total_loss = total_loss + batch_loss
tqdm_bar.set_postfix({
'Eval_Avg_loss': avg_loss})
def Train(model,train_dl,eval_dl,Epoch):
for i in range(Epoch):
model.train()
tqdm_bar = tqdm(train_dl)
tqdm_bar.set_description(desc=f'Train Epoch:{i}',refresh=True)
total_loss = 0
for j,item in enumerate(tqdm_bar):
img,label = item
img = img.to('cuda',torch.float32)
label = label.to('cuda',torch.long)
pred_logits = model(img)
loss = loss_fn(pred_logits['out'],label)
loss.backward()
opt.step()
opt.zero_grad()
batch_loss = float(loss.cpu())
avg_loss = total_loss/(j+1)
total_loss = total_loss + batch_loss
tqdm_bar.set_postfix({
'Batch_Loss':batch_loss,'Epoch_avg_loss': avg_loss})
lr_scheduler.step()
Evaluate(model,eval_dl,i)
Train(model,train_dl,eval_dl,Epoch)
model.eval()
img,label = eval_ds[5]
rst = model(img.unsqueeze(0).float().cuda())
infer_tensor = rst['out'].argmax(1)
topil(infer_tensor.float())
torch.save(model.state_dict(),'res_seg_sd.ptd')
new_model_1 = v.models.segmentation.deeplabv3_resnet50(pretrained=False,pretrained_backbone=False,num_classes=2).cuda()
state_dict= torch.load('res_seg_sd.ptd')
new_model_1.load_state_dict(state_dict)
new_model_1.cuda()
new_model_1.eval()
img,label = eval_ds[5]
rst = new_model_1(img.unsqueeze(0).float().cuda())
infer_tensor = rst['out'].argmax(1)
topil(infer_tensor.float())
img,label = eval_ds[6]
topil(img)
torch.save(model,'res_seg_model.ptm')
new_model = torch.load('res_seg_model.ptm')
new_model.cuda()
new_model.eval()
img,label = eval_ds[6]
rst = new_model(img.unsqueeze(0).float().cuda())
infer_tensor = rst['out'].argmax(1)
topil(infer_tensor.float())
infer_tensor.squeeze()
from torchvision import transforms as T
import torch
a = [[
[1,1,4,4],
[7,0,3,3]
]]
a = torch.IntTensor(a)
a.shape
T.InterpolationMode.BILINEAR
r1 = T.Resize((10,10),interpolation=T.InterpolationMode.NEAREST)
r2 = T.Resize((10,10),interpolation=T.InterpolationMode.BILINEAR)
r1(a)
r2(a)
a = torch.Tensor([[10,-10,10,-10,10]])
b = torch.Tensor([[1,0,1,0,1]])
loss = torch.nn.BCEWithLogitsLoss()
loss(a,b)
sig = torch.nn.Sigmoid()
rst = (sig(a)>0.5).int()
dim1,dim2 = torch.where(rst==1)
i = 1
rst[dim1[i],dim2[i]]
xgboost
import xgboost as xgb
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
print(y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
param = {
'max_depth': 3, # the maximum depth of each tree
'eta': 0.3, # the training step for each iteration
'objective': 'multi:softprob', # error evaluation for multiclass training
'num_class': 3} # the number of classes that exist in this datset
num_round = 20 # the number of training iterations
bst = xgb.train(param, dtrain, num_round)
bst.dump_model('dump.raw.txt')
preds = bst.predict(dtest)
import numpy as np
best_preds = np.asarray([np.argmax(line) for line in preds])
from sklearn.metrics import precision_score
precision_score(y_test, best_preds, average='macro')
import xgboost as xgb
n_estimators = 50
params = {'n_estimators':n_estimators, 'booster':'gbtree', 'max_depth':5, 'learning_rate':0.05,
'objective':'reg:squarederror', 'subsample':1, 'colsample_bytree':1}
clf = xgb.XGBRegressor(**params)
数据挖掘
# kmeans
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
def get_best_cluster_num(X,cluster_nums):
metric_score = []
models = []
for t in cluster_nums:
kmeans_model = KMeans(n_clusters=t).fit(X)
score = metrics.silhouette_score(X, kmeans_model.labels_,metric='euclidean')
models.append(kmeans_model)
metric_score.append(score)
best_idx= np.array(metric_score).argmax()
print(best_idx)
best_cls = cluster_nums[best_idx]
return models[best_idx],best_cls
# 异常检测
import pandas as pd
tmpdf = pd.DataFrame({'p':[1]*4+[0]*4+[66]+[0]*5,'q':[3]*4+[0]*4+[66]+[0]*5})
from sklearn.cluster import DBSCAN
DBSCAN(eps=1, min_samples=3).fit_predict(tmpdf)
from sklearn.ensemble import IsolationForest
IsolationForest().fit_predict(tmpdf)
from sklearn.svm import OneClassSVM
OneClassSVM(nu=0.3).fit_predict(tmpdf)
from sklearn.cluster import KMeans
KMeans(n_clusters=2).fit_predict(tmpdf)
from sklearn.cluster import KMeans
wcss = []
#考察kmeans初始化方法
for i in range(1,11):
##(初始化 KMeans)##
kmeans = KMeans(n_clusters=i, random_state=0)
kmeans.fit(X3)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.savefig('elbow.png')
plt.show()
#print(words[250:300])
kmeans = KMeans(n_clusters = 3, n_init = 20, n_jobs = 1) # n_init(number of iterations for clsutering) n_jobs(number of cpu cores to use)
kmeans.fit(X3)
common_words = kmeans.cluster_centers_.argsort()[:,-1:-26:-1]
for num, centroid in enumerate(common_words):
print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
weather_df_clus_temp = weather_df[["Tm", "Tx", "Tn", "xm", "ym"]]
weather_df_clus_temp = StandardScaler().fit_transform(weather_df_clus_temp)
db = DBSCAN(eps=0.3, min_samples=10).fit(weather_df_clus_temp)
labels = db.labels_
print (labels[500:560])
weather_df["Clus_Db"]=labels
realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels))
from sklearn.decomposition import PCA
pca=PCA(n_components=2)###构建一个PCA模型
pca.fit(digits.data)###将digits数据变换到前两个主成分上
digits_pca=pca.transform(digits.data)
colors=["#476A2A","#7851B8","#BD3430","#4A2D4E","#875525","#A83683","#4E656E","#853541","#3A3120","#535D8E"]
plt.figure(figsize=(10,10))
plt.xlim(digits_pca[:,0].min(),digits_pca[:,0].max())
plt.ylim(digits_pca[:,1].min(),digits_pca[:,1].max())
for i in range(len(digits.data)):###将数据绘制成文本散点
plt.text(digits_pca[i,0],digits_pca[i,1],str(digits.target[i]),color=colors[digits.target[i]],fontdict={"weight":"bold","size":9})
plt.xlabel("第一主成分")
plt.ylabel("第二主成分")
from sklearn.manifold import TSNE
tsne = TSNE(random_state=42)###使用fit_transform而不是fit,因为TSNE没有transform方法
digits_tsne = tsne.fit_transform(digits.data)###运行时间较久
plt.figure(figsize=(10,10))
plt.xlim(digits_tsne[:,0].min(),digits_tsne[:,0].max()+1)
plt.ylim(digits_tsne[:,1].min(),digits_tsne[:,1].max()+1)
for i in range(len(digits.data)):###将数据绘制成文本散点
plt.text(digits_tsne[i,0],digits_tsne[i,1],str(digits.target[i]),color=colors[digits.target[i]],fontdict={"weight":"bold","size":9})
plt.xlabel("第一分量")
plt.ylabel("第二分量")