提升 5 您所在的位置:网站首页 m1芯片有gpu吗 提升 5

提升 5

2023-12-02 21:40| 来源: 网络整理| 查看: 265

二,环境配置

0,检查mac型号

点击桌面左上角mac图标——>关于本机——>概览,确定是m1芯片,了解内存大小(最好有16G以上,8G可能不太够用)。

1,下载 miniforge3 (miniforge3可以理解成 miniconda/annoconda 的社区版,提供了更稳定的对M1芯片的支持)

https://github.com/conda-forge/miniforge/#download

备注: annoconda 在 2022年5月开始也发布了对 mac m1芯片的官方支持,但还是推荐社区发布的miniforge3,开源且更加稳定。

2,安装 miniforge3

chmod +x ~/Downloads/Miniforge3-MacOSX-arm64.sh

sh ~/Downloads/Miniforge3-MacOSX-arm64.sh

source~/miniforge3/bin/activate

3,安装 pytorch (v1.12版本已经正式支持了用于mac m1芯片gpu加速的mps后端。)

pip install torch>=1.12 -i https://pypi.tuna.tsinghua.edu.cn/simple

4,测试环境

importtorch

print(torch.backends.mps.is_available)

print(torch.backends.mps.is_built)

如果输出都是True的话,那么恭喜你配置成功了。

三,范例代码

核心操作非常简单,和使用cuda类似,训练前把模型和数据都移动到torch.device("mps")就可以了。

importtorch

fromtorch importnn

importtorchvision

fromtorchvision importtransforms

importtorch.nn.functional asF

importos,sys,time

importnumpy asnp

importpandas aspd

importdatetime

fromtqdm importtqdm

fromcopy importdeepcopy

fromtorchmetrics importAccuracy

defprintlog(info):

nowtime = datetime.datetime.now.strftime( '%Y-%m-%d %H:%M:%S')

print( "\n"+ "=========="* 8+ "%s"%nowtime)

print(str(info)+ "\n")

#================================================================================

# 一,准备数据

#================================================================================

transform = transforms.Compose([transforms.ToTensor])

ds_train = torchvision.datasets.MNIST(root= "mnist/",train= True,download= True,transform=transform)

ds_val = torchvision.datasets.MNIST(root= "mnist/",train= False,download= True,transform=transform)

dl_train = torch.utils.data.DataLoader(ds_train, batch_size= 128, shuffle= True, num_workers= 2)

dl_val = torch.utils.data.DataLoader(ds_val, batch_size= 128, shuffle= False, num_workers= 2)

#================================================================================

# 二,定义模型

#================================================================================

defcreate_net:

net = nn.Sequential

net.add_module( "conv1",nn.Conv2d(in_channels= 1,out_channels= 64,kernel_size = 3))

net.add_module( "pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))

net.add_module( "conv2",nn.Conv2d(in_channels= 64,out_channels= 512,kernel_size = 3))

net.add_module( "pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))

net.add_module( "dropout",nn.Dropout2d(p = 0.1))

net.add_module( "adaptive_pool",nn.AdaptiveMaxPool2d(( 1, 1)))

net.add_module( "flatten",nn.Flatten)

net.add_module( "linear1",nn.Linear( 512, 1024))

net.add_module( "relu",nn.ReLU)

net.add_module( "linear2",nn.Linear( 1024, 10))

returnnet

net = create_net

print(net)

# 评估指标

classAccuracy(nn.Module):

def__init__(self):

super.__init__

self.correct = nn.Parameter(torch.tensor( 0.0),requires_grad= False)

self.total = nn.Parameter(torch.tensor( 0.0),requires_grad= False)

defforward(self, preds: torch.Tensor, targets: torch.Tensor):

preds = preds.argmax(dim= -1)

m = (preds == targets).sum

n = targets.shape[ 0]

self.correct += m

self.total += n

returnm/n

defcompute(self):

returnself.correct.float / self.total

defreset(self):

self.correct -= self.correct

self.total -= self.total

#================================================================================

# 三,训练模型

#================================================================================

loss_fn = nn.CrossEntropyLoss

optimizer= torch.optim.Adam(net.parameters,lr = 0.01)

metrics_dict = nn.ModuleDict({ "acc":Accuracy})

# =========================移动模型到mps上==============================

device = torch.device( "mps"iftorch.backends.mps.is_available else"cpu")

net.to(device)

loss_fn.to(device)

metrics_dict.to(device)

# ====================================================================

epochs = 20

ckpt_path= 'checkpoint.pt'

#early_stopping相关设置

monitor= "val_acc"

patience= 5

mode= "max"

history = {}

forepoch inrange( 1, epochs+ 1):

printlog( "Epoch {0} / {1}".format(epoch, epochs))

# 1,train -------------------------------------------------

net.train

total_loss,step = 0, 0

loop = tqdm(enumerate(dl_train), total =len(dl_train),ncols= 100)

train_metrics_dict = deepcopy(metrics_dict)

fori, batch inloop:

features,labels = batch

# =========================移动数据到mps上==============================

features = features.to(device)

labels = labels.to(device)

# ====================================================================

#forward

preds = net(features)

loss = loss_fn(preds,labels)

#backward

loss.backward

optimizer.step

optimizer.zero_grad

#metrics

step_metrics = { "train_"+name:metric_fn(preds, labels).item

forname,metric_fn intrain_metrics_dict.items}

step_log = dict({ "train_loss":loss.item},**step_metrics)

total_loss += loss.item

step+= 1

ifi!=len(dl_train) -1:

loop.set_postfix(**step_log)

else:

epoch_loss = total_loss/step

epoch_metrics = { "train_"+name:metric_fn.compute.item

forname,metric_fn intrain_metrics_dict.items}

epoch_log = dict({ "train_loss":epoch_loss},**epoch_metrics)

loop.set_postfix(**epoch_log)

forname,metric_fn intrain_metrics_dict.items:

metric_fn.reset

forname, metric inepoch_log.items:

history[name] = history.get(name, []) + [metric]

# 2,validate -------------------------------------------------

net.eval

total_loss,step = 0, 0

loop = tqdm(enumerate(dl_val), total =len(dl_val),ncols= 100)

val_metrics_dict = deepcopy(metrics_dict)

withtorch.no_grad:

fori, batch inloop:

features,labels = batch

# =========================移动数据到mps上==============================

features = features.to(device)

labels = labels.to(device)

# ====================================================================

#forward

preds = net(features)

loss = loss_fn(preds,labels)

#metrics

step_metrics = { "val_"+name:metric_fn(preds, labels).item

forname,metric_fn inval_metrics_dict.items}

step_log = dict({ "val_loss":loss.item},**step_metrics)

total_loss += loss.item

step+= 1

ifi!=len(dl_val) -1:

loop.set_postfix(**step_log)

else:

epoch_loss = (total_loss/step)

epoch_metrics = { "val_"+name:metric_fn.compute.item

forname,metric_fn inval_metrics_dict.items}

epoch_log = dict({ "val_loss":epoch_loss},**epoch_metrics)

loop.set_postfix(**epoch_log)

forname,metric_fn inval_metrics_dict.items:

metric_fn.reset

epoch_log[ "epoch"] = epoch

forname, metric inepoch_log.items:

history[name] = history.get(name, []) + [metric]

# 3,early-stopping -------------------------------------------------

arr_scores = history[monitor]

best_score_idx = np.argmax(arr_scores) ifmode== "max"elsenp.argmin(arr_scores)

ifbest_score_idx==len(arr_scores) -1:

torch.save(net.state_dict,ckpt_path)

print( ">".format(monitor,

arr_scores[best_score_idx]),file=sys.stderr)

iflen(arr_scores)-best_score_idx>patience:

print( ">".format(

monitor,patience),file=sys.stderr)

break

net.load_state_dict(torch.load(ckpt_path))

dfhistory = pd.DataFrame(history)

四,使用torchkeras支持Mac M1芯片加速

我在最新的3.3.0的torchkeras版本中引入了对 mac m1芯片的支持,当存在可用的 mac m1芯片/ GPU 时,会默认使用它们进行加速,无需做任何配置。

使用范例如下。😋😋😋

!pip install torchkeras>= 3.3.0

importnumpy asnp

importpandas aspd

frommatplotlib importpyplot asplt

importtorch

fromtorch importnn

importtorch.nn.functional asF

fromtorch.utils.data importDataset,DataLoader

importtorchkeras #Attention this line

#================================================================================

# 一,准备数据

#================================================================================

importtorchvision

fromtorchvision importtransforms

transform = transforms.Compose([transforms.ToTensor])

ds_train = torchvision.datasets.MNIST(root= "mnist/",train= True,download= True,transform=transform)

ds_val = torchvision.datasets.MNIST(root= "mnist/",train= False,download= True,transform=transform)

dl_train = torch.utils.data.DataLoader(ds_train, batch_size= 128, shuffle= True, num_workers= 2)

dl_val = torch.utils.data.DataLoader(ds_val, batch_size= 128, shuffle= False, num_workers= 2)

forfeatures,labels indl_train:

break

#================================================================================

# 二,定义模型

#================================================================================

defcreate_net:

net = nn.Sequential

net.add_module( "conv1",nn.Conv2d(in_channels= 1,out_channels= 64,kernel_size = 3))

net.add_module( "pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))

net.add_module( "conv2",nn.Conv2d(in_channels= 64,out_channels= 512,kernel_size = 3))

net.add_module( "pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))

net.add_module( "dropout",nn.Dropout2d(p = 0.1))

net.add_module( "adaptive_pool",nn.AdaptiveMaxPool2d(( 1, 1)))

net.add_module( "flatten",nn.Flatten)

net.add_module( "linear1",nn.Linear( 512, 1024))

net.add_module( "relu",nn.ReLU)

net.add_module( "linear2",nn.Linear( 1024, 10))

returnnet

net = create_net

print(net)

# 评估指标

classAccuracy(nn.Module):

def__init__(self):

super.__init__

self.correct = nn.Parameter(torch.tensor( 0.0),requires_grad= False)

self.total = nn.Parameter(torch.tensor( 0.0),requires_grad= False)

defforward(self, preds: torch.Tensor, targets: torch.Tensor):

preds = preds.argmax(dim= -1)

m = (preds == targets).sum

n = targets.shape[ 0]

self.correct += m

self.total += n

returnm/n

defcompute(self):

returnself.correct.float / self.total

defreset(self):

self.correct -= self.correct

self.total -= self.total

#================================================================================

# 三,训练模型

#================================================================================

model = torchkeras.KerasModel(net,

loss_fn = nn.CrossEntropyLoss,

optimizer= torch.optim.Adam(net.parameters,lr= 0.001),

metrics_dict = { "acc":Accuracy}

)

fromtorchkeras importsummary

summary(model,input_data=features);

# if gpu/mps is available, will auto use it, otherwise cpu will be used.

dfhistory=model.fit(train_data=dl_train,

val_data=dl_val,

epochs= 15,

patience= 5,

monitor= "val_acc",mode= "max",

ckpt_path= 'checkpoint.pt')

#================================================================================

# 四,评估模型

#================================================================================

model.evaluate(dl_val)

#================================================================================

# 五,使用模型

#================================================================================

model.predict(dl_val)[ 0: 10]

#================================================================================

# 六,保存模型

#================================================================================

# The best net parameters has been saved at ckpt_path='checkpoint.pt' during training.

net_clone = create_net

net_clone.load_state_dict(torch.load( "checkpoint.pt"))

五,M1芯片与CPU和Nvidia GPU速度对比

使用以上代码作为范例,分别在CPU, mac m1芯片,以及Nvidia GPU上 运行。

得到的运行速度截图如下:

纯CPU跑效果

Mac M1 芯片加速效果

Tesla P100 GPU加速效果

纯CPU跑一个epoch大约是3min 18s。

使用mac m1芯片加速,一个epoch大约是33 s,相比CPU跑,加速约6倍。

这和pytorch官网显示的训练过程平均加速7倍相当。

使用Nvidia Tesla P100 GPU加速,一个epoch大约是 8s,相比CPU跑,加速约25倍。

整体来说Mac M1芯片对 深度学习训练过程的加速还是非常显著的,通常达到5到7倍左右。

不过目前看和企业中最常使用的高端的Tesla P100 GPU相比,还是有2到4倍的训练速度差异,可以视做一个mini版的GPU吧。

因此Mac M1芯片比较适合本地训练一些中小规模的模型,快速迭代idea,使用起来还是蛮香的。

尤其是本来就打算想换个电脑的,用mac做开发本来比windows好使多了。

- EOF -

加主页君微信,不仅Python技能+1

主页君日常还会在个人微信分享 Python相关工具、资源和 精选技术文章,不定期分享一些 有意思的活动、 岗位内推以及 如何用技术做业余项目

加个微信,打开一扇窗

点击标题可跳转

1、 搞懂 Transformer 结构,看这篇 PyTorch 实现就够了

2、 冷门但好用的 Python 库推荐一波

3、 10个超级实用的数据可视化图表总结!

觉得本文对你有帮助?请分享给更多人

推荐关注「Python开发者」,提升Python技能

点赞和在看就是最大的支持❤️返回搜狐,查看更多



【本文地址】

公司简介

联系我们

今日新闻

    推荐新闻

    专题文章
      CopyRight 2018-2019 实验室设备网 版权所有