Jelajahi Sumber

完善文档,CUDA 安装

zhzhenqin 5 bulan lalu
induk
melakukan
0d53bf17d0
2 mengubah file dengan 90 tambahan dan 1 penghapusan
  1. 37 0
      docs/N卡驱动及CUDA安装.md
  2. 53 1
      docs/YOLO模型训练.md

+ 37 - 0
docs/N卡驱动及CUDA安装.md

@@ -2,6 +2,22 @@
 
 ![onnxruntime-cuda-version-mapping](images/onnxruntime-cuda-version-mapping.png)
 
+### 前置安装
+
+Ubuntu and Debian 系统所需要的库:
+
+```shell
+sudo apt-update
+sudo apt install net-tools
+sudo apt install vim wget curl git
+sudo apt install gcc make build-essential 
+sudo apt install -y libssl-dev zlib1g-dev
+sudo apt install -y libbz2-dev libreadline-dev libsqlite3-dev llvm
+sudo apt install -y libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev
+sudo apt install -y libbz2-dev libssl-dev libncurses5-dev libsqlite3-dev libreadline-dev 
+sudo apt install -y tk-dev libgdbm-dev libdb-dev libpcap-dev xz-utils libexpat1-dev liblzma-dev libffi-dev libc6-dev
+```
+
 ## Nvidia 驱动安装
 
 查看显卡型号
@@ -28,6 +44,27 @@ sudo ./NVIDIA-1060-Linux-x86_64-550.100.run
 
 ## CUDA 安装
 
+分别到Nvidia 官网下载驱动和 CUDA 工具包。
+
+**CUDA**
+https://developer.nvidia.com/cuda-toolkit-archive
+
+**cuDNN**
+https://developer.nvidia.com/rdp/cudnn-archive
+
+下载后,上传到制定目录。执行安装:
+
+```shell
+sudo sh cuda_11.4.0_470.57.02_linux.run
+```
+期间
+一路回车
+accept
+n(不要安装driver,已有驱动)
+y
+y
+y
+
 ```log
 ===========
 = Summary =

+ 53 - 1
docs/YOLO模型训练.md

@@ -1,7 +1,7 @@
 ## YOLO 采用命令训练数据集
 
 ```shell
-yolo train data=coco128.yaml model=yolov8n.pt epochs=10 lr0=0.01 --device='0,1'
+yolo train data=coco128.yaml model=yolov8n.pt epochs=10 lr0=0.01 device='0,1'
 
 yolo task=detect mode=train model=yolov8x.yaml data=mydata.yaml epochs=10 batch=16
 
@@ -23,6 +23,7 @@ yolo task=segment mode=predict model=yolov8x-seg.pt source='/kaggle/input/person
 实际运行:
 ```shell
 yolo train data=/home/jxft/datasets/hyd-action.yaml model=/home/jxft/datasets/yolov8/yolov8n.pt epochs=100 lr0=0.01
+yolo train data=/home/yiidata/datasets/hyd-action.yaml model=/home/yiidata/datasets/yolov8/yolov8n.pt epochs=10 lr0=0.01 device='0'
 ```
 
 ## YOLO 采用代码测试数据集
@@ -62,6 +63,57 @@ results = model.train(  # 开始训练模型
 )
 ```
 
+## Q&A
+
+**1. 报错 RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasSgemm**
+
+报错日志:
+
+```log
+Transferred 319/355 items from pretrained weights
+TensorBoard: Start with 'tensorboard --logdir runs/detect/train3', view at http://localhost:6006/
+Freezing layer 'model.22.dfl.conv.weight'
+AMP: running Automatic Mixed Precision (AMP) checks with YOLOv8n...
+Traceback (most recent call last):
+  File "/usr/local/bin/yolo", line 8, in <module>
+    sys.exit(entrypoint())
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/cfg/__init__.py", line 582, in entrypoint
+    getattr(model, mode)(**overrides)  # default args from model
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/engine/model.py", line 667, in train
+    self.trainer.train()
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/engine/trainer.py", line 198, in train
+    self._do_train(world_size)
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/engine/trainer.py", line 312, in _do_train
+    self._setup_train(world_size)
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/engine/trainer.py", line 256, in _setup_train
+    self.amp = torch.tensor(check_amp(self.model), device=self.device)
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/utils/checks.py", line 655, in check_amp
+    assert amp_allclose(YOLO("yolov8n.pt"), im)
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/utils/checks.py", line 642, in amp_allclose
+    a = m(im, device=device, verbose=False)[0].boxes.data  # FP32 inference
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/engine/model.py", line 176, in __call__
+    return self.predict(source, stream, **kwargs)
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/engine/model.py", line 444, in predict
+    self.predictor.setup_model(model=self.model, verbose=is_cli)
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/engine/predictor.py", line 297, in setup_model
+    self.model = AutoBackend(
+  File "/usr/local/python3/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
+    return func(*args, **kwargs)
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/nn/autobackend.py", line 144, in __init__
+    model = model.fuse(verbose=verbose)
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/nn/tasks.py", line 184, in fuse
+    m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+  File "/usr/local/python3/lib/python3.8/site-packages/ultralytics/utils/torch_utils.py", line 196, in fuse_conv_and_bn
+    fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
+```
+
+Linux 系统,删除 LD_LIBRARY_PATH 环境变量即可解决。
+
+```shell
+unset LD_LIBRARY_PATH                                                           [1]
+```
+
 ## 参考
 
 - https://blog.csdn.net/qq_32892383/article/details/136505299