Docker Guide
install docker
# step 1: install tools
sudo apt-get update
sudo apt-get -y install apt-transport-https ca-certificates curl software-properties-common
# step 2: install GPG
curl -fsSL http://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -
# Step 3: add apt repo
sudo add-apt-repository "deb [arch=amd64] http://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable"
# Step 4: install docker-ce
sudo apt-get -y update
sudo apt-get -y install docker-ce
install docker-ce for given version
# Step 1: search versions
# apt-cache madison docker-ce
# docker-ce | 17.03.1~ce-0~ubuntu-xenial | http://mirrors.aliyun.com/docker-ce/linux/ubuntu xenial/stable amd64 Packages
# docker-ce | 17.03.0~ce-0~ubuntu-xenial | http://mirrors.aliyun.com/docker-ce/linux/ubuntu xenial/stable amd64 Packages
# Step 2: install given version
# sudo apt-get -y install docker-ce=17.03.1~ce-0~ubuntu-xenial
test docker
sudo docker version
Client:
Version: 18.06.1-ce
API version: 1.38
Go version: go1.10.3
Git commit: e68fc7a
Built: Tue Aug 21 17:24:56 2018
OS/Arch: linux/amd64
Experimental: false
Server:
Engine:
Version: 18.06.1-ce
API version: 1.38 (minimum version 1.12)
Go version: go1.10.3
Git commit: e68fc7a
Built: Tue Aug 21 17:23:21 2018
OS/Arch: linux/amd64
Experimental: false
docker namespace
host
id
uid=1000(kezunlin) gid=1000(kezunlin) groups=1000(kezunlin),4(adm),24(cdrom),27(sudo),30(dip),46(plugdev),113(lpadmin),128(sambashare)
sudo docker images
sudo docker run -it --name kzl -v /home/kezunlin/workspace/:/home/kezunlin/workspace nvidia/cuda
container
root@6f167ef72a80:/home/kezunlin/workspace# ll
total 48
drwxrwxr-x 12 1000 1000 4096 Nov 30 10:04 ./
drwxr-xr-x 3 root root 4096 Nov 30 10:14 ../
drwxrwxr-x 10 1000 1000 4096 Dec 5 2017 MyGit/
drwxrwxr-x 12 1000 1000 4096 Oct 31 03:01 blog/
drwxrwxr-x 5 1000 1000 4096 Sep 20 07:33 opencv/
drwxrwxr-x 4 1000 1000 4096 Oct 31 07:55 openmp/
drwxrwxr-x 5 1000 1000 4096 Jan 9 2018 qt/
drwxrwxr-x 2 1000 1000 4096 Jan 4 2018 ros/
drwxrwxr-x 4 1000 1000 4096 Nov 16 2017 voc/
drwxrwxr-x 5 1000 1000 4096 Aug 7 03:19 vs/
root@6f167ef72a80:/home/kezunlin/workspace# touch 1.txt
root@6f167ef72a80:/home/kezunlin/workspace# id
uid=0(root) gid=0(root) groups=0(root)
host
ll /home/kezunlin/workspace/
total 48
drwxrwxr-x 12 kezunlin kezunlin 4096 11月 30 18:14 ./
drwxr-xr-x 47 kezunlin kezunlin 4096 11月 30 18:04 ../
-rw-r--r-- 1 root root 0 11月 30 18:14 1.txt
drwxrwxr-x 12 kezunlin kezunlin 4096 10月 31 11:01 blog/
drwxrwxr-x 5 kezunlin kezunlin 4096 9月 20 15:33 opencv/
drwxrwxr-x 4 kezunlin kezunlin 4096 10月 31 15:55 openmp/
drwxrwxr-x 5 kezunlin kezunlin 4096 1月 9 2018 qt/
drwxrwxr-x 2 kezunlin kezunlin 4096 1月 4 2018 ros/
drwxrwxr-x 4 kezunlin kezunlin 4096 11月 16 2017 voc/
drwxrwxr-x 5 kezunlin kezunlin 4096 8月 7 11:19 vs/
install nvidia-docker2
The machine running the CUDA container only requires the NVIDIA driver, the CUDA toolkit doesn’t have to be installed.
Host系统只需要安装NVIDIA driver即可运行CUDA container。
install
remove nvidia-docker 1.0
# If you have nvidia-docker 1.0 installed: we need to remove it and all existing GPU containers
docker volume ls -q -f driver=nvidia-docker | xargs -r -I{} -n1 docker ps -q -a -f volume={} | xargs -r docker rm -f
sudo apt-get purge -y nvidia-docker
Add the package repositories
vim repo.sh
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \
sudo apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
run scripts
chmod +x repo.sh
./repo.sh
Install nvidia-docker2 and reload the Docker daemon configuration
sudo apt-get install -y nvidia-docker2
sudo pkill -SIGHUP dockerd
test
sudo docker run --runtime=nvidia --rm nvidia/cuda nvidia-smi
output
Unable to find image 'nvidia/cuda:latest' locally
latest: Pulling from nvidia/cuda
8ee29e426c26: Pull complete
6e83b260b73b: Pull complete
e26b65fd1143: Pull complete
40dca07f8222: Pull complete
b420ae9e10b3: Pull complete
a579c1327556: Pull complete
b440bb8df79e: Pull complete
de3b2ccf9562: Pull complete
a69a544d350e: Pull complete
02348b5db71c: Pull complete
Digest: sha256:5996fa2fc0666972360502fe32118286177b879a8a1a834a176e7786021b8cee
Status: Downloaded newer image for nvidia/cuda:latest
Mon Sep 3 10:08:27 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.130 Driver Version: 384.130 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 1060 Off | 00000000:01:00.0 Off | N/A |
| N/A 59C P8 8W / N/A | 408MiB / 6072MiB | 40% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
+-----------------------------------------------------------------------------+
or by tty
sudo docker run --runtime=nvidia -t -i --privileged nvidia/cuda bash
root@8f3ebd5ecbb6:/# nvidia-smi
Tue Sep 4 01:26:31 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.130 Driver Version: 384.130 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 1060 Off | 00000000:01:00.0 Off | N/A |
| N/A 56C P0 31W / N/A | 374MiB / 6072MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
+-----------------------------------------------------------------------------+
Advanced Topics
Default runtime
The default runtime used by the Docker® Engine is runc
, our runtime can become the default one by configuring the docker daemon with --default-runtime=nvidia
. Doing so will remove the need to add the --runtime=nvidia
argument to docker run. It is also the only way to have GPU access during docker build.
Environment variables
The behavior of the runtime can be modified through environment variables (such as NVIDIA_VISIBLE_DEVICES
).
Those environment variables are consumed by nvidia-container-runtime
and are documented here.
Our official CUDA images use default values for these variables.
docker command
sudo docker image list
REPOSITORY TAG IMAGE ID CREATED SIZE
nvidia/cuda latest 04a9ce0dec6d 3 weeks ago 1.96GB
sudo docker run -it --privileged nvidia/cuda bash
docker build --network=host -t anakin:$tag . -f $DockerfilePath
kubernetes with GPU
kubernetes 对于 GPU 的支持截止到 1.9 版本,算是经历了3个阶段:
- kubernetes 1.3 版本开始支持GPU,但是只支持单个 GPU卡;
- kubernetes 1.6 版本开始支持对多个GPU卡的支持;
- kubernetes 1.8 版本以 device plugin 方式提供对GPU的支持。
ls /dev/nvidia*
/dev/nvidia0 /dev/nvidia2 /dev/nvidia4 /dev/nvidia6 /dev/nvidiactl
/dev/nvidia1 /dev/nvidia3 /dev/nvidia5 /dev/nvidia7
- Kubernetes 1.8~1.9,通过
k8s-device-plugin
获取每个Node上GPU的信息,根据这些信息对GPU资源进行管理和调度。需要结合 nvidia-docker2 使用。 k8s-device-plugin
也是由 nvidia 提供,在kubernetes中可以DaemonSet方式运行。
Reference
- nvidia-docker2#prerequisites)
- docker command
- GPU 在 docker 和 kubernetes 中的使用
- Kubernetes GPU使用指南
- scheduling-gpus
History
- 20180903: created.