集群增加计算节点步骤¶
汇总安装依赖包
su - root
# 安装 NFS
yum -y install nfs-utils
# 安装 NTP客户端
yum -y install ntp ntpdate
# 安装 OpenLDAP
yum -y install openldap openldap-clients nss-pam-ldapd sssd sssd-ldap \
policycoreutils-python authconfig
# 安装 Munge
yum -y install epel-release munge munge-libs munge-devel
# 安装 Slurm
yum -y install gcc gcc-c++ readline-devel perl-ExtUtils-MakeMaker \
pam-devel rpm-build mysql-devel python3 perl-Switch
# 以上语句合并
yum -y install nfs-utils ntp ntpdate openldap openldap-clients \
nss-pam-ldapd sssd sssd-ldap policycoreutils-python authconfig \
epel-release munge munge-libs munge-devel gcc gcc-c++ readline-devel \
perl-ExtUtils-MakeMaker pam-devel rpm-build mysql-devel python3 \
perl-Switch
本文中添加的计算节点
节点IP:218.197.110.27
主机名:i-10fab2ed
别名:scow-gpu1
以下步骤均由root用户完成
主机名及hosts¶
- 设置主机名
hostnamectl set-hostname scow-master hostnamectl set-hostname scow-login hostnamectl set-hostname scow-cn01 hostnamectl set-hostname scow-cn02 hostnamectl set-hostname scow-gpu1
- 配置hosts
vim /etc/hosts 218.197.100.99 i-465E8F84 scow-master 218.197.100.85 i-60769374 scow-login 218.197.100.96 i-81A867B6 scow-cn01 218.197.100.97 i-74DD6876 scow-cn02 218.197.110.27 i-10fab2ed scow-gpu1
-
关闭防火墙、selinux、dnsmasq、swap
systemctl disable --now firewalld systemctl disable --now dnsmasq systemctl disable --now NetworkManager setenforce 0 sed -i 's#SELINUX=permissive#SELINUX=disabled#g' /etc/sysconfig/selinux sed -i 's#SELINUX=permissive#SELINUX=disabled#g' /etc/selinux/config reboot getenforce swapoff -a && sysctl -w vm.swappiness=0 sed -ri '/^[^#]*swap/s@^@#@' /etc/fstab
-
管理节点到计算节点的免密登录
将master节点root用户的 .ssh/id_rsa.pub 内容追加到本机 .ssh/authorized_keys文件中
配置NFS¶
- 安装并启动NFS、RPC服务
yum install -y nfs-utils
- 查看服务端可共享的目录
# 218.197.100.85为NFS服务端IP showmount -e 218.197.100.85 # 有如下输出 Export list for 218.197.100.85: /public *
- 挂载服务端共享目录
# 创建目录 mkdir /public #将共享存储/public 挂载至218.197.100.85服务器的/public目录下 mount 218.197.100.85:/public /public -o proto=tcp -o nolock # 设置开机自动挂载 vim /etc/fstab # 在文档末尾添加 218.197.100.85:/public /public nfs rw,auto,nofail,noatime,nolock,intr,tcp,actimeo=1800 0 0
- 查看挂载
df -h |grep public # 有如下输出 218.197.100.85:/public 79G 56M 75G 1% /public
配置NTP¶
-
安装依赖包
su - root yum -y install ntp ntpdate
-
配置客户端的时间服务器
vim /etc/ntp.conf #server 0.centos.pool.ntp.org iburst #server 1.centos.pool.ntp.org iburst #server 2.centos.pool.ntp.org iburst #server 3.centos.pool.ntp.org iburst server 218.197.100.99 systemctl restart ntpd
-
开放服务端连接网络
ssh scow-master vim /etc/ntp.conf 增加 restrict 218.197.110.0 mask 255.255.255.0 nomodify notrap systemctl restart ntpd
-
检查NTP服务
ntpq -p remote refid st t when poll reach delay offset jitter ============================================================================== 218.197.100.99 LOCAL(0) 6 u 2 64 1 0.829 -3.034 0.000
配置OpenLDAP客户端¶
- 运行client.sh 脚本来配置客户端
- 检查配置是否成功
配置Slurm¶
安装Munge¶
- 创建Munge用户
Munge用户要确保管理节点和计算(登录)节点的UID和GID相同
groupadd -g 1108 munge
useradd -m -c "Munge Uid 'N' Gid Emporium" -d /var/lib/munge -u 1108 -g munge -s /sbin/nologin munge
Munge是认证服务,实现本地或者远程主机进程的UID、GID验证。
# 安装Munge
yum install epel-release -y
yum install munge munge-libs munge-devel -y
# 全局密钥在管理节点,需要从管理节点到计算节点
scp -p /etc/munge/munge.key root@gpu1:/etc/munge/
# 所有节点赋权
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
# 启动Munge服务
systemctl start munge
systemctl enable munge
# 查看本地凭据
munge -n

munge -n | unmunge

安装Slurm¶
- 创建slurm用户
groupadd -g 1109 slurm useradd -m -c "Slurm manager" -d /var/lib/slurm -u 1109 -g slurm -s /bin/bash slurm
- 安装slurm依赖
yum install gcc gcc-c++ readline-devel perl-ExtUtils-MakeMaker pam-devel rpm-build mysql-devel python3 perl-Switch -y
- 设置目录及权限
mkdir -p /root/rpmbuild/RPMS/x86_64 mkdir -p /etc/slurm mkdir /var/spool/slurmd chown slurm: /var/spool/slurmd mkdir /var/log/slurm chown slurm: /var/log/slurm mkdir /var/spool/slurmctld chown slurm: /var/spool/slurmctld
- 修改Slurm配置(整个集群,含管理节点和所有计算节点)
ssh scow-master vim /etc/slurm/slurm.conf # 增加 GresTypes=gpu NodeName=i-10fab2ed NodeAddr=218.197.110.27 CPUs=2 CoresPerSocket=1 ThreadsPerCore=1 Gres=gpu:1 RealMemory=200 Procs=1 State=UNKNOWN # 增加GPU资源配置 vim /etc/slurm/gres.conf
- 从管理节点拷贝编译好的Slurm RPM包和配置文件
scp -r /root/rpmbuild/RPMS/x86_64 root@scow-gpu1:/root/rpmbuild/RPMS/x86_64 scp -r /etc/slurm/*.conf root@scow-gpu1:/etc/slurm/
-
安装Slurm
yum localinstall /root/rpmbuild/RPMS/x86_64/slurm-*
-
启动服务
systemctl start slurmd systemctl enable slurmd # 通过 systemctl status slurmd 查看服务状态,并确保个服务状态正常