跳转至

集群增加计算节点步骤

汇总安装依赖包

su - root
# 安装 NFS
yum -y install nfs-utils

# 安装 NTP客户端
yum -y install ntp ntpdate

# 安装 OpenLDAP
yum -y install openldap openldap-clients nss-pam-ldapd sssd sssd-ldap \
     policycoreutils-python authconfig

# 安装 Munge
yum -y install epel-release munge munge-libs munge-devel

# 安装 Slurm
yum -y install gcc gcc-c++ readline-devel perl-ExtUtils-MakeMaker \
     pam-devel rpm-build mysql-devel python3 perl-Switch

# 以上语句合并
yum -y install nfs-utils ntp ntpdate openldap openldap-clients \
     nss-pam-ldapd sssd sssd-ldap policycoreutils-python authconfig \
     epel-release munge munge-libs munge-devel gcc gcc-c++ readline-devel \
     perl-ExtUtils-MakeMaker pam-devel rpm-build mysql-devel python3 \
     perl-Switch

本文中添加的计算节点
节点IP:218.197.110.27
主机名:i-10fab2ed
别名:scow-gpu1 以下步骤均由root用户完成

主机名及hosts

  1. 设置主机名
    hostnamectl set-hostname scow-master
    hostnamectl set-hostname scow-login
    hostnamectl set-hostname scow-cn01
    hostnamectl set-hostname scow-cn02
    hostnamectl set-hostname scow-gpu1
    
  2. 配置hosts
    vim /etc/hosts
    
    218.197.100.99 i-465E8F84 scow-master
    218.197.100.85 i-60769374 scow-login
    218.197.100.96 i-81A867B6 scow-cn01
    218.197.100.97 i-74DD6876 scow-cn02
    218.197.110.27 i-10fab2ed scow-gpu1
    
  3. 关闭防火墙、selinux、dnsmasq、swap

    systemctl disable --now firewalld 
    systemctl disable --now dnsmasq
    systemctl disable --now NetworkManager
    
    setenforce 0
    sed -i 's#SELINUX=permissive#SELINUX=disabled#g' /etc/sysconfig/selinux
    sed -i 's#SELINUX=permissive#SELINUX=disabled#g' /etc/selinux/config
    reboot
    getenforce
    
    
    swapoff -a && sysctl -w vm.swappiness=0
    sed -ri '/^[^#]*swap/s@^@#@' /etc/fstab
    

  4. 管理节点到计算节点的免密登录

    将master节点root用户的 .ssh/id_rsa.pub 内容追加到本机 .ssh/authorized_keys文件中
    

配置NFS

  1. 安装并启动NFS、RPC服务
    yum install -y nfs-utils
    
  2. 查看服务端可共享的目录
    # 218.197.100.85为NFS服务端IP
    showmount -e 218.197.100.85
    
    # 有如下输出
    Export list for 218.197.100.85:
    /public *
    
  3. 挂载服务端共享目录
    # 创建目录
    mkdir /public
    #将共享存储/public 挂载至218.197.100.85服务器的/public目录下
    mount 218.197.100.85:/public /public -o proto=tcp -o nolock
    
    # 设置开机自动挂载
    vim /etc/fstab
    # 在文档末尾添加
    218.197.100.85:/public /public nfs rw,auto,nofail,noatime,nolock,intr,tcp,actimeo=1800 0 0
    
  4. 查看挂载
    df -h |grep public
    
    # 有如下输出
    218.197.100.85:/public     79G   56M   75G   1% /public
    

配置NTP

  1. 安装依赖包

    su - root
    yum -y install ntp ntpdate
    

  2. 配置客户端的时间服务器

    vim /etc/ntp.conf
    
    #server 0.centos.pool.ntp.org iburst
    #server 1.centos.pool.ntp.org iburst
    #server 2.centos.pool.ntp.org iburst
    #server 3.centos.pool.ntp.org iburst
    server 218.197.100.99
    
    systemctl restart ntpd
    

  3. 开放服务端连接网络

    ssh scow-master
    vim /etc/ntp.conf
    
    增加
    restrict 218.197.110.0 mask 255.255.255.0 nomodify notrap
    
    systemctl restart ntpd
    

  4. 检查NTP服务

    ntpq -p
    
         remote           refid      st t when poll reach   delay   offset  jitter
    ==============================================================================
     218.197.100.99  LOCAL(0)         6 u    2   64    1    0.829   -3.034   0.000
    

配置OpenLDAP客户端

  1. 运行client.sh 脚本来配置客户端
  2. 检查配置是否成功

配置Slurm

安装Munge

  1. 创建Munge用户

Munge用户要确保管理节点和计算(登录)节点的UID和GID相同

groupadd -g 1108 munge
useradd -m -c "Munge Uid 'N' Gid Emporium" -d /var/lib/munge -u 1108 -g munge -s /sbin/nologin munge
2. 部署Munge

Munge是认证服务,实现本地或者远程主机进程的UID、GID验证。

# 安装Munge
yum install epel-release -y
yum install munge munge-libs munge-devel -y

# 全局密钥在管理节点,需要从管理节点到计算节点
scp -p /etc/munge/munge.key root@gpu1:/etc/munge/

# 所有节点赋权
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key

# 启动Munge服务
systemctl start munge
systemctl enable munge
3. 测试Munge服务
# 查看本地凭据
munge -n
查看本地凭据
munge -n | unmunge
本地解码

安装Slurm

  1. 创建slurm用户
    groupadd -g 1109 slurm
    useradd -m -c "Slurm manager" -d /var/lib/slurm -u 1109 -g slurm -s /bin/bash slurm
    
  2. 安装slurm依赖
    yum install gcc gcc-c++ readline-devel perl-ExtUtils-MakeMaker pam-devel rpm-build mysql-devel python3 perl-Switch -y
    
  3. 设置目录及权限
    mkdir -p /root/rpmbuild/RPMS/x86_64
    mkdir -p /etc/slurm
    
    mkdir /var/spool/slurmd
    chown slurm: /var/spool/slurmd
    
    mkdir /var/log/slurm
    chown slurm: /var/log/slurm
    
    mkdir /var/spool/slurmctld
    chown slurm: /var/spool/slurmctld
    
  4. 修改Slurm配置(整个集群,含管理节点和所有计算节点)
    ssh scow-master
    vim /etc/slurm/slurm.conf
    
    # 增加
    GresTypes=gpu
    NodeName=i-10fab2ed NodeAddr=218.197.110.27  CPUs=2 CoresPerSocket=1 ThreadsPerCore=1 Gres=gpu:1 RealMemory=200 Procs=1 State=UNKNOWN
    
    # 增加GPU资源配置
    vim /etc/slurm/gres.conf
    
  5. 从管理节点拷贝编译好的Slurm RPM包和配置文件
    scp -r /root/rpmbuild/RPMS/x86_64 root@scow-gpu1:/root/rpmbuild/RPMS/x86_64
    scp -r /etc/slurm/*.conf  root@scow-gpu1:/etc/slurm/
    
  6. 安装Slurm

    yum localinstall /root/rpmbuild/RPMS/x86_64/slurm-*
    

  7. 启动服务

    systemctl start slurmd
    systemctl enable slurmd
    
    # 通过 systemctl status slurmd 查看服务状态,并确保个服务状态正常
    

本文阅读量  次    本站总访问量  次