Hi,
I need help to solve a configuration problem of Maui:
I have recently installed the Torque on Scientific Linux 5.2. The cluster has 9 compute nodes with totally 144 cores.
The Torque works correctly, and submitting the jobs are done successfully.
arm00:~:echo "sleep 30" | qsub -l nodes=2
19.arm00.cluster
arm00:~:qstat
Job id Name User Time Use S Queue
------------------------- ---------------- --------------- -------- - -----
19.arm00 STDIN pooladsaz 0 Q batch
But the pbs_sched gives the following error when all the nodes are available:
[root@arm00 maui-3.2.6p21# pbs_sched
Error reading line 127 of file sched_config
help_starving_jobs true ALL
Error reading line 133 of file sched_config
sort_queues true ALL
Error reading line 156 of file sched_config
sort_by
Segmentation fault
I tried to install the Maui to replace the scheduling server, but I guess Maui doesn't communicate with Torque since no active or even idle job is listed by showq when at least one job is waiting in the queue.
The [root@arm00 maui-3.2.6p21]# showq
ACTIVE JOBS--------------------
JOBNAME USERNAME STATE PROC REMAINING STARTTIME
0 Active Jobs 0 of 0 Processors Active (0.00%)
IDLE JOBS----------------------
JOBNAME USERNAME STATE PROC WCLIMIT QUEUETIME
0 Idle Jobs
BLOCKED JOBS----------------
JOBNAME USERNAME STATE PROC WCLIMIT QUEUETIME
Total Jobs: 0 Active Jobs: 0 Idle Jobs: 0 Blocked Jobs: 0
The file maui.cfg contains:
[root@arm00 maui-3.2.6p21]# more maui.cfg
# MAUI configuration example
SERVERHOST arm00.cluster
ADMIN1 root
ADMINHOSTS arm00.cluster
RMHOST[0] arm00.cluster
RMCFG[0] TYPE=PBS
SERVERPORT 42559
SERVERMODE NORMAL
# Set PBS server polling interval. If you have short # queues or/and jobs it is worth to set a short interval. (10 sec
onds)
RMPOLLINTERVAL 00:00:10
# a max. 10 MByte log file in a logical location
LOGFILE /var/log/maui.log
LOGFILEMAXSIZE 10000000
LOGLEVEL 1
# Set the delay to 1 minute before Maui tries to run a job again, # in case it failed to run the first time.
# The default value is 1 hour.
DEFERTIME 00:01:00
# Necessary for MPI grid jobs
ENABLEMULTIREQJOBS TRUE
And checknode gives the following error:
[root@arm00 maui-3.2.6p21]# checknode arm01.cluster
ERROR: 'checknode' failed
ERROR: cannot locate node 'arm01.cluster'
and the pbs batch system has the following settings:
qmgr -c "set server scheduling=true"
qmgr -c "create queue batch queue_type=execution"
qmgr -c "set queue batch started=true"
qmgr -c "set queue batch enabled=true"
qmgr -c "set queue batch resources_default.nodes=1"
qmgr -c "set queue batch resources_default.walltime=3600"
qmgr -c "set server default_queue=batch"
qmgr -c "set server managers +=
root@arm00.cluster"
qmgr -c "set server operators +=
root@arm00.cluster"
qmgr -c "set server scheduling = TRUE"
Could you help me to solve this problem?
Thanks in advance.
Davoud
PS.:
the output of showconfig and diagnose -n are shown below:
[root@arm00 maui-3.2.6p21]# diagnose -n
diagnosing node table (5120 slots)
Name State Procs Memory Disk Swap Speed Opsys Arch Par Load Res Classes Network Features
----- --- 0:0 0:0 0:0 0:0
Total Nodes: 0 (Active: 0 Idle: 0 Down: 0)
,
[root@arm00 maui-3.2.6p21# showconfig
# Maui version 3.2.6p21 (PID: 17201)
# global policies
REJECTNEGPRIOJOBS[0] FALSE
ENABLENEGJOBPRIORITY[0] FALSE
ENABLEMULTINODEJOBS[0] TRUE
ENABLEMULTIREQJOBS[0] FALSE
BFPRIORITYPOLICY[0] [NONE]
JOBPRIOACCRUALPOLICY QUEUEPOLICY
NODELOADPOLICY ADJUSTSTATE
USEMACHINESPEED FALSE
USESYSTEMQUEUETIME TRUE
USELOCALMACHINEPRIORITY FALSE
NODEUNTRACKEDLOADFACTOR 1.2
JOBNODEMATCHPOLICY[0]
JOBMAXSTARTTIME[0] INFINITY
METAMAXTASKS[0] 0
NODESETPOLICY[0] [NONE]
NODESETATTRIBUTE[0] [NONE]
NODESETLIST[0]
NODESETDELAY[0] 00:00:00
NODESETPRIORITYTYPE[0] MINLOSS
NODESETTOLERANCE[0] 0.00
BACKFILLPOLICY[0] FIRSTFIT
BACKFILLDEPTH[0] 0
BACKFILLPROCFACTOR[0] 0
BACKFILLMAXSCHEDULES[0] 10000
BACKFILLMETRIC[0] PROCS
BFCHUNKDURATION[0] 00:00:00
BFCHUNKSIZE[0] 0
PREEMPTPOLICY[0] REQUEUE
MINADMINSTIME[0] 00:00:00
RESOURCELIMITPOLICY[0]
NODEAVAILABILITYPOLICY[0] COMBINED:[DEFAULT]
NODEALLOCATIONPOLICY[0] MINRESOURCE
TASKDISTRIBUTIONPOLICY[0] DEFAULT
RESERVATIONPOLICY[0] CURRENTHIGHEST
RESERVATIONRETRYTIME[0] 00:00:00
RESERVATIONTHRESHOLDTYPE[0] NONE
RESERVATIONTHRESHOLDVALUE[0] 0
FSPOLICY [NONE]
FSPOLICY [NONE]
FSINTERVAL 12:00:00
FSDEPTH 8
FSDECAY 1.00
# Priority Weights
SERVICEWEIGHT[0] 1
TARGETWEIGHT[0] 1
CREDWEIGHT[0] 1
ATTRWEIGHT[0] 1
FSWEIGHT[0] 1
RESWEIGHT[0] 1
USAGEWEIGHT[0] 1
QUEUETIMEWEIGHT[0] 1
XFACTORWEIGHT[0] 0
SPVIOLATIONWEIGHT[0] 0
BYPASSWEIGHT[0] 0
TARGETQUEUETIMEWEIGHT[0] 0
TARGETXFACTORWEIGHT[0] 0
USERWEIGHT[0] 0
GROUPWEIGHT[0] 0
ACCOUNTWEIGHT[0] 0
QOSWEIGHT[0] 0
CLASSWEIGHT[0] 0
FSUSERWEIGHT[0] 0
FSGROUPWEIGHT[0] 0
FSACCOUNTWEIGHT[0] 0
FSQOSWEIGHT[0] 0
FSCLASSWEIGHT[0] 0
ATTRATTRWEIGHT[0] 0
ATTRSTATEWEIGHT[0] 0
NODEWEIGHT[0] 0
PROCWEIGHT[0] 0
MEMWEIGHT[0] 0
SWAPWEIGHT[0] 0
DISKWEIGHT[0] 0
PSWEIGHT[0] 0
PEWEIGHT[0] 0
WALLTIMEWEIGHT[0] 0
UPROCWEIGHT[0] 0
UJOBWEIGHT[0] 0
CONSUMEDWEIGHT[0] 0
USAGEEXECUTIONTIMEWEIGHT[0] 0
REMAININGWEIGHT[0] 0
PERCENTWEIGHT[0] 0
XFMINWCLIMIT[0] 00:02:00
# partition DEFAULT policies
REJECTNEGPRIOJOBS[1] FALSE
ENABLENEGJOBPRIORITY[1] FALSE
ENABLEMULTINODEJOBS[1] TRUE
ENABLEMULTIREQJOBS[1] FALSE
BFPRIORITYPOLICY[1] [NONE]
JOBPRIOACCRUALPOLICY QUEUEPOLICY
NODELOADPOLICY ADJUSTSTATE
JOBNODEMATCHPOLICY[1]
JOBMAXSTARTTIME[1] INFINITY
METAMAXTASKS[1] 0
NODESETPOLICY[1] [NONE]
NODESETATTRIBUTE[1] [NONE]
NODESETLIST[1]
NODESETDELAY[1] 00:00:00
NODESETPRIORITYTYPE[1] MINLOSS
NODESETTOLERANCE[1] 0.00
# Priority Weights
XFMINWCLIMIT[1] 00:00:00
CLASSCFG[[NONE]] DEFAULT.FEATURES=[NONE]
CLASSCFG[[ALL]] DEFAULT.FEATURES=[NONE]
QOSPRIORITY[0] 0
QOSQTWEIGHT[0] 0
QOSXFWEIGHT[0] 0
QOSTARGETXF[0] 0.00
QOSTARGETQT[0] 00:00:00
QOSFLAGS[0]
QOSPRIORITY[1] 0
QOSQTWEIGHT[1] 0
QOSXFWEIGHT[1] 0
QOSTARGETXF[1] 0.00
QOSTARGETQT[1] 00:00:00
QOSFLAGS[1]
# SERVER MODULES: MX
SERVERMODE NORMAL
SERVERNAME
SERVERHOST arm00
SERVERPORT 42559
LOGFILE maui.log
LOGFILEMAXSIZE 10000000
LOGFILEROLLDEPTH 1
LOGLEVEL 3
LOGFACILITY fALL
SERVERHOMEDIR /usr/local/maui/
TOOLSDIR /usr/local/maui/tools/
LOGDIR /usr/local/maui/log/
STATDIR /usr/local/maui/stats/
LOCKFILE /usr/local/maui/maui.pid
SERVERCONFIGFILE /usr/local/maui/maui.cfg
CHECKPOINTFILE /usr/local/maui/maui.ck
CHECKPOINTINTERVAL 00:05:00
CHECKPOINTEXPIRATIONTIME 3:11:20:00
TRAPJOB
TRAPNODE
TRAPFUNCTION
RESDEPTH 24
RMPOLLINTERVAL 00:00:30
NODEACCESSPOLICY SHARED
ALLOCLOCALITYPOLICY [NONE]
SIMTIMEPOLICY [NONE]
ADMIN1 root
ADMINHOSTS ALL
NODEPOLLFREQUENCY 0
DISPLAYFLAGS
DEFAULTDOMAIN
DEFAULTCLASSLIST [DEFAULT:1]
FEATURENODETYPEHEADER
FEATUREPROCSPEEDHEADER
FEATUREPARTITIONHEADER
DEFERTIME 1:00:00
DEFERCOUNT 24
DEFERSTARTCOUNT 1
JOBPURGETIME 0
NODEPURGETIME 2140000000
APIFAILURETHRESHHOLD 6
NODESYNCTIME 600
JOBSYNCTIME 600
JOBMAXOVERRUN 00:10:00
NODEMAXLOAD 0.0
PLOTMINTIME 120
PLOTMAXTIME 245760
PLOTTIMESCALE 11
PLOTMINPROC 1
PLOTMAXPROC 512
PLOTPROCSCALE 9
SCHEDCFG[] MODE=NORMAL SERVER=arm00:42559
# RM MODULES: PBS SSS WIKI NATIVE
SIMWORKLOADTRACEFILE workload
SIMRESOURCETRACEFILE resource
SIMAUTOSHUTDOWN OFF
SIMSTARTTIME 0
SIMSCALEJOBRUNTIME FALSE
SIMFLAGS
SIMJOBSUBMISSIONPOLICY CONSTANTJOBDEPTH
SIMINITIALQUEUEDEPTH 16
SIMWCACCURACY 0.00
SIMWCACCURACYCHANGE 0.00
SIMNODECOUNT 0
SIMNODECONFIGURATION NORMAL
SIMWCSCALINGPERCENT 100
SIMCOMRATE 0.10
SIMCOMTYPE ROUNDROBIN
COMINTRAFRAMECOST 0.30
COMINTERFRAMECOST 0.30
SIMSTOPITERATION -1
SIMEXITITERATION -1