HARM
harm and utilities
 All Data Structures Files Functions Variables Typedefs Macros Pages
mpi_init_grmhd.c
Go to the documentation of this file.
1 
7 #include "decs.h"
8 
9 
10 
11 // Initialize MPI for GRMHD code
12 int init_MPI_GRMHD(int *argc, char **argv[])
13 {
14 
15 
16  stderrfprintf( "Begin: init_MPI_GRMHD\n"); fflush(stderr);
17  // init MPI (assumes nothing in set_arrays.c used here) : always done non-blocking:
18  // also called if USEMPI==0
19  init_MPI_general(argc, argv);
20  // NOW CAN USE myid to determine if print-out things
21  // stderrfprintf( "Did NOT init_MPI_GRMHD\n"); fflush(stderr);
22 
23 
24 
25 #if(USEOPENMP)
26  init_OPENMP_general(stderr);
27 #endif
28 
29 
30  // always do below since just sets defaults if not doing liaisonmode
31  stderrfprintf( "Begin grmhd_init_mpi_liaisonmode_globalset()\n");
32  grmhd_init_mpi_liaisonmode_globalset();
33  stderrfprintf( "End grmhd_init_mpi_liaisonmode_globalset()\n");
34 
35 
36 #if(USEMPI)
37  // this is non-blocking local operation
38  MPI_Comm_rank(MPI_COMM_GRMHD, &myid); // proc id within GRMHD context only
39 #endif
40 
41 
42  // currently INIT provides args to rest of processes
43  stderrfprintf( "Begin myargs(*argc,*argv)\n");
44  myargs(*argc,*argv);
45  stderrfprintf( "End myargs(*argc,*argv)\n");
46 
47  // set default MPIid (must come after myargs())
48  stderrfprintf( "Begin init_default_MPI_GRMHD_myid()\n");
49  init_default_MPI_GRMHD_myid();
50  stderrfprintf( "End init_default_MPI_GRMHD_myid()\n");
51  // report MPIid[myid] ordering
52  if(PRODUCTION<=2 && myid==0 || PRODUCTION<=1) report_myid(stderr);
53 
54 
55 
56 #if(USEOPENMP)
57  // Setup OpenMP (just number of threads currently that was set on user arguments)
59 #endif
60 
61 
62  // Create "myid" for HARM code not to be used in MPI functions
63  // Allows translation and overloading of nodes in desirable way
64 #if(USEMPI)
65  init_MPI_GRMHD_myid();
66 #endif
67 
68 
69 
70  // for file names
71  sprintf(myidtxt, ".grmhd.%04d", MPIid[myid]);
72 
73  // rest of initialization (still shouldn't include domain decomposition setup -- just other most basic setups)
74  init_MPI_setupfilesandgrid(*argc, *argv);
75 
76  // report MPIid[myid] ordering
77  // MPI_Barrier(MPI_COMM_GRMHD);
78  stderrfprintf( "Begin report_myid()\n");
79  if(myid==0&&logfull_file) report_myid(logfull_file);
80  if(log_file) report_myid(log_file);
81  stderrfprintf( "End report_myid()\n");
82 
83 
84 #if(USEOPENMP)
85  // output to logfull_file
88 #endif
89 
90 
91 #if(DOINGLIAISON)
92  // liaison-related test code:
94 #endif
95 
96  return (0);
97 
98 
99 }
100 
101 
102 // for testing LIAISON+GRMHD code communication
103 void test_nonliaison(void)
104 {
105  int myint;
106 
107  myint=myid;
108 
109 #if(USEMPI&&DOINGLIAISON)
110  MPI_Bcast(&myint,1,MPI_INT,MPIid[0], MPI_COMM_GRMHD_LIAISON);
111  dualfprintf(fail_file,"myid=%d myint=%d\n",myid,myint);
112 #endif
113 
114  myexit(0);
115 
116 
117 }
118 
119 
120 
121 // Set default MPIid[] mapping
122 // must come after assigning rank to myid and after getting numprocs
123 int init_default_MPI_GRMHD_myid(void)
124 {
125  int proc;
126 
127  for(proc=0;proc<numprocs;proc++){
128  MPIid[proc]=proc; // True MPIid
129  }
130 
131 
132  if(MPIid[myid]!=myid){
133  stderrfprintf("Failure to setup default MPIid[myid]=%d: myid=%d numprocs=%d\n",MPIid[myid],myid,numprocs); fflush(stderr);
134  for(proc=0;proc<numprocs;proc++){
135  stderrfprintf("MPIid[proc=%d]=%d\n",proc,MPIid[proc]); fflush(stderr);
136  }
137  myexit(1486754);
138  }
139 
140  return(0);
141 
142 }
143 
144 int init_MPI_GRMHD_myid(void)
145 {
146  int truempiid;
147 
149  //
150  // Note, rank's only used in code in following MPI functions so far:
151  //
152  // MPI_Reduce() : 2nd to last is rank
153  // MPI_Bcast() : 2nd to last is rank of root
154  // MPI_Irecv() : 4th is rank of source
155  // MPI_Isend() : 4th is rank of destination
156  // MPI_Issend(): 4th is rank of destination
157  // MPI_Group_incl(): 3rd is array of ranks, but only used BEFORE GRMHD ranks are created, so no mapping. Used in mpi_init.c, so "ranks" array required to be set of mapped ranks [as it is currently].
158  // MPI_Comm_rank(): 2nd is rank, but get rank, not set or use rank
159  //
160  // And note that we are only mapping *ranks* NOT tags!
161  //
162  // Ensure all converted: (e.g.):
163  // grep "MPI_Irecv" *.c *.h | grep -v MPIid | less
164  //
165  // DO NOT CHANGE RANK FOR:
166  //
167  // MPI_Type_create_darray(): 2nd is rank [rank here is rank within array geometry, which is fixed to be GRMHD "myid" rank]
168  //
170 
171  // depending upon gridsectioning or whatever, allow user function call here that chooses some way to distribute MPI processes
172  // For example, to keep L2 cache coherency require (say) 32x4x4 in 3D per *node*. But if doing totalsize[1]=1024 and grid sectioning, then those outer procs are not used!
173 
174  // However, can create look-up table so that when MPI command is called, any given MPI process is fed myid but returns MPIid desired. So one can redistribute (even in real-time if transfered on-grid data around) how MPI procs are arranged to overload certain nodes with (say) half of the MPI procs not being used on each node.
175 
176  // Note that we are below creating a certain direction of the mapping. The mapping returns the MPI id for a given HARM id. We do this direction because HARM id is setup to have specific meaning w.r.t. spatial grid. Also, HARM id appears in more places than MPI id, that only appears for MPI commands when realling needing to know true MPI rank.
177 
178  // Note default was already set as MPIid[myid]=myid , where MPIid is true MPI id used for MPI commands.
179 
180  // One way to distribute is to take the 2nd the grid (in real space) and doulbe it onto the 1st half.
181 
182 
184  //
185  // Take Ranger for example:
186  //
187  // http://www.tacc.utexas.edu/services/userguides/ranger/
188  //
189  // When reordering tasks, one should ensure to overload each core in the correct way with the correct socket and core affinity for a given MPI task.
190  //
191  // Ranger has 16 cores/node. One sets:
192  //
193  // -pe <tpn>way <nonx16>
194  //
195  // where <tpn> is MPI tasks per node and NoN=Number of nodes. Then we run the HARM and choose:
196  //
197  // ./grmhd OMPPERTASK ncpux1 ncpux2 ncpux3
198  //
199  // Also note that on Ranger, each socket has its own memory! I didn't know this. The documentation pages also explain how to assign socket/core affinity so that memory accesses are more efficient. This is important, for example, in order to avoid a thread or task on one socket trying to use or allocate memory on another socket since that would use the PCI bus that is very slow. Also, when overloading a socket, we want to have MPI tasks assigned with the correct affinity to a socket -- otherwise we'll have NO gain!
200  //
201  // So now, let us set:
202  //
203  // -pe 8way 256
204  //
205  // and one sets (although code can override this and the affinity file mentioned below doesn't use it):
206  //
207  // export OMP_NUM_THREADS=4
208  //
209  // and run using:
210  //
211  // ./grmhd 4 8 2 1
212  //
213  // Ranger will allocate 256/16=16 nodes with 8 MPI tasks per node. If we run using:
214  //
215  // ibrun tacc_affinity ./grmhd 4 8 2 1
216  //
217  // then the affinity will be ensured to use round robin to assign MPI tasks to sockets. That is, it will assign (for 8way) for the first node 8 MPI tasks with:
218  //
219  // rank0 -> Socket0 rank1 -> Socket0 rank2 -> Socket1 rank3 -> Socket1 rank4 -> Socket2 rank5 -> Socket2 rank6 -> Socket3 rank7 -> Socket4
220  //
221  // That's round robin with 8way set. Clearly, then, for each node with 4 sockets we must KNOW that this is infact the ordering. If this is the ordering, then we can take those 8 MPI tasks and ensure that rank1,3,5,7 are associated with the outer radial regions beyond the initially active grid sectioning. Only then will each socket be able to launch 4 OpenMP threads that efficiently stay on a single socket for each MPI task.
222  //
223  // BTW, one can write your own ordering of affinity very easily. The script is just /share/sge/default/pe_scripts/tacc_affinity and appears to be very easy to control. However, let's assume the default "tacc_affinity" affinity method and modify our code as below.
224  //
225  // Within the code, we reorder the MPIid[]'s (see new code and mpi_init_grmhd.c) so that (e.g. for 1D for demonstration purposes) MPI tasks (ranks) 0-15 are instead ordered as 0,2,4,6,8,10,12,14 in physical model space. This assumes that the MPI has done a good job of affinity itself, which on TACC is true.
226  //
227  // Then assuming grid sectioning operates on roughly half the grid at any one time, then MPI task 0 and 1 won't be going at the same time. All MPI tasks will use 4 cores per task so they only access their own memory channel and don't go across the PCI bus on Ranger.
228  //
229  // So this reordering of MPI task numbers depends critically (at least for multi-socket systems where each socket accesses different memory) on knowing how the default affinity is defined. And it depended upon that one chose "8way" but tricked the system into a setup that really means "4way" since ultimately we used 4 OpenMP threads per MPI task.
230  //
232 
233 
234 
235  // true id for this proc:
236  // truempiid=MPIid[myid];
237 
238 
239  // Call user function [can have myid==0 setup all CPUs or just have all CPUs do same setup]
240  theproblem_set_myid();
241 
242 
243 #if(USEMPI)
244  // Might have myid==0 setup al CPUs, but not MPIid[0] since unknown by all CPUs at first and might change. So use myid==0 as broadcast in case myid==0 setup all CPUs.
245  MPI_Bcast(MPIid,truenumprocs,MPI_INT,0,MPI_COMM_GRMHD);
246 #endif
247 
248  return(0);
249 
250 
251 }
252 
253 
254 // report listings of MPIid[myid]
255 int report_myid(FILE *out)
256 {
257  int ranki,rankj,rankk,origid;
258 
259  fprintf(out,"BEGIN Rank orders in physical model space\n");
260 
261  fprintf(out,"\n");
262  for(rankk=0;rankk<ncpux3;rankk++){
263  fprintf(out,"rankk=%d::\n",rankk); // report each k-section
264  for(rankj=0;rankj<ncpux2;rankj++){
265  for(ranki=0;ranki<ncpux1;ranki++){
266  origid=ranki + rankj*ncpux1 + rankk*ncpux1*ncpux2;
267  fprintf(out,"%04d",MPIid[origid]);
268  if(ranki!=ncpux1-1) fprintf(out," ");
269  else fprintf(out,"\n");
270  }
271  if(rankj==ncpux2-1) fprintf(out,"\n");
272  }
273  if(rankk==ncpux3-1) fprintf(out,"\n");
274  }
275  fprintf(out,"\n");
276 
277  fprintf(out,"END Rank orders in physical model space\n");
278 
279  return(0);
280 }
281 
282