27 bool cmp_rank(
const LocalData_t& a,
const LocalData_t& b)
29 return a.start < b.start;
39 const Topology_t &stopo,
const Topology_t &dtopo,
43 cerr <<
"\nIn compute Send Schedule--------------------\n";
45 fprintf(stderr,
"stopo: %ld\tdtopo: %ld\n",stopo.total, dtopo.total);
46 fprintf(stderr,
"gd.len %ld\tgd.cyclic: %ld\tsd.start %d\tsd.len %d\n", gd.len, gd.cyclic, sd.start, sd.len);
49 if (stopo.total == dtopo.total) {
53 fprintf(stderr,
" rank:%d start:%d len:%d base:%p\n", sd.rank, sd.start, sd.len, sd.base);
57 vOut.reserve(vOut.size() + dtopo.total);
59 unsigned slbsz =
blockSize(gd.len, stopo.total, param);
64 unsigned long slow, shigh;
67 unsigned dlbsz =
blockSize(gd.len, dtopo.total, param);
74 fprintf(stderr,
" loop from %d to %d width dtotal: %ld\n", fpid, lpid, dtopo.total);
78 for(
unsigned i=fpid; i <= lpid; i++) {
80 vOut.resize(vOut.size()+1);
81 LocalData_t& s = vOut[vOut.size()-1];
84 unsigned tmp = i*dlbsz;
85 s.start = ( slow >= tmp)?slow:tmp;
88 unsigned end = ( shigh <= tmp)?shigh:tmp;
90 s.len = end - s.start;
92 s.base = sd.base + ((s.start - sd.start) * gd.unit_size);
95 fprintf(stderr,
" s1: to:%d start:%d len:%d base:%p\n", s.rank, s.start, s.len, s.base);
102 unsigned stbsz = slbsz * stopo.total;
106 for(
unsigned b=0; b<nbbloc; b++) {
107 unsigned gb = b * stopo.total + sd.rank;
110 vOut.resize(vOut.size()+1);
111 LocalData_t& s = vOut[vOut.size()-1];
114 s.start = (stbsz*b) + (sd.rank*slbsz);
116 s.base = sd.base + ( b * slbsz * gd.unit_size );
118 #ifdef DEBUG_INTERNAL 119 fprintf(stderr,
" s2: to:%d start:%d len:%d base:%p\n", s.rank, s.start, s.len, s.base);
125 #ifdef DEBUG_INTERNAL 126 cerr <<
"\nIn compute Send Schedule-------------------- done\n";
136 const Topology_t &stopo,
const Topology_t &dtopo,
139 #ifdef DEBUG_INTERNAL 140 cerr <<
"\nIn compute Receive Schedule--------------------\n";
142 fprintf(stderr,
"stopo: %ld\tdtopo: %ld\n",stopo.total, dtopo.total);
143 fprintf(stderr,
"gd.len %ld\tdd.start %d\tdd.len %d\n", gd.len, dd.start, dd.len);
147 if (stopo.total == dtopo.total) {
149 #ifdef DEBUG_INTERNAL 150 fprintf(stderr,
" rank:%d start:%d len:%d base:%p\n", dd.rank, dd.start, dd.len, dd.base);
154 vOut.reserve(vOut.size()+stopo.total);
156 unsigned slbsz =
blockSize(gd.len, stopo.total, param);
158 if (gd.cyclic == 0) {
160 unsigned long dlow = dd.start;
161 unsigned long dhigh = dlow + dd.len;
167 #ifdef DEBUG_INTERNAL 168 fprintf(stderr,
" loop from %d to %d width stotal: %ld\n", fpid, lpid, stopo.total);
172 for(
unsigned i=fpid; i <= lpid; i++) {
174 vOut.resize(vOut.size()+1);
175 LocalData_t& s = vOut[vOut.size()-1];
178 unsigned tmp = i*slbsz;
179 s.start = ( dlow >= tmp)?dlow:tmp;
182 unsigned end = ( dhigh <= tmp)?dhigh:tmp;
184 s.len = end - s.start;
186 s.base = dd.base + ((s.start - dd.start) * gd.unit_size);
188 #ifdef DEBUG_INTERNAL 189 fprintf(stderr,
" r1: from:%d start:%d len:%d base:%p\n", s.rank, s.start, s.len, s.base);
195 unsigned dlbsz =
blockSize(gd.len, dtopo.total, param);
196 unsigned dtbsz = dlbsz * dtopo.total;
200 for(
unsigned b=0; b<nbbloc; b++) {
201 unsigned gb = b * dtopo.total + dd.rank;
204 vOut.resize(vOut.size()+1);
205 LocalData_t& s = vOut[vOut.size()-1];
208 s.start = dtbsz*b + dd.rank*dlbsz;
210 s.base = dd.base + b * dlbsz * gd.unit_size;
212 #ifdef DEBUG_INTERNAL 213 fprintf(stderr,
" r2: from:%d start:%d len:%d base:%p\n", s.rank, s.start, s.len, s.base);
227 void doSchedule(
const GlobalData_t& gd,
const LocalData_t& ld,
const Topology_t &ctopo,
228 vector<LocalData_t>& sched_send, vector<LocalData_t>& sched_recv,
void* comm) {
230 #ifdef DEBUG_INTERNAL 231 cerr <<
"\nIn doSchedule--------------------\n";
234 MPI_Comm mpi_comm = *(MPI_Comm*) comm;
237 fprintf(stderr,
" MPI_COMM_WORLD=%d mpi_comm=%d\n", MPI_COMM_WORLD, mpi_comm);
240 if (sched_send.size() || sched_recv.size()) {
242 MPI_Request sreq[sched_send.size()];
243 MPI_Request rreq[sched_recv.size()];
248 MPI_Status sstat[sched_send.size()];
249 MPI_Status rstat[sched_recv.size()];
251 vector<LocalData_t*> local_recv;
252 vector<LocalData_t*> local_send;
260 if (sched_send.size()) std::stable_sort(sched_send.begin(), sched_send.end(),
cmp_rank);
261 if (sched_recv.size()) std::stable_sort(sched_recv.begin(), sched_recv.end(),
cmp_rank);
268 cerr <<
" #sched_recv: " << sched_recv.size() << endl;
270 for(
unsigned i=0; i < sched_recv.size(); i++) {
271 unsigned from =
getProcId(sched_recv[i].rank, ctopo);
272 if (from == ld.rank) {
274 fprintf(stderr,
" recv: schedr no=%d start=%d len=%d from=%d LOCAL\n", i,
275 sched_recv[i].start, sched_recv[i].len, from);
277 local_recv.push_back(&sched_recv[i]);
280 fprintf(stderr,
" recv: schedr no=%d start=%d len=%d from=%d base=%p\n", i,
281 sched_recv[i].start, sched_recv[i].len, from, sched_recv[i].base);
284 int err = MPI_Irecv(sched_recv[i].base, sched_recv[i].len*gd.unit_size,
285 MPI_BYTE, from, 51, mpi_comm, &rreq[ri++]);
286 if (err!= MPI_SUCCESS) {
287 cerr <<
"EROR IN MPI_Irecv: return value is "<<err<<endl;
294 cerr <<
" #sched_send: " << sched_send.size() << endl;
296 for(
unsigned i=0; i < sched_send.size(); i++) {
297 unsigned to =
getProcId(sched_send[i].rank, ctopo);
300 fprintf(stderr,
" send: scheds no=%d start=%d len=%d to=%d LOCAL\n", i,
301 sched_send[i].start, sched_send[i].len, to);
303 local_send.push_back(&sched_send[i]);
306 fprintf(stderr,
" send: scheds no=%d start=%d len=%d to=%d base=%p\n", i,
307 sched_send[i].start, sched_send[i].len, to, sched_send[i].base);
310 int err = MPI_Isend(sched_send[i].base, sched_send[i].len*gd.unit_size,
311 MPI_BYTE, to, 51, mpi_comm, &sreq[si++]);
312 if (err!= MPI_SUCCESS) {
313 cerr <<
"EROR IN MPI_Isend: return value is "<<err<<endl;
319 if (local_recv.size() != local_send.size()) {
320 cerr <<
"Error: local recv & send have different size: " << local_recv.size() <<
" " << local_send.size() << endl;
322 for(
unsigned i=0; i < local_recv.size(); i++) {
323 if (local_recv[i]->len != local_send[i]->len) {
324 cerr <<
"Error: local recv & send have different len for i= "<<i<<
" :" << local_recv[i]->len <<
" " << local_send[i]->len << endl;
327 fprintf(stderr,
" local: scheds no=%d start=%d len=%d\n", i,
328 sched_send[i].start, sched_send[i].len);
330 memcpy(local_recv[i]->base, local_send[i]->base, local_send[i]->len*gd.unit_size);
335 #ifdef DEBUG_INTERNAL 336 cerr <<
"WAITING local communications to end...\n";
340 err = MPI_Waitall(si, sreq, sstat);
341 if (err!= MPI_SUCCESS) {
342 cerr <<
"EROR IN MPI_WaitAll for send: return value is "<<err<<endl;
344 err = MPI_Waitall(ri, rreq, rstat);
345 if (err!= MPI_SUCCESS) {
346 cerr <<
"EROR IN MPI_WaitAll for recv: return value is "<<err<<endl;
348 #ifdef DEBUG_INTERNAL 349 cerr <<
"WAITING local communications to end...ok \n";
static unsigned BlockNumberOfElementProc(const unsigned glen, const unsigned rank, const unsigned nbprocs, const unsigned bsz, const unsigned pos)
static unsigned NumberOfBlockProc(const unsigned glen, const unsigned nbprocs, const unsigned bsz, const unsigned rank)
void computeSendBlock1D(const GlobalData_t &gd, const LocalData_t &sd, const Topology_t &stopo, const Topology_t &dtopo, const ParisBlock_param_t *param, vector< LocalData_t > &vOut)
static unsigned OwnerBlock(const unsigned bid, const unsigned nbprocs)
static void computeBlockBounds(unsigned long *low, unsigned long *high, const unsigned glen, const unsigned rank, const unsigned nbprocs, const unsigned bsz, const unsigned pos)
static unsigned getProcId(unsigned rank, Topology_t topo)
static unsigned getProcRangeInf(unsigned low, unsigned bsz)
void doSchedule(const GlobalData_t &gd, const LocalData_t &ld, const Topology_t &ctopo, vector< LocalData_t > &sched_send, vector< LocalData_t > &sched_recv, void *comm)
static unsigned blockSize(const unsigned glen, const unsigned nbprocs, const ParisBlock_param_t *param)
static unsigned getProcRangeSup(unsigned high, unsigned bsz)
void computeReceiveBlock1D(const GlobalData_t &gd, const LocalData_t &dd, const Topology_t &stopo, const Topology_t &dtopo, const ParisBlock_param_t *param, vector< LocalData_t > &vOut)
bool cmp_rank(const LocalData_t &a, const LocalData_t &b)