6 #include <Padico/MPCircuit.h> 14 #undef STANDALONE_FILE 16 #define DEBUG_INTERNAL 20 inline char* mymalloc(
unsigned sz) {
21 static unsigned base=0x1000000;
28 #ifdef STANDALONE_FILE 29 #include "../Generated/Concret.h" 30 #include "../Generated/XServiceType.h" 40 const Topology_t &stopo,
const Topology_t &dtopo,
41 vector<LocalData_t>& vOut) {
44 cerr <<
"\nIn compute Send Schedule--------------------\n";
46 fprintf(stderr,
"stopo: %ld\tdtopo: %ld\n",stopo.total, dtopo.total);
47 fprintf(stderr,
"gd.len %ld\tsd.start %d\tsd.len %d\n",gd.len, sd.start, sd.len);
50 if (stopo.total == dtopo.total) {
54 fprintf(stderr,
" rank:%d start:%d len:%d base:%p\n", sd.rank, sd.start, sd.len, sd.base);
58 vOut.reserve(vOut.size()+dtopo.total);
60 unsigned dbsz =
blockSize(gd.len, dtopo.total);
62 unsigned long slow = sd.start;
63 unsigned long shigh = slow + sd.len;
70 fprintf(stderr,
" loop from %d to %d width dtotal: %ld\n", fpid, lpid, dtopo.total);
74 for(
unsigned i=fpid; i <= lpid; i++) {
76 vOut.resize(vOut.size()+1);
77 LocalData_t& s = vOut[vOut.size()-1];
80 unsigned tmp = i*dbsz;
81 s.start = ( slow >= tmp)?slow:tmp;
84 unsigned end = ( shigh <= tmp)?shigh:tmp;
86 s.len = end - s.start;
88 s.base = sd.base + ((s.start - sd.start) * gd.unit_size);
91 fprintf(stderr,
" s: to:%d start:%d len:%d base:%p\n", s.rank, s.start, s.len, s.base);
96 cerr <<
"\nIn compute Send Schedule-------------------- done\n";
106 const Topology_t &stopo,
const Topology_t &dtopo,
107 vector<LocalData_t>& vOut) {
109 #ifdef DEBUG_INTERNAL 110 cerr <<
"\nIn compute Receive Schedule--------------------\n";
112 fprintf(stderr,
"stopo: %ld\tdtopo: %ld\n",stopo.total, dtopo.total);
113 fprintf(stderr,
"gd.len %ld\tdd.start %d\tdd.len %d\n", gd.len, dd.start, dd.len);
117 if (stopo.total == dtopo.total) {
119 #ifdef DEBUG_INTERNAL 120 fprintf(stderr,
" rank:%d start:%d len:%d base:%p\n", dd.rank, dd.start, dd.len, dd.base);
124 vOut.reserve(vOut.size()+stopo.total);
126 unsigned sbsz =
blockSize(gd.len, stopo.total);
128 unsigned long dlow = dd.start;
129 unsigned long dhigh = dlow + dd.len;
135 #ifdef DEBUG_INTERNAL 136 fprintf(stderr,
" loop from %d to %d width stotal: %ld\n", fpid, lpid, stopo.total);
140 for(
unsigned i=fpid; i <= lpid; i++) {
142 vOut.resize(vOut.size()+1);
143 LocalData_t& s = vOut[vOut.size()-1];
146 unsigned tmp = i*sbsz;
147 s.start = ( dlow >= tmp)?dlow:tmp;
150 unsigned end = ( dhigh <= tmp)?dhigh:tmp;
152 s.len = end - s.start;
154 s.base = dd.base + ((s.start - dd.start) * gd.unit_size);
156 #ifdef DEBUG_INTERNAL 157 fprintf(stderr,
" r: from:%d start:%d len:%d base:%p\n", s.rank, s.start, s.len, s.base);
170 void doSchedule(
const GlobalData_t& gd,
const LocalData_t& ld,
const Topology_t &ctopo,
171 vector<LocalData_t>& sched_send, vector<LocalData_t>& sched_recv,
void* comm) {
173 cerr <<
"\nIn doSchedule--------------------\n";
175 padico_mpcircuit_t schd_mpc = (padico_mpcircuit_t) comm;
177 if (sched_send.size() || sched_recv.size()) {
180 void* rreq[sched_recv.size()];
185 vector<LocalData_t*> local_recv;
186 vector<LocalData_t*> local_send;
196 cerr <<
" #sched_recv: " << sched_recv.size() << endl;
198 for(
unsigned i=0; i < sched_recv.size(); i++) {
199 unsigned from =
getProcId(sched_recv[i].rank, ctopo);
200 if (from == ld.rank) {
202 fprintf(stderr,
" recv: schedr no=%d start=%d len=%d from=%d LOCAL\n", i,
203 sched_recv[i].start, sched_recv[i].len, from);
205 local_recv.push_back(&sched_recv[i]);
208 fprintf(stderr,
" recv: schedr no=%d start=%d len=%d from=%d base=%p\n", i,
209 sched_recv[i].start, sched_recv[i].len, from, sched_recv[i].base);
213 rreq[ri++] = padico_mpcircuit_Irecv(sched_recv[i].base, sched_recv[i].len*gd.unit_size,
214 from, 51, schd_mpc );
221 cerr <<
" #sched_send: " << sched_send.size() << endl;
223 for(
unsigned i=0; i < sched_send.size(); i++) {
224 unsigned to =
getProcId(sched_send[i].rank, ctopo);
227 fprintf(stderr,
" send: scheds no=%d start=%d len=%d to=%d LOCAL\n", i,
228 sched_send[i].start, sched_send[i].len, to);
230 local_send.push_back(&sched_send[i]);
233 fprintf(stderr,
" send: scheds no=%d start=%d len=%d to=%d base=%p\n", i,
234 sched_send[i].start, sched_send[i].len, to, sched_send[i].base);
238 padico_mpcircuit_send(sched_send[i].base, sched_send[i].len*gd.unit_size,
245 if (local_recv.size() != local_send.size()) {
246 cerr <<
"Error: local recv & send have different size: " << local_recv.size() <<
" " << local_send.size() << endl;
248 for(
unsigned i=0; i < local_recv.size(); i++) {
249 if (local_recv[i]->len != local_send[i]->len) {
250 cerr <<
"Error: local recv & send have different len for i= "<<i<<
" :" << local_recv[i]->len <<
" " << local_send[i]->len << endl;
253 fprintf(stderr,
" local: scheds no=%d start=%d len=%d\n", i,
254 sched_send[i].start, sched_send[i].len);
257 memcpy(local_recv[i]->base, local_send[i]->base, local_send[i]->len*gd.unit_size);
264 #ifdef DEBUG_INTERNAL 265 cerr <<
"WAITING local communications to end...\n";
268 padico_mpcircuit_waitAll(rreq, ri);
269 #ifdef DEBUG_INTERNAL 270 cerr <<
"WAITING local communications to end...ok \n";
280 #ifdef STANDALONE_FILE 281 int simSendDataBlock1D(
unsigned int glen,
int total,
int rank,
int dtotal,
const PaCO::distLoc_t& mode) {
285 Topology_t stopo, dtopo;
288 vector<unsigned> destid;
291 gd.unit_size =
sizeof(xservice_data_t);
299 sd.base = (
char*) 0x1000;
301 sd.base = (
char*) malloc(sd.len*gd.unit_size);
302 xservice_data_t* p=(xservice_data_t*) sd.base;
303 for(
unsigned i=0; i < sd.len; i++) {
309 cerr <<
"Dumping data: ";
310 for(
unsigned k=0; k < sd.len; k++)
311 cerr <<
" " << ((xservice_data_t*)sd.base)[k];
316 dtopo.total = dtotal;
324 padico_mpcircuit_barrier(schd_mpc);
326 cerr <<
"\n #vdarray: " << vdarray.size() <<
"\n";
327 for(
unsigned i=0; i< vdarray.size(); i++) {
328 cout <<
"Dumping vdarray["<<i<<
"] to " << destid[i] <<
" :\n";
329 cout <<
" topo / gd : " << vdarray[i]->topo().total <<
" / " << vdarray[i]->gd().len <<
" / " << vdarray[i]->gd().unit_size << endl;
330 cout <<
" dist # " << vdarray[i]->dist().length() << endl;
331 for(
unsigned j=0; j< vdarray[i]->dist().length(); j++) {
332 cout <<
" rank/low/len (len): " << vdarray[i]->dist()[j].rank <<
" / " << vdarray[i]->dist()[j].start <<
" / " << vdarray[i]->dist()[j].len <<
" ( " << vdarray[i]->getDataLength(j) <<
" )" <<endl;
348 int main(
int argc,
char** argv) {
350 MPI_Init(&argc, &argv);
353 fprintf(stderr,
"Usage: %s len source_total dest_total_nodes\n", argv[0]);
357 unsigned glen, stotal, dtotal;
359 sscanf(argv[1],
"%d",&glen);
360 sscanf(argv[2],
"%d",&stotal);
361 sscanf(argv[3],
"%d",&dtotal);
363 fprintf(stderr,
"Distribution block 1D with len=%d stotal=%d dtotal=%d\n", glen, stotal, dtotal);
366 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
370 cerr << endl <<
"------------------------------- " << rank <<
" / " << stotal << endl;
374 cerr << pid <<
": ENDING !!\n";
void computeSendBlock1D(const GlobalData_t &gd, const LocalData_t &sd, const Topology_t &stopo, const Topology_t &dtopo, vector< LocalData_t > &vOut)
static unsigned getProcId(unsigned rank, Topology_t topo)
void doSchedule(const GlobalData_t &gd, const LocalData_t &ld, const Topology_t &ctopo, vector< LocalData_t > &sched_send, vector< LocalData_t > &sched_recv, void *comm)
void computeSendDataBlock1D(const GlobalData_t &gd, const LocalData_t &sd, const Topology_t &stopo, const Topology_t &dtopo, const ParisBlock_param_t *param, vAbstrait &vdarray, vector< unsigned > &destid, const PaCO::distLoc_t &mode, void *comm)
static unsigned getProcRangeInf(unsigned low, unsigned bsz)
static unsigned computeBlockBoundInf0(unsigned bsz, unsigned rank)
int main(int argc, char *argv[])
static unsigned blockSize(const unsigned glen, const unsigned nbprocs, const ParisBlock_param_t *param)
static unsigned localBlockLengthO(unsigned glen, unsigned rank, unsigned total, unsigned bsz)
static unsigned getProcRangeSup(unsigned high, unsigned bsz)
void computeReceiveBlock1D(const GlobalData_t &gd, const LocalData_t &dd, const Topology_t &stopo, const Topology_t &dtopo, vector< LocalData_t > &vOut)