<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:p="urn:schemas-microsoft-com:office:powerpoint" xmlns:a="urn:schemas-microsoft-com:office:access" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882" xmlns:rs="urn:schemas-microsoft-com:rowset" xmlns:z="#RowsetSchema" xmlns:b="urn:schemas-microsoft-com:office:publisher" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet" xmlns:odc="urn:schemas-microsoft-com:office:odc" xmlns:oa="urn:schemas-microsoft-com:office:activation" xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:q="http://schemas.xmlsoap.org/soap/envelope/" xmlns:D="DAV:" xmlns:mt="http://schemas.microsoft.com/sharepoint/soap/meetings/" xmlns:x2="http://schemas.microsoft.com/office/excel/2003/xml" xmlns:ois="http://schemas.microsoft.com/sharepoint/soap/ois/" xmlns:dir="http://schemas.microsoft.com/sharepoint/soap/directory/" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" xmlns:dsp="http://schemas.microsoft.com/sharepoint/dsp" xmlns:udc="http://schemas.microsoft.com/data/udc" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:sub="http://schemas.microsoft.com/sharepoint/soap/2002/1/alerts/" xmlns:ec="http://www.w3.org/2001/04/xmlenc#" xmlns:sp="http://schemas.microsoft.com/sharepoint/" xmlns:sps="http://schemas.microsoft.com/sharepoint/soap/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:udcs="http://schemas.microsoft.com/data/udc/soap" xmlns:udcxf="http://schemas.microsoft.com/data/udc/xmlfile" xmlns:udcp2p="http://schemas.microsoft.com/data/udc/parttopart" xmlns:wf="http://schemas.microsoft.com/sharepoint/soap/workflow/" xmlns:dsss="http://schemas.microsoft.com/office/2006/digsig-setup" xmlns:dssi="http://schemas.microsoft.com/office/2006/digsig" xmlns:mdssi="http://schemas.openxmlformats.org/package/2006/digital-signature" xmlns:mver="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns:mrels="http://schemas.openxmlformats.org/package/2006/relationships" xmlns:spwp="http://microsoft.com/sharepoint/webpartpages" xmlns:ex12t="http://schemas.microsoft.com/exchange/services/2006/types" xmlns:ex12m="http://schemas.microsoft.com/exchange/services/2006/messages" xmlns:pptsl="http://schemas.microsoft.com/sharepoint/soap/SlideLibrary/" xmlns:spsl="http://microsoft.com/webservices/SharePointPortalServer/PublishedLinksService" xmlns:Z="urn:schemas-microsoft-com:" xmlns:st="" xmlns="http://www.w3.org/TR/REC-html40">

<head>
<meta http-equiv=Content-Type content="text/html; charset=us-ascii">
<meta name=Generator content="Microsoft Word 12 (filtered medium)">
<style>
<!--
 /* Font Definitions */
 @font-face
        {font-family:Wingdings;
        panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:Cambria;
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:Tahoma;
        panose-1:2 11 6 4 3 5 4 4 2 4;}
 /* Style Definitions */
 p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0in;
        margin-bottom:.0001pt;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:blue;
        text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
        {mso-style-priority:99;
        color:purple;
        text-decoration:underline;}
p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph
        {mso-style-priority:34;
        margin-top:0in;
        margin-right:0in;
        margin-bottom:0in;
        margin-left:.5in;
        margin-bottom:.0001pt;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";}
p.emailquote, li.emailquote, div.emailquote
        {mso-style-name:emailquote;
        mso-margin-top-alt:auto;
        margin-right:0in;
        mso-margin-bottom-alt:auto;
        margin-left:1.0pt;
        font-size:12.0pt;
        font-family:"Times New Roman","serif";}
span.EmailStyle18
        {mso-style-type:personal;
        font-family:"Calibri","sans-serif";
        color:#1F497D;}
span.EmailStyle19
        {mso-style-type:personal;
        font-family:"Calibri","sans-serif";
        color:#1F497D;}
span.EmailStyle20
        {mso-style-type:personal;
        font-family:"Calibri","sans-serif";
        color:#1F497D;}
span.EmailStyle21
        {mso-style-type:personal-reply;
        font-family:"Calibri","sans-serif";
        color:#1F497D;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page Section1
        {size:8.5in 11.0in;
        margin:1.0in 1.0in 1.0in 1.0in;}
div.Section1
        {page:Section1;}
 /* List Definitions */
 @list l0
        {mso-list-id:921522559;
        mso-list-type:hybrid;
        mso-list-template-ids:-1947046762 1292258822 67698691 67698693 67698689 67698691 67698693 67698689 67698691 67698693;}
@list l0:level1
        {mso-level-start-at:8;
        mso-level-number-format:bullet;
        mso-level-text:-;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;
        font-family:"Calibri","sans-serif";
        mso-fareast-font-family:Calibri;
        mso-bidi-font-family:Arial;}
@list l0:level2
        {mso-level-number-format:bullet;
        mso-level-text:o;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-.25in;
        font-family:"Courier New";}
ol
        {margin-bottom:0in;}
ul
        {margin-bottom:0in;}
-->
</style>
<!--[if gte mso 9]><xml>
 <o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
 <o:shapelayout v:ext="edit">
  <o:idmap v:ext="edit" data="1" />
 </o:shapelayout></xml><![endif]-->
</head>

<body lang=EN-US link=blue vlink=purple>

<div class=Section1>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>While putting together the next scenario I realized that it
would be easy enough to modify the worker code in scenario #1 to enable master
fault-tolerance.<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Pseudo
code for the hardened worker:<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
color:#548DD4;background:white'>int</span><span style='font-size:10.0pt;
font-family:"Courier New";color:black;background:white'> main()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>{</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#984806'>MPI_Init</span>()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>> <span style='color:#984806'>MPI_Comm_set_errhandler</span>(MPI_COMM_WORLD,
MPI_ERRORS_RETURN);</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#548DD4'>for</span>(;;)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    {</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        rc = <span
style='color:#984806'>MPI_Recv</span>(src=0, &query, MPI_COMM_WORLD);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#548DD4'>if</span>(rc != MPI_SUCCESS)</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         rc =
MPI_Comm_Restart_rank(MPI_COMM_WORLD, 0);</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         <span
style='color:#548DD4'>if</span>(rc == MPI_SUCCESS)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>             continue;<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>></span><span style='font-size:10.0pt;font-family:
"Calibri","sans-serif"'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         exit(1);
</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     }</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#548DD4'>if</span>(is_done_msg(query))</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>           
<b>break</b>;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>       
process_query(&query, &answer);</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#984806'>MPI_Send</span>(dst=0, &answer, MPI_COMM_WORLD);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    }</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New"'> </span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#984806'>MPI_Finalize</span>()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>}</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>This change seems easy enough… Note that we need to check
the error code only after receive and not after send; the assumption is that if
send failed so will receive.<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>other assumptions:<o:p></o:p></span></p>

<p class=MsoListParagraph style='text-indent:-.25in;mso-list:l0 level1 lfo1'><![if !supportLists]><span
style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'><span
style='mso-list:Ignore'>-<span style='font:7.0pt "Times New Roman"'>         
</span></span></span><![endif]><span dir=LTR></span><span style='font-size:
11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>eventually all ranks will
detect that the master failed and will call restart_rank<o:p></o:p></span></p>

<p class=MsoListParagraph style='margin-left:1.0in;text-indent:-.25in;
mso-list:l0 level2 lfo1'><![if !supportLists]><span style='font-size:11.0pt;
font-family:"Courier New";color:#1F497D'><span style='mso-list:Ignore'>o<span
style='font:7.0pt "Times New Roman"'>   </span></span></span><![endif]><span
dir=LTR></span><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>however they do not start another copy, but only block until the
process is started.<o:p></o:p></span></p>

<p class=MsoListParagraph style='text-indent:-.25in;mso-list:l0 level1 lfo1'><![if !supportLists]><span
style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'><span
style='mso-list:Ignore'>-<span style='font:7.0pt "Times New Roman"'>         
</span></span></span><![endif]><span dir=LTR></span><span style='font-size:
11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>Any outstanding message
sent from the worker to the master after it failed is flushed out by the mpi
implementation (as most impl do today).<o:p></o:p></span></p>

<p class=MsoListParagraph style='text-indent:-.25in;mso-list:l0 level1 lfo1'><![if !supportLists]><span
style='font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'><span
style='mso-list:Ignore'>-<span style='font:7.0pt "Times New Roman"'>         
</span></span></span><![endif]><span dir=LTR></span><span style='font-size:
11.0pt;font-family:"Calibri","sans-serif";color:#1F497D'>Use exit(1) if
restart_rank fails (rather than MPI_Abort) for the case where this rank could
not start the master but other ranks could; in that case the master will
restart the failing rank. Calling MPI_Abort will abort the entire job. (hmmm…
possibly calling MPI_Abort(MPI_COMM_SELF) might be okay).<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Now, what is the point of restarting the master if all messages
get flushed and there isn’t actually any state held by any rank??<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Well, the first thing that comes to mind is the startup time of a
large job. You would need to restart all processes on all nodes. The other is resource
allocation on a batch system. if the job gives up the resources just to be able
to restart immediately; the batch system might (a) not have the resources
available immediately as other jobs are running and expanding (b) queue the job
to restart for a later time because there are other jobs in the queue already.<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>So in this case the reason to implement master recovery would be
latency to completion in case of a failure.<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Another reason would be perception, not to bother the user with the
indication that a job failed.<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>What do you think?<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Thanks,<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>.Erez<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<div>

<div style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in'>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>From:</span></b><span
style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'> Erez Haba <br>
<b>Sent:</b> Wednesday, February 25, 2009 3:21 PM<br>
<b>To:</b> MPI 3.0 Fault Tolerance and Dynamic Process Control working Group<br>
<b>Subject:</b> RE: MPI Fault Tolerance scenarios<o:p></o:p></span></p>

</div>

</div>

<p class=MsoNormal><o:p> </o:p></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Update the wiki page code to simplifies how the code reads (the
error case after MPI_Waitany)<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><a
href="https://svn.mpi-forum.org/trac/mpi-forum-web/wiki/ft/scenarios_and_solutions">https://svn.mpi-forum.org/trac/mpi-forum-web/wiki/ft/scenarios_and_solutions</a><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<div>

<div style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in'>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>From:</span></b><span
style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>
mpi3-ft-bounces@lists.mpi-forum.org
[mailto:mpi3-ft-bounces@lists.mpi-forum.org] <b>On Behalf Of </b>Erez Haba<br>
<b>Sent:</b> Wednesday, February 25, 2009 10:06 AM<br>
<b>To:</b> MPI 3.0 Fault Tolerance and Dynamic Process Control working Group<br>
<b>Subject:</b> Re: [Mpi3-ft] MPI Fault Tolerance scenarios<o:p></o:p></span></p>

</div>

</div>

<p class=MsoNormal><o:p> </o:p></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Thanks Greg for catching this. I fixed setting the
‘repairing[i] = false’ in the example below and on the wiki page.<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>Added the lines<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#548DD4'>else</span></span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>        
repairing[i] = false;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#1F497D'>}</span><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<div>

<div style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in'>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>From:</span></b><span
style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>
mpi3-ft-bounces@lists.mpi-forum.org
[mailto:mpi3-ft-bounces@lists.mpi-forum.org] <b>On Behalf Of </b>Erez Haba<br>
<b>Sent:</b> Wednesday, February 18, 2009 12:04 PM<br>
<b>To:</b> MPI 3.0 Fault Tolerance and Dynamic Process Control working Group<br>
<b>Subject:</b> Re: [Mpi3-ft] MPI Fault Tolerance scenarios<o:p></o:p></span></p>

</div>

</div>

<p class=MsoNormal><o:p> </o:p></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'>I’ve posted this scenario on the FT wiki pages<o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><a
href="https://svn.mpi-forum.org/trac/mpi-forum-web/wiki/ft/scenarios_and_solutions">https://svn.mpi-forum.org/trac/mpi-forum-web/wiki/ft/scenarios_and_solutions</a><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<p class=MsoNormal><span style='font-size:11.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

<div>

<div style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in'>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>From:</span></b><span
style='font-size:10.0pt;font-family:"Tahoma","sans-serif"'>
mpi3-ft-bounces@lists.mpi-forum.org
[mailto:mpi3-ft-bounces@lists.mpi-forum.org] <b>On Behalf Of </b>Erez Haba<br>
<b>Sent:</b> Tuesday, February 17, 2009 6:53 PM<br>
<b>To:</b> MPI 3.0 Fault Tolerance and Dynamic Process Control working Group<br>
<b>Subject:</b> [Mpi3-ft] MPI Fault Tolerance scenarios<o:p></o:p></span></p>

</div>

</div>

<p class=MsoNormal><o:p> </o:p></p>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Hello
all,<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>In
our last meeting we decided to build a set of FT scenarios/programs to help us
understand the details of the interface need to support those scenarios. We
also decided to start with very simple scenarios and add more complex ones as
we understand the former better.  I hope that starting with simple
scenarios will help us build a solid foundation on which we can build the more
complex solutions.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>When
we build an FT solution we will focus on the scenario as described, without
complicating the solution just because it would be needed later for a more
complex one. The time will come later to modify the solution as we acquire more
knowledge and built the foundations. Hence, any proposal or change that we make
needs to fit <i><u>exactly</u></i> the scenario (and all those that we
previously looked at) but no more.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>For
example in the first scenario that we’ll look at there is no need for
saving communicator state or error callback; but they might be required later.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Note
that these scenarios focus on process FT rather than checkpoint/restart or
network degradation. I assume we’ll do the latter later.<o:p></o:p></span></p>

</div>

<div style='margin-top:24.0pt'>

<p class=MsoNormal><b><span style='font-size:13.5pt;font-family:"Cambria","serif";
color:#365F91'>Scenario #1: Very Simple Master-Workers</span></b><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div style='margin-top:10.0pt'>

<p class=MsoNormal><b><span style='font-family:"Cambria","serif";color:#4F81BD'>Description</span></b><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>This
is a very simple master-workers scenario. However simple, we were asked many
times by customers to support FT in this scenario.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>In
this case the MPI application running with n processes, where rank 0 is used as
the master and n-1 ranks are used as workers.  The master generates work
(either by getting it directly from user input, or reading a file) and sends it
for processing to a free worker rank. The master sends requests and receives
replies using MPI point-to-point communication.  The workers wait for the
incoming message, upon arrival the worker computes the result and sends it back
to the master.  The master stores the result to a log file.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Hardening</span></b><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>: The goal is to
harden the workers, the master itself is not FT, thus if it fails the entire
application fails. In this case the workers are FT, and are replaced to keep
computation power for this application. (a twist: if a worker cannot be
recovered the master can work with a smaller set of clients up to a low
watermark).<o:p></o:p></span></p>

</div>

<div style='margin-top:10.0pt'>

<p class=MsoNormal><b><span style='font-family:"Cambria","serif";color:#4F81BD'>Worker</span></b><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>The
worker waits on a blocking receive when a message arrives it process it. If a <i>done</i>
message arrives the worker finalizes MPI and exit normally.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Hardening</span></b><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>: There is not
special requirement for hardening here. If the worker encounters a
communication problem with the master, it means that the master is down and
it’s okay to abort the entire job. Thus, it will use the default error
handler (which aborts on errors).  Note that we do not need to modify the
client at all to make the application FT (except the master).<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Pseudo
code for the hardened worker:<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
color:#548DD4;background:white'>int</span><span style='font-size:10.0pt;
font-family:"Courier New";color:black;background:white'> main()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>{</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#984806'>MPI_Init</span>()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#548DD4'>for</span>(;;)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    {</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#984806'>MPI_Recv</span>(src=0, &query, MPI_COMM_WORLD);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#548DD4'>if</span>(is_done_msg(query))</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>           
<b>break</b>;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>       
process_query(&query, &answer);</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#984806'>MPI_Send</span>(dst=0, &answer, MPI_COMM_WORLD);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    }</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New"'> </span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#984806'>MPI_Finalize</span>()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>}</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Notice
that for this FT code there is no requirement for the worker to rejoin the
comm. As the only communicator used is MPI_COMM_WORLD.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div style='margin-top:10.0pt'>

<p class=MsoNormal><b><span style='font-family:"Cambria","serif";color:#4F81BD'>Master</span></b><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>The
master code reads queries from a stream and passes them on to the workers to process.
The master goes through several phases. In the initialization phase it sends
the first request to each one of the ranks; in the second one it shuts down any
unnecessary ranks (if the job is too small); I the third phase it enters its
progress engine where it handles replies (answers), process recovery and
termination (on input end).<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Hardening</span></b><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>: It is the
responsibility of the master to restart any failing workers and make sure that
the request (query) did not get lost if a worker fails. Hence, every time an
error is detected the master will move the worker into repairing state and move
its workload to other workers.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>The
master runs with errors returned rather than aborted<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><i><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>One
thing to note about the following code: it is not optimized. I did not try to
overlap computation with communication (which is possible) I tried to keep it
as simple as possible for the purpose of discussion.</span></i><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Pseudo
code for the hardened master; the code needed for repairing the failed ranks is
highlighted in yellow.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
color:#548DD4;background:white'>int</span><span style='font-size:10.0pt;
font-family:"Courier New";color:black;background:white'> main()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>{</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#984806'>MPI_Init</span>()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>> <span style='color:#984806'>MPI_Comm_set_errhandler</span>(MPI_COMM_WORLD,
MPI_ERRORS_RETURN);</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#984806'>MPI_Comm_size</span>(MPI_COMM_WORLD,
&n);</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    MPI_Request r[n] = MPI_REQUEST_NULL;</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    QueryMessage q[n];</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    AnswerMessage a[n];</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#548DD4'>int</span>
active_workers = 0;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>> <span style='color:#548DD4'>bool</span> repairing[n]
= false;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    //</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    // Phase 1: send initial requests</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    //</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#548DD4'>for</span>(int
i = 1; i < n; i++)</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    {</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#548DD4'>if</span>(get_next_query(stream, &q[i]) == eof)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>           
<span style='color:#548DD4'>break</span>;</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        active_workers++;</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#984806'>MPI_Send</span>(dest=i, &q[i], MPI_COMM_WORLD);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        rc = <span
style='color:#984806'>MPI_Irecv</span>(src=i, buffer=&a[x],
request=&r[x], MPI_COMM_WORLD)</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#548DD4'>if</span>(rc != MPI_SUCCESS)</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>        
start_repair(i, repairing, q, a, r, stream); </span><span style='font-size:
10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     }</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    }</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    //</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    // Phase 2: finalize any unnecessary ranks</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    //</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#548DD4'>for</span>(int
i = active_workers + 1; i < n; i++)</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    {</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#984806'>MPI_Send</span>(dest=i, &done_msg, MPI_COMM_WORLD);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    }</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    //</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    // The progress engine. Get answers; send
new requests and handle</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    // process repairs</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    //</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#548DD4'>while</span>(active_workers
!= 0)</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    {</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        rc = <span
style='color:#984806'>MPI_Waitany</span>(n, r, &i, MPI_STATUS_IGNORE);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#548DD4'>if</span>(!repairing[i])</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         <span
style='color:#548DD4'>if</span>(rc != MPI_SUCCESS)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>        
    start_repair(i, repairing, q, a, r, stream)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>            
continue;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         }</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>           
process_answer(&a[i]);</span><span style='font-size:10.0pt;font-family:
"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     }</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#548DD4'>else</span> <span style='color:#548DD4'>if</span>(rc != MPI_SUCCESS)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>        
active_workers--;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#1F497D'>}</span><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#548DD4'>else</span></span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>        
repairing[i] = false;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     <span style='color:
#1F497D'>}</span><o:p></o:p></span></p>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif";
color:#1F497D'><o:p> </o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#548DD4'>if</span>(get_next_input(stream, &q[i]) == eof)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>           
active_workers--;</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>           
<span style='color:#984806'>MPI_Send</span>(dest=i, &done_msg)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        <span
style='color:#548DD4'>else</span></span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>           
<span style='color:#984806'>MPI_Send</span>(dest=i, &q[i])</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>           
rc = <span style='color:#984806'>MPI_Irecv</span>(src=i, buffer=&a[i],
request=&r[i], MPI_COMM_WORLD)</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         <span
style='color:#548DD4'>if</span>(rc != MPI_SUCCESS)</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         {</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>        
    start_repair(i, repairing, q, a, r, stream);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>         }</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>        }</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    }</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>    <span style='color:#984806'>MPI_Finalize</span>()</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:white'>}</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>> <span style='color:#548DD4'>void</span>
start_repair(int i, int repairing[], Query q[], Answer q[], MPI_Request r[],
Stream stream)</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>> {</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     repairing[i] = true;</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     push_query_back(stream,
&q[i]);</span><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>>     MPI_Comm_I<span
style='color:#1F497D'>restart_rank</span>(MPI_COMM_WORLD, i, &r[i]);</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Courier New";
background:yellow'>>>> }</span><span style='font-size:10.0pt;
font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div style='margin-top:10.0pt'>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Cambria","serif";
color:#4F81BD'>Logic description (without FT)</span></b><span style='font-size:
10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>The
master code keeps track of the number of active workers through the
active_workers variable. It is solely used for the purpose of shutdown. When
the master is out of input, it shuts-down the workers by sending them <i>‘done’</i>
message. It decrease the number of active workers and finalizes when this
number reaches zero.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>The
master’s progress engine waits on a vector of requests (note that entry 0
is not used, as to simplify the code); one it gets an answer it processes it
and sends the next query to that worker until it’s out of input.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div style='margin-top:10.0pt'>

<p class=MsoNormal><b><span style='font-size:10.0pt;font-family:"Cambria","serif";
color:#4F81BD'>Logic description (with FT)</span></b><span style='font-size:
10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>The
master detects a faulty client either synchronously when it ties to initiate an
async receive (no need to check the send, the assumption is that if send
failed, so will the receive call), or async when the async receive completes
with an error. Once an error detected (and identified as a faulty client, more
about this later), the master starts an async repair of that client. If the
repair succeeds, new work is sent to that client. If it does not, the number of
active workers is decreased and the master has to live with less processing
power.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>The
code above assumes that if the returned code is an error, it should repair the
worker; however as we discussed, there could very well be many different
reasons for an error here, which not all are related to process failure; for
that we might use something in lines of<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal style='text-indent:.5in'><span style='font-size:10.0pt;
font-family:"Courier New";color:#0070C0;background:white'>if</span><span
style='font-size:10.0pt;font-family:"Courier New";color:black;background:white'>(</span><span
style='font-size:10.0pt;font-family:"Courier New";color:#984806;background:
white'>MPI_Error_event</span><span style='font-size:10.0pt;font-family:"Courier New";
color:black;background:white'>(rc) == MPI_EVENT_PROCESS_DOWN)...</span><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>it
would be the responsibility of the MPI implementation to encode or store the
event related to the returned error code.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><i><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>(Note:
in MPICH2 there is a mechanism that enables encoding extended error information
in the error code, which then can be retrieved using MPI_Error_string)</span></i><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div style='margin-top:10.0pt'>

<p class=MsoNormal><b><span style='font-family:"Cambria","serif";color:#4F81BD'>Conclusions</span></b><span
style='font-size:10.0pt;font-family:"Calibri","sans-serif"'><o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>I
believe that the solution above describes what we have discussed in the last
meeting. The required API’s to support this FT are really minimal but
already cover a good set of users.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Please,
send your comments.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Thoughts?<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>Thanks,<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>.Erez<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>P.S.
I will post this on the FT wiki pages (with the feedbac).<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'>P.P.S.
there is one more scenario that we discussed, and extension of the
master-workers model. I will try to get it write us as-soon-as-posible.<o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

<div>

<p class=MsoNormal><span style='font-size:10.0pt;font-family:"Calibri","sans-serif"'> <o:p></o:p></span></p>

</div>

</div>

</body>

</html>