Sample code, good to <10^-4, average abs err of 10^-5, ~10 times faster on average than arcsine, and apologies for the line breaks:
typedef double mytype; //try float here too, if you like
const mytype asin4_params1[5]={ 6.32559537178112e-05, 9.97002719101181e-01, 3.23729856176963e-02, 3.89287300071597e-02, 1.93549238398372e-01};
const mytype asin4_params2[7]={ 2.09625797161885e+01, -1.74835553411477e+02, 6.13575281494908e+02, -1.14033116228467e+03, 1.19159992307311e+03, -6.63957441058529e+02, 1.54421991537526e+02};
const mytype asin4_params3[4]={ 1.57080010233116e+00, -1.41437401362252e+00, 1.84777752400778e-03, -1.24625163381900e-01};
const mytype asin4_split1=0.6;
const mytype asin4_split2=0.925;
static inline mytype asin4(mytype x)
{
if (x<asin4_split1)
return asin4_params1[0]+x*(asin4_params1[1]+x*(asin4_params1[2]+x*(asin4_params1[3]+x*(asin4_params1[4]))));
if (x<asin4_split2)
return asin4_params2[0]+x*(asin4_params2[1]+x*(asin4_params2[2]+x*(asin4_params2[3]+x*(asin4_params2[4]+x*(asin4_params2[5]+x*asin4_params2[6])))));
mytype xx=sqrt(1-x);
return asin4_params3[0]+xx*(asin4_params3[1]+xx*(asin4_params3[2]+xx*asin4_params3[3]));
}











